Files
Jungfraujoch/image_analysis/bragg_integration/BraggIntegrationEngineGPU.cu
T
leonarski_fandClaude Opus 4.8 ddddfb6ffc
Build Packages / build:windows:nocuda (pull_request) Successful in 14m41s
Build Packages / build:windows:cuda (pull_request) Successful in 16m48s
Build Packages / build:rpm (ubuntu2404_nocuda) (pull_request) Successful in 11m15s
Build Packages / build:rpm (rocky8_nocuda) (pull_request) Successful in 12m46s
Build Packages / build:rpm (ubuntu2204_nocuda) (pull_request) Successful in 12m38s
Build Packages / build:rpm (rocky9_nocuda) (pull_request) Successful in 13m11s
Build Packages / build:rpm (rocky8_sls9) (pull_request) Successful in 12m20s
Build Packages / build:rpm (rocky9_sls9) (pull_request) Successful in 12m22s
Build Packages / build:rpm (ubuntu2404) (pull_request) Successful in 11m7s
Build Packages / build:rpm (ubuntu2204) (pull_request) Successful in 11m55s
Build Packages / build:rpm (rocky8) (pull_request) Successful in 12m56s
Build Packages / Generate python client (pull_request) Successful in 14s
Build Packages / build:rpm (rocky9) (pull_request) Successful in 13m15s
Build Packages / Create release (pull_request) Skipped
Build Packages / Build documentation (pull_request) Successful in 41s
Build Packages / XDS test (durin plugin) (pull_request) Successful in 10m3s
Build Packages / DIALS test (pull_request) Successful in 13m6s
Build Packages / XDS test (neggia plugin) (pull_request) Successful in 6m58s
Build Packages / XDS test (JFJoch plugin) (pull_request) Successful in 7m30s
Build Packages / Unit tests (pull_request) Successful in 58m5s
Build Packages / Unit tests (push) Successful in 1h12m36s
Build Packages / build:rpm (rocky8_nocuda) (push) Successful in 14m52s
Build Packages / build:rpm (rocky9_nocuda) (push) Successful in 15m35s
Build Packages / build:rpm (ubuntu2204_nocuda) (push) Successful in 15m29s
Build Packages / build:rpm (ubuntu2404_nocuda) (push) Successful in 13m35s
Build Packages / build:rpm (rocky8_sls9) (push) Successful in 15m25s
Build Packages / build:rpm (rocky9_sls9) (push) Successful in 16m5s
Build Packages / build:rpm (rocky8) (push) Successful in 15m11s
Build Packages / build:rpm (rocky9) (push) Successful in 13m35s
Build Packages / build:rpm (ubuntu2204) (push) Successful in 11m59s
Build Packages / build:rpm (ubuntu2404) (push) Successful in 12m14s
Build Packages / DIALS test (push) Successful in 14m29s
Build Packages / XDS test (durin plugin) (push) Successful in 9m56s
Build Packages / XDS test (JFJoch plugin) (push) Successful in 10m23s
Build Packages / XDS test (neggia plugin) (push) Successful in 9m3s
Build Packages / Generate python client (push) Successful in 20s
Build Packages / Build documentation (push) Successful in 1m10s
Build Packages / Create release (push) Skipped
Build Packages / build:windows:nocuda (push) Successful in 16m39s
Build Packages / build:windows:cuda (push) Successful in 18m40s
bragg_integration: GPU box + profile-fit integrator (standalone engine)
Reimplement BraggIntegrate2D (box sum) and ProfileIntegrate2D (Kabsch
profile fit) under one roof as a base + CPU + GPU engine, mirroring the
AzIntEngine / ROIIntegration pattern. Reads the preprocessed int32
ImagePreprocessorBuffer (masked=INT32_MIN, saturated=INT32_MAX), the same
buffer AzIntEngineGPU/ROIIntegrationGPU consume.

The CUDA engine runs one block per reflection with shared-memory
reductions across six kernels (reset, mask, box-sum, profile learning,
profile build, Kabsch fit); the resolution shell is computed inline. The
learning/fit hot path is single precision (FP64 is throttled on consumer
GPUs; reproduces the double CPU path to ~1e-4). Collapsing the per-frame
CUDA API calls into one reset kernel keeps launch-latency overhead low.

Standalone for now: NOT wired into IndexAndRefine. See
BRAGG_INTEGRATION_ENGINE.md for the design and the binding steps.
BraggIntegrationEngineGPUTest checks GPU == CPU across all three modes
(box/gaussian/empirical) within numeric tolerance, plus a [bragg_bench]
perf sweep.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-07-02 20:59:45 +02:00

481 lines
22 KiB
Plaintext

// SPDX-FileCopyrightText: 2026 Filip Leonarski, Paul Scherrer Institute <filip.leonarski@psi.ch>
// SPDX-License-Identifier: GPL-3.0-only
#include "BraggIntegrationEngineGPU.h"
using namespace bragg_engine;
namespace {
inline void cuda_err(cudaError_t val) {
if (val != cudaSuccess)
throw JFJochException(JFJochExceptionCategory::GPUCUDAError, cudaGetErrorString(val));
}
// Fixed scalars passed by value to every kernel (mirrors BraggIntegrationEngine's members).
struct BraggGpuParams {
int W, H;
float r1_sq, r2, r2_sq, r3, r3_sq;
float min_sigma_ratio;
int R, G, GG;
int do_clip; // background sigma-clip (stills, profile modes)
int empirical; // ProfileEmpirical vs ProfileGaussian
int broadband;
int use_ellipse;
float bw_sigma;
float c_radial;
float F_px;
float beam_x, beam_y;
};
__device__ inline bool valid(int32_t v) { return v != INT32_MIN && v != INT32_MAX; }
// --- Mark the r2 signal disk of every predicted reflection (race-free: all writes are 1). ---
__global__ void mark_mask(const float *px_x, const float *px_y, uint8_t *mask, BraggGpuParams p, int n) {
const int i = blockIdx.x;
if (i >= n) return;
const float cx = px_x[i], cy = px_y[i];
const int x0 = max(0, (int) floorf(cx - p.r2 - 1.0f));
const int x1 = min(p.W - 1, (int) ceilf(cx + p.r2 + 1.0f));
const int y0 = max(0, (int) floorf(cy - p.r2 - 1.0f));
const int y1 = min(p.H - 1, (int) ceilf(cy + p.r2 + 1.0f));
const int bw = x1 - x0 + 1, bh = y1 - y0 + 1;
if (bw <= 0 || bh <= 0) return;
for (int t = threadIdx.x; t < bw * bh; t += blockDim.x) {
const int x = x0 + t % bw, y = y0 + t / bw;
const float ddx = (float) x - cx, ddy = (float) y - cy;
if (ddx * ddx + ddy * ddy < p.r2_sq) mask[y * p.W + x] = 1;
}
}
// --- Pass A box-sum: rough I / background / centroid / strong flag, one block per reflection. ---
__global__ void boxsum(const float *px_x, const float *px_y, const float *dd,
const int32_t *img, const uint8_t *mask, BraggGpuParams p, int n,
int *cx_o, int *cy_o, float *I_o, float *sigma_o, float *bkg_o,
float *obsx_o, float *obsy_o, uint8_t *ok_o, uint8_t *strong_o,
uint8_t *hasobs_o, unsigned long long *invd2mm) {
const int i = blockIdx.x;
if (i >= n) return;
__shared__ unsigned long long s_Isum, s_Ix, s_Iy;
__shared__ int s_ninner, s_ninner_valid, s_nbkg;
__shared__ double s_bkgsum;
__shared__ int s_accept;
__shared__ double s_bkg, s_thr, s_clipsum;
__shared__ int s_clipn;
if (threadIdx.x == 0) {
s_Isum = 0; s_Ix = 0; s_Iy = 0;
s_ninner = 0; s_ninner_valid = 0; s_nbkg = 0; s_bkgsum = 0.0;
}
__syncthreads();
const float cx = px_x[i], cy = px_y[i];
const int x0 = max(0, (int) floorf(cx - p.r3 - 1.0f));
const int x1 = min(p.W - 1, (int) ceilf(cx + p.r3 + 1.0f));
const int y0 = max(0, (int) floorf(cy - p.r3 - 1.0f));
const int y1 = min(p.H - 1, (int) ceilf(cy + p.r3 + 1.0f));
const int bw = x1 - x0 + 1, bh = y1 - y0 + 1;
const int area = (bw > 0 && bh > 0) ? bw * bh : 0;
long long l_Isum = 0, l_Ix = 0, l_Iy = 0;
int l_ni = 0, l_niv = 0, l_nb = 0;
double l_bkg = 0.0;
for (int t = threadIdx.x; t < area; t += blockDim.x) {
const int x = x0 + t % bw, y = y0 + t / bw;
const float ddx = (float) x - cx, ddy = (float) y - cy;
const float d2 = ddx * ddx + ddy * ddy;
const int32_t px = img[y * p.W + x];
if (d2 < p.r1_sq) {
++l_ni;
if (valid(px)) { l_Isum += px; l_Ix += (long long) x * px; l_Iy += (long long) y * px; ++l_niv; }
} else if (d2 >= p.r2_sq && d2 < p.r3_sq) {
if (mask[y * p.W + x]) continue;
if (!valid(px)) continue;
l_bkg += (double) px; ++l_nb;
}
}
atomicAdd(&s_Isum, (unsigned long long) l_Isum);
atomicAdd(&s_Ix, (unsigned long long) l_Ix);
atomicAdd(&s_Iy, (unsigned long long) l_Iy);
atomicAdd(&s_ninner, l_ni);
atomicAdd(&s_ninner_valid, l_niv);
atomicAdd(&s_nbkg, l_nb);
atomicAdd(&s_bkgsum, l_bkg);
__syncthreads();
if (threadIdx.x == 0) {
s_accept = (s_ninner_valid == s_ninner && s_nbkg > 5) ? 1 : 0;
s_bkg = s_accept ? (s_bkgsum / (double) s_nbkg) : 0.0;
s_thr = s_bkg + 3.0 * sqrt(fmax(s_bkg, 1.0));
s_clipsum = 0.0; s_clipn = 0;
}
__syncthreads();
// Second ring pass for the stills sigma-clip (re-reads the annulus; avoids storing bkg values).
if (s_accept && p.do_clip) {
double c_l = 0.0; int cn_l = 0;
for (int t = threadIdx.x; t < area; t += blockDim.x) {
const int x = x0 + t % bw, y = y0 + t / bw;
const float ddx = (float) x - cx, ddy = (float) y - cy;
const float d2 = ddx * ddx + ddy * ddy;
if (!(d2 >= p.r2_sq && d2 < p.r3_sq)) continue;
if (mask[y * p.W + x]) continue;
const int32_t px = img[y * p.W + x];
if (!valid(px)) continue;
if ((double) px <= s_thr) { c_l += px; ++cn_l; }
}
atomicAdd(&s_clipsum, c_l); atomicAdd(&s_clipn, cn_l);
}
__syncthreads();
if (threadIdx.x != 0) return;
if (!s_accept) { ok_o[i] = 0; strong_o[i] = 0; hasobs_o[i] = 0; return; }
double bkg = s_bkg;
if (p.do_clip && s_clipn > 5) bkg = s_clipsum / (double) s_clipn;
const long long Isum = (long long) s_Isum;
const double I = (double) Isum - (double) s_ninner * bkg;
double sigma = fmax(1.0, I * (double) p.min_sigma_ratio);
uint8_t hasobs = 0; double ox = 0.0, oy = 0.0;
if (Isum > 0) {
sigma = fmax(sigma, sqrt((double) Isum));
ox = (double) (long long) s_Ix / (double) Isum;
oy = (double) (long long) s_Iy / (double) Isum;
hasobs = 1;
}
cx_o[i] = (int) lroundf(cx);
cy_o[i] = (int) lroundf(cy);
I_o[i] = (float) I; sigma_o[i] = (float) sigma; bkg_o[i] = (float) bkg;
obsx_o[i] = (float) ox; obsy_o[i] = (float) oy; hasobs_o[i] = hasobs;
ok_o[i] = 1;
strong_o[i] = (sigma > 0.0 && I / sigma >= STRONG_I_OVER_SIGMA) ? 1 : 0;
const float d = dd[i];
if (d > 0.0f) {
// Positive doubles keep IEEE bit-pattern ordering, so atomicMin/Max on the ull view works.
const unsigned long long b = (unsigned long long) __double_as_longlong(1.0 / ((double) d * d));
atomicMin(&invd2mm[0], b);
atomicMax(&invd2mm[1], b);
}
}
// Resolution shell of one reflection from the global inv-d^2 range (mirrors CPU shell_of). Computed
// inline in learn_profile and fit so no separate shell array/kernel is needed.
__device__ inline int compute_shell(float d, const unsigned long long *invd2mm) {
const unsigned long long mn = invd2mm[0], mx = invd2mm[1];
if (!(d > 0.0f) || mx <= mn) return 0;
const double invd2 = 1.0 / ((double) d * d);
const double dmn = __longlong_as_double((long long) mn), dmx = __longlong_as_double((long long) mx);
const int s = (int) ((invd2 - dmn) / (dmx - dmn) * N_SHELL);
return s < 0 ? 0 : (s >= N_SHELL ? N_SHELL - 1 : s);
}
// --- Zero the profile accumulators and seed the inv-d^2 range, in one launch (replaces a handful
// of small cudaMemsetAsync calls, which matter when kernel-launch latency is high). ---
__global__ void reset(float *shell_grid, float *global_grid, int *shell_n, int *global_n,
unsigned long long *invd2mm, int GG) {
for (int k = blockIdx.x * blockDim.x + threadIdx.x; k < N_SHELL * GG; k += blockDim.x * gridDim.x)
shell_grid[k] = 0.0f;
for (int k = blockIdx.x * blockDim.x + threadIdx.x; k < GG; k += blockDim.x * gridDim.x)
global_grid[k] = 0.0f;
if (blockIdx.x == 0 && threadIdx.x < N_SHELL) shell_n[threadIdx.x] = 0;
if (blockIdx.x == 0 && threadIdx.x == 0) {
*global_n = 0;
invd2mm[0] = ~0ull; // min seed
invd2mm[1] = 0ull; // max seed
}
}
// --- Learn the profile: each strong spot adds its bkg-subtracted, I-normalised grid to its shell
// (and the global grid). One block per reflection. ---
__global__ void learn_profile(const int32_t *img, const int *cx_a, const int *cy_a, const float *dd,
const unsigned long long *invd2mm,
const float *I_a, const float *bkg_a, const uint8_t *ok_a, const uint8_t *strong_a,
float *shell_grid, float *global_grid, int *shell_n, int *global_n,
BraggGpuParams p, int n) {
const int i = blockIdx.x;
if (i >= n || !ok_a[i] || !strong_a[i]) return;
const float I = I_a[i];
if (!(I > 0.0f)) return;
const int cx = cx_a[i], cy = cy_a[i], sh = compute_shell(dd[i], invd2mm);
const float bkg = bkg_a[i];
float *sg = shell_grid + (size_t) sh * p.GG;
for (int k = threadIdx.x; k < p.GG; k += blockDim.x) {
const int x = cx + (k % p.G - p.R), y = cy + (k / p.G - p.R);
if (x < 0 || y < 0 || x >= p.W || y >= p.H) continue;
const int32_t px = img[y * p.W + x];
if (!valid(px)) continue;
const float v = ((float) px - bkg) / I;
atomicAdd(&sg[k], v);
atomicAdd(&global_grid[k], v);
}
if (threadIdx.x == 0) { atomicAdd(&shell_n[sh], 1); atomicAdd(global_n, 1); }
}
// --- Reduce each learned grid to its 2nd-moment width (and, for empirical, a normalised profile).
// One block per grid: blocks [0,N_SHELL) are the shells, block N_SHELL is the global grid. ---
__global__ void build_profiles(const float *shell_grid, const float *global_grid,
const int *shell_n, const int *global_n,
float *shell_P, float *global_P, float *shell_sigma2, float *global_sigma2,
BraggGpuParams p) {
const int b = blockIdx.x;
const float *grid; int nstrong; float *P; float *sig2;
if (b < N_SHELL) { grid = shell_grid + (size_t) b * p.GG; nstrong = shell_n[b]; P = shell_P + (size_t) b * p.GG; sig2 = &shell_sigma2[b]; }
else { grid = global_grid; nstrong = *global_n; P = global_P; sig2 = global_sigma2; }
__shared__ float s_m2, s_m2w, s_sum;
if (threadIdx.x == 0) { s_m2 = 0.0f; s_m2w = 0.0f; s_sum = 0.0f; }
__syncthreads();
float l_m2 = 0.0f, l_m2w = 0.0f, l_sum = 0.0f;
for (int k = threadIdx.x; k < p.GG; k += blockDim.x) {
const int dx = k % p.G - p.R, dy = k / p.G - p.R;
const float g = fmaxf(0.0f, grid[k]);
const int r2i = dx * dx + dy * dy;
if (p.broadband || (float) r2i < p.r1_sq) { l_m2 += g * (float) r2i; l_m2w += g; }
l_sum += g;
if (p.empirical) P[k] = g; // pre-store clamped grid for in-place normalisation below
}
atomicAdd(&s_m2, l_m2); atomicAdd(&s_m2w, l_m2w); atomicAdd(&s_sum, l_sum);
__syncthreads();
if (threadIdx.x == 0)
*sig2 = (nstrong > 0 && s_m2w > 0.0f) ? fmaxf(0.25f, (s_m2 / s_m2w) / 2.0f) : 1.0f;
if (p.empirical) {
const float sum = s_sum;
const bool normalise = nstrong > 0 && sum > 0.0f;
for (int k = threadIdx.x; k < p.GG; k += blockDim.x) P[k] = normalise ? P[k] / sum : 0.0f;
}
}
// --- Pass B Kabsch profile fit: I = sum P(c-B)/v over sum P^2/v, v = B + max(I,0)P (iterate).
// One block per reflection; the (possibly elongated) profile is built in shared memory. ---
__global__ void fit(const int32_t *img, const float *px_x, const float *px_y,
const int *cx_a, const int *cy_a, const float *dd, const unsigned long long *invd2mm,
const float *I_seed, const float *bkg_a, const uint8_t *ok_a,
const float *shell_P, const float *global_P,
const float *shell_sigma2, const float *global_sigma2, const int *shell_n,
float *I_o, float *sigma_o, uint8_t *ok_o, BraggGpuParams p, int n) {
const int i = blockIdx.x;
if (i >= n) return;
extern __shared__ float Pbuf[];
__shared__ float s_gs, s_num, s_den, s_I;
__shared__ int s_Rf, s_Gf;
if (!ok_a[i]) { if (threadIdx.x == 0) ok_o[i] = 0; return; }
const int cx = cx_a[i], cy = cy_a[i];
const int sh = compute_shell(dd[i], invd2mm);
const bool use_shell = shell_n[sh] >= MIN_STRONG_PER_SHELL; // else fall back to the global profile
const float bkg = bkg_a[i];
if (p.empirical) {
const float *Psrc = use_shell ? (shell_P + (size_t) sh * p.GG) : global_P;
if (threadIdx.x == 0) { s_Rf = p.R; s_Gf = p.G; }
__syncthreads();
for (int k = threadIdx.x; k < p.GG; k += blockDim.x) Pbuf[k] = Psrc[k];
__syncthreads();
} else {
const float s2t = use_shell ? shell_sigma2[sh] : *global_sigma2;
const float rx = px_x[i] - p.beam_x, ry = px_y[i] - p.beam_y;
const float Rpx = sqrtf(rx * rx + ry * ry);
const float tan2t = Rpx / p.F_px;
float s2r = s2t, ux = 1.0f, uy = 0.0f;
bool elong = false;
if (p.use_ellipse) {
const float sbw = p.bw_sigma * Rpx;
const float radial_extra = sbw * sbw + p.c_radial * tan2t * tan2t;
if (Rpx > 1e-6f && radial_extra > 0.25f) { ux = rx / Rpx; uy = ry / Rpx; s2r = s2t + radial_extra; elong = true; }
}
const int Rf = elong ? min(3 * p.R, (int) ceilf(p.r2 + 2.0f * sqrtf(s2r))) : p.R;
const int Gf = 2 * Rf + 1;
if (threadIdx.x == 0) { s_Rf = Rf; s_Gf = Gf; s_gs = 0.0f; }
__syncthreads();
const float fx = px_x[i] - cx, fy = px_y[i] - cy;
float l_gs = 0.0f;
for (int k = threadIdx.x; k < Gf * Gf; k += blockDim.x) {
const float ex = (k % Gf - Rf) - fx, ey = (k / Gf - Rf) - fy;
const float rad = ex * ux + ey * uy, tn = -ex * uy + ey * ux;
const float g = expf(-rad * rad / (2.0f * s2r) - tn * tn / (2.0f * s2t));
Pbuf[k] = g; l_gs += g;
}
atomicAdd(&s_gs, l_gs);
__syncthreads();
const float gs = s_gs;
for (int k = threadIdx.x; k < Gf * Gf; k += blockDim.x) Pbuf[k] /= gs;
__syncthreads();
}
const int Rf = s_Rf, Gf = s_Gf, GfGf = Gf * Gf;
const float B = fmaxf(bkg, 1.0f);
if (threadIdx.x == 0) s_I = I_seed[i];
__syncthreads();
for (int iter = 0; iter < 4; ++iter) {
if (threadIdx.x == 0) { s_num = 0.0f; s_den = 0.0f; }
__syncthreads();
const float Ihere = s_I;
float l_num = 0.0f, l_den = 0.0f;
for (int k = threadIdx.x; k < GfGf; k += blockDim.x) {
const float Pp = Pbuf[k];
if (Pp <= 0.0f) continue;
const int x = cx + (k % Gf - Rf), y = cy + (k / Gf - Rf);
if (x < 0 || y < 0 || x >= p.W || y >= p.H) continue;
const int32_t px = img[y * p.W + x];
if (!valid(px)) continue;
const float v = B + fmaxf(0.0f, Ihere) * Pp;
l_num += Pp * ((float) px - bkg) / v;
l_den += Pp * Pp / v;
}
atomicAdd(&s_num, l_num); atomicAdd(&s_den, l_den);
__syncthreads();
if (threadIdx.x == 0 && s_den > 0.0f) s_I = s_num / s_den;
__syncthreads();
}
if (threadIdx.x == 0) {
if (s_den > 0.0f) { I_o[i] = s_I; sigma_o[i] = sqrtf(1.0f / s_den); ok_o[i] = 1; }
else ok_o[i] = 0;
}
}
} // namespace
BraggIntegrationEngineGPU::BraggIntegrationEngineGPU(const DiffractionExperiment &experiment,
std::shared_ptr<CudaStream> stream)
: BraggIntegrationEngine(experiment),
stream(std::move(stream)),
d_mask(npixel),
d_shell_grid(static_cast<size_t>(bragg_engine::N_SHELL) * GG),
d_global_grid(GG),
d_shell_P(static_cast<size_t>(bragg_engine::N_SHELL) * GG),
d_global_P(GG),
d_shell_sigma2(bragg_engine::N_SHELL),
d_global_sigma2(1),
d_shell_n(bragg_engine::N_SHELL),
d_global_n(1),
d_invd2(2) {
threads = 128;
// Fit profile grid: R for empirical / box, up to 3R (radially elongated) for the Gaussian.
const int max_Rf = empirical ? R : 3 * R;
const int max_Gf = 2 * max_Rf + 1;
fit_shared_bytes = static_cast<size_t>(max_Gf) * max_Gf * sizeof(float);
cudaDeviceProp prop{};
cuda_err(cudaGetDeviceProperties(&prop, 0));
if (fit_shared_bytes > prop.sharedMemPerBlock)
throw JFJochException(JFJochExceptionCategory::GPUCUDAError,
"BraggIntegrationEngineGPU: profile grid exceeds shared memory (r2 too large)");
}
void BraggIntegrationEngineGPU::EnsureCapacity(size_t n) {
if (n <= capacity)
return;
d_px_x = CudaDevicePtr<float>(n);
d_px_y = CudaDevicePtr<float>(n);
d_d = CudaDevicePtr<float>(n);
d_cx = CudaDevicePtr<int>(n);
d_cy = CudaDevicePtr<int>(n);
d_I = CudaDevicePtr<float>(n);
d_sigma = CudaDevicePtr<float>(n);
d_bkg = CudaDevicePtr<float>(n);
d_obs_x = CudaDevicePtr<float>(n);
d_obs_y = CudaDevicePtr<float>(n);
d_ok = CudaDevicePtr<uint8_t>(n);
d_strong = CudaDevicePtr<uint8_t>(n);
d_has_obs = CudaDevicePtr<uint8_t>(n);
h_px_x.resize(n); h_px_y.resize(n); h_d.resize(n);
h_I.resize(n); h_sigma.resize(n); h_bkg.resize(n);
h_obs_x.resize(n); h_obs_y.resize(n);
h_ok.resize(n); h_has_obs.resize(n);
capacity = n;
}
std::vector<Reflection> BraggIntegrationEngineGPU::Run(const ImagePreprocessorBuffer &image,
const std::vector<Reflection> &predicted,
size_t npredicted, int64_t image_number) {
std::vector<BraggFitResult> results(npredicted);
if (image.size() != npixel || npredicted == 0)
return Finalize(predicted, npredicted, results, image_number);
const int32_t *img = image.getGPUBuffer();
if (img == nullptr)
throw JFJochException(JFJochExceptionCategory::InputParameterInvalid,
"BraggIntegrationEngineGPU: image buffer is not on the GPU");
EnsureCapacity(npredicted);
const int n = static_cast<int>(npredicted);
for (size_t i = 0; i < npredicted; ++i) {
h_px_x[i] = predicted[i].predicted_x;
h_px_y[i] = predicted[i].predicted_y;
h_d[i] = predicted[i].d;
}
cuda_err(cudaMemcpyAsync(d_px_x, h_px_x.data(), sizeof(float) * npredicted, cudaMemcpyHostToDevice, *stream));
cuda_err(cudaMemcpyAsync(d_px_y, h_px_y.data(), sizeof(float) * npredicted, cudaMemcpyHostToDevice, *stream));
cuda_err(cudaMemcpyAsync(d_d, h_d.data(), sizeof(float) * npredicted, cudaMemcpyHostToDevice, *stream));
BraggGpuParams p{
.W = static_cast<int>(xpixel), .H = static_cast<int>(ypixel),
.r1_sq = r1_sq, .r2 = r2, .r2_sq = r2_sq, .r3 = r3, .r3_sq = r3_sq,
.min_sigma_ratio = min_sigma_ratio,
.R = R, .G = G, .GG = GG,
.do_clip = (apply_bkg_clip && mode != IntegratorMode::BoxSum) ? 1 : 0,
.empirical = empirical ? 1 : 0,
.broadband = broadband ? 1 : 0,
.use_ellipse = use_ellipse ? 1 : 0,
.bw_sigma = static_cast<float>(bw_sigma), .c_radial = static_cast<float>(c_radial),
.F_px = static_cast<float>(F_px),
.beam_x = beam_x, .beam_y = beam_y,
};
// Pass A: reset accumulators, mask, then box-sum.
cuda_err(cudaMemsetAsync(d_mask, 0, npixel, *stream));
reset<<<32, 256, 0, *stream>>>(d_shell_grid, d_global_grid, d_shell_n, d_global_n, d_invd2, GG);
mark_mask<<<n, threads, 0, *stream>>>(d_px_x, d_px_y, d_mask, p, n);
boxsum<<<n, threads, 0, *stream>>>(d_px_x, d_px_y, d_d, img, d_mask, p, n,
d_cx, d_cy, d_I, d_sigma, d_bkg, d_obs_x, d_obs_y,
d_ok, d_strong, d_has_obs, d_invd2);
if (mode != IntegratorMode::BoxSum) {
// Pass B: learn (shell computed inline) -> build -> fit.
learn_profile<<<n, threads, 0, *stream>>>(img, d_cx, d_cy, d_d, d_invd2, d_I, d_bkg, d_ok, d_strong,
d_shell_grid, d_global_grid, d_shell_n, d_global_n, p, n);
build_profiles<<<bragg_engine::N_SHELL + 1, threads, 0, *stream>>>(
d_shell_grid, d_global_grid, d_shell_n, d_global_n,
d_shell_P, d_global_P, d_shell_sigma2, d_global_sigma2, p);
fit<<<n, threads, fit_shared_bytes, *stream>>>(img, d_px_x, d_px_y, d_cx, d_cy, d_d, d_invd2,
d_I, d_bkg, d_ok, d_shell_P, d_global_P,
d_shell_sigma2, d_global_sigma2, d_shell_n,
d_I, d_sigma, d_ok, p, n);
}
cuda_err(cudaMemcpyAsync(h_I.data(), d_I, sizeof(float) * npredicted, cudaMemcpyDeviceToHost, *stream));
cuda_err(cudaMemcpyAsync(h_sigma.data(), d_sigma, sizeof(float) * npredicted, cudaMemcpyDeviceToHost, *stream));
cuda_err(cudaMemcpyAsync(h_bkg.data(), d_bkg, sizeof(float) * npredicted, cudaMemcpyDeviceToHost, *stream));
cuda_err(cudaMemcpyAsync(h_ok.data(), d_ok, sizeof(uint8_t) * npredicted, cudaMemcpyDeviceToHost, *stream));
const bool boxsum_mode = mode == IntegratorMode::BoxSum;
if (boxsum_mode) {
cuda_err(cudaMemcpyAsync(h_obs_x.data(), d_obs_x, sizeof(float) * npredicted, cudaMemcpyDeviceToHost, *stream));
cuda_err(cudaMemcpyAsync(h_obs_y.data(), d_obs_y, sizeof(float) * npredicted, cudaMemcpyDeviceToHost, *stream));
cuda_err(cudaMemcpyAsync(h_has_obs.data(), d_has_obs, sizeof(uint8_t) * npredicted, cudaMemcpyDeviceToHost, *stream));
}
cuda_err(cudaStreamSynchronize(*stream));
for (size_t i = 0; i < npredicted; ++i) {
if (!h_ok[i]) continue;
results[i].I = h_I[i];
results[i].sigma = h_sigma[i];
results[i].bkg = h_bkg[i];
results[i].ok = true;
if (boxsum_mode && h_has_obs[i]) {
results[i].observed_x = h_obs_x[i];
results[i].observed_y = h_obs_y[i];
results[i].has_observed = true;
}
}
return Finalize(predicted, npredicted, results, image_number);
}