Files
Jungfraujoch/image_analysis/indexing/FFTIndexerGPU.cu
T
leonarski_fandClaude Opus 4.8 929e6e1fa0
Build Packages / build:rpm (rocky8_nocuda) (push) Successful in 26m41s
Build Packages / build:rpm (rocky9_nocuda) (push) Successful in 29m5s
Build Packages / build:rpm (ubuntu2204_nocuda) (push) Successful in 25m41s
Build Packages / build:rpm (ubuntu2404_nocuda) (push) Successful in 24m59s
Build Packages / build:rpm (rocky8_sls9) (push) Successful in 29m21s
Build Packages / build:rpm (rocky9_sls9) (push) Successful in 30m14s
Build Packages / build:rpm (rocky8) (push) Successful in 24m32s
Build Packages / build:rpm (rocky9) (push) Successful in 28m25s
Build Packages / build:rpm (ubuntu2204) (push) Successful in 25m5s
Build Packages / build:rpm (ubuntu2404) (push) Successful in 23m57s
Build Packages / DIALS test (push) Successful in 33m44s
Build Packages / XDS test (durin plugin) (push) Successful in 20m34s
Build Packages / XDS test (JFJoch plugin) (push) Successful in 16m31s
Build Packages / XDS test (neggia plugin) (push) Successful in 15m11s
Build Packages / Generate python client (push) Successful in 26s
Build Packages / Build documentation (push) Successful in 1m24s
Build Packages / Create release (push) Skipped
Build Packages / Unit tests (push) Successful in 41m41s
FFTIndexer: pick peaks by prominence above a local background, not raw magnitude
The per-direction peak search took argmax|spec|. The projected-spot histogram has a
broad low-frequency ENVELOPE (spots cluster near the origin) whose magnitude can
exceed the true lattice peaks, so on weak / pink-beam frames every direction reported
a short envelope vector (~10 A) and the real 38-79 A axes never surfaced -> 0 candidate
cells -> 0% indexing. (Diagnosed on the lyso jet: the FFT returned only 10-15 A vectors,
the true axes entirely absent.)

Subtract a running-mean background of half-width ~15 A and pick the peak by its
PROMINENCE (mag - background) instead. The smooth envelope cancels to ~0 while sharp
lattice peaks - fundamentals and harmonics alike - keep their height, so the real axes
win. The prominence is also reported as the magnitude, so FilterFFTResults ranks
directions by real-peak strength rather than envelope. Ported identically to CPU
(prefix-sum window) and GPU (sliding-window in the kernel).

Validation (lyso, de-novo): jet FFT 0% -> 20.5% (CPU and GPU identical; vs FFBIDX
27%); crystal 2 95.3% -> 95.5% (no regression, CC1/2 95.8 / CCref 92.7 unchanged).
The ~15 A window is the validated optimum (wider over-smooths, narrower under-removes
the envelope).

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-16 07:25:06 +02:00

181 lines
7.8 KiB
Plaintext

// SPDX-FileCopyrightText: 2025 Filip Leonarski, Paul Scherrer Institute <filip.leonarski@psi.ch>
// SPDX-License-Identifier: GPL-3.0-only
#include "FFTIndexerGPU.h"
#include <cufft.h>
#include <cmath>
#include <algorithm>
__device__ __host__ inline float complex_abs(const cufftComplex &z) {
return sqrtf(z.x * z.x + z.y * z.y);
}
__global__ void calculate_fft_result(
const cufftComplex *__restrict__ d_output,
const float max_length_A,
const float min_length_A,
const int histogram_size,
const int bg_half,
const int directions_size,
FFTResult *d_results) {
int i = blockIdx.x * blockDim.x + threadIdx.x; // Get thread index
if (i < directions_size) {
const int out_len = (histogram_size / 2) + 1;
size_t offset = static_cast<size_t>(out_len) * i;
float len_coeff = 2.0f * max_length_A / static_cast<float>(histogram_size);
// Pick the peak by PROMINENCE above a running-mean background of half-width bg_half:
// the projected histogram has a broad low-frequency envelope whose magnitude can
// exceed the true lattice peaks, so a plain argmax|spec| returns a short envelope
// vector on weak/pink-beam frames. Subtracting the local mean removes that envelope
// while sharp lattice peaks keep their height (mirrors FFTIndexerCPU).
double winsum = 0.0;
int wlo = 0;
int whi = min(bg_half, out_len - 1);
for (int k = wlo; k <= whi; ++k)
winsum += complex_abs(d_output[offset + k]);
float best_prom = 0.0f;
FFTResult result{.magnitude = 0.0f, .direction = i, .length = -1};
for (int j = 0; j < out_len; ++j) {
const int want_hi = min(j + bg_half, out_len - 1);
while (whi < want_hi) { ++whi; winsum += complex_abs(d_output[offset + whi]); }
const int want_lo = max(0, j - bg_half);
while (wlo < want_lo) { winsum -= complex_abs(d_output[offset + wlo]); ++wlo; }
const float len = len_coeff * static_cast<float>(j);
if (len <= min_length_A) continue;
const float mag = complex_abs(d_output[offset + j]);
const float bg = static_cast<float>(winsum / static_cast<double>(whi - wlo + 1));
const float prom = mag - bg;
if (prom > best_prom) {
best_prom = prom;
result.magnitude = prom;
result.length = len;
}
}
d_results[i] = result; // Store the result
}
}
__global__ void histogram_kernel(const float *__restrict__ coord_x,
const float *__restrict__ coord_y,
const float *__restrict__ coord_z,
const float *__restrict__ dir_x,
const float *__restrict__ dir_y,
const float *__restrict__ dir_z,
float histogram_spacing,
int histogram_size,
int coord_size,
int direction_vectors_size,
float *__restrict__ output) {
int direction_idx = blockIdx.x * blockDim.x + threadIdx.x;
if (direction_idx < direction_vectors_size) {
int base_offset = direction_idx * histogram_size;
for (int i = 0; i < histogram_size; i++)
output[base_offset + i] = 0;
for (int i = 0; i < coord_size; i++) {
float dot = fabsf(
dir_x[direction_idx] * coord_x[i] + dir_y[direction_idx] * coord_y[i] + dir_z[direction_idx] * coord_z[
i]);
int64_t bin = static_cast<int64_t>(dot / histogram_spacing);
if (bin >= 0 && bin < histogram_size)
output[base_offset + bin] += 1.0;
}
}
}
inline void cuda_err(cudaError_t val) {
if (val != cudaSuccess)
throw JFJochException(JFJochExceptionCategory::GPUCUDAError, cudaGetErrorString(val));
}
inline void cuda_err(cufftResult val) {
if (val != cufftResult::CUFFT_SUCCESS)
throw JFJochException(JFJochExceptionCategory::GPUCUDAError, "CuFFT error");
}
FFTIndexerGPU::FFTIndexerGPU(const IndexingSettings &settings)
: FFTIndexer(settings), result_fft_reg(result_fft) {
d_input_fft = CudaDevicePtr<float>(input_size);
d_output_fft = CudaDevicePtr<cufftComplex>(output_size);
d_result_fft = CudaDevicePtr<FFTResult>(nDirections);
d_spot_x = CudaDevicePtr<float>(FFT_MAX_SPOTS);
d_spot_y = CudaDevicePtr<float>(FFT_MAX_SPOTS);
d_spot_z = CudaDevicePtr<float>(FFT_MAX_SPOTS);
spot_x = CudaHostPtr<float>(FFT_MAX_SPOTS);
spot_y = CudaHostPtr<float>(FFT_MAX_SPOTS);
spot_z = CudaHostPtr<float>(FFT_MAX_SPOTS);
d_dir_x = CudaDevicePtr<float>(nDirections);
d_dir_y = CudaDevicePtr<float>(nDirections);
d_dir_z = CudaDevicePtr<float>(nDirections);
std::vector<float> dir_x(nDirections);
std::vector<float> dir_y(nDirections);
std::vector<float> dir_z(nDirections);
for (int i = 0; i < nDirections; i++) {
dir_x[i] = direction_vectors.at(i).x;
dir_y[i] = direction_vectors.at(i).y;
dir_z[i] = direction_vectors.at(i).z;
}
cudaMemcpy(d_dir_x, dir_x.data(), nDirections * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_dir_y, dir_y.data(), nDirections * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_dir_z, dir_z.data(), nDirections * sizeof(float), cudaMemcpyHostToDevice);
int n[1] = {static_cast<int32_t>(histogram_size)}; // Size of the FFT along a single dimension
plan = CudaFFTPlan(1, n, nullptr, 1, histogram_size, nullptr, 1, histogram_size / 2 + 1, CUFFT_R2C,
nDirections);
cuda_err(cufftSetStream(plan, stream));
}
void FFTIndexerGPU::ExecuteFFT(const std::vector<Coord> &coord, size_t nspots) {
int l_blockDim = 128;
int l_gridDim = (direction_vectors.size() + l_blockDim - 1) / l_blockDim;
for (int i = 0; i < nspots; i++) {
spot_x[i] = coord[i].x;
spot_y[i] = coord[i].y;
spot_z[i] = coord[i].z;
}
cudaMemcpyAsync(d_spot_x, spot_x, nspots * sizeof(float), cudaMemcpyHostToDevice, stream);
cudaMemcpyAsync(d_spot_y, spot_y, nspots * sizeof(float), cudaMemcpyHostToDevice, stream);
cudaMemcpyAsync(d_spot_z, spot_z, nspots * sizeof(float), cudaMemcpyHostToDevice, stream);
histogram_kernel<<<l_gridDim, l_blockDim, 0, stream>>>(d_spot_x, d_spot_y, d_spot_z,
d_dir_x, d_dir_y, d_dir_z,
histogram_spacing, histogram_size,
nspots,
direction_vectors.size(),
d_input_fft);
cuda_err(cufftExecR2C(plan, d_input_fft, d_output_fft));
// Background half-window ~15 A (length-based, so independent of histogram sizing); see
// FFTIndexerCPU for the prominence-vs-envelope rationale and the validated optimum.
const double len_coeff = 2.0 * static_cast<double>(max_length_A) / static_cast<double>(histogram_size);
const int bg_half = std::max(1, static_cast<int>(std::lround(15.0 / len_coeff)));
calculate_fft_result<<<l_gridDim, l_blockDim, 0, stream>>>(d_output_fft,
max_length_A, min_length_A, histogram_size,
bg_half,
direction_vectors.size(), d_result_fft);
cuda_err(cudaMemcpyAsync(result_fft.data(), d_result_fft, direction_vectors.size() * sizeof(FFTResult),
cudaMemcpyDeviceToHost, stream));
cuda_err(cudaStreamSynchronize(stream));
}