Build Packages / build:rpm (rocky8_nocuda) (push) Successful in 26m41s
Build Packages / build:rpm (rocky9_nocuda) (push) Successful in 29m5s
Build Packages / build:rpm (ubuntu2204_nocuda) (push) Successful in 25m41s
Build Packages / build:rpm (ubuntu2404_nocuda) (push) Successful in 24m59s
Build Packages / build:rpm (rocky8_sls9) (push) Successful in 29m21s
Build Packages / build:rpm (rocky9_sls9) (push) Successful in 30m14s
Build Packages / build:rpm (rocky8) (push) Successful in 24m32s
Build Packages / build:rpm (rocky9) (push) Successful in 28m25s
Build Packages / build:rpm (ubuntu2204) (push) Successful in 25m5s
Build Packages / build:rpm (ubuntu2404) (push) Successful in 23m57s
Build Packages / DIALS test (push) Successful in 33m44s
Build Packages / XDS test (durin plugin) (push) Successful in 20m34s
Build Packages / XDS test (JFJoch plugin) (push) Successful in 16m31s
Build Packages / XDS test (neggia plugin) (push) Successful in 15m11s
Build Packages / Generate python client (push) Successful in 26s
Build Packages / Build documentation (push) Successful in 1m24s
Build Packages / Create release (push) Skipped
Build Packages / Unit tests (push) Successful in 41m41s
The per-direction peak search took argmax|spec|. The projected-spot histogram has a broad low-frequency ENVELOPE (spots cluster near the origin) whose magnitude can exceed the true lattice peaks, so on weak / pink-beam frames every direction reported a short envelope vector (~10 A) and the real 38-79 A axes never surfaced -> 0 candidate cells -> 0% indexing. (Diagnosed on the lyso jet: the FFT returned only 10-15 A vectors, the true axes entirely absent.) Subtract a running-mean background of half-width ~15 A and pick the peak by its PROMINENCE (mag - background) instead. The smooth envelope cancels to ~0 while sharp lattice peaks - fundamentals and harmonics alike - keep their height, so the real axes win. The prominence is also reported as the magnitude, so FilterFFTResults ranks directions by real-peak strength rather than envelope. Ported identically to CPU (prefix-sum window) and GPU (sliding-window in the kernel). Validation (lyso, de-novo): jet FFT 0% -> 20.5% (CPU and GPU identical; vs FFBIDX 27%); crystal 2 95.3% -> 95.5% (no regression, CC1/2 95.8 / CCref 92.7 unchanged). The ~15 A window is the validated optimum (wider over-smooths, narrower under-removes the envelope). Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
181 lines
7.8 KiB
Plaintext
181 lines
7.8 KiB
Plaintext
// SPDX-FileCopyrightText: 2025 Filip Leonarski, Paul Scherrer Institute <filip.leonarski@psi.ch>
|
|
// SPDX-License-Identifier: GPL-3.0-only
|
|
|
|
#include "FFTIndexerGPU.h"
|
|
#include <cufft.h>
|
|
#include <cmath>
|
|
#include <algorithm>
|
|
|
|
__device__ __host__ inline float complex_abs(const cufftComplex &z) {
|
|
return sqrtf(z.x * z.x + z.y * z.y);
|
|
}
|
|
|
|
__global__ void calculate_fft_result(
|
|
const cufftComplex *__restrict__ d_output,
|
|
const float max_length_A,
|
|
const float min_length_A,
|
|
const int histogram_size,
|
|
const int bg_half,
|
|
const int directions_size,
|
|
FFTResult *d_results) {
|
|
int i = blockIdx.x * blockDim.x + threadIdx.x; // Get thread index
|
|
|
|
if (i < directions_size) {
|
|
const int out_len = (histogram_size / 2) + 1;
|
|
size_t offset = static_cast<size_t>(out_len) * i;
|
|
float len_coeff = 2.0f * max_length_A / static_cast<float>(histogram_size);
|
|
|
|
// Pick the peak by PROMINENCE above a running-mean background of half-width bg_half:
|
|
// the projected histogram has a broad low-frequency envelope whose magnitude can
|
|
// exceed the true lattice peaks, so a plain argmax|spec| returns a short envelope
|
|
// vector on weak/pink-beam frames. Subtracting the local mean removes that envelope
|
|
// while sharp lattice peaks keep their height (mirrors FFTIndexerCPU).
|
|
double winsum = 0.0;
|
|
int wlo = 0;
|
|
int whi = min(bg_half, out_len - 1);
|
|
for (int k = wlo; k <= whi; ++k)
|
|
winsum += complex_abs(d_output[offset + k]);
|
|
|
|
float best_prom = 0.0f;
|
|
FFTResult result{.magnitude = 0.0f, .direction = i, .length = -1};
|
|
|
|
for (int j = 0; j < out_len; ++j) {
|
|
const int want_hi = min(j + bg_half, out_len - 1);
|
|
while (whi < want_hi) { ++whi; winsum += complex_abs(d_output[offset + whi]); }
|
|
const int want_lo = max(0, j - bg_half);
|
|
while (wlo < want_lo) { winsum -= complex_abs(d_output[offset + wlo]); ++wlo; }
|
|
|
|
const float len = len_coeff * static_cast<float>(j);
|
|
if (len <= min_length_A) continue;
|
|
|
|
const float mag = complex_abs(d_output[offset + j]);
|
|
const float bg = static_cast<float>(winsum / static_cast<double>(whi - wlo + 1));
|
|
const float prom = mag - bg;
|
|
if (prom > best_prom) {
|
|
best_prom = prom;
|
|
result.magnitude = prom;
|
|
result.length = len;
|
|
}
|
|
}
|
|
d_results[i] = result; // Store the result
|
|
}
|
|
}
|
|
|
|
__global__ void histogram_kernel(const float *__restrict__ coord_x,
|
|
const float *__restrict__ coord_y,
|
|
const float *__restrict__ coord_z,
|
|
const float *__restrict__ dir_x,
|
|
const float *__restrict__ dir_y,
|
|
const float *__restrict__ dir_z,
|
|
float histogram_spacing,
|
|
int histogram_size,
|
|
int coord_size,
|
|
int direction_vectors_size,
|
|
float *__restrict__ output) {
|
|
int direction_idx = blockIdx.x * blockDim.x + threadIdx.x;
|
|
|
|
if (direction_idx < direction_vectors_size) {
|
|
int base_offset = direction_idx * histogram_size;
|
|
|
|
for (int i = 0; i < histogram_size; i++)
|
|
output[base_offset + i] = 0;
|
|
|
|
for (int i = 0; i < coord_size; i++) {
|
|
float dot = fabsf(
|
|
dir_x[direction_idx] * coord_x[i] + dir_y[direction_idx] * coord_y[i] + dir_z[direction_idx] * coord_z[
|
|
i]);
|
|
int64_t bin = static_cast<int64_t>(dot / histogram_spacing);
|
|
if (bin >= 0 && bin < histogram_size)
|
|
output[base_offset + bin] += 1.0;
|
|
}
|
|
}
|
|
}
|
|
|
|
inline void cuda_err(cudaError_t val) {
|
|
if (val != cudaSuccess)
|
|
throw JFJochException(JFJochExceptionCategory::GPUCUDAError, cudaGetErrorString(val));
|
|
}
|
|
|
|
inline void cuda_err(cufftResult val) {
|
|
if (val != cufftResult::CUFFT_SUCCESS)
|
|
throw JFJochException(JFJochExceptionCategory::GPUCUDAError, "CuFFT error");
|
|
}
|
|
|
|
FFTIndexerGPU::FFTIndexerGPU(const IndexingSettings &settings)
|
|
: FFTIndexer(settings), result_fft_reg(result_fft) {
|
|
d_input_fft = CudaDevicePtr<float>(input_size);
|
|
d_output_fft = CudaDevicePtr<cufftComplex>(output_size);
|
|
d_result_fft = CudaDevicePtr<FFTResult>(nDirections);
|
|
|
|
d_spot_x = CudaDevicePtr<float>(FFT_MAX_SPOTS);
|
|
d_spot_y = CudaDevicePtr<float>(FFT_MAX_SPOTS);
|
|
d_spot_z = CudaDevicePtr<float>(FFT_MAX_SPOTS);
|
|
|
|
spot_x = CudaHostPtr<float>(FFT_MAX_SPOTS);
|
|
spot_y = CudaHostPtr<float>(FFT_MAX_SPOTS);
|
|
spot_z = CudaHostPtr<float>(FFT_MAX_SPOTS);
|
|
|
|
d_dir_x = CudaDevicePtr<float>(nDirections);
|
|
d_dir_y = CudaDevicePtr<float>(nDirections);
|
|
d_dir_z = CudaDevicePtr<float>(nDirections);
|
|
|
|
std::vector<float> dir_x(nDirections);
|
|
std::vector<float> dir_y(nDirections);
|
|
std::vector<float> dir_z(nDirections);
|
|
|
|
for (int i = 0; i < nDirections; i++) {
|
|
dir_x[i] = direction_vectors.at(i).x;
|
|
dir_y[i] = direction_vectors.at(i).y;
|
|
dir_z[i] = direction_vectors.at(i).z;
|
|
}
|
|
|
|
cudaMemcpy(d_dir_x, dir_x.data(), nDirections * sizeof(float), cudaMemcpyHostToDevice);
|
|
cudaMemcpy(d_dir_y, dir_y.data(), nDirections * sizeof(float), cudaMemcpyHostToDevice);
|
|
cudaMemcpy(d_dir_z, dir_z.data(), nDirections * sizeof(float), cudaMemcpyHostToDevice);
|
|
|
|
int n[1] = {static_cast<int32_t>(histogram_size)}; // Size of the FFT along a single dimension
|
|
|
|
plan = CudaFFTPlan(1, n, nullptr, 1, histogram_size, nullptr, 1, histogram_size / 2 + 1, CUFFT_R2C,
|
|
nDirections);
|
|
cuda_err(cufftSetStream(plan, stream));
|
|
}
|
|
|
|
|
|
void FFTIndexerGPU::ExecuteFFT(const std::vector<Coord> &coord, size_t nspots) {
|
|
int l_blockDim = 128;
|
|
int l_gridDim = (direction_vectors.size() + l_blockDim - 1) / l_blockDim;
|
|
|
|
for (int i = 0; i < nspots; i++) {
|
|
spot_x[i] = coord[i].x;
|
|
spot_y[i] = coord[i].y;
|
|
spot_z[i] = coord[i].z;
|
|
}
|
|
|
|
cudaMemcpyAsync(d_spot_x, spot_x, nspots * sizeof(float), cudaMemcpyHostToDevice, stream);
|
|
cudaMemcpyAsync(d_spot_y, spot_y, nspots * sizeof(float), cudaMemcpyHostToDevice, stream);
|
|
cudaMemcpyAsync(d_spot_z, spot_z, nspots * sizeof(float), cudaMemcpyHostToDevice, stream);
|
|
|
|
histogram_kernel<<<l_gridDim, l_blockDim, 0, stream>>>(d_spot_x, d_spot_y, d_spot_z,
|
|
d_dir_x, d_dir_y, d_dir_z,
|
|
histogram_spacing, histogram_size,
|
|
nspots,
|
|
direction_vectors.size(),
|
|
d_input_fft);
|
|
|
|
cuda_err(cufftExecR2C(plan, d_input_fft, d_output_fft));
|
|
|
|
// Background half-window ~15 A (length-based, so independent of histogram sizing); see
|
|
// FFTIndexerCPU for the prominence-vs-envelope rationale and the validated optimum.
|
|
const double len_coeff = 2.0 * static_cast<double>(max_length_A) / static_cast<double>(histogram_size);
|
|
const int bg_half = std::max(1, static_cast<int>(std::lround(15.0 / len_coeff)));
|
|
|
|
calculate_fft_result<<<l_gridDim, l_blockDim, 0, stream>>>(d_output_fft,
|
|
max_length_A, min_length_A, histogram_size,
|
|
bg_half,
|
|
direction_vectors.size(), d_result_fft);
|
|
|
|
cuda_err(cudaMemcpyAsync(result_fft.data(), d_result_fft, direction_vectors.size() * sizeof(FFTResult),
|
|
cudaMemcpyDeviceToHost, stream));
|
|
cuda_err(cudaStreamSynchronize(stream));
|
|
}
|