Jungfraujoch/image_analysis/indexing/FFTIndexerGPU.cu

// SPDX-FileCopyrightText: 2025 Filip Leonarski, Paul Scherrer Institute <filip.leonarski@psi.ch>
// SPDX-License-Identifier: GPL-3.0-only

#include "FFTIndexerGPU.h"
#include <cufft.h>
#include <cmath>
#include <algorithm>

__device__ __host__ inline float complex_abs(const cufftComplex &z) {
    return sqrtf(z.x * z.x + z.y * z.y);
}

__global__ void calculate_fft_result(
    const cufftComplex *__restrict__ d_output,
    const float max_length_A,
    const float min_length_A,
    const int histogram_size,
    const int bg_half,
    const int directions_size,
    FFTResult *d_results) {
    int i = blockIdx.x * blockDim.x + threadIdx.x; // Get thread index

    if (i < directions_size) {
        const int out_len = (histogram_size / 2) + 1;
        size_t offset = static_cast<size_t>(out_len) * i;
        float len_coeff = 2.0f * max_length_A / static_cast<float>(histogram_size);

        // Pick the peak by PROMINENCE above a running-mean background of half-width bg_half:
        // the projected histogram has a broad low-frequency envelope whose magnitude can
        // exceed the true lattice peaks, so a plain argmax|spec| returns a short envelope
        // vector on weak/pink-beam frames. Subtracting the local mean removes that envelope
        // while sharp lattice peaks keep their height (mirrors FFTIndexerCPU).
        double winsum = 0.0;
        int wlo = 0;
        int whi = min(bg_half, out_len - 1);
        for (int k = wlo; k <= whi; ++k)
            winsum += complex_abs(d_output[offset + k]);

        float best_prom = 0.0f;
        FFTResult result{.magnitude = 0.0f, .direction = i, .length = -1};

        for (int j = 0; j < out_len; ++j) {
            const int want_hi = min(j + bg_half, out_len - 1);
            while (whi < want_hi) { ++whi; winsum += complex_abs(d_output[offset + whi]); }
            const int want_lo = max(0, j - bg_half);
            while (wlo < want_lo) { winsum -= complex_abs(d_output[offset + wlo]); ++wlo; }

            const float len = len_coeff * static_cast<float>(j);
            if (len <= min_length_A) continue;

            const float mag = complex_abs(d_output[offset + j]);
            const float bg = static_cast<float>(winsum / static_cast<double>(whi - wlo + 1));
            const float prom = mag - bg;
            if (prom > best_prom) {
                best_prom = prom;
                result.magnitude = prom;
                result.length = len;
            }
        }
        d_results[i] = result; // Store the result
    }
}

__global__ void histogram_kernel(const float *__restrict__ coord_x,
                                 const float *__restrict__ coord_y,
                                 const float *__restrict__ coord_z,
                                 const float *__restrict__ dir_x,
                                 const float *__restrict__ dir_y,
                                 const float *__restrict__ dir_z,
                                 float histogram_spacing,
                                 int histogram_size,
                                 int coord_size,
                                 int direction_vectors_size,
                                 float *__restrict__ output) {
    int direction_idx = blockIdx.x * blockDim.x + threadIdx.x;

    if (direction_idx < direction_vectors_size) {
        int base_offset = direction_idx * histogram_size;

        for (int i = 0; i < histogram_size; i++)
            output[base_offset + i] = 0;

        for (int i = 0; i < coord_size; i++) {
            float dot = fabsf(
                dir_x[direction_idx] * coord_x[i] + dir_y[direction_idx] * coord_y[i] + dir_z[direction_idx] * coord_z[
                    i]);
            int64_t bin = static_cast<int64_t>(dot / histogram_spacing);
            if (bin >= 0 && bin < histogram_size)
                output[base_offset + bin] += 1.0;
        }
    }
}

inline void cuda_err(cudaError_t val) {
    if (val != cudaSuccess)
        throw JFJochException(JFJochExceptionCategory::GPUCUDAError, cudaGetErrorString(val));
}

inline void cuda_err(cufftResult val) {
    if (val != cufftResult::CUFFT_SUCCESS)
        throw JFJochException(JFJochExceptionCategory::GPUCUDAError, "CuFFT error");
}

FFTIndexerGPU::FFTIndexerGPU(const IndexingSettings &settings)
    : FFTIndexer(settings), result_fft_reg(result_fft) {
    d_input_fft = CudaDevicePtr<float>(input_size);
    d_output_fft = CudaDevicePtr<cufftComplex>(output_size);
    d_result_fft = CudaDevicePtr<FFTResult>(nDirections);

    d_spot_x = CudaDevicePtr<float>(FFT_MAX_SPOTS);
    d_spot_y = CudaDevicePtr<float>(FFT_MAX_SPOTS);
    d_spot_z = CudaDevicePtr<float>(FFT_MAX_SPOTS);

    spot_x = CudaHostPtr<float>(FFT_MAX_SPOTS);
    spot_y = CudaHostPtr<float>(FFT_MAX_SPOTS);
    spot_z = CudaHostPtr<float>(FFT_MAX_SPOTS);

    d_dir_x = CudaDevicePtr<float>(nDirections);
    d_dir_y = CudaDevicePtr<float>(nDirections);
    d_dir_z = CudaDevicePtr<float>(nDirections);

    std::vector<float> dir_x(nDirections);
    std::vector<float> dir_y(nDirections);
    std::vector<float> dir_z(nDirections);

    for (int i = 0; i < nDirections; i++) {
        dir_x[i] = direction_vectors.at(i).x;
        dir_y[i] = direction_vectors.at(i).y;
        dir_z[i] = direction_vectors.at(i).z;
    }

    cudaMemcpy(d_dir_x, dir_x.data(), nDirections * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_dir_y, dir_y.data(), nDirections * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_dir_z, dir_z.data(), nDirections * sizeof(float), cudaMemcpyHostToDevice);

    int n[1] = {static_cast<int32_t>(histogram_size)}; // Size of the FFT along a single dimension

    plan = CudaFFTPlan(1, n, nullptr, 1, histogram_size, nullptr, 1, histogram_size / 2 + 1, CUFFT_R2C,
                       nDirections);
    cuda_err(cufftSetStream(plan, stream));
}


void FFTIndexerGPU::ExecuteFFT(const std::vector<Coord> &coord, size_t nspots) {
    int l_blockDim = 128;
    int l_gridDim = (direction_vectors.size() + l_blockDim - 1) / l_blockDim;

    for (int i = 0; i < nspots; i++) {
        spot_x[i] = coord[i].x;
        spot_y[i] = coord[i].y;
        spot_z[i] = coord[i].z;
    }

    cudaMemcpyAsync(d_spot_x, spot_x, nspots * sizeof(float), cudaMemcpyHostToDevice, stream);
    cudaMemcpyAsync(d_spot_y, spot_y, nspots * sizeof(float), cudaMemcpyHostToDevice, stream);
    cudaMemcpyAsync(d_spot_z, spot_z, nspots * sizeof(float), cudaMemcpyHostToDevice, stream);

    histogram_kernel<<<l_gridDim, l_blockDim, 0, stream>>>(d_spot_x, d_spot_y, d_spot_z,
                                                           d_dir_x, d_dir_y, d_dir_z,
                                                           histogram_spacing, histogram_size,
                                                           nspots,
                                                           direction_vectors.size(),
                                                           d_input_fft);

    cuda_err(cufftExecR2C(plan, d_input_fft, d_output_fft));

    // Background half-window ~15 A (length-based, so independent of histogram sizing); see
    // FFTIndexerCPU for the prominence-vs-envelope rationale and the validated optimum.
    const double len_coeff = 2.0 * static_cast<double>(max_length_A) / static_cast<double>(histogram_size);
    const int bg_half = std::max(1, static_cast<int>(std::lround(15.0 / len_coeff)));

    calculate_fft_result<<<l_gridDim, l_blockDim, 0, stream>>>(d_output_fft,
                                                               max_length_A, min_length_A, histogram_size,
                                                               bg_half,
                                                               direction_vectors.size(), d_result_fft);

    cuda_err(cudaMemcpyAsync(result_fft.data(), d_result_fft, direction_vectors.size() * sizeof(FFTResult),
                             cudaMemcpyDeviceToHost, stream));
    cuda_err(cudaStreamSynchronize(stream));
}