Files
Jungfraujoch/image_analysis/indexing/FFTIndexerGPU.cu
2025-09-21 19:27:51 +02:00

153 lines
6.2 KiB
Plaintext

// SPDX-FileCopyrightText: 2025 Filip Leonarski, Paul Scherrer Institute <filip.leonarski@psi.ch>
// SPDX-License-Identifier: GPL-3.0-only
#include "FFTIndexerGPU.h"
#include <cufft.h>
__device__ __host__ inline float complex_abs(const cufftComplex &z) {
return sqrtf(z.x * z.x + z.y * z.y);
}
__global__ void calculate_fft_result(
const cufftComplex *__restrict__ d_output,
const float max_length_A,
const float min_length_A,
const int histogram_size,
const int directions_size,
FFTResult *d_results) {
int i = blockIdx.x * blockDim.x + threadIdx.x; // Get thread index
if (i < directions_size) {
// Ensure within bounds
size_t offset = ((histogram_size / 2) + 1) * i;
float max_magnitude = 0.0f;
FFTResult result{.direction = i, .length = -1};
float len_coeff = 2.0f * max_length_A / static_cast<float>(histogram_size);
for (int j = 0; j < (histogram_size / 2) + 1; j++) {
float mag = complex_abs(d_output[offset + j]);
float len = len_coeff * static_cast<float>(j);
if (len > min_length_A && mag > max_magnitude) {
max_magnitude = mag;
result.magnitude = mag;
result.length = len;
}
}
d_results[i] = result; // Store the result
}
}
__global__ void histogram_kernel(const float *__restrict__ coord_x,
const float *__restrict__ coord_y,
const float *__restrict__ coord_z,
const float *__restrict__ dir_x,
const float *__restrict__ dir_y,
const float *__restrict__ dir_z,
float histogram_spacing,
int histogram_size,
int coord_size,
int direction_vectors_size,
float *__restrict__ output) {
int direction_idx = blockIdx.x * blockDim.x + threadIdx.x;
if (direction_idx < direction_vectors_size) {
int base_offset = direction_idx * histogram_size;
for (int i = 0; i < histogram_size; i++)
output[base_offset + i] = 0;
for (int i = 0; i < coord_size; i++) {
float dot = fabsf(
dir_x[direction_idx] * coord_x[i] + dir_y[direction_idx] * coord_y[i] + dir_z[direction_idx] * coord_z[
i]);
int64_t bin = static_cast<int64_t>(dot / histogram_spacing);
if (bin >= 0 && bin < histogram_size)
output[base_offset + bin] += 1.0;
}
}
}
inline void cuda_err(cudaError_t val) {
if (val != cudaSuccess)
throw JFJochException(JFJochExceptionCategory::GPUCUDAError, cudaGetErrorString(val));
}
inline void cuda_err(cufftResult val) {
if (val != cufftResult::CUFFT_SUCCESS)
throw JFJochException(JFJochExceptionCategory::GPUCUDAError, "CuFFT error");
}
FFTIndexerGPU::FFTIndexerGPU(const IndexingSettings &settings)
: FFTIndexer(settings), result_fft_reg(result_fft) {
d_input_fft = CudaDevicePtr<float>(input_size);
d_output_fft = CudaDevicePtr<cufftComplex>(output_size);
d_result_fft = CudaDevicePtr<FFTResult>(nDirections);
d_spot_x = CudaDevicePtr<float>(FFT_MAX_SPOTS);
d_spot_y = CudaDevicePtr<float>(FFT_MAX_SPOTS);
d_spot_z = CudaDevicePtr<float>(FFT_MAX_SPOTS);
spot_x = CudaHostPtr<float>(FFT_MAX_SPOTS);
spot_y = CudaHostPtr<float>(FFT_MAX_SPOTS);
spot_z = CudaHostPtr<float>(FFT_MAX_SPOTS);
d_dir_x = CudaDevicePtr<float>(nDirections);
d_dir_y = CudaDevicePtr<float>(nDirections);
d_dir_z = CudaDevicePtr<float>(nDirections);
std::vector<float> dir_x(nDirections);
std::vector<float> dir_y(nDirections);
std::vector<float> dir_z(nDirections);
for (int i = 0; i < nDirections; i++) {
dir_x[i] = direction_vectors.at(i).x;
dir_y[i] = direction_vectors.at(i).y;
dir_z[i] = direction_vectors.at(i).z;
}
cudaMemcpy(d_dir_x, dir_x.data(), nDirections * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_dir_y, dir_y.data(), nDirections * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_dir_z, dir_z.data(), nDirections * sizeof(float), cudaMemcpyHostToDevice);
int n[1] = {static_cast<int32_t>(histogram_size)}; // Size of the FFT along a single dimension
plan = CudaFFTPlan(1, n, nullptr, 1, histogram_size, nullptr, 1, histogram_size / 2 + 1, CUFFT_R2C,
nDirections);
cuda_err(cufftSetStream(plan, stream));
}
void FFTIndexerGPU::ExecuteFFT(const std::vector<Coord> &coord, size_t nspots) {
int l_blockDim = 128;
int l_gridDim = (direction_vectors.size() + l_blockDim - 1) / l_blockDim;
for (int i = 0; i < nspots; i++) {
spot_x[i] = coord[i].x;
spot_y[i] = coord[i].y;
spot_z[i] = coord[i].z;
}
cudaMemcpyAsync(d_spot_x, spot_x, nspots * sizeof(float), cudaMemcpyHostToDevice, stream);
cudaMemcpyAsync(d_spot_y, spot_y, nspots * sizeof(float), cudaMemcpyHostToDevice, stream);
cudaMemcpyAsync(d_spot_z, spot_z, nspots * sizeof(float), cudaMemcpyHostToDevice, stream);
histogram_kernel<<<l_gridDim, l_blockDim, 0, stream>>>(d_spot_x, d_spot_y, d_spot_z,
d_dir_x, d_dir_y, d_dir_z,
histogram_spacing, histogram_size,
nspots,
direction_vectors.size(),
d_input_fft);
cuda_err(cufftExecR2C(plan, d_input_fft, d_output_fft));
calculate_fft_result<<<l_gridDim, l_blockDim, 0, stream>>>(d_output_fft,
max_length_A, min_length_A, histogram_size,
direction_vectors.size(), d_result_fft);
cuda_err(cudaMemcpyAsync(result_fft.data(), d_result_fft, direction_vectors.size() * sizeof(FFTResult),
cudaMemcpyDeviceToHost, stream));
cuda_err(cudaStreamSynchronize(stream));
}