// SPDX-FileCopyrightText: 2025 Filip Leonarski, Paul Scherrer Institute // SPDX-License-Identifier: GPL-3.0-only #include "BraggPredictionGPU.h" #ifdef JFJOCH_USE_CUDA #include "../indexing/CUDAMemHelpers.h" #include namespace { __device__ inline bool is_odd(int v) { return (v & 1) != 0; } __device__ inline bool compute_reflection(const KernelConsts &C, int h, int k, int l, Reflection &out) { if (h == 0 && k == 0 && l == 0) return false; // Systematic absences (centering only) // P, I, A, B, C, F supported switch (C.centering) { case 'I': if (is_odd(h + k + l)) return false; break; case 'A': if (is_odd(k + l)) return false; break; case 'B': if (is_odd(h + l)) return false; break; case 'C': if (is_odd(h + k)) return false; break; case 'F': if ((is_odd(h + k)) || (is_odd(h + l)) || (is_odd(k + l))) return false; break; case 'R': { // Rhombohedral in hexagonal setting (hR, a_h=b_h, gamma=120°): // Condition: -h + k + l = 3n int mod = (-h + k + l) % 3; if (mod < 0) mod += 3; if (mod != 0) return false; break; } default: break; } float Ah_x = C.Astar.x * h; float Ah_y = C.Astar.y * h; float Ah_z = C.Astar.z * h; float AhBk_x = Ah_x + C.Bstar.x * k; float AhBk_y = Ah_y + C.Bstar.y * k; float AhBk_z = Ah_z + C.Bstar.z * k; float rx = AhBk_x + C.Cstar.x * l; float ry = AhBk_y + C.Cstar.y * l; float rz = AhBk_z + C.Cstar.z * l; float dot = rx * rx + ry * ry + rz * rz; if (dot > C.one_over_dmax_sq) return false; float Sx = rx + C.S0.x; float Sy = ry + C.S0.y; float Sz = rz + C.S0.z; float S_len = sqrtf(Sx * Sx + Sy * Sy + Sz * Sz); float dist_ewald = fabsf(S_len - C.one_over_wavelength); if (dist_ewald > C.ewald_cutoff) return false; float Srx = C.rot[0] * Sx + C.rot[1] * Sy + C.rot[2] * Sz; float Sry = C.rot[3] * Sx + C.rot[4] * Sy + C.rot[5] * Sz; float Srz = C.rot[6] * Sx + C.rot[7] * Sy + C.rot[8] * Sz; if (Srz <= 0.0f) return false; float coeff = C.coeff_const / Srz; float x = C.beam_x + Srx * coeff; float y = C.beam_y + Sry * coeff; if (x < 0.0f || x >= C.det_width_pxl || y < 0.0f || y >= C.det_height_pxl) return false; out.h = h; out.k = k; out.l = l; out.predicted_x = x; out.predicted_y = y; out.d = 1.0f / sqrtf(dot); out.dist_ewald = dist_ewald; return true; } __global__ void bragg_kernel_3d(const KernelConsts *__restrict__ kc, int max_hkl, int max_reflections, Reflection *__restrict__ out, int *__restrict__ counter) { int range = 2 * max_hkl; int hi = blockIdx.x * blockDim.x + threadIdx.x; int ki = blockIdx.y * blockDim.y + threadIdx.y; int li = blockIdx.z * blockDim.z + threadIdx.z; if (hi >= range || ki >= range || li >= range) return; int h = hi - max_hkl; int k = ki - max_hkl; int l = li - max_hkl; Reflection r{}; if (!compute_reflection(*kc, h, k, l, r)) return; int pos = atomicAdd(counter, 1); if (pos < max_reflections) out[pos] = r; else atomicSub(counter, 1); } inline KernelConsts BuildKernelConsts(const DiffractionExperiment &experiment, const CrystalLattice &lattice, float high_res_A, float ewald_dist_cutoff, char centering) { KernelConsts kc{}; auto geom = experiment.GetDiffractionGeometry(); kc.det_width_pxl = static_cast(experiment.GetXPixelsNum()); kc.det_height_pxl = static_cast(experiment.GetYPixelsNum()); kc.beam_x = geom.GetBeamX_pxl(); kc.beam_y = geom.GetBeamY_pxl(); kc.coeff_const = geom.GetDetectorDistance_mm() / geom.GetPixelSize_mm(); float one_over_dmax = 1.0f / high_res_A; kc.one_over_dmax_sq = one_over_dmax * one_over_dmax; kc.one_over_wavelength = 1.0f / geom.GetWavelength_A(); kc.ewald_cutoff = ewald_dist_cutoff; kc.Astar = lattice.Astar(); kc.Bstar = lattice.Bstar(); kc.Cstar = lattice.Cstar(); kc.S0 = geom.GetScatteringVector(); kc.centering = centering; auto rotT = geom.GetPoniRotMatrix().transpose().arr(); for (int i = 0; i < 9; ++i) kc.rot[i] = rotT[i]; return kc; } } // namespace BraggPredictionGPU::BraggPredictionGPU(int max_reflections) : BraggPrediction(max_reflections), reg_out(reflections), d_out(max_reflections), dK(1), d_count(1), h_count(1) { } int BraggPredictionGPU::Calc(const DiffractionExperiment &experiment, const CrystalLattice &lattice, const BraggPredictionSettings &settings) { // Build constants on host KernelConsts hK = BuildKernelConsts(experiment, lattice, settings.high_res_A, settings.ewald_dist_cutoff, settings.centering); cudaMemcpyAsync(dK, &hK, sizeof(KernelConsts), cudaMemcpyHostToDevice, stream); cudaMemsetAsync(d_count, 0, sizeof(int), stream); // Configure and launch on the stream const int range = 2 * settings.max_hkl; dim3 block(8, 8, 8); dim3 grid((range + block.x - 1) / block.x, (range + block.y - 1) / block.y, (range + block.z - 1) / block.z); bragg_kernel_3d<<>>(dK, settings.max_hkl, max_reflections, d_out, d_count); // Async D2H count and synchronize cudaMemcpyAsync(h_count, d_count, sizeof(int), cudaMemcpyDeviceToHost, stream); cudaStreamSynchronize(stream); int count = *h_count.get(); if (count > max_reflections) count = max_reflections; if (count == 0) return {}; cudaMemcpyAsync(reflections.data(), d_out, sizeof(Reflection) * count, cudaMemcpyDeviceToHost, stream); cudaStreamSynchronize(stream); return count; } #endif