0e9f7cc956
Build Packages / build:rpm (rocky8_nocuda) (push) Successful in 14m41s
Build Packages / build:rpm (ubuntu2404_nocuda) (push) Successful in 15m22s
Build Packages / build:rpm (rocky8_sls9) (push) Successful in 17m23s
Build Packages / build:rpm (rocky8) (push) Successful in 17m45s
Build Packages / build:rpm (ubuntu2204_nocuda) (push) Successful in 18m8s
Build Packages / build:rpm (rocky9_nocuda) (push) Successful in 18m16s
Build Packages / build:rpm (rocky9_sls9) (push) Successful in 18m18s
Build Packages / build:rpm (rocky9) (push) Successful in 11m29s
Build Packages / Generate python client (push) Successful in 33s
Build Packages / build:rpm (ubuntu2204) (push) Successful in 11m21s
Build Packages / Create release (push) Skipped
Build Packages / Build documentation (push) Successful in 56s
Build Packages / build:rpm (ubuntu2404) (push) Successful in 11m42s
Build Packages / XDS test (neggia plugin) (push) Successful in 10m51s
Build Packages / XDS test (JFJoch plugin) (push) Successful in 11m18s
Build Packages / XDS test (durin plugin) (push) Successful in 11m31s
Build Packages / DIALS test (push) Successful in 13m54s
Build Packages / Unit tests (push) Successful in 1h1m24s
208 lines
8.1 KiB
Plaintext
208 lines
8.1 KiB
Plaintext
// SPDX-FileCopyrightText: 2025 Filip Leonarski, Paul Scherrer Institute <filip.leonarski@psi.ch>
|
|
// SPDX-License-Identifier: GPL-3.0-only
|
|
|
|
#include "BraggPredictionGPU.h"
|
|
|
|
#ifdef JFJOCH_USE_CUDA
|
|
#include "../indexing/CUDAMemHelpers.h"
|
|
#include <cuda_runtime.h>
|
|
|
|
namespace {
|
|
__device__ inline bool is_odd(int v) { return (v & 1) != 0; }
|
|
|
|
__device__ inline float angle_from_ewald_sphere_deg(const Coord &S0, float recip_x, float recip_y, float recip_z, float recip_sq) {
|
|
const float epsilon = 1e-5f;
|
|
const float rad_to_deg = 180.0f / static_cast<float>(M_PI);
|
|
|
|
const float s0_sq = S0.x * S0.x + S0.y * S0.y + S0.z * S0.z;
|
|
const float s0_p0 = S0.x * recip_x + S0.y * recip_y + S0.z * recip_z;
|
|
const float val = s0_sq * recip_sq - s0_p0 * s0_p0;
|
|
|
|
if (fabsf(val) < epsilon || s0_sq < epsilon) return NAN;
|
|
|
|
const float a_num = (s0_sq - 0.25f * recip_sq) * recip_sq;
|
|
if (a_num < 0.0f) return NAN;
|
|
|
|
const float A = sqrtf(a_num / val);
|
|
const float B = (A * s0_p0 + 0.5f * recip_sq) / s0_sq;
|
|
|
|
const float p_star_x = A * recip_x - B * S0.x;
|
|
const float p_star_y = A * recip_y - B * S0.y;
|
|
const float p_star_z = A * recip_z - B * S0.z;
|
|
|
|
const float p_star_sq = p_star_x * p_star_x + p_star_y * p_star_y + p_star_z * p_star_z;
|
|
const float denom = sqrtf(p_star_sq * recip_sq);
|
|
if (denom < epsilon) return NAN;
|
|
|
|
float c = (p_star_x * recip_x + p_star_y * recip_y + p_star_z * recip_z) / denom;
|
|
c = fmaxf(-1.0f, fminf(1.0f, c));
|
|
|
|
return acosf(c) * rad_to_deg;
|
|
}
|
|
|
|
__device__ inline bool compute_reflection(const KernelConsts &C, int h, int k, int l, Reflection &out) {
|
|
if (h == 0 && k == 0 && l == 0)
|
|
return false;
|
|
|
|
// Systematic absences (centering only)
|
|
// P, I, A, B, C, F supported
|
|
switch (C.centering) {
|
|
case 'I':
|
|
if (is_odd(h + k + l))
|
|
return false;
|
|
break;
|
|
case 'A':
|
|
if (is_odd(k + l))
|
|
return false;
|
|
break;
|
|
case 'B':
|
|
if (is_odd(h + l))
|
|
return false;
|
|
break;
|
|
case 'C':
|
|
if (is_odd(h + k))
|
|
return false;
|
|
break;
|
|
case 'F':
|
|
if ((is_odd(h + k)) || (is_odd(h + l)) || (is_odd(k + l)))
|
|
return false;
|
|
break;
|
|
case 'R': {
|
|
// Rhombohedral in hexagonal setting (hR, a_h=b_h, gamma=120°):
|
|
// Condition: -h + k + l = 3n
|
|
int mod = (-h + k + l) % 3;
|
|
if (mod < 0) mod += 3;
|
|
if (mod != 0) return false;
|
|
break;
|
|
}
|
|
default:
|
|
break;
|
|
}
|
|
|
|
float Ah_x = C.Astar.x * h;
|
|
float Ah_y = C.Astar.y * h;
|
|
float Ah_z = C.Astar.z * h;
|
|
float AhBk_x = Ah_x + C.Bstar.x * k;
|
|
float AhBk_y = Ah_y + C.Bstar.y * k;
|
|
float AhBk_z = Ah_z + C.Bstar.z * k;
|
|
float recip_x = AhBk_x + C.Cstar.x * l;
|
|
float recip_y = AhBk_y + C.Cstar.y * l;
|
|
float recip_z = AhBk_z + C.Cstar.z * l;
|
|
float recip_sq = recip_x * recip_x + recip_y * recip_y + recip_z * recip_z;
|
|
if (recip_sq > C.one_over_dmax_sq) return false;
|
|
float Sx = recip_x + C.S0.x;
|
|
float Sy = recip_y + C.S0.y;
|
|
float Sz = recip_z + C.S0.z;
|
|
float S_len = sqrtf(Sx * Sx + Sy * Sy + Sz * Sz);
|
|
float dist_ewald = fabsf(S_len - C.one_over_wavelength);
|
|
if (dist_ewald > C.ewald_cutoff) return false;
|
|
float Srx = C.rot[0] * Sx + C.rot[1] * Sy + C.rot[2] * Sz;
|
|
float Sry = C.rot[3] * Sx + C.rot[4] * Sy + C.rot[5] * Sz;
|
|
float Srz = C.rot[6] * Sx + C.rot[7] * Sy + C.rot[8] * Sz;
|
|
if (Srz <= 0.0f) return false;
|
|
float coeff = C.coeff_const / Srz;
|
|
float x = C.beam_x + Srx * coeff;
|
|
float y = C.beam_y + Sry * coeff;
|
|
if (x < 0.0f || x >= C.det_width_pxl || y < 0.0f || y >= C.det_height_pxl) return false;
|
|
out.h = h;
|
|
out.k = k;
|
|
out.l = l;
|
|
out.delta_phi_deg = angle_from_ewald_sphere_deg(C.S0, recip_x, recip_y, recip_z, recip_sq);
|
|
out.predicted_x = x;
|
|
out.predicted_y = y;
|
|
out.d = 1.0f / sqrtf(recip_sq);
|
|
out.dist_ewald = dist_ewald;
|
|
out.rlp = 1.0f;
|
|
out.partiality = 1.0f;
|
|
out.zeta = 1.0f;
|
|
out.image_scale_corr = 1.0f;
|
|
return true;
|
|
}
|
|
|
|
__global__ void bragg_kernel_3d(const KernelConsts *__restrict__ kc,
|
|
int max_hkl,
|
|
int max_reflections,
|
|
Reflection *__restrict__ out,
|
|
int *__restrict__ counter) {
|
|
int range = 2 * max_hkl + 1;
|
|
int hi = blockIdx.x * blockDim.x + threadIdx.x;
|
|
int ki = blockIdx.y * blockDim.y + threadIdx.y;
|
|
int li = blockIdx.z * blockDim.z + threadIdx.z;
|
|
if (hi >= range || ki >= range || li >= range) return;
|
|
int h = hi - max_hkl;
|
|
int k = ki - max_hkl;
|
|
int l = li - max_hkl;
|
|
Reflection r{};
|
|
if (!compute_reflection(*kc, h, k, l, r)) return;
|
|
int pos = atomicAdd(counter, 1);
|
|
if (pos < max_reflections) out[pos] = r;
|
|
else atomicSub(counter, 1);
|
|
}
|
|
|
|
inline KernelConsts BuildKernelConsts(const DiffractionExperiment &experiment,
|
|
const CrystalLattice &lattice,
|
|
float high_res_A,
|
|
float ewald_dist_cutoff,
|
|
char centering) {
|
|
KernelConsts kc{};
|
|
auto geom = experiment.GetDiffractionGeometry();
|
|
kc.det_width_pxl = static_cast<float>(experiment.GetXPixelsNum());
|
|
kc.det_height_pxl = static_cast<float>(experiment.GetYPixelsNum());
|
|
kc.beam_x = geom.GetBeamX_pxl();
|
|
kc.beam_y = geom.GetBeamY_pxl();
|
|
kc.coeff_const = geom.GetDetectorDistance_mm() / geom.GetPixelSize_mm();
|
|
float one_over_dmax = 1.0f / high_res_A;
|
|
kc.one_over_dmax_sq = one_over_dmax * one_over_dmax;
|
|
kc.one_over_wavelength = 1.0f / geom.GetWavelength_A();
|
|
kc.ewald_cutoff = ewald_dist_cutoff;
|
|
kc.Astar = lattice.Astar();
|
|
kc.Bstar = lattice.Bstar();
|
|
kc.Cstar = lattice.Cstar();
|
|
kc.S0 = geom.GetScatteringVector();
|
|
kc.centering = centering;
|
|
auto rotT = geom.GetPoniRotMatrix().transpose().arr();
|
|
for (int i = 0; i < 9; ++i) kc.rot[i] = rotT[i];
|
|
return kc;
|
|
}
|
|
} // namespace
|
|
|
|
BraggPredictionGPU::BraggPredictionGPU(int max_reflections)
|
|
: BraggPrediction(max_reflections),
|
|
reg_out(reflections), d_out(max_reflections),
|
|
dK(1), d_count(1), h_count(1) {
|
|
}
|
|
|
|
int BraggPredictionGPU::Calc(const DiffractionExperiment &experiment,
|
|
const CrystalLattice &lattice,
|
|
const BraggPredictionSettings &settings) {
|
|
// Build constants on host
|
|
KernelConsts hK = BuildKernelConsts(experiment, lattice, settings.high_res_A, settings.ewald_dist_cutoff, settings.centering);
|
|
cudaMemcpyAsync(dK, &hK, sizeof(KernelConsts), cudaMemcpyHostToDevice, stream);
|
|
cudaMemsetAsync(d_count, 0, sizeof(int), stream);
|
|
|
|
// Configure and launch on the stream
|
|
const int range = 2 * settings.max_hkl;
|
|
dim3 block(8, 8, 8);
|
|
dim3 grid((range + block.x - 1) / block.x,
|
|
(range + block.y - 1) / block.y,
|
|
(range + block.z - 1) / block.z);
|
|
|
|
bragg_kernel_3d<<<grid, block, 0, stream>>>(dK, settings.max_hkl, max_reflections, d_out, d_count);
|
|
|
|
// Async D2H count and synchronize
|
|
cudaMemcpyAsync(h_count, d_count, sizeof(int), cudaMemcpyDeviceToHost, stream);
|
|
cudaStreamSynchronize(stream);
|
|
|
|
int count = *h_count.get();
|
|
if (count > max_reflections) count = max_reflections;
|
|
if (count == 0)
|
|
return {};
|
|
|
|
cudaMemcpyAsync(reflections.data(), d_out, sizeof(Reflection) * count, cudaMemcpyDeviceToHost, stream);
|
|
cudaStreamSynchronize(stream);
|
|
|
|
return count;
|
|
}
|
|
|
|
#endif
|