0e9f7cc956
Build Packages / build:rpm (rocky8_nocuda) (push) Successful in 14m41s
Build Packages / build:rpm (ubuntu2404_nocuda) (push) Successful in 15m22s
Build Packages / build:rpm (rocky8_sls9) (push) Successful in 17m23s
Build Packages / build:rpm (rocky8) (push) Successful in 17m45s
Build Packages / build:rpm (ubuntu2204_nocuda) (push) Successful in 18m8s
Build Packages / build:rpm (rocky9_nocuda) (push) Successful in 18m16s
Build Packages / build:rpm (rocky9_sls9) (push) Successful in 18m18s
Build Packages / build:rpm (rocky9) (push) Successful in 11m29s
Build Packages / Generate python client (push) Successful in 33s
Build Packages / build:rpm (ubuntu2204) (push) Successful in 11m21s
Build Packages / Create release (push) Skipped
Build Packages / Build documentation (push) Successful in 56s
Build Packages / build:rpm (ubuntu2404) (push) Successful in 11m42s
Build Packages / XDS test (neggia plugin) (push) Successful in 10m51s
Build Packages / XDS test (JFJoch plugin) (push) Successful in 11m18s
Build Packages / XDS test (durin plugin) (push) Successful in 11m31s
Build Packages / DIALS test (push) Successful in 13m54s
Build Packages / Unit tests (push) Successful in 1h1m24s
295 lines
11 KiB
Plaintext
295 lines
11 KiB
Plaintext
// SPDX-FileCopyrightText: 2025 Filip Leonarski, Paul Scherrer Institute <filip.leonarski@psi.ch>
|
|
// SPDX-License-Identifier: GPL-3.0-only
|
|
|
|
#include "BraggPredictionRotGPU.h"
|
|
|
|
#ifdef JFJOCH_USE_CUDA
|
|
#include "../indexing/CUDAMemHelpers.h"
|
|
#include <cuda_runtime.h>
|
|
#include <cmath>
|
|
|
|
namespace {
|
|
__host__ __device__ inline bool is_odd(int v) { return (v & 1) != 0; }
|
|
|
|
__host__ __device__ inline void cross3(float ax, float ay, float az,
|
|
float bx, float by, float bz,
|
|
float &cx, float &cy, float &cz) {
|
|
cx = ay * bz - az * by;
|
|
cy = az * bx - ax * bz;
|
|
cz = ax * by - ay * bx;
|
|
}
|
|
|
|
__host__ __device__ inline float dot3(float ax, float ay, float az,
|
|
float bx, float by, float bz) {
|
|
return ax * bx + ay * by + az * bz;
|
|
}
|
|
|
|
__host__ __device__ inline void normalize3(float &x, float &y, float &z) {
|
|
float len = sqrtf(x * x + y * y + z * z);
|
|
if (len < 1e-12f) { x = 0.0f; y = 0.0f; z = 0.0f; return; }
|
|
float inv = 1.0f / len;
|
|
x *= inv; y *= inv; z *= inv;
|
|
}
|
|
|
|
__device__ inline int compute_reflections_rot(const KernelConstsRot &C, int h, int k, int l, Reflection out[2]) {
|
|
if (h == 0 && k == 0 && l == 0)
|
|
return 0;
|
|
|
|
switch (C.centering) {
|
|
case 'I':
|
|
if (is_odd(h + k + l)) return false;
|
|
break;
|
|
case 'A':
|
|
if (is_odd(k + l)) return false;
|
|
break;
|
|
case 'B':
|
|
if (is_odd(h + l)) return false;
|
|
break;
|
|
case 'C':
|
|
if (is_odd(h + k)) return false;
|
|
break;
|
|
case 'F':
|
|
if (is_odd(h + k) || is_odd(h + l) || is_odd(k + l)) return false;
|
|
break;
|
|
case 'R': {
|
|
int mod = (-h + k + l) % 3;
|
|
if (mod < 0) mod += 3;
|
|
if (mod != 0) return false;
|
|
break;
|
|
}
|
|
default:
|
|
break;
|
|
}
|
|
|
|
// p0 = A* h + B* k + C* l
|
|
float p0x = C.Astar.x * h + C.Bstar.x * k + C.Cstar.x * l;
|
|
float p0y = C.Astar.y * h + C.Bstar.y * k + C.Cstar.y * l;
|
|
float p0z = C.Astar.z * h + C.Bstar.z * k + C.Cstar.z * l;
|
|
|
|
float p0_sq = p0x * p0x + p0y * p0y + p0z * p0z;
|
|
if (p0_sq <= 0.0f || p0_sq > C.one_over_dmax_sq)
|
|
return 0;
|
|
|
|
float p0_m1 = p0x * C.m1.x + p0y * C.m1.y + p0z * C.m1.z;
|
|
float p0_m2 = p0x * C.m2.x + p0y * C.m2.y + p0z * C.m2.z;
|
|
float p0_m3 = p0x * C.m3.x + p0y * C.m3.y + p0z * C.m3.z;
|
|
|
|
float rho_sq = p0_sq - (p0_m2 * p0_m2);
|
|
float p_m3 = (-p0_sq / 2.0f - p0_m2 * C.m2_S0) / C.m3_S0;
|
|
float p_m2 = p0_m2;
|
|
|
|
if (rho_sq < p_m3 * p_m3) return 0;
|
|
if (p0_sq > 4.0f * dot3(C.S0.x, C.S0.y, C.S0.z, C.S0.x, C.S0.y, C.S0.z)) return 0;
|
|
|
|
float p_m1_pos = sqrtf(rho_sq - p_m3 * p_m3);
|
|
float p_m1_arr[2] = {p_m1_pos, -p_m1_pos};
|
|
|
|
int count = 0;
|
|
for (int idx = 0; idx < 2; ++idx) {
|
|
float p_m1 = p_m1_arr[idx];
|
|
|
|
float cosphi = (p_m1 * p0_m1 + p_m3 * p0_m3) / rho_sq;
|
|
float sinphi = (p_m1 * p0_m3 - p_m3 * p0_m1) / rho_sq;
|
|
|
|
float px = C.m1.x * p_m1 + C.m2.x * p_m2 + C.m3.x * p_m3;
|
|
float py = C.m1.y * p_m1 + C.m2.y * p_m2 + C.m3.y * p_m3;
|
|
float pz = C.m1.z * p_m1 + C.m2.z * p_m2 + C.m3.z * p_m3;
|
|
|
|
float Sx = C.S0.x + px;
|
|
float Sy = C.S0.y + py;
|
|
float Sz = C.S0.z + pz;
|
|
|
|
float phi = -1.0f * atan2f(sinphi, cosphi);
|
|
|
|
// e1 = normalize(S x S0) - direction perpendicular to both S and S0
|
|
float e1x, e1y, e1z;
|
|
cross3(Sx, Sy, Sz, C.S0.x, C.S0.y, C.S0.z, e1x, e1y, e1z);
|
|
normalize3(e1x, e1y, e1z);
|
|
|
|
// zeta = |m2 · e1| - the "lorentz-like" geometric factor for partiality
|
|
float zeta_abs = fabsf(dot3(C.m2.x, C.m2.y, C.m2.z, e1x, e1y, e1z));
|
|
|
|
// Check min_zeta threshold (consistent with CPU)
|
|
if (zeta_abs < C.min_zeta)
|
|
continue;
|
|
|
|
// epsilon3 cutoff check (consistent with CPU, Kabsch formulation)
|
|
float epsilon3 = fabsf(phi * zeta_abs);
|
|
if (epsilon3 > C.mosaicity_multiplier * C.mos_angle_rad)
|
|
continue;
|
|
|
|
float cx, cy, cz;
|
|
cross3(Sx, Sy, Sz, C.S0.x, C.S0.y, C.S0.z, cx, cy, cz);
|
|
float S_dot_S0 = dot3(Sx, Sy, Sz, C.S0.x, C.S0.y, C.S0.z);
|
|
float lorentz = fabsf(dot3(C.m2.x, C.m2.y, C.m2.z, cx, cy, cz)) / S_dot_S0;
|
|
|
|
// Partiality calculation (Kabsch formulation)
|
|
// c1 = sqrt(2) * sigma / zeta, where sigma = mosaicity
|
|
float c1 = zeta_abs / (sqrtf(2.0f) * C.mos_angle_rad);
|
|
float half_wedge = C.wedge_angle_rad / 2.0f;
|
|
float partiality = (erff((phi + half_wedge) * c1)
|
|
- erff((phi - half_wedge) * c1)) / 2.0f;
|
|
|
|
|
|
// Use S (rotated) for projection
|
|
float Srx = C.rot[0] * Sx + C.rot[1] * Sy + C.rot[2] * Sz;
|
|
float Sry = C.rot[3] * Sx + C.rot[4] * Sy + C.rot[5] * Sz;
|
|
float Srz = C.rot[6] * Sx + C.rot[7] * Sy + C.rot[8] * Sz;
|
|
|
|
if (Srz <= 0.0f) continue;
|
|
|
|
float coeff = C.coeff_const / Srz;
|
|
float x = C.beam_x + Srx * coeff;
|
|
float y = C.beam_y + Sry * coeff;
|
|
|
|
if (x < 0.0f || x >= C.det_width_pxl || y < 0.0f || y >= C.det_height_pxl)
|
|
continue;
|
|
|
|
float dist_ewald = fabsf(sqrtf(Sx * Sx + Sy * Sy + Sz * Sz) - C.one_over_wavelength);
|
|
|
|
out[count].h = h;
|
|
out[count].k = k;
|
|
out[count].l = l;
|
|
out[count].delta_phi_deg = phi * 180.0 / M_PI;
|
|
out[count].predicted_x = x;
|
|
out[count].predicted_y = y;
|
|
out[count].d = 1.0f / sqrtf(p0_sq);
|
|
out[count].dist_ewald = dist_ewald;
|
|
out[count].rlp = lorentz;
|
|
out[count].partiality = partiality;
|
|
out[count].zeta = zeta_abs;
|
|
out[count].image_scale_corr = lorentz / partiality;
|
|
count++;
|
|
}
|
|
return count;
|
|
}
|
|
|
|
__global__ void bragg_rot_kernel_3d(const KernelConstsRot *__restrict__ kc,
|
|
int max_hkl,
|
|
int max_reflections,
|
|
Reflection *__restrict__ out,
|
|
int *__restrict__ counter) {
|
|
int range = 2 * max_hkl + 1;
|
|
int hi = blockIdx.x * blockDim.x + threadIdx.x;
|
|
int ki = blockIdx.y * blockDim.y + threadIdx.y;
|
|
int li = blockIdx.z * blockDim.z + threadIdx.z;
|
|
if (hi >= range || ki >= range || li >= range) return;
|
|
int h = hi - max_hkl;
|
|
int k = ki - max_hkl;
|
|
int l = li - max_hkl;
|
|
|
|
Reflection r[2];
|
|
int n = compute_reflections_rot(*kc, h, k, l, r);
|
|
|
|
for (int i = 0; i < n; ++i) {
|
|
int pos = atomicAdd(counter, 1);
|
|
if (pos < max_reflections)
|
|
out[pos] = r[i];
|
|
else
|
|
atomicSub(counter, 1);
|
|
}
|
|
}
|
|
|
|
inline KernelConstsRot BuildKernelConstsRot(const DiffractionExperiment &experiment,
|
|
const CrystalLattice &lattice,
|
|
const BraggPredictionSettings &settings) {
|
|
KernelConstsRot kc{};
|
|
auto geom = experiment.GetDiffractionGeometry();
|
|
|
|
kc.det_width_pxl = static_cast<float>(experiment.GetXPixelsNum());
|
|
kc.det_height_pxl = static_cast<float>(experiment.GetYPixelsNum());
|
|
kc.beam_x = geom.GetBeamX_pxl();
|
|
kc.beam_y = geom.GetBeamY_pxl();
|
|
kc.coeff_const = geom.GetDetectorDistance_mm() / geom.GetPixelSize_mm();
|
|
|
|
float one_over_dmax = 1.0f / settings.high_res_A;
|
|
kc.one_over_dmax_sq = one_over_dmax * one_over_dmax;
|
|
kc.one_over_wavelength = 1.0f / geom.GetWavelength_A();
|
|
|
|
// Store mosaicity and wedge in radians for partiality calculation
|
|
kc.mos_angle_rad = settings.mosaicity_deg * static_cast<float>(M_PI) / 180.0f;
|
|
kc.wedge_angle_rad = settings.wedge_deg * static_cast<float>(M_PI) / 180.0f;
|
|
kc.min_zeta = settings.min_zeta;
|
|
kc.mosaicity_multiplier = settings.mosaicity_multiplier;
|
|
|
|
kc.Astar = lattice.Astar();
|
|
kc.Bstar = lattice.Bstar();
|
|
kc.Cstar = lattice.Cstar();
|
|
kc.S0 = geom.GetScatteringVector();
|
|
|
|
auto rotT = geom.GetPoniRotMatrix().transpose().arr();
|
|
for (int i = 0; i < 9; ++i) kc.rot[i] = rotT[i];
|
|
|
|
kc.centering = settings.centering;
|
|
return kc;
|
|
}
|
|
|
|
inline void BuildGoniometerBasis(const DiffractionExperiment &experiment, KernelConstsRot &kc) {
|
|
const auto gon_opt = experiment.GetGoniometer();
|
|
if (!gon_opt.has_value())
|
|
throw JFJochException(JFJochExceptionCategory::InputParameterInvalid,
|
|
"BraggPredictionRotationGPU requires a goniometer axis");
|
|
const GoniometerAxis &gon = *gon_opt;
|
|
|
|
// m2 = normalize(axis)
|
|
float m2x = gon.GetAxis().x;
|
|
float m2y = gon.GetAxis().y;
|
|
float m2z = gon.GetAxis().z;
|
|
normalize3(m2x, m2y, m2z);
|
|
|
|
// m1 = normalize(m2 x S0)
|
|
float m1x, m1y, m1z;
|
|
cross3(m2x, m2y, m2z, kc.S0.x, kc.S0.y, kc.S0.z, m1x, m1y, m1z);
|
|
normalize3(m1x, m1y, m1z);
|
|
|
|
// m3 = normalize(m1 x m2)
|
|
float m3x, m3y, m3z;
|
|
cross3(m1x, m1y, m1z, m2x, m2y, m2z, m3x, m3y, m3z);
|
|
normalize3(m3x, m3y, m3z);
|
|
|
|
kc.m1 = Coord(m1x, m1y, m1z);
|
|
kc.m2 = Coord(m2x, m2y, m2z);
|
|
kc.m3 = Coord(m3x, m3y, m3z);
|
|
kc.m2_S0 = dot3(m2x, m2y, m2z, kc.S0.x, kc.S0.y, kc.S0.z);
|
|
kc.m3_S0 = dot3(m3x, m3y, m3z, kc.S0.x, kc.S0.y, kc.S0.z);
|
|
}
|
|
} // namespace
|
|
|
|
BraggPredictionRotGPU::BraggPredictionRotGPU(int max_reflections)
|
|
: BraggPrediction(max_reflections),
|
|
reg_out(reflections), d_out(max_reflections),
|
|
dK(1), d_count(1), h_count(1) {
|
|
}
|
|
|
|
int BraggPredictionRotGPU::Calc(const DiffractionExperiment &experiment,
|
|
const CrystalLattice &lattice,
|
|
const BraggPredictionSettings &settings) {
|
|
KernelConstsRot hK = BuildKernelConstsRot(experiment, lattice, settings);
|
|
BuildGoniometerBasis(experiment, hK);
|
|
|
|
cudaMemcpyAsync(dK, &hK, sizeof(KernelConstsRot), cudaMemcpyHostToDevice, stream);
|
|
cudaMemsetAsync(d_count, 0, sizeof(int), stream);
|
|
|
|
const int range = 2 * settings.max_hkl;
|
|
dim3 block(8, 8, 8);
|
|
dim3 grid((range + block.x - 1) / block.x,
|
|
(range + block.y - 1) / block.y,
|
|
(range + block.z - 1) / block.z);
|
|
|
|
bragg_rot_kernel_3d<<<grid, block, 0, stream>>>(dK, settings.max_hkl, max_reflections, d_out, d_count);
|
|
|
|
cudaMemcpyAsync(h_count, d_count, sizeof(int), cudaMemcpyDeviceToHost, stream);
|
|
cudaStreamSynchronize(stream);
|
|
|
|
int count = *h_count.get();
|
|
if (count > max_reflections) count = max_reflections;
|
|
if (count == 0) return 0;
|
|
|
|
cudaMemcpyAsync(reflections.data(), d_out, sizeof(Reflection) * count, cudaMemcpyDeviceToHost, stream);
|
|
cudaStreamSynchronize(stream);
|
|
|
|
return count;
|
|
}
|
|
|
|
#endif
|