Files
Jungfraujoch/image_analysis/bragg_prediction/BraggPredictionRotGPU.cu
Filip Leonarski 1ab257af6c
All checks were successful
Build Packages / build:rpm (ubuntu2404_nocuda) (push) Successful in 12m8s
Build Packages / build:rpm (rocky8_nocuda) (push) Successful in 12m57s
Build Packages / build:rpm (ubuntu2204_nocuda) (push) Successful in 12m55s
Build Packages / build:rpm (rocky8_sls9) (push) Successful in 12m0s
Build Packages / build:rpm (rocky9_nocuda) (push) Successful in 13m30s
Build Packages / Generate python client (push) Successful in 20s
Build Packages / Unit tests (push) Has been skipped
Build Packages / Create release (push) Has been skipped
Build Packages / Build documentation (push) Successful in 39s
Build Packages / build:rpm (rocky8) (push) Successful in 9m23s
Build Packages / build:rpm (rocky9_sls9) (push) Successful in 10m33s
Build Packages / build:rpm (ubuntu2404) (push) Successful in 8m2s
Build Packages / build:rpm (ubuntu2204) (push) Successful in 8m42s
Build Packages / build:rpm (rocky9) (push) Successful in 9m38s
v1.0.0-rc.125 (#32)
This is an UNSTABLE release. This version adds scalign and merging. These are experimental at the moment, and should not be used for production analysis.
If things go wrong with analysis, it is better to revert to 1.0.0-rc.124.

* jfjoch_broker: Improve logic on switching on/off spot finding
* jfjoch_broker: Increase maximum spot count for FFBIDX to 65536
* jfjoch_broker: Increase default maximum unit cell for FFT to 500 A (could have performance impact, TBD)
* jfjoch_process: Add scalign and merging functionality - program is experimental at the moment and should not be used for production analysis
* jfjoch_viewer: Display partiality and reciprocal Lorentz-polarization correction for each reflection
* jfjoch_writer: Save more information about each reflection

Reviewed-on: #32
Co-authored-by: Filip Leonarski <filip.leonarski@psi.ch>
Co-committed-by: Filip Leonarski <filip.leonarski@psi.ch>
2026-02-18 16:17:21 +01:00

294 lines
11 KiB
Plaintext

// SPDX-FileCopyrightText: 2025 Filip Leonarski, Paul Scherrer Institute <filip.leonarski@psi.ch>
// SPDX-License-Identifier: GPL-3.0-only
#include "BraggPredictionRotGPU.h"
#ifdef JFJOCH_USE_CUDA
#include "../indexing/CUDAMemHelpers.h"
#include <cuda_runtime.h>
#include <cmath>
namespace {
__host__ __device__ inline bool is_odd(int v) { return (v & 1) != 0; }
__host__ __device__ inline void cross3(float ax, float ay, float az,
float bx, float by, float bz,
float &cx, float &cy, float &cz) {
cx = ay * bz - az * by;
cy = az * bx - ax * bz;
cz = ax * by - ay * bx;
}
__host__ __device__ inline float dot3(float ax, float ay, float az,
float bx, float by, float bz) {
return ax * bx + ay * by + az * bz;
}
__host__ __device__ inline void normalize3(float &x, float &y, float &z) {
float len = sqrtf(x * x + y * y + z * z);
if (len < 1e-12f) { x = 0.0f; y = 0.0f; z = 0.0f; return; }
float inv = 1.0f / len;
x *= inv; y *= inv; z *= inv;
}
__device__ inline int compute_reflections_rot(const KernelConstsRot &C, int h, int k, int l, Reflection out[2]) {
if (h == 0 && k == 0 && l == 0)
return 0;
switch (C.centering) {
case 'I':
if (is_odd(h + k + l)) return false;
break;
case 'A':
if (is_odd(k + l)) return false;
break;
case 'B':
if (is_odd(h + l)) return false;
break;
case 'C':
if (is_odd(h + k)) return false;
break;
case 'F':
if (is_odd(h + k) || is_odd(h + l) || is_odd(k + l)) return false;
break;
case 'R': {
int mod = (-h + k + l) % 3;
if (mod < 0) mod += 3;
if (mod != 0) return false;
break;
}
default:
break;
}
// p0 = A* h + B* k + C* l
float p0x = C.Astar.x * h + C.Bstar.x * k + C.Cstar.x * l;
float p0y = C.Astar.y * h + C.Bstar.y * k + C.Cstar.y * l;
float p0z = C.Astar.z * h + C.Bstar.z * k + C.Cstar.z * l;
float p0_sq = p0x * p0x + p0y * p0y + p0z * p0z;
if (p0_sq <= 0.0f || p0_sq > C.one_over_dmax_sq)
return 0;
float p0_m1 = p0x * C.m1.x + p0y * C.m1.y + p0z * C.m1.z;
float p0_m2 = p0x * C.m2.x + p0y * C.m2.y + p0z * C.m2.z;
float p0_m3 = p0x * C.m3.x + p0y * C.m3.y + p0z * C.m3.z;
float rho_sq = p0_sq - (p0_m2 * p0_m2);
float p_m3 = (-p0_sq / 2.0f - p0_m2 * C.m2_S0) / C.m3_S0;
float p_m2 = p0_m2;
if (rho_sq < p_m3 * p_m3) return 0;
if (p0_sq > 4.0f * dot3(C.S0.x, C.S0.y, C.S0.z, C.S0.x, C.S0.y, C.S0.z)) return 0;
float p_m1_pos = sqrtf(rho_sq - p_m3 * p_m3);
float p_m1_arr[2] = {p_m1_pos, -p_m1_pos};
int count = 0;
for (int idx = 0; idx < 2; ++idx) {
float p_m1 = p_m1_arr[idx];
float cosphi = (p_m1 * p0_m1 + p_m3 * p0_m3) / rho_sq;
float sinphi = (p_m1 * p0_m3 - p_m3 * p0_m1) / rho_sq;
float px = C.m1.x * p_m1 + C.m2.x * p_m2 + C.m3.x * p_m3;
float py = C.m1.y * p_m1 + C.m2.y * p_m2 + C.m3.y * p_m3;
float pz = C.m1.z * p_m1 + C.m2.z * p_m2 + C.m3.z * p_m3;
float Sx = C.S0.x + px;
float Sy = C.S0.y + py;
float Sz = C.S0.z + pz;
float phi = -1.0f * atan2f(sinphi, cosphi);
// e1 = normalize(S x S0) - direction perpendicular to both S and S0
float e1x, e1y, e1z;
cross3(Sx, Sy, Sz, C.S0.x, C.S0.y, C.S0.z, e1x, e1y, e1z);
normalize3(e1x, e1y, e1z);
// zeta = |m2 · e1| - the "lorentz-like" geometric factor for partiality
float zeta_abs = fabsf(dot3(C.m2.x, C.m2.y, C.m2.z, e1x, e1y, e1z));
// Check min_zeta threshold (consistent with CPU)
if (zeta_abs < C.min_zeta)
continue;
// epsilon3 cutoff check (consistent with CPU, Kabsch formulation)
float epsilon3 = fabsf(phi * zeta_abs);
if (epsilon3 > C.mosaicity_multiplier * C.mos_angle_rad)
continue;
float cx, cy, cz;
cross3(Sx, Sy, Sz, C.S0.x, C.S0.y, C.S0.z, cx, cy, cz);
float S_dot_S0 = dot3(Sx, Sy, Sz, C.S0.x, C.S0.y, C.S0.z);
float lorentz = fabsf(dot3(C.m2.x, C.m2.y, C.m2.z, cx, cy, cz)) / S_dot_S0;
// Partiality calculation (Kabsch formulation)
// c1 = sqrt(2) * sigma / zeta, where sigma = mosaicity
float c1 = zeta_abs / (sqrtf(2.0f) * C.mos_angle_rad);
float half_wedge = C.wedge_angle_rad / 2.0f;
float partiality = (erff((phi + half_wedge) * c1)
- erff((phi - half_wedge) * c1)) / 2.0f;
// Use S (rotated) for projection
float Srx = C.rot[0] * Sx + C.rot[1] * Sy + C.rot[2] * Sz;
float Sry = C.rot[3] * Sx + C.rot[4] * Sy + C.rot[5] * Sz;
float Srz = C.rot[6] * Sx + C.rot[7] * Sy + C.rot[8] * Sz;
if (Srz <= 0.0f) continue;
float coeff = C.coeff_const / Srz;
float x = C.beam_x + Srx * coeff;
float y = C.beam_y + Sry * coeff;
if (x < 0.0f || x >= C.det_width_pxl || y < 0.0f || y >= C.det_height_pxl)
continue;
float dist_ewald = fabsf(sqrtf(Sx * Sx + Sy * Sy + Sz * Sz) - C.one_over_wavelength);
out[count].h = h;
out[count].k = k;
out[count].l = l;
out[count].delta_phi_deg = phi * 180.0 / M_PI;
out[count].predicted_x = x;
out[count].predicted_y = y;
out[count].d = 1.0f / sqrtf(p0_sq);
out[count].dist_ewald = dist_ewald;
out[count].rlp = lorentz;
out[count].partiality = partiality;
out[count].zeta = zeta_abs;
count++;
}
return count;
}
__global__ void bragg_rot_kernel_3d(const KernelConstsRot *__restrict__ kc,
int max_hkl,
int max_reflections,
Reflection *__restrict__ out,
int *__restrict__ counter) {
int range = 2 * max_hkl + 1;
int hi = blockIdx.x * blockDim.x + threadIdx.x;
int ki = blockIdx.y * blockDim.y + threadIdx.y;
int li = blockIdx.z * blockDim.z + threadIdx.z;
if (hi >= range || ki >= range || li >= range) return;
int h = hi - max_hkl;
int k = ki - max_hkl;
int l = li - max_hkl;
Reflection r[2];
int n = compute_reflections_rot(*kc, h, k, l, r);
for (int i = 0; i < n; ++i) {
int pos = atomicAdd(counter, 1);
if (pos < max_reflections)
out[pos] = r[i];
else
atomicSub(counter, 1);
}
}
inline KernelConstsRot BuildKernelConstsRot(const DiffractionExperiment &experiment,
const CrystalLattice &lattice,
const BraggPredictionSettings &settings) {
KernelConstsRot kc{};
auto geom = experiment.GetDiffractionGeometry();
kc.det_width_pxl = static_cast<float>(experiment.GetXPixelsNum());
kc.det_height_pxl = static_cast<float>(experiment.GetYPixelsNum());
kc.beam_x = geom.GetBeamX_pxl();
kc.beam_y = geom.GetBeamY_pxl();
kc.coeff_const = geom.GetDetectorDistance_mm() / geom.GetPixelSize_mm();
float one_over_dmax = 1.0f / settings.high_res_A;
kc.one_over_dmax_sq = one_over_dmax * one_over_dmax;
kc.one_over_wavelength = 1.0f / geom.GetWavelength_A();
// Store mosaicity and wedge in radians for partiality calculation
kc.mos_angle_rad = settings.mosaicity_deg * static_cast<float>(M_PI) / 180.0f;
kc.wedge_angle_rad = settings.wedge_deg * static_cast<float>(M_PI) / 180.0f;
kc.min_zeta = settings.min_zeta;
kc.mosaicity_multiplier = settings.mosaicity_multiplier;
kc.Astar = lattice.Astar();
kc.Bstar = lattice.Bstar();
kc.Cstar = lattice.Cstar();
kc.S0 = geom.GetScatteringVector();
auto rotT = geom.GetPoniRotMatrix().transpose().arr();
for (int i = 0; i < 9; ++i) kc.rot[i] = rotT[i];
kc.centering = settings.centering;
return kc;
}
inline void BuildGoniometerBasis(const DiffractionExperiment &experiment, KernelConstsRot &kc) {
const auto gon_opt = experiment.GetGoniometer();
if (!gon_opt.has_value())
throw JFJochException(JFJochExceptionCategory::InputParameterInvalid,
"BraggPredictionRotationGPU requires a goniometer axis");
const GoniometerAxis &gon = *gon_opt;
// m2 = normalize(axis)
float m2x = gon.GetAxis().x;
float m2y = gon.GetAxis().y;
float m2z = gon.GetAxis().z;
normalize3(m2x, m2y, m2z);
// m1 = normalize(m2 x S0)
float m1x, m1y, m1z;
cross3(m2x, m2y, m2z, kc.S0.x, kc.S0.y, kc.S0.z, m1x, m1y, m1z);
normalize3(m1x, m1y, m1z);
// m3 = normalize(m1 x m2)
float m3x, m3y, m3z;
cross3(m1x, m1y, m1z, m2x, m2y, m2z, m3x, m3y, m3z);
normalize3(m3x, m3y, m3z);
kc.m1 = Coord(m1x, m1y, m1z);
kc.m2 = Coord(m2x, m2y, m2z);
kc.m3 = Coord(m3x, m3y, m3z);
kc.m2_S0 = dot3(m2x, m2y, m2z, kc.S0.x, kc.S0.y, kc.S0.z);
kc.m3_S0 = dot3(m3x, m3y, m3z, kc.S0.x, kc.S0.y, kc.S0.z);
}
} // namespace
BraggPredictionRotGPU::BraggPredictionRotGPU(int max_reflections)
: BraggPrediction(max_reflections),
reg_out(reflections), d_out(max_reflections),
dK(1), d_count(1), h_count(1) {
}
int BraggPredictionRotGPU::Calc(const DiffractionExperiment &experiment,
const CrystalLattice &lattice,
const BraggPredictionSettings &settings) {
KernelConstsRot hK = BuildKernelConstsRot(experiment, lattice, settings);
BuildGoniometerBasis(experiment, hK);
cudaMemcpyAsync(dK, &hK, sizeof(KernelConstsRot), cudaMemcpyHostToDevice, stream);
cudaMemsetAsync(d_count, 0, sizeof(int), stream);
const int range = 2 * settings.max_hkl;
dim3 block(8, 8, 8);
dim3 grid((range + block.x - 1) / block.x,
(range + block.y - 1) / block.y,
(range + block.z - 1) / block.z);
bragg_rot_kernel_3d<<<grid, block, 0, stream>>>(dK, settings.max_hkl, max_reflections, d_out, d_count);
cudaMemcpyAsync(h_count, d_count, sizeof(int), cudaMemcpyDeviceToHost, stream);
cudaStreamSynchronize(stream);
int count = *h_count.get();
if (count > max_reflections) count = max_reflections;
if (count == 0) return 0;
cudaMemcpyAsync(reflections.data(), d_out, sizeof(Reflection) * count, cudaMemcpyDeviceToHost, stream);
cudaStreamSynchronize(stream);
return count;
}
#endif