// SPDX-FileCopyrightText: 2025 Filip Leonarski, Paul Scherrer Institute <filip.leonarski@psi.ch>
// SPDX-License-Identifier: GPL-3.0-only

#include "BraggPredictionGPU.h"

#ifdef JFJOCH_USE_CUDA
#include "../indexing/CUDAMemHelpers.h"
#include <cuda_runtime.h>

namespace {
    __device__ inline bool is_odd(int v) { return (v & 1) != 0; }

    __device__ inline bool compute_reflection(const KernelConsts &C, int h, int k, int l, Reflection &out) {
        if (h == 0 && k == 0 && l == 0)
            return false;

        // Systematic absences (centering only)
        // P, I, A, B, C, F supported
        switch (C.centering) {
            case 'I':
                if (is_odd(h + k + l))
                    return false;
                break;
            case 'A':
                if (is_odd(k + l))
                    return false;
                break;
            case 'B':
                if (is_odd(h + l))
                    return false;
                break;
            case 'C':
                if (is_odd(h + k))
                    return false;
                break;
            case 'F':
                if ((is_odd(h + k)) || (is_odd(h + l)) || (is_odd(k + l)))
                    return false;
                break;
            case 'R': {
                // Rhombohedral in hexagonal setting (hR, a_h=b_h, gamma=120°):
                // Condition: -h + k + l = 3n
                int mod = (-h + k + l) % 3;
                if (mod < 0) mod += 3;
                if (mod != 0) return false;
                break;
            }
            default:
                break;
        }

        float Ah_x = C.Astar.x * h;
        float Ah_y = C.Astar.y * h;
        float Ah_z = C.Astar.z * h;
        float AhBk_x = Ah_x + C.Bstar.x * k;
        float AhBk_y = Ah_y + C.Bstar.y * k;
        float AhBk_z = Ah_z + C.Bstar.z * k;
        float rx = AhBk_x + C.Cstar.x * l;
        float ry = AhBk_y + C.Cstar.y * l;
        float rz = AhBk_z + C.Cstar.z * l;
        float dot = rx * rx + ry * ry + rz * rz;
        if (dot > C.one_over_dmax_sq) return false;
        float Sx = rx + C.S0.x;
        float Sy = ry + C.S0.y;
        float Sz = rz + C.S0.z;
        float S_len = sqrtf(Sx * Sx + Sy * Sy + Sz * Sz);
        float dist_ewald = fabsf(S_len - C.one_over_wavelength);
        if (dist_ewald > C.ewald_cutoff) return false;
        float Srx = C.rot[0] * Sx + C.rot[1] * Sy + C.rot[2] * Sz;
        float Sry = C.rot[3] * Sx + C.rot[4] * Sy + C.rot[5] * Sz;
        float Srz = C.rot[6] * Sx + C.rot[7] * Sy + C.rot[8] * Sz;
        if (Srz <= 0.0f) return false;
        float coeff = C.coeff_const / Srz;
        float x = C.beam_x + Srx * coeff;
        float y = C.beam_y + Sry * coeff;
        if (x < 0.0f || x >= C.det_width_pxl || y < 0.0f || y >= C.det_height_pxl) return false;
        out.h = h;
        out.k = k;
        out.l = l;
        out.predicted_x = x;
        out.predicted_y = y;
        out.d = 1.0f / sqrtf(dot);
        out.dist_ewald = dist_ewald;
        return true;
    }

    __global__ void bragg_kernel_3d(const KernelConsts *__restrict__ kc,
                                    int max_hkl,
                                    int max_reflections,
                                    Reflection *__restrict__ out,
                                    int *__restrict__ counter) {
        int range = 2 * max_hkl;
        int hi = blockIdx.x * blockDim.x + threadIdx.x;
        int ki = blockIdx.y * blockDim.y + threadIdx.y;
        int li = blockIdx.z * blockDim.z + threadIdx.z;
        if (hi >= range || ki >= range || li >= range) return;
        int h = hi - max_hkl;
        int k = ki - max_hkl;
        int l = li - max_hkl;
        Reflection r{};
        if (!compute_reflection(*kc, h, k, l, r)) return;
        int pos = atomicAdd(counter, 1);
        if (pos < max_reflections) out[pos] = r;
        else atomicSub(counter, 1);
    }

    inline KernelConsts BuildKernelConsts(const DiffractionExperiment &experiment,
                                          const CrystalLattice &lattice,
                                          float high_res_A,
                                          float ewald_dist_cutoff,
                                          char centering) {
        KernelConsts kc{};
        auto geom = experiment.GetDiffractionGeometry();
        kc.det_width_pxl = static_cast<float>(experiment.GetXPixelsNum());
        kc.det_height_pxl = static_cast<float>(experiment.GetYPixelsNum());
        kc.beam_x = geom.GetBeamX_pxl();
        kc.beam_y = geom.GetBeamY_pxl();
        kc.coeff_const = geom.GetDetectorDistance_mm() / geom.GetPixelSize_mm();
        float one_over_dmax = 1.0f / high_res_A;
        kc.one_over_dmax_sq = one_over_dmax * one_over_dmax;
        kc.one_over_wavelength = 1.0f / geom.GetWavelength_A();
        kc.ewald_cutoff = ewald_dist_cutoff;
        kc.Astar = lattice.Astar();
        kc.Bstar = lattice.Bstar();
        kc.Cstar = lattice.Cstar();
        kc.S0 = geom.GetScatteringVector();
        kc.centering = centering;
        auto rotT = geom.GetPoniRotMatrix().transpose().arr();
        for (int i = 0; i < 9; ++i) kc.rot[i] = rotT[i];
        return kc;
    }
} // namespace

BraggPredictionGPU::BraggPredictionGPU(int max_reflections)
    : BraggPrediction(max_reflections),
      reg_out(reflections), d_out(max_reflections),
      dK(1), d_count(1), h_count(1) {
}

int BraggPredictionGPU::Calc(const DiffractionExperiment &experiment,
                             const CrystalLattice &lattice,
                             const BraggPredictionSettings &settings) {
    // Build constants on host
    KernelConsts hK = BuildKernelConsts(experiment, lattice, settings.high_res_A, settings.ewald_dist_cutoff, settings.centering);
    cudaMemcpyAsync(dK, &hK, sizeof(KernelConsts), cudaMemcpyHostToDevice, stream);
    cudaMemsetAsync(d_count, 0, sizeof(int), stream);

    // Configure and launch on the stream
    const int range = 2 * settings.max_hkl;
    dim3 block(8, 8, 8);
    dim3 grid((range + block.x - 1) / block.x,
              (range + block.y - 1) / block.y,
              (range + block.z - 1) / block.z);

    bragg_kernel_3d<<<grid, block, 0, stream>>>(dK, settings.max_hkl, max_reflections, d_out, d_count);

    // Async D2H count and synchronize
    cudaMemcpyAsync(h_count, d_count, sizeof(int), cudaMemcpyDeviceToHost, stream);
    cudaStreamSynchronize(stream);

    int count = *h_count.get();
    if (count > max_reflections) count = max_reflections;
    if (count == 0)
        return {};

    cudaMemcpyAsync(reflections.data(), d_out, sizeof(Reflection) * count, cudaMemcpyDeviceToHost, stream);
    cudaStreamSynchronize(stream);

    return count;
}

#endif