// SPDX-FileCopyrightText: 2025 Filip Leonarski, Paul Scherrer Institute <filip.leonarski@psi.ch>
// SPDX-License-Identifier: GPL-3.0-only

// GPU Spot finding developed by Hans-Christian Stadler (PSI)
// Copyright (2019-2023) Paul Scherrer Institute

#include "ImageSpotFinderGPU.h"
#include "../../common/JFJochException.h"

struct spot_parameters {
    int32_t width;
    int32_t height;
    float strong_pixel_threshold2;
    int32_t count_threshold;
};

// input      X x Y pixels array
// output     X x Y bit array

static constexpr int WARP_SIZE = 32;    // assume warp size of 32 cuda threads per warp

inline void cuda_err(cudaError_t val) {
    if (val != cudaSuccess)
        throw JFJochException(JFJochExceptionCategory::GPUCUDAError, cudaGetErrorString(val));
}

// Write pixel results to bit array
// params: spot finding parameters
// out: pixel result bit array
// pixel: flat pixel index = bit index into bit array
// val: pixel result
// **NOTE**: assumes sizeof(*out) * 8 == WARP_SIZE
__device__ __forceinline__ void write_result(const spot_parameters& params, uint32_t* out, int32_t pixel, uint8_t val)
{
    static_assert(sizeof(*out) * 8 == WARP_SIZE, "Violation of essential implementation assumption: WARP_SIZE must match output array element type bit size!");
    static constexpr unsigned ALL_THREADS = UINT32_MAX;
    const int32_t laneid = threadIdx.x & (WARP_SIZE - 1);
    unsigned result = __ballot_sync(ALL_THREADS, val);
    const int32_t idx = pixel / WARP_SIZE;      // global uint32_t index
    const int32_t bit = pixel % WARP_SIZE;      // local bit index

    if ((bit >= laneid) && (laneid == 0)) {      // write to upper part of uint32_t
        result <<= bit;
        if (result)
            atomicOr(&out[idx], result);
    } else if ((bit < laneid) && (bit == 0)) {   // write to lower part of uint32_t
        result >>= laneid;
        if (result)
            atomicOr(&out[idx], result);
    }
}

// Determine if pixel could be a spot
// params: spot finding parameters
// val: pixel value
// sum: window sum
// sum2: window sum of squares
// count: window valid pixels count
// return the pixel result: 0-no spot / 1-spot candidate
__device__ __forceinline__ uint8_t pixel_result(const spot_parameters& params, const int64_t val, int64_t sum, int64_t sum2, int64_t count)
{
    sum -= val;
    sum2 -= val * val;
    count -= 1;

    const int64_t var = count * sum2 - (sum * sum);   // This should be divided by ((2*NBX+1) * (2*NBY+1)-1)*((2*NBX+1) * (2*NBY+1))
    const int64_t in_minus_mean = val * count - sum;  // Should be divided by ((2*NBX+1) * (2*NBY+1));

    const int64_t tmp1 = in_minus_mean * in_minus_mean;
    const float tmp2 = var * params.strong_pixel_threshold2;
    bool snr_criterion;

    if (params.strong_pixel_threshold2 == 0.0f)
        snr_criterion = true;
    else
        snr_criterion = (count > ImageSpotFinder::MIN_VALID_PIXELS) && (in_minus_mean > 0) && (tmp1 > tmp2);

    bool count_criterion = (params.count_threshold == 0.0f) || (val > params.count_threshold);

    bool strong_pixel = snr_criterion && count_criterion;
    if (val == INT32_MAX)
        strong_pixel = true;
    else if (val == INT32_MIN)
        strong_pixel = false;
    return strong_pixel ? 1 : 0;
}

// Find pixels that could be spots
// in: image input values
// out: pixel result bit array, 1 bit per pixel (0:no/1:candidate spot)
// params: spot finding parameters
//
// The algorithm uses multiple waves (blockDim.y) that run over sections of rows.
// Each wave will write output at the back row and read input at the front row.
// Each wave is split into column output sections (blockDim.x)
// A wave section (block) is responsible for a particular row/column section and
// maintains sum/sum2/count values per column for the output row.
// Every cuda thread is associated with a particular column. The thread maintains
// the sum/sum2/count values in shared memory for it's column. To do this, the input
// pixel values for the hight of the aggregation window are saved in shared memory.
__global__ void analyze_pixel(const int32_t *in, uint32_t *prev_out, uint32_t *out, const spot_parameters params)
{
    // assumption: 2 * params.nby + 1 <= params.rows and 2 * params.nbx + 1 <= params.width
    const int32_t window = 2 * (int)ImageSpotFinder::NBX + 1;                                         // vertical window

    const int32_t writeSize = blockDim.x - 2 * ImageSpotFinder::NBX;                                  // output columns per block
    const int32_t cmin = blockIdx.x * writeSize;                                            // lowest output column
    const int32_t cmax = min(cmin + writeSize, static_cast<int32_t>(params.width));     // past highest output column
    const int32_t col = cmin + threadIdx.x - ImageSpotFinder::NBX;                                    // thread -> column mapping
    const bool data_col = (col >= 0) && (col < static_cast<int32_t>(params.width));           // read global mem
    const bool result_col = (col >= cmin) && (col < cmax);                                   // write result
    const int32_t nWaves = gridDim.y;                                                       // number of waves
    const int32_t rowsPerWave = (params.height + nWaves - 1) / nWaves;                       // rows per wave
    const int32_t rmin = blockIdx.y * rowsPerWave;                                          // lowest result row for this wave
    const int32_t rmax = min(rmin + rowsPerWave, static_cast<int32_t>(params.height));  // past highest result row for this wave

    const int32_t left = max(static_cast<int32_t>(threadIdx.x) - static_cast<int32_t>(ImageSpotFinder::NBX), 0); // leftmost column touched by this thread
    const int32_t right = min(static_cast<int32_t>(threadIdx.x) + static_cast<int32_t>(ImageSpotFinder::NBX) + 1, static_cast<int32_t>(params.width)); // past rightmost column touched by this thread

    int32_t back = rmin;                                                                    // back of wave for writing
    int32_t front = max(back - static_cast<int32_t>(ImageSpotFinder::NBX), 0);                   // front of wave for reading (needs to overtake back initially)

    extern __shared__ int64_t shared_mem[];
    int64_t* shared_sum = shared_mem;                                                       // shared buffer [blockDim.x]
    int64_t* shared_sum2 = &shared_sum[blockDim.x];                                         // shared buffer [blockDim.x]
    auto shared_count = reinterpret_cast<int16_t*>(&shared_sum2[blockDim.x]);           // shared buffer [blockDim.x]
    auto shared_val = reinterpret_cast<int32_t *>(&shared_count[blockDim.x]);                                        // shared cyclic buffer [window, blockDim.x]

    int64_t total_sum;                                                                      // totals
    int64_t total_sum2;
    int32_t total_count;

    // initialize sum, sum2, count, val buffers
    shared_sum[threadIdx.x] = 0;                                                            // shared values without effect on totals
    shared_sum2[threadIdx.x] = 0;
    shared_count[threadIdx.x] = 0;
    for (int i=0; i<window; i++)
        shared_val[i * blockDim.x + threadIdx.x] = INT32_MIN;  // value that is NOT counted

    // wave front up to rmin + nby + 1
    do {
        if (data_col) {     // read at the front end of the wave
            const int32_t npixel = front * params.width + col;
            const bool sat = ((prev_out[npixel / 32] & (1U << (npixel % 32))) != 0);
            const int32_t val = sat ? INT32_MAX : in[npixel];
            shared_val[(front % window) * blockDim.x + threadIdx.x] = val;
            if (val != INT32_MAX && val != INT32_MIN) {
                shared_sum[threadIdx.x] += val;
                shared_sum2[threadIdx.x] += val * val;
                shared_count[threadIdx.x] += 1;
            }
        }
        front++;
    } while (front < rmin + static_cast<int32_t>(ImageSpotFinder::NBX) + 1);
    // wave front up to rmax
    do {
        __syncthreads();    // make others see the shared values
        uint8_t val = 0;
        if (result_col) {   // write at the back end of the wave
            total_sum = total_sum2 = total_count = 0;
            for (auto j = left; j < right; j++) {
                total_sum += shared_sum[j];
                total_sum2 += shared_sum2[j];
                total_count += shared_count[j];
            }
            val = pixel_result(params, shared_val[(back % window) * blockDim.x + threadIdx.x], total_sum, total_sum2, total_count);
        }
        write_result(params, out, back * params.width + col, val);
        back++;
        __syncthreads();    // keep shared values until others have seen them
        if (data_col) {     // read at the front end of the wave
            int16_t cnt = 0;
            int32_t old = shared_val[(front % window) * blockDim.x + threadIdx.x];
            if (old == INT32_MAX || old == INT32_MIN) {
                old = 0;    // no effect value
                cnt = 1;    // bring count to normal
            }
            int32_t val = in[front * params.width + col];
            shared_val[(front % window) * blockDim.x + threadIdx.x] = val;
            if (val == INT32_MAX || val == INT32_MIN) {
                val = 0;    // no effect value
                cnt -= 1;   // count diff from normal
            }
            shared_sum[threadIdx.x] += val - old;
            shared_sum2[threadIdx.x] += val * val - old * old;
            shared_count[threadIdx.x] += cnt;
        }
        front++;
    } while (front < rmax);
    // wave back up to rmax
    do {
        __syncthreads();    // make others see the shared values
        uint8_t val = 0;
        if (result_col) {   // write at the back end of the wave
            total_sum = total_sum2 = total_count = 0;
            for (auto j = left; j < right; j++) {
                total_sum += shared_sum[j];
                total_sum2 += shared_sum2[j];
                total_count += shared_count[j];
            }
            val = pixel_result(params, shared_val[(back % window) * blockDim.x + threadIdx.x], total_sum, total_sum2, total_count);
        }
        write_result(params, out, back * params.width + col, val);
        back++;
        __syncthreads();    // keep shared values until others have seen them
        if (data_col) {     // read at the front end of the wave if possible
            int16_t cnt = -1; // normal count diff
            int32_t old = shared_val[(front % window) * blockDim.x + threadIdx.x];
            if (old == INT32_MAX || old == INT32_MIN) {
                old = 0;    // no effect value
                cnt += 1;   // bring count to normal
            }
            int32_t val = 0;
            if (front < params.height) {
                const int32_t npixel = front * params.width + col;
                const bool sat = ((prev_out[npixel / 32] & (1U << (npixel % 32))) != 0);
                val = sat ? INT32_MAX : in[npixel];
                if (val == INT32_MAX || val == INT32_MIN)
                    val = 0;    // no effect value
                else
                    cnt += 1;   // count diff from normal
            }
            shared_sum[threadIdx.x] += val - old;
            shared_sum2[threadIdx.x] += val * val - old * old;
            shared_count[threadIdx.x] += cnt;
        }
        front++;
    } while (back < rmax);
}

ImageSpotFinderGPU::ImageSpotFinderGPU(int32_t in_width, int32_t in_height) :
        ImageSpotFinder(in_width, in_height),
        input_buffer_reg(input_buffer),
        output_buffer_reg(output_buffer) {
    gpu_in = CudaDevicePtr<int32_t>(width * height);
    gpu_out_0 = CudaDevicePtr<uint32_t>(OutputSize());
    gpu_out_1 = CudaDevicePtr<uint32_t>(OutputSize());
}

std::vector<DiffractionSpot> ImageSpotFinderGPU::Run(const SpotFindingSettings &settings, const std::vector<bool> &res_mask) {
    spot_parameters spot_params{};
    spot_params.height = height;
    spot_params.width = width;
    spot_params.strong_pixel_threshold2 = settings.signal_to_noise_threshold * settings.signal_to_noise_threshold;
    spot_params.count_threshold = settings.photon_count_threshold;

    if (2 * NBX + 1 > windowSizeLimit)
        throw JFJochException(JFJochExceptionCategory::SpotFinderError, "nbx exceeds window size limit");
    if (2 * NBX + 1 > windowSizeLimit)
        throw JFJochException(JFJochExceptionCategory::SpotFinderError, "nby exceeds window size limit");
    if (windowSizeLimit > numberOfCudaThreads)
        throw JFJochException(JFJochExceptionCategory::SpotFinderError, "window size limit exceeds number of cuda threads");
    if (windowSizeLimit > spot_params.width)
        throw JFJochException(JFJochExceptionCategory::SpotFinderError, "window size limit exceeds number of columns");
    if (windowSizeLimit > spot_params.height)
        throw JFJochException(JFJochExceptionCategory::SpotFinderError, "window size limit exceeds number of height");

    const auto nWriters = numberOfCudaThreads - 2 * NBX;
    const auto nBlocks = (spot_params.width + nWriters - 1) / nWriters;
    const auto window = 2 * NBX + 1;
    const auto sharedSize = (2 * sizeof(int64_t) + // sum, sum2
                             window * sizeof(int32_t) + // val
                             1 * sizeof(int16_t) // count
                            ) * numberOfCudaThreads;
    const dim3 blocks(nBlocks, numberOfWaves);

    cuda_err(cudaMemcpyAsync(gpu_in, input_buffer.data(), width * height * sizeof(int32_t), cudaMemcpyHostToDevice, stream));
    cuda_err(cudaMemsetAsync(gpu_out_0, 0, OutputSize() * sizeof(uint32_t), stream));
    cuda_err(cudaMemsetAsync(gpu_out_1, 0, OutputSize() * sizeof(uint32_t), stream));
    analyze_pixel<<<blocks, numberOfCudaThreads, sharedSize, stream>>>
            (gpu_in, gpu_out_1, gpu_out_0, spot_params);
    analyze_pixel<<<blocks, numberOfCudaThreads, sharedSize, stream>>>
            (gpu_in, gpu_out_0, gpu_out_1, spot_params);
    cuda_err(cudaMemcpyAsync(output_buffer.data(), gpu_out_1, OutputSize() * sizeof(uint32_t), cudaMemcpyDeviceToHost, stream));

    cuda_err(cudaStreamSynchronize(stream));

    return ExtractSpots(settings, res_mask);
}