// SPDX-FileCopyrightText: 2024 Filip Leonarski, Paul Scherrer Institute <filip.leonarski@psi.ch>
// SPDX-License-Identifier: CERN-OHL-S-2.0

#include "hls_jfjoch.h"

#define o(field) offsetof(ModuleStatistics, field)
#define sf(msg, field, s) msg(o(field)*8 + s - 1, o(field)*8)

inline ap_uint<1024> fill_module_info(axis_completion &cmpl, ap_uint<PIXEL_COUNT_WIDTH> &pixel_count_result) {
    ap_uint<1024> msg = 0;
    sf(msg, frame_number, 64) = cmpl.frame_number;
    sf(msg, timestamp, 64) = cmpl.timestamp;
    sf(msg, pulse_id, 64) = cmpl.bunchid;
    sf(msg, detector_type, 32) = cmpl.detector_type;
    sf(msg, exptime, 32) = cmpl.exptime;
    sf(msg, debug, 32) = cmpl.debug;
    sf(msg, pedestal, 32) = cmpl.pedestal;
    sf(msg, packet_count, 32) = cmpl.packet_count;
    sf(msg, module_number, 32) = cmpl.module;
    sf(msg, err_pixels, 32) = pixel_count_result(31, 0);
    sf(msg, saturated_pixels, 32) = pixel_count_result(63, 32);
    sf(msg, max_value, 32) = pixel_count_result(95, 64);
    sf(msg, min_value, 32) = pixel_count_result(127,96);
    sf(msg, pixel_sum, 64) = pixel_count_result(191,128);
    sf(msg, masked_pixels, 32) = pixel_count_result(223, 192);
    return msg;
}

inline void write_completion(hls::stream<ap_uint<32> > &m_axis_completion,
                             const ap_uint<16> &handle,
                             const ap_uint<16> &data_collection_id) {
#pragma HLS INLINE
    m_axis_completion << (data_collection_id, handle);
}


void read_request(hls::stream<ap_uint<32> > &s_axis_work_request, ap_uint<16> &handle) {
#pragma HLS INLINE
    ap_uint<32> tmp1;
    s_axis_work_request >> tmp1;
    handle = tmp1(15, 0);
}

void host_writer(STREAM_512 &data_in,
                 hls::stream<ap_uint<512>> &adu_histo_in,
                 hls::stream<ap_uint<512>> &integration_in,
                 hls::stream<ap_uint<512>> &spot_finder_in,
                 hls::stream<ap_uint<512>> &roi_count_in,
                 hls::stream<ap_uint<PIXEL_COUNT_WIDTH>> &pixel_calc_in,
                 hls::stream<axis_completion > &s_axis_completion,
                 hls::stream<ap_axiu<512,1,1,1> > &host_memory_out,
                 hls::stream<axis_datamover_ctrl> &datamover_out_cmd,
                 hls::stream<ap_uint<32> > &s_axis_work_request,
                 hls::stream<ap_uint<32> > &m_axis_completion,
                 const uint64_t *dma_address_table,
                 volatile uint64_t &packets_processed,
                 volatile ap_uint<1> &idle,
                 volatile ap_uint<1> &in_cancel,
                 volatile ap_uint<3> &state) {
#pragma HLS INTERFACE ap_ctrl_none port=return
#pragma HLS INTERFACE register both axis port=data_in
#pragma HLS INTERFACE register both axis port=adu_histo_in
#pragma HLS INTERFACE register both axis port=integration_in
#pragma HLS INTERFACE register both axis port=spot_finder_in
#pragma HLS INTERFACE register both axis port=roi_count_in
#pragma HLS INTERFACE register both axis port=pixel_calc_in
#pragma HLS INTERFACE register both axis port=s_axis_completion
#pragma HLS INTERFACE register both axis port=host_memory_out
#pragma HLS INTERFACE register both axis port=datamover_out_cmd
#pragma HLS INTERFACE register both axis port=m_axis_completion
#pragma HLS INTERFACE register both axis port=s_axis_work_request
#pragma HLS INTERFACE register ap_vld port=packets_processed
#pragma HLS INTERFACE register ap_none port=idle
#pragma HLS INTERFACE register ap_none port=in_cancel
#pragma HLS INTERFACE register ap_none port=state

#pragma HLS INTERFACE mode=m_axi port=dma_address_table bundle=dma_address_table depth=65536 offset=off \
		max_read_burst_length=2  max_write_burst_length=2 latency=10 num_write_outstanding=1 num_read_outstanding=1

    idle = 1;

    state = 0;

    ap_uint<16> req_handle;
    ap_uint<16> req_handle_send = HANDLE_START;
    ap_uint<64> req_host_offset;

     while (data_in.empty()) {
        if (!s_axis_work_request.empty()) {
#pragma HLS PIPELINE II=1
            read_request(s_axis_work_request, req_handle);
            write_completion(m_axis_completion, req_handle, DATA_COLLECTION_ID_PURGE);
        }
    }
    packet_512_t packet;
    {
#pragma HLS PROTOCOL fixed
        data_in >> packet;
        ap_wait();
    }
    ap_uint<32> data_collection_mode = ACT_REG_MODE(packet.data);
    ap_uint<32> data_collection_id = data_collection_mode(31, 16);
    ap_uint<1> mode_32bit = (data_collection_mode & MODE_32BIT_OUTPUT) ? 1 : 0;
    ap_uint<1> mode_8bit = (data_collection_mode & MODE_8BIT_OUTPUT) ? 1 : 0;
    uint64_t internal_packets_processed = 0;
    packets_processed = internal_packets_processed;

    write_completion(m_axis_completion, HANDLE_START, data_collection_id);

    idle = 0;

    size_t pixel_depth;
    if (mode_32bit)
        pixel_depth = 4;
    else if (mode_8bit)
        pixel_depth = 1;
    else
        pixel_depth = 2;

    axis_completion cmpl;
    s_axis_completion >> cmpl;

    state = 1;
    while (!cmpl.last) {
        ap_uint<1> send_images = 1;

        while (s_axis_work_request.empty() && !in_cancel.read()) {
#pragma HLS PIPELINE II=1
            ap_wait();
        }
        if (s_axis_work_request.empty()) {
            send_images = 0;
            state = 2;
        } else
            state = 3;

        // Either send_images == 0 (so collection can proceed without writing to host mem)
        // or s_axis_work_request is not empty (so collection can proceed with writing to host mem)
        if (send_images) {
            read_request(s_axis_work_request, req_handle);
            req_host_offset = dma_address_table[req_handle];
            send_images = (req_host_offset != 0);
        }

        packet_512_t packet_out;
        packet_out.strb = UINT64_MAX;
        packet_out.keep = UINT64_MAX;
        packet_out.dest = 0;
        packet_out.user = 0;
        packet_out.id = 0;
        packet_out.last = 0;

        if (send_images) {
            setup_datamover(datamover_out_cmd, req_host_offset, RAW_MODULE_SIZE * pixel_depth);
            setup_datamover(datamover_out_cmd, req_host_offset + offsetof(DeviceOutput, spot_finding_result),
                            (RAW_MODULE_SIZE / 8 + 64));
            setup_datamover(datamover_out_cmd, req_host_offset + offsetof(DeviceOutput, integration_result),
                            (FPGA_INTEGRATION_BIN_COUNT / 8) * 64);
            setup_datamover(datamover_out_cmd, req_host_offset + offsetof(DeviceOutput, adu_histogram),
                            ADU_HISTO_BIN_COUNT / 16 * 64);
            setup_datamover(datamover_out_cmd, req_host_offset + offsetof(DeviceOutput, roi_counts),
                            FPGA_ROI_COUNT * 64);
            setup_datamover(datamover_out_cmd, req_host_offset + offsetof(DeviceOutput, module_statistics),
                            3 * 64);
        }

        for (int i = 0; i < RAW_MODULE_SIZE * pixel_depth / 64; i++) {
#pragma HLS PIPELINE II=1
            data_in >> packet;
            packet_out.data = packet.data;
            if (i == RAW_MODULE_SIZE * pixel_depth / 64 - 1)
                packet_out.last = 1;
            else
                packet_out.last = 0;
            if (send_images)
                host_memory_out << packet_out;
        }

        packet_out.last = 0;

        // 256 transfers x 512-bit
        for (int i = 0; i < RAW_MODULE_SIZE / (8 * 64) + 1; i++) {
#pragma HLS PIPELINE II=1
            spot_finder_in >> packet_out.data;
            packet_out.last = (i == RAW_MODULE_SIZE / (8 * 64));
            if (send_images)
                host_memory_out << packet_out;
        }

        // 128 transfers x 512-bit
        for (int i = 0; i < FPGA_INTEGRATION_BIN_COUNT / 8; i++) {
#pragma HLS PIPELINE II=1
            integration_in >> packet_out.data;
            packet_out.last = (i == FPGA_INTEGRATION_BIN_COUNT / 8 - 1);
            if (send_images)
                host_memory_out << packet_out;
        }

        // 128 transfers x 512-bit
        for (int i = 0; i < ADU_HISTO_BIN_COUNT / 16; i++) {
#pragma HLS PIPELINE II=1
            ap_uint<512> tmp;
            adu_histo_in >> packet_out.data;
            packet_out.last = (i == ADU_HISTO_BIN_COUNT / 16 - 1);
            if (send_images)
                host_memory_out << packet_out;
        }

        // 8 transfers x 512-bit
        for (int i = 0; i < FPGA_ROI_COUNT; i++) {
#pragma HLS PIPELINE II=1
            ap_uint<512> tmp;
            roi_count_in >> packet_out.data;
            packet_out.last = (i == FPGA_ROI_COUNT - 1);
            if (send_images)
                host_memory_out << packet_out;
        }

        if (send_images)
            state = 5;
        else
            state = 4;

        ap_uint<PIXEL_COUNT_WIDTH> pixel_count_p;
        pixel_calc_in >> pixel_count_p;

        ap_uint<1024> tmp = fill_module_info(cmpl, pixel_count_p);

        if (send_images) {
            packet_out.data = tmp(511,0);
            packet_out.last = 0;
            host_memory_out << packet_out;

            packet_out.data = tmp(1023,512);
            packet_out.last = 0;
            host_memory_out << packet_out;
        }

        packet_out.data = cmpl.packet_mask;
        packet_out.last = 1;

        if (send_images)
            host_memory_out << packet_out;

        if (send_images) {
            if (req_handle_send != HANDLE_START)
                write_completion(m_axis_completion, req_handle_send, data_collection_id);
            req_handle_send = req_handle;
            internal_packets_processed += cmpl.packet_count;
            packets_processed = internal_packets_processed;
        }

        state = 6;
        s_axis_completion >> cmpl;
        state = 7;
    }

    data_in >> packet;

#ifdef JFJOCH_HLS_NOSYNTH
    while (!host_memory_out.empty())
        std::this_thread::sleep_for(std::chrono::milliseconds(1));
#else
    {
        // wait for 5 ms (ensure that PCIe buffer is clean)
#pragma HLS PROTOCOL
        for (int i = 0; i < 5 * 200 * 1000; i++)
                ap_wait();
    }
#endif

    if (req_handle_send != HANDLE_START)
        write_completion(m_axis_completion, req_handle_send, data_collection_id);

    write_completion(m_axis_completion, HANDLE_END, data_collection_id);

    idle = 1;
}