Jungfraujoch/fpga/hls/integration.cpp

// SPDX-FileCopyrightText: 2024 Filip Leonarski, Paul Scherrer Institute <filip.leonarski@psi.ch>
// SPDX-License-Identifier: CERN-OHL-S-2.0

#include "hls_jfjoch.h"

void integration(STREAM_768 &data_in,
                 STREAM_768 &data_out,
                 hls::stream<ap_axiu<64,1,1,1>> &result_out,
                 hls::stream<axis_completion > &s_axis_completion,
                 hls::stream<axis_completion > &m_axis_completion,
                 ap_uint<256> *d_hbm_p0,
                 ap_uint<256> *d_hbm_p1,
                 ap_uint<256> *d_hbm_p2,
                 ap_uint<256> *d_hbm_p3,
                 volatile ap_uint<1> &idle,
                 ap_uint<32> hbm_size_bytes) {
#pragma HLS INTERFACE register both axis port=data_in
#pragma HLS INTERFACE register both axis port=data_out
#pragma HLS INTERFACE register both axis port=result_out
#pragma HLS INTERFACE register both axis port=m_axis_completion
#pragma HLS INTERFACE register both axis port=s_axis_completion
#pragma HLS INTERFACE register ap_none port=hbm_size_bytes
#pragma HLS INTERFACE register ap_none port=idle

#pragma HLS INTERFACE m_axi port=d_hbm_p0 bundle=d_hbm_p0 depth=16384 offset=off \
        max_read_burst_length=16 max_write_burst_length=2 latency=120 num_write_outstanding=2 num_read_outstanding=8
#pragma HLS INTERFACE m_axi port=d_hbm_p1 bundle=d_hbm_p1 depth=16384 offset=off \
        max_read_burst_length=16 max_write_burst_length=2 latency=120 num_write_outstanding=2 num_read_outstanding=8
#pragma HLS INTERFACE m_axi port=d_hbm_p2 bundle=d_hbm_p2 depth=16384 offset=off \
        max_read_burst_length=16 max_write_burst_length=2 latency=120 num_write_outstanding=2 num_read_outstanding=8
#pragma HLS INTERFACE m_axi port=d_hbm_p3 bundle=d_hbm_p3 depth=16384 offset=off \
        max_read_burst_length=16 max_write_burst_length=2 latency=120 num_write_outstanding=2 num_read_outstanding=8

    idle = 1;

    ap_fixed<50,44, AP_RND_CONV> sum[64][FPGA_INTEGRATION_BIN_COUNT];
    // log2(32768*512*1024/64) = 32 + sign 1 bit
#pragma HLS BIND_STORAGE variable=sum type=ram_t2p impl=bram
#pragma HLS ARRAY_PARTITION variable=sum type=complete dim=1
    ap_uint<18> count[64][FPGA_INTEGRATION_BIN_COUNT]; // log2(16*512*1024/64) = 17
#pragma HLS BIND_STORAGE variable=count type=ram_t2p impl=bram
#pragma HLS ARRAY_PARTITION variable=count type=complete dim=1


    for (int j = 0; j < FPGA_INTEGRATION_BIN_COUNT; j++) {
#pragma HLS PIPELINE II=1
        for (int i = 0; i < 64; i++) {
            sum[i][j] = 0;
            count[i][j] = 0;
        }
    }

    ap_int<24> in_val[32];
    ap_uint<16> in_bin[32];
    integration_factor_t in_coeff[32];

    ap_uint<256> bins_0, bins_1, coeff_0, coeff_1;

    packet_768_t packet_in;
    {
#pragma HLS PROTOCOL fixed
        data_in >> packet_in;
        ap_wait();
        data_out << packet_in;
        ap_wait();
        idle = 0;
        ap_wait();
    }
    ap_uint<32> offset_hbm_0         = 12 * hbm_size_bytes / 32;
    ap_uint<32> offset_hbm_1         = 13 * hbm_size_bytes / 32;
    ap_uint<32> offset_hbm_2         = 14 * hbm_size_bytes / 32;
    ap_uint<32> offset_hbm_3         = 15 * hbm_size_bytes / 32;

    axis_completion cmpl;
    s_axis_completion >> cmpl;
    while (!cmpl.last) {
        m_axis_completion << cmpl;
        for (int i = 0; i < RAW_MODULE_SIZE / 32 / 2; i++) {
#pragma HLS PIPELINE II=2
            for (int k = 0; k < 2; k++) {
                data_in >> packet_in;
                data_out << packet_in;
                bins_0 = d_hbm_p0[offset_hbm_0 + cmpl.module * RAW_MODULE_SIZE * sizeof(int16_t) / 64 + i * 2 + k];
                bins_1 = d_hbm_p1[offset_hbm_1 + cmpl.module * RAW_MODULE_SIZE * sizeof(int16_t) / 64 + i * 2 + k];
                coeff_0 = d_hbm_p2[offset_hbm_2 + cmpl.module * RAW_MODULE_SIZE * sizeof(int16_t) / 64 + i * 2 + k];
                coeff_1 = d_hbm_p3[offset_hbm_3 + cmpl.module * RAW_MODULE_SIZE * sizeof(int16_t) / 64 + i * 2 + k];

                unpack_2xhbm_to_32x16bit(bins_0, bins_1, in_bin);
                unpack_2xhbm_to_32x16bit(coeff_0, coeff_1, in_coeff);

                unpack32(packet_in.data, in_val);

                for (int j = 0; j < 32; j++) {
                    ap_fixed<32, 16, AP_RND_CONV> tmp = in_val[j] * in_coeff[j];
                    if ((in_val[j] != INT24_MAX)
                        && (in_val[j] != INT24_MIN)
                        && (packet_in.strb[j])
                        && (in_bin[j] < FPGA_INTEGRATION_BIN_COUNT)) {
                        sum[k * 32 + j][in_bin[j]] += tmp;
                        count[k * 32 + j][in_bin[j]] += 1;
                    }
                }
            }
        }
        s_axis_completion >> cmpl;

        for (int i = 0; i < FPGA_INTEGRATION_BIN_COUNT; i++) {
#pragma HLS PIPELINE II=1
            ap_axiu<64,1,1,1> res;
            ap_fixed<64, 50, AP_RND_CONV> main_sum = 0;
            ap_uint<22> main_count = 0;
            for (int j = 0; j < 64; j++) {
                main_sum += sum[j][i];
                main_count += count[j][i];
                sum[j][i] = 0;
                count[j][i] = 0;
            }
            res.data(31,    0) = main_count;

            float_uint32 conv;
            conv.f = main_sum.to_float();

            res.data(63, 32) = conv.u;

            res.dest = 0;
            res.id = 0;
            res.keep = UINT8_MAX;
            res.strb = UINT8_MAX;
            res.user = 0;
            res.last = ((i == FPGA_INTEGRATION_BIN_COUNT - 1) ? 1 : 0);
            result_out << res;
        }
    }
    m_axis_completion << cmpl;

    result_out << ap_axiu<64,1,1,1>{.user = 1};

    data_in >> packet_in;
    data_out << packet_in;
    idle = 1;
}