// Copyright (2019-2023) Paul Scherrer Institute #include "hls_jfjoch.h" void integration(STREAM_512 &data_in, STREAM_512 &data_out, hls::stream> &result_out, hls::stream &s_axis_completion, hls::stream &m_axis_completion, ap_uint<256> *d_hbm_p0, ap_uint<256> *d_hbm_p1, ap_uint<32> hbm_size_bytes) { #pragma HLS INTERFACE ap_ctrl_none port=return #pragma HLS INTERFACE register both axis port=data_in #pragma HLS INTERFACE register both axis port=data_out #pragma HLS INTERFACE register both axis port=result_out #pragma HLS INTERFACE register both axis port=m_axis_completion #pragma HLS INTERFACE register both axis port=s_axis_completion #pragma HLS INTERFACE register ap_none port=hbm_size_bytes #pragma HLS INTERFACE m_axi port=d_hbm_p0 bundle=d_hbm_p0 depth=512 offset=off \ max_read_burst_length=16 max_write_burst_length=2 latency=120 num_write_outstanding=2 num_read_outstanding=8 #pragma HLS INTERFACE m_axi port=d_hbm_p1 bundle=d_hbm_p1 depth=512 offset=off \ max_read_burst_length=16 max_write_burst_length=2 latency=120 num_write_outstanding=2 num_read_outstanding=8 ap_int<30> sum[64][FPGA_INTEGRATION_BIN_COUNT]; // log2(32768*512*1024/64) = 28 + sign 1 bit #pragma HLS BIND_STORAGE variable=sum type=ram_t2p impl=bram #pragma HLS ARRAY_PARTITION variable=sum type=complete dim=1 ap_uint<14> count[64][FPGA_INTEGRATION_BIN_COUNT]; // log2(512*1024/64) = 13 #pragma HLS BIND_STORAGE variable=count type=ram_t2p impl=bram #pragma HLS ARRAY_PARTITION variable=count type=complete dim=1 ap_uint<44> sum2[64][FPGA_INTEGRATION_BIN_COUNT]; // log2(32768*32768*512*1024/64) = 43 #pragma HLS BIND_STORAGE variable=sum2 type=ram_t2p impl=bram #pragma HLS ARRAY_PARTITION variable=sum2 type=complete dim=1 for (int j = 0; j < FPGA_INTEGRATION_BIN_COUNT; j++) { #pragma HLS PIPELINE II=1 for (int i = 0; i < 64; i++) { sum[i][j] = 0; sum2[i][j] = 0; count[i][j] = 0; } } ap_int<16> in_val[32]; ap_uint<16> in_bin[32]; ap_uint<256> bins_0, bins_1; packet_512_t packet_in; data_in >> packet_in; data_out << packet_in; ap_uint<32> offset_hbm_0 = 16 * hbm_size_bytes / 32; ap_uint<32> offset_hbm_1 = 17 * hbm_size_bytes / 32; axis_completion cmpl; s_axis_completion >> cmpl; while (!cmpl.last) { m_axis_completion << cmpl; for (int i = 0; i < RAW_MODULE_SIZE / 32 / 2; i++) { #pragma HLS PIPELINE II=2 for (int k = 0; k < 2; k++) { data_in >> packet_in; data_out << packet_in; bins_0 = d_hbm_p0[offset_hbm_0 + cmpl.module * RAW_MODULE_SIZE * sizeof(int16_t) / 64 + i * 2 + k]; bins_1 = d_hbm_p1[offset_hbm_1 + cmpl.module * RAW_MODULE_SIZE * sizeof(int16_t) / 64 + i * 2 + k]; unpack_2xhbm_to_32x16bit(bins_0, bins_1, in_bin); unpack32(packet_in.data, in_val); for (int j = 0; j < 32; j++) { ap_uint<44> tmp = in_val[j] * in_val[j]; if ((in_val[j] != INT16_MAX) && (in_val[j] != INT16_MIN) && (in_bin[j] < FPGA_INTEGRATION_BIN_COUNT)) { sum[k * 32 + j][in_bin[j]] += in_val[j]; sum2[k * 32 + j][in_bin[j]] += tmp; count[k * 32 + j][in_bin[j]] += 1; } } } } for (int i = 0; i < FPGA_INTEGRATION_BIN_COUNT; i++) { #pragma HLS PIPELINE II=1 ap_uint<192> val = 0; ap_int<38> main_sum = 0; ap_int<52> main_sum2 = 0; ap_int<22> main_count = 0; for (int j = 0; j < 64; j++) { main_sum += sum[j][i]; main_sum2 += sum2[j][i]; main_count += count[j][i]; sum[j][i] = 0; sum2[j][i] = 0; count[j][i] = 0; } val(63, 0 ) = main_count; val(127, 64) = main_sum; val(191, 128) = main_sum2; result_out << val; } s_axis_completion >> cmpl; } m_axis_completion << cmpl; data_in >> packet_in; data_out << packet_in; }