101 lines
3.9 KiB
C++
101 lines
3.9 KiB
C++
// Copyright (2019-2023) Paul Scherrer Institute
|
|
|
|
#include "hls_jfjoch.h"
|
|
|
|
void integration(STREAM_512 &data_in,
|
|
STREAM_512 &data_out,
|
|
hls::stream<ap_uint<512>> &result_out,
|
|
hls::stream<axis_completion > &s_axis_completion,
|
|
hls::stream<axis_completion > &m_axis_completion,
|
|
ap_uint<256> *d_hbm_p0,
|
|
ap_uint<256> *d_hbm_p1) {
|
|
#pragma HLS INTERFACE ap_ctrl_none port=return
|
|
#pragma HLS INTERFACE register both axis port=data_in
|
|
#pragma HLS INTERFACE register both axis port=data_out
|
|
#pragma HLS INTERFACE register both axis port=result_out
|
|
#pragma HLS INTERFACE register both axis port=m_axis_completion
|
|
#pragma HLS INTERFACE register both axis port=s_axis_completion
|
|
#pragma HLS INTERFACE m_axi port=d_hbm_p0 bundle=d_hbm_p0 depth=512 offset=off \
|
|
max_read_burst_length=16 max_write_burst_length=2 latency=120 num_write_outstanding=2 num_read_outstanding=8
|
|
#pragma HLS INTERFACE m_axi port=d_hbm_p1 bundle=d_hbm_p1 depth=512 offset=off \
|
|
max_read_burst_length=16 max_write_burst_length=2 latency=120 num_write_outstanding=2 num_read_outstanding=8
|
|
|
|
ap_int<30> sum[64][FPGA_INTEGRATION_BIN_COUNT];
|
|
// log2(32768*512*1024/64) = 28 + sign 1 bit
|
|
#pragma HLS BIND_STORAGE variable=sum type=ram_t2p impl=bram
|
|
#pragma HLS ARRAY_PARTITION variable=sum type=complete dim=1
|
|
ap_uint<14> count[64][FPGA_INTEGRATION_BIN_COUNT]; // log2(512*1024/64) = 13
|
|
#pragma HLS BIND_STORAGE variable=count type=ram_t2p impl=bram
|
|
#pragma HLS ARRAY_PARTITION variable=count type=complete dim=1
|
|
|
|
for (int j = 0; j < FPGA_INTEGRATION_BIN_COUNT; j++) {
|
|
#pragma HLS PIPELINE II=1
|
|
for (int i = 0; i < 64; i++) {
|
|
sum[i][j] = 0;
|
|
count[i][j] = 0;
|
|
}
|
|
}
|
|
|
|
ap_int<16> in_val[32];
|
|
ap_uint<16> in_bin[32];
|
|
ap_uint<256> bins_0, bins_1;
|
|
|
|
packet_512_t packet_in;
|
|
data_in >> packet_in;
|
|
data_out << packet_in;
|
|
|
|
ap_uint<32> hbm_size_256b = ACT_REG_HBM_SIZE_256b(packet_in.data);
|
|
ap_uint<32> offset_hbm_0 = 16 * hbm_size_256b;
|
|
ap_uint<32> offset_hbm_1 = 17 * hbm_size_256b;
|
|
|
|
axis_completion cmpl;
|
|
s_axis_completion >> cmpl;
|
|
while (!cmpl.last) {
|
|
m_axis_completion << cmpl;
|
|
for (int i = 0; i < RAW_MODULE_SIZE / 32 / 2; i++) {
|
|
#pragma HLS PIPELINE II=2
|
|
for (int k = 0; k < 2; k++) {
|
|
data_in >> packet_in;
|
|
data_out << packet_in;
|
|
bins_0 = d_hbm_p0[offset_hbm_0 + cmpl.module * RAW_MODULE_SIZE * sizeof(int16_t) / 64 + i * 2 + k];
|
|
bins_1 = d_hbm_p1[offset_hbm_1 + cmpl.module * RAW_MODULE_SIZE * sizeof(int16_t) / 64 + i * 2 + k];
|
|
unpack_2xhbm_to_32x16bit(bins_0, bins_1, in_bin);
|
|
unpack32(packet_in.data, in_val);
|
|
|
|
for (int j = 0; j < 32; j++) {
|
|
if ((in_val[j] != INT16_MAX) && (in_val[j] != INT16_MIN) && (in_bin[j] < FPGA_INTEGRATION_BIN_COUNT)) {
|
|
sum[k * 32 + j][in_bin[j]] += in_val[j];
|
|
count[k * 32 + j][in_bin[j]] += 1;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
for (int i = 0; i < FPGA_INTEGRATION_BIN_COUNT/4; i++) {
|
|
#pragma HLS PIPELINE II=4
|
|
ap_uint<512> val = 0;
|
|
|
|
for (int k = 0; k < 4; k++) {
|
|
ap_int<64> main_sum = 0;
|
|
ap_int<64> main_count = 0;
|
|
|
|
for (int j = 0; j < 64; j++) {
|
|
main_sum += sum[j][4 * i + k];
|
|
main_count += count[j][4 * i + k];
|
|
sum[j][4 * i + k] = 0;
|
|
count[j][4 * i + k] = 0;
|
|
}
|
|
val(128 * k + 63, 128 * k) = main_sum;
|
|
val(128 * k + 127, 128 * k + 64) = main_count;
|
|
}
|
|
|
|
result_out << val;
|
|
}
|
|
s_axis_completion >> cmpl;
|
|
}
|
|
m_axis_completion << cmpl;
|
|
|
|
data_in >> packet_in;
|
|
data_out << packet_in;
|
|
}
|