diff --git a/fpga/hls/CMakeLists.txt b/fpga/hls/CMakeLists.txt index 098f9084..69aa737e 100644 --- a/fpga/hls/CMakeLists.txt +++ b/fpga/hls/CMakeLists.txt @@ -12,7 +12,8 @@ ADD_LIBRARY( HLSSimulation STATIC icmp.cpp arp.cpp ip_header_checksum.h udp.cpp - sls_detector.cpp) + sls_detector.cpp + save_to_hbm.cpp) TARGET_INCLUDE_DIRECTORIES(HLSSimulation PUBLIC ../include) TARGET_LINK_LIBRARIES(HLSSimulation CommonFunctions) @@ -45,6 +46,7 @@ MAKE_HLS_MODULE(ethernet.cpp ethernet) MAKE_HLS_MODULE(arp.cpp arp) MAKE_HLS_MODULE(udp.cpp udp) MAKE_HLS_MODULE(sls_detector.cpp sls_detector) +MAKE_HLS_MODULE(save_to_hbm.cpp save_to_hbm) SET (HLS_IPS psi_ch_hls_data_collection_fsm_1_0.zip psi_ch_hls_timer_host_1_0.zip @@ -58,7 +60,8 @@ SET (HLS_IPS psi_ch_hls_data_collection_fsm_1_0.zip psi_ch_hls_udp_1_0.zip psi_ch_hls_sls_detector_1_0.zip psi_ch_hls_icmp_1_0.zip - psi_ch_hls_host_writer_1_0.zip) + psi_ch_hls_host_writer_1_0.zip + psi_ch_hls_save_to_hbm_1_0.zip) SET (HLS_IPS ${HLS_IPS} PARENT_SCOPE) ADD_CUSTOM_TARGET(hls DEPENDS ${HLS_IPS}) diff --git a/fpga/hls/hls_jfjoch.h b/fpga/hls/hls_jfjoch.h index 58c3f845..09fed7ec 100644 --- a/fpga/hls/hls_jfjoch.h +++ b/fpga/hls/hls_jfjoch.h @@ -118,6 +118,18 @@ void internal_packet_generator(STREAM_512 &data_in, STREAM_512 &data_out, ap_uint<512> module_cache[RAW_MODULE_SIZE * sizeof(uint16_t) / 512 * 8], volatile ap_uint<1> &in_cancel); +void save_to_hbm(STREAM_512 &data_in, + hls::stream > &addr_in, + STREAM_512 &data_out, + hls::stream > &addr_out, + hls::burst_maxi d_hbm_p0, hls::burst_maxi d_hbm_p1, + hls::burst_maxi d_hbm_p2, hls::burst_maxi d_hbm_p3, + STREAM_512 &completion_out, + volatile uint64_t &packets_processed, + volatile ap_uint<1> &idle, + ap_uint<8> &err_reg, + uint32_t hbm_size); + template ap_uint pack32(ap_int in[32]) { #pragma HLS INLINE ap_uint out; diff --git a/fpga/hls/save_to_hbm.cpp b/fpga/hls/save_to_hbm.cpp new file mode 100644 index 00000000..559967c2 --- /dev/null +++ b/fpga/hls/save_to_hbm.cpp @@ -0,0 +1,241 @@ +// Copyright (2019-2022) Paul Scherrer Institute +// SPDX-License-Identifier: CERN-OHL-S-2.0 or GPL-3.0-or-later + +#include "hls_jfjoch.h" + +#ifndef __SYNTHESIS__ +#include +#endif + +#define PACKET_SIZE 8192 +#define HBM_BURST_SIZE 64 + +inline void write_completion(STREAM_512 &m_axis_completion, + const ap_uint<32> &handle, + const ap_uint<8> &module_number, + const ap_uint<64> &frame_num, + const ap_uint<256> &packet_mask, + const ap_uint<16> &packet_count, + const ap_uint<32> &debug, + const ap_uint<64> ×tamp, + const ap_uint<64> &bunchid, + const ap_uint<32> &exptime, + const ap_uint<32> &data_collection_id, + const ap_uint<1> &flushing) { +#pragma HLS INLINE + + ap_uint<1> all_packets_ok = packet_mask.and_reduce(); + ap_uint<1> any_packets_received = packet_mask.or_reduce(); + ap_uint<8> status = 0; + status[0] = all_packets_ok; + status[1] = any_packets_received; + status[2] = flushing; + ap_uint<128> tmp = (handle, packet_count, status, module_number, frame_num); + status[7] = tmp.xor_reduce(); // ensure completion has even parity +/* + if (handle != HANDLE_SKIP_FRAME) { + m_axis_completion << handle; + m_axis_completion << (packet_count, status, module_number); + m_axis_completion << frame_num(63, 32); + m_axis_completion << frame_num(31, 0); + + m_axis_completion << timestamp(63,32); + m_axis_completion << timestamp(31,0); + m_axis_completion << bunchid(63,32); + m_axis_completion << bunchid(31,0); + + m_axis_completion << exptime; + m_axis_completion << debug; + m_axis_completion << 0; + m_axis_completion << data_collection_id; + + m_axis_completion << packet_mask(127,96); + m_axis_completion << packet_mask( 95,64); + m_axis_completion << packet_mask( 63,32); + m_axis_completion << packet_mask( 31, 0); + } */ + +} + +void save_to_hbm(STREAM_512 &data_in, + hls::stream > &addr_in, + STREAM_512 &data_out, + hls::stream > &addr_out, + hls::burst_maxi d_hbm_p0, hls::burst_maxi d_hbm_p1, + hls::burst_maxi d_hbm_p2, hls::burst_maxi d_hbm_p3, + STREAM_512 &completion_out, + volatile uint64_t &packets_processed, + volatile ap_uint<1> &idle, + ap_uint<8> &err_reg, + uint32_t hbm_size) { +#pragma HLS INTERFACE ap_ctrl_none port=return +#pragma HLS INTERFACE register both axis port=data_in +#pragma HLS INTERFACE register both axis port=addr_in +#pragma HLS INTERFACE register both axis port=data_out +#pragma HLS INTERFACE register both axis port=addr_out +#pragma HLS INTERFACE register both axis port=completion_out +#pragma HLS INTERFACE register ap_vld port=packets_processed +#pragma HLS INTERFACE register ap_vld port=err_reg +#pragma HLS INTERFACE register ap_none port=idle +#pragma HLS INTERFACE register ap_stable port=hbm_size + +#pragma HLS INTERFACE m_axi port=d_hbm_p0 bundle=d_hbm_p0 depth=512 offset=off \ + max_read_burst_length=2 max_write_burst_length=16 latency=120 num_write_outstanding=8 num_read_outstanding=2 +#pragma HLS INTERFACE m_axi port=d_hbm_p1 bundle=d_hbm_p1 depth=512 offset=off \ + max_read_burst_length=2 max_write_burst_length=16 latency=120 num_write_outstanding=8 num_read_outstanding=2 +#pragma HLS INTERFACE m_axi port=d_hbm_p2 bundle=d_hbm_p2 depth=512 offset=off \ + max_read_burst_length=2 max_write_burst_length=16 latency=120 num_write_outstanding=8 num_read_outstanding=2 +#pragma HLS INTERFACE m_axi port=d_hbm_p3 bundle=d_hbm_p3 depth=512 offset=off \ + max_read_burst_length=2 max_write_burst_length=16 latency=120 num_write_outstanding=8 num_read_outstanding=2 + + ap_uint<128> packet_mask[MAX_MODULES_FPGA*2]; +#pragma HLS RESOURCE variable=packet_mask core=RAM_1P + ap_uint<16> packet_count[MAX_MODULES_FPGA*2]; +#pragma HLS RESOURCE variable=packet_count core=RAM_1P + ap_uint<32> handle[MAX_MODULES_FPGA*2]; +#pragma HLS RESOURCE variable=handle core=RAM_1P + ap_uint<64> curr_frame[MAX_MODULES_FPGA*2]; +#pragma HLS RESOURCE variable=curr_frame core=RAM_1P + ap_uint<32> debug[MAX_MODULES_FPGA*2]; +#pragma HLS RESOURCE variable=debug core=RAM_1P + ap_uint<64> timestamp[MAX_MODULES_FPGA*2]; +#pragma HLS RESOURCE variable=timestamp core=RAM_1P + ap_uint<32> exptime[MAX_MODULES_FPGA*2]; +#pragma HLS RESOURCE variable=exptime core=RAM_1P + ap_uint<64> jf_bunchid[MAX_MODULES_FPGA*2]; +#pragma HLS RESOURCE variable=jf_bunchid core=RAM_1P + + idle = 1; + + for (int i = 0; i < MAX_MODULES_FPGA*2; i++) { +#pragma HLS UNROLL + curr_frame[i] = UINT64_MAX; + handle[i] = 0; + packet_mask[i] = 0; + packet_count[i] = 0; + debug[i] = 0; + timestamp[i] = 0; + exptime[i] = 0; + jf_bunchid[i] = 0; + } + + uint32_t handle_val = 0; + + ap_uint addr; + addr_in >> addr; + addr_out << addr; + + packet_512_t packet_in; + data_in >> packet_in; + data_out << packet_in; + + ap_uint<5> nmodules = ACT_REG_NMODULES(packet_in.data); + ap_uint<32> data_collection_mode = ACT_REG_MODE(packet_in.data); + ap_uint<32> data_collection_id = data_collection_mode(31, 16); // upper 16-bit of mode + + ap_uint<1> mode_nonblocking = (data_collection_mode & MODE_NONBLOCKING_ON_WR) ? 1 : 0; + + ap_uint<8> internal_err_reg = 0; + err_reg = internal_err_reg; + + idle = 0; + uint64_t total_counter = 0; + packets_processed = 0; + addr_in >> addr; + addr_out << addr; + + Loop_good_packet: + while (!addr_last_flag(addr)) { + // Process one UDP packet per iteration +#pragma HLS PIPELINE II=128 + ap_uint<64> frame_number = addr_frame_number(addr); + ap_uint<4> module_number = addr_module(addr); + ap_uint<7> eth_packet = addr_eth_packet(addr); + ap_uint<5> id = module_number * 2 + (frame_number % 2); + + if (curr_frame[id] != frame_number) { + if (packet_mask[id] != 0) { + ap_uint<32> comp_handle = handle[id]; + ap_uint<64> comp_frame = curr_frame[id]; + ap_uint<256> comp_packet_mask = packet_mask[id]; + ap_uint<16> comp_packet_count = packet_count[id]; + ap_uint<32> comp_debug = debug[id]; + ap_uint<64> comp_timestamp = timestamp[id]; + ap_uint<64> comp_bunchid = jf_bunchid[id]; + ap_uint<32> comp_exptime = exptime[id]; + + write_completion(completion_out, comp_handle, module_number, + comp_frame, comp_packet_mask, comp_packet_count, + comp_debug, comp_timestamp, comp_bunchid, + comp_exptime, data_collection_id, 0); + } + + handle[id] = handle_val; + curr_frame[id] = frame_number; + + debug[id] = addr_jf_debug(addr); + timestamp[id] = addr_timestamp(addr); + jf_bunchid[id] = addr_bunch_id(addr); + exptime[id] = addr_exptime(addr); + + packet_mask[id] = ap_uint<128>(1) << eth_packet; + packet_count[id] = 1; + + handle_val = (handle_val + 1) % hbm_size; + } else { + packet_count[id]++; + packet_mask[id] |= ap_uint<128>(1) << eth_packet; + } + + size_t out_frame_addr = (handle[id] * 128 + eth_packet) * 64; + + for (int i = 0; i < 64; i++) { + if (i % 16 == 0) { + d_hbm_p0.write_request(out_frame_addr + i, 16); + d_hbm_p1.write_request(out_frame_addr + i, 16); + d_hbm_p2.write_request(out_frame_addr + i, 16); + d_hbm_p3.write_request(out_frame_addr + i, 16); + } + + data_in >> packet_in; + data_out << packet_in; + d_hbm_p0.write(packet_in.data(255, 0)); + d_hbm_p1.write(packet_in.data(511, 256)); + + data_in >> packet_in; + data_out << packet_in; + d_hbm_p2.write(packet_in.data(255, 0)); + d_hbm_p3.write(packet_in.data(511, 256)); + + if (i % 16 == 15) { + d_hbm_p0.write_response(); + d_hbm_p1.write_response(); + d_hbm_p2.write_response(); + d_hbm_p3.write_response(); + } + } + if (packet_in.last != 1) + internal_err_reg[1] = 1; + + + total_counter++; + packets_processed = total_counter; + addr_in >> addr; + addr_out << addr; + err_reg = internal_err_reg; + } + + for (ap_uint<8> m = 0; m < nmodules * 2; m++) { +#pragma HLS PIPELINE II=16 + if (packet_mask[m] != 0) + write_completion(completion_out, handle[m], m / 2, curr_frame[m], + packet_mask[m], packet_count[m], + debug[m], timestamp[m], jf_bunchid[m], + exptime[m], data_collection_id, 1); + } + + data_in >> packet_in; + data_out << packet_in; + + idle = 1; +} diff --git a/fpga/scripts/bd_pcie.tcl b/fpga/scripts/bd_pcie.tcl index df18c8fa..6c10cfab 100644 --- a/fpga/scripts/bd_pcie.tcl +++ b/fpga/scripts/bd_pcie.tcl @@ -390,6 +390,10 @@ proc create_root_design { parentCell } { connect_bd_intf_net -intf_net jungfraujoch_0_m_axi_d_hbm_p9 [get_bd_intf_pins hbm_infrastructure/s_axi_hbm_9] [get_bd_intf_pins jungfraujoch_0/m_axi_d_hbm_p9] connect_bd_intf_net -intf_net jungfraujoch_0_m_axi_d_hbm_p10 [get_bd_intf_pins hbm_infrastructure/s_axi_hbm_10] [get_bd_intf_pins jungfraujoch_0/m_axi_d_hbm_p10] connect_bd_intf_net -intf_net jungfraujoch_0_m_axi_d_hbm_p11 [get_bd_intf_pins hbm_infrastructure/s_axi_hbm_11] [get_bd_intf_pins jungfraujoch_0/m_axi_d_hbm_p11] + connect_bd_intf_net -intf_net jungfraujoch_0_m_axi_d_hbm_p12 [get_bd_intf_pins hbm_infrastructure/s_axi_hbm_12] [get_bd_intf_pins jungfraujoch_0/m_axi_d_hbm_p12] + connect_bd_intf_net -intf_net jungfraujoch_0_m_axi_d_hbm_p13 [get_bd_intf_pins hbm_infrastructure/s_axi_hbm_13] [get_bd_intf_pins jungfraujoch_0/m_axi_d_hbm_p13] + connect_bd_intf_net -intf_net jungfraujoch_0_m_axi_d_hbm_p14 [get_bd_intf_pins hbm_infrastructure/s_axi_hbm_14] [get_bd_intf_pins jungfraujoch_0/m_axi_d_hbm_p14] + connect_bd_intf_net -intf_net jungfraujoch_0_m_axi_d_hbm_p15 [get_bd_intf_pins hbm_infrastructure/s_axi_hbm_15] [get_bd_intf_pins jungfraujoch_0/m_axi_d_hbm_p15] connect_bd_intf_net -intf_net jungfraujoch_0_m_axis_c2h_data [get_bd_intf_pins jungfraujoch_0/m_axis_c2h_data] [get_bd_intf_pins pcie_dma_0/s_axis_c2h_data] connect_bd_intf_net -intf_net jungfraujoch_0_m_axis_c2h_datamover_cmd [get_bd_intf_pins jungfraujoch_0/m_axis_c2h_datamover_cmd] [get_bd_intf_pins pcie_dma_0/s_axis_c2h_cmd] connect_bd_intf_net -intf_net jungfraujoch_0_m_axis_h2c_datamover_cmd [get_bd_intf_pins jungfraujoch_0/m_axis_h2c_datamover_cmd] [get_bd_intf_pins pcie_dma_0/s_axis_h2c_cmd] diff --git a/fpga/scripts/jfjoch.tcl b/fpga/scripts/jfjoch.tcl index 52ef5551..526355f6 100644 --- a/fpga/scripts/jfjoch.tcl +++ b/fpga/scripts/jfjoch.tcl @@ -64,6 +64,14 @@ proc create_hier_cell_jungfraujoch { parentCell nameHier } { create_bd_intf_pin -mode Master -vlnv xilinx.com:interface:aximm_rtl:1.0 m_axi_d_hbm_p11 + create_bd_intf_pin -mode Master -vlnv xilinx.com:interface:aximm_rtl:1.0 m_axi_d_hbm_p12 + + create_bd_intf_pin -mode Master -vlnv xilinx.com:interface:aximm_rtl:1.0 m_axi_d_hbm_p13 + + create_bd_intf_pin -mode Master -vlnv xilinx.com:interface:aximm_rtl:1.0 m_axi_d_hbm_p14 + + create_bd_intf_pin -mode Master -vlnv xilinx.com:interface:aximm_rtl:1.0 m_axi_d_hbm_p15 + create_bd_intf_pin -mode Master -vlnv xilinx.com:interface:axis_rtl:1.0 m_axis_c2h_data create_bd_intf_pin -mode Master -vlnv xilinx.com:interface:axis_rtl:1.0 m_axis_c2h_datamover_cmd diff --git a/receiver/HLSSimulatedDevice.cpp b/receiver/HLSSimulatedDevice.cpp index 601022cf..cda6db15 100644 --- a/receiver/HLSSimulatedDevice.cpp +++ b/receiver/HLSSimulatedDevice.cpp @@ -216,11 +216,13 @@ void HLSSimulatedDevice::HLSMainThread() { STREAM_512 converted_1; STREAM_512 converted_2; + STREAM_512 converted_3; hls::stream > addr0; hls::stream > addr1; hls::stream > addr2; hls::stream > addr3; + hls::stream > addr4; hls::stream > udp_metadata; ap_uint<1> idle_data_collection; @@ -306,10 +308,25 @@ void HLSSimulatedDevice::HLSMainThread() { // Timer procedure - count how many times write_data is not accepting input (to help track down latency issues) hls_cores.emplace_back([&] { timer_host(converted_1, converted_2, counter_host); }); + STREAM_512 save_to_hbm_completion; + ap_uint<8> save_to_hbm_err_reg; + uint64_t save_to_hbm_packets_processed; + ap_uint<1> save_to_hbm_idle; + + hls_cores.emplace_back([&] { save_to_hbm(converted_2, addr3, converted_3, addr4, + (hbm256_t *) (hbm_memory[12].data()), + (hbm256_t *) (hbm_memory[13].data()), + (hbm256_t *) (hbm_memory[14].data()), + (hbm256_t *) (hbm_memory[15].data()), + save_to_hbm_completion, + save_to_hbm_packets_processed, + save_to_hbm_idle, + save_to_hbm_err_reg, + 16); }); // 3. Prepare data to write to host memory hls_cores.emplace_back([&] { - host_writer(converted_2, addr3, datamover_out.GetDataStream(), + host_writer(converted_3, addr4, datamover_out.GetDataStream(), datamover_out.GetCtrlStream(), work_request_stream, completion_stream, packets_processed, host_writer_idle, err_reg); }); @@ -328,6 +345,9 @@ void HLSSimulatedDevice::HLSMainThread() { if (!addr3.empty()) throw std::runtime_error("Addr3 queue not empty"); + if (!addr4.empty()) + throw std::runtime_error("Addr4 queue not empty"); + if (!raw1.empty()) throw std::runtime_error("Raw1 queue not empty"); @@ -349,6 +369,9 @@ void HLSSimulatedDevice::HLSMainThread() { if (!converted_2.empty()) throw std::runtime_error("Converted_2 queue not empty"); + if (!converted_3.empty()) + throw std::runtime_error("Converted_3 queue not empty"); + if (!datamover_in.GetDataStream().empty()) throw std::runtime_error("Datamover queue is not empty");