378 lines
12 KiB
C++
378 lines
12 KiB
C++
// Copyright (2019-2022) Paul Scherrer Institute
|
|
// SPDX-License-Identifier: GPL-3.0-or-later
|
|
|
|
#include "HLSSimulatedDevice.h"
|
|
|
|
#include <bitset>
|
|
#include <arpa/inet.h>
|
|
#include "datamover_model.h"
|
|
|
|
uint16_t checksum(const uint16_t *addr, size_t count) {
|
|
/* Compute Internet Checksum for "count" bytes
|
|
* beginning at location "addr".
|
|
*/
|
|
long sum = 0;
|
|
|
|
for (int i = 0; i < count / 2; i++)
|
|
sum += addr[i];
|
|
|
|
/* Add left-over byte, if any */
|
|
if (count % 2 == 1)
|
|
sum += ((uint8_t *) addr)[count / 2];
|
|
|
|
/* Fold 32-bit sum to 16 bits */
|
|
while (sum>>16)
|
|
sum = (sum & 0xffff) + (sum >> 16);
|
|
|
|
return ~sum;
|
|
}
|
|
|
|
HLSSimulatedDevice::HLSSimulatedDevice(uint16_t data_stream, size_t in_frame_buffer_size_modules, int16_t numa_node)
|
|
: FPGAAcquisitionDevice(data_stream),
|
|
datamover_in(Direction::Input),
|
|
datamover_out(Direction::Output, nullptr, 256),
|
|
idle(true) {
|
|
|
|
max_modules = MAX_MODULES_FPGA;
|
|
|
|
MapBuffersStandard(in_frame_buffer_size_modules,
|
|
(3 + 3 * 16) * max_modules + 2, numa_node);
|
|
|
|
auto in_mem_location32 = (uint32_t *) in_mem_location;
|
|
|
|
for (int i = 0; i < max_modules * (3 + 3 * 16) + 2; i++) {
|
|
in_mem_location32[2 * i ] = ((uint64_t) buffer_device[i]) & UINT32_MAX;
|
|
in_mem_location32[2 * i + 1] = ((uint64_t) buffer_device[i]) >> 32;
|
|
}
|
|
|
|
for (auto &i: hbm_memory)
|
|
// i.resize(SIZE_OF_HBM_BLOCK_IN_BYTES);
|
|
i.resize(32*1024*1024); // only 32 MiB instead of 256 MiB per HBM interface (should be more than enough for all the tests anyway)
|
|
}
|
|
|
|
void HLSSimulatedDevice::CreateFinalPacket(const DiffractionExperiment& experiment) {
|
|
CreatePacketJF(experiment, UINT64_MAX, 0, 0, nullptr, false);
|
|
}
|
|
|
|
void HLSSimulatedDevice::SendPacket(char *buffer, int len, uint8_t user) {
|
|
auto obuff = (ap_uint<512> *)buffer;
|
|
|
|
for (int i = 0; i < (len + 63) / 64; i++) {
|
|
packet_512_t packet_in;
|
|
if (i == (len + 63) / 64 - 1) packet_in.last = 1;
|
|
else packet_in.last = 0;
|
|
packet_in.keep = 0xFFFFFFFFFFFFFFFF;
|
|
packet_in.user = user;
|
|
packet_in.data = obuff[i];
|
|
din_eth.write(packet_in);
|
|
}
|
|
|
|
}
|
|
void HLSSimulatedDevice::CreatePacketJF(const DiffractionExperiment& experiment, uint64_t frame_number, uint32_t eth_packet,
|
|
uint32_t module_number, const uint16_t *data, bool trigger, int8_t adjust_axis, uint8_t user) {
|
|
|
|
char buff[256*64];
|
|
memset(buff, 0, 256*64);
|
|
|
|
auto packet = (jf_raw_packet *)buff;
|
|
|
|
packet->ether_type = htons(0x0800);
|
|
packet->sour_mac[0] = 0x00; // module 0
|
|
|
|
uint64_t tmp_mac = fpga_mac_addr;
|
|
for (int i = 0; i < 6; i++)
|
|
packet->dest_mac[i] = (tmp_mac >> (8*i)) % 256;
|
|
|
|
uint32_t half_module = 2 * module_number | ((eth_packet >= 64) ? 1 : 0);
|
|
|
|
packet->ipv4_header_h = htons(0x4500); // Big endian in IP header!
|
|
packet->ipv4_header_total_length = htons(8268); // Big endian in IP header!
|
|
packet->ipv4_header_dest_ip = fpga_ipv4_addr;
|
|
packet->ipv4_header_sour_ip = experiment.GetSrcIPv4Address(data_stream, half_module);
|
|
|
|
packet->ipv4_header_ttl_protocol = htons(0x0011);
|
|
packet->ipv4_header_checksum = checksum( (uint16_t *) &packet->ipv4_header_h, 20); // checksum is already in network order
|
|
|
|
packet->udp_dest_port = htons(GetUDPPort()); // module number
|
|
packet->udp_sour_port = htons(0xDFAC);
|
|
packet->udp_length = htons(8248);
|
|
|
|
// JF headers are little endian
|
|
packet->jf.timestamp = 0xABCDEF0000FEDCBAL;
|
|
packet->jf.bunchid = 0x1234567898765431L;
|
|
packet->jf.xCoord = half_module;
|
|
packet->jf.framenum = frame_number;
|
|
packet->jf.packetnum = eth_packet % 64;
|
|
if (trigger) packet->jf.debug = 1<<31;
|
|
if (data != nullptr) {
|
|
for (int i = 0; i < 4096; i++)
|
|
packet->jf.data[i] = data[i];
|
|
}
|
|
packet->udp_checksum = htons(checksum( (uint16_t *) (buff+42), 8192+48));
|
|
|
|
SendPacket(buff, (130+adjust_axis)*64, user);
|
|
}
|
|
|
|
void HLSSimulatedDevice::CreatePackets(const DiffractionExperiment& experiment, uint64_t frame_number_0, uint64_t frames,
|
|
uint32_t module_number, const uint16_t *data, bool trigger, int8_t adjust_axis,
|
|
uint8_t user) {
|
|
for (uint64_t i = 0; i < frames; i++) {
|
|
for (int j = 0; j < 128; j++)
|
|
CreatePacketJF(experiment, frame_number_0 + i, j, module_number, data + (i * 128 + j) * 4096, trigger, adjust_axis,
|
|
user);
|
|
}
|
|
}
|
|
|
|
AXI_STREAM & HLSSimulatedDevice::OutputStream() {
|
|
return dout_eth;
|
|
}
|
|
|
|
void HLSSimulatedDevice::HW_ReadActionRegister(ActionConfig *job) {
|
|
memcpy(job, &cfg, sizeof(ActionConfig));
|
|
}
|
|
|
|
void HLSSimulatedDevice::HW_WriteActionRegister(const ActionConfig *job) {
|
|
memcpy(&cfg, job, sizeof(ActionConfig));
|
|
}
|
|
|
|
void HLSSimulatedDevice::FPGA_StartAction() {
|
|
if (action_thread.joinable())
|
|
action_thread.join();
|
|
|
|
run_data_collection = 1;
|
|
cancel_data_collection = 0;
|
|
idle = false;
|
|
action_thread = std::thread(&HLSSimulatedDevice::HLSMainThread, this );
|
|
}
|
|
|
|
void HLSSimulatedDevice::FPGA_EndAction() {
|
|
if (action_thread.joinable())
|
|
action_thread.join();
|
|
}
|
|
|
|
HLSSimulatedDevice::~HLSSimulatedDevice() {
|
|
if (action_thread.joinable())
|
|
action_thread.join();
|
|
}
|
|
|
|
bool HLSSimulatedDevice::HW_ReadMailbox(uint32_t values[12]) {
|
|
if (completion_stream.size() < 12)
|
|
return false;
|
|
|
|
for (int i = 0; i < 12; i++)
|
|
values[i] = completion_stream.read();
|
|
|
|
return true;
|
|
}
|
|
|
|
void HLSSimulatedDevice::HW_SetCancelDataCollectionBit() {
|
|
cancel_data_collection = 1;
|
|
}
|
|
|
|
bool HLSSimulatedDevice::HW_IsIdle() const {
|
|
return idle && datamover_out.IsIdle();
|
|
}
|
|
|
|
|
|
bool HLSSimulatedDevice::HW_SendWorkRequest(uint32_t handle) {
|
|
uint64_t address = (handle == UINT32_MAX) ? 0 : (uint64_t) buffer_device.at(handle);
|
|
uint32_t parity = (std::bitset<32>(handle).count() + std::bitset<64>(address).count()) % 2;
|
|
|
|
work_request_stream.write(handle);
|
|
work_request_stream.write(address >> 32);
|
|
work_request_stream.write(address & UINT32_MAX);
|
|
work_request_stream.write(parity);
|
|
return true;
|
|
}
|
|
|
|
void HLSSimulatedDevice::HLSMainThread() {
|
|
uint64_t counter_hbm;
|
|
uint64_t counter_host;
|
|
uint64_t eth_packets;
|
|
uint64_t icmp_packets;
|
|
uint64_t udp_packets;
|
|
uint64_t sls_packets;
|
|
uint32_t udp_len_err;
|
|
uint32_t udp_eth_err;
|
|
|
|
ap_uint<1> clear_counters = 0;
|
|
|
|
uint64_t packets_processed;
|
|
|
|
std::vector<std::thread> hls_cores;
|
|
|
|
STREAM_512 ip1, udp1, udp2, icmp1, arp1;
|
|
|
|
STREAM_512 raw0;
|
|
STREAM_512 raw1;
|
|
STREAM_512 raw2;
|
|
STREAM_512 raw3;
|
|
STREAM_512 raw4;
|
|
|
|
hls::stream<ap_uint<18 * 32> > pedestalG0_subtracted;
|
|
|
|
STREAM_512 converted_1;
|
|
STREAM_512 converted_2;
|
|
|
|
hls::stream<ap_uint<ADDR_STREAM_WIDTH> > addr0;
|
|
hls::stream<ap_uint<ADDR_STREAM_WIDTH> > addr1;
|
|
hls::stream<ap_uint<ADDR_STREAM_WIDTH> > addr2;
|
|
hls::stream<ap_uint<ADDR_STREAM_WIDTH> > addr3;
|
|
|
|
hls::stream<ap_uint<UDP_METADATA_STREAM_WIDTH> > udp_metadata;
|
|
ap_uint<1> idle_data_collection;
|
|
|
|
ap_uint<8> err_reg;
|
|
|
|
std::vector<ap_uint<256>> d_uram_p0(MAX_MODULES_FPGA * RAW_MODULE_SIZE / 32);
|
|
std::vector<ap_uint<256>> d_uram_p1(MAX_MODULES_FPGA * RAW_MODULE_SIZE / 32);
|
|
|
|
while(!din_eth.empty())
|
|
ethernet(din_eth, ip1, arp1, fpga_mac_addr, eth_packets, clear_counters);
|
|
|
|
while(!ip1.empty())
|
|
ipv4(ip1, udp1, icmp1, fpga_ipv4_addr);
|
|
|
|
arp(arp1,
|
|
dout_eth,
|
|
fpga_mac_addr,
|
|
fpga_ipv4_addr,
|
|
1, run_data_collection);
|
|
|
|
while (!arp1.empty()) {
|
|
arp(arp1,
|
|
dout_eth,
|
|
fpga_mac_addr,
|
|
fpga_ipv4_addr,
|
|
1, run_data_collection);
|
|
}
|
|
|
|
// reset static counter
|
|
arp(arp1,
|
|
dout_eth,
|
|
fpga_mac_addr,
|
|
fpga_ipv4_addr,
|
|
0, run_data_collection);
|
|
|
|
while(!icmp1.empty())
|
|
icmp(icmp1, dout_eth, icmp_packets, clear_counters);
|
|
|
|
while (!udp1.empty())
|
|
udp(udp1, udp2, udp_metadata, udp_packets, clear_counters);
|
|
|
|
while (!udp2.empty())
|
|
sls_detector(udp2, udp_metadata, raw0, addr0, sls_packets, udp_eth_err, udp_len_err, clear_counters);
|
|
|
|
// 1. Parse incoming UDP packets
|
|
idle_data_collection = 0;
|
|
hls_cores.emplace_back([&] {
|
|
while (idle_data_collection == 0) {
|
|
data_collection_fsm(raw0, raw1,
|
|
addr0, addr1,
|
|
run_data_collection,
|
|
cancel_data_collection,
|
|
idle_data_collection,
|
|
cfg.mode,
|
|
cfg.one_over_energy,
|
|
cfg.nframes,
|
|
cfg.nmodules,
|
|
cfg.nstorage_cells);
|
|
run_data_collection = 0;
|
|
}
|
|
});
|
|
|
|
// Load external calibration
|
|
hls_cores.emplace_back([&] { load_calibration(raw1, raw2, datamover_in.GetCtrlStream(), datamover_in.GetDataStream(),
|
|
in_mem_location); });
|
|
|
|
// Generate internal packets
|
|
hls_cores.emplace_back([&] { internal_packet_generator(raw2, raw3, addr1, addr2, cancel_data_collection); });
|
|
|
|
// Timer procedure - count how many times pedestal_corr/gain_corr is not accepting input (to help track down latency issues)
|
|
hls_cores.emplace_back([&] { timer_hbm(raw3, raw4, counter_hbm); });
|
|
|
|
// 2. Apply pedestal & gain corrections
|
|
hls_cores.emplace_back([&] { jf_conversion(raw4, converted_1,
|
|
addr2, addr3,
|
|
d_uram_p0.data(), d_uram_p1.data(),
|
|
(hbm256_t *) (hbm_memory[0].data()), (hbm256_t *) (hbm_memory[1].data()),
|
|
(hbm256_t *) (hbm_memory[2].data()), (hbm256_t *) (hbm_memory[3].data()),
|
|
(hbm256_t *) (hbm_memory[4].data()), (hbm256_t *) (hbm_memory[5].data()),
|
|
(hbm256_t *) (hbm_memory[6].data()), (hbm256_t *) (hbm_memory[7].data()),
|
|
(hbm256_t *) (hbm_memory[8].data()), (hbm256_t *) (hbm_memory[9].data())); });
|
|
|
|
// Timer procedure - count how many times write_data is not accepting input (to help track down latency issues)
|
|
hls_cores.emplace_back([&] { timer_host(converted_1, converted_2, counter_host); });
|
|
|
|
|
|
// 3. Prepare data to write to host memory
|
|
hls_cores.emplace_back([&] {
|
|
host_writer(converted_2, addr3, datamover_out.GetDataStream(),
|
|
datamover_out.GetCtrlStream(), work_request_stream, completion_stream,
|
|
packets_processed, err_reg); });
|
|
|
|
for (auto &i : hls_cores)
|
|
i.join();
|
|
|
|
if (!din_eth.empty())
|
|
throw std::runtime_error("din_eth queue not empty");
|
|
|
|
if (!addr1.empty())
|
|
throw std::runtime_error("Addr1 queue not empty");
|
|
|
|
if (!addr2.empty())
|
|
throw std::runtime_error("Addr2 queue not empty");
|
|
|
|
if (!addr3.empty())
|
|
throw std::runtime_error("Addr3 queue not empty");
|
|
|
|
if (!raw1.empty())
|
|
throw std::runtime_error("Raw1 queue not empty");
|
|
|
|
if (!raw2.empty())
|
|
throw std::runtime_error("Raw2 queue not empty");
|
|
|
|
if (!raw3.empty())
|
|
throw std::runtime_error("Raw3 queue not empty");
|
|
|
|
if (!raw4.empty())
|
|
throw std::runtime_error("Raw4 queue not empty");
|
|
|
|
if (!pedestalG0_subtracted.empty())
|
|
throw std::runtime_error("PedestalG0_subtracted queue not empty");
|
|
|
|
if (!converted_1.empty())
|
|
throw std::runtime_error("Converted_1 queue not empty");
|
|
|
|
if (!converted_2.empty())
|
|
throw std::runtime_error("Converted_2 queue not empty");
|
|
|
|
if (!datamover_in.GetDataStream().empty())
|
|
throw std::runtime_error("Datamover queue is not empty");
|
|
|
|
if (!work_request_stream.empty())
|
|
throw std::runtime_error("Work request stream is not empty");
|
|
|
|
if (err_reg != 0)
|
|
throw std::runtime_error("Error reg for frame_statistics not zero, val=" + std::to_string(err_reg));
|
|
|
|
while (!datamover_out.IsIdle())
|
|
std::this_thread::sleep_for(std::chrono::milliseconds(100));
|
|
|
|
idle = true;
|
|
}
|
|
|
|
uint32_t HLSSimulatedDevice::HW_GetIPv4Address() const {
|
|
return fpga_ipv4_addr;
|
|
}
|
|
|
|
uint64_t HLSSimulatedDevice::HW_GetMACAddress() const {
|
|
return fpga_mac_addr;
|
|
}
|
|
|
|
void HLSSimulatedDevice::HW_GetStatus(ActionStatus *status) const {
|
|
memset(status, 0, sizeof(ActionStatus));
|
|
|
|
status->modules_internal_packet_generator = 1;
|
|
status->max_modules = max_modules;
|
|
} |