// Copyright (2019-2023) Paul Scherrer Institute #include "HLSSimulatedDevice.h" #include #include #include "../fpga/hls/datamover_model.h" #include "../fpga/hls/hls_jfjoch.h" uint16_t checksum(const uint16_t *addr, size_t count) { /* Compute Internet Checksum for "count" bytes * beginning at location "addr". */ long sum = 0; for (int i = 0; i < count / 2; i++) sum += addr[i]; /* Add left-over byte, if any */ if (count % 2 == 1) sum += ((uint8_t *) addr)[count / 2]; /* Fold 32-bit sum to 16 bits */ while (sum>>16) sum = (sum & 0xffff) + (sum >> 16); return ~sum; } HLSSimulatedDevice::HLSSimulatedDevice(uint16_t data_stream, size_t in_frame_buffer_size_modules, int16_t numa_node) : FPGAAcquisitionDevice(data_stream), datamover_in(Direction::Input, nullptr), datamover_out(Direction::Output, nullptr), datamover_out_hbm_0(Direction::Output, (char *) hbm.data()), datamover_out_hbm_1(Direction::Output, (char *) hbm.data()), datamover_in_hbm_0(Direction::Input, (char *) hbm.data()), datamover_in_hbm_1(Direction::Input, (char *) hbm.data()), idle(true), hbm(hbm_if_size / 32 * hbm_if_count), dma_address_table(65536) { mac_addr = 0xCCAA11223344; ipv4_addr = 0x0132010A; max_modules = MAX_MODULES_FPGA; MapBuffersStandard(in_frame_buffer_size_modules, numa_node); auto in_mem_location32 = (uint32_t *) dma_address_table.data(); for (int i = 0; i < buffer_device.size(); i++) { in_mem_location32[2 * i ] = ((uint64_t) buffer_device[i]) & UINT32_MAX; in_mem_location32[2 * i + 1] = ((uint64_t) buffer_device[i]) >> 32; } } void HLSSimulatedDevice::CreateFinalPacket(const DiffractionExperiment& experiment) { CreateJFPacket(experiment, UINT64_MAX, 0, 0, nullptr, false); } void HLSSimulatedDevice::SendPacket(char *buffer, int len, uint8_t user) { auto obuff = (ap_uint<512> *)buffer; for (int i = 0; i < (len + 63) / 64; i++) { packet_512_t packet_in; if (i == (len + 63) / 64 - 1) packet_in.last = 1; else packet_in.last = 0; packet_in.keep = 0xFFFFFFFFFFFFFFFF; packet_in.user = user; packet_in.data = obuff[i]; din_eth.write(packet_in); } } void HLSSimulatedDevice::CreateJFPacket(const DiffractionExperiment& experiment, uint64_t frame_number, uint32_t eth_packet, uint32_t module_number, const uint16_t *data, int8_t adjust_axis, uint8_t user) { char buff[256*64]; memset(buff, 0, 256*64); auto packet = (jf_raw_packet *)buff; packet->ether_type = htons(0x0800); packet->sour_mac[0] = 0x00; // module 0 uint64_t tmp_mac = mac_addr; for (int i = 0; i < 6; i++) packet->dest_mac[i] = (tmp_mac >> (8*i)) % 256; uint32_t half_module = 2 * module_number | ((eth_packet >= 64) ? 1 : 0); packet->ipv4_header_h = htons(0x4500); // Big endian in IP header! packet->ipv4_header_total_length = htons(8268); // Big endian in IP header! packet->ipv4_header_dest_ip = ipv4_addr; packet->ipv4_header_sour_ip = experiment.GetSrcIPv4Address(data_stream, half_module); packet->ipv4_header_ttl_protocol = htons(0x0011); packet->ipv4_header_checksum = checksum( (uint16_t *) &packet->ipv4_header_h, 20); // checksum is already in network order packet->udp_dest_port = htons(GetUDPPort()); // module number packet->udp_sour_port = htons(0xDFAC); packet->udp_length = htons(8248); // JF headers are little endian packet->jf.detectortype = SLS_DETECTOR_TYPE_JUNGFRAU; packet->jf.timestamp = 0xABCDEF0000FEDCBAL; packet->jf.bunchid = 0x1234567898765431L; packet->jf.row = half_module; packet->jf.column = 0; packet->jf.framenum = frame_number; packet->jf.packetnum = eth_packet % 64; if (data != nullptr) { for (int i = 0; i < 4096; i++) packet->jf.data[i] = data[i]; } packet->udp_checksum = htons(checksum( (uint16_t *) (buff+42), 8192+48)); SendPacket(buff, (130+adjust_axis)*64, user); } void HLSSimulatedDevice::CreateJFPackets(const DiffractionExperiment& experiment, uint64_t frame_number_0, uint64_t frames, uint32_t module_number, const uint16_t *data) { for (uint64_t i = 0; i < frames; i++) { for (int j = 0; j < 128; j++) CreateJFPacket(experiment, frame_number_0 + i, j, module_number, data + (i * 128 + j) * 4096, 0, 0); } } void HLSSimulatedDevice::CreateEIGERPacket(const DiffractionExperiment &experiment, uint64_t frame_number, uint32_t eth_packet, uint32_t module_number, uint32_t col, uint32_t row, const uint16_t *data) { char buff[256*64]; memset(buff, 0, 256*64); auto packet = (eiger_raw_packet *)buff; packet->ether_type = htons(0x0800); packet->sour_mac[0] = 0x00; // module 0 uint64_t tmp_mac = mac_addr; for (int i = 0; i < 6; i++) packet->dest_mac[i] = (tmp_mac >> (8*i)) % 256; packet->ipv4_header_h = htons(0x4500); // Big endian in IP header! packet->ipv4_header_total_length = htons(4172); // Big endian in IP header! packet->ipv4_header_dest_ip = ipv4_addr; packet->ipv4_header_sour_ip = experiment.GetSrcIPv4Address(data_stream, 0); packet->ipv4_header_ttl_protocol = htons(0x0011); packet->ipv4_header_checksum = checksum( (uint16_t *) &packet->ipv4_header_h, 20); // checksum is already in network order packet->udp_dest_port = htons(GetUDPPort()); // module number packet->udp_sour_port = htons(0xDFAC); packet->udp_length = htons(4152); // JF headers are little endian packet->eiger.detectortype = SLS_DETECTOR_TYPE_EIGER; packet->eiger.timestamp = 0xABCDEF0000FEDCBAL; packet->eiger.bunchid = 0x1234567898765431L; packet->eiger.row = (2 * module_number) | (row % 2); packet->eiger.column = col % 2; packet->eiger.framenum = frame_number; packet->eiger.packetnum = eth_packet % 64; if (data != nullptr) { for (int i = 0; i < 2048; i++) packet->eiger.data[i] = data[i]; } packet->udp_checksum = htons(checksum( (uint16_t *) (buff+42), 4096+48)); SendPacket(buff, 66*64, 0); } AXI_STREAM & HLSSimulatedDevice::OutputStream() { return dout_eth; } void HLSSimulatedDevice::HW_ReadActionRegister(DataCollectionConfig *job) { memcpy(job, &cfg, sizeof(DataCollectionConfig)); } void HLSSimulatedDevice::HW_WriteActionRegister(const DataCollectionConfig *job) { memcpy(&cfg, job, sizeof(DataCollectionConfig)); } void HLSSimulatedDevice::FPGA_StartAction(const DiffractionExperiment &experiment) { if (action_thread.joinable()) action_thread.join(); run_counter += 1; run_data_collection = 1; cancel_data_collection = 0; idle = false; while (!din_frame_generator.empty()) din_frame_generator.read(); datamover_out.ClearCompletedDescriptors(); action_thread = std::thread(&HLSSimulatedDevice::HLSMainThread, this ); } void HLSSimulatedDevice::FrameGeneratorFuture(FrameGeneratorConfig config) { frame_generator(din_frame_generator, hbm.data(), hbm.data(), hbm_if_size, mac_addr, ipv4_addr, cancel_data_collection, config); } void HLSSimulatedDevice::HW_RunInternalGenerator(const FrameGeneratorConfig &config) { frame_generator_future = std::async(std::launch::async, &HLSSimulatedDevice::FrameGeneratorFuture, this, config); } void HLSSimulatedDevice::FPGA_EndAction() { if (action_thread.joinable()) action_thread.join(); } HLSSimulatedDevice::~HLSSimulatedDevice() { if (action_thread.joinable()) action_thread.join(); } bool HLSSimulatedDevice::HW_ReadMailbox(uint32_t *values) { std::unique_lock ul(completion_mutex); ap_uint<32> tmp; bool ret = completion_stream.read_nb(tmp); values[0] = tmp; // equivalent to driver functionality if (ret) { uint32_t data_collection_id = (values[0] >> 16) & 0xFFFF; uint32_t handle = values[0] & 0xFFFF; if (handle == HANDLE_START) completion_count = 0; else if ((handle != HANDLE_END) && (data_collection_id != DATA_COLLECTION_ID_PURGE)) { completion_count++; while (completion_count * DMA_DESCRIPTORS_PER_MODULE > datamover_out.GetCompletedDescriptors()) std::this_thread::sleep_for(std::chrono::milliseconds(1)); } } return ret; } void HLSSimulatedDevice::Cancel() { cancel_data_collection = 1; } bool HLSSimulatedDevice::HW_IsIdle() const { return idle && datamover_out.IsIdle(); } bool HLSSimulatedDevice::HW_SendWorkRequest(uint32_t handle) { work_request_stream.write(handle); return true; } inline uint32_t float2uint(float f) { float_uint32 fu; fu.f = f; return fu.u; } void HLSSimulatedDevice::HLSMainThread() { ap_uint<1> clear_counters = 0; uint64_t packets_processed; std::vector hls_cores; STREAM_512 ip1, udp1, udp2, icmp1, arp1; STREAM_512 network0; STREAM_768 stream_768_0; STREAM_768 stream_768_1; STREAM_768 stream_768_2; STREAM_768 stream_768_3; STREAM_768 stream_768_4; STREAM_768 stream_768_5; STREAM_512 data_0; hls::stream, 2> data_1; hls::stream, 2> data_2; STREAM_512 data_3; STREAM_512 data_4; STREAM_512 data_5; STREAM_512 data_6; STREAM_512 data_7; STREAM_512 data_8; STREAM_512 data_9; STREAM_512 data_10; STREAM_512 data_12; STREAM_512 data_13; hls::stream addr0; hls::stream addr1; hls::stream addr2; hls::stream addr3; hls::stream axi_compl[12]; hls::stream> hbm_handles; hls::stream> adu_histo_result; hls::stream> integration_result_0; hls::stream> integration_result_1; hls::stream> spot_finder_result_0; hls::stream> spot_finder_result_1; hls::stream> spot_finder_conn_0; hls::stream> spot_finder_result_2; hls::stream> spot_finder_result_3; hls::stream> spot_finder_mask_0; hls::stream> roi_calc_result_0; hls::stream > udp_metadata; volatile ap_uint<1> idle_data_collection = 1; ap_uint<1> load_to_hbm_idle; ap_uint<1> save_to_hbm_idle; ap_uint<1> integration_idle; ap_uint<1> stream_conv_idle; ap_uint<1> frame_summation_idle; volatile bool done = false; volatile bool udp_done = false; // done AND udp_idle volatile bool sls_done = false; // done AND sls_idle // Sent gratuitous ARP message arp(arp1, dout_eth, mac_addr, ipv4_addr, 1, 1); Logger logger_hls("HLS"); volatile rcv_state_t state = RCV_INIT; // Start data collection data_collection_fsm(data_0, data_1, addr0, addr1, run_data_collection, cancel_data_collection, idle_data_collection, cfg.mode, float2uint(cfg.energy_kev), cfg.nframes, cfg.nmodules, cfg.nstorage_cells, cfg.nsummation , state); run_data_collection = 0; data_collection_fsm(data_0, data_1, addr0, addr1, run_data_collection, cancel_data_collection, idle_data_collection, cfg.mode, float2uint(cfg.energy_kev), cfg.nframes, cfg.nmodules, cfg.nstorage_cells, cfg.nsummation, state); hls_cores.emplace_back([&] { while (!udp_done) { if (din_eth.empty() && din_frame_generator.empty()) std::this_thread::sleep_for(std::chrono::microseconds(10)); else stream_merge(din_eth, din_frame_generator, network0, data_source); } logger_hls.Info("Stream_merge done"); }); hls_cores.emplace_back([&] { while (!udp_done) { if (network0.empty()) std::this_thread::sleep_for(std::chrono::microseconds(10)); else ethernet(network0, ip1, arp1, mac_addr, eth_packets, clear_counters); } logger_hls.Info("ethernet done"); }); hls_cores.emplace_back([&] { while (!udp_done) { if (ip1.empty()) std::this_thread::sleep_for(std::chrono::microseconds(10)); else ipv4(ip1, udp1, icmp1, ipv4_addr); } logger_hls.Info("ipv4 done"); }); hls_cores.emplace_back([&] { ap_uint<1> udp_idle = 1; while (!done || !udp_idle) { if (udp1.empty()) std::this_thread::sleep_for(std::chrono::microseconds(10)); else udp(udp1, udp2, udp_metadata, udp_packets, clear_counters, udp_idle); } udp_done = true; logger_hls.Info("udp done"); }); hls_cores.emplace_back([&] { ap_uint<1> sls_idle = 1; while (!done || !sls_idle) { if (udp2.empty()) std::this_thread::sleep_for(std::chrono::microseconds (10)); else sls_detector(udp2, udp_metadata, data_0, addr0, sls_packets, udp_eth_err, udp_len_err, current_pulse_id, clear_counters, sls_idle); } sls_done = true; logger_hls.Info("sls_detector done"); }); // 1. Parse incoming UDP packets hls_cores.emplace_back([&] { while ((state != RCV_WAIT_FOR_START) || (idle_data_collection.read() == 0) || (!data_0.empty())) { data_collection_fsm(data_0, data_1, addr0, addr1, run_data_collection, cancel_data_collection, idle_data_collection, cfg.mode, float2uint(cfg.energy_kev), cfg.nframes, cfg.nmodules, cfg.nstorage_cells, cfg.nsummation, state); } done = true; logger_hls.Info("data collection done"); }); // 2. Cache images in HBM hls_cores.emplace_back([&] { save_to_hbm(addr1, axi_compl[0], hbm_handles, datamover_out_hbm_0.GetCtrlStream(), datamover_out_hbm_1.GetCtrlStream(), save_to_hbm_idle, hbm_if_size);}); hls_cores.emplace_back([&] { save_to_hbm_data(data_1, data_3, datamover_out_hbm_0.GetDataStream(), datamover_out_hbm_1.GetDataStream());}); hls_cores.emplace_back([&] {frame_summation_reorder_compl(data_3, data_4, axi_compl[0], axi_compl[1]);}); hls_cores.emplace_back([&] { load_from_hbm(data_4, data_5, axi_compl[1], axi_compl[2], hbm_handles, datamover_in_hbm_0.GetDataStream(), datamover_in_hbm_1.GetDataStream(), datamover_in_hbm_0.GetCtrlStream(), datamover_in_hbm_1.GetCtrlStream(), load_to_hbm_idle, hbm_if_size);}); hls_cores.emplace_back([&] { pedestal(data_5, data_6, axi_compl[2], axi_compl[3], hbm.data(), hbm.data(), hbm.data(), hbm.data(), hbm.data(), hbm.data(), hbm.data(), hbm.data(), hbm.data(), hbm.data(), hbm.data(), hbm.data(), hbm_if_size);}); // 3. Calculate histogram of ADU values hls_cores.emplace_back([&] { adu_histo(data_6, data_7, adu_histo_result, axi_compl[3], axi_compl[4]);}); // 4. Mask missing pixels hls_cores.emplace_back([&] { mask_missing(data_7, data_8, axi_compl[4], axi_compl[5]);}); // 5. Handle EIGER packets properly hls_cores.emplace_back([&] { eiger_reorder(data_8, data_9, axi_compl[5], axi_compl[6]);}); // 6. Apply pedestal & gain corrections hls_cores.emplace_back([&] { jf_conversion(data_9, stream_768_0, axi_compl[6], axi_compl[7], hbm.data(), hbm.data(), hbm.data(), hbm.data(), hbm.data(), hbm.data(), hbm.data(), hbm.data(), hbm.data(), hbm.data(), hbm.data(), hbm.data(), hbm_if_size); }); // 7. Frame summation hls_cores.emplace_back([&] { frame_summation(stream_768_0, stream_768_1, axi_compl[7], axi_compl[8], frame_summation_idle);}); // 8. Integration of pixels hls_cores.emplace_back([&] { integration(stream_768_1, stream_768_2, integration_result_0, axi_compl[8], axi_compl[9], hbm.data(), hbm.data(), hbm.data(), hbm.data(), integration_idle, hbm_if_size);}); hls_cores.emplace_back([&] { axis_64_to_512(integration_result_0, integration_result_1);}); // 9. Spot finding ap_uint<32> tmp_snr_threshold = float2uint(spot_finder_parameters.snr_threshold); ap_uint<32> min_d = float2uint(spot_finder_parameters.min_d); ap_uint<32> max_d = float2uint(spot_finder_parameters.max_d); ap_int<32> tmp_count_threshold = spot_finder_parameters.count_threshold; ap_uint<32> min_pix_per_spot = spot_finder_parameters.min_pix_per_spot; hls_cores.emplace_back([&] { spot_finder_mask(stream_768_2, stream_768_3, spot_finder_mask_0, axi_compl[9], axi_compl[10], hbm.data(), hbm.data(), min_d, max_d, hbm_if_size); logger_hls.Info("spot_finder_mask done"); }); hls_cores.emplace_back([&] { spot_finder(stream_768_3, spot_finder_mask_0, stream_768_4, spot_finder_result_0, tmp_count_threshold, tmp_snr_threshold); logger_hls.Info("spot_finder done"); }); hls_cores.emplace_back([&] { spot_finder_connectivity(spot_finder_result_0, spot_finder_result_1, spot_finder_conn_0); logger_hls.Info("spot_finder_connectivity done"); }); hls_cores.emplace_back([&] { spot_finder_merge(spot_finder_result_1, spot_finder_conn_0, spot_finder_result_2, min_pix_per_spot); logger_hls.Info("spot_finder_merge done"); }); hls_cores.emplace_back([&] { axis_32_to_512(spot_finder_result_2, spot_finder_result_3); logger_hls.Info("axis_32_to_512 done"); }); hls_cores.emplace_back([&] { roi_calc(stream_768_4, stream_768_5, roi_calc_result_0, axi_compl[10], axi_compl[11], hbm.data(), hbm.data(), hbm_if_size); logger_hls.Info("roi_calc done"); }); // 10. Reduce/extend 24-bit stream hls_cores.emplace_back([&] { stream_24bit_conv(stream_768_5, data_12, stream_conv_idle);}); // 11. Prepare data to write to host memory hls_cores.emplace_back([&] { ap_uint<3> state; host_writer(data_12, adu_histo_result, integration_result_1, spot_finder_result_3, roi_calc_result_0, axi_compl[11], datamover_out.GetDataStream(), datamover_out.GetCtrlStream(), work_request_stream, completion_stream, dma_address_table.data(), packets_processed, host_writer_idle, cancel_data_collection, state); logger_hls.Info("host_writer done"); }); for (auto &i : hls_cores) i.join(); if (frame_generator_future.valid()) frame_generator_future.get(); // reset static counter arp(arp1, dout_eth, mac_addr, ipv4_addr, 0, 1); try { while (!din_eth.empty()) din_eth.read(); if (!addr1.empty()) throw std::runtime_error("Addr1 queue not empty"); if (!addr2.empty()) throw std::runtime_error("Addr2 queue not empty"); if (!addr3.empty()) throw std::runtime_error("Addr3 queue not empty"); if (!data_1.empty()) throw std::runtime_error("data_1 queue not empty"); if (!data_2.empty()) throw std::runtime_error("data_2 queue not empty"); if (!data_3.empty()) throw std::runtime_error("data_3 queue not empty"); if (!data_4.empty()) throw std::runtime_error("data_4 queue not empty"); if (!data_5.empty()) throw std::runtime_error("data_5 queue not empty"); if (!data_7.empty()) throw std::runtime_error("data_7 queue not empty"); if (!data_8.empty()) throw std::runtime_error("data_8 queue not empty"); if (!data_9.empty()) throw std::runtime_error("data_9 queue not empty"); if (!data_10.empty()) throw std::runtime_error("data_10 queue not empty"); if (!data_12.empty()) throw std::runtime_error("data_12 queue not empty"); if (!data_13.empty()) throw std::runtime_error("data_13 queue not empty"); for (auto &c: axi_compl) { if (!c.empty()) throw std::runtime_error("Compl queue not empty"); } if (!stream_768_0.empty()) throw std::runtime_error("stream_768_0 queue not empty"); if (!stream_768_1.empty()) throw std::runtime_error("stream_768_1 queue not empty"); if (!stream_768_2.empty()) throw std::runtime_error("stream_768_2 queue not empty"); if (!stream_768_3.empty()) throw std::runtime_error("stream_768_3 queue not empty"); if (!stream_768_4.empty()) throw std::runtime_error("stream_768_4 queue not empty"); if (!hbm_handles.empty()) throw std::runtime_error("Handles queue not empty"); if (!integration_result_0.empty()) throw std::runtime_error("Integration result queue not empty"); if (!integration_result_1.empty()) throw std::runtime_error("Integration result queue not empty"); if (!datamover_in.GetDataStream().empty()) throw std::runtime_error("Datamover queue is not empty"); while (!datamover_out.IsIdle()) std::this_thread::sleep_for(std::chrono::milliseconds(100)); } catch (const std::runtime_error &e) { if (logger) logger->ErrorException(e); throw e; } if (logger) logger->Info("Packets Eth {} UDP {} SLS {} Proc {}", eth_packets, udp_packets, sls_packets, packets_processed); idle = true; } DataCollectionStatus HLSSimulatedDevice::GetDataCollectionStatus() const { DataCollectionStatus status{}; union { uint64_t u64; double d; } pulse_id_conv; pulse_id_conv.u64 = current_pulse_id; status.ctrl_reg = ap_uint<1>(host_writer_idle) ? (1 << 4) : 0; status.max_modules = max_modules; status.hbm_size_bytes = hbm_if_size; status.run_counter = run_counter; status.current_pulseid = pulse_id_conv.d; status.packets_udp = udp_packets; status.packets_eth = eth_packets; status.packets_sls = sls_packets; return status; } void HLSSimulatedDevice::HW_LoadCalibration(const LoadCalibrationConfig &config) { int ret = load_calibration(hbm.data(), hbm.data(), config, hbm_if_size, datamover_in.GetCtrlStream(), datamover_in.GetDataStream(), dma_address_table.data()); if (ret) throw JFJochException(JFJochExceptionCategory::AcquisitionDeviceError, "Error in loading calibration " + std::to_string(ret)); if (!datamover_in.GetDataStream().empty()) throw std::runtime_error("Datamover queue is not empty"); } void HLSSimulatedDevice::HW_SetSpotFinderParameters(const SpotFinderParameters ¶ms) { spot_finder_parameters = params; } void HLSSimulatedDevice::HW_SetDataSource(uint32_t val) { data_source = val; } uint32_t HLSSimulatedDevice::HW_GetDataSource() { return data_source; } void HLSSimulatedDevice::CreateXfelBunchIDPacket(double pulse_id, uint32_t event_code) { union { uint64_t u64; double d; } pulse_id_conv; pulse_id_conv.d = pulse_id; bunchid_raw_packet packet{}; packet.ether_type = htons(0x0800); for (int i = 0; i < 6; i++) packet.dest_mac[i] = 0xFF; packet.ipv4_header_h = htons(0x4500); // Big endian in IP header! packet.ipv4_header_total_length = htons(sizeof(bunchid_payload) + 8 + 20); // Big endian in IP header! packet.ipv4_header_dest_ip = UINT32_MAX; packet.ipv4_header_ttl_protocol = htons(0x0011); packet.ipv4_header_checksum = checksum( (uint16_t *) &packet.ipv4_header_h, 20); // checksum is already in network order packet.udp_length = htons(sizeof(bunchid_payload) + 8); packet.payload.magicn[0] = BUNCHID_MAGICN; packet.payload.magicn[1] = BUNCHID_MAGICN; packet.payload.magicn[2] = BUNCHID_MAGICN; packet.payload.bunchid_msb[0] = (pulse_id_conv.u64 >> 32) & UINT32_MAX; packet.payload.bunchid_msb[1] = (pulse_id_conv.u64 >> 32) & UINT32_MAX; packet.payload.bunchid_lsb[0] = pulse_id_conv.u64 & UINT32_MAX; packet.payload.bunchid_lsb[1] = pulse_id_conv.u64 & UINT32_MAX; packet.payload.montrig = event_code; SendPacket((char *) &packet, sizeof(bunchid_raw_packet)); }