// SPDX-FileCopyrightText: 2024 Filip Leonarski, Paul Scherrer Institute // SPDX-License-Identifier: CERN-OHL-S-2.0 #include "hls_jfjoch.h" #define o(field) offsetof(ModuleStatistics, field) #define sf(msg, field, s) msg(o(field)*8 + s - 1, o(field)*8) inline ap_uint<1024> fill_module_info(axis_completion &cmpl, ap_uint &pixel_count_result) { ap_uint<1024> msg = 0; sf(msg, frame_number, 64) = cmpl.frame_number; sf(msg, timestamp, 64) = cmpl.timestamp; sf(msg, pulse_id, 64) = cmpl.bunchid; sf(msg, detector_type, 32) = cmpl.detector_type; sf(msg, exptime, 32) = cmpl.exptime; sf(msg, debug, 32) = cmpl.debug; sf(msg, pedestal, 32) = cmpl.pedestal; sf(msg, packet_count, 32) = cmpl.packet_count; sf(msg, module_number, 32) = cmpl.module; sf(msg, err_pixels, 32) = pixel_count_result(31, 0); sf(msg, saturated_pixels, 32) = pixel_count_result(63, 32); sf(msg, max_value, 32) = pixel_count_result(95, 64); sf(msg, min_value, 32) = pixel_count_result(127,96); sf(msg, pixel_sum, 64) = pixel_count_result(191,128); sf(msg, masked_pixels, 32) = pixel_count_result(223, 192); return msg; } inline void write_completion(hls::stream > &m_axis_completion, const ap_uint<16> &handle, const ap_uint<16> &data_collection_id) { #pragma HLS INLINE m_axis_completion << (data_collection_id, handle); } void read_request(hls::stream > &s_axis_work_request, ap_uint<16> &handle) { #pragma HLS INLINE ap_uint<32> tmp1; s_axis_work_request >> tmp1; handle = tmp1(15, 0); } void host_writer(STREAM_512 &data_in, hls::stream> &adu_histo_in, hls::stream> &integration_in, hls::stream> &spot_finder_in, hls::stream> &roi_count_in, hls::stream> &pixel_calc_in, hls::stream &s_axis_completion, hls::stream > &host_memory_out, hls::stream &datamover_out_cmd, hls::stream > &s_axis_work_request, hls::stream > &m_axis_completion, const uint64_t *dma_address_table, volatile uint64_t &packets_processed, volatile ap_uint<1> &idle, volatile ap_uint<1> &in_cancel, volatile ap_uint<3> &state) { #pragma HLS INTERFACE ap_ctrl_none port=return #pragma HLS INTERFACE register both axis port=data_in #pragma HLS INTERFACE register both axis port=adu_histo_in #pragma HLS INTERFACE register both axis port=integration_in #pragma HLS INTERFACE register both axis port=spot_finder_in #pragma HLS INTERFACE register both axis port=roi_count_in #pragma HLS INTERFACE register both axis port=pixel_calc_in #pragma HLS INTERFACE register both axis port=s_axis_completion #pragma HLS INTERFACE register both axis port=host_memory_out #pragma HLS INTERFACE register both axis port=datamover_out_cmd #pragma HLS INTERFACE register both axis port=m_axis_completion #pragma HLS INTERFACE register both axis port=s_axis_work_request #pragma HLS INTERFACE register ap_vld port=packets_processed #pragma HLS INTERFACE register ap_none port=idle #pragma HLS INTERFACE register ap_none port=in_cancel #pragma HLS INTERFACE register ap_none port=state #pragma HLS INTERFACE mode=m_axi port=dma_address_table bundle=dma_address_table depth=65536 offset=off \ max_read_burst_length=2 max_write_burst_length=2 latency=10 num_write_outstanding=1 num_read_outstanding=1 idle = 1; state = 0; ap_uint<16> req_handle; ap_uint<16> req_handle_send = HANDLE_START; ap_uint<64> req_host_offset; while (data_in.empty()) { if (!s_axis_work_request.empty()) { #pragma HLS PIPELINE II=1 read_request(s_axis_work_request, req_handle); write_completion(m_axis_completion, req_handle, DATA_COLLECTION_ID_PURGE); } } packet_512_t packet; { #pragma HLS PROTOCOL fixed data_in >> packet; ap_wait(); } ap_uint<32> data_collection_mode = ACT_REG_MODE(packet.data); ap_uint<32> data_collection_id = data_collection_mode(31, 16); ap_uint<1> mode_32bit = (data_collection_mode & MODE_32BIT_OUTPUT) ? 1 : 0; ap_uint<1> mode_8bit = (data_collection_mode & MODE_8BIT_OUTPUT) ? 1 : 0; uint64_t internal_packets_processed = 0; packets_processed = internal_packets_processed; write_completion(m_axis_completion, HANDLE_START, data_collection_id); idle = 0; size_t pixel_depth; if (mode_32bit) pixel_depth = 4; else if (mode_8bit) pixel_depth = 1; else pixel_depth = 2; axis_completion cmpl; s_axis_completion >> cmpl; state = 1; while (!cmpl.last) { ap_uint<1> send_images = 1; while (s_axis_work_request.empty() && !in_cancel.read()) { #pragma HLS PIPELINE II=1 ap_wait(); } if (s_axis_work_request.empty()) { send_images = 0; state = 2; } else state = 3; // Either send_images == 0 (so collection can proceed without writing to host mem) // or s_axis_work_request is not empty (so collection can proceed with writing to host mem) if (send_images) { read_request(s_axis_work_request, req_handle); req_host_offset = dma_address_table[req_handle]; send_images = (req_host_offset != 0); } packet_512_t packet_out; packet_out.strb = UINT64_MAX; packet_out.keep = UINT64_MAX; packet_out.dest = 0; packet_out.user = 0; packet_out.id = 0; packet_out.last = 0; if (send_images) { setup_datamover(datamover_out_cmd, req_host_offset, RAW_MODULE_SIZE * pixel_depth); setup_datamover(datamover_out_cmd, req_host_offset + offsetof(DeviceOutput, spot_finding_result), (RAW_MODULE_SIZE / 8 + 64)); setup_datamover(datamover_out_cmd, req_host_offset + offsetof(DeviceOutput, integration_result), (FPGA_INTEGRATION_BIN_COUNT / 8) * 64); setup_datamover(datamover_out_cmd, req_host_offset + offsetof(DeviceOutput, adu_histogram), ADU_HISTO_BIN_COUNT / 16 * 64); setup_datamover(datamover_out_cmd, req_host_offset + offsetof(DeviceOutput, roi_counts), FPGA_ROI_COUNT * 64); setup_datamover(datamover_out_cmd, req_host_offset + offsetof(DeviceOutput, module_statistics), 3 * 64); } for (int i = 0; i < RAW_MODULE_SIZE * pixel_depth / 64; i++) { #pragma HLS PIPELINE II=1 data_in >> packet; packet_out.data = packet.data; if (i == RAW_MODULE_SIZE * pixel_depth / 64 - 1) packet_out.last = 1; else packet_out.last = 0; if (send_images) host_memory_out << packet_out; } packet_out.last = 0; // 256 transfers x 512-bit for (int i = 0; i < RAW_MODULE_SIZE / (8 * 64) + 1; i++) { #pragma HLS PIPELINE II=1 spot_finder_in >> packet_out.data; packet_out.last = (i == RAW_MODULE_SIZE / (8 * 64)); if (send_images) host_memory_out << packet_out; } // 128 transfers x 512-bit for (int i = 0; i < FPGA_INTEGRATION_BIN_COUNT / 8; i++) { #pragma HLS PIPELINE II=1 integration_in >> packet_out.data; packet_out.last = (i == FPGA_INTEGRATION_BIN_COUNT / 8 - 1); if (send_images) host_memory_out << packet_out; } // 128 transfers x 512-bit for (int i = 0; i < ADU_HISTO_BIN_COUNT / 16; i++) { #pragma HLS PIPELINE II=1 ap_uint<512> tmp; adu_histo_in >> packet_out.data; packet_out.last = (i == ADU_HISTO_BIN_COUNT / 16 - 1); if (send_images) host_memory_out << packet_out; } // 8 transfers x 512-bit for (int i = 0; i < FPGA_ROI_COUNT; i++) { #pragma HLS PIPELINE II=1 ap_uint<512> tmp; roi_count_in >> packet_out.data; packet_out.last = (i == FPGA_ROI_COUNT - 1); if (send_images) host_memory_out << packet_out; } if (send_images) state = 5; else state = 4; ap_uint pixel_count_p; pixel_calc_in >> pixel_count_p; ap_uint<1024> tmp = fill_module_info(cmpl, pixel_count_p); if (send_images) { packet_out.data = tmp(511,0); packet_out.last = 0; host_memory_out << packet_out; packet_out.data = tmp(1023,512); packet_out.last = 0; host_memory_out << packet_out; } packet_out.data = cmpl.packet_mask; packet_out.last = 1; if (send_images) host_memory_out << packet_out; if (send_images) { if (req_handle_send != HANDLE_START) write_completion(m_axis_completion, req_handle_send, data_collection_id); req_handle_send = req_handle; internal_packets_processed += cmpl.packet_count; packets_processed = internal_packets_processed; } state = 6; s_axis_completion >> cmpl; state = 7; } data_in >> packet; #ifdef JFJOCH_HLS_NOSYNTH while (!host_memory_out.empty()) std::this_thread::sleep_for(std::chrono::milliseconds(1)); #else { // wait for 5 ms (ensure that PCIe buffer is clean) #pragma HLS PROTOCOL for (int i = 0; i < 5 * 200 * 1000; i++) ap_wait(); } #endif if (req_handle_send != HANDLE_START) write_completion(m_axis_completion, req_handle_send, data_collection_id); write_completion(m_axis_completion, HANDLE_END, data_collection_id); idle = 1; }