From 8dd42a478a39e70e752eb2219c3fd9e655d79fcf Mon Sep 17 00:00:00 2001 From: Konrad Bucheli Date: Tue, 10 Oct 2023 11:39:26 +0200 Subject: [PATCH] how to handle PCIe bus errors --- _toc.yml | 1 + admin-guide/troubleshooting/pcie_bus_error.md | 18 ++++++++++++++++++ 2 files changed, 19 insertions(+) create mode 100644 admin-guide/troubleshooting/pcie_bus_error.md diff --git a/_toc.yml b/_toc.yml index 6ae68a26..d21ad5e1 100644 --- a/_toc.yml +++ b/_toc.yml @@ -103,6 +103,7 @@ chapters: - file: admin-guide/troubleshooting/boot - file: admin-guide/troubleshooting/kerberos - file: admin-guide/troubleshooting/sssd + - file: admin-guide/troubleshooting/pcie_bus_error - file: admin-guide/order-vm - file: infrastructure-guide/index diff --git a/admin-guide/troubleshooting/pcie_bus_error.md b/admin-guide/troubleshooting/pcie_bus_error.md new file mode 100644 index 00000000..140c6df8 --- /dev/null +++ b/admin-guide/troubleshooting/pcie_bus_error.md @@ -0,0 +1,18 @@ +# PCIe Bus Error + +When there are PCI Express bus errors like +``` +Oct 05 11:26:19 pc16209.psi.ch kernel: pcieport 10000:e0:06.0: AER: TLP Header: 34000000 e1000010 89148914 00000000 +Oct 05 11:26:19 pc16209.psi.ch kernel: pcieport 10000:e0:06.0: PCIe Bus Error: severity=Uncorrected (Non-Fatal), type=Transaction Layer, (Requester ID) +Oct 05 11:26:19 pc16209.psi.ch kernel: pcieport 10000:e0:06.0: device [8086:464d] error status/mask=00100000/00010000 +Oct 05 11:26:19 pc16209.psi.ch kernel: pcieport 10000:e0:06.0: [20] UnsupReq (First) +``` +you might try with disabling **Active State Power Management** (ASPM) in the kernel. + +To do so set in Hiera + +``` +base::enable_pcie_aspm: false +``` + +the apply it with `puppet agent -t` and reboot.