diff --git a/_toc.yml b/_toc.yml index 476bcac5..83b239c4 100644 --- a/_toc.yml +++ b/_toc.yml @@ -95,16 +95,20 @@ chapters: - file: admin-guide/updates - file: admin-guide/troubleshooting sections: - - file: admin-guide/troubleshooting/methods-and-tools - file: admin-guide/troubleshooting/deployment + - file: admin-guide/troubleshooting/systemd + - file: admin-guide/troubleshooting/jounald + - file: admin-guide/troubleshooting/network - file: admin-guide/troubleshooting/login - - file: admin-guide/troubleshooting/package_management + - file: admin-guide/troubleshooting/packages - file: admin-guide/troubleshooting/selinux - file: admin-guide/troubleshooting/boot - file: admin-guide/troubleshooting/kerberos - file: admin-guide/troubleshooting/gvfs - file: admin-guide/troubleshooting/sssd - file: admin-guide/troubleshooting/pam + - file: admin-guide/troubleshooting/filesystem + - file: admin-guide/troubleshooting/processes - file: admin-guide/troubleshooting/pcie_bus_error - file: admin-guide/order-vm diff --git a/admin-guide/certificates.md b/admin-guide/certificates.md index 747ce7bd..06943634 100644 --- a/admin-guide/certificates.md +++ b/admin-guide/certificates.md @@ -1,9 +1,9 @@ -# Managing HTTPS/SSL Certificates +# HTTPS Certificates We use DigiCert certificates. -## Requesting certificates +## Request a Certificate First create a certificate signing request (CSR) like this, replacing the values for `FQDN` and `ALIASES` @@ -56,12 +56,12 @@ Our teams practice is to always create a new private key and to back it up encry - for the rest in our [team secret store](https://git.psi.ch/linux-infra/core-linux-secrets) -## Renewing certificates +## Renew Certificate Using the same configuration file as above, generate a new private key and CSR, and submit the CSR as before. -## Revoke certificates +## Revoke Certificate If you would like to revoke a DigiCert certificate, please send an e-mail to pki@psi.ch diff --git a/admin-guide/troubleshooting/filesystem.md b/admin-guide/troubleshooting/filesystem.md new file mode 100644 index 00000000..29110c4d --- /dev/null +++ b/admin-guide/troubleshooting/filesystem.md @@ -0,0 +1,37 @@ +# Filesystem + +Check filesystem capacity using `df`: + +```bash +[root@lx ~]# df -h +Filesystem Size Used Avail Use% Mounted on +/dev/mapper/vg_root-lv_root 8.0G 1.4G 6.7G 17% / +devtmpfs 909M 0 909M 0% /dev +tmpfs 920M 0 920M 0% /dev/shm +tmpfs 920M 816K 920M 1% /run +tmpfs 920M 0 920M 0% /sys/fs/cgroup +/dev/sda1 976M 198M 728M 22% /boot +/dev/mapper/vg_root-lv_tmp 1014M 34M 981M 4% /tmp +/dev/mapper/vg_root-lv_var 2.9G 1.4G 1.5G 47% /var +/dev/mapper/vg_root-lv_var_log 2.0G 160M 1.9G 8% /var/log +/dev/mapper/vg_root-lv_openafs 1008M 1.3M 956M 1% /var/cache/openafs +tmpfs 184M 4.0K 184M 1% /run/user/0 +``` + +Check available inodes (~ the maximum number of files that can be created):: + +```bash +[root@lx ~]# df -i +Filesystem Inodes IUsed IFree IUse% Mounted on +/dev/mapper/vg_root-lv_root 4194304 48891 4145413 2% / +devtmpfs 232630 383 232247 1% /dev +tmpfs 235485 1 235484 1% /dev/shm +tmpfs 235485 575 234910 1% /run +tmpfs 235485 16 235469 1% /sys/fs/cgroup +/dev/sda1 65536 348 65188 1% /boot +/dev/mapper/vg_root-lv_tmp 524288 316 523972 1% /tmp +/dev/mapper/vg_root-lv_var 1474560 1042691 431869 71% /var +/dev/mapper/vg_root-lv_var_log 1048576 81 1048495 1% /var/log +/dev/mapper/vg_root-lv_openafs 65536 11 65525 1% /var/cache/openafs +tmpfs 235485 2 235483 1% /run/user/0 +``` \ No newline at end of file diff --git a/admin-guide/troubleshooting/journald.md b/admin-guide/troubleshooting/journald.md new file mode 100644 index 00000000..a93f5df0 --- /dev/null +++ b/admin-guide/troubleshooting/journald.md @@ -0,0 +1,45 @@ +# Journald + +- List all reboots/show logs starting at a specific reboot: +```bash +journalctl --list-boots + -10 19d173f56d314912820486b9ddfd7d6c Thu 2018-06-21 11:20:55 CEST—Thu 2018-06-21 + -9 3a5a050289314221a3863b88a0eef367 Thu 2018-06-21 11:26:33 CEST—Thu 2018-06-21 + -8 f9726e6c9ce44678ab68a2fc12b1c12c Thu 2018-06-21 11:43:38 CEST—Thu 2018-06-21 + -7 b4e6bc84ff8840adbc698992cd1900d2 Thu 2018-06-21 14:55:42 CEST—Thu 2018-06-21 + -6 81b78d0d09934937a24a73bfcd3d8ede Thu 2018-06-21 15:06:18 CEST—Thu 2018-06-21 + -5 dd78e29c073448ad9731c6c18288c97a Thu 2018-06-21 15:23:15 CEST—Thu 2018-06-21 + -4 0fc2f05d12664d3aba6364102401d5fb Thu 2018-06-21 15:29:36 CEST—Thu 2018-06-21 + -3 412bbe36d12546bab749a2a63fad99ca Thu 2018-06-21 15:34:19 CEST—Thu 2018-06-21 + -2 c5189f2006c245d7833bb8fe20e62545 Thu 2018-06-21 16:07:51 CEST—Thu 2018-06-21 + -1 7c47950edd194ff4b6a67d3556672430 Thu 2018-06-21 16:11:28 CEST—Thu 2018-06-21 + 0 61ea098edc924030aafb7a822c2df0e3 Thu 2018-06-21 16:26:46 CEST—Wed 2018-06-27 + +journalctl -b + -- Logs begin at Thu 2018-06-21 11:20:55 CEST, end at Wed 2018-06-27 14:20:01 CEST. -- + Jun 21 16:26:46 lxdev05.psi.ch systemd-journal[85]: Runtime journal is using 8.0M (max allowed 91.9M, trying to leave 137.9M free of 911.8M av + Jun 21 16:26:46 lxdev05.psi.ch kernel: Initializing cgroup subsys cpuset + +journalctl -b -2 + -- Logs begin at Thu 2018-06-21 11:20:55 CEST, end at Wed 2018-06-27 14:20:01 CEST. -- + Jun 21 16:07:51 lxdev05.psi.ch systemd-journal[87]: Runtime journal is using 8.0M (max allowed 91.9M, trying to leave 137.9M free of 911.8M av + Jun 21 16:07:51 lxdev05.psi.ch kernel: Initializing cgroup subsys cpuset +``` + +- Show logs starting from a given date/time: +```bash +journalctl --since 2018-06-23 +journalctl --since '2018-06-23 18:13' +``` + +- Show logs for a given unit, eg a given service: +```bash +journalctl -u sshd.service +journalctl -u pli-puppet-run.timer +``` + +- Show the last N messages (1000 by default):: +```bash +journalctl -e +journalctl -e -n 250 +``` diff --git a/admin-guide/troubleshooting/methods-and-tools.rst b/admin-guide/troubleshooting/methods-and-tools.rst deleted file mode 100644 index 3b694e8f..00000000 --- a/admin-guide/troubleshooting/methods-and-tools.rst +++ /dev/null @@ -1,269 +0,0 @@ -=================== - Methods and Tools -=================== - -This section covers the general methods and tools available for troubleshooting -RHEL Linux systems. - -Methodology -=========== - -When solving problems it is helpful to use a structured approach (as opposed to -randomly trying things until the system seems to work again) and to keep notes. - -The `Google SRE book `_ has useful -information, especially the `chapter on troubleshooting -`_ - - -Tools -===== - - -Services --------- - -Services can be inspected with :manpage:`systemctl(1)`. Example:: - - ● sssd.service - System Security Services Daemon - Loaded: loaded (/usr/lib/systemd/system/sssd.service; enabled; vendor preset: disabled) - Active: active (running) since Thu 2018-06-21 16:26:48 CEST; 5 days ago - Main PID: 691 (sssd) - CGroup: /system.slice/sssd.service - ├─691 /usr/sbin/sssd -i --logger=files - ├─746 /usr/libexec/sssd/sssd_be --domain D.PSI.CH --uid 0 --gid 0 --logger=files - ├─758 /usr/libexec/sssd/sssd_nss --uid 0 --gid 0 --logger=files - └─759 /usr/libexec/sssd/sssd_pam --uid 0 --gid 0 --logger=files - - Jun 21 16:26:48 lxdev05.psi.ch systemd[1]: Starting System Security Services Daemon... - Jun 21 16:26:48 lxdev05.psi.ch sssd[691]: Starting up - Jun 21 16:26:48 lxdev05.psi.ch sssd[be[D.PSI.CH]][746]: Starting up - Jun 21 16:26:48 lxdev05.psi.ch sssd[pam][759]: Starting up - Jun 21 16:26:48 lxdev05.psi.ch sssd[nss][758]: Starting up - Jun 21 16:26:48 lxdev05.psi.ch systemd[1]: Started System Security Services Daemon. - Jun 25 10:59:22 lxdev05.psi.ch [sssd[krb5_child[5223]]][5223]: Preauthentication failed - Jun 25 10:59:24 lxdev05.psi.ch [sssd[krb5_child[5224]]][5224]: Preauthentication failed - Jun 25 10:59:24 lxdev05.psi.ch [sssd[krb5_child[5224]]][5224]: Preauthentication failed - - -Processes ---------- - -Processes can be investigated through a variety of tools: - -1. The files in ``/proc/$PID/``, in particular - - a) ``/proc/$PID/fd/*``: the open files of the process - b) ``/proc/$PID/environ``: the process' environment - -2. :manpage:`strace(1)` allows tracing a process' system calls. -3. :manpage:`ltrace(1)` allows tracing a process' library calls. - -.. note:: Both :manpage:`strace(1)` and :manpage:`ltrace(1)` slow the target - process down **a lot**, which might cause problems. - - -System and Application Logs ---------------------------- - -Starting with RHEL 7 almost all system logs end up in the journal, which can be -queried with ``journalctl``. One important exception is :manpage:`sssd(8)`, -which provides authentication against Active Directory. Its logs can be found in -``/var/log/sssd``. - - -:manpage:`journalctl` offers a lot of functionality. The following list shows -the most important features: - -1. List all reboots/show logs starting at a specific reboot:: - - # journalctl --list-boots - -10 19d173f56d314912820486b9ddfd7d6c Thu 2018-06-21 11:20:55 CEST—Thu 2018-06-21 - -9 3a5a050289314221a3863b88a0eef367 Thu 2018-06-21 11:26:33 CEST—Thu 2018-06-21 - -8 f9726e6c9ce44678ab68a2fc12b1c12c Thu 2018-06-21 11:43:38 CEST—Thu 2018-06-21 - -7 b4e6bc84ff8840adbc698992cd1900d2 Thu 2018-06-21 14:55:42 CEST—Thu 2018-06-21 - -6 81b78d0d09934937a24a73bfcd3d8ede Thu 2018-06-21 15:06:18 CEST—Thu 2018-06-21 - -5 dd78e29c073448ad9731c6c18288c97a Thu 2018-06-21 15:23:15 CEST—Thu 2018-06-21 - -4 0fc2f05d12664d3aba6364102401d5fb Thu 2018-06-21 15:29:36 CEST—Thu 2018-06-21 - -3 412bbe36d12546bab749a2a63fad99ca Thu 2018-06-21 15:34:19 CEST—Thu 2018-06-21 - -2 c5189f2006c245d7833bb8fe20e62545 Thu 2018-06-21 16:07:51 CEST—Thu 2018-06-21 - -1 7c47950edd194ff4b6a67d3556672430 Thu 2018-06-21 16:11:28 CEST—Thu 2018-06-21 - 0 61ea098edc924030aafb7a822c2df0e3 Thu 2018-06-21 16:26:46 CEST—Wed 2018-06-27 - - # journalctl -b - -- Logs begin at Thu 2018-06-21 11:20:55 CEST, end at Wed 2018-06-27 14:20:01 CEST. -- - Jun 21 16:26:46 lxdev05.psi.ch systemd-journal[85]: Runtime journal is using 8.0M (max allowed 91.9M, trying to leave 137.9M free of 911.8M av - Jun 21 16:26:46 lxdev05.psi.ch kernel: Initializing cgroup subsys cpuset - - # journalctl -b -2 - -- Logs begin at Thu 2018-06-21 11:20:55 CEST, end at Wed 2018-06-27 14:20:01 CEST. -- - Jun 21 16:07:51 lxdev05.psi.ch systemd-journal[87]: Runtime journal is using 8.0M (max allowed 91.9M, trying to leave 137.9M free of 911.8M av - Jun 21 16:07:51 lxdev05.psi.ch kernel: Initializing cgroup subsys cpuset - -2. Show logs starting from a given date/time:: - - # journalctl --since 2018-06-23 - # journalctl --since '2018-06-23 18:13' - -3. Show logs for a given unit, eg a given service:: - - # journalctl -u sshd.service - # journalctl -u pli-puppet-run.timer - -4. Show the last N messages (1000 by default):: - - # journalctl -e - # journalctl -e -n 250 - -5. List all systemd timers:: - - # journalctl list-timers - NEXT LEFT LAST PASSED UNIT ACTIVATES - Wed 2018-06-27 16:45:01 CEST 2h 16min left Tue 2018-06-26 16:45:01 CEST 21h ago systemd-tmpfiles-clean.timer systemd-tmpfiles-clean.service - Thu 2018-06-28 07:31:00 CEST 17h left Wed 2018-06-27 07:31:25 CEST 6h ago pli-puppet-run.timer pli-puppet-run.service - - 2 timers listed. - Pass --all to see loaded but inactive timers, too. - - -Filesystems and Storage ------------------------ - -Check filesystem capacity using :manpage:`df(1)`:: - - # df -h - Filesystem Size Used Avail Use% Mounted on - /dev/mapper/vg_root-lv_root 8.0G 1.4G 6.7G 17% / - devtmpfs 909M 0 909M 0% /dev - tmpfs 920M 0 920M 0% /dev/shm - tmpfs 920M 816K 920M 1% /run - tmpfs 920M 0 920M 0% /sys/fs/cgroup - /dev/sda1 976M 198M 728M 22% /boot - /dev/mapper/vg_root-lv_tmp 1014M 34M 981M 4% /tmp - /dev/mapper/vg_root-lv_var 2.9G 1.4G 1.5G 47% /var - /dev/mapper/vg_root-lv_var_log 2.0G 160M 1.9G 8% /var/log - /dev/mapper/vg_root-lv_openafs 1008M 1.3M 956M 1% /var/cache/openafs - tmpfs 184M 4.0K 184M 1% /run/user/0 - - -Check available inodes (~ the maximum number of files that can be created):: - - # df -i - Filesystem Inodes IUsed IFree IUse% Mounted on - /dev/mapper/vg_root-lv_root 4194304 48891 4145413 2% / - devtmpfs 232630 383 232247 1% /dev - tmpfs 235485 1 235484 1% /dev/shm - tmpfs 235485 575 234910 1% /run - tmpfs 235485 16 235469 1% /sys/fs/cgroup - /dev/sda1 65536 348 65188 1% /boot - /dev/mapper/vg_root-lv_tmp 524288 316 523972 1% /tmp - /dev/mapper/vg_root-lv_var 1474560 1042691 431869 71% /var - /dev/mapper/vg_root-lv_var_log 1048576 81 1048495 1% /var/log - /dev/mapper/vg_root-lv_openafs 65536 11 65525 1% /var/cache/openafs - tmpfs 235485 2 235483 1% /run/user/0 - - -Networking ----------- - -Test hostname resolution with :manpage:`getent(1)`, for example ``getent hosts -www.psi.ch``. Unlike :manpage:`nslookup(1)` or :manpage:`dig(1)`, it uses the -system resolver. - -The systems IP addresses and routes can be displayed using :manpage:`ip(8)`:: - - # ip address - 1: lo: mtu 65536 qdisc noqueue state UNKNOWN group default qlen 1 - link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00 - inet 127.0.0.1/8 scope host lo - valid_lft forever preferred_lft forever - inet6 ::1/128 scope host - valid_lft forever preferred_lft forever - 2: ens160: mtu 1500 qdisc pfifo_fast state UP group default qlen 1000 - link/ether 00:50:56:9d:6d:03 brd ff:ff:ff:ff:ff:ff - inet 10.129.160.195/24 brd 10.129.160.255 scope global ens160 - valid_lft forever preferred_lft forever - inet6 fe80::250:56ff:fe9d:6d03/64 scope link - valid_lft forever preferred_lft forever - - # ip route - default via 10.129.160.1 dev ens160 - 10.129.160.0/24 dev ens160 proto kernel scope link src 10.129.160.195 - 169.254.0.0/16 dev ens160 scope link metric 1002 - -The link status and other information of an interface can be displayed using -:manpage:`ethtool(8)`: - -1. Link status:: - - # ethtool ens160 - Settings for ens160: - [...] - Speed: 10000Mb/s - Duplex: Full - [...] - Link detected: yes - -2. Statistics (driver-specific, but look for errors/discards/dropped):: - - # ethtool -S ens160 - NIC statistics: - Tx Queue#: 0 - TSO pkts tx: 21529 - TSO bytes tx: 91036062 - ucast pkts tx: 1036632 - ucast bytes tx: 235421707 - mcast pkts tx: 8 - mcast bytes tx: 648 - bcast pkts tx: 7 - bcast bytes tx: 294 - pkts tx err: 0 - pkts tx discard: 0 - drv dropped tx total: 0 - too many frags: 0 - giant hdr: 0 - hdr err: 0 - tso: 0 - ring full: 0 - pkts linearized: 0 - hdr cloned: 0 - giant hdr: 0 - Rx Queue#: 0 - LRO pkts rx: 6913 - LRO byte rx: 100534073 - ucast pkts rx: 551554 - ucast bytes rx: 161369441 - mcast pkts rx: 4 - mcast bytes rx: 344 - bcast pkts rx: 753276 - bcast bytes rx: 45787629 - pkts rx OOB: 0 - pkts rx err: 0 - drv dropped rx total: 0 - err: 0 - fcs: 0 - rx buf alloc fail: 0 - tx timeout count: 0 - - -Packages --------- - -The integratity of installed package can be checked with :manpage:`rpm(8)`:: - - # rpm -Vv pciutils - ......... /usr/sbin/lspci - ......... /usr/sbin/setpci - ......... /usr/sbin/update-pciids - ......... /usr/share/doc/pciutils-3.5.1 - ......... d /usr/share/doc/pciutils-3.5.1/COPYING - ......... d /usr/share/doc/pciutils-3.5.1/ChangeLog - ......... d /usr/share/doc/pciutils-3.5.1/README - ......... d /usr/share/doc/pciutils-3.5.1/pciutils.lsm - ......... d /usr/share/man/man8/lspci.8.gz - ......... d /usr/share/man/man8/setpci.8.gz - ......... d /usr/share/man/man8/update-pciids.8.gz - -Running ``rpm -Vav`` will verify **all** installed packages and take a long -time. See the man page for details on the output format. Changes, especially in -configuration files, can be normal, though. diff --git a/admin-guide/troubleshooting/network.md b/admin-guide/troubleshooting/network.md new file mode 100644 index 00000000..aa9903b0 --- /dev/null +++ b/admin-guide/troubleshooting/network.md @@ -0,0 +1,80 @@ +# Network + +Test hostname resolution with :manpage:`getent(1)`, for example ``getent hosts +www.psi.ch``. Unlike :manpage:`nslookup(1)` or :manpage:`dig(1)`, it uses the +system resolver. + +The systems IP addresses and routes can be displayed using :manpage:`ip(8)`:: + + # ip address + 1: lo: mtu 65536 qdisc noqueue state UNKNOWN group default qlen 1 + link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00 + inet 127.0.0.1/8 scope host lo + valid_lft forever preferred_lft forever + inet6 ::1/128 scope host + valid_lft forever preferred_lft forever + 2: ens160: mtu 1500 qdisc pfifo_fast state UP group default qlen 1000 + link/ether 00:50:56:9d:6d:03 brd ff:ff:ff:ff:ff:ff + inet 10.129.160.195/24 brd 10.129.160.255 scope global ens160 + valid_lft forever preferred_lft forever + inet6 fe80::250:56ff:fe9d:6d03/64 scope link + valid_lft forever preferred_lft forever + + # ip route + default via 10.129.160.1 dev ens160 + 10.129.160.0/24 dev ens160 proto kernel scope link src 10.129.160.195 + 169.254.0.0/16 dev ens160 scope link metric 1002 + +The link status and other information of an interface can be displayed using +:manpage:`ethtool(8)`: + +1. Link status:: + + # ethtool ens160 + Settings for ens160: + [...] + Speed: 10000Mb/s + Duplex: Full + [...] + Link detected: yes + +2. Statistics (driver-specific, but look for errors/discards/dropped):: + + # ethtool -S ens160 + NIC statistics: + Tx Queue#: 0 + TSO pkts tx: 21529 + TSO bytes tx: 91036062 + ucast pkts tx: 1036632 + ucast bytes tx: 235421707 + mcast pkts tx: 8 + mcast bytes tx: 648 + bcast pkts tx: 7 + bcast bytes tx: 294 + pkts tx err: 0 + pkts tx discard: 0 + drv dropped tx total: 0 + too many frags: 0 + giant hdr: 0 + hdr err: 0 + tso: 0 + ring full: 0 + pkts linearized: 0 + hdr cloned: 0 + giant hdr: 0 + Rx Queue#: 0 + LRO pkts rx: 6913 + LRO byte rx: 100534073 + ucast pkts rx: 551554 + ucast bytes rx: 161369441 + mcast pkts rx: 4 + mcast bytes rx: 344 + bcast pkts rx: 753276 + bcast bytes rx: 45787629 + pkts rx OOB: 0 + pkts rx err: 0 + drv dropped rx total: 0 + err: 0 + fcs: 0 + rx buf alloc fail: 0 + tx timeout count: 0 diff --git a/admin-guide/troubleshooting/package_management.md b/admin-guide/troubleshooting/packages.md similarity index 57% rename from admin-guide/troubleshooting/package_management.md rename to admin-guide/troubleshooting/packages.md index 3be6d54d..a4d3f4a2 100644 --- a/admin-guide/troubleshooting/package_management.md +++ b/admin-guide/troubleshooting/packages.md @@ -1,25 +1,27 @@ -# Package Management +# Packages ## Dependency Resolution Failure / Duplicate Packages If e.g. an update attempt ends up in something like -``` -# yum update +```bash +yum update + Loaded plugins: langpacks, search-disabled-repos Resolving Dependencies ... --> Finished Dependency Resolution Error: Package: sssd-dbus-1.16.5-10.el7_9.12.x86_64 (@rhel7) - Requires: sssd-common = 1.16.5-10.el7_9.12 - Removing: sssd-common-1.16.5-10.el7_9.12.x86_64 (@rhel7) - sssd-common = 1.16.5-10.el7_9.12 - Updated By: sssd-common-1.16.5-10.el7_9.15.x86_64 (rhel7) - sssd-common = 1.16.5-10.el7_9.15 - Removing: sssd-common-1.16.5-10.el7_9.13.x86_64 (@rhel7) + Requires: sssd-common = 1.16.5-10.el7_9.12 + Removing: sssd-common-1.16.5-10.el7_9.12.x86_64 (@rhel7) + sssd-common = 1.16.5-10.el7_9.12 + Updated By: sssd-common-1.16.5-10.el7_9.15.x86_64 (rhel7) + sssd-common = 1.16.5-10.el7_9.15 + Removing: sssd-common-1.16.5-10.el7_9.13.x86_64 (@rhel7) ... ``` + best check if one of these packages is twice listed as installed: -``` -# rpm -qa sssd-common +```bash +rpm -qa sssd-common sssd-common-1.16.5-10.el7_9.13.x86_64 sssd-common-1.16.5-10.el7_9.12.x86_64 ``` @@ -30,7 +32,7 @@ Now you can try to remove one or both packages, maybe you need to remove also pa ### BDB0087 DB_RUNRECOVERY If the error message is -``` +```bash error: rpmdb: BDB0113 Thread/process 21389/140229907089216 failed: BDB1507 Thread died in Berkeley DB library error: db5 error(-30973) from dbenv->failchk: BDB0087 DB_RUNRECOVERY: Fatal error, run database recovery error: cannot open Packages index using db5 - (-30973) @@ -42,7 +44,7 @@ Error: rpmdb open failed then do -``` +```bash mv /var/lib/rpm/__db* /tmp yum clean all ``` @@ -50,7 +52,7 @@ yum clean all ### BDB0091 DB_VERSION_MISMATCH For -``` +```bash RPM: error: db5 error(-30969) from dbenv->open: BDB0091 DB_VERSION_MISMATCH: Database environment version mismatch RPM: error: cannot open Packages index using db5 - (-30969) RPM: error: cannot open Packages database in /var/lib/rpm @@ -58,7 +60,7 @@ Error: Could not run transaction. ``` do -``` +```bash yum clean all rpm --rebuilddb ``` @@ -67,3 +69,23 @@ rpm --rebuilddb ## Rebuild RPM Database When `yum`, `dnf` or `rpm` hang or have errors pointing to a problem with their database, then follow the [Guide to rebuild a corrupted RPM database](https://www.tecmint.com/rebuild-corrupted-rpm-database-in-centos/) + +## RPM Integrity Check +The integratity of installed package can be checked with `rpm`: + +```bash +rpm -Vv pciutils +......... /usr/sbin/lspci +......... /usr/sbin/setpci +......... /usr/sbin/update-pciids +......... /usr/share/doc/pciutils-3.5.1 +......... d /usr/share/doc/pciutils-3.5.1/COPYING +......... d /usr/share/doc/pciutils-3.5.1/ChangeLog +......... d /usr/share/doc/pciutils-3.5.1/README +......... d /usr/share/doc/pciutils-3.5.1/pciutils.lsm +......... d /usr/share/man/man8/lspci.8.gz +......... d /usr/share/man/man8/setpci.8.gz +......... d /usr/share/man/man8/update-pciids.8.gz +``` + +Running `rpm -Vav` will verify **all** installed packages and take a long time. See the man page for details on the output format. Changes, especially in configuration files, can be normal, though. \ No newline at end of file diff --git a/admin-guide/troubleshooting/processes.md b/admin-guide/troubleshooting/processes.md new file mode 100644 index 00000000..4a65b0f1 --- /dev/null +++ b/admin-guide/troubleshooting/processes.md @@ -0,0 +1,13 @@ +# Processes + +Processes can be investigated through a variety of tools: + +1. The files in `/proc/$PID/`, in particular + - `/proc/$PID/fd/*`: the open files of the process + - `/proc/$PID/environ`: the process' environment +2. `strace` allows tracing a process' system calls. +3. `ltrace` allows tracing a process' library calls. + +```{note} +Both `strace` and `ltrace` slow the target process down **a lot**, which might cause problems. +``` diff --git a/admin-guide/troubleshooting/systemd.md b/admin-guide/troubleshooting/systemd.md new file mode 100644 index 00000000..c01fce70 --- /dev/null +++ b/admin-guide/troubleshooting/systemd.md @@ -0,0 +1,26 @@ +# Systemd + + +- List status of services +```bash +# list status of all services +systemctl status + +# list status specific service +systemctl status yourservice +``` + +- List all systemd timers: +```bash +systemctl list-timers + NEXT LEFT LAST PASSED UNIT ACTIVATES + Wed 2018-06-27 16:45:01 CEST 2h 16min left Tue 2018-06-26 16:45:01 CEST 21h ago systemd-tmpfiles-clean.timer systemd-tmpfiles-clean.service + Thu 2018-06-28 07:31:00 CEST 17h left Wed 2018-06-27 07:31:25 CEST 6h ago pli-puppet-run.timer pli-puppet-run.service + + 2 timers listed. + Pass --all to see loaded but inactive timers, too. +``` + +## Logs +All systemd logs end up in journald. One important exception is __sssd__ which provides authentication against Active Directory. Its logs can be found in +`/var/log/sssd`. \ No newline at end of file