10 KiB
Troubleshooting
Find sources with issues in validation log:
tail -n 10000 /opt/dispatcher_node/latest/logs/data_validation.log | grep "Invalid and drop" | awk -e '{print($4)}' | sort | uniq
Overview
Following are the step to setup the imagebuffer from scratch:
- Install Users
for THE_HOST in $(sort -u ../hostlists_daqbufs/ImageBufferHosts.txt); do cat user_settings.sh ../hostlists_daqbufs/env_settings.sh add_user.sh | ssh -i ${HOME}/.ssh/id_rsa_daq root@${THE_HOST} ; done
- Install Java
for THE_HOST in $(sort -u ../hostlists_daqbufs/ImageBufferHosts.txt); do cat user_settings.sh ../hostlists_daqbufs/env_settings.sh install_java.sh | ssh -i ${HOME}/.ssh/id_rsa_daq root@${THE_HOST} ; done
- Install Query Nodes - ImageBuffer
for THE_HOST in $(sort -u ../hostlists_daqbufs/ImageBufferHosts.txt); do cat user_settings.sh ../hostlists_daqbufs/env_settings.sh install_query_node.sh | ssh -i ${HOME}/.ssh/id_rsa_daq root@${THE_HOST} ; done
- Install Dispatcher Nodes ImageBuffer
for THE_HOST in $(sort -u ../hostlists_daqbufs/ImageBufferHosts.txt); do cat user_settings.sh ../hostlists_daqbufs/env_settings.sh install_dispatcher_node.sh | ssh -i ${HOME}/.ssh/id_rsa_daq root@${THE_HOST} ; done
- Create Required Config File
[root@sf-daq-5 /]# cat /home/daqusr/.config/daq/domain.properties
backend.default=sf-imagebuffer
chown -R daqusr:daq /home/daqusr/.config/daq/domain.properties
chown -R daqusr:daq /home/daqusr/.config/daq/dispatcher.properties
- Restart Services (if needed)
for THE_HOST in $(sort -u ../hostlists_daqbufs/ImageBufferHosts.txt); do echo -e "systemctl stop daq-query-node.service" | ssh -i ${HOME}/.ssh/id_rsa_daq root@${THE_HOST} ; done
for THE_HOST in $(sort -u ../hostlists_daqbufs/ImageBufferHosts.txt); do echo -e "systemctl stop daq-dispatcher-node.service" | ssh -i ${HOME}/.ssh/id_rsa_daq root@${THE_HOST} ; done
for THE_HOST in $(sort -u ../hostlists_daqbufs/ImageBufferHosts.txt); do echo -e "hostname \n systemctl start daq-dispatcher-node.service \n sleep 20" | ssh -i ${HOME}/.ssh/id_rsa_daq root@${THE_HOST} ; done
for THE_HOST in $(sort -u ../hostlists_daqbufs/ImageBufferHosts.txt); do echo -e "hostname \n systemctl start daq-query-node.service \n sleep 10" | ssh -i ${HOME}/.ssh/id_rsa_daq root@${THE_HOST} ; done
Monitoring of the system is available via telegraf and grafana: https://hpc-monitor01.psi.ch/d/TW0pr_bik/gl2?refresh=30s&orgId=1
TODO CONVERT DOCUMENTATION
Dispatcher
Dispatcher Nodes
Install
- Go to ch.psi.daq.buildall and execute:
./gradlew dropItDispatcherNode -x test - Login to master node and follow these instructions to setup the git environment.
- Multihost command:
for THE_HOST in $(sort -u ../hostlists/DispatcherNodeHosts.txt); do cat user_settings.sh ../hostlists/env_settings.sh install_dispatcher_node.sh | ssh -i ${HOME}/.ssh/id_rsa_daq root@${THE_HOST} ; done
De-Install
Multihost command: for THE_HOST in $(sort -u ../hostlists/DispatcherNodeHosts.txt); do echo -e "systemctl stop daq-dispatcher-node.service \n systemctl disable daq-dispatcher-node.service \n rm /usr/lib/systemd/system/daq-dispatcher-node.service \n rm -rf /opt/dispatcher_node \n systemctl daemon-reload" | ssh -i ${HOME}/.ssh/id_rsa_daq root@${THE_HOST} ; done
Dispatcher REST Server
Install
-
Go to ch.psi.daq.buildall and execute:
./gradlew dropItDispatcherREST -x test -
Login to master node and follow these instructions to setup the git environment.
-
Multihost command:
for THE_HOST in $(sort -u ../hostlists/DispatcherRESTHost.txt); do cat user_settings.sh ../hostlists/env_settings.sh install_dispatcher_rest.sh | ssh -i ${HOME}/.ssh/id_rsa_daq root@${THE_HOST} ; done -
Check if ui is running by using a browser: http://sf-nube-13.psi.ch:8080/
De-Install
Multihost command: for THE_HOST in $(sort -u ../hostlists/DispatcherRESTHost.txt); do echo -e "systemctl stop daq-dispatcher-rest.service \n systemctl disable daq-dispatcher-rest.service \n rm /usr/lib/systemd/system/daq-dispatcher-rest.service \n rm -rf /opt/dispatcher_rest \n systemctl daemon-reload" | ssh -i ${HOME}/.ssh/id_rsa_daq root@${THE_HOST} ; done
Querying
Query Nodes
Install
- Go to ch.psi.daq.buildall and execute:
./gradlew dropItQueryNode -x test - Login to master node and follow these instructions to setup the git environment.
- Multihost command:
for THE_HOST in $(sort -u ../hostlists/QueryNodeHosts.txt); do cat user_settings.sh ../hostlists/env_settings.sh install_query_node.sh | ssh -i ${HOME}/.ssh/id_rsa_daq root@${THE_HOST} ; done
De-Install
Multihost command: for THE_HOST in $(sort -u ../hostlists/QueryNodeHosts.txt); do echo -e "systemctl stop daq-query-node.service \n systemctl disable daq-query-node.service \n rm /usr/lib/systemd/system/daq-query-node.service \n rm -rf /opt/query_node \n systemctl daemon-reload" | ssh -i ${HOME}/.ssh/id_rsa_daq root@${THE_HOST} ; done
Query REST Server
Install
- Go to ch.psi.daq.buildall and execute:
./gradlew dropItQueryREST -x test - Login to master node and follow these instructions to setup the git environment.
- Multihost command:
for THE_HOST in $(sort -u ../hostlists/QueryRESTHost.txt); do cat user_settings.sh ../hostlists/env_settings.sh install_query_rest.sh | ssh -i ${HOME}/.ssh/id_rsa_daq root@${THE_HOST} ; done
De-Install
Multihost command: for THE_HOST in $(sort -u ../hostlists/QueryRESTHost.txt); do echo -e "systemctl stop daq-query-rest.service \n systemctl disable daq-query-rest.service \n rm /usr/lib/systemd/system/daq-query-rest.service \n rm -rf /opt/query_rest \n systemctl daemon-reload" | ssh -i ${HOME}/.ssh/id_rsa_daq root@${THE_HOST} ; done
DAQLocal
DAQLocal
Install
- Go to ch.psi.daq.buildall and execute:
./gradlew dropItDAQLocal -x test - Login to master node and follow these instructions to setup the git environment.
- Multihost command:
cat user_settings.sh ../hostlists/env_settings.sh ../hostlists/stream_sources.sh install_daqlocal.sh | bash
De-Install
Multihost command: systemctl stop daq-daqlocal.service; systemctl disable daq-daqlocal.service; rm -rf /usr/lib/systemd/system/daq-daqlocal.service; rm -rf /opt/daqlocal
Helpful Commands
Dispatcher Node service:
for THE_HOST in $(sort -u ../hostlists/*Host*.txt); do echo -e "echo -e '\n\nHOST:${THE_HOST}' && ls /usr/lib/systemd/system/daq-dispatcher-node* | xargs -n1 basename | xargs -n1 systemctl stop" | ssh -i ${HOME}/.ssh/id_rsa_daq root@${THE_HOST} ; done
Miscellaneous
Remove log files:
for THE_HOST in $(sort -u ../hostlists/*Host*.txt); do echo -e "find /data_meta -name "*.log*" | grep "logs" | xargs rm" | ssh -i ${HOME}/.ssh/id_rsa_daq root@${THE_HOST} ; done
Monitor log messages
systemd:
journalctl -f -u daq-dispatcher-rest.service
journalctl --since=today -u daq-dispatcher-rest.service
CPU/Disk usage:
dstat -d -D sdb1,sda5,total -cm -n
Docker Issues
systemctl stop docker
rm -rf /var/lib/docker
systemctl start docker
systemctl start nginx
Modify Logging
- Modify logback-server.xml (e.g. in /opt/dispatcher_node/latest/lib/)
- Run JConsole
a. /usr/java/latest/bin/jconsole
b. /usr/java/latest/bin/jconsole and use
Remote Process(application needs to be started with-Dcom.sun.management.jmxremote.port=3334 -Dcom.sun.management.jmxremote.ssl=false -Dcom.sun.management.jmxremote.authenticate=false) a. localhost:3334 for DispatcherNode b. localhost:3335 for DispatcherRest c. localhost:3336 for QueryNode d. localhost:3337 for QueryRest. - Go to ch.qos.logback.classic -> ... -> Operations
- Press
reloadDefaultConfiguration
Profiling
- Run
/usr/java/latest/bin/jvisualvm(use/usr/java/latest/bin/jvisualvm -J-Dnetbeans.logger.console=truefor debuging). - Add
JMX Connestion(see here forhostname:portsettings
Note: You might need to install yum install xorg-x11-xauth libXtst
Folder Crawler
The NAS system needs incredibly long to list folders. Current workaround is to use a folder crawler that periodically lists the folder structure and thus keeps it in the cache.
mkdir /home/daqusr/scripts && cp ../scripts/folder_crawler.sh /home/daqusr/scripts && chown -R daqusr:daq /home/daqusr/scriptscp ../hostlists/systemd/folder-crawler.service /etc/systemd/system/ && systemctl enable folder-crawler.service && systemctl daemon-reloadcp ../hostlists/systemd/folder-crawler.timer /etc/systemd/system/ && systemctl enable folder-crawler.timer && systemctl daemon-reload && systemctl start folder-crawler.timer
Maintenance Utils
Find Largest Files
find /data/sf-databuffer/daq_swissfel/daq_swissfel_3 -mtime +5 -printf "%s %n %m %u %g %t %p" \( -type l -printf ' -> %l\n' -o -printf '\n' \) | sort -k1,1 -n
Count Disk Usage
find /data/sf-databuffer/daq_swissfel/daq_swissfel_3 -type f -mtime +5 -printf '%s\n' | awk '{a+=$1;} END {printf "%.1f GB\n", a/2**30;}'
find /data/sf-databuffer/daq_swissfel/daq_swissfel_3 -type f -newerct "2000-01-01" ! -newerct "2018-06-27 23:00" -printf '%s\n' | awk '{a+=$1;} END {printf "%.1f GB\n", a/2**30;}'
Delete Specific Files (do not forget -empty if needed!!!)
find /data/sf-databuffer/daq_swissfel/daq_swissfel_3 -type f -mtime +5 -regextype sed -regex '.*LOSS_SIGNAL_RAW.*' -delete
find /data/sf-databuffer/daq_swissfel/daq_swissfel_3 -type d -empty -delete
find /gpfs/sf-data/sf-imagebuffer/daq_swissfel/daq_swissfel_4 -type f -newerct "2000-01-01" ! -newerct "2018-06-25 23:00" -delete
find /gpfs/sf-data/sf-imagebuffer/daq_swissfel/daq_swissfel_4 -type d -empty -delete
Delete empty files in parallel
# use 32 threads
find /gls_data/gls-archive/daq_local/daq_local_*/byTime/ -maxdepth 1 | tail -n +2 | xargs -I {} -P 32 -n 1 find {} -type f -empty -delete
Parallel rsync
# see: https://stackoverflow.com/a/46611168
# SETUP OPTIONS
export SRCDIR="/home/maerki_f/Downloads/rsync_test/.snapshot/data/test"
# export SRCDIR="/gls_data/.snapshot/daily.2018-07-18_0010/gls-archive/daq_local/daq_local_2/byTime"
export DESTDIR="/home/maerki_f/Downloads/rsync_test/data/test"
# export DESTDIR="/gls_data/gls-archive/daq_local/daq_local_2/byTime"
# use 32 threads
ls -1 $SRCDIR | xargs -I {} -P 32 -n 1 rsync -auvh --progress $SRCDIR/{} $DESTDIR/