# Troubleshooting Find sources with issues in validation log: ``` tail -n 10000 /opt/dispatcher_node/latest/logs/data_validation.log | grep "Invalid and drop" | awk -e '{print($4)}' | sort | uniq ``` # Overview Following are the step to setup the imagebuffer from scratch: - Install Users ```bash for THE_HOST in $(sort -u ../hostlists_daqbufs/ImageBufferHosts.txt); do cat user_settings.sh ../hostlists_daqbufs/env_settings.sh add_user.sh | ssh -i ${HOME}/.ssh/id_rsa_daq root@${THE_HOST} ; done ``` - Install Java ```bash for THE_HOST in $(sort -u ../hostlists_daqbufs/ImageBufferHosts.txt); do cat user_settings.sh ../hostlists_daqbufs/env_settings.sh install_java.sh | ssh -i ${HOME}/.ssh/id_rsa_daq root@${THE_HOST} ; done ``` - Install Query Nodes - ImageBuffer ```bash for THE_HOST in $(sort -u ../hostlists_daqbufs/ImageBufferHosts.txt); do cat user_settings.sh ../hostlists_daqbufs/env_settings.sh install_query_node.sh | ssh -i ${HOME}/.ssh/id_rsa_daq root@${THE_HOST} ; done ``` - Install Dispatcher Nodes ImageBuffer ```bash for THE_HOST in $(sort -u ../hostlists_daqbufs/ImageBufferHosts.txt); do cat user_settings.sh ../hostlists_daqbufs/env_settings.sh install_dispatcher_node.sh | ssh -i ${HOME}/.ssh/id_rsa_daq root@${THE_HOST} ; done ``` - Create Required Config File ``` [root@sf-daq-5 /]# cat /home/daqusr/.config/daq/domain.properties backend.default=sf-imagebuffer chown -R daqusr:daq /home/daqusr/.config/daq/domain.properties chown -R daqusr:daq /home/daqusr/.config/daq/dispatcher.properties ``` - Restart Services (if needed) ``` for THE_HOST in $(sort -u ../hostlists_daqbufs/ImageBufferHosts.txt); do echo -e "systemctl stop daq-query-node.service" | ssh -i ${HOME}/.ssh/id_rsa_daq root@${THE_HOST} ; done for THE_HOST in $(sort -u ../hostlists_daqbufs/ImageBufferHosts.txt); do echo -e "systemctl stop daq-dispatcher-node.service" | ssh -i ${HOME}/.ssh/id_rsa_daq root@${THE_HOST} ; done ``` ``` for THE_HOST in $(sort -u ../hostlists_daqbufs/ImageBufferHosts.txt); do echo -e "hostname \n systemctl start daq-dispatcher-node.service \n sleep 20" | ssh -i ${HOME}/.ssh/id_rsa_daq root@${THE_HOST} ; done for THE_HOST in $(sort -u ../hostlists_daqbufs/ImageBufferHosts.txt); do echo -e "hostname \n systemctl start daq-query-node.service \n sleep 10" | ssh -i ${HOME}/.ssh/id_rsa_daq root@${THE_HOST} ; done ``` Monitoring of the system is available via telegraf and grafana: https://hpc-monitor01.psi.ch/d/TW0pr_bik/gl2?refresh=30s&orgId=1 ---- TODO CONVERT DOCUMENTATION ## Dispatcher ### Dispatcher Nodes #### Install 1. Go to ch.psi.daq.buildall and execute: `./gradlew dropItDispatcherNode -x test` 2. Login to master node and follow [these instructions](Readme.md#clone_git) to setup the git environment. 3. Multihost command: `for THE_HOST in $(sort -u ../hostlists/DispatcherNodeHosts.txt); do cat user_settings.sh ../hostlists/env_settings.sh install_dispatcher_node.sh | ssh -i ${HOME}/.ssh/id_rsa_daq root@${THE_HOST} ; done` #### De-Install Multihost command: `for THE_HOST in $(sort -u ../hostlists/DispatcherNodeHosts.txt); do echo -e "systemctl stop daq-dispatcher-node.service \n systemctl disable daq-dispatcher-node.service \n rm /usr/lib/systemd/system/daq-dispatcher-node.service \n rm -rf /opt/dispatcher_node \n systemctl daemon-reload" | ssh -i ${HOME}/.ssh/id_rsa_daq root@${THE_HOST} ; done` ### Dispatcher REST Server #### Install 1. Go to ch.psi.daq.buildall and execute: `./gradlew dropItDispatcherREST -x test` 2. Login to master node and follow [these instructions](Readme.md#clone_git) to setup the git environment. 3. Multihost command: `for THE_HOST in $(sort -u ../hostlists/DispatcherRESTHost.txt); do cat user_settings.sh ../hostlists/env_settings.sh install_dispatcher_rest.sh | ssh -i ${HOME}/.ssh/id_rsa_daq root@${THE_HOST} ; done` 4. Check if ui is running by using a browser: http://sf-nube-13.psi.ch:8080/ #### De-Install Multihost command: `for THE_HOST in $(sort -u ../hostlists/DispatcherRESTHost.txt); do echo -e "systemctl stop daq-dispatcher-rest.service \n systemctl disable daq-dispatcher-rest.service \n rm /usr/lib/systemd/system/daq-dispatcher-rest.service \n rm -rf /opt/dispatcher_rest \n systemctl daemon-reload" | ssh -i ${HOME}/.ssh/id_rsa_daq root@${THE_HOST} ; done` ## Querying ### Query Nodes #### Install 1. Go to ch.psi.daq.buildall and execute: `./gradlew dropItQueryNode -x test` 2. Login to master node and follow [these instructions](Readme.md#clone_git) to setup the git environment. 3. Multihost command: `for THE_HOST in $(sort -u ../hostlists/QueryNodeHosts.txt); do cat user_settings.sh ../hostlists/env_settings.sh install_query_node.sh | ssh -i ${HOME}/.ssh/id_rsa_daq root@${THE_HOST} ; done` #### De-Install Multihost command: `for THE_HOST in $(sort -u ../hostlists/QueryNodeHosts.txt); do echo -e "systemctl stop daq-query-node.service \n systemctl disable daq-query-node.service \n rm /usr/lib/systemd/system/daq-query-node.service \n rm -rf /opt/query_node \n systemctl daemon-reload" | ssh -i ${HOME}/.ssh/id_rsa_daq root@${THE_HOST} ; done` ### Query REST Server #### Install 1. Go to ch.psi.daq.buildall and execute: `./gradlew dropItQueryREST -x test` 2. Login to master node and follow [these instructions](Readme.md#clone_git) to setup the git environment. 3. Multihost command: `for THE_HOST in $(sort -u ../hostlists/QueryRESTHost.txt); do cat user_settings.sh ../hostlists/env_settings.sh install_query_rest.sh | ssh -i ${HOME}/.ssh/id_rsa_daq root@${THE_HOST} ; done` #### De-Install Multihost command: `for THE_HOST in $(sort -u ../hostlists/QueryRESTHost.txt); do echo -e "systemctl stop daq-query-rest.service \n systemctl disable daq-query-rest.service \n rm /usr/lib/systemd/system/daq-query-rest.service \n rm -rf /opt/query_rest \n systemctl daemon-reload" | ssh -i ${HOME}/.ssh/id_rsa_daq root@${THE_HOST} ; done` ## DAQLocal ### DAQLocal #### Install 1. Go to ch.psi.daq.buildall and execute: `./gradlew dropItDAQLocal -x test` 2. Login to master node and follow [these instructions](Readme.md#clone_git) to setup the git environment. 1. Multihost command: `cat user_settings.sh ../hostlists/env_settings.sh ../hostlists/stream_sources.sh install_daqlocal.sh | bash` #### De-Install Multihost command: `systemctl stop daq-daqlocal.service; systemctl disable daq-daqlocal.service; rm -rf /usr/lib/systemd/system/daq-daqlocal.service; rm -rf /opt/daqlocal` ## Helpful Commands Dispatcher Node service: `for THE_HOST in $(sort -u ../hostlists/*Host*.txt); do echo -e "echo -e '\n\nHOST:${THE_HOST}' && ls /usr/lib/systemd/system/daq-dispatcher-node* | xargs -n1 basename | xargs -n1 systemctl stop" | ssh -i ${HOME}/.ssh/id_rsa_daq root@${THE_HOST} ; done` ### Miscellaneous Remove log files: `for THE_HOST in $(sort -u ../hostlists/*Host*.txt); do echo -e "find /data_meta -name "*.log*" | grep "logs" | xargs rm" | ssh -i ${HOME}/.ssh/id_rsa_daq root@${THE_HOST} ; done` Monitor log messages systemd: `journalctl -f -u daq-dispatcher-rest.service` `journalctl --since=today -u daq-dispatcher-rest.service` CPU/Disk usage: `dstat -d -D sdb1,sda5,total -cm -n` ### Docker Issues ``` systemctl stop docker rm -rf /var/lib/docker systemctl start docker systemctl start nginx ``` ### Modify Logging 1. Modify logback-server.xml (e.g. in /opt/dispatcher_node/latest/lib/) 2. Run JConsole a. /usr/java/latest/bin/jconsole b. /usr/java/latest/bin/jconsole and use `Remote Process` (application needs to be started with `-Dcom.sun.management.jmxremote.port=3334 -Dcom.sun.management.jmxremote.ssl=false -Dcom.sun.management.jmxremote.authenticate=false`) a. localhost:3334 for DispatcherNode b. localhost:3335 for DispatcherRest c. localhost:3336 for QueryNode d. localhost:3337 for QueryRest. 3. Go to ch.qos.logback.classic -> ... -> Operations 4. Press `reloadDefaultConfiguration` ### Profiling 1. Run `/usr/java/latest/bin/jvisualvm` (use `/usr/java/latest/bin/jvisualvm -J-Dnetbeans.logger.console=true` for debuging). 2. Add `JMX Connestion` (see [here](Readme.md#modify_logging) for `hostname:port` settings Note: You might need to install `yum install xorg-x11-xauth libXtst` ### Folder Crawler The NAS system needs incredibly long to list folders. Current workaround is to use a folder crawler that periodically lists the folder structure and thus keeps it in the cache. 1. `mkdir /home/daqusr/scripts && cp ../scripts/folder_crawler.sh /home/daqusr/scripts && chown -R daqusr:daq /home/daqusr/scripts` 2. `cp ../hostlists/systemd/folder-crawler.service /etc/systemd/system/ && systemctl enable folder-crawler.service && systemctl daemon-reload` 3. `cp ../hostlists/systemd/folder-crawler.timer /etc/systemd/system/ && systemctl enable folder-crawler.timer && systemctl daemon-reload && systemctl start folder-crawler.timer` ## Maintenance Utils ### Find Largest Files `find /data/sf-databuffer/daq_swissfel/daq_swissfel_3 -mtime +5 -printf "%s %n %m %u %g %t %p" \( -type l -printf ' -> %l\n' -o -printf '\n' \) | sort -k1,1 -n` ### Count Disk Usage `find /data/sf-databuffer/daq_swissfel/daq_swissfel_3 -type f -mtime +5 -printf '%s\n' | awk '{a+=$1;} END {printf "%.1f GB\n", a/2**30;}'` `find /data/sf-databuffer/daq_swissfel/daq_swissfel_3 -type f -newerct "2000-01-01" ! -newerct "2018-06-27 23:00" -printf '%s\n' | awk '{a+=$1;} END {printf "%.1f GB\n", a/2**30;}'` ### Delete Specific Files (do not forget -empty if needed!!!) `find /data/sf-databuffer/daq_swissfel/daq_swissfel_3 -type f -mtime +5 -regextype sed -regex '.*LOSS_SIGNAL_RAW.*' -delete` `find /data/sf-databuffer/daq_swissfel/daq_swissfel_3 -type d -empty -delete` `find /gpfs/sf-data/sf-imagebuffer/daq_swissfel/daq_swissfel_4 -type f -newerct "2000-01-01" ! -newerct "2018-06-25 23:00" -delete` `find /gpfs/sf-data/sf-imagebuffer/daq_swissfel/daq_swissfel_4 -type d -empty -delete` ### Delete empty files in parallel ```bash # use 32 threads find /gls_data/gls-archive/daq_local/daq_local_*/byTime/ -maxdepth 1 | tail -n +2 | xargs -I {} -P 32 -n 1 find {} -type f -empty -delete ``` ### Parallel rsync ```bash # see: https://stackoverflow.com/a/46611168 # SETUP OPTIONS export SRCDIR="/home/maerki_f/Downloads/rsync_test/.snapshot/data/test" # export SRCDIR="/gls_data/.snapshot/daily.2018-07-18_0010/gls-archive/daq_local/daq_local_2/byTime" export DESTDIR="/home/maerki_f/Downloads/rsync_test/data/test" # export DESTDIR="/gls_data/gls-archive/daq_local/daq_local_2/byTime" # use 32 threads ls -1 $SRCDIR | xargs -I {} -P 32 -n 1 rsync -auvh --progress $SRCDIR/{} $DESTDIR/ ```