Selaa lähdekoodia

unbound: fix healthcheck logging + added fail tolerance to checks (#6004)

* unbound: fix healthcheck logging to stdout + rewrote healthcheck logic

* compose: bump unbound tag

* unbound: fixed healthcheck logic
Niklas Meyer 1 vuosi sitten
vanhempi
sitoutus
b26ccc2019

+ 11 - 4
data/Dockerfiles/unbound/Dockerfile

@@ -5,14 +5,17 @@ LABEL maintainer = "The Infrastructure Company GmbH <info@servercow.de>"
 RUN apk add --update --no-cache \
 RUN apk add --update --no-cache \
 	curl \
 	curl \
 	bind-tools \
 	bind-tools \
+	coreutils \
 	unbound \
 	unbound \
 	bash \
 	bash \
 	openssl \
 	openssl \
 	drill \
 	drill \
 	tzdata \
 	tzdata \
+	syslog-ng \
+	supervisor \
 	&& curl -o /etc/unbound/root.hints https://www.internic.net/domain/named.cache \
 	&& curl -o /etc/unbound/root.hints https://www.internic.net/domain/named.cache \
 	&& chown root:unbound /etc/unbound \
 	&& chown root:unbound /etc/unbound \
-  && adduser unbound tty \
+    && adduser unbound tty \
 	&& chmod 775 /etc/unbound
 	&& chmod 775 /etc/unbound
 
 
 EXPOSE 53/udp 53/tcp
 EXPOSE 53/udp 53/tcp
@@ -21,9 +24,13 @@ COPY docker-entrypoint.sh /docker-entrypoint.sh
 
 
 # healthcheck (dig, ping)
 # healthcheck (dig, ping)
 COPY healthcheck.sh /healthcheck.sh
 COPY healthcheck.sh /healthcheck.sh
+COPY syslog-ng.conf /etc/syslog-ng/syslog-ng.conf
+COPY supervisord.conf /etc/supervisor/supervisord.conf
+COPY stop-supervisor.sh /usr/local/sbin/stop-supervisor.sh
+
 RUN chmod +x /healthcheck.sh
 RUN chmod +x /healthcheck.sh
-HEALTHCHECK --interval=30s --timeout=30s CMD [ "/healthcheck.sh" ]
+HEALTHCHECK --interval=30s --timeout=10s \
+  CMD sh -c '[ -f /tmp/healthcheck_status ] && [ "$(cat /tmp/healthcheck_status)" -eq 0 ] || exit 1'
 
 
 ENTRYPOINT ["/docker-entrypoint.sh"]
 ENTRYPOINT ["/docker-entrypoint.sh"]
-
-CMD ["/usr/sbin/unbound"]
+CMD exec /usr/bin/supervisord -c /etc/supervisor/supervisord.conf

+ 79 - 53
data/Dockerfiles/unbound/healthcheck.sh

@@ -1,76 +1,102 @@
 #!/bin/bash
 #!/bin/bash
 
 
-# Skip Unbound (DNS Resolver) Healthchecks (NOT Recommended!)
-if [[ "${SKIP_UNBOUND_HEALTHCHECK}" =~ ^([yY][eE][sS]|[yY])+$ ]]; then
-    SKIP_UNBOUND_HEALTHCHECK=y
-fi
-
-# Reset logfile
-echo "$(date +"%Y-%m-%d %H:%M:%S"): Starting health check - logs can be found in /var/log/healthcheck.log"
-echo "$(date +"%Y-%m-%d %H:%M:%S"): Starting health check" > /var/log/healthcheck.log
+STATUS_FILE="/tmp/healthcheck_status"
+RUNS=0
 
 
-# Declare log function for logfile inside container
-function log_to_file() {
-    echo "$(date +"%Y-%m-%d %H:%M:%S"): $1" >> /var/log/healthcheck.log
+# Declare log function for logfile to stdout
+function log_to_stdout() {
+echo "$(date +"%Y-%m-%d %H:%M:%S"): $1"
 }
 }
 
 
 # General Ping function to check general pingability
 # General Ping function to check general pingability
 function check_ping() {
 function check_ping() {
-    declare -a ipstoping=("1.1.1.1" "8.8.8.8" "9.9.9.9")
-
-    for ip in "${ipstoping[@]}" ; do
-            ping -q -c 3 -w 5 "$ip"
-            if [ $? -ne 0 ]; then
-                log_to_file "Healthcheck: Couldn't ping $ip for 5 seconds... Gave up!"
-                log_to_file "Please check your internet connection or firewall rules to fix this error, because a simple ping test should always go through from the unbound container!"
-                return 1
-            fi
+declare -a ipstoping=("1.1.1.1" "8.8.8.8" "9.9.9.9")
+local fail_tolerance=1
+local failures=0
+
+for ip in "${ipstoping[@]}" ; do
+    success=false
+    for ((i=1; i<=3; i++)); do
+        ping -q -c 3 -w 5 "$ip" > /dev/null
+        if [ $? -eq 0 ]; then
+            success=true
+            break
+        else
+            log_to_stdout "Healthcheck: Failed to ping $ip on attempt $i. Trying again..."
+        fi
     done
     done
+    
+    if [ "$success" = false ]; then
+        log_to_stdout "Healthcheck: Couldn't ping $ip after 3 attempts. Marking this IP as failed."
+        ((failures++))
+    fi
+done
+
+if [ $failures -gt $fail_tolerance ]; then
+    log_to_stdout "Healthcheck: Too many ping failures ($fail_tolerance failures allowed, you got $failures failures), marking Healthcheck as unhealthy..."
+    return 1
+fi
+
+return 0
 
 
-    log_to_file "Healthcheck: Ping Checks WORKING properly!"
-    return 0
 }
 }
 
 
 # General DNS Resolve Check against Unbound Resolver himself
 # General DNS Resolve Check against Unbound Resolver himself
 function check_dns() {
 function check_dns() {
-    declare -a domains=("mailcow.email" "github.com" "hub.docker.com")
-
-    for domain in "${domains[@]}" ; do
-        for ((i=1; i<=3; i++)); do
-            dig +short +timeout=2 +tries=1 "$domain" @127.0.0.1 > /dev/null
-        if [ $? -ne 0 ]; then
-            log_to_file "Healthcheck: DNS Resolution Failed on $i attempt! Trying again..."
-            if [ $i -eq 3 ]; then
-                log_to_file "Healthcheck: DNS Resolution not possible after $i attempts... Gave up!"
-                log_to_file "Maybe check your outbound firewall, as it needs to resolve DNS over TCP AND UDP!"
-                return 1
-            fi
+declare -a domains=("fuzzy.mailcow.email" "github.com" "hub.docker.com")
+local fail_tolerance=1
+local failures=0
+
+for domain in "${domains[@]}" ; do
+    success=false
+    for ((i=1; i<=3; i++)); do
+        dig_output=$(dig +short +timeout=2 +tries=1 "$domain" @127.0.0.1 2>/dev/null)
+        dig_rc=$?
+
+        if [ $dig_rc -ne 0 ] || [ -z "$dig_output" ]; then
+            log_to_stdout "Healthcheck: DNS Resolution Failed on attempt $i for $domain! Trying again..."
+        else
+            success=true
+            break
         fi
         fi
-        done
     done
     done
-
-    log_to_file "Healthcheck: DNS Resolver WORKING properly!"
-    return 0
     
     
-}
+    if [ "$success" = false ]; then
+        log_to_stdout "Healthcheck: DNS Resolution not possible after 3 attempts for $domain... Gave up!"
+        ((failures++))
+    fi
+done
 
 
-if [[ ${SKIP_UNBOUND_HEALTHCHECK} == "y" ]]; then
-    log_to_file "Healthcheck: ALL CHECKS WERE SKIPPED! Unbound is healthy!"
-    exit 0
+if [ $failures -gt $fail_tolerance ]; then
+    log_to_stdout "Healthcheck: Too many DNS failures ($fail_tolerance failures allowed, you got $failures failures), marking Healthcheck as unhealthy..."
+    return 1
 fi
 fi
 
 
-# run checks, if check is not returning 0 (return value if check is ok), healthcheck will exit with 1 (marked in docker as unhealthy)
-check_ping
+return 0
+}
 
 
-if [ $? -ne 0 ]; then
-    exit 1
-fi
+while true; do
 
 
-check_dns
+    if [[ ${SKIP_UNBOUND_HEALTHCHECK} == "y" ]]; then
+    log_to_stdout "Healthcheck: ALL CHECKS WERE SKIPPED! Unbound is healthy!"
+    echo "0" > $STATUS_FILE
+    sleep 365d
+    fi
 
 
-if [ $? -ne 0 ]; then
-    exit 1
-fi
+    # run checks, if check is not returning 0 (return value if check is ok), healthcheck will exit with 1 (marked in docker as unhealthy)
+    check_ping
+    PING_STATUS=$?
+
+    check_dns
+    DNS_STATUS=$?
+
+    if [ $PING_STATUS -ne 0 ] || [ $DNS_STATUS -ne 0 ]; then
+        echo "1" > $STATUS_FILE
+
+    else
+        echo "0" > $STATUS_FILE
+    fi
+
+    sleep 30
 
 
-log_to_file "Healthcheck: ALL CHECKS WERE SUCCESSFUL! Unbound is healthy!"
-exit 0
+done

+ 10 - 0
data/Dockerfiles/unbound/stop-supervisor.sh

@@ -0,0 +1,10 @@
+#!/bin/bash
+
+printf "READY\n";
+
+while read line; do
+  echo "Processing Event: $line" >&2;
+  kill -3 $(cat "/var/run/supervisord.pid")
+done < /dev/stdin
+
+rm -rf /tmp/healthcheck_status

+ 32 - 0
data/Dockerfiles/unbound/supervisord.conf

@@ -0,0 +1,32 @@
+[supervisord]
+nodaemon=true
+user=root
+pidfile=/var/run/supervisord.pid
+
+[program:syslog-ng]
+command=/usr/sbin/syslog-ng --foreground --no-caps
+stdout_logfile=/dev/stdout
+stdout_logfile_maxbytes=0
+stderr_logfile=/dev/stderr
+stderr_logfile_maxbytes=0
+autostart=true
+
+[program:unbound]
+command=/usr/sbin/unbound
+stdout_logfile=/dev/stdout
+stdout_logfile_maxbytes=0
+stderr_logfile=/dev/stderr
+stderr_logfile_maxbytes=0
+autorestart=true
+
+[program:unbound-healthcheck]
+command=/bin/bash /healthcheck.sh
+stdout_logfile=/dev/stdout
+stdout_logfile_maxbytes=0
+stderr_logfile=/dev/stderr
+stderr_logfile_maxbytes=0
+autorestart=true
+
+[eventlistener:processes]
+command=/usr/local/sbin/stop-supervisor.sh
+events=PROCESS_STATE_STOPPED, PROCESS_STATE_EXITED, PROCESS_STATE_FATAL

+ 21 - 0
data/Dockerfiles/unbound/syslog-ng.conf

@@ -0,0 +1,21 @@
+@version: 4.5
+@include "scl.conf"
+options {
+  chain_hostnames(off);
+  flush_lines(0);
+  use_dns(no);
+  use_fqdn(no);
+  owner("root"); group("adm"); perm(0640);
+  stats(freq(0));
+  keep_timestamp(no);
+  bad_hostname("^gconfd$");
+};
+source s_dgram {
+  unix-dgram("/dev/log");
+  internal();
+};
+destination d_stdout { pipe("/dev/stdout"); };
+log {
+  source(s_dgram);
+  destination(d_stdout);
+};

+ 1 - 1
docker-compose.yml

@@ -1,7 +1,7 @@
 services:
 services:
 
 
     unbound-mailcow:
     unbound-mailcow:
-      image: mailcow/unbound:1.22
+      image: mailcow/unbound:1.23
       environment:
       environment:
         - TZ=${TZ}
         - TZ=${TZ}
         - SKIP_UNBOUND_HEALTHCHECK=${SKIP_UNBOUND_HEALTHCHECK:-n}
         - SKIP_UNBOUND_HEALTHCHECK=${SKIP_UNBOUND_HEALTHCHECK:-n}