Bläddra i källkod

[Watchdog] Remove some check_ping checks
[Watchdog] Add ClamAV check (if SKIP_CLAMD=n)
[Watchdog] Add Unbound check
[Watchdog] Do not use Docker API by default to determine IP of containers (see "IP_BY_DOCKER_API")
[Watchdog] Minor changes

André 7 år sedan
förälder
incheckning
2da228a21e
1 ändrade filer med 134 tillägg och 31 borttagningar
  1. 134 31
      data/Dockerfiles/watchdog/watchdog.sh

+ 134 - 31
data/Dockerfiles/watchdog/watchdog.sh

@@ -43,6 +43,7 @@ log_msg() {
 function mail_error() {
   [[ -z ${1} ]] && return 1
   [[ -z ${2} ]] && return 2
+  [[ -z ${3} ]] && BODY="Service was restarted, please check your mailcow installation." || BODY="${3}"
   RCPT_DOMAIN=$(echo ${1} | awk -F @ {'print $NF'})
   RCPT_MX=$(dig +short ${RCPT_DOMAIN} mx | sort -n | awk '{print $2; exit}')
   if [[ -z ${RCPT_MX} ]]; then
@@ -50,8 +51,8 @@ function mail_error() {
     return 1
   fi
   ./smtp-cli --missing-modules-ok \
-    --subject="Watchdog: ${2} service hit the error rate limit" \
-    --body-plain="Service was restarted, please check your mailcow installation." \
+    --subject="Watchdog: ${2} hit the error rate limit" \
+    --body-plain="${BODY}" \
     --to=${1} \
     --from="watchdog@${MAILCOW_HOSTNAME}" \
     --server="${RCPT_MX}" \
@@ -66,28 +67,32 @@ get_container_ip() {
   CONTAINER_IP=
   LOOP_C=1
   until [[ ${CONTAINER_IP} =~ ^[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}$ ]] || [[ ${LOOP_C} -gt 5 ]]; do
-    sleep 0.5
-    # get long container id for exact match
-    CONTAINER_ID=($(curl --silent --insecure https://dockerapi/containers/json | jq -r ".[] | {name: .Config.Labels[\"com.docker.compose.service\"], id: .Id}" | jq -rc "select( .name | tostring == \"${1}\") | .id"))
-    # returned id can have multiple elements (if scaled), shuffle for random test
-    CONTAINER_ID=($(printf "%s\n" "${CONTAINER_ID[@]}" | shuf))
-    if [[ ! -z ${CONTAINER_ID} ]]; then
-      for matched_container in "${CONTAINER_ID[@]}"; do
-        CONTAINER_IPS=($(curl --silent --insecure https://dockerapi/containers/${matched_container}/json | jq -r '.NetworkSettings.Networks[].IPAddress')) 
-        for ip_match in "${CONTAINER_IPS[@]}"; do
-          # grep will do nothing if one of these vars is empty
-          [[ -z ${ip_match} ]] && continue
-          [[ -z ${IPV4_NETWORK} ]] && continue
-          # only return ips that are part of our network
-          if ! grep -q ${IPV4_NETWORK} <(echo ${ip_match}); then
-            continue
-          else
-            CONTAINER_IP=${ip_match}
-            break
-          fi
+    if [ ${IP_BY_DOCKER_API} -eq 0 ]; then
+      CONTAINER_IP=$(dig a "${1}" +short)
+    else
+      sleep 0.5
+      # get long container id for exact match
+      CONTAINER_ID=($(curl --silent --insecure https://dockerapi/containers/json | jq -r ".[] | {name: .Config.Labels[\"com.docker.compose.service\"], id: .Id}" | jq -rc "select( .name | tostring == \"${1}\") | .id"))
+      # returned id can have multiple elements (if scaled), shuffle for random test
+      CONTAINER_ID=($(printf "%s\n" "${CONTAINER_ID[@]}" | shuf))
+      if [[ ! -z ${CONTAINER_ID} ]]; then
+        for matched_container in "${CONTAINER_ID[@]}"; do
+          CONTAINER_IPS=($(curl --silent --insecure https://dockerapi/containers/${matched_container}/json | jq -r '.NetworkSettings.Networks[].IPAddress')) 
+          for ip_match in "${CONTAINER_IPS[@]}"; do
+            # grep will do nothing if one of these vars is empty
+            [[ -z ${ip_match} ]] && continue
+            [[ -z ${IPV4_NETWORK} ]] && continue
+            # only return ips that are part of our network
+            if ! grep -q ${IPV4_NETWORK} <(echo ${ip_match}); then
+              continue
+            else
+              CONTAINER_IP=${ip_match}
+              break
+            fi
+          done
+          [[ ! -z ${CONTAINER_IP} ]] && break
         done
-        [[ ! -z ${CONTAINER_IP} ]] && break
-      done
+      fi
     fi
     LOOP_C=$((LOOP_C + 1))
   done
@@ -103,7 +108,6 @@ nginx_checks() {
   while [ ${err_count} -lt ${THRESHOLD} ]; do
     host_ip=$(get_container_ip nginx-mailcow)
     err_c_cur=${err_count}
-    /usr/lib/nagios/plugins/check_ping -4 -H ${host_ip} -w 2000,10% -c 4000,100% -p2 1>&2; err_count=$(( ${err_count} + $? ))
     /usr/lib/nagios/plugins/check_http -4 -H ${host_ip} -u / -p 8081 1>&2; err_count=$(( ${err_count} + $? ))
     [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
     [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
@@ -114,6 +118,32 @@ nginx_checks() {
   return 1
 }
 
+unbound_checks() {
+  err_count=0
+  diff_c=0
+  THRESHOLD=8
+  # Reduce error count by 2 after restarting an unhealthy container
+  trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
+  while [ ${err_count} -lt ${THRESHOLD} ]; do
+    host_ip=$(get_container_ip unbound-mailcow)
+    err_c_cur=${err_count}
+    /usr/lib/nagios/plugins/check_dns -s ${host_ip} -H google.com 1>&2; err_count=$(( ${err_count} + $? ))
+    DNSSEC=$(dig com +dnssec | egrep 'flags:.+ad')
+    if [[ -z ${DNSSEC} ]]; then
+      echo "DNSSEC failure" 1>&2
+      err_count=$(( ${err_count} + 1))
+    else
+      echo "DNSSEC check succeeded" 1>&2
+    fi
+    [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
+    [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
+    progress "Unbound" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
+    diff_c=0
+    sleep $(( ( RANDOM % 30 )  + 10 ))
+  done
+  return 1
+}
+
 mysql_checks() {
   err_count=0
   diff_c=0
@@ -137,7 +167,7 @@ mysql_checks() {
 sogo_checks() {
   err_count=0
   diff_c=0
-  THRESHOLD=20
+  THRESHOLD=10
   # Reduce error count by 2 after restarting an unhealthy container
   trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
   while [ ${err_count} -lt ${THRESHOLD} ]; do
@@ -156,7 +186,7 @@ sogo_checks() {
 postfix_checks() {
   err_count=0
   diff_c=0
-  THRESHOLD=16
+  THRESHOLD=8
   # Reduce error count by 2 after restarting an unhealthy container
   trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
   while [ ${err_count} -lt ${THRESHOLD} ]; do
@@ -173,10 +203,29 @@ postfix_checks() {
   return 1
 }
 
+clamd_checks() {
+  err_count=0
+  diff_c=0
+  THRESHOLD=6
+  # Reduce error count by 2 after restarting an unhealthy container
+  trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
+  while [ ${err_count} -lt ${THRESHOLD} ]; do
+	host_ip=$(get_container_ip clamd-mailcow)
+    err_c_cur=${err_count}
+    /usr/lib/nagios/plugins/check_clamd -4 -H ${host_ip} 1>&2; err_count=$(( ${err_count} + $? ))
+    [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
+    [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
+    progress "Clamd" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
+    diff_c=0
+    sleep $(( ( RANDOM % 30 )  + 10 ))
+  done
+  return 1
+}
+
 dovecot_checks() {
   err_count=0
   diff_c=0
-  THRESHOLD=24
+  THRESHOLD=20
   # Reduce error count by 2 after restarting an unhealthy container
   trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
   while [ ${err_count} -lt ${THRESHOLD} ]; do
@@ -199,7 +248,7 @@ dovecot_checks() {
 phpfpm_checks() {
   err_count=0
   diff_c=0
-  THRESHOLD=10
+  THRESHOLD=5
   # Reduce error count by 2 after restarting an unhealthy container
   trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
   while [ ${err_count} -lt ${THRESHOLD} ]; do
@@ -207,7 +256,6 @@ phpfpm_checks() {
     err_c_cur=${err_count}
     nc -z ${host_ip} 9001 ; err_count=$(( ${err_count} + ($? * 2)))
     nc -z ${host_ip} 9002 ; err_count=$(( ${err_count} + ($? * 2)))
-    /usr/lib/nagios/plugins/check_ping -4 -H ${host_ip} -w 2000,10% -c 4000,100% -p2 1>&2; err_count=$(( ${err_count} + $? ))
     [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
     [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
     progress "PHP-FPM" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
@@ -217,10 +265,30 @@ phpfpm_checks() {
   return 1
 }
 
+cert_checks() {
+  err_count=0
+  diff_c=0
+  THRESHOLD=1
+  # Reduce error count by 2 after restarting an unhealthy container
+  trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
+  while [ ${err_count} -lt ${THRESHOLD} ]; do
+    host_ip=$(get_container_ip nginx-mailcow)
+    err_c_cur=${err_count}
+    /usr/lib/nagios/plugins/check_http -H ${host_ip} -p ${HTTPS_PORT} -C 15 1>&2; err_count=$(( ${err_count} + $? ))
+    [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
+    [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
+    progress "TLS certificate" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
+    diff_c=0
+    # Sleep 1 day, fixme: 1 day lag
+    sleep 86400
+  done
+  return 1
+}
+
 rspamd_checks() {
   err_count=0
   diff_c=0
-  THRESHOLD=10
+  THRESHOLD=5
   # Reduce error count by 2 after restarting an unhealthy container
   trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
   while [ ${err_count} -lt ${THRESHOLD} ]; do
@@ -238,7 +306,6 @@ Empty
     else
       echo "Rspamd settings check succeeded" 1>&2
     fi
-    /usr/lib/nagios/plugins/check_ping -4 -H ${host_ip} -w 2000,10% -c 4000,100% -p2 1>&2; err_count=$(( ${err_count} + $? ))
     [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
     [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
     progress "Rspamd" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
@@ -293,6 +360,42 @@ done
 ) &
 BACKGROUND_TASKS+=($!)
 
+if [ ${CHECK_UNBOUND} -eq 1 ]; then
+(
+while true; do
+  if ! unbound_checks; then
+    log_msg "Unbound hit error limit"
+    [[ ! -z ${WATCHDOG_NOTIFY_EMAIL} ]] && mail_error "${WATCHDOG_NOTIFY_EMAIL}" "unbound-mailcow"
+    echo unbound-mailcow > /tmp/com_pipe
+  fi
+done
+) &
+BACKGROUND_TASKS+=($!)
+fi
+
+if [[ "${SKIP_CLAMD}" =~ ^([nN][oO]|[nN])+$ ]]; then
+(
+while true; do
+  if ! clamd_checks; then
+    log_msg "Clamd hit error limit"
+    [[ ! -z ${WATCHDOG_NOTIFY_EMAIL} ]] && mail_error "${WATCHDOG_NOTIFY_EMAIL}" "clamd-mailcow"
+    echo clamd-mailcow > /tmp/com_pipe
+  fi
+done
+) &
+BACKGROUND_TASKS+=($!)
+fi
+
+(
+while true; do
+  if ! cert_checks; then
+    log_msg "TLS certificate hit error limit"
+    [[ ! -z ${WATCHDOG_NOTIFY_EMAIL} ]] && mail_error "${WATCHDOG_NOTIFY_EMAIL}" "TLS check" "TLS certificate expires soon!"
+  fi
+done
+) &
+BACKGROUND_TASKS+=($!)
+
 (
 while true; do
   if ! postfix_checks; then