Jelajahi Sumber

[Watchdog] Add certificate check for primary certificate; Add mail delay

andryyy 5 tahun lalu
induk
melakukan
bcad1af121
2 mengubah file dengan 57 tambahan dan 3 penghapusan
  1. 56 2
      data/Dockerfiles/watchdog/watchdog.sh
  2. 1 1
      docker-compose.yml

+ 56 - 2
data/Dockerfiles/watchdog/watchdog.sh

@@ -87,9 +87,22 @@ log_msg() {
 }
 
 function mail_error() {
+  THROTTLE=
   [[ -z ${1} ]] && return 1
   # If exists, body will be the content of "/tmp/${1}", even if ${2} is set
   [[ -z ${2} ]] && BODY="Service was restarted on $(date), please check your mailcow installation." || BODY="$(date) - ${2}"
+  # If exists, mail will be throttled by argument in seconds
+  [[ ! -z ${3} ]] && THROTTLE=${3}
+  if [[ ! -z ${THROTTLE} ]]; then
+    TTL_LEFT="$(${REDIS_CMDLINE} TTL THROTTLE_${1} 2> /dev/null)"
+    if [[ "${TTL_LEFT}" == "-2" ]]; then
+      # Delay key not found, setting a delay key now
+      ${REDIS_CMDLINE} SET THROTTLE_${1} 1 EX ${THROTTLE}
+    else
+      log_msg "Not sending notification email now, blocked for ${TTL_LEFT} seconds..."
+      return 1
+    fi
+  fi
   WATCHDOG_NOTIFY_EMAIL=$(echo "${WATCHDOG_NOTIFY_EMAIL}" | sed 's/"//;s|"$||')
   # Some exceptions for subject and body formats
   if [[ ${1} == "fail2ban" ]]; then
@@ -467,6 +480,28 @@ dovecot_repl_checks() {
   return 1
 }
 
+cert_checks() {
+  err_count=0
+  diff_c=0
+  THRESHOLD=1
+  # Reduce error count by 2 after restarting an unhealthy container
+  trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
+  while [ ${err_count} -lt ${THRESHOLD} ]; do
+    touch /tmp/certcheck; echo "$(tail -50 /tmp/certcheck)" > /tmp/certcheck
+    host_ip_postfix=$(get_container_ip postfix)
+    host_ip_dovecot=$(get_container_ip dovecot)
+    err_c_cur=${err_count}
+    /usr/lib/nagios/plugins/check_smtp -H ${host_ip_postfix} -p 25 -4 -S -D 7 2>> /tmp/certcheck 1>&2; err_count=$(( ${err_count} + $? ))
+    /usr/lib/nagios/plugins/check_imap -H ${host_ip_dovecot} -p 993 -4 -S -D 7 2>> /tmp/certcheck 1>&2; err_count=$(( ${err_count} + $? ))
+    [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
+    [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
+    progress "Certificate expiry check" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
+    # Always sleep 5 minutes, mail notifications are limited
+    sleep 300
+  done
+  return 1
+}
+
 phpfpm_checks() {
   err_count=0
   diff_c=0
@@ -927,6 +962,18 @@ PID=$!
 echo "Spawned fail2ban_checks with PID ${PID}"
 BACKGROUND_TASKS+=(${PID})
 
+(
+while true; do
+  if ! cert_checks; then
+    log_msg "Cert check hit error limit"
+    echo certcheck > /tmp/com_pipe
+  fi
+done
+) &
+PID=$!
+echo "Spawned cert_checks with PID ${PID}"
+BACKGROUND_TASKS+=(${PID})
+
 (
 while true; do
   if ! olefy_checks; then
@@ -1013,11 +1060,18 @@ while true; do
   elif [[ ${com_pipe_answer} == "mysql_repl_checks" ]]; then
     log_msg "MySQL replication is not working properly"
     # Define $2 to override message text, else print service was restarted at ...
-    [[ ! -z ${WATCHDOG_NOTIFY_EMAIL} ]] && mail_error "${com_pipe_answer}" "Please check the SQL replication status"
+    # Once mail per 10 minutes
+    [[ ! -z ${WATCHDOG_NOTIFY_EMAIL} ]] && mail_error "${com_pipe_answer}" "Please check the SQL replication status" 600
   elif [[ ${com_pipe_answer} == "dovecot_repl_checks" ]]; then
     log_msg "Dovecot replication is not working properly"
     # Define $2 to override message text, else print service was restarted at ...
-    [[ ! -z ${WATCHDOG_NOTIFY_EMAIL} ]] && mail_error "${com_pipe_answer}" "Please check the Dovecot replicator status"
+    # Once mail per 10 minutes
+    [[ ! -z ${WATCHDOG_NOTIFY_EMAIL} ]] && mail_error "${com_pipe_answer}" "Please check the Dovecot replicator status" 600
+  elif [[ ${com_pipe_answer} == "certcheck" ]]; then
+    log_msg "Certificates are about to expire"
+    # Define $2 to override message text, else print service was restarted at ...
+    # Only mail once a day
+    [[ ! -z ${WATCHDOG_NOTIFY_EMAIL} ]] && mail_error "${com_pipe_answer}" "Please renew your certificate" 86400
   elif [[ ${com_pipe_answer} == "acme-mailcow" ]]; then
     log_msg "acme-mailcow did not complete successfully"
     # Define $2 to override message text, else print service was restarted at ...

+ 1 - 1
docker-compose.yml

@@ -401,7 +401,7 @@ services:
         - /lib/modules:/lib/modules:ro
 
     watchdog-mailcow:
-      image: mailcow/watchdog:1.82
+      image: mailcow/watchdog:1.83
       # Debug
       #command: /watchdog.sh
       dns: