watchdog.sh 42 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170
  1. #!/bin/bash
  2. if [ "${DEV_MODE}" != "n" ]; then
  3. echo -e "\e[31mEnabled Debug Mode\e[0m"
  4. set -x
  5. fi
  6. trap "exit" INT TERM
  7. trap "kill 0" EXIT
  8. # Prepare
  9. BACKGROUND_TASKS=()
  10. echo "Waiting for containers to settle..."
  11. for i in {30..1}; do
  12. echo "${i}"
  13. sleep 1
  14. done
  15. if [[ "${USE_WATCHDOG}" =~ ^([nN][oO]|[nN])+$ ]]; then
  16. echo -e "$(date) - USE_WATCHDOG=n, skipping watchdog..."
  17. sleep 365d
  18. exec $(readlink -f "$0")
  19. fi
  20. if [[ "${WATCHDOG_VERBOSE}" =~ ^([yY][eE][sS]|[yY])+$ ]]; then
  21. SMTP_VERBOSE="--verbose"
  22. CURL_VERBOSE="--verbose"
  23. set -xv
  24. else
  25. SMTP_VERBOSE=""
  26. CURL_VERBOSE=""
  27. exec 2>/dev/null
  28. fi
  29. # Checks pipe their corresponding container name in this pipe
  30. if [[ ! -p /tmp/com_pipe ]]; then
  31. mkfifo /tmp/com_pipe
  32. fi
  33. # Wait for containers
  34. while ! mariadb-admin status --ssl=false --socket=/var/run/mysqld/mysqld.sock -u${DBUSER} -p${DBPASS} --silent; do
  35. echo "Waiting for SQL..."
  36. sleep 2
  37. done
  38. # Do not attempt to write to slave
  39. if [[ ! -z ${REDIS_SLAVEOF_IP} ]]; then
  40. REDIS_CMDLINE="redis-cli -h ${REDIS_SLAVEOF_IP} -p ${REDIS_SLAVEOF_PORT} -a ${REDISPASS} --no-auth-warning"
  41. else
  42. REDIS_CMDLINE="redis-cli -h redis -p 6379 -a ${REDISPASS} --no-auth-warning"
  43. fi
  44. until [[ $(${REDIS_CMDLINE} PING) == "PONG" ]]; do
  45. echo "Waiting for Redis..."
  46. sleep 2
  47. done
  48. ${REDIS_CMDLINE} DEL F2B_RES > /dev/null
  49. # Common functions
  50. get_ipv6(){
  51. local IPV6=
  52. local IPV6_SRCS=
  53. local TRY=
  54. IPV6_SRCS[0]="ip6.mailcow.email"
  55. IPV6_SRCS[1]="ip6.nevondo.com"
  56. until [[ ! -z ${IPV6} ]] || [[ ${TRY} -ge 10 ]]; do
  57. IPV6=$(curl --connect-timeout 3 -m 10 -L6s ${IPV6_SRCS[$RANDOM % ${#IPV6_SRCS[@]} ]} | grep "^\([0-9a-fA-F]\{0,4\}:\)\{1,7\}[0-9a-fA-F]\{0,4\}$")
  58. [[ ! -z ${TRY} ]] && sleep 1
  59. TRY=$((TRY+1))
  60. done
  61. echo ${IPV6}
  62. }
  63. array_diff() {
  64. # https://stackoverflow.com/questions/2312762, Alex Offshore
  65. eval local ARR1=\(\"\${$2[@]}\"\)
  66. eval local ARR2=\(\"\${$3[@]}\"\)
  67. local IFS=$'\n'
  68. mapfile -t $1 < <(comm -23 <(echo "${ARR1[*]}" | sort) <(echo "${ARR2[*]}" | sort))
  69. }
  70. progress() {
  71. SERVICE=${1}
  72. TOTAL=${2}
  73. CURRENT=${3}
  74. DIFF=${4}
  75. [[ -z ${DIFF} ]] && DIFF=0
  76. [[ -z ${TOTAL} || -z ${CURRENT} ]] && return
  77. [[ ${CURRENT} -gt ${TOTAL} ]] && return
  78. [[ ${CURRENT} -lt 0 ]] && CURRENT=0
  79. PERCENT=$(( 200 * ${CURRENT} / ${TOTAL} % 2 + 100 * ${CURRENT} / ${TOTAL} ))
  80. ${REDIS_CMDLINE} LPUSH WATCHDOG_LOG "{\"time\":\"$(date +%s)\",\"service\":\"${SERVICE}\",\"lvl\":\"${PERCENT}\",\"hpnow\":\"${CURRENT}\",\"hptotal\":\"${TOTAL}\",\"hpdiff\":\"${DIFF}\"}" > /dev/null
  81. log_msg "${SERVICE} health level: ${PERCENT}% (${CURRENT}/${TOTAL}), health trend: ${DIFF}" no_redis
  82. # Return 10 to indicate a dead service
  83. [ ${CURRENT} -le 0 ] && return 10
  84. }
  85. log_msg() {
  86. if [[ ${2} != "no_redis" ]]; then
  87. ${REDIS_CMDLINE} LPUSH WATCHDOG_LOG "{\"time\":\"$(date +%s)\",\"message\":\"$(printf '%s' "${1}" | \
  88. tr '\r\n%&;$"_[]{}-' ' ')\"}" > /dev/null
  89. fi
  90. echo $(date) $(printf '%s\n' "${1}")
  91. }
  92. function notify_error() {
  93. # Check if one of the notification options is enabled
  94. [[ -z ${WATCHDOG_NOTIFY_EMAIL} ]] && [[ -z ${WATCHDOG_NOTIFY_WEBHOOK} ]] && return 0
  95. THROTTLE=
  96. [[ -z ${1} ]] && return 1
  97. # If exists, body will be the content of "/tmp/${1}", even if ${2} is set
  98. [[ -z ${2} ]] && BODY="Service was restarted on $(date), please check your mailcow installation." || BODY="$(date) - ${2}"
  99. # If exists, mail will be throttled by argument in seconds
  100. [[ ! -z ${3} ]] && THROTTLE=${3}
  101. if [[ ! -z ${THROTTLE} ]]; then
  102. TTL_LEFT="$(${REDIS_CMDLINE} TTL THROTTLE_${1} 2> /dev/null)"
  103. if [[ "${TTL_LEFT}" == "-2" ]]; then
  104. # Delay key not found, setting a delay key now
  105. ${REDIS_CMDLINE} SET THROTTLE_${1} 1 EX ${THROTTLE}
  106. else
  107. log_msg "Not sending notification email now, blocked for ${TTL_LEFT} seconds..."
  108. return 1
  109. fi
  110. fi
  111. WATCHDOG_NOTIFY_EMAIL=$(echo "${WATCHDOG_NOTIFY_EMAIL}" | sed 's/"//;s|"$||')
  112. # Some exceptions for subject and body formats
  113. if [[ ${1} == "fail2ban" ]]; then
  114. SUBJECT="${BODY}"
  115. BODY="Please see netfilter-mailcow for more details and triggered rules."
  116. else
  117. SUBJECT="${WATCHDOG_SUBJECT}: ${1}"
  118. fi
  119. # Send mail notification if enabled
  120. if [[ ! -z ${WATCHDOG_NOTIFY_EMAIL} ]]; then
  121. IFS=',' read -r -a MAIL_RCPTS <<< "${WATCHDOG_NOTIFY_EMAIL}"
  122. for rcpt in "${MAIL_RCPTS[@]}"; do
  123. RCPT_DOMAIN=
  124. RCPT_MX=
  125. RCPT_DOMAIN=$(echo ${rcpt} | awk -F @ {'print $NF'})
  126. CHECK_FOR_VALID_MX=$(dig +short ${RCPT_DOMAIN} mx)
  127. if [[ -z ${CHECK_FOR_VALID_MX} ]]; then
  128. log_msg "Cannot determine MX for ${rcpt}, skipping email notification..."
  129. return 1
  130. fi
  131. [ -f "/tmp/${1}" ] && BODY="/tmp/${1}"
  132. timeout 10s ./smtp-cli --missing-modules-ok \
  133. "${SMTP_VERBOSE}" \
  134. --charset=UTF-8 \
  135. --subject="${SUBJECT}" \
  136. --body-plain="${BODY}" \
  137. --add-header="X-Priority: 1" \
  138. --to=${rcpt} \
  139. --from="watchdog@${MAILCOW_HOSTNAME}" \
  140. --hello-host=${MAILCOW_HOSTNAME} \
  141. --ipv4
  142. if [[ $? -eq 1 ]]; then # exit code 1 is fine
  143. log_msg "Sent notification email to ${rcpt}"
  144. else
  145. if [[ "${SMTP_VERBOSE}" == "" ]]; then
  146. log_msg "Error while sending notification email to ${rcpt}. You can enable verbose logging by setting 'WATCHDOG_VERBOSE=y' in mailcow.conf."
  147. else
  148. log_msg "Error while sending notification email to ${rcpt}."
  149. fi
  150. fi
  151. done
  152. fi
  153. # Send webhook notification if enabled
  154. if [[ ! -z ${WATCHDOG_NOTIFY_WEBHOOK} ]]; then
  155. if [[ -z ${WATCHDOG_NOTIFY_WEBHOOK_BODY} ]]; then
  156. log_msg "No webhook body set, skipping webhook notification..."
  157. return 1
  158. fi
  159. # Escape subject and body (https://stackoverflow.com/a/2705678)
  160. ESCAPED_SUBJECT=$(echo ${SUBJECT} | sed -e 's/[\/&]/\\&/g')
  161. ESCAPED_BODY=$(echo ${BODY} | sed -e 's/[\/&]/\\&/g')
  162. # Replace subject and body placeholders
  163. WEBHOOK_BODY=$(echo ${WATCHDOG_NOTIFY_WEBHOOK_BODY} | sed -e "s/\$SUBJECT\|\${SUBJECT}/$ESCAPED_SUBJECT/g" -e "s/\$BODY\|\${BODY}/$ESCAPED_BODY/g")
  164. # POST to webhook
  165. curl -X POST -H "Content-Type: application/json" ${CURL_VERBOSE} -d "${WEBHOOK_BODY}" ${WATCHDOG_NOTIFY_WEBHOOK}
  166. log_msg "Sent notification using webhook"
  167. fi
  168. }
  169. get_container_ip() {
  170. # ${1} is container
  171. CONTAINER_ID=()
  172. CONTAINER_IPS=()
  173. CONTAINER_IP=
  174. LOOP_C=1
  175. until [[ ${CONTAINER_IP} =~ ^[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}$ ]] || [[ ${LOOP_C} -gt 5 ]]; do
  176. if [ ${IP_BY_DOCKER_API} -eq 0 ]; then
  177. CONTAINER_IP=$(dig a "${1}" +short)
  178. else
  179. sleep 0.5
  180. # get long container id for exact match
  181. CONTAINER_ID=($(curl --silent --insecure https://dockerapi.${COMPOSE_PROJECT_NAME}_mailcow-network/containers/json | jq -r ".[] | {name: .Config.Labels[\"com.docker.compose.service\"], project: .Config.Labels[\"com.docker.compose.project\"], id: .Id}" | jq -rc "select( .name | tostring == \"${1}\") | select( .project | tostring | contains(\"${COMPOSE_PROJECT_NAME,,}\")) | .id"))
  182. # returned id can have multiple elements (if scaled), shuffle for random test
  183. CONTAINER_ID=($(printf "%s\n" "${CONTAINER_ID[@]}" | shuf))
  184. if [[ ! -z ${CONTAINER_ID} ]]; then
  185. for matched_container in "${CONTAINER_ID[@]}"; do
  186. CONTAINER_IPS=($(curl --silent --insecure https://dockerapi.${COMPOSE_PROJECT_NAME}_mailcow-network/containers/${matched_container}/json | jq -r '.NetworkSettings.Networks[].IPAddress'))
  187. for ip_match in "${CONTAINER_IPS[@]}"; do
  188. # grep will do nothing if one of these vars is empty
  189. [[ -z ${ip_match} ]] && continue
  190. [[ -z ${IPV4_NETWORK} ]] && continue
  191. # only return ips that are part of our network
  192. if ! grep -q ${IPV4_NETWORK} <(echo ${ip_match}); then
  193. continue
  194. else
  195. CONTAINER_IP=${ip_match}
  196. break
  197. fi
  198. done
  199. [[ ! -z ${CONTAINER_IP} ]] && break
  200. done
  201. fi
  202. fi
  203. LOOP_C=$((LOOP_C + 1))
  204. done
  205. [[ ${LOOP_C} -gt 5 ]] && echo 240.0.0.0 || echo ${CONTAINER_IP}
  206. }
  207. # One-time check
  208. if grep -qi "$(echo ${IPV6_NETWORK} | cut -d: -f1-3)" <<< "$(ip a s)"; then
  209. if [[ -z "$(get_ipv6)" ]]; then
  210. notify_error "ipv6-config" "enable_ipv6 is true in docker-compose.yml, but an IPv6 link could not be established. Please verify your IPv6 connection."
  211. fi
  212. fi
  213. external_checks() {
  214. err_count=0
  215. diff_c=0
  216. THRESHOLD=${EXTERNAL_CHECKS_THRESHOLD}
  217. # Reduce error count by 2 after restarting an unhealthy container
  218. GUID=$(mariadb --skip-ssl -u${DBUSER} -p${DBPASS} ${DBNAME} -e "SELECT version FROM versions WHERE application = 'GUID'" -BN)
  219. trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
  220. while [ ${err_count} -lt ${THRESHOLD} ]; do
  221. err_c_cur=${err_count}
  222. CHECK_REPONSE="$(curl --connect-timeout 3 -m 10 -4 -s https://checks.mailcow.email -X POST -dguid=${GUID} 2> /dev/null)"
  223. if [[ ! -z "${CHECK_REPONSE}" ]] && [[ "$(echo ${CHECK_REPONSE} | jq -r .response)" == "critical" ]]; then
  224. echo ${CHECK_REPONSE} | jq -r .out > /tmp/external_checks
  225. err_count=$(( ${err_count} + 1 ))
  226. fi
  227. CHECK_REPONSE6="$(curl --connect-timeout 3 -m 10 -6 -s https://checks.mailcow.email -X POST -dguid=${GUID} 2> /dev/null)"
  228. if [[ ! -z "${CHECK_REPONSE6}" ]] && [[ "$(echo ${CHECK_REPONSE6} | jq -r .response)" == "critical" ]]; then
  229. echo ${CHECK_REPONSE} | jq -r .out > /tmp/external_checks
  230. err_count=$(( ${err_count} + 1 ))
  231. fi
  232. [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
  233. [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
  234. progress "External checks" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
  235. if [[ $? == 10 ]]; then
  236. diff_c=0
  237. sleep 60
  238. else
  239. diff_c=0
  240. sleep $(( ( RANDOM % 20 ) + 1800 ))
  241. fi
  242. done
  243. return 1
  244. }
  245. nginx_checks() {
  246. err_count=0
  247. diff_c=0
  248. THRESHOLD=${NGINX_THRESHOLD}
  249. # Reduce error count by 2 after restarting an unhealthy container
  250. trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
  251. while [ ${err_count} -lt ${THRESHOLD} ]; do
  252. touch /tmp/nginx-mailcow; echo "$(tail -50 /tmp/nginx-mailcow)" > /tmp/nginx-mailcow
  253. host_ip=$(get_container_ip nginx-mailcow)
  254. err_c_cur=${err_count}
  255. /usr/lib/nagios/plugins/check_http -4 -H ${host_ip} -u / -p 8081 2>> /tmp/nginx-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
  256. [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
  257. [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
  258. progress "Nginx" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
  259. if [[ $? == 10 ]]; then
  260. diff_c=0
  261. sleep 1
  262. else
  263. diff_c=0
  264. sleep $(( ( RANDOM % 60 ) + 20 ))
  265. fi
  266. done
  267. return 1
  268. }
  269. unbound_checks() {
  270. err_count=0
  271. diff_c=0
  272. THRESHOLD=${UNBOUND_THRESHOLD}
  273. # Reduce error count by 2 after restarting an unhealthy container
  274. trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
  275. while [ ${err_count} -lt ${THRESHOLD} ]; do
  276. touch /tmp/unbound-mailcow; echo "$(tail -50 /tmp/unbound-mailcow)" > /tmp/unbound-mailcow
  277. host_ip=$(get_container_ip unbound-mailcow)
  278. err_c_cur=${err_count}
  279. /usr/lib/mailcow/check_dns.sh -s ${host_ip} -H stackoverflow.com 2>> /tmp/unbound-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
  280. DNSSEC=$(dig com +dnssec | egrep 'flags:.+ad')
  281. if [[ -z ${DNSSEC} ]]; then
  282. echo "DNSSEC failure" 2>> /tmp/unbound-mailcow 1>&2
  283. err_count=$(( ${err_count} + 1))
  284. else
  285. echo "DNSSEC check succeeded" 2>> /tmp/unbound-mailcow 1>&2
  286. fi
  287. [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
  288. [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
  289. progress "Unbound" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
  290. if [[ $? == 10 ]]; then
  291. diff_c=0
  292. sleep 1
  293. else
  294. diff_c=0
  295. sleep $(( ( RANDOM % 60 ) + 20 ))
  296. fi
  297. done
  298. return 1
  299. }
  300. redis_checks() {
  301. # A check for the local redis container
  302. err_count=0
  303. diff_c=0
  304. THRESHOLD=${REDIS_THRESHOLD}
  305. # Reduce error count by 2 after restarting an unhealthy container
  306. trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
  307. while [ ${err_count} -lt ${THRESHOLD} ]; do
  308. touch /tmp/redis-mailcow; echo "$(tail -50 /tmp/redis-mailcow)" > /tmp/redis-mailcow
  309. host_ip=$(get_container_ip redis-mailcow)
  310. err_c_cur=${err_count}
  311. /usr/lib/nagios/plugins/check_tcp -4 -H redis-mailcow -p 6379 -E -s "AUTH ${REDISPASS}\nPING\n" -q "QUIT" -e "PONG" 2>> /tmp/redis-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
  312. [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
  313. [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
  314. progress "Redis" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
  315. if [[ $? == 10 ]]; then
  316. diff_c=0
  317. sleep 1
  318. else
  319. diff_c=0
  320. sleep $(( ( RANDOM % 60 ) + 20 ))
  321. fi
  322. done
  323. return 1
  324. }
  325. mysql_checks() {
  326. err_count=0
  327. diff_c=0
  328. THRESHOLD=${MYSQL_THRESHOLD}
  329. # Reduce error count by 2 after restarting an unhealthy container
  330. trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
  331. while [ ${err_count} -lt ${THRESHOLD} ]; do
  332. touch /tmp/mysql-mailcow; echo "$(tail -50 /tmp/mysql-mailcow)" > /tmp/mysql-mailcow
  333. err_c_cur=${err_count}
  334. /usr/lib/nagios/plugins/check_mysql -s /var/run/mysqld/mysqld.sock -u ${DBUSER} -p ${DBPASS} -d ${DBNAME} 2>> /tmp/mysql-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
  335. /usr/lib/nagios/plugins/check_mysql_query -s /var/run/mysqld/mysqld.sock -u ${DBUSER} -p ${DBPASS} -d ${DBNAME} -q "SELECT COUNT(*) FROM information_schema.tables" 2>> /tmp/mysql-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
  336. [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
  337. [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
  338. progress "MySQL/MariaDB" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
  339. if [[ $? == 10 ]]; then
  340. diff_c=0
  341. sleep 1
  342. else
  343. diff_c=0
  344. sleep $(( ( RANDOM % 60 ) + 20 ))
  345. fi
  346. done
  347. return 1
  348. }
  349. mysql_repl_checks() {
  350. err_count=0
  351. diff_c=0
  352. THRESHOLD=${MYSQL_REPLICATION_THRESHOLD}
  353. # Reduce error count by 2 after restarting an unhealthy container
  354. trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
  355. while [ ${err_count} -lt ${THRESHOLD} ]; do
  356. touch /tmp/mysql_repl_checks; echo "$(tail -50 /tmp/mysql_repl_checks)" > /tmp/mysql_repl_checks
  357. err_c_cur=${err_count}
  358. /usr/lib/nagios/plugins/check_mysql_slavestatus.sh -S /var/run/mysqld/mysqld.sock -u root -p ${DBROOT} 2>> /tmp/mysql_repl_checks 1>&2; err_count=$(( ${err_count} + $? ))
  359. [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
  360. [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
  361. progress "MySQL/MariaDB replication" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
  362. if [[ $? == 10 ]]; then
  363. diff_c=0
  364. sleep 60
  365. else
  366. diff_c=0
  367. sleep $(( ( RANDOM % 60 ) + 20 ))
  368. fi
  369. done
  370. return 1
  371. }
  372. sogo_checks() {
  373. err_count=0
  374. diff_c=0
  375. THRESHOLD=${SOGO_THRESHOLD}
  376. # Reduce error count by 2 after restarting an unhealthy container
  377. trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
  378. while [ ${err_count} -lt ${THRESHOLD} ]; do
  379. touch /tmp/sogo-mailcow; echo "$(tail -50 /tmp/sogo-mailcow)" > /tmp/sogo-mailcow
  380. host_ip=$(get_container_ip sogo-mailcow)
  381. err_c_cur=${err_count}
  382. /usr/lib/nagios/plugins/check_http -4 -H ${host_ip} -u /SOGo.index/ -p 20000 2>> /tmp/sogo-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
  383. [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
  384. [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
  385. progress "SOGo" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
  386. if [[ $? == 10 ]]; then
  387. diff_c=0
  388. sleep 1
  389. else
  390. diff_c=0
  391. sleep $(( ( RANDOM % 60 ) + 20 ))
  392. fi
  393. done
  394. return 1
  395. }
  396. postfix_checks() {
  397. err_count=0
  398. diff_c=0
  399. THRESHOLD=${POSTFIX_THRESHOLD}
  400. # Reduce error count by 2 after restarting an unhealthy container
  401. trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
  402. while [ ${err_count} -lt ${THRESHOLD} ]; do
  403. touch /tmp/postfix-mailcow; echo "$(tail -50 /tmp/postfix-mailcow)" > /tmp/postfix-mailcow
  404. host_ip=$(get_container_ip postfix-mailcow)
  405. err_c_cur=${err_count}
  406. /usr/lib/nagios/plugins/check_smtp -4 -H ${host_ip} -p 589 -f "watchdog@invalid" -C "RCPT TO:watchdog@localhost" -C DATA -C . -R 250 2>> /tmp/postfix-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
  407. /usr/lib/nagios/plugins/check_smtp -4 -H ${host_ip} -p 589 -S 2>> /tmp/postfix-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
  408. [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
  409. [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
  410. progress "Postfix" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
  411. if [[ $? == 10 ]]; then
  412. diff_c=0
  413. sleep 1
  414. else
  415. diff_c=0
  416. sleep $(( ( RANDOM % 60 ) + 20 ))
  417. fi
  418. done
  419. return 1
  420. }
  421. postfix-tlspol_checks() {
  422. err_count=0
  423. diff_c=0
  424. THRESHOLD=${POSTFIX_TLSPOL_THRESHOLD}
  425. # Reduce error count by 2 after restarting an unhealthy container
  426. trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
  427. while [ ${err_count} -lt ${THRESHOLD} ]; do
  428. touch /tmp/postfix-tlspol-mailcow; echo "$(tail -50 /tmp/postfix-tlspol-mailcow)" > /tmp/postfix-tlspol-mailcow
  429. host_ip=$(get_container_ip postfix-tlspol-mailcow)
  430. err_c_cur=${err_count}
  431. /usr/lib/nagios/plugins/check_tcp -4 -H ${host_ip} -p 8642 2>> /tmp/postfix-tlspol-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
  432. [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
  433. [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
  434. progress "Postfix TLS Policy companion" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
  435. if [[ $? == 10 ]]; then
  436. diff_c=0
  437. sleep 1
  438. else
  439. diff_c=0
  440. sleep $(( ( RANDOM % 60 ) + 20 ))
  441. fi
  442. done
  443. return 1
  444. }
  445. clamd_checks() {
  446. err_count=0
  447. diff_c=0
  448. THRESHOLD=${CLAMD_THRESHOLD}
  449. # Reduce error count by 2 after restarting an unhealthy container
  450. trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
  451. while [ ${err_count} -lt ${THRESHOLD} ]; do
  452. touch /tmp/clamd-mailcow; echo "$(tail -50 /tmp/clamd-mailcow)" > /tmp/clamd-mailcow
  453. host_ip=$(get_container_ip clamd-mailcow)
  454. err_c_cur=${err_count}
  455. /usr/lib/nagios/plugins/check_clamd -4 -H ${host_ip} 2>> /tmp/clamd-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
  456. [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
  457. [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
  458. progress "Clamd" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
  459. if [[ $? == 10 ]]; then
  460. diff_c=0
  461. sleep 1
  462. else
  463. diff_c=0
  464. sleep $(( ( RANDOM % 120 ) + 20 ))
  465. fi
  466. done
  467. return 1
  468. }
  469. dovecot_checks() {
  470. err_count=0
  471. diff_c=0
  472. THRESHOLD=${DOVECOT_THRESHOLD}
  473. # Reduce error count by 2 after restarting an unhealthy container
  474. trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
  475. while [ ${err_count} -lt ${THRESHOLD} ]; do
  476. touch /tmp/dovecot-mailcow; echo "$(tail -50 /tmp/dovecot-mailcow)" > /tmp/dovecot-mailcow
  477. host_ip=$(get_container_ip dovecot-mailcow)
  478. err_c_cur=${err_count}
  479. /usr/lib/nagios/plugins/check_smtp -4 -H ${host_ip} -p 24 -f "watchdog@invalid" -C "RCPT TO:<watchdog@invalid>" -L -R "User doesn't exist" 2>> /tmp/dovecot-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
  480. /usr/lib/nagios/plugins/check_imap -4 -H ${host_ip} -p 993 -S -e "OK " 2>> /tmp/dovecot-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
  481. /usr/lib/nagios/plugins/check_imap -4 -H ${host_ip} -p 143 -e "OK " 2>> /tmp/dovecot-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
  482. /usr/lib/nagios/plugins/check_tcp -4 -H ${host_ip} -p 10001 -e "VERSION" 2>> /tmp/dovecot-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
  483. /usr/lib/nagios/plugins/check_tcp -4 -H ${host_ip} -p 4190 -e "Dovecot ready" 2>> /tmp/dovecot-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
  484. [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
  485. [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
  486. progress "Dovecot" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
  487. if [[ $? == 10 ]]; then
  488. diff_c=0
  489. sleep 1
  490. else
  491. diff_c=0
  492. sleep $(( ( RANDOM % 60 ) + 20 ))
  493. fi
  494. done
  495. return 1
  496. }
  497. dovecot_repl_checks() {
  498. err_count=0
  499. diff_c=0
  500. THRESHOLD=${DOVECOT_REPL_THRESHOLD}
  501. D_REPL_STATUS=$(redis-cli -h redis -a ${REDISPASS} --no-auth-warning -r GET DOVECOT_REPL_HEALTH)
  502. # Reduce error count by 2 after restarting an unhealthy container
  503. trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
  504. while [ ${err_count} -lt ${THRESHOLD} ]; do
  505. err_c_cur=${err_count}
  506. D_REPL_STATUS=$(redis-cli --raw -h redis -a ${REDISPASS} --no-auth-warning GET DOVECOT_REPL_HEALTH)
  507. if [[ "${D_REPL_STATUS}" != "1" ]]; then
  508. err_count=$(( ${err_count} + 1 ))
  509. fi
  510. [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
  511. [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
  512. progress "Dovecot replication" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
  513. if [[ $? == 10 ]]; then
  514. diff_c=0
  515. sleep 60
  516. else
  517. diff_c=0
  518. sleep $(( ( RANDOM % 60 ) + 20 ))
  519. fi
  520. done
  521. return 1
  522. }
  523. cert_checks() {
  524. err_count=0
  525. diff_c=0
  526. THRESHOLD=7
  527. # Reduce error count by 2 after restarting an unhealthy container
  528. trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
  529. while [ ${err_count} -lt ${THRESHOLD} ]; do
  530. touch /tmp/certcheck; echo "$(tail -50 /tmp/certcheck)" > /tmp/certcheck
  531. host_ip_postfix=$(get_container_ip postfix)
  532. host_ip_dovecot=$(get_container_ip dovecot)
  533. err_c_cur=${err_count}
  534. /usr/lib/nagios/plugins/check_smtp -H ${host_ip_postfix} -p 589 -4 -S -D 7 2>> /tmp/certcheck 1>&2; err_count=$(( ${err_count} + $? ))
  535. /usr/lib/nagios/plugins/check_imap -H ${host_ip_dovecot} -p 993 -4 -S -D 7 2>> /tmp/certcheck 1>&2; err_count=$(( ${err_count} + $? ))
  536. [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
  537. [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
  538. progress "Primary certificate expiry check" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
  539. # Always sleep 5 minutes, mail notifications are limited
  540. sleep 300
  541. done
  542. return 1
  543. }
  544. phpfpm_checks() {
  545. err_count=0
  546. diff_c=0
  547. THRESHOLD=${PHPFPM_THRESHOLD}
  548. # Reduce error count by 2 after restarting an unhealthy container
  549. trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
  550. while [ ${err_count} -lt ${THRESHOLD} ]; do
  551. touch /tmp/php-fpm-mailcow; echo "$(tail -50 /tmp/php-fpm-mailcow)" > /tmp/php-fpm-mailcow
  552. host_ip=$(get_container_ip php-fpm-mailcow)
  553. err_c_cur=${err_count}
  554. /usr/lib/nagios/plugins/check_tcp -H ${host_ip} -p 9001 2>> /tmp/php-fpm-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
  555. /usr/lib/nagios/plugins/check_tcp -H ${host_ip} -p 9002 2>> /tmp/php-fpm-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
  556. [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
  557. [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
  558. progress "PHP-FPM" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
  559. if [[ $? == 10 ]]; then
  560. diff_c=0
  561. sleep 1
  562. else
  563. diff_c=0
  564. sleep $(( ( RANDOM % 60 ) + 20 ))
  565. fi
  566. done
  567. return 1
  568. }
  569. ratelimit_checks() {
  570. err_count=0
  571. diff_c=0
  572. THRESHOLD=${RATELIMIT_THRESHOLD}
  573. RL_LOG_STATUS=$(redis-cli -h redis -a ${REDISPASS} --no-auth-warning LRANGE RL_LOG 0 0 | jq .qid)
  574. # Reduce error count by 2 after restarting an unhealthy container
  575. trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
  576. while [ ${err_count} -lt ${THRESHOLD} ]; do
  577. err_c_cur=${err_count}
  578. RL_LOG_STATUS_PREV=${RL_LOG_STATUS}
  579. RL_LOG_STATUS=$(redis-cli -h redis -a ${REDISPASS} --no-auth-warning LRANGE RL_LOG 0 0 | jq .qid)
  580. if [[ ${RL_LOG_STATUS_PREV} != ${RL_LOG_STATUS} ]]; then
  581. err_count=$(( ${err_count} + 1 ))
  582. echo 'Last 10 applied ratelimits (may overlap with previous reports).' > /tmp/ratelimit
  583. echo 'Full ratelimit buckets can be emptied by deleting the ratelimit hash from within mailcow UI (see /debug -> Protocols -> Ratelimit):' >> /tmp/ratelimit
  584. echo >> /tmp/ratelimit
  585. redis-cli --raw -h redis -a ${REDISPASS} --no-auth-warning LRANGE RL_LOG 0 10 | jq . >> /tmp/ratelimit
  586. fi
  587. [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
  588. [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
  589. progress "Ratelimit" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
  590. if [[ $? == 10 ]]; then
  591. diff_c=0
  592. sleep 1
  593. else
  594. diff_c=0
  595. sleep $(( ( RANDOM % 60 ) + 20 ))
  596. fi
  597. done
  598. return 1
  599. }
  600. mailq_checks() {
  601. err_count=0
  602. diff_c=0
  603. THRESHOLD=${MAILQ_THRESHOLD}
  604. # Reduce error count by 2 after restarting an unhealthy container
  605. trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
  606. while [ ${err_count} -lt ${THRESHOLD} ]; do
  607. touch /tmp/mail_queue_status; echo "$(tail -50 /tmp/mail_queue_status)" > /tmp/mail_queue_status
  608. MAILQ_LOG_STATUS=$(find /var/spool/postfix/deferred -type f | wc -l)
  609. echo "Mail queue contains ${MAILQ_LOG_STATUS} items (critical limit is ${MAILQ_CRIT}) at $(date)" >> /tmp/mail_queue_status
  610. err_c_cur=${err_count}
  611. if [ ${MAILQ_LOG_STATUS} -ge ${MAILQ_CRIT} ]; then
  612. err_count=$(( ${err_count} + 1 ))
  613. echo "Mail queue contains ${MAILQ_LOG_STATUS} items (critical limit is ${MAILQ_CRIT}) at $(date)" >> /tmp/mail_queue_status
  614. fi
  615. [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
  616. [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
  617. progress "Mail queue" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
  618. if [[ $? == 10 ]]; then
  619. diff_c=0
  620. sleep 60
  621. else
  622. diff_c=0
  623. sleep $(( ( RANDOM % 60 ) + 20 ))
  624. fi
  625. done
  626. return 1
  627. }
  628. fail2ban_checks() {
  629. err_count=0
  630. diff_c=0
  631. THRESHOLD=${FAIL2BAN_THRESHOLD}
  632. F2B_LOG_STATUS=($(${REDIS_CMDLINE} --raw HKEYS F2B_ACTIVE_BANS))
  633. F2B_RES=
  634. # Reduce error count by 2 after restarting an unhealthy container
  635. trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
  636. while [ ${err_count} -lt ${THRESHOLD} ]; do
  637. err_c_cur=${err_count}
  638. F2B_LOG_STATUS_PREV=(${F2B_LOG_STATUS[@]})
  639. F2B_LOG_STATUS=($(${REDIS_CMDLINE} --raw HKEYS F2B_ACTIVE_BANS))
  640. array_diff F2B_RES F2B_LOG_STATUS F2B_LOG_STATUS_PREV
  641. if [[ ! -z "${F2B_RES}" ]]; then
  642. err_count=$(( ${err_count} + 1 ))
  643. echo -n "${F2B_RES[@]}" | tr -cd "[a-fA-F0-9.:/] " | timeout 3s ${REDIS_CMDLINE} -x SET F2B_RES > /dev/null
  644. if [ $? -ne 0 ]; then
  645. ${REDIS_CMDLINE} -x DEL F2B_RES
  646. fi
  647. fi
  648. [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
  649. [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
  650. progress "Fail2ban" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
  651. if [[ $? == 10 ]]; then
  652. diff_c=0
  653. sleep 1
  654. else
  655. diff_c=0
  656. sleep $(( ( RANDOM % 60 ) + 20 ))
  657. fi
  658. done
  659. return 1
  660. }
  661. acme_checks() {
  662. err_count=0
  663. diff_c=0
  664. THRESHOLD=${ACME_THRESHOLD}
  665. ACME_LOG_STATUS=$(redis-cli -h redis -a ${REDISPASS} --no-auth-warning GET ACME_FAIL_TIME)
  666. if [[ -z "${ACME_LOG_STATUS}" ]]; then
  667. ${REDIS_CMDLINE} SET ACME_FAIL_TIME 0
  668. ACME_LOG_STATUS=0
  669. fi
  670. # Reduce error count by 2 after restarting an unhealthy container
  671. trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
  672. while [ ${err_count} -lt ${THRESHOLD} ]; do
  673. err_c_cur=${err_count}
  674. ACME_LOG_STATUS_PREV=${ACME_LOG_STATUS}
  675. ACME_LC=0
  676. until [[ ! -z ${ACME_LOG_STATUS} ]] || [ ${ACME_LC} -ge 3 ]; do
  677. ACME_LOG_STATUS=$(redis-cli -h redis -a ${REDISPASS} --no-auth-warning GET ACME_FAIL_TIME 2> /dev/null)
  678. sleep 3
  679. ACME_LC=$((ACME_LC+1))
  680. done
  681. if [[ ${ACME_LOG_STATUS_PREV} != ${ACME_LOG_STATUS} ]]; then
  682. err_count=$(( ${err_count} + 1 ))
  683. fi
  684. [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
  685. [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
  686. progress "ACME" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
  687. if [[ $? == 10 ]]; then
  688. diff_c=0
  689. sleep 1
  690. else
  691. diff_c=0
  692. sleep $(( ( RANDOM % 60 ) + 20 ))
  693. fi
  694. done
  695. return 1
  696. }
  697. rspamd_checks() {
  698. err_count=0
  699. diff_c=0
  700. THRESHOLD=${RSPAMD_THRESHOLD}
  701. # Reduce error count by 2 after restarting an unhealthy container
  702. trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
  703. while [ ${err_count} -lt ${THRESHOLD} ]; do
  704. touch /tmp/rspamd-mailcow; echo "$(tail -50 /tmp/rspamd-mailcow)" > /tmp/rspamd-mailcow
  705. host_ip=$(get_container_ip rspamd-mailcow)
  706. err_c_cur=${err_count}
  707. SCORE=$(echo 'To: null@localhost
  708. From: watchdog@localhost
  709. Empty
  710. ' | usr/bin/curl --max-time 10 -s --data-binary @- --unix-socket /var/lib/rspamd/rspamd.sock http://rspamd.${COMPOSE_PROJECT_NAME}_mailcow-network/scan | jq -rc .default.required_score | sed 's/\..*//' )
  711. if [[ ${SCORE} -ne 9999 ]]; then
  712. echo "Rspamd settings check failed, score returned: ${SCORE}" 2>> /tmp/rspamd-mailcow 1>&2
  713. err_count=$(( ${err_count} + 1))
  714. else
  715. echo "Rspamd settings check succeeded, score returned: ${SCORE}" 2>> /tmp/rspamd-mailcow 1>&2
  716. fi
  717. # A dirty hack until a PING PONG event is implemented to worker proxy
  718. # We expect an empty response, not a timeout
  719. if [ "$(curl -s --max-time 10 ${host_ip}:9900 2> /dev/null ; echo $?)" == "28" ]; then
  720. echo "Milter check failed" 2>> /tmp/rspamd-mailcow 1>&2; err_count=$(( ${err_count} + 1 ));
  721. else
  722. echo "Milter check succeeded" 2>> /tmp/rspamd-mailcow 1>&2
  723. fi
  724. [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
  725. [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
  726. progress "Rspamd" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
  727. if [[ $? == 10 ]]; then
  728. diff_c=0
  729. sleep 1
  730. else
  731. diff_c=0
  732. sleep $(( ( RANDOM % 60 ) + 20 ))
  733. fi
  734. done
  735. return 1
  736. }
  737. olefy_checks() {
  738. err_count=0
  739. diff_c=0
  740. THRESHOLD=${OLEFY_THRESHOLD}
  741. # Reduce error count by 2 after restarting an unhealthy container
  742. trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
  743. while [ ${err_count} -lt ${THRESHOLD} ]; do
  744. touch /tmp/olefy-mailcow; echo "$(tail -50 /tmp/olefy-mailcow)" > /tmp/olefy-mailcow
  745. host_ip=$(get_container_ip olefy-mailcow)
  746. err_c_cur=${err_count}
  747. /usr/lib/nagios/plugins/check_tcp -4 -H ${host_ip} -p 10055 -s "PING\n" 2>> /tmp/olefy-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
  748. [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
  749. [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
  750. progress "Olefy" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
  751. if [[ $? == 10 ]]; then
  752. diff_c=0
  753. sleep 1
  754. else
  755. diff_c=0
  756. sleep $(( ( RANDOM % 60 ) + 20 ))
  757. fi
  758. done
  759. return 1
  760. }
  761. # Notify about start
  762. if [[ ${WATCHDOG_NOTIFY_START} =~ ^([yY][eE][sS]|[yY])+$ ]]; then
  763. notify_error "watchdog-mailcow" "Watchdog started monitoring mailcow."
  764. fi
  765. # Create watchdog agents
  766. (
  767. while true; do
  768. if ! nginx_checks; then
  769. log_msg "Nginx hit error limit"
  770. echo nginx-mailcow > /tmp/com_pipe
  771. fi
  772. done
  773. ) &
  774. PID=$!
  775. echo "Spawned nginx_checks with PID ${PID}"
  776. BACKGROUND_TASKS+=(${PID})
  777. if [[ ${WATCHDOG_EXTERNAL_CHECKS} =~ ^([yY][eE][sS]|[yY])+$ ]]; then
  778. (
  779. while true; do
  780. if ! external_checks; then
  781. log_msg "External checks hit error limit"
  782. echo external_checks > /tmp/com_pipe
  783. fi
  784. done
  785. ) &
  786. PID=$!
  787. echo "Spawned external_checks with PID ${PID}"
  788. BACKGROUND_TASKS+=(${PID})
  789. fi
  790. if [[ ${WATCHDOG_MYSQL_REPLICATION_CHECKS} =~ ^([yY][eE][sS]|[yY])+$ ]]; then
  791. (
  792. while true; do
  793. if ! mysql_repl_checks; then
  794. log_msg "MySQL replication check hit error limit"
  795. echo mysql_repl_checks > /tmp/com_pipe
  796. fi
  797. done
  798. ) &
  799. PID=$!
  800. echo "Spawned mysql_repl_checks with PID ${PID}"
  801. BACKGROUND_TASKS+=(${PID})
  802. fi
  803. (
  804. while true; do
  805. if ! mysql_checks; then
  806. log_msg "MySQL hit error limit"
  807. echo mysql-mailcow > /tmp/com_pipe
  808. fi
  809. done
  810. ) &
  811. PID=$!
  812. echo "Spawned mysql_checks with PID ${PID}"
  813. BACKGROUND_TASKS+=(${PID})
  814. (
  815. while true; do
  816. if ! redis_checks; then
  817. log_msg "Local Redis hit error limit"
  818. echo redis-mailcow > /tmp/com_pipe
  819. fi
  820. done
  821. ) &
  822. PID=$!
  823. echo "Spawned redis_checks with PID ${PID}"
  824. BACKGROUND_TASKS+=(${PID})
  825. (
  826. while true; do
  827. if ! phpfpm_checks; then
  828. log_msg "PHP-FPM hit error limit"
  829. echo php-fpm-mailcow > /tmp/com_pipe
  830. fi
  831. done
  832. ) &
  833. PID=$!
  834. echo "Spawned phpfpm_checks with PID ${PID}"
  835. BACKGROUND_TASKS+=(${PID})
  836. if [[ "${SKIP_SOGO}" =~ ^([nN][oO]|[nN])+$ ]]; then
  837. (
  838. while true; do
  839. if ! sogo_checks; then
  840. log_msg "SOGo hit error limit"
  841. echo sogo-mailcow > /tmp/com_pipe
  842. fi
  843. done
  844. ) &
  845. PID=$!
  846. echo "Spawned sogo_checks with PID ${PID}"
  847. BACKGROUND_TASKS+=(${PID})
  848. fi
  849. if [ ${CHECK_UNBOUND} -eq 1 ]; then
  850. (
  851. while true; do
  852. if ! unbound_checks; then
  853. log_msg "Unbound hit error limit"
  854. echo unbound-mailcow > /tmp/com_pipe
  855. fi
  856. done
  857. ) &
  858. PID=$!
  859. echo "Spawned unbound_checks with PID ${PID}"
  860. BACKGROUND_TASKS+=(${PID})
  861. fi
  862. if [[ "${SKIP_CLAMD}" =~ ^([nN][oO]|[nN])+$ ]]; then
  863. (
  864. while true; do
  865. if ! clamd_checks; then
  866. log_msg "Clamd hit error limit"
  867. echo clamd-mailcow > /tmp/com_pipe
  868. fi
  869. done
  870. ) &
  871. PID=$!
  872. echo "Spawned clamd_checks with PID ${PID}"
  873. BACKGROUND_TASKS+=(${PID})
  874. fi
  875. (
  876. while true; do
  877. if ! postfix_checks; then
  878. log_msg "Postfix hit error limit"
  879. echo postfix-mailcow > /tmp/com_pipe
  880. fi
  881. done
  882. ) &
  883. PID=$!
  884. echo "Spawned postfix_checks with PID ${PID}"
  885. BACKGROUND_TASKS+=(${PID})
  886. (
  887. while true; do
  888. if ! mailq_checks; then
  889. log_msg "Mail queue hit error limit"
  890. echo mail_queue_status > /tmp/com_pipe
  891. fi
  892. done
  893. ) &
  894. PID=$!
  895. echo "Spawned mailq_checks with PID ${PID}"
  896. BACKGROUND_TASKS+=(${PID})
  897. (
  898. while true; do
  899. if ! postfix-tlspol_checks; then
  900. log_msg "Postfix TLS Policy hit error limit"
  901. echo postfix-tlspol-mailcow > /tmp/com_pipe
  902. fi
  903. done
  904. ) &
  905. PID=$!
  906. echo "Spawned postfix-tlspol_checks with PID ${PID}"
  907. BACKGROUND_TASKS+=(${PID})
  908. (
  909. while true; do
  910. if ! dovecot_checks; then
  911. log_msg "Dovecot hit error limit"
  912. echo dovecot-mailcow > /tmp/com_pipe
  913. fi
  914. done
  915. ) &
  916. PID=$!
  917. echo "Spawned dovecot_checks with PID ${PID}"
  918. BACKGROUND_TASKS+=(${PID})
  919. (
  920. while true; do
  921. if ! dovecot_repl_checks; then
  922. log_msg "Dovecot hit error limit"
  923. echo dovecot_repl_checks > /tmp/com_pipe
  924. fi
  925. done
  926. ) &
  927. PID=$!
  928. echo "Spawned dovecot_repl_checks with PID ${PID}"
  929. BACKGROUND_TASKS+=(${PID})
  930. (
  931. while true; do
  932. if ! rspamd_checks; then
  933. log_msg "Rspamd hit error limit"
  934. echo rspamd-mailcow > /tmp/com_pipe
  935. fi
  936. done
  937. ) &
  938. PID=$!
  939. echo "Spawned rspamd_checks with PID ${PID}"
  940. BACKGROUND_TASKS+=(${PID})
  941. (
  942. while true; do
  943. if ! ratelimit_checks; then
  944. log_msg "Ratelimit hit error limit"
  945. echo ratelimit > /tmp/com_pipe
  946. fi
  947. done
  948. ) &
  949. PID=$!
  950. echo "Spawned ratelimit_checks with PID ${PID}"
  951. BACKGROUND_TASKS+=(${PID})
  952. (
  953. while true; do
  954. if ! fail2ban_checks; then
  955. log_msg "Fail2ban hit error limit"
  956. echo fail2ban > /tmp/com_pipe
  957. fi
  958. done
  959. ) &
  960. PID=$!
  961. echo "Spawned fail2ban_checks with PID ${PID}"
  962. BACKGROUND_TASKS+=(${PID})
  963. (
  964. while true; do
  965. if ! cert_checks; then
  966. log_msg "Cert check hit error limit"
  967. echo certcheck > /tmp/com_pipe
  968. fi
  969. done
  970. ) &
  971. PID=$!
  972. echo "Spawned cert_checks with PID ${PID}"
  973. BACKGROUND_TASKS+=(${PID})
  974. if [[ "${SKIP_OLEFY}" =~ ^([nN][oO]|[nN])+$ ]]; then
  975. (
  976. while true; do
  977. if ! olefy_checks; then
  978. log_msg "Olefy hit error limit"
  979. echo olefy-mailcow > /tmp/com_pipe
  980. fi
  981. done
  982. ) &
  983. PID=$!
  984. echo "Spawned olefy_checks with PID ${PID}"
  985. BACKGROUND_TASKS+=(${PID})
  986. fi
  987. (
  988. while true; do
  989. if ! acme_checks; then
  990. log_msg "ACME client hit error limit"
  991. echo acme-mailcow > /tmp/com_pipe
  992. fi
  993. done
  994. ) &
  995. PID=$!
  996. echo "Spawned acme_checks with PID ${PID}"
  997. BACKGROUND_TASKS+=(${PID})
  998. # Monitor watchdog agents, stop script when agents fails and wait for respawn by Docker (restart:always:n)
  999. (
  1000. while true; do
  1001. for bg_task in ${BACKGROUND_TASKS[*]}; do
  1002. if ! kill -0 ${bg_task} 1>&2; then
  1003. log_msg "Worker ${bg_task} died, stopping watchdog and waiting for respawn..."
  1004. kill -TERM 1
  1005. fi
  1006. sleep 10
  1007. done
  1008. done
  1009. ) &
  1010. # Monitor dockerapi
  1011. (
  1012. while true; do
  1013. while nc -z dockerapi 443; do
  1014. sleep 3
  1015. done
  1016. log_msg "Cannot find dockerapi-mailcow, waiting to recover..."
  1017. kill -STOP ${BACKGROUND_TASKS[*]}
  1018. until nc -z dockerapi 443; do
  1019. sleep 3
  1020. done
  1021. kill -CONT ${BACKGROUND_TASKS[*]}
  1022. kill -USR1 ${BACKGROUND_TASKS[*]}
  1023. done
  1024. ) &
  1025. # Actions when threshold limit is reached
  1026. while true; do
  1027. CONTAINER_ID=
  1028. HAS_INITDB=
  1029. read com_pipe_answer </tmp/com_pipe
  1030. if [ -s "/tmp/${com_pipe_answer}" ]; then
  1031. cat "/tmp/${com_pipe_answer}"
  1032. fi
  1033. if [[ ${com_pipe_answer} == "ratelimit" ]]; then
  1034. log_msg "At least one ratelimit was applied"
  1035. notify_error "${com_pipe_answer}"
  1036. elif [[ ${com_pipe_answer} == "mail_queue_status" ]]; then
  1037. log_msg "Mail queue status is critical"
  1038. notify_error "${com_pipe_answer}"
  1039. elif [[ ${com_pipe_answer} == "external_checks" ]]; then
  1040. log_msg "Your mailcow is an open relay!"
  1041. # Define $2 to override message text, else print service was restarted at ...
  1042. notify_error "${com_pipe_answer}" "Please stop mailcow now and check your network configuration!"
  1043. elif [[ ${com_pipe_answer} == "mysql_repl_checks" ]]; then
  1044. log_msg "MySQL replication is not working properly"
  1045. # Define $2 to override message text, else print service was restarted at ...
  1046. # Once mail per 10 minutes
  1047. notify_error "${com_pipe_answer}" "Please check the SQL replication status" 600
  1048. elif [[ ${com_pipe_answer} == "dovecot_repl_checks" ]]; then
  1049. log_msg "Dovecot replication is not working properly"
  1050. # Define $2 to override message text, else print service was restarted at ...
  1051. # Once mail per 10 minutes
  1052. notify_error "${com_pipe_answer}" "Please check the Dovecot replicator status" 600
  1053. elif [[ ${com_pipe_answer} == "certcheck" ]]; then
  1054. log_msg "Certificates are about to expire"
  1055. # Define $2 to override message text, else print service was restarted at ...
  1056. # Only mail once a day
  1057. notify_error "${com_pipe_answer}" "Please renew your certificate" 86400
  1058. elif [[ ${com_pipe_answer} == "acme-mailcow" ]]; then
  1059. log_msg "acme-mailcow did not complete successfully"
  1060. # Define $2 to override message text, else print service was restarted at ...
  1061. notify_error "${com_pipe_answer}" "Please check acme-mailcow for further information."
  1062. elif [[ ${com_pipe_answer} == "fail2ban" ]]; then
  1063. F2B_RES=($(timeout 4s ${REDIS_CMDLINE} --raw GET F2B_RES 2> /dev/null))
  1064. if [[ ! -z "${F2B_RES}" ]]; then
  1065. ${REDIS_CMDLINE} DEL F2B_RES > /dev/null
  1066. host=
  1067. for host in "${F2B_RES[@]}"; do
  1068. log_msg "Banned ${host}"
  1069. rm /tmp/fail2ban 2> /dev/null
  1070. timeout 2s whois "${host}" > /tmp/fail2ban
  1071. [[ ${WATCHDOG_NOTIFY_BAN} =~ ^([yY][eE][sS]|[yY])+$ ]] && notify_error "${com_pipe_answer}" "IP ban: ${host}"
  1072. done
  1073. fi
  1074. elif [[ ${com_pipe_answer} =~ .+-mailcow ]]; then
  1075. kill -STOP ${BACKGROUND_TASKS[*]}
  1076. sleep 10
  1077. CONTAINER_ID=$(curl --silent --insecure https://dockerapi.${COMPOSE_PROJECT_NAME}_mailcow-network/containers/json | jq -r ".[] | {name: .Config.Labels[\"com.docker.compose.service\"], project: .Config.Labels[\"com.docker.compose.project\"], id: .Id}" | jq -rc "select( .name | tostring | contains(\"${com_pipe_answer}\")) | select( .project | tostring | contains(\"${COMPOSE_PROJECT_NAME,,}\")) | .id")
  1078. if [[ ! -z ${CONTAINER_ID} ]]; then
  1079. if [[ "${com_pipe_answer}" == "php-fpm-mailcow" ]]; then
  1080. HAS_INITDB=$(curl --silent --insecure -XPOST https://dockerapi.${COMPOSE_PROJECT_NAME}_mailcow-network/containers/${CONTAINER_ID}/top | jq '.msg.Processes[] | contains(["php -c /usr/local/etc/php -f /web/inc/init_db.inc.php"])' | grep true)
  1081. fi
  1082. S_RUNNING=$(($(date +%s) - $(curl --silent --insecure https://dockerapi.${COMPOSE_PROJECT_NAME}_mailcow-network/containers/${CONTAINER_ID}/json | jq .State.StartedAt | xargs -n1 date +%s -d)))
  1083. if [ ${S_RUNNING} -lt 360 ]; then
  1084. log_msg "Container is running for less than 360 seconds, skipping action..."
  1085. elif [[ ! -z ${HAS_INITDB} ]]; then
  1086. log_msg "Database is being initialized by php-fpm-mailcow, not restarting but delaying checks for a minute..."
  1087. sleep 60
  1088. else
  1089. log_msg "Sending restart command to ${CONTAINER_ID}..."
  1090. curl --silent --insecure -XPOST https://dockerapi.${COMPOSE_PROJECT_NAME}_mailcow-network/containers/${CONTAINER_ID}/restart
  1091. notify_error "${com_pipe_answer}"
  1092. log_msg "Wait for restarted container to settle and continue watching..."
  1093. sleep 35
  1094. fi
  1095. fi
  1096. kill -CONT ${BACKGROUND_TASKS[*]}
  1097. sleep 1
  1098. kill -USR1 ${BACKGROUND_TASKS[*]}
  1099. fi
  1100. done