watchdog.sh 41 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133
  1. #!/bin/bash
  2. if [ "${DEV_MODE}" != "n" ]; then
  3. echo -e "\e[31mEnabled Debug Mode\e[0m"
  4. set -x
  5. fi
  6. trap "exit" INT TERM
  7. trap "kill 0" EXIT
  8. # Prepare
  9. BACKGROUND_TASKS=()
  10. echo "Waiting for containers to settle..."
  11. for i in {30..1}; do
  12. echo "${i}"
  13. sleep 1
  14. done
  15. if [[ "${USE_WATCHDOG}" =~ ^([nN][oO]|[nN])+$ ]]; then
  16. echo -e "$(date) - USE_WATCHDOG=n, skipping watchdog..."
  17. sleep 365d
  18. exec $(readlink -f "$0")
  19. fi
  20. if [[ "${WATCHDOG_VERBOSE}" =~ ^([yY][eE][sS]|[yY])+$ ]]; then
  21. SMTP_VERBOSE="--verbose"
  22. CURL_VERBOSE="--verbose"
  23. set -xv
  24. else
  25. SMTP_VERBOSE=""
  26. CURL_VERBOSE=""
  27. exec 2>/dev/null
  28. fi
  29. # Checks pipe their corresponding container name in this pipe
  30. if [[ ! -p /tmp/com_pipe ]]; then
  31. mkfifo /tmp/com_pipe
  32. fi
  33. # Wait for containers
  34. while ! mariadb-admin status --ssl=false --socket=/var/run/mysqld/mysqld.sock -u${DBUSER} -p${DBPASS} --silent; do
  35. echo "Waiting for SQL..."
  36. sleep 2
  37. done
  38. # Do not attempt to write to slave
  39. if [[ ! -z ${REDIS_SLAVEOF_IP} ]]; then
  40. REDIS_CMDLINE="redis-cli -h ${REDIS_SLAVEOF_IP} -p ${REDIS_SLAVEOF_PORT} -a ${REDISPASS} --no-auth-warning"
  41. else
  42. REDIS_CMDLINE="redis-cli -h redis -p 6379 -a ${REDISPASS} --no-auth-warning"
  43. fi
  44. until [[ $(${REDIS_CMDLINE} PING) == "PONG" ]]; do
  45. echo "Waiting for Redis..."
  46. sleep 2
  47. done
  48. ${REDIS_CMDLINE} DEL F2B_RES > /dev/null
  49. # Common functions
  50. get_ipv6(){
  51. local IPV6=
  52. local IPV6_SRCS=
  53. local TRY=
  54. IPV6_SRCS[0]="ip6.mailcow.email"
  55. IPV6_SRCS[1]="ip6.nevondo.com"
  56. until [[ ! -z ${IPV6} ]] || [[ ${TRY} -ge 10 ]]; do
  57. IPV6=$(curl --connect-timeout 3 -m 10 -L6s ${IPV6_SRCS[$RANDOM % ${#IPV6_SRCS[@]} ]} | grep "^\([0-9a-fA-F]\{0,4\}:\)\{1,7\}[0-9a-fA-F]\{0,4\}$")
  58. [[ ! -z ${TRY} ]] && sleep 1
  59. TRY=$((TRY+1))
  60. done
  61. echo ${IPV6}
  62. }
  63. array_diff() {
  64. # https://stackoverflow.com/questions/2312762, Alex Offshore
  65. eval local ARR1=\(\"\${$2[@]}\"\)
  66. eval local ARR2=\(\"\${$3[@]}\"\)
  67. local IFS=$'\n'
  68. mapfile -t $1 < <(comm -23 <(echo "${ARR1[*]}" | sort) <(echo "${ARR2[*]}" | sort))
  69. }
  70. progress() {
  71. SERVICE=${1}
  72. TOTAL=${2}
  73. CURRENT=${3}
  74. DIFF=${4}
  75. [[ -z ${DIFF} ]] && DIFF=0
  76. [[ -z ${TOTAL} || -z ${CURRENT} ]] && return
  77. [[ ${CURRENT} -gt ${TOTAL} ]] && return
  78. [[ ${CURRENT} -lt 0 ]] && CURRENT=0
  79. PERCENT=$(( 200 * ${CURRENT} / ${TOTAL} % 2 + 100 * ${CURRENT} / ${TOTAL} ))
  80. ${REDIS_CMDLINE} LPUSH WATCHDOG_LOG "{\"time\":\"$(date +%s)\",\"service\":\"${SERVICE}\",\"lvl\":\"${PERCENT}\",\"hpnow\":\"${CURRENT}\",\"hptotal\":\"${TOTAL}\",\"hpdiff\":\"${DIFF}\"}" > /dev/null
  81. log_msg "${SERVICE} health level: ${PERCENT}% (${CURRENT}/${TOTAL}), health trend: ${DIFF}" no_redis
  82. # Return 10 to indicate a dead service
  83. [ ${CURRENT} -le 0 ] && return 10
  84. }
  85. log_msg() {
  86. if [[ ${2} != "no_redis" ]]; then
  87. ${REDIS_CMDLINE} LPUSH WATCHDOG_LOG "{\"time\":\"$(date +%s)\",\"message\":\"$(printf '%s' "${1}" | \
  88. tr '\r\n%&;$"_[]{}-' ' ')\"}" > /dev/null
  89. fi
  90. echo $(date) $(printf '%s\n' "${1}")
  91. }
  92. function notify_error() {
  93. # Check if one of the notification options is enabled
  94. [[ -z ${WATCHDOG_NOTIFY_EMAIL} ]] && [[ -z ${WATCHDOG_NOTIFY_WEBHOOK} ]] && return 0
  95. THROTTLE=
  96. [[ -z ${1} ]] && return 1
  97. # If exists, body will be the content of "/tmp/${1}", even if ${2} is set
  98. [[ -z ${2} ]] && BODY="Service was restarted on $(date), please check your mailcow installation." || BODY="$(date) - ${2}"
  99. # If exists, mail will be throttled by argument in seconds
  100. [[ ! -z ${3} ]] && THROTTLE=${3}
  101. if [[ ! -z ${THROTTLE} ]]; then
  102. TTL_LEFT="$(${REDIS_CMDLINE} TTL THROTTLE_${1} 2> /dev/null)"
  103. if [[ "${TTL_LEFT}" == "-2" ]]; then
  104. # Delay key not found, setting a delay key now
  105. ${REDIS_CMDLINE} SET THROTTLE_${1} 1 EX ${THROTTLE}
  106. else
  107. log_msg "Not sending notification email now, blocked for ${TTL_LEFT} seconds..."
  108. return 1
  109. fi
  110. fi
  111. WATCHDOG_NOTIFY_EMAIL=$(echo "${WATCHDOG_NOTIFY_EMAIL}" | sed 's/"//;s|"$||')
  112. # Some exceptions for subject and body formats
  113. if [[ ${1} == "fail2ban" ]]; then
  114. SUBJECT="${BODY}"
  115. BODY="Please see netfilter-mailcow for more details and triggered rules."
  116. else
  117. SUBJECT="${WATCHDOG_SUBJECT}: ${1}"
  118. fi
  119. # Send mail notification if enabled
  120. if [[ ! -z ${WATCHDOG_NOTIFY_EMAIL} ]]; then
  121. IFS=',' read -r -a MAIL_RCPTS <<< "${WATCHDOG_NOTIFY_EMAIL}"
  122. for rcpt in "${MAIL_RCPTS[@]}"; do
  123. RCPT_DOMAIN=
  124. RCPT_MX=
  125. RCPT_DOMAIN=$(echo ${rcpt} | awk -F @ {'print $NF'})
  126. CHECK_FOR_VALID_MX=$(dig +short ${RCPT_DOMAIN} mx)
  127. if [[ -z ${CHECK_FOR_VALID_MX} ]]; then
  128. log_msg "Cannot determine MX for ${rcpt}, skipping email notification..."
  129. return 1
  130. fi
  131. [ -f "/tmp/${1}" ] && BODY="/tmp/${1}"
  132. timeout 10s ./smtp-cli --missing-modules-ok \
  133. "${SMTP_VERBOSE}" \
  134. --charset=UTF-8 \
  135. --subject="${SUBJECT}" \
  136. --body-plain="${BODY}" \
  137. --add-header="X-Priority: 1" \
  138. --to=${rcpt} \
  139. --from="watchdog@${MAILCOW_HOSTNAME}" \
  140. --hello-host=${MAILCOW_HOSTNAME} \
  141. --ipv4
  142. if [[ $? -eq 1 ]]; then # exit code 1 is fine
  143. log_msg "Sent notification email to ${rcpt}"
  144. else
  145. if [[ "${SMTP_VERBOSE}" == "" ]]; then
  146. log_msg "Error while sending notification email to ${rcpt}. You can enable verbose logging by setting 'WATCHDOG_VERBOSE=y' in mailcow.conf."
  147. else
  148. log_msg "Error while sending notification email to ${rcpt}."
  149. fi
  150. fi
  151. done
  152. fi
  153. # Send webhook notification if enabled
  154. if [[ ! -z ${WATCHDOG_NOTIFY_WEBHOOK} ]]; then
  155. if [[ -z ${WATCHDOG_NOTIFY_WEBHOOK_BODY} ]]; then
  156. log_msg "No webhook body set, skipping webhook notification..."
  157. return 1
  158. fi
  159. # Escape subject and body (https://stackoverflow.com/a/2705678)
  160. ESCAPED_SUBJECT=$(echo ${SUBJECT} | sed -e 's/[\/&]/\\&/g')
  161. ESCAPED_BODY=$(echo ${BODY} | sed -e 's/[\/&]/\\&/g')
  162. # Replace subject and body placeholders
  163. WEBHOOK_BODY=$(echo ${WATCHDOG_NOTIFY_WEBHOOK_BODY} | sed -e "s/\$SUBJECT\|\${SUBJECT}/$ESCAPED_SUBJECT/g" -e "s/\$BODY\|\${BODY}/$ESCAPED_BODY/g")
  164. # POST to webhook
  165. curl -X POST -H "Content-Type: application/json" ${CURL_VERBOSE} -d "${WEBHOOK_BODY}" ${WATCHDOG_NOTIFY_WEBHOOK}
  166. log_msg "Sent notification using webhook"
  167. fi
  168. }
  169. get_container_ip() {
  170. # ${1} is container
  171. CONTAINER_ID=()
  172. CONTAINER_IPS=()
  173. CONTAINER_IP=
  174. LOOP_C=1
  175. until [[ ${CONTAINER_IP} =~ ^[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}$ ]] || [[ ${LOOP_C} -gt 5 ]]; do
  176. if [ ${IP_BY_DOCKER_API} -eq 0 ]; then
  177. CONTAINER_IP=$(dig a "${1}" +short)
  178. else
  179. sleep 0.5
  180. # get long container id for exact match
  181. CONTAINER_ID=($(curl --silent --insecure https://dockerapi.${COMPOSE_PROJECT_NAME}_mailcow-network/containers/json | jq -r ".[] | {name: .Config.Labels[\"com.docker.compose.service\"], project: .Config.Labels[\"com.docker.compose.project\"], id: .Id}" | jq -rc "select( .name | tostring == \"${1}\") | select( .project | tostring | contains(\"${COMPOSE_PROJECT_NAME,,}\")) | .id"))
  182. # returned id can have multiple elements (if scaled), shuffle for random test
  183. CONTAINER_ID=($(printf "%s\n" "${CONTAINER_ID[@]}" | shuf))
  184. if [[ ! -z ${CONTAINER_ID} ]]; then
  185. for matched_container in "${CONTAINER_ID[@]}"; do
  186. CONTAINER_IPS=($(curl --silent --insecure https://dockerapi.${COMPOSE_PROJECT_NAME}_mailcow-network/containers/${matched_container}/json | jq -r '.NetworkSettings.Networks[].IPAddress'))
  187. for ip_match in "${CONTAINER_IPS[@]}"; do
  188. # grep will do nothing if one of these vars is empty
  189. [[ -z ${ip_match} ]] && continue
  190. [[ -z ${IPV4_NETWORK} ]] && continue
  191. # only return ips that are part of our network
  192. if ! grep -q ${IPV4_NETWORK} <(echo ${ip_match}); then
  193. continue
  194. else
  195. CONTAINER_IP=${ip_match}
  196. break
  197. fi
  198. done
  199. [[ ! -z ${CONTAINER_IP} ]] && break
  200. done
  201. fi
  202. fi
  203. LOOP_C=$((LOOP_C + 1))
  204. done
  205. [[ ${LOOP_C} -gt 5 ]] && echo 240.0.0.0 || echo ${CONTAINER_IP}
  206. }
  207. # One-time check
  208. if grep -qi "$(echo ${IPV6_NETWORK} | cut -d: -f1-3)" <<< "$(ip a s)"; then
  209. if [[ -z "$(get_ipv6)" ]]; then
  210. notify_error "ipv6-config" "enable_ipv6 is true in docker-compose.yml, but an IPv6 link could not be established. Please verify your IPv6 connection."
  211. fi
  212. fi
  213. external_checks() {
  214. err_count=0
  215. diff_c=0
  216. THRESHOLD=${EXTERNAL_CHECKS_THRESHOLD}
  217. # Reduce error count by 2 after restarting an unhealthy container
  218. GUID=$(mariadb --skip-ssl -u${DBUSER} -p${DBPASS} ${DBNAME} -e "SELECT version FROM versions WHERE application = 'GUID'" -BN)
  219. trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
  220. while [ ${err_count} -lt ${THRESHOLD} ]; do
  221. err_c_cur=${err_count}
  222. CHECK_REPONSE="$(curl --connect-timeout 3 -m 10 -4 -s https://checks.mailcow.email -X POST -dguid=${GUID} 2> /dev/null)"
  223. if [[ ! -z "${CHECK_REPONSE}" ]] && [[ "$(echo ${CHECK_REPONSE} | jq -r .response)" == "critical" ]]; then
  224. echo ${CHECK_REPONSE} | jq -r .out > /tmp/external_checks
  225. err_count=$(( ${err_count} + 1 ))
  226. fi
  227. CHECK_REPONSE6="$(curl --connect-timeout 3 -m 10 -6 -s https://checks.mailcow.email -X POST -dguid=${GUID} 2> /dev/null)"
  228. if [[ ! -z "${CHECK_REPONSE6}" ]] && [[ "$(echo ${CHECK_REPONSE6} | jq -r .response)" == "critical" ]]; then
  229. echo ${CHECK_REPONSE} | jq -r .out > /tmp/external_checks
  230. err_count=$(( ${err_count} + 1 ))
  231. fi
  232. [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
  233. [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
  234. progress "External checks" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
  235. if [[ $? == 10 ]]; then
  236. diff_c=0
  237. sleep 60
  238. else
  239. diff_c=0
  240. sleep $(( ( RANDOM % 20 ) + 1800 ))
  241. fi
  242. done
  243. return 1
  244. }
  245. nginx_checks() {
  246. err_count=0
  247. diff_c=0
  248. THRESHOLD=${NGINX_THRESHOLD}
  249. # Reduce error count by 2 after restarting an unhealthy container
  250. trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
  251. while [ ${err_count} -lt ${THRESHOLD} ]; do
  252. touch /tmp/nginx-mailcow; echo "$(tail -50 /tmp/nginx-mailcow)" > /tmp/nginx-mailcow
  253. host_ip=$(get_container_ip nginx-mailcow)
  254. err_c_cur=${err_count}
  255. /usr/lib/nagios/plugins/check_http -4 -H ${host_ip} -u / -p 8081 2>> /tmp/nginx-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
  256. [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
  257. [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
  258. progress "Nginx" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
  259. if [[ $? == 10 ]]; then
  260. diff_c=0
  261. sleep 1
  262. else
  263. diff_c=0
  264. sleep $(( ( RANDOM % 60 ) + 20 ))
  265. fi
  266. done
  267. return 1
  268. }
  269. unbound_checks() {
  270. err_count=0
  271. diff_c=0
  272. THRESHOLD=${UNBOUND_THRESHOLD}
  273. # Reduce error count by 2 after restarting an unhealthy container
  274. trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
  275. while [ ${err_count} -lt ${THRESHOLD} ]; do
  276. touch /tmp/unbound-mailcow; echo "$(tail -50 /tmp/unbound-mailcow)" > /tmp/unbound-mailcow
  277. host_ip=$(get_container_ip unbound-mailcow)
  278. err_c_cur=${err_count}
  279. /usr/lib/mailcow/check_dns.sh -s ${host_ip} -H stackoverflow.com 2>> /tmp/unbound-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
  280. DNSSEC=$(dig com +dnssec | egrep 'flags:.+ad')
  281. if [[ -z ${DNSSEC} ]]; then
  282. echo "DNSSEC failure" 2>> /tmp/unbound-mailcow 1>&2
  283. err_count=$(( ${err_count} + 1))
  284. else
  285. echo "DNSSEC check succeeded" 2>> /tmp/unbound-mailcow 1>&2
  286. fi
  287. [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
  288. [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
  289. progress "Unbound" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
  290. if [[ $? == 10 ]]; then
  291. diff_c=0
  292. sleep 1
  293. else
  294. diff_c=0
  295. sleep $(( ( RANDOM % 60 ) + 20 ))
  296. fi
  297. done
  298. return 1
  299. }
  300. redis_checks() {
  301. # A check for the local redis container
  302. err_count=0
  303. diff_c=0
  304. THRESHOLD=${REDIS_THRESHOLD}
  305. # Reduce error count by 2 after restarting an unhealthy container
  306. trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
  307. while [ ${err_count} -lt ${THRESHOLD} ]; do
  308. touch /tmp/redis-mailcow; echo "$(tail -50 /tmp/redis-mailcow)" > /tmp/redis-mailcow
  309. host_ip=$(get_container_ip redis-mailcow)
  310. err_c_cur=${err_count}
  311. /usr/lib/nagios/plugins/check_tcp -4 -H redis-mailcow -p 6379 -E -s "AUTH ${REDISPASS}\nPING\n" -q "QUIT" -e "PONG" 2>> /tmp/redis-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
  312. [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
  313. [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
  314. progress "Redis" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
  315. if [[ $? == 10 ]]; then
  316. diff_c=0
  317. sleep 1
  318. else
  319. diff_c=0
  320. sleep $(( ( RANDOM % 60 ) + 20 ))
  321. fi
  322. done
  323. return 1
  324. }
  325. mysql_checks() {
  326. err_count=0
  327. diff_c=0
  328. THRESHOLD=${MYSQL_THRESHOLD}
  329. # Reduce error count by 2 after restarting an unhealthy container
  330. trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
  331. while [ ${err_count} -lt ${THRESHOLD} ]; do
  332. touch /tmp/mysql-mailcow; echo "$(tail -50 /tmp/mysql-mailcow)" > /tmp/mysql-mailcow
  333. err_c_cur=${err_count}
  334. /usr/lib/nagios/plugins/check_mysql -s /var/run/mysqld/mysqld.sock -u ${DBUSER} -p ${DBPASS} -d ${DBNAME} 2>> /tmp/mysql-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
  335. /usr/lib/nagios/plugins/check_mysql_query -s /var/run/mysqld/mysqld.sock -u ${DBUSER} -p ${DBPASS} -d ${DBNAME} -q "SELECT COUNT(*) FROM information_schema.tables" 2>> /tmp/mysql-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
  336. [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
  337. [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
  338. progress "MySQL/MariaDB" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
  339. if [[ $? == 10 ]]; then
  340. diff_c=0
  341. sleep 1
  342. else
  343. diff_c=0
  344. sleep $(( ( RANDOM % 60 ) + 20 ))
  345. fi
  346. done
  347. return 1
  348. }
  349. mysql_repl_checks() {
  350. err_count=0
  351. diff_c=0
  352. THRESHOLD=${MYSQL_REPLICATION_THRESHOLD}
  353. # Reduce error count by 2 after restarting an unhealthy container
  354. trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
  355. while [ ${err_count} -lt ${THRESHOLD} ]; do
  356. touch /tmp/mysql_repl_checks; echo "$(tail -50 /tmp/mysql_repl_checks)" > /tmp/mysql_repl_checks
  357. err_c_cur=${err_count}
  358. /usr/lib/nagios/plugins/check_mysql_slavestatus.sh -S /var/run/mysqld/mysqld.sock -u root -p ${DBROOT} 2>> /tmp/mysql_repl_checks 1>&2; err_count=$(( ${err_count} + $? ))
  359. [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
  360. [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
  361. progress "MySQL/MariaDB replication" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
  362. if [[ $? == 10 ]]; then
  363. diff_c=0
  364. sleep 60
  365. else
  366. diff_c=0
  367. sleep $(( ( RANDOM % 60 ) + 20 ))
  368. fi
  369. done
  370. return 1
  371. }
  372. sogo_checks() {
  373. err_count=0
  374. diff_c=0
  375. THRESHOLD=${SOGO_THRESHOLD}
  376. # Reduce error count by 2 after restarting an unhealthy container
  377. trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
  378. while [ ${err_count} -lt ${THRESHOLD} ]; do
  379. touch /tmp/sogo-mailcow; echo "$(tail -50 /tmp/sogo-mailcow)" > /tmp/sogo-mailcow
  380. host_ip=$(get_container_ip sogo-mailcow)
  381. err_c_cur=${err_count}
  382. /usr/lib/nagios/plugins/check_http -4 -H ${host_ip} -u /SOGo.index/ -p 20000 2>> /tmp/sogo-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
  383. [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
  384. [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
  385. progress "SOGo" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
  386. if [[ $? == 10 ]]; then
  387. diff_c=0
  388. sleep 1
  389. else
  390. diff_c=0
  391. sleep $(( ( RANDOM % 60 ) + 20 ))
  392. fi
  393. done
  394. return 1
  395. }
  396. postfix_checks() {
  397. err_count=0
  398. diff_c=0
  399. THRESHOLD=${POSTFIX_THRESHOLD}
  400. # Reduce error count by 2 after restarting an unhealthy container
  401. trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
  402. while [ ${err_count} -lt ${THRESHOLD} ]; do
  403. touch /tmp/postfix-mailcow; echo "$(tail -50 /tmp/postfix-mailcow)" > /tmp/postfix-mailcow
  404. host_ip=$(get_container_ip postfix-mailcow)
  405. err_c_cur=${err_count}
  406. /usr/lib/nagios/plugins/check_smtp -4 -H ${host_ip} -p 589 -f "watchdog@invalid" -C "RCPT TO:watchdog@localhost" -C DATA -C . -R 250 2>> /tmp/postfix-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
  407. /usr/lib/nagios/plugins/check_smtp -4 -H ${host_ip} -p 589 -S 2>> /tmp/postfix-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
  408. [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
  409. [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
  410. progress "Postfix" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
  411. if [[ $? == 10 ]]; then
  412. diff_c=0
  413. sleep 1
  414. else
  415. diff_c=0
  416. sleep $(( ( RANDOM % 60 ) + 20 ))
  417. fi
  418. done
  419. return 1
  420. }
  421. clamd_checks() {
  422. err_count=0
  423. diff_c=0
  424. THRESHOLD=${CLAMD_THRESHOLD}
  425. # Reduce error count by 2 after restarting an unhealthy container
  426. trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
  427. while [ ${err_count} -lt ${THRESHOLD} ]; do
  428. touch /tmp/clamd-mailcow; echo "$(tail -50 /tmp/clamd-mailcow)" > /tmp/clamd-mailcow
  429. host_ip=$(get_container_ip clamd-mailcow)
  430. err_c_cur=${err_count}
  431. /usr/lib/nagios/plugins/check_clamd -4 -H ${host_ip} 2>> /tmp/clamd-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
  432. [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
  433. [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
  434. progress "Clamd" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
  435. if [[ $? == 10 ]]; then
  436. diff_c=0
  437. sleep 1
  438. else
  439. diff_c=0
  440. sleep $(( ( RANDOM % 120 ) + 20 ))
  441. fi
  442. done
  443. return 1
  444. }
  445. dovecot_checks() {
  446. err_count=0
  447. diff_c=0
  448. THRESHOLD=${DOVECOT_THRESHOLD}
  449. # Reduce error count by 2 after restarting an unhealthy container
  450. trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
  451. while [ ${err_count} -lt ${THRESHOLD} ]; do
  452. touch /tmp/dovecot-mailcow; echo "$(tail -50 /tmp/dovecot-mailcow)" > /tmp/dovecot-mailcow
  453. host_ip=$(get_container_ip dovecot-mailcow)
  454. err_c_cur=${err_count}
  455. /usr/lib/nagios/plugins/check_smtp -4 -H ${host_ip} -p 24 -f "watchdog@invalid" -C "RCPT TO:<watchdog@invalid>" -L -R "User doesn't exist" 2>> /tmp/dovecot-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
  456. /usr/lib/nagios/plugins/check_imap -4 -H ${host_ip} -p 993 -S -e "OK " 2>> /tmp/dovecot-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
  457. /usr/lib/nagios/plugins/check_imap -4 -H ${host_ip} -p 143 -e "OK " 2>> /tmp/dovecot-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
  458. /usr/lib/nagios/plugins/check_tcp -4 -H ${host_ip} -p 10001 -e "VERSION" 2>> /tmp/dovecot-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
  459. /usr/lib/nagios/plugins/check_tcp -4 -H ${host_ip} -p 4190 -e "Dovecot ready" 2>> /tmp/dovecot-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
  460. [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
  461. [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
  462. progress "Dovecot" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
  463. if [[ $? == 10 ]]; then
  464. diff_c=0
  465. sleep 1
  466. else
  467. diff_c=0
  468. sleep $(( ( RANDOM % 60 ) + 20 ))
  469. fi
  470. done
  471. return 1
  472. }
  473. dovecot_repl_checks() {
  474. err_count=0
  475. diff_c=0
  476. THRESHOLD=${DOVECOT_REPL_THRESHOLD}
  477. D_REPL_STATUS=$(redis-cli -h redis -a ${REDISPASS} --no-auth-warning -r GET DOVECOT_REPL_HEALTH)
  478. # Reduce error count by 2 after restarting an unhealthy container
  479. trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
  480. while [ ${err_count} -lt ${THRESHOLD} ]; do
  481. err_c_cur=${err_count}
  482. D_REPL_STATUS=$(redis-cli --raw -h redis -a ${REDISPASS} --no-auth-warning GET DOVECOT_REPL_HEALTH)
  483. if [[ "${D_REPL_STATUS}" != "1" ]]; then
  484. err_count=$(( ${err_count} + 1 ))
  485. fi
  486. [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
  487. [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
  488. progress "Dovecot replication" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
  489. if [[ $? == 10 ]]; then
  490. diff_c=0
  491. sleep 60
  492. else
  493. diff_c=0
  494. sleep $(( ( RANDOM % 60 ) + 20 ))
  495. fi
  496. done
  497. return 1
  498. }
  499. cert_checks() {
  500. err_count=0
  501. diff_c=0
  502. THRESHOLD=7
  503. # Reduce error count by 2 after restarting an unhealthy container
  504. trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
  505. while [ ${err_count} -lt ${THRESHOLD} ]; do
  506. touch /tmp/certcheck; echo "$(tail -50 /tmp/certcheck)" > /tmp/certcheck
  507. host_ip_postfix=$(get_container_ip postfix)
  508. host_ip_dovecot=$(get_container_ip dovecot)
  509. err_c_cur=${err_count}
  510. /usr/lib/nagios/plugins/check_smtp -H ${host_ip_postfix} -p 589 -4 -S -D 7 2>> /tmp/certcheck 1>&2; err_count=$(( ${err_count} + $? ))
  511. /usr/lib/nagios/plugins/check_imap -H ${host_ip_dovecot} -p 993 -4 -S -D 7 2>> /tmp/certcheck 1>&2; err_count=$(( ${err_count} + $? ))
  512. [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
  513. [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
  514. progress "Primary certificate expiry check" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
  515. # Always sleep 5 minutes, mail notifications are limited
  516. sleep 300
  517. done
  518. return 1
  519. }
  520. phpfpm_checks() {
  521. err_count=0
  522. diff_c=0
  523. THRESHOLD=${PHPFPM_THRESHOLD}
  524. # Reduce error count by 2 after restarting an unhealthy container
  525. trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
  526. while [ ${err_count} -lt ${THRESHOLD} ]; do
  527. touch /tmp/php-fpm-mailcow; echo "$(tail -50 /tmp/php-fpm-mailcow)" > /tmp/php-fpm-mailcow
  528. host_ip=$(get_container_ip php-fpm-mailcow)
  529. err_c_cur=${err_count}
  530. /usr/lib/nagios/plugins/check_tcp -H ${host_ip} -p 9001 2>> /tmp/php-fpm-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
  531. /usr/lib/nagios/plugins/check_tcp -H ${host_ip} -p 9002 2>> /tmp/php-fpm-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
  532. [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
  533. [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
  534. progress "PHP-FPM" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
  535. if [[ $? == 10 ]]; then
  536. diff_c=0
  537. sleep 1
  538. else
  539. diff_c=0
  540. sleep $(( ( RANDOM % 60 ) + 20 ))
  541. fi
  542. done
  543. return 1
  544. }
  545. ratelimit_checks() {
  546. err_count=0
  547. diff_c=0
  548. THRESHOLD=${RATELIMIT_THRESHOLD}
  549. RL_LOG_STATUS=$(redis-cli -h redis -a ${REDISPASS} --no-auth-warning LRANGE RL_LOG 0 0 | jq .qid)
  550. # Reduce error count by 2 after restarting an unhealthy container
  551. trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
  552. while [ ${err_count} -lt ${THRESHOLD} ]; do
  553. err_c_cur=${err_count}
  554. RL_LOG_STATUS_PREV=${RL_LOG_STATUS}
  555. RL_LOG_STATUS=$(redis-cli -h redis -a ${REDISPASS} --no-auth-warning LRANGE RL_LOG 0 0 | jq .qid)
  556. if [[ ${RL_LOG_STATUS_PREV} != ${RL_LOG_STATUS} ]]; then
  557. err_count=$(( ${err_count} + 1 ))
  558. echo 'Last 10 applied ratelimits (may overlap with previous reports).' > /tmp/ratelimit
  559. echo 'Full ratelimit buckets can be emptied by deleting the ratelimit hash from within mailcow UI (see /debug -> Protocols -> Ratelimit):' >> /tmp/ratelimit
  560. echo >> /tmp/ratelimit
  561. redis-cli --raw -h redis -a ${REDISPASS} --no-auth-warning LRANGE RL_LOG 0 10 | jq . >> /tmp/ratelimit
  562. fi
  563. [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
  564. [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
  565. progress "Ratelimit" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
  566. if [[ $? == 10 ]]; then
  567. diff_c=0
  568. sleep 1
  569. else
  570. diff_c=0
  571. sleep $(( ( RANDOM % 60 ) + 20 ))
  572. fi
  573. done
  574. return 1
  575. }
  576. mailq_checks() {
  577. err_count=0
  578. diff_c=0
  579. THRESHOLD=${MAILQ_THRESHOLD}
  580. # Reduce error count by 2 after restarting an unhealthy container
  581. trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
  582. while [ ${err_count} -lt ${THRESHOLD} ]; do
  583. touch /tmp/mail_queue_status; echo "$(tail -50 /tmp/mail_queue_status)" > /tmp/mail_queue_status
  584. MAILQ_LOG_STATUS=$(find /var/spool/postfix/deferred -type f | wc -l)
  585. echo "Mail queue contains ${MAILQ_LOG_STATUS} items (critical limit is ${MAILQ_CRIT}) at $(date)" >> /tmp/mail_queue_status
  586. err_c_cur=${err_count}
  587. if [ ${MAILQ_LOG_STATUS} -ge ${MAILQ_CRIT} ]; then
  588. err_count=$(( ${err_count} + 1 ))
  589. echo "Mail queue contains ${MAILQ_LOG_STATUS} items (critical limit is ${MAILQ_CRIT}) at $(date)" >> /tmp/mail_queue_status
  590. fi
  591. [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
  592. [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
  593. progress "Mail queue" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
  594. if [[ $? == 10 ]]; then
  595. diff_c=0
  596. sleep 60
  597. else
  598. diff_c=0
  599. sleep $(( ( RANDOM % 60 ) + 20 ))
  600. fi
  601. done
  602. return 1
  603. }
  604. fail2ban_checks() {
  605. err_count=0
  606. diff_c=0
  607. THRESHOLD=${FAIL2BAN_THRESHOLD}
  608. F2B_LOG_STATUS=($(${REDIS_CMDLINE} --raw HKEYS F2B_ACTIVE_BANS))
  609. F2B_RES=
  610. # Reduce error count by 2 after restarting an unhealthy container
  611. trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
  612. while [ ${err_count} -lt ${THRESHOLD} ]; do
  613. err_c_cur=${err_count}
  614. F2B_LOG_STATUS_PREV=(${F2B_LOG_STATUS[@]})
  615. F2B_LOG_STATUS=($(${REDIS_CMDLINE} --raw HKEYS F2B_ACTIVE_BANS))
  616. array_diff F2B_RES F2B_LOG_STATUS F2B_LOG_STATUS_PREV
  617. if [[ ! -z "${F2B_RES}" ]]; then
  618. err_count=$(( ${err_count} + 1 ))
  619. echo -n "${F2B_RES[@]}" | tr -cd "[a-fA-F0-9.:/] " | timeout 3s ${REDIS_CMDLINE} -x SET F2B_RES > /dev/null
  620. if [ $? -ne 0 ]; then
  621. ${REDIS_CMDLINE} -x DEL F2B_RES
  622. fi
  623. fi
  624. [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
  625. [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
  626. progress "Fail2ban" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
  627. if [[ $? == 10 ]]; then
  628. diff_c=0
  629. sleep 1
  630. else
  631. diff_c=0
  632. sleep $(( ( RANDOM % 60 ) + 20 ))
  633. fi
  634. done
  635. return 1
  636. }
  637. acme_checks() {
  638. err_count=0
  639. diff_c=0
  640. THRESHOLD=${ACME_THRESHOLD}
  641. ACME_LOG_STATUS=$(redis-cli -h redis -a ${REDISPASS} --no-auth-warning GET ACME_FAIL_TIME)
  642. if [[ -z "${ACME_LOG_STATUS}" ]]; then
  643. ${REDIS_CMDLINE} SET ACME_FAIL_TIME 0
  644. ACME_LOG_STATUS=0
  645. fi
  646. # Reduce error count by 2 after restarting an unhealthy container
  647. trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
  648. while [ ${err_count} -lt ${THRESHOLD} ]; do
  649. err_c_cur=${err_count}
  650. ACME_LOG_STATUS_PREV=${ACME_LOG_STATUS}
  651. ACME_LC=0
  652. until [[ ! -z ${ACME_LOG_STATUS} ]] || [ ${ACME_LC} -ge 3 ]; do
  653. ACME_LOG_STATUS=$(redis-cli -h redis -a ${REDISPASS} --no-auth-warning GET ACME_FAIL_TIME 2> /dev/null)
  654. sleep 3
  655. ACME_LC=$((ACME_LC+1))
  656. done
  657. if [[ ${ACME_LOG_STATUS_PREV} != ${ACME_LOG_STATUS} ]]; then
  658. err_count=$(( ${err_count} + 1 ))
  659. fi
  660. [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
  661. [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
  662. progress "ACME" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
  663. if [[ $? == 10 ]]; then
  664. diff_c=0
  665. sleep 1
  666. else
  667. diff_c=0
  668. sleep $(( ( RANDOM % 60 ) + 20 ))
  669. fi
  670. done
  671. return 1
  672. }
  673. rspamd_checks() {
  674. err_count=0
  675. diff_c=0
  676. THRESHOLD=${RSPAMD_THRESHOLD}
  677. # Reduce error count by 2 after restarting an unhealthy container
  678. trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
  679. while [ ${err_count} -lt ${THRESHOLD} ]; do
  680. touch /tmp/rspamd-mailcow; echo "$(tail -50 /tmp/rspamd-mailcow)" > /tmp/rspamd-mailcow
  681. host_ip=$(get_container_ip rspamd-mailcow)
  682. err_c_cur=${err_count}
  683. SCORE=$(echo 'To: null@localhost
  684. From: watchdog@localhost
  685. Empty
  686. ' | usr/bin/curl --max-time 10 -s --data-binary @- --unix-socket /var/lib/rspamd/rspamd.sock http://rspamd.${COMPOSE_PROJECT_NAME}_mailcow-network/scan | jq -rc .default.required_score | sed 's/\..*//' )
  687. if [[ ${SCORE} -ne 9999 ]]; then
  688. echo "Rspamd settings check failed, score returned: ${SCORE}" 2>> /tmp/rspamd-mailcow 1>&2
  689. err_count=$(( ${err_count} + 1))
  690. else
  691. echo "Rspamd settings check succeeded, score returned: ${SCORE}" 2>> /tmp/rspamd-mailcow 1>&2
  692. fi
  693. # A dirty hack until a PING PONG event is implemented to worker proxy
  694. # We expect an empty response, not a timeout
  695. if [ "$(curl -s --max-time 10 ${host_ip}:9900 2> /dev/null ; echo $?)" == "28" ]; then
  696. echo "Milter check failed" 2>> /tmp/rspamd-mailcow 1>&2; err_count=$(( ${err_count} + 1 ));
  697. else
  698. echo "Milter check succeeded" 2>> /tmp/rspamd-mailcow 1>&2
  699. fi
  700. [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
  701. [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
  702. progress "Rspamd" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
  703. if [[ $? == 10 ]]; then
  704. diff_c=0
  705. sleep 1
  706. else
  707. diff_c=0
  708. sleep $(( ( RANDOM % 60 ) + 20 ))
  709. fi
  710. done
  711. return 1
  712. }
  713. olefy_checks() {
  714. err_count=0
  715. diff_c=0
  716. THRESHOLD=${OLEFY_THRESHOLD}
  717. # Reduce error count by 2 after restarting an unhealthy container
  718. trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
  719. while [ ${err_count} -lt ${THRESHOLD} ]; do
  720. touch /tmp/olefy-mailcow; echo "$(tail -50 /tmp/olefy-mailcow)" > /tmp/olefy-mailcow
  721. host_ip=$(get_container_ip olefy-mailcow)
  722. err_c_cur=${err_count}
  723. /usr/lib/nagios/plugins/check_tcp -4 -H ${host_ip} -p 10055 -s "PING\n" 2>> /tmp/olefy-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
  724. [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
  725. [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
  726. progress "Olefy" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
  727. if [[ $? == 10 ]]; then
  728. diff_c=0
  729. sleep 1
  730. else
  731. diff_c=0
  732. sleep $(( ( RANDOM % 60 ) + 20 ))
  733. fi
  734. done
  735. return 1
  736. }
  737. # Notify about start
  738. if [[ ${WATCHDOG_NOTIFY_START} =~ ^([yY][eE][sS]|[yY])+$ ]]; then
  739. notify_error "watchdog-mailcow" "Watchdog started monitoring mailcow."
  740. fi
  741. # Create watchdog agents
  742. (
  743. while true; do
  744. if ! nginx_checks; then
  745. log_msg "Nginx hit error limit"
  746. echo nginx-mailcow > /tmp/com_pipe
  747. fi
  748. done
  749. ) &
  750. PID=$!
  751. echo "Spawned nginx_checks with PID ${PID}"
  752. BACKGROUND_TASKS+=(${PID})
  753. if [[ ${WATCHDOG_EXTERNAL_CHECKS} =~ ^([yY][eE][sS]|[yY])+$ ]]; then
  754. (
  755. while true; do
  756. if ! external_checks; then
  757. log_msg "External checks hit error limit"
  758. echo external_checks > /tmp/com_pipe
  759. fi
  760. done
  761. ) &
  762. PID=$!
  763. echo "Spawned external_checks with PID ${PID}"
  764. BACKGROUND_TASKS+=(${PID})
  765. fi
  766. if [[ ${WATCHDOG_MYSQL_REPLICATION_CHECKS} =~ ^([yY][eE][sS]|[yY])+$ ]]; then
  767. (
  768. while true; do
  769. if ! mysql_repl_checks; then
  770. log_msg "MySQL replication check hit error limit"
  771. echo mysql_repl_checks > /tmp/com_pipe
  772. fi
  773. done
  774. ) &
  775. PID=$!
  776. echo "Spawned mysql_repl_checks with PID ${PID}"
  777. BACKGROUND_TASKS+=(${PID})
  778. fi
  779. (
  780. while true; do
  781. if ! mysql_checks; then
  782. log_msg "MySQL hit error limit"
  783. echo mysql-mailcow > /tmp/com_pipe
  784. fi
  785. done
  786. ) &
  787. PID=$!
  788. echo "Spawned mysql_checks with PID ${PID}"
  789. BACKGROUND_TASKS+=(${PID})
  790. (
  791. while true; do
  792. if ! redis_checks; then
  793. log_msg "Local Redis hit error limit"
  794. echo redis-mailcow > /tmp/com_pipe
  795. fi
  796. done
  797. ) &
  798. PID=$!
  799. echo "Spawned redis_checks with PID ${PID}"
  800. BACKGROUND_TASKS+=(${PID})
  801. (
  802. while true; do
  803. if ! phpfpm_checks; then
  804. log_msg "PHP-FPM hit error limit"
  805. echo php-fpm-mailcow > /tmp/com_pipe
  806. fi
  807. done
  808. ) &
  809. PID=$!
  810. echo "Spawned phpfpm_checks with PID ${PID}"
  811. BACKGROUND_TASKS+=(${PID})
  812. if [[ "${SKIP_SOGO}" =~ ^([nN][oO]|[nN])+$ ]]; then
  813. (
  814. while true; do
  815. if ! sogo_checks; then
  816. log_msg "SOGo hit error limit"
  817. echo sogo-mailcow > /tmp/com_pipe
  818. fi
  819. done
  820. ) &
  821. PID=$!
  822. echo "Spawned sogo_checks with PID ${PID}"
  823. BACKGROUND_TASKS+=(${PID})
  824. fi
  825. if [ ${CHECK_UNBOUND} -eq 1 ]; then
  826. (
  827. while true; do
  828. if ! unbound_checks; then
  829. log_msg "Unbound hit error limit"
  830. echo unbound-mailcow > /tmp/com_pipe
  831. fi
  832. done
  833. ) &
  834. PID=$!
  835. echo "Spawned unbound_checks with PID ${PID}"
  836. BACKGROUND_TASKS+=(${PID})
  837. fi
  838. if [[ "${SKIP_CLAMD}" =~ ^([nN][oO]|[nN])+$ ]]; then
  839. (
  840. while true; do
  841. if ! clamd_checks; then
  842. log_msg "Clamd hit error limit"
  843. echo clamd-mailcow > /tmp/com_pipe
  844. fi
  845. done
  846. ) &
  847. PID=$!
  848. echo "Spawned clamd_checks with PID ${PID}"
  849. BACKGROUND_TASKS+=(${PID})
  850. fi
  851. (
  852. while true; do
  853. if ! postfix_checks; then
  854. log_msg "Postfix hit error limit"
  855. echo postfix-mailcow > /tmp/com_pipe
  856. fi
  857. done
  858. ) &
  859. PID=$!
  860. echo "Spawned postfix_checks with PID ${PID}"
  861. BACKGROUND_TASKS+=(${PID})
  862. (
  863. while true; do
  864. if ! mailq_checks; then
  865. log_msg "Mail queue hit error limit"
  866. echo mail_queue_status > /tmp/com_pipe
  867. fi
  868. done
  869. ) &
  870. PID=$!
  871. echo "Spawned mailq_checks with PID ${PID}"
  872. BACKGROUND_TASKS+=(${PID})
  873. (
  874. while true; do
  875. if ! dovecot_checks; then
  876. log_msg "Dovecot hit error limit"
  877. echo dovecot-mailcow > /tmp/com_pipe
  878. fi
  879. done
  880. ) &
  881. PID=$!
  882. echo "Spawned dovecot_checks with PID ${PID}"
  883. BACKGROUND_TASKS+=(${PID})
  884. (
  885. while true; do
  886. if ! dovecot_repl_checks; then
  887. log_msg "Dovecot hit error limit"
  888. echo dovecot_repl_checks > /tmp/com_pipe
  889. fi
  890. done
  891. ) &
  892. PID=$!
  893. echo "Spawned dovecot_repl_checks with PID ${PID}"
  894. BACKGROUND_TASKS+=(${PID})
  895. (
  896. while true; do
  897. if ! rspamd_checks; then
  898. log_msg "Rspamd hit error limit"
  899. echo rspamd-mailcow > /tmp/com_pipe
  900. fi
  901. done
  902. ) &
  903. PID=$!
  904. echo "Spawned rspamd_checks with PID ${PID}"
  905. BACKGROUND_TASKS+=(${PID})
  906. (
  907. while true; do
  908. if ! ratelimit_checks; then
  909. log_msg "Ratelimit hit error limit"
  910. echo ratelimit > /tmp/com_pipe
  911. fi
  912. done
  913. ) &
  914. PID=$!
  915. echo "Spawned ratelimit_checks with PID ${PID}"
  916. BACKGROUND_TASKS+=(${PID})
  917. (
  918. while true; do
  919. if ! fail2ban_checks; then
  920. log_msg "Fail2ban hit error limit"
  921. echo fail2ban > /tmp/com_pipe
  922. fi
  923. done
  924. ) &
  925. PID=$!
  926. echo "Spawned fail2ban_checks with PID ${PID}"
  927. BACKGROUND_TASKS+=(${PID})
  928. (
  929. while true; do
  930. if ! cert_checks; then
  931. log_msg "Cert check hit error limit"
  932. echo certcheck > /tmp/com_pipe
  933. fi
  934. done
  935. ) &
  936. PID=$!
  937. echo "Spawned cert_checks with PID ${PID}"
  938. BACKGROUND_TASKS+=(${PID})
  939. if [[ "${SKIP_OLEFY}" =~ ^([nN][oO]|[nN])+$ ]]; then
  940. (
  941. while true; do
  942. if ! olefy_checks; then
  943. log_msg "Olefy hit error limit"
  944. echo olefy-mailcow > /tmp/com_pipe
  945. fi
  946. done
  947. ) &
  948. PID=$!
  949. echo "Spawned olefy_checks with PID ${PID}"
  950. BACKGROUND_TASKS+=(${PID})
  951. fi
  952. (
  953. while true; do
  954. if ! acme_checks; then
  955. log_msg "ACME client hit error limit"
  956. echo acme-mailcow > /tmp/com_pipe
  957. fi
  958. done
  959. ) &
  960. PID=$!
  961. echo "Spawned acme_checks with PID ${PID}"
  962. BACKGROUND_TASKS+=(${PID})
  963. # Monitor watchdog agents, stop script when agents fails and wait for respawn by Docker (restart:always:n)
  964. (
  965. while true; do
  966. for bg_task in ${BACKGROUND_TASKS[*]}; do
  967. if ! kill -0 ${bg_task} 1>&2; then
  968. log_msg "Worker ${bg_task} died, stopping watchdog and waiting for respawn..."
  969. kill -TERM 1
  970. fi
  971. sleep 10
  972. done
  973. done
  974. ) &
  975. # Monitor dockerapi
  976. (
  977. while true; do
  978. while nc -z dockerapi 443; do
  979. sleep 3
  980. done
  981. log_msg "Cannot find dockerapi-mailcow, waiting to recover..."
  982. kill -STOP ${BACKGROUND_TASKS[*]}
  983. until nc -z dockerapi 443; do
  984. sleep 3
  985. done
  986. kill -CONT ${BACKGROUND_TASKS[*]}
  987. kill -USR1 ${BACKGROUND_TASKS[*]}
  988. done
  989. ) &
  990. # Actions when threshold limit is reached
  991. while true; do
  992. CONTAINER_ID=
  993. HAS_INITDB=
  994. read com_pipe_answer </tmp/com_pipe
  995. if [ -s "/tmp/${com_pipe_answer}" ]; then
  996. cat "/tmp/${com_pipe_answer}"
  997. fi
  998. if [[ ${com_pipe_answer} == "ratelimit" ]]; then
  999. log_msg "At least one ratelimit was applied"
  1000. notify_error "${com_pipe_answer}"
  1001. elif [[ ${com_pipe_answer} == "mail_queue_status" ]]; then
  1002. log_msg "Mail queue status is critical"
  1003. notify_error "${com_pipe_answer}"
  1004. elif [[ ${com_pipe_answer} == "external_checks" ]]; then
  1005. log_msg "Your mailcow is an open relay!"
  1006. # Define $2 to override message text, else print service was restarted at ...
  1007. notify_error "${com_pipe_answer}" "Please stop mailcow now and check your network configuration!"
  1008. elif [[ ${com_pipe_answer} == "mysql_repl_checks" ]]; then
  1009. log_msg "MySQL replication is not working properly"
  1010. # Define $2 to override message text, else print service was restarted at ...
  1011. # Once mail per 10 minutes
  1012. notify_error "${com_pipe_answer}" "Please check the SQL replication status" 600
  1013. elif [[ ${com_pipe_answer} == "dovecot_repl_checks" ]]; then
  1014. log_msg "Dovecot replication is not working properly"
  1015. # Define $2 to override message text, else print service was restarted at ...
  1016. # Once mail per 10 minutes
  1017. notify_error "${com_pipe_answer}" "Please check the Dovecot replicator status" 600
  1018. elif [[ ${com_pipe_answer} == "certcheck" ]]; then
  1019. log_msg "Certificates are about to expire"
  1020. # Define $2 to override message text, else print service was restarted at ...
  1021. # Only mail once a day
  1022. notify_error "${com_pipe_answer}" "Please renew your certificate" 86400
  1023. elif [[ ${com_pipe_answer} == "acme-mailcow" ]]; then
  1024. log_msg "acme-mailcow did not complete successfully"
  1025. # Define $2 to override message text, else print service was restarted at ...
  1026. notify_error "${com_pipe_answer}" "Please check acme-mailcow for further information."
  1027. elif [[ ${com_pipe_answer} == "fail2ban" ]]; then
  1028. F2B_RES=($(timeout 4s ${REDIS_CMDLINE} --raw GET F2B_RES 2> /dev/null))
  1029. if [[ ! -z "${F2B_RES}" ]]; then
  1030. ${REDIS_CMDLINE} DEL F2B_RES > /dev/null
  1031. host=
  1032. for host in "${F2B_RES[@]}"; do
  1033. log_msg "Banned ${host}"
  1034. rm /tmp/fail2ban 2> /dev/null
  1035. timeout 2s whois "${host}" > /tmp/fail2ban
  1036. [[ ${WATCHDOG_NOTIFY_BAN} =~ ^([yY][eE][sS]|[yY])+$ ]] && notify_error "${com_pipe_answer}" "IP ban: ${host}"
  1037. done
  1038. fi
  1039. elif [[ ${com_pipe_answer} =~ .+-mailcow ]]; then
  1040. kill -STOP ${BACKGROUND_TASKS[*]}
  1041. sleep 10
  1042. CONTAINER_ID=$(curl --silent --insecure https://dockerapi.${COMPOSE_PROJECT_NAME}_mailcow-network/containers/json | jq -r ".[] | {name: .Config.Labels[\"com.docker.compose.service\"], project: .Config.Labels[\"com.docker.compose.project\"], id: .Id}" | jq -rc "select( .name | tostring | contains(\"${com_pipe_answer}\")) | select( .project | tostring | contains(\"${COMPOSE_PROJECT_NAME,,}\")) | .id")
  1043. if [[ ! -z ${CONTAINER_ID} ]]; then
  1044. if [[ "${com_pipe_answer}" == "php-fpm-mailcow" ]]; then
  1045. HAS_INITDB=$(curl --silent --insecure -XPOST https://dockerapi.${COMPOSE_PROJECT_NAME}_mailcow-network/containers/${CONTAINER_ID}/top | jq '.msg.Processes[] | contains(["php -c /usr/local/etc/php -f /web/inc/init_db.inc.php"])' | grep true)
  1046. fi
  1047. S_RUNNING=$(($(date +%s) - $(curl --silent --insecure https://dockerapi.${COMPOSE_PROJECT_NAME}_mailcow-network/containers/${CONTAINER_ID}/json | jq .State.StartedAt | xargs -n1 date +%s -d)))
  1048. if [ ${S_RUNNING} -lt 360 ]; then
  1049. log_msg "Container is running for less than 360 seconds, skipping action..."
  1050. elif [[ ! -z ${HAS_INITDB} ]]; then
  1051. log_msg "Database is being initialized by php-fpm-mailcow, not restarting but delaying checks for a minute..."
  1052. sleep 60
  1053. else
  1054. log_msg "Sending restart command to ${CONTAINER_ID}..."
  1055. curl --silent --insecure -XPOST https://dockerapi.${COMPOSE_PROJECT_NAME}_mailcow-network/containers/${CONTAINER_ID}/restart
  1056. notify_error "${com_pipe_answer}"
  1057. log_msg "Wait for restarted container to settle and continue watching..."
  1058. sleep 35
  1059. fi
  1060. fi
  1061. kill -CONT ${BACKGROUND_TASKS[*]}
  1062. sleep 1
  1063. kill -USR1 ${BACKGROUND_TASKS[*]}
  1064. fi
  1065. done