watchdog.sh 36 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016
  1. #!/bin/bash
  2. trap "exit" INT TERM
  3. trap "kill 0" EXIT
  4. # Prepare
  5. BACKGROUND_TASKS=()
  6. echo "Waiting for containers to settle..."
  7. sleep 30
  8. if [[ "${USE_WATCHDOG}" =~ ^([nN][oO]|[nN])+$ ]]; then
  9. echo -e "$(date) - USE_WATCHDOG=n, skipping watchdog..."
  10. sleep 365d
  11. exec $(readlink -f "$0")
  12. fi
  13. # Checks pipe their corresponding container name in this pipe
  14. if [[ ! -p /tmp/com_pipe ]]; then
  15. mkfifo /tmp/com_pipe
  16. fi
  17. # Wait for containers
  18. while ! mysqladmin status --socket=/var/run/mysqld/mysqld.sock -u${DBUSER} -p${DBPASS} --silent; do
  19. echo "Waiting for SQL..."
  20. sleep 2
  21. done
  22. # Do not attempt to write to slave
  23. if [[ ! -z ${REDIS_SLAVEOF_IP} ]]; then
  24. REDIS_CMDLINE="redis-cli -h ${REDIS_SLAVEOF_IP} -p ${REDIS_SLAVEOF_PORT}"
  25. else
  26. REDIS_CMDLINE="redis-cli -h redis -p 6379"
  27. fi
  28. until [[ $(${REDIS_CMDLINE} PING) == "PONG" ]]; do
  29. echo "Waiting for Redis..."
  30. sleep 2
  31. done
  32. ${REDIS_CMDLINE} DEL F2B_RES > /dev/null
  33. # Common functions
  34. get_ipv6(){
  35. local IPV6=
  36. local IPV6_SRCS=
  37. local TRY=
  38. IPV6_SRCS[0]="ip6.korves.net"
  39. IPV6_SRCS[1]="ip6.mailcow.email"
  40. until [[ ! -z ${IPV6} ]] || [[ ${TRY} -ge 10 ]]; do
  41. IPV6=$(curl --connect-timeout 3 -m 10 -L6s ${IPV6_SRCS[$RANDOM % ${#IPV6_SRCS[@]} ]} | grep "^\([0-9a-fA-F]\{0,4\}:\)\{1,7\}[0-9a-fA-F]\{0,4\}$")
  42. [[ ! -z ${TRY} ]] && sleep 1
  43. TRY=$((TRY+1))
  44. done
  45. echo ${IPV6}
  46. }
  47. array_diff() {
  48. # https://stackoverflow.com/questions/2312762, Alex Offshore
  49. eval local ARR1=\(\"\${$2[@]}\"\)
  50. eval local ARR2=\(\"\${$3[@]}\"\)
  51. local IFS=$'\n'
  52. mapfile -t $1 < <(comm -23 <(echo "${ARR1[*]}" | sort) <(echo "${ARR2[*]}" | sort))
  53. }
  54. progress() {
  55. SERVICE=${1}
  56. TOTAL=${2}
  57. CURRENT=${3}
  58. DIFF=${4}
  59. [[ -z ${DIFF} ]] && DIFF=0
  60. [[ -z ${TOTAL} || -z ${CURRENT} ]] && return
  61. [[ ${CURRENT} -gt ${TOTAL} ]] && return
  62. [[ ${CURRENT} -lt 0 ]] && CURRENT=0
  63. PERCENT=$(( 200 * ${CURRENT} / ${TOTAL} % 2 + 100 * ${CURRENT} / ${TOTAL} ))
  64. ${REDIS_CMDLINE} LPUSH WATCHDOG_LOG "{\"time\":\"$(date +%s)\",\"service\":\"${SERVICE}\",\"lvl\":\"${PERCENT}\",\"hpnow\":\"${CURRENT}\",\"hptotal\":\"${TOTAL}\",\"hpdiff\":\"${DIFF}\"}" > /dev/null
  65. log_msg "${SERVICE} health level: ${PERCENT}% (${CURRENT}/${TOTAL}), health trend: ${DIFF}" no_redis
  66. # Return 10 to indicate a dead service
  67. [ ${CURRENT} -le 0 ] && return 10
  68. }
  69. log_msg() {
  70. if [[ ${2} != "no_redis" ]]; then
  71. ${REDIS_CMDLINE} LPUSH WATCHDOG_LOG "{\"time\":\"$(date +%s)\",\"message\":\"$(printf '%s' "${1}" | \
  72. tr '\r\n%&;$"_[]{}-' ' ')\"}" > /dev/null
  73. fi
  74. echo $(date) $(printf '%s\n' "${1}")
  75. }
  76. function mail_error() {
  77. [[ -z ${1} ]] && return 1
  78. [[ -z ${2} ]] && BODY="Service was restarted on $(date), please check your mailcow installation." || BODY="$(date) - ${2}"
  79. WATCHDOG_NOTIFY_EMAIL=$(echo "${WATCHDOG_NOTIFY_EMAIL}" | sed 's/"//;s|"$||')
  80. # Some exceptions for subject and body formats
  81. if [[ ${1} == "fail2ban" ]]; then
  82. SUBJECT="${BODY}"
  83. BODY="Please see netfilter-mailcow for more details and triggered rules."
  84. else
  85. SUBJECT="Watchdog ALERT: ${1}"
  86. fi
  87. IFS=',' read -r -a MAIL_RCPTS <<< "${WATCHDOG_NOTIFY_EMAIL}"
  88. for rcpt in "${MAIL_RCPTS[@]}"; do
  89. RCPT_DOMAIN=
  90. #RCPT_MX=
  91. RCPT_DOMAIN=$(echo ${rcpt} | awk -F @ {'print $NF'})
  92. # Latest smtp-cli looks up mx via dns
  93. #RCPT_MX=$(dig +short ${RCPT_DOMAIN} mx | sort -n | awk '{print $2; exit}')
  94. #if [[ -z ${RCPT_MX} ]]; then
  95. # log_msg "Cannot determine MX for ${rcpt}, skipping email notification..."
  96. # return 1
  97. #fi
  98. [ -f "/tmp/${1}" ] && BODY="/tmp/${1}"
  99. timeout 10s ./smtp-cli --missing-modules-ok \
  100. --charset=UTF-8 \
  101. --subject="${SUBJECT}" \
  102. --body-plain="${BODY}" \
  103. --add-header="X-Priority: 1" \
  104. --to=${rcpt} \
  105. --from="watchdog@${MAILCOW_HOSTNAME}" \
  106. --hello-host=${MAILCOW_HOSTNAME} \
  107. --ipv4
  108. #--server="${RCPT_MX}"
  109. log_msg "Sent notification email to ${rcpt}"
  110. done
  111. }
  112. get_container_ip() {
  113. # ${1} is container
  114. CONTAINER_ID=()
  115. CONTAINER_IPS=()
  116. CONTAINER_IP=
  117. LOOP_C=1
  118. until [[ ${CONTAINER_IP} =~ ^[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}$ ]] || [[ ${LOOP_C} -gt 5 ]]; do
  119. if [ ${IP_BY_DOCKER_API} -eq 0 ]; then
  120. CONTAINER_IP=$(dig a "${1}" +short)
  121. else
  122. sleep 0.5
  123. # get long container id for exact match
  124. CONTAINER_ID=($(curl --silent --insecure https://dockerapi/containers/json | jq -r ".[] | {name: .Config.Labels[\"com.docker.compose.service\"], id: .Id}" | jq -rc "select( .name | tostring == \"${1}\") | .id"))
  125. # returned id can have multiple elements (if scaled), shuffle for random test
  126. CONTAINER_ID=($(printf "%s\n" "${CONTAINER_ID[@]}" | shuf))
  127. if [[ ! -z ${CONTAINER_ID} ]]; then
  128. for matched_container in "${CONTAINER_ID[@]}"; do
  129. CONTAINER_IPS=($(curl --silent --insecure https://dockerapi/containers/${matched_container}/json | jq -r '.NetworkSettings.Networks[].IPAddress'))
  130. for ip_match in "${CONTAINER_IPS[@]}"; do
  131. # grep will do nothing if one of these vars is empty
  132. [[ -z ${ip_match} ]] && continue
  133. [[ -z ${IPV4_NETWORK} ]] && continue
  134. # only return ips that are part of our network
  135. if ! grep -q ${IPV4_NETWORK} <(echo ${ip_match}); then
  136. continue
  137. else
  138. CONTAINER_IP=${ip_match}
  139. break
  140. fi
  141. done
  142. [[ ! -z ${CONTAINER_IP} ]] && break
  143. done
  144. fi
  145. fi
  146. LOOP_C=$((LOOP_C + 1))
  147. done
  148. [[ ${LOOP_C} -gt 5 ]] && echo 240.0.0.0 || echo ${CONTAINER_IP}
  149. }
  150. # One-time check
  151. if grep -qi "$(echo ${IPV6_NETWORK} | cut -d: -f1-3)" <<< "$(ip a s)"; then
  152. if [[ -z "$(get_ipv6)" ]]; then
  153. mail_error "ipv6-config" "enable_ipv6 is true in docker-compose.yml, but an IPv6 link could not be established. Please verify your IPv6 connection."
  154. fi
  155. fi
  156. external_checks() {
  157. err_count=0
  158. diff_c=0
  159. THRESHOLD=${EXTERNAL_CHECKS_THRESHOLD}
  160. # Reduce error count by 2 after restarting an unhealthy container
  161. GUID=$(mysql -u${DBUSER} -p${DBPASS} ${DBNAME} -e "SELECT version FROM versions WHERE application = 'GUID'" -BN)
  162. trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
  163. while [ ${err_count} -lt ${THRESHOLD} ]; do
  164. err_c_cur=${err_count}
  165. CHECK_REPONSE="$(curl --connect-timeout 3 -m 10 -4 -s https://checks.mailcow.email -X POST -dguid=${GUID} 2> /dev/null)"
  166. if [[ ! -z "${CHECK_REPONSE}" ]] && [[ "$(echo ${CHECK_REPONSE} | jq -r .response)" == "critical" ]]; then
  167. echo ${CHECK_REPONSE} | jq -r .out > /tmp/external_checks
  168. err_count=$(( ${err_count} + 1 ))
  169. fi
  170. CHECK_REPONSE6="$(curl --connect-timeout 3 -m 10 -6 -s https://checks.mailcow.email -X POST -dguid=${GUID} 2> /dev/null)"
  171. if [[ ! -z "${CHECK_REPONSE6}" ]] && [[ "$(echo ${CHECK_REPONSE6} | jq -r .response)" == "critical" ]]; then
  172. echo ${CHECK_REPONSE} | jq -r .out > /tmp/external_checks
  173. err_count=$(( ${err_count} + 1 ))
  174. fi
  175. [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
  176. [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
  177. progress "External checks" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
  178. if [[ $? == 10 ]]; then
  179. diff_c=0
  180. sleep 60
  181. else
  182. diff_c=0
  183. sleep $(( ( RANDOM % 20 ) + 120 ))
  184. fi
  185. done
  186. return 1
  187. }
  188. nginx_checks() {
  189. err_count=0
  190. diff_c=0
  191. THRESHOLD=${NGINX_THRESHOLD}
  192. # Reduce error count by 2 after restarting an unhealthy container
  193. trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
  194. while [ ${err_count} -lt ${THRESHOLD} ]; do
  195. touch /tmp/nginx-mailcow; echo "$(tail -50 /tmp/nginx-mailcow)" > /tmp/nginx-mailcow
  196. host_ip=$(get_container_ip nginx-mailcow)
  197. err_c_cur=${err_count}
  198. /usr/lib/nagios/plugins/check_http -4 -H ${host_ip} -u / -p 8081 2>> /tmp/nginx-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
  199. [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
  200. [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
  201. progress "Nginx" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
  202. if [[ $? == 10 ]]; then
  203. diff_c=0
  204. sleep 1
  205. else
  206. diff_c=0
  207. sleep $(( ( RANDOM % 60 ) + 20 ))
  208. fi
  209. done
  210. return 1
  211. }
  212. unbound_checks() {
  213. err_count=0
  214. diff_c=0
  215. THRESHOLD=${UNBOUND_THRESHOLD}
  216. # Reduce error count by 2 after restarting an unhealthy container
  217. trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
  218. while [ ${err_count} -lt ${THRESHOLD} ]; do
  219. touch /tmp/unbound-mailcow; echo "$(tail -50 /tmp/unbound-mailcow)" > /tmp/unbound-mailcow
  220. host_ip=$(get_container_ip unbound-mailcow)
  221. err_c_cur=${err_count}
  222. /usr/lib/nagios/plugins/check_dns -s ${host_ip} -H stackoverflow.com 2>> /tmp/unbound-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
  223. DNSSEC=$(dig com +dnssec | egrep 'flags:.+ad')
  224. if [[ -z ${DNSSEC} ]]; then
  225. echo "DNSSEC failure" 2>> /tmp/unbound-mailcow 1>&2
  226. err_count=$(( ${err_count} + 1))
  227. else
  228. echo "DNSSEC check succeeded" 2>> /tmp/unbound-mailcow 1>&2
  229. fi
  230. [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
  231. [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
  232. progress "Unbound" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
  233. if [[ $? == 10 ]]; then
  234. diff_c=0
  235. sleep 1
  236. else
  237. diff_c=0
  238. sleep $(( ( RANDOM % 60 ) + 20 ))
  239. fi
  240. done
  241. return 1
  242. }
  243. redis_checks() {
  244. # A check for the local redis container
  245. err_count=0
  246. diff_c=0
  247. THRESHOLD=${REDIS_THRESHOLD}
  248. # Reduce error count by 2 after restarting an unhealthy container
  249. trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
  250. while [ ${err_count} -lt ${THRESHOLD} ]; do
  251. touch /tmp/redis-mailcow; echo "$(tail -50 /tmp/redis-mailcow)" > /tmp/redis-mailcow
  252. host_ip=$(get_container_ip redis-mailcow)
  253. err_c_cur=${err_count}
  254. /usr/lib/nagios/plugins/check_tcp -4 -H redis-mailcow -p 6379 -E -s "PING\n" -q "QUIT" -e "PONG" 2>> /tmp/redis-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
  255. [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
  256. [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
  257. progress "Redis" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
  258. if [[ $? == 10 ]]; then
  259. diff_c=0
  260. sleep 1
  261. else
  262. diff_c=0
  263. sleep $(( ( RANDOM % 60 ) + 20 ))
  264. fi
  265. done
  266. return 1
  267. }
  268. mysql_checks() {
  269. err_count=0
  270. diff_c=0
  271. THRESHOLD=${MYSQL_THRESHOLD}
  272. # Reduce error count by 2 after restarting an unhealthy container
  273. trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
  274. while [ ${err_count} -lt ${THRESHOLD} ]; do
  275. touch /tmp/mysql-mailcow; echo "$(tail -50 /tmp/mysql-mailcow)" > /tmp/mysql-mailcow
  276. err_c_cur=${err_count}
  277. /usr/lib/nagios/plugins/check_mysql -s /var/run/mysqld/mysqld.sock -u ${DBUSER} -p ${DBPASS} -d ${DBNAME} 2>> /tmp/mysql-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
  278. /usr/lib/nagios/plugins/check_mysql_query -s /var/run/mysqld/mysqld.sock -u ${DBUSER} -p ${DBPASS} -d ${DBNAME} -q "SELECT COUNT(*) FROM information_schema.tables" 2>> /tmp/mysql-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
  279. [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
  280. [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
  281. progress "MySQL/MariaDB" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
  282. if [[ $? == 10 ]]; then
  283. diff_c=0
  284. sleep 1
  285. else
  286. diff_c=0
  287. sleep $(( ( RANDOM % 60 ) + 20 ))
  288. fi
  289. done
  290. return 1
  291. }
  292. mysql_repl_checks() {
  293. err_count=0
  294. diff_c=0
  295. THRESHOLD=${MYSQL_REPLICATION_THRESHOLD}
  296. # Reduce error count by 2 after restarting an unhealthy container
  297. trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
  298. while [ ${err_count} -lt ${THRESHOLD} ]; do
  299. touch /tmp/mysql_repl_checks; echo "$(tail -50 /tmp/mysql_repl_checks)" > /tmp/mysql_repl_checks
  300. err_c_cur=${err_count}
  301. /usr/lib/nagios/plugins/check_mysql_slavestatus.sh -S /var/run/mysqld/mysqld.sock -u root -p ${DBROOT} 2>> /tmp/mysql_repl_checks 1>&2; err_count=$(( ${err_count} + $? ))
  302. [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
  303. [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
  304. progress "MySQL/MariaDB replication" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
  305. if [[ $? == 10 ]]; then
  306. diff_c=0
  307. sleep 60
  308. else
  309. diff_c=0
  310. sleep $(( ( RANDOM % 60 ) + 20 ))
  311. fi
  312. done
  313. return 1
  314. }
  315. sogo_checks() {
  316. err_count=0
  317. diff_c=0
  318. THRESHOLD=${SOGO_THRESHOLD}
  319. # Reduce error count by 2 after restarting an unhealthy container
  320. trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
  321. while [ ${err_count} -lt ${THRESHOLD} ]; do
  322. touch /tmp/sogo-mailcow; echo "$(tail -50 /tmp/sogo-mailcow)" > /tmp/sogo-mailcow
  323. host_ip=$(get_container_ip sogo-mailcow)
  324. err_c_cur=${err_count}
  325. /usr/lib/nagios/plugins/check_http -4 -H ${host_ip} -u /SOGo.index/ -p 20000 -R "SOGo\.MainUI" 2>> /tmp/sogo-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
  326. [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
  327. [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
  328. progress "SOGo" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
  329. if [[ $? == 10 ]]; then
  330. diff_c=0
  331. sleep 1
  332. else
  333. diff_c=0
  334. sleep $(( ( RANDOM % 60 ) + 20 ))
  335. fi
  336. done
  337. return 1
  338. }
  339. postfix_checks() {
  340. err_count=0
  341. diff_c=0
  342. THRESHOLD=${POSTFIX_THRESHOLD}
  343. # Reduce error count by 2 after restarting an unhealthy container
  344. trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
  345. while [ ${err_count} -lt ${THRESHOLD} ]; do
  346. touch /tmp/postfix-mailcow; echo "$(tail -50 /tmp/postfix-mailcow)" > /tmp/postfix-mailcow
  347. host_ip=$(get_container_ip postfix-mailcow)
  348. err_c_cur=${err_count}
  349. /usr/lib/nagios/plugins/check_smtp -4 -H ${host_ip} -p 589 -f "watchdog@invalid" -C "RCPT TO:watchdog@localhost" -C DATA -C . -R 250 2>> /tmp/postfix-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
  350. /usr/lib/nagios/plugins/check_smtp -4 -H ${host_ip} -p 589 -S 2>> /tmp/postfix-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
  351. [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
  352. [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
  353. progress "Postfix" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
  354. if [[ $? == 10 ]]; then
  355. diff_c=0
  356. sleep 1
  357. else
  358. diff_c=0
  359. sleep $(( ( RANDOM % 60 ) + 20 ))
  360. fi
  361. done
  362. return 1
  363. }
  364. clamd_checks() {
  365. err_count=0
  366. diff_c=0
  367. THRESHOLD=${CLAMD_THRESHOLD}
  368. # Reduce error count by 2 after restarting an unhealthy container
  369. trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
  370. while [ ${err_count} -lt ${THRESHOLD} ]; do
  371. touch /tmp/clamd-mailcow; echo "$(tail -50 /tmp/clamd-mailcow)" > /tmp/clamd-mailcow
  372. host_ip=$(get_container_ip clamd-mailcow)
  373. err_c_cur=${err_count}
  374. /usr/lib/nagios/plugins/check_clamd -4 -H ${host_ip} 2>> /tmp/clamd-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
  375. [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
  376. [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
  377. progress "Clamd" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
  378. if [[ $? == 10 ]]; then
  379. diff_c=0
  380. sleep 1
  381. else
  382. diff_c=0
  383. sleep $(( ( RANDOM % 120 ) + 20 ))
  384. fi
  385. done
  386. return 1
  387. }
  388. dovecot_checks() {
  389. err_count=0
  390. diff_c=0
  391. THRESHOLD=${DOVECOT_THRESHOLD}
  392. # Reduce error count by 2 after restarting an unhealthy container
  393. trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
  394. while [ ${err_count} -lt ${THRESHOLD} ]; do
  395. touch /tmp/dovecot-mailcow; echo "$(tail -50 /tmp/dovecot-mailcow)" > /tmp/dovecot-mailcow
  396. host_ip=$(get_container_ip dovecot-mailcow)
  397. err_c_cur=${err_count}
  398. /usr/lib/nagios/plugins/check_smtp -4 -H ${host_ip} -p 24 -f "watchdog@invalid" -C "RCPT TO:<watchdog@invalid>" -L -R "User doesn't exist" 2>> /tmp/dovecot-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
  399. /usr/lib/nagios/plugins/check_imap -4 -H ${host_ip} -p 993 -S -e "OK " 2>> /tmp/dovecot-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
  400. /usr/lib/nagios/plugins/check_imap -4 -H ${host_ip} -p 143 -e "OK " 2>> /tmp/dovecot-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
  401. /usr/lib/nagios/plugins/check_tcp -4 -H ${host_ip} -p 10001 -e "VERSION" 2>> /tmp/dovecot-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
  402. /usr/lib/nagios/plugins/check_tcp -4 -H ${host_ip} -p 4190 -e "Dovecot ready" 2>> /tmp/dovecot-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
  403. [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
  404. [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
  405. progress "Dovecot" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
  406. if [[ $? == 10 ]]; then
  407. diff_c=0
  408. sleep 1
  409. else
  410. diff_c=0
  411. sleep $(( ( RANDOM % 60 ) + 20 ))
  412. fi
  413. done
  414. return 1
  415. }
  416. dovecot_repl_checks() {
  417. err_count=0
  418. diff_c=0
  419. THRESHOLD=${DOVECOT_REPL_THRESHOLD}
  420. D_REPL_STATUS=$(redis-cli -h redis -r GET DOVECOT_REPL_HEALTH)
  421. # Reduce error count by 2 after restarting an unhealthy container
  422. trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
  423. while [ ${err_count} -lt ${THRESHOLD} ]; do
  424. err_c_cur=${err_count}
  425. D_REPL_STATUS=$(redis-cli --raw -h redis GET DOVECOT_REPL_HEALTH)
  426. if [[ "${D_REPL_STATUS}" != "1" ]]; then
  427. err_count=$(( ${err_count} + 1 ))
  428. fi
  429. [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
  430. [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
  431. progress "Dovecot replication" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
  432. if [[ $? == 10 ]]; then
  433. diff_c=0
  434. sleep 1
  435. else
  436. diff_c=0
  437. sleep $(( ( RANDOM % 60 ) + 20 ))
  438. fi
  439. done
  440. return 1
  441. }
  442. phpfpm_checks() {
  443. err_count=0
  444. diff_c=0
  445. THRESHOLD=${PHPFPM_THRESHOLD}
  446. # Reduce error count by 2 after restarting an unhealthy container
  447. trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
  448. while [ ${err_count} -lt ${THRESHOLD} ]; do
  449. touch /tmp/php-fpm-mailcow; echo "$(tail -50 /tmp/php-fpm-mailcow)" > /tmp/php-fpm-mailcow
  450. host_ip=$(get_container_ip php-fpm-mailcow)
  451. err_c_cur=${err_count}
  452. /usr/lib/nagios/plugins/check_tcp -H ${host_ip} -p 9001 2>> /tmp/php-fpm-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
  453. /usr/lib/nagios/plugins/check_tcp -H ${host_ip} -p 9002 2>> /tmp/php-fpm-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
  454. [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
  455. [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
  456. progress "PHP-FPM" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
  457. if [[ $? == 10 ]]; then
  458. diff_c=0
  459. sleep 1
  460. else
  461. diff_c=0
  462. sleep $(( ( RANDOM % 60 ) + 20 ))
  463. fi
  464. done
  465. return 1
  466. }
  467. ratelimit_checks() {
  468. err_count=0
  469. diff_c=0
  470. THRESHOLD=${RATELIMIT_THRESHOLD}
  471. RL_LOG_STATUS=$(redis-cli -h redis LRANGE RL_LOG 0 0 | jq .qid)
  472. # Reduce error count by 2 after restarting an unhealthy container
  473. trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
  474. while [ ${err_count} -lt ${THRESHOLD} ]; do
  475. err_c_cur=${err_count}
  476. RL_LOG_STATUS_PREV=${RL_LOG_STATUS}
  477. RL_LOG_STATUS=$(redis-cli -h redis LRANGE RL_LOG 0 0 | jq .qid)
  478. if [[ ${RL_LOG_STATUS_PREV} != ${RL_LOG_STATUS} ]]; then
  479. err_count=$(( ${err_count} + 1 ))
  480. echo 'Last 10 applied ratelimits (may overlap with previous reports).' > /tmp/ratelimit
  481. echo 'Full ratelimit buckets can be emptied by deleting the ratelimit hash from within mailcow UI (see /debug -> Protocols -> Ratelimit):' >> /tmp/ratelimit
  482. echo >> /tmp/ratelimit
  483. redis-cli --raw -h redis LRANGE RL_LOG 0 10 | jq . >> /tmp/ratelimit
  484. fi
  485. [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
  486. [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
  487. progress "Ratelimit" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
  488. if [[ $? == 10 ]]; then
  489. diff_c=0
  490. sleep 1
  491. else
  492. diff_c=0
  493. sleep $(( ( RANDOM % 60 ) + 20 ))
  494. fi
  495. done
  496. return 1
  497. }
  498. fail2ban_checks() {
  499. err_count=0
  500. diff_c=0
  501. THRESHOLD=${FAIL2BAN_THRESHOLD}
  502. F2B_LOG_STATUS=($(${REDIS_CMDLINE} --raw HKEYS F2B_ACTIVE_BANS))
  503. F2B_RES=
  504. # Reduce error count by 2 after restarting an unhealthy container
  505. trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
  506. while [ ${err_count} -lt ${THRESHOLD} ]; do
  507. err_c_cur=${err_count}
  508. F2B_LOG_STATUS_PREV=(${F2B_LOG_STATUS[@]})
  509. F2B_LOG_STATUS=($(${REDIS_CMDLINE} --raw HKEYS F2B_ACTIVE_BANS))
  510. array_diff F2B_RES F2B_LOG_STATUS F2B_LOG_STATUS_PREV
  511. if [[ ! -z "${F2B_RES}" ]]; then
  512. err_count=$(( ${err_count} + 1 ))
  513. echo -n "${F2B_RES[@]}" | tr -cd "[a-fA-F0-9.:/] " | timeout 3s ${REDIS_CMDLINE} -x SET F2B_RES > /dev/null
  514. if [ $? -ne 0 ]; then
  515. ${REDIS_CMDLINE} -x DEL F2B_RES
  516. fi
  517. fi
  518. [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
  519. [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
  520. progress "Fail2ban" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
  521. if [[ $? == 10 ]]; then
  522. diff_c=0
  523. sleep 1
  524. else
  525. diff_c=0
  526. sleep $(( ( RANDOM % 60 ) + 20 ))
  527. fi
  528. done
  529. return 1
  530. }
  531. acme_checks() {
  532. err_count=0
  533. diff_c=0
  534. THRESHOLD=${ACME_THRESHOLD}
  535. ACME_LOG_STATUS=$(redis-cli -h redis GET ACME_FAIL_TIME)
  536. if [[ -z "${ACME_LOG_STATUS}" ]]; then
  537. ${REDIS_CMDLINE} SET ACME_FAIL_TIME 0
  538. ACME_LOG_STATUS=0
  539. fi
  540. # Reduce error count by 2 after restarting an unhealthy container
  541. trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
  542. while [ ${err_count} -lt ${THRESHOLD} ]; do
  543. err_c_cur=${err_count}
  544. ACME_LOG_STATUS_PREV=${ACME_LOG_STATUS}
  545. ACME_LC=0
  546. until [[ ! -z ${ACME_LOG_STATUS} ]] || [ ${ACME_LC} -ge 3 ]; do
  547. ACME_LOG_STATUS=$(redis-cli -h redis GET ACME_FAIL_TIME 2> /dev/null)
  548. sleep 3
  549. ACME_LC=$((ACME_LC+1))
  550. done
  551. if [[ ${ACME_LOG_STATUS_PREV} != ${ACME_LOG_STATUS} ]]; then
  552. err_count=$(( ${err_count} + 1 ))
  553. fi
  554. [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
  555. [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
  556. progress "ACME" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
  557. if [[ $? == 10 ]]; then
  558. diff_c=0
  559. sleep 1
  560. else
  561. diff_c=0
  562. sleep $(( ( RANDOM % 60 ) + 20 ))
  563. fi
  564. done
  565. return 1
  566. }
  567. ipv6nat_checks() {
  568. err_count=0
  569. diff_c=0
  570. THRESHOLD=${IPV6NAT_THRESHOLD}
  571. # Reduce error count by 2 after restarting an unhealthy container
  572. trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
  573. while [ ${err_count} -lt ${THRESHOLD} ]; do
  574. err_c_cur=${err_count}
  575. CONTAINERS=$(curl --silent --insecure https://dockerapi/containers/json)
  576. IPV6NAT_CONTAINER_ID=$(echo ${CONTAINERS} | jq -r ".[] | {name: .Config.Labels[\"com.docker.compose.service\"], id: .Id}" | jq -rc "select( .name | tostring | contains(\"ipv6nat-mailcow\")) | .id")
  577. if [[ ! -z ${IPV6NAT_CONTAINER_ID} ]]; then
  578. LATEST_STARTED="$(echo ${CONTAINERS} | jq -r ".[] | {name: .Config.Labels[\"com.docker.compose.service\"], StartedAt: .State.StartedAt}" | jq -rc "select( .name | tostring | contains(\"ipv6nat-mailcow\") | not)" | jq -rc .StartedAt | xargs -n1 date +%s -d | sort | tail -n1)"
  579. LATEST_IPV6NAT="$(echo ${CONTAINERS} | jq -r ".[] | {name: .Config.Labels[\"com.docker.compose.service\"], StartedAt: .State.StartedAt}" | jq -rc "select( .name | tostring | contains(\"ipv6nat-mailcow\"))" | jq -rc .StartedAt | xargs -n1 date +%s -d | sort | tail -n1)"
  580. DIFFERENCE_START_TIME=$(expr ${LATEST_IPV6NAT} - ${LATEST_STARTED} 2>/dev/null)
  581. if [[ "${DIFFERENCE_START_TIME}" -lt 30 ]]; then
  582. err_count=$(( ${err_count} + 1 ))
  583. fi
  584. fi
  585. [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
  586. [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
  587. progress "IPv6 NAT" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
  588. if [[ $? == 10 ]]; then
  589. diff_c=0
  590. sleep 30
  591. else
  592. diff_c=0
  593. sleep 300
  594. fi
  595. done
  596. return 1
  597. }
  598. rspamd_checks() {
  599. err_count=0
  600. diff_c=0
  601. THRESHOLD=${RSPAMD_THRESHOLD}
  602. # Reduce error count by 2 after restarting an unhealthy container
  603. trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
  604. while [ ${err_count} -lt ${THRESHOLD} ]; do
  605. touch /tmp/rspamd-mailcow; echo "$(tail -50 /tmp/rspamd-mailcow)" > /tmp/rspamd-mailcow
  606. host_ip=$(get_container_ip rspamd-mailcow)
  607. err_c_cur=${err_count}
  608. SCORE=$(echo 'To: null@localhost
  609. From: watchdog@localhost
  610. Empty
  611. ' | usr/bin/curl -s --data-binary @- --unix-socket /var/lib/rspamd/rspamd.sock http://rspamd/scan | jq -rc .default.required_score)
  612. if [[ ${SCORE} != "9999" ]]; then
  613. echo "Rspamd settings check failed" 2>> /tmp/rspamd-mailcow 1>&2
  614. err_count=$(( ${err_count} + 1))
  615. else
  616. echo "Rspamd settings check succeeded" 2>> /tmp/rspamd-mailcow 1>&2
  617. fi
  618. [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
  619. [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
  620. progress "Rspamd" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
  621. if [[ $? == 10 ]]; then
  622. diff_c=0
  623. sleep 1
  624. else
  625. diff_c=0
  626. sleep $(( ( RANDOM % 60 ) + 20 ))
  627. fi
  628. done
  629. return 1
  630. }
  631. olefy_checks() {
  632. err_count=0
  633. diff_c=0
  634. THRESHOLD=${OLEFY_THRESHOLD}
  635. # Reduce error count by 2 after restarting an unhealthy container
  636. trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
  637. while [ ${err_count} -lt ${THRESHOLD} ]; do
  638. touch /tmp/olefy-mailcow; echo "$(tail -50 /tmp/olefy-mailcow)" > /tmp/olefy-mailcow
  639. host_ip=$(get_container_ip olefy-mailcow)
  640. err_c_cur=${err_count}
  641. /usr/lib/nagios/plugins/check_tcp -4 -H ${host_ip} -p 10055 -s "PING\n" 2>> /tmp/olefy-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
  642. [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
  643. [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
  644. progress "Olefy" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
  645. if [[ $? == 10 ]]; then
  646. diff_c=0
  647. sleep 1
  648. else
  649. diff_c=0
  650. sleep $(( ( RANDOM % 60 ) + 20 ))
  651. fi
  652. done
  653. return 1
  654. }
  655. # Notify about start
  656. if [[ ! -z ${WATCHDOG_NOTIFY_EMAIL} ]]; then
  657. mail_error "watchdog-mailcow" "Watchdog started monitoring mailcow."
  658. fi
  659. # Create watchdog agents
  660. (
  661. while true; do
  662. if ! nginx_checks; then
  663. log_msg "Nginx hit error limit"
  664. echo nginx-mailcow > /tmp/com_pipe
  665. fi
  666. done
  667. ) &
  668. PID=$!
  669. echo "Spawned nginx_checks with PID ${PID}"
  670. BACKGROUND_TASKS+=(${PID})
  671. if [[ ${WATCHDOG_EXTERNAL_CHECKS} =~ ^([yY][eE][sS]|[yY])+$ ]]; then
  672. (
  673. while true; do
  674. if ! external_checks; then
  675. log_msg "External checks hit error limit"
  676. echo external_checks > /tmp/com_pipe
  677. fi
  678. done
  679. ) &
  680. PID=$!
  681. echo "Spawned external_checks with PID ${PID}"
  682. BACKGROUND_TASKS+=(${PID})
  683. fi
  684. if [[ ${WATCHDOG_MYSQL_REPLICATION_CHECKS} =~ ^([yY][eE][sS]|[yY])+$ ]]; then
  685. (
  686. while true; do
  687. if ! mysql_repl_checks; then
  688. log_msg "MySQL replication check hit error limit"
  689. echo mysql_repl_checks > /tmp/com_pipe
  690. fi
  691. done
  692. ) &
  693. PID=$!
  694. echo "Spawned mysql_repl_checks with PID ${PID}"
  695. BACKGROUND_TASKS+=(${PID})
  696. fi
  697. (
  698. while true; do
  699. if ! mysql_checks; then
  700. log_msg "MySQL hit error limit"
  701. echo mysql-mailcow > /tmp/com_pipe
  702. fi
  703. done
  704. ) &
  705. PID=$!
  706. echo "Spawned mysql_checks with PID ${PID}"
  707. BACKGROUND_TASKS+=(${PID})
  708. (
  709. while true; do
  710. if ! redis_checks; then
  711. log_msg "Local Redis hit error limit"
  712. echo redis-mailcow > /tmp/com_pipe
  713. fi
  714. done
  715. ) &
  716. PID=$!
  717. echo "Spawned redis_checks with PID ${PID}"
  718. BACKGROUND_TASKS+=(${PID})
  719. (
  720. while true; do
  721. if ! phpfpm_checks; then
  722. log_msg "PHP-FPM hit error limit"
  723. echo php-fpm-mailcow > /tmp/com_pipe
  724. fi
  725. done
  726. ) &
  727. PID=$!
  728. echo "Spawned phpfpm_checks with PID ${PID}"
  729. BACKGROUND_TASKS+=(${PID})
  730. if [[ "${SKIP_SOGO}" =~ ^([nN][oO]|[nN])+$ ]]; then
  731. (
  732. while true; do
  733. if ! sogo_checks; then
  734. log_msg "SOGo hit error limit"
  735. echo sogo-mailcow > /tmp/com_pipe
  736. fi
  737. done
  738. ) &
  739. PID=$!
  740. echo "Spawned sogo_checks with PID ${PID}"
  741. BACKGROUND_TASKS+=(${PID})
  742. fi
  743. if [ ${CHECK_UNBOUND} -eq 1 ]; then
  744. (
  745. while true; do
  746. if ! unbound_checks; then
  747. log_msg "Unbound hit error limit"
  748. echo unbound-mailcow > /tmp/com_pipe
  749. fi
  750. done
  751. ) &
  752. PID=$!
  753. echo "Spawned unbound_checks with PID ${PID}"
  754. BACKGROUND_TASKS+=(${PID})
  755. fi
  756. if [[ "${SKIP_CLAMD}" =~ ^([nN][oO]|[nN])+$ ]]; then
  757. (
  758. while true; do
  759. if ! clamd_checks; then
  760. log_msg "Clamd hit error limit"
  761. echo clamd-mailcow > /tmp/com_pipe
  762. fi
  763. done
  764. ) &
  765. PID=$!
  766. echo "Spawned clamd_checks with PID ${PID}"
  767. BACKGROUND_TASKS+=(${PID})
  768. fi
  769. (
  770. while true; do
  771. if ! postfix_checks; then
  772. log_msg "Postfix hit error limit"
  773. echo postfix-mailcow > /tmp/com_pipe
  774. fi
  775. done
  776. ) &
  777. PID=$!
  778. echo "Spawned postfix_checks with PID ${PID}"
  779. BACKGROUND_TASKS+=(${PID})
  780. (
  781. while true; do
  782. if ! dovecot_checks; then
  783. log_msg "Dovecot hit error limit"
  784. echo dovecot-mailcow > /tmp/com_pipe
  785. fi
  786. done
  787. ) &
  788. PID=$!
  789. echo "Spawned dovecot_checks with PID ${PID}"
  790. BACKGROUND_TASKS+=(${PID})
  791. (
  792. while true; do
  793. if ! dovecot_repl_checks; then
  794. log_msg "Dovecot hit error limit"
  795. echo dovecot_repl_checks > /tmp/com_pipe
  796. fi
  797. done
  798. ) &
  799. PID=$!
  800. echo "Spawned dovecot_repl_checks with PID ${PID}"
  801. BACKGROUND_TASKS+=(${PID})
  802. (
  803. while true; do
  804. if ! rspamd_checks; then
  805. log_msg "Rspamd hit error limit"
  806. echo rspamd-mailcow > /tmp/com_pipe
  807. fi
  808. done
  809. ) &
  810. PID=$!
  811. echo "Spawned rspamd_checks with PID ${PID}"
  812. BACKGROUND_TASKS+=(${PID})
  813. (
  814. while true; do
  815. if ! ratelimit_checks; then
  816. log_msg "Ratelimit hit error limit"
  817. echo ratelimit > /tmp/com_pipe
  818. fi
  819. done
  820. ) &
  821. PID=$!
  822. echo "Spawned ratelimit_checks with PID ${PID}"
  823. BACKGROUND_TASKS+=(${PID})
  824. (
  825. while true; do
  826. if ! fail2ban_checks; then
  827. log_msg "Fail2ban hit error limit"
  828. echo fail2ban > /tmp/com_pipe
  829. fi
  830. done
  831. ) &
  832. PID=$!
  833. echo "Spawned fail2ban_checks with PID ${PID}"
  834. BACKGROUND_TASKS+=(${PID})
  835. (
  836. while true; do
  837. if ! olefy_checks; then
  838. log_msg "Olefy hit error limit"
  839. echo olefy-mailcow > /tmp/com_pipe
  840. fi
  841. done
  842. ) &
  843. PID=$!
  844. echo "Spawned olefy_checks with PID ${PID}"
  845. BACKGROUND_TASKS+=(${PID})
  846. (
  847. while true; do
  848. if ! acme_checks; then
  849. log_msg "ACME client hit error limit"
  850. echo acme-mailcow > /tmp/com_pipe
  851. fi
  852. done
  853. ) &
  854. PID=$!
  855. echo "Spawned acme_checks with PID ${PID}"
  856. BACKGROUND_TASKS+=(${PID})
  857. (
  858. while true; do
  859. if ! ipv6nat_checks; then
  860. log_msg "IPv6 NAT warning: ipv6nat-mailcow container was not started at least 30s after siblings (not an error)"
  861. echo ipv6nat-mailcow > /tmp/com_pipe
  862. fi
  863. done
  864. ) &
  865. PID=$!
  866. echo "Spawned ipv6nat_checks with PID ${PID}"
  867. BACKGROUND_TASKS+=(${PID})
  868. # Monitor watchdog agents, stop script when agents fails and wait for respawn by Docker (restart:always:n)
  869. (
  870. while true; do
  871. for bg_task in ${BACKGROUND_TASKS[*]}; do
  872. if ! kill -0 ${bg_task} 1>&2; then
  873. log_msg "Worker ${bg_task} died, stopping watchdog and waiting for respawn..."
  874. kill -TERM 1
  875. fi
  876. sleep 10
  877. done
  878. done
  879. ) &
  880. # Monitor dockerapi
  881. (
  882. while true; do
  883. while nc -z dockerapi 443; do
  884. sleep 3
  885. done
  886. log_msg "Cannot find dockerapi-mailcow, waiting to recover..."
  887. kill -STOP ${BACKGROUND_TASKS[*]}
  888. until nc -z dockerapi 443; do
  889. sleep 3
  890. done
  891. kill -CONT ${BACKGROUND_TASKS[*]}
  892. kill -USR1 ${BACKGROUND_TASKS[*]}
  893. done
  894. ) &
  895. # Actions when threshold limit is reached
  896. while true; do
  897. CONTAINER_ID=
  898. HAS_INITDB=
  899. read com_pipe_answer </tmp/com_pipe
  900. if [ -s "/tmp/${com_pipe_answer}" ]; then
  901. cat "/tmp/${com_pipe_answer}"
  902. fi
  903. if [[ ${com_pipe_answer} == "ratelimit" ]]; then
  904. log_msg "At least one ratelimit was applied"
  905. [[ ! -z ${WATCHDOG_NOTIFY_EMAIL} ]] && mail_error "${com_pipe_answer}"
  906. elif [[ ${com_pipe_answer} == "external_checks" ]]; then
  907. log_msg "Your mailcow is an open relay!"
  908. [[ ! -z ${WATCHDOG_NOTIFY_EMAIL} ]] && mail_error "${com_pipe_answer}" "Please stop mailcow now and check your network configuration!"
  909. elif [[ ${com_pipe_answer} == "mysql_repl_checks" ]]; then
  910. log_msg "MySQL replication is not working properly"
  911. [[ ! -z ${WATCHDOG_NOTIFY_EMAIL} ]] && mail_error "${com_pipe_answer}"
  912. elif [[ ${com_pipe_answer} == "dovecot_repl_checks" ]]; then
  913. log_msg "Dovecot replication is not working properly" "Please check doveadm replicator status"
  914. [[ ! -z ${WATCHDOG_NOTIFY_EMAIL} ]] && mail_error "${com_pipe_answer}"
  915. elif [[ ${com_pipe_answer} == "acme-mailcow" ]]; then
  916. log_msg "acme-mailcow did not complete successfully"
  917. [[ ! -z ${WATCHDOG_NOTIFY_EMAIL} ]] && mail_error "${com_pipe_answer}" "Please check acme-mailcow for further information."
  918. elif [[ ${com_pipe_answer} == "fail2ban" ]]; then
  919. F2B_RES=($(timeout 4s ${REDIS_CMDLINE} --raw GET F2B_RES 2> /dev/null))
  920. if [[ ! -z "${F2B_RES}" ]]; then
  921. ${REDIS_CMDLINE} DEL F2B_RES > /dev/null
  922. host=
  923. for host in "${F2B_RES[@]}"; do
  924. log_msg "Banned ${host}"
  925. rm /tmp/fail2ban 2> /dev/null
  926. timeout 2s whois "${host}" > /tmp/fail2ban
  927. [[ ! -z ${WATCHDOG_NOTIFY_EMAIL} ]] && [[ ${WATCHDOG_NOTIFY_BAN} =~ ^([yY][eE][sS]|[yY])+$ ]] && mail_error "${com_pipe_answer}" "IP ban: ${host}"
  928. done
  929. fi
  930. elif [[ ${com_pipe_answer} =~ .+-mailcow ]]; then
  931. kill -STOP ${BACKGROUND_TASKS[*]}
  932. sleep 10
  933. CONTAINER_ID=$(curl --silent --insecure https://dockerapi/containers/json | jq -r ".[] | {name: .Config.Labels[\"com.docker.compose.service\"], id: .Id}" | jq -rc "select( .name | tostring | contains(\"${com_pipe_answer}\")) | .id")
  934. if [[ ! -z ${CONTAINER_ID} ]]; then
  935. if [[ "${com_pipe_answer}" == "php-fpm-mailcow" ]]; then
  936. HAS_INITDB=$(curl --silent --insecure -XPOST https://dockerapi/containers/${CONTAINER_ID}/top | jq '.msg.Processes[] | contains(["php -c /usr/local/etc/php -f /web/inc/init_db.inc.php"])' | grep true)
  937. fi
  938. S_RUNNING=$(($(date +%s) - $(curl --silent --insecure https://dockerapi/containers/${CONTAINER_ID}/json | jq .State.StartedAt | xargs -n1 date +%s -d)))
  939. if [ ${S_RUNNING} -lt 360 ]; then
  940. log_msg "Container is running for less than 360 seconds, skipping action..."
  941. elif [[ ! -z ${HAS_INITDB} ]]; then
  942. log_msg "Database is being initialized by php-fpm-mailcow, not restarting but delaying checks for a minute..."
  943. sleep 60
  944. else
  945. log_msg "Sending restart command to ${CONTAINER_ID}..."
  946. curl --silent --insecure -XPOST https://dockerapi/containers/${CONTAINER_ID}/restart
  947. if [[ ${com_pipe_answer} != "ipv6nat-mailcow" ]]; then
  948. [[ ! -z ${WATCHDOG_NOTIFY_EMAIL} ]] && mail_error "${com_pipe_answer}"
  949. fi
  950. log_msg "Wait for restarted container to settle and continue watching..."
  951. sleep 35
  952. fi
  953. fi
  954. kill -CONT ${BACKGROUND_TASKS[*]}
  955. sleep 1
  956. kill -USR1 ${BACKGROUND_TASKS[*]}
  957. fi
  958. done