watchdog.sh 39 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097
  1. #!/bin/bash
  2. trap "exit" INT TERM
  3. trap "kill 0" EXIT
  4. # Prepare
  5. BACKGROUND_TASKS=()
  6. echo "Waiting for containers to settle..."
  7. for i in {30..1}; do
  8. echo "${i}"
  9. sleep 1
  10. done
  11. if [[ "${USE_WATCHDOG}" =~ ^([nN][oO]|[nN])+$ ]]; then
  12. echo -e "$(date) - USE_WATCHDOG=n, skipping watchdog..."
  13. sleep 365d
  14. exec $(readlink -f "$0")
  15. fi
  16. if [[ "${WATCHDOG_VERBOSE}" =~ ^([yY][eE][sS]|[yY])+$ ]]; then
  17. SMTP_VERBOSE="--verbose"
  18. set -xv
  19. else
  20. SMTP_VERBOSE=""
  21. exec 2>/dev/null
  22. fi
  23. # Checks pipe their corresponding container name in this pipe
  24. if [[ ! -p /tmp/com_pipe ]]; then
  25. mkfifo /tmp/com_pipe
  26. fi
  27. # Wait for containers
  28. while ! mysqladmin status --socket=/var/run/mysqld/mysqld.sock -u${DBUSER} -p${DBPASS} --silent; do
  29. echo "Waiting for SQL..."
  30. sleep 2
  31. done
  32. # Do not attempt to write to slave
  33. if [[ ! -z ${REDIS_SLAVEOF_IP} ]]; then
  34. REDIS_CMDLINE="redis-cli -h ${REDIS_SLAVEOF_IP} -p ${REDIS_SLAVEOF_PORT}"
  35. else
  36. REDIS_CMDLINE="redis-cli -h redis -p 6379"
  37. fi
  38. until [[ $(${REDIS_CMDLINE} PING) == "PONG" ]]; do
  39. echo "Waiting for Redis..."
  40. sleep 2
  41. done
  42. ${REDIS_CMDLINE} DEL F2B_RES > /dev/null
  43. # Common functions
  44. get_ipv6(){
  45. local IPV6=
  46. local IPV6_SRCS=
  47. local TRY=
  48. IPV6_SRCS[0]="ip6.mailcow.email"
  49. IPV6_SRCS[1]="ip6.nevondo.com"
  50. until [[ ! -z ${IPV6} ]] || [[ ${TRY} -ge 10 ]]; do
  51. IPV6=$(curl --connect-timeout 3 -m 10 -L6s ${IPV6_SRCS[$RANDOM % ${#IPV6_SRCS[@]} ]} | grep "^\([0-9a-fA-F]\{0,4\}:\)\{1,7\}[0-9a-fA-F]\{0,4\}$")
  52. [[ ! -z ${TRY} ]] && sleep 1
  53. TRY=$((TRY+1))
  54. done
  55. echo ${IPV6}
  56. }
  57. array_diff() {
  58. # https://stackoverflow.com/questions/2312762, Alex Offshore
  59. eval local ARR1=\(\"\${$2[@]}\"\)
  60. eval local ARR2=\(\"\${$3[@]}\"\)
  61. local IFS=$'\n'
  62. mapfile -t $1 < <(comm -23 <(echo "${ARR1[*]}" | sort) <(echo "${ARR2[*]}" | sort))
  63. }
  64. progress() {
  65. SERVICE=${1}
  66. TOTAL=${2}
  67. CURRENT=${3}
  68. DIFF=${4}
  69. [[ -z ${DIFF} ]] && DIFF=0
  70. [[ -z ${TOTAL} || -z ${CURRENT} ]] && return
  71. [[ ${CURRENT} -gt ${TOTAL} ]] && return
  72. [[ ${CURRENT} -lt 0 ]] && CURRENT=0
  73. PERCENT=$(( 200 * ${CURRENT} / ${TOTAL} % 2 + 100 * ${CURRENT} / ${TOTAL} ))
  74. ${REDIS_CMDLINE} LPUSH WATCHDOG_LOG "{\"time\":\"$(date +%s)\",\"service\":\"${SERVICE}\",\"lvl\":\"${PERCENT}\",\"hpnow\":\"${CURRENT}\",\"hptotal\":\"${TOTAL}\",\"hpdiff\":\"${DIFF}\"}" > /dev/null
  75. log_msg "${SERVICE} health level: ${PERCENT}% (${CURRENT}/${TOTAL}), health trend: ${DIFF}" no_redis
  76. # Return 10 to indicate a dead service
  77. [ ${CURRENT} -le 0 ] && return 10
  78. }
  79. log_msg() {
  80. if [[ ${2} != "no_redis" ]]; then
  81. ${REDIS_CMDLINE} LPUSH WATCHDOG_LOG "{\"time\":\"$(date +%s)\",\"message\":\"$(printf '%s' "${1}" | \
  82. tr '\r\n%&;$"_[]{}-' ' ')\"}" > /dev/null
  83. fi
  84. echo $(date) $(printf '%s\n' "${1}")
  85. }
  86. function mail_error() {
  87. THROTTLE=
  88. [[ -z ${1} ]] && return 1
  89. # If exists, body will be the content of "/tmp/${1}", even if ${2} is set
  90. [[ -z ${2} ]] && BODY="Service was restarted on $(date), please check your mailcow installation." || BODY="$(date) - ${2}"
  91. # If exists, mail will be throttled by argument in seconds
  92. [[ ! -z ${3} ]] && THROTTLE=${3}
  93. if [[ ! -z ${THROTTLE} ]]; then
  94. TTL_LEFT="$(${REDIS_CMDLINE} TTL THROTTLE_${1} 2> /dev/null)"
  95. if [[ "${TTL_LEFT}" == "-2" ]]; then
  96. # Delay key not found, setting a delay key now
  97. ${REDIS_CMDLINE} SET THROTTLE_${1} 1 EX ${THROTTLE}
  98. else
  99. log_msg "Not sending notification email now, blocked for ${TTL_LEFT} seconds..."
  100. return 1
  101. fi
  102. fi
  103. WATCHDOG_NOTIFY_EMAIL=$(echo "${WATCHDOG_NOTIFY_EMAIL}" | sed 's/"//;s|"$||')
  104. # Some exceptions for subject and body formats
  105. if [[ ${1} == "fail2ban" ]]; then
  106. SUBJECT="${BODY}"
  107. BODY="Please see netfilter-mailcow for more details and triggered rules."
  108. else
  109. SUBJECT="${WATCHDOG_SUBJECT}: ${1}"
  110. fi
  111. IFS=',' read -r -a MAIL_RCPTS <<< "${WATCHDOG_NOTIFY_EMAIL}"
  112. for rcpt in "${MAIL_RCPTS[@]}"; do
  113. RCPT_DOMAIN=
  114. RCPT_MX=
  115. RCPT_DOMAIN=$(echo ${rcpt} | awk -F @ {'print $NF'})
  116. CHECK_FOR_VALID_MX=$(dig +short ${RCPT_DOMAIN} mx)
  117. if [[ -z ${CHECK_FOR_VALID_MX} ]]; then
  118. log_msg "Cannot determine MX for ${rcpt}, skipping email notification..."
  119. return 1
  120. fi
  121. [ -f "/tmp/${1}" ] && BODY="/tmp/${1}"
  122. timeout 10s ./smtp-cli --missing-modules-ok \
  123. "${SMTP_VERBOSE}" \
  124. --charset=UTF-8 \
  125. --subject="${SUBJECT}" \
  126. --body-plain="${BODY}" \
  127. --add-header="X-Priority: 1" \
  128. --to=${rcpt} \
  129. --from="watchdog@${MAILCOW_HOSTNAME}" \
  130. --hello-host=${MAILCOW_HOSTNAME} \
  131. --ipv4
  132. if [[ $? -eq 1 ]]; then # exit code 1 is fine
  133. log_msg "Sent notification email to ${rcpt}"
  134. else
  135. if [[ "${SMTP_VERBOSE}" == "" ]]; then
  136. log_msg "Error while sending notification email to ${rcpt}. You can enable verbose logging by setting 'WATCHDOG_VERBOSE=y' in mailcow.conf."
  137. else
  138. log_msg "Error while sending notification email to ${rcpt}."
  139. fi
  140. fi
  141. done
  142. }
  143. get_container_ip() {
  144. # ${1} is container
  145. CONTAINER_ID=()
  146. CONTAINER_IPS=()
  147. CONTAINER_IP=
  148. LOOP_C=1
  149. until [[ ${CONTAINER_IP} =~ ^[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}$ ]] || [[ ${LOOP_C} -gt 5 ]]; do
  150. if [ ${IP_BY_DOCKER_API} -eq 0 ]; then
  151. CONTAINER_IP=$(dig a "${1}" +short)
  152. else
  153. sleep 0.5
  154. # get long container id for exact match
  155. CONTAINER_ID=($(curl --silent --insecure https://dockerapi/containers/json | jq -r ".[] | {name: .Config.Labels[\"com.docker.compose.service\"], project: .Config.Labels[\"com.docker.compose.project\"], id: .Id}" | jq -rc "select( .name | tostring == \"${1}\") | select( .project | tostring | contains(\"${COMPOSE_PROJECT_NAME,,}\")) | .id"))
  156. # returned id can have multiple elements (if scaled), shuffle for random test
  157. CONTAINER_ID=($(printf "%s\n" "${CONTAINER_ID[@]}" | shuf))
  158. if [[ ! -z ${CONTAINER_ID} ]]; then
  159. for matched_container in "${CONTAINER_ID[@]}"; do
  160. CONTAINER_IPS=($(curl --silent --insecure https://dockerapi/containers/${matched_container}/json | jq -r '.NetworkSettings.Networks[].IPAddress'))
  161. for ip_match in "${CONTAINER_IPS[@]}"; do
  162. # grep will do nothing if one of these vars is empty
  163. [[ -z ${ip_match} ]] && continue
  164. [[ -z ${IPV4_NETWORK} ]] && continue
  165. # only return ips that are part of our network
  166. if ! grep -q ${IPV4_NETWORK} <(echo ${ip_match}); then
  167. continue
  168. else
  169. CONTAINER_IP=${ip_match}
  170. break
  171. fi
  172. done
  173. [[ ! -z ${CONTAINER_IP} ]] && break
  174. done
  175. fi
  176. fi
  177. LOOP_C=$((LOOP_C + 1))
  178. done
  179. [[ ${LOOP_C} -gt 5 ]] && echo 240.0.0.0 || echo ${CONTAINER_IP}
  180. }
  181. # One-time check
  182. if grep -qi "$(echo ${IPV6_NETWORK} | cut -d: -f1-3)" <<< "$(ip a s)"; then
  183. if [[ -z "$(get_ipv6)" ]]; then
  184. mail_error "ipv6-config" "enable_ipv6 is true in docker-compose.yml, but an IPv6 link could not be established. Please verify your IPv6 connection."
  185. fi
  186. fi
  187. external_checks() {
  188. err_count=0
  189. diff_c=0
  190. THRESHOLD=${EXTERNAL_CHECKS_THRESHOLD}
  191. # Reduce error count by 2 after restarting an unhealthy container
  192. GUID=$(mysql -u${DBUSER} -p${DBPASS} ${DBNAME} -e "SELECT version FROM versions WHERE application = 'GUID'" -BN)
  193. trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
  194. while [ ${err_count} -lt ${THRESHOLD} ]; do
  195. err_c_cur=${err_count}
  196. CHECK_REPONSE="$(curl --connect-timeout 3 -m 10 -4 -s https://checks.mailcow.email -X POST -dguid=${GUID} 2> /dev/null)"
  197. if [[ ! -z "${CHECK_REPONSE}" ]] && [[ "$(echo ${CHECK_REPONSE} | jq -r .response)" == "critical" ]]; then
  198. echo ${CHECK_REPONSE} | jq -r .out > /tmp/external_checks
  199. err_count=$(( ${err_count} + 1 ))
  200. fi
  201. CHECK_REPONSE6="$(curl --connect-timeout 3 -m 10 -6 -s https://checks.mailcow.email -X POST -dguid=${GUID} 2> /dev/null)"
  202. if [[ ! -z "${CHECK_REPONSE6}" ]] && [[ "$(echo ${CHECK_REPONSE6} | jq -r .response)" == "critical" ]]; then
  203. echo ${CHECK_REPONSE} | jq -r .out > /tmp/external_checks
  204. err_count=$(( ${err_count} + 1 ))
  205. fi
  206. [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
  207. [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
  208. progress "External checks" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
  209. if [[ $? == 10 ]]; then
  210. diff_c=0
  211. sleep 60
  212. else
  213. diff_c=0
  214. sleep $(( ( RANDOM % 20 ) + 1800 ))
  215. fi
  216. done
  217. return 1
  218. }
  219. nginx_checks() {
  220. err_count=0
  221. diff_c=0
  222. THRESHOLD=${NGINX_THRESHOLD}
  223. # Reduce error count by 2 after restarting an unhealthy container
  224. trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
  225. while [ ${err_count} -lt ${THRESHOLD} ]; do
  226. touch /tmp/nginx-mailcow; echo "$(tail -50 /tmp/nginx-mailcow)" > /tmp/nginx-mailcow
  227. host_ip=$(get_container_ip nginx-mailcow)
  228. err_c_cur=${err_count}
  229. /usr/lib/nagios/plugins/check_http -4 -H ${host_ip} -u / -p 8081 2>> /tmp/nginx-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
  230. [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
  231. [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
  232. progress "Nginx" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
  233. if [[ $? == 10 ]]; then
  234. diff_c=0
  235. sleep 1
  236. else
  237. diff_c=0
  238. sleep $(( ( RANDOM % 60 ) + 20 ))
  239. fi
  240. done
  241. return 1
  242. }
  243. unbound_checks() {
  244. err_count=0
  245. diff_c=0
  246. THRESHOLD=${UNBOUND_THRESHOLD}
  247. # Reduce error count by 2 after restarting an unhealthy container
  248. trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
  249. while [ ${err_count} -lt ${THRESHOLD} ]; do
  250. touch /tmp/unbound-mailcow; echo "$(tail -50 /tmp/unbound-mailcow)" > /tmp/unbound-mailcow
  251. host_ip=$(get_container_ip unbound-mailcow)
  252. err_c_cur=${err_count}
  253. /usr/lib/nagios/plugins/check_dns -s ${host_ip} -H stackoverflow.com 2>> /tmp/unbound-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
  254. DNSSEC=$(dig com +dnssec | egrep 'flags:.+ad')
  255. if [[ -z ${DNSSEC} ]]; then
  256. echo "DNSSEC failure" 2>> /tmp/unbound-mailcow 1>&2
  257. err_count=$(( ${err_count} + 1))
  258. else
  259. echo "DNSSEC check succeeded" 2>> /tmp/unbound-mailcow 1>&2
  260. fi
  261. [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
  262. [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
  263. progress "Unbound" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
  264. if [[ $? == 10 ]]; then
  265. diff_c=0
  266. sleep 1
  267. else
  268. diff_c=0
  269. sleep $(( ( RANDOM % 60 ) + 20 ))
  270. fi
  271. done
  272. return 1
  273. }
  274. redis_checks() {
  275. # A check for the local redis container
  276. err_count=0
  277. diff_c=0
  278. THRESHOLD=${REDIS_THRESHOLD}
  279. # Reduce error count by 2 after restarting an unhealthy container
  280. trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
  281. while [ ${err_count} -lt ${THRESHOLD} ]; do
  282. touch /tmp/redis-mailcow; echo "$(tail -50 /tmp/redis-mailcow)" > /tmp/redis-mailcow
  283. host_ip=$(get_container_ip redis-mailcow)
  284. err_c_cur=${err_count}
  285. /usr/lib/nagios/plugins/check_tcp -4 -H redis-mailcow -p 6379 -E -s "PING\n" -q "QUIT" -e "PONG" 2>> /tmp/redis-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
  286. [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
  287. [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
  288. progress "Redis" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
  289. if [[ $? == 10 ]]; then
  290. diff_c=0
  291. sleep 1
  292. else
  293. diff_c=0
  294. sleep $(( ( RANDOM % 60 ) + 20 ))
  295. fi
  296. done
  297. return 1
  298. }
  299. mysql_checks() {
  300. err_count=0
  301. diff_c=0
  302. THRESHOLD=${MYSQL_THRESHOLD}
  303. # Reduce error count by 2 after restarting an unhealthy container
  304. trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
  305. while [ ${err_count} -lt ${THRESHOLD} ]; do
  306. touch /tmp/mysql-mailcow; echo "$(tail -50 /tmp/mysql-mailcow)" > /tmp/mysql-mailcow
  307. err_c_cur=${err_count}
  308. /usr/lib/nagios/plugins/check_mysql -s /var/run/mysqld/mysqld.sock -u ${DBUSER} -p ${DBPASS} -d ${DBNAME} 2>> /tmp/mysql-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
  309. /usr/lib/nagios/plugins/check_mysql_query -s /var/run/mysqld/mysqld.sock -u ${DBUSER} -p ${DBPASS} -d ${DBNAME} -q "SELECT COUNT(*) FROM information_schema.tables" 2>> /tmp/mysql-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
  310. [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
  311. [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
  312. progress "MySQL/MariaDB" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
  313. if [[ $? == 10 ]]; then
  314. diff_c=0
  315. sleep 1
  316. else
  317. diff_c=0
  318. sleep $(( ( RANDOM % 60 ) + 20 ))
  319. fi
  320. done
  321. return 1
  322. }
  323. mysql_repl_checks() {
  324. err_count=0
  325. diff_c=0
  326. THRESHOLD=${MYSQL_REPLICATION_THRESHOLD}
  327. # Reduce error count by 2 after restarting an unhealthy container
  328. trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
  329. while [ ${err_count} -lt ${THRESHOLD} ]; do
  330. touch /tmp/mysql_repl_checks; echo "$(tail -50 /tmp/mysql_repl_checks)" > /tmp/mysql_repl_checks
  331. err_c_cur=${err_count}
  332. /usr/lib/nagios/plugins/check_mysql_slavestatus.sh -S /var/run/mysqld/mysqld.sock -u root -p ${DBROOT} 2>> /tmp/mysql_repl_checks 1>&2; err_count=$(( ${err_count} + $? ))
  333. [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
  334. [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
  335. progress "MySQL/MariaDB replication" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
  336. if [[ $? == 10 ]]; then
  337. diff_c=0
  338. sleep 60
  339. else
  340. diff_c=0
  341. sleep $(( ( RANDOM % 60 ) + 20 ))
  342. fi
  343. done
  344. return 1
  345. }
  346. sogo_checks() {
  347. err_count=0
  348. diff_c=0
  349. THRESHOLD=${SOGO_THRESHOLD}
  350. # Reduce error count by 2 after restarting an unhealthy container
  351. trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
  352. while [ ${err_count} -lt ${THRESHOLD} ]; do
  353. touch /tmp/sogo-mailcow; echo "$(tail -50 /tmp/sogo-mailcow)" > /tmp/sogo-mailcow
  354. host_ip=$(get_container_ip sogo-mailcow)
  355. err_c_cur=${err_count}
  356. /usr/lib/nagios/plugins/check_http -4 -H ${host_ip} -u /SOGo.index/ -p 20000 2>> /tmp/sogo-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
  357. [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
  358. [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
  359. progress "SOGo" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
  360. if [[ $? == 10 ]]; then
  361. diff_c=0
  362. sleep 1
  363. else
  364. diff_c=0
  365. sleep $(( ( RANDOM % 60 ) + 20 ))
  366. fi
  367. done
  368. return 1
  369. }
  370. postfix_checks() {
  371. err_count=0
  372. diff_c=0
  373. THRESHOLD=${POSTFIX_THRESHOLD}
  374. # Reduce error count by 2 after restarting an unhealthy container
  375. trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
  376. while [ ${err_count} -lt ${THRESHOLD} ]; do
  377. touch /tmp/postfix-mailcow; echo "$(tail -50 /tmp/postfix-mailcow)" > /tmp/postfix-mailcow
  378. host_ip=$(get_container_ip postfix-mailcow)
  379. err_c_cur=${err_count}
  380. /usr/lib/nagios/plugins/check_smtp -4 -H ${host_ip} -p 589 -f "watchdog@invalid" -C "RCPT TO:watchdog@localhost" -C DATA -C . -R 250 2>> /tmp/postfix-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
  381. /usr/lib/nagios/plugins/check_smtp -4 -H ${host_ip} -p 589 -S 2>> /tmp/postfix-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
  382. [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
  383. [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
  384. progress "Postfix" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
  385. if [[ $? == 10 ]]; then
  386. diff_c=0
  387. sleep 1
  388. else
  389. diff_c=0
  390. sleep $(( ( RANDOM % 60 ) + 20 ))
  391. fi
  392. done
  393. return 1
  394. }
  395. clamd_checks() {
  396. err_count=0
  397. diff_c=0
  398. THRESHOLD=${CLAMD_THRESHOLD}
  399. # Reduce error count by 2 after restarting an unhealthy container
  400. trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
  401. while [ ${err_count} -lt ${THRESHOLD} ]; do
  402. touch /tmp/clamd-mailcow; echo "$(tail -50 /tmp/clamd-mailcow)" > /tmp/clamd-mailcow
  403. host_ip=$(get_container_ip clamd-mailcow)
  404. err_c_cur=${err_count}
  405. /usr/lib/nagios/plugins/check_clamd -4 -H ${host_ip} 2>> /tmp/clamd-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
  406. [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
  407. [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
  408. progress "Clamd" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
  409. if [[ $? == 10 ]]; then
  410. diff_c=0
  411. sleep 1
  412. else
  413. diff_c=0
  414. sleep $(( ( RANDOM % 120 ) + 20 ))
  415. fi
  416. done
  417. return 1
  418. }
  419. dovecot_checks() {
  420. err_count=0
  421. diff_c=0
  422. THRESHOLD=${DOVECOT_THRESHOLD}
  423. # Reduce error count by 2 after restarting an unhealthy container
  424. trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
  425. while [ ${err_count} -lt ${THRESHOLD} ]; do
  426. touch /tmp/dovecot-mailcow; echo "$(tail -50 /tmp/dovecot-mailcow)" > /tmp/dovecot-mailcow
  427. host_ip=$(get_container_ip dovecot-mailcow)
  428. err_c_cur=${err_count}
  429. /usr/lib/nagios/plugins/check_smtp -4 -H ${host_ip} -p 24 -f "watchdog@invalid" -C "RCPT TO:<watchdog@invalid>" -L -R "User doesn't exist" 2>> /tmp/dovecot-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
  430. /usr/lib/nagios/plugins/check_imap -4 -H ${host_ip} -p 993 -S -e "OK " 2>> /tmp/dovecot-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
  431. /usr/lib/nagios/plugins/check_imap -4 -H ${host_ip} -p 143 -e "OK " 2>> /tmp/dovecot-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
  432. /usr/lib/nagios/plugins/check_tcp -4 -H ${host_ip} -p 10001 -e "VERSION" 2>> /tmp/dovecot-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
  433. /usr/lib/nagios/plugins/check_tcp -4 -H ${host_ip} -p 4190 -e "Dovecot ready" 2>> /tmp/dovecot-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
  434. [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
  435. [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
  436. progress "Dovecot" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
  437. if [[ $? == 10 ]]; then
  438. diff_c=0
  439. sleep 1
  440. else
  441. diff_c=0
  442. sleep $(( ( RANDOM % 60 ) + 20 ))
  443. fi
  444. done
  445. return 1
  446. }
  447. dovecot_repl_checks() {
  448. err_count=0
  449. diff_c=0
  450. THRESHOLD=${DOVECOT_REPL_THRESHOLD}
  451. D_REPL_STATUS=$(redis-cli -h redis -r GET DOVECOT_REPL_HEALTH)
  452. # Reduce error count by 2 after restarting an unhealthy container
  453. trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
  454. while [ ${err_count} -lt ${THRESHOLD} ]; do
  455. err_c_cur=${err_count}
  456. D_REPL_STATUS=$(redis-cli --raw -h redis GET DOVECOT_REPL_HEALTH)
  457. if [[ "${D_REPL_STATUS}" != "1" ]]; then
  458. err_count=$(( ${err_count} + 1 ))
  459. fi
  460. [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
  461. [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
  462. progress "Dovecot replication" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
  463. if [[ $? == 10 ]]; then
  464. diff_c=0
  465. sleep 60
  466. else
  467. diff_c=0
  468. sleep $(( ( RANDOM % 60 ) + 20 ))
  469. fi
  470. done
  471. return 1
  472. }
  473. cert_checks() {
  474. err_count=0
  475. diff_c=0
  476. THRESHOLD=7
  477. # Reduce error count by 2 after restarting an unhealthy container
  478. trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
  479. while [ ${err_count} -lt ${THRESHOLD} ]; do
  480. touch /tmp/certcheck; echo "$(tail -50 /tmp/certcheck)" > /tmp/certcheck
  481. host_ip_postfix=$(get_container_ip postfix)
  482. host_ip_dovecot=$(get_container_ip dovecot)
  483. err_c_cur=${err_count}
  484. /usr/lib/nagios/plugins/check_smtp -H ${host_ip_postfix} -p 589 -4 -S -D 7 2>> /tmp/certcheck 1>&2; err_count=$(( ${err_count} + $? ))
  485. /usr/lib/nagios/plugins/check_imap -H ${host_ip_dovecot} -p 993 -4 -S -D 7 2>> /tmp/certcheck 1>&2; err_count=$(( ${err_count} + $? ))
  486. [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
  487. [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
  488. progress "Primary certificate expiry check" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
  489. # Always sleep 5 minutes, mail notifications are limited
  490. sleep 300
  491. done
  492. return 1
  493. }
  494. phpfpm_checks() {
  495. err_count=0
  496. diff_c=0
  497. THRESHOLD=${PHPFPM_THRESHOLD}
  498. # Reduce error count by 2 after restarting an unhealthy container
  499. trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
  500. while [ ${err_count} -lt ${THRESHOLD} ]; do
  501. touch /tmp/php-fpm-mailcow; echo "$(tail -50 /tmp/php-fpm-mailcow)" > /tmp/php-fpm-mailcow
  502. host_ip=$(get_container_ip php-fpm-mailcow)
  503. err_c_cur=${err_count}
  504. /usr/lib/nagios/plugins/check_tcp -H ${host_ip} -p 9001 2>> /tmp/php-fpm-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
  505. /usr/lib/nagios/plugins/check_tcp -H ${host_ip} -p 9002 2>> /tmp/php-fpm-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
  506. [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
  507. [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
  508. progress "PHP-FPM" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
  509. if [[ $? == 10 ]]; then
  510. diff_c=0
  511. sleep 1
  512. else
  513. diff_c=0
  514. sleep $(( ( RANDOM % 60 ) + 20 ))
  515. fi
  516. done
  517. return 1
  518. }
  519. ratelimit_checks() {
  520. err_count=0
  521. diff_c=0
  522. THRESHOLD=${RATELIMIT_THRESHOLD}
  523. RL_LOG_STATUS=$(redis-cli -h redis LRANGE RL_LOG 0 0 | jq .qid)
  524. # Reduce error count by 2 after restarting an unhealthy container
  525. trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
  526. while [ ${err_count} -lt ${THRESHOLD} ]; do
  527. err_c_cur=${err_count}
  528. RL_LOG_STATUS_PREV=${RL_LOG_STATUS}
  529. RL_LOG_STATUS=$(redis-cli -h redis LRANGE RL_LOG 0 0 | jq .qid)
  530. if [[ ${RL_LOG_STATUS_PREV} != ${RL_LOG_STATUS} ]]; then
  531. err_count=$(( ${err_count} + 1 ))
  532. echo 'Last 10 applied ratelimits (may overlap with previous reports).' > /tmp/ratelimit
  533. echo 'Full ratelimit buckets can be emptied by deleting the ratelimit hash from within mailcow UI (see /debug -> Protocols -> Ratelimit):' >> /tmp/ratelimit
  534. echo >> /tmp/ratelimit
  535. redis-cli --raw -h redis LRANGE RL_LOG 0 10 | jq . >> /tmp/ratelimit
  536. fi
  537. [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
  538. [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
  539. progress "Ratelimit" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
  540. if [[ $? == 10 ]]; then
  541. diff_c=0
  542. sleep 1
  543. else
  544. diff_c=0
  545. sleep $(( ( RANDOM % 60 ) + 20 ))
  546. fi
  547. done
  548. return 1
  549. }
  550. mailq_checks() {
  551. err_count=0
  552. diff_c=0
  553. THRESHOLD=${MAILQ_THRESHOLD}
  554. # Reduce error count by 2 after restarting an unhealthy container
  555. trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
  556. while [ ${err_count} -lt ${THRESHOLD} ]; do
  557. touch /tmp/mail_queue_status; echo "$(tail -50 /tmp/mail_queue_status)" > /tmp/mail_queue_status
  558. MAILQ_LOG_STATUS=$(find /var/spool/postfix/deferred -type f | wc -l)
  559. echo "Mail queue contains ${MAILQ_LOG_STATUS} items (critical limit is ${MAILQ_CRIT}) at $(date)" >> /tmp/mail_queue_status
  560. err_c_cur=${err_count}
  561. if [ ${MAILQ_LOG_STATUS} -ge ${MAILQ_CRIT} ]; then
  562. err_count=$(( ${err_count} + 1 ))
  563. echo "Mail queue contains ${MAILQ_LOG_STATUS} items (critical limit is ${MAILQ_CRIT}) at $(date)" >> /tmp/mail_queue_status
  564. fi
  565. [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
  566. [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
  567. progress "Mail queue" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
  568. if [[ $? == 10 ]]; then
  569. diff_c=0
  570. sleep 60
  571. else
  572. diff_c=0
  573. sleep $(( ( RANDOM % 60 ) + 20 ))
  574. fi
  575. done
  576. return 1
  577. }
  578. fail2ban_checks() {
  579. err_count=0
  580. diff_c=0
  581. THRESHOLD=${FAIL2BAN_THRESHOLD}
  582. F2B_LOG_STATUS=($(${REDIS_CMDLINE} --raw HKEYS F2B_ACTIVE_BANS))
  583. F2B_RES=
  584. # Reduce error count by 2 after restarting an unhealthy container
  585. trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
  586. while [ ${err_count} -lt ${THRESHOLD} ]; do
  587. err_c_cur=${err_count}
  588. F2B_LOG_STATUS_PREV=(${F2B_LOG_STATUS[@]})
  589. F2B_LOG_STATUS=($(${REDIS_CMDLINE} --raw HKEYS F2B_ACTIVE_BANS))
  590. array_diff F2B_RES F2B_LOG_STATUS F2B_LOG_STATUS_PREV
  591. if [[ ! -z "${F2B_RES}" ]]; then
  592. err_count=$(( ${err_count} + 1 ))
  593. echo -n "${F2B_RES[@]}" | tr -cd "[a-fA-F0-9.:/] " | timeout 3s ${REDIS_CMDLINE} -x SET F2B_RES > /dev/null
  594. if [ $? -ne 0 ]; then
  595. ${REDIS_CMDLINE} -x DEL F2B_RES
  596. fi
  597. fi
  598. [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
  599. [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
  600. progress "Fail2ban" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
  601. if [[ $? == 10 ]]; then
  602. diff_c=0
  603. sleep 1
  604. else
  605. diff_c=0
  606. sleep $(( ( RANDOM % 60 ) + 20 ))
  607. fi
  608. done
  609. return 1
  610. }
  611. acme_checks() {
  612. err_count=0
  613. diff_c=0
  614. THRESHOLD=${ACME_THRESHOLD}
  615. ACME_LOG_STATUS=$(redis-cli -h redis GET ACME_FAIL_TIME)
  616. if [[ -z "${ACME_LOG_STATUS}" ]]; then
  617. ${REDIS_CMDLINE} SET ACME_FAIL_TIME 0
  618. ACME_LOG_STATUS=0
  619. fi
  620. # Reduce error count by 2 after restarting an unhealthy container
  621. trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
  622. while [ ${err_count} -lt ${THRESHOLD} ]; do
  623. err_c_cur=${err_count}
  624. ACME_LOG_STATUS_PREV=${ACME_LOG_STATUS}
  625. ACME_LC=0
  626. until [[ ! -z ${ACME_LOG_STATUS} ]] || [ ${ACME_LC} -ge 3 ]; do
  627. ACME_LOG_STATUS=$(redis-cli -h redis GET ACME_FAIL_TIME 2> /dev/null)
  628. sleep 3
  629. ACME_LC=$((ACME_LC+1))
  630. done
  631. if [[ ${ACME_LOG_STATUS_PREV} != ${ACME_LOG_STATUS} ]]; then
  632. err_count=$(( ${err_count} + 1 ))
  633. fi
  634. [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
  635. [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
  636. progress "ACME" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
  637. if [[ $? == 10 ]]; then
  638. diff_c=0
  639. sleep 1
  640. else
  641. diff_c=0
  642. sleep $(( ( RANDOM % 60 ) + 20 ))
  643. fi
  644. done
  645. return 1
  646. }
  647. rspamd_checks() {
  648. err_count=0
  649. diff_c=0
  650. THRESHOLD=${RSPAMD_THRESHOLD}
  651. # Reduce error count by 2 after restarting an unhealthy container
  652. trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
  653. while [ ${err_count} -lt ${THRESHOLD} ]; do
  654. touch /tmp/rspamd-mailcow; echo "$(tail -50 /tmp/rspamd-mailcow)" > /tmp/rspamd-mailcow
  655. host_ip=$(get_container_ip rspamd-mailcow)
  656. err_c_cur=${err_count}
  657. SCORE=$(echo 'To: null@localhost
  658. From: watchdog@localhost
  659. Empty
  660. ' | usr/bin/curl --max-time 10 -s --data-binary @- --unix-socket /var/lib/rspamd/rspamd.sock http://rspamd/scan | jq -rc .default.required_score)
  661. if [[ ${SCORE} != "9999" ]]; then
  662. echo "Rspamd settings check failed, score returned: ${SCORE}" 2>> /tmp/rspamd-mailcow 1>&2
  663. err_count=$(( ${err_count} + 1))
  664. else
  665. echo "Rspamd settings check succeeded, score returned: ${SCORE}" 2>> /tmp/rspamd-mailcow 1>&2
  666. fi
  667. # A dirty hack until a PING PONG event is implemented to worker proxy
  668. # We expect an empty response, not a timeout
  669. if [ "$(curl -s --max-time 10 ${host_ip}:9900 2> /dev/null ; echo $?)" == "28" ]; then
  670. echo "Milter check failed" 2>> /tmp/rspamd-mailcow 1>&2; err_count=$(( ${err_count} + 1 ));
  671. else
  672. echo "Milter check succeeded" 2>> /tmp/rspamd-mailcow 1>&2
  673. fi
  674. [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
  675. [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
  676. progress "Rspamd" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
  677. if [[ $? == 10 ]]; then
  678. diff_c=0
  679. sleep 1
  680. else
  681. diff_c=0
  682. sleep $(( ( RANDOM % 60 ) + 20 ))
  683. fi
  684. done
  685. return 1
  686. }
  687. olefy_checks() {
  688. err_count=0
  689. diff_c=0
  690. THRESHOLD=${OLEFY_THRESHOLD}
  691. # Reduce error count by 2 after restarting an unhealthy container
  692. trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
  693. while [ ${err_count} -lt ${THRESHOLD} ]; do
  694. touch /tmp/olefy-mailcow; echo "$(tail -50 /tmp/olefy-mailcow)" > /tmp/olefy-mailcow
  695. host_ip=$(get_container_ip olefy-mailcow)
  696. err_c_cur=${err_count}
  697. /usr/lib/nagios/plugins/check_tcp -4 -H ${host_ip} -p 10055 -s "PING\n" 2>> /tmp/olefy-mailcow 1>&2; err_count=$(( ${err_count} + $? ))
  698. [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
  699. [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
  700. progress "Olefy" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
  701. if [[ $? == 10 ]]; then
  702. diff_c=0
  703. sleep 1
  704. else
  705. diff_c=0
  706. sleep $(( ( RANDOM % 60 ) + 20 ))
  707. fi
  708. done
  709. return 1
  710. }
  711. # Notify about start
  712. if [[ ! -z ${WATCHDOG_NOTIFY_EMAIL} ]]; then
  713. mail_error "watchdog-mailcow" "Watchdog started monitoring mailcow."
  714. fi
  715. # Create watchdog agents
  716. (
  717. while true; do
  718. if ! nginx_checks; then
  719. log_msg "Nginx hit error limit"
  720. echo nginx-mailcow > /tmp/com_pipe
  721. fi
  722. done
  723. ) &
  724. PID=$!
  725. echo "Spawned nginx_checks with PID ${PID}"
  726. BACKGROUND_TASKS+=(${PID})
  727. if [[ ${WATCHDOG_EXTERNAL_CHECKS} =~ ^([yY][eE][sS]|[yY])+$ ]]; then
  728. (
  729. while true; do
  730. if ! external_checks; then
  731. log_msg "External checks hit error limit"
  732. echo external_checks > /tmp/com_pipe
  733. fi
  734. done
  735. ) &
  736. PID=$!
  737. echo "Spawned external_checks with PID ${PID}"
  738. BACKGROUND_TASKS+=(${PID})
  739. fi
  740. if [[ ${WATCHDOG_MYSQL_REPLICATION_CHECKS} =~ ^([yY][eE][sS]|[yY])+$ ]]; then
  741. (
  742. while true; do
  743. if ! mysql_repl_checks; then
  744. log_msg "MySQL replication check hit error limit"
  745. echo mysql_repl_checks > /tmp/com_pipe
  746. fi
  747. done
  748. ) &
  749. PID=$!
  750. echo "Spawned mysql_repl_checks with PID ${PID}"
  751. BACKGROUND_TASKS+=(${PID})
  752. fi
  753. (
  754. while true; do
  755. if ! mysql_checks; then
  756. log_msg "MySQL hit error limit"
  757. echo mysql-mailcow > /tmp/com_pipe
  758. fi
  759. done
  760. ) &
  761. PID=$!
  762. echo "Spawned mysql_checks with PID ${PID}"
  763. BACKGROUND_TASKS+=(${PID})
  764. (
  765. while true; do
  766. if ! redis_checks; then
  767. log_msg "Local Redis hit error limit"
  768. echo redis-mailcow > /tmp/com_pipe
  769. fi
  770. done
  771. ) &
  772. PID=$!
  773. echo "Spawned redis_checks with PID ${PID}"
  774. BACKGROUND_TASKS+=(${PID})
  775. (
  776. while true; do
  777. if ! phpfpm_checks; then
  778. log_msg "PHP-FPM hit error limit"
  779. echo php-fpm-mailcow > /tmp/com_pipe
  780. fi
  781. done
  782. ) &
  783. PID=$!
  784. echo "Spawned phpfpm_checks with PID ${PID}"
  785. BACKGROUND_TASKS+=(${PID})
  786. if [[ "${SKIP_SOGO}" =~ ^([nN][oO]|[nN])+$ ]]; then
  787. (
  788. while true; do
  789. if ! sogo_checks; then
  790. log_msg "SOGo hit error limit"
  791. echo sogo-mailcow > /tmp/com_pipe
  792. fi
  793. done
  794. ) &
  795. PID=$!
  796. echo "Spawned sogo_checks with PID ${PID}"
  797. BACKGROUND_TASKS+=(${PID})
  798. fi
  799. if [ ${CHECK_UNBOUND} -eq 1 ]; then
  800. (
  801. while true; do
  802. if ! unbound_checks; then
  803. log_msg "Unbound hit error limit"
  804. echo unbound-mailcow > /tmp/com_pipe
  805. fi
  806. done
  807. ) &
  808. PID=$!
  809. echo "Spawned unbound_checks with PID ${PID}"
  810. BACKGROUND_TASKS+=(${PID})
  811. fi
  812. if [[ "${SKIP_CLAMD}" =~ ^([nN][oO]|[nN])+$ ]]; then
  813. (
  814. while true; do
  815. if ! clamd_checks; then
  816. log_msg "Clamd hit error limit"
  817. echo clamd-mailcow > /tmp/com_pipe
  818. fi
  819. done
  820. ) &
  821. PID=$!
  822. echo "Spawned clamd_checks with PID ${PID}"
  823. BACKGROUND_TASKS+=(${PID})
  824. fi
  825. (
  826. while true; do
  827. if ! postfix_checks; then
  828. log_msg "Postfix hit error limit"
  829. echo postfix-mailcow > /tmp/com_pipe
  830. fi
  831. done
  832. ) &
  833. PID=$!
  834. echo "Spawned postfix_checks with PID ${PID}"
  835. BACKGROUND_TASKS+=(${PID})
  836. (
  837. while true; do
  838. if ! mailq_checks; then
  839. log_msg "Mail queue hit error limit"
  840. echo mail_queue_status > /tmp/com_pipe
  841. fi
  842. done
  843. ) &
  844. PID=$!
  845. echo "Spawned mailq_checks with PID ${PID}"
  846. BACKGROUND_TASKS+=(${PID})
  847. (
  848. while true; do
  849. if ! dovecot_checks; then
  850. log_msg "Dovecot hit error limit"
  851. echo dovecot-mailcow > /tmp/com_pipe
  852. fi
  853. done
  854. ) &
  855. PID=$!
  856. echo "Spawned dovecot_checks with PID ${PID}"
  857. BACKGROUND_TASKS+=(${PID})
  858. (
  859. while true; do
  860. if ! dovecot_repl_checks; then
  861. log_msg "Dovecot hit error limit"
  862. echo dovecot_repl_checks > /tmp/com_pipe
  863. fi
  864. done
  865. ) &
  866. PID=$!
  867. echo "Spawned dovecot_repl_checks with PID ${PID}"
  868. BACKGROUND_TASKS+=(${PID})
  869. (
  870. while true; do
  871. if ! rspamd_checks; then
  872. log_msg "Rspamd hit error limit"
  873. echo rspamd-mailcow > /tmp/com_pipe
  874. fi
  875. done
  876. ) &
  877. PID=$!
  878. echo "Spawned rspamd_checks with PID ${PID}"
  879. BACKGROUND_TASKS+=(${PID})
  880. (
  881. while true; do
  882. if ! ratelimit_checks; then
  883. log_msg "Ratelimit hit error limit"
  884. echo ratelimit > /tmp/com_pipe
  885. fi
  886. done
  887. ) &
  888. PID=$!
  889. echo "Spawned ratelimit_checks with PID ${PID}"
  890. BACKGROUND_TASKS+=(${PID})
  891. (
  892. while true; do
  893. if ! fail2ban_checks; then
  894. log_msg "Fail2ban hit error limit"
  895. echo fail2ban > /tmp/com_pipe
  896. fi
  897. done
  898. ) &
  899. PID=$!
  900. echo "Spawned fail2ban_checks with PID ${PID}"
  901. BACKGROUND_TASKS+=(${PID})
  902. (
  903. while true; do
  904. if ! cert_checks; then
  905. log_msg "Cert check hit error limit"
  906. echo certcheck > /tmp/com_pipe
  907. fi
  908. done
  909. ) &
  910. PID=$!
  911. echo "Spawned cert_checks with PID ${PID}"
  912. BACKGROUND_TASKS+=(${PID})
  913. (
  914. while true; do
  915. if ! olefy_checks; then
  916. log_msg "Olefy hit error limit"
  917. echo olefy-mailcow > /tmp/com_pipe
  918. fi
  919. done
  920. ) &
  921. PID=$!
  922. echo "Spawned olefy_checks with PID ${PID}"
  923. BACKGROUND_TASKS+=(${PID})
  924. (
  925. while true; do
  926. if ! acme_checks; then
  927. log_msg "ACME client hit error limit"
  928. echo acme-mailcow > /tmp/com_pipe
  929. fi
  930. done
  931. ) &
  932. PID=$!
  933. echo "Spawned acme_checks with PID ${PID}"
  934. BACKGROUND_TASKS+=(${PID})
  935. # Monitor watchdog agents, stop script when agents fails and wait for respawn by Docker (restart:always:n)
  936. (
  937. while true; do
  938. for bg_task in ${BACKGROUND_TASKS[*]}; do
  939. if ! kill -0 ${bg_task} 1>&2; then
  940. log_msg "Worker ${bg_task} died, stopping watchdog and waiting for respawn..."
  941. kill -TERM 1
  942. fi
  943. sleep 10
  944. done
  945. done
  946. ) &
  947. # Monitor dockerapi
  948. (
  949. while true; do
  950. while nc -z dockerapi 443; do
  951. sleep 3
  952. done
  953. log_msg "Cannot find dockerapi-mailcow, waiting to recover..."
  954. kill -STOP ${BACKGROUND_TASKS[*]}
  955. until nc -z dockerapi 443; do
  956. sleep 3
  957. done
  958. kill -CONT ${BACKGROUND_TASKS[*]}
  959. kill -USR1 ${BACKGROUND_TASKS[*]}
  960. done
  961. ) &
  962. # Actions when threshold limit is reached
  963. while true; do
  964. CONTAINER_ID=
  965. HAS_INITDB=
  966. read com_pipe_answer </tmp/com_pipe
  967. if [ -s "/tmp/${com_pipe_answer}" ]; then
  968. cat "/tmp/${com_pipe_answer}"
  969. fi
  970. if [[ ${com_pipe_answer} == "ratelimit" ]]; then
  971. log_msg "At least one ratelimit was applied"
  972. [[ ! -z ${WATCHDOG_NOTIFY_EMAIL} ]] && mail_error "${com_pipe_answer}"
  973. elif [[ ${com_pipe_answer} == "mail_queue_status" ]]; then
  974. log_msg "Mail queue status is critical"
  975. [[ ! -z ${WATCHDOG_NOTIFY_EMAIL} ]] && mail_error "${com_pipe_answer}"
  976. elif [[ ${com_pipe_answer} == "external_checks" ]]; then
  977. log_msg "Your mailcow is an open relay!"
  978. # Define $2 to override message text, else print service was restarted at ...
  979. [[ ! -z ${WATCHDOG_NOTIFY_EMAIL} ]] && mail_error "${com_pipe_answer}" "Please stop mailcow now and check your network configuration!"
  980. elif [[ ${com_pipe_answer} == "mysql_repl_checks" ]]; then
  981. log_msg "MySQL replication is not working properly"
  982. # Define $2 to override message text, else print service was restarted at ...
  983. # Once mail per 10 minutes
  984. [[ ! -z ${WATCHDOG_NOTIFY_EMAIL} ]] && mail_error "${com_pipe_answer}" "Please check the SQL replication status" 600
  985. elif [[ ${com_pipe_answer} == "dovecot_repl_checks" ]]; then
  986. log_msg "Dovecot replication is not working properly"
  987. # Define $2 to override message text, else print service was restarted at ...
  988. # Once mail per 10 minutes
  989. [[ ! -z ${WATCHDOG_NOTIFY_EMAIL} ]] && mail_error "${com_pipe_answer}" "Please check the Dovecot replicator status" 600
  990. elif [[ ${com_pipe_answer} == "certcheck" ]]; then
  991. log_msg "Certificates are about to expire"
  992. # Define $2 to override message text, else print service was restarted at ...
  993. # Only mail once a day
  994. [[ ! -z ${WATCHDOG_NOTIFY_EMAIL} ]] && mail_error "${com_pipe_answer}" "Please renew your certificate" 86400
  995. elif [[ ${com_pipe_answer} == "acme-mailcow" ]]; then
  996. log_msg "acme-mailcow did not complete successfully"
  997. # Define $2 to override message text, else print service was restarted at ...
  998. [[ ! -z ${WATCHDOG_NOTIFY_EMAIL} ]] && mail_error "${com_pipe_answer}" "Please check acme-mailcow for further information."
  999. elif [[ ${com_pipe_answer} == "fail2ban" ]]; then
  1000. F2B_RES=($(timeout 4s ${REDIS_CMDLINE} --raw GET F2B_RES 2> /dev/null))
  1001. if [[ ! -z "${F2B_RES}" ]]; then
  1002. ${REDIS_CMDLINE} DEL F2B_RES > /dev/null
  1003. host=
  1004. for host in "${F2B_RES[@]}"; do
  1005. log_msg "Banned ${host}"
  1006. rm /tmp/fail2ban 2> /dev/null
  1007. timeout 2s whois "${host}" > /tmp/fail2ban
  1008. [[ ! -z ${WATCHDOG_NOTIFY_EMAIL} ]] && [[ ${WATCHDOG_NOTIFY_BAN} =~ ^([yY][eE][sS]|[yY])+$ ]] && mail_error "${com_pipe_answer}" "IP ban: ${host}"
  1009. done
  1010. fi
  1011. elif [[ ${com_pipe_answer} =~ .+-mailcow ]]; then
  1012. kill -STOP ${BACKGROUND_TASKS[*]}
  1013. sleep 10
  1014. CONTAINER_ID=$(curl --silent --insecure https://dockerapi/containers/json | jq -r ".[] | {name: .Config.Labels[\"com.docker.compose.service\"], project: .Config.Labels[\"com.docker.compose.project\"], id: .Id}" | jq -rc "select( .name | tostring | contains(\"${com_pipe_answer}\")) | select( .project | tostring | contains(\"${COMPOSE_PROJECT_NAME,,}\")) | .id")
  1015. if [[ ! -z ${CONTAINER_ID} ]]; then
  1016. if [[ "${com_pipe_answer}" == "php-fpm-mailcow" ]]; then
  1017. HAS_INITDB=$(curl --silent --insecure -XPOST https://dockerapi/containers/${CONTAINER_ID}/top | jq '.msg.Processes[] | contains(["php -c /usr/local/etc/php -f /web/inc/init_db.inc.php"])' | grep true)
  1018. fi
  1019. S_RUNNING=$(($(date +%s) - $(curl --silent --insecure https://dockerapi/containers/${CONTAINER_ID}/json | jq .State.StartedAt | xargs -n1 date +%s -d)))
  1020. if [ ${S_RUNNING} -lt 360 ]; then
  1021. log_msg "Container is running for less than 360 seconds, skipping action..."
  1022. elif [[ ! -z ${HAS_INITDB} ]]; then
  1023. log_msg "Database is being initialized by php-fpm-mailcow, not restarting but delaying checks for a minute..."
  1024. sleep 60
  1025. else
  1026. log_msg "Sending restart command to ${CONTAINER_ID}..."
  1027. curl --silent --insecure -XPOST https://dockerapi/containers/${CONTAINER_ID}/restart
  1028. [[ ! -z ${WATCHDOG_NOTIFY_EMAIL} ]] && mail_error "${com_pipe_answer}"
  1029. log_msg "Wait for restarted container to settle and continue watching..."
  1030. sleep 35
  1031. fi
  1032. fi
  1033. kill -CONT ${BACKGROUND_TASKS[*]}
  1034. sleep 1
  1035. kill -USR1 ${BACKGROUND_TASKS[*]}
  1036. fi
  1037. done