watchdog.sh 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281
  1. #!/bin/bash
  2. trap "exit" INT TERM
  3. trap "kill 0" EXIT
  4. PARENT_PID=$$
  5. # Prepare
  6. BACKGROUND_TASKS=()
  7. # Skip watchdog?
  8. if [[ "${USE_WATCHDOG}" =~ ^([nN][oO])+$ ]]; then
  9. echo "Skipping watchdog, sleeping..."
  10. sleep 365d
  11. exit 0
  12. fi
  13. # Checks pipe their corresponding container name in this pipe
  14. if [[ ! -p /tmp/com_pipe ]]; then
  15. mkfifo /tmp/com_pipe
  16. fi
  17. # Common functions
  18. progress() {
  19. SERVICE=${1}
  20. TOTAL=${2}
  21. CURRENT=${3}
  22. DIFF=${4}
  23. [[ -z ${DIFF} ]] && DIFF=0
  24. [[ -z ${TOTAL} || -z ${CURRENT} ]] && return
  25. [[ ${CURRENT} -gt ${TOTAL} ]] && return
  26. [[ ${CURRENT} -lt 0 ]] && CURRENT=0
  27. percent=$(( 200 * ${CURRENT} / ${TOTAL} % 2 + 100 * ${CURRENT} / ${TOTAL} ))
  28. completed=$(( ${percent} / 2 ))
  29. remaining=$(( 50 - ${completed} ))
  30. echo -ne "$(date) Health level: "
  31. echo -n "["
  32. printf "%0.s>" $(seq ${completed})
  33. [[ ${remaining} != 0 ]] && printf "%0.s." $(seq ${remaining})
  34. echo -en "] ${percent}% - Service: ${SERVICE}, health trend: "
  35. [[ ${DIFF} =~ ^-[1-9] ]] && echo -en "\e[31mnegative \e[0m" || echo -en "\e[32mpositive \e[0m"
  36. echo "(${DIFF})"
  37. }
  38. # Check functions
  39. nginx_checks() {
  40. err_count=0
  41. diff_c=0
  42. THRESHOLD=16
  43. # Reduce error count by 2 after restarting an unhealthy container
  44. trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
  45. while [ ${err_count} -lt ${THRESHOLD} ]; do
  46. err_c_cur=${err_count}
  47. /usr/lib/nagios/plugins/check_ping -H nginx-mailcow -w 2000,10% -c 4000,100% -p2 1>&2; err_count=$(( ${err_count} + $? ))
  48. /usr/lib/nagios/plugins/check_http -H nginx-mailcow -u / -p 8081 1>&2; err_count=$(( ${err_count} + $? ))
  49. sleep $(( ( RANDOM % 30 ) + 10 ))
  50. [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
  51. [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
  52. progress "Nginx" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
  53. done
  54. return 1
  55. }
  56. mysql_checks() {
  57. err_count=0
  58. diff_c=0
  59. THRESHOLD=12
  60. # Reduce error count by 2 after restarting an unhealthy container
  61. trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
  62. while [ ${err_count} -lt ${THRESHOLD} ]; do
  63. err_c_cur=${err_count}
  64. /usr/lib/nagios/plugins/check_mysql -H mysql-mailcow -P 3306 -u ${DBUSER} -p ${DBPASS} -d ${DBNAME} 1>&2; err_count=$(( ${err_count} + $? ))
  65. /usr/lib/nagios/plugins/check_mysql_query -H mysql-mailcow -P 3306 -u ${DBUSER} -p ${DBPASS} -d ${DBNAME} -q "SELECT COUNT(*) FROM information_schema.tables" 1>&2; err_count=$(( ${err_count} + $? ))
  66. sleep $(( ( RANDOM % 30 ) + 10 ))
  67. [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
  68. [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
  69. progress "MySQL/MariaDB" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
  70. done
  71. return 1
  72. }
  73. sogo_checks() {
  74. err_count=0
  75. diff_c=0
  76. THRESHOLD=20
  77. # Reduce error count by 2 after restarting an unhealthy container
  78. trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
  79. while [ ${err_count} -lt ${THRESHOLD} ]; do
  80. err_c_cur=${err_count}
  81. /usr/lib/nagios/plugins/check_http -H sogo-mailcow -u /WebServerResources/css/theme-default.css -p 9192 -R md-default-theme 1>&2; err_count=$(( ${err_count} + $? ))
  82. /usr/lib/nagios/plugins/check_http -H sogo-mailcow -u /SOGo.index/ -p 20000 -R "SOGo\sGroupware" 1>&2; err_count=$(( ${err_count} + $? ))
  83. /usr/lib/nagios/plugins/check_http -H nginx-mailcow -u /SOGo/ -p 443 --ssl -R "Bad Gateway" --invert-regex 1>&2; err_count=$(( ${err_count} + $? ))
  84. /usr/lib/nagios/plugins/check_http -H nginx-mailcow -u /SOGo/ -p 80 -R "Bad Gateway" --invert-regex 1>&2; err_count=$(( ${err_count} + $? ))
  85. sleep $(( ( RANDOM % 30 ) + 10 ))
  86. [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
  87. [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
  88. progress "SOGo" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
  89. done
  90. return 1
  91. }
  92. postfix_checks() {
  93. err_count=0
  94. diff_c=0
  95. THRESHOLD=16
  96. # Reduce error count by 2 after restarting an unhealthy container
  97. trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
  98. while [ ${err_count} -lt ${THRESHOLD} ]; do
  99. err_c_cur=${err_count}
  100. /usr/lib/nagios/plugins/check_smtp -H postfix-mailcow -p 25 1>&2; err_count=$(( ${err_count} + $? ))
  101. /usr/lib/nagios/plugins/check_smtp -H postfix-mailcow -p 588 -f watchdog -C "RCPT TO:null@localhost" -C DATA -C . -R 250 1>&2; err_count=$(( ${err_count} + $? ))
  102. /usr/lib/nagios/plugins/check_smtp -H postfix-mailcow -p 587 -S 1>&2; err_count=$(( ${err_count} + $? ))
  103. sleep $(( ( RANDOM % 30 ) + 10 ))
  104. [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
  105. [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
  106. progress "Postfix" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
  107. done
  108. return 1
  109. }
  110. dovecot_checks() {
  111. err_count=0
  112. diff_c=0
  113. THRESHOLD=24
  114. # Reduce error count by 2 after restarting an unhealthy container
  115. trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
  116. while [ ${err_count} -lt ${THRESHOLD} ]; do
  117. err_c_cur=${err_count}
  118. /usr/lib/nagios/plugins/check_smtp -H dovecot-mailcow -p 24 -f "watchdog" -C "RCPT TO:<watchdog@invalid>" -L -R "User doesn't exist" 1>&2; err_count=$(( ${err_count} + $? ))
  119. /usr/lib/nagios/plugins/check_imap -H dovecot-mailcow -p 993 -S -e "OK " 1>&2; err_count=$(( ${err_count} + $? ))
  120. /usr/lib/nagios/plugins/check_imap -H dovecot-mailcow -p 143 -e "OK " 1>&2; err_count=$(( ${err_count} + $? ))
  121. /usr/lib/nagios/plugins/check_tcp -H dovecot-mailcow -p 10001 -e "VERSION" 1>&2; err_count=$(( ${err_count} + $? ))
  122. /usr/lib/nagios/plugins/check_tcp -H dovecot-mailcow -p 4190 -e "Dovecot ready" 1>&2; err_count=$(( ${err_count} + $? ))
  123. sleep $(( ( RANDOM % 30 ) + 10 ))
  124. [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
  125. [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
  126. progress "Dovecot" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
  127. done
  128. return 1
  129. }
  130. phpfpm_checks() {
  131. err_count=0
  132. diff_c=0
  133. THRESHOLD=12
  134. # Reduce error count by 2 after restarting an unhealthy container
  135. trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
  136. while [ ${err_count} -lt ${THRESHOLD} ]; do
  137. err_c_cur=${err_count}
  138. cgi-fcgi -bind -connect php-fpm-mailcow:9000 | grep PHP 1>&2; err_count=$(( ${err_count} + ($? * 2)))
  139. /usr/lib/nagios/plugins/check_ping -H php-fpm-mailcow -w 2000,10% -c 4000,100% -p2 1>&2; err_count=$(( ${err_count} + $? ))
  140. /usr/lib/nagios/plugins/check_http -H nginx-mailcow -u /settings.php -p 8081 -r "settings \{" 1>&2; err_count=$(( ${err_count} + ($? * 2)))
  141. sleep $(( ( RANDOM % 30 ) + 10 ))
  142. [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
  143. [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
  144. progress "PHP-FPM" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
  145. done
  146. return 1
  147. }
  148. dns_checks() {
  149. err_count=0
  150. diff_c=0
  151. THRESHOLD=28
  152. # Reduce error count by 2 after restarting an unhealthy container
  153. trap "[ ${err_count} -gt 1 ] && err_count=$(( ${err_count} - 2 ))" USR1
  154. while [ ${err_count} -lt ${THRESHOLD} ]; do
  155. err_c_cur=${err_count}
  156. /usr/lib/nagios/plugins/check_dns -H google.com 1>&2; err_count=$(( ${err_count} + ($? * 2)))
  157. /usr/lib/nagios/plugins/check_dns -s $(dig unbound-mailcow +short A) -H google.com 1>&2; err_count=$(( ${err_count} + ($? * 2)))
  158. /usr/lib/nagios/plugins/check_dns -s $(dig unbound-mailcow +short AAAA) -H google.com 1>&2; err_count=$(( ${err_count} + ($? * 2)))
  159. dig +dnssec org. @172.22.1.254 | grep -E 'flags:.+ad' 1>&2; err_count=$(( ${err_count} + ($? * 2)))
  160. sleep $(( ( RANDOM % 30 ) + 10 ))
  161. [ ${err_c_cur} -eq ${err_count} ] && [ ! $((${err_count} - 1)) -lt 0 ] && err_count=$((${err_count} - 1)) diff_c=1
  162. [ ${err_c_cur} -ne ${err_count} ] && diff_c=$(( ${err_c_cur} - ${err_count} ))
  163. progress "Unbound" ${THRESHOLD} $(( ${THRESHOLD} - ${err_count} )) ${diff_c}
  164. done
  165. return 1
  166. }
  167. # Create watchdog agents
  168. (
  169. while true; do
  170. if ! nginx_checks; then
  171. echo -e "\e[31m$(date) - Nginx hit error limit\e[0m"
  172. echo nginx-mailcow > /tmp/com_pipe
  173. fi
  174. done
  175. ) &
  176. BACKGROUND_TASKS+=($!)
  177. (
  178. while true; do
  179. if ! mysql_checks; then
  180. echo -e "\e[31m$(date) - MySQL hit error limit\e[0m"
  181. echo mysql-mailcow > /tmp/com_pipe
  182. fi
  183. done
  184. ) &
  185. BACKGROUND_TASKS+=($!)
  186. (
  187. while true; do
  188. if ! phpfpm_checks; then
  189. echo -e "\e[31m$(date) - PHP-FPM hit error limit\e[0m"
  190. echo php-fpm-mailcow > /tmp/com_pipe
  191. fi
  192. done
  193. ) &
  194. BACKGROUND_TASKS+=($!)
  195. (
  196. while true; do
  197. if ! sogo_checks; then
  198. echo -e "\e[31m$(date) - SOGo hit error limit\e[0m"
  199. echo sogo-mailcow > /tmp/com_pipe
  200. fi
  201. done
  202. ) &
  203. BACKGROUND_TASKS+=($!)
  204. (
  205. while true; do
  206. if ! postfix_checks; then
  207. echo -e "\e[31m$(date) - Postfix hit error limit\e[0m"
  208. echo postfix-mailcow > /tmp/com_pipe
  209. fi
  210. done
  211. ) &
  212. BACKGROUND_TASKS+=($!)
  213. (
  214. while true; do
  215. if ! dovecot_checks; then
  216. echo -e "\e[31m$(date) - Dovecot hit error limit\e[0m"
  217. echo dovecot-mailcow > /tmp/com_pipe
  218. fi
  219. done
  220. ) &
  221. BACKGROUND_TASKS+=($!)
  222. (
  223. while true; do
  224. if ! dns_checks; then
  225. echo -e "\e[31m$(date) - Unbound hit error limit\e[0m"
  226. echo unbound-mailcow > /tmp/com_pipe
  227. fi
  228. done
  229. ) &
  230. BACKGROUND_TASKS+=($!)
  231. # Monitor watchdog agents, stop script when agents fails and wait for respawn by Docker (restart:always:n)
  232. (
  233. while true; do
  234. for bg_task in ${BACKGROUND_TASKS[*]}; do
  235. if ! kill -0 ${bg_task} 21>&2; then
  236. echo "Worker ${bg_task} died, stopping watchdog and waiting for respawn..."
  237. kill -TERM ${PARENT_PID}
  238. fi
  239. sleep 1
  240. done
  241. done
  242. ) &
  243. # Restart container when threshold limit reached
  244. while true; do
  245. CONTAINER_ID=
  246. read com_pipe_answer </tmp/com_pipe
  247. if [[ ${com_pipe_answer} =~ .+-mailcow ]]; then
  248. kill -STOP ${BACKGROUND_TASKS[*]}
  249. sleep 3
  250. CONTAINER_ID=$(curl --silent --unix-socket /var/run/docker.sock http/containers/json?all=1 | jq -rc "map(select(.Names[] | contains (\"${com_pipe_answer}\"))) | .[] .Id")
  251. if [[ ! -z ${CONTAINER_ID} ]]; then
  252. echo "Sending restart command to ${CONTAINER_ID}..."
  253. curl --silent --unix-socket /var/run/docker.sock -XPOST http/containers/${CONTAINER_ID}/restart
  254. fi
  255. echo "Wait for restarted container to settle and continue watching..."
  256. sleep 30s
  257. kill -CONT ${BACKGROUND_TASKS[*]}
  258. kill -USR1 ${BACKGROUND_TASKS[*]}
  259. fi
  260. done