소스 검색

Monitor backups with PagerDuty hook integration (#245).

Dan Helfman 5 년 전
부모
커밋
bc02c123e6

+ 2 - 0
NEWS

@@ -1,4 +1,6 @@
 1.5.0
+ * #245: Monitor backups with PagerDuty hook integration. See the documentation for more
+   information: https://torsion.org/borgmatic/docs/how-to/monitor-your-backups/#pagerduty-hook
  * #255: Add per-action hooks: "before_prune", "after_prune", "before_check", and "after_check".
  * #274: Add ~/.config/borgmatic.d as another configuration directory default.
  * #277: Customize Healthchecks log level via borgmatic "--monitoring-verbosity" flag.

+ 1 - 0
README.md

@@ -66,6 +66,7 @@ borgmatic is powered by [Borg Backup](https://www.borgbackup.org/).
 <a href="https://healthchecks.io/"><img src="docs/static/healthchecks.png" alt="Healthchecks" height="60px" style="margin-bottom:20px;"></a>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
 <a href="https://cronitor.io/"><img src="docs/static/cronitor.png" alt="Cronitor" height="60px" style="margin-bottom:20px;"></a>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
 <a href="https://cronhub.io/"><img src="docs/static/cronhub.png" alt="Cronhub" height="60px" style="margin-bottom:20px;"></a>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
+<a href="https://www.pagerduty.com/"><img src="docs/static/pagerduty.png" alt="PagerDuty" height="60px" style="margin-bottom:20px;"></a>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
 <a href="https://www.rsync.net/cgi-bin/borg.cgi?campaign=borg&adgroup=borgmatic"><img src="docs/static/rsyncnet.png" alt="rsync.net" height="60px" style="margin-bottom:20px;"></a>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
 <a href="https://www.borgbase.com/?utm_source=borgmatic"><img src="docs/static/borgbase.png" alt="BorgBase" height="60px" style="margin-bottom:20px;"></a>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
 

+ 9 - 0
borgmatic/config/schema.yaml

@@ -567,6 +567,15 @@ map:
                     for details.
                 example:
                     https://cronitor.link/d3x0c1
+            pagerduty:
+                type: str
+                desc: |
+                    PagerDuty integration key used to notify PagerDuty when a backup errors. Create
+                    an account at https://www.pagerduty.com/ if you'd like to use this service. See
+                    https://torsion.org/borgmatic/docs/how-to/monitor-your-backups/#pagerduty-hook
+                    for details.
+                example:
+                    a177cad45bd374409f78906a810a3074
             cronhub:
                 type: str
                 desc: |

+ 2 - 1
borgmatic/hooks/dispatch.py

@@ -1,6 +1,6 @@
 import logging
 
-from borgmatic.hooks import cronhub, cronitor, healthchecks, mysql, postgresql
+from borgmatic.hooks import cronhub, cronitor, healthchecks, mysql, pagerduty, postgresql
 
 logger = logging.getLogger(__name__)
 
@@ -8,6 +8,7 @@ HOOK_NAME_TO_MODULE = {
     'healthchecks': healthchecks,
     'cronitor': cronitor,
     'cronhub': cronhub,
+    'pagerduty': pagerduty,
     'postgresql_databases': postgresql,
     'mysql_databases': mysql,
 }

+ 1 - 1
borgmatic/hooks/monitor.py

@@ -1,6 +1,6 @@
 from enum import Enum
 
-MONITOR_HOOK_NAMES = ('healthchecks', 'cronitor', 'cronhub')
+MONITOR_HOOK_NAMES = ('healthchecks', 'cronitor', 'cronhub', 'pagerduty')
 
 
 class State(Enum):

+ 62 - 0
borgmatic/hooks/pagerduty.py

@@ -0,0 +1,62 @@
+import datetime
+import json
+import logging
+import platform
+
+import requests
+
+from borgmatic.hooks import monitor
+
+logger = logging.getLogger(__name__)
+
+EVENTS_API_URL = 'https://events.pagerduty.com/v2/enqueue'
+
+
+def ping_monitor(integration_key, config_filename, state, monitoring_log_level, dry_run):
+    '''
+    If this is an error state, create a PagerDuty event with the given integration key. Use the
+    given configuration filename in any log entries. If this is a dry run, then don't actually
+    create an event.
+    '''
+    if state != monitor.State.FAIL:
+        logger.debug(
+            '{}: Ignoring unsupported monitoring {} in PagerDuty hook'.format(
+                config_filename, state.name.lower()
+            )
+        )
+        return
+
+    dry_run_label = ' (dry run; not actually sending)' if dry_run else ''
+    logger.info('{}: Sending failure event to PagerDuty {}'.format(config_filename, dry_run_label))
+
+    if dry_run:
+        return
+
+    hostname = platform.node()
+    local_timestamp = (
+        datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc).astimezone().isoformat()
+    )
+    payload = json.dumps(
+        {
+            'routing_key': integration_key,
+            'event_action': 'trigger',
+            'payload': {
+                'summary': 'backup failed on {}'.format(hostname),
+                'severity': 'error',
+                'source': hostname,
+                'timestamp': local_timestamp,
+                'component': 'borgmatic',
+                'group': 'backups',
+                'class': 'backup failure',
+                'custom_details': {
+                    'hostname': hostname,
+                    'configuration filename': config_filename,
+                    'server time': local_timestamp,
+                },
+            },
+        }
+    )
+    logger.debug('{}: Using PagerDuty payload: {}'.format(config_filename, payload))
+
+    logging.getLogger('urllib3').setLevel(logging.ERROR)
+    requests.post(EVENTS_API_URL, data=payload.encode('utf-8'))

+ 34 - 7
docs/how-to/monitor-your-backups.md

@@ -28,14 +28,15 @@ hooks](https://torsion.org/borgmatic/docs/how-to/monitor-your-backups/#error-hoo
 below for how to configure this.
 4. **borgmatic monitoring hooks**: This feature integrates with monitoring
    services like [Healthchecks](https://healthchecks.io/),
-[Cronitor](https://cronitor.io), and [Cronhub](https://cronhub.io), and pings
-these services whenever borgmatic runs. That way, you'll receive an alert when
-something goes wrong or the service doesn't hear from borgmatic for a
-configured interval. See
-[Healthchecks
+[Cronitor](https://cronitor.io), [Cronhub](https://cronhub.io), and
+[PagerDuty](https://www.pagerduty.com/) and pings these services whenever
+borgmatic runs. That way, you'll receive an alert when something goes wrong or
+(for certain hooks) the service doesn't hear from borgmatic for a configured
+interval. See [Healthchecks
 hook](https://torsion.org/borgmatic/docs/how-to/monitor-your-backups/#healthchecks-hook), [Cronitor
-hook](https://torsion.org/borgmatic/docs/how-to/monitor-your-backups/#cronitor-hook), and [Cronhub
-hook](https://torsion.org/borgmatic/docs/how-to/monitor-your-backups/#cronhub-hook)
+hook](https://torsion.org/borgmatic/docs/how-to/monitor-your-backups/#cronitor-hook), [Cronhub
+hook](https://torsion.org/borgmatic/docs/how-to/monitor-your-backups/#cronhub-hook), and
+[PagerDuty hook](https://torsion.org/borgmatic/docs/how-to/monitor-your-backups/#pagerduty-hook)
 below for how to configure this.
 3. **Third-party monitoring software**: You can use traditional monitoring
 software to consume borgmatic JSON output and track when the last
@@ -200,6 +201,32 @@ mechanisms](https://docs.cronhub.io/integrations.html) when backups fail
 or it doesn't hear from borgmatic for a certain period of time.
 
 
+## PagerDuty hook
+
+[PagerDuty](https://cronhub.io/) provides incident monitoring and alerting,
+and borgmatic has built-in integration with it. Once you create a PagerDuty
+account and <a
+href="https://support.pagerduty.com/docs/services-and-integrations">service</a>
+on their site, all you need to do is configure borgmatic with the unique
+"Integration Key" for your service. Here's an example:
+
+
+```yaml
+hooks:
+    pagerduty: a177cad45bd374409f78906a810a3074
+```
+
+With this hook in place, borgmatic creates a PagerDuty event for your service
+whenever backups fail. Specifically, if an error occurs during a `create`,
+`prune`, or `check` action, borgmatic sends an event to PagerDuty after the
+`on_error` hooks run. Note that borgmatic does not contact PagerDuty when a
+backup starts or ends without error.
+
+You can configure PagerDuty to notify you by a [variety of
+mechanisms](https://support.pagerduty.com/docs/notifications) when backups
+fail.
+
+
 ## Scripting borgmatic
 
 To consume the output of borgmatic in other software, you can include an

BIN
docs/static/pagerduty.png


+ 35 - 0
tests/unit/hooks/test_pagerduty.py

@@ -0,0 +1,35 @@
+from flexmock import flexmock
+
+from borgmatic.hooks import pagerduty as module
+
+
+def test_ping_monitor_ignores_start_state():
+    flexmock(module.requests).should_receive('post').never()
+
+    module.ping_monitor(
+        'abc123', 'config.yaml', module.monitor.State.START, monitoring_log_level=1, dry_run=False
+    )
+
+
+def test_ping_monitor_ignores_finish_state():
+    flexmock(module.requests).should_receive('post').never()
+
+    module.ping_monitor(
+        'abc123', 'config.yaml', module.monitor.State.FINISH, monitoring_log_level=1, dry_run=False
+    )
+
+
+def test_ping_monitor_calls_api_for_fail_state():
+    flexmock(module.requests).should_receive('post')
+
+    module.ping_monitor(
+        'abc123', 'config.yaml', module.monitor.State.FAIL, monitoring_log_level=1, dry_run=False
+    )
+
+
+def test_ping_monitor_dry_run_does_not_call_api():
+    flexmock(module.requests).should_receive('post').never()
+
+    module.ping_monitor(
+        'abc123', 'config.yaml', module.monitor.State.FAIL, monitoring_log_level=1, dry_run=True
+    )