Browse Source

Dead man's switch via healthchecks.io integration (#223) + new monitoring documentation.

Dan Helfman 6 years ago
parent
commit
128ebf04ce

+ 8 - 0
NEWS

@@ -1,3 +1,11 @@
+1.3.25
+ * #223: Dead man's switch to detect when backups start failing silently, implemented via
+   healthchecks.io hook integration. See the documentation for more information:
+   https://torsion.org/borgmatic/docs/how-to/monitor-your-backups/#healthchecks-hook
+ * Documentation on monitoring and alerting options for borgmatic backups:
+   https://torsion.org/borgmatic/docs/how-to/monitor-your-backups/
+ * Automatically rewrite links when developing on documentation locally.
+
 1.3.24
  * #86: Add "borgmatic list --successful" flag to only list successful (non-checkpoint) archives.
  * Add a suggestion form to all documentation pages, so users can submit ideas for improving the

+ 1 - 5
README.md

@@ -63,6 +63,7 @@ href="https://asciinema.org/a/203761" target="_blank">screencast</a>.
  * [Make per-application backups](https://torsion.org/borgmatic/docs/how-to/make-per-application-backups/)
  * [Deal with very large backups](https://torsion.org/borgmatic/docs/how-to/deal-with-very-large-backups/)
  * [Inspect your backups](https://torsion.org/borgmatic/docs/how-to/inspect-your-backups/)
+ * [Monitor your backups](https://torsion.org/borgmatic/docs/how-to/monitor-your-backups/)
  * [Restore a backup](https://torsion.org/borgmatic/docs/how-to/restore-a-backup/)
  * [Add preparation and cleanup steps to backups](https://torsion.org/borgmatic/docs/how-to/add-preparation-and-cleanup-steps-to-backups/)
  * [Upgrade borgmatic](https://torsion.org/borgmatic/docs/how-to/upgrade/)
@@ -116,8 +117,3 @@ your thing. In general, contributions are very welcome. We don't bite!
 Also, please check out the [borgmatic development
 how-to](https://torsion.org/borgmatic/docs/how-to/develop-on-borgmatic/) for
 info on cloning source code, running tests, etc.
-
-<script>
-  var links = document.getElementsByClassName("referral");
-  links[Math.floor(Math.random() * links.length)].style.display = "none";
-</script>

+ 9 - 0
borgmatic/commands/borgmatic.py

@@ -60,6 +60,9 @@ def run_configuration(config_filename, config, arguments):
                 'pre-backup',
                 global_arguments.dry_run,
             )
+            hook.ping_healthchecks(
+                hooks.get('healthchecks'), config_filename, global_arguments.dry_run, 'start'
+            )
         except (OSError, CalledProcessError) as error:
             encountered_error = error
             yield from make_error_log_records(
@@ -95,6 +98,9 @@ def run_configuration(config_filename, config, arguments):
                 'post-backup',
                 global_arguments.dry_run,
             )
+            hook.ping_healthchecks(
+                hooks.get('healthchecks'), config_filename, global_arguments.dry_run
+            )
         except (OSError, CalledProcessError) as error:
             encountered_error = error
             yield from make_error_log_records(
@@ -113,6 +119,9 @@ def run_configuration(config_filename, config, arguments):
                 error=encountered_error,
                 output=getattr(encountered_error, 'output', ''),
             )
+            hook.ping_healthchecks(
+                hooks.get('healthchecks'), config_filename, global_arguments.dry_run, 'fail'
+            )
         except (OSError, CalledProcessError) as error:
             yield from make_error_log_records(
                 '{}: Error running on-error hook'.format(config_filename), error

+ 11 - 4
borgmatic/config/schema.yaml

@@ -337,8 +337,8 @@ map:
                 example: false
     hooks:
         desc: |
-            Shell commands or scripts to execute at various points during a borgmatic run.
-            IMPORTANT: All provided commands and scripts are executed with user permissions of
+            Shell commands, scripts, or integrations to execute at various points during a borgmatic
+            run. IMPORTANT: All provided commands and scripts are executed with user permissions of
             borgmatic. Do not forget to set secure permissions on this configuration file (chmod
             0600) as well as on any script called from a hook (chmod 0700) to prevent potential
             shell injection or privilege escalation.
@@ -363,10 +363,17 @@ map:
                 seq:
                     - type: str
                 desc: |
-                    List of one or more shell commands or scripts to execute when an exception occurs
-                    during a backup or when running a before_backup or after_backup hook.
+                    List of one or more shell commands or scripts to execute when an exception
+                    occurs during a backup or when running a before_backup or after_backup hook.
                 example:
                     - echo "Error while creating a backup or running a backup hook."
+            healthchecks:
+                type: str
+                desc: |
+                    Healthchecks ping URL or UUID to notify when a backup begins, ends, or errors.
+                    Create an account at https://healthchecks.io if you'd like to use this service.
+                example:
+                    https://hc-ping.com/your-uuid-here
             before_everything:
                 seq:
                     - type: str

+ 33 - 0
borgmatic/hook.py

@@ -1,6 +1,8 @@
 import logging
 import os
 
+import requests
+
 from borgmatic import execute
 
 logger = logging.getLogger(__name__)
@@ -69,3 +71,34 @@ def execute_hook(commands, umask, config_filename, description, dry_run, **conte
     finally:
         if original_umask:
             os.umask(original_umask)
+
+
+def ping_healthchecks(ping_url_or_uuid, config_filename, dry_run, append=None):
+    '''
+    Ping the given healthchecks.io URL or UUID, appending the append string if any. Use the given
+    configuration filename in any log entries. If this is a dry run, then don't actually ping
+    anything.
+    '''
+    if not ping_url_or_uuid:
+        logger.debug('{}: No healthchecks hook set'.format(config_filename))
+        return
+
+    ping_url = (
+        ping_url_or_uuid
+        if ping_url_or_uuid.startswith('http')
+        else 'https://hc-ping.com/{}'.format(ping_url_or_uuid)
+    )
+    dry_run_label = ' (dry run; not actually pinging)' if dry_run else ''
+
+    if append:
+        ping_url = '{}/{}'.format(ping_url, append)
+
+    logger.info(
+        '{}: Pinging healthchecks.io{}{}'.format(
+            config_filename, ' ' + append if append else '', dry_run_label
+        )
+    )
+    logger.debug('{}: Using healthchecks.io ping URL {}'.format(config_filename, ping_url))
+
+    logging.getLogger('urllib3').setLevel(logging.ERROR)
+    requests.get(ping_url)

+ 3 - 2
docs/how-to/add-preparation-and-cleanup-steps-to-backups.md

@@ -48,8 +48,8 @@ a backup or a backup hook, but not if an error occurs during a
 `before_everything` hook.
 
 borgmatic also runs `on_error` hooks if an error occurs, either when creating
-a backup or running a backup hook. See the [error alerting
-documentation](https://torsion.org/borgmatic/docs/how-to/inspect-your-backups.md)
+a backup or running a backup hook. See the [monitoring and alerting
+documentation](https://torsion.org/borgmatic/docs/how-to/monitor-your-backups.md)
 for more information.
 
 ## Hook output
@@ -73,3 +73,4 @@ invoked by hooks.
  * [Set up backups with borgmatic](https://torsion.org/borgmatic/docs/how-to/set-up-backups.md)
  * [Make per-application backups](https://torsion.org/borgmatic/docs/how-to/make-per-application-backups.md)
  * [Inspect your backups](https://torsion.org/borgmatic/docs/how-to/inspect-your-backups.md)
+ * [Monitor your backups](https://torsion.org/borgmatic/docs/how-to/monitor-your-backups.md)

+ 2 - 70
docs/how-to/inspect-your-backups.md

@@ -22,7 +22,7 @@ borgmatic --verbosity 2
 
 ## Backup summary
 
-If you're less concerned with progress during a backup, and you just want to
+If you're less concerned with progress during a backup, and you only want to
 see the summary of archive statistics at the end, you can use the stats
 option when performing a backup:
 
@@ -83,78 +83,10 @@ Note that the [sample borgmatic systemd service
 file](https://torsion.org/borgmatic/docs/how-to/set-up-backups/#systemd)
 already has this rate limit disabled.
 
-## Error alerting
-
-When an error occurs during a backup, borgmatic can run configurable shell
-commands to fire off custom error notifications or take other actions, so you
-can get alerted as soon as something goes wrong. Here's a not-so-useful
-example:
-
-```yaml
-hooks:
-    on_error:
-        - echo "Error while creating a backup or running a backup hook."
-```
-
-The `on_error` hook supports interpolating particular runtime variables into
-the hook command. Here's an example that assumes you provide a separate shell
-script to handle the alerting:
-
-```yaml
-hooks:
-    on_error:
-        - send-text-message.sh "{configuration_filename}" "{repository}"
-```
-
-In this example, when the error occurs, borgmatic interpolates a few runtime
-values into the hook command: the borgmatic configuration filename, and the
-path of the repository. Here's the full set of supported variables you can use
-here:
-
- * `configuration_filename`: borgmatic configuration filename in which the
-   error occurred
- * `repository`: path of the repository in which the error occurred (may be
-   blank if the error occurs in a hook)
- * `error`: the error message itself
- * `output`: output of the command that failed (may be blank if an error
-   occurred without running a command)
-
-Note that borgmatic does not run `on_error` hooks if an error occurs within a
-`before_everything` or `after_everything` hook. For more about hooks, see the
-[borgmatic hooks
-documentation](https://torsion.org/borgmatic/docs/how-to/add-preparation-and-cleanup-steps-to-backups.md),
-especially the security information.
-
-
-## Scripting borgmatic
-
-To consume the output of borgmatic in other software, you can include an
-optional `--json` flag with `create`, `list`, or `info` to get the output
-formatted as JSON.
-
-Note that when you specify the `--json` flag, Borg's other non-JSON output is
-suppressed so as not to interfere with the captured JSON. Also note that JSON
-output only shows up at the console, and not in syslog.
-
-### Successful backups
-
-`borgmatic list` includes support for a `--successful` flag that only lists
-successful (non-checkpoint) backups. Combined with a built-in Borg flag like
-`--last`, you can list the last successful backup for use in your monitoring
-scripts. Here's an example combined with `--json`:
-
-```bash
-borgmatic list --successful --last 1 --json
-```
-
-Note that this particular combination will only work if you've got a single
-backup "series" in your repository. If you're instead backing up, say, from
-multiple different hosts into a single repository, then you'll need to get
-fancier with your archive listing. See `borg list --help` for more flags.
-
 
 ## Related documentation
 
  * [Set up backups with borgmatic](https://torsion.org/borgmatic/docs/how-to/set-up-backups.md)
+ * [Monitor your backups](https://torsion.org/borgmatic/docs/how-to/monitor-your-backups.md)
  * [Add preparation and cleanup steps to backups](https://torsion.org/borgmatic/docs/how-to/add-preparation-and-cleanup-steps-to-backups.md)
  * [Develop on borgmatic](https://torsion.org/borgmatic/docs/how-to/develop-on-borgmatic.md)

+ 152 - 0
docs/how-to/monitor-your-backups.md

@@ -0,0 +1,152 @@
+---
+title: How to monitor your backups
+---
+
+## Monitoring and alerting
+
+Having backups is great, but they won't do you a lot of good unless you have
+confidence that they're running on a regular basis. That's where monitoring
+and alerting comes in.
+
+There are several different ways you can monitor your backups and find out
+whether they're succeeding. Which of these you choose to do is up to you and
+your particular infrastructure:
+
+1. **Job runner alerts**: The easiest place to start is with failure alerts
+from the [scheduled job
+runner](https://torsion.org/borgmatic/docs/how-to/set-up-backups/#autopilot) (cron,
+systemd, etc.) that's running borgmatic. But note that if the job doesn't even
+get scheduled (e.g. due to the job runner not running), you probably won't get
+an alert at all! Still, this is a decent first line of defense, especially
+when combined with some of the other approaches below.
+2. **borgmatic error hooks**: The `on_error` hook allows you to run an arbitrary
+command or script when borgmatic itself encounters an error running your
+backups. So for instance, you can run a script to send yourself a text message
+alert. But note that if borgmatic doesn't actually run, this alert won't fire.
+See [error
+hooks](https://torsion.org/borgmatic/docs/how-to/monitor-your-backups/#error-hooks)
+below for how to configure this.
+4. **borgmatic Healthchecks hook**: This feature integrates with the
+[Healthchecks](https://healthchecks.io/) service, and pings Healthchecks
+whenever borgmatic runs. That way, Healthchecks can alert you when something
+goes wrong or it doesn't hear from borgmatic for a configured interval. (See
+[Healthchecks
+hook](https://torsion.org/borgmatic/docs/how-to/monitor-your-backups/#healthchecks-hook)
+below for how to configure this.)
+3. **Third-party monitoring software**: You can use traditional monitoring
+software to consume borgmatic JSON output and track when the last
+successful backup occurred. See [scripting
+borgmatic](https://torsion.org/borgmatic/docs/how-to/monitor-your-backups/#scripting-borgmatic)
+below for how to configure this.
+5. **Borg hosting providers**: Most [Borg hosting
+providers](https://torsion.org/borgmatic/#hosting-providers) include
+monitoring and alerting as part of their offering. This gives you a dashboard
+to check on all of your backups, and can alert you if the service doesn't hear
+from borgmatic for a configured interval.
+6. **borgmatic consistency checks**: While not strictly part of monitoring, if you
+really want confidence that your backups are not only running but are
+restorable as well, you can configure particular [consistency
+checks](https://torsion.org/borgmatic/docs/how-to/deal-with-very-large-backups/#consistency-check-configuration)
+or even script full [restore
+tests](https://torsion.org/borgmatic/docs/how-to/restore-a-backup/).
+
+
+## Error hooks
+
+When an error occurs during a backup, borgmatic can run configurable shell
+commands to fire off custom error notifications or take other actions, so you
+can get alerted as soon as something goes wrong. Here's a not-so-useful
+example:
+
+```yaml
+hooks:
+    on_error:
+        - echo "Error while creating a backup or running a backup hook."
+```
+
+The `on_error` hook supports interpolating particular runtime variables into
+the hook command. Here's an example that assumes you provide a separate shell
+script to handle the alerting:
+
+```yaml
+hooks:
+    on_error:
+        - send-text-message.sh "{configuration_filename}" "{repository}"
+```
+
+In this example, when the error occurs, borgmatic interpolates a few runtime
+values into the hook command: the borgmatic configuration filename, and the
+path of the repository. Here's the full set of supported variables you can use
+here:
+
+ * `configuration_filename`: borgmatic configuration filename in which the
+   error occurred
+ * `repository`: path of the repository in which the error occurred (may be
+   blank if the error occurs in a hook)
+ * `error`: the error message itself
+ * `output`: output of the command that failed (may be blank if an error
+   occurred without running a command)
+
+Note that borgmatic does not run `on_error` hooks if an error occurs within a
+`before_everything` or `after_everything` hook. For more about hooks, see the
+[borgmatic hooks
+documentation](https://torsion.org/borgmatic/docs/how-to/add-preparation-and-cleanup-steps-to-backups.md),
+especially the security information.
+
+
+## Healthchecks hook
+
+[Healthchecks](https://healthchecks.io/) is a service that provides "instant
+alerts when your cron jobs fail silently", and borgmatic has built-in
+integration with it. Once you create a Healthchecks account and project on
+their site, all you need to do is configure borgmatic with the unique "Ping
+URL" for your project. Here's an example:
+
+
+```yaml
+hooks:
+    healthchecks: https://hc-ping.com/addffa72-da17-40ae-be9c-ff591afb942a
+```
+
+With this hook in place, borgmatic will ping your Healthchecks project when a
+backup begins, ends, or errors. Then you can configure Healthchecks to notify
+you by a [variety of
+mechanisms](https://healthchecks.io/#welcome-integrations) when backups fail
+or it doesn't hear from borgmatic for a certain period of time.
+
+
+## Scripting borgmatic
+
+To consume the output of borgmatic in other software, you can include an
+optional `--json` flag with `create`, `list`, or `info` to get the output
+formatted as JSON.
+
+Note that when you specify the `--json` flag, Borg's other non-JSON output is
+suppressed so as not to interfere with the captured JSON. Also note that JSON
+output only shows up at the console, and not in syslog.
+
+
+### Successful backups
+
+`borgmatic list` includes support for a `--successful` flag that only lists
+successful (non-checkpoint) backups. Combined with a built-in Borg flag like
+`--last`, you can list the last successful backup for use in your monitoring
+scripts. Here's an example combined with `--json`:
+
+```bash
+borgmatic list --successful --last 1 --json
+```
+
+Note that this particular combination will only work if you've got a single
+backup "series" in your repository. If you're instead backing up, say, from
+multiple different hosts into a single repository, then you'll need to get
+fancier with your archive listing. See `borg list --help` for more flags.
+
+
+## Related documentation
+
+ * [Set up backups with borgmatic](https://torsion.org/borgmatic/docs/how-to/set-up-backups.md)
+ * [Inspect your backups](https://torsion.org/borgmatic/docs/how-to/inspect-your-backups.md)
+ * [Add preparation and cleanup steps to backups](https://torsion.org/borgmatic/docs/how-to/add-preparation-and-cleanup-steps-to-backups.md)
+ * [Restore a backup](https://torsion.org/borgmatic/docs/how-to/restore-a-backup.md)
+ * [Develop on borgmatic](https://torsion.org/borgmatic/docs/how-to/develop-on-borgmatic.md)

+ 1 - 0
docs/how-to/restore-a-backup.md

@@ -65,3 +65,4 @@ Like a whole-archive restore, this also restores into the current directory.
 
  * [Set up backups with borgmatic](https://torsion.org/borgmatic/docs/how-to/set-up-backups.md)
  * [Inspect your backups](https://torsion.org/borgmatic/docs/how-to/inspect-your-backups.md)
+ * [Monitor your backups](https://torsion.org/borgmatic/docs/how-to/monitor-your-backups.md)

+ 2 - 6
docs/how-to/set-up-backups.md

@@ -228,7 +228,7 @@ found character that cannot start any token
   in "config.yaml", line 230, column 1
 ```
 
-YAML does not allow tabs. So to fix this, simply replace any tabs in your
+YAML does not allow tabs. So to fix this, replace any tabs in your
 configuration file with the requisite number of spaces.
 
 ### libyaml compilation errors
@@ -247,10 +247,6 @@ it.
  * [Make per-application backups](https://torsion.org/borgmatic/docs/how-to/make-per-application-backups.md)
  * [Deal with very large backups](https://torsion.org/borgmatic/docs/how-to/deal-with-very-large-backups.md)
  * [Inspect your backups](https://torsion.org/borgmatic/docs/how-to/inspect-your-backups.md)
+ * [Monitor your backups](https://torsion.org/borgmatic/docs/how-to/monitor-your-backups.md)
  * [borgmatic configuration reference](https://torsion.org/borgmatic/docs/reference/configuration.md)
  * [borgmatic command-line reference](https://torsion.org/borgmatic/docs/reference/command-line.md)
-
-<script>
-  var links = document.getElementsByClassName("referral");
-  links[Math.floor(Math.random() * links.length)].style.display = "none";
-</script>

+ 2 - 1
setup.py

@@ -1,6 +1,6 @@
 from setuptools import find_packages, setup
 
-VERSION = '1.3.24'
+VERSION = '1.3.25'
 
 
 setup(
@@ -31,6 +31,7 @@ setup(
     obsoletes=['atticmatic'],
     install_requires=(
         'pykwalify>=1.6.0,<14.06',
+        'requests',
         'ruamel.yaml>0.15.0,<0.17.0',
         'setuptools',
         'colorama>=0.4.1,<0.5',

+ 1 - 0
test_requirements.txt

@@ -20,5 +20,6 @@ pytest==5.1.2
 pytest-cov==2.7.1
 python-dateutil==2.8.0
 PyYAML==5.1.2
+requests==2.22.0
 ruamel.yaml>0.15.0,<0.17.0
 toml==0.10.0

+ 30 - 0
tests/unit/test_hook.py

@@ -79,3 +79,33 @@ def test_execute_hook_on_error_logs_as_error():
     ).once()
 
     module.execute_hook([':'], None, 'config.yaml', 'on-error', dry_run=False)
+
+
+def test_ping_healthchecks_hits_ping_url():
+    ping_url = 'https://example.com'
+    flexmock(module.requests).should_receive('get').with_args(ping_url)
+
+    module.ping_healthchecks(ping_url, 'config.yaml', dry_run=False)
+
+
+def test_ping_healthchecks_without_ping_url_does_not_raise():
+    flexmock(module.requests).should_receive('get').never()
+
+    module.ping_healthchecks(ping_url_or_uuid=None, config_filename='config.yaml', dry_run=False)
+
+
+def test_ping_healthchecks_with_ping_uuid_hits_corresponding_url():
+    ping_uuid = 'abcd-efgh-ijkl-mnop'
+    flexmock(module.requests).should_receive('get').with_args(
+        'https://hc-ping.com/{}'.format(ping_uuid)
+    )
+
+    module.ping_healthchecks(ping_uuid, 'config.yaml', dry_run=False)
+
+
+def test_ping_healthchecks_hits_ping_url_with_append():
+    ping_url = 'https://example.com'
+    append = 'failed-so-hard'
+    flexmock(module.requests).should_receive('get').with_args('{}/{}'.format(ping_url, append))
+
+    module.ping_healthchecks(ping_url, 'config.yaml', dry_run=False, append=append)