Add alerts for alertmanager's health

This will be evaluated by a cluster-level Prometheus that monitors the Alertmanager itself. Signed-off-by: Douglas Camata <[email protected]>
rhobs · Aug 9, 2023 · 017ef8b · 017ef8b
1 parent c8bb4a4
commit 017ef8b
Show file tree

Hide file tree

Showing 3 changed files with 326 additions and 0 deletions.
diff --git a/observability/prometheusrules.jsonnet b/observability/prometheusrules.jsonnet
@@ -93,6 +93,8 @@ local appSREOverwrites(environment) = {
         std.startsWith(name, 'rhobs-mst') && environment == 'stage' then '92520ea4d6976f30d1618164e186ef9b'
       else if
         std.startsWith(name, 'gubernator') then 'no-dashboard'
+      else if
+        std.startsWith(name, 'alertmanager') then 'no-dashboard'
       else error 'no dashboard id for group %s' % name,
   },
 
@@ -243,6 +245,12 @@ local renderAlerts(name, environment, mixin) = {
   'observatorium-thanos-production.prometheusrules': renderAlerts('observatorium-thanos-production', 'production', thanosAlerts),
 }
 
+{
+  local alertmanagerAlerts = (import 'github.com/prometheus/alertmanager/doc/alertmanager-mixin/mixin.libsonnet') + config.alertmanager,
+  'observatorium-alertmanager-stage.prometheusrules': renderAlerts('observatorium-alertmanager-stage', 'stage', alertmanagerAlerts),
+  'observatorium-alertmanager-production.prometheusrules': renderAlerts('observatorium-alertmanager-production', 'production', alertmanagerAlerts),
+}
+
 {
   local patchedLoki = loki {
     prometheusAlerts+: {

diff --git a/.../observability/prometheusrules/observatorium-alertmanager-production.prometheusrules.yaml b/.../observability/prometheusrules/observatorium-alertmanager-production.prometheusrules.yaml
@@ -0,0 +1,159 @@
+---
+$schema: /openshift/prometheus-rule-1.yml
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+  labels:
+    prometheus: app-sre
+    role: alert-rules
+  name: observatorium-alertmanager-production
+spec:
+  groups:
+  - name: alertmanager.rules
+    rules:
+    - alert: AlertmanagerFailedReload
+      annotations:
+        dashboard: https://grafana.app-sre.devshift.net/d/no-dashboard/alertmanager.rules?orgId=1&refresh=10s&var-datasource={{$externalLabels.cluster}}-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m
+        description: Configuration has failed to load for {{$labels.pod}}.
+        message: Configuration has failed to load for {{$labels.pod}}.
+        runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#alertmanagerfailedreload
+        summary: Reloading an Alertmanager configuration has failed.
+      expr: |
+        # Without max_over_time, failed scrapes could create false negatives, see
+        # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
+        max_over_time(alertmanager_config_last_reload_successful{job="alertmanager"}[5m]) == 0
+      for: 10m
+      labels:
+        service: telemeter
+        severity: critical
+    - alert: AlertmanagerMembersInconsistent
+      annotations:
+        dashboard: https://grafana.app-sre.devshift.net/d/no-dashboard/alertmanager.rules?orgId=1&refresh=10s&var-datasource={{$externalLabels.cluster}}-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m
+        description: Alertmanager {{$labels.pod}} has only found {{ $value }} members of the {{$labels.job}} cluster.
+        message: Alertmanager {{$labels.pod}} has only found {{ $value }} members of the {{$labels.job}} cluster.
+        runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#alertmanagermembersinconsistent
+        summary: A member of an Alertmanager cluster has not found all other cluster members.
+      expr: |
+        # Without max_over_time, failed scrapes could create false negatives, see
+        # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
+          max_over_time(alertmanager_cluster_members{job="alertmanager"}[5m])
+        < on (namespace,job) group_left
+          count by (namespace,job) (max_over_time(alertmanager_cluster_members{job="alertmanager"}[5m]))
+      for: 15m
+      labels:
+        service: telemeter
+        severity: critical
+    - alert: AlertmanagerFailedToSendAlerts
+      annotations:
+        dashboard: https://grafana.app-sre.devshift.net/d/no-dashboard/alertmanager.rules?orgId=1&refresh=10s&var-datasource={{$externalLabels.cluster}}-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m
+        description: Alertmanager {{$labels.pod}} failed to send {{ $value | humanizePercentage }} of notifications to {{ $labels.integration }}.
+        message: Alertmanager {{$labels.pod}} failed to send {{ $value | humanizePercentage }} of notifications to {{ $labels.integration }}.
+        runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#alertmanagerfailedtosendalerts
+        summary: An Alertmanager instance failed to send notifications.
+      expr: |
+        (
+          rate(alertmanager_notifications_failed_total{job="alertmanager"}[5m])
+        /
+          rate(alertmanager_notifications_total{job="alertmanager"}[5m])
+        )
+        > 0.01
+      for: 5m
+      labels:
+        service: telemeter
+        severity: medium
+    - alert: AlertmanagerClusterFailedToSendAlerts
+      annotations:
+        dashboard: https://grafana.app-sre.devshift.net/d/no-dashboard/alertmanager.rules?orgId=1&refresh=10s&var-datasource={{$externalLabels.cluster}}-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m
+        description: The minimum notification failure rate to {{ $labels.integration }} sent from any instance in the {{$labels.job}} cluster is {{ $value | humanizePercentage }}.
+        message: The minimum notification failure rate to {{ $labels.integration }} sent from any instance in the {{$labels.job}} cluster is {{ $value | humanizePercentage }}.
+        runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#alertmanagerclusterfailedtosendalerts
+        summary: All Alertmanager instances in a cluster failed to send notifications to a critical integration.
+      expr: |
+        min by (namespace,job, integration) (
+          rate(alertmanager_notifications_failed_total{job="alertmanager", integration=~`slack|pagerduty|email|webhook`}[5m])
+        /
+          rate(alertmanager_notifications_total{job="alertmanager", integration=~`slack|pagerduty|email|webhook`}[5m])
+        )
+        > 0.01
+      for: 5m
+      labels:
+        service: telemeter
+        severity: critical
+    - alert: AlertmanagerClusterFailedToSendAlerts
+      annotations:
+        dashboard: https://grafana.app-sre.devshift.net/d/no-dashboard/alertmanager.rules?orgId=1&refresh=10s&var-datasource={{$externalLabels.cluster}}-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m
+        description: The minimum notification failure rate to {{ $labels.integration }} sent from any instance in the {{$labels.job}} cluster is {{ $value | humanizePercentage }}.
+        message: The minimum notification failure rate to {{ $labels.integration }} sent from any instance in the {{$labels.job}} cluster is {{ $value | humanizePercentage }}.
+        runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#alertmanagerclusterfailedtosendalerts
+        summary: All Alertmanager instances in a cluster failed to send notifications to a non-critical integration.
+      expr: |
+        min by (namespace,job, integration) (
+          rate(alertmanager_notifications_failed_total{job="alertmanager", integration!~`slack|pagerduty|email|webhook`}[5m])
+        /
+          rate(alertmanager_notifications_total{job="alertmanager", integration!~`slack|pagerduty|email|webhook`}[5m])
+        )
+        > 0.01
+      for: 5m
+      labels:
+        service: telemeter
+        severity: medium
+    - alert: AlertmanagerConfigInconsistent
+      annotations:
+        dashboard: https://grafana.app-sre.devshift.net/d/no-dashboard/alertmanager.rules?orgId=1&refresh=10s&var-datasource={{$externalLabels.cluster}}-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m
+        description: Alertmanager instances within the {{$labels.job}} cluster have different configurations.
+        message: Alertmanager instances within the {{$labels.job}} cluster have different configurations.
+        runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#alertmanagerconfiginconsistent
+        summary: Alertmanager instances within the same cluster have different configurations.
+      expr: |
+        count by (namespace,job) (
+          count_values by (namespace,job) ("config_hash", alertmanager_config_hash{job="alertmanager"})
+        )
+        != 1
+      for: 20m
+      labels:
+        service: telemeter
+        severity: critical
+    - alert: AlertmanagerClusterDown
+      annotations:
+        dashboard: https://grafana.app-sre.devshift.net/d/no-dashboard/alertmanager.rules?orgId=1&refresh=10s&var-datasource={{$externalLabels.cluster}}-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m
+        description: '{{ $value | humanizePercentage }} of Alertmanager instances within the {{$labels.job}} cluster have been up for less than half of the last 5m.'
+        message: '{{ $value | humanizePercentage }} of Alertmanager instances within the {{$labels.job}} cluster have been up for less than half of the last 5m.'
+        runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#alertmanagerclusterdown
+        summary: Half or more of the Alertmanager instances within the same cluster are down.
+      expr: |
+        (
+          count by (namespace,job) (
+            avg_over_time(up{job="alertmanager"}[5m]) < 0.5
+          )
+        /
+          count by (namespace,job) (
+            up{job="alertmanager"}
+          )
+        )
+        >= 0.5
+      for: 5m
+      labels:
+        service: telemeter
+        severity: critical
+    - alert: AlertmanagerClusterCrashlooping
+      annotations:
+        dashboard: https://grafana.app-sre.devshift.net/d/no-dashboard/alertmanager.rules?orgId=1&refresh=10s&var-datasource={{$externalLabels.cluster}}-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m
+        description: '{{ $value | humanizePercentage }} of Alertmanager instances within the {{$labels.job}} cluster have restarted at least 5 times in the last 10m.'
+        message: '{{ $value | humanizePercentage }} of Alertmanager instances within the {{$labels.job}} cluster have restarted at least 5 times in the last 10m.'
+        runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#alertmanagerclustercrashlooping
+        summary: Half or more of the Alertmanager instances within the same cluster are crashlooping.
+      expr: |
+        (
+          count by (namespace,job) (
+            changes(process_start_time_seconds{job="alertmanager"}[10m]) > 4
+          )
+        /
+          count by (namespace,job) (
+            up{job="alertmanager"}
+          )
+        )
+        >= 0.5
+      for: 5m
+      labels:
+        service: telemeter
+        severity: critical