Skip to content

Commit

Permalink
fix(charts): hermes monitoring and prometheus rules (#1564)
Browse files Browse the repository at this point in the history
## Summary
Fixes hermes related monitoring service with Prometheus scraping and
adds prometheus rules template for alerts.
## Background
hermes monitoring service is misconfigured and alerts are yet to be
implemented.
## Changes
adds `serviceMonitor` and `prometheusRule` templates enabling data
scraping and alerts with prometheus.
also fixes current service porting and default image and labels.
## Testing
against dusk-10, locally

## Metrics
enables `hermes-relayer` metrics scraping
  • Loading branch information
quasystaty1 authored Oct 3, 2024
1 parent b54ccb9 commit 8e10279
Show file tree
Hide file tree
Showing 8 changed files with 102 additions and 8 deletions.
2 changes: 1 addition & 1 deletion charts/hermes/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ type: application
# This is the chart version. This version number should be incremented each time you make changes
# to the chart and its templates, including the app version.
# Versions are expected to follow Semantic Versioning (https://semver.org/)
version: 0.4.1
version: 0.4.2

# This is the version number of the application being deployed. This version number should be
# incremented each time you make changes to the application. Versions are not expected to
Expand Down
4 changes: 2 additions & 2 deletions charts/hermes/files/config.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ enabled = {{ .Values.rest.enabled }}

# Specify the IPv4/6 host over which the built-in HTTP server will serve the RESTful
# API requests. Default: 127.0.0.1
host = '127.0.0.1'
host = '0.0.0.0'

# Specify the port over which the built-in HTTP server will serve the restful API
# requests. Default: 3000
Expand All @@ -35,7 +35,7 @@ tx_confirmation = {{ .Values.mode.packets.txConfirmation }}

[telemetry]
enabled = {{ .Values.telemetry.enabled }}
host = '127.0.0.1'
host = '0.0.0.0'
port = {{ .Values.ports.telemetry }}

[telemetry.buckets]
Expand Down
14 changes: 14 additions & 0 deletions charts/hermes/templates/_helpers.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,20 @@ We truncate at 63 chars because some Kubernetes name fields are limited to this
{{- end -}}
{{- end -}}

{{/*
Common labels
*/}}
{{- define "hermes.labels" -}}
{{ include "hermes.selectorLabels" . }}
{{- end }}

{{/*
Selector labels
*/}}
{{- define "hermes.selectorLabels" -}}
app: {{ include "hermes.fullname" . }}
{{- end }}

{{/*
Return if ingress is stable.
*/}}
Expand Down
6 changes: 3 additions & 3 deletions charts/hermes/templates/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,18 +3,18 @@ kind: Deployment
metadata:
name: {{ include "hermes.fullname" . }}
labels:
app: astria-dev-cluster
app: {{ include "hermes.fullname" . }}
namespace: {{ include "hermes.namespace" . }}
spec:
replicas: {{ .Values.global.replicaCount }}
selector:
matchLabels:
app: astria-dev-cluster
app: {{ include "hermes.fullname" . }}
template:
metadata:
name: {{ include "hermes.fullname" . }}
labels:
app: astria-dev-cluster
app: {{ include "hermes.fullname" . }}
spec:
{{- if .Values.createChannel.enabled }}
initContainers:
Expand Down
20 changes: 20 additions & 0 deletions charts/hermes/templates/prometheusrule.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
{{- if .Values.alerting.enabled -}}
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: {{ template "hermes.fullname" . }}
{{- if .Values.alerting.prometheusRule.namespace }}
namespace: {{ .Values.alerting.prometheusRule.namespace | quote }}
{{- end }}
labels:
{{- include "hermes.labels" . | nindent 4 }}
{{- if .Values.alerting.prometheusRule.additionalLabels }}
{{- toYaml .Values.alerting.prometheusRule.additionalLabels | nindent 4 }}
{{- end }}
spec:
{{- if .Values.alerting.prometheusRule.rules }}
groups:
- name: {{ template "hermes.fullname" . }}
rules: {{- toYaml .Values.alerting.prometheusRule.rules | nindent 4 }}
{{- end }}
{{- end }}
4 changes: 3 additions & 1 deletion charts/hermes/templates/service.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
kind: Service
apiVersion: v1
metadata:
labels:
app: {{ include "hermes.fullname" . }}
name: {{ include "hermes.fullname" . }}-service
namespace: {{ include "hermes.namespace" . }}
spec:
Expand All @@ -14,7 +16,7 @@ spec:
targetPort: rest
{{- end }}
{{- if .Values.telemetry.enabled }}
- name: telemetry-svc
- name: telemetry
port: {{ .Values.ports.telemetry }}
targetPort: telemetry
{{- end }}
Expand Down
28 changes: 28 additions & 0 deletions charts/hermes/templates/servicemonitor.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
{{- if and .Values.serviceMonitor.enabled .Values.telemetry.enabled }}
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: hermes-relayer-metrics
labels:
app: {{ include "hermes.fullname" . }}
{{- with .Values.serviceMonitor.additionalLabels }}
{{- toYaml . | nindent 4 }}
{{- end }}
spec:
jobLabel: hermes-relayer-metric
namespaceSelector:
matchNames:
- {{ include "hermes.namespace" . }}
selector:
matchLabels:
app: {{ include "hermes.fullname" . }}
endpoints:
- port: telemetry
path: /metrics
{{- with .Values.serviceMonitor.interval }}
interval: {{ . }}
{{- end }}
{{- with .Values.serviceMonitor.scrapeTimeout }}
scrapeTimeout: {{ . }}
{{- end }}
{{- end }}
32 changes: 31 additions & 1 deletion charts/hermes/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ global:
replicaCount: 1
logLevel: debug

image: ghcr.io/penumbra-zone/hermes:main
image: ghcr.io/astriaorg/hermes:sha-450f848
imagePullPolicy: IfNotPresent

fullnameOverride: ""
Expand Down Expand Up @@ -34,6 +34,13 @@ telemetry:
tracingServer:
enabled: false

# ServiceMonitor configuration
serviceMonitor:
enabled: false
port: 26660
additionalLabels:
release: kube-prometheus-stack

mode:
clients:
enabled: true
Expand Down Expand Up @@ -188,3 +195,26 @@ ingress:
# - secretName: chart-example-tls
# hosts:
# - chart-example.local

alerting:
enabled: false
interval: ""
additionalLabels:
release: kube-prometheus-stack
annotations: {}
# scrapeTimeout: 10s
# path: /metrics
prometheusRule:
enabled: true
additionalLabels:
release: kube-prometheus-stack
namespace: monitoring
rules:
- alert: Chain_Node_Down
expr: up{container="cometbft"} == 0 # Insert your query Expression
for: 1m # Rough number but should be enough to init warn
labels:
severity: critical
annotations:
summary: Chain Node is Down (instance {{ $labels.instance }})
description: "chain node '{{ $labels.namespace }}' has disappeared from Prometheus target discovery.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

0 comments on commit 8e10279

Please sign in to comment.