Skip to content

Commit

Permalink
clean jsonnet file
Browse files Browse the repository at this point in the history
Signed-off-by: Thibault Mange <[email protected]>
  • Loading branch information
thibaultmg committed Jul 26, 2023
1 parent c919cac commit e0d7db3
Show file tree
Hide file tree
Showing 2 changed files with 68 additions and 149 deletions.
118 changes: 40 additions & 78 deletions observability/dashboards/observatorium-gubernator.libsonnet
Original file line number Diff line number Diff line change
@@ -1,48 +1,7 @@
// local am = (import '../config.libsonnet').alertmanager;
// local utils = import 'github.com/thanos-io/thanos/mixin/lib/utils.libsonnet';
local g = import 'github.com/grafana/jsonnet-libs/grafana-builder/grafana.libsonnet';
local grafana = import 'grafonnet/grafana.libsonnet';
local template = import 'grafonnet/template.libsonnet';

function() {

local gubernator = self,
local intervalTemplate =
template.interval(
'interval',
'5m,10m,30m,1h,6h,12h,auto',
label='interval',
current='5m',
),
local namespaceTemplate =
template.new(
name='namespace',
datasource='$datasource',
query='label_values(thanos_status, namespace)',
label='namespace',
allValues='.+',
current='',
hide='',
refresh=2,
includeAll=true,
sort=1
),
local container = 'gubernator',
local pod = 'observatorium-gubernator',
local jobTemplate =
template.new(
name='job',
datasource='$datasource',
query='label_values(up{namespace="$namespace", job="observatorium-gubernator"}, job)',
label='job',
allValues='.+',
current='',
hide='',
refresh=2,
includeAll=true,
sort=1
),

local panel(title, description='', unit='reqps') =
g.panel(title) {
description: description,
Expand All @@ -54,41 +13,53 @@ function() {
span: 0,
} + g.stack,

local commonSelectors = 'namespace="$namespace", job="$job"',
local podSelector = 'observatorium-gubernator-.*',
local containerSelector = 'gubernator',

local datasourcesRegex = '/^rhobs.*|telemeter-prod-01-prometheus|app-sre-stage-01-prometheus/',
local labelMatchers = {
ns: 'namespace="$namespace"',
job: 'job="observatorium-gubernator"',
nsAndJob: std.join(', ', [self.ns, self.job]),
pod: 'pod=~"observatorium-gubernator.*"',
container: 'container="gubernator"',
},
local intervalTemplate =
template.interval(
'interval',
'5m,10m,30m,1h,6h,12h,auto',
label='interval',
current='5m',
),

dashboard:: {
data:
g.dashboard('Observatorium / Gubernator')
.addTemplate('namespace', 'gubernator_check_counter', 'namespace')
.addRow(
g.row('GetRateLimits API')
.addPanel(
panel('Requests', 'Rate of gRPC requests to the API per second', 'reqps') +
g.queryPanel(
'sum by (job, method) (rate(gubernator_grpc_request_counts{%s, method=~".*/GetRateLimits"}[$interval]))' % commonSelectors,
'sum by (job, method) (rate(gubernator_grpc_request_counts{%(nsAndJob)s, method=~".*/GetRateLimits"}[$interval]))' % labelMatchers,
'{{job}}',
)
)
.addPanel(
panel('Errors', 'Rate of failed gRPC requests to the API per second', 'reqps') +
g.queryPanel(
'sum by (job, method) (rate(gubernator_grpc_request_counts{%s, method=~".*/GetRateLimits", status="failed"}[$interval]))' % commonSelectors,
'sum by (job, method) (rate(gubernator_grpc_request_counts{%(nsAndJob)s, method=~".*/GetRateLimits", status="failed"}[$interval]))' % labelMatchers,
'{{status}} {{job}}',
)
)
.addPanel(
panel('Latencies', 'Latency of gRPC requests to the API per percentiles', 'ms') +
g.queryPanel(
'avg by(quantile, job) (gubernator_grpc_request_duration{%s, method=~".*/GetRateLimits"}) * 1000' % commonSelectors,
'avg by(quantile, job) (gubernator_grpc_request_duration{%(nsAndJob)s, method=~".*/GetRateLimits"}) * 1000' % labelMatchers,
'{{quantile}}th percentile',
)
)
.addPanel(
panel('Over Limit requests rate', 'Rate of requests that resulted in rate limiting (over the limit) per second', 'reqps') +
g.queryPanel(
'sum by(job) (rate(gubernator_over_limit_counter{%s}[$interval]))' % commonSelectors,
'sum by(job) (rate(gubernator_over_limit_counter{%(nsAndJob)s}[$interval]))' % labelMatchers,
'{{job}}',
)
)
Expand All @@ -98,21 +69,21 @@ function() {
.addPanel(
panel('Requests', 'Rate of gRPC requests to the API per second', 'reqps') +
g.queryPanel(
'sum by (job, method) (rate(gubernator_grpc_request_counts{%s, method=~".*/GetPeerRateLimits"}[$interval]))' % commonSelectors,
'sum by (job, method) (rate(gubernator_grpc_request_counts{%(nsAndJob)s, method=~".*/GetPeerRateLimits"}[$interval]))' % labelMatchers,
'{{job}}',
)
)
.addPanel(
panel('Errors', 'Rate of failed gRPC requests to the API per second', 'reqps') +
g.queryPanel(
'sum by (job, method) (rate(gubernator_grpc_request_counts{%s, method=~".*/GetPeerRateLimits", status="failed"}[$interval]))' % commonSelectors,
'sum by (job, method) (rate(gubernator_grpc_request_counts{%(nsAndJob)s, method=~".*/GetPeerRateLimits", status="failed"}[$interval]))' % labelMatchers,
'{{status}} {{job}}',
)
)
.addPanel(
panel('Latencies', 'Latency of gRPC requests to the API per percentiles', 'ms') +
g.queryPanel(
'avg by(quantile, job) (gubernator_grpc_request_duration{%s, method=~".*/GetPeerRateLimits"}) * 1000' % commonSelectors,
'avg by(quantile, job) (gubernator_grpc_request_duration{%(nsAndJob)s, method=~".*/GetPeerRateLimits"}) * 1000' % labelMatchers,
'{{quantile}}th percentile',
)
)
Expand All @@ -122,14 +93,14 @@ function() {
.addPanel(
panel('getRateLimitsBatch queue length', 'The getRateLimitsBatch() queue length in PeerClient. This represents rate checks queued by for batching to a remote peer.', '') +
g.queryPanel(
'sum by(job) (rate(gubernator_queue_length{%s}[$interval]))' % commonSelectors,
'sum by(job) (rate(gubernator_queue_length{%(nsAndJob)s}[$interval]))' % labelMatchers,
'{{job}}',
)
)
.addPanel(
panel('GetRateLimit queue length', 'The number of GetRateLimit requests queued up in GubernatorPool workers.', '') +
g.queryPanel(
'sum by(job) (rate(gubernator_pool_queue_length{%s}[$interval]))' % commonSelectors,
'sum by(job) (rate(gubernator_pool_queue_length{%(nsAndJob)s}[$interval]))' % labelMatchers,
'{{job}}',
)
)
Expand All @@ -139,28 +110,28 @@ function() {
.addPanel(
panel('Requests', 'Rate of cache requests per second', 'reqps') +
g.queryPanel(
'sum by(job) (rate(gubernator_cache_access_count{%s}[$interval]))' % commonSelectors,
'sum by(job) (rate(gubernator_cache_access_count{%(nsAndJob)s}[$interval]))' % labelMatchers,
'{{job}}',
)
)
.addPanel(
panel('Misses', 'Rate of cache misses per second', 'reqps') +
g.queryPanel(
'sum by(job) (rate(gubernator_cache_access_count{%s, type="miss"}[$interval])) / sum by(job) (rate(gubernator_cache_access_count{%s}[$interval]))' % [commonSelectors, commonSelectors],
'sum by(job) (rate(gubernator_cache_access_count{%(nsAndJob)s, type="miss"}[$interval])) / sum by(job) (rate(gubernator_cache_access_count{%(nsAndJob)s}[$interval]))' % labelMatchers,
'{{job}}',
)
)
.addPanel(
panel('Size', 'The number of items in LRU Cache which holds the rate limits.', '') +
g.queryPanel(
'sum by(job) (gubernator_cache_size{%s})' % commonSelectors,
'sum by(job) (gubernator_cache_size{%(nsAndJob)s})' % labelMatchers,
'{{job}}',
)
)
.addPanel(
panel('Unexpired evictions', 'Rate of cache items which were evicted while unexpired per second.', 'reqps') +
g.queryPanel(
'sum by(job) (rate(gubernator_unexpired_evictions_count{%s}[$interval]))' % commonSelectors,
'sum by(job) (rate(gubernator_unexpired_evictions_count{%(nsAndJob)s}[$interval]))' % labelMatchers,
'{{job}}',
)
)
Expand All @@ -170,21 +141,21 @@ function() {
.addPanel(
panel('Batch', 'Latency of batch send operations to a remote peer per percentiles', 'ms') +
g.queryPanel(
'avg by(quantile, job) (gubernator_batch_send_duration{%s}) * 1000' % commonSelectors,
'avg by(quantile, job) (gubernator_batch_send_duration{%(nsAndJob)s}) * 1000' % labelMatchers,
'{{quantile}}th percentile',
)
)
.addPanel(
panel('Broadcast', 'Latency of of GLOBAL broadcasts to peers per percentiles', 'ms') +
g.queryPanel(
'avg by(quantile, job) (gubernator_broadcast_durations{%s}) * 1000' % commonSelectors,
'avg by(quantile, job) (gubernator_broadcast_durations{%(nsAndJob)s}) * 1000' % labelMatchers,
'{{quantile}}th percentile {{job}}',
)
)
.addPanel(
panel('Async', 'Latency of of GLOBAL async sends per percentiles', 'ms') +
g.queryPanel(
'avg by(quantile, job) (gubernator_async_durations{%s}) * 1000' % commonSelectors,
'avg by(quantile, job) (gubernator_async_durations{%(nsAndJob)s}) * 1000' % labelMatchers,
'{{quantile}}th percentile {{job}}',
)
)
Expand All @@ -194,39 +165,30 @@ function() {
.addPanel(
panel('Memory Usage', 'Memory usage of the Gubernator process', 'MiB') +
g.queryPanel(
'(container_memory_working_set_bytes{container="%s", pod=~"%s", namespace="$namespace"}) / 1024^2' % [
containerSelector,
podSelector,
],
'container_memory_working_set_bytes{%(container)s, %(pod)s, %(ns)s} / 1024^2' % labelMatchers,
'memory usage system {{pod}}',
)
)
.addPanel(
panel('CPU Usage', 'CPU usage of the Gubernator process', 'percent') +
g.queryPanel(
'rate(process_cpu_seconds_total{container="%s", pod=~"%s", namespace="$namespace"}[$interval]) * 100' % [
containerSelector,
podSelector,
],
'rate(process_cpu_seconds_total{%(container)s, %(pod)s, %(ns)s}[$interval]) * 100' % labelMatchers,
'cpu usage system {{pod}}',
)
)
.addPanel(
panel('Pod/Container Restarts', 'Number of times the pod/container has restarted', '') +
g.queryPanel(
'sum by (pod) (kube_pod_container_status_restarts_total{container="%s", pod=~"%s", namespace="$namespace",})' % [
containerSelector,
podSelector,
],
'sum by (pod) (kube_pod_container_status_restarts_total{%(container)s, %(pod)s, %(ns)s})' % labelMatchers,
'pod restart count {{pod}}',
)
)
.addPanel(
panel('Network Usage', 'Network usage of the Gubernator process', 'binBps') +
g.queryPanel(
[
'sum by (pod) (rate(container_network_receive_bytes_total{pod=~"%s", namespace="$namespace"}[$interval]))' % podSelector,
'sum by (pod) (rate(container_network_transmit_bytes_total{pod=~"%s", namespace="$namespace"}[$interval]))' % podSelector,
'sum by (pod) (rate(container_network_receive_bytes_total{%(pod)s, %(ns)s}[$interval]))' % labelMatchers,
'sum by (pod) (rate(container_network_transmit_bytes_total{%(pod)s, %(ns)s}[$interval]))' % labelMatchers,
],
[
'network traffic in {{pod}}',
Expand All @@ -236,12 +198,12 @@ function() {
)
) + {
templating+: {
list+: [
list: [
if variable.name == 'datasource'
then variable { regex: '/^rhobs.*|telemeter-prod-01-prometheus|app-sre-stage-01-prometheus/' }
then variable { regex: datasourcesRegex }
else variable
for variable in super.list
] + [namespaceTemplate, jobTemplate, intervalTemplate],
] + [intervalTemplate],
},
},
},
Expand Down
Loading

0 comments on commit e0d7db3

Please sign in to comment.