Skip to content

Commit

Permalink
Adapt CPU/mem metrics
Browse files Browse the repository at this point in the history
Signed-off-by: davidmirror-ops <[email protected]>
  • Loading branch information
davidmirror-ops committed Sep 24, 2024
1 parent 12ec36f commit 81ef23c
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 19 deletions.
20 changes: 10 additions & 10 deletions deployment/stats/prometheus/flyteuser-dashboard.json
Original file line number Diff line number Diff line change
Expand Up @@ -626,15 +626,15 @@
"targets": [
{
"datasource": null,
"expr": "avg((flyte:propeller:all:workflow:success_duration_ms{project=~\"$project\", domain=~\"$domain\", wf=~\"$workflow\"})/1000) by(quantile)",
"expr": "(avg(flyte:propeller:all:workflow:success_duration_ms{project=~\"$project\", domain=~\"$domain\", wf=~\"$workflow\"}) by(quantile))/1000",
"format": "time_series",
"hide": false,
"instant": false,
"interval": "",
"intervalFactor": 2,
"legendFormat": "",
"metric": "",
"query": "avg((flyte:propeller:all:workflow:success_duration_ms{project=~\"$project\", domain=~\"$domain\", wf=~\"$workflow\"})/1000) by(quantile)",
"query": "(avg(flyte:propeller:all:workflow:success_duration_ms{project=~\"$project\", domain=~\"$domain\", wf=~\"$workflow\"}) by(quantile))/1000",
"refId": "A",
"step": 10,
"target": ""
Expand Down Expand Up @@ -734,15 +734,15 @@
"targets": [
{
"datasource": null,
"expr": "avg((flyte:propeller:all:workflow:failure_duration_ms{project=~\"$project\", domain=~\"$domain\", wf=~\"$workflow\"})/1000) by(quantile)",
"expr": "(avg(flyte:propeller:all:workflow:failure_duration_ms{project=~\"$project\", domain=~\"$domain\", wf=~\"$workflow\"}) by(quantile))/1000",
"format": "time_series",
"hide": false,
"instant": false,
"interval": "",
"intervalFactor": 2,
"legendFormat": "",
"metric": "",
"query": "avg((flyte:propeller:all:workflow:failure_duration_ms{project=~\"$project\", domain=~\"$domain\", wf=~\"$workflow\"})/1000) by(quantile)",
"query": "(avg(flyte:propeller:all:workflow:failure_duration_ms{project=~\"$project\", domain=~\"$domain\", wf=~\"$workflow\"}) by(quantile))/1000",
"refId": "A",
"step": 10,
"target": ""
Expand Down Expand Up @@ -1270,15 +1270,15 @@
"targets": [
{
"datasource": null,
"expr": "(100 * max(container_memory_working_set_bytes * on(pod) group_left(label_task_name, label_node_id, label_workflow_name) kube_pod_labels{namespace=\"$project-$domain\",label_workflow_name=\"$workflow\"} * on(pod) group_left(phase) kube_pod_status_phase{phase=\"Running\"}) by (namespace, pod, label_task_name, label_node_id, label_workflow_name) / max(cluster:namespace:pod_memory:active:kube_pod_container_resource_limits{container!=\"\"} * on(pod) group_left(label_task_name, label_node_id, label_workflow_name) kube_pod_labels * on(pod) group_left(phase) kube_pod_status_phase{phase=\"Running\"}) by (namespace, pod, label_task_name, label_node_id, label_workflow_name)) > 0 ",
"expr": "(100 * (max(container_memory_working_set_bytes{container!=\"\"} * on(pod) group_left(label_task_name, label_node_id, label_workflow_name) kube_pod_labels{namespace=~\"$project-$domain\",label_workflow_name=~\"$workflow\"} * on(pod) group_left(phase) kube_pod_status_phase{phase=\"Running\"}) by (namespace, pod, label_task_name, label_node_id, label_workflow_name) / max(cluster:namespace:pod_memory:active:kube_pod_container_resource_limits{container!=\"\"} * on(pod) group_left(label_task_name, label_node_id, label_workflow_name) kube_pod_labels * on(pod) group_left(phase) kube_pod_status_phase{phase=\"Running\"}) by (namespace, pod, label_task_name, label_node_id, label_workflow_name))) > 0",
"format": "time_series",
"hide": false,
"instant": false,
"interval": "",
"intervalFactor": 2,
"legendFormat": "",
"metric": "",
"query": "(100 * max(container_memory_working_set_bytes * on(pod) group_left(label_task_name, label_node_id, label_workflow_name) kube_pod_labels{namespace=\"$project-$domain\",label_workflow_name=\"$workflow\"} * on(pod) group_left(phase) kube_pod_status_phase{phase=\"Running\"}) by (namespace, pod, label_task_name, label_node_id, label_workflow_name) / max(cluster:namespace:pod_memory:active:kube_pod_container_resource_limits{container!=\"\"} * on(pod) group_left(label_task_name, label_node_id, label_workflow_name) kube_pod_labels * on(pod) group_left(phase) kube_pod_status_phase{phase=\"Running\"}) by (namespace, pod, label_task_name, label_node_id, label_workflow_name)) > 0 ",
"query": "(100 * (max(container_memory_working_set_bytes{container!=\"\"} * on(pod) group_left(label_task_name, label_node_id, label_workflow_name) kube_pod_labels{namespace=~\"$project-$domain\",label_workflow_name=~\"$workflow\"} * on(pod) group_left(phase) kube_pod_status_phase{phase=\"Running\"}) by (namespace, pod, label_task_name, label_node_id, label_workflow_name) / max(cluster:namespace:pod_memory:active:kube_pod_container_resource_limits{container!=\"\"} * on(pod) group_left(label_task_name, label_node_id, label_workflow_name) kube_pod_labels * on(pod) group_left(phase) kube_pod_status_phase{phase=\"Running\"}) by (namespace, pod, label_task_name, label_node_id, label_workflow_name))) > 0",
"refId": "A",
"step": 10,
"target": ""
Expand Down Expand Up @@ -1372,15 +1372,15 @@
"targets": [
{
"datasource": null,
"expr": "(max(container_cpu_usage_seconds_total * on(pod) group_left(label_task_name, label_node_id, label_workflow_name) kube_pod_labels{namespace=~\"$project-$domain\",label_workflow_name=~\"$workflow\"} * on(pod) group_left(phase) kube_pod_status_phase{phase=\"Running\"}) by (namespace, pod, label_task_name, label_node_id, label_workflow_name) / max(cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits * on(pod) group_left(label_task_name, label_node_id, label_workflow_name) kube_pod_labels * on(pod) group_left(phase) kube_pod_status_phase{phase=\"Running\"}) by (namespace, pod, label_task_name, label_node_id, label_workflow_name)) > 0",
"expr": "(100 * (sum(rate(container_cpu_usage_seconds_total{image!=\"\"}[2m]) * on(pod) group_left(label_task_name, label_node_id, label_workflow_name) kube_pod_labels{namespace=~\"$project-$domain\",label_workflow_name=~\"$workflow\"} * on(pod) group_left(phase) kube_pod_status_phase{phase=\"Running\"}) by (namespace, pod, label_task_name, label_node_id, label_workflow_name) / sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits{container!=\"\"} * on(pod) group_left(label_task_name, label_node_id, label_workflow_name) kube_pod_labels * on(pod) group_left(phase) kube_pod_status_phase{phase=\"Running\"}) by (namespace, pod, label_task_name, label_node_id, label_workflow_name))) > 0",
"format": "time_series",
"hide": false,
"instant": false,
"interval": "",
"intervalFactor": 2,
"legendFormat": "",
"metric": "",
"query": "(max(container_cpu_usage_seconds_total * on(pod) group_left(label_task_name, label_node_id, label_workflow_name) kube_pod_labels{namespace=~\"$project-$domain\",label_workflow_name=~\"$workflow\"} * on(pod) group_left(phase) kube_pod_status_phase{phase=\"Running\"}) by (namespace, pod, label_task_name, label_node_id, label_workflow_name) / max(cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits * on(pod) group_left(label_task_name, label_node_id, label_workflow_name) kube_pod_labels * on(pod) group_left(phase) kube_pod_status_phase{phase=\"Running\"}) by (namespace, pod, label_task_name, label_node_id, label_workflow_name)) > 0",
"query": "(100 * (sum(rate(container_cpu_usage_seconds_total{image!=\"\"}[2m]) * on(pod) group_left(label_task_name, label_node_id, label_workflow_name) kube_pod_labels{namespace=~\"$project-$domain\",label_workflow_name=~\"$workflow\"} * on(pod) group_left(phase) kube_pod_status_phase{phase=\"Running\"}) by (namespace, pod, label_task_name, label_node_id, label_workflow_name) / sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits{container!=\"\"} * on(pod) group_left(label_task_name, label_node_id, label_workflow_name) kube_pod_labels * on(pod) group_left(phase) kube_pod_status_phase{phase=\"Running\"}) by (namespace, pod, label_task_name, label_node_id, label_workflow_name))) > 0",
"refId": "A",
"step": 10,
"target": ""
Expand Down Expand Up @@ -1490,7 +1490,7 @@
"thresholds": [],
"timeFrom": null,
"timeShift": null,
"title": "User errors",
"title": "User error rate",
"tooltip": {
"msResolution": true,
"shared": true,
Expand Down Expand Up @@ -1618,7 +1618,7 @@
"thresholds": [],
"timeFrom": null,
"timeShift": null,
"title": "System errors",
"title": "System error rate",
"tooltip": {
"msResolution": true,
"shared": true,
Expand Down
17 changes: 8 additions & 9 deletions stats/flyteuser.dashboard.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ def workflow_stats(collapse: bool) -> Row:
dataSource=DATASOURCE,
targets=[
Target(
expr='avg((flyte:propeller:all:workflow:success_duration_ms{project=~"$project", domain=~"$domain", wf=~"$workflow"})/1000) by(quantile)',
expr='(avg(flyte:propeller:all:workflow:success_duration_ms{project=~"$project", domain=~"$domain", wf=~"$workflow"}) by(quantile))/1000',
refId='A',
),
],
Expand All @@ -82,7 +82,7 @@ def workflow_stats(collapse: bool) -> Row:
dataSource=DATASOURCE,
targets=[
Target(
expr='avg((flyte:propeller:all:workflow:failure_duration_ms{project=~"$project", domain=~"$domain", wf=~"$workflow"})/1000) by(quantile)',
expr='(avg(flyte:propeller:all:workflow:failure_duration_ms{project=~"$project", domain=~"$domain", wf=~"$workflow"}) by(quantile))/1000',
refId='A',
),
],
Expand Down Expand Up @@ -156,19 +156,18 @@ def resource_stats(collapse: bool) -> Row:
dataSource=DATASOURCE,
targets=[
Target(
expr='(100 * max(container_memory_working_set_bytes * on(pod) group_left(label_task_name, label_node_id, label_workflow_name) kube_pod_labels{namespace="$project-$domain",label_workflow_name="$workflow"} * on(pod) group_left(phase) kube_pod_status_phase{phase="Running"}) by (namespace, pod, label_task_name, label_node_id, label_workflow_name) / max(cluster:namespace:pod_memory:active:kube_pod_container_resource_limits{container!=""} * on(pod) group_left(label_task_name, label_node_id, label_workflow_name) kube_pod_labels * on(pod) group_left(phase) kube_pod_status_phase{phase="Running"}) by (namespace, pod, label_task_name, label_node_id, label_workflow_name)) > 0 ',
expr='(100 * (max(container_memory_working_set_bytes{container!=""} * on(pod) group_left(label_task_name, label_node_id, label_workflow_name) kube_pod_labels{namespace=~"$project-$domain",label_workflow_name=~"$workflow"} * on(pod) group_left(phase) kube_pod_status_phase{phase="Running"}) by (namespace, pod, label_task_name, label_node_id, label_workflow_name) / max(cluster:namespace:pod_memory:active:kube_pod_container_resource_limits{container!=""} * on(pod) group_left(label_task_name, label_node_id, label_workflow_name) kube_pod_labels * on(pod) group_left(phase) kube_pod_status_phase{phase="Running"}) by (namespace, pod, label_task_name, label_node_id, label_workflow_name))) > 0',
refId='A',
),

],
showValue='true',
),
showValue='true',
),
BarChart(
title="CPU Usage per Task(%)",
dataSource=DATASOURCE,
targets=[
Target(
expr='(max(container_cpu_usage_seconds_total * on(pod) group_left(label_task_name, label_node_id, label_workflow_name) kube_pod_labels{namespace=~"$project-$domain",label_workflow_name=~"$workflow"} * on(pod) group_left(phase) kube_pod_status_phase{phase="Running"}) by (namespace, pod, label_task_name, label_node_id, label_workflow_name) / max(cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits * on(pod) group_left(label_task_name, label_node_id, label_workflow_name) kube_pod_labels * on(pod) group_left(phase) kube_pod_status_phase{phase="Running"}) by (namespace, pod, label_task_name, label_node_id, label_workflow_name)) > 0',
expr='(100 * (sum(rate(container_cpu_usage_seconds_total{image!=""}[2m]) * on(pod) group_left(label_task_name, label_node_id, label_workflow_name) kube_pod_labels{namespace=~"$project-$domain",label_workflow_name=~"$workflow"} * on(pod) group_left(phase) kube_pod_status_phase{phase="Running"}) by (namespace, pod, label_task_name, label_node_id, label_workflow_name) / sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits{container!=""} * on(pod) group_left(label_task_name, label_node_id, label_workflow_name) kube_pod_labels * on(pod) group_left(phase) kube_pod_status_phase{phase="Running"}) by (namespace, pod, label_task_name, label_node_id, label_workflow_name))) > 0',
refId='A',
),
],
Expand All @@ -183,7 +182,7 @@ def errors(collapse: bool) -> Row:
collapse=collapse,
panels=[
Graph(
title="User errors",
title="User error rate",
dataSource=DATASOURCE,
targets=[
Target(
Expand All @@ -194,7 +193,7 @@ def errors(collapse: bool) -> Row:
yAxes=single_y_axis(format=SHORT_FORMAT),
),
Graph(
title="System errors",
title="System error rate",
dataSource=DATASOURCE,
targets=[
Target(
Expand Down

0 comments on commit 81ef23c

Please sign in to comment.