Skip to content

Commit

Permalink
[SARC-331] Implémenter les alertes : GPU-util moyen d’un user sur une…
Browse files Browse the repository at this point in the history
… période X plus bas qu’un threshold X
  • Loading branch information
notoraptor committed Sep 16, 2024
1 parent a64fd9a commit d49b3a3
Show file tree
Hide file tree
Showing 2 changed files with 157 additions and 0 deletions.
77 changes: 77 additions & 0 deletions sarc/alerts/usage_alerts/gpu_util_per_user.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
import logging
from datetime import datetime, timedelta
from typing import Optional

from sarc.config import MTL
from sarc.jobs.series import compute_cost_and_waste, load_job_series

logger = logging.getLogger(__name__)


def check_gpu_util_per_user(
threshold: timedelta,
time_interval: Optional[timedelta] = timedelta(days=7),
minimum_runtime: Optional[timedelta] = timedelta(minutes=5),
):
"""
Check if users have enough utilization of GPUs.
Log a warning for each user if average GPU-util of user jobs
in time interval is lower than a given threshold.
For a given user job, GPU-util is computed as
gpu_utilization * gpu_equivalent_cost
(with gpu_equivalent_cost as elapsed_time * allocated.gres_gpu).
Parameters
----------
threshold: timedelta
Minimum value for average GPU-util expected per user.
We assume GPU-util is expressed in GPU-seconds,
thus threshold can be expressed with a timedelta.
time_interval
If given, only jobs which ran in [now - time_interval, time_interval] will be used for checking.
Default is last 7 days.
If None, all jobs are used.
minimum_runtime
If given, only jobs which ran at least for this minimum runtime will be used for checking.
Default is 5 minutes.
If None, set to 0.
"""
# Parse time_interval
start, end, clip_time = None, None, False
if time_interval is not None:
end = datetime.now(tz=MTL)
start = end - time_interval
clip_time = True

# Get data frame. We clip time if start and end are available,
# so that minimum_runtime is compared to job running time in given interval.
df = load_job_series(start=start, end=end, clip_time=clip_time)

# Parse minimum_runtime, and select only jobs where
# elapsed time >= minimum runtime and allocated.gres_gpu > 0
if minimum_runtime is None:
minimum_runtime = timedelta(seconds=0)
df = df[
(df["elapsed_time"] >= minimum_runtime.total_seconds())
& (df["allocated.gres_gpu"] > 0)
]

# Compute cost
df = compute_cost_and_waste(df)

# Compute GPU-util for each job
df["gpu_util"] = df["gpu_utilization"] * df["gpu_equivalent_cost"]

# Compute average GPU-util per user
f_stats = df.groupby(["user"])[["gpu_util"]].mean()

# Now we can check
for row in f_stats.itertuples():
user = row.Index
gpu_util = row.gpu_util
if gpu_util < threshold.total_seconds():
logger.warning(
f"[{user}] insufficient average gpu_util: {gpu_util} GPU-seconds; "
f"minimum required: {threshold} ({threshold.total_seconds()} GPU-seconds)"
)
80 changes: 80 additions & 0 deletions tests/functional/usage_alerts/test_alert_gpu_util_per_user.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
import functools
from datetime import timedelta

import pytest

from sarc.alerts.usage_alerts.gpu_util_per_user import check_gpu_util_per_user
from sarc.client import get_jobs
from tests.functional.jobs.test_func_load_job_series import MOCK_TIME

from ..jobs.test_func_job_statistics import generate_fake_timeseries
from .common import _get_warnings

get_warnings = functools.partial(
_get_warnings,
module="sarc.alerts.usage_alerts.gpu_util_per_user:gpu_util_per_user.py",
)


@pytest.mark.freeze_time(MOCK_TIME)
@pytest.mark.usefixtures("read_only_db", "tzlocal_is_mtl")
@pytest.mark.parametrize(
"params,expected",
[
# Check with default params. In last 7 days from now (mock time: 2023-11-22),
# there is only 2 jobs, both with no gpu_utilization, so, no warnings.
(dict(threshold=timedelta()), []),
# Check with no time_interval and a threshold to 7 days
(
dict(threshold=timedelta(hours=7), time_interval=None),
[
"[beaubonhomme] insufficient average gpu_util: 21585.0 GPU-seconds; minimum required: 7:00:00 (25200.0 GPU-seconds)",
"[bonhomme] insufficient average gpu_util: 21585.0 GPU-seconds; minimum required: 7:00:00 (25200.0 GPU-seconds)",
"[grosbonhomme] insufficient average gpu_util: 21585.0 GPU-seconds; minimum required: 7:00:00 (25200.0 GPU-seconds)",
"[petitbonhomme] insufficient average gpu_util: 22784.166666666668 GPU-seconds; minimum required: 7:00:00 (25200.0 GPU-seconds)",
],
),
# Check with no time_interval and threshold to 6 days
(
dict(threshold=timedelta(hours=6), time_interval=None),
[
"[beaubonhomme] insufficient average gpu_util: 21585.0 GPU-seconds; minimum required: 6:00:00 (21600.0 GPU-seconds)",
"[bonhomme] insufficient average gpu_util: 21585.0 GPU-seconds; minimum required: 6:00:00 (21600.0 GPU-seconds)",
"[grosbonhomme] insufficient average gpu_util: 21585.0 GPU-seconds; minimum required: 6:00:00 (21600.0 GPU-seconds)",
# "[petitbonhomme]
],
),
# Check with a valid time_interval
(
dict(threshold=timedelta(hours=8), time_interval=timedelta(days=276)),
[
"[beaubonhomme] insufficient average gpu_util: 19816.229166666668 GPU-seconds; minimum required: 8:00:00 (28800.0 GPU-seconds)",
"[grosbonhomme] insufficient average gpu_util: 9023.729166666666 GPU-seconds; minimum required: 8:00:00 (28800.0 GPU-seconds)",
"[petitbonhomme] insufficient average gpu_util: 28780.0 GPU-seconds; minimum required: 8:00:00 (28800.0 GPU-seconds)",
],
),
# Check will all params, including minimum_runtime
(
dict(
threshold=timedelta(hours=8),
time_interval=timedelta(days=276),
minimum_runtime=timedelta(seconds=39000),
),
[
"[beaubonhomme] insufficient average gpu_util: 19816.229166666668 GPU-seconds; minimum required: 8:00:00 (28800.0 GPU-seconds)",
# "[grosbonhomme] insufficient average gpu_util: 9023.729166666666 GPU-seconds; minimum required: 8:00:00 (28800.0 GPU-seconds)",
"[petitbonhomme] insufficient average gpu_util: 28780.0 GPU-seconds; minimum required: 8:00:00 (28800.0 GPU-seconds)",
],
),
],
)
def test_alert_gpu_util_per_user(params, expected, caplog, monkeypatch):
monkeypatch.setattr(
"sarc.jobs.series.get_job_time_series", generate_fake_timeseries
)

for job in get_jobs():
job.statistics(save=True)

check_gpu_util_per_user(**params)
assert get_warnings(caplog.text) == expected

0 comments on commit d49b3a3

Please sign in to comment.