Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SARC-331] Implémenter les alertes : GPU-util moyen d’un user sur une période X plus bas qu’un threshold X #133

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ disable = [
"line-too-long", # Black takes care of line length.
"logging-fstring-interpolation",
"duplicate-code",
"too-many-positional-arguments",
]
extension-pkg-whitelist = "pydantic"

Expand Down
77 changes: 77 additions & 0 deletions sarc/alerts/usage_alerts/gpu_util_per_user.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
import logging
from datetime import datetime, timedelta
from typing import Optional

from sarc.config import MTL
from sarc.jobs.series import compute_cost_and_waste, load_job_series

logger = logging.getLogger(__name__)


def check_gpu_util_per_user(
threshold: timedelta,
time_interval: Optional[timedelta] = timedelta(days=7),
minimum_runtime: Optional[timedelta] = timedelta(minutes=5),
):
"""
Check if users have enough utilization of GPUs.
Log a warning for each user if average GPU-util of user jobs
in time interval is lower than a given threshold.

For a given user job, GPU-util is computed as
gpu_utilization * gpu_equivalent_cost
(with gpu_equivalent_cost as elapsed_time * allocated.gres_gpu).

Parameters
----------
threshold: timedelta
Minimum value for average GPU-util expected per user.
We assume GPU-util is expressed in GPU-seconds,
thus threshold can be expressed with a timedelta.
time_interval
If given, only jobs which ran in [now - time_interval, time_interval] will be used for checking.
Default is last 7 days.
If None, all jobs are used.
minimum_runtime
If given, only jobs which ran at least for this minimum runtime will be used for checking.
Default is 5 minutes.
If None, set to 0.
"""
# Parse time_interval
start, end, clip_time = None, None, False
if time_interval is not None:
end = datetime.now(tz=MTL)
start = end - time_interval
clip_time = True

# Get data frame. We clip time if start and end are available,
# so that minimum_runtime is compared to job running time in given interval.
df = load_job_series(start=start, end=end, clip_time=clip_time)

# Parse minimum_runtime, and select only jobs where
# elapsed time >= minimum runtime and allocated.gres_gpu > 0
if minimum_runtime is None:
minimum_runtime = timedelta(seconds=0)
df = df[
(df["elapsed_time"] >= minimum_runtime.total_seconds())
& (df["allocated.gres_gpu"] > 0)
]

# Compute cost
df = compute_cost_and_waste(df)

# Compute GPU-util for each job
df["gpu_util"] = df["gpu_utilization"] * df["gpu_equivalent_cost"]

# Compute average GPU-util per user
f_stats = df.groupby(["user"])[["gpu_util"]].mean()

# Now we can check
for row in f_stats.itertuples():
user = row.Index
gpu_util = row.gpu_util
if gpu_util < threshold.total_seconds():
logger.warning(
f"[{user}] insufficient average gpu_util: {gpu_util} GPU-seconds; "
f"minimum required: {threshold} ({threshold.total_seconds()} GPU-seconds)"
)
51 changes: 51 additions & 0 deletions tests/functional/usage_alerts/test_alert_gpu_util_per_user.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
import functools
import re
from datetime import timedelta

import pytest

from sarc.alerts.usage_alerts.gpu_util_per_user import check_gpu_util_per_user
from sarc.client import get_jobs
from tests.functional.jobs.test_func_load_job_series import MOCK_TIME

from ..jobs.test_func_job_statistics import generate_fake_timeseries


@pytest.mark.freeze_time(MOCK_TIME)
@pytest.mark.usefixtures("read_only_db", "tzlocal_is_mtl")
@pytest.mark.parametrize(
"params",
[
# Check with default params. In last 7 days from now (mock time: 2023-11-22),
# there is only 2 jobs, both with no gpu_utilization, so, no warnings.
dict(threshold=timedelta()),
# Check with no time_interval and a threshold to 7 days
dict(threshold=timedelta(hours=7), time_interval=None),
# Check with no time_interval and threshold to 6 days
dict(threshold=timedelta(hours=6), time_interval=None),
# Check with a valid time_interval
dict(threshold=timedelta(hours=8), time_interval=timedelta(days=276)),
# Check will all params, including minimum_runtime
dict(
threshold=timedelta(hours=8),
time_interval=timedelta(days=276),
minimum_runtime=timedelta(seconds=39000),
),
],
)
def test_alert_gpu_util_per_user(params, caplog, monkeypatch, file_regression):
monkeypatch.setattr(
"sarc.jobs.series.get_job_time_series", generate_fake_timeseries
)

for job in get_jobs():
job.statistics(save=True)

check_gpu_util_per_user(**params)
file_regression.check(
re.sub(
r"WARNING +sarc\.alerts\.usage_alerts\.gpu_util_per_user:gpu_util_per_user.py:[0-9]+ +",
"",
caplog.text,
)
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
[beaubonhomme] insufficient average gpu_util: 21585.0 GPU-seconds; minimum required: 7:00:00 (25200.0 GPU-seconds)
[bonhomme] insufficient average gpu_util: 21585.0 GPU-seconds; minimum required: 7:00:00 (25200.0 GPU-seconds)
[grosbonhomme] insufficient average gpu_util: 21585.0 GPU-seconds; minimum required: 7:00:00 (25200.0 GPU-seconds)
[petitbonhomme] insufficient average gpu_util: 22784.166666666668 GPU-seconds; minimum required: 7:00:00 (25200.0 GPU-seconds)
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[beaubonhomme] insufficient average gpu_util: 21585.0 GPU-seconds; minimum required: 6:00:00 (21600.0 GPU-seconds)
[bonhomme] insufficient average gpu_util: 21585.0 GPU-seconds; minimum required: 6:00:00 (21600.0 GPU-seconds)
[grosbonhomme] insufficient average gpu_util: 21585.0 GPU-seconds; minimum required: 6:00:00 (21600.0 GPU-seconds)
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[beaubonhomme] insufficient average gpu_util: 19816.229166666668 GPU-seconds; minimum required: 8:00:00 (28800.0 GPU-seconds)
[grosbonhomme] insufficient average gpu_util: 9023.729166666666 GPU-seconds; minimum required: 8:00:00 (28800.0 GPU-seconds)
[petitbonhomme] insufficient average gpu_util: 28780.0 GPU-seconds; minimum required: 8:00:00 (28800.0 GPU-seconds)
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
[beaubonhomme] insufficient average gpu_util: 19816.229166666668 GPU-seconds; minimum required: 8:00:00 (28800.0 GPU-seconds)
[petitbonhomme] insufficient average gpu_util: 28780.0 GPU-seconds; minimum required: 8:00:00 (28800.0 GPU-seconds)
Loading