diff --git a/pyproject.toml b/pyproject.toml index 1b84eb67..afa3c33f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -78,6 +78,7 @@ disable = [ "line-too-long", # Black takes care of line length. "logging-fstring-interpolation", "duplicate-code", + "too-many-positional-arguments", ] extension-pkg-whitelist = "pydantic" diff --git a/sarc/alerts/usage_alerts/gpu_util_per_user.py b/sarc/alerts/usage_alerts/gpu_util_per_user.py new file mode 100644 index 00000000..652c2674 --- /dev/null +++ b/sarc/alerts/usage_alerts/gpu_util_per_user.py @@ -0,0 +1,77 @@ +import logging +from datetime import datetime, timedelta +from typing import Optional + +from sarc.config import MTL +from sarc.jobs.series import compute_cost_and_waste, load_job_series + +logger = logging.getLogger(__name__) + + +def check_gpu_util_per_user( + threshold: timedelta, + time_interval: Optional[timedelta] = timedelta(days=7), + minimum_runtime: Optional[timedelta] = timedelta(minutes=5), +): + """ + Check if users have enough utilization of GPUs. + Log a warning for each user if average GPU-util of user jobs + in time interval is lower than a given threshold. + + For a given user job, GPU-util is computed as + gpu_utilization * gpu_equivalent_cost + (with gpu_equivalent_cost as elapsed_time * allocated.gres_gpu). + + Parameters + ---------- + threshold: timedelta + Minimum value for average GPU-util expected per user. + We assume GPU-util is expressed in GPU-seconds, + thus threshold can be expressed with a timedelta. + time_interval + If given, only jobs which ran in [now - time_interval, time_interval] will be used for checking. + Default is last 7 days. + If None, all jobs are used. + minimum_runtime + If given, only jobs which ran at least for this minimum runtime will be used for checking. + Default is 5 minutes. + If None, set to 0. + """ + # Parse time_interval + start, end, clip_time = None, None, False + if time_interval is not None: + end = datetime.now(tz=MTL) + start = end - time_interval + clip_time = True + + # Get data frame. We clip time if start and end are available, + # so that minimum_runtime is compared to job running time in given interval. + df = load_job_series(start=start, end=end, clip_time=clip_time) + + # Parse minimum_runtime, and select only jobs where + # elapsed time >= minimum runtime and allocated.gres_gpu > 0 + if minimum_runtime is None: + minimum_runtime = timedelta(seconds=0) + df = df[ + (df["elapsed_time"] >= minimum_runtime.total_seconds()) + & (df["allocated.gres_gpu"] > 0) + ] + + # Compute cost + df = compute_cost_and_waste(df) + + # Compute GPU-util for each job + df["gpu_util"] = df["gpu_utilization"] * df["gpu_equivalent_cost"] + + # Compute average GPU-util per user + f_stats = df.groupby(["user"])[["gpu_util"]].mean() + + # Now we can check + for row in f_stats.itertuples(): + user = row.Index + gpu_util = row.gpu_util + if gpu_util < threshold.total_seconds(): + logger.warning( + f"[{user}] insufficient average gpu_util: {gpu_util} GPU-seconds; " + f"minimum required: {threshold} ({threshold.total_seconds()} GPU-seconds)" + ) diff --git a/tests/functional/usage_alerts/test_alert_gpu_util_per_user.py b/tests/functional/usage_alerts/test_alert_gpu_util_per_user.py new file mode 100644 index 00000000..199ae37a --- /dev/null +++ b/tests/functional/usage_alerts/test_alert_gpu_util_per_user.py @@ -0,0 +1,51 @@ +import functools +import re +from datetime import timedelta + +import pytest + +from sarc.alerts.usage_alerts.gpu_util_per_user import check_gpu_util_per_user +from sarc.client import get_jobs +from tests.functional.jobs.test_func_load_job_series import MOCK_TIME + +from ..jobs.test_func_job_statistics import generate_fake_timeseries + + +@pytest.mark.freeze_time(MOCK_TIME) +@pytest.mark.usefixtures("read_only_db", "tzlocal_is_mtl") +@pytest.mark.parametrize( + "params", + [ + # Check with default params. In last 7 days from now (mock time: 2023-11-22), + # there is only 2 jobs, both with no gpu_utilization, so, no warnings. + dict(threshold=timedelta()), + # Check with no time_interval and a threshold to 7 days + dict(threshold=timedelta(hours=7), time_interval=None), + # Check with no time_interval and threshold to 6 days + dict(threshold=timedelta(hours=6), time_interval=None), + # Check with a valid time_interval + dict(threshold=timedelta(hours=8), time_interval=timedelta(days=276)), + # Check will all params, including minimum_runtime + dict( + threshold=timedelta(hours=8), + time_interval=timedelta(days=276), + minimum_runtime=timedelta(seconds=39000), + ), + ], +) +def test_alert_gpu_util_per_user(params, caplog, monkeypatch, file_regression): + monkeypatch.setattr( + "sarc.jobs.series.get_job_time_series", generate_fake_timeseries + ) + + for job in get_jobs(): + job.statistics(save=True) + + check_gpu_util_per_user(**params) + file_regression.check( + re.sub( + r"WARNING +sarc\.alerts\.usage_alerts\.gpu_util_per_user:gpu_util_per_user.py:[0-9]+ +", + "", + caplog.text, + ) + ) diff --git a/tests/functional/usage_alerts/test_alert_gpu_util_per_user/test_alert_gpu_util_per_user_params0_.txt b/tests/functional/usage_alerts/test_alert_gpu_util_per_user/test_alert_gpu_util_per_user_params0_.txt new file mode 100644 index 00000000..e69de29b diff --git a/tests/functional/usage_alerts/test_alert_gpu_util_per_user/test_alert_gpu_util_per_user_params1_.txt b/tests/functional/usage_alerts/test_alert_gpu_util_per_user/test_alert_gpu_util_per_user_params1_.txt new file mode 100644 index 00000000..f947035b --- /dev/null +++ b/tests/functional/usage_alerts/test_alert_gpu_util_per_user/test_alert_gpu_util_per_user_params1_.txt @@ -0,0 +1,4 @@ +[beaubonhomme] insufficient average gpu_util: 21585.0 GPU-seconds; minimum required: 7:00:00 (25200.0 GPU-seconds) +[bonhomme] insufficient average gpu_util: 21585.0 GPU-seconds; minimum required: 7:00:00 (25200.0 GPU-seconds) +[grosbonhomme] insufficient average gpu_util: 21585.0 GPU-seconds; minimum required: 7:00:00 (25200.0 GPU-seconds) +[petitbonhomme] insufficient average gpu_util: 22784.166666666668 GPU-seconds; minimum required: 7:00:00 (25200.0 GPU-seconds) diff --git a/tests/functional/usage_alerts/test_alert_gpu_util_per_user/test_alert_gpu_util_per_user_params2_.txt b/tests/functional/usage_alerts/test_alert_gpu_util_per_user/test_alert_gpu_util_per_user_params2_.txt new file mode 100644 index 00000000..8870360a --- /dev/null +++ b/tests/functional/usage_alerts/test_alert_gpu_util_per_user/test_alert_gpu_util_per_user_params2_.txt @@ -0,0 +1,3 @@ +[beaubonhomme] insufficient average gpu_util: 21585.0 GPU-seconds; minimum required: 6:00:00 (21600.0 GPU-seconds) +[bonhomme] insufficient average gpu_util: 21585.0 GPU-seconds; minimum required: 6:00:00 (21600.0 GPU-seconds) +[grosbonhomme] insufficient average gpu_util: 21585.0 GPU-seconds; minimum required: 6:00:00 (21600.0 GPU-seconds) diff --git a/tests/functional/usage_alerts/test_alert_gpu_util_per_user/test_alert_gpu_util_per_user_params3_.txt b/tests/functional/usage_alerts/test_alert_gpu_util_per_user/test_alert_gpu_util_per_user_params3_.txt new file mode 100644 index 00000000..0045c641 --- /dev/null +++ b/tests/functional/usage_alerts/test_alert_gpu_util_per_user/test_alert_gpu_util_per_user_params3_.txt @@ -0,0 +1,3 @@ +[beaubonhomme] insufficient average gpu_util: 19816.229166666668 GPU-seconds; minimum required: 8:00:00 (28800.0 GPU-seconds) +[grosbonhomme] insufficient average gpu_util: 9023.729166666666 GPU-seconds; minimum required: 8:00:00 (28800.0 GPU-seconds) +[petitbonhomme] insufficient average gpu_util: 28780.0 GPU-seconds; minimum required: 8:00:00 (28800.0 GPU-seconds) diff --git a/tests/functional/usage_alerts/test_alert_gpu_util_per_user/test_alert_gpu_util_per_user_params4_.txt b/tests/functional/usage_alerts/test_alert_gpu_util_per_user/test_alert_gpu_util_per_user_params4_.txt new file mode 100644 index 00000000..42a8b9d1 --- /dev/null +++ b/tests/functional/usage_alerts/test_alert_gpu_util_per_user/test_alert_gpu_util_per_user_params4_.txt @@ -0,0 +1,2 @@ +[beaubonhomme] insufficient average gpu_util: 19816.229166666668 GPU-seconds; minimum required: 8:00:00 (28800.0 GPU-seconds) +[petitbonhomme] insufficient average gpu_util: 28780.0 GPU-seconds; minimum required: 8:00:00 (28800.0 GPU-seconds)