[SARC-331] Implémenter les alertes : GPU-util moyen d’un user sur une…

… période X plus bas qu’un threshold X
mila-iqia · Sep 16, 2024 · d49b3a3 · d49b3a3
1 parent a64fd9a
commit d49b3a3
Show file tree

Hide file tree

Showing 2 changed files with 157 additions and 0 deletions.
diff --git a/sarc/alerts/usage_alerts/gpu_util_per_user.py b/sarc/alerts/usage_alerts/gpu_util_per_user.py
@@ -0,0 +1,77 @@
+import logging
+from datetime import datetime, timedelta
+from typing import Optional
+
+from sarc.config import MTL
+from sarc.jobs.series import compute_cost_and_waste, load_job_series
+
+logger = logging.getLogger(__name__)
+
+
+def check_gpu_util_per_user(
+    threshold: timedelta,
+    time_interval: Optional[timedelta] = timedelta(days=7),
+    minimum_runtime: Optional[timedelta] = timedelta(minutes=5),
+):
+    """
+    Check if users have enough utilization of GPUs.
+    Log a warning for each user if average GPU-util of user jobs
+    in time interval is lower than a given threshold.
+
+    For a given user job, GPU-util is computed as
+    gpu_utilization * gpu_equivalent_cost
+    (with gpu_equivalent_cost as elapsed_time * allocated.gres_gpu).
+
+    Parameters
+    ----------
+    threshold: timedelta
+        Minimum value for average GPU-util expected per user.
+        We assume GPU-util is expressed in GPU-seconds,
+        thus threshold can be expressed with a timedelta.
+    time_interval
+        If given, only jobs which ran in [now - time_interval, time_interval] will be used for checking.
+        Default is last 7 days.
+        If None, all jobs are used.
+    minimum_runtime
+        If given, only jobs which ran at least for this minimum runtime will be used for checking.
+        Default is 5 minutes.
+        If None, set to 0.
+    """
+    # Parse time_interval
+    start, end, clip_time = None, None, False
+    if time_interval is not None:
+        end = datetime.now(tz=MTL)
+        start = end - time_interval
+        clip_time = True
+
+    # Get data frame. We clip time if start and end are available,
+    # so that minimum_runtime is compared to job running time in given interval.
+    df = load_job_series(start=start, end=end, clip_time=clip_time)
+
+    # Parse minimum_runtime, and select only jobs where
+    # elapsed time >= minimum runtime and allocated.gres_gpu > 0
+    if minimum_runtime is None:
+        minimum_runtime = timedelta(seconds=0)
+    df = df[
+        (df["elapsed_time"] >= minimum_runtime.total_seconds())
+        & (df["allocated.gres_gpu"] > 0)
+    ]
+
+    # Compute cost
+    df = compute_cost_and_waste(df)
+
+    # Compute GPU-util for each job
+    df["gpu_util"] = df["gpu_utilization"] * df["gpu_equivalent_cost"]
+
+    # Compute average GPU-util per user
+    f_stats = df.groupby(["user"])[["gpu_util"]].mean()
+
+    # Now we can check
+    for row in f_stats.itertuples():
+        user = row.Index
+        gpu_util = row.gpu_util
+        if gpu_util < threshold.total_seconds():
+            logger.warning(
+                f"[{user}] insufficient average gpu_util: {gpu_util} GPU-seconds; "
+                f"minimum required: {threshold} ({threshold.total_seconds()} GPU-seconds)"
+            )
diff --git a/tests/functional/usage_alerts/test_alert_gpu_util_per_user.py b/tests/functional/usage_alerts/test_alert_gpu_util_per_user.py
@@ -0,0 +1,80 @@
+import functools
+from datetime import timedelta
+
+import pytest
+
+from sarc.alerts.usage_alerts.gpu_util_per_user import check_gpu_util_per_user
+from sarc.client import get_jobs
+from tests.functional.jobs.test_func_load_job_series import MOCK_TIME
+
+from ..jobs.test_func_job_statistics import generate_fake_timeseries
+from .common import _get_warnings
+
+get_warnings = functools.partial(
+    _get_warnings,
+    module="sarc.alerts.usage_alerts.gpu_util_per_user:gpu_util_per_user.py",
+)
+
+
+@pytest.mark.freeze_time(MOCK_TIME)
+@pytest.mark.usefixtures("read_only_db", "tzlocal_is_mtl")
+@pytest.mark.parametrize(
+    "params,expected",
+    [
+        # Check with default params. In last 7 days from now (mock time: 2023-11-22),
+        # there is only 2 jobs, both with no gpu_utilization, so, no warnings.
+        (dict(threshold=timedelta()), []),
+        # Check with no time_interval and a threshold to 7 days
+        (
+            dict(threshold=timedelta(hours=7), time_interval=None),
+            [
+                "[beaubonhomme] insufficient average gpu_util: 21585.0 GPU-seconds; minimum required: 7:00:00 (25200.0 GPU-seconds)",
+                "[bonhomme] insufficient average gpu_util: 21585.0 GPU-seconds; minimum required: 7:00:00 (25200.0 GPU-seconds)",
+                "[grosbonhomme] insufficient average gpu_util: 21585.0 GPU-seconds; minimum required: 7:00:00 (25200.0 GPU-seconds)",
+                "[petitbonhomme] insufficient average gpu_util: 22784.166666666668 GPU-seconds; minimum required: 7:00:00 (25200.0 GPU-seconds)",
+            ],
+        ),
+        # Check with no time_interval and threshold to 6 days
+        (
+            dict(threshold=timedelta(hours=6), time_interval=None),
+            [
+                "[beaubonhomme] insufficient average gpu_util: 21585.0 GPU-seconds; minimum required: 6:00:00 (21600.0 GPU-seconds)",
+                "[bonhomme] insufficient average gpu_util: 21585.0 GPU-seconds; minimum required: 6:00:00 (21600.0 GPU-seconds)",
+                "[grosbonhomme] insufficient average gpu_util: 21585.0 GPU-seconds; minimum required: 6:00:00 (21600.0 GPU-seconds)",
+                # "[petitbonhomme]
+            ],
+        ),
+        # Check with a valid time_interval
+        (
+            dict(threshold=timedelta(hours=8), time_interval=timedelta(days=276)),
+            [
+                "[beaubonhomme] insufficient average gpu_util: 19816.229166666668 GPU-seconds; minimum required: 8:00:00 (28800.0 GPU-seconds)",
+                "[grosbonhomme] insufficient average gpu_util: 9023.729166666666 GPU-seconds; minimum required: 8:00:00 (28800.0 GPU-seconds)",
+                "[petitbonhomme] insufficient average gpu_util: 28780.0 GPU-seconds; minimum required: 8:00:00 (28800.0 GPU-seconds)",
+            ],
+        ),
+        # Check will all params, including minimum_runtime
+        (
+            dict(
+                threshold=timedelta(hours=8),
+                time_interval=timedelta(days=276),
+                minimum_runtime=timedelta(seconds=39000),
+            ),
+            [
+                "[beaubonhomme] insufficient average gpu_util: 19816.229166666668 GPU-seconds; minimum required: 8:00:00 (28800.0 GPU-seconds)",
+                # "[grosbonhomme] insufficient average gpu_util: 9023.729166666666 GPU-seconds; minimum required: 8:00:00 (28800.0 GPU-seconds)",
+                "[petitbonhomme] insufficient average gpu_util: 28780.0 GPU-seconds; minimum required: 8:00:00 (28800.0 GPU-seconds)",
+            ],
+        ),
+    ],
+)
+def test_alert_gpu_util_per_user(params, expected, caplog, monkeypatch):
+    monkeypatch.setattr(
+        "sarc.jobs.series.get_job_time_series", generate_fake_timeseries
+    )
+
+    for job in get_jobs():
+        job.statistics(save=True)
+
+    check_gpu_util_per_user(**params)
+    assert get_warnings(caplog.text) == expected