SARC-271: Update data frame with RGU data (#113)

* SARC-271: Update data frame with RGU data * Log warnings if RGU start date or RGU/GPU ratios are not available. * Rename function to update_cluster_job_series_rgu and make sure only jobs related to given cluster name are updated. Add new function update_job_series_rgu to apply changes on data frame for all clusters. * Log a warning if RGU file is not specified and another warning if RGU file is not found. --------- Co-authored-by: Bruno Carrez <[email protected]>
mila-iqia · Apr 5, 2024 · 44eabe9 · 44eabe9
1 parent 59e47b7
commit 44eabe9
Show file tree

Hide file tree

Showing 11 changed files with 631 additions and 14 deletions.
diff --git a/config/sarc-dev.json b/config/sarc-dev.json
@@ -40,7 +40,9 @@
             "diskusage_report_command": "diskusage_report --project --all_users",
             "prometheus_url": "https://mila-thanos.calculquebec.ca",
             "prometheus_headers_file": "secrets/drac_prometheus/headers.json",
-            "start_date": "2022-04-01"
+            "start_date": "2022-04-01",
+            "rgu_start_date": "2023-11-28",
+            "gpu_to_rgu_billing": "secrets/gpu_to_rgu_billing_narval.json"
         },
         "beluga": {
             "host": "beluga.computecanada.ca",
@@ -52,7 +54,9 @@
             "diskusage_report_command": "diskusage_report --project --all_users",
             "prometheus_url": "https://mila-thanos.calculquebec.ca",
             "prometheus_headers_file": "secrets/drac_prometheus/headers.json",
-            "start_date": "2022-04-01"
+            "start_date": "2022-04-01",
+            "rgu_start_date": "2024-04-03",
+            "gpu_to_rgu_billing": "secrets/gpu_to_rgu_billing_beluga.json"
         },
         "graham": {
             "host": "graham.computecanada.ca",
@@ -65,7 +69,9 @@
             "prometheus_url": null,
             "prometheus_headers_file": null,
             "start_date": "2022-04-01",
-            "nodes_info_file": "secrets/nodes_graham.txt"
+            "nodes_info_file": "secrets/nodes_graham.txt",
+            "rgu_start_date": "2024-04-03",
+            "gpu_to_rgu_billing": "secrets/gpu_to_rgu_billing_graham.json"
         },
         "cedar": {
             "host": "cedar.computecanada.ca",
@@ -78,7 +84,9 @@
             "prometheus_url": null,
             "prometheus_headers_file": null,
             "start_date": "2022-04-01",
-            "nodes_info_file": "secrets/nodes_cedar.txt"
+            "nodes_info_file": "secrets/nodes_cedar.txt",
+            "rgu_start_date": "2024-04-03",
+            "gpu_to_rgu_billing": "secrets/gpu_to_rgu_billing_cedar.json"
         }
     }
 }
diff --git a/config/sarc-prod.json b/config/sarc-prod.json
@@ -40,7 +40,9 @@
             "diskusage_report_command": "diskusage_report --project --all_users",
             "prometheus_url": "https://mila-thanos.calculquebec.ca",
             "prometheus_headers_file": "secrets/drac_prometheus/headers.json",
-            "start_date": "2022-04-01"
+            "start_date": "2022-04-01",
+            "rgu_start_date": "2023-11-28",
+            "gpu_to_rgu_billing": "secrets/gpu_to_rgu_billing_narval.json"
         },
         "beluga": {
             "host": "beluga.computecanada.ca",
@@ -52,7 +54,9 @@
             "diskusage_report_command": "diskusage_report --project --all_users",
             "prometheus_url": "https://mila-thanos.calculquebec.ca",
             "prometheus_headers_file": "secrets/drac_prometheus/headers.json",
-            "start_date": "2022-04-01"
+            "start_date": "2022-04-01",
+            "rgu_start_date": "2024-04-03",
+            "gpu_to_rgu_billing": "secrets/gpu_to_rgu_billing_beluga.json"
         },
         "graham": {
             "host": "graham.computecanada.ca",
@@ -65,7 +69,9 @@
             "prometheus_url": null,
             "prometheus_headers_file": null,
             "start_date": "2022-04-01",
-            "nodes_info_file": "secrets/nodes_graham.txt"
+            "nodes_info_file": "secrets/nodes_graham.txt",
+            "rgu_start_date": "2024-04-03",
+            "gpu_to_rgu_billing": "secrets/gpu_to_rgu_billing_graham.json"
         },
         "cedar": {
             "host": "cedar.computecanada.ca",
@@ -78,7 +84,9 @@
             "prometheus_url": null,
             "prometheus_headers_file": null,
             "start_date": "2022-04-01",
-            "nodes_info_file": "secrets/nodes_cedar.txt"
+            "nodes_info_file": "secrets/nodes_cedar.txt",
+            "rgu_start_date": "2024-04-03",
+            "gpu_to_rgu_billing": "secrets/gpu_to_rgu_billing_cedar.json"
         }
     }
 }
diff --git a/sarc/config.py b/sarc/config.py
@@ -83,6 +83,8 @@ class ClusterConfig(BaseModel):
     duc_storage_command: str = None
     diskusage_report_command: str = None
     start_date: str = "2022-04-01"
+    rgu_start_date: str = None
+    gpu_to_rgu_billing: Path = None
 
     @validator("timezone")
     def _timezone(cls, value):

diff --git a/sarc/jobs/series.py b/sarc/jobs/series.py
@@ -1,5 +1,8 @@
 from __future__ import annotations
 
+import json
+import logging
+import os.path
 from datetime import datetime, timedelta
 from typing import TYPE_CHECKING, Callable
 
@@ -9,7 +12,7 @@
 from prometheus_api_client import MetricRangeDataFrame
 from tqdm import tqdm
 
-from sarc.config import MTL, UTC
+from sarc.config import MTL, UTC, ClusterConfig, config
 from sarc.jobs.job import JobStatistics, Statistics, count_jobs, get_jobs
 from sarc.traces import trace_decorator
 
@@ -401,6 +404,174 @@ def load_job_series(
     return pandas.DataFrame(rows)
 
 
+def update_cluster_job_series_rgu(
+    df: pandas.DataFrame, cluster_config: ClusterConfig
+) -> pandas.DataFrame:
+    """
+    Compute RGU information for jobs related to given cluster config in a data frame.
+
+    Parameters
+    ----------
+    df: DataFrame
+        Data frame to update, typically returned by `load_job_series`.
+        Should contain fields:
+        "cluster_name", "start_time", "allocated.gpu_type", "allocated.gres_gpu".
+    cluster_config: ClusterConfig
+        Configuration of cluster to which jobs to update belong.
+        Should define following config:
+        "rgu_start_date": date since when billing is given as RGU.
+        "gpu_to_rgu_billing": path to a JSON file containing a dict which maps
+        GPU type to RGU cost per GPU.
+
+    Returns
+    -------
+    DataFrame
+        Input data frame with:
+        - column `allocated.gres_gpu` updated if necessary.
+        - column `allocated.gres_rgu` added or updated to contain RGU billing.
+          Set to NaN (or unchanged if already present) for jobs from other clusters.
+        - column `gpu_type_rgu` added or updated to contain RGU cost per GPU (RGU/GPU ratio).
+          Set to NaN (or unchanged if already present) for jobs from other clusters.
+
+    Pseudocode describing how we update data frame:
+    for each job: if job.cluster_name == cluster_config.name:
+        if start_time < cluster_config.rgu_start_date:
+            # We are BEFORE transition to RGU
+            if allocated.gpu_type in gpu_to_rgu_billing:
+                # compute rgu columns
+                allocated.gres_rgu = allocated.gres_gpu * gpu_to_rgu_billing[allocated.gpu_type]
+                allocated.gpu_type_rgu = gpu_to_rgu_billing[allocated.gpu_type]
+            else:
+                # set rgu columns to nan
+                allocated.gres_rgu = nan
+                allocated.gpu_type_rgu = nan
+        else:
+            # We are AFTER transition to RGU
+            # Anyway, we assume gres_rgu is current gres_gpu
+            allocated.gres_rgu = allocated.gres_gpu
+
+            if allocated.gpu_type in gpu_to_rgu_billing:
+                # we fix gres_gpu by dividing it with RGU/GPU ratio
+                allocated.gres_gpu = allocated.gres_gpu / gpu_to_rgu_billing[allocated.gpu_type]
+                # we save RGU/GPU ratio
+                allocated.gpu_type_rgu = gpu_to_rgu_billing[allocated.gpu_type]
+            else:
+                # we cannot fix gres_gpu, so we set it to nan
+                allocated.gres_gpu = nan
+                # we cannot get RGU/GPU ratio, so we set it to nan
+                allocated.gpu_type_rgu = nan
+    """
+
+    # Make sure frame will have new RGU columns anyway, with NaN as default value.
+    if "allocated.gres_rgu" not in df.columns:
+        df["allocated.gres_rgu"] = np.nan
+    if "allocated.gpu_type_rgu" not in df.columns:
+        df["allocated.gpu_type_rgu"] = np.nan
+
+    if cluster_config.rgu_start_date is None:
+        logging.warning(
+            f"RGU update: no RGU start date for cluster {cluster_config.name}"
+        )
+        return df
+
+    if cluster_config.gpu_to_rgu_billing is None:
+        logging.warning(
+            f"RGU update: no RGU/GPU JSON path for cluster {cluster_config.name}"
+        )
+        return df
+
+    if not os.path.isfile(cluster_config.gpu_to_rgu_billing):
+        logging.warning(
+            f"RGU update: RGU/GPU JSON file not found for cluster {cluster_config.name} "
+            f"at: {cluster_config.gpu_to_rgu_billing}"
+        )
+        return df
+
+    # Otherwise, parse RGU start date.
+    rgu_start_date = datetime.fromisoformat(cluster_config.rgu_start_date).astimezone(
+        MTL
+    )
+
+    # Get RGU/GPU ratios.
+    with open(cluster_config.gpu_to_rgu_billing, "r", encoding="utf-8") as file:
+        gpu_to_rgu_billing = json.load(file)
+        assert isinstance(gpu_to_rgu_billing, dict)
+    if not gpu_to_rgu_billing:
+        logging.warning(
+            f"RGU update: no RGU/GPU available for cluster {cluster_config.name}"
+        )
+        return df
+
+    # We have now both RGU stare date and RGU/GPU ratios. We can update columns.
+
+    # Compute column allocated.gpu_type_rgu
+    # If a GPU type is not found in RGU/GPU ratios,
+    # then ratio will be set to NaN in output column.
+    col_ratio_rgu_by_gpu = df["allocated.gpu_type"].map(gpu_to_rgu_billing)
+
+    # Compute slices for both before and since RGU start date.
+    slice_before_rgu_time = (df["cluster_name"] == cluster_config.name) & (
+        df["start_time"] < rgu_start_date
+    )
+    slice_after_rgu_time = (df["cluster_name"] == cluster_config.name) & (
+        df["start_time"] >= rgu_start_date
+    )
+
+    # We can already set column allocated.gpu_type_rgu anyway.
+    df.loc[slice_before_rgu_time, "allocated.gpu_type_rgu"] = col_ratio_rgu_by_gpu[
+        slice_before_rgu_time
+    ]
+    df.loc[slice_after_rgu_time, "allocated.gpu_type_rgu"] = col_ratio_rgu_by_gpu[
+        slice_after_rgu_time
+    ]
+
+    # Compute allocated.gres_rgu where job started before RGU time.
+    df.loc[slice_before_rgu_time, "allocated.gres_rgu"] = (
+        df["allocated.gres_gpu"][slice_before_rgu_time]
+        * col_ratio_rgu_by_gpu[slice_before_rgu_time]
+    )
+
+    # Set allocated.gres_rgu with previous allocated.gres_gpu where job started after RGU time.
+    df.loc[slice_after_rgu_time, "allocated.gres_rgu"] = df["allocated.gres_gpu"][
+        slice_after_rgu_time
+    ]
+    # Then update allocated.gres_gpu where job started after RGU time.
+    df.loc[slice_after_rgu_time, "allocated.gres_gpu"] = (
+        df["allocated.gres_gpu"][slice_after_rgu_time]
+        / col_ratio_rgu_by_gpu[slice_after_rgu_time]
+    )
+
+    return df
+
+
+def update_job_series_rgu(df: DataFrame):
+    """
+    Compute RGU information for jobs in given data frame.
+
+    Parameters
+    ----------
+    df: DataFrame
+        Data frame to update, typically returned by `load_job_series`.
+        Should contain fields:
+         "cluster_name", "start_time", "allocated.gpu_type", "allocated.gres_gpu".
+
+    Returns
+    -------
+    DataFrame
+        Input data frame with:
+        - column `allocated.gres_gpu` updated if necessary.
+        - column `allocated.gres_rgu` added or updated to contain RGU billing.
+          Set to NaN (or unchanged if already present) for jobs from clusters without RGU.
+        - column `gpu_type_rgu` added or updated to contain RGU cost per GPU (RGU/GPU ratio).
+          Set to NaN (or unchanged if already present) for jobs from clusters without RGU.
+
+    For more details about implementation, see function `update_cluster_job_series_rgu`
+    """
+    for cluster_config in config().clusters.values():
+        update_cluster_job_series_rgu(df, cluster_config)
+    return df
+
+
 def _select_stat(name, dist):
     if not dist:
         return np.nan