From 44eabe92e6f4c693266f66f891a7e5bcb88dcd35 Mon Sep 17 00:00:00 2001 From: notoraptor Date: Fri, 5 Apr 2024 13:56:21 -0400 Subject: [PATCH] SARC-271: Update data frame with RGU data (#113) * SARC-271: Update data frame with RGU data * Log warnings if RGU start date or RGU/GPU ratios are not available. * Rename function to update_cluster_job_series_rgu and make sure only jobs related to given cluster name are updated. Add new function update_job_series_rgu to apply changes on data frame for all clusters. * Log a warning if RGU file is not specified and another warning if RGU file is not found. --------- Co-authored-by: Bruno Carrez --- config/sarc-dev.json | 16 +- config/sarc-prod.json | 16 +- sarc/config.py | 2 + sarc/jobs/series.py | 173 ++++++++- .../jobs/test_func_update_job_series_rgu.py | 360 ++++++++++++++++++ ...ate_job_series_rgu_with_real_test_data.txt | 41 ++ .../gpu_to_rgu_billing.json | 4 + .../gpu_to_rgu_billing_empty.json | 1 + .../patate_prometheus/gpu_to_rgu_billing.json | 3 + .../raisin_prometheus/gpu_to_rgu_billing.json | 11 + tests/sarc-test.json | 18 +- 11 files changed, 631 insertions(+), 14 deletions(-) create mode 100644 tests/functional/jobs/test_func_update_job_series_rgu.py create mode 100644 tests/functional/jobs/test_func_update_job_series_rgu/test_update_job_series_rgu_with_real_test_data.txt create mode 100644 tests/not-so-secrets/fromage_prometheus/gpu_to_rgu_billing.json create mode 100644 tests/not-so-secrets/gerudo_prometheus/gpu_to_rgu_billing_empty.json create mode 100644 tests/not-so-secrets/patate_prometheus/gpu_to_rgu_billing.json create mode 100644 tests/not-so-secrets/raisin_prometheus/gpu_to_rgu_billing.json diff --git a/config/sarc-dev.json b/config/sarc-dev.json index 350f44c6..bde70c31 100644 --- a/config/sarc-dev.json +++ b/config/sarc-dev.json @@ -40,7 +40,9 @@ "diskusage_report_command": "diskusage_report --project --all_users", "prometheus_url": "https://mila-thanos.calculquebec.ca", "prometheus_headers_file": "secrets/drac_prometheus/headers.json", - "start_date": "2022-04-01" + "start_date": "2022-04-01", + "rgu_start_date": "2023-11-28", + "gpu_to_rgu_billing": "secrets/gpu_to_rgu_billing_narval.json" }, "beluga": { "host": "beluga.computecanada.ca", @@ -52,7 +54,9 @@ "diskusage_report_command": "diskusage_report --project --all_users", "prometheus_url": "https://mila-thanos.calculquebec.ca", "prometheus_headers_file": "secrets/drac_prometheus/headers.json", - "start_date": "2022-04-01" + "start_date": "2022-04-01", + "rgu_start_date": "2024-04-03", + "gpu_to_rgu_billing": "secrets/gpu_to_rgu_billing_beluga.json" }, "graham": { "host": "graham.computecanada.ca", @@ -65,7 +69,9 @@ "prometheus_url": null, "prometheus_headers_file": null, "start_date": "2022-04-01", - "nodes_info_file": "secrets/nodes_graham.txt" + "nodes_info_file": "secrets/nodes_graham.txt", + "rgu_start_date": "2024-04-03", + "gpu_to_rgu_billing": "secrets/gpu_to_rgu_billing_graham.json" }, "cedar": { "host": "cedar.computecanada.ca", @@ -78,7 +84,9 @@ "prometheus_url": null, "prometheus_headers_file": null, "start_date": "2022-04-01", - "nodes_info_file": "secrets/nodes_cedar.txt" + "nodes_info_file": "secrets/nodes_cedar.txt", + "rgu_start_date": "2024-04-03", + "gpu_to_rgu_billing": "secrets/gpu_to_rgu_billing_cedar.json" } } } diff --git a/config/sarc-prod.json b/config/sarc-prod.json index aeb952c6..543b3bb6 100644 --- a/config/sarc-prod.json +++ b/config/sarc-prod.json @@ -40,7 +40,9 @@ "diskusage_report_command": "diskusage_report --project --all_users", "prometheus_url": "https://mila-thanos.calculquebec.ca", "prometheus_headers_file": "secrets/drac_prometheus/headers.json", - "start_date": "2022-04-01" + "start_date": "2022-04-01", + "rgu_start_date": "2023-11-28", + "gpu_to_rgu_billing": "secrets/gpu_to_rgu_billing_narval.json" }, "beluga": { "host": "beluga.computecanada.ca", @@ -52,7 +54,9 @@ "diskusage_report_command": "diskusage_report --project --all_users", "prometheus_url": "https://mila-thanos.calculquebec.ca", "prometheus_headers_file": "secrets/drac_prometheus/headers.json", - "start_date": "2022-04-01" + "start_date": "2022-04-01", + "rgu_start_date": "2024-04-03", + "gpu_to_rgu_billing": "secrets/gpu_to_rgu_billing_beluga.json" }, "graham": { "host": "graham.computecanada.ca", @@ -65,7 +69,9 @@ "prometheus_url": null, "prometheus_headers_file": null, "start_date": "2022-04-01", - "nodes_info_file": "secrets/nodes_graham.txt" + "nodes_info_file": "secrets/nodes_graham.txt", + "rgu_start_date": "2024-04-03", + "gpu_to_rgu_billing": "secrets/gpu_to_rgu_billing_graham.json" }, "cedar": { "host": "cedar.computecanada.ca", @@ -78,7 +84,9 @@ "prometheus_url": null, "prometheus_headers_file": null, "start_date": "2022-04-01", - "nodes_info_file": "secrets/nodes_cedar.txt" + "nodes_info_file": "secrets/nodes_cedar.txt", + "rgu_start_date": "2024-04-03", + "gpu_to_rgu_billing": "secrets/gpu_to_rgu_billing_cedar.json" } } } diff --git a/sarc/config.py b/sarc/config.py index 8562cc5d..71458c97 100644 --- a/sarc/config.py +++ b/sarc/config.py @@ -83,6 +83,8 @@ class ClusterConfig(BaseModel): duc_storage_command: str = None diskusage_report_command: str = None start_date: str = "2022-04-01" + rgu_start_date: str = None + gpu_to_rgu_billing: Path = None @validator("timezone") def _timezone(cls, value): diff --git a/sarc/jobs/series.py b/sarc/jobs/series.py index f47c408a..7b083491 100644 --- a/sarc/jobs/series.py +++ b/sarc/jobs/series.py @@ -1,5 +1,8 @@ from __future__ import annotations +import json +import logging +import os.path from datetime import datetime, timedelta from typing import TYPE_CHECKING, Callable @@ -9,7 +12,7 @@ from prometheus_api_client import MetricRangeDataFrame from tqdm import tqdm -from sarc.config import MTL, UTC +from sarc.config import MTL, UTC, ClusterConfig, config from sarc.jobs.job import JobStatistics, Statistics, count_jobs, get_jobs from sarc.traces import trace_decorator @@ -401,6 +404,174 @@ def load_job_series( return pandas.DataFrame(rows) +def update_cluster_job_series_rgu( + df: pandas.DataFrame, cluster_config: ClusterConfig +) -> pandas.DataFrame: + """ + Compute RGU information for jobs related to given cluster config in a data frame. + + Parameters + ---------- + df: DataFrame + Data frame to update, typically returned by `load_job_series`. + Should contain fields: + "cluster_name", "start_time", "allocated.gpu_type", "allocated.gres_gpu". + cluster_config: ClusterConfig + Configuration of cluster to which jobs to update belong. + Should define following config: + "rgu_start_date": date since when billing is given as RGU. + "gpu_to_rgu_billing": path to a JSON file containing a dict which maps + GPU type to RGU cost per GPU. + + Returns + ------- + DataFrame + Input data frame with: + - column `allocated.gres_gpu` updated if necessary. + - column `allocated.gres_rgu` added or updated to contain RGU billing. + Set to NaN (or unchanged if already present) for jobs from other clusters. + - column `gpu_type_rgu` added or updated to contain RGU cost per GPU (RGU/GPU ratio). + Set to NaN (or unchanged if already present) for jobs from other clusters. + + Pseudocode describing how we update data frame: + for each job: if job.cluster_name == cluster_config.name: + if start_time < cluster_config.rgu_start_date: + # We are BEFORE transition to RGU + if allocated.gpu_type in gpu_to_rgu_billing: + # compute rgu columns + allocated.gres_rgu = allocated.gres_gpu * gpu_to_rgu_billing[allocated.gpu_type] + allocated.gpu_type_rgu = gpu_to_rgu_billing[allocated.gpu_type] + else: + # set rgu columns to nan + allocated.gres_rgu = nan + allocated.gpu_type_rgu = nan + else: + # We are AFTER transition to RGU + # Anyway, we assume gres_rgu is current gres_gpu + allocated.gres_rgu = allocated.gres_gpu + + if allocated.gpu_type in gpu_to_rgu_billing: + # we fix gres_gpu by dividing it with RGU/GPU ratio + allocated.gres_gpu = allocated.gres_gpu / gpu_to_rgu_billing[allocated.gpu_type] + # we save RGU/GPU ratio + allocated.gpu_type_rgu = gpu_to_rgu_billing[allocated.gpu_type] + else: + # we cannot fix gres_gpu, so we set it to nan + allocated.gres_gpu = nan + # we cannot get RGU/GPU ratio, so we set it to nan + allocated.gpu_type_rgu = nan + """ + + # Make sure frame will have new RGU columns anyway, with NaN as default value. + if "allocated.gres_rgu" not in df.columns: + df["allocated.gres_rgu"] = np.nan + if "allocated.gpu_type_rgu" not in df.columns: + df["allocated.gpu_type_rgu"] = np.nan + + if cluster_config.rgu_start_date is None: + logging.warning( + f"RGU update: no RGU start date for cluster {cluster_config.name}" + ) + return df + + if cluster_config.gpu_to_rgu_billing is None: + logging.warning( + f"RGU update: no RGU/GPU JSON path for cluster {cluster_config.name}" + ) + return df + + if not os.path.isfile(cluster_config.gpu_to_rgu_billing): + logging.warning( + f"RGU update: RGU/GPU JSON file not found for cluster {cluster_config.name} " + f"at: {cluster_config.gpu_to_rgu_billing}" + ) + return df + + # Otherwise, parse RGU start date. + rgu_start_date = datetime.fromisoformat(cluster_config.rgu_start_date).astimezone( + MTL + ) + + # Get RGU/GPU ratios. + with open(cluster_config.gpu_to_rgu_billing, "r", encoding="utf-8") as file: + gpu_to_rgu_billing = json.load(file) + assert isinstance(gpu_to_rgu_billing, dict) + if not gpu_to_rgu_billing: + logging.warning( + f"RGU update: no RGU/GPU available for cluster {cluster_config.name}" + ) + return df + + # We have now both RGU stare date and RGU/GPU ratios. We can update columns. + + # Compute column allocated.gpu_type_rgu + # If a GPU type is not found in RGU/GPU ratios, + # then ratio will be set to NaN in output column. + col_ratio_rgu_by_gpu = df["allocated.gpu_type"].map(gpu_to_rgu_billing) + + # Compute slices for both before and since RGU start date. + slice_before_rgu_time = (df["cluster_name"] == cluster_config.name) & ( + df["start_time"] < rgu_start_date + ) + slice_after_rgu_time = (df["cluster_name"] == cluster_config.name) & ( + df["start_time"] >= rgu_start_date + ) + + # We can already set column allocated.gpu_type_rgu anyway. + df.loc[slice_before_rgu_time, "allocated.gpu_type_rgu"] = col_ratio_rgu_by_gpu[ + slice_before_rgu_time + ] + df.loc[slice_after_rgu_time, "allocated.gpu_type_rgu"] = col_ratio_rgu_by_gpu[ + slice_after_rgu_time + ] + + # Compute allocated.gres_rgu where job started before RGU time. + df.loc[slice_before_rgu_time, "allocated.gres_rgu"] = ( + df["allocated.gres_gpu"][slice_before_rgu_time] + * col_ratio_rgu_by_gpu[slice_before_rgu_time] + ) + + # Set allocated.gres_rgu with previous allocated.gres_gpu where job started after RGU time. + df.loc[slice_after_rgu_time, "allocated.gres_rgu"] = df["allocated.gres_gpu"][ + slice_after_rgu_time + ] + # Then update allocated.gres_gpu where job started after RGU time. + df.loc[slice_after_rgu_time, "allocated.gres_gpu"] = ( + df["allocated.gres_gpu"][slice_after_rgu_time] + / col_ratio_rgu_by_gpu[slice_after_rgu_time] + ) + + return df + + +def update_job_series_rgu(df: DataFrame): + """ + Compute RGU information for jobs in given data frame. + + Parameters + ---------- + df: DataFrame + Data frame to update, typically returned by `load_job_series`. + Should contain fields: + "cluster_name", "start_time", "allocated.gpu_type", "allocated.gres_gpu". + + Returns + ------- + DataFrame + Input data frame with: + - column `allocated.gres_gpu` updated if necessary. + - column `allocated.gres_rgu` added or updated to contain RGU billing. + Set to NaN (or unchanged if already present) for jobs from clusters without RGU. + - column `gpu_type_rgu` added or updated to contain RGU cost per GPU (RGU/GPU ratio). + Set to NaN (or unchanged if already present) for jobs from clusters without RGU. + + For more details about implementation, see function `update_cluster_job_series_rgu` + """ + for cluster_config in config().clusters.values(): + update_cluster_job_series_rgu(df, cluster_config) + return df + + def _select_stat(name, dist): if not dist: return np.nan diff --git a/tests/functional/jobs/test_func_update_job_series_rgu.py b/tests/functional/jobs/test_func_update_job_series_rgu.py new file mode 100644 index 00000000..c38ac58e --- /dev/null +++ b/tests/functional/jobs/test_func_update_job_series_rgu.py @@ -0,0 +1,360 @@ +import json +from datetime import datetime +from pprint import pformat +from typing import Dict + +import numpy as np +import pandas +import pytest + +from sarc.config import MTL, ClusterConfig, config +from sarc.jobs.series import ( + load_job_series, + update_cluster_job_series_rgu, + update_job_series_rgu, +) + +from .test_func_load_job_series import MOCK_TIME + + +def _gen_data_frame( + cluster_names: list, start_times=[], gres_gpu: list = [], gpu_type: list = [] +): + """Generate a data frame suited for RGU tests.""" + assert len(cluster_names) == len(start_times) == len(gres_gpu) == len(gpu_type) + rows = [ + { + "cluster_name": cluster_name, + "start_time": start_time, + "allocated.gres_gpu": gres_gpu, + "allocated.gpu_type": gpu_type, + } + for cluster_name, start_time, gres_gpu, gpu_type in zip( + cluster_names, start_times, gres_gpu, gpu_type + ) + ] + frame = pandas.DataFrame(rows) + assert frame.shape == (len(gres_gpu), 4 if len(gres_gpu) else 0) + return frame + + +def _read_json(filename): + with open(filename, "r", encoding="utf-8") as file: + return json.load(file) + + +# Below, we generate fixtures for cluster configs used in these tests. +# There are 5 clusters: +# - no rgu date, no RGU mapping +# - no rgu date, only RGU mapping +# - only rgu date, no RGU mapping +# - rgu date, empty RGU mapping +# - rgu date, RGU mapping +# With 4 first configs, frame should not be updated, +# as either rgu date is missing or RGU mapping is missing or empty. +# With 5th config, frame should be updated, as all required data are available. + + +@pytest.mark.usefixtures("read_only_db", "tzlocal_is_mtl") +@pytest.fixture +def clusters_config(): + clusters: Dict[str, ClusterConfig] = config().clusters + return clusters + + +@pytest.fixture +def cluster_no_rgu(clusters_config): + return clusters_config["hyrule"] + + +@pytest.fixture +def cluster_only_rgu_start_date(clusters_config): + return clusters_config["local"] + + +@pytest.fixture +def cluster_only_rgu_billing(clusters_config): + return clusters_config["patate"] + + +@pytest.fixture +def cluster_full_rgu_empty_billing(clusters_config): + return clusters_config["gerudo"] + + +@pytest.fixture +def cluster_full_rgu(clusters_config): + return clusters_config["raisin"] + + +@pytest.mark.usefixtures("read_only_db", "tzlocal_is_mtl") +def test_clusters_rgu_config( + cluster_no_rgu, + cluster_only_rgu_start_date, + cluster_only_rgu_billing, + cluster_full_rgu_empty_billing, + cluster_full_rgu, +): + """Just check clusters config.""" + assert cluster_no_rgu.rgu_start_date is None + assert cluster_no_rgu.gpu_to_rgu_billing is None + + assert cluster_only_rgu_start_date.rgu_start_date is not None + assert cluster_only_rgu_start_date.gpu_to_rgu_billing is None + + assert cluster_only_rgu_billing.rgu_start_date is None + assert cluster_only_rgu_billing.gpu_to_rgu_billing is not None + + assert cluster_full_rgu_empty_billing.rgu_start_date is not None + assert cluster_full_rgu_empty_billing.gpu_to_rgu_billing is not None + assert _read_json(cluster_full_rgu_empty_billing.gpu_to_rgu_billing) == {} + + assert cluster_full_rgu.rgu_start_date is not None + assert cluster_full_rgu.gpu_to_rgu_billing is not None + gpu_to_rgu_billing = _read_json(cluster_full_rgu.gpu_to_rgu_billing) + assert isinstance(gpu_to_rgu_billing, dict) + assert len(gpu_to_rgu_billing) + + +@pytest.mark.usefixtures("read_only_db", "tzlocal_is_mtl") +def test_data_frame_output_size( + cluster_no_rgu, + cluster_only_rgu_start_date, + cluster_only_rgu_billing, + cluster_full_rgu_empty_billing, + cluster_full_rgu, +): + """ + Check that nothing is computed if cluster does not have both + RGU start time and non-empty RGU/GPU ratio JSON file. + """ + cluster_names = ["raisin"] * 5 + start_times = [ + datetime.strptime(date, "%Y-%m-%d").astimezone(MTL) + for date in ( + "2023-02-14", + "2023-02-15", + "2023-02-16", + "2023-02-17", + "2023-02-18", + ) + ] + gres_gpu = [1, 2, 3, 4, 5] + gpu_type = [ + "raisin_gpu_1", + "raisin_gpu_2", + "raisin_gpu_3", + "raisin_gpu_4", + "raisin_gpu_5", + ] + + nans = pandas.Series([np.nan] * 5) + + frame = _gen_data_frame(cluster_names, start_times, gres_gpu, gpu_type) + assert frame.shape == (5, 4) + assert "allocated.gres_rgu" not in frame.columns + assert "allocated.gpu_type_rgu" not in frame.columns + + update_cluster_job_series_rgu(frame, cluster_no_rgu) + assert frame.shape == (5, 6) + assert frame["allocated.gres_rgu"].equals(nans) + assert frame["allocated.gpu_type_rgu"].equals(nans) + + update_cluster_job_series_rgu(frame, cluster_only_rgu_start_date) + assert frame.shape == (5, 6) + assert frame["allocated.gres_rgu"].equals(nans) + assert frame["allocated.gpu_type_rgu"].equals(nans) + + update_cluster_job_series_rgu(frame, cluster_only_rgu_billing) + assert frame.shape == (5, 6) + assert frame["allocated.gres_rgu"].equals(nans) + assert frame["allocated.gpu_type_rgu"].equals(nans) + + update_cluster_job_series_rgu(frame, cluster_full_rgu_empty_billing) + assert frame.shape == (5, 6) + assert frame["allocated.gres_rgu"].equals(nans) + assert frame["allocated.gpu_type_rgu"].equals(nans) + + # Then, with full config, we should have updates. + update_cluster_job_series_rgu(frame, cluster_full_rgu) + assert frame.shape == (5, 6) + assert not frame["allocated.gres_rgu"].equals(nans) + assert not frame["allocated.gpu_type_rgu"].equals(nans) + + +def _gen_complex_data_frame(): + cluster_names = (["raisin"] * 9) + ["fromage", "patate", "fromage"] + start_times = [ + datetime.strptime(date, "%Y-%m-%d").astimezone(MTL) + for date in ( + "2023-02-12", + "2023-02-13", + "2023-02-14", + "2023-02-15", + "2023-02-16", + "2023-02-17", + "2023-02-18", + "2023-02-19", + "2023-02-20", + "2023-02-21", # job belongs to cluster fromage + "2023-02-21", # job belongs to cluster patate + "2023-02-22", # job belongs to cluster fromage + ) + ] + gres_gpu = [1, 2, 3, 4, 5000, 6000, 7000, 8000, 9000, 123, 5678, 91011] + gpu_type = [ + "raisin_gpu_unknown_1", + "raisin_gpu_unknown_2", + "raisin_gpu_3", + "raisin_gpu_4", + "raisin_gpu_5", + "raisin_gpu_unknown_6", + "A100", + "raisin_gpu_unknown_8", + "raisin_gpu_unknown_9", + "fromage_gpu_1", # job belongs to cluster fromage + "patate_gpu_9", # job belongs to cluster patate + "fromage_gpu_2", # job belongs to cluster fromage + ] + return _gen_data_frame(cluster_names, start_times, gres_gpu, gpu_type) + + +def _get_expected_columns_with_cluster_raisin(): + """ + Return expected columns when complex data frame is updated using only cluster raisin. + """ + expected_gres_gpu = [ + 1.0, # before 2023-02-16, should not change (even if GPU type is unknown) + 2.0, # before 2023-02-16, should not change (even if GPU type is unknown) + 3.0, # before 2023-02-16, should not change + 4.0, # before 2023-02-16, should not change + 5000 / 500, # from 2023-12-16, should be divided by RGU/GPU ratio + np.nan, # from 2023-12-16, unknown GPU type, should be nan + 7000 / 700, # from 2023-12-16, should be divided by RGU/GPU ratio + np.nan, # from 2023-12-16, unknown GPU type, should be nan + np.nan, # from 2023-12-16, unknown GPU type, should be nan + 123, # job does not belong to cluster raisin, then should not change + 5678, # job does not belong to cluster raisin, then should not change + 91011, # job does not belong to cluster raisin, then should not change + ] + expected_gres_rgu = [ + np.nan, # before 2023-12-16, unknown GPU type, should be nan + np.nan, # before 2023-12-16, unknown GPU type, should be nan + 3 * 300.0, # before 2023-12-16, should be gres_gpu * RGU/GPU ratio + 4 * 400.0, # before 2023-12-16, should be gres_gpu * RGU/GPU ratio + 5000.0, # from 2023-12-16, should be gres_gpu + 6000.0, # from 2023-12-16, should be gres_gpu (even if GPU type is unknown) + 7000.0, # from 2023-12-16, should be gres_gpu + 8000.0, # from 2023-12-16, should be gres_gpu (even if GPU type is unknown) + 9000.0, # from 2023-12-16, should be gres_gpu (even if GPU type is unknown) + np.nan, # job does not belong to cluster raisin, then should have nan here + np.nan, # job does not belong to cluster raisin, then should have nan here + np.nan, # job does not belong to cluster raisin, then should have nan here + ] + expected_gpu_type_rgu = [ + np.nan, # GPU type unknown, should be nan + np.nan, # GPU type unknown, should be nan + 300, # GPU type exists in RGU map, should be copied here + 400, # GPU type exists in RGU map, should be copied here + 500, # GPU type exists in RGU map, should be copied here + np.nan, # GPU type unknown, should be nan + 700, # GPU type exists in RGU map, should be copied here + np.nan, # GPU type unknown, should be nan + np.nan, # GPU type unknown, should be nan + np.nan, # job does not belong to cluster raisin, then should have nan here + np.nan, # job does not belong to cluster raisin, then should have nan here + np.nan, # job does not belong to cluster raisin, then should have nan here + ] + + return expected_gres_gpu, expected_gres_rgu, expected_gpu_type_rgu + + +@pytest.mark.usefixtures("read_only_db", "tzlocal_is_mtl") +def test_update_cluster_job_series_rgu(cluster_full_rgu): + """Concrete test for 1 cluster with a generated frame.""" + assert cluster_full_rgu.rgu_start_date == "2023-02-16" + frame = _gen_complex_data_frame() + assert frame.shape == (12, 4) + assert "allocated.gres_rgu" not in frame.columns + assert "allocated.gpu_type_rgu" not in frame.columns + + returned_frame = update_cluster_job_series_rgu(frame, cluster_full_rgu) + assert frame is returned_frame + assert frame.shape == (12, 6) + assert "allocated.gres_rgu" in frame.columns + assert "allocated.gpu_type_rgu" in frame.columns + + ( + expected_gres_gpu, + expected_gres_rgu, + expected_gpu_type_rgu, + ) = _get_expected_columns_with_cluster_raisin() + assert frame["allocated.gres_gpu"].equals(pandas.Series(expected_gres_gpu)) + assert frame["allocated.gres_rgu"].equals(pandas.Series(expected_gres_rgu)) + assert frame["allocated.gpu_type_rgu"].equals(pandas.Series(expected_gpu_type_rgu)) + + +@pytest.mark.usefixtures("read_only_db", "tzlocal_is_mtl") +def test_update_job_series_rgu(): + """Concrete test for all clusters with a generated frame.""" + frame = _gen_complex_data_frame() + assert frame.shape == (12, 4) + assert "allocated.gres_rgu" not in frame.columns + assert "allocated.gpu_type_rgu" not in frame.columns + + returned_frame = update_job_series_rgu(frame) + assert frame is returned_frame + assert frame.shape == (12, 6) + assert "allocated.gres_rgu" in frame.columns + assert "allocated.gpu_type_rgu" in frame.columns + + ( + expected_gres_gpu, + expected_gres_rgu, + expected_gpu_type_rgu, + ) = _get_expected_columns_with_cluster_raisin() + expected_gres_gpu[-3:] = [ + 123.0, # job belongs to cluster fromage before RGU, should not change + 5678.0, # job belongs to cluster patate, no RGU, then no change + 91011 / 200, # job belongs to cluster fromage after RGU, divided by RGU/GPU + ] + expected_gres_rgu[-3:] = [ + 123 * 100.0, # job from to cluster fromage before RGU: gres_gpu * RGU/GPU ratio + np.nan, # job belongs to cluster patate, no RGU, then should have nan here + 91011.0, # job belongs to cluster fromage after RGU, should be gres_gpu + ] + expected_gpu_type_rgu[-3:] = [ + 100.0, # job belongs to cluster fromage, GPU type should be copied here + np.nan, # job belongs to cluster patate, no RGU, then should have nan here + 200.0, # job belongs to cluster fromage, GPU type should be copied here + ] + assert frame["allocated.gres_gpu"].equals(pandas.Series(expected_gres_gpu)) + assert frame["allocated.gres_rgu"].equals(pandas.Series(expected_gres_rgu)) + assert frame["allocated.gpu_type_rgu"].equals(pandas.Series(expected_gpu_type_rgu)) + + +@pytest.mark.freeze_time(MOCK_TIME) +@pytest.mark.usefixtures("read_only_db", "tzlocal_is_mtl") +def test_update_job_series_rgu_with_real_test_data(cluster_full_rgu, file_regression): + """Concrete tests with jobs from read_only_db""" + frame = load_job_series() + update_cluster_job_series_rgu(frame, cluster_full_rgu) + + def _df_to_pretty_str(df: pandas.DataFrame) -> str: + fields = [ + "job_id", + "cluster_name", + "start_time", + "allocated.gpu_type", + "allocated.gres_gpu", + "allocated.gres_rgu", + "allocated.gpu_type_rgu", + ] + return df[fields].to_markdown() + + file_regression.check( + f"Update job series RGU for {frame.shape[0]} job(s):\n\n" + f"RGU start date: {cluster_full_rgu.rgu_start_date}\n\n" + f"gpu_to_rgu_billing:\n{pformat(_read_json(cluster_full_rgu.gpu_to_rgu_billing))}\n\n" + f"{_df_to_pretty_str(frame)}" + ) diff --git a/tests/functional/jobs/test_func_update_job_series_rgu/test_update_job_series_rgu_with_real_test_data.txt b/tests/functional/jobs/test_func_update_job_series_rgu/test_update_job_series_rgu_with_real_test_data.txt new file mode 100644 index 00000000..b4574e6c --- /dev/null +++ b/tests/functional/jobs/test_func_update_job_series_rgu/test_update_job_series_rgu_with_real_test_data.txt @@ -0,0 +1,41 @@ +Update job series RGU for 24 job(s): + +RGU start date: 2023-02-16 + +gpu_to_rgu_billing: +{'A100': 700, + 'raisin_gpu_1': 100, + 'raisin_gpu_2': 200, + 'raisin_gpu_3': 300, + 'raisin_gpu_4': 400, + 'raisin_gpu_5': 500, + 'raisin_gpu_6': 600, + 'raisin_gpu_8': 800, + 'raisin_gpu_9': 900} + +| | job_id | cluster_name | start_time | allocated.gpu_type | allocated.gres_gpu | allocated.gres_rgu | allocated.gpu_type_rgu | +|---:|----------:|:---------------|:--------------------------|:---------------------|---------------------:|---------------------:|-------------------------:| +| 0 | 1 | raisin | 2023-02-14 00:01:00-05:00 | | 1 | nan | nan | +| 1 | 2 | raisin | 2023-02-14 06:01:00-05:00 | | 1 | nan | nan | +| 2 | 3 | raisin | 2023-02-14 12:01:00-05:00 | | 1 | nan | nan | +| 3 | 4 | raisin | 2023-02-14 18:01:00-05:00 | | 1 | nan | nan | +| 4 | 5 | raisin | 2023-02-15 00:01:00-05:00 | | 1 | nan | nan | +| 5 | 6 | raisin | 2023-02-15 06:01:00-05:00 | | 1 | nan | nan | +| 6 | 7 | raisin | 2023-11-21 07:00:00-05:00 | | nan | 1 | nan | +| 7 | 8 | raisin | 2023-11-21 07:00:00-05:00 | | nan | 1 | nan | +| 8 | 9 | raisin | 2023-02-16 00:01:00-05:00 | | nan | 1 | nan | +| 9 | 10 | raisin | 2023-02-16 00:01:00-05:00 | | nan | 1 | nan | +| 10 | 11 | raisin | 2023-02-16 00:01:00-05:00 | | nan | 1 | nan | +| 11 | 12 | raisin | 2023-02-16 18:01:00-05:00 | | nan | 1 | nan | +| 12 | 13 | raisin | 2023-02-17 00:01:00-05:00 | | nan | 1 | nan | +| 13 | 14 | raisin | 2023-02-17 06:01:00-05:00 | | nan | 1 | nan | +| 14 | 15 | fromage | 2023-02-17 12:01:00-05:00 | | 1 | nan | nan | +| 15 | 16 | patate | 2023-02-17 18:01:00-05:00 | | 1 | nan | nan | +| 16 | 17 | raisin | 2023-02-18 00:01:00-05:00 | | nan | 1 | nan | +| 17 | 18 | raisin | 2023-02-18 06:01:00-05:00 | | nan | 1 | nan | +| 18 | 19 | raisin | 2023-02-18 12:01:00-05:00 | | nan | 1 | nan | +| 19 | 20 | raisin | 2023-02-18 18:01:00-05:00 | | nan | 1 | nan | +| 20 | 1000000 | raisin | 2023-02-19 00:01:00-05:00 | | nan | 1 | nan | +| 21 | 1000000 | raisin | 2023-02-19 06:01:00-05:00 | | nan | 1 | nan | +| 22 | 23 | raisin | 2023-02-19 12:01:00-05:00 | A100 | 0.00285714 | 2 | 700 | +| 23 | 999999999 | raisin | 2023-02-19 18:01:00-05:00 | | nan | 0 | nan | \ No newline at end of file diff --git a/tests/not-so-secrets/fromage_prometheus/gpu_to_rgu_billing.json b/tests/not-so-secrets/fromage_prometheus/gpu_to_rgu_billing.json new file mode 100644 index 00000000..8a58e422 --- /dev/null +++ b/tests/not-so-secrets/fromage_prometheus/gpu_to_rgu_billing.json @@ -0,0 +1,4 @@ +{ + "fromage_gpu_1": 100, + "fromage_gpu_2": 200 +} diff --git a/tests/not-so-secrets/gerudo_prometheus/gpu_to_rgu_billing_empty.json b/tests/not-so-secrets/gerudo_prometheus/gpu_to_rgu_billing_empty.json new file mode 100644 index 00000000..0967ef42 --- /dev/null +++ b/tests/not-so-secrets/gerudo_prometheus/gpu_to_rgu_billing_empty.json @@ -0,0 +1 @@ +{} diff --git a/tests/not-so-secrets/patate_prometheus/gpu_to_rgu_billing.json b/tests/not-so-secrets/patate_prometheus/gpu_to_rgu_billing.json new file mode 100644 index 00000000..c021e828 --- /dev/null +++ b/tests/not-so-secrets/patate_prometheus/gpu_to_rgu_billing.json @@ -0,0 +1,3 @@ +{ + "patate_gpu_1": 700 +} diff --git a/tests/not-so-secrets/raisin_prometheus/gpu_to_rgu_billing.json b/tests/not-so-secrets/raisin_prometheus/gpu_to_rgu_billing.json new file mode 100644 index 00000000..b49f1135 --- /dev/null +++ b/tests/not-so-secrets/raisin_prometheus/gpu_to_rgu_billing.json @@ -0,0 +1,11 @@ +{ + "raisin_gpu_1": 100, + "raisin_gpu_2": 200, + "raisin_gpu_3": 300, + "raisin_gpu_4": 400, + "raisin_gpu_5": 500, + "raisin_gpu_6": 600, + "A100": 700, + "raisin_gpu_8": 800, + "raisin_gpu_9": 900 +} diff --git a/tests/sarc-test.json b/tests/sarc-test.json index c1355e4a..e6f286f2 100644 --- a/tests/sarc-test.json +++ b/tests/sarc-test.json @@ -29,7 +29,9 @@ "duc_inodes_command": null, "duc_storage_command": null, "diskusage_report_command": null, - "prometheus_url": "http://monitoring.server.raisin.quebec:9090/" + "prometheus_url": "http://monitoring.server.raisin.quebec:9090/", + "rgu_start_date": "2023-02-16", + "gpu_to_rgu_billing": "tests/not-so-secrets/raisin_prometheus/gpu_to_rgu_billing.json" }, "raisin_no_prometheus": { "host": "raisin_no_prometheus", @@ -51,7 +53,9 @@ "duc_storage_command": "duc ls -d /project/.duc_databases/rrg-bonhomme-ad.sqlite /project/rrg-bonhomme-ad", "diskusage_report_command": "diskusage_report --project --all_users", "prometheus_url": "https://fromage-thanos.calcul.ca", - "prometheus_headers_file": "tests/not-so-secrets/fromage_prometheus/headers.json" + "prometheus_headers_file": "tests/not-so-secrets/fromage_prometheus/headers.json", + "rgu_start_date": "2023-02-22", + "gpu_to_rgu_billing": "tests/not-so-secrets/fromage_prometheus/gpu_to_rgu_billing.json" }, "patate": { "host": "patate", @@ -62,7 +66,8 @@ "duc_storage_command": "duc ls -d /project/.duc_databases/rrg-bonhomme-ad.sqlite /project/rrg-bonhomme-ad", "diskusage_report_command": "diskusage_report --project --all_users", "prometheus_url": "https://fromage-thanos.calcul.ca", - "prometheus_headers_file": "tests/not-so-secrets/patate_prometheus/headers.json" + "prometheus_headers_file": "tests/not-so-secrets/patate_prometheus/headers.json", + "gpu_to_rgu_billing": "tests/not-so-secrets/patate_prometheus/gpu_to_rgu_billing.json" }, "gerudo": { "host": "gerudo", @@ -73,7 +78,9 @@ "duc_storage_command": "duc ls -d /project/.duc_databases/rrg-bonhomme-ad.sqlite /project/rrg-bonhomme-ad", "diskusage_report_command": "diskusage_report --project --all_users", "prometheus_url": "https://gerudo-thanos.calcul.ca", - "prometheus_headers_file": "tests/not-so-secrets/patate_prometheus/headers.json" + "prometheus_headers_file": "tests/not-so-secrets/patate_prometheus/headers.json", + "rgu_start_date": "2023-02-16", + "gpu_to_rgu_billing": "tests/not-so-secrets/gerudo_prometheus/gpu_to_rgu_billing_empty.json" }, "hyrule": { "host": "hyrule", @@ -106,7 +113,8 @@ "duc_inodes_command": null, "duc_storage_command": null, "diskusage_report_command": null, - "prometheus_url": null + "prometheus_url": null, + "rgu_start_date": "2023-02-16" } } }