Skip to content

Commit

Permalink
SARC-271: Update data frame with RGU data (#113)
Browse files Browse the repository at this point in the history
* SARC-271: Update data frame with RGU data

* Log warnings if RGU start date or RGU/GPU ratios are not available.

* Rename function to update_cluster_job_series_rgu and make sure only jobs related to given cluster name are updated.

Add new function update_job_series_rgu to apply changes on data frame for all clusters.

* Log a warning if RGU file is not specified and another warning if RGU file is not found.

---------

Co-authored-by: Bruno Carrez <[email protected]>
  • Loading branch information
notoraptor and nurbal authored Apr 5, 2024
1 parent 59e47b7 commit 44eabe9
Show file tree
Hide file tree
Showing 11 changed files with 631 additions and 14 deletions.
16 changes: 12 additions & 4 deletions config/sarc-dev.json
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,9 @@
"diskusage_report_command": "diskusage_report --project --all_users",
"prometheus_url": "https://mila-thanos.calculquebec.ca",
"prometheus_headers_file": "secrets/drac_prometheus/headers.json",
"start_date": "2022-04-01"
"start_date": "2022-04-01",
"rgu_start_date": "2023-11-28",
"gpu_to_rgu_billing": "secrets/gpu_to_rgu_billing_narval.json"
},
"beluga": {
"host": "beluga.computecanada.ca",
Expand All @@ -52,7 +54,9 @@
"diskusage_report_command": "diskusage_report --project --all_users",
"prometheus_url": "https://mila-thanos.calculquebec.ca",
"prometheus_headers_file": "secrets/drac_prometheus/headers.json",
"start_date": "2022-04-01"
"start_date": "2022-04-01",
"rgu_start_date": "2024-04-03",
"gpu_to_rgu_billing": "secrets/gpu_to_rgu_billing_beluga.json"
},
"graham": {
"host": "graham.computecanada.ca",
Expand All @@ -65,7 +69,9 @@
"prometheus_url": null,
"prometheus_headers_file": null,
"start_date": "2022-04-01",
"nodes_info_file": "secrets/nodes_graham.txt"
"nodes_info_file": "secrets/nodes_graham.txt",
"rgu_start_date": "2024-04-03",
"gpu_to_rgu_billing": "secrets/gpu_to_rgu_billing_graham.json"
},
"cedar": {
"host": "cedar.computecanada.ca",
Expand All @@ -78,7 +84,9 @@
"prometheus_url": null,
"prometheus_headers_file": null,
"start_date": "2022-04-01",
"nodes_info_file": "secrets/nodes_cedar.txt"
"nodes_info_file": "secrets/nodes_cedar.txt",
"rgu_start_date": "2024-04-03",
"gpu_to_rgu_billing": "secrets/gpu_to_rgu_billing_cedar.json"
}
}
}
16 changes: 12 additions & 4 deletions config/sarc-prod.json
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,9 @@
"diskusage_report_command": "diskusage_report --project --all_users",
"prometheus_url": "https://mila-thanos.calculquebec.ca",
"prometheus_headers_file": "secrets/drac_prometheus/headers.json",
"start_date": "2022-04-01"
"start_date": "2022-04-01",
"rgu_start_date": "2023-11-28",
"gpu_to_rgu_billing": "secrets/gpu_to_rgu_billing_narval.json"
},
"beluga": {
"host": "beluga.computecanada.ca",
Expand All @@ -52,7 +54,9 @@
"diskusage_report_command": "diskusage_report --project --all_users",
"prometheus_url": "https://mila-thanos.calculquebec.ca",
"prometheus_headers_file": "secrets/drac_prometheus/headers.json",
"start_date": "2022-04-01"
"start_date": "2022-04-01",
"rgu_start_date": "2024-04-03",
"gpu_to_rgu_billing": "secrets/gpu_to_rgu_billing_beluga.json"
},
"graham": {
"host": "graham.computecanada.ca",
Expand All @@ -65,7 +69,9 @@
"prometheus_url": null,
"prometheus_headers_file": null,
"start_date": "2022-04-01",
"nodes_info_file": "secrets/nodes_graham.txt"
"nodes_info_file": "secrets/nodes_graham.txt",
"rgu_start_date": "2024-04-03",
"gpu_to_rgu_billing": "secrets/gpu_to_rgu_billing_graham.json"
},
"cedar": {
"host": "cedar.computecanada.ca",
Expand All @@ -78,7 +84,9 @@
"prometheus_url": null,
"prometheus_headers_file": null,
"start_date": "2022-04-01",
"nodes_info_file": "secrets/nodes_cedar.txt"
"nodes_info_file": "secrets/nodes_cedar.txt",
"rgu_start_date": "2024-04-03",
"gpu_to_rgu_billing": "secrets/gpu_to_rgu_billing_cedar.json"
}
}
}
2 changes: 2 additions & 0 deletions sarc/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,8 @@ class ClusterConfig(BaseModel):
duc_storage_command: str = None
diskusage_report_command: str = None
start_date: str = "2022-04-01"
rgu_start_date: str = None
gpu_to_rgu_billing: Path = None

@validator("timezone")
def _timezone(cls, value):
Expand Down
173 changes: 172 additions & 1 deletion sarc/jobs/series.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
from __future__ import annotations

import json
import logging
import os.path
from datetime import datetime, timedelta
from typing import TYPE_CHECKING, Callable

Expand All @@ -9,7 +12,7 @@
from prometheus_api_client import MetricRangeDataFrame
from tqdm import tqdm

from sarc.config import MTL, UTC
from sarc.config import MTL, UTC, ClusterConfig, config
from sarc.jobs.job import JobStatistics, Statistics, count_jobs, get_jobs
from sarc.traces import trace_decorator

Expand Down Expand Up @@ -401,6 +404,174 @@ def load_job_series(
return pandas.DataFrame(rows)


def update_cluster_job_series_rgu(
df: pandas.DataFrame, cluster_config: ClusterConfig
) -> pandas.DataFrame:
"""
Compute RGU information for jobs related to given cluster config in a data frame.
Parameters
----------
df: DataFrame
Data frame to update, typically returned by `load_job_series`.
Should contain fields:
"cluster_name", "start_time", "allocated.gpu_type", "allocated.gres_gpu".
cluster_config: ClusterConfig
Configuration of cluster to which jobs to update belong.
Should define following config:
"rgu_start_date": date since when billing is given as RGU.
"gpu_to_rgu_billing": path to a JSON file containing a dict which maps
GPU type to RGU cost per GPU.
Returns
-------
DataFrame
Input data frame with:
- column `allocated.gres_gpu` updated if necessary.
- column `allocated.gres_rgu` added or updated to contain RGU billing.
Set to NaN (or unchanged if already present) for jobs from other clusters.
- column `gpu_type_rgu` added or updated to contain RGU cost per GPU (RGU/GPU ratio).
Set to NaN (or unchanged if already present) for jobs from other clusters.
Pseudocode describing how we update data frame:
for each job: if job.cluster_name == cluster_config.name:
if start_time < cluster_config.rgu_start_date:
# We are BEFORE transition to RGU
if allocated.gpu_type in gpu_to_rgu_billing:
# compute rgu columns
allocated.gres_rgu = allocated.gres_gpu * gpu_to_rgu_billing[allocated.gpu_type]
allocated.gpu_type_rgu = gpu_to_rgu_billing[allocated.gpu_type]
else:
# set rgu columns to nan
allocated.gres_rgu = nan
allocated.gpu_type_rgu = nan
else:
# We are AFTER transition to RGU
# Anyway, we assume gres_rgu is current gres_gpu
allocated.gres_rgu = allocated.gres_gpu
if allocated.gpu_type in gpu_to_rgu_billing:
# we fix gres_gpu by dividing it with RGU/GPU ratio
allocated.gres_gpu = allocated.gres_gpu / gpu_to_rgu_billing[allocated.gpu_type]
# we save RGU/GPU ratio
allocated.gpu_type_rgu = gpu_to_rgu_billing[allocated.gpu_type]
else:
# we cannot fix gres_gpu, so we set it to nan
allocated.gres_gpu = nan
# we cannot get RGU/GPU ratio, so we set it to nan
allocated.gpu_type_rgu = nan
"""

# Make sure frame will have new RGU columns anyway, with NaN as default value.
if "allocated.gres_rgu" not in df.columns:
df["allocated.gres_rgu"] = np.nan
if "allocated.gpu_type_rgu" not in df.columns:
df["allocated.gpu_type_rgu"] = np.nan

if cluster_config.rgu_start_date is None:
logging.warning(
f"RGU update: no RGU start date for cluster {cluster_config.name}"
)
return df

if cluster_config.gpu_to_rgu_billing is None:
logging.warning(
f"RGU update: no RGU/GPU JSON path for cluster {cluster_config.name}"
)
return df

if not os.path.isfile(cluster_config.gpu_to_rgu_billing):
logging.warning(
f"RGU update: RGU/GPU JSON file not found for cluster {cluster_config.name} "
f"at: {cluster_config.gpu_to_rgu_billing}"
)
return df

# Otherwise, parse RGU start date.
rgu_start_date = datetime.fromisoformat(cluster_config.rgu_start_date).astimezone(
MTL
)

# Get RGU/GPU ratios.
with open(cluster_config.gpu_to_rgu_billing, "r", encoding="utf-8") as file:
gpu_to_rgu_billing = json.load(file)
assert isinstance(gpu_to_rgu_billing, dict)
if not gpu_to_rgu_billing:
logging.warning(
f"RGU update: no RGU/GPU available for cluster {cluster_config.name}"
)
return df

# We have now both RGU stare date and RGU/GPU ratios. We can update columns.

# Compute column allocated.gpu_type_rgu
# If a GPU type is not found in RGU/GPU ratios,
# then ratio will be set to NaN in output column.
col_ratio_rgu_by_gpu = df["allocated.gpu_type"].map(gpu_to_rgu_billing)

# Compute slices for both before and since RGU start date.
slice_before_rgu_time = (df["cluster_name"] == cluster_config.name) & (
df["start_time"] < rgu_start_date
)
slice_after_rgu_time = (df["cluster_name"] == cluster_config.name) & (
df["start_time"] >= rgu_start_date
)

# We can already set column allocated.gpu_type_rgu anyway.
df.loc[slice_before_rgu_time, "allocated.gpu_type_rgu"] = col_ratio_rgu_by_gpu[
slice_before_rgu_time
]
df.loc[slice_after_rgu_time, "allocated.gpu_type_rgu"] = col_ratio_rgu_by_gpu[
slice_after_rgu_time
]

# Compute allocated.gres_rgu where job started before RGU time.
df.loc[slice_before_rgu_time, "allocated.gres_rgu"] = (
df["allocated.gres_gpu"][slice_before_rgu_time]
* col_ratio_rgu_by_gpu[slice_before_rgu_time]
)

# Set allocated.gres_rgu with previous allocated.gres_gpu where job started after RGU time.
df.loc[slice_after_rgu_time, "allocated.gres_rgu"] = df["allocated.gres_gpu"][
slice_after_rgu_time
]
# Then update allocated.gres_gpu where job started after RGU time.
df.loc[slice_after_rgu_time, "allocated.gres_gpu"] = (
df["allocated.gres_gpu"][slice_after_rgu_time]
/ col_ratio_rgu_by_gpu[slice_after_rgu_time]
)

return df


def update_job_series_rgu(df: DataFrame):
"""
Compute RGU information for jobs in given data frame.
Parameters
----------
df: DataFrame
Data frame to update, typically returned by `load_job_series`.
Should contain fields:
"cluster_name", "start_time", "allocated.gpu_type", "allocated.gres_gpu".
Returns
-------
DataFrame
Input data frame with:
- column `allocated.gres_gpu` updated if necessary.
- column `allocated.gres_rgu` added or updated to contain RGU billing.
Set to NaN (or unchanged if already present) for jobs from clusters without RGU.
- column `gpu_type_rgu` added or updated to contain RGU cost per GPU (RGU/GPU ratio).
Set to NaN (or unchanged if already present) for jobs from clusters without RGU.
For more details about implementation, see function `update_cluster_job_series_rgu`
"""
for cluster_config in config().clusters.values():
update_cluster_job_series_rgu(df, cluster_config)
return df


def _select_stat(name, dist):
if not dist:
return np.nan
Expand Down
Loading

0 comments on commit 44eabe9

Please sign in to comment.