Skip to content

Commit

Permalink
Merge pull request #14 from Taichi-Ibi/main
Browse files Browse the repository at this point in the history
Update source code from Taichi-Ibi/gpu-dashboard
  • Loading branch information
olachinkei authored Jul 23, 2024
2 parents 9298b4e + 7cc1b38 commit b5a7f35
Show file tree
Hide file tree
Showing 5 changed files with 54 additions and 17 deletions.
2 changes: 1 addition & 1 deletion gpu-dashboard/GpuUsage.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def wrapper(*args, **kwargs):

return wrapper


@error_handler
def handler(event: dict[str, str], context: object) -> None:
# -------------------- 準備 -------------------- #
# Set WANDB envirionment
Expand Down
10 changes: 8 additions & 2 deletions gpu-dashboard/config.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
testmode: false
enable_alert: true
ignore_tag: other_gpu # 増えるようだったらfnmatchで対応する
ignore_tag: ["other_gpu", "others_gpu"] # 増えるようだったらfnmatchで対応する
wandb_dir: /tmp

dashboard:
Expand Down Expand Up @@ -38,6 +38,8 @@ companies:
schedule:
- date: "2024-02-15"
assigned_gpu_node: 50
- date: "2024-07-16"
assigned_gpu_node: 52

### Stockmark ###
- company: stockmark-geniac
Expand All @@ -59,6 +61,7 @@ companies:
assigned_gpu_node: 4
- date: "2024-06-15"
assigned_gpu_node: 35
distributed_learning: true

### NII ###
- company: nii-geniac
Expand Down Expand Up @@ -100,6 +103,7 @@ companies:
schedule:
- date: "2024-05-24"
assigned_gpu_node: 36
distributed_learning: true

### Kotoba Technologies ###
- company: kotoba-geniac
Expand All @@ -108,11 +112,13 @@ companies:
schedule:
- date: "2024-05-24"
assigned_gpu_node: 8
distributed_learning: true

### 富士通 ###
- company: fujitsu-geniac
teams:
- fujitsu-geniac
schedule:
- date: "2024-05-24"
assigned_gpu_node: 16
assigned_gpu_node: 16
distributed_learning: true
42 changes: 33 additions & 9 deletions gpu-dashboard/fetch_runs.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,9 +241,14 @@ def query_runs(
if createdAt.timestamp() == updatedAt.timestamp(): # 即終了したもの
continue
if not make_blacklist:
if CONFIG.ignore_tag in [
if not set(CONFIG.ignore_tag).isdisjoint([
t.lower() for t in node.tags
]: # 特定のtagをスキップ
]): # 特定のtagをスキップ
continue
else:
if set(CONFIG.ignore_tag).isdisjoint([
t.lower() for t in node.tags
]): # 特定のtag以外をスキップ
continue
if target_date is not None:
if target_date > updatedAt.date(): # 昨日以前に終了したものはスキップ
Expand Down Expand Up @@ -314,7 +319,11 @@ def get_metrics(
) -> pl.DataFrame:
# raw data
api = wandb.Api()
run = api.run(path=run_path)
try:
run = api.run(path=run_path)
except wandb.errors.CommError as e:
print(f"Error: Could not find run {run_path}. Exception: {e}")
return pl.DataFrame()
metrics_df = pl.from_dataframe(run.history(stream="events", samples=100))
# filter
if len(metrics_df) <= 1:
Expand All @@ -334,17 +343,23 @@ def get_metrics(
)
if metrics_df_with_datetime.is_empty():
return pl.DataFrame()
# process
daily_metrics_df = (
metrics_df_with_datetime.lazy()
# 縦持ちに変換
metrics_df_small_width = (
metrics_df_with_datetime
# カラム抽出
.select(
"datetime",
"_timestamp",
gpu_ptn := ("^system\.gpu\.\d+\.gpu$"),
memory_ptn := ("^system\.gpu\.\d+\.memory$"),
)
)
if metrics_df_small_width.width == 2:
return pl.DataFrame()
# process
daily_metrics_df = (
metrics_df_small_width
.with_columns(pl.col("datetime").cast(pl.Date).alias("date"))
# 縦持ちに変換
.melt(
id_vars=["date", "datetime", "_timestamp"],
value_vars=[c for c in metrics_df.columns if re.findall(gpu_ptn, c)]
Expand All @@ -364,7 +379,6 @@ def get_metrics(
) # seconds * 60 * 60 = hours
.alias("metrics_hours"),
)
.collect()
# 横持ちに変換
.pivot(index="date", columns="gpu", values=["average", "max"])
.rename(
Expand Down Expand Up @@ -472,5 +486,15 @@ def get_world_size(run_path: str) -> int:
api = wandb.Api()
run = api.run(run_path)
config = run.config
world_size = config.get("world_size", 0)
entity = run_path.split("/")[0]
num_nodes = config.get("num_nodes", 0)
num_gpus = config.get("num_gpus", 0)

if entity == "kotoba-geniac":
world_size = num_nodes * num_gpus
elif entity == "fujitsu-geniac":
world_size = config.get("SLURM_NTASKS", 0)
else:
world_size = config.get("world_size", 0)

return world_size
6 changes: 5 additions & 1 deletion gpu-dashboard/update_blacklist.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ def create_blacklist() -> list[BlacklistRow]:
project=project.project,
target_date=None,
make_blacklist=True,
distributed_learning=tree.distributed_learning,
)
project.runs = runs

Expand All @@ -45,7 +46,7 @@ def create_blacklist() -> list[BlacklistRow]:
for tree in trees:
for project in tree.projects:
for run in project.runs:
if CONFIG.ignore_tag in [t.lower() for t in run.tags]:
if not set(CONFIG.ignore_tag).isdisjoint([t.lower() for t in run.tags]):
blacklist.append(BlacklistRow(run_path=run.run_path, tags=run.tags))

return blacklist
Expand Down Expand Up @@ -78,3 +79,6 @@ def upload_blacklist(blacklist: list[BlacklistRow]) -> None:
run.log_artifact(artifact)

return None

if __name__ == "__main__":
update_blacklist()
11 changes: 7 additions & 4 deletions gpu-dashboard/update_tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -282,12 +282,15 @@ def update_companies(
),
}
)
if CONFIG.enable_alert & (
gpu_daily_company_table["GPU稼働率(%)"].to_list()[0] < 10
):
if CONFIG.enable_alert:
latest_row_dict = gpu_daily_company_table.to_pandas().to_dict(
orient="records"
)[0]
wandb.alert(title="Too low utilization rate found.", text=str(latest_row_dict))
threshold = 10
if latest_row_dict["GPU稼働率(%)"] < threshold:
wandb.alert(
title="Too low utilization rate found.",
text=company,
)

return None

0 comments on commit b5a7f35

Please sign in to comment.