Merge pull request #14 from Taichi-Ibi/main

Update source code from Taichi-Ibi/gpu-dashboard
wandb · Jul 23, 2024 · b5a7f35 · b5a7f35
2 parents 9298b4e + 7cc1b38
commit b5a7f35
Show file tree

Hide file tree

Showing 5 changed files with 54 additions and 17 deletions.
diff --git a/gpu-dashboard/GpuUsage.py b/gpu-dashboard/GpuUsage.py
@@ -38,7 +38,7 @@ def wrapper(*args, **kwargs):
 
     return wrapper
 
-
+@error_handler
 def handler(event: dict[str, str], context: object) -> None:
     # -------------------- 準備 -------------------- #
     # Set WANDB envirionment

diff --git a/gpu-dashboard/config.yaml b/gpu-dashboard/config.yaml
@@ -1,6 +1,6 @@
 testmode: false
 enable_alert: true
-ignore_tag: other_gpu  # 増えるようだったらfnmatchで対応する
+ignore_tag: ["other_gpu", "others_gpu"]  # 増えるようだったらfnmatchで対応する
 wandb_dir: /tmp
 
 dashboard:
@@ -38,6 +38,8 @@ companies:
     schedule:
       - date: "2024-02-15"
         assigned_gpu_node: 50
+      - date: "2024-07-16"
+        assigned_gpu_node: 52
 
   ### Stockmark ###
   - company: stockmark-geniac 
@@ -59,6 +61,7 @@ companies:
         assigned_gpu_node: 4
       - date: "2024-06-15"
         assigned_gpu_node: 35
+    distributed_learning: true
 
   ### NII ###
   - company: nii-geniac 
@@ -100,6 +103,7 @@ companies:
     schedule:
       - date: "2024-05-24"
         assigned_gpu_node: 36
+    distributed_learning: true
 
   ### Kotoba Technologies ###
   - company: kotoba-geniac
@@ -108,11 +112,13 @@ companies:
     schedule:
       - date: "2024-05-24"
         assigned_gpu_node: 8
+    distributed_learning: true
 
   ### 富士通 ###
   - company: fujitsu-geniac
     teams:
       - fujitsu-geniac
     schedule:
       - date: "2024-05-24"
-        assigned_gpu_node: 16
+        assigned_gpu_node: 16
+    distributed_learning: true
diff --git a/gpu-dashboard/fetch_runs.py b/gpu-dashboard/fetch_runs.py
@@ -241,9 +241,14 @@ def query_runs(
         if createdAt.timestamp() == updatedAt.timestamp():  # 即終了したもの
             continue
         if not make_blacklist:
-            if CONFIG.ignore_tag in [
+            if not set(CONFIG.ignore_tag).isdisjoint([
                 t.lower() for t in node.tags
-            ]:  # 特定のtagをスキップ
+            ]): # 特定のtagをスキップ
+                continue
+        else:
+            if set(CONFIG.ignore_tag).isdisjoint([
+                t.lower() for t in node.tags
+            ]): # 特定のtag以外をスキップ
                 continue
         if target_date is not None:
             if target_date > updatedAt.date():  # 昨日以前に終了したものはスキップ
@@ -314,7 +319,11 @@ def get_metrics(
 ) -> pl.DataFrame:
     # raw data
     api = wandb.Api()
-    run = api.run(path=run_path)
+    try:
+        run = api.run(path=run_path)
+    except wandb.errors.CommError as e:
+        print(f"Error: Could not find run {run_path}. Exception: {e}")
+        return pl.DataFrame()
     metrics_df = pl.from_dataframe(run.history(stream="events", samples=100))
     # filter
     if len(metrics_df) <= 1:
@@ -334,17 +343,23 @@ def get_metrics(
     )
     if metrics_df_with_datetime.is_empty():
         return pl.DataFrame()
-    # process
-    daily_metrics_df = (
-        metrics_df_with_datetime.lazy()
-        # 縦持ちに変換
+    metrics_df_small_width = (
+        metrics_df_with_datetime
+        # カラム抽出
         .select(
             "datetime",
             "_timestamp",
             gpu_ptn := ("^system\.gpu\.\d+\.gpu$"),
             memory_ptn := ("^system\.gpu\.\d+\.memory$"),
         )
+    )
+    if metrics_df_small_width.width == 2:
+        return pl.DataFrame()
+    # process
+    daily_metrics_df = (
+        metrics_df_small_width
         .with_columns(pl.col("datetime").cast(pl.Date).alias("date"))
+        # 縦持ちに変換
         .melt(
             id_vars=["date", "datetime", "_timestamp"],
             value_vars=[c for c in metrics_df.columns if re.findall(gpu_ptn, c)]
@@ -364,7 +379,6 @@ def get_metrics(
             )  # seconds * 60 * 60 = hours
             .alias("metrics_hours"),
         )
-        .collect()
         # 横持ちに変換
         .pivot(index="date", columns="gpu", values=["average", "max"])
         .rename(
@@ -472,5 +486,15 @@ def get_world_size(run_path: str) -> int:
     api = wandb.Api()
     run = api.run(run_path)
     config = run.config
-    world_size = config.get("world_size", 0)
+    entity = run_path.split("/")[0]
+    num_nodes = config.get("num_nodes", 0)
+    num_gpus = config.get("num_gpus", 0)
+
+    if entity == "kotoba-geniac":
+        world_size = num_nodes * num_gpus
+    elif entity == "fujitsu-geniac":
+        world_size = config.get("SLURM_NTASKS", 0)
+    else:
+        world_size = config.get("world_size", 0)
+
     return world_size
diff --git a/gpu-dashboard/update_blacklist.py b/gpu-dashboard/update_blacklist.py
@@ -37,6 +37,7 @@ def create_blacklist() -> list[BlacklistRow]:
                 project=project.project,
                 target_date=None,
                 make_blacklist=True,
+                distributed_learning=tree.distributed_learning,
             )
             project.runs = runs
 
@@ -45,7 +46,7 @@ def create_blacklist() -> list[BlacklistRow]:
     for tree in trees:
         for project in tree.projects:
             for run in project.runs:
-                if CONFIG.ignore_tag in [t.lower() for t in run.tags]:
+                if not set(CONFIG.ignore_tag).isdisjoint([t.lower() for t in run.tags]):
                     blacklist.append(BlacklistRow(run_path=run.run_path, tags=run.tags))
 
     return blacklist
@@ -78,3 +79,6 @@ def upload_blacklist(blacklist: list[BlacklistRow]) -> None:
         run.log_artifact(artifact)
 
     return None
+
+if __name__ == "__main__":
+    update_blacklist()
diff --git a/gpu-dashboard/update_tables.py b/gpu-dashboard/update_tables.py
@@ -282,12 +282,15 @@ def update_companies(
                     ),
                 }
             )
-            if CONFIG.enable_alert & (
-                gpu_daily_company_table["GPU稼働率(%)"].to_list()[0] < 10
-            ):
+            if CONFIG.enable_alert:
                 latest_row_dict = gpu_daily_company_table.to_pandas().to_dict(
                     orient="records"
                 )[0]
-                wandb.alert(title="Too low utilization rate found.", text=str(latest_row_dict))
+                threshold = 10
+                if latest_row_dict["GPU稼働率(%)"] < threshold:
+                    wandb.alert(
+                        title="Too low utilization rate found.",
+                        text=company,
+                    )
 
     return None