Merge branch 'master' into users_exceptions

mila-iqia · Apr 11, 2024 · a95b9e7 · a95b9e7
2 parents 7469517 + 44eabe9
commit a95b9e7
Show file tree

Hide file tree

Showing 42 changed files with 2,662 additions and 186 deletions.
diff --git a/config/sarc-client.json b/config/sarc-client.json
@@ -3,5 +3,11 @@
         "connection_string": "mongodb://readuser:readpwd@localhost:8123/sarc",
         "database_name": "sarc"
     },
-    "cache": "./sarc-cache"
+    "cache": "./sarc-cache",
+    "loki": {
+        "uri": "https://localhost/loki"
+    },
+    "tempo": {
+        "uri": "https://localhost/tempo"
+    }
 }
diff --git a/config/sarc-dev.json b/config/sarc-dev.json
@@ -40,7 +40,9 @@
             "diskusage_report_command": "diskusage_report --project --all_users",
             "prometheus_url": "https://mila-thanos.calculquebec.ca",
             "prometheus_headers_file": "secrets/drac_prometheus/headers.json",
-            "start_date": "2022-04-01"
+            "start_date": "2022-04-01",
+            "rgu_start_date": "2023-11-28",
+            "gpu_to_rgu_billing": "secrets/gpu_to_rgu_billing_narval.json"
         },
         "beluga": {
             "host": "beluga.computecanada.ca",
@@ -52,7 +54,9 @@
             "diskusage_report_command": "diskusage_report --project --all_users",
             "prometheus_url": "https://mila-thanos.calculquebec.ca",
             "prometheus_headers_file": "secrets/drac_prometheus/headers.json",
-            "start_date": "2022-04-01"
+            "start_date": "2022-04-01",
+            "rgu_start_date": "2024-04-03",
+            "gpu_to_rgu_billing": "secrets/gpu_to_rgu_billing_beluga.json"
         },
         "graham": {
             "host": "graham.computecanada.ca",
@@ -65,7 +69,9 @@
             "prometheus_url": null,
             "prometheus_headers_file": null,
             "start_date": "2022-04-01",
-            "nodes_info_file": "secrets/nodes_graham.txt"
+            "nodes_info_file": "secrets/nodes_graham.txt",
+            "rgu_start_date": "2024-04-03",
+            "gpu_to_rgu_billing": "secrets/gpu_to_rgu_billing_graham.json"
         },
         "cedar": {
             "host": "cedar.computecanada.ca",
@@ -78,7 +84,9 @@
             "prometheus_url": null,
             "prometheus_headers_file": null,
             "start_date": "2022-04-01",
-            "nodes_info_file": "secrets/nodes_cedar.txt"
+            "nodes_info_file": "secrets/nodes_cedar.txt",
+            "rgu_start_date": "2024-04-03",
+            "gpu_to_rgu_billing": "secrets/gpu_to_rgu_billing_cedar.json"
         }
     }
 }
diff --git a/config/sarc-prod.json b/config/sarc-prod.json
@@ -40,7 +40,9 @@
             "diskusage_report_command": "diskusage_report --project --all_users",
             "prometheus_url": "https://mila-thanos.calculquebec.ca",
             "prometheus_headers_file": "secrets/drac_prometheus/headers.json",
-            "start_date": "2022-04-01"
+            "start_date": "2022-04-01",
+            "rgu_start_date": "2023-11-28",
+            "gpu_to_rgu_billing": "secrets/gpu_to_rgu_billing_narval.json"
         },
         "beluga": {
             "host": "beluga.computecanada.ca",
@@ -52,7 +54,9 @@
             "diskusage_report_command": "diskusage_report --project --all_users",
             "prometheus_url": "https://mila-thanos.calculquebec.ca",
             "prometheus_headers_file": "secrets/drac_prometheus/headers.json",
-            "start_date": "2022-04-01"
+            "start_date": "2022-04-01",
+            "rgu_start_date": "2024-04-03",
+            "gpu_to_rgu_billing": "secrets/gpu_to_rgu_billing_beluga.json"
         },
         "graham": {
             "host": "graham.computecanada.ca",
@@ -65,7 +69,9 @@
             "prometheus_url": null,
             "prometheus_headers_file": null,
             "start_date": "2022-04-01",
-            "nodes_info_file": "secrets/nodes_graham.txt"
+            "nodes_info_file": "secrets/nodes_graham.txt",
+            "rgu_start_date": "2024-04-03",
+            "gpu_to_rgu_billing": "secrets/gpu_to_rgu_billing_graham.json"
         },
         "cedar": {
             "host": "cedar.computecanada.ca",
@@ -78,7 +84,9 @@
             "prometheus_url": null,
             "prometheus_headers_file": null,
             "start_date": "2022-04-01",
-            "nodes_info_file": "secrets/nodes_cedar.txt"
+            "nodes_info_file": "secrets/nodes_cedar.txt",
+            "rgu_start_date": "2024-04-03",
+            "gpu_to_rgu_billing": "secrets/gpu_to_rgu_billing_cedar.json"
         }
     }
 }
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -23,6 +23,9 @@ sphinx = "^6.2.1"
 myst-parser = "^2.0.0"
 sphinx-rtd-theme = "^1.2.2"
 nbsphinx = "^0.9.3"
+opentelemetry-api = "^1.23.0"
+opentelemetry-sdk = "^1.23.0"
+requests-mock = "^1.11.0"
 
 [tool.poetry.group.dev.dependencies]
 black = ">= 22.12.0"
@@ -68,7 +71,8 @@ disable = [
     "missing-function-docstring",
     "invalid-name",
     "no-else-return",  # Bad rule IMO (- OB)
-    "line-too-long"  # Black takes care of line length.
+    "line-too-long",  # Black takes care of line length.
+    "logging-fstring-interpolation"
 ]
 extension-pkg-whitelist = "pydantic"
 

diff --git a/sarc/account_matching/make_matches.py b/sarc/account_matching/make_matches.py
@@ -16,6 +16,7 @@
 import copy
 import csv
 import json
+import logging
 from pathlib import PosixPath
 
 from sarc.account_matching import name_distances
@@ -136,7 +137,7 @@ def perform_matching(
         if drac_source not in DLD_data:
             # we might not have all three source files
             if verbose:
-                print(f"{drac_source} file missing !")
+                logging.warning(f"{drac_source} file missing !")
             continue
         LD_members = _how_many_drac_accounts_with_mila_emails(
             DLD_data, drac_source, verbose=verbose
@@ -145,12 +146,12 @@ def perform_matching(
             assert D_member["email"].endswith("@mila.quebec")
             if D_member["email"] in S_mila_emails_to_ignore:
                 if verbose:
-                    print(f'Ignoring phantom {D_member["email"]} (ignore list).')
+                    logging.info(f'Ignoring phantom {D_member["email"]} (ignore list).')
                 continue
             if D_member["email"] not in DD_persons:
                 # we WANT to create an entry in DD_persons with the mila username, and the name from the cc_source !
                 if verbose:
-                    print(
+                    logging.info(
                         f'Creating phantom profile for {D_member["email"]} (automatic).'
                     )
                 DD_persons[D_member["email"]] = {}
@@ -303,9 +304,9 @@ def _prompt_manual_match(mila_display_name, cc_source, best_matches):
             # Re-prompt.
 
     if cc_match:
-        print(mila_display_name, "(matched with)", cc_match)
+        logging.info(f"[prompt] {mila_display_name} (matched with) {cc_match}")
     else:
-        print("(ignored)")
+        logging.info(f"[prompt] {mila_display_name} (ignored)")
 
     return cc_match
 
@@ -373,10 +374,10 @@ def _make_matches_status_report(DLD_data, DD_persons):
         else:
             disabled_count += 1
 
-    print(
+    logging.info(
         f"We have {enabled_count} enabled accounts and {disabled_count} disabled accounts."
     )
-    print(
+    logging.info(
         f"Out of those enabled accounts, there are {good_count} successful matches "
         f"and {bad_count} failed matches."
     )
@@ -391,7 +392,7 @@ def _make_matches_status_report(DLD_data, DD_persons):
                 if D["activation_status"] in ["activated"]
             ]
         )
-        print(f"We have {count_drac_members_activated} activated drac_members.")
+        logging.info(f"We have {count_drac_members_activated} activated drac_members.")
 
         # let's try to be more precise about things to find the missing accounts
         set_A = {
@@ -404,7 +405,7 @@ def _make_matches_status_report(DLD_data, DD_persons):
             for D_person in DD_persons.values()
             if D_person.get("drac_members", None) is not None
         }
-        print(
+        logging.info(
             "We could not find matches in the Mila LDAP for the CC accounts "
             f"associated with the following emails: {set_A.difference(set_B)}."
         )
@@ -415,7 +416,7 @@ def _make_matches_status_report(DLD_data, DD_persons):
         count_drac_roles_activated = len(
             [D for D in DLD_data["drac_roles"] if D["status"].lower() in ["activated"]]
         )
-        print(f"We have {count_drac_roles_activated} activated drac_roles.")
+        logging.info(f"We have {count_drac_roles_activated} activated drac_roles.")
 
 
 def _how_many_drac_accounts_with_mila_emails(
@@ -429,7 +430,7 @@ def _how_many_drac_accounts_with_mila_emails(
     ]
 
     if verbose:
-        print(
+        logging.info(
             f"We have {len(LD_members)} {drac_source} accounts with @mila.quebec, "
             f"out of {len(data['drac_members'])}."
         )

diff --git a/sarc/cli/acquire/jobs.py b/sarc/cli/acquire/jobs.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+import logging
 from dataclasses import dataclass
 from datetime import datetime, timedelta
 from typing import Generator
@@ -9,6 +10,7 @@
 from sarc.config import config
 from sarc.errors import ClusterNotFound
 from sarc.jobs.sacct import sacct_mongodb_import
+from sarc.traces import using_trace
 
 
 def _str_to_dt(dt_str: str) -> datetime:
@@ -56,17 +58,17 @@ def _dates_auto_first_date(cluster_name: str) -> datetime:
     if cluster is None:
         raise ClusterNotFound(f"Cluster {cluster_name} not found in database")
     start_date = cluster["start_date"]
-    print(f"start_date={start_date}")
+    logging.info(f"start_date={start_date}")
     end_date = cluster["end_date"]
-    print(f"end_date={end_date}")
+    logging.info(f"end_date={end_date}")
     if end_date is None:
         return _str_to_dt(start_date)
     return _str_to_dt(end_date) + timedelta(days=1)
 
 
 def _dates_set_last_date(cluster_name: str, date: datetime) -> None:
     # set the last valid date in the database for the cluster
-    print(f"set last successful date for cluster {cluster_name} to {date}")
+    logging.info(f"set last successful date for cluster {cluster_name} to {date}")
     db = config().mongo.database_instance
     db_collection = db.clusters
     db_collection.update_one(
@@ -93,19 +95,33 @@ def execute(self) -> int:
         clusters_configs = cfg.clusters
 
         for cluster_name in self.cluster_names:
-            for date, is_auto in parse_dates(self.dates, cluster_name):
-                try:
-                    print(
-                        f"Acquire data on {cluster_name} for date: {date} (is_auto={is_auto})"
-                    )
-
-                    sacct_mongodb_import(
-                        clusters_configs[cluster_name], date, self.no_prometheus
-                    )
-                    if is_auto:
-                        _dates_set_last_date(cluster_name, date)
-                # pylint: disable=broad-exception-caught
-                except Exception as e:
-                    print(f"Failed to acquire data for {cluster_name} on {date}: {e}")
-                    return 1
+            try:
+                for date, is_auto in parse_dates(self.dates, cluster_name):
+                    with using_trace(
+                        "AcquireJobs", "acquire_cluster_data", exception_types=()
+                    ) as span:
+                        span.set_attribute("cluster_name", cluster_name)
+                        span.set_attribute("date", str(date))
+                        span.set_attribute("is_auto", is_auto)
+                        try:
+                            logging.info(
+                                f"Acquire data on {cluster_name} for date: {date} (is_auto={is_auto})"
+                            )
+
+                            sacct_mongodb_import(
+                                clusters_configs[cluster_name], date, self.no_prometheus
+                            )
+                            if is_auto:
+                                _dates_set_last_date(cluster_name, date)
+                        # pylint: disable=broad-exception-caught
+                        except Exception as e:
+                            logging.error(
+                                f"Failed to acquire data for {cluster_name} on {date}: {e}"
+                            )
+                            raise e
+            # pylint: disable=broad-exception-caught
+            except Exception:
+                # Error while acquiring data on a cluster from given dates.
+                # Continue to next cluster.
+                continue
         return 0
diff --git a/sarc/config.py b/sarc/config.py
@@ -83,6 +83,8 @@ class ClusterConfig(BaseModel):
     duc_storage_command: str = None
     diskusage_report_command: str = None
     start_date: str = "2022-04-01"
+    rgu_start_date: str = None
+    gpu_to_rgu_billing: Path = None
 
     @validator("timezone")
     def _timezone(cls, value):
@@ -101,7 +103,7 @@ def ssh(self):
             fconfig = FabricConfig()
         else:
             fconfig = FabricConfig(ssh_config=SSHConfig.from_path(self.sshconfig))
-        fconfig["run"]["pty"] = True
+        fconfig["run"]["pty"] = False
         fconfig["run"]["in_stream"] = False
         return Connection(self.host, config=fconfig)
 
@@ -167,6 +169,14 @@ def _relative_exception(cls, value):
         return relative_filepath(value)
 
 
+class LokiConfig(BaseModel):
+    uri: str
+
+
+class TempoConfig(BaseModel):
+    uri: str
+
+
 class MyMilaConfig(BaseModel):
     tmp_json_path: str
 
@@ -189,6 +199,8 @@ def _absolute_path(value, values, config, field):
 class Config(BaseModel):
     mongo: MongoConfig
     cache: Path = None
+    loki: LokiConfig = None
+    tempo: TempoConfig = None
 
     _abs_path = validator("cache", allow_reuse=True)(_absolute_path)
 

diff --git a/sarc/jobs/job.py b/sarc/jobs/job.py
@@ -8,6 +8,8 @@
 from pydantic import validator
 from pydantic_mongo import AbstractRepository, ObjectIdField
 
+from sarc.traces import trace_decorator
+
 from ..config import MTL, TZLOCAL, UTC, BaseModel, ClusterConfig, config
 
 
@@ -163,6 +165,7 @@ def series(self, **kwargs):
 
         return get_job_time_series(job=self, **kwargs)
 
+    @trace_decorator()
     def statistics(self, recompute=False, save=True, overwrite_when_empty=False):
         from .series import compute_job_statistics  # pylint: disable=cyclic-import