Skip to content

Commit

Permalink
Merge branch 'master' into users_exceptions
Browse files Browse the repository at this point in the history
  • Loading branch information
nurbal committed Apr 11, 2024
2 parents 7469517 + 44eabe9 commit a95b9e7
Show file tree
Hide file tree
Showing 42 changed files with 2,662 additions and 186 deletions.
8 changes: 7 additions & 1 deletion config/sarc-client.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,11 @@
"connection_string": "mongodb://readuser:readpwd@localhost:8123/sarc",
"database_name": "sarc"
},
"cache": "./sarc-cache"
"cache": "./sarc-cache",
"loki": {
"uri": "https://localhost/loki"
},
"tempo": {
"uri": "https://localhost/tempo"
}
}
16 changes: 12 additions & 4 deletions config/sarc-dev.json
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,9 @@
"diskusage_report_command": "diskusage_report --project --all_users",
"prometheus_url": "https://mila-thanos.calculquebec.ca",
"prometheus_headers_file": "secrets/drac_prometheus/headers.json",
"start_date": "2022-04-01"
"start_date": "2022-04-01",
"rgu_start_date": "2023-11-28",
"gpu_to_rgu_billing": "secrets/gpu_to_rgu_billing_narval.json"
},
"beluga": {
"host": "beluga.computecanada.ca",
Expand All @@ -52,7 +54,9 @@
"diskusage_report_command": "diskusage_report --project --all_users",
"prometheus_url": "https://mila-thanos.calculquebec.ca",
"prometheus_headers_file": "secrets/drac_prometheus/headers.json",
"start_date": "2022-04-01"
"start_date": "2022-04-01",
"rgu_start_date": "2024-04-03",
"gpu_to_rgu_billing": "secrets/gpu_to_rgu_billing_beluga.json"
},
"graham": {
"host": "graham.computecanada.ca",
Expand All @@ -65,7 +69,9 @@
"prometheus_url": null,
"prometheus_headers_file": null,
"start_date": "2022-04-01",
"nodes_info_file": "secrets/nodes_graham.txt"
"nodes_info_file": "secrets/nodes_graham.txt",
"rgu_start_date": "2024-04-03",
"gpu_to_rgu_billing": "secrets/gpu_to_rgu_billing_graham.json"
},
"cedar": {
"host": "cedar.computecanada.ca",
Expand All @@ -78,7 +84,9 @@
"prometheus_url": null,
"prometheus_headers_file": null,
"start_date": "2022-04-01",
"nodes_info_file": "secrets/nodes_cedar.txt"
"nodes_info_file": "secrets/nodes_cedar.txt",
"rgu_start_date": "2024-04-03",
"gpu_to_rgu_billing": "secrets/gpu_to_rgu_billing_cedar.json"
}
}
}
16 changes: 12 additions & 4 deletions config/sarc-prod.json
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,9 @@
"diskusage_report_command": "diskusage_report --project --all_users",
"prometheus_url": "https://mila-thanos.calculquebec.ca",
"prometheus_headers_file": "secrets/drac_prometheus/headers.json",
"start_date": "2022-04-01"
"start_date": "2022-04-01",
"rgu_start_date": "2023-11-28",
"gpu_to_rgu_billing": "secrets/gpu_to_rgu_billing_narval.json"
},
"beluga": {
"host": "beluga.computecanada.ca",
Expand All @@ -52,7 +54,9 @@
"diskusage_report_command": "diskusage_report --project --all_users",
"prometheus_url": "https://mila-thanos.calculquebec.ca",
"prometheus_headers_file": "secrets/drac_prometheus/headers.json",
"start_date": "2022-04-01"
"start_date": "2022-04-01",
"rgu_start_date": "2024-04-03",
"gpu_to_rgu_billing": "secrets/gpu_to_rgu_billing_beluga.json"
},
"graham": {
"host": "graham.computecanada.ca",
Expand All @@ -65,7 +69,9 @@
"prometheus_url": null,
"prometheus_headers_file": null,
"start_date": "2022-04-01",
"nodes_info_file": "secrets/nodes_graham.txt"
"nodes_info_file": "secrets/nodes_graham.txt",
"rgu_start_date": "2024-04-03",
"gpu_to_rgu_billing": "secrets/gpu_to_rgu_billing_graham.json"
},
"cedar": {
"host": "cedar.computecanada.ca",
Expand All @@ -78,7 +84,9 @@
"prometheus_url": null,
"prometheus_headers_file": null,
"start_date": "2022-04-01",
"nodes_info_file": "secrets/nodes_cedar.txt"
"nodes_info_file": "secrets/nodes_cedar.txt",
"rgu_start_date": "2024-04-03",
"gpu_to_rgu_billing": "secrets/gpu_to_rgu_billing_cedar.json"
}
}
}
93 changes: 90 additions & 3 deletions poetry.lock

Large diffs are not rendered by default.

6 changes: 5 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@ sphinx = "^6.2.1"
myst-parser = "^2.0.0"
sphinx-rtd-theme = "^1.2.2"
nbsphinx = "^0.9.3"
opentelemetry-api = "^1.23.0"
opentelemetry-sdk = "^1.23.0"
requests-mock = "^1.11.0"

[tool.poetry.group.dev.dependencies]
black = ">= 22.12.0"
Expand Down Expand Up @@ -68,7 +71,8 @@ disable = [
"missing-function-docstring",
"invalid-name",
"no-else-return", # Bad rule IMO (- OB)
"line-too-long" # Black takes care of line length.
"line-too-long", # Black takes care of line length.
"logging-fstring-interpolation"
]
extension-pkg-whitelist = "pydantic"

Expand Down
23 changes: 12 additions & 11 deletions sarc/account_matching/make_matches.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
import copy
import csv
import json
import logging
from pathlib import PosixPath

from sarc.account_matching import name_distances
Expand Down Expand Up @@ -136,7 +137,7 @@ def perform_matching(
if drac_source not in DLD_data:
# we might not have all three source files
if verbose:
print(f"{drac_source} file missing !")
logging.warning(f"{drac_source} file missing !")
continue
LD_members = _how_many_drac_accounts_with_mila_emails(
DLD_data, drac_source, verbose=verbose
Expand All @@ -145,12 +146,12 @@ def perform_matching(
assert D_member["email"].endswith("@mila.quebec")
if D_member["email"] in S_mila_emails_to_ignore:
if verbose:
print(f'Ignoring phantom {D_member["email"]} (ignore list).')
logging.info(f'Ignoring phantom {D_member["email"]} (ignore list).')
continue
if D_member["email"] not in DD_persons:
# we WANT to create an entry in DD_persons with the mila username, and the name from the cc_source !
if verbose:
print(
logging.info(
f'Creating phantom profile for {D_member["email"]} (automatic).'
)
DD_persons[D_member["email"]] = {}
Expand Down Expand Up @@ -303,9 +304,9 @@ def _prompt_manual_match(mila_display_name, cc_source, best_matches):
# Re-prompt.

if cc_match:
print(mila_display_name, "(matched with)", cc_match)
logging.info(f"[prompt] {mila_display_name} (matched with) {cc_match}")
else:
print("(ignored)")
logging.info(f"[prompt] {mila_display_name} (ignored)")

return cc_match

Expand Down Expand Up @@ -373,10 +374,10 @@ def _make_matches_status_report(DLD_data, DD_persons):
else:
disabled_count += 1

print(
logging.info(
f"We have {enabled_count} enabled accounts and {disabled_count} disabled accounts."
)
print(
logging.info(
f"Out of those enabled accounts, there are {good_count} successful matches "
f"and {bad_count} failed matches."
)
Expand All @@ -391,7 +392,7 @@ def _make_matches_status_report(DLD_data, DD_persons):
if D["activation_status"] in ["activated"]
]
)
print(f"We have {count_drac_members_activated} activated drac_members.")
logging.info(f"We have {count_drac_members_activated} activated drac_members.")

# let's try to be more precise about things to find the missing accounts
set_A = {
Expand All @@ -404,7 +405,7 @@ def _make_matches_status_report(DLD_data, DD_persons):
for D_person in DD_persons.values()
if D_person.get("drac_members", None) is not None
}
print(
logging.info(
"We could not find matches in the Mila LDAP for the CC accounts "
f"associated with the following emails: {set_A.difference(set_B)}."
)
Expand All @@ -415,7 +416,7 @@ def _make_matches_status_report(DLD_data, DD_persons):
count_drac_roles_activated = len(
[D for D in DLD_data["drac_roles"] if D["status"].lower() in ["activated"]]
)
print(f"We have {count_drac_roles_activated} activated drac_roles.")
logging.info(f"We have {count_drac_roles_activated} activated drac_roles.")


def _how_many_drac_accounts_with_mila_emails(
Expand All @@ -429,7 +430,7 @@ def _how_many_drac_accounts_with_mila_emails(
]

if verbose:
print(
logging.info(
f"We have {len(LD_members)} {drac_source} accounts with @mila.quebec, "
f"out of {len(data['drac_members'])}."
)
Expand Down
52 changes: 34 additions & 18 deletions sarc/cli/acquire/jobs.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from __future__ import annotations

import logging
from dataclasses import dataclass
from datetime import datetime, timedelta
from typing import Generator
Expand All @@ -9,6 +10,7 @@
from sarc.config import config
from sarc.errors import ClusterNotFound
from sarc.jobs.sacct import sacct_mongodb_import
from sarc.traces import using_trace


def _str_to_dt(dt_str: str) -> datetime:
Expand Down Expand Up @@ -56,17 +58,17 @@ def _dates_auto_first_date(cluster_name: str) -> datetime:
if cluster is None:
raise ClusterNotFound(f"Cluster {cluster_name} not found in database")
start_date = cluster["start_date"]
print(f"start_date={start_date}")
logging.info(f"start_date={start_date}")
end_date = cluster["end_date"]
print(f"end_date={end_date}")
logging.info(f"end_date={end_date}")
if end_date is None:
return _str_to_dt(start_date)
return _str_to_dt(end_date) + timedelta(days=1)


def _dates_set_last_date(cluster_name: str, date: datetime) -> None:
# set the last valid date in the database for the cluster
print(f"set last successful date for cluster {cluster_name} to {date}")
logging.info(f"set last successful date for cluster {cluster_name} to {date}")
db = config().mongo.database_instance
db_collection = db.clusters
db_collection.update_one(
Expand All @@ -93,19 +95,33 @@ def execute(self) -> int:
clusters_configs = cfg.clusters

for cluster_name in self.cluster_names:
for date, is_auto in parse_dates(self.dates, cluster_name):
try:
print(
f"Acquire data on {cluster_name} for date: {date} (is_auto={is_auto})"
)

sacct_mongodb_import(
clusters_configs[cluster_name], date, self.no_prometheus
)
if is_auto:
_dates_set_last_date(cluster_name, date)
# pylint: disable=broad-exception-caught
except Exception as e:
print(f"Failed to acquire data for {cluster_name} on {date}: {e}")
return 1
try:
for date, is_auto in parse_dates(self.dates, cluster_name):
with using_trace(
"AcquireJobs", "acquire_cluster_data", exception_types=()
) as span:
span.set_attribute("cluster_name", cluster_name)
span.set_attribute("date", str(date))
span.set_attribute("is_auto", is_auto)
try:
logging.info(
f"Acquire data on {cluster_name} for date: {date} (is_auto={is_auto})"
)

sacct_mongodb_import(
clusters_configs[cluster_name], date, self.no_prometheus
)
if is_auto:
_dates_set_last_date(cluster_name, date)
# pylint: disable=broad-exception-caught
except Exception as e:
logging.error(
f"Failed to acquire data for {cluster_name} on {date}: {e}"
)
raise e
# pylint: disable=broad-exception-caught
except Exception:
# Error while acquiring data on a cluster from given dates.
# Continue to next cluster.
continue
return 0
14 changes: 13 additions & 1 deletion sarc/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,8 @@ class ClusterConfig(BaseModel):
duc_storage_command: str = None
diskusage_report_command: str = None
start_date: str = "2022-04-01"
rgu_start_date: str = None
gpu_to_rgu_billing: Path = None

@validator("timezone")
def _timezone(cls, value):
Expand All @@ -101,7 +103,7 @@ def ssh(self):
fconfig = FabricConfig()
else:
fconfig = FabricConfig(ssh_config=SSHConfig.from_path(self.sshconfig))
fconfig["run"]["pty"] = True
fconfig["run"]["pty"] = False
fconfig["run"]["in_stream"] = False
return Connection(self.host, config=fconfig)

Expand Down Expand Up @@ -167,6 +169,14 @@ def _relative_exception(cls, value):
return relative_filepath(value)


class LokiConfig(BaseModel):
uri: str


class TempoConfig(BaseModel):
uri: str


class MyMilaConfig(BaseModel):
tmp_json_path: str

Expand All @@ -189,6 +199,8 @@ def _absolute_path(value, values, config, field):
class Config(BaseModel):
mongo: MongoConfig
cache: Path = None
loki: LokiConfig = None
tempo: TempoConfig = None

_abs_path = validator("cache", allow_reuse=True)(_absolute_path)

Expand Down
3 changes: 3 additions & 0 deletions sarc/jobs/job.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
from pydantic import validator
from pydantic_mongo import AbstractRepository, ObjectIdField

from sarc.traces import trace_decorator

from ..config import MTL, TZLOCAL, UTC, BaseModel, ClusterConfig, config


Expand Down Expand Up @@ -163,6 +165,7 @@ def series(self, **kwargs):

return get_job_time_series(job=self, **kwargs)

@trace_decorator()
def statistics(self, recompute=False, save=True, overwrite_when_empty=False):
from .series import compute_job_statistics # pylint: disable=cyclic-import

Expand Down
Loading

0 comments on commit a95b9e7

Please sign in to comment.