Skip to content

Commit

Permalink
Merge pull request #5162 from lindig/private/christianlin/CP-41126
Browse files Browse the repository at this point in the history
CP-33044 replace gpumon shutdown with NVML detach/attach
  • Loading branch information
robhoes authored Aug 31, 2023
2 parents 795298d + 0ae41a2 commit ca6a369
Show file tree
Hide file tree
Showing 2 changed files with 41 additions and 1 deletion.
9 changes: 9 additions & 0 deletions ocaml/xapi/xapi_globs.ml
Original file line number Diff line number Diff line change
Expand Up @@ -1159,6 +1159,10 @@ type nvidia_t4_sriov = Nvidia_T4_SRIOV | Nvidia_LEGACY | Nvidia_DEFAULT

let nvidia_t4_sriov = ref Nvidia_DEFAULT

(** CP-41126. true - we are detaching the NVML library in gpumon; false -
we stop gpumon. *)
let nvidia_gpumon_detach = ref false

let failed_login_alert_freq = ref 3600

let other_options =
Expand Down Expand Up @@ -1470,6 +1474,11 @@ let other_options =
, (fun () -> string_of_int !max_observer_file_size)
, "The maximum size of log files for saving spans"
)
; ( "nvidia-gpumon-detach"
, Arg.Set nvidia_gpumon_detach
, (fun () -> string_of_bool !nvidia_gpumon_detach)
, "On VM start, detach the NVML library rather than stopping gpumon"
)
]

(* The options can be set with the variable xapiflags in /etc/sysconfig/xapi.
Expand Down
33 changes: 32 additions & 1 deletion ocaml/xapi/xapi_gpumon.ml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ open D

let gpumon = "xcp-rrdd-gpumon"

let with_lock = Xapi_stdext_threads.Threadext.Mutex.execute

module Gpumon = Daemon_manager.Make (struct
let check =
Daemon_manager.Function
Expand All @@ -40,7 +42,36 @@ module Gpumon = Daemon_manager.Make (struct
Xapi_systemctl.stop ~wait_until_success:false gpumon
end)

let with_gpumon_stopped = Gpumon.with_daemon_stopped
let gpumon_m = Mutex.create ()

let with_gpumon_stopped ?(timeout = 30.0) f =
match !Xapi_globs.nvidia_gpumon_detach with
| false ->
Gpumon.with_daemon_stopped ~timeout f
| true -> (
debug "%s: about to acquire lock" __FUNCTION__ ;
with_lock gpumon_m @@ fun () ->
let module GPU = Gpumon_client.Client.Nvidia in
match GPU.nvml_is_attached __FUNCTION__ with
| false ->
(* nothing to do, just execute f *)
debug "%s: NVML is detached; nothing to do" __FUNCTION__ ;
f ()
| true ->
(* detach, execute f, re-attach in any case. Be aware
that both xenopsd, xapi call /usr/lib/nvidia/sriov-manage,
which may stop stop gpumon *)
Fun.protect
(fun () ->
debug "%s: about to detach NVML" __FUNCTION__ ;
GPU.nvml_detach __FUNCTION__ ;
f ()
)
~finally:(fun () ->
debug "%s: about to attach NVML" __FUNCTION__ ;
GPU.nvml_attach __FUNCTION__
)
)

module Nvidia = struct
let key = "nvidia"
Expand Down

0 comments on commit ca6a369

Please sign in to comment.