From 3702bb56f87a3f6bb8d6a08d8a94e4959df26795 Mon Sep 17 00:00:00 2001 From: val06 Date: Fri, 18 Oct 2024 18:46:42 +0300 Subject: [PATCH] [EBPF] GPU-monitoring: added nvml lib path config knob (#30263) --- cmd/system-probe/modules/gpu.go | 10 +++++++--- pkg/config/setup/system_probe.go | 7 +++++-- pkg/gpu/config.go | 22 +++++++++++++++------- pkg/gpu/probe.go | 2 +- 4 files changed, 28 insertions(+), 13 deletions(-) diff --git a/cmd/system-probe/modules/gpu.go b/cmd/system-probe/modules/gpu.go index afedbca65ca05..3cd54e27293e5 100644 --- a/cmd/system-probe/modules/gpu.go +++ b/cmd/system-probe/modules/gpu.go @@ -24,16 +24,20 @@ import ( ) var _ module.Module = &GPUMonitoringModule{} -var gpuMonitoringConfigNamespaces = []string{gpu.GPUConfigNS} +var gpuMonitoringConfigNamespaces = []string{gpu.GPUNS} // GPUMonitoring Factory var GPUMonitoring = module.Factory{ Name: config.GPUMonitoringModule, ConfigNamespaces: gpuMonitoringConfigNamespaces, Fn: func(_ *sysconfigtypes.Config, deps module.FactoryDependencies) (module.Module, error) { + + c := gpu.NewConfig() probeDeps := gpu.ProbeDependencies{ Telemetry: deps.Telemetry, - NvmlLib: nvml.New(), + //if the config parameter doesn't exist or is empty string, the default value is used as defined in go-nvml library + //(https://github.com/NVIDIA/go-nvml/blob/main/pkg/nvml/lib.go#L30) + NvmlLib: nvml.New(nvml.WithLibraryPath(c.NVMLLibraryPath)), } ret := probeDeps.NvmlLib.Init() @@ -41,7 +45,7 @@ var GPUMonitoring = module.Factory{ return nil, fmt.Errorf("unable to initialize NVML library: %v", ret) } - t, err := gpu.NewProbe(gpu.NewConfig(), probeDeps) + t, err := gpu.NewProbe(c, probeDeps) if err != nil { return nil, fmt.Errorf("unable to start GPU monitoring: %w", err) } diff --git a/pkg/config/setup/system_probe.go b/pkg/config/setup/system_probe.go index 66762a7d98fae..97d0f0bf85856 100644 --- a/pkg/config/setup/system_probe.go +++ b/pkg/config/setup/system_probe.go @@ -31,7 +31,7 @@ const ( pngNS = "ping" tracerouteNS = "traceroute" discoveryNS = "discovery" - gpuMonitoringNS = "gpu_monitoring" + gpuNS = "gpu_monitoring" defaultConnsMessageBatchSize = 600 // defaultServiceMonitoringJavaAgentArgs is default arguments that are passing to the injected java USM agent @@ -408,7 +408,10 @@ func InitSystemProbeConfig(cfg pkgconfigmodel.Config) { cfg.BindEnv("fleet_policies_dir") // GPU monitoring - cfg.BindEnvAndSetDefault(join(gpuMonitoringNS, "enabled"), false) + cfg.BindEnvAndSetDefault(join(gpuNS, "enabled"), false) + cfg.BindEnv(join(gpuNS, "nvml_lib_path")) + cfg.BindEnvAndSetDefault(join(gpuNS, "process_scan_interval_seconds"), 5) + cfg.BindEnvAndSetDefault(join(gpuNS, "initial_process_sync"), true) initCWSSystemProbeConfig(cfg) } diff --git a/pkg/gpu/config.go b/pkg/gpu/config.go index 995fbde58cbd9..d8beee7591dff 100644 --- a/pkg/gpu/config.go +++ b/pkg/gpu/config.go @@ -7,26 +7,34 @@ package gpu import ( + sysconfig "github.com/DataDog/datadog-agent/cmd/system-probe/config" + pkgconfigsetup "github.com/DataDog/datadog-agent/pkg/config/setup" "time" "github.com/DataDog/datadog-agent/pkg/ebpf" ) -// GPUConfigNS is the namespace for the GPU monitoring probe. -const GPUConfigNS = "gpu_monitoring" +// GPUNS is the namespace for the GPU monitoring probe. +const GPUNS = "gpu_monitoring" // Config holds the configuration for the GPU monitoring probe. type Config struct { - *ebpf.Config + ebpf.Config + // ScanTerminatedProcessesInterval is the interval at which the probe scans for terminated processes. ScanTerminatedProcessesInterval time.Duration - InitialProcessSync bool + // InitialProcessSync indicates whether the probe should sync the process list on startup. + InitialProcessSync bool + // NVMLLibraryPath is the path of the native libnvidia-ml.so library + NVMLLibraryPath string } // NewConfig generates a new configuration for the GPU monitoring probe. func NewConfig() *Config { + spCfg := pkgconfigsetup.SystemProbe() return &Config{ - Config: ebpf.NewConfig(), - ScanTerminatedProcessesInterval: 5 * time.Second, - InitialProcessSync: true, + Config: *ebpf.NewConfig(), + ScanTerminatedProcessesInterval: time.Duration(spCfg.GetInt(sysconfig.FullKeyPath(GPUNS, "process_scan_interval_seconds"))) * time.Second, + InitialProcessSync: spCfg.GetBool(sysconfig.FullKeyPath(GPUNS, "initial_process_sync")), + NVMLLibraryPath: spCfg.GetString(sysconfig.FullKeyPath(GPUNS, "nvml_lib_path")), } } diff --git a/pkg/gpu/probe.go b/pkg/gpu/probe.go index fa1db4aac6c6c..905120afc82a0 100644 --- a/pkg/gpu/probe.go +++ b/pkg/gpu/probe.go @@ -133,7 +133,7 @@ func startGPUProbe(buf bytecode.AssetReader, opts manager.Options, deps ProbeDep }, }, }, - EbpfConfig: cfg.Config, + EbpfConfig: &cfg.Config, PerformInitialScan: cfg.InitialProcessSync, }