From 2c4b4e906bd271b7b34a6df5887df8ac5ebd99dc Mon Sep 17 00:00:00 2001 From: Valeri Pliskin Date: Fri, 18 Oct 2024 14:18:46 +0100 Subject: [PATCH 1/4] - added nvmlLibraryPath config value to gpu module. - switched the process scan interval to a config flag with default value --- cmd/system-probe/modules/gpu.go | 8 +++++--- pkg/config/setup/system_probe.go | 6 ++++-- pkg/gpu/config.go | 20 ++++++++++++++------ 3 files changed, 23 insertions(+), 11 deletions(-) diff --git a/cmd/system-probe/modules/gpu.go b/cmd/system-probe/modules/gpu.go index afedbca65ca05..1a74d65abcb4f 100644 --- a/cmd/system-probe/modules/gpu.go +++ b/cmd/system-probe/modules/gpu.go @@ -24,16 +24,18 @@ import ( ) var _ module.Module = &GPUMonitoringModule{} -var gpuMonitoringConfigNamespaces = []string{gpu.GPUConfigNS} +var gpuMonitoringConfigNamespaces = []string{gpu.GPUNS} // GPUMonitoring Factory var GPUMonitoring = module.Factory{ Name: config.GPUMonitoringModule, ConfigNamespaces: gpuMonitoringConfigNamespaces, Fn: func(_ *sysconfigtypes.Config, deps module.FactoryDependencies) (module.Module, error) { + + c := gpu.NewConfig() probeDeps := gpu.ProbeDependencies{ Telemetry: deps.Telemetry, - NvmlLib: nvml.New(), + NvmlLib: nvml.New(nvml.WithLibraryPath(c.NVMLLibraryPath)), } ret := probeDeps.NvmlLib.Init() @@ -41,7 +43,7 @@ var GPUMonitoring = module.Factory{ return nil, fmt.Errorf("unable to initialize NVML library: %v", ret) } - t, err := gpu.NewProbe(gpu.NewConfig(), probeDeps) + t, err := gpu.NewProbe(c, probeDeps) if err != nil { return nil, fmt.Errorf("unable to start GPU monitoring: %w", err) } diff --git a/pkg/config/setup/system_probe.go b/pkg/config/setup/system_probe.go index 66762a7d98fae..c2ffdb43a75ed 100644 --- a/pkg/config/setup/system_probe.go +++ b/pkg/config/setup/system_probe.go @@ -31,7 +31,7 @@ const ( pngNS = "ping" tracerouteNS = "traceroute" discoveryNS = "discovery" - gpuMonitoringNS = "gpu_monitoring" + gpuNS = "gpu_monitoring" defaultConnsMessageBatchSize = 600 // defaultServiceMonitoringJavaAgentArgs is default arguments that are passing to the injected java USM agent @@ -408,7 +408,9 @@ func InitSystemProbeConfig(cfg pkgconfigmodel.Config) { cfg.BindEnv("fleet_policies_dir") // GPU monitoring - cfg.BindEnvAndSetDefault(join(gpuMonitoringNS, "enabled"), false) + cfg.BindEnvAndSetDefault(join(gpuNS, "enabled"), false) + cfg.BindEnv(join(gpuNS, "nvml_library_path")) + cfg.BindEnvAndSetDefault(join(gpuNS, "process_scan_interval_seconds"), 5) initCWSSystemProbeConfig(cfg) } diff --git a/pkg/gpu/config.go b/pkg/gpu/config.go index 995fbde58cbd9..47eb3844c1a64 100644 --- a/pkg/gpu/config.go +++ b/pkg/gpu/config.go @@ -7,26 +7,34 @@ package gpu import ( + sysconfig "github.com/DataDog/datadog-agent/cmd/system-probe/config" + pkgconfigsetup "github.com/DataDog/datadog-agent/pkg/config/setup" "time" "github.com/DataDog/datadog-agent/pkg/ebpf" ) -// GPUConfigNS is the namespace for the GPU monitoring probe. -const GPUConfigNS = "gpu_monitoring" +// GPUNS is the namespace for the GPU monitoring probe. +const GPUNS = "gpu_monitoring" // Config holds the configuration for the GPU monitoring probe. type Config struct { - *ebpf.Config + ebpf.Config + // ScanTerminatedProcessesInterval is the interval at which the probe scans for terminated processes. ScanTerminatedProcessesInterval time.Duration - InitialProcessSync bool + // InitialProcessSync indicates whether the probe should sync the process list on startup. + InitialProcessSync bool + // NVMLLibraryPath is the path of the native libnvidia-ml.so library + NVMLLibraryPath string } // NewConfig generates a new configuration for the GPU monitoring probe. func NewConfig() *Config { + spCfg := pkgconfigsetup.SystemProbe() return &Config{ - Config: ebpf.NewConfig(), - ScanTerminatedProcessesInterval: 5 * time.Second, + Config: *ebpf.NewConfig(), + ScanTerminatedProcessesInterval: time.Duration(spCfg.GetInt(sysconfig.FullKeyPath(GPUNS, "process_scan_interval_seconds"))) * time.Second, InitialProcessSync: true, + NVMLLibraryPath: spCfg.GetString(sysconfig.FullKeyPath(GPUNS, "nvml_library_path")), } } From 6c4e186d35bd7b58d36c1ab78fc14fdb4d140454 Mon Sep 17 00:00:00 2001 From: Valeri Pliskin Date: Fri, 18 Oct 2024 14:45:17 +0100 Subject: [PATCH 2/4] - shortened the config parameter name - added a comment --- cmd/system-probe/modules/gpu.go | 4 +++- pkg/config/setup/system_probe.go | 2 +- pkg/gpu/config.go | 2 +- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/cmd/system-probe/modules/gpu.go b/cmd/system-probe/modules/gpu.go index 1a74d65abcb4f..3cd54e27293e5 100644 --- a/cmd/system-probe/modules/gpu.go +++ b/cmd/system-probe/modules/gpu.go @@ -35,7 +35,9 @@ var GPUMonitoring = module.Factory{ c := gpu.NewConfig() probeDeps := gpu.ProbeDependencies{ Telemetry: deps.Telemetry, - NvmlLib: nvml.New(nvml.WithLibraryPath(c.NVMLLibraryPath)), + //if the config parameter doesn't exist or is empty string, the default value is used as defined in go-nvml library + //(https://github.com/NVIDIA/go-nvml/blob/main/pkg/nvml/lib.go#L30) + NvmlLib: nvml.New(nvml.WithLibraryPath(c.NVMLLibraryPath)), } ret := probeDeps.NvmlLib.Init() diff --git a/pkg/config/setup/system_probe.go b/pkg/config/setup/system_probe.go index c2ffdb43a75ed..5fb5ce611c69c 100644 --- a/pkg/config/setup/system_probe.go +++ b/pkg/config/setup/system_probe.go @@ -409,7 +409,7 @@ func InitSystemProbeConfig(cfg pkgconfigmodel.Config) { // GPU monitoring cfg.BindEnvAndSetDefault(join(gpuNS, "enabled"), false) - cfg.BindEnv(join(gpuNS, "nvml_library_path")) + cfg.BindEnv(join(gpuNS, "nvml_lib_path")) cfg.BindEnvAndSetDefault(join(gpuNS, "process_scan_interval_seconds"), 5) initCWSSystemProbeConfig(cfg) diff --git a/pkg/gpu/config.go b/pkg/gpu/config.go index 47eb3844c1a64..88ca0cc5388fa 100644 --- a/pkg/gpu/config.go +++ b/pkg/gpu/config.go @@ -35,6 +35,6 @@ func NewConfig() *Config { Config: *ebpf.NewConfig(), ScanTerminatedProcessesInterval: time.Duration(spCfg.GetInt(sysconfig.FullKeyPath(GPUNS, "process_scan_interval_seconds"))) * time.Second, InitialProcessSync: true, - NVMLLibraryPath: spCfg.GetString(sysconfig.FullKeyPath(GPUNS, "nvml_library_path")), + NVMLLibraryPath: spCfg.GetString(sysconfig.FullKeyPath(GPUNS, "nvml_lib_path")), } } From 5771755a386360336efa8733aae1d2c8e4e2b67b Mon Sep 17 00:00:00 2001 From: Valeri Pliskin Date: Fri, 18 Oct 2024 14:50:46 +0100 Subject: [PATCH 3/4] fixed compilation error --- pkg/gpu/probe.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/gpu/probe.go b/pkg/gpu/probe.go index fa1db4aac6c6c..905120afc82a0 100644 --- a/pkg/gpu/probe.go +++ b/pkg/gpu/probe.go @@ -133,7 +133,7 @@ func startGPUProbe(buf bytecode.AssetReader, opts manager.Options, deps ProbeDep }, }, }, - EbpfConfig: cfg.Config, + EbpfConfig: &cfg.Config, PerformInitialScan: cfg.InitialProcessSync, } From d09c912a29e2576e29e3306877ae860f44fec727 Mon Sep 17 00:00:00 2001 From: Valeri Pliskin Date: Fri, 18 Oct 2024 14:59:41 +0100 Subject: [PATCH 4/4] - changed initial_process_sync to use config flag as well --- pkg/config/setup/system_probe.go | 1 + pkg/gpu/config.go | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/pkg/config/setup/system_probe.go b/pkg/config/setup/system_probe.go index 5fb5ce611c69c..97d0f0bf85856 100644 --- a/pkg/config/setup/system_probe.go +++ b/pkg/config/setup/system_probe.go @@ -411,6 +411,7 @@ func InitSystemProbeConfig(cfg pkgconfigmodel.Config) { cfg.BindEnvAndSetDefault(join(gpuNS, "enabled"), false) cfg.BindEnv(join(gpuNS, "nvml_lib_path")) cfg.BindEnvAndSetDefault(join(gpuNS, "process_scan_interval_seconds"), 5) + cfg.BindEnvAndSetDefault(join(gpuNS, "initial_process_sync"), true) initCWSSystemProbeConfig(cfg) } diff --git a/pkg/gpu/config.go b/pkg/gpu/config.go index 88ca0cc5388fa..d8beee7591dff 100644 --- a/pkg/gpu/config.go +++ b/pkg/gpu/config.go @@ -34,7 +34,7 @@ func NewConfig() *Config { return &Config{ Config: *ebpf.NewConfig(), ScanTerminatedProcessesInterval: time.Duration(spCfg.GetInt(sysconfig.FullKeyPath(GPUNS, "process_scan_interval_seconds"))) * time.Second, - InitialProcessSync: true, + InitialProcessSync: spCfg.GetBool(sysconfig.FullKeyPath(GPUNS, "initial_process_sync")), NVMLLibraryPath: spCfg.GetString(sysconfig.FullKeyPath(GPUNS, "nvml_lib_path")), } }