Skip to content

Commit

Permalink
[EBPF] GPU-monitoring: added nvml lib path config knob (#30263)
Browse files Browse the repository at this point in the history
  • Loading branch information
val06 authored Oct 18, 2024
1 parent ec63d4e commit 3702bb5
Show file tree
Hide file tree
Showing 4 changed files with 28 additions and 13 deletions.
10 changes: 7 additions & 3 deletions cmd/system-probe/modules/gpu.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,24 +24,28 @@ import (
)

var _ module.Module = &GPUMonitoringModule{}
var gpuMonitoringConfigNamespaces = []string{gpu.GPUConfigNS}
var gpuMonitoringConfigNamespaces = []string{gpu.GPUNS}

// GPUMonitoring Factory
var GPUMonitoring = module.Factory{
Name: config.GPUMonitoringModule,
ConfigNamespaces: gpuMonitoringConfigNamespaces,
Fn: func(_ *sysconfigtypes.Config, deps module.FactoryDependencies) (module.Module, error) {

c := gpu.NewConfig()
probeDeps := gpu.ProbeDependencies{
Telemetry: deps.Telemetry,
NvmlLib: nvml.New(),
//if the config parameter doesn't exist or is empty string, the default value is used as defined in go-nvml library
//(https://github.com/NVIDIA/go-nvml/blob/main/pkg/nvml/lib.go#L30)
NvmlLib: nvml.New(nvml.WithLibraryPath(c.NVMLLibraryPath)),
}

ret := probeDeps.NvmlLib.Init()
if ret != nvml.SUCCESS && ret != nvml.ERROR_ALREADY_INITIALIZED {
return nil, fmt.Errorf("unable to initialize NVML library: %v", ret)
}

t, err := gpu.NewProbe(gpu.NewConfig(), probeDeps)
t, err := gpu.NewProbe(c, probeDeps)
if err != nil {
return nil, fmt.Errorf("unable to start GPU monitoring: %w", err)
}
Expand Down
7 changes: 5 additions & 2 deletions pkg/config/setup/system_probe.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ const (
pngNS = "ping"
tracerouteNS = "traceroute"
discoveryNS = "discovery"
gpuMonitoringNS = "gpu_monitoring"
gpuNS = "gpu_monitoring"
defaultConnsMessageBatchSize = 600

// defaultServiceMonitoringJavaAgentArgs is default arguments that are passing to the injected java USM agent
Expand Down Expand Up @@ -408,7 +408,10 @@ func InitSystemProbeConfig(cfg pkgconfigmodel.Config) {
cfg.BindEnv("fleet_policies_dir")

// GPU monitoring
cfg.BindEnvAndSetDefault(join(gpuMonitoringNS, "enabled"), false)
cfg.BindEnvAndSetDefault(join(gpuNS, "enabled"), false)
cfg.BindEnv(join(gpuNS, "nvml_lib_path"))
cfg.BindEnvAndSetDefault(join(gpuNS, "process_scan_interval_seconds"), 5)
cfg.BindEnvAndSetDefault(join(gpuNS, "initial_process_sync"), true)

initCWSSystemProbeConfig(cfg)
}
Expand Down
22 changes: 15 additions & 7 deletions pkg/gpu/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,26 +7,34 @@
package gpu

import (
sysconfig "github.com/DataDog/datadog-agent/cmd/system-probe/config"
pkgconfigsetup "github.com/DataDog/datadog-agent/pkg/config/setup"
"time"

"github.com/DataDog/datadog-agent/pkg/ebpf"
)

// GPUConfigNS is the namespace for the GPU monitoring probe.
const GPUConfigNS = "gpu_monitoring"
// GPUNS is the namespace for the GPU monitoring probe.
const GPUNS = "gpu_monitoring"

// Config holds the configuration for the GPU monitoring probe.
type Config struct {
*ebpf.Config
ebpf.Config
// ScanTerminatedProcessesInterval is the interval at which the probe scans for terminated processes.
ScanTerminatedProcessesInterval time.Duration
InitialProcessSync bool
// InitialProcessSync indicates whether the probe should sync the process list on startup.
InitialProcessSync bool
// NVMLLibraryPath is the path of the native libnvidia-ml.so library
NVMLLibraryPath string
}

// NewConfig generates a new configuration for the GPU monitoring probe.
func NewConfig() *Config {
spCfg := pkgconfigsetup.SystemProbe()
return &Config{
Config: ebpf.NewConfig(),
ScanTerminatedProcessesInterval: 5 * time.Second,
InitialProcessSync: true,
Config: *ebpf.NewConfig(),
ScanTerminatedProcessesInterval: time.Duration(spCfg.GetInt(sysconfig.FullKeyPath(GPUNS, "process_scan_interval_seconds"))) * time.Second,
InitialProcessSync: spCfg.GetBool(sysconfig.FullKeyPath(GPUNS, "initial_process_sync")),
NVMLLibraryPath: spCfg.GetString(sysconfig.FullKeyPath(GPUNS, "nvml_lib_path")),
}
}
2 changes: 1 addition & 1 deletion pkg/gpu/probe.go
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ func startGPUProbe(buf bytecode.AssetReader, opts manager.Options, deps ProbeDep
},
},
},
EbpfConfig: cfg.Config,
EbpfConfig: &cfg.Config,
PerformInitialScan: cfg.InitialProcessSync,
}

Expand Down

0 comments on commit 3702bb5

Please sign in to comment.