Skip to content

Commit

Permalink
feat(inputs.nvidia-smi): Add test_on_startup option
Browse files Browse the repository at this point in the history
There are some cases where the nvidia-smi plugin might be found in PATH and executable, but upon running it might always return a non-zero exit code. For various reasons, in the environment I work in, this might be expected. It's thus disruptive for system logs to be polluted with infinite error messages. It's preferable in this situation to check if nvidia-smi returns a good result on plugin startup, and if not, allow the error to be bubbled up and handled according to startup_error_behavior.
  • Loading branch information
LandonTClipp committed Oct 2, 2024
1 parent 640eda0 commit cd27c4a
Show file tree
Hide file tree
Showing 4 changed files with 67 additions and 9 deletions.
6 changes: 6 additions & 0 deletions plugins/inputs/nvidia_smi/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,12 @@ using the `startup_error_behavior` setting. Available values are:

## Optional: timeout for GPU polling
# timeout = "5s"

## Optional: Attempt to run nvidia-smi once on startup. If nvidia-smi returns a non-zero
## exit code, the plugin will return an error. This is particularly useful
## if used in conjunction with `startup_error_behavior` to allow the plugin to be
## disabled if nvidia-smi cannot run successfully.
# probe_on_startup = false
```

### Linux
Expand Down
26 changes: 17 additions & 9 deletions plugins/inputs/nvidia_smi/nvidia_smi.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,12 +27,14 @@ var sampleConfig string

// NvidiaSMI holds the methods for this plugin
type NvidiaSMI struct {
BinPath string `toml:"bin_path"`
Timeout config.Duration `toml:"timeout"`
Log telegraf.Logger `toml:"-"`

ignorePlugin bool
once sync.Once
BinPath string `toml:"bin_path"`
Timeout config.Duration `toml:"timeout"`
ProbeOnStartup bool `toml:"probe_on_startup"`
Log telegraf.Logger `toml:"-"`

ignorePlugin bool
once sync.Once
nvidiaSMIArgs []string
}

func (*NvidiaSMI) SampleConfig() string {
Expand All @@ -47,6 +49,11 @@ func (smi *NvidiaSMI) Start(telegraf.Accumulator) error {
}
smi.BinPath = binPath
}
if smi.ProbeOnStartup {
if _, err := internal.CombinedOutputTimeout(exec.Command(smi.BinPath, smi.nvidiaSMIArgs...), time.Duration(smi.Timeout)); err != nil {
return &internal.StartupError{Err: err}
}
}

return nil
}
Expand All @@ -60,7 +67,7 @@ func (smi *NvidiaSMI) Gather(acc telegraf.Accumulator) error {
}

// Construct and execute metrics query
data, err := internal.CombinedOutputTimeout(exec.Command(smi.BinPath, "-q", "-x"), time.Duration(smi.Timeout))
data, err := internal.CombinedOutputTimeout(exec.Command(smi.BinPath, smi.nvidiaSMIArgs...), time.Duration(smi.Timeout))
if err != nil {
return fmt.Errorf("calling %q failed: %w", smi.BinPath, err)
}
Expand Down Expand Up @@ -119,8 +126,9 @@ func (smi *NvidiaSMI) parse(acc telegraf.Accumulator, data []byte) error {
func init() {
inputs.Add("nvidia_smi", func() telegraf.Input {
return &NvidiaSMI{
BinPath: "/usr/bin/nvidia-smi",
Timeout: config.Duration(5 * time.Second),
BinPath: "/usr/bin/nvidia-smi",
Timeout: config.Duration(5 * time.Second),
nvidiaSMIArgs: []string{"-q", "-x"},
}
})
}
38 changes: 38 additions & 0 deletions plugins/inputs/nvidia_smi/nvidia_smi_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,50 @@ import (
"time"

"github.com/influxdata/telegraf"
"github.com/influxdata/telegraf/config"
"github.com/influxdata/telegraf/internal"
"github.com/influxdata/telegraf/models"
"github.com/influxdata/telegraf/testutil"
"github.com/stretchr/testify/require"
)

func TestOnStartupError(t *testing.T) {
tests := []struct {
ProbeOnStartup bool
}{
{
ProbeOnStartup: true,
},
{
ProbeOnStartup: false,
},
}
for _, tt := range tests {
plugin := &NvidiaSMI{
BinPath: "/bin/bash",
ProbeOnStartup: tt.ProbeOnStartup,
Timeout: config.Duration(time.Second),
Log: &testutil.Logger{},
nvidiaSMIArgs: []string{"-c", "exit 9"},
}
model := models.NewRunningInput(plugin, &models.InputConfig{
Name: "nvidia_smi",
})
require.NoError(t, model.Init())

var acc testutil.Accumulator
var ferr *internal.FatalError
err := model.Start(&acc)

if tt.ProbeOnStartup {
require.False(t, errors.As(err, &ferr))
require.ErrorIs(t, model.Gather(&acc), internal.ErrNotConnected)
} else {
require.NoError(t, err)
}
}
}

func TestErrorBehaviorDefault(t *testing.T) {
// make sure we can't find nvidia-smi in $PATH somewhere
os.Unsetenv("PATH")
Expand Down
6 changes: 6 additions & 0 deletions plugins/inputs/nvidia_smi/sample.conf
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,9 @@

## Optional: timeout for GPU polling
# timeout = "5s"

## Optional: Attempt to run nvidia-smi once on startup. If nvidia-smi returns a non-zero
## exit code, the plugin will return an error. This is particularly useful
## if used in conjunction with `startup_error_behavior` to allow the plugin to be
## disabled if nvidia-smi cannot run successfully.
# probe_on_startup = false

0 comments on commit cd27c4a

Please sign in to comment.