Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Export min and max power readings from ipmi #21

Merged
merged 4 commits into from
Dec 29, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 9 additions & 3 deletions pkg/collector/fixtures/e2e-test-cgroupsv1-output.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,15 @@ batchjob_cpu_user_seconds{batch="slurm",hostname="",jobaccount="testacc",jobid="
batchjob_cpus{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 0
# HELP batchjob_exporter_build_info A metric with a constant '1' value labeled by version, revision, branch, goversion from which batchjob_exporter was built, and the goos and goarch for the build.
# TYPE batchjob_exporter_build_info gauge
# HELP batchjob_ipmi_dcmi_watts_total Current Power consumption in watts
# TYPE batchjob_ipmi_dcmi_watts_total counter
batchjob_ipmi_dcmi_watts_total{hostname=""} 332
# HELP batchjob_ipmi_dcmi_current_watts_total Current Power consumption in watts
# TYPE batchjob_ipmi_dcmi_current_watts_total counter
batchjob_ipmi_dcmi_current_watts_total{hostname=""} 332
# HELP batchjob_ipmi_dcmi_max_watts_total Maximum Power consumption in watts
# TYPE batchjob_ipmi_dcmi_max_watts_total counter
batchjob_ipmi_dcmi_max_watts_total{hostname=""} 504
# HELP batchjob_ipmi_dcmi_min_watts_total Minimum Power consumption in watts
# TYPE batchjob_ipmi_dcmi_min_watts_total counter
batchjob_ipmi_dcmi_min_watts_total{hostname=""} 68
# HELP batchjob_memory_cache_bytes Memory cache used in bytes
# TYPE batchjob_memory_cache_bytes gauge
batchjob_memory_cache_bytes{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 2.1086208e+07
Expand Down
12 changes: 9 additions & 3 deletions pkg/collector/fixtures/e2e-test-cgroupsv2-output.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,15 @@ batchjob_cpu_user_seconds{batch="slurm",hostname="",jobaccount="testacc",jobid="
batchjob_cpus{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 2
# HELP batchjob_exporter_build_info A metric with a constant '1' value labeled by version, revision, branch, goversion from which batchjob_exporter was built, and the goos and goarch for the build.
# TYPE batchjob_exporter_build_info gauge
# HELP batchjob_ipmi_dcmi_watts_total Current Power consumption in watts
# TYPE batchjob_ipmi_dcmi_watts_total counter
batchjob_ipmi_dcmi_watts_total{hostname=""} 332
# HELP batchjob_ipmi_dcmi_current_watts_total Current Power consumption in watts
# TYPE batchjob_ipmi_dcmi_current_watts_total counter
batchjob_ipmi_dcmi_current_watts_total{hostname=""} 332
# HELP batchjob_ipmi_dcmi_max_watts_total Maximum Power consumption in watts
# TYPE batchjob_ipmi_dcmi_max_watts_total counter
batchjob_ipmi_dcmi_max_watts_total{hostname=""} 504
# HELP batchjob_ipmi_dcmi_min_watts_total Minimum Power consumption in watts
# TYPE batchjob_ipmi_dcmi_min_watts_total counter
batchjob_ipmi_dcmi_min_watts_total{hostname=""} 68
# HELP batchjob_memory_cache_bytes Memory cache used in bytes
# TYPE batchjob_memory_cache_bytes gauge
batchjob_memory_cache_bytes{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 0
Expand Down
91 changes: 67 additions & 24 deletions pkg/collector/ipmi.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,22 @@ import (
const ipmiCollectorSubsystem = "ipmi_dcmi"

type impiCollector struct {
logger log.Logger
hostname string
execMode string
wattsMetricDesc *prometheus.Desc
logger log.Logger
hostname string
execMode string
metricDesc map[string]*prometheus.Desc
}

// Expected output from DCMI spec
// Ref: https://www.intel.com/content/dam/www/public/us/en/documents/technical-specifications/dcmi-v1-5-rev-spec.pdf
// Current Power : 164 Watts
// Minimum Power over sampling duration : 48 watts
// Maximum Power over sampling duration : 361 watts
// Average Power over sampling duration : 157 watts
// Time Stamp : 12/29/2023 - 08:58:00
// Statistics reporting time period : 1473439000 milliseconds
// Power Measurement : Active

var (
ipmiDcmiCmd = BatchJobExporterApp.Flag(
"collector.ipmi.dcmi.cmd",
Expand All @@ -36,9 +46,17 @@ var (
ipmiDCMIPowerMeasurementRegex = regexp.MustCompile(
`^Power Measurement\s*:\s*(?P<value>Active|Not\sAvailable).*`,
)
ipmiDCMICurrentPowerRegex = regexp.MustCompile(
`^Current Power\s*:\s*(?P<value>[0-9.]*)\s*Watts.*`,
)
ipmiDCMIPowerReadingRegexMap = map[string]*regexp.Regexp{
"current": regexp.MustCompile(
`^Current Power\s*:\s*(?P<value>[0-9.]*)\s*[w|W]atts.*`,
),
"min": regexp.MustCompile(
`^Minimum Power over sampling duration\s*:\s*(?P<value>[0-9.]*)\s*[w|W]atts.*`,
),
"max": regexp.MustCompile(
`^Maximum Power over sampling duration\s*:\s*(?P<value>[0-9.]*)\s*[w|W]atts.*`,
),
}
)

func init() {
Expand All @@ -59,10 +77,20 @@ func NewIPMICollector(logger log.Logger) (Collector, error) {
}
}

wattsMetricDesc := prometheus.NewDesc(
prometheus.BuildFQName(Namespace, ipmiCollectorSubsystem, "watts_total"),
// Initialize metricDesc map
var metricDesc = make(map[string]*prometheus.Desc, 3)
metricDesc["current"] = prometheus.NewDesc(
prometheus.BuildFQName(Namespace, ipmiCollectorSubsystem, "current_watts_total"),
"Current Power consumption in watts", []string{"hostname"}, nil,
)
metricDesc["min"] = prometheus.NewDesc(
prometheus.BuildFQName(Namespace, ipmiCollectorSubsystem, "min_watts_total"),
"Minimum Power consumption in watts", []string{"hostname"}, nil,
)
metricDesc["max"] = prometheus.NewDesc(
prometheus.BuildFQName(Namespace, ipmiCollectorSubsystem, "max_watts_total"),
"Maximum Power consumption in watts", []string{"hostname"}, nil,
)

// Split command
cmdSlice := strings.Split(*ipmiDcmiCmd, " ")
Expand Down Expand Up @@ -91,10 +119,10 @@ func NewIPMICollector(logger log.Logger) (Collector, error) {

outside:
collector := impiCollector{
logger: logger,
hostname: hostname,
execMode: execMode,
wattsMetricDesc: wattsMetricDesc,
logger: logger,
hostname: hostname,
execMode: execMode,
metricDesc: metricDesc,
}
return &collector, nil
}
Expand Down Expand Up @@ -138,34 +166,49 @@ func (c *impiCollector) Update(ch chan<- prometheus.Metric) error {
}

// Parse power consumption from output
currentPowerConsumption, err := c.getCurrentPowerConsumption(stdOut)
powerReadings, err := c.getPowerReadings(stdOut)
if err != nil {
level.Error(c.logger).Log("msg", "Failed to parse IPMI DCMI command output", "error", err)
return err
}

// Returned value negative == Power Measurement is not avail
if currentPowerConsumption > -1 {
ch <- prometheus.MustNewConstMetric(c.wattsMetricDesc, prometheus.CounterValue, float64(currentPowerConsumption), c.hostname)
if len(powerReadings) > 1 {
for rType, rValue := range powerReadings {
if rValue > -1 {
ch <- prometheus.MustNewConstMetric(c.metricDesc[rType], prometheus.CounterValue, float64(rValue), c.hostname)
}
}
}
return nil
}

// Get current power consumption
func (c *impiCollector) getCurrentPowerConsumption(ipmiOutput []byte) (float64, error) {
// Get current, min and max power readings
func (c *impiCollector) getPowerReadings(ipmiOutput []byte) (map[string]float64, error) {
// Check for Power Measurement are avail
value, err := getValue(ipmiOutput, ipmiDCMIPowerMeasurementRegex)
if err != nil {
return -1, err
return nil, err
}

// When Power Measurement in 'Active' state - we can get watts
var powerReadings = make(map[string]float64, 3)
if value == "Active" {
value, err := getValue(ipmiOutput, ipmiDCMICurrentPowerRegex)
if err != nil {
return -1, err
// Get power readings
for rType, regex := range ipmiDCMIPowerReadingRegexMap {
reading, err := getValue(ipmiOutput, regex)
if err != nil {
powerReadings[rType] = float64(-1)
continue
}
readingValue, err := strconv.ParseFloat(reading, 64)
if err != nil {
powerReadings[rType] = float64(-1)
continue
}
powerReadings[rType] = readingValue
}
return strconv.ParseFloat(value, 64)
return powerReadings, nil
}
return -1, nil
return nil, fmt.Errorf("IPMI Power readings not Active")
}
45 changes: 40 additions & 5 deletions pkg/collector/ipmi_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
package collector

import (
"reflect"
"testing"

"github.com/go-kit/log"
Expand All @@ -18,18 +19,52 @@ Average Power over sampling duration : 348 watts
Time Stamp : 11/03/2023 - 08:36:29
Statistics reporting time period : 2685198000 milliseconds
Power Measurement : Active

`
expectedPower = float64(332)
ipmidcmiStdoutAlt = `
Current Power : 332 watts
Minimum Power over sampling duration : 68 Watts
Maximum Power over sampling duration : 504 Watts
Average Power over sampling duration : 348 Watts
Time Stamp : 11/03/2023 - 08:36:29
Statistics reporting time period : 2685198000 milliseconds
Power Measurement : Active
`
ipmidcmiStdoutDisactive = `
Power Measurement : Disable
`
expectedPower = map[string]float64{
"current": 332,
"min": 68,
"max": 504,
}
)

func TestIpmiMetrics(t *testing.T) {
c := impiCollector{logger: log.NewNopLogger()}
value, err := c.getCurrentPowerConsumption([]byte(ipmidcmiStdout))
value, err := c.getPowerReadings([]byte(ipmidcmiStdout))
if err != nil {
t.Errorf("failed to parse IPMI DCMI output: %v", err)
}
if !reflect.DeepEqual(value, expectedPower) {
t.Fatalf("expected power %v. Got %v", expectedPower, value)
}
}

func TestIpmiMetricsAlt(t *testing.T) {
c := impiCollector{logger: log.NewNopLogger()}
value, err := c.getPowerReadings([]byte(ipmidcmiStdoutAlt))
if err != nil {
t.Errorf("failed to parse IPMI DCMI output: %v", err)
}
if value != expectedPower {
t.Fatalf("expected power %f. Got %f", expectedPower, value)
if !reflect.DeepEqual(value, expectedPower) {
t.Fatalf("expected power %v. Got %v", expectedPower, value)
}
}

func TestIpmiMetricsDisactive(t *testing.T) {
c := impiCollector{logger: log.NewNopLogger()}
value, _ := c.getPowerReadings([]byte(ipmidcmiStdoutDisactive))
if value != nil {
t.Errorf("Expected nil output. Got %v", value)
}
}