Skip to content

Commit

Permalink
feat: Export min and max power readings
Browse files Browse the repository at this point in the history
* IPMI exposes min, max and current power readings

* We were only exporting current reading. Now we export min and max as well

* According to DCMI spec, min and max are recorded min and max since last boot

Signed-off-by: Mahendra Paipuri <[email protected]>
  • Loading branch information
mahendrapaipuri committed Dec 29, 2023
1 parent ff037cb commit c4d1640
Show file tree
Hide file tree
Showing 4 changed files with 125 additions and 35 deletions.
12 changes: 9 additions & 3 deletions pkg/collector/fixtures/e2e-test-cgroupsv1-output.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,15 @@ batchjob_cpu_user_seconds{batch="slurm",hostname="",jobaccount="testacc",jobid="
batchjob_cpus{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 0
# HELP batchjob_exporter_build_info A metric with a constant '1' value labeled by version, revision, branch, goversion from which batchjob_exporter was built, and the goos and goarch for the build.
# TYPE batchjob_exporter_build_info gauge
# HELP batchjob_ipmi_dcmi_watts_total Current Power consumption in watts
# TYPE batchjob_ipmi_dcmi_watts_total counter
batchjob_ipmi_dcmi_watts_total{hostname=""} 332
# HELP batchjob_ipmi_dcmi_current_watts_total Current Power consumption in watts
# TYPE batchjob_ipmi_dcmi_current_watts_total counter
batchjob_ipmi_dcmi_current_watts_total{hostname=""} 332
# HELP batchjob_ipmi_dcmi_max_watts_total Maximum Power consumption in watts
# TYPE batchjob_ipmi_dcmi_max_watts_total counter
batchjob_ipmi_dcmi_max_watts_total{hostname=""} 504
# HELP batchjob_ipmi_dcmi_min_watts_total Minimum Power consumption in watts
# TYPE batchjob_ipmi_dcmi_min_watts_total counter
batchjob_ipmi_dcmi_min_watts_total{hostname=""} 68
# HELP batchjob_memory_cache_bytes Memory cache used in bytes
# TYPE batchjob_memory_cache_bytes gauge
batchjob_memory_cache_bytes{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 2.1086208e+07
Expand Down
12 changes: 9 additions & 3 deletions pkg/collector/fixtures/e2e-test-cgroupsv2-output.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,15 @@ batchjob_cpu_user_seconds{batch="slurm",hostname="",jobaccount="testacc",jobid="
batchjob_cpus{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 2
# HELP batchjob_exporter_build_info A metric with a constant '1' value labeled by version, revision, branch, goversion from which batchjob_exporter was built, and the goos and goarch for the build.
# TYPE batchjob_exporter_build_info gauge
# HELP batchjob_ipmi_dcmi_watts_total Current Power consumption in watts
# TYPE batchjob_ipmi_dcmi_watts_total counter
batchjob_ipmi_dcmi_watts_total{hostname=""} 332
# HELP batchjob_ipmi_dcmi_current_watts_total Current Power consumption in watts
# TYPE batchjob_ipmi_dcmi_current_watts_total counter
batchjob_ipmi_dcmi_current_watts_total{hostname=""} 332
# HELP batchjob_ipmi_dcmi_max_watts_total Maximum Power consumption in watts
# TYPE batchjob_ipmi_dcmi_max_watts_total counter
batchjob_ipmi_dcmi_max_watts_total{hostname=""} 504
# HELP batchjob_ipmi_dcmi_min_watts_total Minimum Power consumption in watts
# TYPE batchjob_ipmi_dcmi_min_watts_total counter
batchjob_ipmi_dcmi_min_watts_total{hostname=""} 68
# HELP batchjob_memory_cache_bytes Memory cache used in bytes
# TYPE batchjob_memory_cache_bytes gauge
batchjob_memory_cache_bytes{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 0
Expand Down
91 changes: 67 additions & 24 deletions pkg/collector/ipmi.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,22 @@ import (
const ipmiCollectorSubsystem = "ipmi_dcmi"

type impiCollector struct {
logger log.Logger
hostname string
execMode string
wattsMetricDesc *prometheus.Desc
logger log.Logger
hostname string
execMode string
metricDesc map[string]*prometheus.Desc
}

// Expected output from DCMI spec
// Ref: https://www.intel.com/content/dam/www/public/us/en/documents/technical-specifications/dcmi-v1-5-rev-spec.pdf
// Current Power : 164 Watts
// Minimum Power over sampling duration : 48 watts
// Maximum Power over sampling duration : 361 watts
// Average Power over sampling duration : 157 watts
// Time Stamp : 12/29/2023 - 08:58:00
// Statistics reporting time period : 1473439000 milliseconds
// Power Measurement : Active

var (
ipmiDcmiCmd = BatchJobExporterApp.Flag(
"collector.ipmi.dcmi.cmd",
Expand All @@ -36,9 +46,17 @@ var (
ipmiDCMIPowerMeasurementRegex = regexp.MustCompile(
`^Power Measurement\s*:\s*(?P<value>Active|Not\sAvailable).*`,
)
ipmiDCMICurrentPowerRegex = regexp.MustCompile(
`^Current Power\s*:\s*(?P<value>[0-9.]*)\s*Watts.*`,
)
ipmiDCMIPowerReadingRegexMap = map[string]*regexp.Regexp{
"current": regexp.MustCompile(
`^Current Power\s*:\s*(?P<value>[0-9.]*)\s*[w|W]atts.*`,
),
"min": regexp.MustCompile(
`^Minimum Power over sampling duration\s*:\s*(?P<value>[0-9.]*)\s*[w|W]atts.*`,
),
"max": regexp.MustCompile(
`^Maximum Power over sampling duration\s*:\s*(?P<value>[0-9.]*)\s*[w|W]atts.*`,
),
}
)

func init() {
Expand All @@ -59,10 +77,20 @@ func NewIPMICollector(logger log.Logger) (Collector, error) {
}
}

wattsMetricDesc := prometheus.NewDesc(
prometheus.BuildFQName(Namespace, ipmiCollectorSubsystem, "watts_total"),
// Initialize metricDesc map
var metricDesc = make(map[string]*prometheus.Desc, 3)
metricDesc["current"] = prometheus.NewDesc(
prometheus.BuildFQName(Namespace, ipmiCollectorSubsystem, "current_watts_total"),
"Current Power consumption in watts", []string{"hostname"}, nil,
)
metricDesc["min"] = prometheus.NewDesc(
prometheus.BuildFQName(Namespace, ipmiCollectorSubsystem, "min_watts_total"),
"Minimum Power consumption in watts", []string{"hostname"}, nil,
)
metricDesc["max"] = prometheus.NewDesc(
prometheus.BuildFQName(Namespace, ipmiCollectorSubsystem, "max_watts_total"),
"Maximum Power consumption in watts", []string{"hostname"}, nil,
)

// Split command
cmdSlice := strings.Split(*ipmiDcmiCmd, " ")
Expand Down Expand Up @@ -91,10 +119,10 @@ func NewIPMICollector(logger log.Logger) (Collector, error) {

outside:
collector := impiCollector{
logger: logger,
hostname: hostname,
execMode: execMode,
wattsMetricDesc: wattsMetricDesc,
logger: logger,
hostname: hostname,
execMode: execMode,
metricDesc: metricDesc,
}
return &collector, nil
}
Expand Down Expand Up @@ -138,34 +166,49 @@ func (c *impiCollector) Update(ch chan<- prometheus.Metric) error {
}

// Parse power consumption from output
currentPowerConsumption, err := c.getCurrentPowerConsumption(stdOut)
powerReadings, err := c.getPowerReadings(stdOut)
if err != nil {
level.Error(c.logger).Log("msg", "Failed to parse IPMI DCMI command output", "error", err)
return err
}

// Returned value negative == Power Measurement is not avail
if currentPowerConsumption > -1 {
ch <- prometheus.MustNewConstMetric(c.wattsMetricDesc, prometheus.CounterValue, float64(currentPowerConsumption), c.hostname)
if len(powerReadings) > 1 {
for rType, rValue := range powerReadings {
if rValue > -1 {
ch <- prometheus.MustNewConstMetric(c.metricDesc[rType], prometheus.CounterValue, float64(rValue), c.hostname)
}
}
}
return nil
}

// Get current power consumption
func (c *impiCollector) getCurrentPowerConsumption(ipmiOutput []byte) (float64, error) {
// Get current, min and max power readings
func (c *impiCollector) getPowerReadings(ipmiOutput []byte) (map[string]float64, error) {
// Check for Power Measurement are avail
value, err := getValue(ipmiOutput, ipmiDCMIPowerMeasurementRegex)
if err != nil {
return -1, err
return nil, err
}

// When Power Measurement in 'Active' state - we can get watts
var powerReadings = make(map[string]float64, 3)
if value == "Active" {
value, err := getValue(ipmiOutput, ipmiDCMICurrentPowerRegex)
if err != nil {
return -1, err
// Get power readings
for rType, regex := range ipmiDCMIPowerReadingRegexMap {
reading, err := getValue(ipmiOutput, regex)
if err != nil {
powerReadings[rType] = float64(-1)
continue
}
readingValue, err := strconv.ParseFloat(reading, 64)
if err != nil {
powerReadings[rType] = float64(-1)
continue
}
powerReadings[rType] = readingValue
}
return strconv.ParseFloat(value, 64)
return powerReadings, nil
}
return -1, nil
return nil, fmt.Errorf("IPMI Power readings not Active")
}
45 changes: 40 additions & 5 deletions pkg/collector/ipmi_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
package collector

import (
"reflect"
"testing"

"github.com/go-kit/log"
Expand All @@ -18,18 +19,52 @@ Average Power over sampling duration : 348 watts
Time Stamp : 11/03/2023 - 08:36:29
Statistics reporting time period : 2685198000 milliseconds
Power Measurement : Active
`
expectedPower = float64(332)
ipmidcmiStdoutAlt = `
Current Power : 332 watts
Minimum Power over sampling duration : 68 Watts
Maximum Power over sampling duration : 504 Watts
Average Power over sampling duration : 348 Watts
Time Stamp : 11/03/2023 - 08:36:29
Statistics reporting time period : 2685198000 milliseconds
Power Measurement : Active
`
ipmidcmiStdoutDisactive = `
Power Measurement : Disable
`
expectedPower = map[string]float64{
"current": 332,
"min": 68,
"max": 504,
}
)

func TestIpmiMetrics(t *testing.T) {
c := impiCollector{logger: log.NewNopLogger()}
value, err := c.getCurrentPowerConsumption([]byte(ipmidcmiStdout))
value, err := c.getPowerReadings([]byte(ipmidcmiStdout))
if err != nil {
t.Errorf("failed to parse IPMI DCMI output: %v", err)
}
if !reflect.DeepEqual(value, expectedPower) {
t.Fatalf("expected power %v. Got %v", expectedPower, value)
}
}

func TestIpmiMetricsAlt(t *testing.T) {
c := impiCollector{logger: log.NewNopLogger()}
value, err := c.getPowerReadings([]byte(ipmidcmiStdoutAlt))
if err != nil {
t.Errorf("failed to parse IPMI DCMI output: %v", err)
}
if value != expectedPower {
t.Fatalf("expected power %f. Got %f", expectedPower, value)
if !reflect.DeepEqual(value, expectedPower) {
t.Fatalf("expected power %v. Got %v", expectedPower, value)
}
}

func TestIpmiMetricsDisactive(t *testing.T) {
c := impiCollector{logger: log.NewNopLogger()}
value, _ := c.getPowerReadings([]byte(ipmidcmiStdoutDisactive))
if value != nil {
t.Errorf("Expected nil output. Got %v", value)
}
}

0 comments on commit c4d1640

Please sign in to comment.