diff --git a/pkg/collector/fixtures/e2e-test-cgroupsv1-output.txt b/pkg/collector/fixtures/e2e-test-cgroupsv1-output.txt index ae630621..c4cbc166 100644 --- a/pkg/collector/fixtures/e2e-test-cgroupsv1-output.txt +++ b/pkg/collector/fixtures/e2e-test-cgroupsv1-output.txt @@ -12,9 +12,15 @@ batchjob_cpu_user_seconds{batch="slurm",hostname="",jobaccount="testacc",jobid=" batchjob_cpus{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 0 # HELP batchjob_exporter_build_info A metric with a constant '1' value labeled by version, revision, branch, goversion from which batchjob_exporter was built, and the goos and goarch for the build. # TYPE batchjob_exporter_build_info gauge -# HELP batchjob_ipmi_dcmi_watts_total Current Power consumption in watts -# TYPE batchjob_ipmi_dcmi_watts_total counter -batchjob_ipmi_dcmi_watts_total{hostname=""} 332 +# HELP batchjob_ipmi_dcmi_current_watts_total Current Power consumption in watts +# TYPE batchjob_ipmi_dcmi_current_watts_total counter +batchjob_ipmi_dcmi_current_watts_total{hostname=""} 332 +# HELP batchjob_ipmi_dcmi_max_watts_total Maximum Power consumption in watts +# TYPE batchjob_ipmi_dcmi_max_watts_total counter +batchjob_ipmi_dcmi_max_watts_total{hostname=""} 504 +# HELP batchjob_ipmi_dcmi_min_watts_total Minimum Power consumption in watts +# TYPE batchjob_ipmi_dcmi_min_watts_total counter +batchjob_ipmi_dcmi_min_watts_total{hostname=""} 68 # HELP batchjob_memory_cache_bytes Memory cache used in bytes # TYPE batchjob_memory_cache_bytes gauge batchjob_memory_cache_bytes{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 2.1086208e+07 diff --git a/pkg/collector/fixtures/e2e-test-cgroupsv2-output.txt b/pkg/collector/fixtures/e2e-test-cgroupsv2-output.txt index a4113b05..89de42b5 100644 --- a/pkg/collector/fixtures/e2e-test-cgroupsv2-output.txt +++ b/pkg/collector/fixtures/e2e-test-cgroupsv2-output.txt @@ -12,9 +12,15 @@ batchjob_cpu_user_seconds{batch="slurm",hostname="",jobaccount="testacc",jobid=" batchjob_cpus{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 2 # HELP batchjob_exporter_build_info A metric with a constant '1' value labeled by version, revision, branch, goversion from which batchjob_exporter was built, and the goos and goarch for the build. # TYPE batchjob_exporter_build_info gauge -# HELP batchjob_ipmi_dcmi_watts_total Current Power consumption in watts -# TYPE batchjob_ipmi_dcmi_watts_total counter -batchjob_ipmi_dcmi_watts_total{hostname=""} 332 +# HELP batchjob_ipmi_dcmi_current_watts_total Current Power consumption in watts +# TYPE batchjob_ipmi_dcmi_current_watts_total counter +batchjob_ipmi_dcmi_current_watts_total{hostname=""} 332 +# HELP batchjob_ipmi_dcmi_max_watts_total Maximum Power consumption in watts +# TYPE batchjob_ipmi_dcmi_max_watts_total counter +batchjob_ipmi_dcmi_max_watts_total{hostname=""} 504 +# HELP batchjob_ipmi_dcmi_min_watts_total Minimum Power consumption in watts +# TYPE batchjob_ipmi_dcmi_min_watts_total counter +batchjob_ipmi_dcmi_min_watts_total{hostname=""} 68 # HELP batchjob_memory_cache_bytes Memory cache used in bytes # TYPE batchjob_memory_cache_bytes gauge batchjob_memory_cache_bytes{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuuid="ac28caf5-ce6c-35f6-73fb-47d9d43f7780",step="",task=""} 0 diff --git a/pkg/collector/ipmi.go b/pkg/collector/ipmi.go index 6be7f57e..91b700bd 100644 --- a/pkg/collector/ipmi.go +++ b/pkg/collector/ipmi.go @@ -22,12 +22,22 @@ import ( const ipmiCollectorSubsystem = "ipmi_dcmi" type impiCollector struct { - logger log.Logger - hostname string - execMode string - wattsMetricDesc *prometheus.Desc + logger log.Logger + hostname string + execMode string + metricDesc map[string]*prometheus.Desc } +// Expected output from DCMI spec +// Ref: https://www.intel.com/content/dam/www/public/us/en/documents/technical-specifications/dcmi-v1-5-rev-spec.pdf +// Current Power : 164 Watts +// Minimum Power over sampling duration : 48 watts +// Maximum Power over sampling duration : 361 watts +// Average Power over sampling duration : 157 watts +// Time Stamp : 12/29/2023 - 08:58:00 +// Statistics reporting time period : 1473439000 milliseconds +// Power Measurement : Active + var ( ipmiDcmiCmd = BatchJobExporterApp.Flag( "collector.ipmi.dcmi.cmd", @@ -36,9 +46,17 @@ var ( ipmiDCMIPowerMeasurementRegex = regexp.MustCompile( `^Power Measurement\s*:\s*(?PActive|Not\sAvailable).*`, ) - ipmiDCMICurrentPowerRegex = regexp.MustCompile( - `^Current Power\s*:\s*(?P[0-9.]*)\s*Watts.*`, - ) + ipmiDCMIPowerReadingRegexMap = map[string]*regexp.Regexp{ + "current": regexp.MustCompile( + `^Current Power\s*:\s*(?P[0-9.]*)\s*[w|W]atts.*`, + ), + "min": regexp.MustCompile( + `^Minimum Power over sampling duration\s*:\s*(?P[0-9.]*)\s*[w|W]atts.*`, + ), + "max": regexp.MustCompile( + `^Maximum Power over sampling duration\s*:\s*(?P[0-9.]*)\s*[w|W]atts.*`, + ), + } ) func init() { @@ -59,10 +77,20 @@ func NewIPMICollector(logger log.Logger) (Collector, error) { } } - wattsMetricDesc := prometheus.NewDesc( - prometheus.BuildFQName(Namespace, ipmiCollectorSubsystem, "watts_total"), + // Initialize metricDesc map + var metricDesc = make(map[string]*prometheus.Desc, 3) + metricDesc["current"] = prometheus.NewDesc( + prometheus.BuildFQName(Namespace, ipmiCollectorSubsystem, "current_watts_total"), "Current Power consumption in watts", []string{"hostname"}, nil, ) + metricDesc["min"] = prometheus.NewDesc( + prometheus.BuildFQName(Namespace, ipmiCollectorSubsystem, "min_watts_total"), + "Minimum Power consumption in watts", []string{"hostname"}, nil, + ) + metricDesc["max"] = prometheus.NewDesc( + prometheus.BuildFQName(Namespace, ipmiCollectorSubsystem, "max_watts_total"), + "Maximum Power consumption in watts", []string{"hostname"}, nil, + ) // Split command cmdSlice := strings.Split(*ipmiDcmiCmd, " ") @@ -91,10 +119,10 @@ func NewIPMICollector(logger log.Logger) (Collector, error) { outside: collector := impiCollector{ - logger: logger, - hostname: hostname, - execMode: execMode, - wattsMetricDesc: wattsMetricDesc, + logger: logger, + hostname: hostname, + execMode: execMode, + metricDesc: metricDesc, } return &collector, nil } @@ -138,34 +166,49 @@ func (c *impiCollector) Update(ch chan<- prometheus.Metric) error { } // Parse power consumption from output - currentPowerConsumption, err := c.getCurrentPowerConsumption(stdOut) + powerReadings, err := c.getPowerReadings(stdOut) if err != nil { level.Error(c.logger).Log("msg", "Failed to parse IPMI DCMI command output", "error", err) return err } // Returned value negative == Power Measurement is not avail - if currentPowerConsumption > -1 { - ch <- prometheus.MustNewConstMetric(c.wattsMetricDesc, prometheus.CounterValue, float64(currentPowerConsumption), c.hostname) + if len(powerReadings) > 1 { + for rType, rValue := range powerReadings { + if rValue > -1 { + ch <- prometheus.MustNewConstMetric(c.metricDesc[rType], prometheus.CounterValue, float64(rValue), c.hostname) + } + } } return nil } -// Get current power consumption -func (c *impiCollector) getCurrentPowerConsumption(ipmiOutput []byte) (float64, error) { +// Get current, min and max power readings +func (c *impiCollector) getPowerReadings(ipmiOutput []byte) (map[string]float64, error) { // Check for Power Measurement are avail value, err := getValue(ipmiOutput, ipmiDCMIPowerMeasurementRegex) if err != nil { - return -1, err + return nil, err } // When Power Measurement in 'Active' state - we can get watts + var powerReadings = make(map[string]float64, 3) if value == "Active" { - value, err := getValue(ipmiOutput, ipmiDCMICurrentPowerRegex) - if err != nil { - return -1, err + // Get power readings + for rType, regex := range ipmiDCMIPowerReadingRegexMap { + reading, err := getValue(ipmiOutput, regex) + if err != nil { + powerReadings[rType] = float64(-1) + continue + } + readingValue, err := strconv.ParseFloat(reading, 64) + if err != nil { + powerReadings[rType] = float64(-1) + continue + } + powerReadings[rType] = readingValue } - return strconv.ParseFloat(value, 64) + return powerReadings, nil } - return -1, nil + return nil, fmt.Errorf("IPMI Power readings not Active") } diff --git a/pkg/collector/ipmi_test.go b/pkg/collector/ipmi_test.go index c7996ebf..bfce6d17 100644 --- a/pkg/collector/ipmi_test.go +++ b/pkg/collector/ipmi_test.go @@ -4,6 +4,7 @@ package collector import ( + "reflect" "testing" "github.com/go-kit/log" @@ -18,18 +19,52 @@ Average Power over sampling duration : 348 watts Time Stamp : 11/03/2023 - 08:36:29 Statistics reporting time period : 2685198000 milliseconds Power Measurement : Active - ` - expectedPower = float64(332) + ipmidcmiStdoutAlt = ` +Current Power : 332 watts +Minimum Power over sampling duration : 68 Watts +Maximum Power over sampling duration : 504 Watts +Average Power over sampling duration : 348 Watts +Time Stamp : 11/03/2023 - 08:36:29 +Statistics reporting time period : 2685198000 milliseconds +Power Measurement : Active +` + ipmidcmiStdoutDisactive = ` +Power Measurement : Disable +` + expectedPower = map[string]float64{ + "current": 332, + "min": 68, + "max": 504, + } ) func TestIpmiMetrics(t *testing.T) { c := impiCollector{logger: log.NewNopLogger()} - value, err := c.getCurrentPowerConsumption([]byte(ipmidcmiStdout)) + value, err := c.getPowerReadings([]byte(ipmidcmiStdout)) + if err != nil { + t.Errorf("failed to parse IPMI DCMI output: %v", err) + } + if !reflect.DeepEqual(value, expectedPower) { + t.Fatalf("expected power %v. Got %v", expectedPower, value) + } +} + +func TestIpmiMetricsAlt(t *testing.T) { + c := impiCollector{logger: log.NewNopLogger()} + value, err := c.getPowerReadings([]byte(ipmidcmiStdoutAlt)) if err != nil { t.Errorf("failed to parse IPMI DCMI output: %v", err) } - if value != expectedPower { - t.Fatalf("expected power %f. Got %f", expectedPower, value) + if !reflect.DeepEqual(value, expectedPower) { + t.Fatalf("expected power %v. Got %v", expectedPower, value) + } +} + +func TestIpmiMetricsDisactive(t *testing.T) { + c := impiCollector{logger: log.NewNopLogger()} + value, _ := c.getPowerReadings([]byte(ipmidcmiStdoutDisactive)) + if value != nil { + t.Errorf("Expected nil output. Got %v", value) } }