Skip to content

Commit

Permalink
feat: Export total system avail mem as new ts
Browse files Browse the repository at this point in the history
* It will be used to estimate job level DRAM energy usage

* Update test fixtures

Signed-off-by: Mahendra Paipuri <[email protected]>
  • Loading branch information
mahendrapaipuri committed Jan 6, 2024
1 parent 3c1209e commit 3cd0284
Show file tree
Hide file tree
Showing 8 changed files with 36 additions and 8 deletions.
3 changes: 3 additions & 0 deletions pkg/collector/fixtures/output/e2e-test-cgroupsv1-output.txt
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,9 @@ batchjob_slurm_job_memory_total_bytes{batch="slurm",hostname="",jobaccount="test
# HELP batchjob_slurm_job_memory_used_bytes Memory used in bytes
# TYPE batchjob_slurm_job_memory_used_bytes gauge
batchjob_slurm_job_memory_used_bytes{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuser="testusr",jobuuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5",step="",task=""} 4.0194048e+07
# HELP batchjob_slurm_job_system_memory_total_bytes Available total system memory in bytes
# TYPE batchjob_slurm_job_system_memory_total_bytes gauge
batchjob_slurm_job_system_memory_total_bytes{batch="slurm",hostname=""} 1.6042172416e+10
# HELP go_gc_duration_seconds A summary of the pause duration of garbage collection cycles.
# TYPE go_gc_duration_seconds summary
# HELP go_goroutines Number of goroutines that currently exist.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ batchjob_rapl_package_joules_total{hostname="",index="1",path="pkg/collector/fix
batchjob_scrape_collector_success{collector="ipmi_dcmi"} 1
batchjob_scrape_collector_success{collector="rapl"} 1
batchjob_scrape_collector_success{collector="slurm_job"} 1
# HELP batchjob_slurm_job_cpu_psi_seconds Cumulative CPU PSI seconds
# HELP batchjob_slurm_job_cpu_psi_seconds Cumulative CPU PSI in seconds
# TYPE batchjob_slurm_job_cpu_psi_seconds gauge
batchjob_slurm_job_cpu_psi_seconds{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuser="testusr",jobuuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5",step="",task=""} 0
# HELP batchjob_slurm_job_cpu_system_seconds Cumulative CPU system seconds
Expand All @@ -45,7 +45,7 @@ batchjob_slurm_job_memory_cache_bytes{batch="slurm",hostname="",jobaccount="test
# HELP batchjob_slurm_job_memory_fail_count Memory fail count
# TYPE batchjob_slurm_job_memory_fail_count gauge
batchjob_slurm_job_memory_fail_count{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuser="testusr",jobuuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5",step="",task=""} 0
# HELP batchjob_slurm_job_memory_psi_seconds Cumulative memory PSI seconds
# HELP batchjob_slurm_job_memory_psi_seconds Cumulative memory PSI in seconds
# TYPE batchjob_slurm_job_memory_psi_seconds gauge
batchjob_slurm_job_memory_psi_seconds{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuser="testusr",jobuuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5",step="",task=""} 0
# HELP batchjob_slurm_job_memory_rss_bytes Memory RSS used in bytes
Expand All @@ -62,10 +62,13 @@ batchjob_slurm_job_memory_used_bytes{batch="slurm",hostname="",jobaccount="testa
batchjob_slurm_job_memsw_fail_count{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuser="testusr",jobuuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5",step="",task=""} 0
# HELP batchjob_slurm_job_memsw_total_bytes Swap total in bytes
# TYPE batchjob_slurm_job_memsw_total_bytes gauge
batchjob_slurm_job_memsw_total_bytes{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuser="testusr",jobuuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5",step="",task=""} 1.8446744073709552e+19
batchjob_slurm_job_memsw_total_bytes{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuser="testusr",jobuuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5",step="",task=""} 1.6378171392e+10
# HELP batchjob_slurm_job_memsw_used_bytes Swap used in bytes
# TYPE batchjob_slurm_job_memsw_used_bytes gauge
batchjob_slurm_job_memsw_used_bytes{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuser="testusr",jobuuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5",step="",task=""} 0
# HELP batchjob_slurm_job_system_memory_total_bytes Available total system memory in bytes
# TYPE batchjob_slurm_job_system_memory_total_bytes gauge
batchjob_slurm_job_system_memory_total_bytes{batch="slurm",hostname=""} 1.6378171392e+10
# HELP go_gc_duration_seconds A summary of the pause duration of garbage collection cycles.
# TYPE go_gc_duration_seconds summary
# HELP go_goroutines Number of goroutines that currently exist.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,9 @@ batchjob_slurm_job_memory_total_bytes{batch="slurm",hostname="",jobaccount="test
# HELP batchjob_slurm_job_memory_used_bytes Memory used in bytes
# TYPE batchjob_slurm_job_memory_used_bytes gauge
batchjob_slurm_job_memory_used_bytes{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuser="testusr",jobuuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5",step="",task=""} 4.111491072e+09
# HELP batchjob_slurm_job_system_memory_total_bytes Available total system memory in bytes
# TYPE batchjob_slurm_job_system_memory_total_bytes gauge
batchjob_slurm_job_system_memory_total_bytes{batch="slurm",hostname=""} 1.6378171392e+10
# HELP go_gc_duration_seconds A summary of the pause duration of garbage collection cycles.
# TYPE go_gc_duration_seconds summary
# HELP go_goroutines Number of goroutines that currently exist.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,9 @@ batchjob_slurm_job_memory_total_bytes{batch="slurm",hostname="",jobaccount="test
# HELP batchjob_slurm_job_memory_used_bytes Memory used in bytes
# TYPE batchjob_slurm_job_memory_used_bytes gauge
batchjob_slurm_job_memory_used_bytes{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuser="testusr",jobuuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5",step="",task=""} 4.111491072e+09
# HELP batchjob_slurm_job_system_memory_total_bytes Available total system memory in bytes
# TYPE batchjob_slurm_job_system_memory_total_bytes gauge
batchjob_slurm_job_system_memory_total_bytes{batch="slurm",hostname=""} 1.6378171392e+10
# HELP go_gc_duration_seconds A summary of the pause duration of garbage collection cycles.
# TYPE go_gc_duration_seconds summary
# HELP go_goroutines Number of goroutines that currently exist.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,9 @@ batchjob_slurm_job_memory_total_bytes{batch="slurm",hostname="",jobaccount="test
# HELP batchjob_slurm_job_memory_used_bytes Memory used in bytes
# TYPE batchjob_slurm_job_memory_used_bytes gauge
batchjob_slurm_job_memory_used_bytes{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuser="testusr",jobuuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5",step="",task=""} 4.111491072e+09
# HELP batchjob_slurm_job_system_memory_total_bytes Available total system memory in bytes
# TYPE batchjob_slurm_job_system_memory_total_bytes gauge
batchjob_slurm_job_system_memory_total_bytes{batch="slurm",hostname=""} 1.6378171392e+10
# HELP go_gc_duration_seconds A summary of the pause duration of garbage collection cycles.
# TYPE go_gc_duration_seconds summary
# HELP go_goroutines Number of goroutines that currently exist.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,9 @@ batchjob_slurm_job_memory_total_bytes{batch="slurm",hostname="",jobaccount="test
# HELP batchjob_slurm_job_memory_used_bytes Memory used in bytes
# TYPE batchjob_slurm_job_memory_used_bytes gauge
batchjob_slurm_job_memory_used_bytes{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuser="testusr",jobuuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5",step="",task=""} 4.111491072e+09
# HELP batchjob_slurm_job_system_memory_total_bytes Available total system memory in bytes
# TYPE batchjob_slurm_job_system_memory_total_bytes gauge
batchjob_slurm_job_system_memory_total_bytes{batch="slurm",hostname=""} 1.6042172416e+10
# HELP go_gc_duration_seconds A summary of the pause duration of garbage collection cycles.
# TYPE go_gc_duration_seconds summary
# HELP go_goroutines Number of goroutines that currently exist.
Expand Down
19 changes: 14 additions & 5 deletions pkg/collector/slurm.go
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,7 @@ type slurmCollector struct {
memswTotal *prometheus.Desc
memswFailCount *prometheus.Desc
memoryPressure *prometheus.Desc
memoryAvailable *prometheus.Desc
gpuJobFlag *prometheus.Desc
collectError *prometheus.Desc
logger log.Logger
Expand Down Expand Up @@ -174,9 +175,10 @@ func NewSlurmCollector(logger log.Logger) (Collector, error) {
// Get total memory of instance
var memTotal float64
memInfo, err := GetMemInfo()
if err != nil {
level.Info(logger).Log("msg", "Failed to get total memory of the host", "err", err)
memTotal = memInfo["MemTotal"]
if err == nil {
memTotal = memInfo["MemTotal_bytes"]
} else {
level.Error(logger).Log("msg", "Failed to get total memory of the host", "err", err)
}
return &slurmCollector{
cgroups: cgroupsVersion,
Expand Down Expand Up @@ -211,7 +213,7 @@ func NewSlurmCollector(logger log.Logger) (Collector, error) {
),
cpuPressure: prometheus.NewDesc(
prometheus.BuildFQName(Namespace, slurmCollectorSubsystem, "cpu_psi_seconds"),
"Cumulative CPU PSI seconds",
"Cumulative CPU PSI in seconds",
[]string{"batch", "hostname", "jobid", "jobuser", "jobaccount", "jobuuid", "step", "task"},
nil,
),
Expand Down Expand Up @@ -265,10 +267,16 @@ func NewSlurmCollector(logger log.Logger) (Collector, error) {
),
memoryPressure: prometheus.NewDesc(
prometheus.BuildFQName(Namespace, slurmCollectorSubsystem, "memory_psi_seconds"),
"Cumulative memory PSI seconds",
"Cumulative memory PSI in seconds",
[]string{"batch", "hostname", "jobid", "jobuser", "jobaccount", "jobuuid", "step", "task"},
nil,
),
memoryAvailable: prometheus.NewDesc(
prometheus.BuildFQName(Namespace, slurmCollectorSubsystem, "system_memory_total_bytes"),
"Available total system memory in bytes",
[]string{"batch", "hostname"},
nil,
),
collectError: prometheus.NewDesc(
prometheus.BuildFQName(Namespace, slurmCollectorSubsystem, "collect_error"),
"Indicates collection error, 0=no error, 1=error",
Expand Down Expand Up @@ -341,6 +349,7 @@ func (c *slurmCollector) Update(ch chan<- prometheus.Metric) error {
ch <- prometheus.MustNewConstMetric(c.gpuJobFlag, prometheus.GaugeValue, float64(1), m.batch, c.hostname, m.jobid, m.jobuser, m.jobaccount, m.jobuuid, gpuOrdinal, uuid, uuid)
}
}
ch <- prometheus.MustNewConstMetric(c.memoryAvailable, prometheus.GaugeValue, float64(c.memTotal), "slurm", c.hostname)
return nil
}

Expand Down
1 change: 1 addition & 0 deletions scripts/e2e-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,7 @@ then
./bin/batchjob_exporter \
--path.sysfs="pkg/collector/fixtures/sys" \
--path.cgroupfs="pkg/collector/fixtures/sys/fs/cgroup" \
--path.procfs="pkg/collector/fixtures/proc" \
--collector.slurm.create.unique.jobids \
--collector.slurm.job.props.path="pkg/collector/fixtures/slurmjobprops" \
--collector.slurm.gpu.type="nvidia" \
Expand Down

0 comments on commit 3cd0284

Please sign in to comment.