diff --git a/pkg/collector/fixtures/output/e2e-test-cgroupsv1-output.txt b/pkg/collector/fixtures/output/e2e-test-cgroupsv1-output.txt index 535c9a76..df952772 100644 --- a/pkg/collector/fixtures/output/e2e-test-cgroupsv1-output.txt +++ b/pkg/collector/fixtures/output/e2e-test-cgroupsv1-output.txt @@ -51,6 +51,9 @@ batchjob_slurm_job_memory_total_bytes{batch="slurm",hostname="",jobaccount="test # HELP batchjob_slurm_job_memory_used_bytes Memory used in bytes # TYPE batchjob_slurm_job_memory_used_bytes gauge batchjob_slurm_job_memory_used_bytes{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuser="testusr",jobuuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5",step="",task=""} 4.0194048e+07 +# HELP batchjob_slurm_job_system_memory_total_bytes Available total system memory in bytes +# TYPE batchjob_slurm_job_system_memory_total_bytes gauge +batchjob_slurm_job_system_memory_total_bytes{batch="slurm",hostname=""} 1.6042172416e+10 # HELP go_gc_duration_seconds A summary of the pause duration of garbage collection cycles. # TYPE go_gc_duration_seconds summary # HELP go_goroutines Number of goroutines that currently exist. diff --git a/pkg/collector/fixtures/output/e2e-test-cgroupsv2-all-metrics-output.txt b/pkg/collector/fixtures/output/e2e-test-cgroupsv2-all-metrics-output.txt index 8bd1292e..8c817e7a 100644 --- a/pkg/collector/fixtures/output/e2e-test-cgroupsv2-all-metrics-output.txt +++ b/pkg/collector/fixtures/output/e2e-test-cgroupsv2-all-metrics-output.txt @@ -20,7 +20,7 @@ batchjob_rapl_package_joules_total{hostname="",index="1",path="pkg/collector/fix batchjob_scrape_collector_success{collector="ipmi_dcmi"} 1 batchjob_scrape_collector_success{collector="rapl"} 1 batchjob_scrape_collector_success{collector="slurm_job"} 1 -# HELP batchjob_slurm_job_cpu_psi_seconds Cumulative CPU PSI seconds +# HELP batchjob_slurm_job_cpu_psi_seconds Cumulative CPU PSI in seconds # TYPE batchjob_slurm_job_cpu_psi_seconds gauge batchjob_slurm_job_cpu_psi_seconds{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuser="testusr",jobuuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5",step="",task=""} 0 # HELP batchjob_slurm_job_cpu_system_seconds Cumulative CPU system seconds @@ -45,7 +45,7 @@ batchjob_slurm_job_memory_cache_bytes{batch="slurm",hostname="",jobaccount="test # HELP batchjob_slurm_job_memory_fail_count Memory fail count # TYPE batchjob_slurm_job_memory_fail_count gauge batchjob_slurm_job_memory_fail_count{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuser="testusr",jobuuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5",step="",task=""} 0 -# HELP batchjob_slurm_job_memory_psi_seconds Cumulative memory PSI seconds +# HELP batchjob_slurm_job_memory_psi_seconds Cumulative memory PSI in seconds # TYPE batchjob_slurm_job_memory_psi_seconds gauge batchjob_slurm_job_memory_psi_seconds{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuser="testusr",jobuuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5",step="",task=""} 0 # HELP batchjob_slurm_job_memory_rss_bytes Memory RSS used in bytes @@ -62,10 +62,13 @@ batchjob_slurm_job_memory_used_bytes{batch="slurm",hostname="",jobaccount="testa batchjob_slurm_job_memsw_fail_count{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuser="testusr",jobuuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5",step="",task=""} 0 # HELP batchjob_slurm_job_memsw_total_bytes Swap total in bytes # TYPE batchjob_slurm_job_memsw_total_bytes gauge -batchjob_slurm_job_memsw_total_bytes{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuser="testusr",jobuuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5",step="",task=""} 1.8446744073709552e+19 +batchjob_slurm_job_memsw_total_bytes{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuser="testusr",jobuuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5",step="",task=""} 1.6378171392e+10 # HELP batchjob_slurm_job_memsw_used_bytes Swap used in bytes # TYPE batchjob_slurm_job_memsw_used_bytes gauge batchjob_slurm_job_memsw_used_bytes{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuser="testusr",jobuuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5",step="",task=""} 0 +# HELP batchjob_slurm_job_system_memory_total_bytes Available total system memory in bytes +# TYPE batchjob_slurm_job_system_memory_total_bytes gauge +batchjob_slurm_job_system_memory_total_bytes{batch="slurm",hostname=""} 1.6378171392e+10 # HELP go_gc_duration_seconds A summary of the pause duration of garbage collection cycles. # TYPE go_gc_duration_seconds summary # HELP go_goroutines Number of goroutines that currently exist. diff --git a/pkg/collector/fixtures/output/e2e-test-cgroupsv2-amd-output.txt b/pkg/collector/fixtures/output/e2e-test-cgroupsv2-amd-output.txt index f3a7a6e7..9d182bd6 100644 --- a/pkg/collector/fixtures/output/e2e-test-cgroupsv2-amd-output.txt +++ b/pkg/collector/fixtures/output/e2e-test-cgroupsv2-amd-output.txt @@ -51,6 +51,9 @@ batchjob_slurm_job_memory_total_bytes{batch="slurm",hostname="",jobaccount="test # HELP batchjob_slurm_job_memory_used_bytes Memory used in bytes # TYPE batchjob_slurm_job_memory_used_bytes gauge batchjob_slurm_job_memory_used_bytes{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuser="testusr",jobuuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5",step="",task=""} 4.111491072e+09 +# HELP batchjob_slurm_job_system_memory_total_bytes Available total system memory in bytes +# TYPE batchjob_slurm_job_system_memory_total_bytes gauge +batchjob_slurm_job_system_memory_total_bytes{batch="slurm",hostname=""} 1.6378171392e+10 # HELP go_gc_duration_seconds A summary of the pause duration of garbage collection cycles. # TYPE go_gc_duration_seconds summary # HELP go_goroutines Number of goroutines that currently exist. diff --git a/pkg/collector/fixtures/output/e2e-test-cgroupsv2-nogpu-output.txt b/pkg/collector/fixtures/output/e2e-test-cgroupsv2-nogpu-output.txt index b2d413f7..2ff48116 100644 --- a/pkg/collector/fixtures/output/e2e-test-cgroupsv2-nogpu-output.txt +++ b/pkg/collector/fixtures/output/e2e-test-cgroupsv2-nogpu-output.txt @@ -47,6 +47,9 @@ batchjob_slurm_job_memory_total_bytes{batch="slurm",hostname="",jobaccount="test # HELP batchjob_slurm_job_memory_used_bytes Memory used in bytes # TYPE batchjob_slurm_job_memory_used_bytes gauge batchjob_slurm_job_memory_used_bytes{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuser="testusr",jobuuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5",step="",task=""} 4.111491072e+09 +# HELP batchjob_slurm_job_system_memory_total_bytes Available total system memory in bytes +# TYPE batchjob_slurm_job_system_memory_total_bytes gauge +batchjob_slurm_job_system_memory_total_bytes{batch="slurm",hostname=""} 1.6378171392e+10 # HELP go_gc_duration_seconds A summary of the pause duration of garbage collection cycles. # TYPE go_gc_duration_seconds summary # HELP go_goroutines Number of goroutines that currently exist. diff --git a/pkg/collector/fixtures/output/e2e-test-cgroupsv2-nvidia-output.txt b/pkg/collector/fixtures/output/e2e-test-cgroupsv2-nvidia-output.txt index 328c1f0a..a8121d04 100644 --- a/pkg/collector/fixtures/output/e2e-test-cgroupsv2-nvidia-output.txt +++ b/pkg/collector/fixtures/output/e2e-test-cgroupsv2-nvidia-output.txt @@ -51,6 +51,9 @@ batchjob_slurm_job_memory_total_bytes{batch="slurm",hostname="",jobaccount="test # HELP batchjob_slurm_job_memory_used_bytes Memory used in bytes # TYPE batchjob_slurm_job_memory_used_bytes gauge batchjob_slurm_job_memory_used_bytes{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuser="testusr",jobuuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5",step="",task=""} 4.111491072e+09 +# HELP batchjob_slurm_job_system_memory_total_bytes Available total system memory in bytes +# TYPE batchjob_slurm_job_system_memory_total_bytes gauge +batchjob_slurm_job_system_memory_total_bytes{batch="slurm",hostname=""} 1.6378171392e+10 # HELP go_gc_duration_seconds A summary of the pause duration of garbage collection cycles. # TYPE go_gc_duration_seconds summary # HELP go_goroutines Number of goroutines that currently exist. diff --git a/pkg/collector/fixtures/output/e2e-test-cgroupsv2-procfs-output.txt b/pkg/collector/fixtures/output/e2e-test-cgroupsv2-procfs-output.txt index 328c1f0a..0acd8ce9 100644 --- a/pkg/collector/fixtures/output/e2e-test-cgroupsv2-procfs-output.txt +++ b/pkg/collector/fixtures/output/e2e-test-cgroupsv2-procfs-output.txt @@ -51,6 +51,9 @@ batchjob_slurm_job_memory_total_bytes{batch="slurm",hostname="",jobaccount="test # HELP batchjob_slurm_job_memory_used_bytes Memory used in bytes # TYPE batchjob_slurm_job_memory_used_bytes gauge batchjob_slurm_job_memory_used_bytes{batch="slurm",hostname="",jobaccount="testacc",jobid="1009248",jobuser="testusr",jobuuid="0f0ac288-dbd4-a9a3-df3a-ab14ef9d51d5",step="",task=""} 4.111491072e+09 +# HELP batchjob_slurm_job_system_memory_total_bytes Available total system memory in bytes +# TYPE batchjob_slurm_job_system_memory_total_bytes gauge +batchjob_slurm_job_system_memory_total_bytes{batch="slurm",hostname=""} 1.6042172416e+10 # HELP go_gc_duration_seconds A summary of the pause duration of garbage collection cycles. # TYPE go_gc_duration_seconds summary # HELP go_goroutines Number of goroutines that currently exist. diff --git a/pkg/collector/slurm.go b/pkg/collector/slurm.go index cc807590..34148274 100644 --- a/pkg/collector/slurm.go +++ b/pkg/collector/slurm.go @@ -124,6 +124,7 @@ type slurmCollector struct { memswTotal *prometheus.Desc memswFailCount *prometheus.Desc memoryPressure *prometheus.Desc + memoryAvailable *prometheus.Desc gpuJobFlag *prometheus.Desc collectError *prometheus.Desc logger log.Logger @@ -174,9 +175,10 @@ func NewSlurmCollector(logger log.Logger) (Collector, error) { // Get total memory of instance var memTotal float64 memInfo, err := GetMemInfo() - if err != nil { - level.Info(logger).Log("msg", "Failed to get total memory of the host", "err", err) - memTotal = memInfo["MemTotal"] + if err == nil { + memTotal = memInfo["MemTotal_bytes"] + } else { + level.Error(logger).Log("msg", "Failed to get total memory of the host", "err", err) } return &slurmCollector{ cgroups: cgroupsVersion, @@ -211,7 +213,7 @@ func NewSlurmCollector(logger log.Logger) (Collector, error) { ), cpuPressure: prometheus.NewDesc( prometheus.BuildFQName(Namespace, slurmCollectorSubsystem, "cpu_psi_seconds"), - "Cumulative CPU PSI seconds", + "Cumulative CPU PSI in seconds", []string{"batch", "hostname", "jobid", "jobuser", "jobaccount", "jobuuid", "step", "task"}, nil, ), @@ -265,10 +267,16 @@ func NewSlurmCollector(logger log.Logger) (Collector, error) { ), memoryPressure: prometheus.NewDesc( prometheus.BuildFQName(Namespace, slurmCollectorSubsystem, "memory_psi_seconds"), - "Cumulative memory PSI seconds", + "Cumulative memory PSI in seconds", []string{"batch", "hostname", "jobid", "jobuser", "jobaccount", "jobuuid", "step", "task"}, nil, ), + memoryAvailable: prometheus.NewDesc( + prometheus.BuildFQName(Namespace, slurmCollectorSubsystem, "system_memory_total_bytes"), + "Available total system memory in bytes", + []string{"batch", "hostname"}, + nil, + ), collectError: prometheus.NewDesc( prometheus.BuildFQName(Namespace, slurmCollectorSubsystem, "collect_error"), "Indicates collection error, 0=no error, 1=error", @@ -341,6 +349,7 @@ func (c *slurmCollector) Update(ch chan<- prometheus.Metric) error { ch <- prometheus.MustNewConstMetric(c.gpuJobFlag, prometheus.GaugeValue, float64(1), m.batch, c.hostname, m.jobid, m.jobuser, m.jobaccount, m.jobuuid, gpuOrdinal, uuid, uuid) } } + ch <- prometheus.MustNewConstMetric(c.memoryAvailable, prometheus.GaugeValue, float64(c.memTotal), "slurm", c.hostname) return nil } diff --git a/scripts/e2e-test.sh b/scripts/e2e-test.sh index 2206c810..f3c88bf1 100755 --- a/scripts/e2e-test.sh +++ b/scripts/e2e-test.sh @@ -178,6 +178,7 @@ then ./bin/batchjob_exporter \ --path.sysfs="pkg/collector/fixtures/sys" \ --path.cgroupfs="pkg/collector/fixtures/sys/fs/cgroup" \ + --path.procfs="pkg/collector/fixtures/proc" \ --collector.slurm.create.unique.jobids \ --collector.slurm.job.props.path="pkg/collector/fixtures/slurmjobprops" \ --collector.slurm.gpu.type="nvidia" \