Skip to content

Commit

Permalink
chore: Use int indices for devs map
Browse files Browse the repository at this point in the history
* Iteration over map is undefined in go and not reproducible

* To ensure we always have same behaviour we use int as map index and iterate over range

* This is done to avoid unit test failures as order in slice gpuOrdinals is important in cmp

Signed-off-by: Mahendra Paipuri <[email protected]>
  • Loading branch information
mahendrapaipuri committed Jan 1, 2024
1 parent 684fa4f commit 7e5ab16
Show file tree
Hide file tree
Showing 3 changed files with 27 additions and 10 deletions.
8 changes: 5 additions & 3 deletions pkg/collector/helper.go
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ func LoadCgroupsV2Metrics(
// exporter simple.
//
// NOTE: Hoping this command returns MIG devices too
func GetNvidiaGPUDevices(nvidiaSmiPath string, logger log.Logger) (map[string]Device, error) {
func GetNvidiaGPUDevices(nvidiaSmiPath string, logger log.Logger) (map[int]Device, error) {
// Check if nvidia-smi binary exists
if _, err := os.Stat(nvidiaSmiPath); err != nil {
level.Error(logger).Log("msg", "Failed to open nvidia-smi executable", "path", nvidiaSmiPath, "err", err)
Expand All @@ -116,7 +116,8 @@ func GetNvidiaGPUDevices(nvidiaSmiPath string, logger log.Logger) (map[string]De
}

// Get all devices
gpuDevices := map[string]Device{}
gpuDevices := map[int]Device{}
devIndxInt := 0
for _, line := range strings.Split(string(nvidiaSmiOutput), "\n") {
// Header line, empty line and newlines are ignored
if line == "" || line == "\n" || strings.HasPrefix(line, "index") {
Expand All @@ -143,7 +144,8 @@ func GetNvidiaGPUDevices(nvidiaSmiPath string, logger log.Logger) (map[string]De
level.Debug(logger).
Log("msg", "Found nVIDIA GPU", "name", devName, "UUID", devUuid, "isMig:", isMig)

gpuDevices[devIndx] = Device{index: devIndx, name: devName, uuid: devUuid, isMig: isMig}
gpuDevices[devIndxInt] = Device{index: devIndx, name: devName, uuid: devUuid, isMig: isMig}
devIndxInt++
}
return gpuDevices, nil
}
23 changes: 19 additions & 4 deletions pkg/collector/slurm.go
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ type slurmCollector struct {
cgroupsRootPath string
slurmCgroupsPath string
hostname string
nvidiaGPUDevs map[string]Device
nvidiaGPUDevs map[int]Device
cpuUser *prometheus.Desc
cpuSystem *prometheus.Desc
cpuTotal *prometheus.Desc
Expand Down Expand Up @@ -326,8 +326,12 @@ func (c *slurmCollector) Update(ch chan<- prometheus.Metric) error {
}
for _, gpuOrdinal := range m.jobGpuOrdinals {
var uuid string
if dev, ok := c.nvidiaGPUDevs[gpuOrdinal]; ok {
uuid = dev.uuid
// Check the int index of devices where gpuOrdinal == dev.index
for _, dev := range c.nvidiaGPUDevs {
if gpuOrdinal == dev.index {
uuid = dev.uuid
break
}
}
ch <- prometheus.MustNewConstMetric(c.gpuJobMap, prometheus.GaugeValue, float64(jid), m.batch, c.hostname, gpuOrdinal, uuid, uuid)
}
Expand Down Expand Up @@ -472,7 +476,18 @@ func (c *slurmCollector) getJobProperties(metric *CgroupMetric, pids []uint64) {
}

// If there are no GPUs this loop will be skipped anyways
for _, dev := range c.nvidiaGPUDevs {
// NOTE: In go loop over map is not reproducible. The order is undefined and thus
// we might end up with a situation where jobGpuOrdinals will [1 2] or [2 1] if
// current Job has two GPUs. This will fail unit tests as order in Slice is important
// in Go
//
// So we use map[int]Device to have int indices for devices which we use internally
// We are not using device index as it might be a non-integer. We are not sure about
// it but just to be safe. This will have a small overhead as we need to check the
// correct integer index for each device index. We can live with it as there are
// typically 2/4/8 GPUs per node.
for i := 0; i <= len(c.nvidiaGPUDevs); i++ {
dev := c.nvidiaGPUDevs[i]
gpuJobMapInfo := fmt.Sprintf("%s/%s", *gpuStatPath, dev.index)

// NOTE: Look for file name with UUID as it will be more appropriate with
Expand Down
6 changes: 3 additions & 3 deletions pkg/collector/slurm_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,11 @@ import (

var expectedSlurmMetrics CgroupMetric

func mockGPUDevices() map[string]Device {
var devs = make(map[string]Device, 4)
func mockGPUDevices() map[int]Device {
var devs = make(map[int]Device, 4)
for i := 0; i <= 4; i++ {
idxString := strconv.Itoa(i)
devs[idxString] = Device{index: idxString, uuid: fmt.Sprintf("GPU-%d", i)}
devs[i] = Device{index: idxString, uuid: fmt.Sprintf("GPU-%d", i)}
}
return devs
}
Expand Down

0 comments on commit 7e5ab16

Please sign in to comment.