From 24f79d0b5ab6bf6a2a38a51c6be929852133d246 Mon Sep 17 00:00:00 2001 From: Mahendra Paipuri <44365948+mahendrapaipuri@users.noreply.github.com> Date: Wed, 15 Nov 2023 16:16:33 +0100 Subject: [PATCH 01/14] CircleCI Commit --- .circleci/config.yml | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 .circleci/config.yml diff --git a/.circleci/config.yml b/.circleci/config.yml new file mode 100644 index 00000000..4175da6c --- /dev/null +++ b/.circleci/config.yml @@ -0,0 +1,26 @@ +# Use the latest 2.1 version of CircleCI pipeline process engine. +# See: https://circleci.com/docs/configuration-reference +version: 2.1 + +# Define a job to be invoked later in a workflow. +# See: https://circleci.com/docs/configuration-reference/#jobs +jobs: + say-hello: + # Specify the execution environment. You can specify an image from Docker Hub or use one of our convenience images from CircleCI's Developer Hub. + # See: https://circleci.com/docs/configuration-reference/#executor-job + docker: + - image: cimg/base:stable + # Add steps to the job + # See: https://circleci.com/docs/configuration-reference/#steps + steps: + - checkout + - run: + name: "Say hello" + command: "echo Hello, World!" + +# Orchestrate jobs using workflows +# See: https://circleci.com/docs/configuration-reference/#workflows +workflows: + say-hello-workflow: + jobs: + - say-hello From 0e762b2b47db6fb5ff8bad3904e0ab2b72ad7569 Mon Sep 17 00:00:00 2001 From: mahendrapaipuri Date: Wed, 15 Nov 2023 16:32:09 +0100 Subject: [PATCH 02/14] feat: Add dockerfile --- Dockerfile | 12 ++++++++++++ 1 file changed, 12 insertions(+) create mode 100644 Dockerfile diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 00000000..02bd26f0 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,12 @@ +ARG ARCH="amd64" +ARG OS="linux" +FROM quay.io/prometheus/busybox-${OS}-${ARCH}:latest +LABEL maintainer="Mahendra Paipuri " + +ARG ARCH="amd64" +ARG OS="linux" +COPY .build/${OS}-${ARCH}/batchjob_exporter /bin/batchjob_exporter + +EXPOSE 9100 +USER nobody +ENTRYPOINT [ "/bin/batchjob_exporter" ] From 0b2880d3d3ae5681b153cd001eaf3755ba609c27 Mon Sep 17 00:00:00 2001 From: mahendrapaipuri Date: Wed, 15 Nov 2023 16:32:24 +0100 Subject: [PATCH 03/14] ci: Use node exporter circle CI config --- .circleci/config.yml | 122 ++++++++++++++++++++++++++++++++++++------- 1 file changed, 104 insertions(+), 18 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 4175da6c..67cdde1c 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -1,26 +1,112 @@ -# Use the latest 2.1 version of CircleCI pipeline process engine. -# See: https://circleci.com/docs/configuration-reference version: 2.1 +orbs: + prometheus: prometheus/prometheus@0.17.1 +executors: + # Whenever the Go version is updated here, .promu.yml and .promu-cgo.yml + # should also be updated. + golang: + docker: + - image: cimg/go:1.21 + arm: + machine: + image: ubuntu-2204:current + resource_class: arm.medium -# Define a job to be invoked later in a workflow. -# See: https://circleci.com/docs/configuration-reference/#jobs jobs: - say-hello: - # Specify the execution environment. You can specify an image from Docker Hub or use one of our convenience images from CircleCI's Developer Hub. - # See: https://circleci.com/docs/configuration-reference/#executor-job - docker: - - image: cimg/base:stable - # Add steps to the job - # See: https://circleci.com/docs/configuration-reference/#steps + test: + executor: golang + steps: + - prometheus/setup_environment + - run: go mod download + - run: make + - prometheus/store_artifact: + file: node_exporter + test-arm: + executor: arm steps: - checkout + - run: uname -a + - run: make test-e2e + build: + machine: + image: ubuntu-2204:current + parallelism: 3 + steps: + - prometheus/setup_environment + - run: docker run --privileged linuxkit/binfmt:af88a591f9cc896a52ce596b9cf7ca26a061ef97 + - run: promu crossbuild -v --parallelism $CIRCLE_NODE_TOTAL --parallelism-thread $CIRCLE_NODE_INDEX + - run: promu --config .promu-cgo.yml crossbuild -v --parallelism $CIRCLE_NODE_TOTAL --parallelism-thread $CIRCLE_NODE_INDEX + - persist_to_workspace: + root: . + paths: + - .build + - store_artifacts: + path: .build + destination: /build + test_docker: + machine: + image: ubuntu-2204:current + environment: + DOCKER_TEST_IMAGE_NAME: quay.io/prometheus/golang-builder:1.18-base + REPO_PATH: github.com/prometheus/node_exporter + steps: + - prometheus/setup_environment + - attach_workspace: + at: . - run: - name: "Say hello" - command: "echo Hello, World!" - -# Orchestrate jobs using workflows -# See: https://circleci.com/docs/configuration-reference/#workflows + command: | + if [ -n "$CIRCLE_TAG" ]; then + make docker DOCKER_IMAGE_TAG=$CIRCLE_TAG + else + make docker + fi + - run: docker images + - run: docker run --rm -t -v "$(pwd):/app" "${DOCKER_TEST_IMAGE_NAME}" -i "${REPO_PATH}" -T + - run: + command: | + if [ -n "$CIRCLE_TAG" ]; then + make test-docker DOCKER_IMAGE_TAG=$CIRCLE_TAG + else + make test-docker + fi workflows: - say-hello-workflow: + version: 2 + node_exporter: jobs: - - say-hello + - test: + filters: + tags: + only: /.*/ + - test-arm: + filters: + tags: + only: /.*/ + - build: + filters: + tags: + only: /.*/ + - test_docker: + requires: + - test + - build + filters: + tags: + only: /.*/ + - prometheus/publish_master: + context: org-context + requires: + - test + - build + filters: + branches: + only: master + - prometheus/publish_release: + context: org-context + requires: + - test + - build + filters: + tags: + only: /^v.*/ + branches: + ignore: /.*/ From b19d9556945473f52b93f7b929e175304d8a8a46 Mon Sep 17 00:00:00 2001 From: mahendrapaipuri Date: Wed, 15 Nov 2023 16:32:48 +0100 Subject: [PATCH 04/14] chore: Fix docker repo name --- .gitignore | 8 +++++++- Makefile.common | 2 +- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index c8594976..abe603d5 100644 --- a/.gitignore +++ b/.gitignore @@ -15,7 +15,7 @@ *.out # Dependency directories (remove the comment below to include it) -# vendor/ +vendor/ # Go workspace file go.work @@ -28,3 +28,9 @@ collector/fixtures/sys # Ignore scripts run.sh + +# Ignore artifacts +/.build +/.deps +/.release +/.tarballs diff --git a/Makefile.common b/Makefile.common index 3761597b..7c3c140b 100644 --- a/Makefile.common +++ b/Makefile.common @@ -69,7 +69,7 @@ BIN_DIR ?= $(shell pwd) DOCKER_IMAGE_TAG ?= $(subst /,-,$(shell git rev-parse --abbrev-ref HEAD)) DOCKERFILE_PATH ?= ./Dockerfile DOCKERBUILD_CONTEXT ?= ./ -DOCKER_REPO ?= prom +DOCKER_REPO ?= mahendrapaipuri DOCKER_ARCHS ?= amd64 From ac37ad8428debbd6d933f461860e3767d1c86101 Mon Sep 17 00:00:00 2001 From: mahendrapaipuri Date: Wed, 15 Nov 2023 16:37:10 +0100 Subject: [PATCH 05/14] ci: Dont test docker in CI --- .circleci/config.yml | 14 +++++++------- test_image.sh | 36 ++++++++++++++++++++++++++++++++++++ 2 files changed, 43 insertions(+), 7 deletions(-) create mode 100755 test_image.sh diff --git a/.circleci/config.yml b/.circleci/config.yml index 67cdde1c..24838b8c 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -85,13 +85,13 @@ workflows: filters: tags: only: /.*/ - - test_docker: - requires: - - test - - build - filters: - tags: - only: /.*/ + # - test_docker: + # requires: + # - test + # - build + # filters: + # tags: + # only: /.*/ - prometheus/publish_master: context: org-context requires: diff --git a/test_image.sh b/test_image.sh new file mode 100755 index 00000000..5c7078ce --- /dev/null +++ b/test_image.sh @@ -0,0 +1,36 @@ +#!/bin/bash +set -exo pipefail + +docker_image=$1 +port=$2 + +container_id='' + +wait_start() { + for in in {1..10}; do + if /usr/bin/curl -s -m 5 -f "http://localhost:${port}/metrics" > /dev/null; then + docker_cleanup + exit 0 + else + sleep 1 + fi + done + + exit 1 +} + +docker_start() { + container_id=$(docker run -d -p "${port}":"${port}" "${docker_image}") +} + +docker_cleanup() { + docker kill "${container_id}" +} + +if [[ "$#" -ne 2 ]] ; then + echo "Usage: $0 quay.io/prometheus/batchjob-exporter:v0.1.0 9010" >&2 + exit 1 +fi + +docker_start +wait_start From a0251830d199000e3294f214201bf46b377ac635 Mon Sep 17 00:00:00 2001 From: mahendrapaipuri Date: Wed, 15 Nov 2023 17:04:54 +0100 Subject: [PATCH 06/14] style: Format code --- batchjob_exporter_test.go | 2 +- collector/helper.go | 2 +- collector/ipmi.go | 8 ++++---- collector/nvidia_gpus.go | 18 +++++++++--------- collector/nvidia_gpus_test.go | 10 +++++----- collector/rapl_test.go | 6 +++--- collector/slurm.go | 8 ++++---- collector/slurm_test.go | 34 +++++++++++++++++----------------- 8 files changed, 44 insertions(+), 44 deletions(-) diff --git a/batchjob_exporter_test.go b/batchjob_exporter_test.go index 3457b20a..a34f0a08 100644 --- a/batchjob_exporter_test.go +++ b/batchjob_exporter_test.go @@ -109,4 +109,4 @@ func runCommandAndTests(cmd *exec.Cmd, address string, fn func(pid int) error) e cmd.Process.Kill() } return err -} +} diff --git a/collector/helper.go b/collector/helper.go index 2c1cf9e8..1b601591 100644 --- a/collector/helper.go +++ b/collector/helper.go @@ -46,4 +46,4 @@ func Execute(cmd string, args []string, logger log.Logger) ([]byte, error) { err = fmt.Errorf("error running %s: %s", cmd, err) } return out, err -} \ No newline at end of file +} diff --git a/collector/ipmi.go b/collector/ipmi.go index 0ec78231..c33aaf59 100644 --- a/collector/ipmi.go +++ b/collector/ipmi.go @@ -27,9 +27,9 @@ type impiCollector struct { } var ( - ipmiDcmiWrapperExec = kingpin.Flag("collector.ipmi.dcmi.wrapper.path", "Path to IPMI DCMI executable wrapper.").Default("ipmi-dcmi-wrapper").String() - ipmiDCMIPowerMeasurementRegex = regexp.MustCompile(`^Power Measurement\s*:\s*(?PActive|Not\sAvailable).*`) - ipmiDCMICurrentPowerRegex = regexp.MustCompile(`^Current Power\s*:\s*(?P[0-9.]*)\s*Watts.*`) + ipmiDcmiWrapperExec = kingpin.Flag("collector.ipmi.dcmi.wrapper.path", "Path to IPMI DCMI executable wrapper.").Default("ipmi-dcmi-wrapper").String() + ipmiDCMIPowerMeasurementRegex = regexp.MustCompile(`^Power Measurement\s*:\s*(?PActive|Not\sAvailable).*`) + ipmiDCMICurrentPowerRegex = regexp.MustCompile(`^Current Power\s*:\s*(?P[0-9.]*)\s*Watts.*`) ) func init() { @@ -45,7 +45,7 @@ func NewIpmiCollector(logger log.Logger) (Collector, error) { ) collector := impiCollector{ - logger: logger, + logger: logger, wattsMetricDesc: wattsMetricDesc, } return &collector, nil diff --git a/collector/nvidia_gpus.go b/collector/nvidia_gpus.go index ce90b26f..20aca912 100644 --- a/collector/nvidia_gpus.go +++ b/collector/nvidia_gpus.go @@ -21,15 +21,15 @@ var ( ) type Device struct { - name string - uuid string - isMig bool + name string + uuid string + isMig bool } type nvidiaGpuJobMapCollector struct { - devices []Device - logger log.Logger - gpuJobMapDesc *prometheus.Desc + devices []Device + logger log.Logger + gpuJobMapDesc *prometheus.Desc } func init() { @@ -87,9 +87,9 @@ func NewNvidiaGpuJobMapCollector(logger log.Logger) (Collector, error) { ) collector := nvidiaGpuJobMapCollector{ - devices: allDevices, - logger: logger, - gpuJobMapDesc: gpuJobMapDesc, + devices: allDevices, + logger: logger, + gpuJobMapDesc: gpuJobMapDesc, } return &collector, nil } diff --git a/collector/nvidia_gpus_test.go b/collector/nvidia_gpus_test.go index 0bce4bb6..0fb47793 100644 --- a/collector/nvidia_gpus_test.go +++ b/collector/nvidia_gpus_test.go @@ -11,11 +11,11 @@ import ( ) var ( - devices = []Device{{name: "fakeGpu1", - uuid: "GPU-f124aa59-d406-d45b-9481-8fcd694e6c9e", - isMig: false}, {name: "fakeGpu2", - uuid: "GPU-61a65011-6571-a6d2-5ab8-66cbb6f7f9c3", - isMig: false}} + devices = []Device{{name: "fakeGpu1", + uuid: "GPU-f124aa59-d406-d45b-9481-8fcd694e6c9e", + isMig: false}, {name: "fakeGpu2", + uuid: "GPU-61a65011-6571-a6d2-5ab8-66cbb6f7f9c3", + isMig: false}} ) func TestNvidiaJobGpuMap(t *testing.T) { diff --git a/collector/rapl_test.go b/collector/rapl_test.go index ffbe7b5c..375580c1 100644 --- a/collector/rapl_test.go +++ b/collector/rapl_test.go @@ -11,7 +11,7 @@ import ( "github.com/prometheus/procfs/sysfs" ) -var expectedEnergyMetrics = []int{258218293244, 130570505826} +var expectedEnergyMetrics = []float64{258218293244, 130570505826} func TestRaplMetrics(t *testing.T) { if _, err := kingpin.CommandLine.Parse([]string{"--path.sysfs", "fixtures/sys"}); err != nil { @@ -31,8 +31,8 @@ func TestRaplMetrics(t *testing.T) { if err != nil { t.Fatalf("Cannot retrieve energy data from GetEnergyMicrojoules function: %v ", err) } - if expectedEnergyMetrics[iz] != int(microJoules) { - t.Fatalf("Expected energy value %d: Got: %d ", expectedEnergyMetrics[iz], microJoules) + if expectedEnergyMetrics[iz] != float64(microJoules) { + t.Fatalf("Expected energy value %f: Got: %f ", expectedEnergyMetrics[iz], float64(microJoules)) } } } diff --git a/collector/slurm.go b/collector/slurm.go index 3fff1d9b..8dfde3ff 100644 --- a/collector/slurm.go +++ b/collector/slurm.go @@ -23,9 +23,9 @@ import ( const slurmCollectorSubsystem = "slurm_job" var ( - cgroupV2 = false - metricLock = sync.RWMutex{} - collectJobSteps = kingpin.Flag("collector.slurm.jobsteps.metrics", "Whether to collect metrics of all slurm job steps and tasks [WARNING: This option can result in very high cardinality of metrics].").Default("false").Bool() + cgroupV2 = false + metricLock = sync.RWMutex{} + collectJobSteps = kingpin.Flag("collector.slurm.jobsteps.metrics", "Whether to collect metrics of all slurm job steps and tasks [WARNING: This option can result in very high cardinality of metrics].").Default("false").Bool() ) type CgroupMetric struct { @@ -432,4 +432,4 @@ func (c *slurmCollector) getCgroupsV2Metrics(name string) (CgroupMetric, error) } c.getInfoV2(name, &metric) return metric, nil -} \ No newline at end of file +} diff --git a/collector/slurm_test.go b/collector/slurm_test.go index 5abb061d..e4976fe3 100644 --- a/collector/slurm_test.go +++ b/collector/slurm_test.go @@ -20,24 +20,24 @@ func TestSlurmJobMetrics(t *testing.T) { c := slurmCollector{cgroupV2: true, logger: log.NewNopLogger()} metrics, err := c.getJobsMetrics() expectedSlurmMetrics["/system.slice/slurmstepd.scope/job_1009248"] = CgroupMetric{ - name: "/system.slice/slurmstepd.scope/job_1009248", - cpuUser: 60375.292848, - cpuSystem: 115.777502, - cpuTotal: 60491.070351, - cpus: 2, - memoryRSS: 4.098592768e+09, - memoryCache: 0, - memoryUsed: 4.111491072e+09, - memoryTotal: 4.294967296e+09, + name: "/system.slice/slurmstepd.scope/job_1009248", + cpuUser: 60375.292848, + cpuSystem: 115.777502, + cpuTotal: 60491.070351, + cpus: 2, + memoryRSS: 4.098592768e+09, + memoryCache: 0, + memoryUsed: 4.111491072e+09, + memoryTotal: 4.294967296e+09, memoryFailCount: 0, - memswUsed: 0, - memswTotal: 0, - memswFailCount: 0, - userslice: false, - uid: -1, - jobid: "1009248", - batch: "slurm", - err: false} + memswUsed: 0, + memswTotal: 0, + memswFailCount: 0, + userslice: false, + uid: -1, + jobid: "1009248", + batch: "slurm", + err: false} if err != nil { t.Fatalf("Cannot retrieve data from getJobsMetrics function: %v ", err) } From 67205d8aff948a26482e7e453006cc7e1a57be33 Mon Sep 17 00:00:00 2001 From: mahendrapaipuri Date: Wed, 15 Nov 2023 17:05:12 +0100 Subject: [PATCH 07/14] build: Ignore docker tests --- Makefile | 20 ++++++-------------- Makefile.common | 2 +- 2 files changed, 7 insertions(+), 15 deletions(-) diff --git a/Makefile b/Makefile index 43021db1..2dc700ef 100644 --- a/Makefile +++ b/Makefile @@ -49,14 +49,7 @@ endif PROMU := $(FIRST_GOPATH)/bin/promu --config $(PROMU_CONF) -e2e-out-64k-page = collector/fixtures/e2e-64k-page-output.txt -e2e-out = collector/fixtures/e2e-output.txt -ifeq ($(MACH), ppc64le) - e2e-out = $(e2e-out-64k-page) -endif -ifeq ($(MACH), aarch64) - e2e-out = $(e2e-out-64k-page) -endif +e2e-out = collector/fixtures/e2e-test-output.txt # 64bit -> 32bit mapping for cross-checking. At least for amd64/386, the 64bit CPU can execute 32bit code but not the other way around, so we don't support cross-testing upwards. cross-test = skip-test-32bit @@ -104,7 +97,7 @@ update_fixtures: .PHONY: test-e2e test-e2e: build collector/fixtures/sys/.unpacked @echo ">> running end-to-end tests" - ./end-to-end-test.sh + ./e2e-test.sh .PHONY: skip-test-e2e skip-test-e2e: @@ -114,17 +107,16 @@ skip-test-e2e: checkmetrics: $(PROMTOOL) @echo ">> checking metrics for correctness" ./checkmetrics.sh $(PROMTOOL) $(e2e-out) - ./checkmetrics.sh $(PROMTOOL) $(e2e-out-64k-page) .PHONY: checkrules checkrules: $(PROMTOOL) @echo ">> checking rules for correctness" find . -name "*rules*.yml" | xargs -I {} $(PROMTOOL) check rules {} -.PHONY: test-docker -test-docker: - @echo ">> testing docker image" - ./test_image.sh "$(DOCKER_REPO)/$(DOCKER_IMAGE_NAME)-linux-amd64:$(DOCKER_IMAGE_TAG)" 9100 +# .PHONY: test-docker +# test-docker: +# @echo ">> testing docker image" +# ./test_image.sh "$(DOCKER_REPO)/$(DOCKER_IMAGE_NAME)-linux-amd64:$(DOCKER_IMAGE_TAG)" 9100 .PHONY: promtool promtool: $(PROMTOOL) diff --git a/Makefile.common b/Makefile.common index 7c3c140b..4e536cad 100644 --- a/Makefile.common +++ b/Makefile.common @@ -92,7 +92,7 @@ endif %: common-% ; .PHONY: common-all -common-all: precheck style check_license lint yamllint unused build test +common-all: precheck style lint yamllint unused build test .PHONY: common-style common-style: From bc6358bf66521e6a3b03122b9f23f26982927d07 Mon Sep 17 00:00:00 2001 From: mahendrapaipuri Date: Wed, 15 Nov 2023 17:05:35 +0100 Subject: [PATCH 08/14] ci: Correct project name in CI config --- .circleci/config.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 24838b8c..c65b695e 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -20,7 +20,7 @@ jobs: - run: go mod download - run: make - prometheus/store_artifact: - file: node_exporter + file: batchjob_exporter test-arm: executor: arm steps: @@ -48,7 +48,7 @@ jobs: image: ubuntu-2204:current environment: DOCKER_TEST_IMAGE_NAME: quay.io/prometheus/golang-builder:1.18-base - REPO_PATH: github.com/prometheus/node_exporter + REPO_PATH: github.com/prometheus/batchjob_exporter steps: - prometheus/setup_environment - attach_workspace: @@ -71,7 +71,7 @@ jobs: fi workflows: version: 2 - node_exporter: + batchjob_exporter: jobs: - test: filters: From 889db7a64ff6ecc9700e1468aba16983d6bb6fd2 Mon Sep 17 00:00:00 2001 From: mahendrapaipuri Date: Wed, 15 Nov 2023 17:05:45 +0100 Subject: [PATCH 09/14] ci: Check metrics names --- checkmetrics.sh | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100755 checkmetrics.sh diff --git a/checkmetrics.sh b/checkmetrics.sh new file mode 100755 index 00000000..082a7a38 --- /dev/null +++ b/checkmetrics.sh @@ -0,0 +1,16 @@ +#!/usr/bin/env bash + +if [[ ( -z "$1" ) || ( -z "$2" ) ]]; then + echo "usage: ./checkmetrics.sh /usr/bin/promtool e2e-test-output.txt" + exit 1 +fi + +# Ignore known issues in auto-generated and network specific collectors. +lint=$($1 check metrics < "$2" 2>&1 | grep -v -E "^batchjob_(memory_fail_count|memsw_fail_count)") + +if [[ -n $lint ]]; then + echo -e "Some Prometheus metrics do not follow best practices:\n" + echo "$lint" + + exit 1 +fi From 22f8136f397bf1582b54ac3ab655c181896b959c Mon Sep 17 00:00:00 2001 From: mahendrapaipuri Date: Thu, 16 Nov 2023 09:59:03 +0100 Subject: [PATCH 10/14] build: Remove darwin platform from crossbuild * Does not make sense as exporter is targeting HPC platforms --- .promu-cgo.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.promu-cgo.yml b/.promu-cgo.yml index 5bf8c819..263fd956 100644 --- a/.promu-cgo.yml +++ b/.promu-cgo.yml @@ -21,7 +21,5 @@ tarball: - NOTICE crossbuild: platforms: - - darwin/amd64 - - darwin/arm64 - netbsd/amd64 - netbsd/386 From d35beca3d9f371cb3d6755923ebb586da57fb135 Mon Sep 17 00:00:00 2001 From: mahendrapaipuri Date: Thu, 16 Nov 2023 09:59:24 +0100 Subject: [PATCH 11/14] refactor: Add log line for each device found --- collector/nvidia_gpus.go | 1 + 1 file changed, 1 insertion(+) diff --git a/collector/nvidia_gpus.go b/collector/nvidia_gpus.go index 20aca912..639622de 100644 --- a/collector/nvidia_gpus.go +++ b/collector/nvidia_gpus.go @@ -72,6 +72,7 @@ func getAllDevices(logger log.Logger) ([]Device, error) { if strings.HasPrefix(devUuid, "MIG") { isMig = true } + level.Debug(logger).Log("msg", "Found nVIDIA GPU", devName, "with UUID", devUuid, "and isMig:", isMig) allDevices = append(allDevices, Device{name: devName, uuid: devUuid, isMig: isMig}) } return allDevices, nil From 1296f6dbba6dcbe0d2fd79ab030d49bacb370540 Mon Sep 17 00:00:00 2001 From: mahendrapaipuri Date: Thu, 16 Nov 2023 10:01:47 +0100 Subject: [PATCH 12/14] test: Support testing cgroups v1 and v2 - We added two test cases for both v1 and v2 - sys.ttar file contains control groups for two cases - We check cgroups mode in e2e-test.sh and compare output with correct file --- Makefile | 32 +- .../fixtures/e2e-test-cgroupsv1-output.txt | 173 +++++ ...tput.txt => e2e-test-cgroupsv2-output.txt} | 0 collector/fixtures/sys.ttar | 726 +++++++++++++++++- collector/slurm_test.go | 43 +- e2e-test.sh | 9 +- 6 files changed, 967 insertions(+), 16 deletions(-) create mode 100644 collector/fixtures/e2e-test-cgroupsv1-output.txt rename collector/fixtures/{e2e-test-output.txt => e2e-test-cgroupsv2-output.txt} (100%) diff --git a/Makefile b/Makefile index 2dc700ef..27fe0eae 100644 --- a/Makefile +++ b/Makefile @@ -12,8 +12,10 @@ PROMTOOL_VERSION ?= 2.30.0 PROMTOOL_URL ?= https://github.com/prometheus/prometheus/releases/download/v$(PROMTOOL_VERSION)/prometheus-$(PROMTOOL_VERSION).$(GO_BUILD_PLATFORM).tar.gz PROMTOOL ?= $(FIRST_GOPATH)/bin/promtool +TEST_DOCKER ?= false DOCKER_IMAGE_NAME ?= batchjob-exporter MACH ?= $(shell uname -m) +CGROUPS_MODE ?= $([ $(stat -fc %T /sys/fs/cgroup/) = "cgroup2fs" ] && echo "unified" || ( [ -e /sys/fs/cgroup/unified/ ] && echo "hybrid" || echo "legacy")) STATICCHECK_IGNORE = @@ -23,6 +25,12 @@ else test-e2e := skip-test-e2e endif +ifeq ($(TEST_DOCKER), false) + test-docker := skip-test-docker +else + test-docker := test-docker +endif + # Use CGO for non-Linux builds. ifeq ($(GOOS), linux) PROMU_CONF ?= .promu.yml @@ -49,7 +57,14 @@ endif PROMU := $(FIRST_GOPATH)/bin/promu --config $(PROMU_CONF) -e2e-out = collector/fixtures/e2e-test-output.txt +e2e-cgroupsv2-out = collector/fixtures/e2e-test-cgroupsv2-output.txt +e2e-cgroupsv1-out = collector/fixtures/e2e-test-cgroupsv1-output.txt + +ifeq ($(CGROUPS_MODE), unified) + e2e-out = $(e2e-cgroupsv2-out) +else + e2e-out = $(e2e-cgroupsv1-out) +endif # 64bit -> 32bit mapping for cross-checking. At least for amd64/386, the 64bit CPU can execute 32bit code but not the other way around, so we don't support cross-testing upwards. cross-test = skip-test-32bit @@ -67,7 +82,7 @@ $(eval $(call goarch_pair,amd64,386)) $(eval $(call goarch_pair,mips64,mips)) $(eval $(call goarch_pair,mips64el,mipsel)) -all:: vet checkmetrics checkrules common-all $(cross-test) $(test-e2e) +all:: vet checkmetrics checkrules common-all $(cross-test) $(test-docker) $(test-e2e) .PHONY: test test: collector/fixtures/sys/.unpacked @@ -93,7 +108,6 @@ update_fixtures: rm -vf collector/fixtures/sys/.unpacked ./ttar -C collector/fixtures -c -f collector/fixtures/sys.ttar sys - .PHONY: test-e2e test-e2e: build collector/fixtures/sys/.unpacked @echo ">> running end-to-end tests" @@ -113,10 +127,14 @@ checkrules: $(PROMTOOL) @echo ">> checking rules for correctness" find . -name "*rules*.yml" | xargs -I {} $(PROMTOOL) check rules {} -# .PHONY: test-docker -# test-docker: -# @echo ">> testing docker image" -# ./test_image.sh "$(DOCKER_REPO)/$(DOCKER_IMAGE_NAME)-linux-amd64:$(DOCKER_IMAGE_TAG)" 9100 +.PHONY: test-docker +test-docker: + @echo ">> testing docker image" + ./test_image.sh "$(DOCKER_REPO)/$(DOCKER_IMAGE_NAME)-linux-amd64:$(DOCKER_IMAGE_TAG)" 9010 + +.PHONY: skip-test-docker +skip-test-docker: + @echo ">> SKIP running docker tests" .PHONY: promtool promtool: $(PROMTOOL) diff --git a/collector/fixtures/e2e-test-cgroupsv1-output.txt b/collector/fixtures/e2e-test-cgroupsv1-output.txt new file mode 100644 index 00000000..2b959dd6 --- /dev/null +++ b/collector/fixtures/e2e-test-cgroupsv1-output.txt @@ -0,0 +1,173 @@ +# HELP batchjob_cpu_system_seconds Cumulative CPU system seconds for jobid +# TYPE batchjob_cpu_system_seconds gauge +batchjob_cpu_system_seconds{batch="slurm",jobid="1009248",step="",task=""} 0.45 +# HELP batchjob_cpu_total_seconds Cumulative CPU total seconds for jobid +# TYPE batchjob_cpu_total_seconds gauge +batchjob_cpu_total_seconds{batch="slurm",jobid="1009248",step="",task=""} 1.012410966 +# HELP batchjob_cpu_user_seconds Cumulative CPU user seconds for jobid +# TYPE batchjob_cpu_user_seconds gauge +batchjob_cpu_user_seconds{batch="slurm",jobid="1009248",step="",task=""} 0.39 +# HELP batchjob_cpus Number of CPUs in the jobid +# TYPE batchjob_cpus gauge +batchjob_cpus{batch="slurm",jobid="1009248",step="",task=""} 0 +# HELP batchjob_exporter_build_info A metric with a constant '1' value labeled by version, revision, branch, goversion from which batchjob_exporter was built, and the goos and goarch for the build. +# TYPE batchjob_exporter_build_info gauge +batchjob_exporter_build_info{branch="",goarch="amd64",goos="linux",goversion="go1.21.4",revision="unknown",tags="unknown",version=""} 1 +# HELP batchjob_memory_cache_bytes Memory cache used in bytes +# TYPE batchjob_memory_cache_bytes gauge +batchjob_memory_cache_bytes{batch="slurm",jobid="1009248",step="",task=""} 2.1086208e+07 +# HELP batchjob_memory_fail_count Memory fail count +# TYPE batchjob_memory_fail_count gauge +batchjob_memory_fail_count{batch="slurm",jobid="1009248",step="",task=""} 0 +# HELP batchjob_memory_rss_bytes Memory RSS used in bytes +# TYPE batchjob_memory_rss_bytes gauge +batchjob_memory_rss_bytes{batch="slurm",jobid="1009248",step="",task=""} 1.0407936e+07 +# HELP batchjob_memory_total_bytes Memory total given to jobid in bytes +# TYPE batchjob_memory_total_bytes gauge +batchjob_memory_total_bytes{batch="slurm",jobid="1009248",step="",task=""} 2.01362030592e+11 +# HELP batchjob_memory_used_bytes Memory used in bytes +# TYPE batchjob_memory_used_bytes gauge +batchjob_memory_used_bytes{batch="slurm",jobid="1009248",step="",task=""} 4.0194048e+07 +# HELP batchjob_memsw_fail_count Swap fail count +# TYPE batchjob_memsw_fail_count gauge +batchjob_memsw_fail_count{batch="slurm",jobid="1009248",step="",task=""} 0 +# HELP batchjob_memsw_total_bytes Swap total given to jobid in bytes +# TYPE batchjob_memsw_total_bytes gauge +batchjob_memsw_total_bytes{batch="slurm",jobid="1009248",step="",task=""} 9.223372036854772e+18 +# HELP batchjob_memsw_used_bytes Swap used in bytes +# TYPE batchjob_memsw_used_bytes gauge +batchjob_memsw_used_bytes{batch="slurm",jobid="1009248",step="",task=""} 4.032512e+07 +# HELP batchjob_rapl_package_joules_total Current RAPL package value in joules +# TYPE batchjob_rapl_package_joules_total counter +batchjob_rapl_package_joules_total{index="0",path="collector/fixtures/sys/class/powercap/intel-rapl:0"} 258218.293244 +batchjob_rapl_package_joules_total{index="1",path="collector/fixtures/sys/class/powercap/intel-rapl:1"} 130570.505826 +# HELP batchjob_scrape_collector_duration_seconds batchjob_exporter: Duration of a collector scrape. +# TYPE batchjob_scrape_collector_duration_seconds gauge +batchjob_scrape_collector_duration_seconds{collector="ipmi_dcmi"} 0.000502612 +batchjob_scrape_collector_duration_seconds{collector="rapl"} 0.002879363 +batchjob_scrape_collector_duration_seconds{collector="slurm_job"} 0.003768202 +# HELP batchjob_scrape_collector_success batchjob_exporter: Whether a collector succeeded. +# TYPE batchjob_scrape_collector_success gauge +batchjob_scrape_collector_success{collector="ipmi_dcmi"} 0 +batchjob_scrape_collector_success{collector="rapl"} 1 +batchjob_scrape_collector_success{collector="slurm_job"} 1 +# HELP go_gc_duration_seconds A summary of the pause duration of garbage collection cycles. +# TYPE go_gc_duration_seconds summary +go_gc_duration_seconds{quantile="0"} 0 +go_gc_duration_seconds{quantile="0.25"} 0 +go_gc_duration_seconds{quantile="0.5"} 0 +go_gc_duration_seconds{quantile="0.75"} 0 +go_gc_duration_seconds{quantile="1"} 0 +go_gc_duration_seconds_sum 0 +go_gc_duration_seconds_count 0 +# HELP go_goroutines Number of goroutines that currently exist. +# TYPE go_goroutines gauge +go_goroutines 7 +# HELP go_info Information about the Go environment. +# TYPE go_info gauge +go_info{version="go1.21.4"} 1 +# HELP go_memstats_alloc_bytes Number of bytes allocated and still in use. +# TYPE go_memstats_alloc_bytes gauge +go_memstats_alloc_bytes 681656 +# HELP go_memstats_alloc_bytes_total Total number of bytes allocated, even if freed. +# TYPE go_memstats_alloc_bytes_total counter +go_memstats_alloc_bytes_total 681656 +# HELP go_memstats_buck_hash_sys_bytes Number of bytes used by the profiling bucket hash table. +# TYPE go_memstats_buck_hash_sys_bytes gauge +go_memstats_buck_hash_sys_bytes 1.445185e+06 +# HELP go_memstats_frees_total Total number of frees. +# TYPE go_memstats_frees_total counter +go_memstats_frees_total 246 +# HELP go_memstats_gc_sys_bytes Number of bytes used for garbage collection system metadata. +# TYPE go_memstats_gc_sys_bytes gauge +go_memstats_gc_sys_bytes 2.915544e+06 +# HELP go_memstats_heap_alloc_bytes Number of heap bytes allocated and still in use. +# TYPE go_memstats_heap_alloc_bytes gauge +go_memstats_heap_alloc_bytes 681656 +# HELP go_memstats_heap_idle_bytes Number of heap bytes waiting to be used. +# TYPE go_memstats_heap_idle_bytes gauge +go_memstats_heap_idle_bytes 1.982464e+06 +# HELP go_memstats_heap_inuse_bytes Number of heap bytes that are in use. +# TYPE go_memstats_heap_inuse_bytes gauge +go_memstats_heap_inuse_bytes 1.851392e+06 +# HELP go_memstats_heap_objects Number of allocated objects. +# TYPE go_memstats_heap_objects gauge +go_memstats_heap_objects 3643 +# HELP go_memstats_heap_released_bytes Number of heap bytes released to OS. +# TYPE go_memstats_heap_released_bytes gauge +go_memstats_heap_released_bytes 1.949696e+06 +# HELP go_memstats_heap_sys_bytes Number of heap bytes obtained from system. +# TYPE go_memstats_heap_sys_bytes gauge +go_memstats_heap_sys_bytes 3.833856e+06 +# HELP go_memstats_last_gc_time_seconds Number of seconds since 1970 of last garbage collection. +# TYPE go_memstats_last_gc_time_seconds gauge +go_memstats_last_gc_time_seconds 0 +# HELP go_memstats_lookups_total Total number of pointer lookups. +# TYPE go_memstats_lookups_total counter +go_memstats_lookups_total 0 +# HELP go_memstats_mallocs_total Total number of mallocs. +# TYPE go_memstats_mallocs_total counter +go_memstats_mallocs_total 3889 +# HELP go_memstats_mcache_inuse_bytes Number of bytes in use by mcache structures. +# TYPE go_memstats_mcache_inuse_bytes gauge +go_memstats_mcache_inuse_bytes 1200 +# HELP go_memstats_mcache_sys_bytes Number of bytes used for mcache structures obtained from system. +# TYPE go_memstats_mcache_sys_bytes gauge +go_memstats_mcache_sys_bytes 15600 +# HELP go_memstats_mspan_inuse_bytes Number of bytes in use by mspan structures. +# TYPE go_memstats_mspan_inuse_bytes gauge +go_memstats_mspan_inuse_bytes 33768 +# HELP go_memstats_mspan_sys_bytes Number of bytes used for mspan structures obtained from system. +# TYPE go_memstats_mspan_sys_bytes gauge +go_memstats_mspan_sys_bytes 65184 +# HELP go_memstats_next_gc_bytes Number of heap bytes when next garbage collection will take place. +# TYPE go_memstats_next_gc_bytes gauge +go_memstats_next_gc_bytes 4.194304e+06 +# HELP go_memstats_other_sys_bytes Number of bytes used for other system allocations. +# TYPE go_memstats_other_sys_bytes gauge +go_memstats_other_sys_bytes 511839 +# HELP go_memstats_stack_inuse_bytes Number of bytes in use by the stack allocator. +# TYPE go_memstats_stack_inuse_bytes gauge +go_memstats_stack_inuse_bytes 360448 +# HELP go_memstats_stack_sys_bytes Number of bytes obtained from system for stack allocator. +# TYPE go_memstats_stack_sys_bytes gauge +go_memstats_stack_sys_bytes 360448 +# HELP go_memstats_sys_bytes Number of bytes obtained from system. +# TYPE go_memstats_sys_bytes gauge +go_memstats_sys_bytes 9.147656e+06 +# HELP go_threads Number of OS threads created. +# TYPE go_threads gauge +go_threads 7 +# HELP process_cpu_seconds_total Total user and system CPU time spent in seconds. +# TYPE process_cpu_seconds_total counter +process_cpu_seconds_total 0 +# HELP process_max_fds Maximum number of open file descriptors. +# TYPE process_max_fds gauge +process_max_fds 1.048576e+06 +# HELP process_open_fds Number of open file descriptors. +# TYPE process_open_fds gauge +process_open_fds 8 +# HELP process_resident_memory_bytes Resident memory size in bytes. +# TYPE process_resident_memory_bytes gauge +process_resident_memory_bytes 1.376256e+07 +# HELP process_start_time_seconds Start time of the process since unix epoch in seconds. +# TYPE process_start_time_seconds gauge +process_start_time_seconds 1.70006874948e+09 +# HELP process_virtual_memory_bytes Virtual memory size in bytes. +# TYPE process_virtual_memory_bytes gauge +process_virtual_memory_bytes 1.647849472e+09 +# HELP process_virtual_memory_max_bytes Maximum amount of virtual memory available in bytes. +# TYPE process_virtual_memory_max_bytes gauge +process_virtual_memory_max_bytes 1.8446744073709552e+19 +# HELP promhttp_metric_handler_errors_total Total number of internal errors encountered by the promhttp metric handler. +# TYPE promhttp_metric_handler_errors_total counter +promhttp_metric_handler_errors_total{cause="encoding"} 0 +promhttp_metric_handler_errors_total{cause="gathering"} 0 +# HELP promhttp_metric_handler_requests_in_flight Current number of scrapes being served. +# TYPE promhttp_metric_handler_requests_in_flight gauge +promhttp_metric_handler_requests_in_flight 1 +# HELP promhttp_metric_handler_requests_total Total number of scrapes by HTTP status code. +# TYPE promhttp_metric_handler_requests_total counter +promhttp_metric_handler_requests_total{code="200"} 0 +promhttp_metric_handler_requests_total{code="500"} 0 +promhttp_metric_handler_requests_total{code="503"} 0 diff --git a/collector/fixtures/e2e-test-output.txt b/collector/fixtures/e2e-test-cgroupsv2-output.txt similarity index 100% rename from collector/fixtures/e2e-test-output.txt rename to collector/fixtures/e2e-test-cgroupsv2-output.txt diff --git a/collector/fixtures/sys.ttar b/collector/fixtures/sys.ttar index 7f24bb27..78daa714 100644 --- a/collector/fixtures/sys.ttar +++ b/collector/fixtures/sys.ttar @@ -413,6 +413,728 @@ Mode: 775 Directory: sys/fs/cgroup Mode: 775 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Directory: sys/fs/cgroup/cpuacct +Mode: 775 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Directory: sys/fs/cgroup/cpuacct/slurm +Mode: 775 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Directory: sys/fs/cgroup/cpuacct/slurm/uid_1000 +Mode: 775 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Directory: sys/fs/cgroup/cpuacct/slurm/uid_1000/job_1009248 +Mode: 775 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/fs/cgroup/cpuacct/slurm/uid_1000/job_1009248/cgroup.clone_children +Lines: 1 +0 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/fs/cgroup/cpuacct/slurm/uid_1000/job_1009248/cgroup.procs +Lines: 5 +9544 +9562 +9563 +9616 +9870 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/fs/cgroup/cpuacct/slurm/uid_1000/job_1009248/cpu.cfs_period_us +Lines: 1 +100000 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/fs/cgroup/cpuacct/slurm/uid_1000/job_1009248/cpu.cfs_quota_us +Lines: 1 +-1 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/fs/cgroup/cpuacct/slurm/uid_1000/job_1009248/cpu.shares +Lines: 1 +1024 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/fs/cgroup/cpuacct/slurm/uid_1000/job_1009248/cpu.stat +Lines: 3 +nr_periods 0 +nr_throttled 0 +throttled_time 0 +Mode: 444 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/fs/cgroup/cpuacct/slurm/uid_1000/job_1009248/cpuacct.stat +Lines: 2 +user 39 +system 45 +Mode: 444 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/fs/cgroup/cpuacct/slurm/uid_1000/job_1009248/cpuacct.usage +Lines: 1 +1012410966 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/fs/cgroup/cpuacct/slurm/uid_1000/job_1009248/cpuacct.usage_all +Lines: 65 +cpu user system +0 1196678 71229 +1 15514404 54098 +2 4542209 0 +3 34311942 18654226 +4 1694579 0 +5 25310180 69879 +6 9831958 10304749 +7 15456610 51624 +8 9536685 0 +9 36102975 135135 +10 7936161 508681 +11 9247682 14142 +12 4834097 802504 +13 27902500 1695238 +14 0 0 +15 12947096 537550 +16 6216078 72385 +17 5460476 337738 +18 0 0 +19 1773846 206981 +20 4300098 0 +21 996060 0 +22 6086470 28544 +23 1450661 0 +24 9226052 8540577 +25 626699 0 +26 3095099 0 +27 20635910 1528216 +28 16708670 11599918 +29 2364270 0 +30 1218227 0 +31 15519923 858952 +32 1351546 0 +33 45599413 596696 +34 8443048 330679 +35 13830826 0 +36 3206203 330195 +37 3473800 69381 +38 41808354 1361643 +39 3060034 0 +40 14823758 9284885 +41 123661669 5981166 +42 0 0 +43 0 0 +44 3572054 780589 +45 255415820 2411920 +46 0 0 +47 8187034 50930 +48 0 0 +49 1360213 0 +50 0 0 +51 23418158 63506 +52 0 0 +53 14814933 39230 +54 0 0 +55 8628984 0 +56 0 0 +57 16282353 18125 +58 0 0 +59 2816371 72505 +60 980429 0 +61 28250255 38016 +62 0 0 +63 564950 0 +Mode: 444 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/fs/cgroup/cpuacct/slurm/uid_1000/job_1009248/cpuacct.usage_percpu +Lines: 1 +1267907 15568502 4542209 52966168 1694579 25380059 20136707 15508234 9536685 36238110 8444842 9261824 5636601 29597738 0 13484646 6288463 5798214 0 1980827 4300098 996060 6115014 1450661 17766629 626699 3095099 22164126 28308588 2364270 1218227 16378875 1351546 46196109 8773727 13830826 3536398 3543181 43169997 3060034 24108643 129642835 0 0 4352643 257827740 0 8237964 0 1360213 0 23481664 0 14854163 0 8628984 0 16300478 0 2888876 980429 27761417 0 564950 +Mode: 444 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/fs/cgroup/cpuacct/slurm/uid_1000/job_1009248/cpuacct.usage_percpu_sys +Lines: 1 +71229 54098 0 18654226 0 69879 10304749 51624 0 135135 508681 14142 802504 1695238 0 537550 72385 337738 0 206981 0 0 28544 0 8540577 0 0 1528216 11599918 0 0 858952 0 596696 330679 0 330195 69381 1361643 0 9284885 5981166 0 0 780589 2411920 0 50930 0 0 0 63506 0 39230 0 0 0 18125 0 72505 0 38016 0 0 +Mode: 444 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/fs/cgroup/cpuacct/slurm/uid_1000/job_1009248/cpuacct.usage_percpu_user +Lines: 1 +1196678 15514404 4542209 34311942 1694579 25310180 9831958 15456610 9536685 36102975 7936161 9247682 4834097 27902500 0 12947096 6216078 5460476 0 1773846 4300098 996060 6086470 1450661 9226052 626699 3095099 20635910 16708670 2364270 1218227 15519923 1351546 45599413 8443048 13830826 3206203 3473800 41808354 3060034 14823758 123661669 0 0 3572054 255415820 0 8187034 0 1360213 0 23418158 0 14814933 0 8628984 0 16282353 0 2816371 980429 27908262 0 564950 +Mode: 444 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/fs/cgroup/cpuacct/slurm/uid_1000/job_1009248/cpuacct.usage_sys +Lines: 1 +77501832 +Mode: 444 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/fs/cgroup/cpuacct/slurm/uid_1000/job_1009248/cpuacct.usage_user +Lines: 1 +934961699 +Mode: 444 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/fs/cgroup/cpuacct/slurm/uid_1000/job_1009248/notify_on_release +Lines: 1 +0 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Directory: sys/fs/cgroup/cpuacct/slurm/uid_1000/job_1009248/step_0 +Mode: 775 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/fs/cgroup/cpuacct/slurm/uid_1000/job_1009248/step_0/cgroup.clone_children +Lines: 1 +0 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/fs/cgroup/cpuacct/slurm/uid_1000/job_1009248/step_0/cgroup.procs +Lines: 5 +9544 +9562 +9563 +9616 +9870 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/fs/cgroup/cpuacct/slurm/uid_1000/job_1009248/step_0/cpu.cfs_period_us +Lines: 1 +100000 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/fs/cgroup/cpuacct/slurm/uid_1000/job_1009248/step_0/cpu.cfs_quota_us +Lines: 1 +-1 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/fs/cgroup/cpuacct/slurm/uid_1000/job_1009248/step_0/cpu.shares +Lines: 1 +1024 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/fs/cgroup/cpuacct/slurm/uid_1000/job_1009248/step_0/cpu.stat +Lines: 3 +nr_periods 0 +nr_throttled 0 +throttled_time 0 +Mode: 444 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/fs/cgroup/cpuacct/slurm/uid_1000/job_1009248/step_0/cpuacct.stat +Lines: 2 +user 39 +system 45 +Mode: 444 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/fs/cgroup/cpuacct/slurm/uid_1000/job_1009248/step_0/cpuacct.usage +Lines: 1 +1012410966 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/fs/cgroup/cpuacct/slurm/uid_1000/job_1009248/step_0/cpuacct.usage_all +Lines: 65 +cpu user system +0 1196678 71229 +1 15514404 54098 +2 4542209 0 +3 34311942 18654226 +4 1694579 0 +5 25310180 69879 +6 9831958 10304749 +7 15456610 51624 +8 9536685 0 +9 36102975 135135 +10 7936161 508681 +11 9247682 14142 +12 4834097 802504 +13 27902500 1695238 +14 0 0 +15 12947096 537550 +16 6216078 72385 +17 5460476 337738 +18 0 0 +19 1773846 206981 +20 4300098 0 +21 996060 0 +22 6086470 28544 +23 1450661 0 +24 9226052 8540577 +25 626699 0 +26 3095099 0 +27 20635910 1528216 +28 16708670 11599918 +29 2364270 0 +30 1218227 0 +31 15519923 858952 +32 1351546 0 +33 45599413 596696 +34 8443048 330679 +35 13830826 0 +36 3206203 330195 +37 3473800 69381 +38 41808354 1361643 +39 3060034 0 +40 14823758 9284885 +41 123661669 5981166 +42 0 0 +43 0 0 +44 3572054 780589 +45 255415820 2411920 +46 0 0 +47 8187034 50930 +48 0 0 +49 1360213 0 +50 0 0 +51 23418158 63506 +52 0 0 +53 14814933 39230 +54 0 0 +55 8628984 0 +56 0 0 +57 16282353 18125 +58 0 0 +59 2816371 72505 +60 980429 0 +61 28250255 38016 +62 0 0 +63 564950 0 +Mode: 444 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/fs/cgroup/cpuacct/slurm/uid_1000/job_1009248/step_0/cpuacct.usage_percpu +Lines: 1 +1267907 15568502 4542209 52966168 1694579 25380059 20136707 15508234 9536685 36238110 8444842 9261824 5636601 29597738 0 13484646 6288463 5798214 0 1980827 4300098 996060 6115014 1450661 17766629 626699 3095099 22164126 28308588 2364270 1218227 16378875 1351546 46196109 8773727 13830826 3536398 3543181 43169997 3060034 24108643 129642835 0 0 4352643 257827740 0 8237964 0 1360213 0 23481664 0 14854163 0 8628984 0 16300478 0 2888876 980429 27761417 0 564950 +Mode: 444 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/fs/cgroup/cpuacct/slurm/uid_1000/job_1009248/step_0/cpuacct.usage_percpu_sys +Lines: 1 +71229 54098 0 18654226 0 69879 10304749 51624 0 135135 508681 14142 802504 1695238 0 537550 72385 337738 0 206981 0 0 28544 0 8540577 0 0 1528216 11599918 0 0 858952 0 596696 330679 0 330195 69381 1361643 0 9284885 5981166 0 0 780589 2411920 0 50930 0 0 0 63506 0 39230 0 0 0 18125 0 72505 0 38016 0 0 +Mode: 444 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/fs/cgroup/cpuacct/slurm/uid_1000/job_1009248/step_0/cpuacct.usage_percpu_user +Lines: 1 +1196678 15514404 4542209 34311942 1694579 25310180 9831958 15456610 9536685 36102975 7936161 9247682 4834097 27902500 0 12947096 6216078 5460476 0 1773846 4300098 996060 6086470 1450661 9226052 626699 3095099 20635910 16708670 2364270 1218227 15519923 1351546 45599413 8443048 13830826 3206203 3473800 41808354 3060034 14823758 123661669 0 0 3572054 255415820 0 8187034 0 1360213 0 23418158 0 14814933 0 8628984 0 16282353 0 2816371 980429 27908262 0 564950 +Mode: 444 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/fs/cgroup/cpuacct/slurm/uid_1000/job_1009248/step_0/cpuacct.usage_sys +Lines: 1 +77501832 +Mode: 444 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/fs/cgroup/cpuacct/slurm/uid_1000/job_1009248/step_0/cpuacct.usage_user +Lines: 1 +934961699 +Mode: 444 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/fs/cgroup/cpuacct/slurm/uid_1000/job_1009248/step_0/notify_on_release +Lines: 1 +0 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/fs/cgroup/cpuacct/slurm/uid_1000/job_1009248/step_0/tasks +Lines: 5 +9544 +9562 +9563 +9616 +9870 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/fs/cgroup/cpuacct/slurm/uid_1000/job_1009248/tasks +Lines: 5 +9544 +9562 +9563 +9616 +9870 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Directory: sys/fs/cgroup/memory +Mode: 775 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Directory: sys/fs/cgroup/memory/slurm +Mode: 775 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Directory: sys/fs/cgroup/memory/slurm/uid_1000 +Mode: 775 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Directory: sys/fs/cgroup/memory/slurm/uid_1000/job_1009248 +Mode: 775 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/fs/cgroup/memory/slurm/uid_1000/job_1009248/cgroup.clone_children +Lines: 1 +0 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/fs/cgroup/memory/slurm/uid_1000/job_1009248/cgroup.procs +Lines: 5 +9544 +9562 +9563 +9616 +9870 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/fs/cgroup/memory/slurm/uid_1000/job_1009248/memory.failcnt +Lines: 1 +0 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/fs/cgroup/memory/slurm/uid_1000/job_1009248/memory.kmem.failcnt +Lines: 1 +0 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/fs/cgroup/memory/slurm/uid_1000/job_1009248/memory.kmem.limit_in_bytes +Lines: 1 +9223372036854771712 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/fs/cgroup/memory/slurm/uid_1000/job_1009248/memory.kmem.max_usage_in_bytes +Lines: 1 +7733248 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/fs/cgroup/memory/slurm/uid_1000/job_1009248/memory.kmem.slabinfo +Lines: 0 +Mode: 444 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/fs/cgroup/memory/slurm/uid_1000/job_1009248/memory.kmem.tcp.failcnt +Lines: 1 +0 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/fs/cgroup/memory/slurm/uid_1000/job_1009248/memory.kmem.tcp.limit_in_bytes +Lines: 1 +9223372036854771712 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/fs/cgroup/memory/slurm/uid_1000/job_1009248/memory.kmem.tcp.max_usage_in_bytes +Lines: 1 +0 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/fs/cgroup/memory/slurm/uid_1000/job_1009248/memory.kmem.tcp.usage_in_bytes +Lines: 1 +0 +Mode: 444 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/fs/cgroup/memory/slurm/uid_1000/job_1009248/memory.kmem.usage_in_bytes +Lines: 1 +7725056 +Mode: 444 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/fs/cgroup/memory/slurm/uid_1000/job_1009248/memory.limit_in_bytes +Lines: 1 +201362030592 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/fs/cgroup/memory/slurm/uid_1000/job_1009248/memory.max_usage_in_bytes +Lines: 1 +55246848 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/fs/cgroup/memory/slurm/uid_1000/job_1009248/memory.memsw.failcnt +Lines: 1 +0 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/fs/cgroup/memory/slurm/uid_1000/job_1009248/memory.memsw.limit_in_bytes +Lines: 1 +9223372036854771712 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/fs/cgroup/memory/slurm/uid_1000/job_1009248/memory.memsw.max_usage_in_bytes +Lines: 1 +55246848 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/fs/cgroup/memory/slurm/uid_1000/job_1009248/memory.memsw.usage_in_bytes +Lines: 1 +40325120 +Mode: 444 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/fs/cgroup/memory/slurm/uid_1000/job_1009248/memory.move_charge_at_immigrate +Lines: 1 +0 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/fs/cgroup/memory/slurm/uid_1000/job_1009248/memory.numa_stat +Lines: 8 +total=7854 N0=1647 N1=6050 +file=5214 N0=74 N1=5405 +anon=2640 N0=1573 N1=645 +unevictable=0 N0=0 N1=0 +hierarchical_total=7854 N0=1848 N1=6105 +hierarchical_file=5214 N0=33 N1=5181 +hierarchical_anon=2640 N0=1815 N1=924 +hierarchical_unevictable=0 N0=0 N1=0 +Mode: 444 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/fs/cgroup/memory/slurm/uid_1000/job_1009248/memory.oom_control +Lines: 3 +oom_kill_disable 0 +under_oom 0 +oom_kill 0 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/fs/cgroup/memory/slurm/uid_1000/job_1009248/memory.soft_limit_in_bytes +Lines: 1 +9223372036854771712 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/fs/cgroup/memory/slurm/uid_1000/job_1009248/memory.stat +Lines: 36 +cache 21086208 +rss 10407936 +rss_huge 0 +shmem 0 +mapped_file 0 +dirty 0 +writeback 0 +swap 0 +pgpgin 24981 +pgpgout 17590 +pgfault 43296 +pgmajfault 33 +inactive_anon 10813440 +active_anon 0 +inactive_file 20275200 +active_file 946176 +unevictable 0 +hierarchical_memory_limit 201362030592 +hierarchical_memsw_limit 9223372036854771712 +total_cache 21086208 +total_rss 10407936 +total_rss_huge 0 +total_shmem 0 +total_mapped_file 0 +total_dirty 0 +total_writeback 0 +total_swap 0 +total_pgpgin 24981 +total_pgpgout 17590 +total_pgfault 43296 +total_pgmajfault 33 +total_inactive_anon 10813440 +total_active_anon 0 +total_inactive_file 20275200 +total_active_file 946176 +total_unevictable 0 +Mode: 444 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/fs/cgroup/memory/slurm/uid_1000/job_1009248/memory.swappiness +Lines: 1 +60 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/fs/cgroup/memory/slurm/uid_1000/job_1009248/memory.usage_in_bytes +Lines: 1 +40194048 +Mode: 444 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/fs/cgroup/memory/slurm/uid_1000/job_1009248/memory.use_hierarchy +Lines: 1 +1 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/fs/cgroup/memory/slurm/uid_1000/job_1009248/notify_on_release +Lines: 1 +0 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Directory: sys/fs/cgroup/memory/slurm/uid_1000/job_1009248/step_0 +Mode: 775 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/fs/cgroup/memory/slurm/uid_1000/job_1009248/step_0/cgroup.clone_children +Lines: 1 +0 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/fs/cgroup/memory/slurm/uid_1000/job_1009248/step_0/cgroup.procs +Lines: 5 +9544 +9562 +9563 +9616 +9870 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/fs/cgroup/memory/slurm/uid_1000/job_1009248/step_0/memory.failcnt +Lines: 1 +0 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/fs/cgroup/memory/slurm/uid_1000/job_1009248/step_0/memory.kmem.failcnt +Lines: 1 +0 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/fs/cgroup/memory/slurm/uid_1000/job_1009248/step_0/memory.kmem.limit_in_bytes +Lines: 1 +9223372036854771712 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/fs/cgroup/memory/slurm/uid_1000/job_1009248/step_0/memory.kmem.max_usage_in_bytes +Lines: 1 +7733248 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/fs/cgroup/memory/slurm/uid_1000/job_1009248/step_0/memory.kmem.slabinfo +Lines: 0 +Mode: 444 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/fs/cgroup/memory/slurm/uid_1000/job_1009248/step_0/memory.kmem.tcp.failcnt +Lines: 1 +0 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/fs/cgroup/memory/slurm/uid_1000/job_1009248/step_0/memory.kmem.tcp.limit_in_bytes +Lines: 1 +9223372036854771712 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/fs/cgroup/memory/slurm/uid_1000/job_1009248/step_0/memory.kmem.tcp.max_usage_in_bytes +Lines: 1 +0 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/fs/cgroup/memory/slurm/uid_1000/job_1009248/step_0/memory.kmem.tcp.usage_in_bytes +Lines: 1 +0 +Mode: 444 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/fs/cgroup/memory/slurm/uid_1000/job_1009248/step_0/memory.kmem.usage_in_bytes +Lines: 1 +7725056 +Mode: 444 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/fs/cgroup/memory/slurm/uid_1000/job_1009248/step_0/memory.limit_in_bytes +Lines: 1 +201362030592 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/fs/cgroup/memory/slurm/uid_1000/job_1009248/step_0/memory.max_usage_in_bytes +Lines: 1 +55246848 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/fs/cgroup/memory/slurm/uid_1000/job_1009248/step_0/memory.memsw.failcnt +Lines: 1 +0 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/fs/cgroup/memory/slurm/uid_1000/job_1009248/step_0/memory.memsw.limit_in_bytes +Lines: 1 +9223372036854771712 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/fs/cgroup/memory/slurm/uid_1000/job_1009248/step_0/memory.memsw.max_usage_in_bytes +Lines: 1 +55246848 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/fs/cgroup/memory/slurm/uid_1000/job_1009248/step_0/memory.memsw.usage_in_bytes +Lines: 1 +40325120 +Mode: 444 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/fs/cgroup/memory/slurm/uid_1000/job_1009248/step_0/memory.move_charge_at_immigrate +Lines: 1 +0 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/fs/cgroup/memory/slurm/uid_1000/job_1009248/step_0/memory.numa_stat +Lines: 8 +total=7854 N0=1647 N1=6050 +file=5214 N0=74 N1=5405 +anon=2640 N0=1573 N1=645 +unevictable=0 N0=0 N1=0 +hierarchical_total=7854 N0=1848 N1=6105 +hierarchical_file=5214 N0=33 N1=5181 +hierarchical_anon=2640 N0=1815 N1=924 +hierarchical_unevictable=0 N0=0 N1=0 +Mode: 444 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/fs/cgroup/memory/slurm/uid_1000/job_1009248/step_0/memory.oom_control +Lines: 3 +oom_kill_disable 0 +under_oom 0 +oom_kill 0 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/fs/cgroup/memory/slurm/uid_1000/job_1009248/step_0/memory.soft_limit_in_bytes +Lines: 1 +9223372036854771712 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/fs/cgroup/memory/slurm/uid_1000/job_1009248/step_0/memory.stat +Lines: 36 +cache 21086208 +rss 10407936 +rss_huge 0 +shmem 0 +mapped_file 0 +dirty 0 +writeback 0 +swap 0 +pgpgin 24981 +pgpgout 17590 +pgfault 43296 +pgmajfault 33 +inactive_anon 10813440 +active_anon 0 +inactive_file 20275200 +active_file 946176 +unevictable 0 +hierarchical_memory_limit 201362030592 +hierarchical_memsw_limit 9223372036854771712 +total_cache 21086208 +total_rss 10407936 +total_rss_huge 0 +total_shmem 0 +total_mapped_file 0 +total_dirty 0 +total_writeback 0 +total_swap 0 +total_pgpgin 24981 +total_pgpgout 17590 +total_pgfault 43296 +total_pgmajfault 33 +total_inactive_anon 10813440 +total_active_anon 0 +total_inactive_file 20275200 +total_active_file 946176 +total_unevictable 0 +Mode: 444 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/fs/cgroup/memory/slurm/uid_1000/job_1009248/step_0/memory.swappiness +Lines: 1 +60 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/fs/cgroup/memory/slurm/uid_1000/job_1009248/step_0/memory.usage_in_bytes +Lines: 1 +40194048 +Mode: 444 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/fs/cgroup/memory/slurm/uid_1000/job_1009248/step_0/memory.use_hierarchy +Lines: 1 +1 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/fs/cgroup/memory/slurm/uid_1000/job_1009248/step_0/notify_on_release +Lines: 1 +0 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/fs/cgroup/memory/slurm/uid_1000/job_1009248/step_0/tasks +Lines: 5 +9544 +9562 +9563 +9616 +9870 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +Path: sys/fs/cgroup/memory/slurm/uid_1000/job_1009248/tasks +Lines: 5 +9544 +9562 +9563 +9616 +9870 +Mode: 644 +# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Directory: sys/fs/cgroup/system.slice Mode: 775 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - @@ -4361,7 +5083,3 @@ Lines: 1 max Mode: 640 # ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Path: sys/.unpacked -Lines: 0 -Mode: 664 -# ttar - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/collector/slurm_test.go b/collector/slurm_test.go index e4976fe3..64341ace 100644 --- a/collector/slurm_test.go +++ b/collector/slurm_test.go @@ -11,15 +11,15 @@ import ( "github.com/go-kit/log" ) -var expectedSlurmMetrics = make(map[string]CgroupMetric) +var expectedSlurmMetrics CgroupMetric -func TestSlurmJobMetrics(t *testing.T) { +func TestCgroupsV2SlurmJobMetrics(t *testing.T) { if _, err := kingpin.CommandLine.Parse([]string{"--path.cgroupfs", "fixtures/sys/fs/cgroup"}); err != nil { t.Fatal(err) } c := slurmCollector{cgroupV2: true, logger: log.NewNopLogger()} metrics, err := c.getJobsMetrics() - expectedSlurmMetrics["/system.slice/slurmstepd.scope/job_1009248"] = CgroupMetric{ + expectedSlurmMetrics = CgroupMetric{ name: "/system.slice/slurmstepd.scope/job_1009248", cpuUser: 60375.292848, cpuSystem: 115.777502, @@ -41,7 +41,42 @@ func TestSlurmJobMetrics(t *testing.T) { if err != nil { t.Fatalf("Cannot retrieve data from getJobsMetrics function: %v ", err) } - if !reflect.DeepEqual(metrics, expectedSlurmMetrics) { + if !reflect.DeepEqual(metrics["/system.slice/slurmstepd.scope/job_1009248"], expectedSlurmMetrics) { + t.Fatalf("Expected metrics data is %+v: \nGot %+v", expectedSlurmMetrics, metrics) + } +} + +func TestCgroupsV1SlurmJobMetrics(t *testing.T) { + if _, err := kingpin.CommandLine.Parse([]string{"--path.cgroupfs", "fixtures/sys/fs/cgroup"}); err != nil { + t.Fatal(err) + } + c := slurmCollector{cgroupV2: false, logger: log.NewNopLogger()} + metrics, err := c.getJobsMetrics() + expectedSlurmMetrics = CgroupMetric{ + name: "/slurm/uid_1000/job_1009248", + cpuUser: 0.39, + cpuSystem: 0.45, + cpuTotal: 1.012410966, + cpus: 0, + memoryRSS: 1.0407936e+07, + memoryCache: 2.1086208e+07, + memoryUsed: 4.0194048e+07, + memoryTotal: 2.01362030592e+11, + memoryFailCount: 0, + memswUsed: 4.032512e+07, + memswTotal: 9.223372036854772e+18, + memswFailCount: 0, + userslice: false, + uid: 1000, + jobid: "1009248", + step: "", + task: "", + batch: "slurm", + err: false} + if err != nil { + t.Fatalf("Cannot retrieve data from getJobsMetrics function: %v ", err) + } + if !reflect.DeepEqual(metrics["/slurm/uid_1000/job_1009248"], expectedSlurmMetrics) { t.Fatalf("Expected metrics data is %+v: \nGot %+v", expectedSlurmMetrics, metrics) } } diff --git a/e2e-test.sh b/e2e-test.sh index 5b714e2a..f30b332e 100755 --- a/e2e-test.sh +++ b/e2e-test.sh @@ -11,7 +11,14 @@ skip_re="^(go_|batchjob_exporter_build_info|batchjob_scrape_collector_duration_s arch="$(uname -m)" -fixture='collector/fixtures/e2e-test-output.txt' +cgroups_mode=$([ $(stat -fc %T /sys/fs/cgroup/) = "cgroup2fs" ] && echo "unified" || ( [ -e /sys/fs/cgroup/unified/ ] && echo "hybrid" || echo "legacy")) + +echo "cgroups mode detected is ${cgroups_mode}" + +case "${cgroups_mode}" in + legacy|hybrid) fixture='collector/fixtures/e2e-test-cgroupsv1-output.txt' ;; + *) fixture='collector/fixtures/e2e-test-cgroupsv2-output.txt' ;; +esac keep=0; update=0; verbose=0 while getopts 'hkuv' opt From 7943a8052f8116931933821465bfc8cf33f82a4b Mon Sep 17 00:00:00 2001 From: mahendrapaipuri Date: Thu, 16 Nov 2023 10:15:40 +0100 Subject: [PATCH 13/14] test: Update expected output for cgroup v1 --- .../fixtures/e2e-test-cgroupsv1-output.txt | 54 ++++--------------- 1 file changed, 9 insertions(+), 45 deletions(-) diff --git a/collector/fixtures/e2e-test-cgroupsv1-output.txt b/collector/fixtures/e2e-test-cgroupsv1-output.txt index 2b959dd6..07037c65 100644 --- a/collector/fixtures/e2e-test-cgroupsv1-output.txt +++ b/collector/fixtures/e2e-test-cgroupsv1-output.txt @@ -12,7 +12,9 @@ batchjob_cpu_user_seconds{batch="slurm",jobid="1009248",step="",task=""} 0.39 batchjob_cpus{batch="slurm",jobid="1009248",step="",task=""} 0 # HELP batchjob_exporter_build_info A metric with a constant '1' value labeled by version, revision, branch, goversion from which batchjob_exporter was built, and the goos and goarch for the build. # TYPE batchjob_exporter_build_info gauge -batchjob_exporter_build_info{branch="",goarch="amd64",goos="linux",goversion="go1.21.4",revision="unknown",tags="unknown",version=""} 1 +# HELP batchjob_ipmi_dcmi_watts_total Current Power consumption in watts +# TYPE batchjob_ipmi_dcmi_watts_total counter +batchjob_ipmi_dcmi_watts_total 332 # HELP batchjob_memory_cache_bytes Memory cache used in bytes # TYPE batchjob_memory_cache_bytes gauge batchjob_memory_cache_bytes{batch="slurm",jobid="1009248",step="",task=""} 2.1086208e+07 @@ -37,128 +39,90 @@ batchjob_memsw_total_bytes{batch="slurm",jobid="1009248",step="",task=""} 9.2233 # HELP batchjob_memsw_used_bytes Swap used in bytes # TYPE batchjob_memsw_used_bytes gauge batchjob_memsw_used_bytes{batch="slurm",jobid="1009248",step="",task=""} 4.032512e+07 +# HELP batchjob_nvidia_gpu_jobid Batch Job ID of current nVIDIA GPU +# TYPE batchjob_nvidia_gpu_jobid gauge +batchjob_nvidia_gpu_jobid{uuid="GPU-61a65011-6571-a6d2-5ab8-66cbb6f7f9c3"} 11000 +batchjob_nvidia_gpu_jobid{uuid="GPU-f124aa59-d406-d45b-9481-8fcd694e6c9e"} 10000 # HELP batchjob_rapl_package_joules_total Current RAPL package value in joules # TYPE batchjob_rapl_package_joules_total counter batchjob_rapl_package_joules_total{index="0",path="collector/fixtures/sys/class/powercap/intel-rapl:0"} 258218.293244 batchjob_rapl_package_joules_total{index="1",path="collector/fixtures/sys/class/powercap/intel-rapl:1"} 130570.505826 # HELP batchjob_scrape_collector_duration_seconds batchjob_exporter: Duration of a collector scrape. # TYPE batchjob_scrape_collector_duration_seconds gauge -batchjob_scrape_collector_duration_seconds{collector="ipmi_dcmi"} 0.000502612 -batchjob_scrape_collector_duration_seconds{collector="rapl"} 0.002879363 -batchjob_scrape_collector_duration_seconds{collector="slurm_job"} 0.003768202 # HELP batchjob_scrape_collector_success batchjob_exporter: Whether a collector succeeded. # TYPE batchjob_scrape_collector_success gauge -batchjob_scrape_collector_success{collector="ipmi_dcmi"} 0 +batchjob_scrape_collector_success{collector="ipmi_dcmi"} 1 +batchjob_scrape_collector_success{collector="nvidia_gpu"} 1 batchjob_scrape_collector_success{collector="rapl"} 1 batchjob_scrape_collector_success{collector="slurm_job"} 1 # HELP go_gc_duration_seconds A summary of the pause duration of garbage collection cycles. # TYPE go_gc_duration_seconds summary -go_gc_duration_seconds{quantile="0"} 0 -go_gc_duration_seconds{quantile="0.25"} 0 -go_gc_duration_seconds{quantile="0.5"} 0 -go_gc_duration_seconds{quantile="0.75"} 0 -go_gc_duration_seconds{quantile="1"} 0 -go_gc_duration_seconds_sum 0 -go_gc_duration_seconds_count 0 # HELP go_goroutines Number of goroutines that currently exist. # TYPE go_goroutines gauge -go_goroutines 7 # HELP go_info Information about the Go environment. # TYPE go_info gauge -go_info{version="go1.21.4"} 1 # HELP go_memstats_alloc_bytes Number of bytes allocated and still in use. # TYPE go_memstats_alloc_bytes gauge -go_memstats_alloc_bytes 681656 # HELP go_memstats_alloc_bytes_total Total number of bytes allocated, even if freed. # TYPE go_memstats_alloc_bytes_total counter -go_memstats_alloc_bytes_total 681656 # HELP go_memstats_buck_hash_sys_bytes Number of bytes used by the profiling bucket hash table. # TYPE go_memstats_buck_hash_sys_bytes gauge -go_memstats_buck_hash_sys_bytes 1.445185e+06 # HELP go_memstats_frees_total Total number of frees. # TYPE go_memstats_frees_total counter -go_memstats_frees_total 246 # HELP go_memstats_gc_sys_bytes Number of bytes used for garbage collection system metadata. # TYPE go_memstats_gc_sys_bytes gauge -go_memstats_gc_sys_bytes 2.915544e+06 # HELP go_memstats_heap_alloc_bytes Number of heap bytes allocated and still in use. # TYPE go_memstats_heap_alloc_bytes gauge -go_memstats_heap_alloc_bytes 681656 # HELP go_memstats_heap_idle_bytes Number of heap bytes waiting to be used. # TYPE go_memstats_heap_idle_bytes gauge -go_memstats_heap_idle_bytes 1.982464e+06 # HELP go_memstats_heap_inuse_bytes Number of heap bytes that are in use. # TYPE go_memstats_heap_inuse_bytes gauge -go_memstats_heap_inuse_bytes 1.851392e+06 # HELP go_memstats_heap_objects Number of allocated objects. # TYPE go_memstats_heap_objects gauge -go_memstats_heap_objects 3643 # HELP go_memstats_heap_released_bytes Number of heap bytes released to OS. # TYPE go_memstats_heap_released_bytes gauge -go_memstats_heap_released_bytes 1.949696e+06 # HELP go_memstats_heap_sys_bytes Number of heap bytes obtained from system. # TYPE go_memstats_heap_sys_bytes gauge -go_memstats_heap_sys_bytes 3.833856e+06 # HELP go_memstats_last_gc_time_seconds Number of seconds since 1970 of last garbage collection. # TYPE go_memstats_last_gc_time_seconds gauge -go_memstats_last_gc_time_seconds 0 # HELP go_memstats_lookups_total Total number of pointer lookups. # TYPE go_memstats_lookups_total counter -go_memstats_lookups_total 0 # HELP go_memstats_mallocs_total Total number of mallocs. # TYPE go_memstats_mallocs_total counter -go_memstats_mallocs_total 3889 # HELP go_memstats_mcache_inuse_bytes Number of bytes in use by mcache structures. # TYPE go_memstats_mcache_inuse_bytes gauge -go_memstats_mcache_inuse_bytes 1200 # HELP go_memstats_mcache_sys_bytes Number of bytes used for mcache structures obtained from system. # TYPE go_memstats_mcache_sys_bytes gauge -go_memstats_mcache_sys_bytes 15600 # HELP go_memstats_mspan_inuse_bytes Number of bytes in use by mspan structures. # TYPE go_memstats_mspan_inuse_bytes gauge -go_memstats_mspan_inuse_bytes 33768 # HELP go_memstats_mspan_sys_bytes Number of bytes used for mspan structures obtained from system. # TYPE go_memstats_mspan_sys_bytes gauge -go_memstats_mspan_sys_bytes 65184 # HELP go_memstats_next_gc_bytes Number of heap bytes when next garbage collection will take place. # TYPE go_memstats_next_gc_bytes gauge -go_memstats_next_gc_bytes 4.194304e+06 # HELP go_memstats_other_sys_bytes Number of bytes used for other system allocations. # TYPE go_memstats_other_sys_bytes gauge -go_memstats_other_sys_bytes 511839 # HELP go_memstats_stack_inuse_bytes Number of bytes in use by the stack allocator. # TYPE go_memstats_stack_inuse_bytes gauge -go_memstats_stack_inuse_bytes 360448 # HELP go_memstats_stack_sys_bytes Number of bytes obtained from system for stack allocator. # TYPE go_memstats_stack_sys_bytes gauge -go_memstats_stack_sys_bytes 360448 # HELP go_memstats_sys_bytes Number of bytes obtained from system. # TYPE go_memstats_sys_bytes gauge -go_memstats_sys_bytes 9.147656e+06 # HELP go_threads Number of OS threads created. # TYPE go_threads gauge -go_threads 7 # HELP process_cpu_seconds_total Total user and system CPU time spent in seconds. # TYPE process_cpu_seconds_total counter -process_cpu_seconds_total 0 # HELP process_max_fds Maximum number of open file descriptors. # TYPE process_max_fds gauge -process_max_fds 1.048576e+06 # HELP process_open_fds Number of open file descriptors. # TYPE process_open_fds gauge -process_open_fds 8 # HELP process_resident_memory_bytes Resident memory size in bytes. # TYPE process_resident_memory_bytes gauge -process_resident_memory_bytes 1.376256e+07 # HELP process_start_time_seconds Start time of the process since unix epoch in seconds. # TYPE process_start_time_seconds gauge -process_start_time_seconds 1.70006874948e+09 # HELP process_virtual_memory_bytes Virtual memory size in bytes. # TYPE process_virtual_memory_bytes gauge -process_virtual_memory_bytes 1.647849472e+09 # HELP process_virtual_memory_max_bytes Maximum amount of virtual memory available in bytes. # TYPE process_virtual_memory_max_bytes gauge -process_virtual_memory_max_bytes 1.8446744073709552e+19 # HELP promhttp_metric_handler_errors_total Total number of internal errors encountered by the promhttp metric handler. # TYPE promhttp_metric_handler_errors_total counter promhttp_metric_handler_errors_total{cause="encoding"} 0 From 9bebb96b370796dd0b3b08bea259cc2dd4d197bf Mon Sep 17 00:00:00 2001 From: mahendrapaipuri Date: Thu, 16 Nov 2023 11:27:30 +0100 Subject: [PATCH 14/14] build: Only test for different archs in linux --- .circleci/config.yml | 2 +- .promu-cgo.yml | 2 ++ .promu.yml | 10 ++++++++-- 3 files changed, 11 insertions(+), 3 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index c65b695e..9f7dac6d 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -35,7 +35,7 @@ jobs: - prometheus/setup_environment - run: docker run --privileged linuxkit/binfmt:af88a591f9cc896a52ce596b9cf7ca26a061ef97 - run: promu crossbuild -v --parallelism $CIRCLE_NODE_TOTAL --parallelism-thread $CIRCLE_NODE_INDEX - - run: promu --config .promu-cgo.yml crossbuild -v --parallelism $CIRCLE_NODE_TOTAL --parallelism-thread $CIRCLE_NODE_INDEX + # - run: promu --config .promu-cgo.yml crossbuild -v --parallelism $CIRCLE_NODE_TOTAL --parallelism-thread $CIRCLE_NODE_INDEX - persist_to_workspace: root: . paths: diff --git a/.promu-cgo.yml b/.promu-cgo.yml index 263fd956..6285cce9 100644 --- a/.promu-cgo.yml +++ b/.promu-cgo.yml @@ -21,5 +21,7 @@ tarball: - NOTICE crossbuild: platforms: + # We are not sure if our exporter will work on netbsd. So dont bother on + # cross compiling for now - netbsd/amd64 - netbsd/386 diff --git a/.promu.yml b/.promu.yml index 275cfa86..ffca1376 100644 --- a/.promu.yml +++ b/.promu.yml @@ -20,5 +20,11 @@ tarball: - NOTICE crossbuild: platforms: - - linux - - openbsd/amd64 + - linux/amd64 + - linux/386 + - linux/arm64 + - linux/mips + - linux/mipsle + - linux/mips64 + - linux/mips64le + - linux/ppc64le