diff --git a/resources/benchmarks/README.md b/resources/benchmarks/README.md new file mode 100644 index 0000000..1795937 --- /dev/null +++ b/resources/benchmarks/README.md @@ -0,0 +1,53 @@ +# Benchmark Tests + +This directory contains benchmark tests for the following workload managers and schedulers: + +- Kueue +- Volcano +- Yunikorn +- Run:ai + +The benchmark tests involve submitting workloads intended to evaluate the scheduler's performance under specific scenarios. + +These workloads are designed to fully utilize the cluster under optimal scheduling conditions. + +One approach to benchmarking is to run this workload on clusters with different schedulers and then compare the average GPU occupancy of the nodes. + +For all workload managers except Run:ai, the benchmark test involves two sequential workflows. The first workflow registers the CRDs, and the second workflow runs the common part of the test. +Run:ai requires additional customization and thus has a separate workflow + +## Gang Scheduling Benchmark Test + +The gang-scheduling benchmark workflow operates on 32 virtual GPU nodes, submitting a burst of 53 jobs with replica numbers ranging from 1 to 32 in a [predetermined order](gang-scheduling/workflows/run-test-common.yml). + +#### Example + +To run the benchmark test for Kueue: + +```bash +./bin/knavigator -workflow 'resources/benchmarks/gang-scheduling/workflows/{config-kueue.yml,run-test-common.yml}' +``` + +#### Run:ai + +```bash +./bin/knavigator -workflow resources/benchmarks/gang-scheduling/workflows/run-test-runai.yml +``` + +## Scaling Benchmark Test + +The scaling benchmark workflow operates on 500 virtual GPU nodes, submitting [two workloads](workflows/run-test-common.yml) one after another. The first workload is a job with 500 replicas, the second workload is 500 single node jobs started simultaneously. + +### Example + +To run the benchmark test for Volcano: + +```bash +./bin/knavigator -workflow 'resources/benchmarks/scaling/workflows/{config-volcano.yml,run-test-common.yml}' +``` + +### Run:ai + +```bash +./bin/knavigator -workflow resources/benchmarks/scaling/workflows/run-test-runai.yml +``` diff --git a/resources/benchmarks/gang-scheduling/README.md b/resources/benchmarks/gang-scheduling/README.md deleted file mode 100644 index b12b7eb..0000000 --- a/resources/benchmarks/gang-scheduling/README.md +++ /dev/null @@ -1,35 +0,0 @@ -# Gang Scheduling Benchmark Test - -This directory contains gang scheduling benchmark tests for the following workload managers and schedulers: - -- Jobset -- Kueue -- Volcano -- Yunikorn -- Run:ai - -The gang-scheduling benchmark workflow operates on 32 virtual GPU nodes, submitting a burst of 53 jobs with replica numbers ranging from 1 to 32 in a [predetermined order](workflows/run-test-common.yml). - -The workload is designed to fully utilize the cluster under optimal scheduling conditions. - -One method to perform benchmarking is to input this workload into clusters that use different schedulers and then compare the average GPU occupancy of the nodes. - -## Usage - -For all workload managers except Run:ai, the benchmark test involves two sequential workflows. The first workflow registers the CRDs, and the second workflow runs the common part of the test. - -### Example - -To run the benchmark test for Kueue: - -```bash -./bin/knavigator -workflow resources/benchmarks/gang-scheduling/workflows/config-kueue.yml,resources/benchmarks/gang-scheduling/workflows/run-test-common.yml -``` - -### Run:ai - -Run:ai requires additional customization and thus has a separate workflow: - -```bash -./bin/knavigator -workflow resources/benchmarks/gang-scheduling/workflows/run-test-runai.yml -``` diff --git a/resources/benchmarks/gang-scheduling/workflows/config-jobset.yml b/resources/benchmarks/gang-scheduling/workflows/config-jobset.yml deleted file mode 100644 index 610b689..0000000 --- a/resources/benchmarks/gang-scheduling/workflows/config-jobset.yml +++ /dev/null @@ -1,9 +0,0 @@ -name: config-jobset -tasks: -- id: register - type: RegisterObj - params: - template: "resources/benchmarks/templates/k8s/jobset.yml" - nameFormat: "jobset{{._ENUM_}}" - podNameFormat: "{{._NAME_}}-workers-[0-9]+-[0-9]+-.+" - podCount: "{{.replicas}}" diff --git a/resources/benchmarks/gang-scheduling/workflows/config-kueue.yml b/resources/benchmarks/gang-scheduling/workflows/config-kueue.yml index 03cbc14..11d1189 100644 --- a/resources/benchmarks/gang-scheduling/workflows/config-kueue.yml +++ b/resources/benchmarks/gang-scheduling/workflows/config-kueue.yml @@ -38,6 +38,7 @@ tasks: flavor: gpu-node cpu: 8 memory: 36Gi + pods: 32 gpu: 256 - id: create-local-queue type: SubmitObj @@ -48,3 +49,80 @@ tasks: name: team-queue namespace: default clusterQueue: team +- id: configure + type: Configure + params: + configmaps: + - name: kueue-manager-config + namespace: kueue-system + op: create + data: + controller_manager_config.yaml: | + apiVersion: config.kueue.x-k8s.io/v1beta1 + kind: Configuration + health: + healthProbeBindAddress: :8081 + metrics: + bindAddress: :8080 + # enableClusterQueueResources: true + webhook: + port: 9443 + leaderElection: + leaderElect: true + resourceName: c1f6bfd2.kueue.x-k8s.io + controller: + groupKindConcurrency: + Job.batch: 5 + Pod: 5 + Workload.kueue.x-k8s.io: 5 + LocalQueue.kueue.x-k8s.io: 1 + ClusterQueue.kueue.x-k8s.io: 1 + ResourceFlavor.kueue.x-k8s.io: 1 + clientConnection: + qps: 50 + burst: 100 + #pprofBindAddress: :8083 + waitForPodsReady: + enable: true + timeout: 5m + blockAdmission: true + requeuingStrategy: + timestamp: Eviction + backoffLimitCount: null # null indicates infinite requeuing + backoffBaseSeconds: 60 + backoffMaxSeconds: 3600 + #manageJobsWithoutQueueName: true + #internalCertManagement: + # enable: false + # webhookServiceName: "" + # webhookSecretName: "" + integrations: + frameworks: + - "batch/job" + - "kubeflow.org/mpijob" + - "ray.io/rayjob" + - "ray.io/raycluster" + - "jobset.x-k8s.io/jobset" + - "kubeflow.org/mxjob" + - "kubeflow.org/paddlejob" + - "kubeflow.org/pytorchjob" + - "kubeflow.org/tfjob" + - "kubeflow.org/xgboostjob" + # - "pod" + # externalFrameworks: + # - "Foo.v1.example.com" + # podOptions: + # namespaceSelector: + # matchExpressions: + # - key: kubernetes.io/metadata.name + # operator: NotIn + # values: [ kube-system, kueue-system ] + #fairSharing: + # enable: true + # preemptionStrategies: [LessThanOrEqualToFinalShare, LessThanInitialShare] + #resources: + # excludeResourcePrefixes: [] + deploymentRestarts: + - namespace: kueue-system + name: kueue-controller-manager + timeout: 1m diff --git a/resources/benchmarks/scaling/workflows/config-kueue.yml b/resources/benchmarks/scaling/workflows/config-kueue.yml new file mode 100644 index 0000000..4b70dce --- /dev/null +++ b/resources/benchmarks/scaling/workflows/config-kueue.yml @@ -0,0 +1,128 @@ +name: config-kueue +tasks: +- id: register-cluster-queue + type: RegisterObj + params: + template: "resources/templates/kueue/cluster-queue.yml" +- id: register-local-queue + type: RegisterObj + params: + template: "resources/templates/kueue/local-queue.yml" +- id: register-resource-flavor + type: RegisterObj + params: + template: "resources/templates/kueue/resource-flavor.yml" +- id: register + type: RegisterObj + params: + template: "resources/benchmarks/templates/kueue/job.yml" + nameFormat: "job{{._ENUM_}}" + podNameFormat: "{{._NAME_}}-[0-9]-.*" + podCount: "{{.replicas}}" +- id: create-resource-flavor + type: SubmitObj + params: + refTaskId: register-resource-flavor + canExist: true + params: + name: "gpu-node" + nodeLabels: + nvidia.com/gpu.count: "8" +- id: create-cluster-queue + type: SubmitObj + params: + refTaskId: register-cluster-queue + canExist: true + params: + name: team + flavor: gpu-node + cpu: 50 + memory: 360Gi + pods: 500 + gpu: 4000 +- id: create-local-queue + type: SubmitObj + params: + refTaskId: register-local-queue + canExist: true + params: + name: team-queue + namespace: default + clusterQueue: team +- id: configure + type: Configure + params: + configmaps: + - name: kueue-manager-config + namespace: kueue-system + op: create + data: + controller_manager_config.yaml: | + apiVersion: config.kueue.x-k8s.io/v1beta1 + kind: Configuration + health: + healthProbeBindAddress: :8081 + metrics: + bindAddress: :8080 + # enableClusterQueueResources: true + webhook: + port: 9443 + leaderElection: + leaderElect: true + resourceName: c1f6bfd2.kueue.x-k8s.io + controller: + groupKindConcurrency: + Job.batch: 5 + Pod: 5 + Workload.kueue.x-k8s.io: 5 + LocalQueue.kueue.x-k8s.io: 1 + ClusterQueue.kueue.x-k8s.io: 1 + ResourceFlavor.kueue.x-k8s.io: 1 + clientConnection: + qps: 50 + burst: 100 + #pprofBindAddress: :8083 + waitForPodsReady: + enable: true + timeout: 5m + blockAdmission: true + requeuingStrategy: + timestamp: Eviction + backoffLimitCount: null # null indicates infinite requeuing + backoffBaseSeconds: 60 + backoffMaxSeconds: 3600 + #manageJobsWithoutQueueName: true + #internalCertManagement: + # enable: false + # webhookServiceName: "" + # webhookSecretName: "" + integrations: + frameworks: + - "batch/job" + - "kubeflow.org/mpijob" + - "ray.io/rayjob" + - "ray.io/raycluster" + - "jobset.x-k8s.io/jobset" + - "kubeflow.org/mxjob" + - "kubeflow.org/paddlejob" + - "kubeflow.org/pytorchjob" + - "kubeflow.org/tfjob" + - "kubeflow.org/xgboostjob" + # - "pod" + # externalFrameworks: + # - "Foo.v1.example.com" + # podOptions: + # namespaceSelector: + # matchExpressions: + # - key: kubernetes.io/metadata.name + # operator: NotIn + # values: [ kube-system, kueue-system ] + #fairSharing: + # enable: true + # preemptionStrategies: [LessThanOrEqualToFinalShare, LessThanInitialShare] + #resources: + # excludeResourcePrefixes: [] + deploymentRestarts: + - namespace: kueue-system + name: kueue-controller-manager + timeout: 1m diff --git a/resources/benchmarks/scaling/workflows/config-volcano.yml b/resources/benchmarks/scaling/workflows/config-volcano.yml new file mode 100644 index 0000000..53673af --- /dev/null +++ b/resources/benchmarks/scaling/workflows/config-volcano.yml @@ -0,0 +1,31 @@ +name: config-volcano +tasks: +- id: register + type: RegisterObj + params: + template: "resources/benchmarks/templates/volcano/job.yml" + nameFormat: "j{{._ENUM_}}" + podNameFormat: "{{._NAME_}}-test-[0-9]+" + podCount: "{{.replicas}}" +- id: configure + type: Configure + params: + configmaps: + - name: volcano-scheduler-configmap + namespace: volcano-system + op: create + data: + volcano-scheduler.conf: | + actions: "enqueue, allocate, backfill" + tiers: + - plugins: + - name: priority + - name: gang + - name: conformance + - plugins: + - name: drf + - name: predicates + - name: proportion + - name: nodeorder + - name: binpack + timeout: 1m diff --git a/resources/benchmarks/scaling/workflows/config-yunikorn.yml b/resources/benchmarks/scaling/workflows/config-yunikorn.yml new file mode 100644 index 0000000..aee3fb9 --- /dev/null +++ b/resources/benchmarks/scaling/workflows/config-yunikorn.yml @@ -0,0 +1,29 @@ +name: config-yunikorn +tasks: +- id: register + type: RegisterObj + params: + template: "resources/benchmarks/templates/yunikorn/job.yml" + nameFormat: "job{{._ENUM_}}" + podNameFormat: "{{._NAME_}}-.*" + podCount: "{{.replicas}}" +- id: configure + type: Configure + params: + configmaps: + - name: yunikorn-configs + namespace: yunikorn + op: create + data: + queues.yaml: | + partitions: + - name: default + queues: + - name: root + queues: + - name: sandbox + submitacl: '*' + resources: + max: + {memory: 360Gi, vcore: 50000m, nvidia.com/gpu: 4000} + timeout: 1m diff --git a/resources/benchmarks/scaling/workflows/run-test-common.yml b/resources/benchmarks/scaling/workflows/run-test-common.yml new file mode 100644 index 0000000..583c68e --- /dev/null +++ b/resources/benchmarks/scaling/workflows/run-test-common.yml @@ -0,0 +1,31 @@ +name: test-scaling +tasks: +- id: configure + type: Configure + params: + nodes: + - type: dgxa100.80g + count: 500 + labels: + nvidia.com/gpu.count: "8" + timeout: 5m +- id: sleep + type: Sleep + params: + timeout: 5s +- id: job1 + type: SubmitObj + params: + refTaskId: register + count: 500 + params: + replicas: 1 + ttl: 2m +- id: job2 + type: SubmitObj + params: + refTaskId: register + count: 1 + params: + replicas: 500 + ttl: 2m diff --git a/resources/benchmarks/scaling/workflows/run-test-runai.yml b/resources/benchmarks/scaling/workflows/run-test-runai.yml new file mode 100644 index 0000000..4eed902 --- /dev/null +++ b/resources/benchmarks/scaling/workflows/run-test-runai.yml @@ -0,0 +1,47 @@ +name: test-scaling +tasks: +- id: configure + type: Configure + params: + nodes: + - type: dgxa100.80g + count: 500 + labels: + nvidia.com/gpu.count: "8" + timeout: 5m +- id: sleep + type: Sleep + params: + timeout: 5s +- id: register-trainingworkload + type: RegisterObj + params: + template: "resources/benchmarks/templates/runai/trainingworkload.yml" + nameFormat: "twl{{._ENUM_}}" + podNameFormat: "{{._NAME_}}-0-0" + podCount: 1 +- id: register-distributedworkload + type: RegisterObj + params: + template: "resources/benchmarks/templates/runai/distributedworkload.yml" + nameFormat: "dwl{{._ENUM_}}" + podNameFormat: "{{._NAME_}}-(launcher-[a-z0-9]+|worker-[0-9]+)" + podCount: "{{.workers}} + 1" +# +### Benchmark test +# +- id: job1 + type: SubmitObj + params: + refTaskId: register-trainingworkload + count: 500 + params: + ttl: 2m +- id: job2 + type: SubmitObj + params: + refTaskId: register-distributedworkload + count: 1 + params: + workers: 499 + ttl: 2m diff --git a/resources/benchmarks/templates/k8s/job.yml b/resources/benchmarks/templates/k8s/job.yml deleted file mode 100644 index 210207a..0000000 --- a/resources/benchmarks/templates/k8s/job.yml +++ /dev/null @@ -1,45 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -apiVersion: batch/v1 -kind: Job -metadata: - name: "{{._NAME_}}" - namespace: default -spec: - backoffLimit: 0 - completions: {{.replicas}} - parallelism: {{.replicas}} - completionMode: Indexed - template: - metadata: - annotations: - pod-complete.stage.kwok.x-k8s.io/delay: {{.ttl}} - pod-complete.stage.kwok.x-k8s.io/jitter-delay: {{.ttl}} - spec: - schedulerName: default-scheduler - containers: - - name: test - image: ubuntu - imagePullPolicy: IfNotPresent - resources: - limits: - cpu: 100m - memory: 256M - nvidia.com/gpu: "8" - requests: - cpu: 100m - memory: 256M - nvidia.com/gpu: "8" - restartPolicy: OnFailure diff --git a/resources/benchmarks/templates/k8s/jobset.yml b/resources/benchmarks/templates/k8s/jobset.yml deleted file mode 100644 index 14e9144..0000000 --- a/resources/benchmarks/templates/k8s/jobset.yml +++ /dev/null @@ -1,55 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -apiVersion: jobset.x-k8s.io/v1alpha2 -kind: JobSet -metadata: - name: "{{._NAME_}}" - namespace: default -spec: - # We want to declare our JobSet successful if workers finish. - # If workers finish we should clean up the remaining replicatedJobs. - successPolicy: - operator: All - targetReplicatedJobs: - - workers - replicatedJobs: - - name: workers - replicas: {{.replicas}} - template: - spec: - backoffLimit: 0 - completions: 1 - parallelism: 1 - completionMode: Indexed - template: - metadata: - annotations: - pod-complete.stage.kwok.x-k8s.io/delay: {{.ttl}} - pod-complete.stage.kwok.x-k8s.io/jitter-delay: {{.ttl}} - spec: - schedulerName: default-scheduler - containers: - - name: test - image: ubuntu - imagePullPolicy: IfNotPresent - resources: - limits: - cpu: 100m - memory: 256M - nvidia.com/gpu: "8" - requests: - cpu: 100m - memory: 256M - nvidia.com/gpu: "8" diff --git a/resources/benchmarks/templates/kueue/job.yml b/resources/benchmarks/templates/kueue/job.yml index 84ef408..011d829 100644 --- a/resources/benchmarks/templates/kueue/job.yml +++ b/resources/benchmarks/templates/kueue/job.yml @@ -25,7 +25,7 @@ spec: suspend: true completions: {{.replicas}} parallelism: {{.replicas}} - completionMode: Indexed + completionMode: NonIndexed template: metadata: annotations: diff --git a/resources/benchmarks/templates/runai/distributedworkload.yml b/resources/benchmarks/templates/runai/distributedworkload.yml index c02b1b4..1131ba8 100644 --- a/resources/benchmarks/templates/runai/distributedworkload.yml +++ b/resources/benchmarks/templates/runai/distributedworkload.yml @@ -17,7 +17,7 @@ masterSpec: cpu: value: 100m memory: - value: 256M + value: 250M gpuDevices: value: 8 largeShm: @@ -48,7 +48,7 @@ spec: cpu: value: 100m memory: - value: 256M + value: 250M gpuDevices: value: 8 workers: diff --git a/resources/benchmarks/templates/runai/trainingworkload.yml b/resources/benchmarks/templates/runai/trainingworkload.yml index 17bf859..4753cc5 100644 --- a/resources/benchmarks/templates/runai/trainingworkload.yml +++ b/resources/benchmarks/templates/runai/trainingworkload.yml @@ -27,7 +27,7 @@ spec: cpu: value: 100m memory: - value: 256M + value: 250M gpuDevices: value: 8 nodePools: diff --git a/resources/benchmarks/templates/volcano/job.yml b/resources/benchmarks/templates/volcano/job.yml index b8c56cc..4c656f0 100644 --- a/resources/benchmarks/templates/volcano/job.yml +++ b/resources/benchmarks/templates/volcano/job.yml @@ -46,9 +46,9 @@ spec: resources: limits: cpu: 100m - memory: 256M + memory: 250M nvidia.com/gpu: "8" requests: cpu: 100m - memory: 256M + memory: 250M nvidia.com/gpu: "8" diff --git a/resources/benchmarks/templates/yunikorn/job.yml b/resources/benchmarks/templates/yunikorn/job.yml index ee96e08..734cc65 100644 --- a/resources/benchmarks/templates/yunikorn/job.yml +++ b/resources/benchmarks/templates/yunikorn/job.yml @@ -33,8 +33,8 @@ spec: "name": "group-{{._NAME_}}", "minMember": {{.replicas}}, "minResource": { - "cpu": "1", - "memory": "500M", + "cpu": "100m", + "memory": "250M", "nvidia.com/gpu": "8" }, "nodeSelector": {}, @@ -52,10 +52,10 @@ spec: resources: limits: cpu: 100m - memory: 256M + memory: 250M nvidia.com/gpu: "8" requests: cpu: 100m - memory: 256M + memory: 250M nvidia.com/gpu: "8" restartPolicy: Never diff --git a/resources/templates/kueue/cluster-queue.yml b/resources/templates/kueue/cluster-queue.yml index 23840d4..058d5c8 100644 --- a/resources/templates/kueue/cluster-queue.yml +++ b/resources/templates/kueue/cluster-queue.yml @@ -22,7 +22,7 @@ spec: cohort: {{.cohort}} {{- end }} resourceGroups: - - coveredResources: ["cpu", "memory", "nvidia.com/gpu"] + - coveredResources: ["cpu", "memory", "pods", "nvidia.com/gpu"] flavors: - name: {{.flavor}} resources: @@ -30,6 +30,8 @@ spec: nominalQuota: {{.cpu}} - name: "memory" nominalQuota: {{.memory}} + - name: "pods" + nominalQuota: {{.pods}} - name: "nvidia.com/gpu" nominalQuota: {{.gpu}} preemption: diff --git a/resources/workflows/kueue/test-job.yml b/resources/workflows/kueue/test-job.yml index 15f2fd8..aaac626 100644 --- a/resources/workflows/kueue/test-job.yml +++ b/resources/workflows/kueue/test-job.yml @@ -62,6 +62,7 @@ tasks: flavor: gpu-node cpu: 8 memory: 36Gi + pods: 4 gpu: 32 - id: create-local-queue type: SubmitObj diff --git a/resources/workflows/kueue/test-preemption.yml b/resources/workflows/kueue/test-preemption.yml index 91eeec6..28af7f9 100644 --- a/resources/workflows/kueue/test-preemption.yml +++ b/resources/workflows/kueue/test-preemption.yml @@ -75,6 +75,7 @@ tasks: flavor: gpu-node cpu: 8 memory: 36Gi + pods: 4 gpu: 16 - id: create-cluster-queue-b type: SubmitObj @@ -87,6 +88,7 @@ tasks: flavor: gpu-node cpu: 8 memory: 36Gi + pods: 4 gpu: 16 - id: create-local-queue-a type: SubmitObj