diff --git a/docs/examples/runai/runai.md b/docs/examples/runai/runai.md index 83b01d4..0982697 100644 --- a/docs/examples/runai/runai.md +++ b/docs/examples/runai/runai.md @@ -23,18 +23,12 @@ Navigate to the Run:ai portal and create a new project. Upon creating the projec This script will deploy a `kind` cluster if necessary, followed by deploying `KWOK` and `Prometheus`. It will then prompt you to select a workload manager. Choose the `run:ai` option. -4. **Update KWOK stage**: -Update the pod-complete stage by running the following command -```bash -kubectl apply -f ./charts/overrides/kwok/pod-complete.yaml -``` - -5. **Replace cluster UID and project name in the sample workflow files**: +4. **Replace cluster UID and project name in the sample workflow files**: Update the sample workflow files [test-trainingworkload.yml](../../../resources/workflows/runai/test-trainingworkload.yml#L40-L41) and [test-distributedworkload.yml](../../../resources/workflows/runai/test-distributedworkload.yml#L40-L41) by replacing `` with the cluster UID and `` with the project name. -6. **Run the workflows** +5. **Run the workflows** Run a Run:ai training workload: ```bash diff --git a/resources/benchmarks/gang-scheduling/workflows/run-test-runai.yml b/resources/benchmarks/gang-scheduling/workflows/run-test-runai.yml new file mode 100644 index 0000000..1a489e9 --- /dev/null +++ b/resources/benchmarks/gang-scheduling/workflows/run-test-runai.yml @@ -0,0 +1,137 @@ +name: test-gang-scheduling +tasks: +- id: register-trainingworkload + type: RegisterObj + params: + template: "resources/benchmarks/templates/runai/trainingworkload.yml" + nameFormat: "twl{{._ENUM_}}" + podNameFormat: "{{._NAME_}}-0-0" + podCount: 1 +- id: register-distributedworkload + type: RegisterObj + params: + template: "resources/benchmarks/templates/runai/distributedworkload.yml" + nameFormat: "dwl{{._ENUM_}}" + podNameFormat: "{{._NAME_}}-(launcher-[a-z0-9]+|worker-[0-9]+)" + podCount: "{{.workers}} + 1" +# +### Benchmark test +# +- id: job1 + type: SubmitObj + params: + refTaskId: register-distributedworkload + count: 1 + params: + workers: 31 + ttl: 30s +- id: job2 + type: SubmitObj + params: + refTaskId: register-distributedworkload + count: 2 + params: + workers: 15 + ttl: 30s +- id: job3 + type: SubmitObj + params: + refTaskId: register-distributedworkload + count: 3 + params: + workers: 9 + ttl: 30s +- id: job3.1 + type: SubmitObj + params: + refTaskId: register-distributedworkload + count: 1 + params: + workers: 1 + ttl: 30s +- id: job4 + type: SubmitObj + params: + refTaskId: register-distributedworkload + count: 4 + params: + workers: 7 + ttl: 30s +- id: job5 + type: SubmitObj + params: + refTaskId: register-distributedworkload + count: 5 + params: + workers: 5 + ttl: 30s +- id: job5.1 + type: SubmitObj + params: + refTaskId: register-trainingworkload + count: 2 + params: + ttl: 30s +- id: job6 + type: SubmitObj + params: + refTaskId: register-distributedworkload + count: 6 + params: + workers: 4 + ttl: 30s +- id: job6.1 + type: SubmitObj + params: + refTaskId: register-distributedworkload + count: 1 + params: + workers: 1 + ttl: 30s +- id: job7 + type: SubmitObj + params: + refTaskId: register-distributedworkload + count: 7 + params: + workers: 3 + ttl: 30s +- id: job7.1 + type: SubmitObj + params: + refTaskId: register-distributedworkload + count: 1 + params: + workers: 1 + ttl: 30s +- id: job7.2 + type: SubmitObj + params: + refTaskId: register-trainingworkload + count: 2 + params: + ttl: 30s +- id: job8 + type: SubmitObj + params: + refTaskId: register-distributedworkload + count: 8 + params: + workers: 3 + ttl: 30s +- id: job9 + type: SubmitObj + params: + refTaskId: register-distributedworkload + count: 9 + params: + workers: 2 + ttl: 30s +- id: job9.1 + type: SubmitObj + params: + refTaskId: register-distributedworkload + count: 1 + params: + workers: 4 + ttl: 30s diff --git a/resources/benchmarks/gang-scheduling/workflows/run-test.yml b/resources/benchmarks/gang-scheduling/workflows/run-test.yml index 1af25e2..042dcd4 100644 --- a/resources/benchmarks/gang-scheduling/workflows/run-test.yml +++ b/resources/benchmarks/gang-scheduling/workflows/run-test.yml @@ -4,7 +4,7 @@ tasks: #- id: register # type: RegisterObj # params: -# template: "resources/benchmarks/gang-scheduling/templates/k8s/job.yml" +# template: "resources/benchmarks/templates/k8s/job.yml" # nameFormat: "job{{._ENUM_}}" # podNameFormat: "{{._NAME_}}-[0-9]-.*" # podCount: "{{.replicas}}" @@ -13,7 +13,7 @@ tasks: #- id: register # type: RegisterObj # params: -# template: "resources/benchmarks/gang-scheduling/templates/k8s/jobset.yml" +# template: "resources/benchmarks/templates/k8s/jobset.yml" # nameFormat: "jobset{{._ENUM_}}" # podNameFormat: "{{._NAME_}}-workers-[0-9]+-[0-9]+-.+" # podCount: "{{.replicas}}" @@ -34,7 +34,7 @@ tasks: #- id: register # type: RegisterObj # params: -# template: "resources/benchmarks/gang-scheduling/templates/kueue/job.yml" +# template: "resources/benchmarks/templates/kueue/job.yml" # nameFormat: "job{{._ENUM_}}" # podNameFormat: "{{._NAME_}}-[0-9]-.*" # podCount: "{{.replicas}}" @@ -69,39 +69,61 @@ tasks: #- id: register # type: RegisterObj # params: -# template: "resources/benchmarks/gang-scheduling/templates/volcano/job.yml" +# template: "resources/benchmarks/templates/volcano/job.yml" # nameFormat: "j{{._ENUM_}}" # podNameFormat: "{{._NAME_}}-test-[0-9]+" # podCount: "{{.replicas}}" +#- id: configure +# type: Configure +# params: +# configmaps: +# - name: volcano-scheduler-configmap +# namespace: volcano-system +# op: create +# data: +# volcano-scheduler.conf: | +# actions: "enqueue, allocate, backfill" +# tiers: +# - plugins: +# - name: priority +# - name: gang +# - name: conformance +# - plugins: +# - name: drf +# - name: predicates +# - name: proportion +# - name: nodeorder +# - name: binpack +# timeout: 1m ### Yunikorn -- id: register - type: RegisterObj - params: - template: "resources/benchmarks/gang-scheduling/templates/yunikorn/job.yml" - nameFormat: "job{{._ENUM_}}" - podNameFormat: "{{._NAME_}}-.*" - podCount: "{{.replicas}}" -- id: configure - type: Configure - params: - configmaps: - - name: yunikorn-configs - namespace: yunikorn - op: create - data: - queues.yaml: | - partitions: - - name: default - queues: - - name: root - queues: - - name: sandbox - submitacl: '*' - resources: - max: - {memory: 36Gi, vcore: 8000m, nvidia.com/gpu: 256} - timeout: 1m +#- id: register +# type: RegisterObj +# params: +# template: "resources/benchmarks/templates/yunikorn/job.yml" +# nameFormat: "job{{._ENUM_}}" +# podNameFormat: "{{._NAME_}}-.*" +# podCount: "{{.replicas}}" +#- id: configure +# type: Configure +# params: +# configmaps: +# - name: yunikorn-configs +# namespace: yunikorn +# op: create +# data: +# queues.yaml: | +# partitions: +# - name: default +# queues: +# - name: root +# queues: +# - name: sandbox +# submitacl: '*' +# resources: +# max: +# {memory: 36Gi, vcore: 8000m, nvidia.com/gpu: 256} +# timeout: 1m # ### Benchmark test # @@ -112,6 +134,7 @@ tasks: count: 1 params: replicas: 32 + ttl: 30s - id: job2 type: SubmitObj params: @@ -119,6 +142,7 @@ tasks: count: 2 params: replicas: 16 + ttl: 30s - id: job3 type: SubmitObj params: @@ -126,20 +150,23 @@ tasks: count: 3 params: replicas: 10 + ttl: 30s - id: job3.1 type: SubmitObj params: refTaskId: register count: 1 params: - replicas: 2 + replicas: 2 + ttl: 30s - id: job4 type: SubmitObj params: refTaskId: register count: 4 params: - replicas: 8 + replicas: 8 + ttl: 30s - id: job5 type: SubmitObj params: @@ -147,6 +174,7 @@ tasks: count: 5 params: replicas: 6 + ttl: 30s - id: job5.1 type: SubmitObj params: @@ -154,6 +182,7 @@ tasks: count: 2 params: replicas: 1 + ttl: 30s - id: job6 type: SubmitObj params: @@ -161,6 +190,7 @@ tasks: count: 6 params: replicas: 5 + ttl: 30s - id: job6.1 type: SubmitObj params: @@ -168,6 +198,7 @@ tasks: count: 1 params: replicas: 2 + ttl: 30s - id: job7 type: SubmitObj params: @@ -175,6 +206,7 @@ tasks: count: 7 params: replicas: 4 + ttl: 30s - id: job7.1 type: SubmitObj params: @@ -182,6 +214,7 @@ tasks: count: 1 params: replicas: 2 + ttl: 30s - id: job7.2 type: SubmitObj params: @@ -189,6 +222,7 @@ tasks: count: 2 params: replicas: 1 + ttl: 30s - id: job8 type: SubmitObj params: @@ -196,6 +230,7 @@ tasks: count: 8 params: replicas: 4 + ttl: 30s - id: job9 type: SubmitObj params: @@ -203,6 +238,7 @@ tasks: count: 9 params: replicas: 3 + ttl: 30s - id: job9.1 type: SubmitObj params: @@ -210,3 +246,4 @@ tasks: count: 1 params: replicas: 5 + ttl: 30s diff --git a/resources/benchmarks/gang-scheduling/templates/k8s/job.yml b/resources/benchmarks/templates/k8s/job.yml similarity index 91% rename from resources/benchmarks/gang-scheduling/templates/k8s/job.yml rename to resources/benchmarks/templates/k8s/job.yml index a205780..210207a 100644 --- a/resources/benchmarks/gang-scheduling/templates/k8s/job.yml +++ b/resources/benchmarks/templates/k8s/job.yml @@ -25,7 +25,8 @@ spec: template: metadata: annotations: - pod-complete.stage.kwok.x-k8s.io/delay: "30s" + pod-complete.stage.kwok.x-k8s.io/delay: {{.ttl}} + pod-complete.stage.kwok.x-k8s.io/jitter-delay: {{.ttl}} spec: schedulerName: default-scheduler containers: diff --git a/resources/benchmarks/gang-scheduling/templates/k8s/jobset.yml b/resources/benchmarks/templates/k8s/jobset.yml similarity index 92% rename from resources/benchmarks/gang-scheduling/templates/k8s/jobset.yml rename to resources/benchmarks/templates/k8s/jobset.yml index 4b912b9..14e9144 100644 --- a/resources/benchmarks/gang-scheduling/templates/k8s/jobset.yml +++ b/resources/benchmarks/templates/k8s/jobset.yml @@ -36,7 +36,8 @@ spec: template: metadata: annotations: - pod-complete.stage.kwok.x-k8s.io/delay: "30s" + pod-complete.stage.kwok.x-k8s.io/delay: {{.ttl}} + pod-complete.stage.kwok.x-k8s.io/jitter-delay: {{.ttl}} spec: schedulerName: default-scheduler containers: diff --git a/resources/benchmarks/gang-scheduling/templates/kueue/job.yml b/resources/benchmarks/templates/kueue/job.yml similarity index 91% rename from resources/benchmarks/gang-scheduling/templates/kueue/job.yml rename to resources/benchmarks/templates/kueue/job.yml index f4f2f5e..ec6cb6a 100644 --- a/resources/benchmarks/gang-scheduling/templates/kueue/job.yml +++ b/resources/benchmarks/templates/kueue/job.yml @@ -26,7 +26,8 @@ spec: template: metadata: annotations: - pod-complete.stage.kwok.x-k8s.io/delay: "30s" + pod-complete.stage.kwok.x-k8s.io/delay: {{.ttl}} + pod-complete.stage.kwok.x-k8s.io/jitter-delay: {{.ttl}} spec: containers: - name: test diff --git a/resources/benchmarks/templates/runai/distributedworkload.yml b/resources/benchmarks/templates/runai/distributedworkload.yml new file mode 100644 index 0000000..c02b1b4 --- /dev/null +++ b/resources/benchmarks/templates/runai/distributedworkload.yml @@ -0,0 +1,64 @@ +apiVersion: run.ai/v2alpha1 +kind: DistributedWorkload +metadata: + name: "{{._NAME_}}" + namespace: runai- + annotations: + clusterId: + labels: + project: +masterSpec: + name: + value: "{{._NAME_}}" + image: + value: ubuntu + imagePullPolicy: + value: Always + cpu: + value: 100m + memory: + value: 256M + gpuDevices: + value: 8 + largeShm: + value: false + nodePools: + value: default + runAsUser: + value: true + usage: Submit + autoDeletionTimeAfterCompletionSeconds: + value: 2592000 +spec: + annotations: + items: + clusterId: + value: + pod-complete.stage.kwok.x-k8s.io/delay: + value: {{.ttl}} + pod-complete.stage.kwok.x-k8s.io/jitter-delay: + value: {{.ttl}} + name: + value: "{{._NAME_}}" + jobType: MPIJob + image: + value: ubuntu + imagePullPolicy: + value: Always + cpu: + value: 100m + memory: + value: 256M + gpuDevices: + value: 8 + workers: + value: {{.workers}} + largeShm: + value: false + nodePools: + value: default + runAsUser: + value: true + usage: Submit + autoDeletionTimeAfterCompletionSeconds: + value: 2592000 diff --git a/resources/benchmarks/templates/runai/trainingworkload.yml b/resources/benchmarks/templates/runai/trainingworkload.yml new file mode 100644 index 0000000..17bf859 --- /dev/null +++ b/resources/benchmarks/templates/runai/trainingworkload.yml @@ -0,0 +1,35 @@ +apiVersion: run.ai/v2alpha1 +kind: TrainingWorkload +metadata: + name: "{{._NAME_}}" + namespace: runai- + annotations: + clusterId: + labels: + project: +spec: + name: + value: "{{._NAME_}}" + image: + value: ubuntu + imagePullPolicy: + value: IfNotPresent + active: + value: true + annotations: + items: + clusterId: + value: + pod-complete.stage.kwok.x-k8s.io/delay: + value: {{.ttl}} + pod-complete.stage.kwok.x-k8s.io/jitter-delay: + value: {{.ttl}} + cpu: + value: 100m + memory: + value: 256M + gpuDevices: + value: 8 + nodePools: + value: default + usage: Submit diff --git a/resources/benchmarks/gang-scheduling/templates/volcano/job.yml b/resources/benchmarks/templates/volcano/job.yml similarity index 91% rename from resources/benchmarks/gang-scheduling/templates/volcano/job.yml rename to resources/benchmarks/templates/volcano/job.yml index 67c2455..b8c56cc 100644 --- a/resources/benchmarks/gang-scheduling/templates/volcano/job.yml +++ b/resources/benchmarks/templates/volcano/job.yml @@ -36,7 +36,8 @@ spec: metadata: name: test annotations: - pod-complete.stage.kwok.x-k8s.io/delay: "30s" + pod-complete.stage.kwok.x-k8s.io/delay: {{.ttl}} + pod-complete.stage.kwok.x-k8s.io/jitter-delay: {{.ttl}} spec: containers: - name: job @@ -51,4 +52,3 @@ spec: cpu: 100m memory: 256M nvidia.com/gpu: "8" - restartPolicy: OnFailure diff --git a/resources/benchmarks/gang-scheduling/templates/yunikorn/job.yml b/resources/benchmarks/templates/yunikorn/job.yml similarity index 66% rename from resources/benchmarks/gang-scheduling/templates/yunikorn/job.yml rename to resources/benchmarks/templates/yunikorn/job.yml index b2c3ee3..ee96e08 100644 --- a/resources/benchmarks/gang-scheduling/templates/yunikorn/job.yml +++ b/resources/benchmarks/templates/yunikorn/job.yml @@ -1,4 +1,3 @@ - # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -27,7 +26,23 @@ spec: applicationId: "test-{{._NAME_}}" queue: root.sandbox annotations: - pod-complete.stage.kwok.x-k8s.io/delay: "30s" + yunikorn.apache.org/task-group-name: group-{{._NAME_}} + yunikorn.apache.org/schedulingPolicyParameters: "gangSchedulingStyle=Hard" + yunikorn.apache.org/task-groups: |- + [{ + "name": "group-{{._NAME_}}", + "minMember": {{.replicas}}, + "minResource": { + "cpu": "1", + "memory": "500M", + "nvidia.com/gpu": "8" + }, + "nodeSelector": {}, + "tolerations": [], + "affinity": {} + }] + pod-complete.stage.kwok.x-k8s.io/delay: {{.ttl}} + pod-complete.stage.kwok.x-k8s.io/jitter-delay: {{.ttl}} spec: schedulerName: yunikorn containers: diff --git a/scripts/create-test-cluster.sh b/scripts/create-test-cluster.sh index c017096..c5471e3 100755 --- a/scripts/create-test-cluster.sh +++ b/scripts/create-test-cluster.sh @@ -42,6 +42,7 @@ fi deploy_prometheus deploy_kwok +kubectl apply -f $REPO_HOME/charts/overrides/kwok/pod-complete.yaml echo "" printYellow "Select workload manager or leave it blank to skip:" diff --git a/scripts/env.sh b/scripts/env.sh index 6056452..c6d828b 100644 --- a/scripts/env.sh +++ b/scripts/env.sh @@ -60,7 +60,7 @@ function deploy_kwok() { kubectl apply -f https://github.com/${KWOK_REPO}/releases/download/${KWOK_RELEASE}/stage-fast.yaml kubectl apply -f https://github.com/${KWOK_REPO}/raw/main/kustomize/stage/pod/chaos/pod-init-container-running-failed.yaml kubectl apply -f https://github.com/${KWOK_REPO}/raw/main/kustomize/stage/pod/chaos/pod-container-running-failed.yaml - kubectl apply -f https://github.com/${KWOK_REPO}/raw/main/kustomize/stage/pod/general/pod-complete.yaml + #kubectl apply -f https://github.com/${KWOK_REPO}/raw/main/kustomize/stage/pod/general/pod-complete.yaml } # Prometheus @@ -85,7 +85,7 @@ function deploy_prometheus() { --set prometheus.prometheusSpec.serviceMonitorSelectorNilUsesHelmValues=false \ --set prometheus.prometheusSpec.podMonitorSelectorNilUsesHelmValues=false - kubectl -n monitoring wait --for=condition=ready pod -l app.kubernetes.io/instance=kube-prometheus-stack --timeout=3m + kubectl -n monitoring wait --for=condition=ready pod -l app.kubernetes.io/instance=kube-prometheus-stack --timeout=300s } # Tested workload managers @@ -98,7 +98,8 @@ function deploy_jobset() { printGreen Deploying jobset kubectl apply --server-side -f https://github.com/kubernetes-sigs/jobset/releases/download/${JOBSET_VERSION}/manifests.yaml - kubectl -n jobset-system wait --for=condition=ready pod -l control-plane=controller-manager --timeout=60s + + kubectl -n jobset-system wait --for=condition=ready pod -l control-plane=controller-manager --timeout=300s } # https://github.com/kubernetes-sigs/kueue @@ -108,7 +109,8 @@ function deploy_kueue() { printGreen Deploying kueue kubectl apply --server-side -f https://github.com/kubernetes-sigs/kueue/releases/download/${KUEUE_VERSION}/manifests.yaml - kubectl -n kueue-system wait --for=condition=ready pod -l control-plane=controller-manager --timeout=60s + + kubectl -n kueue-system wait --for=condition=ready pod -l control-plane=controller-manager --timeout=300s } # https://github.com/volcano-sh/volcano @@ -123,7 +125,7 @@ function deploy_volcano() { --version=$VOLCANO_VERSION --wait for app in volcano-admission volcano-controller volcano-scheduler; do - kubectl -n volcano-system wait --for=condition=ready pod -l app=$app --timeout=60s + kubectl -n volcano-system wait --for=condition=ready pod -l app=$app --timeout=300s done # Wait until volcano webhook is ready @@ -142,7 +144,7 @@ function deploy_yunikorn() { helm upgrade --install yunikorn yunikorn/yunikorn -n yunikorn --create-namespace \ --version=$YUNIKORN_VERSION --wait - kubectl -n yunikorn wait --for=condition=ready pod -l app=yunikorn --timeout=60s + kubectl -n yunikorn wait --for=condition=ready pod -l app=yunikorn --timeout=300s } # https://www.run.ai/