Skip to content

Commit

Permalink
update benchmarks
Browse files Browse the repository at this point in the history
Signed-off-by: Dmitry Shmulevich <[email protected]>
  • Loading branch information
dmitsh committed Aug 1, 2024
1 parent 52c3059 commit 3bb8591
Show file tree
Hide file tree
Showing 12 changed files with 342 additions and 54 deletions.
10 changes: 2 additions & 8 deletions docs/examples/runai/runai.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,18 +23,12 @@ Navigate to the Run:ai portal and create a new project. Upon creating the projec

This script will deploy a `kind` cluster if necessary, followed by deploying `KWOK` and `Prometheus`. It will then prompt you to select a workload manager. Choose the `run:ai` option.

4. **Update KWOK stage**:

Update the pod-complete stage by running the following command
```bash
kubectl apply -f ./charts/overrides/kwok/pod-complete.yaml
```

5. **Replace cluster UID and project name in the sample workflow files**:
4. **Replace cluster UID and project name in the sample workflow files**:

Update the sample workflow files [test-trainingworkload.yml](../../../resources/workflows/runai/test-trainingworkload.yml#L40-L41) and [test-distributedworkload.yml](../../../resources/workflows/runai/test-distributedworkload.yml#L40-L41) by replacing `<RUNAI_CLUSTER_ID>` with the cluster UID and `<RUNAI_PROJECT>` with the project name.

6. **Run the workflows**
5. **Run the workflows**

Run a Run:ai training workload:
```bash
Expand Down
137 changes: 137 additions & 0 deletions resources/benchmarks/gang-scheduling/workflows/run-test-runai.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
name: test-gang-scheduling
tasks:
- id: register-trainingworkload
type: RegisterObj
params:
template: "resources/benchmarks/templates/runai/trainingworkload.yml"
nameFormat: "twl{{._ENUM_}}"
podNameFormat: "{{._NAME_}}-0-0"
podCount: 1
- id: register-distributedworkload
type: RegisterObj
params:
template: "resources/benchmarks/templates/runai/distributedworkload.yml"
nameFormat: "dwl{{._ENUM_}}"
podNameFormat: "{{._NAME_}}-(launcher-[a-z0-9]+|worker-[0-9]+)"
podCount: "{{.workers}} + 1"
#
### Benchmark test
#
- id: job1
type: SubmitObj
params:
refTaskId: register-distributedworkload
count: 1
params:
workers: 31
ttl: 30s
- id: job2
type: SubmitObj
params:
refTaskId: register-distributedworkload
count: 2
params:
workers: 15
ttl: 30s
- id: job3
type: SubmitObj
params:
refTaskId: register-distributedworkload
count: 3
params:
workers: 9
ttl: 30s
- id: job3.1
type: SubmitObj
params:
refTaskId: register-distributedworkload
count: 1
params:
workers: 1
ttl: 30s
- id: job4
type: SubmitObj
params:
refTaskId: register-distributedworkload
count: 4
params:
workers: 7
ttl: 30s
- id: job5
type: SubmitObj
params:
refTaskId: register-distributedworkload
count: 5
params:
workers: 5
ttl: 30s
- id: job5.1
type: SubmitObj
params:
refTaskId: register-trainingworkload
count: 2
params:
ttl: 30s
- id: job6
type: SubmitObj
params:
refTaskId: register-distributedworkload
count: 6
params:
workers: 4
ttl: 30s
- id: job6.1
type: SubmitObj
params:
refTaskId: register-distributedworkload
count: 1
params:
workers: 1
ttl: 30s
- id: job7
type: SubmitObj
params:
refTaskId: register-distributedworkload
count: 7
params:
workers: 3
ttl: 30s
- id: job7.1
type: SubmitObj
params:
refTaskId: register-distributedworkload
count: 1
params:
workers: 1
ttl: 30s
- id: job7.2
type: SubmitObj
params:
refTaskId: register-trainingworkload
count: 2
params:
ttl: 30s
- id: job8
type: SubmitObj
params:
refTaskId: register-distributedworkload
count: 8
params:
workers: 3
ttl: 30s
- id: job9
type: SubmitObj
params:
refTaskId: register-distributedworkload
count: 9
params:
workers: 2
ttl: 30s
- id: job9.1
type: SubmitObj
params:
refTaskId: register-distributedworkload
count: 1
params:
workers: 4
ttl: 30s
103 changes: 70 additions & 33 deletions resources/benchmarks/gang-scheduling/workflows/run-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ tasks:
#- id: register
# type: RegisterObj
# params:
# template: "resources/benchmarks/gang-scheduling/templates/k8s/job.yml"
# template: "resources/benchmarks/templates/k8s/job.yml"
# nameFormat: "job{{._ENUM_}}"
# podNameFormat: "{{._NAME_}}-[0-9]-.*"
# podCount: "{{.replicas}}"
Expand All @@ -13,7 +13,7 @@ tasks:
#- id: register
# type: RegisterObj
# params:
# template: "resources/benchmarks/gang-scheduling/templates/k8s/jobset.yml"
# template: "resources/benchmarks/templates/k8s/jobset.yml"
# nameFormat: "jobset{{._ENUM_}}"
# podNameFormat: "{{._NAME_}}-workers-[0-9]+-[0-9]+-.+"
# podCount: "{{.replicas}}"
Expand All @@ -34,7 +34,7 @@ tasks:
#- id: register
# type: RegisterObj
# params:
# template: "resources/benchmarks/gang-scheduling/templates/kueue/job.yml"
# template: "resources/benchmarks/templates/kueue/job.yml"
# nameFormat: "job{{._ENUM_}}"
# podNameFormat: "{{._NAME_}}-[0-9]-.*"
# podCount: "{{.replicas}}"
Expand Down Expand Up @@ -69,39 +69,61 @@ tasks:
#- id: register
# type: RegisterObj
# params:
# template: "resources/benchmarks/gang-scheduling/templates/volcano/job.yml"
# template: "resources/benchmarks/templates/volcano/job.yml"
# nameFormat: "j{{._ENUM_}}"
# podNameFormat: "{{._NAME_}}-test-[0-9]+"
# podCount: "{{.replicas}}"
#- id: configure
# type: Configure
# params:
# configmaps:
# - name: volcano-scheduler-configmap
# namespace: volcano-system
# op: create
# data:
# volcano-scheduler.conf: |
# actions: "enqueue, allocate, backfill"
# tiers:
# - plugins:
# - name: priority
# - name: gang
# - name: conformance
# - plugins:
# - name: drf
# - name: predicates
# - name: proportion
# - name: nodeorder
# - name: binpack
# timeout: 1m

### Yunikorn
- id: register
type: RegisterObj
params:
template: "resources/benchmarks/gang-scheduling/templates/yunikorn/job.yml"
nameFormat: "job{{._ENUM_}}"
podNameFormat: "{{._NAME_}}-.*"
podCount: "{{.replicas}}"
- id: configure
type: Configure
params:
configmaps:
- name: yunikorn-configs
namespace: yunikorn
op: create
data:
queues.yaml: |
partitions:
- name: default
queues:
- name: root
queues:
- name: sandbox
submitacl: '*'
resources:
max:
{memory: 36Gi, vcore: 8000m, nvidia.com/gpu: 256}
timeout: 1m
#- id: register
# type: RegisterObj
# params:
# template: "resources/benchmarks/templates/yunikorn/job.yml"
# nameFormat: "job{{._ENUM_}}"
# podNameFormat: "{{._NAME_}}-.*"
# podCount: "{{.replicas}}"
#- id: configure
# type: Configure
# params:
# configmaps:
# - name: yunikorn-configs
# namespace: yunikorn
# op: create
# data:
# queues.yaml: |
# partitions:
# - name: default
# queues:
# - name: root
# queues:
# - name: sandbox
# submitacl: '*'
# resources:
# max:
# {memory: 36Gi, vcore: 8000m, nvidia.com/gpu: 256}
# timeout: 1m
#
### Benchmark test
#
Expand All @@ -112,101 +134,116 @@ tasks:
count: 1
params:
replicas: 32
ttl: 30s
- id: job2
type: SubmitObj
params:
refTaskId: register
count: 2
params:
replicas: 16
ttl: 30s
- id: job3
type: SubmitObj
params:
refTaskId: register
count: 3
params:
replicas: 10
ttl: 30s
- id: job3.1
type: SubmitObj
params:
refTaskId: register
count: 1
params:
replicas: 2
replicas: 2
ttl: 30s
- id: job4
type: SubmitObj
params:
refTaskId: register
count: 4
params:
replicas: 8
replicas: 8
ttl: 30s
- id: job5
type: SubmitObj
params:
refTaskId: register
count: 5
params:
replicas: 6
ttl: 30s
- id: job5.1
type: SubmitObj
params:
refTaskId: register
count: 2
params:
replicas: 1
ttl: 30s
- id: job6
type: SubmitObj
params:
refTaskId: register
count: 6
params:
replicas: 5
ttl: 30s
- id: job6.1
type: SubmitObj
params:
refTaskId: register
count: 1
params:
replicas: 2
ttl: 30s
- id: job7
type: SubmitObj
params:
refTaskId: register
count: 7
params:
replicas: 4
ttl: 30s
- id: job7.1
type: SubmitObj
params:
refTaskId: register
count: 1
params:
replicas: 2
ttl: 30s
- id: job7.2
type: SubmitObj
params:
refTaskId: register
count: 2
params:
replicas: 1
ttl: 30s
- id: job8
type: SubmitObj
params:
refTaskId: register
count: 8
params:
replicas: 4
ttl: 30s
- id: job9
type: SubmitObj
params:
refTaskId: register
count: 9
params:
replicas: 3
ttl: 30s
- id: job9.1
type: SubmitObj
params:
refTaskId: register
count: 1
params:
replicas: 5
ttl: 30s
Loading

0 comments on commit 3bb8591

Please sign in to comment.