Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

update benchmarks #89

Merged
merged 1 commit into from
Aug 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 2 additions & 8 deletions docs/examples/runai/runai.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,18 +23,12 @@ Navigate to the Run:ai portal and create a new project. Upon creating the projec

This script will deploy a `kind` cluster if necessary, followed by deploying `KWOK` and `Prometheus`. It will then prompt you to select a workload manager. Choose the `run:ai` option.

4. **Update KWOK stage**:

Update the pod-complete stage by running the following command
```bash
kubectl apply -f ./charts/overrides/kwok/pod-complete.yaml
```

5. **Replace cluster UID and project name in the sample workflow files**:
4. **Replace cluster UID and project name in the sample workflow files**:

Update the sample workflow files [test-trainingworkload.yml](../../../resources/workflows/runai/test-trainingworkload.yml#L40-L41) and [test-distributedworkload.yml](../../../resources/workflows/runai/test-distributedworkload.yml#L40-L41) by replacing `<RUNAI_CLUSTER_ID>` with the cluster UID and `<RUNAI_PROJECT>` with the project name.

6. **Run the workflows**
5. **Run the workflows**

Run a Run:ai training workload:
```bash
Expand Down
137 changes: 137 additions & 0 deletions resources/benchmarks/gang-scheduling/workflows/run-test-runai.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
name: test-gang-scheduling
tasks:
- id: register-trainingworkload
type: RegisterObj
params:
template: "resources/benchmarks/templates/runai/trainingworkload.yml"
nameFormat: "twl{{._ENUM_}}"
podNameFormat: "{{._NAME_}}-0-0"
podCount: 1
- id: register-distributedworkload
type: RegisterObj
params:
template: "resources/benchmarks/templates/runai/distributedworkload.yml"
nameFormat: "dwl{{._ENUM_}}"
podNameFormat: "{{._NAME_}}-(launcher-[a-z0-9]+|worker-[0-9]+)"
podCount: "{{.workers}} + 1"
#
### Benchmark test
#
- id: job1
type: SubmitObj
params:
refTaskId: register-distributedworkload
count: 1
params:
workers: 31
ttl: 30s
- id: job2
type: SubmitObj
params:
refTaskId: register-distributedworkload
count: 2
params:
workers: 15
ttl: 30s
- id: job3
type: SubmitObj
params:
refTaskId: register-distributedworkload
count: 3
params:
workers: 9
ttl: 30s
- id: job3.1
type: SubmitObj
params:
refTaskId: register-distributedworkload
count: 1
params:
workers: 1
ttl: 30s
- id: job4
type: SubmitObj
params:
refTaskId: register-distributedworkload
count: 4
params:
workers: 7
ttl: 30s
- id: job5
type: SubmitObj
params:
refTaskId: register-distributedworkload
count: 5
params:
workers: 5
ttl: 30s
- id: job5.1
type: SubmitObj
params:
refTaskId: register-trainingworkload
count: 2
params:
ttl: 30s
- id: job6
type: SubmitObj
params:
refTaskId: register-distributedworkload
count: 6
params:
workers: 4
ttl: 30s
- id: job6.1
type: SubmitObj
params:
refTaskId: register-distributedworkload
count: 1
params:
workers: 1
ttl: 30s
- id: job7
type: SubmitObj
params:
refTaskId: register-distributedworkload
count: 7
params:
workers: 3
ttl: 30s
- id: job7.1
type: SubmitObj
params:
refTaskId: register-distributedworkload
count: 1
params:
workers: 1
ttl: 30s
- id: job7.2
type: SubmitObj
params:
refTaskId: register-trainingworkload
count: 2
params:
ttl: 30s
- id: job8
type: SubmitObj
params:
refTaskId: register-distributedworkload
count: 8
params:
workers: 3
ttl: 30s
- id: job9
type: SubmitObj
params:
refTaskId: register-distributedworkload
count: 9
params:
workers: 2
ttl: 30s
- id: job9.1
type: SubmitObj
params:
refTaskId: register-distributedworkload
count: 1
params:
workers: 4
ttl: 30s
103 changes: 70 additions & 33 deletions resources/benchmarks/gang-scheduling/workflows/run-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ tasks:
#- id: register
# type: RegisterObj
# params:
# template: "resources/benchmarks/gang-scheduling/templates/k8s/job.yml"
# template: "resources/benchmarks/templates/k8s/job.yml"
# nameFormat: "job{{._ENUM_}}"
# podNameFormat: "{{._NAME_}}-[0-9]-.*"
# podCount: "{{.replicas}}"
Expand All @@ -13,7 +13,7 @@ tasks:
#- id: register
# type: RegisterObj
# params:
# template: "resources/benchmarks/gang-scheduling/templates/k8s/jobset.yml"
# template: "resources/benchmarks/templates/k8s/jobset.yml"
# nameFormat: "jobset{{._ENUM_}}"
# podNameFormat: "{{._NAME_}}-workers-[0-9]+-[0-9]+-.+"
# podCount: "{{.replicas}}"
Expand All @@ -34,7 +34,7 @@ tasks:
#- id: register
# type: RegisterObj
# params:
# template: "resources/benchmarks/gang-scheduling/templates/kueue/job.yml"
# template: "resources/benchmarks/templates/kueue/job.yml"
# nameFormat: "job{{._ENUM_}}"
# podNameFormat: "{{._NAME_}}-[0-9]-.*"
# podCount: "{{.replicas}}"
Expand Down Expand Up @@ -69,39 +69,61 @@ tasks:
#- id: register
# type: RegisterObj
# params:
# template: "resources/benchmarks/gang-scheduling/templates/volcano/job.yml"
# template: "resources/benchmarks/templates/volcano/job.yml"
# nameFormat: "j{{._ENUM_}}"
# podNameFormat: "{{._NAME_}}-test-[0-9]+"
# podCount: "{{.replicas}}"
#- id: configure
# type: Configure
# params:
# configmaps:
# - name: volcano-scheduler-configmap
# namespace: volcano-system
# op: create
# data:
# volcano-scheduler.conf: |
# actions: "enqueue, allocate, backfill"
# tiers:
# - plugins:
# - name: priority
# - name: gang
# - name: conformance
# - plugins:
# - name: drf
# - name: predicates
# - name: proportion
# - name: nodeorder
# - name: binpack
# timeout: 1m

### Yunikorn
- id: register
type: RegisterObj
params:
template: "resources/benchmarks/gang-scheduling/templates/yunikorn/job.yml"
nameFormat: "job{{._ENUM_}}"
podNameFormat: "{{._NAME_}}-.*"
podCount: "{{.replicas}}"
- id: configure
type: Configure
params:
configmaps:
- name: yunikorn-configs
namespace: yunikorn
op: create
data:
queues.yaml: |
partitions:
- name: default
queues:
- name: root
queues:
- name: sandbox
submitacl: '*'
resources:
max:
{memory: 36Gi, vcore: 8000m, nvidia.com/gpu: 256}
timeout: 1m
#- id: register
# type: RegisterObj
# params:
# template: "resources/benchmarks/templates/yunikorn/job.yml"
# nameFormat: "job{{._ENUM_}}"
# podNameFormat: "{{._NAME_}}-.*"
# podCount: "{{.replicas}}"
#- id: configure
# type: Configure
# params:
# configmaps:
# - name: yunikorn-configs
# namespace: yunikorn
# op: create
# data:
# queues.yaml: |
# partitions:
# - name: default
# queues:
# - name: root
# queues:
# - name: sandbox
# submitacl: '*'
# resources:
# max:
# {memory: 36Gi, vcore: 8000m, nvidia.com/gpu: 256}
# timeout: 1m
#
### Benchmark test
#
Expand All @@ -112,101 +134,116 @@ tasks:
count: 1
params:
replicas: 32
ttl: 30s
- id: job2
type: SubmitObj
params:
refTaskId: register
count: 2
params:
replicas: 16
ttl: 30s
- id: job3
type: SubmitObj
params:
refTaskId: register
count: 3
params:
replicas: 10
ttl: 30s
- id: job3.1
type: SubmitObj
params:
refTaskId: register
count: 1
params:
replicas: 2
replicas: 2
ttl: 30s
- id: job4
type: SubmitObj
params:
refTaskId: register
count: 4
params:
replicas: 8
replicas: 8
ttl: 30s
- id: job5
type: SubmitObj
params:
refTaskId: register
count: 5
params:
replicas: 6
ttl: 30s
- id: job5.1
type: SubmitObj
params:
refTaskId: register
count: 2
params:
replicas: 1
ttl: 30s
- id: job6
type: SubmitObj
params:
refTaskId: register
count: 6
params:
replicas: 5
ttl: 30s
- id: job6.1
type: SubmitObj
params:
refTaskId: register
count: 1
params:
replicas: 2
ttl: 30s
- id: job7
type: SubmitObj
params:
refTaskId: register
count: 7
params:
replicas: 4
ttl: 30s
- id: job7.1
type: SubmitObj
params:
refTaskId: register
count: 1
params:
replicas: 2
ttl: 30s
- id: job7.2
type: SubmitObj
params:
refTaskId: register
count: 2
params:
replicas: 1
ttl: 30s
- id: job8
type: SubmitObj
params:
refTaskId: register
count: 8
params:
replicas: 4
ttl: 30s
- id: job9
type: SubmitObj
params:
refTaskId: register
count: 9
params:
replicas: 3
ttl: 30s
- id: job9.1
type: SubmitObj
params:
refTaskId: register
count: 1
params:
replicas: 5
ttl: 30s
Loading
Loading