diff --git a/charts/overrides/kueue/priority.yml b/charts/overrides/kueue/priority.yaml similarity index 100% rename from charts/overrides/kueue/priority.yml rename to charts/overrides/kueue/priority.yaml diff --git a/charts/overrides/kwok-affinity-deployment-patch.yaml b/charts/overrides/kwok-affinity-deployment-patch.yaml new file mode 100644 index 0000000..8170319 --- /dev/null +++ b/charts/overrides/kwok-affinity-deployment-patch.yaml @@ -0,0 +1,12 @@ +spec: + template: + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: type + operator: NotIn + values: + - kwok diff --git a/docs/examples/kueue/kueue.md b/docs/examples/kueue/kueue.md index 7c73697..6942f13 100644 --- a/docs/examples/kueue/kueue.md +++ b/docs/examples/kueue/kueue.md @@ -6,7 +6,7 @@ Install `kueue` by following these [instructions](https://kueue.sigs.k8s.io/docs KUEUE_VERSION=v0.8.0 kubectl apply --server-side -f https://github.com/kubernetes-sigs/kueue/releases/download/${KUEUE_VERSION}/manifests.yaml -kubectl apply -f charts/overrides/kueue/priority.yml +kubectl apply -f charts/overrides/kueue/priority.yaml ``` Run a kueue job: diff --git a/scripts/env.sh b/scripts/env.sh index 676f4a9..3a8fbdf 100644 --- a/scripts/env.sh +++ b/scripts/env.sh @@ -44,6 +44,29 @@ function fail_if_command_not_found() { fi } +### wait for specific number of pods in a namespace +function wait_for_pods() { + local namespace=$1 + local pods=$2 + local wait_time=60 + local sleep_interval=5 + local elapsed_time=0 + + while true; do + count=$(kubectl get pods -n $namespace --no-headers 2>/dev/null | wc -l) + if [ "$count" -eq $pods ]; then + break + fi + echo "current pods $count, expecting $pods" + + sleep "$sleep_interval" + elapsed_time=$((elapsed_time + sleep_interval)) + if [ "$elapsed_time" -gt "$wait_time" ]; then + exit 1 + fi + done +} + # KWOK # @@ -99,6 +122,11 @@ function deploy_jobset() { kubectl apply --server-side -f https://github.com/kubernetes-sigs/jobset/releases/download/${JOBSET_VERSION}/manifests.yaml + kubectl -n jobset-system patch deploy jobset-controller-manager \ + --patch-file=$REPO_HOME/charts/overrides/kwok-affinity-deployment-patch.yaml + + wait_for_pods "jobset-system" 1 + kubectl -n jobset-system wait --for=condition=ready pod -l control-plane=controller-manager --timeout=300s } @@ -110,6 +138,11 @@ function deploy_kueue() { kubectl apply --server-side -f https://github.com/kubernetes-sigs/kueue/releases/download/${KUEUE_VERSION}/manifests.yaml + kubectl -n kueue-system patch deployment kueue-controller-manager \ + --patch-file=$REPO_HOME/charts/overrides/kwok-affinity-deployment-patch.yaml + + wait_for_pods "kueue-system" 1 + kubectl -n kueue-system wait --for=condition=ready pod -l control-plane=controller-manager --timeout=300s } @@ -122,7 +155,8 @@ function deploy_volcano() { helm repo add --force-update volcano-sh https://volcano-sh.github.io/helm-charts helm upgrade --install volcano volcano-sh/volcano -n volcano-system --create-namespace \ - --version=$VOLCANO_VERSION --wait + --version=$VOLCANO_VERSION --wait \ + --set-json 'affinity={"nodeAffinity":{"requiredDuringSchedulingIgnoredDuringExecution":{"nodeSelectorTerms":[{"matchExpressions":[{"key":"type","operator":"NotIn","values":["kwok"]}]}]}}}' for app in volcano-admission volcano-controller volcano-scheduler; do kubectl -n volcano-system wait --for=condition=ready pod -l app=$app --timeout=300s @@ -142,7 +176,8 @@ function deploy_yunikorn() { helm repo add --force-update yunikorn https://apache.github.io/yunikorn-release helm upgrade --install yunikorn yunikorn/yunikorn -n yunikorn --create-namespace \ - --version=$YUNIKORN_VERSION --wait + --version=$YUNIKORN_VERSION --wait \ + --set-json 'affinity={"nodeAffinity":{"requiredDuringSchedulingIgnoredDuringExecution":{"nodeSelectorTerms":[{"matchExpressions":[{"key":"type","operator":"NotIn","values":["kwok"]}]}]}}}' kubectl -n yunikorn wait --for=condition=ready pod -l app=yunikorn --timeout=300s } @@ -186,5 +221,6 @@ Run:ai deployment requires environment variables: --set controlPlane.url=$RUNAI_CONTROL_PLANE_URL \ --set controlPlane.clientSecret=$RUNAI_CLIENT_SECRET \ --set cluster.uid=$RUNAI_CLUSTER_ID \ - --set cluster.url=https://example.com + --set cluster.url=https://example.com \ + --set-json 'affinity={"nodeAffinity":{"requiredDuringSchedulingIgnoredDuringExecution":{"nodeSelectorTerms":[{"matchExpressions":[{"key":"type","operator":"NotIn","values":["kwok"]}]}]}}}' }