diff --git a/.github/resources/minio_remote_config_cell.json b/.github/resources/minio_remote_config_cell.json new file mode 100644 index 000000000..e36c4b188 --- /dev/null +++ b/.github/resources/minio_remote_config_cell.json @@ -0,0 +1,20 @@ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "@ray.remote\n", + "def get_minio_run_config():\n", + " import s3fs\n", + " import pyarrow\n", + " s3_fs = s3fs.S3FileSystem(\n", + " key = \"minio\",\n", + " secret = \"minio123\",\n", + " endpoint_url = \"http://minio-service.default.svc.cluster.local:9000\"\n", + " )\n", + " custom_fs = pyarrow.fs.PyFileSystem(pyarrow.fs.FSSpecHandler(s3_fs))\n", + " run_config = ray.train.RunConfig(storage_path='training', storage_filesystem=custom_fs)\n", + " return run_config" + ] + } diff --git a/.github/resources/wait_for_job_cell.json b/.github/resources/wait_for_job_cell.json new file mode 100644 index 000000000..eb8805bd4 --- /dev/null +++ b/.github/resources/wait_for_job_cell.json @@ -0,0 +1,20 @@ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from time import sleep\n", + "\n", + "finished = False\n", + "while not finished:\n", + " sleep(5)\n", + " status = client.get_job_status(submission_id)\n", + " finished = (status == \"SUCCEEDED\" or status == \"FAILED\" or status == \"STOPPED\")\n", + " print(status)\n", + "print(\"Job status \" + status)\n", + "print(\"Logs: \")\n", + "print(client.get_job_logs(submission_id))\n", + "assert status == \"SUCCEEDED\", \"Job failed or was stopped!\"" + ] + } diff --git a/.github/workflows/e2e_tests.yaml b/.github/workflows/e2e_tests.yaml deleted file mode 100644 index d216df9d7..000000000 --- a/.github/workflows/e2e_tests.yaml +++ /dev/null @@ -1,164 +0,0 @@ -name: e2e - -on: - pull_request: - branches: - - main - - 'release-*' - paths-ignore: - - 'docs/**' - - '**.adoc' - - '**.md' - - 'LICENSE' - push: - branches: - - main - - 'release-*' - paths-ignore: - - 'docs/**' - - '**.adoc' - - '**.md' - - 'LICENSE' - -concurrency: - group: ${{ github.head_ref }}-${{ github.workflow }} - cancel-in-progress: true - -env: - CODEFLARE_OPERATOR_IMG: "quay.io/project-codeflare/codeflare-operator:dev" - -jobs: - kubernetes: - - runs-on: ubuntu-20.04-4core-gpu - - steps: - - name: Checkout code - uses: actions/checkout@v4 - with: - submodules: recursive - - - name: Checkout common repo code - uses: actions/checkout@v4 - with: - repository: 'project-codeflare/codeflare-common' - ref: 'main' - path: 'common' - - - name: Checkout CodeFlare operator repository - uses: actions/checkout@v4 - with: - repository: project-codeflare/codeflare-operator - path: codeflare-operator - - - name: Set Go - uses: actions/setup-go@v5 - with: - go-version-file: './codeflare-operator/go.mod' - cache-dependency-path: "./codeflare-operator/go.sum" - - - name: Set up gotestfmt - uses: gotesttools/gotestfmt-action@v2 - with: - token: ${{ secrets.GITHUB_TOKEN }} - - - name: Set up specific Python version - uses: actions/setup-python@v5 - with: - python-version: '3.9' - cache: 'pip' # caching pip dependencies - - - name: Setup NVidia GPU environment for KinD - uses: ./common/github-actions/nvidia-gpu-setup - - - name: Setup and start KinD cluster - uses: ./common/github-actions/kind - - - name: Install NVidia GPU operator for KinD - uses: ./common/github-actions/nvidia-gpu-operator - - - name: Deploy CodeFlare stack - id: deploy - run: | - cd codeflare-operator - echo Setting up CodeFlare stack - make setup-e2e - echo Deploying CodeFlare operator - make deploy -e IMG="${CODEFLARE_OPERATOR_IMG}" -e ENV="e2e" - kubectl wait --timeout=120s --for=condition=Available=true deployment -n openshift-operators codeflare-operator-manager - cd .. - - - name: Add user to KinD - uses: ./common/github-actions/kind-add-user - with: - user-name: sdk-user - - - name: Configure RBAC for sdk user with limited permissions - run: | - kubectl create clusterrole list-ingresses --verb=get,list --resource=ingresses - kubectl create clusterrolebinding sdk-user-list-ingresses --clusterrole=list-ingresses --user=sdk-user - kubectl create clusterrole namespace-creator --verb=get,list,create,delete,patch --resource=namespaces - kubectl create clusterrolebinding sdk-user-namespace-creator --clusterrole=namespace-creator --user=sdk-user - kubectl create clusterrole raycluster-creator --verb=get,list,create,delete,patch --resource=rayclusters - kubectl create clusterrolebinding sdk-user-raycluster-creator --clusterrole=raycluster-creator --user=sdk-user - kubectl create clusterrole appwrapper-creator --verb=get,list,create,delete,patch --resource=appwrappers - kubectl create clusterrolebinding sdk-user-appwrapper-creator --clusterrole=appwrapper-creator --user=sdk-user - kubectl create clusterrole resourceflavor-creator --verb=get,list,create,delete --resource=resourceflavors - kubectl create clusterrolebinding sdk-user-resourceflavor-creator --clusterrole=resourceflavor-creator --user=sdk-user - kubectl create clusterrole clusterqueue-creator --verb=get,list,create,delete,patch --resource=clusterqueues - kubectl create clusterrolebinding sdk-user-clusterqueue-creator --clusterrole=clusterqueue-creator --user=sdk-user - kubectl create clusterrole localqueue-creator --verb=get,list,create,delete,patch --resource=localqueues - kubectl create clusterrolebinding sdk-user-localqueue-creator --clusterrole=localqueue-creator --user=sdk-user - kubectl create clusterrole list-secrets --verb=get,list --resource=secrets - kubectl create clusterrolebinding sdk-user-list-secrets --clusterrole=list-secrets --user=sdk-user - kubectl config use-context sdk-user - - - name: Run e2e tests - run: | - export CODEFLARE_TEST_OUTPUT_DIR=${{ env.TEMP_DIR }} - echo "CODEFLARE_TEST_OUTPUT_DIR=${CODEFLARE_TEST_OUTPUT_DIR}" >> $GITHUB_ENV - - set -euo pipefail - pip install poetry - poetry install --with test,docs - echo "Running e2e tests..." - poetry run pytest -v -s ./tests/e2e -m 'kind and nvidia_gpu' > ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log 2>&1 - env: - GRPC_DNS_RESOLVER: "native" - - - name: Switch to kind-cluster context to print logs - if: always() && steps.deploy.outcome == 'success' - run: kubectl config use-context kind-cluster - - - name: Print Pytest output log - if: always() && steps.deploy.outcome == 'success' - run: | - echo "Printing Pytest output logs" - cat ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log - - - name: Print CodeFlare operator logs - if: always() && steps.deploy.outcome == 'success' - run: | - echo "Printing CodeFlare operator logs" - kubectl logs -n openshift-operators --tail -1 -l app.kubernetes.io/name=codeflare-operator | tee ${CODEFLARE_TEST_OUTPUT_DIR}/codeflare-operator.log - - - name: Print KubeRay operator logs - if: always() && steps.deploy.outcome == 'success' - run: | - echo "Printing KubeRay operator logs" - kubectl logs -n ray-system --tail -1 -l app.kubernetes.io/name=kuberay | tee ${CODEFLARE_TEST_OUTPUT_DIR}/kuberay.log - - - name: Export all KinD pod logs - uses: ./common/github-actions/kind-export-logs - if: always() && steps.deploy.outcome == 'success' - with: - output-directory: ${CODEFLARE_TEST_OUTPUT_DIR} - - - name: Upload logs - uses: actions/upload-artifact@v4 - if: always() && steps.deploy.outcome == 'success' - with: - name: logs - retention-days: 10 - path: | - ${{ env.CODEFLARE_TEST_OUTPUT_DIR }}/**/*.log diff --git a/.github/workflows/guided_notebook_tests.yaml b/.github/workflows/guided_notebook_tests.yaml new file mode 100644 index 000000000..757f680cb --- /dev/null +++ b/.github/workflows/guided_notebook_tests.yaml @@ -0,0 +1,394 @@ +name: Guided notebooks tests + +on: + pull_request: + branches: + - main + - 'release-*' + paths-ignore: + - 'docs/**' + - '**.adoc' + - '**.md' + - 'LICENSE' + push: + branches: + - main + - 'release-*' + paths-ignore: + - 'docs/**' + - '**.adoc' + - '**.md' + - 'LICENSE' + +concurrency: + group: ${{ github.head_ref }}-${{ github.workflow }} + cancel-in-progress: true + +env: + CODEFLARE_OPERATOR_IMG: "quay.io/project-codeflare/codeflare-operator:dev" + +jobs: + verify-0_basic_ray: + runs-on: ubuntu-20.04-4core + + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + submodules: recursive + + - name: Checkout common repo code + uses: actions/checkout@v4 + with: + repository: 'project-codeflare/codeflare-common' + ref: 'main' + path: 'common' + + - name: Checkout CodeFlare operator repository + uses: actions/checkout@v4 + with: + repository: project-codeflare/codeflare-operator + path: codeflare-operator + + - name: Set Go + uses: actions/setup-go@v5 + with: + go-version-file: './codeflare-operator/go.mod' + cache-dependency-path: "./codeflare-operator/go.sum" + + - name: Set up gotestfmt + uses: gotesttools/gotestfmt-action@v2 + with: + token: ${{ secrets.GITHUB_TOKEN }} + + - name: Set up specific Python version + uses: actions/setup-python@v5 + with: + python-version: '3.9' + cache: 'pip' # caching pip dependencies + + - name: Setup and start KinD cluster + uses: ./common/github-actions/kind + + - name: Deploy CodeFlare stack + id: deploy + run: | + cd codeflare-operator + echo Setting up CodeFlare stack + make setup-e2e + echo Deploying CodeFlare operator + make deploy -e IMG="${CODEFLARE_OPERATOR_IMG}" -e ENV="e2e" + kubectl wait --timeout=120s --for=condition=Available=true deployment -n openshift-operators codeflare-operator-manager + cd .. + + - name: Setup Guided notebooks execution + run: | + echo "Installing papermill and dependencies..." + pip install poetry papermill ipython ipykernel + # Disable virtualenv due to problems using packaged in virtualenv in papermill + poetry config virtualenvs.create false + + echo "Installing SDK..." + poetry install --with test,docs + + - name: Run 0_basic_ray.ipynb + run: | + set -euo pipefail + + # Remove login/logout cells, as KinD doesn't support authentication using token + jq -r 'del(.cells[] | select(.source[] | contains("Create authentication object for user permissions")))' 0_basic_ray.ipynb > 0_basic_ray.ipynb.tmp && mv 0_basic_ray.ipynb.tmp 0_basic_ray.ipynb + jq -r 'del(.cells[] | select(.source[] | contains("auth.logout()")))' 0_basic_ray.ipynb > 0_basic_ray.ipynb.tmp && mv 0_basic_ray.ipynb.tmp 0_basic_ray.ipynb + # Set explicit namespace as SDK need it (currently) to resolve local queues + sed -i "s/head_memory=2,/head_memory=2, namespace='default',/" 0_basic_ray.ipynb + # Run notebook + poetry run papermill 0_basic_ray.ipynb 0_basic_ray_out.ipynb --log-output --execution-timeout 600 + working-directory: demo-notebooks/guided-demos + + - name: Print CodeFlare operator logs + if: always() && steps.deploy.outcome == 'success' + run: | + echo "Printing CodeFlare operator logs" + kubectl logs -n openshift-operators --tail -1 -l app.kubernetes.io/name=codeflare-operator | tee ${TEMP_DIR}/codeflare-operator.log + + - name: Print Kueue operator logs + if: always() && steps.deploy.outcome == 'success' + run: | + echo "Printing Kueue operator logs" + KUEUE_CONTROLLER_POD=$(kubectl get pods -n kueue-system | grep kueue-controller | awk '{print $1}') + kubectl logs -n kueue-system --tail -1 ${KUEUE_CONTROLLER_POD} | tee ${TEMP_DIR}/kueue.log + + - name: Print KubeRay operator logs + if: always() && steps.deploy.outcome == 'success' + run: | + echo "Printing KubeRay operator logs" + kubectl logs -n ray-system --tail -1 -l app.kubernetes.io/name=kuberay | tee ${TEMP_DIR}/kuberay.log + + - name: Export all KinD pod logs + uses: ./common/github-actions/kind-export-logs + if: always() && steps.deploy.outcome == 'success' + with: + output-directory: ${TEMP_DIR} + + - name: Upload logs + uses: actions/upload-artifact@v4 + if: always() && steps.deploy.outcome == 'success' + with: + name: logs-0_basic_ray + retention-days: 10 + path: | + ${{ env.TEMP_DIR }}/**/*.log + + verify-1_cluster_job_client: + runs-on: ubuntu-20.04-4core-gpu + + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + submodules: recursive + + - name: Checkout common repo code + uses: actions/checkout@v4 + with: + repository: 'project-codeflare/codeflare-common' + ref: 'main' + path: 'common' + + - name: Checkout CodeFlare operator repository + uses: actions/checkout@v4 + with: + repository: project-codeflare/codeflare-operator + path: codeflare-operator + + - name: Set Go + uses: actions/setup-go@v5 + with: + go-version-file: './codeflare-operator/go.mod' + cache-dependency-path: "./codeflare-operator/go.sum" + + - name: Set up gotestfmt + uses: gotesttools/gotestfmt-action@v2 + with: + token: ${{ secrets.GITHUB_TOKEN }} + + - name: Set up specific Python version + uses: actions/setup-python@v5 + with: + python-version: '3.9' + cache: 'pip' # caching pip dependencies + + - name: Setup NVidia GPU environment for KinD + uses: ./common/github-actions/nvidia-gpu-setup + + - name: Setup and start KinD cluster + uses: ./common/github-actions/kind + + - name: Install NVidia GPU operator for KinD + uses: ./common/github-actions/nvidia-gpu-operator + with: + enable-time-slicing: 'true' + + - name: Deploy CodeFlare stack + id: deploy + run: | + cd codeflare-operator + echo Setting up CodeFlare stack + make setup-e2e + echo Deploying CodeFlare operator + make deploy -e IMG="${CODEFLARE_OPERATOR_IMG}" -e ENV="e2e" + kubectl wait --timeout=120s --for=condition=Available=true deployment -n openshift-operators codeflare-operator-manager + cd .. + + - name: Setup Guided notebooks execution + run: | + echo "Installing papermill and dependencies..." + pip install poetry papermill ipython ipykernel + # Disable virtualenv due to problems using packaged in virtualenv in papermill + poetry config virtualenvs.create false + + echo "Installing SDK..." + poetry install --with test,docs + + - name: Run 1_cluster_job_client.ipynb + run: | + set -euo pipefail + + # Remove login/logout cells, as KinD doesn't support authentication using token + jq -r 'del(.cells[] | select(.source[] | contains("Create authentication object for user permissions")))' 1_cluster_job_client.ipynb > 1_cluster_job_client.ipynb.tmp && mv 1_cluster_job_client.ipynb.tmp 1_cluster_job_client.ipynb + jq -r 'del(.cells[] | select(.source[] | contains("auth.logout()")))' 1_cluster_job_client.ipynb > 1_cluster_job_client.ipynb.tmp && mv 1_cluster_job_client.ipynb.tmp 1_cluster_job_client.ipynb + # Replace async logs with waiting for job to finish, async logs don't work properly in papermill + JOB_WAIT=$(jq -r '.' ${GITHUB_WORKSPACE}/.github/resources/wait_for_job_cell.json) + jq --argjson job_wait "$JOB_WAIT" -r '(.cells[] | select(.source[] | contains("async for lines in client.tail_job_logs"))) |= $job_wait' 1_cluster_job_client.ipynb > 1_cluster_job_client.ipynb.tmp && mv 1_cluster_job_client.ipynb.tmp 1_cluster_job_client.ipynb + # Set explicit namespace as SDK need it (currently) to resolve local queues + sed -i "s/head_cpus=1,/head_cpus=1, namespace='default',/" 1_cluster_job_client.ipynb + # Run notebook + poetry run papermill 1_cluster_job_client.ipynb 1_cluster_job_client_out.ipynb --log-output --execution-timeout 1200 + working-directory: demo-notebooks/guided-demos + + - name: Print CodeFlare operator logs + if: always() && steps.deploy.outcome == 'success' + run: | + echo "Printing CodeFlare operator logs" + kubectl logs -n openshift-operators --tail -1 -l app.kubernetes.io/name=codeflare-operator | tee ${TEMP_DIR}/codeflare-operator.log + + - name: Print Kueue operator logs + if: always() && steps.deploy.outcome == 'success' + run: | + echo "Printing Kueue operator logs" + KUEUE_CONTROLLER_POD=$(kubectl get pods -n kueue-system | grep kueue-controller | awk '{print $1}') + kubectl logs -n kueue-system --tail -1 ${KUEUE_CONTROLLER_POD} | tee ${TEMP_DIR}/kueue.log + + - name: Print KubeRay operator logs + if: always() && steps.deploy.outcome == 'success' + run: | + echo "Printing KubeRay operator logs" + kubectl logs -n ray-system --tail -1 -l app.kubernetes.io/name=kuberay | tee ${TEMP_DIR}/kuberay.log + + - name: Export all KinD pod logs + uses: ./common/github-actions/kind-export-logs + if: always() && steps.deploy.outcome == 'success' + with: + output-directory: ${TEMP_DIR} + + - name: Upload logs + uses: actions/upload-artifact@v4 + if: always() && steps.deploy.outcome == 'success' + with: + name: logs-1_cluster_job_client + retention-days: 10 + path: | + ${{ env.TEMP_DIR }}/**/*.log + + verify-2_basic_interactive: + runs-on: ubuntu-20.04-4core-gpu + + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + submodules: recursive + + - name: Checkout common repo code + uses: actions/checkout@v4 + with: + repository: 'project-codeflare/codeflare-common' + ref: 'main' + path: 'common' + + - name: Checkout CodeFlare operator repository + uses: actions/checkout@v4 + with: + repository: project-codeflare/codeflare-operator + path: codeflare-operator + + - name: Set Go + uses: actions/setup-go@v5 + with: + go-version-file: './codeflare-operator/go.mod' + cache-dependency-path: "./codeflare-operator/go.sum" + + - name: Set up gotestfmt + uses: gotesttools/gotestfmt-action@v2 + with: + token: ${{ secrets.GITHUB_TOKEN }} + + - name: Set up specific Python version + uses: actions/setup-python@v5 + with: + python-version: '3.9' + cache: 'pip' # caching pip dependencies + + - name: Setup NVidia GPU environment for KinD + uses: ./common/github-actions/nvidia-gpu-setup + + - name: Setup and start KinD cluster + uses: ./common/github-actions/kind + + - name: Install NVidia GPU operator for KinD + uses: ./common/github-actions/nvidia-gpu-operator + with: + enable-time-slicing: 'true' + + - name: Deploy CodeFlare stack + id: deploy + run: | + cd codeflare-operator + echo Setting up CodeFlare stack + make setup-e2e + echo Deploying CodeFlare operator + make deploy -e IMG="${CODEFLARE_OPERATOR_IMG}" -e ENV="e2e" + kubectl wait --timeout=120s --for=condition=Available=true deployment -n openshift-operators codeflare-operator-manager + cd .. + + - name: Install MINIO + run: | + kubectl apply -f ./tests/e2e/minio_deployment.yaml + kubectl wait --timeout=120s --for=condition=Available=true deployment -n default minio + + - name: Setup Guided notebooks execution + run: | + echo "Installing papermill and dependencies..." + pip install poetry papermill ipython ipykernel + # Disable virtualenv due to problems using packaged in virtualenv in papermill + poetry config virtualenvs.create false + + echo "Installing SDK..." + poetry install --with test,docs + + - name: Run 2_basic_interactive.ipynb + run: | + set -euo pipefail + + # Remove login/logout cells, as KinD doesn't support authentication using token + jq -r 'del(.cells[] | select(.source[] | contains("Create authentication object for user permissions")))' 2_basic_interactive.ipynb > 2_basic_interactive.ipynb.tmp && mv 2_basic_interactive.ipynb.tmp 2_basic_interactive.ipynb + jq -r 'del(.cells[] | select(.source[] | contains("auth.logout()")))' 2_basic_interactive.ipynb > 2_basic_interactive.ipynb.tmp && mv 2_basic_interactive.ipynb.tmp 2_basic_interactive.ipynb + # Rewrite cluster_uri() to local_client_url() to retrieve client URL available out of cluster, as the test is executed outside of cluster + sed -i "s/cluster_uri()/local_client_url()/" 2_basic_interactive.ipynb + # Set explicit namespace as SDK need it (currently) to resolve local queues + sed -i "s/head_cpus=1,/head_cpus=1, namespace='default',/" 2_basic_interactive.ipynb + # Add MINIO related modules to runtime environment + sed -i "s/\\\\\"transformers/\\\\\"s3fs\\\\\", \\\\\"pyarrow\\\\\", \\\\\"transformers/" 2_basic_interactive.ipynb + # Replace markdown cell with remote configuration for MINIO + MINIO_CONFIG=$(jq -r '.' ${GITHUB_WORKSPACE}/.github/resources/minio_remote_config_cell.json) + jq --argjson minio_config "$MINIO_CONFIG" -r '(.cells[] | select(.source[] | contains("Now that we are connected"))) |= $minio_config' 2_basic_interactive.ipynb > 2_basic_interactive.ipynb.tmp && mv 2_basic_interactive.ipynb.tmp 2_basic_interactive.ipynb + # Configure persistent storage for Ray trainer + sed -i -E "s/# run_config.*\)/, run_config=ray.get(get_minio_run_config.remote())/" 2_basic_interactive.ipynb + # Run notebook + poetry run papermill 2_basic_interactive.ipynb 2_basic_interactive_out.ipynb --log-output --execution-timeout 1200 + env: + GRPC_DNS_RESOLVER: "native" + working-directory: demo-notebooks/guided-demos + + - name: Print CodeFlare operator logs + if: always() && steps.deploy.outcome == 'success' + run: | + echo "Printing CodeFlare operator logs" + kubectl logs -n openshift-operators --tail -1 -l app.kubernetes.io/name=codeflare-operator | tee ${TEMP_DIR}/codeflare-operator.log + + - name: Print Kueue operator logs + if: always() && steps.deploy.outcome == 'success' + run: | + echo "Printing Kueue operator logs" + KUEUE_CONTROLLER_POD=$(kubectl get pods -n kueue-system | grep kueue-controller | awk '{print $1}') + kubectl logs -n kueue-system --tail -1 ${KUEUE_CONTROLLER_POD} | tee ${TEMP_DIR}/kueue.log + + - name: Print KubeRay operator logs + if: always() && steps.deploy.outcome == 'success' + run: | + echo "Printing KubeRay operator logs" + kubectl logs -n ray-system --tail -1 -l app.kubernetes.io/name=kuberay | tee ${TEMP_DIR}/kuberay.log + + - name: Export all KinD pod logs + uses: ./common/github-actions/kind-export-logs + if: always() && steps.deploy.outcome == 'success' + with: + output-directory: ${TEMP_DIR} + + - name: Upload logs + uses: actions/upload-artifact@v4 + if: always() && steps.deploy.outcome == 'success' + with: + name: logs-2_basic_interactive + retention-days: 10 + path: | + ${{ env.TEMP_DIR }}/**/*.log diff --git a/demo-notebooks/guided-demos/0_basic_ray.ipynb b/demo-notebooks/guided-demos/0_basic_ray.ipynb index 3f0f62e47..11f3a3b2e 100644 --- a/demo-notebooks/guided-demos/0_basic_ray.ipynb +++ b/demo-notebooks/guided-demos/0_basic_ray.ipynb @@ -62,10 +62,12 @@ "# The SDK will try to find the name of your default local queue based on the annotation \"kueue.x-k8s.io/default-queue\": \"true\" unless you specify the local queue manually below\n", "cluster = Cluster(ClusterConfiguration(\n", " name='raytest', \n", + " head_cpus='500m',\n", + " head_memory=2,\n", " head_gpus=0, # For GPU enabled workloads set the head_gpus and num_gpus\n", " num_gpus=0,\n", " num_workers=2,\n", - " min_cpus=1,\n", + " min_cpus='250m',\n", " max_cpus=1,\n", " min_memory=4,\n", " max_memory=4,\n", diff --git a/demo-notebooks/guided-demos/1_cluster_job_client.ipynb b/demo-notebooks/guided-demos/1_cluster_job_client.ipynb index 00576024a..bd5d69657 100644 --- a/demo-notebooks/guided-demos/1_cluster_job_client.ipynb +++ b/demo-notebooks/guided-demos/1_cluster_job_client.ipynb @@ -44,10 +44,12 @@ "# The SDK will try to find the name of your default local queue based on the annotation \"kueue.x-k8s.io/default-queue\": \"true\" unless you specify the local queue manually below\n", "cluster = Cluster(ClusterConfiguration(\n", " name='jobtest',\n", + " head_cpus=1,\n", + " head_memory=4,\n", " head_gpus=1, # For GPU enabled workloads set the head_gpus and num_gpus\n", " num_gpus=1,\n", " num_workers=2,\n", - " min_cpus=1,\n", + " min_cpus='250m',\n", " max_cpus=1,\n", " min_memory=4,\n", " max_memory=4,\n", diff --git a/demo-notebooks/guided-demos/2_basic_interactive.ipynb b/demo-notebooks/guided-demos/2_basic_interactive.ipynb index 0692caa4c..f95edc9d8 100644 --- a/demo-notebooks/guided-demos/2_basic_interactive.ipynb +++ b/demo-notebooks/guided-demos/2_basic_interactive.ipynb @@ -60,13 +60,15 @@ "cluster_name = \"interactivetest\"\n", "cluster = Cluster(ClusterConfiguration(\n", " name=cluster_name,\n", + " head_cpus=1,\n", + " head_memory=6,\n", " head_gpus=1, # For GPU enabled workloads set the head_gpus and num_gpus\n", " num_gpus=1,\n", " num_workers=2,\n", - " min_cpus=2,\n", - " max_cpus=2,\n", - " min_memory=8,\n", - " max_memory=8,\n", + " min_cpus='250m',\n", + " max_cpus=1,\n", + " min_memory=4,\n", + " max_memory=6,\n", " image=\"quay.io/rhoai/ray:2.23.0-py39-cu121\",\n", " write_to_file=False, # When enabled Ray Cluster yaml files are written to /HOME/.codeflare/resources \n", " # local_queue=\"local-queue-name\" # Specify the local queue manually\n", @@ -251,7 +253,17 @@ "\n", " ray_trainer = TorchTrainer(\n", " train_func,\n", - " scaling_config=ScalingConfig(num_workers=3, use_gpu=True),\n", + " scaling_config=ScalingConfig(\n", + " # num_workers = number of worker nodes with the ray head node included\n", + " num_workers=3,\n", + " use_gpu=True,\n", + " resources_per_worker={\n", + " \"CPU\": 1,\n", + " },\n", + " trainer_resources={\n", + " \"CPU\": 0,\n", + " }\n", + " )\n", " # Configure persistent storage that is accessible across \n", " # all worker nodes.\n", " # Uncomment and update the RunConfig below to include your storage details.\n", diff --git a/demo-notebooks/guided-demos/mnist_fashion.py b/demo-notebooks/guided-demos/mnist_fashion.py index 85cd6e64c..ba5b2636c 100644 --- a/demo-notebooks/guided-demos/mnist_fashion.py +++ b/demo-notebooks/guided-demos/mnist_fashion.py @@ -78,8 +78,16 @@ def train_func_distributed(): trainer = TorchTrainer( train_func_distributed, scaling_config=ScalingConfig( - num_workers=3, use_gpu=use_gpu - ), # num_workers = number of worker nodes with the ray head node included + # num_workers = number of worker nodes with the ray head node included + num_workers=3, + use_gpu=use_gpu, + resources_per_worker={ + "CPU": 1, + }, + trainer_resources={ + "CPU": 0, + }, + ), ) results = trainer.fit() diff --git a/tests/e2e/minio_deployment.yaml b/tests/e2e/minio_deployment.yaml index 86d4ef01f..b2cdc54a9 100644 --- a/tests/e2e/minio_deployment.yaml +++ b/tests/e2e/minio_deployment.yaml @@ -88,10 +88,7 @@ spec: mountPath: /data subPath: minio terminationMessagePolicy: File - image: >- - quay.io/minio/minio:RELEASE.2024-06-22T05-26-45Z - # In case of disconnected environment, use image digest instead of tag - # For example : /minio/minio@sha256:6b3abf2f59286b985bfde2b23e37230b466081eda5dccbf971524d54c8e406b5 + image: quay.io/minio/minio:RELEASE.2024-06-22T05-26-45Z args: - server - /data @@ -129,35 +126,3 @@ spec: sessionAffinity: None selector: app: minio ---- -kind: Route -apiVersion: route.openshift.io/v1 -metadata: - name: minio-api -spec: - to: - kind: Service - name: minio-service - weight: 100 - port: - targetPort: api - wildcardPolicy: None - tls: - termination: edge - insecureEdgeTerminationPolicy: Redirect ---- -kind: Route -apiVersion: route.openshift.io/v1 -metadata: - name: minio-ui -spec: - to: - kind: Service - name: minio-service - weight: 100 - port: - targetPort: ui - wildcardPolicy: None - tls: - termination: edge - insecureEdgeTerminationPolicy: Redirect