kubeflow · saileshd1402 · Sep 29, 2024 · Sep 29, 2024 · Sep 29, 2024 · Sep 29, 2024
diff --git a/.github/workflows/template-notebook-test/action.yaml b/.github/workflows/template-notebook-test/action.yaml
@@ -0,0 +1,70 @@
+name: Notebook test template
+description: A composite action to setup and run example notebooks using Papermill
+
+inputs:
+  kubernetes-version:
+    required: true
+    description: kubernetes version
+  python-version:
+    required: false
+    description: Python version
+    # Most latest supporting version
+    default: "3.10"
+  papermill-args-yaml:
+    description: 'Additional arguments to pass to Papermill in yaml format'
+    required: false
+    default: ""
+  notebook-input:
+    description: 'Path to the input notebook'
+    required: true
+  notebook-output:
+    description: 'Path to save the output notebook'
+    required: true
+
+runs:
+  using: composite
+  steps:
+    - name: Free-Up Disk Space
+      uses: ./.github/workflows/free-up-disk-space
+
+    - name: Setup Python
+      uses: actions/setup-python@v5
+      with:
+        python-version: ${{ inputs.python-version }}
+
+    - name: Install Python dependencies
+      shell: bash
+      run: |
+        pip install jupyter ipykernel papermill==2.2.0
+
+    - name: Create k8s Kind Cluster
+      uses: helm/[email protected]
+      with:
+        node_image: kindest/node:${{ inputs.kubernetes-version }}
+        cluster_name: training-operator-cluster
+        kubectl_version: ${{ inputs.kubernetes-version }}
+
+    - name: Build training-operator
+      shell: bash
+      run: |
+        ./scripts/gha/build-image.sh
+      env:
+        TRAINING_CI_IMAGE: kubeflowtraining/training-operator:test
+
+    - name: Deploy training operator
+      shell: bash
+      run: |
+        ./scripts/gha/setup-training-operator.sh
+        docker system prune -a -f
+        docker system df
+        df -h
+      env:
+        KIND_CLUSTER: training-operator-cluster
+        TRAINING_CI_IMAGE: kubeflowtraining/training-operator:test
+        GANG_SCHEDULER_NAME: "none"
+        KUBERNETES_VERSION: ${{ inputs.kubernetes-version }}
+
+    - name: Run Jupyter Notebook with Papermill
+      shell: bash
+      run: |
+        papermill ${{ inputs.notebook-input }} ${{ inputs.notebook-output }} -p kubeflow_python_sdk "./sdk/python" --parameters_yaml "${{ inputs.papermill-args-yaml }}"
diff --git a/.github/workflows/test-example-notebooks.yaml b/.github/workflows/test-example-notebooks.yaml
@@ -0,0 +1,31 @@
+name: Test example notebooks
+
+on:
+  - pull_request
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  create-pytorchjob-notebook-test:
+    runs-on: ubuntu-latest
+    timeout-minutes: 30
+    strategy:
+      fail-fast: false
+      matrix:
+        kubernetes-version: ["v1.28.7"]
+        python-version: ["3.9", "3.10", "3.11"]
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Test Notebook
+        uses: ./.github/workflows/template-notebook-test
+        with:
+          kubernetes-version: ${{ matrix.kubernetes-version }}
+          python-version: ${{ matrix.python-version }}
+          notebook-input: ./examples/pytorch/image-classification/create-pytorchjob.ipynb
+          notebook-output: ./examples/pytorch/image-classification/create-pytorchjob-output.ipynb
+          papermill-args-yaml: |
+            namespace: default
diff --git a/examples/pytorch/image-classification/create-pytorchjob.ipynb b/examples/pytorch/image-classification/create-pytorchjob.ipynb
@@ -2,6 +2,7 @@
  "cells": [
   {
    "cell_type": "markdown",
+   "id": "3f9c3a89",
    "metadata": {
     "pycharm": {
      "name": "#%% md\n"
@@ -13,6 +14,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "1e9500f5",
    "metadata": {
     "pycharm": {
      "name": "#%% md\n"
@@ -24,8 +26,24 @@
     "The notebook shows how to use Kubeflow Training SDK to create, get, wait, check and delete PyTorchJob."
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5e683444",
+   "metadata": {
+    "tags": [
+     "parameters"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "kubeflow_python_sdk=\"git+https://github.com/kubeflow/training-operator.git#subdirectory=sdk/python\"\n",
+    "namespace=\"kubeflow-user-example-com\""
+   ]
+  },
   {
    "cell_type": "markdown",
+   "id": "eb6e3a78",
    "metadata": {
     "tags": []
    },
@@ -38,16 +56,19 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "03ec7538",
    "metadata": {},
    "outputs": [],
    "source": [
     "# TODO (andreyvelich): Change to release version when SDK with the new APIs is published.\n",
-    "!pip install git+https://github.com/kubeflow/training-operator.git#subdirectory=sdk/python"
+    "# Install Kubeflow Python SDK\n",
+    "!pip install {kubeflow_python_sdk}"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 2,
+   "id": "028bdb24",
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
@@ -71,6 +92,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "ec708746",
    "metadata": {
     "pycharm": {
      "name": "#%% md\n"
@@ -82,6 +104,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "fb9ea8a8",
    "metadata": {
     "pycharm": {
      "name": "#%% md\n"
@@ -94,6 +117,7 @@
   {
    "cell_type": "code",
    "execution_count": 3,
+   "id": "a41509b6",
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
@@ -102,12 +126,11 @@
    "outputs": [],
    "source": [
     "name = \"pytorch-dist-mnist-gloo\"\n",
-    "namespace = \"kubeflow-user-example-com\"\n",
     "container_name = \"pytorch\"\n",
     "\n",
     "container = V1Container(\n",
     "    name=container_name,\n",
-    "    image=\"gcr.io/kubeflow-ci/pytorch-dist-mnist-test:v1.0\",\n",
+    "    image=\"kubeflow/pytorch-dist-mnist:latest\",\n",
     "    args=[\"--backend\", \"gloo\"],\n",
     ")\n",
     "\n",
@@ -144,6 +167,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "15f87dc3",
    "metadata": {
     "pycharm": {
      "name": "#%% md\n"
@@ -158,6 +182,7 @@
   {
    "cell_type": "code",
    "execution_count": 4,
+   "id": "bb649aa6",
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
@@ -176,12 +201,13 @@
     "# Namespace will be reused in every APIs.\n",
     "training_client = TrainingClient(namespace=namespace)\n",
     "\n",
-    "# If `job_kind` is not set in `TrainingClient`, we need to set it for each API.\n",
-    "training_client.create_job(pytorchjob, job_kind=constants.PYTORCHJOB_KIND)"
+    "# `job_kind` is set in `TrainingClient`\n",
+    "training_client.create_job(pytorchjob)"
    ]
   },
   {
    "cell_type": "markdown",
+   "id": "2356f98c",
    "metadata": {
     "pycharm": {
      "name": "#%% md\n"
@@ -196,6 +222,7 @@
   {
    "cell_type": "code",
    "execution_count": 5,
+   "id": "9a38d06a",
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
@@ -214,11 +241,12 @@
     }
    ],
    "source": [
-    "training_client.get_job(name, job_kind=constants.PYTORCHJOB_KIND).metadata.name"
+    "training_client.get_job(name).metadata.name"
    ]
   },
   {
    "cell_type": "markdown",
+   "id": "79325494",
    "metadata": {
     "pycharm": {
      "name": "#%% md\n"
@@ -231,6 +259,7 @@
   {
    "cell_type": "code",
    "execution_count": 7,
+   "id": "6eab24c6",
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
@@ -260,11 +289,12 @@
     }
    ],
    "source": [
-    "training_client.get_job_conditions(name=name, job_kind=constants.PYTORCHJOB_KIND)"
+    "training_client.get_job_conditions(name=name)"
    ]
   },
   {
    "cell_type": "markdown",
+   "id": "3c4ad567",
    "metadata": {
     "pycharm": {
      "name": "#%% md\n"
@@ -277,6 +307,7 @@
   {
    "cell_type": "code",
    "execution_count": 8,
+   "id": "36a43f58",
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
@@ -302,13 +333,14 @@
     }
    ],
    "source": [
-    "pytorchjob = training_client.wait_for_job_conditions(name=name, job_kind=constants.PYTORCHJOB_KIND)\n",
+    "pytorchjob = training_client.wait_for_job_conditions(name=name)\n",
     "\n",
     "print(f\"Succeeded number of replicas: {pytorchjob.status.replica_statuses['Master'].succeeded}\")"
    ]
   },
   {
    "cell_type": "markdown",
+   "id": "7c51cd8c",
    "metadata": {
     "pycharm": {
      "name": "#%% md\n"
@@ -321,6 +353,7 @@
   {
    "cell_type": "code",
    "execution_count": 9,
+   "id": "3429ac34",
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
@@ -339,11 +372,12 @@
     }
    ],
    "source": [
-    "training_client.is_job_succeeded(name=name, job_kind=constants.PYTORCHJOB_KIND)"
+    "training_client.is_job_succeeded(name=name)"
    ]
   },
   {
    "cell_type": "markdown",
+   "id": "d40c8408",
    "metadata": {
     "pycharm": {
      "name": "#%% md\n"
@@ -356,6 +390,7 @@
   {
    "cell_type": "code",
    "execution_count": 10,
+   "id": "93ea4500",
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
@@ -476,11 +511,12 @@
     }
    ],
    "source": [
-    "training_client.get_job_logs(name=name, job_kind=constants.PYTORCHJOB_KIND)"
+    "training_client.get_job_logs(name=name)"
    ]
   },
   {
    "cell_type": "markdown",
+   "id": "405ffaf0",
    "metadata": {
     "pycharm": {
      "name": "#%% md\n"
@@ -493,6 +529,7 @@
   {
    "cell_type": "code",
    "execution_count": 11,
+   "id": "f79247a1",
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
@@ -514,6 +551,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "6f35c6fc",
    "metadata": {},
    "outputs": [],
    "source": []