From a0def9f3ece37d0a78ff0043f3b151df6f745e9d Mon Sep 17 00:00:00 2001 From: Michal Szadkowski Date: Thu, 8 Aug 2024 13:26:59 +0200 Subject: [PATCH 01/26] Introduce ManagedBy field to RunPolicy that is used by each Kubeflow Job Spec Signed-off-by: Michal Szadkowski --- pkg/apis/kubeflow.org/v1/common_types.go | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/pkg/apis/kubeflow.org/v1/common_types.go b/pkg/apis/kubeflow.org/v1/common_types.go index 42fd1d0dad..3d2f70fb60 100644 --- a/pkg/apis/kubeflow.org/v1/common_types.go +++ b/pkg/apis/kubeflow.org/v1/common_types.go @@ -221,6 +221,21 @@ type RunPolicy struct { // +kubebuilder:default:=false // +optional Suspend *bool `json:"suspend,omitempty"` + + // ManagedBy is used to indicate the controller or entity that manages a job. + // The value must be either an empty, 'kubeflow.org/training-operator' or + // 'kueue.x-k8s.io/multikueue'. + // The built-in job controller reconciles a job which don't have this + // field at all or the field value is the reserved string + // 'kubeflow.org/training-operator', but delegates reconciling the job + // with a 'kueue.x-k8s.io/multikueue' to the Kueue. + // + // The value must be a valid domain-prefixed path (e.g. acme.io/foo) - + // all characters before the first "/" must be a valid subdomain as defined + // by RFC 1123. All characters trailing the first "/" must be valid HTTP Path + // characters as defined by RFC 3986. The value cannot exceed 63 characters. + // The field is immutable. + ManagedBy *string `json:"managedBy,omitempty"` } // SchedulingPolicy encapsulates various scheduling policies of the distributed training From c11c3516a3dde279a9105129f7bace13f7d21f5c Mon Sep 17 00:00:00 2001 From: Michal Szadkowski Date: Thu, 8 Aug 2024 13:27:59 +0200 Subject: [PATCH 02/26] Update Kubeflow JOb manifests Signed-off-by: Michal Szadkowski --- manifests/base/crds/kubeflow.org_jaxjobs.yaml | 10 ++++++++++ manifests/base/crds/kubeflow.org_mpijobs.yaml | 10 ++++++++++ manifests/base/crds/kubeflow.org_paddlejobs.yaml | 10 ++++++++++ manifests/base/crds/kubeflow.org_pytorchjobs.yaml | 10 ++++++++++ manifests/base/crds/kubeflow.org_tfjobs.yaml | 10 ++++++++++ manifests/base/crds/kubeflow.org_xgboostjobs.yaml | 10 ++++++++++ 6 files changed, 60 insertions(+) diff --git a/manifests/base/crds/kubeflow.org_jaxjobs.yaml b/manifests/base/crds/kubeflow.org_jaxjobs.yaml index 79cf2c6128..fd0a33b998 100644 --- a/manifests/base/crds/kubeflow.org_jaxjobs.yaml +++ b/manifests/base/crds/kubeflow.org_jaxjobs.yaml @@ -7306,6 +7306,16 @@ spec: CleanPodPolicy defines the policy to kill pods after the job completes. Default to None. type: string + managedBy: + description: |- + ManagedBy is used to indicate the controller or entity that manages a job. + The value must be either an empty, 'kubeflow.org/training-operator' or + 'kueue.x-k8s.io/multikueue'. + The built-in job controller reconciles a job which don't have this + field at all or the field value is the reserved string + 'kubeflow.org/training-operator', but delegates reconciling the job + with a 'kueue.x-k8s. + type: string schedulingPolicy: description: SchedulingPolicy defines the policy related to scheduling, e.g. gang-scheduling diff --git a/manifests/base/crds/kubeflow.org_mpijobs.yaml b/manifests/base/crds/kubeflow.org_mpijobs.yaml index 4139d49fef..0384e06d32 100644 --- a/manifests/base/crds/kubeflow.org_mpijobs.yaml +++ b/manifests/base/crds/kubeflow.org_mpijobs.yaml @@ -7311,6 +7311,16 @@ spec: CleanPodPolicy defines the policy to kill pods after the job completes. Default to None. type: string + managedBy: + description: |- + ManagedBy is used to indicate the controller or entity that manages a job. + The value must be either an empty, 'kubeflow.org/training-operator' or + 'kueue.x-k8s.io/multikueue'. + The built-in job controller reconciles a job which don't have this + field at all or the field value is the reserved string + 'kubeflow.org/training-operator', but delegates reconciling the job + with a 'kueue.x-k8s. + type: string schedulingPolicy: description: SchedulingPolicy defines the policy related to scheduling, e.g. gang-scheduling diff --git a/manifests/base/crds/kubeflow.org_paddlejobs.yaml b/manifests/base/crds/kubeflow.org_paddlejobs.yaml index c8a5acb8c3..27511571fd 100644 --- a/manifests/base/crds/kubeflow.org_paddlejobs.yaml +++ b/manifests/base/crds/kubeflow.org_paddlejobs.yaml @@ -7793,6 +7793,16 @@ spec: CleanPodPolicy defines the policy to kill pods after the job completes. Default to None. type: string + managedBy: + description: |- + ManagedBy is used to indicate the controller or entity that manages a job. + The value must be either an empty, 'kubeflow.org/training-operator' or + 'kueue.x-k8s.io/multikueue'. + The built-in job controller reconciles a job which don't have this + field at all or the field value is the reserved string + 'kubeflow.org/training-operator', but delegates reconciling the job + with a 'kueue.x-k8s. + type: string schedulingPolicy: description: SchedulingPolicy defines the policy related to scheduling, e.g. gang-scheduling diff --git a/manifests/base/crds/kubeflow.org_pytorchjobs.yaml b/manifests/base/crds/kubeflow.org_pytorchjobs.yaml index a5c261e124..e28c5be35e 100644 --- a/manifests/base/crds/kubeflow.org_pytorchjobs.yaml +++ b/manifests/base/crds/kubeflow.org_pytorchjobs.yaml @@ -7830,6 +7830,16 @@ spec: CleanPodPolicy defines the policy to kill pods after the job completes. Default to None. type: string + managedBy: + description: |- + ManagedBy is used to indicate the controller or entity that manages a job. + The value must be either an empty, 'kubeflow.org/training-operator' or + 'kueue.x-k8s.io/multikueue'. + The built-in job controller reconciles a job which don't have this + field at all or the field value is the reserved string + 'kubeflow.org/training-operator', but delegates reconciling the job + with a 'kueue.x-k8s. + type: string schedulingPolicy: description: SchedulingPolicy defines the policy related to scheduling, e.g. gang-scheduling diff --git a/manifests/base/crds/kubeflow.org_tfjobs.yaml b/manifests/base/crds/kubeflow.org_tfjobs.yaml index 153243aae6..4fd7f4a88c 100644 --- a/manifests/base/crds/kubeflow.org_tfjobs.yaml +++ b/manifests/base/crds/kubeflow.org_tfjobs.yaml @@ -71,6 +71,16 @@ spec: CleanPodPolicy defines the policy to kill pods after the job completes. Default to None. type: string + managedBy: + description: |- + ManagedBy is used to indicate the controller or entity that manages a job. + The value must be either an empty, 'kubeflow.org/training-operator' or + 'kueue.x-k8s.io/multikueue'. + The built-in job controller reconciles a job which don't have this + field at all or the field value is the reserved string + 'kubeflow.org/training-operator', but delegates reconciling the job + with a 'kueue.x-k8s. + type: string schedulingPolicy: description: SchedulingPolicy defines the policy related to scheduling, e.g. gang-scheduling diff --git a/manifests/base/crds/kubeflow.org_xgboostjobs.yaml b/manifests/base/crds/kubeflow.org_xgboostjobs.yaml index 34237c6aba..c023598f88 100644 --- a/manifests/base/crds/kubeflow.org_xgboostjobs.yaml +++ b/manifests/base/crds/kubeflow.org_xgboostjobs.yaml @@ -67,6 +67,16 @@ spec: CleanPodPolicy defines the policy to kill pods after the job completes. Default to None. type: string + managedBy: + description: |- + ManagedBy is used to indicate the controller or entity that manages a job. + The value must be either an empty, 'kubeflow.org/training-operator' or + 'kueue.x-k8s.io/multikueue'. + The built-in job controller reconciles a job which don't have this + field at all or the field value is the reserved string + 'kubeflow.org/training-operator', but delegates reconciling the job + with a 'kueue.x-k8s. + type: string schedulingPolicy: description: SchedulingPolicy defines the policy related to scheduling, e.g. gang-scheduling From 38432509dee370ef853f7b1064457822aac77886 Mon Sep 17 00:00:00 2001 From: Michal Szadkowski Date: Thu, 8 Aug 2024 13:29:43 +0200 Subject: [PATCH 03/26] Update Kubeflow Jobs Reconcile to use ManagedBy field to decide if skip the process Signed-off-by: Michal Szadkowski --- pkg/controller.v1/common/job.go | 7 +++++++ pkg/controller.v1/mpi/mpijob_controller.go | 5 +++++ pkg/controller.v1/paddlepaddle/paddlepaddle_controller.go | 5 +++++ pkg/controller.v1/pytorch/pytorchjob_controller.go | 5 +++++ pkg/controller.v1/tensorflow/tfjob_controller.go | 5 +++++ pkg/controller.v1/xgboost/xgboostjob_controller.go | 5 +++++ 6 files changed, 32 insertions(+) diff --git a/pkg/controller.v1/common/job.go b/pkg/controller.v1/common/job.go index 825afdf8b1..ad663d4503 100644 --- a/pkg/controller.v1/common/job.go +++ b/pkg/controller.v1/common/job.go @@ -455,3 +455,10 @@ func (jc *JobController) CleanupJob(runPolicy *apiv1.RunPolicy, jobStatus apiv1. func (jc *JobController) calcPGMinResources(minMember int32, replicas map[apiv1.ReplicaType]*apiv1.ReplicaSpec) *corev1.ResourceList { return CalcPGMinResources(minMember, replicas, jc.PriorityClassLister.Get) } + +func (jc *JobController) ManagedByExternalController(rp apiv1.RunPolicy) *string { + if controllerName := rp.ManagedBy; controllerName != nil && *controllerName != jc.Controller.ControllerName() { + return controllerName + } + return nil +} diff --git a/pkg/controller.v1/mpi/mpijob_controller.go b/pkg/controller.v1/mpi/mpijob_controller.go index 0067692a66..a3c1859108 100644 --- a/pkg/controller.v1/mpi/mpijob_controller.go +++ b/pkg/controller.v1/mpi/mpijob_controller.go @@ -135,6 +135,11 @@ func (jc *MPIJobReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct return ctrl.Result{}, client.IgnoreNotFound(err) } + if manager := jc.ManagedByExternalController(mpijob.Spec.RunPolicy); manager != nil { + logger.Info("Skipping MPIJob managed by a different controller", "managed-by", manager) + return ctrl.Result{}, nil + } + if err = kubeflowv1.ValidateV1MpiJobSpec(&mpijob.Spec); err != nil { logger.Error(err, "MPIJob failed validation") jc.Recorder.Eventf(mpijob, corev1.EventTypeWarning, commonutil.NewReason(kubeflowv1.MPIJobKind, commonutil.JobFailedValidationReason), diff --git a/pkg/controller.v1/paddlepaddle/paddlepaddle_controller.go b/pkg/controller.v1/paddlepaddle/paddlepaddle_controller.go index 7cae58e9c5..a9884d4442 100644 --- a/pkg/controller.v1/paddlepaddle/paddlepaddle_controller.go +++ b/pkg/controller.v1/paddlepaddle/paddlepaddle_controller.go @@ -131,6 +131,11 @@ func (r *PaddleJobReconciler) Reconcile(ctx context.Context, req ctrl.Request) ( return ctrl.Result{}, client.IgnoreNotFound(err) } + if manager := r.ManagedByExternalController(paddlejob.Spec.RunPolicy); manager != nil { + logger.Info("Skipping PaddleJob managed by a different controller", "managed-by", manager) + return ctrl.Result{}, nil + } + // Check if reconciliation is needed jobKey, err := common.KeyFunc(paddlejob) if err != nil { diff --git a/pkg/controller.v1/pytorch/pytorchjob_controller.go b/pkg/controller.v1/pytorch/pytorchjob_controller.go index 7025c24396..f8f2f4d0b5 100644 --- a/pkg/controller.v1/pytorch/pytorchjob_controller.go +++ b/pkg/controller.v1/pytorch/pytorchjob_controller.go @@ -132,6 +132,11 @@ func (r *PyTorchJobReconciler) Reconcile(ctx context.Context, req ctrl.Request) return ctrl.Result{}, client.IgnoreNotFound(err) } + if manager := r.ManagedByExternalController(pytorchjob.Spec.RunPolicy); manager != nil { + logger.Info("Skipping PyTorchJob managed by a different controller", "managed-by", manager) + return ctrl.Result{}, nil + } + // Check if reconciliation is needed jobKey, err := common.KeyFunc(pytorchjob) if err != nil { diff --git a/pkg/controller.v1/tensorflow/tfjob_controller.go b/pkg/controller.v1/tensorflow/tfjob_controller.go index a60d65affe..3095d7a57a 100644 --- a/pkg/controller.v1/tensorflow/tfjob_controller.go +++ b/pkg/controller.v1/tensorflow/tfjob_controller.go @@ -127,6 +127,11 @@ func (r *TFJobReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl return ctrl.Result{}, client.IgnoreNotFound(err) } + if manager := r.ManagedByExternalController(tfjob.Spec.RunPolicy); manager != nil { + logger.Info("Skipping TFJob managed by a different controller", "managed-by", manager) + return ctrl.Result{}, nil + } + // Check if reconciliation is needed jobKey, err := common.KeyFunc(tfjob) if err != nil { diff --git a/pkg/controller.v1/xgboost/xgboostjob_controller.go b/pkg/controller.v1/xgboost/xgboostjob_controller.go index fb860e462f..3b051ddc7d 100644 --- a/pkg/controller.v1/xgboost/xgboostjob_controller.go +++ b/pkg/controller.v1/xgboost/xgboostjob_controller.go @@ -130,6 +130,11 @@ func (r *XGBoostJobReconciler) Reconcile(ctx context.Context, req ctrl.Request) return ctrl.Result{}, client.IgnoreNotFound(err) } + if manager := r.ManagedByExternalController(xgboostjob.Spec.RunPolicy); manager != nil { + logger.Info("Skipping XGBoostJob managed by a different controller", "managed-by", manager) + return ctrl.Result{}, nil + } + // Check reconcile is required. jobKey, err := common.KeyFunc(xgboostjob) if err != nil { From d8cc545a774dd456e99917d537b2cf4cb580f9d2 Mon Sep 17 00:00:00 2001 From: Michal Szadkowski Date: Fri, 9 Aug 2024 11:46:00 +0200 Subject: [PATCH 04/26] job controller test Signed-off-by: Michal Szadkowski --- pkg/controller.v1/common/job.go | 2 +- pkg/controller.v1/common/job_controller.go | 2 + pkg/controller.v1/common/job_test.go | 34 ++++++++++ .../mpi/mpijob_controller_test.go | 64 ++++++++++++++++++ .../paddlepaddle_controller_test.go | 64 ++++++++++++++++++ .../pytorch/pytorchjob_controller_test.go | 66 +++++++++++++++++++ .../tensorflow/tfjob_controller_test.go | 64 ++++++++++++++++++ .../xgboost/xgboostjob_controller_test.go | 64 ++++++++++++++++++ 8 files changed, 359 insertions(+), 1 deletion(-) diff --git a/pkg/controller.v1/common/job.go b/pkg/controller.v1/common/job.go index ad663d4503..d9b5715e47 100644 --- a/pkg/controller.v1/common/job.go +++ b/pkg/controller.v1/common/job.go @@ -457,7 +457,7 @@ func (jc *JobController) calcPGMinResources(minMember int32, replicas map[apiv1. } func (jc *JobController) ManagedByExternalController(rp apiv1.RunPolicy) *string { - if controllerName := rp.ManagedBy; controllerName != nil && *controllerName != jc.Controller.ControllerName() { + if controllerName := rp.ManagedBy; controllerName != nil && *controllerName != KubeflowJobsController { return controllerName } return nil diff --git a/pkg/controller.v1/common/job_controller.go b/pkg/controller.v1/common/job_controller.go index b141c22c55..23fc02ed40 100644 --- a/pkg/controller.v1/common/job_controller.go +++ b/pkg/controller.v1/common/job_controller.go @@ -77,6 +77,8 @@ func (c *JobControllerConfiguration) EnableGangScheduling() bool { return c.GangScheduling != "" && c.GangScheduling != GangSchedulerNone } +const KubeflowJobsController = "kubeflow.org/training-operator" + // JobController abstracts other operators to manage the lifecycle of Jobs. // User need to first implement the ControllerInterface(objectA) and then initialize a JobController(objectB) struct with objectA // as the parameter. diff --git a/pkg/controller.v1/common/job_test.go b/pkg/controller.v1/common/job_test.go index b204cc2e89..5f33b24d86 100644 --- a/pkg/controller.v1/common/job_test.go +++ b/pkg/controller.v1/common/job_test.go @@ -219,6 +219,40 @@ func TestPastActiveDeadline(T *testing.T) { } } +func TestManagedBy(T *testing.T) { + cases := map[string]struct { + managedBy string + }{ + "managedBy is empty": { + managedBy: "", + }, + "managedBy is training-operator controller": { + managedBy: KubeflowJobsController, + }, + "managedBy is not the training-operator controller": { + managedBy: "kueue.x-k8s.io/multikueue", + }, + "managedBy is other value": { + managedBy: "other-job-controller", + }, + } + for name, tc := range cases { + T.Run(name, func(t *testing.T) { + jobController := JobController{} + runPolicy := &apiv1.RunPolicy{ + ManagedBy: &tc.managedBy, + } + if tc.managedBy == "" { + runPolicy.ManagedBy = nil + } + + if got := jobController.ManagedByExternalController(*runPolicy); got != nil && tc.managedBy != *got { + t.Errorf("Unexpected manager controller: \nwant: %v\ngot: %v\n", tc.managedBy, *got) + } + }) + } +} + func newPod(name string, phase corev1.PodPhase) *corev1.Pod { pod := &corev1.Pod{ ObjectMeta: metav1.ObjectMeta{ diff --git a/pkg/controller.v1/mpi/mpijob_controller_test.go b/pkg/controller.v1/mpi/mpijob_controller_test.go index 2c63e484f2..7e653f62f2 100644 --- a/pkg/controller.v1/mpi/mpijob_controller_test.go +++ b/pkg/controller.v1/mpi/mpijob_controller_test.go @@ -20,6 +20,7 @@ import ( "strings" common "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" + v1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" corev1 "k8s.io/api/core/v1" @@ -1066,6 +1067,69 @@ var _ = Describe("MPIJob controller", func() { By("Checking if the startTime is updated") Expect(created.Status.StartTime).ShouldNot(Equal(startTimeBeforeSuspended)) }) + + It("Should not reconcile a job while managed by external controller", func() { + By("Creating a MPIJob managed by external controller") + otherController := "kueue.x-k8s.io/multikueue" + job.Spec.RunPolicy = v1.RunPolicy{ + ManagedBy: &otherController, + } + job.Spec.RunPolicy.Suspend = ptr.To(true) + Expect(testK8sClient.Create(ctx, job)).Should(Succeed()) + + created := &kubeflowv1.MPIJob{} + By("Checking created MPIJob") + Eventually(func() bool { + err := testK8sClient.Get(ctx, jobKey, created) + return err == nil + }, testutil.Timeout, testutil.Interval).Should(BeTrue()) + + By("Checking created MPIJob has a nil startTime") + Consistently(func() *metav1.Time { + Expect(testK8sClient.Get(ctx, jobKey, created)).Should(Succeed()) + return created.Status.StartTime + }, testutil.ConsistentDuration, testutil.Interval).Should(BeNil()) + + launcherPod := &corev1.Pod{} + workerPod := &corev1.Pod{} + launcherSvc := &corev1.Service{} + workerSvc := &corev1.Service{} + + By("Checking if the pods and services aren't created") + Consistently(func() bool { + errMasterPod := testK8sClient.Get(ctx, launcherKey, launcherPod) + errWorkerPod := testK8sClient.Get(ctx, worker0Key, workerPod) + errMasterSvc := testK8sClient.Get(ctx, launcherKey, launcherSvc) + errWorkerSvc := testK8sClient.Get(ctx, worker0Key, workerSvc) + return errors.IsNotFound(errMasterPod) && errors.IsNotFound(errWorkerPod) && + errors.IsNotFound(errMasterSvc) && errors.IsNotFound(errWorkerSvc) + }, testutil.ConsistentDuration, testutil.Interval).Should(BeTrue()) + + By("Checking if the MPIJob status was not updated") + Eventually(func() []kubeflowv1.JobCondition { + Expect(testK8sClient.Get(ctx, jobKey, created)).Should(Succeed()) + return created.Status.Conditions + }, testutil.ConsistentDuration, testutil.Interval).Should(BeComparableTo([]v1.JobCondition(nil))) + + By("Unsuspending the MPIJob") + Eventually(func() error { + Expect(testK8sClient.Get(ctx, jobKey, created)).Should(Succeed()) + created.Spec.RunPolicy.Suspend = ptr.To(false) + return testK8sClient.Update(ctx, created) + }, testutil.Timeout, testutil.Interval).Should(Succeed()) + + By("Checking created MPIJob still has a nil startTime") + Eventually(func() *metav1.Time { + Expect(testK8sClient.Get(ctx, jobKey, created)).Should(Succeed()) + return created.Status.StartTime + }, testutil.Timeout, testutil.Interval).Should(BeNil()) + + By("Checking if the MPIJob status was not updated, even after unsuspending") + Eventually(func() []kubeflowv1.JobCondition { + Expect(testK8sClient.Get(ctx, jobKey, created)).Should(Succeed()) + return created.Status.Conditions + }, testutil.ConsistentDuration, testutil.Interval).Should(BeComparableTo([]v1.JobCondition(nil))) + }) }) }) diff --git a/pkg/controller.v1/paddlepaddle/paddlepaddle_controller_test.go b/pkg/controller.v1/paddlepaddle/paddlepaddle_controller_test.go index 493acf1ad5..94c243cde0 100644 --- a/pkg/controller.v1/paddlepaddle/paddlepaddle_controller_test.go +++ b/pkg/controller.v1/paddlepaddle/paddlepaddle_controller_test.go @@ -28,6 +28,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" kubeflowv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" + v1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" commonutil "github.com/kubeflow/training-operator/pkg/util" "github.com/kubeflow/training-operator/pkg/util/testutil" ) @@ -430,6 +431,69 @@ var _ = Describe("PaddleJob controller", func() { By("Checking if the startTime is updated") Expect(created.Status.StartTime).ShouldNot(Equal(startTimeBeforeSuspended)) }) + + It("Should not reconcile a job while managed by external controller", func() { + By("Creating a PaddleJob managed by external controller") + otherController := "kueue.x-k8s.io/multikueue" + job.Spec.RunPolicy = v1.RunPolicy{ + ManagedBy: &otherController, + } + job.Spec.RunPolicy.Suspend = ptr.To(true) + Expect(testK8sClient.Create(ctx, job)).Should(Succeed()) + + created := &kubeflowv1.PaddleJob{} + By("Checking created PaddleJob") + Eventually(func() bool { + err := testK8sClient.Get(ctx, jobKey, created) + return err == nil + }, testutil.Timeout, testutil.Interval).Should(BeTrue()) + + By("Checking created PaddleJob has a nil startTime") + Consistently(func() *metav1.Time { + Expect(testK8sClient.Get(ctx, jobKey, created)).Should(Succeed()) + return created.Status.StartTime + }, testutil.ConsistentDuration, testutil.Interval).Should(BeNil()) + + masterPod := &corev1.Pod{} + workerPod := &corev1.Pod{} + masterSvc := &corev1.Service{} + workerSvc := &corev1.Service{} + + By("Checking if the pods and services aren't created") + Consistently(func() bool { + errMasterPod := testK8sClient.Get(ctx, masterKey, masterPod) + errWorkerPod := testK8sClient.Get(ctx, worker0Key, workerPod) + errMasterSvc := testK8sClient.Get(ctx, masterKey, masterSvc) + errWorkerSvc := testK8sClient.Get(ctx, worker0Key, workerSvc) + return errors.IsNotFound(errMasterPod) && errors.IsNotFound(errWorkerPod) && + errors.IsNotFound(errMasterSvc) && errors.IsNotFound(errWorkerSvc) + }, testutil.ConsistentDuration, testutil.Interval).Should(BeTrue()) + + By("Checking if the PaddleJob status was not updated") + Eventually(func() []kubeflowv1.JobCondition { + Expect(testK8sClient.Get(ctx, jobKey, created)).Should(Succeed()) + return created.Status.Conditions + }, testutil.ConsistentDuration, testutil.Interval).Should(BeComparableTo([]v1.JobCondition(nil))) + + By("Unsuspending the PaddleJob") + Eventually(func() error { + Expect(testK8sClient.Get(ctx, jobKey, created)).Should(Succeed()) + created.Spec.RunPolicy.Suspend = ptr.To(false) + return testK8sClient.Update(ctx, created) + }, testutil.Timeout, testutil.Interval).Should(Succeed()) + + By("Checking created PaddleJob still has a nil startTime") + Eventually(func() *metav1.Time { + Expect(testK8sClient.Get(ctx, jobKey, created)).Should(Succeed()) + return created.Status.StartTime + }, testutil.Timeout, testutil.Interval).Should(BeNil()) + + By("Checking if the PaddleJob status was not updated, even after unsuspending") + Eventually(func() []kubeflowv1.JobCondition { + Expect(testK8sClient.Get(ctx, jobKey, created)).Should(Succeed()) + return created.Status.Conditions + }, testutil.ConsistentDuration, testutil.Interval).Should(BeComparableTo([]v1.JobCondition(nil))) + }) }) }) diff --git a/pkg/controller.v1/pytorch/pytorchjob_controller_test.go b/pkg/controller.v1/pytorch/pytorchjob_controller_test.go index 0e97661f6b..3fffa7d1cf 100644 --- a/pkg/controller.v1/pytorch/pytorchjob_controller_test.go +++ b/pkg/controller.v1/pytorch/pytorchjob_controller_test.go @@ -30,6 +30,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" kubeflowv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" + v1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" commonutil "github.com/kubeflow/training-operator/pkg/util" "github.com/kubeflow/training-operator/pkg/util/testutil" ) @@ -466,6 +467,71 @@ var _ = Describe("PyTorchJob controller", func() { By("Checking if the startTime is updated") Expect(created.Status.StartTime).ShouldNot(Equal(startTimeBeforeSuspended)) }) + + It("Should not reconcile a job while managed by external controller", func() { + By("Creating a PyTorchJob managed by external controller") + otherController := "kueue.x-k8s.io/multikueue" + job.Spec.RunPolicy = v1.RunPolicy{ + ManagedBy: &otherController, + } + job.Spec.RunPolicy.Suspend = ptr.To(true) + job.Spec.PyTorchReplicaSpecs[kubeflowv1.PyTorchJobReplicaTypeWorker].Replicas = ptr.To[int32](1) + Expect(testK8sClient.Create(ctx, job)).Should(Succeed()) + + created := &kubeflowv1.PyTorchJob{} + By("Checking created PyTorchJob") + Eventually(func() bool { + err := testK8sClient.Get(ctx, jobKey, created) + return err == nil + }, testutil.Timeout, testutil.Interval).Should(BeTrue()) + + By("Checking created PyTorchJob has a nil startTime") + Consistently(func() *metav1.Time { + Expect(testK8sClient.Get(ctx, jobKey, created)).Should(Succeed()) + return created.Status.StartTime + }, testutil.ConsistentDuration, testutil.Interval).Should(BeNil()) + + masterPod := &corev1.Pod{} + workerPod := &corev1.Pod{} + masterSvc := &corev1.Service{} + workerSvc := &corev1.Service{} + + By("Checking if the pods and services aren't created") + Consistently(func() bool { + errMasterPod := testK8sClient.Get(ctx, masterKey, masterPod) + errWorkerPod := testK8sClient.Get(ctx, worker0Key, workerPod) + errMasterSvc := testK8sClient.Get(ctx, masterKey, masterSvc) + errWorkerSvc := testK8sClient.Get(ctx, worker0Key, workerSvc) + return errors.IsNotFound(errMasterPod) && errors.IsNotFound(errWorkerPod) && + errors.IsNotFound(errMasterSvc) && errors.IsNotFound(errWorkerSvc) + }, testutil.ConsistentDuration, testutil.Interval).Should(BeTrue()) + + By("Checking if the PyTorchJob status was not updated") + Eventually(func() []kubeflowv1.JobCondition { + Expect(testK8sClient.Get(ctx, jobKey, created)).Should(Succeed()) + return created.Status.Conditions + }, testutil.ConsistentDuration, testutil.Interval).Should(BeComparableTo([]v1.JobCondition(nil))) + + By("Unsuspending the PyTorchJob") + Eventually(func() error { + Expect(testK8sClient.Get(ctx, jobKey, created)).Should(Succeed()) + created.Spec.RunPolicy.Suspend = ptr.To(false) + return testK8sClient.Update(ctx, created) + }, testutil.Timeout, testutil.Interval).Should(Succeed()) + + By("Checking created PyTorchJob still has a nil startTime") + Eventually(func() *metav1.Time { + Expect(testK8sClient.Get(ctx, jobKey, created)).Should(Succeed()) + return created.Status.StartTime + }, testutil.Timeout, testutil.Interval).Should(BeNil()) + + By("Checking if the PyTorchJob status was not updated, even after unsuspending") + Eventually(func() []kubeflowv1.JobCondition { + Expect(testK8sClient.Get(ctx, jobKey, created)).Should(Succeed()) + return created.Status.Conditions + }, testutil.ConsistentDuration, testutil.Interval).Should(BeComparableTo([]v1.JobCondition(nil))) + + }) }) Context("When creating the elastic PyTorchJob", func() { diff --git a/pkg/controller.v1/tensorflow/tfjob_controller_test.go b/pkg/controller.v1/tensorflow/tfjob_controller_test.go index 1d08aba0c9..86f2f8e4b6 100644 --- a/pkg/controller.v1/tensorflow/tfjob_controller_test.go +++ b/pkg/controller.v1/tensorflow/tfjob_controller_test.go @@ -29,6 +29,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" kubeflowv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" + v1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" tftestutil "github.com/kubeflow/training-operator/pkg/controller.v1/tensorflow/testutil" commonutil "github.com/kubeflow/training-operator/pkg/util" "github.com/kubeflow/training-operator/pkg/util/testutil" @@ -606,5 +607,68 @@ var _ = Describe("TFJob controller", func() { By("Checking if the startTime is updated") Expect(created.Status.StartTime).ShouldNot(Equal(startTimeBeforeSuspended)) }) + + It("Should not reconcile a job while managed by external controller", func() { + By("Creating a TFJob managed by external controller") + otherController := "kueue.x-k8s.io/multikueue" + job.Spec.RunPolicy = v1.RunPolicy{ + ManagedBy: &otherController, + } + job.Spec.RunPolicy.Suspend = ptr.To(true) + Expect(testK8sClient.Create(ctx, job)).Should(Succeed()) + + created := &kubeflowv1.TFJob{} + By("Checking created TFJob") + Eventually(func() bool { + err := testK8sClient.Get(ctx, jobKey, created) + return err == nil + }, testutil.Timeout, testutil.Interval).Should(BeTrue()) + + By("Checking created TFJob has a nil startTime") + Consistently(func() *metav1.Time { + Expect(testK8sClient.Get(ctx, jobKey, created)).Should(Succeed()) + return created.Status.StartTime + }, testutil.ConsistentDuration, testutil.Interval).Should(BeNil()) + + chiefPod := &corev1.Pod{} + workerPod := &corev1.Pod{} + chiefSvc := &corev1.Service{} + workerSvc := &corev1.Service{} + + By("Checking if the pods and services aren't created") + Consistently(func() bool { + errMasterPod := testK8sClient.Get(ctx, chiefKey, chiefPod) + errWorkerPod := testK8sClient.Get(ctx, worker0Key, workerPod) + errMasterSvc := testK8sClient.Get(ctx, chiefKey, chiefSvc) + errWorkerSvc := testK8sClient.Get(ctx, worker0Key, workerSvc) + return errors.IsNotFound(errMasterPod) && errors.IsNotFound(errWorkerPod) && + errors.IsNotFound(errMasterSvc) && errors.IsNotFound(errWorkerSvc) + }, testutil.ConsistentDuration, testutil.Interval).Should(BeTrue()) + + By("Checking if the TFJob status was not updated") + Eventually(func() []kubeflowv1.JobCondition { + Expect(testK8sClient.Get(ctx, jobKey, created)).Should(Succeed()) + return created.Status.Conditions + }, testutil.ConsistentDuration, testutil.Interval).Should(BeComparableTo([]v1.JobCondition(nil))) + + By("Unsuspending the TFJob") + Eventually(func() error { + Expect(testK8sClient.Get(ctx, jobKey, created)).Should(Succeed()) + created.Spec.RunPolicy.Suspend = ptr.To(false) + return testK8sClient.Update(ctx, created) + }, testutil.Timeout, testutil.Interval).Should(Succeed()) + + By("Checking created TFJob still has a nil startTime") + Eventually(func() *metav1.Time { + Expect(testK8sClient.Get(ctx, jobKey, created)).Should(Succeed()) + return created.Status.StartTime + }, testutil.Timeout, testutil.Interval).Should(BeNil()) + + By("Checking if the TFJob status was not updated, even after unsuspending") + Eventually(func() []kubeflowv1.JobCondition { + Expect(testK8sClient.Get(ctx, jobKey, created)).Should(Succeed()) + return created.Status.Conditions + }, testutil.ConsistentDuration, testutil.Interval).Should(BeComparableTo([]v1.JobCondition(nil))) + }) }) }) diff --git a/pkg/controller.v1/xgboost/xgboostjob_controller_test.go b/pkg/controller.v1/xgboost/xgboostjob_controller_test.go index 7bf1ef8c84..64cce5689a 100644 --- a/pkg/controller.v1/xgboost/xgboostjob_controller_test.go +++ b/pkg/controller.v1/xgboost/xgboostjob_controller_test.go @@ -28,6 +28,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" kubeflowv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" + v1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" commonutil "github.com/kubeflow/training-operator/pkg/util" "github.com/kubeflow/training-operator/pkg/util/testutil" ) @@ -359,6 +360,69 @@ var _ = Describe("XGBoost controller", func() { By("Checking if the startTime is updated") Expect(created.Status.StartTime).ShouldNot(Equal(startTimeBeforeSuspended)) }) + + It("Should not reconcile a job while managed by external controller", func() { + By("Creating a XGBoostJob managed by external controller") + otherController := "kueue.x-k8s.io/multikueue" + job.Spec.RunPolicy = v1.RunPolicy{ + ManagedBy: &otherController, + } + job.Spec.RunPolicy.Suspend = ptr.To(true) + Expect(testK8sClient.Create(ctx, job)).Should(Succeed()) + + created := &kubeflowv1.XGBoostJob{} + By("Checking created XGBoostJob") + Eventually(func() bool { + err := testK8sClient.Get(ctx, jobKey, created) + return err == nil + }, testutil.Timeout, testutil.Interval).Should(BeTrue()) + + By("Checking created XGBoostJob has a nil startTime") + Consistently(func() *metav1.Time { + Expect(testK8sClient.Get(ctx, jobKey, created)).Should(Succeed()) + return created.Status.StartTime + }, testutil.ConsistentDuration, testutil.Interval).Should(BeNil()) + + masterPod := &corev1.Pod{} + workerPod := &corev1.Pod{} + masterSvc := &corev1.Service{} + workerSvc := &corev1.Service{} + + By("Checking if the pods and services aren't created") + Consistently(func() bool { + errMasterPod := testK8sClient.Get(ctx, masterKey, masterPod) + errWorkerPod := testK8sClient.Get(ctx, worker0Key, workerPod) + errMasterSvc := testK8sClient.Get(ctx, masterKey, masterSvc) + errWorkerSvc := testK8sClient.Get(ctx, worker0Key, workerSvc) + return errors.IsNotFound(errMasterPod) && errors.IsNotFound(errWorkerPod) && + errors.IsNotFound(errMasterSvc) && errors.IsNotFound(errWorkerSvc) + }, testutil.ConsistentDuration, testutil.Interval).Should(BeTrue()) + + By("Checking if the XGBoostJob status was not updated") + Eventually(func() []kubeflowv1.JobCondition { + Expect(testK8sClient.Get(ctx, jobKey, created)).Should(Succeed()) + return created.Status.Conditions + }, testutil.ConsistentDuration, testutil.Interval).Should(BeComparableTo([]v1.JobCondition(nil))) + + By("Unsuspending the XGBoostJob") + Eventually(func() error { + Expect(testK8sClient.Get(ctx, jobKey, created)).Should(Succeed()) + created.Spec.RunPolicy.Suspend = ptr.To(false) + return testK8sClient.Update(ctx, created) + }, testutil.Timeout, testutil.Interval).Should(Succeed()) + + By("Checking created XGBoostJob still has a nil startTime") + Eventually(func() *metav1.Time { + Expect(testK8sClient.Get(ctx, jobKey, created)).Should(Succeed()) + return created.Status.StartTime + }, testutil.Timeout, testutil.Interval).Should(BeNil()) + + By("Checking if the XGBoostJob status was not updated, even after unsuspending") + Eventually(func() []kubeflowv1.JobCondition { + Expect(testK8sClient.Get(ctx, jobKey, created)).Should(Succeed()) + return created.Status.Conditions + }, testutil.ConsistentDuration, testutil.Interval).Should(BeComparableTo([]v1.JobCondition(nil))) + }) }) }) From 24402530670af4c49417a0a4321f564fc74f92fc Mon Sep 17 00:00:00 2001 From: Michal Szadkowski Date: Fri, 9 Aug 2024 11:57:06 +0200 Subject: [PATCH 05/26] spec validation webhook Signed-off-by: Michal Szadkowski --- pkg/common/util/webhooks.go | 20 ++++++++++ pkg/util/testutil/constants.go | 2 + .../paddlepaddle/paddlepaddle_webhook.go | 2 + .../paddlepaddle/paddlepaddle_webhook_test.go | 37 +++++++++++++++++++ pkg/webhooks/pytorch/pytorchjob_webhook.go | 3 +- .../pytorch/pytorchjob_webhook_test.go | 37 +++++++++++++++++++ pkg/webhooks/tensorflow/tfjob_webhook.go | 2 + pkg/webhooks/tensorflow/tfjob_webhook_test.go | 37 +++++++++++++++++++ pkg/webhooks/xgboost/xgboostjob_webhook.go | 3 +- .../xgboost/xgboostjob_webhook_test.go | 37 +++++++++++++++++++ 10 files changed, 178 insertions(+), 2 deletions(-) create mode 100644 pkg/common/util/webhooks.go diff --git a/pkg/common/util/webhooks.go b/pkg/common/util/webhooks.go new file mode 100644 index 0000000000..e342201ff8 --- /dev/null +++ b/pkg/common/util/webhooks.go @@ -0,0 +1,20 @@ +package util + +import ( + v1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" + + "k8s.io/apimachinery/pkg/util/validation" + "k8s.io/apimachinery/pkg/util/validation/field" +) + +func ValidateManagedBy(runPolicy *v1.RunPolicy, allErrs field.ErrorList) field.ErrorList { + if runPolicy.ManagedBy != nil { + manager := *runPolicy.ManagedBy + fieldPath := field.NewPath("spec", "managedBy") + allErrs = append(allErrs, validation.IsDomainPrefixedPath(fieldPath, manager)...) + if len(manager) > v1.MaxManagedByLength { + allErrs = append(allErrs, field.TooLongMaxLength(fieldPath, manager, v1.MaxManagedByLength)) + } + } + return allErrs +} diff --git a/pkg/util/testutil/constants.go b/pkg/util/testutil/constants.go index 74e0a11796..26d58ed837 100644 --- a/pkg/util/testutil/constants.go +++ b/pkg/util/testutil/constants.go @@ -16,4 +16,6 @@ const ( var ( IgnoreJobConditionsTimes = cmpopts.IgnoreFields(kubeflowv1.JobCondition{}, "LastUpdateTime", "LastTransitionTime") + MalformedManagedBy = "other-job-controller" + TooLongManagedBy = "kubeflow.org/very-very-very-very-very-very-very-very-long-job-controller" ) diff --git a/pkg/webhooks/paddlepaddle/paddlepaddle_webhook.go b/pkg/webhooks/paddlepaddle/paddlepaddle_webhook.go index dab25d419a..264abe6675 100644 --- a/pkg/webhooks/paddlepaddle/paddlepaddle_webhook.go +++ b/pkg/webhooks/paddlepaddle/paddlepaddle_webhook.go @@ -31,6 +31,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/webhook/admission" trainingoperator "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" + "github.com/kubeflow/training-operator/pkg/common/util" ) var ( @@ -74,6 +75,7 @@ func validatePaddleJob(job *trainingoperator.PaddleJob) field.ErrorList { if errors := apimachineryvalidation.NameIsDNS1035Label(job.Name, false); len(errors) != 0 { allErrs = append(allErrs, field.Invalid(field.NewPath("metadata").Child("name"), job.Name, fmt.Sprintf("should match: %v", strings.Join(errors, ",")))) } + allErrs = util.ValidateManagedBy(&job.Spec.RunPolicy, allErrs) allErrs = append(allErrs, validateSpec(job.Spec.PaddleReplicaSpecs)...) return allErrs } diff --git a/pkg/webhooks/paddlepaddle/paddlepaddle_webhook_test.go b/pkg/webhooks/paddlepaddle/paddlepaddle_webhook_test.go index 087aa5d87e..dcc17aa6cb 100644 --- a/pkg/webhooks/paddlepaddle/paddlepaddle_webhook_test.go +++ b/pkg/webhooks/paddlepaddle/paddlepaddle_webhook_test.go @@ -27,6 +27,8 @@ import ( "k8s.io/utils/ptr" trainingoperator "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" + "github.com/kubeflow/training-operator/pkg/controller.v1/common" + "github.com/kubeflow/training-operator/pkg/util/testutil" ) func TestValidateV1PaddleJob(t *testing.T) { @@ -65,6 +67,9 @@ func TestValidateV1PaddleJob(t *testing.T) { Name: "test", }, Spec: trainingoperator.PaddleJobSpec{ + RunPolicy: trainingoperator.RunPolicy{ + ManagedBy: ptr.To(common.KubeflowJobsController), + }, PaddleReplicaSpecs: validPaddleReplicaSpecs, }, }, @@ -169,6 +174,38 @@ func TestValidateV1PaddleJob(t *testing.T) { field.Required(paddleReplicaSpecPath, ""), }, }, + "managedBy controller name is malformed": { + paddleJob: &trainingoperator.PaddleJob{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test", + }, + Spec: trainingoperator.PaddleJobSpec{ + RunPolicy: trainingoperator.RunPolicy{ + ManagedBy: ptr.To(testutil.MalformedManagedBy), + }, + PaddleReplicaSpecs: validPaddleReplicaSpecs, + }, + }, + wantErr: field.ErrorList{ + field.Invalid(field.NewPath("spec").Child("managedBy"), "", ""), + }, + }, + "managedBy controller name is too long": { + paddleJob: &trainingoperator.PaddleJob{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test", + }, + Spec: trainingoperator.PaddleJobSpec{ + RunPolicy: trainingoperator.RunPolicy{ + ManagedBy: ptr.To(testutil.TooLongManagedBy), + }, + PaddleReplicaSpecs: validPaddleReplicaSpecs, + }, + }, + wantErr: field.ErrorList{ + field.TooLongMaxLength(field.NewPath("spec").Child("managedBy"), "", trainingoperator.MaxManagedByLength), + }, + }, } for name, tc := range testCases { t.Run(name, func(t *testing.T) { diff --git a/pkg/webhooks/pytorch/pytorchjob_webhook.go b/pkg/webhooks/pytorch/pytorchjob_webhook.go index 14d7b5c0eb..fdab1bb50e 100644 --- a/pkg/webhooks/pytorch/pytorchjob_webhook.go +++ b/pkg/webhooks/pytorch/pytorchjob_webhook.go @@ -31,6 +31,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/webhook/admission" trainingoperator "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" + "github.com/kubeflow/training-operator/pkg/common/util" ) var ( @@ -74,10 +75,10 @@ func (w *Webhook) ValidateDelete(context.Context, runtime.Object) (admission.War func validatePyTorchJob(job *trainingoperator.PyTorchJob) (admission.Warnings, field.ErrorList) { var allErrs field.ErrorList var warnings admission.Warnings - if errors := apimachineryvalidation.NameIsDNS1035Label(job.ObjectMeta.Name, false); len(errors) != 0 { allErrs = append(allErrs, field.Invalid(field.NewPath("metadata").Child("name"), job.Name, fmt.Sprintf("should match: %v", strings.Join(errors, ",")))) } + allErrs = util.ValidateManagedBy(&job.Spec.RunPolicy, allErrs) ws, err := validateSpec(job.Spec) warnings = append(warnings, ws...) allErrs = append(allErrs, err...) diff --git a/pkg/webhooks/pytorch/pytorchjob_webhook_test.go b/pkg/webhooks/pytorch/pytorchjob_webhook_test.go index 362a6d91e9..4c55c02dac 100644 --- a/pkg/webhooks/pytorch/pytorchjob_webhook_test.go +++ b/pkg/webhooks/pytorch/pytorchjob_webhook_test.go @@ -29,6 +29,8 @@ import ( "sigs.k8s.io/controller-runtime/pkg/webhook/admission" trainingoperator "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" + "github.com/kubeflow/training-operator/pkg/controller.v1/common" + "github.com/kubeflow/training-operator/pkg/util/testutil" ) func TestValidateV1PyTorchJob(t *testing.T) { @@ -82,6 +84,9 @@ func TestValidateV1PyTorchJob(t *testing.T) { Name: "test", }, Spec: trainingoperator.PyTorchJobSpec{ + RunPolicy: trainingoperator.RunPolicy{ + ManagedBy: ptr.To(common.KubeflowJobsController), + }, PyTorchReplicaSpecs: validPyTorchReplicaSpecs, }, }, @@ -284,6 +289,38 @@ func TestValidateV1PyTorchJob(t *testing.T) { specPath.Child("elasticPolicy").Child("nProcPerNode"), specPath.Child("nprocPerNode")), }, }, + "managedBy controller name is malformed": { + pytorchJob: &trainingoperator.PyTorchJob{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test", + }, + Spec: trainingoperator.PyTorchJobSpec{ + RunPolicy: trainingoperator.RunPolicy{ + ManagedBy: ptr.To(testutil.MalformedManagedBy), + }, + PyTorchReplicaSpecs: validPyTorchReplicaSpecs, + }, + }, + wantErr: field.ErrorList{ + field.Invalid(field.NewPath("spec").Child("managedBy"), "", ""), + }, + }, + "managedBy controller name is too long": { + pytorchJob: &trainingoperator.PyTorchJob{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test", + }, + Spec: trainingoperator.PyTorchJobSpec{ + RunPolicy: trainingoperator.RunPolicy{ + ManagedBy: ptr.To(testutil.TooLongManagedBy), + }, + PyTorchReplicaSpecs: validPyTorchReplicaSpecs, + }, + }, + wantErr: field.ErrorList{ + field.TooLongMaxLength(field.NewPath("spec").Child("managedBy"), "", trainingoperator.MaxManagedByLength), + }, + }, } for name, tc := range testCases { diff --git a/pkg/webhooks/tensorflow/tfjob_webhook.go b/pkg/webhooks/tensorflow/tfjob_webhook.go index 9e9629a1b8..b11ade8e5c 100644 --- a/pkg/webhooks/tensorflow/tfjob_webhook.go +++ b/pkg/webhooks/tensorflow/tfjob_webhook.go @@ -30,6 +30,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/webhook/admission" trainingoperator "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" + "github.com/kubeflow/training-operator/pkg/common/util" ) var ( @@ -73,6 +74,7 @@ func validateTFJob(job *trainingoperator.TFJob) field.ErrorList { if errors := apimachineryvalidation.NameIsDNS1035Label(job.Name, false); len(errors) != 0 { allErrs = append(allErrs, field.Invalid(field.NewPath("metadata").Child("name"), job.Name, fmt.Sprintf("should match: %v", strings.Join(errors, ",")))) } + allErrs = util.ValidateManagedBy(&job.Spec.RunPolicy, allErrs) allErrs = append(allErrs, validateSpec(job.Spec)...) return allErrs } diff --git a/pkg/webhooks/tensorflow/tfjob_webhook_test.go b/pkg/webhooks/tensorflow/tfjob_webhook_test.go index 236d613295..caed081fe5 100644 --- a/pkg/webhooks/tensorflow/tfjob_webhook_test.go +++ b/pkg/webhooks/tensorflow/tfjob_webhook_test.go @@ -27,6 +27,8 @@ import ( "k8s.io/utils/ptr" trainingoperator "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" + "github.com/kubeflow/training-operator/pkg/controller.v1/common" + "github.com/kubeflow/training-operator/pkg/util/testutil" ) func TestValidateTFJob(t *testing.T) { @@ -59,6 +61,9 @@ func TestValidateTFJob(t *testing.T) { Name: "test", }, Spec: trainingoperator.TFJobSpec{ + RunPolicy: trainingoperator.RunPolicy{ + ManagedBy: ptr.To(common.KubeflowJobsController), + }, TFReplicaSpecs: validTFReplicaSpecs, }, }, @@ -180,6 +185,38 @@ func TestValidateTFJob(t *testing.T) { field.Forbidden(tfReplicaSpecPath, ""), }, }, + "managedBy controller name is malformed": { + tfJob: &trainingoperator.TFJob{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test", + }, + Spec: trainingoperator.TFJobSpec{ + RunPolicy: trainingoperator.RunPolicy{ + ManagedBy: ptr.To(testutil.MalformedManagedBy), + }, + TFReplicaSpecs: validTFReplicaSpecs, + }, + }, + wantErr: field.ErrorList{ + field.Invalid(field.NewPath("spec").Child("managedBy"), "", ""), + }, + }, + "managedBy controller name is too long": { + tfJob: &trainingoperator.TFJob{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test", + }, + Spec: trainingoperator.TFJobSpec{ + RunPolicy: trainingoperator.RunPolicy{ + ManagedBy: ptr.To(testutil.TooLongManagedBy), + }, + TFReplicaSpecs: validTFReplicaSpecs, + }, + }, + wantErr: field.ErrorList{ + field.TooLongMaxLength(field.NewPath("spec").Child("managedBy"), "", trainingoperator.MaxManagedByLength), + }, + }, } for name, tc := range testCases { t.Run(name, func(t *testing.T) { diff --git a/pkg/webhooks/xgboost/xgboostjob_webhook.go b/pkg/webhooks/xgboost/xgboostjob_webhook.go index 826cbe2c53..460c129c71 100644 --- a/pkg/webhooks/xgboost/xgboostjob_webhook.go +++ b/pkg/webhooks/xgboost/xgboostjob_webhook.go @@ -31,6 +31,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/webhook/admission" trainingoperator "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" + "github.com/kubeflow/training-operator/pkg/common/util" ) var ( @@ -71,10 +72,10 @@ func (w *Webhook) ValidateDelete(context.Context, runtime.Object) (admission.War func validateXGBoostJob(job *trainingoperator.XGBoostJob) field.ErrorList { var allErrs field.ErrorList - if errors := apimachineryvalidation.NameIsDNS1035Label(job.Name, false); len(errors) != 0 { allErrs = append(allErrs, field.Invalid(field.NewPath("metadata").Child("name"), job.Name, fmt.Sprintf("should match: %v", strings.Join(errors, ",")))) } + allErrs = util.ValidateManagedBy(&job.Spec.RunPolicy, allErrs) allErrs = append(allErrs, validateSpec(job.Spec)...) return allErrs } diff --git a/pkg/webhooks/xgboost/xgboostjob_webhook_test.go b/pkg/webhooks/xgboost/xgboostjob_webhook_test.go index 9bd95893c6..b0b806aa13 100644 --- a/pkg/webhooks/xgboost/xgboostjob_webhook_test.go +++ b/pkg/webhooks/xgboost/xgboostjob_webhook_test.go @@ -27,6 +27,8 @@ import ( "k8s.io/utils/ptr" trainingoperator "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" + "github.com/kubeflow/training-operator/pkg/controller.v1/common" + "github.com/kubeflow/training-operator/pkg/util/testutil" ) func TestValidateXGBoostJob(t *testing.T) { @@ -91,6 +93,9 @@ func TestValidateXGBoostJob(t *testing.T) { Name: "test", }, Spec: trainingoperator.XGBoostJobSpec{ + RunPolicy: trainingoperator.RunPolicy{ + ManagedBy: ptr.To(common.KubeflowJobsController), + }, XGBReplicaSpecs: validXGBoostReplicaSpecs, }, }, @@ -231,6 +236,38 @@ func TestValidateXGBoostJob(t *testing.T) { field.Required(xgbReplicaSpecPath.Key(string(trainingoperator.XGBoostJobReplicaTypeMaster)), ""), }, }, + "managedBy controller name is malformed": { + xgboostJob: &trainingoperator.XGBoostJob{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test", + }, + Spec: trainingoperator.XGBoostJobSpec{ + RunPolicy: trainingoperator.RunPolicy{ + ManagedBy: ptr.To(testutil.MalformedManagedBy), + }, + XGBReplicaSpecs: validXGBoostReplicaSpecs, + }, + }, + wantErr: field.ErrorList{ + field.Invalid(field.NewPath("spec").Child("managedBy"), "", ""), + }, + }, + "managedBy controller name is too long": { + xgboostJob: &trainingoperator.XGBoostJob{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test", + }, + Spec: trainingoperator.XGBoostJobSpec{ + RunPolicy: trainingoperator.RunPolicy{ + ManagedBy: ptr.To(testutil.TooLongManagedBy), + }, + XGBReplicaSpecs: validXGBoostReplicaSpecs, + }, + }, + wantErr: field.ErrorList{ + field.TooLongMaxLength(field.NewPath("spec").Child("managedBy"), "", trainingoperator.MaxManagedByLength), + }, + }, } for name, tc := range testCases { t.Run(name, func(t *testing.T) { From cd342b3addbca79886ec8025d92b999c6ca593a3 Mon Sep 17 00:00:00 2001 From: Michal Szadkowski Date: Fri, 9 Aug 2024 11:57:49 +0200 Subject: [PATCH 06/26] add manageBy maxLenght const Signed-off-by: Michal Szadkowski --- pkg/apis/kubeflow.org/v1/common_types.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pkg/apis/kubeflow.org/v1/common_types.go b/pkg/apis/kubeflow.org/v1/common_types.go index 3d2f70fb60..76ff590021 100644 --- a/pkg/apis/kubeflow.org/v1/common_types.go +++ b/pkg/apis/kubeflow.org/v1/common_types.go @@ -182,6 +182,9 @@ const ( RestartPolicyExitCode RestartPolicy = "ExitCode" ) +// maximum length of the value of the ManagedBy field +const MaxManagedByLength = 63 + // RunPolicy encapsulates various runtime policies of the distributed training // job, for example how to clean up resources and how long the job can stay // active. From f245891fbe49616dbd2a834abb4e371c3f7a010e Mon Sep 17 00:00:00 2001 From: Michal Szadkowski Date: Fri, 9 Aug 2024 11:58:22 +0200 Subject: [PATCH 07/26] generate new manifest Signed-off-by: Michal Szadkowski --- docs/api/kubeflow.org_v1_generated.asciidoc | 14 +++++++++ hack/python-sdk/swagger.json | 4 +++ pkg/apis/kubeflow.org/v1/openapi_generated.go | 7 +++++ .../kubeflow.org/v1/zz_generated.deepcopy.go | 5 ++++ sdk/python/docs/KubeflowOrgV1RunPolicy.md | 1 + .../models/kubeflow_org_v1_run_policy.py | 30 ++++++++++++++++++- .../test/test_kubeflow_org_v1_jax_job.py | 1 + .../test/test_kubeflow_org_v1_jax_job_list.py | 2 ++ .../test/test_kubeflow_org_v1_jax_job_spec.py | 2 ++ .../test/test_kubeflow_org_v1_mpi_job.py | 1 + .../test/test_kubeflow_org_v1_mpi_job_list.py | 2 ++ .../test/test_kubeflow_org_v1_mpi_job_spec.py | 1 + .../test/test_kubeflow_org_v1_paddle_job.py | 1 + .../test_kubeflow_org_v1_paddle_job_list.py | 2 ++ .../test_kubeflow_org_v1_paddle_job_spec.py | 2 ++ .../test/test_kubeflow_org_v1_py_torch_job.py | 1 + .../test_kubeflow_org_v1_py_torch_job_list.py | 2 ++ .../test_kubeflow_org_v1_py_torch_job_spec.py | 2 ++ .../test/test_kubeflow_org_v1_run_policy.py | 1 + .../test/test_kubeflow_org_v1_tf_job.py | 1 + .../test/test_kubeflow_org_v1_tf_job_list.py | 2 ++ .../test/test_kubeflow_org_v1_tf_job_spec.py | 2 ++ .../test/test_kubeflow_org_v1_xg_boost_job.py | 1 + .../test_kubeflow_org_v1_xg_boost_job_list.py | 2 ++ .../test_kubeflow_org_v1_xg_boost_job_spec.py | 2 ++ 25 files changed, 90 insertions(+), 1 deletion(-) diff --git a/docs/api/kubeflow.org_v1_generated.asciidoc b/docs/api/kubeflow.org_v1_generated.asciidoc index 3648d52e4d..6b3599a4f2 100644 --- a/docs/api/kubeflow.org_v1_generated.asciidoc +++ b/docs/api/kubeflow.org_v1_generated.asciidoc @@ -677,6 +677,20 @@ Suspending a Job will reset the StartTime field of the Job. Defaults to false. +| *`managedBy`* __string__ | ManagedBy is used to indicate the controller or entity that manages a job. +The value must be either an empty, 'kubeflow.org/training-operator' or +'kueue.x-k8s.io/multikueue'. +The built-in job controller reconciles a job which don't have this +field at all or the field value is the reserved string +'kubeflow.org/training-operator', but delegates reconciling the job +with a 'kueue.x-k8s.io/multikueue' to the Kueue. + + +The value must be a valid domain-prefixed path (e.g. acme.io/foo) - +all characters before the first "/" must be a valid subdomain as defined +by RFC 1123. All characters trailing the first "/" must be valid HTTP Path +characters as defined by RFC 3986. The value cannot exceed 63 characters. +The field is immutable. |=== diff --git a/hack/python-sdk/swagger.json b/hack/python-sdk/swagger.json index a158dd8748..0f9f92856a 100644 --- a/hack/python-sdk/swagger.json +++ b/hack/python-sdk/swagger.json @@ -575,6 +575,10 @@ "description": "CleanPodPolicy defines the policy to kill pods after the job completes. Default to None.", "type": "string" }, + "managedBy": { + "description": "ManagedBy is used to indicate the controller or entity that manages a job. The value must be either an empty, 'kubeflow.org/training-operator' or 'kueue.x-k8s.io/multikueue'. The built-in job controller reconciles a job which don't have this field at all or the field value is the reserved string 'kubeflow.org/training-operator', but delegates reconciling the job with a 'kueue.x-k8s.io/multikueue' to the Kueue.\n\nThe value must be a valid domain-prefixed path (e.g. acme.io/foo) - all characters before the first \"/\" must be a valid subdomain as defined by RFC 1123. All characters trailing the first \"/\" must be valid HTTP Path characters as defined by RFC 3986. The value cannot exceed 63 characters. The field is immutable.", + "type": "string" + }, "schedulingPolicy": { "description": "SchedulingPolicy defines the policy related to scheduling, e.g. gang-scheduling", "$ref": "#/definitions/kubeflow.org.v1.SchedulingPolicy" diff --git a/pkg/apis/kubeflow.org/v1/openapi_generated.go b/pkg/apis/kubeflow.org/v1/openapi_generated.go index d6f32b1142..8fe8710b20 100644 --- a/pkg/apis/kubeflow.org/v1/openapi_generated.go +++ b/pkg/apis/kubeflow.org/v1/openapi_generated.go @@ -1063,6 +1063,13 @@ func schema_pkg_apis_kubefloworg_v1_RunPolicy(ref common.ReferenceCallback) comm Format: "", }, }, + "managedBy": { + SchemaProps: spec.SchemaProps{ + Description: "ManagedBy is used to indicate the controller or entity that manages a job. The value must be either an empty, 'kubeflow.org/training-operator' or 'kueue.x-k8s.io/multikueue'. The built-in job controller reconciles a job which don't have this field at all or the field value is the reserved string 'kubeflow.org/training-operator', but delegates reconciling the job with a 'kueue.x-k8s.io/multikueue' to the Kueue.\n\nThe value must be a valid domain-prefixed path (e.g. acme.io/foo) - all characters before the first \"/\" must be a valid subdomain as defined by RFC 1123. All characters trailing the first \"/\" must be valid HTTP Path characters as defined by RFC 3986. The value cannot exceed 63 characters. The field is immutable.", + Type: []string{"string"}, + Format: "", + }, + }, }, }, }, diff --git a/pkg/apis/kubeflow.org/v1/zz_generated.deepcopy.go b/pkg/apis/kubeflow.org/v1/zz_generated.deepcopy.go index 79ca131cd5..52ce7b75e6 100644 --- a/pkg/apis/kubeflow.org/v1/zz_generated.deepcopy.go +++ b/pkg/apis/kubeflow.org/v1/zz_generated.deepcopy.go @@ -680,6 +680,11 @@ func (in *RunPolicy) DeepCopyInto(out *RunPolicy) { *out = new(bool) **out = **in } + if in.ManagedBy != nil { + in, out := &in.ManagedBy, &out.ManagedBy + *out = new(string) + **out = **in + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new RunPolicy. diff --git a/sdk/python/docs/KubeflowOrgV1RunPolicy.md b/sdk/python/docs/KubeflowOrgV1RunPolicy.md index ce41fa09f6..bcbadfaa10 100644 --- a/sdk/python/docs/KubeflowOrgV1RunPolicy.md +++ b/sdk/python/docs/KubeflowOrgV1RunPolicy.md @@ -7,6 +7,7 @@ Name | Type | Description | Notes **active_deadline_seconds** | **int** | Specifies the duration in seconds relative to the startTime that the job may be active before the system tries to terminate it; value must be positive integer. | [optional] **backoff_limit** | **int** | Optional number of retries before marking this job failed. | [optional] **clean_pod_policy** | **str** | CleanPodPolicy defines the policy to kill pods after the job completes. Default to None. | [optional] +**managed_by** | **str** | ManagedBy is used to indicate the controller or entity that manages a job. The value must be either an empty, 'kubeflow.org/training-operator' or 'kueue.x-k8s.io/multikueue'. The built-in job controller reconciles a job which don't have this field at all or the field value is the reserved string 'kubeflow.org/training-operator', but delegates reconciling the job with a 'kueue.x-k8s.io/multikueue' to the Kueue. The value must be a valid domain-prefixed path (e.g. acme.io/foo) - all characters before the first \"/\" must be a valid subdomain as defined by RFC 1123. All characters trailing the first \"/\" must be valid HTTP Path characters as defined by RFC 3986. The value cannot exceed 63 characters. The field is immutable. | [optional] **scheduling_policy** | [**KubeflowOrgV1SchedulingPolicy**](KubeflowOrgV1SchedulingPolicy.md) | | [optional] **suspend** | **bool** | suspend specifies whether the Job controller should create Pods or not. If a Job is created with suspend set to true, no Pods are created by the Job controller. If a Job is suspended after creation (i.e. the flag goes from false to true), the Job controller will delete all active Pods and PodGroups associated with this Job. Users must design their workload to gracefully handle this. Suspending a Job will reset the StartTime field of the Job. Defaults to false. | [optional] **ttl_seconds_after_finished** | **int** | TTLSecondsAfterFinished is the TTL to clean up jobs. It may take extra ReconcilePeriod seconds for the cleanup, since reconcile gets called periodically. Default to infinite. | [optional] diff --git a/sdk/python/kubeflow/training/models/kubeflow_org_v1_run_policy.py b/sdk/python/kubeflow/training/models/kubeflow_org_v1_run_policy.py index c5fd48bc9a..7d30f3e41c 100644 --- a/sdk/python/kubeflow/training/models/kubeflow_org_v1_run_policy.py +++ b/sdk/python/kubeflow/training/models/kubeflow_org_v1_run_policy.py @@ -36,6 +36,7 @@ class KubeflowOrgV1RunPolicy(object): 'active_deadline_seconds': 'int', 'backoff_limit': 'int', 'clean_pod_policy': 'str', + 'managed_by': 'str', 'scheduling_policy': 'KubeflowOrgV1SchedulingPolicy', 'suspend': 'bool', 'ttl_seconds_after_finished': 'int' @@ -45,12 +46,13 @@ class KubeflowOrgV1RunPolicy(object): 'active_deadline_seconds': 'activeDeadlineSeconds', 'backoff_limit': 'backoffLimit', 'clean_pod_policy': 'cleanPodPolicy', + 'managed_by': 'managedBy', 'scheduling_policy': 'schedulingPolicy', 'suspend': 'suspend', 'ttl_seconds_after_finished': 'ttlSecondsAfterFinished' } - def __init__(self, active_deadline_seconds=None, backoff_limit=None, clean_pod_policy=None, scheduling_policy=None, suspend=None, ttl_seconds_after_finished=None, local_vars_configuration=None): # noqa: E501 + def __init__(self, active_deadline_seconds=None, backoff_limit=None, clean_pod_policy=None, managed_by=None, scheduling_policy=None, suspend=None, ttl_seconds_after_finished=None, local_vars_configuration=None): # noqa: E501 """KubeflowOrgV1RunPolicy - a model defined in OpenAPI""" # noqa: E501 if local_vars_configuration is None: local_vars_configuration = Configuration() @@ -59,6 +61,7 @@ def __init__(self, active_deadline_seconds=None, backoff_limit=None, clean_pod_p self._active_deadline_seconds = None self._backoff_limit = None self._clean_pod_policy = None + self._managed_by = None self._scheduling_policy = None self._suspend = None self._ttl_seconds_after_finished = None @@ -70,6 +73,8 @@ def __init__(self, active_deadline_seconds=None, backoff_limit=None, clean_pod_p self.backoff_limit = backoff_limit if clean_pod_policy is not None: self.clean_pod_policy = clean_pod_policy + if managed_by is not None: + self.managed_by = managed_by if scheduling_policy is not None: self.scheduling_policy = scheduling_policy if suspend is not None: @@ -146,6 +151,29 @@ def clean_pod_policy(self, clean_pod_policy): self._clean_pod_policy = clean_pod_policy + @property + def managed_by(self): + """Gets the managed_by of this KubeflowOrgV1RunPolicy. # noqa: E501 + + ManagedBy is used to indicate the controller or entity that manages a job. The value must be either an empty, 'kubeflow.org/training-operator' or 'kueue.x-k8s.io/multikueue'. The built-in job controller reconciles a job which don't have this field at all or the field value is the reserved string 'kubeflow.org/training-operator', but delegates reconciling the job with a 'kueue.x-k8s.io/multikueue' to the Kueue. The value must be a valid domain-prefixed path (e.g. acme.io/foo) - all characters before the first \"/\" must be a valid subdomain as defined by RFC 1123. All characters trailing the first \"/\" must be valid HTTP Path characters as defined by RFC 3986. The value cannot exceed 63 characters. The field is immutable. # noqa: E501 + + :return: The managed_by of this KubeflowOrgV1RunPolicy. # noqa: E501 + :rtype: str + """ + return self._managed_by + + @managed_by.setter + def managed_by(self, managed_by): + """Sets the managed_by of this KubeflowOrgV1RunPolicy. + + ManagedBy is used to indicate the controller or entity that manages a job. The value must be either an empty, 'kubeflow.org/training-operator' or 'kueue.x-k8s.io/multikueue'. The built-in job controller reconciles a job which don't have this field at all or the field value is the reserved string 'kubeflow.org/training-operator', but delegates reconciling the job with a 'kueue.x-k8s.io/multikueue' to the Kueue. The value must be a valid domain-prefixed path (e.g. acme.io/foo) - all characters before the first \"/\" must be a valid subdomain as defined by RFC 1123. All characters trailing the first \"/\" must be valid HTTP Path characters as defined by RFC 3986. The value cannot exceed 63 characters. The field is immutable. # noqa: E501 + + :param managed_by: The managed_by of this KubeflowOrgV1RunPolicy. # noqa: E501 + :type: str + """ + + self._managed_by = managed_by + @property def scheduling_policy(self): """Gets the scheduling_policy of this KubeflowOrgV1RunPolicy. # noqa: E501 diff --git a/sdk/python/test/test_kubeflow_org_v1_jax_job.py b/sdk/python/test/test_kubeflow_org_v1_jax_job.py index 05c9e07d4c..f252ecd54d 100644 --- a/sdk/python/test/test_kubeflow_org_v1_jax_job.py +++ b/sdk/python/test/test_kubeflow_org_v1_jax_job.py @@ -50,6 +50,7 @@ def make_instance(self, include_optional): active_deadline_seconds = 56, backoff_limit = 56, clean_pod_policy = '0', + managed_by = '0', scheduling_policy = kubeflow_org_v1_scheduling_policy.KubeflowOrgV1SchedulingPolicy( min_available = 56, min_resources = { diff --git a/sdk/python/test/test_kubeflow_org_v1_jax_job_list.py b/sdk/python/test/test_kubeflow_org_v1_jax_job_list.py index 4f92324581..cfdf269e43 100644 --- a/sdk/python/test/test_kubeflow_org_v1_jax_job_list.py +++ b/sdk/python/test/test_kubeflow_org_v1_jax_job_list.py @@ -53,6 +53,7 @@ def make_instance(self, include_optional): active_deadline_seconds = 56, backoff_limit = 56, clean_pod_policy = '0', + managed_by = '0', scheduling_policy = kubeflow_org_v1_scheduling_policy.KubeflowOrgV1SchedulingPolicy( min_available = 56, min_resources = { @@ -106,6 +107,7 @@ def make_instance(self, include_optional): active_deadline_seconds = 56, backoff_limit = 56, clean_pod_policy = '0', + managed_by = '0', scheduling_policy = kubeflow_org_v1_scheduling_policy.KubeflowOrgV1SchedulingPolicy( min_available = 56, min_resources = { diff --git a/sdk/python/test/test_kubeflow_org_v1_jax_job_spec.py b/sdk/python/test/test_kubeflow_org_v1_jax_job_spec.py index 7e023b261b..02da4bb580 100644 --- a/sdk/python/test/test_kubeflow_org_v1_jax_job_spec.py +++ b/sdk/python/test/test_kubeflow_org_v1_jax_job_spec.py @@ -46,6 +46,7 @@ def make_instance(self, include_optional): active_deadline_seconds = 56, backoff_limit = 56, clean_pod_policy = '0', + managed_by = '0', scheduling_policy = kubeflow_org_v1_scheduling_policy.KubeflowOrgV1SchedulingPolicy( min_available = 56, min_resources = { @@ -69,6 +70,7 @@ def make_instance(self, include_optional): active_deadline_seconds = 56, backoff_limit = 56, clean_pod_policy = '0', + managed_by = '0', scheduling_policy = kubeflow_org_v1_scheduling_policy.KubeflowOrgV1SchedulingPolicy( min_available = 56, min_resources = { diff --git a/sdk/python/test/test_kubeflow_org_v1_mpi_job.py b/sdk/python/test/test_kubeflow_org_v1_mpi_job.py index 1f8f164cf9..2f2eccdf8e 100644 --- a/sdk/python/test/test_kubeflow_org_v1_mpi_job.py +++ b/sdk/python/test/test_kubeflow_org_v1_mpi_job.py @@ -52,6 +52,7 @@ def make_instance(self, include_optional): active_deadline_seconds = 56, backoff_limit = 56, clean_pod_policy = '0', + managed_by = '0', scheduling_policy = kubeflow_org_v1_scheduling_policy.KubeflowOrgV1SchedulingPolicy( min_available = 56, min_resources = { diff --git a/sdk/python/test/test_kubeflow_org_v1_mpi_job_list.py b/sdk/python/test/test_kubeflow_org_v1_mpi_job_list.py index 4406dc67e1..ee3248e160 100644 --- a/sdk/python/test/test_kubeflow_org_v1_mpi_job_list.py +++ b/sdk/python/test/test_kubeflow_org_v1_mpi_job_list.py @@ -55,6 +55,7 @@ def make_instance(self, include_optional): active_deadline_seconds = 56, backoff_limit = 56, clean_pod_policy = '0', + managed_by = '0', scheduling_policy = kubeflow_org_v1_scheduling_policy.KubeflowOrgV1SchedulingPolicy( min_available = 56, min_resources = { @@ -111,6 +112,7 @@ def make_instance(self, include_optional): active_deadline_seconds = 56, backoff_limit = 56, clean_pod_policy = '0', + managed_by = '0', scheduling_policy = kubeflow_org_v1_scheduling_policy.KubeflowOrgV1SchedulingPolicy( min_available = 56, min_resources = { diff --git a/sdk/python/test/test_kubeflow_org_v1_mpi_job_spec.py b/sdk/python/test/test_kubeflow_org_v1_mpi_job_spec.py index c042104169..4142c43a0c 100644 --- a/sdk/python/test/test_kubeflow_org_v1_mpi_job_spec.py +++ b/sdk/python/test/test_kubeflow_org_v1_mpi_job_spec.py @@ -48,6 +48,7 @@ def make_instance(self, include_optional): active_deadline_seconds = 56, backoff_limit = 56, clean_pod_policy = '0', + managed_by = '0', scheduling_policy = kubeflow_org_v1_scheduling_policy.KubeflowOrgV1SchedulingPolicy( min_available = 56, min_resources = { diff --git a/sdk/python/test/test_kubeflow_org_v1_paddle_job.py b/sdk/python/test/test_kubeflow_org_v1_paddle_job.py index 17759aa30a..e0e213f6cb 100644 --- a/sdk/python/test/test_kubeflow_org_v1_paddle_job.py +++ b/sdk/python/test/test_kubeflow_org_v1_paddle_job.py @@ -57,6 +57,7 @@ def make_instance(self, include_optional): active_deadline_seconds = 56, backoff_limit = 56, clean_pod_policy = '0', + managed_by = '0', scheduling_policy = kubeflow_org_v1_scheduling_policy.KubeflowOrgV1SchedulingPolicy( min_available = 56, min_resources = { diff --git a/sdk/python/test/test_kubeflow_org_v1_paddle_job_list.py b/sdk/python/test/test_kubeflow_org_v1_paddle_job_list.py index e82d533a1f..d11b9484fa 100644 --- a/sdk/python/test/test_kubeflow_org_v1_paddle_job_list.py +++ b/sdk/python/test/test_kubeflow_org_v1_paddle_job_list.py @@ -60,6 +60,7 @@ def make_instance(self, include_optional): active_deadline_seconds = 56, backoff_limit = 56, clean_pod_policy = '0', + managed_by = '0', scheduling_policy = kubeflow_org_v1_scheduling_policy.KubeflowOrgV1SchedulingPolicy( min_available = 56, min_resources = { @@ -120,6 +121,7 @@ def make_instance(self, include_optional): active_deadline_seconds = 56, backoff_limit = 56, clean_pod_policy = '0', + managed_by = '0', scheduling_policy = kubeflow_org_v1_scheduling_policy.KubeflowOrgV1SchedulingPolicy( min_available = 56, min_resources = { diff --git a/sdk/python/test/test_kubeflow_org_v1_paddle_job_spec.py b/sdk/python/test/test_kubeflow_org_v1_paddle_job_spec.py index 71029311b3..4b7e759a42 100644 --- a/sdk/python/test/test_kubeflow_org_v1_paddle_job_spec.py +++ b/sdk/python/test/test_kubeflow_org_v1_paddle_job_spec.py @@ -53,6 +53,7 @@ def make_instance(self, include_optional): active_deadline_seconds = 56, backoff_limit = 56, clean_pod_policy = '0', + managed_by = '0', scheduling_policy = kubeflow_org_v1_scheduling_policy.KubeflowOrgV1SchedulingPolicy( min_available = 56, min_resources = { @@ -76,6 +77,7 @@ def make_instance(self, include_optional): active_deadline_seconds = 56, backoff_limit = 56, clean_pod_policy = '0', + managed_by = '0', scheduling_policy = kubeflow_org_v1_scheduling_policy.KubeflowOrgV1SchedulingPolicy( min_available = 56, min_resources = { diff --git a/sdk/python/test/test_kubeflow_org_v1_py_torch_job.py b/sdk/python/test/test_kubeflow_org_v1_py_torch_job.py index 4a85f5ed46..716555cf75 100644 --- a/sdk/python/test/test_kubeflow_org_v1_py_torch_job.py +++ b/sdk/python/test/test_kubeflow_org_v1_py_torch_job.py @@ -69,6 +69,7 @@ def make_instance(self, include_optional): active_deadline_seconds = 56, backoff_limit = 56, clean_pod_policy = '0', + managed_by = '0', scheduling_policy = kubeflow_org_v1_scheduling_policy.KubeflowOrgV1SchedulingPolicy( min_available = 56, min_resources = { diff --git a/sdk/python/test/test_kubeflow_org_v1_py_torch_job_list.py b/sdk/python/test/test_kubeflow_org_v1_py_torch_job_list.py index 2cbafc65de..be2e21f2c4 100644 --- a/sdk/python/test/test_kubeflow_org_v1_py_torch_job_list.py +++ b/sdk/python/test/test_kubeflow_org_v1_py_torch_job_list.py @@ -72,6 +72,7 @@ def make_instance(self, include_optional): active_deadline_seconds = 56, backoff_limit = 56, clean_pod_policy = '0', + managed_by = '0', scheduling_policy = kubeflow_org_v1_scheduling_policy.KubeflowOrgV1SchedulingPolicy( min_available = 56, min_resources = { @@ -144,6 +145,7 @@ def make_instance(self, include_optional): active_deadline_seconds = 56, backoff_limit = 56, clean_pod_policy = '0', + managed_by = '0', scheduling_policy = kubeflow_org_v1_scheduling_policy.KubeflowOrgV1SchedulingPolicy( min_available = 56, min_resources = { diff --git a/sdk/python/test/test_kubeflow_org_v1_py_torch_job_spec.py b/sdk/python/test/test_kubeflow_org_v1_py_torch_job_spec.py index f19f3f2fd1..eb73d0c63c 100644 --- a/sdk/python/test/test_kubeflow_org_v1_py_torch_job_spec.py +++ b/sdk/python/test/test_kubeflow_org_v1_py_torch_job_spec.py @@ -65,6 +65,7 @@ def make_instance(self, include_optional): active_deadline_seconds = 56, backoff_limit = 56, clean_pod_policy = '0', + managed_by = '0', scheduling_policy = kubeflow_org_v1_scheduling_policy.KubeflowOrgV1SchedulingPolicy( min_available = 56, min_resources = { @@ -88,6 +89,7 @@ def make_instance(self, include_optional): active_deadline_seconds = 56, backoff_limit = 56, clean_pod_policy = '0', + managed_by = '0', scheduling_policy = kubeflow_org_v1_scheduling_policy.KubeflowOrgV1SchedulingPolicy( min_available = 56, min_resources = { diff --git a/sdk/python/test/test_kubeflow_org_v1_run_policy.py b/sdk/python/test/test_kubeflow_org_v1_run_policy.py index 525be1d785..a279e0c722 100644 --- a/sdk/python/test/test_kubeflow_org_v1_run_policy.py +++ b/sdk/python/test/test_kubeflow_org_v1_run_policy.py @@ -39,6 +39,7 @@ def make_instance(self, include_optional): active_deadline_seconds = 56, backoff_limit = 56, clean_pod_policy = '0', + managed_by = '0', scheduling_policy = kubeflow_org_v1_scheduling_policy.KubeflowOrgV1SchedulingPolicy( min_available = 56, min_resources = { diff --git a/sdk/python/test/test_kubeflow_org_v1_tf_job.py b/sdk/python/test/test_kubeflow_org_v1_tf_job.py index fa1cf4a6bc..8d8e89c94d 100644 --- a/sdk/python/test/test_kubeflow_org_v1_tf_job.py +++ b/sdk/python/test/test_kubeflow_org_v1_tf_job.py @@ -45,6 +45,7 @@ def make_instance(self, include_optional): active_deadline_seconds = 56, backoff_limit = 56, clean_pod_policy = '0', + managed_by = '0', scheduling_policy = kubeflow_org_v1_scheduling_policy.KubeflowOrgV1SchedulingPolicy( min_available = 56, min_resources = { diff --git a/sdk/python/test/test_kubeflow_org_v1_tf_job_list.py b/sdk/python/test/test_kubeflow_org_v1_tf_job_list.py index 6f7c46d212..5dbb8fd138 100644 --- a/sdk/python/test/test_kubeflow_org_v1_tf_job_list.py +++ b/sdk/python/test/test_kubeflow_org_v1_tf_job_list.py @@ -48,6 +48,7 @@ def make_instance(self, include_optional): active_deadline_seconds = 56, backoff_limit = 56, clean_pod_policy = '0', + managed_by = '0', scheduling_policy = kubeflow_org_v1_scheduling_policy.KubeflowOrgV1SchedulingPolicy( min_available = 56, min_resources = { @@ -103,6 +104,7 @@ def make_instance(self, include_optional): active_deadline_seconds = 56, backoff_limit = 56, clean_pod_policy = '0', + managed_by = '0', scheduling_policy = kubeflow_org_v1_scheduling_policy.KubeflowOrgV1SchedulingPolicy( min_available = 56, min_resources = { diff --git a/sdk/python/test/test_kubeflow_org_v1_tf_job_spec.py b/sdk/python/test/test_kubeflow_org_v1_tf_job_spec.py index 892fb43578..9448020c3e 100644 --- a/sdk/python/test/test_kubeflow_org_v1_tf_job_spec.py +++ b/sdk/python/test/test_kubeflow_org_v1_tf_job_spec.py @@ -41,6 +41,7 @@ def make_instance(self, include_optional): active_deadline_seconds = 56, backoff_limit = 56, clean_pod_policy = '0', + managed_by = '0', scheduling_policy = kubeflow_org_v1_scheduling_policy.KubeflowOrgV1SchedulingPolicy( min_available = 56, min_resources = { @@ -65,6 +66,7 @@ def make_instance(self, include_optional): active_deadline_seconds = 56, backoff_limit = 56, clean_pod_policy = '0', + managed_by = '0', scheduling_policy = kubeflow_org_v1_scheduling_policy.KubeflowOrgV1SchedulingPolicy( min_available = 56, min_resources = { diff --git a/sdk/python/test/test_kubeflow_org_v1_xg_boost_job.py b/sdk/python/test/test_kubeflow_org_v1_xg_boost_job.py index 88e6de1793..324bad50d7 100644 --- a/sdk/python/test/test_kubeflow_org_v1_xg_boost_job.py +++ b/sdk/python/test/test_kubeflow_org_v1_xg_boost_job.py @@ -44,6 +44,7 @@ def make_instance(self, include_optional): active_deadline_seconds = 56, backoff_limit = 56, clean_pod_policy = '0', + managed_by = '0', scheduling_policy = kubeflow_org_v1_scheduling_policy.KubeflowOrgV1SchedulingPolicy( min_available = 56, min_resources = { diff --git a/sdk/python/test/test_kubeflow_org_v1_xg_boost_job_list.py b/sdk/python/test/test_kubeflow_org_v1_xg_boost_job_list.py index 5cdfdd931d..de4dd000f2 100644 --- a/sdk/python/test/test_kubeflow_org_v1_xg_boost_job_list.py +++ b/sdk/python/test/test_kubeflow_org_v1_xg_boost_job_list.py @@ -47,6 +47,7 @@ def make_instance(self, include_optional): active_deadline_seconds = 56, backoff_limit = 56, clean_pod_policy = '0', + managed_by = '0', scheduling_policy = kubeflow_org_v1_scheduling_policy.KubeflowOrgV1SchedulingPolicy( min_available = 56, min_resources = { @@ -100,6 +101,7 @@ def make_instance(self, include_optional): active_deadline_seconds = 56, backoff_limit = 56, clean_pod_policy = '0', + managed_by = '0', scheduling_policy = kubeflow_org_v1_scheduling_policy.KubeflowOrgV1SchedulingPolicy( min_available = 56, min_resources = { diff --git a/sdk/python/test/test_kubeflow_org_v1_xg_boost_job_spec.py b/sdk/python/test/test_kubeflow_org_v1_xg_boost_job_spec.py index 222f29f84a..c7206b01f1 100644 --- a/sdk/python/test/test_kubeflow_org_v1_xg_boost_job_spec.py +++ b/sdk/python/test/test_kubeflow_org_v1_xg_boost_job_spec.py @@ -40,6 +40,7 @@ def make_instance(self, include_optional): active_deadline_seconds = 56, backoff_limit = 56, clean_pod_policy = '0', + managed_by = '0', scheduling_policy = kubeflow_org_v1_scheduling_policy.KubeflowOrgV1SchedulingPolicy( min_available = 56, min_resources = { @@ -63,6 +64,7 @@ def make_instance(self, include_optional): active_deadline_seconds = 56, backoff_limit = 56, clean_pod_policy = '0', + managed_by = '0', scheduling_policy = kubeflow_org_v1_scheduling_policy.KubeflowOrgV1SchedulingPolicy( min_available = 56, min_resources = { From 4f7694ac5000dfc252576ab06f225dae57e31346 Mon Sep 17 00:00:00 2001 From: Michal Szadkowski Date: Tue, 13 Aug 2024 11:40:08 +0200 Subject: [PATCH 08/26] revert webhook formatting Signed-off-by: Michal Szadkowski --- pkg/webhooks/pytorch/pytorchjob_webhook.go | 1 + pkg/webhooks/xgboost/xgboostjob_webhook.go | 1 + 2 files changed, 2 insertions(+) diff --git a/pkg/webhooks/pytorch/pytorchjob_webhook.go b/pkg/webhooks/pytorch/pytorchjob_webhook.go index fdab1bb50e..c5793a6003 100644 --- a/pkg/webhooks/pytorch/pytorchjob_webhook.go +++ b/pkg/webhooks/pytorch/pytorchjob_webhook.go @@ -75,6 +75,7 @@ func (w *Webhook) ValidateDelete(context.Context, runtime.Object) (admission.War func validatePyTorchJob(job *trainingoperator.PyTorchJob) (admission.Warnings, field.ErrorList) { var allErrs field.ErrorList var warnings admission.Warnings + if errors := apimachineryvalidation.NameIsDNS1035Label(job.ObjectMeta.Name, false); len(errors) != 0 { allErrs = append(allErrs, field.Invalid(field.NewPath("metadata").Child("name"), job.Name, fmt.Sprintf("should match: %v", strings.Join(errors, ",")))) } diff --git a/pkg/webhooks/xgboost/xgboostjob_webhook.go b/pkg/webhooks/xgboost/xgboostjob_webhook.go index 460c129c71..a02d98dd6f 100644 --- a/pkg/webhooks/xgboost/xgboostjob_webhook.go +++ b/pkg/webhooks/xgboost/xgboostjob_webhook.go @@ -72,6 +72,7 @@ func (w *Webhook) ValidateDelete(context.Context, runtime.Object) (admission.War func validateXGBoostJob(job *trainingoperator.XGBoostJob) field.ErrorList { var allErrs field.ErrorList + if errors := apimachineryvalidation.NameIsDNS1035Label(job.Name, false); len(errors) != 0 { allErrs = append(allErrs, field.Invalid(field.NewPath("metadata").Child("name"), job.Name, fmt.Sprintf("should match: %v", strings.Join(errors, ",")))) } From cc28b5e3bb819f2960dcbb4d37701ec3d5273ad4 Mon Sep 17 00:00:00 2001 From: Michal Szadkowski Date: Tue, 13 Aug 2024 12:42:43 +0200 Subject: [PATCH 09/26] Move allowed controllers constants in one place Signed-off-by: Michal Szadkowski --- pkg/apis/kubeflow.org/v1/common_types.go | 6 ++++++ pkg/controller.v1/common/job_controller.go | 2 -- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/pkg/apis/kubeflow.org/v1/common_types.go b/pkg/apis/kubeflow.org/v1/common_types.go index 76ff590021..f71e0cc8e6 100644 --- a/pkg/apis/kubeflow.org/v1/common_types.go +++ b/pkg/apis/kubeflow.org/v1/common_types.go @@ -35,6 +35,12 @@ const ( // JobRoleLabel represents the label key for the job role, e.g. master. JobRoleLabel = "training.kubeflow.org/job-role" + + // KubeflowJobsController represents the value of the default jobs controller + KubeflowJobsController = "kubeflow.org/training-operator" + + // MultiKueueController represents the value of the only allowed external controller except for KubeflowJobsController + MultiKueueController = "kueue.x-k8s.io/multikueue" ) // JobStatus represents the current observed state of the training Job. diff --git a/pkg/controller.v1/common/job_controller.go b/pkg/controller.v1/common/job_controller.go index 23fc02ed40..b141c22c55 100644 --- a/pkg/controller.v1/common/job_controller.go +++ b/pkg/controller.v1/common/job_controller.go @@ -77,8 +77,6 @@ func (c *JobControllerConfiguration) EnableGangScheduling() bool { return c.GangScheduling != "" && c.GangScheduling != GangSchedulerNone } -const KubeflowJobsController = "kubeflow.org/training-operator" - // JobController abstracts other operators to manage the lifecycle of Jobs. // User need to first implement the ControllerInterface(objectA) and then initialize a JobController(objectB) struct with objectA // as the parameter. From d94a23c7c42e6f6635a8c7392f56a1b7a6b56c1b Mon Sep 17 00:00:00 2001 From: Michal Szadkowski Date: Tue, 13 Aug 2024 12:44:23 +0200 Subject: [PATCH 10/26] Make validatation for allowed managedBy values Signed-off-by: Michal Szadkowski --- pkg/common/util/webhooks.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pkg/common/util/webhooks.go b/pkg/common/util/webhooks.go index e342201ff8..10d2f14dae 100644 --- a/pkg/common/util/webhooks.go +++ b/pkg/common/util/webhooks.go @@ -15,6 +15,9 @@ func ValidateManagedBy(runPolicy *v1.RunPolicy, allErrs field.ErrorList) field.E if len(manager) > v1.MaxManagedByLength { allErrs = append(allErrs, field.TooLongMaxLength(fieldPath, manager, v1.MaxManagedByLength)) } + if manager != v1.MultiKueueController || manager != v1.KubeflowJobsController { + allErrs = append(allErrs, field.NotSupported(fieldPath, manager, []string{v1.MultiKueueController, v1.KubeflowJobsController})) + } } return allErrs } From 6f471d6d252c52149a3daa6042e983aa297ea03c Mon Sep 17 00:00:00 2001 From: Michal Szadkowski Date: Tue, 13 Aug 2024 12:46:20 +0200 Subject: [PATCH 11/26] Update after controllers constants move Signed-off-by: Michal Szadkowski --- pkg/controller.v1/common/job.go | 2 +- pkg/controller.v1/common/job_test.go | 2 +- pkg/webhooks/paddlepaddle/paddlepaddle_webhook_test.go | 3 +-- pkg/webhooks/pytorch/pytorchjob_webhook_test.go | 3 +-- pkg/webhooks/tensorflow/tfjob_webhook_test.go | 3 +-- pkg/webhooks/xgboost/xgboostjob_webhook_test.go | 3 +-- 6 files changed, 6 insertions(+), 10 deletions(-) diff --git a/pkg/controller.v1/common/job.go b/pkg/controller.v1/common/job.go index d9b5715e47..da67e8a6cb 100644 --- a/pkg/controller.v1/common/job.go +++ b/pkg/controller.v1/common/job.go @@ -457,7 +457,7 @@ func (jc *JobController) calcPGMinResources(minMember int32, replicas map[apiv1. } func (jc *JobController) ManagedByExternalController(rp apiv1.RunPolicy) *string { - if controllerName := rp.ManagedBy; controllerName != nil && *controllerName != KubeflowJobsController { + if controllerName := rp.ManagedBy; controllerName != nil && *controllerName != apiv1.KubeflowJobsController { return controllerName } return nil diff --git a/pkg/controller.v1/common/job_test.go b/pkg/controller.v1/common/job_test.go index 5f33b24d86..f76d9be7f1 100644 --- a/pkg/controller.v1/common/job_test.go +++ b/pkg/controller.v1/common/job_test.go @@ -227,7 +227,7 @@ func TestManagedBy(T *testing.T) { managedBy: "", }, "managedBy is training-operator controller": { - managedBy: KubeflowJobsController, + managedBy: apiv1.KubeflowJobsController, }, "managedBy is not the training-operator controller": { managedBy: "kueue.x-k8s.io/multikueue", diff --git a/pkg/webhooks/paddlepaddle/paddlepaddle_webhook_test.go b/pkg/webhooks/paddlepaddle/paddlepaddle_webhook_test.go index dcc17aa6cb..ac42549185 100644 --- a/pkg/webhooks/paddlepaddle/paddlepaddle_webhook_test.go +++ b/pkg/webhooks/paddlepaddle/paddlepaddle_webhook_test.go @@ -27,7 +27,6 @@ import ( "k8s.io/utils/ptr" trainingoperator "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" - "github.com/kubeflow/training-operator/pkg/controller.v1/common" "github.com/kubeflow/training-operator/pkg/util/testutil" ) @@ -68,7 +67,7 @@ func TestValidateV1PaddleJob(t *testing.T) { }, Spec: trainingoperator.PaddleJobSpec{ RunPolicy: trainingoperator.RunPolicy{ - ManagedBy: ptr.To(common.KubeflowJobsController), + ManagedBy: ptr.To(trainingoperator.KubeflowJobsController), }, PaddleReplicaSpecs: validPaddleReplicaSpecs, }, diff --git a/pkg/webhooks/pytorch/pytorchjob_webhook_test.go b/pkg/webhooks/pytorch/pytorchjob_webhook_test.go index 4c55c02dac..a28b5d953c 100644 --- a/pkg/webhooks/pytorch/pytorchjob_webhook_test.go +++ b/pkg/webhooks/pytorch/pytorchjob_webhook_test.go @@ -29,7 +29,6 @@ import ( "sigs.k8s.io/controller-runtime/pkg/webhook/admission" trainingoperator "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" - "github.com/kubeflow/training-operator/pkg/controller.v1/common" "github.com/kubeflow/training-operator/pkg/util/testutil" ) @@ -85,7 +84,7 @@ func TestValidateV1PyTorchJob(t *testing.T) { }, Spec: trainingoperator.PyTorchJobSpec{ RunPolicy: trainingoperator.RunPolicy{ - ManagedBy: ptr.To(common.KubeflowJobsController), + ManagedBy: ptr.To(trainingoperator.KubeflowJobsController), }, PyTorchReplicaSpecs: validPyTorchReplicaSpecs, }, diff --git a/pkg/webhooks/tensorflow/tfjob_webhook_test.go b/pkg/webhooks/tensorflow/tfjob_webhook_test.go index caed081fe5..a294ea382d 100644 --- a/pkg/webhooks/tensorflow/tfjob_webhook_test.go +++ b/pkg/webhooks/tensorflow/tfjob_webhook_test.go @@ -27,7 +27,6 @@ import ( "k8s.io/utils/ptr" trainingoperator "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" - "github.com/kubeflow/training-operator/pkg/controller.v1/common" "github.com/kubeflow/training-operator/pkg/util/testutil" ) @@ -62,7 +61,7 @@ func TestValidateTFJob(t *testing.T) { }, Spec: trainingoperator.TFJobSpec{ RunPolicy: trainingoperator.RunPolicy{ - ManagedBy: ptr.To(common.KubeflowJobsController), + ManagedBy: ptr.To(trainingoperator.KubeflowJobsController), }, TFReplicaSpecs: validTFReplicaSpecs, }, diff --git a/pkg/webhooks/xgboost/xgboostjob_webhook_test.go b/pkg/webhooks/xgboost/xgboostjob_webhook_test.go index b0b806aa13..d4b63440b0 100644 --- a/pkg/webhooks/xgboost/xgboostjob_webhook_test.go +++ b/pkg/webhooks/xgboost/xgboostjob_webhook_test.go @@ -27,7 +27,6 @@ import ( "k8s.io/utils/ptr" trainingoperator "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" - "github.com/kubeflow/training-operator/pkg/controller.v1/common" "github.com/kubeflow/training-operator/pkg/util/testutil" ) @@ -94,7 +93,7 @@ func TestValidateXGBoostJob(t *testing.T) { }, Spec: trainingoperator.XGBoostJobSpec{ RunPolicy: trainingoperator.RunPolicy{ - ManagedBy: ptr.To(common.KubeflowJobsController), + ManagedBy: ptr.To(trainingoperator.KubeflowJobsController), }, XGBReplicaSpecs: validXGBoostReplicaSpecs, }, From 4c5565d56c5a86a3bb076e2b7087d8298dca0e44 Mon Sep 17 00:00:00 2001 From: Michal Szadkowski Date: Tue, 13 Aug 2024 13:13:49 +0200 Subject: [PATCH 12/26] Update jobs controller tests Signed-off-by: Michal Szadkowski --- .../mpi/mpijob_controller_test.go | 22 +++++++++---------- .../paddlepaddle_controller_test.go | 22 +++++++++---------- .../pytorch/pytorchjob_controller_test.go | 22 +++++++++---------- .../tensorflow/tfjob_controller_test.go | 22 +++++++++---------- .../xgboost/xgboostjob_controller_test.go | 22 +++++++++---------- 5 files changed, 50 insertions(+), 60 deletions(-) diff --git a/pkg/controller.v1/mpi/mpijob_controller_test.go b/pkg/controller.v1/mpi/mpijob_controller_test.go index 7e653f62f2..5c3dd4cd36 100644 --- a/pkg/controller.v1/mpi/mpijob_controller_test.go +++ b/pkg/controller.v1/mpi/mpijob_controller_test.go @@ -20,7 +20,6 @@ import ( "strings" common "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" - v1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" corev1 "k8s.io/api/core/v1" @@ -1071,7 +1070,7 @@ var _ = Describe("MPIJob controller", func() { It("Should not reconcile a job while managed by external controller", func() { By("Creating a MPIJob managed by external controller") otherController := "kueue.x-k8s.io/multikueue" - job.Spec.RunPolicy = v1.RunPolicy{ + job.Spec.RunPolicy = kubeflowv1.RunPolicy{ ManagedBy: &otherController, } job.Spec.RunPolicy.Suspend = ptr.To(true) @@ -1090,26 +1089,25 @@ var _ = Describe("MPIJob controller", func() { return created.Status.StartTime }, testutil.ConsistentDuration, testutil.Interval).Should(BeNil()) - launcherPod := &corev1.Pod{} - workerPod := &corev1.Pod{} - launcherSvc := &corev1.Service{} - workerSvc := &corev1.Service{} - By("Checking if the pods and services aren't created") Consistently(func() bool { + launcherPod := &corev1.Pod{} + workerPod := &corev1.Pod{} + launcherSvc := &corev1.Service{} + workerSvc := &corev1.Service{} errMasterPod := testK8sClient.Get(ctx, launcherKey, launcherPod) errWorkerPod := testK8sClient.Get(ctx, worker0Key, workerPod) errMasterSvc := testK8sClient.Get(ctx, launcherKey, launcherSvc) errWorkerSvc := testK8sClient.Get(ctx, worker0Key, workerSvc) return errors.IsNotFound(errMasterPod) && errors.IsNotFound(errWorkerPod) && errors.IsNotFound(errMasterSvc) && errors.IsNotFound(errWorkerSvc) - }, testutil.ConsistentDuration, testutil.Interval).Should(BeTrue()) + }, testutil.ConsistentDuration, testutil.Interval).Should(BeTrue(), "pods and services should be created by external controller (here not existent)") By("Checking if the MPIJob status was not updated") Eventually(func() []kubeflowv1.JobCondition { Expect(testK8sClient.Get(ctx, jobKey, created)).Should(Succeed()) return created.Status.Conditions - }, testutil.ConsistentDuration, testutil.Interval).Should(BeComparableTo([]v1.JobCondition(nil))) + }, testutil.Timeout, testutil.Interval).Should(BeComparableTo([]kubeflowv1.JobCondition(nil))) By("Unsuspending the MPIJob") Eventually(func() error { @@ -1119,16 +1117,16 @@ var _ = Describe("MPIJob controller", func() { }, testutil.Timeout, testutil.Interval).Should(Succeed()) By("Checking created MPIJob still has a nil startTime") - Eventually(func() *metav1.Time { + Consistently(func() *metav1.Time { Expect(testK8sClient.Get(ctx, jobKey, created)).Should(Succeed()) return created.Status.StartTime - }, testutil.Timeout, testutil.Interval).Should(BeNil()) + }, testutil.ConsistentDuration, testutil.Interval).Should(BeNil()) By("Checking if the MPIJob status was not updated, even after unsuspending") Eventually(func() []kubeflowv1.JobCondition { Expect(testK8sClient.Get(ctx, jobKey, created)).Should(Succeed()) return created.Status.Conditions - }, testutil.ConsistentDuration, testutil.Interval).Should(BeComparableTo([]v1.JobCondition(nil))) + }, testutil.Timeout, testutil.Interval).Should(BeComparableTo([]kubeflowv1.JobCondition(nil))) }) }) }) diff --git a/pkg/controller.v1/paddlepaddle/paddlepaddle_controller_test.go b/pkg/controller.v1/paddlepaddle/paddlepaddle_controller_test.go index 94c243cde0..423eb032d8 100644 --- a/pkg/controller.v1/paddlepaddle/paddlepaddle_controller_test.go +++ b/pkg/controller.v1/paddlepaddle/paddlepaddle_controller_test.go @@ -28,7 +28,6 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" kubeflowv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" - v1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" commonutil "github.com/kubeflow/training-operator/pkg/util" "github.com/kubeflow/training-operator/pkg/util/testutil" ) @@ -435,7 +434,7 @@ var _ = Describe("PaddleJob controller", func() { It("Should not reconcile a job while managed by external controller", func() { By("Creating a PaddleJob managed by external controller") otherController := "kueue.x-k8s.io/multikueue" - job.Spec.RunPolicy = v1.RunPolicy{ + job.Spec.RunPolicy = kubeflowv1.RunPolicy{ ManagedBy: &otherController, } job.Spec.RunPolicy.Suspend = ptr.To(true) @@ -454,26 +453,25 @@ var _ = Describe("PaddleJob controller", func() { return created.Status.StartTime }, testutil.ConsistentDuration, testutil.Interval).Should(BeNil()) - masterPod := &corev1.Pod{} - workerPod := &corev1.Pod{} - masterSvc := &corev1.Service{} - workerSvc := &corev1.Service{} - By("Checking if the pods and services aren't created") Consistently(func() bool { + masterPod := &corev1.Pod{} + workerPod := &corev1.Pod{} + masterSvc := &corev1.Service{} + workerSvc := &corev1.Service{} errMasterPod := testK8sClient.Get(ctx, masterKey, masterPod) errWorkerPod := testK8sClient.Get(ctx, worker0Key, workerPod) errMasterSvc := testK8sClient.Get(ctx, masterKey, masterSvc) errWorkerSvc := testK8sClient.Get(ctx, worker0Key, workerSvc) return errors.IsNotFound(errMasterPod) && errors.IsNotFound(errWorkerPod) && errors.IsNotFound(errMasterSvc) && errors.IsNotFound(errWorkerSvc) - }, testutil.ConsistentDuration, testutil.Interval).Should(BeTrue()) + }, testutil.ConsistentDuration, testutil.Interval).Should(BeTrue(), "pods and services should be created by external controller (here not existent)") By("Checking if the PaddleJob status was not updated") Eventually(func() []kubeflowv1.JobCondition { Expect(testK8sClient.Get(ctx, jobKey, created)).Should(Succeed()) return created.Status.Conditions - }, testutil.ConsistentDuration, testutil.Interval).Should(BeComparableTo([]v1.JobCondition(nil))) + }, testutil.Timeout, testutil.Interval).Should(BeComparableTo([]kubeflowv1.JobCondition(nil))) By("Unsuspending the PaddleJob") Eventually(func() error { @@ -483,16 +481,16 @@ var _ = Describe("PaddleJob controller", func() { }, testutil.Timeout, testutil.Interval).Should(Succeed()) By("Checking created PaddleJob still has a nil startTime") - Eventually(func() *metav1.Time { + Consistently(func() *metav1.Time { Expect(testK8sClient.Get(ctx, jobKey, created)).Should(Succeed()) return created.Status.StartTime - }, testutil.Timeout, testutil.Interval).Should(BeNil()) + }, testutil.ConsistentDuration, testutil.Interval).Should(BeNil()) By("Checking if the PaddleJob status was not updated, even after unsuspending") Eventually(func() []kubeflowv1.JobCondition { Expect(testK8sClient.Get(ctx, jobKey, created)).Should(Succeed()) return created.Status.Conditions - }, testutil.ConsistentDuration, testutil.Interval).Should(BeComparableTo([]v1.JobCondition(nil))) + }, testutil.Timeout, testutil.Interval).Should(BeComparableTo([]kubeflowv1.JobCondition(nil))) }) }) }) diff --git a/pkg/controller.v1/pytorch/pytorchjob_controller_test.go b/pkg/controller.v1/pytorch/pytorchjob_controller_test.go index 3fffa7d1cf..6cee154591 100644 --- a/pkg/controller.v1/pytorch/pytorchjob_controller_test.go +++ b/pkg/controller.v1/pytorch/pytorchjob_controller_test.go @@ -30,7 +30,6 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" kubeflowv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" - v1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" commonutil "github.com/kubeflow/training-operator/pkg/util" "github.com/kubeflow/training-operator/pkg/util/testutil" ) @@ -471,7 +470,7 @@ var _ = Describe("PyTorchJob controller", func() { It("Should not reconcile a job while managed by external controller", func() { By("Creating a PyTorchJob managed by external controller") otherController := "kueue.x-k8s.io/multikueue" - job.Spec.RunPolicy = v1.RunPolicy{ + job.Spec.RunPolicy = kubeflowv1.RunPolicy{ ManagedBy: &otherController, } job.Spec.RunPolicy.Suspend = ptr.To(true) @@ -491,26 +490,25 @@ var _ = Describe("PyTorchJob controller", func() { return created.Status.StartTime }, testutil.ConsistentDuration, testutil.Interval).Should(BeNil()) - masterPod := &corev1.Pod{} - workerPod := &corev1.Pod{} - masterSvc := &corev1.Service{} - workerSvc := &corev1.Service{} - By("Checking if the pods and services aren't created") Consistently(func() bool { + masterPod := &corev1.Pod{} + workerPod := &corev1.Pod{} + masterSvc := &corev1.Service{} + workerSvc := &corev1.Service{} errMasterPod := testK8sClient.Get(ctx, masterKey, masterPod) errWorkerPod := testK8sClient.Get(ctx, worker0Key, workerPod) errMasterSvc := testK8sClient.Get(ctx, masterKey, masterSvc) errWorkerSvc := testK8sClient.Get(ctx, worker0Key, workerSvc) return errors.IsNotFound(errMasterPod) && errors.IsNotFound(errWorkerPod) && errors.IsNotFound(errMasterSvc) && errors.IsNotFound(errWorkerSvc) - }, testutil.ConsistentDuration, testutil.Interval).Should(BeTrue()) + }, testutil.ConsistentDuration, testutil.Interval).Should(BeTrue(), "pods and services should be created by external controller (here not existent)") By("Checking if the PyTorchJob status was not updated") Eventually(func() []kubeflowv1.JobCondition { Expect(testK8sClient.Get(ctx, jobKey, created)).Should(Succeed()) return created.Status.Conditions - }, testutil.ConsistentDuration, testutil.Interval).Should(BeComparableTo([]v1.JobCondition(nil))) + }, testutil.Timeout, testutil.Interval).Should(BeComparableTo([]kubeflowv1.JobCondition(nil))) By("Unsuspending the PyTorchJob") Eventually(func() error { @@ -520,16 +518,16 @@ var _ = Describe("PyTorchJob controller", func() { }, testutil.Timeout, testutil.Interval).Should(Succeed()) By("Checking created PyTorchJob still has a nil startTime") - Eventually(func() *metav1.Time { + Consistently(func() *metav1.Time { Expect(testK8sClient.Get(ctx, jobKey, created)).Should(Succeed()) return created.Status.StartTime - }, testutil.Timeout, testutil.Interval).Should(BeNil()) + }, testutil.ConsistentDuration, testutil.Interval).Should(BeNil()) By("Checking if the PyTorchJob status was not updated, even after unsuspending") Eventually(func() []kubeflowv1.JobCondition { Expect(testK8sClient.Get(ctx, jobKey, created)).Should(Succeed()) return created.Status.Conditions - }, testutil.ConsistentDuration, testutil.Interval).Should(BeComparableTo([]v1.JobCondition(nil))) + }, testutil.Timeout, testutil.Interval).Should(BeComparableTo([]kubeflowv1.JobCondition(nil))) }) }) diff --git a/pkg/controller.v1/tensorflow/tfjob_controller_test.go b/pkg/controller.v1/tensorflow/tfjob_controller_test.go index 86f2f8e4b6..5a9b6b5f60 100644 --- a/pkg/controller.v1/tensorflow/tfjob_controller_test.go +++ b/pkg/controller.v1/tensorflow/tfjob_controller_test.go @@ -29,7 +29,6 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" kubeflowv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" - v1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" tftestutil "github.com/kubeflow/training-operator/pkg/controller.v1/tensorflow/testutil" commonutil "github.com/kubeflow/training-operator/pkg/util" "github.com/kubeflow/training-operator/pkg/util/testutil" @@ -611,7 +610,7 @@ var _ = Describe("TFJob controller", func() { It("Should not reconcile a job while managed by external controller", func() { By("Creating a TFJob managed by external controller") otherController := "kueue.x-k8s.io/multikueue" - job.Spec.RunPolicy = v1.RunPolicy{ + job.Spec.RunPolicy = kubeflowv1.RunPolicy{ ManagedBy: &otherController, } job.Spec.RunPolicy.Suspend = ptr.To(true) @@ -630,26 +629,25 @@ var _ = Describe("TFJob controller", func() { return created.Status.StartTime }, testutil.ConsistentDuration, testutil.Interval).Should(BeNil()) - chiefPod := &corev1.Pod{} - workerPod := &corev1.Pod{} - chiefSvc := &corev1.Service{} - workerSvc := &corev1.Service{} - By("Checking if the pods and services aren't created") Consistently(func() bool { + chiefPod := &corev1.Pod{} + workerPod := &corev1.Pod{} + chiefSvc := &corev1.Service{} + workerSvc := &corev1.Service{} errMasterPod := testK8sClient.Get(ctx, chiefKey, chiefPod) errWorkerPod := testK8sClient.Get(ctx, worker0Key, workerPod) errMasterSvc := testK8sClient.Get(ctx, chiefKey, chiefSvc) errWorkerSvc := testK8sClient.Get(ctx, worker0Key, workerSvc) return errors.IsNotFound(errMasterPod) && errors.IsNotFound(errWorkerPod) && errors.IsNotFound(errMasterSvc) && errors.IsNotFound(errWorkerSvc) - }, testutil.ConsistentDuration, testutil.Interval).Should(BeTrue()) + }, testutil.ConsistentDuration, testutil.Interval).Should(BeTrue(), "pods and services should be created by external controller (here not existent)") By("Checking if the TFJob status was not updated") Eventually(func() []kubeflowv1.JobCondition { Expect(testK8sClient.Get(ctx, jobKey, created)).Should(Succeed()) return created.Status.Conditions - }, testutil.ConsistentDuration, testutil.Interval).Should(BeComparableTo([]v1.JobCondition(nil))) + }, testutil.Timeout, testutil.Interval).Should(BeComparableTo([]kubeflowv1.JobCondition(nil))) By("Unsuspending the TFJob") Eventually(func() error { @@ -659,16 +657,16 @@ var _ = Describe("TFJob controller", func() { }, testutil.Timeout, testutil.Interval).Should(Succeed()) By("Checking created TFJob still has a nil startTime") - Eventually(func() *metav1.Time { + Consistently(func() *metav1.Time { Expect(testK8sClient.Get(ctx, jobKey, created)).Should(Succeed()) return created.Status.StartTime - }, testutil.Timeout, testutil.Interval).Should(BeNil()) + }, testutil.ConsistentDuration, testutil.Interval).Should(BeNil()) By("Checking if the TFJob status was not updated, even after unsuspending") Eventually(func() []kubeflowv1.JobCondition { Expect(testK8sClient.Get(ctx, jobKey, created)).Should(Succeed()) return created.Status.Conditions - }, testutil.ConsistentDuration, testutil.Interval).Should(BeComparableTo([]v1.JobCondition(nil))) + }, testutil.Timeout, testutil.Interval).Should(BeComparableTo([]kubeflowv1.JobCondition(nil))) }) }) }) diff --git a/pkg/controller.v1/xgboost/xgboostjob_controller_test.go b/pkg/controller.v1/xgboost/xgboostjob_controller_test.go index 64cce5689a..ea175568c2 100644 --- a/pkg/controller.v1/xgboost/xgboostjob_controller_test.go +++ b/pkg/controller.v1/xgboost/xgboostjob_controller_test.go @@ -28,7 +28,6 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" kubeflowv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" - v1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" commonutil "github.com/kubeflow/training-operator/pkg/util" "github.com/kubeflow/training-operator/pkg/util/testutil" ) @@ -364,7 +363,7 @@ var _ = Describe("XGBoost controller", func() { It("Should not reconcile a job while managed by external controller", func() { By("Creating a XGBoostJob managed by external controller") otherController := "kueue.x-k8s.io/multikueue" - job.Spec.RunPolicy = v1.RunPolicy{ + job.Spec.RunPolicy = kubeflowv1.RunPolicy{ ManagedBy: &otherController, } job.Spec.RunPolicy.Suspend = ptr.To(true) @@ -383,26 +382,25 @@ var _ = Describe("XGBoost controller", func() { return created.Status.StartTime }, testutil.ConsistentDuration, testutil.Interval).Should(BeNil()) - masterPod := &corev1.Pod{} - workerPod := &corev1.Pod{} - masterSvc := &corev1.Service{} - workerSvc := &corev1.Service{} - By("Checking if the pods and services aren't created") Consistently(func() bool { + masterPod := &corev1.Pod{} + workerPod := &corev1.Pod{} + masterSvc := &corev1.Service{} + workerSvc := &corev1.Service{} errMasterPod := testK8sClient.Get(ctx, masterKey, masterPod) errWorkerPod := testK8sClient.Get(ctx, worker0Key, workerPod) errMasterSvc := testK8sClient.Get(ctx, masterKey, masterSvc) errWorkerSvc := testK8sClient.Get(ctx, worker0Key, workerSvc) return errors.IsNotFound(errMasterPod) && errors.IsNotFound(errWorkerPod) && errors.IsNotFound(errMasterSvc) && errors.IsNotFound(errWorkerSvc) - }, testutil.ConsistentDuration, testutil.Interval).Should(BeTrue()) + }, testutil.ConsistentDuration, testutil.Interval).Should(BeTrue(), "pods and services should be created by external controller (here not existent)") By("Checking if the XGBoostJob status was not updated") Eventually(func() []kubeflowv1.JobCondition { Expect(testK8sClient.Get(ctx, jobKey, created)).Should(Succeed()) return created.Status.Conditions - }, testutil.ConsistentDuration, testutil.Interval).Should(BeComparableTo([]v1.JobCondition(nil))) + }, testutil.Timeout, testutil.Interval).Should(BeComparableTo([]kubeflowv1.JobCondition(nil))) By("Unsuspending the XGBoostJob") Eventually(func() error { @@ -412,16 +410,16 @@ var _ = Describe("XGBoost controller", func() { }, testutil.Timeout, testutil.Interval).Should(Succeed()) By("Checking created XGBoostJob still has a nil startTime") - Eventually(func() *metav1.Time { + Consistently(func() *metav1.Time { Expect(testK8sClient.Get(ctx, jobKey, created)).Should(Succeed()) return created.Status.StartTime - }, testutil.Timeout, testutil.Interval).Should(BeNil()) + }, testutil.ConsistentDuration, testutil.Interval).Should(BeNil()) By("Checking if the XGBoostJob status was not updated, even after unsuspending") Eventually(func() []kubeflowv1.JobCondition { Expect(testK8sClient.Get(ctx, jobKey, created)).Should(Succeed()) return created.Status.Conditions - }, testutil.ConsistentDuration, testutil.Interval).Should(BeComparableTo([]v1.JobCondition(nil))) + }, testutil.Timeout, testutil.Interval).Should(BeComparableTo([]kubeflowv1.JobCondition(nil))) }) }) }) From d17a9b91b46150d44c4e615b640df36e5a3d0024 Mon Sep 17 00:00:00 2001 From: Michal Szadkowski Date: Wed, 14 Aug 2024 15:29:09 +0200 Subject: [PATCH 13/26] Update validateManagedBy webhook Signed-off-by: Michal Szadkowski --- pkg/common/util/webhooks.go | 9 +++++++-- pkg/webhooks/paddlepaddle/paddlepaddle_webhook_test.go | 4 ++++ pkg/webhooks/pytorch/pytorchjob_webhook_test.go | 4 ++++ pkg/webhooks/tensorflow/tfjob_webhook_test.go | 4 ++++ pkg/webhooks/xgboost/xgboostjob_webhook_test.go | 4 ++++ 5 files changed, 23 insertions(+), 2 deletions(-) diff --git a/pkg/common/util/webhooks.go b/pkg/common/util/webhooks.go index 10d2f14dae..ecb7fc5e91 100644 --- a/pkg/common/util/webhooks.go +++ b/pkg/common/util/webhooks.go @@ -3,10 +3,15 @@ package util import ( v1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" + "k8s.io/apimachinery/pkg/util/sets" "k8s.io/apimachinery/pkg/util/validation" "k8s.io/apimachinery/pkg/util/validation/field" ) +var SupportedJobControllers = sets.New( + v1.MultiKueueController, + v1.KubeflowJobsController) + func ValidateManagedBy(runPolicy *v1.RunPolicy, allErrs field.ErrorList) field.ErrorList { if runPolicy.ManagedBy != nil { manager := *runPolicy.ManagedBy @@ -15,8 +20,8 @@ func ValidateManagedBy(runPolicy *v1.RunPolicy, allErrs field.ErrorList) field.E if len(manager) > v1.MaxManagedByLength { allErrs = append(allErrs, field.TooLongMaxLength(fieldPath, manager, v1.MaxManagedByLength)) } - if manager != v1.MultiKueueController || manager != v1.KubeflowJobsController { - allErrs = append(allErrs, field.NotSupported(fieldPath, manager, []string{v1.MultiKueueController, v1.KubeflowJobsController})) + if !SupportedJobControllers.Has(manager) { + allErrs = append(allErrs, field.NotSupported(fieldPath, manager, sets.List(SupportedJobControllers))) } } return allErrs diff --git a/pkg/webhooks/paddlepaddle/paddlepaddle_webhook_test.go b/pkg/webhooks/paddlepaddle/paddlepaddle_webhook_test.go index ac42549185..65b52a1045 100644 --- a/pkg/webhooks/paddlepaddle/paddlepaddle_webhook_test.go +++ b/pkg/webhooks/paddlepaddle/paddlepaddle_webhook_test.go @@ -23,10 +23,12 @@ import ( "github.com/google/go-cmp/cmp/cmpopts" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/sets" "k8s.io/apimachinery/pkg/util/validation/field" "k8s.io/utils/ptr" trainingoperator "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" + "github.com/kubeflow/training-operator/pkg/common/util" "github.com/kubeflow/training-operator/pkg/util/testutil" ) @@ -187,6 +189,7 @@ func TestValidateV1PaddleJob(t *testing.T) { }, wantErr: field.ErrorList{ field.Invalid(field.NewPath("spec").Child("managedBy"), "", ""), + field.NotSupported(field.NewPath("spec").Child("managedBy"), "", sets.List(util.SupportedJobControllers)), }, }, "managedBy controller name is too long": { @@ -203,6 +206,7 @@ func TestValidateV1PaddleJob(t *testing.T) { }, wantErr: field.ErrorList{ field.TooLongMaxLength(field.NewPath("spec").Child("managedBy"), "", trainingoperator.MaxManagedByLength), + field.NotSupported(field.NewPath("spec").Child("managedBy"), "", sets.List(util.SupportedJobControllers)), }, }, } diff --git a/pkg/webhooks/pytorch/pytorchjob_webhook_test.go b/pkg/webhooks/pytorch/pytorchjob_webhook_test.go index a28b5d953c..6f0d4e9077 100644 --- a/pkg/webhooks/pytorch/pytorchjob_webhook_test.go +++ b/pkg/webhooks/pytorch/pytorchjob_webhook_test.go @@ -24,11 +24,13 @@ import ( "github.com/google/go-cmp/cmp/cmpopts" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/sets" "k8s.io/apimachinery/pkg/util/validation/field" "k8s.io/utils/ptr" "sigs.k8s.io/controller-runtime/pkg/webhook/admission" trainingoperator "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" + "github.com/kubeflow/training-operator/pkg/common/util" "github.com/kubeflow/training-operator/pkg/util/testutil" ) @@ -302,6 +304,7 @@ func TestValidateV1PyTorchJob(t *testing.T) { }, wantErr: field.ErrorList{ field.Invalid(field.NewPath("spec").Child("managedBy"), "", ""), + field.NotSupported(field.NewPath("spec").Child("managedBy"), "", sets.List(util.SupportedJobControllers)), }, }, "managedBy controller name is too long": { @@ -318,6 +321,7 @@ func TestValidateV1PyTorchJob(t *testing.T) { }, wantErr: field.ErrorList{ field.TooLongMaxLength(field.NewPath("spec").Child("managedBy"), "", trainingoperator.MaxManagedByLength), + field.NotSupported(field.NewPath("spec").Child("managedBy"), "", sets.List(util.SupportedJobControllers)), }, }, } diff --git a/pkg/webhooks/tensorflow/tfjob_webhook_test.go b/pkg/webhooks/tensorflow/tfjob_webhook_test.go index a294ea382d..c75797b877 100644 --- a/pkg/webhooks/tensorflow/tfjob_webhook_test.go +++ b/pkg/webhooks/tensorflow/tfjob_webhook_test.go @@ -23,10 +23,12 @@ import ( "github.com/google/go-cmp/cmp/cmpopts" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/sets" "k8s.io/apimachinery/pkg/util/validation/field" "k8s.io/utils/ptr" trainingoperator "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" + "github.com/kubeflow/training-operator/pkg/common/util" "github.com/kubeflow/training-operator/pkg/util/testutil" ) @@ -198,6 +200,7 @@ func TestValidateTFJob(t *testing.T) { }, wantErr: field.ErrorList{ field.Invalid(field.NewPath("spec").Child("managedBy"), "", ""), + field.NotSupported(field.NewPath("spec").Child("managedBy"), "", sets.List(util.SupportedJobControllers)), }, }, "managedBy controller name is too long": { @@ -214,6 +217,7 @@ func TestValidateTFJob(t *testing.T) { }, wantErr: field.ErrorList{ field.TooLongMaxLength(field.NewPath("spec").Child("managedBy"), "", trainingoperator.MaxManagedByLength), + field.NotSupported(field.NewPath("spec").Child("managedBy"), "", sets.List(util.SupportedJobControllers)), }, }, } diff --git a/pkg/webhooks/xgboost/xgboostjob_webhook_test.go b/pkg/webhooks/xgboost/xgboostjob_webhook_test.go index d4b63440b0..8ee50f591a 100644 --- a/pkg/webhooks/xgboost/xgboostjob_webhook_test.go +++ b/pkg/webhooks/xgboost/xgboostjob_webhook_test.go @@ -23,10 +23,12 @@ import ( "github.com/google/go-cmp/cmp/cmpopts" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/sets" "k8s.io/apimachinery/pkg/util/validation/field" "k8s.io/utils/ptr" trainingoperator "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" + "github.com/kubeflow/training-operator/pkg/common/util" "github.com/kubeflow/training-operator/pkg/util/testutil" ) @@ -249,6 +251,7 @@ func TestValidateXGBoostJob(t *testing.T) { }, wantErr: field.ErrorList{ field.Invalid(field.NewPath("spec").Child("managedBy"), "", ""), + field.NotSupported(field.NewPath("spec").Child("managedBy"), "", sets.List(util.SupportedJobControllers)), }, }, "managedBy controller name is too long": { @@ -265,6 +268,7 @@ func TestValidateXGBoostJob(t *testing.T) { }, wantErr: field.ErrorList{ field.TooLongMaxLength(field.NewPath("spec").Child("managedBy"), "", trainingoperator.MaxManagedByLength), + field.NotSupported(field.NewPath("spec").Child("managedBy"), "", sets.List(util.SupportedJobControllers)), }, }, } From 5544a5ba8c7d8c5280d248cd889f5abe4865624e Mon Sep 17 00:00:00 2001 From: Michal Szadkowski Date: Wed, 28 Aug 2024 17:04:54 +0200 Subject: [PATCH 14/26] Remove validation for the length of ManagedBy field Signed-off-by: Michal Szadkowski --- hack/python-sdk/swagger.json | 2 +- pkg/apis/kubeflow.org/v1/common_types.go | 5 +---- pkg/apis/kubeflow.org/v1/openapi_generated.go | 2 +- pkg/common/util/webhooks.go | 3 --- pkg/util/testutil/constants.go | 1 - .../paddlepaddle/paddlepaddle_webhook_test.go | 17 ----------------- pkg/webhooks/pytorch/pytorchjob_webhook_test.go | 17 ----------------- pkg/webhooks/tensorflow/tfjob_webhook_test.go | 17 ----------------- pkg/webhooks/xgboost/xgboostjob_webhook_test.go | 17 ----------------- 9 files changed, 3 insertions(+), 78 deletions(-) diff --git a/hack/python-sdk/swagger.json b/hack/python-sdk/swagger.json index 0f9f92856a..7ddba636a2 100644 --- a/hack/python-sdk/swagger.json +++ b/hack/python-sdk/swagger.json @@ -576,7 +576,7 @@ "type": "string" }, "managedBy": { - "description": "ManagedBy is used to indicate the controller or entity that manages a job. The value must be either an empty, 'kubeflow.org/training-operator' or 'kueue.x-k8s.io/multikueue'. The built-in job controller reconciles a job which don't have this field at all or the field value is the reserved string 'kubeflow.org/training-operator', but delegates reconciling the job with a 'kueue.x-k8s.io/multikueue' to the Kueue.\n\nThe value must be a valid domain-prefixed path (e.g. acme.io/foo) - all characters before the first \"/\" must be a valid subdomain as defined by RFC 1123. All characters trailing the first \"/\" must be valid HTTP Path characters as defined by RFC 3986. The value cannot exceed 63 characters. The field is immutable.", + "description": "ManagedBy is used to indicate the controller or entity that manages a job. The value must be either an empty, 'kubeflow.org/training-operator' or 'kueue.x-k8s.io/multikueue'. The built-in job controller reconciles a job which don't have this field at all or the field value is the reserved string 'kubeflow.org/training-operator', but delegates reconciling the job with a 'kueue.x-k8s.io/multikueue' to the Kueue.\n\nThe value must be a valid domain-prefixed path (e.g. acme.io/foo) - all characters before the first \"/\" must be a valid subdomain as defined by RFC 1123. All characters trailing the first \"/\" must be valid HTTP Path characters as defined by RFC 3986. The field is immutable.", "type": "string" }, "schedulingPolicy": { diff --git a/pkg/apis/kubeflow.org/v1/common_types.go b/pkg/apis/kubeflow.org/v1/common_types.go index f71e0cc8e6..47f9ab9f31 100644 --- a/pkg/apis/kubeflow.org/v1/common_types.go +++ b/pkg/apis/kubeflow.org/v1/common_types.go @@ -188,9 +188,6 @@ const ( RestartPolicyExitCode RestartPolicy = "ExitCode" ) -// maximum length of the value of the ManagedBy field -const MaxManagedByLength = 63 - // RunPolicy encapsulates various runtime policies of the distributed training // job, for example how to clean up resources and how long the job can stay // active. @@ -242,7 +239,7 @@ type RunPolicy struct { // The value must be a valid domain-prefixed path (e.g. acme.io/foo) - // all characters before the first "/" must be a valid subdomain as defined // by RFC 1123. All characters trailing the first "/" must be valid HTTP Path - // characters as defined by RFC 3986. The value cannot exceed 63 characters. + // characters as defined by RFC 3986. // The field is immutable. ManagedBy *string `json:"managedBy,omitempty"` } diff --git a/pkg/apis/kubeflow.org/v1/openapi_generated.go b/pkg/apis/kubeflow.org/v1/openapi_generated.go index 8fe8710b20..f19e0ee51f 100644 --- a/pkg/apis/kubeflow.org/v1/openapi_generated.go +++ b/pkg/apis/kubeflow.org/v1/openapi_generated.go @@ -1065,7 +1065,7 @@ func schema_pkg_apis_kubefloworg_v1_RunPolicy(ref common.ReferenceCallback) comm }, "managedBy": { SchemaProps: spec.SchemaProps{ - Description: "ManagedBy is used to indicate the controller or entity that manages a job. The value must be either an empty, 'kubeflow.org/training-operator' or 'kueue.x-k8s.io/multikueue'. The built-in job controller reconciles a job which don't have this field at all or the field value is the reserved string 'kubeflow.org/training-operator', but delegates reconciling the job with a 'kueue.x-k8s.io/multikueue' to the Kueue.\n\nThe value must be a valid domain-prefixed path (e.g. acme.io/foo) - all characters before the first \"/\" must be a valid subdomain as defined by RFC 1123. All characters trailing the first \"/\" must be valid HTTP Path characters as defined by RFC 3986. The value cannot exceed 63 characters. The field is immutable.", + Description: "ManagedBy is used to indicate the controller or entity that manages a job. The value must be either an empty, 'kubeflow.org/training-operator' or 'kueue.x-k8s.io/multikueue'. The built-in job controller reconciles a job which don't have this field at all or the field value is the reserved string 'kubeflow.org/training-operator', but delegates reconciling the job with a 'kueue.x-k8s.io/multikueue' to the Kueue.\n\nThe value must be a valid domain-prefixed path (e.g. acme.io/foo) - all characters before the first \"/\" must be a valid subdomain as defined by RFC 1123. All characters trailing the first \"/\" must be valid HTTP Path characters as defined by RFC 3986. The field is immutable.", Type: []string{"string"}, Format: "", }, diff --git a/pkg/common/util/webhooks.go b/pkg/common/util/webhooks.go index ecb7fc5e91..b1dc51f068 100644 --- a/pkg/common/util/webhooks.go +++ b/pkg/common/util/webhooks.go @@ -17,9 +17,6 @@ func ValidateManagedBy(runPolicy *v1.RunPolicy, allErrs field.ErrorList) field.E manager := *runPolicy.ManagedBy fieldPath := field.NewPath("spec", "managedBy") allErrs = append(allErrs, validation.IsDomainPrefixedPath(fieldPath, manager)...) - if len(manager) > v1.MaxManagedByLength { - allErrs = append(allErrs, field.TooLongMaxLength(fieldPath, manager, v1.MaxManagedByLength)) - } if !SupportedJobControllers.Has(manager) { allErrs = append(allErrs, field.NotSupported(fieldPath, manager, sets.List(SupportedJobControllers))) } diff --git a/pkg/util/testutil/constants.go b/pkg/util/testutil/constants.go index 26d58ed837..d79c075bf3 100644 --- a/pkg/util/testutil/constants.go +++ b/pkg/util/testutil/constants.go @@ -17,5 +17,4 @@ const ( var ( IgnoreJobConditionsTimes = cmpopts.IgnoreFields(kubeflowv1.JobCondition{}, "LastUpdateTime", "LastTransitionTime") MalformedManagedBy = "other-job-controller" - TooLongManagedBy = "kubeflow.org/very-very-very-very-very-very-very-very-long-job-controller" ) diff --git a/pkg/webhooks/paddlepaddle/paddlepaddle_webhook_test.go b/pkg/webhooks/paddlepaddle/paddlepaddle_webhook_test.go index 65b52a1045..c2ee3def19 100644 --- a/pkg/webhooks/paddlepaddle/paddlepaddle_webhook_test.go +++ b/pkg/webhooks/paddlepaddle/paddlepaddle_webhook_test.go @@ -192,23 +192,6 @@ func TestValidateV1PaddleJob(t *testing.T) { field.NotSupported(field.NewPath("spec").Child("managedBy"), "", sets.List(util.SupportedJobControllers)), }, }, - "managedBy controller name is too long": { - paddleJob: &trainingoperator.PaddleJob{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test", - }, - Spec: trainingoperator.PaddleJobSpec{ - RunPolicy: trainingoperator.RunPolicy{ - ManagedBy: ptr.To(testutil.TooLongManagedBy), - }, - PaddleReplicaSpecs: validPaddleReplicaSpecs, - }, - }, - wantErr: field.ErrorList{ - field.TooLongMaxLength(field.NewPath("spec").Child("managedBy"), "", trainingoperator.MaxManagedByLength), - field.NotSupported(field.NewPath("spec").Child("managedBy"), "", sets.List(util.SupportedJobControllers)), - }, - }, } for name, tc := range testCases { t.Run(name, func(t *testing.T) { diff --git a/pkg/webhooks/pytorch/pytorchjob_webhook_test.go b/pkg/webhooks/pytorch/pytorchjob_webhook_test.go index 6f0d4e9077..24eb102125 100644 --- a/pkg/webhooks/pytorch/pytorchjob_webhook_test.go +++ b/pkg/webhooks/pytorch/pytorchjob_webhook_test.go @@ -307,23 +307,6 @@ func TestValidateV1PyTorchJob(t *testing.T) { field.NotSupported(field.NewPath("spec").Child("managedBy"), "", sets.List(util.SupportedJobControllers)), }, }, - "managedBy controller name is too long": { - pytorchJob: &trainingoperator.PyTorchJob{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test", - }, - Spec: trainingoperator.PyTorchJobSpec{ - RunPolicy: trainingoperator.RunPolicy{ - ManagedBy: ptr.To(testutil.TooLongManagedBy), - }, - PyTorchReplicaSpecs: validPyTorchReplicaSpecs, - }, - }, - wantErr: field.ErrorList{ - field.TooLongMaxLength(field.NewPath("spec").Child("managedBy"), "", trainingoperator.MaxManagedByLength), - field.NotSupported(field.NewPath("spec").Child("managedBy"), "", sets.List(util.SupportedJobControllers)), - }, - }, } for name, tc := range testCases { diff --git a/pkg/webhooks/tensorflow/tfjob_webhook_test.go b/pkg/webhooks/tensorflow/tfjob_webhook_test.go index c75797b877..a315f85db3 100644 --- a/pkg/webhooks/tensorflow/tfjob_webhook_test.go +++ b/pkg/webhooks/tensorflow/tfjob_webhook_test.go @@ -203,23 +203,6 @@ func TestValidateTFJob(t *testing.T) { field.NotSupported(field.NewPath("spec").Child("managedBy"), "", sets.List(util.SupportedJobControllers)), }, }, - "managedBy controller name is too long": { - tfJob: &trainingoperator.TFJob{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test", - }, - Spec: trainingoperator.TFJobSpec{ - RunPolicy: trainingoperator.RunPolicy{ - ManagedBy: ptr.To(testutil.TooLongManagedBy), - }, - TFReplicaSpecs: validTFReplicaSpecs, - }, - }, - wantErr: field.ErrorList{ - field.TooLongMaxLength(field.NewPath("spec").Child("managedBy"), "", trainingoperator.MaxManagedByLength), - field.NotSupported(field.NewPath("spec").Child("managedBy"), "", sets.List(util.SupportedJobControllers)), - }, - }, } for name, tc := range testCases { t.Run(name, func(t *testing.T) { diff --git a/pkg/webhooks/xgboost/xgboostjob_webhook_test.go b/pkg/webhooks/xgboost/xgboostjob_webhook_test.go index 8ee50f591a..930b4a4f4e 100644 --- a/pkg/webhooks/xgboost/xgboostjob_webhook_test.go +++ b/pkg/webhooks/xgboost/xgboostjob_webhook_test.go @@ -254,23 +254,6 @@ func TestValidateXGBoostJob(t *testing.T) { field.NotSupported(field.NewPath("spec").Child("managedBy"), "", sets.List(util.SupportedJobControllers)), }, }, - "managedBy controller name is too long": { - xgboostJob: &trainingoperator.XGBoostJob{ - ObjectMeta: metav1.ObjectMeta{ - Name: "test", - }, - Spec: trainingoperator.XGBoostJobSpec{ - RunPolicy: trainingoperator.RunPolicy{ - ManagedBy: ptr.To(testutil.TooLongManagedBy), - }, - XGBReplicaSpecs: validXGBoostReplicaSpecs, - }, - }, - wantErr: field.ErrorList{ - field.TooLongMaxLength(field.NewPath("spec").Child("managedBy"), "", trainingoperator.MaxManagedByLength), - field.NotSupported(field.NewPath("spec").Child("managedBy"), "", sets.List(util.SupportedJobControllers)), - }, - }, } for name, tc := range testCases { t.Run(name, func(t *testing.T) { From e086a9eec796463704242ec186613a22a60c03f5 Mon Sep 17 00:00:00 2001 From: Michal Szadkowski Date: Wed, 4 Sep 2024 12:23:41 +0200 Subject: [PATCH 15/26] Update after code review Signed-off-by: Michal Szadkowski --- docs/api/kubeflow.org_v1_generated.asciidoc | 8 +------- hack/python-sdk/swagger.json | 2 +- manifests/base/crds/kubeflow.org_jaxjobs.yaml | 2 +- manifests/base/crds/kubeflow.org_mpijobs.yaml | 2 +- manifests/base/crds/kubeflow.org_paddlejobs.yaml | 2 +- manifests/base/crds/kubeflow.org_pytorchjobs.yaml | 2 +- manifests/base/crds/kubeflow.org_tfjobs.yaml | 2 +- manifests/base/crds/kubeflow.org_xgboostjobs.yaml | 2 +- pkg/apis/kubeflow.org/v1/common_types.go | 9 ++------- pkg/apis/kubeflow.org/v1/openapi_generated.go | 2 +- pkg/common/util/webhooks.go | 4 +--- pkg/controller.v1/mpi/mpijob_controller_test.go | 3 +-- .../paddlepaddle/paddlepaddle_controller_test.go | 3 +-- pkg/controller.v1/pytorch/pytorchjob_controller_test.go | 3 +-- pkg/controller.v1/tensorflow/tfjob_controller_test.go | 3 +-- pkg/controller.v1/xgboost/xgboostjob_controller_test.go | 3 +-- pkg/webhooks/paddlepaddle/paddlepaddle_webhook_test.go | 3 +-- pkg/webhooks/pytorch/pytorchjob_webhook_test.go | 3 +-- pkg/webhooks/tensorflow/tfjob_webhook_test.go | 3 +-- pkg/webhooks/xgboost/xgboostjob_webhook_test.go | 3 +-- sdk/python/docs/KubeflowOrgV1RunPolicy.md | 2 +- .../training/models/kubeflow_org_v1_run_policy.py | 4 ++-- 22 files changed, 24 insertions(+), 46 deletions(-) diff --git a/docs/api/kubeflow.org_v1_generated.asciidoc b/docs/api/kubeflow.org_v1_generated.asciidoc index 6b3599a4f2..55e4a7dd23 100644 --- a/docs/api/kubeflow.org_v1_generated.asciidoc +++ b/docs/api/kubeflow.org_v1_generated.asciidoc @@ -680,16 +680,10 @@ Defaults to false. | *`managedBy`* __string__ | ManagedBy is used to indicate the controller or entity that manages a job. The value must be either an empty, 'kubeflow.org/training-operator' or 'kueue.x-k8s.io/multikueue'. -The built-in job controller reconciles a job which don't have this +The training-operator reconciles a job which doesn't have this field at all or the field value is the reserved string 'kubeflow.org/training-operator', but delegates reconciling the job with a 'kueue.x-k8s.io/multikueue' to the Kueue. - - -The value must be a valid domain-prefixed path (e.g. acme.io/foo) - -all characters before the first "/" must be a valid subdomain as defined -by RFC 1123. All characters trailing the first "/" must be valid HTTP Path -characters as defined by RFC 3986. The value cannot exceed 63 characters. The field is immutable. |=== diff --git a/hack/python-sdk/swagger.json b/hack/python-sdk/swagger.json index 7ddba636a2..7f1530378b 100644 --- a/hack/python-sdk/swagger.json +++ b/hack/python-sdk/swagger.json @@ -576,7 +576,7 @@ "type": "string" }, "managedBy": { - "description": "ManagedBy is used to indicate the controller or entity that manages a job. The value must be either an empty, 'kubeflow.org/training-operator' or 'kueue.x-k8s.io/multikueue'. The built-in job controller reconciles a job which don't have this field at all or the field value is the reserved string 'kubeflow.org/training-operator', but delegates reconciling the job with a 'kueue.x-k8s.io/multikueue' to the Kueue.\n\nThe value must be a valid domain-prefixed path (e.g. acme.io/foo) - all characters before the first \"/\" must be a valid subdomain as defined by RFC 1123. All characters trailing the first \"/\" must be valid HTTP Path characters as defined by RFC 3986. The field is immutable.", + "description": "ManagedBy is used to indicate the controller or entity that manages a job. The value must be either an empty, 'kubeflow.org/training-operator' or 'kueue.x-k8s.io/multikueue'. The training-operator reconciles a job which doesn't have this field at all or the field value is the reserved string 'kubeflow.org/training-operator', but delegates reconciling the job with a 'kueue.x-k8s.io/multikueue' to the Kueue. The field is immutable.", "type": "string" }, "schedulingPolicy": { diff --git a/manifests/base/crds/kubeflow.org_jaxjobs.yaml b/manifests/base/crds/kubeflow.org_jaxjobs.yaml index fd0a33b998..32962b6161 100644 --- a/manifests/base/crds/kubeflow.org_jaxjobs.yaml +++ b/manifests/base/crds/kubeflow.org_jaxjobs.yaml @@ -7311,7 +7311,7 @@ spec: ManagedBy is used to indicate the controller or entity that manages a job. The value must be either an empty, 'kubeflow.org/training-operator' or 'kueue.x-k8s.io/multikueue'. - The built-in job controller reconciles a job which don't have this + The training-operator reconciles a job which doesn't have this field at all or the field value is the reserved string 'kubeflow.org/training-operator', but delegates reconciling the job with a 'kueue.x-k8s. diff --git a/manifests/base/crds/kubeflow.org_mpijobs.yaml b/manifests/base/crds/kubeflow.org_mpijobs.yaml index 0384e06d32..5a42e3288f 100644 --- a/manifests/base/crds/kubeflow.org_mpijobs.yaml +++ b/manifests/base/crds/kubeflow.org_mpijobs.yaml @@ -7316,7 +7316,7 @@ spec: ManagedBy is used to indicate the controller or entity that manages a job. The value must be either an empty, 'kubeflow.org/training-operator' or 'kueue.x-k8s.io/multikueue'. - The built-in job controller reconciles a job which don't have this + The training-operator reconciles a job which doesn't have this field at all or the field value is the reserved string 'kubeflow.org/training-operator', but delegates reconciling the job with a 'kueue.x-k8s. diff --git a/manifests/base/crds/kubeflow.org_paddlejobs.yaml b/manifests/base/crds/kubeflow.org_paddlejobs.yaml index 27511571fd..282d6c3ce5 100644 --- a/manifests/base/crds/kubeflow.org_paddlejobs.yaml +++ b/manifests/base/crds/kubeflow.org_paddlejobs.yaml @@ -7798,7 +7798,7 @@ spec: ManagedBy is used to indicate the controller or entity that manages a job. The value must be either an empty, 'kubeflow.org/training-operator' or 'kueue.x-k8s.io/multikueue'. - The built-in job controller reconciles a job which don't have this + The training-operator reconciles a job which doesn't have this field at all or the field value is the reserved string 'kubeflow.org/training-operator', but delegates reconciling the job with a 'kueue.x-k8s. diff --git a/manifests/base/crds/kubeflow.org_pytorchjobs.yaml b/manifests/base/crds/kubeflow.org_pytorchjobs.yaml index e28c5be35e..1d23531fba 100644 --- a/manifests/base/crds/kubeflow.org_pytorchjobs.yaml +++ b/manifests/base/crds/kubeflow.org_pytorchjobs.yaml @@ -7835,7 +7835,7 @@ spec: ManagedBy is used to indicate the controller or entity that manages a job. The value must be either an empty, 'kubeflow.org/training-operator' or 'kueue.x-k8s.io/multikueue'. - The built-in job controller reconciles a job which don't have this + The training-operator reconciles a job which doesn't have this field at all or the field value is the reserved string 'kubeflow.org/training-operator', but delegates reconciling the job with a 'kueue.x-k8s. diff --git a/manifests/base/crds/kubeflow.org_tfjobs.yaml b/manifests/base/crds/kubeflow.org_tfjobs.yaml index 4fd7f4a88c..d045477431 100644 --- a/manifests/base/crds/kubeflow.org_tfjobs.yaml +++ b/manifests/base/crds/kubeflow.org_tfjobs.yaml @@ -76,7 +76,7 @@ spec: ManagedBy is used to indicate the controller or entity that manages a job. The value must be either an empty, 'kubeflow.org/training-operator' or 'kueue.x-k8s.io/multikueue'. - The built-in job controller reconciles a job which don't have this + The training-operator reconciles a job which doesn't have this field at all or the field value is the reserved string 'kubeflow.org/training-operator', but delegates reconciling the job with a 'kueue.x-k8s. diff --git a/manifests/base/crds/kubeflow.org_xgboostjobs.yaml b/manifests/base/crds/kubeflow.org_xgboostjobs.yaml index c023598f88..af76c28ec8 100644 --- a/manifests/base/crds/kubeflow.org_xgboostjobs.yaml +++ b/manifests/base/crds/kubeflow.org_xgboostjobs.yaml @@ -72,7 +72,7 @@ spec: ManagedBy is used to indicate the controller or entity that manages a job. The value must be either an empty, 'kubeflow.org/training-operator' or 'kueue.x-k8s.io/multikueue'. - The built-in job controller reconciles a job which don't have this + The training-operator reconciles a job which doesn't have this field at all or the field value is the reserved string 'kubeflow.org/training-operator', but delegates reconciling the job with a 'kueue.x-k8s. diff --git a/pkg/apis/kubeflow.org/v1/common_types.go b/pkg/apis/kubeflow.org/v1/common_types.go index 47f9ab9f31..1eac2fe3bf 100644 --- a/pkg/apis/kubeflow.org/v1/common_types.go +++ b/pkg/apis/kubeflow.org/v1/common_types.go @@ -39,7 +39,7 @@ const ( // KubeflowJobsController represents the value of the default jobs controller KubeflowJobsController = "kubeflow.org/training-operator" - // MultiKueueController represents the value of the only allowed external controller except for KubeflowJobsController + // MultiKueueController represents the MultiKueue controller MultiKueueController = "kueue.x-k8s.io/multikueue" ) @@ -231,15 +231,10 @@ type RunPolicy struct { // ManagedBy is used to indicate the controller or entity that manages a job. // The value must be either an empty, 'kubeflow.org/training-operator' or // 'kueue.x-k8s.io/multikueue'. - // The built-in job controller reconciles a job which don't have this + // The training-operator reconciles a job which doesn't have this // field at all or the field value is the reserved string // 'kubeflow.org/training-operator', but delegates reconciling the job // with a 'kueue.x-k8s.io/multikueue' to the Kueue. - // - // The value must be a valid domain-prefixed path (e.g. acme.io/foo) - - // all characters before the first "/" must be a valid subdomain as defined - // by RFC 1123. All characters trailing the first "/" must be valid HTTP Path - // characters as defined by RFC 3986. // The field is immutable. ManagedBy *string `json:"managedBy,omitempty"` } diff --git a/pkg/apis/kubeflow.org/v1/openapi_generated.go b/pkg/apis/kubeflow.org/v1/openapi_generated.go index f19e0ee51f..2c62883152 100644 --- a/pkg/apis/kubeflow.org/v1/openapi_generated.go +++ b/pkg/apis/kubeflow.org/v1/openapi_generated.go @@ -1065,7 +1065,7 @@ func schema_pkg_apis_kubefloworg_v1_RunPolicy(ref common.ReferenceCallback) comm }, "managedBy": { SchemaProps: spec.SchemaProps{ - Description: "ManagedBy is used to indicate the controller or entity that manages a job. The value must be either an empty, 'kubeflow.org/training-operator' or 'kueue.x-k8s.io/multikueue'. The built-in job controller reconciles a job which don't have this field at all or the field value is the reserved string 'kubeflow.org/training-operator', but delegates reconciling the job with a 'kueue.x-k8s.io/multikueue' to the Kueue.\n\nThe value must be a valid domain-prefixed path (e.g. acme.io/foo) - all characters before the first \"/\" must be a valid subdomain as defined by RFC 1123. All characters trailing the first \"/\" must be valid HTTP Path characters as defined by RFC 3986. The field is immutable.", + Description: "ManagedBy is used to indicate the controller or entity that manages a job. The value must be either an empty, 'kubeflow.org/training-operator' or 'kueue.x-k8s.io/multikueue'. The training-operator reconciles a job which doesn't have this field at all or the field value is the reserved string 'kubeflow.org/training-operator', but delegates reconciling the job with a 'kueue.x-k8s.io/multikueue' to the Kueue. The field is immutable.", Type: []string{"string"}, Format: "", }, diff --git a/pkg/common/util/webhooks.go b/pkg/common/util/webhooks.go index b1dc51f068..326a3fcabb 100644 --- a/pkg/common/util/webhooks.go +++ b/pkg/common/util/webhooks.go @@ -4,7 +4,6 @@ import ( v1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" "k8s.io/apimachinery/pkg/util/sets" - "k8s.io/apimachinery/pkg/util/validation" "k8s.io/apimachinery/pkg/util/validation/field" ) @@ -16,9 +15,8 @@ func ValidateManagedBy(runPolicy *v1.RunPolicy, allErrs field.ErrorList) field.E if runPolicy.ManagedBy != nil { manager := *runPolicy.ManagedBy fieldPath := field.NewPath("spec", "managedBy") - allErrs = append(allErrs, validation.IsDomainPrefixedPath(fieldPath, manager)...) if !SupportedJobControllers.Has(manager) { - allErrs = append(allErrs, field.NotSupported(fieldPath, manager, sets.List(SupportedJobControllers))) + allErrs = append(allErrs, field.NotSupported(fieldPath, manager, SupportedJobControllers.UnsortedList())) } } return allErrs diff --git a/pkg/controller.v1/mpi/mpijob_controller_test.go b/pkg/controller.v1/mpi/mpijob_controller_test.go index 5c3dd4cd36..024cbaaec1 100644 --- a/pkg/controller.v1/mpi/mpijob_controller_test.go +++ b/pkg/controller.v1/mpi/mpijob_controller_test.go @@ -1069,9 +1069,8 @@ var _ = Describe("MPIJob controller", func() { It("Should not reconcile a job while managed by external controller", func() { By("Creating a MPIJob managed by external controller") - otherController := "kueue.x-k8s.io/multikueue" job.Spec.RunPolicy = kubeflowv1.RunPolicy{ - ManagedBy: &otherController, + ManagedBy: ptr.To(kubeflowv1.MultiKueueController), } job.Spec.RunPolicy.Suspend = ptr.To(true) Expect(testK8sClient.Create(ctx, job)).Should(Succeed()) diff --git a/pkg/controller.v1/paddlepaddle/paddlepaddle_controller_test.go b/pkg/controller.v1/paddlepaddle/paddlepaddle_controller_test.go index 423eb032d8..72a851642a 100644 --- a/pkg/controller.v1/paddlepaddle/paddlepaddle_controller_test.go +++ b/pkg/controller.v1/paddlepaddle/paddlepaddle_controller_test.go @@ -433,9 +433,8 @@ var _ = Describe("PaddleJob controller", func() { It("Should not reconcile a job while managed by external controller", func() { By("Creating a PaddleJob managed by external controller") - otherController := "kueue.x-k8s.io/multikueue" job.Spec.RunPolicy = kubeflowv1.RunPolicy{ - ManagedBy: &otherController, + ManagedBy: ptr.To(kubeflowv1.MultiKueueController), } job.Spec.RunPolicy.Suspend = ptr.To(true) Expect(testK8sClient.Create(ctx, job)).Should(Succeed()) diff --git a/pkg/controller.v1/pytorch/pytorchjob_controller_test.go b/pkg/controller.v1/pytorch/pytorchjob_controller_test.go index 6cee154591..37b4c9218c 100644 --- a/pkg/controller.v1/pytorch/pytorchjob_controller_test.go +++ b/pkg/controller.v1/pytorch/pytorchjob_controller_test.go @@ -469,9 +469,8 @@ var _ = Describe("PyTorchJob controller", func() { It("Should not reconcile a job while managed by external controller", func() { By("Creating a PyTorchJob managed by external controller") - otherController := "kueue.x-k8s.io/multikueue" job.Spec.RunPolicy = kubeflowv1.RunPolicy{ - ManagedBy: &otherController, + ManagedBy: ptr.To(kubeflowv1.MultiKueueController), } job.Spec.RunPolicy.Suspend = ptr.To(true) job.Spec.PyTorchReplicaSpecs[kubeflowv1.PyTorchJobReplicaTypeWorker].Replicas = ptr.To[int32](1) diff --git a/pkg/controller.v1/tensorflow/tfjob_controller_test.go b/pkg/controller.v1/tensorflow/tfjob_controller_test.go index 5a9b6b5f60..53265e358c 100644 --- a/pkg/controller.v1/tensorflow/tfjob_controller_test.go +++ b/pkg/controller.v1/tensorflow/tfjob_controller_test.go @@ -609,9 +609,8 @@ var _ = Describe("TFJob controller", func() { It("Should not reconcile a job while managed by external controller", func() { By("Creating a TFJob managed by external controller") - otherController := "kueue.x-k8s.io/multikueue" job.Spec.RunPolicy = kubeflowv1.RunPolicy{ - ManagedBy: &otherController, + ManagedBy: ptr.To(kubeflowv1.MultiKueueController), } job.Spec.RunPolicy.Suspend = ptr.To(true) Expect(testK8sClient.Create(ctx, job)).Should(Succeed()) diff --git a/pkg/controller.v1/xgboost/xgboostjob_controller_test.go b/pkg/controller.v1/xgboost/xgboostjob_controller_test.go index ea175568c2..b1a1fce9c6 100644 --- a/pkg/controller.v1/xgboost/xgboostjob_controller_test.go +++ b/pkg/controller.v1/xgboost/xgboostjob_controller_test.go @@ -362,9 +362,8 @@ var _ = Describe("XGBoost controller", func() { It("Should not reconcile a job while managed by external controller", func() { By("Creating a XGBoostJob managed by external controller") - otherController := "kueue.x-k8s.io/multikueue" job.Spec.RunPolicy = kubeflowv1.RunPolicy{ - ManagedBy: &otherController, + ManagedBy: ptr.To(kubeflowv1.MultiKueueController), } job.Spec.RunPolicy.Suspend = ptr.To(true) Expect(testK8sClient.Create(ctx, job)).Should(Succeed()) diff --git a/pkg/webhooks/paddlepaddle/paddlepaddle_webhook_test.go b/pkg/webhooks/paddlepaddle/paddlepaddle_webhook_test.go index c2ee3def19..3fbf13fcc0 100644 --- a/pkg/webhooks/paddlepaddle/paddlepaddle_webhook_test.go +++ b/pkg/webhooks/paddlepaddle/paddlepaddle_webhook_test.go @@ -175,7 +175,7 @@ func TestValidateV1PaddleJob(t *testing.T) { field.Required(paddleReplicaSpecPath, ""), }, }, - "managedBy controller name is malformed": { + "unsupported managedBy controller name": { paddleJob: &trainingoperator.PaddleJob{ ObjectMeta: metav1.ObjectMeta{ Name: "test", @@ -188,7 +188,6 @@ func TestValidateV1PaddleJob(t *testing.T) { }, }, wantErr: field.ErrorList{ - field.Invalid(field.NewPath("spec").Child("managedBy"), "", ""), field.NotSupported(field.NewPath("spec").Child("managedBy"), "", sets.List(util.SupportedJobControllers)), }, }, diff --git a/pkg/webhooks/pytorch/pytorchjob_webhook_test.go b/pkg/webhooks/pytorch/pytorchjob_webhook_test.go index 24eb102125..609780fe7a 100644 --- a/pkg/webhooks/pytorch/pytorchjob_webhook_test.go +++ b/pkg/webhooks/pytorch/pytorchjob_webhook_test.go @@ -290,7 +290,7 @@ func TestValidateV1PyTorchJob(t *testing.T) { specPath.Child("elasticPolicy").Child("nProcPerNode"), specPath.Child("nprocPerNode")), }, }, - "managedBy controller name is malformed": { + "unsupported managedBy controller name": { pytorchJob: &trainingoperator.PyTorchJob{ ObjectMeta: metav1.ObjectMeta{ Name: "test", @@ -303,7 +303,6 @@ func TestValidateV1PyTorchJob(t *testing.T) { }, }, wantErr: field.ErrorList{ - field.Invalid(field.NewPath("spec").Child("managedBy"), "", ""), field.NotSupported(field.NewPath("spec").Child("managedBy"), "", sets.List(util.SupportedJobControllers)), }, }, diff --git a/pkg/webhooks/tensorflow/tfjob_webhook_test.go b/pkg/webhooks/tensorflow/tfjob_webhook_test.go index a315f85db3..3197b5e674 100644 --- a/pkg/webhooks/tensorflow/tfjob_webhook_test.go +++ b/pkg/webhooks/tensorflow/tfjob_webhook_test.go @@ -186,7 +186,7 @@ func TestValidateTFJob(t *testing.T) { field.Forbidden(tfReplicaSpecPath, ""), }, }, - "managedBy controller name is malformed": { + "unsupported managedBy controller name": { tfJob: &trainingoperator.TFJob{ ObjectMeta: metav1.ObjectMeta{ Name: "test", @@ -199,7 +199,6 @@ func TestValidateTFJob(t *testing.T) { }, }, wantErr: field.ErrorList{ - field.Invalid(field.NewPath("spec").Child("managedBy"), "", ""), field.NotSupported(field.NewPath("spec").Child("managedBy"), "", sets.List(util.SupportedJobControllers)), }, }, diff --git a/pkg/webhooks/xgboost/xgboostjob_webhook_test.go b/pkg/webhooks/xgboost/xgboostjob_webhook_test.go index 930b4a4f4e..b9c84c91ec 100644 --- a/pkg/webhooks/xgboost/xgboostjob_webhook_test.go +++ b/pkg/webhooks/xgboost/xgboostjob_webhook_test.go @@ -237,7 +237,7 @@ func TestValidateXGBoostJob(t *testing.T) { field.Required(xgbReplicaSpecPath.Key(string(trainingoperator.XGBoostJobReplicaTypeMaster)), ""), }, }, - "managedBy controller name is malformed": { + "unsupported managedBy controller name": { xgboostJob: &trainingoperator.XGBoostJob{ ObjectMeta: metav1.ObjectMeta{ Name: "test", @@ -250,7 +250,6 @@ func TestValidateXGBoostJob(t *testing.T) { }, }, wantErr: field.ErrorList{ - field.Invalid(field.NewPath("spec").Child("managedBy"), "", ""), field.NotSupported(field.NewPath("spec").Child("managedBy"), "", sets.List(util.SupportedJobControllers)), }, }, diff --git a/sdk/python/docs/KubeflowOrgV1RunPolicy.md b/sdk/python/docs/KubeflowOrgV1RunPolicy.md index bcbadfaa10..83a72953ea 100644 --- a/sdk/python/docs/KubeflowOrgV1RunPolicy.md +++ b/sdk/python/docs/KubeflowOrgV1RunPolicy.md @@ -7,7 +7,7 @@ Name | Type | Description | Notes **active_deadline_seconds** | **int** | Specifies the duration in seconds relative to the startTime that the job may be active before the system tries to terminate it; value must be positive integer. | [optional] **backoff_limit** | **int** | Optional number of retries before marking this job failed. | [optional] **clean_pod_policy** | **str** | CleanPodPolicy defines the policy to kill pods after the job completes. Default to None. | [optional] -**managed_by** | **str** | ManagedBy is used to indicate the controller or entity that manages a job. The value must be either an empty, 'kubeflow.org/training-operator' or 'kueue.x-k8s.io/multikueue'. The built-in job controller reconciles a job which don't have this field at all or the field value is the reserved string 'kubeflow.org/training-operator', but delegates reconciling the job with a 'kueue.x-k8s.io/multikueue' to the Kueue. The value must be a valid domain-prefixed path (e.g. acme.io/foo) - all characters before the first \"/\" must be a valid subdomain as defined by RFC 1123. All characters trailing the first \"/\" must be valid HTTP Path characters as defined by RFC 3986. The value cannot exceed 63 characters. The field is immutable. | [optional] +**managed_by** | **str** | ManagedBy is used to indicate the controller or entity that manages a job. The value must be either an empty, 'kubeflow.org/training-operator' or 'kueue.x-k8s.io/multikueue'. The training-operator reconciles a job which doesn't have this field at all or the field value is the reserved string 'kubeflow.org/training-operator', but delegates reconciling the job with a 'kueue.x-k8s.io/multikueue' to the Kueue. The field is immutable. | [optional] **scheduling_policy** | [**KubeflowOrgV1SchedulingPolicy**](KubeflowOrgV1SchedulingPolicy.md) | | [optional] **suspend** | **bool** | suspend specifies whether the Job controller should create Pods or not. If a Job is created with suspend set to true, no Pods are created by the Job controller. If a Job is suspended after creation (i.e. the flag goes from false to true), the Job controller will delete all active Pods and PodGroups associated with this Job. Users must design their workload to gracefully handle this. Suspending a Job will reset the StartTime field of the Job. Defaults to false. | [optional] **ttl_seconds_after_finished** | **int** | TTLSecondsAfterFinished is the TTL to clean up jobs. It may take extra ReconcilePeriod seconds for the cleanup, since reconcile gets called periodically. Default to infinite. | [optional] diff --git a/sdk/python/kubeflow/training/models/kubeflow_org_v1_run_policy.py b/sdk/python/kubeflow/training/models/kubeflow_org_v1_run_policy.py index 7d30f3e41c..80d9dcb83d 100644 --- a/sdk/python/kubeflow/training/models/kubeflow_org_v1_run_policy.py +++ b/sdk/python/kubeflow/training/models/kubeflow_org_v1_run_policy.py @@ -155,7 +155,7 @@ def clean_pod_policy(self, clean_pod_policy): def managed_by(self): """Gets the managed_by of this KubeflowOrgV1RunPolicy. # noqa: E501 - ManagedBy is used to indicate the controller or entity that manages a job. The value must be either an empty, 'kubeflow.org/training-operator' or 'kueue.x-k8s.io/multikueue'. The built-in job controller reconciles a job which don't have this field at all or the field value is the reserved string 'kubeflow.org/training-operator', but delegates reconciling the job with a 'kueue.x-k8s.io/multikueue' to the Kueue. The value must be a valid domain-prefixed path (e.g. acme.io/foo) - all characters before the first \"/\" must be a valid subdomain as defined by RFC 1123. All characters trailing the first \"/\" must be valid HTTP Path characters as defined by RFC 3986. The value cannot exceed 63 characters. The field is immutable. # noqa: E501 + ManagedBy is used to indicate the controller or entity that manages a job. The value must be either an empty, 'kubeflow.org/training-operator' or 'kueue.x-k8s.io/multikueue'. The training-operator reconciles a job which doesn't have this field at all or the field value is the reserved string 'kubeflow.org/training-operator', but delegates reconciling the job with a 'kueue.x-k8s.io/multikueue' to the Kueue. The field is immutable. # noqa: E501 :return: The managed_by of this KubeflowOrgV1RunPolicy. # noqa: E501 :rtype: str @@ -166,7 +166,7 @@ def managed_by(self): def managed_by(self, managed_by): """Sets the managed_by of this KubeflowOrgV1RunPolicy. - ManagedBy is used to indicate the controller or entity that manages a job. The value must be either an empty, 'kubeflow.org/training-operator' or 'kueue.x-k8s.io/multikueue'. The built-in job controller reconciles a job which don't have this field at all or the field value is the reserved string 'kubeflow.org/training-operator', but delegates reconciling the job with a 'kueue.x-k8s.io/multikueue' to the Kueue. The value must be a valid domain-prefixed path (e.g. acme.io/foo) - all characters before the first \"/\" must be a valid subdomain as defined by RFC 1123. All characters trailing the first \"/\" must be valid HTTP Path characters as defined by RFC 3986. The value cannot exceed 63 characters. The field is immutable. # noqa: E501 + ManagedBy is used to indicate the controller or entity that manages a job. The value must be either an empty, 'kubeflow.org/training-operator' or 'kueue.x-k8s.io/multikueue'. The training-operator reconciles a job which doesn't have this field at all or the field value is the reserved string 'kubeflow.org/training-operator', but delegates reconciling the job with a 'kueue.x-k8s.io/multikueue' to the Kueue. The field is immutable. # noqa: E501 :param managed_by: The managed_by of this KubeflowOrgV1RunPolicy. # noqa: E501 :type: str From 52d4e465e3f27085eab2204e6527ee8f867f24a1 Mon Sep 17 00:00:00 2001 From: Michal Szadkowski Date: Tue, 10 Sep 2024 11:44:58 +0200 Subject: [PATCH 16/26] Update ManagedBy comment Signed-off-by: Michal Szadkowski --- manifests/base/crds/kubeflow.org_jaxjobs.yaml | 2 +- manifests/base/crds/kubeflow.org_mpijobs.yaml | 2 +- manifests/base/crds/kubeflow.org_paddlejobs.yaml | 2 +- manifests/base/crds/kubeflow.org_pytorchjobs.yaml | 2 +- manifests/base/crds/kubeflow.org_tfjobs.yaml | 2 +- manifests/base/crds/kubeflow.org_xgboostjobs.yaml | 2 +- manifests/overlays/standalone/kustomization.yaml | 15 ++++++++------- pkg/apis/kubeflow.org/v1/common_types.go | 2 +- 8 files changed, 15 insertions(+), 14 deletions(-) diff --git a/manifests/base/crds/kubeflow.org_jaxjobs.yaml b/manifests/base/crds/kubeflow.org_jaxjobs.yaml index 32962b6161..0131e051e5 100644 --- a/manifests/base/crds/kubeflow.org_jaxjobs.yaml +++ b/manifests/base/crds/kubeflow.org_jaxjobs.yaml @@ -7314,7 +7314,7 @@ spec: The training-operator reconciles a job which doesn't have this field at all or the field value is the reserved string 'kubeflow.org/training-operator', but delegates reconciling the job - with a 'kueue.x-k8s. + with 'kueue.x-k8s. type: string schedulingPolicy: description: SchedulingPolicy defines the policy related to scheduling, diff --git a/manifests/base/crds/kubeflow.org_mpijobs.yaml b/manifests/base/crds/kubeflow.org_mpijobs.yaml index 5a42e3288f..6bbdf671ad 100644 --- a/manifests/base/crds/kubeflow.org_mpijobs.yaml +++ b/manifests/base/crds/kubeflow.org_mpijobs.yaml @@ -7319,7 +7319,7 @@ spec: The training-operator reconciles a job which doesn't have this field at all or the field value is the reserved string 'kubeflow.org/training-operator', but delegates reconciling the job - with a 'kueue.x-k8s. + with 'kueue.x-k8s. type: string schedulingPolicy: description: SchedulingPolicy defines the policy related to scheduling, diff --git a/manifests/base/crds/kubeflow.org_paddlejobs.yaml b/manifests/base/crds/kubeflow.org_paddlejobs.yaml index 282d6c3ce5..6c69ba882b 100644 --- a/manifests/base/crds/kubeflow.org_paddlejobs.yaml +++ b/manifests/base/crds/kubeflow.org_paddlejobs.yaml @@ -7801,7 +7801,7 @@ spec: The training-operator reconciles a job which doesn't have this field at all or the field value is the reserved string 'kubeflow.org/training-operator', but delegates reconciling the job - with a 'kueue.x-k8s. + with 'kueue.x-k8s. type: string schedulingPolicy: description: SchedulingPolicy defines the policy related to scheduling, diff --git a/manifests/base/crds/kubeflow.org_pytorchjobs.yaml b/manifests/base/crds/kubeflow.org_pytorchjobs.yaml index 1d23531fba..b4eabcaceb 100644 --- a/manifests/base/crds/kubeflow.org_pytorchjobs.yaml +++ b/manifests/base/crds/kubeflow.org_pytorchjobs.yaml @@ -7838,7 +7838,7 @@ spec: The training-operator reconciles a job which doesn't have this field at all or the field value is the reserved string 'kubeflow.org/training-operator', but delegates reconciling the job - with a 'kueue.x-k8s. + with 'kueue.x-k8s. type: string schedulingPolicy: description: SchedulingPolicy defines the policy related to scheduling, diff --git a/manifests/base/crds/kubeflow.org_tfjobs.yaml b/manifests/base/crds/kubeflow.org_tfjobs.yaml index d045477431..5b1bd84fe0 100644 --- a/manifests/base/crds/kubeflow.org_tfjobs.yaml +++ b/manifests/base/crds/kubeflow.org_tfjobs.yaml @@ -79,7 +79,7 @@ spec: The training-operator reconciles a job which doesn't have this field at all or the field value is the reserved string 'kubeflow.org/training-operator', but delegates reconciling the job - with a 'kueue.x-k8s. + with 'kueue.x-k8s. type: string schedulingPolicy: description: SchedulingPolicy defines the policy related to scheduling, diff --git a/manifests/base/crds/kubeflow.org_xgboostjobs.yaml b/manifests/base/crds/kubeflow.org_xgboostjobs.yaml index af76c28ec8..4cdceca6ed 100644 --- a/manifests/base/crds/kubeflow.org_xgboostjobs.yaml +++ b/manifests/base/crds/kubeflow.org_xgboostjobs.yaml @@ -75,7 +75,7 @@ spec: The training-operator reconciles a job which doesn't have this field at all or the field value is the reserved string 'kubeflow.org/training-operator', but delegates reconciling the job - with a 'kueue.x-k8s. + with 'kueue.x-k8s. type: string schedulingPolicy: description: SchedulingPolicy defines the policy related to scheduling, diff --git a/manifests/overlays/standalone/kustomization.yaml b/manifests/overlays/standalone/kustomization.yaml index df72e1dc03..f055da5dc8 100644 --- a/manifests/overlays/standalone/kustomization.yaml +++ b/manifests/overlays/standalone/kustomization.yaml @@ -2,12 +2,13 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization namespace: kubeflow resources: - - ../../base - - namespace.yaml +- ../../base +- namespace.yaml images: - - name: kubeflow/training-operator - newTag: "latest" +- name: kubeflow/training-operator + newName: kubeflowtraining/training-operator + newTag: test secretGenerator: - - name: training-operator-webhook-cert - options: - disableNameSuffixHash: true +- name: training-operator-webhook-cert + options: + disableNameSuffixHash: true diff --git a/pkg/apis/kubeflow.org/v1/common_types.go b/pkg/apis/kubeflow.org/v1/common_types.go index 1eac2fe3bf..59923b4da7 100644 --- a/pkg/apis/kubeflow.org/v1/common_types.go +++ b/pkg/apis/kubeflow.org/v1/common_types.go @@ -234,7 +234,7 @@ type RunPolicy struct { // The training-operator reconciles a job which doesn't have this // field at all or the field value is the reserved string // 'kubeflow.org/training-operator', but delegates reconciling the job - // with a 'kueue.x-k8s.io/multikueue' to the Kueue. + // with 'kueue.x-k8s.io/multikueue' to the Kueue. // The field is immutable. ManagedBy *string `json:"managedBy,omitempty"` } From 3870734182d584a766abb491def176705048435b Mon Sep 17 00:00:00 2001 From: Michal Szadkowski Date: Wed, 11 Sep 2024 15:33:05 +0200 Subject: [PATCH 17/26] E2E tests for managedBy Signed-off-by: Michal Szadkowski --- sdk/python/test/e2e/test_e2e_pytorchjob.py | 75 ++++++++++++++++++++++ sdk/python/test/e2e/utils.py | 8 +++ 2 files changed, 83 insertions(+) diff --git a/sdk/python/test/e2e/test_e2e_pytorchjob.py b/sdk/python/test/e2e/test_e2e_pytorchjob.py index c5b28faaf8..efe9087609 100644 --- a/sdk/python/test/e2e/test_e2e_pytorchjob.py +++ b/sdk/python/test/e2e/test_e2e_pytorchjob.py @@ -167,6 +167,79 @@ def test_sdk_e2e(job_namespace): TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace) +@pytest.mark.skipif( + GANG_SCHEDULER_NAME in GANG_SCHEDULERS, + reason="For plain scheduling", +) +def test_sdk_e2e_managed_by(job_namespace): + JOB_NAME = "pytorchjob-e2e" + container = generate_container() + + master = KubeflowOrgV1ReplicaSpec( + replicas=1, + restart_policy="OnFailure", + template=V1PodTemplateSpec( + metadata=V1ObjectMeta( + annotations={constants.ISTIO_SIDECAR_INJECTION: "false"} + ), + spec=V1PodSpec(containers=[container]), + ), + ) + + worker = KubeflowOrgV1ReplicaSpec( + replicas=1, + restart_policy="OnFailure", + template=V1PodTemplateSpec( + metadata=V1ObjectMeta( + annotations={constants.ISTIO_SIDECAR_INJECTION: "false"} + ), + spec=V1PodSpec(containers=[container]), + ), + ) + + #1. Job created with default value: 'kubeflow.org/training-operator' - job created and status updated + #2. Job created with kueue value: 'kueue.x-k8s.io/multikueue' - job created but status not updated + #3. Job created with invalid value (not acceptable by the webhook) - job not created + controllers = { + JOB_NAME+"-default-controller": 'kubeflow.org/training-operator', + JOB_NAME+"-multikueue-controller": 'kueue.x-k8s.io/multikueue', + JOB_NAME+"-invalid-controller": 'kueue.x-k8s.io/other-controller', + } + for job_name, managed_by in controllers.items(): + pytorchjob = generate_pytorchjob(job_namespace, job_name, master, worker, managed_by=managed_by) + try: + TRAINING_CLIENT.create_job(job=pytorchjob, namespace=job_namespace) + except Exception as e: + if "invalid" in str(job_name): + error_message = f"Failed to create PyTorchJob: {job_namespace}/{job_name}" + assert error_message in str(e), f"Unexpected error: {e}" + else: + raise Exception(f"PyTorchJob E2E fails. Exception: {e}") + + logging.info(f"List of created {TRAINING_CLIENT.job_kind}s") + jobs = TRAINING_CLIENT.list_jobs(job_namespace) + logging.info(jobs) + + try: + #Only jobs with valid controllers should be created, 2 out of 3 satisfy this condition: 'kubeflow.org/training-operator' and 'kueue.x-k8s.io/multikueue' + if len(jobs) != 2: + raise Exception(f"Too many PyTorchJobs created {jobs}") + + for job in jobs: + if job._metadata.name == 'kubeflow.org/training-operator': + utils.verify_job_e2e(TRAINING_CLIENT, job._metadata.name, job_namespace, wait_timeout=900) + if job._metadata.name == 'kueue.x-k8s.io/multikueue': + utils.verify_externally_managed_job_e2e(TRAINING_CLIENT, job._metadata.name, job_namespace) + + except Exception as e: + utils.print_job_results(TRAINING_CLIENT, JOB_NAME, job_namespace) + TRAINING_CLIENT.delete_job(JOB_NAME, job_namespace) + raise Exception(f"PyTorchJob E2E fails. Exception: {e}") + + for job in jobs: + utils.print_job_results(TRAINING_CLIENT, job._metadata.name, job_namespace) + TRAINING_CLIENT.delete_job(job._metadata.name, job_namespace) + @pytest.mark.skipif( GANG_SCHEDULER_NAME in GANG_SCHEDULERS, reason="For plain scheduling", @@ -246,6 +319,7 @@ def generate_pytorchjob( master: KubeflowOrgV1ReplicaSpec, worker: KubeflowOrgV1ReplicaSpec, scheduling_policy: Optional[KubeflowOrgV1SchedulingPolicy] = None, + managed_by: Optional[str] = None, ) -> KubeflowOrgV1PyTorchJob: return KubeflowOrgV1PyTorchJob( api_version=constants.API_VERSION, @@ -255,6 +329,7 @@ def generate_pytorchjob( run_policy=KubeflowOrgV1RunPolicy( clean_pod_policy="None", scheduling_policy=scheduling_policy, + managed_by=managed_by, ), pytorch_replica_specs={"Master": master, "Worker": worker}, ), diff --git a/sdk/python/test/e2e/utils.py b/sdk/python/test/e2e/utils.py index 7a6f81f922..ee09b097de 100644 --- a/sdk/python/test/e2e/utils.py +++ b/sdk/python/test/e2e/utils.py @@ -11,6 +11,14 @@ logging.getLogger().setLevel(logging.INFO) +def verify_externally_managed_job_e2e(client: TrainingClient, name: str, namespace: str): + """Verify externally managed Training Job e2e test.""" + job = client.get_job(name, namespace) + conditions = client.get_job_conditions(name, namespace, client.job_kind, job) + if len(conditions) != 0: + raise Exception(f"{client.job_kind} conditions {conditions} should not be updated, externally managed by {managed_by}") + + def verify_unschedulable_job_e2e(client: TrainingClient, name: str, namespace: str): """Verify unschedulable Training Job e2e test.""" logging.info(f"\n\n\n{client.job_kind} is creating") From 9b35574dcd2daf935cabbf1d2262e95cb1667219 Mon Sep 17 00:00:00 2001 From: Michal Szadkowski Date: Wed, 11 Sep 2024 19:37:45 +0200 Subject: [PATCH 18/26] Update generated files and manifests Signed-off-by: Michal Szadkowski --- docs/api/kubeflow.org_v1_generated.asciidoc | 2 +- hack/python-sdk/swagger.json | 2 +- pkg/apis/kubeflow.org/v1/openapi_generated.go | 2 +- sdk/python/docs/KubeflowOrgV1RunPolicy.md | 2 +- .../kubeflow/training/models/kubeflow_org_v1_run_policy.py | 4 ++-- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/api/kubeflow.org_v1_generated.asciidoc b/docs/api/kubeflow.org_v1_generated.asciidoc index 55e4a7dd23..a145d9f738 100644 --- a/docs/api/kubeflow.org_v1_generated.asciidoc +++ b/docs/api/kubeflow.org_v1_generated.asciidoc @@ -683,7 +683,7 @@ The value must be either an empty, 'kubeflow.org/training-operator' or The training-operator reconciles a job which doesn't have this field at all or the field value is the reserved string 'kubeflow.org/training-operator', but delegates reconciling the job -with a 'kueue.x-k8s.io/multikueue' to the Kueue. +with 'kueue.x-k8s.io/multikueue' to the Kueue. The field is immutable. |=== diff --git a/hack/python-sdk/swagger.json b/hack/python-sdk/swagger.json index 7f1530378b..bb5632167a 100644 --- a/hack/python-sdk/swagger.json +++ b/hack/python-sdk/swagger.json @@ -576,7 +576,7 @@ "type": "string" }, "managedBy": { - "description": "ManagedBy is used to indicate the controller or entity that manages a job. The value must be either an empty, 'kubeflow.org/training-operator' or 'kueue.x-k8s.io/multikueue'. The training-operator reconciles a job which doesn't have this field at all or the field value is the reserved string 'kubeflow.org/training-operator', but delegates reconciling the job with a 'kueue.x-k8s.io/multikueue' to the Kueue. The field is immutable.", + "description": "ManagedBy is used to indicate the controller or entity that manages a job. The value must be either an empty, 'kubeflow.org/training-operator' or 'kueue.x-k8s.io/multikueue'. The training-operator reconciles a job which doesn't have this field at all or the field value is the reserved string 'kubeflow.org/training-operator', but delegates reconciling the job with 'kueue.x-k8s.io/multikueue' to the Kueue. The field is immutable.", "type": "string" }, "schedulingPolicy": { diff --git a/pkg/apis/kubeflow.org/v1/openapi_generated.go b/pkg/apis/kubeflow.org/v1/openapi_generated.go index 2c62883152..560f2b278a 100644 --- a/pkg/apis/kubeflow.org/v1/openapi_generated.go +++ b/pkg/apis/kubeflow.org/v1/openapi_generated.go @@ -1065,7 +1065,7 @@ func schema_pkg_apis_kubefloworg_v1_RunPolicy(ref common.ReferenceCallback) comm }, "managedBy": { SchemaProps: spec.SchemaProps{ - Description: "ManagedBy is used to indicate the controller or entity that manages a job. The value must be either an empty, 'kubeflow.org/training-operator' or 'kueue.x-k8s.io/multikueue'. The training-operator reconciles a job which doesn't have this field at all or the field value is the reserved string 'kubeflow.org/training-operator', but delegates reconciling the job with a 'kueue.x-k8s.io/multikueue' to the Kueue. The field is immutable.", + Description: "ManagedBy is used to indicate the controller or entity that manages a job. The value must be either an empty, 'kubeflow.org/training-operator' or 'kueue.x-k8s.io/multikueue'. The training-operator reconciles a job which doesn't have this field at all or the field value is the reserved string 'kubeflow.org/training-operator', but delegates reconciling the job with 'kueue.x-k8s.io/multikueue' to the Kueue. The field is immutable.", Type: []string{"string"}, Format: "", }, diff --git a/sdk/python/docs/KubeflowOrgV1RunPolicy.md b/sdk/python/docs/KubeflowOrgV1RunPolicy.md index 83a72953ea..d3b32c70c7 100644 --- a/sdk/python/docs/KubeflowOrgV1RunPolicy.md +++ b/sdk/python/docs/KubeflowOrgV1RunPolicy.md @@ -7,7 +7,7 @@ Name | Type | Description | Notes **active_deadline_seconds** | **int** | Specifies the duration in seconds relative to the startTime that the job may be active before the system tries to terminate it; value must be positive integer. | [optional] **backoff_limit** | **int** | Optional number of retries before marking this job failed. | [optional] **clean_pod_policy** | **str** | CleanPodPolicy defines the policy to kill pods after the job completes. Default to None. | [optional] -**managed_by** | **str** | ManagedBy is used to indicate the controller or entity that manages a job. The value must be either an empty, 'kubeflow.org/training-operator' or 'kueue.x-k8s.io/multikueue'. The training-operator reconciles a job which doesn't have this field at all or the field value is the reserved string 'kubeflow.org/training-operator', but delegates reconciling the job with a 'kueue.x-k8s.io/multikueue' to the Kueue. The field is immutable. | [optional] +**managed_by** | **str** | ManagedBy is used to indicate the controller or entity that manages a job. The value must be either an empty, 'kubeflow.org/training-operator' or 'kueue.x-k8s.io/multikueue'. The training-operator reconciles a job which doesn't have this field at all or the field value is the reserved string 'kubeflow.org/training-operator', but delegates reconciling the job with 'kueue.x-k8s.io/multikueue' to the Kueue. The field is immutable. | [optional] **scheduling_policy** | [**KubeflowOrgV1SchedulingPolicy**](KubeflowOrgV1SchedulingPolicy.md) | | [optional] **suspend** | **bool** | suspend specifies whether the Job controller should create Pods or not. If a Job is created with suspend set to true, no Pods are created by the Job controller. If a Job is suspended after creation (i.e. the flag goes from false to true), the Job controller will delete all active Pods and PodGroups associated with this Job. Users must design their workload to gracefully handle this. Suspending a Job will reset the StartTime field of the Job. Defaults to false. | [optional] **ttl_seconds_after_finished** | **int** | TTLSecondsAfterFinished is the TTL to clean up jobs. It may take extra ReconcilePeriod seconds for the cleanup, since reconcile gets called periodically. Default to infinite. | [optional] diff --git a/sdk/python/kubeflow/training/models/kubeflow_org_v1_run_policy.py b/sdk/python/kubeflow/training/models/kubeflow_org_v1_run_policy.py index 80d9dcb83d..7782720075 100644 --- a/sdk/python/kubeflow/training/models/kubeflow_org_v1_run_policy.py +++ b/sdk/python/kubeflow/training/models/kubeflow_org_v1_run_policy.py @@ -155,7 +155,7 @@ def clean_pod_policy(self, clean_pod_policy): def managed_by(self): """Gets the managed_by of this KubeflowOrgV1RunPolicy. # noqa: E501 - ManagedBy is used to indicate the controller or entity that manages a job. The value must be either an empty, 'kubeflow.org/training-operator' or 'kueue.x-k8s.io/multikueue'. The training-operator reconciles a job which doesn't have this field at all or the field value is the reserved string 'kubeflow.org/training-operator', but delegates reconciling the job with a 'kueue.x-k8s.io/multikueue' to the Kueue. The field is immutable. # noqa: E501 + ManagedBy is used to indicate the controller or entity that manages a job. The value must be either an empty, 'kubeflow.org/training-operator' or 'kueue.x-k8s.io/multikueue'. The training-operator reconciles a job which doesn't have this field at all or the field value is the reserved string 'kubeflow.org/training-operator', but delegates reconciling the job with 'kueue.x-k8s.io/multikueue' to the Kueue. The field is immutable. # noqa: E501 :return: The managed_by of this KubeflowOrgV1RunPolicy. # noqa: E501 :rtype: str @@ -166,7 +166,7 @@ def managed_by(self): def managed_by(self, managed_by): """Sets the managed_by of this KubeflowOrgV1RunPolicy. - ManagedBy is used to indicate the controller or entity that manages a job. The value must be either an empty, 'kubeflow.org/training-operator' or 'kueue.x-k8s.io/multikueue'. The training-operator reconciles a job which doesn't have this field at all or the field value is the reserved string 'kubeflow.org/training-operator', but delegates reconciling the job with a 'kueue.x-k8s.io/multikueue' to the Kueue. The field is immutable. # noqa: E501 + ManagedBy is used to indicate the controller or entity that manages a job. The value must be either an empty, 'kubeflow.org/training-operator' or 'kueue.x-k8s.io/multikueue'. The training-operator reconciles a job which doesn't have this field at all or the field value is the reserved string 'kubeflow.org/training-operator', but delegates reconciling the job with 'kueue.x-k8s.io/multikueue' to the Kueue. The field is immutable. # noqa: E501 :param managed_by: The managed_by of this KubeflowOrgV1RunPolicy. # noqa: E501 :type: str From ada29bcd07cc8bc1edeac2be62c577c9e9528e39 Mon Sep 17 00:00:00 2001 From: Michal Szadkowski Date: Wed, 11 Sep 2024 21:05:05 +0200 Subject: [PATCH 19/26] Rework after code review Signed-off-by: Michal Szadkowski --- pkg/common/util/webhooks.go | 6 ++--- pkg/controller.v1/common/job_test.go | 22 +++++++++---------- pkg/util/testutil/constants.go | 1 - .../paddlepaddle/paddlepaddle_webhook_test.go | 9 ++++---- .../pytorch/pytorchjob_webhook_test.go | 9 ++++---- pkg/webhooks/tensorflow/tfjob_webhook_test.go | 9 ++++---- .../xgboost/xgboostjob_webhook_test.go | 9 ++++---- sdk/python/test/e2e/test_e2e_pytorchjob.py | 5 ++++- sdk/python/test/e2e/utils.py | 8 ------- 9 files changed, 38 insertions(+), 40 deletions(-) diff --git a/pkg/common/util/webhooks.go b/pkg/common/util/webhooks.go index 326a3fcabb..65511531ba 100644 --- a/pkg/common/util/webhooks.go +++ b/pkg/common/util/webhooks.go @@ -7,7 +7,7 @@ import ( "k8s.io/apimachinery/pkg/util/validation/field" ) -var SupportedJobControllers = sets.New( +var supportedJobControllers = sets.New( v1.MultiKueueController, v1.KubeflowJobsController) @@ -15,8 +15,8 @@ func ValidateManagedBy(runPolicy *v1.RunPolicy, allErrs field.ErrorList) field.E if runPolicy.ManagedBy != nil { manager := *runPolicy.ManagedBy fieldPath := field.NewPath("spec", "managedBy") - if !SupportedJobControllers.Has(manager) { - allErrs = append(allErrs, field.NotSupported(fieldPath, manager, SupportedJobControllers.UnsortedList())) + if !supportedJobControllers.Has(manager) { + allErrs = append(allErrs, field.NotSupported(fieldPath, manager, supportedJobControllers.UnsortedList())) } } return allErrs diff --git a/pkg/controller.v1/common/job_test.go b/pkg/controller.v1/common/job_test.go index f76d9be7f1..e604c715f0 100644 --- a/pkg/controller.v1/common/job_test.go +++ b/pkg/controller.v1/common/job_test.go @@ -30,6 +30,7 @@ import ( "k8s.io/apimachinery/pkg/runtime" "k8s.io/client-go/kubernetes/fake" "k8s.io/client-go/tools/record" + "k8s.io/utils/ptr" ) func TestDeletePodsAndServices(T *testing.T) { @@ -221,33 +222,32 @@ func TestPastActiveDeadline(T *testing.T) { func TestManagedBy(T *testing.T) { cases := map[string]struct { - managedBy string + managedBy *string }{ "managedBy is empty": { - managedBy: "", + managedBy: ptr.To[string](""), }, "managedBy is training-operator controller": { - managedBy: apiv1.KubeflowJobsController, + managedBy: ptr.To[string](apiv1.KubeflowJobsController), }, "managedBy is not the training-operator controller": { - managedBy: "kueue.x-k8s.io/multikueue", + managedBy: ptr.To[string]("kueue.x-k8s.io/multikueue"), }, "managedBy is other value": { - managedBy: "other-job-controller", + managedBy: ptr.To[string]("other-job-controller"), }, } for name, tc := range cases { T.Run(name, func(t *testing.T) { jobController := JobController{} runPolicy := &apiv1.RunPolicy{ - ManagedBy: &tc.managedBy, - } - if tc.managedBy == "" { - runPolicy.ManagedBy = nil + ManagedBy: tc.managedBy, } - if got := jobController.ManagedByExternalController(*runPolicy); got != nil && tc.managedBy != *got { - t.Errorf("Unexpected manager controller: \nwant: %v\ngot: %v\n", tc.managedBy, *got) + if got := jobController.ManagedByExternalController(*runPolicy); got != nil { + if diff := cmp.Diff(tc.managedBy, got); diff != "" { + t.Errorf("Unexpected manager controller (-want +got):\n%s", diff) + } } }) } diff --git a/pkg/util/testutil/constants.go b/pkg/util/testutil/constants.go index d79c075bf3..74e0a11796 100644 --- a/pkg/util/testutil/constants.go +++ b/pkg/util/testutil/constants.go @@ -16,5 +16,4 @@ const ( var ( IgnoreJobConditionsTimes = cmpopts.IgnoreFields(kubeflowv1.JobCondition{}, "LastUpdateTime", "LastTransitionTime") - MalformedManagedBy = "other-job-controller" ) diff --git a/pkg/webhooks/paddlepaddle/paddlepaddle_webhook_test.go b/pkg/webhooks/paddlepaddle/paddlepaddle_webhook_test.go index 3fbf13fcc0..2992cd87e9 100644 --- a/pkg/webhooks/paddlepaddle/paddlepaddle_webhook_test.go +++ b/pkg/webhooks/paddlepaddle/paddlepaddle_webhook_test.go @@ -28,8 +28,7 @@ import ( "k8s.io/utils/ptr" trainingoperator "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" - "github.com/kubeflow/training-operator/pkg/common/util" - "github.com/kubeflow/training-operator/pkg/util/testutil" + v1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" ) func TestValidateV1PaddleJob(t *testing.T) { @@ -182,13 +181,15 @@ func TestValidateV1PaddleJob(t *testing.T) { }, Spec: trainingoperator.PaddleJobSpec{ RunPolicy: trainingoperator.RunPolicy{ - ManagedBy: ptr.To(testutil.MalformedManagedBy), + ManagedBy: ptr.To("other-job-controller"), }, PaddleReplicaSpecs: validPaddleReplicaSpecs, }, }, wantErr: field.ErrorList{ - field.NotSupported(field.NewPath("spec").Child("managedBy"), "", sets.List(util.SupportedJobControllers)), + field.NotSupported(field.NewPath("spec").Child("managedBy"), "", sets.List(sets.New( + v1.MultiKueueController, + v1.KubeflowJobsController))), }, }, } diff --git a/pkg/webhooks/pytorch/pytorchjob_webhook_test.go b/pkg/webhooks/pytorch/pytorchjob_webhook_test.go index 609780fe7a..f69a65a412 100644 --- a/pkg/webhooks/pytorch/pytorchjob_webhook_test.go +++ b/pkg/webhooks/pytorch/pytorchjob_webhook_test.go @@ -30,8 +30,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/webhook/admission" trainingoperator "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" - "github.com/kubeflow/training-operator/pkg/common/util" - "github.com/kubeflow/training-operator/pkg/util/testutil" + v1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" ) func TestValidateV1PyTorchJob(t *testing.T) { @@ -297,13 +296,15 @@ func TestValidateV1PyTorchJob(t *testing.T) { }, Spec: trainingoperator.PyTorchJobSpec{ RunPolicy: trainingoperator.RunPolicy{ - ManagedBy: ptr.To(testutil.MalformedManagedBy), + ManagedBy: ptr.To("other-job-controller"), }, PyTorchReplicaSpecs: validPyTorchReplicaSpecs, }, }, wantErr: field.ErrorList{ - field.NotSupported(field.NewPath("spec").Child("managedBy"), "", sets.List(util.SupportedJobControllers)), + field.NotSupported(field.NewPath("spec").Child("managedBy"), "", sets.List(sets.New( + v1.MultiKueueController, + v1.KubeflowJobsController))), }, }, } diff --git a/pkg/webhooks/tensorflow/tfjob_webhook_test.go b/pkg/webhooks/tensorflow/tfjob_webhook_test.go index 3197b5e674..a9e9a899c4 100644 --- a/pkg/webhooks/tensorflow/tfjob_webhook_test.go +++ b/pkg/webhooks/tensorflow/tfjob_webhook_test.go @@ -28,8 +28,7 @@ import ( "k8s.io/utils/ptr" trainingoperator "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" - "github.com/kubeflow/training-operator/pkg/common/util" - "github.com/kubeflow/training-operator/pkg/util/testutil" + v1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" ) func TestValidateTFJob(t *testing.T) { @@ -193,13 +192,15 @@ func TestValidateTFJob(t *testing.T) { }, Spec: trainingoperator.TFJobSpec{ RunPolicy: trainingoperator.RunPolicy{ - ManagedBy: ptr.To(testutil.MalformedManagedBy), + ManagedBy: ptr.To("other-job-controller"), }, TFReplicaSpecs: validTFReplicaSpecs, }, }, wantErr: field.ErrorList{ - field.NotSupported(field.NewPath("spec").Child("managedBy"), "", sets.List(util.SupportedJobControllers)), + field.NotSupported(field.NewPath("spec").Child("managedBy"), "", sets.List(sets.New( + v1.MultiKueueController, + v1.KubeflowJobsController))), }, }, } diff --git a/pkg/webhooks/xgboost/xgboostjob_webhook_test.go b/pkg/webhooks/xgboost/xgboostjob_webhook_test.go index b9c84c91ec..4187529546 100644 --- a/pkg/webhooks/xgboost/xgboostjob_webhook_test.go +++ b/pkg/webhooks/xgboost/xgboostjob_webhook_test.go @@ -28,8 +28,7 @@ import ( "k8s.io/utils/ptr" trainingoperator "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" - "github.com/kubeflow/training-operator/pkg/common/util" - "github.com/kubeflow/training-operator/pkg/util/testutil" + v1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" ) func TestValidateXGBoostJob(t *testing.T) { @@ -244,13 +243,15 @@ func TestValidateXGBoostJob(t *testing.T) { }, Spec: trainingoperator.XGBoostJobSpec{ RunPolicy: trainingoperator.RunPolicy{ - ManagedBy: ptr.To(testutil.MalformedManagedBy), + ManagedBy: ptr.To("other-job-controller"), }, XGBReplicaSpecs: validXGBoostReplicaSpecs, }, }, wantErr: field.ErrorList{ - field.NotSupported(field.NewPath("spec").Child("managedBy"), "", sets.List(util.SupportedJobControllers)), + field.NotSupported(field.NewPath("spec").Child("managedBy"), "", sets.List(sets.New( + v1.MultiKueueController, + v1.KubeflowJobsController))), }, }, } diff --git a/sdk/python/test/e2e/test_e2e_pytorchjob.py b/sdk/python/test/e2e/test_e2e_pytorchjob.py index efe9087609..5800c0f76f 100644 --- a/sdk/python/test/e2e/test_e2e_pytorchjob.py +++ b/sdk/python/test/e2e/test_e2e_pytorchjob.py @@ -229,7 +229,10 @@ def test_sdk_e2e_managed_by(job_namespace): if job._metadata.name == 'kubeflow.org/training-operator': utils.verify_job_e2e(TRAINING_CLIENT, job._metadata.name, job_namespace, wait_timeout=900) if job._metadata.name == 'kueue.x-k8s.io/multikueue': - utils.verify_externally_managed_job_e2e(TRAINING_CLIENT, job._metadata.name, job_namespace) + conditions = TRAINING_CLIENT.get_job_conditions(job._metadata.name, job_namespace, TRAINING_CLIENT.job_kind, job) + if len(conditions) != 0: + raise Exception(f"{TRAINING_CLIENT.job_kind} conditions {conditions} should not be updated, externally managed by {managed_by}") + except Exception as e: utils.print_job_results(TRAINING_CLIENT, JOB_NAME, job_namespace) diff --git a/sdk/python/test/e2e/utils.py b/sdk/python/test/e2e/utils.py index ee09b097de..7a6f81f922 100644 --- a/sdk/python/test/e2e/utils.py +++ b/sdk/python/test/e2e/utils.py @@ -11,14 +11,6 @@ logging.getLogger().setLevel(logging.INFO) -def verify_externally_managed_job_e2e(client: TrainingClient, name: str, namespace: str): - """Verify externally managed Training Job e2e test.""" - job = client.get_job(name, namespace) - conditions = client.get_job_conditions(name, namespace, client.job_kind, job) - if len(conditions) != 0: - raise Exception(f"{client.job_kind} conditions {conditions} should not be updated, externally managed by {managed_by}") - - def verify_unschedulable_job_e2e(client: TrainingClient, name: str, namespace: str): """Verify unschedulable Training Job e2e test.""" logging.info(f"\n\n\n{client.job_kind} is creating") From de5d02b5ed406d31858fe215069ca5d6eccc86a3 Mon Sep 17 00:00:00 2001 From: Michal Szadkowski Date: Thu, 12 Sep 2024 09:41:55 +0200 Subject: [PATCH 20/26] Revert kustomization change Signed-off-by: Michal Szadkowski --- manifests/overlays/standalone/kustomization.yaml | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/manifests/overlays/standalone/kustomization.yaml b/manifests/overlays/standalone/kustomization.yaml index f055da5dc8..df72e1dc03 100644 --- a/manifests/overlays/standalone/kustomization.yaml +++ b/manifests/overlays/standalone/kustomization.yaml @@ -2,13 +2,12 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization namespace: kubeflow resources: -- ../../base -- namespace.yaml + - ../../base + - namespace.yaml images: -- name: kubeflow/training-operator - newName: kubeflowtraining/training-operator - newTag: test + - name: kubeflow/training-operator + newTag: "latest" secretGenerator: -- name: training-operator-webhook-cert - options: - disableNameSuffixHash: true + - name: training-operator-webhook-cert + options: + disableNameSuffixHash: true From 7d3bb891541ad553dbd7f66be17f2e9434445e39 Mon Sep 17 00:00:00 2001 From: Michal Szadkowski Date: Thu, 12 Sep 2024 11:31:46 +0200 Subject: [PATCH 21/26] Update job_test and logging Signed-off-by: Michal Szadkowski --- pkg/common/util/webhooks.go | 2 +- pkg/controller.v1/common/job_test.go | 24 ++++++++++++++----- pkg/controller.v1/mpi/mpijob_controller.go | 2 +- .../paddlepaddle/paddlepaddle_controller.go | 2 +- .../pytorch/pytorchjob_controller.go | 2 +- .../tensorflow/tfjob_controller.go | 2 +- .../xgboost/xgboostjob_controller.go | 2 +- 7 files changed, 24 insertions(+), 12 deletions(-) diff --git a/pkg/common/util/webhooks.go b/pkg/common/util/webhooks.go index 65511531ba..ea3529d009 100644 --- a/pkg/common/util/webhooks.go +++ b/pkg/common/util/webhooks.go @@ -14,8 +14,8 @@ var supportedJobControllers = sets.New( func ValidateManagedBy(runPolicy *v1.RunPolicy, allErrs field.ErrorList) field.ErrorList { if runPolicy.ManagedBy != nil { manager := *runPolicy.ManagedBy - fieldPath := field.NewPath("spec", "managedBy") if !supportedJobControllers.Has(manager) { + fieldPath := field.NewPath("spec", "managedBy") allErrs = append(allErrs, field.NotSupported(fieldPath, manager, supportedJobControllers.UnsortedList())) } } diff --git a/pkg/controller.v1/common/job_test.go b/pkg/controller.v1/common/job_test.go index e604c715f0..cdd34fdb19 100644 --- a/pkg/controller.v1/common/job_test.go +++ b/pkg/controller.v1/common/job_test.go @@ -220,21 +220,30 @@ func TestPastActiveDeadline(T *testing.T) { } } -func TestManagedBy(T *testing.T) { +func TestManagedByExternalController(T *testing.T) { cases := map[string]struct { - managedBy *string + managedBy *string + wantResult bool }{ + "managedBy is nil": { + managedBy: nil, + wantResult: false, + }, "managedBy is empty": { - managedBy: ptr.To[string](""), + managedBy: ptr.To[string](""), + wantResult: true, }, "managedBy is training-operator controller": { - managedBy: ptr.To[string](apiv1.KubeflowJobsController), + managedBy: ptr.To[string](apiv1.KubeflowJobsController), + wantResult: false, }, "managedBy is not the training-operator controller": { - managedBy: ptr.To[string]("kueue.x-k8s.io/multikueue"), + managedBy: ptr.To[string]("kueue.x-k8s.io/multikueue"), + wantResult: true, }, "managedBy is other value": { - managedBy: ptr.To[string]("other-job-controller"), + managedBy: ptr.To[string]("other-job-controller"), + wantResult: true, }, } for name, tc := range cases { @@ -245,6 +254,9 @@ func TestManagedBy(T *testing.T) { } if got := jobController.ManagedByExternalController(*runPolicy); got != nil { + if !tc.wantResult { + t.Errorf("Unwanted manager controller: %s\n", *got) + } if diff := cmp.Diff(tc.managedBy, got); diff != "" { t.Errorf("Unexpected manager controller (-want +got):\n%s", diff) } diff --git a/pkg/controller.v1/mpi/mpijob_controller.go b/pkg/controller.v1/mpi/mpijob_controller.go index a3c1859108..10682d0981 100644 --- a/pkg/controller.v1/mpi/mpijob_controller.go +++ b/pkg/controller.v1/mpi/mpijob_controller.go @@ -136,7 +136,7 @@ func (jc *MPIJobReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct } if manager := jc.ManagedByExternalController(mpijob.Spec.RunPolicy); manager != nil { - logger.Info("Skipping MPIJob managed by a different controller", "managed-by", manager) + logger.Info("Skipping MPIJob managed by a custom controller", "managed-by", manager) return ctrl.Result{}, nil } diff --git a/pkg/controller.v1/paddlepaddle/paddlepaddle_controller.go b/pkg/controller.v1/paddlepaddle/paddlepaddle_controller.go index a9884d4442..1f3cf2230f 100644 --- a/pkg/controller.v1/paddlepaddle/paddlepaddle_controller.go +++ b/pkg/controller.v1/paddlepaddle/paddlepaddle_controller.go @@ -132,7 +132,7 @@ func (r *PaddleJobReconciler) Reconcile(ctx context.Context, req ctrl.Request) ( } if manager := r.ManagedByExternalController(paddlejob.Spec.RunPolicy); manager != nil { - logger.Info("Skipping PaddleJob managed by a different controller", "managed-by", manager) + logger.Info("Skipping PaddleJob managed by a custom controller", "managed-by", manager) return ctrl.Result{}, nil } diff --git a/pkg/controller.v1/pytorch/pytorchjob_controller.go b/pkg/controller.v1/pytorch/pytorchjob_controller.go index f8f2f4d0b5..9af3e4527d 100644 --- a/pkg/controller.v1/pytorch/pytorchjob_controller.go +++ b/pkg/controller.v1/pytorch/pytorchjob_controller.go @@ -133,7 +133,7 @@ func (r *PyTorchJobReconciler) Reconcile(ctx context.Context, req ctrl.Request) } if manager := r.ManagedByExternalController(pytorchjob.Spec.RunPolicy); manager != nil { - logger.Info("Skipping PyTorchJob managed by a different controller", "managed-by", manager) + logger.Info("Skipping PyTorchJob managed by a custom controller", "managed-by", manager) return ctrl.Result{}, nil } diff --git a/pkg/controller.v1/tensorflow/tfjob_controller.go b/pkg/controller.v1/tensorflow/tfjob_controller.go index 3095d7a57a..b14c33f36d 100644 --- a/pkg/controller.v1/tensorflow/tfjob_controller.go +++ b/pkg/controller.v1/tensorflow/tfjob_controller.go @@ -128,7 +128,7 @@ func (r *TFJobReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl } if manager := r.ManagedByExternalController(tfjob.Spec.RunPolicy); manager != nil { - logger.Info("Skipping TFJob managed by a different controller", "managed-by", manager) + logger.Info("Skipping TFJob managed by a custom controller", "managed-by", manager) return ctrl.Result{}, nil } diff --git a/pkg/controller.v1/xgboost/xgboostjob_controller.go b/pkg/controller.v1/xgboost/xgboostjob_controller.go index 3b051ddc7d..da3889ce02 100644 --- a/pkg/controller.v1/xgboost/xgboostjob_controller.go +++ b/pkg/controller.v1/xgboost/xgboostjob_controller.go @@ -131,7 +131,7 @@ func (r *XGBoostJobReconciler) Reconcile(ctx context.Context, req ctrl.Request) } if manager := r.ManagedByExternalController(xgboostjob.Spec.RunPolicy); manager != nil { - logger.Info("Skipping XGBoostJob managed by a different controller", "managed-by", manager) + logger.Info("Skipping XGBoostJob managed by a custom controller", "managed-by", manager) return ctrl.Result{}, nil } From 353d5056786230dfad7080092fcbb32491a7a79f Mon Sep 17 00:00:00 2001 From: Michal Szadkowski Date: Thu, 12 Sep 2024 14:24:17 +0200 Subject: [PATCH 22/26] Provide immutability check for ManagedBy Signed-off-by: Michal Szadkowski --- pkg/common/util/webhooks.go | 22 +++++++--- .../paddlepaddle/paddlepaddle_webhook.go | 31 ++++++++----- .../paddlepaddle/paddlepaddle_webhook_test.go | 9 ++-- pkg/webhooks/pytorch/pytorchjob_webhook.go | 30 ++++++++----- .../pytorch/pytorchjob_webhook_test.go | 44 +++++++++++++++---- pkg/webhooks/tensorflow/tfjob_webhook.go | 30 ++++++++----- pkg/webhooks/tensorflow/tfjob_webhook_test.go | 9 ++-- pkg/webhooks/xgboost/xgboostjob_webhook.go | 31 ++++++++----- .../xgboost/xgboostjob_webhook_test.go | 9 ++-- 9 files changed, 145 insertions(+), 70 deletions(-) diff --git a/pkg/common/util/webhooks.go b/pkg/common/util/webhooks.go index ea3529d009..a99f08eb95 100644 --- a/pkg/common/util/webhooks.go +++ b/pkg/common/util/webhooks.go @@ -3,6 +3,7 @@ package util import ( v1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" + apivalidation "k8s.io/apimachinery/pkg/api/validation" "k8s.io/apimachinery/pkg/util/sets" "k8s.io/apimachinery/pkg/util/validation/field" ) @@ -11,13 +12,22 @@ var supportedJobControllers = sets.New( v1.MultiKueueController, v1.KubeflowJobsController) -func ValidateManagedBy(runPolicy *v1.RunPolicy, allErrs field.ErrorList) field.ErrorList { - if runPolicy.ManagedBy != nil { - manager := *runPolicy.ManagedBy +func ValidateManagedBy(oldRunPolicy *v1.RunPolicy, newRunPolicy *v1.RunPolicy) field.ErrorList { + errs := field.ErrorList{} + // Validate immutability + if oldRunPolicy != nil && newRunPolicy != nil { + oldManager := oldRunPolicy.ManagedBy + newManager := newRunPolicy.ManagedBy + fieldPath := field.NewPath("spec", "runPolicy", "managedBy") + errs = apivalidation.ValidateImmutableField(newManager, oldManager, fieldPath) + } + // Validate the value + if newRunPolicy != nil && newRunPolicy.ManagedBy != nil { + manager := *newRunPolicy.ManagedBy if !supportedJobControllers.Has(manager) { - fieldPath := field.NewPath("spec", "managedBy") - allErrs = append(allErrs, field.NotSupported(fieldPath, manager, supportedJobControllers.UnsortedList())) + fieldPath := field.NewPath("spec", "runPolicy", "managedBy") + errs = append(errs, field.NotSupported(fieldPath, manager, supportedJobControllers.UnsortedList())) } } - return allErrs + return errs } diff --git a/pkg/webhooks/paddlepaddle/paddlepaddle_webhook.go b/pkg/webhooks/paddlepaddle/paddlepaddle_webhook.go index 264abe6675..6817cd6da2 100644 --- a/pkg/webhooks/paddlepaddle/paddlepaddle_webhook.go +++ b/pkg/webhooks/paddlepaddle/paddlepaddle_webhook.go @@ -56,30 +56,41 @@ func (w Webhook) ValidateCreate(ctx context.Context, obj runtime.Object) (admiss job := obj.(*trainingoperator.PaddleJob) log := ctrl.LoggerFrom(ctx).WithName("paddlejob-webhook") log.V(5).Info("Validating create", "paddleJob", klog.KObj(job)) - return nil, validatePaddleJob(job).ToAggregate() + return nil, validatePaddleJob(nil, job).ToAggregate() } -func (w Webhook) ValidateUpdate(ctx context.Context, _, newObj runtime.Object) (admission.Warnings, error) { - job := newObj.(*trainingoperator.PaddleJob) +func (w Webhook) ValidateUpdate(ctx context.Context, oldObj, newObj runtime.Object) (admission.Warnings, error) { + oldJob := oldObj.(*trainingoperator.PaddleJob) + newJob := newObj.(*trainingoperator.PaddleJob) log := ctrl.LoggerFrom(ctx).WithName("paddlejob-webhook") - log.V(5).Info("Validating update", "paddleJob", klog.KObj(job)) - return nil, validatePaddleJob(job).ToAggregate() + log.V(5).Info("Validating update", "paddleJob", klog.KObj(newJob)) + return nil, validatePaddleJob(oldJob, newJob).ToAggregate() } func (w Webhook) ValidateDelete(context.Context, runtime.Object) (admission.Warnings, error) { return nil, nil } -func validatePaddleJob(job *trainingoperator.PaddleJob) field.ErrorList { +func validatePaddleJob(oldJob, newJob *trainingoperator.PaddleJob) field.ErrorList { var allErrs field.ErrorList - if errors := apimachineryvalidation.NameIsDNS1035Label(job.Name, false); len(errors) != 0 { - allErrs = append(allErrs, field.Invalid(field.NewPath("metadata").Child("name"), job.Name, fmt.Sprintf("should match: %v", strings.Join(errors, ",")))) + if errors := apimachineryvalidation.NameIsDNS1035Label(newJob.Name, false); len(errors) != 0 { + allErrs = append(allErrs, field.Invalid(field.NewPath("metadata").Child("name"), newJob.Name, fmt.Sprintf("should match: %v", strings.Join(errors, ",")))) } - allErrs = util.ValidateManagedBy(&job.Spec.RunPolicy, allErrs) - allErrs = append(allErrs, validateSpec(job.Spec.PaddleReplicaSpecs)...) + + allErrs = append(allErrs, validateRunPolicy(oldJob, newJob)...) + allErrs = append(allErrs, validateSpec(newJob.Spec.PaddleReplicaSpecs)...) return allErrs } +func validateRunPolicy(oldJob, newJob *trainingoperator.PaddleJob) field.ErrorList { + var oldRunPolicy, newRunPolicy *trainingoperator.RunPolicy = nil, &newJob.Spec.RunPolicy + if oldJob != nil { + oldRunPolicy = &oldJob.Spec.RunPolicy + } + + return util.ValidateManagedBy(oldRunPolicy, newRunPolicy) +} + func validateSpec(rSpecs map[trainingoperator.ReplicaType]*trainingoperator.ReplicaSpec) field.ErrorList { var allErrs field.ErrorList diff --git a/pkg/webhooks/paddlepaddle/paddlepaddle_webhook_test.go b/pkg/webhooks/paddlepaddle/paddlepaddle_webhook_test.go index 2992cd87e9..a59ed7d32c 100644 --- a/pkg/webhooks/paddlepaddle/paddlepaddle_webhook_test.go +++ b/pkg/webhooks/paddlepaddle/paddlepaddle_webhook_test.go @@ -28,7 +28,6 @@ import ( "k8s.io/utils/ptr" trainingoperator "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" - v1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" ) func TestValidateV1PaddleJob(t *testing.T) { @@ -187,15 +186,15 @@ func TestValidateV1PaddleJob(t *testing.T) { }, }, wantErr: field.ErrorList{ - field.NotSupported(field.NewPath("spec").Child("managedBy"), "", sets.List(sets.New( - v1.MultiKueueController, - v1.KubeflowJobsController))), + field.NotSupported(field.NewPath("spec", "runPolicy", "managedBy"), "", sets.List(sets.New( + trainingoperator.MultiKueueController, + trainingoperator.KubeflowJobsController))), }, }, } for name, tc := range testCases { t.Run(name, func(t *testing.T) { - got := validatePaddleJob(tc.paddleJob) + got := validatePaddleJob(nil, tc.paddleJob) if diff := cmp.Diff(tc.wantErr, got, cmpopts.IgnoreFields(field.Error{}, "Detail", "BadValue")); len(diff) != 0 { t.Errorf("Unexpected error (-want,+got):\n%s", diff) } diff --git a/pkg/webhooks/pytorch/pytorchjob_webhook.go b/pkg/webhooks/pytorch/pytorchjob_webhook.go index c5793a6003..362df4a4ac 100644 --- a/pkg/webhooks/pytorch/pytorchjob_webhook.go +++ b/pkg/webhooks/pytorch/pytorchjob_webhook.go @@ -56,15 +56,16 @@ func (w *Webhook) ValidateCreate(ctx context.Context, obj runtime.Object) (admis job := obj.(*trainingoperator.PyTorchJob) log := ctrl.LoggerFrom(ctx).WithName("pytorchjob-webhook") log.V(5).Info("Validating create", "pytorchJob", klog.KObj(job)) - warnings, errs := validatePyTorchJob(job) + warnings, errs := validatePyTorchJob(nil, job) return warnings, errs.ToAggregate() } -func (w *Webhook) ValidateUpdate(ctx context.Context, _ runtime.Object, newObj runtime.Object) (admission.Warnings, error) { - job := newObj.(*trainingoperator.PyTorchJob) +func (w *Webhook) ValidateUpdate(ctx context.Context, oldObj, newObj runtime.Object) (admission.Warnings, error) { + oldJob := newObj.(*trainingoperator.PyTorchJob) + newJob := newObj.(*trainingoperator.PyTorchJob) log := ctrl.LoggerFrom(ctx).WithName("pytorchjob-webhook") - log.V(5).Info("Validating update", "pytorchJob", klog.KObj(job)) - warnings, errs := validatePyTorchJob(job) + log.V(5).Info("Validating update", "pytorchJob", klog.KObj(newJob)) + warnings, errs := validatePyTorchJob(oldJob, newJob) return warnings, errs.ToAggregate() } @@ -72,20 +73,29 @@ func (w *Webhook) ValidateDelete(context.Context, runtime.Object) (admission.War return nil, nil } -func validatePyTorchJob(job *trainingoperator.PyTorchJob) (admission.Warnings, field.ErrorList) { +func validatePyTorchJob(oldJob, newJob *trainingoperator.PyTorchJob) (admission.Warnings, field.ErrorList) { var allErrs field.ErrorList var warnings admission.Warnings - if errors := apimachineryvalidation.NameIsDNS1035Label(job.ObjectMeta.Name, false); len(errors) != 0 { - allErrs = append(allErrs, field.Invalid(field.NewPath("metadata").Child("name"), job.Name, fmt.Sprintf("should match: %v", strings.Join(errors, ",")))) + if errors := apimachineryvalidation.NameIsDNS1035Label(newJob.ObjectMeta.Name, false); len(errors) != 0 { + allErrs = append(allErrs, field.Invalid(field.NewPath("metadata").Child("name"), newJob.Name, fmt.Sprintf("should match: %v", strings.Join(errors, ",")))) } - allErrs = util.ValidateManagedBy(&job.Spec.RunPolicy, allErrs) - ws, err := validateSpec(job.Spec) + allErrs = append(allErrs, validateRunPolicy(oldJob, newJob)...) + ws, err := validateSpec(newJob.Spec) warnings = append(warnings, ws...) allErrs = append(allErrs, err...) return warnings, allErrs } +func validateRunPolicy(oldJob, newJob *trainingoperator.PyTorchJob) field.ErrorList { + var oldRunPolicy, newRunPolicy *trainingoperator.RunPolicy = nil, &newJob.Spec.RunPolicy + if oldJob != nil { + oldRunPolicy = &oldJob.Spec.RunPolicy + } + + return util.ValidateManagedBy(oldRunPolicy, newRunPolicy) +} + func validateSpec(spec trainingoperator.PyTorchJobSpec) (admission.Warnings, field.ErrorList) { var allErrs field.ErrorList var warnings admission.Warnings diff --git a/pkg/webhooks/pytorch/pytorchjob_webhook_test.go b/pkg/webhooks/pytorch/pytorchjob_webhook_test.go index f69a65a412..64ebc03251 100644 --- a/pkg/webhooks/pytorch/pytorchjob_webhook_test.go +++ b/pkg/webhooks/pytorch/pytorchjob_webhook_test.go @@ -23,6 +23,7 @@ import ( "github.com/google/go-cmp/cmp" "github.com/google/go-cmp/cmp/cmpopts" corev1 "k8s.io/api/core/v1" + apivalidation "k8s.io/apimachinery/pkg/api/validation" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/util/sets" "k8s.io/apimachinery/pkg/util/validation/field" @@ -30,7 +31,6 @@ import ( "sigs.k8s.io/controller-runtime/pkg/webhook/admission" trainingoperator "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" - v1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" ) func TestValidateV1PyTorchJob(t *testing.T) { @@ -74,9 +74,10 @@ func TestValidateV1PyTorchJob(t *testing.T) { } testCases := map[string]struct { - pytorchJob *trainingoperator.PyTorchJob - wantErr field.ErrorList - wantWarnings admission.Warnings + pytorchJob *trainingoperator.PyTorchJob + oldPytorchJob *trainingoperator.PyTorchJob + wantErr field.ErrorList + wantWarnings admission.Warnings }{ "valid PyTorchJob": { pytorchJob: &trainingoperator.PyTorchJob{ @@ -302,16 +303,43 @@ func TestValidateV1PyTorchJob(t *testing.T) { }, }, wantErr: field.ErrorList{ - field.NotSupported(field.NewPath("spec").Child("managedBy"), "", sets.List(sets.New( - v1.MultiKueueController, - v1.KubeflowJobsController))), + field.NotSupported(field.NewPath("spec", "runPolicy", "managedBy"), "", sets.List(sets.New( + trainingoperator.MultiKueueController, + trainingoperator.KubeflowJobsController))), + }, + }, + "managedBy field becomes mutable": { + oldPytorchJob: &trainingoperator.PyTorchJob{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test", + }, + Spec: trainingoperator.PyTorchJobSpec{ + RunPolicy: trainingoperator.RunPolicy{ + ManagedBy: ptr.To(trainingoperator.KubeflowJobsController), + }, + PyTorchReplicaSpecs: validPyTorchReplicaSpecs, + }, + }, + pytorchJob: &trainingoperator.PyTorchJob{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test", + }, + Spec: trainingoperator.PyTorchJobSpec{ + RunPolicy: trainingoperator.RunPolicy{ + ManagedBy: ptr.To(trainingoperator.MultiKueueController), + }, + PyTorchReplicaSpecs: validPyTorchReplicaSpecs, + }, + }, + wantErr: field.ErrorList{ + field.Invalid(field.NewPath("spec", "runPolicy", "managedBy"), trainingoperator.MultiKueueController, apivalidation.FieldImmutableErrorMsg), }, }, } for name, tc := range testCases { t.Run(name, func(t *testing.T) { - gotWarnings, gotError := validatePyTorchJob(tc.pytorchJob) + gotWarnings, gotError := validatePyTorchJob(tc.oldPytorchJob, tc.pytorchJob) if diff := cmp.Diff(tc.wantWarnings, gotWarnings, cmpopts.SortSlices(func(a, b string) bool { return a < b })); len(diff) != 0 { t.Errorf("Unexpected warnings (-want,+got):\n%s", diff) } diff --git a/pkg/webhooks/tensorflow/tfjob_webhook.go b/pkg/webhooks/tensorflow/tfjob_webhook.go index b11ade8e5c..d666e8f98e 100644 --- a/pkg/webhooks/tensorflow/tfjob_webhook.go +++ b/pkg/webhooks/tensorflow/tfjob_webhook.go @@ -55,30 +55,40 @@ func (w *Webhook) ValidateCreate(ctx context.Context, obj runtime.Object) (admis job := obj.(*trainingoperator.TFJob) log := ctrl.LoggerFrom(ctx).WithName("tfjob-webhook") log.V(5).Info("Validating create", "TFJob", klog.KObj(job)) - return nil, validateTFJob(job).ToAggregate() + return nil, validateTFJob(nil, job).ToAggregate() } -func (w *Webhook) ValidateUpdate(ctx context.Context, _, newObj runtime.Object) (admission.Warnings, error) { - job := newObj.(*trainingoperator.TFJob) +func (w *Webhook) ValidateUpdate(ctx context.Context, oldObj, newObj runtime.Object) (admission.Warnings, error) { + oldJob := oldObj.(*trainingoperator.TFJob) + newJob := newObj.(*trainingoperator.TFJob) log := ctrl.LoggerFrom(ctx).WithName("tfjob-webhook") - log.V(5).Info("Validating update", "NewTFJob", klog.KObj(job)) - return nil, validateTFJob(job).ToAggregate() + log.V(5).Info("Validating update", "NewTFJob", klog.KObj(newJob)) + return nil, validateTFJob(oldJob, newJob).ToAggregate() } func (w *Webhook) ValidateDelete(context.Context, runtime.Object) (admission.Warnings, error) { return nil, nil } -func validateTFJob(job *trainingoperator.TFJob) field.ErrorList { +func validateTFJob(oldJob, newJob *trainingoperator.TFJob) field.ErrorList { var allErrs field.ErrorList - if errors := apimachineryvalidation.NameIsDNS1035Label(job.Name, false); len(errors) != 0 { - allErrs = append(allErrs, field.Invalid(field.NewPath("metadata").Child("name"), job.Name, fmt.Sprintf("should match: %v", strings.Join(errors, ",")))) + if errors := apimachineryvalidation.NameIsDNS1035Label(newJob.Name, false); len(errors) != 0 { + allErrs = append(allErrs, field.Invalid(field.NewPath("metadata").Child("name"), newJob.Name, fmt.Sprintf("should match: %v", strings.Join(errors, ",")))) } - allErrs = util.ValidateManagedBy(&job.Spec.RunPolicy, allErrs) - allErrs = append(allErrs, validateSpec(job.Spec)...) + allErrs = append(allErrs, validateRunPolicy(oldJob, newJob)...) + allErrs = append(allErrs, validateSpec(newJob.Spec)...) return allErrs } +func validateRunPolicy(oldJob, newJob *trainingoperator.TFJob) field.ErrorList { + var oldRunPolicy, newRunPolicy *trainingoperator.RunPolicy = nil, &newJob.Spec.RunPolicy + if oldJob != nil { + oldRunPolicy = &oldJob.Spec.RunPolicy + } + + return util.ValidateManagedBy(oldRunPolicy, newRunPolicy) +} + func validateSpec(spec trainingoperator.TFJobSpec) field.ErrorList { return validateTFReplicaSpecs(spec.TFReplicaSpecs) } diff --git a/pkg/webhooks/tensorflow/tfjob_webhook_test.go b/pkg/webhooks/tensorflow/tfjob_webhook_test.go index a9e9a899c4..a666ddf53f 100644 --- a/pkg/webhooks/tensorflow/tfjob_webhook_test.go +++ b/pkg/webhooks/tensorflow/tfjob_webhook_test.go @@ -28,7 +28,6 @@ import ( "k8s.io/utils/ptr" trainingoperator "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" - v1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" ) func TestValidateTFJob(t *testing.T) { @@ -198,15 +197,15 @@ func TestValidateTFJob(t *testing.T) { }, }, wantErr: field.ErrorList{ - field.NotSupported(field.NewPath("spec").Child("managedBy"), "", sets.List(sets.New( - v1.MultiKueueController, - v1.KubeflowJobsController))), + field.NotSupported(field.NewPath("spec", "runPolicy", "managedBy"), "", sets.List(sets.New( + trainingoperator.MultiKueueController, + trainingoperator.KubeflowJobsController))), }, }, } for name, tc := range testCases { t.Run(name, func(t *testing.T) { - got := validateTFJob(tc.tfJob) + got := validateTFJob(nil, tc.tfJob) if diff := cmp.Diff(tc.wantErr, got, cmpopts.IgnoreFields(field.Error{}, "Detail", "BadValue")); len(diff) != 0 { t.Errorf("Unexpected error (-want,+got):\n%s", diff) } diff --git a/pkg/webhooks/xgboost/xgboostjob_webhook.go b/pkg/webhooks/xgboost/xgboostjob_webhook.go index a02d98dd6f..eb95949149 100644 --- a/pkg/webhooks/xgboost/xgboostjob_webhook.go +++ b/pkg/webhooks/xgboost/xgboostjob_webhook.go @@ -56,31 +56,40 @@ func (w *Webhook) ValidateCreate(ctx context.Context, obj runtime.Object) (admis job := obj.(*trainingoperator.XGBoostJob) log := ctrl.LoggerFrom(ctx).WithName("xgboostjob-webhook") log.V(5).Info("Validating create", "xgboostJob", klog.KObj(job)) - return nil, validateXGBoostJob(job).ToAggregate() + return nil, validateXGBoostJob(nil, job).ToAggregate() } -func (w *Webhook) ValidateUpdate(ctx context.Context, _, newObj runtime.Object) (admission.Warnings, error) { - job := newObj.(*trainingoperator.XGBoostJob) +func (w *Webhook) ValidateUpdate(ctx context.Context, oldObj, newObj runtime.Object) (admission.Warnings, error) { + oldJob := oldObj.(*trainingoperator.XGBoostJob) + newJob := newObj.(*trainingoperator.XGBoostJob) log := ctrl.LoggerFrom(ctx).WithName("xgboostjob-webhook") - log.V(5).Info("Validating create", "xgboostJob", klog.KObj(job)) - return nil, validateXGBoostJob(job).ToAggregate() + log.V(5).Info("Validating create", "xgboostJob", klog.KObj(newJob)) + return nil, validateXGBoostJob(oldJob, newJob).ToAggregate() } func (w *Webhook) ValidateDelete(context.Context, runtime.Object) (admission.Warnings, error) { return nil, nil } -func validateXGBoostJob(job *trainingoperator.XGBoostJob) field.ErrorList { +func validateXGBoostJob(oldJob, newJob *trainingoperator.XGBoostJob) field.ErrorList { var allErrs field.ErrorList - - if errors := apimachineryvalidation.NameIsDNS1035Label(job.Name, false); len(errors) != 0 { - allErrs = append(allErrs, field.Invalid(field.NewPath("metadata").Child("name"), job.Name, fmt.Sprintf("should match: %v", strings.Join(errors, ",")))) + if errors := apimachineryvalidation.NameIsDNS1035Label(newJob.Name, false); len(errors) != 0 { + allErrs = append(allErrs, field.Invalid(field.NewPath("metadata").Child("name"), newJob.Name, fmt.Sprintf("should match: %v", strings.Join(errors, ",")))) } - allErrs = util.ValidateManagedBy(&job.Spec.RunPolicy, allErrs) - allErrs = append(allErrs, validateSpec(job.Spec)...) + allErrs = append(allErrs, validateRunPolicy(oldJob, newJob)...) + allErrs = append(allErrs, validateSpec(newJob.Spec)...) return allErrs } +func validateRunPolicy(oldJob, newJob *trainingoperator.XGBoostJob) field.ErrorList { + var oldRunPolicy, newRunPolicy *trainingoperator.RunPolicy = nil, &newJob.Spec.RunPolicy + if oldJob != nil { + oldRunPolicy = &oldJob.Spec.RunPolicy + } + + return util.ValidateManagedBy(oldRunPolicy, newRunPolicy) +} + func validateSpec(spec trainingoperator.XGBoostJobSpec) field.ErrorList { return validateXGBReplicaSpecs(spec.XGBReplicaSpecs) } diff --git a/pkg/webhooks/xgboost/xgboostjob_webhook_test.go b/pkg/webhooks/xgboost/xgboostjob_webhook_test.go index 4187529546..c8b7ffd811 100644 --- a/pkg/webhooks/xgboost/xgboostjob_webhook_test.go +++ b/pkg/webhooks/xgboost/xgboostjob_webhook_test.go @@ -28,7 +28,6 @@ import ( "k8s.io/utils/ptr" trainingoperator "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" - v1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" ) func TestValidateXGBoostJob(t *testing.T) { @@ -249,15 +248,15 @@ func TestValidateXGBoostJob(t *testing.T) { }, }, wantErr: field.ErrorList{ - field.NotSupported(field.NewPath("spec").Child("managedBy"), "", sets.List(sets.New( - v1.MultiKueueController, - v1.KubeflowJobsController))), + field.NotSupported(field.NewPath("spec", "runPolicy", "managedBy"), "", sets.List(sets.New( + trainingoperator.MultiKueueController, + trainingoperator.KubeflowJobsController))), }, }, } for name, tc := range testCases { t.Run(name, func(t *testing.T) { - got := validateXGBoostJob(tc.xgboostJob) + got := validateXGBoostJob(nil, tc.xgboostJob) if diff := cmp.Diff(tc.wantErr, got, cmpopts.IgnoreFields(field.Error{}, "Detail", "BadValue")); len(diff) != 0 { t.Errorf("Unexpected errors (-want,+got):\n%s", diff) } From 80576f64c74f7ead1901bd98272e34ec17f49bc0 Mon Sep 17 00:00:00 2001 From: Michal Szadkowski Date: Thu, 12 Sep 2024 15:39:00 +0200 Subject: [PATCH 23/26] Avoid making copy of runPolicy Signed-off-by: Michal Szadkowski --- pkg/controller.v1/common/job.go | 4 ++-- pkg/controller.v1/common/job_test.go | 2 +- pkg/controller.v1/mpi/mpijob_controller.go | 2 +- pkg/controller.v1/paddlepaddle/paddlepaddle_controller.go | 2 +- pkg/controller.v1/pytorch/pytorchjob_controller.go | 2 +- pkg/controller.v1/tensorflow/tfjob_controller.go | 2 +- pkg/controller.v1/xgboost/xgboostjob_controller.go | 2 +- pkg/webhooks/paddlepaddle/paddlepaddle_webhook_test.go | 2 +- pkg/webhooks/pytorch/pytorchjob_webhook_test.go | 4 ++-- pkg/webhooks/tensorflow/tfjob_webhook_test.go | 2 +- pkg/webhooks/xgboost/xgboostjob_webhook_test.go | 2 +- 11 files changed, 13 insertions(+), 13 deletions(-) diff --git a/pkg/controller.v1/common/job.go b/pkg/controller.v1/common/job.go index da67e8a6cb..ae53b4a9b7 100644 --- a/pkg/controller.v1/common/job.go +++ b/pkg/controller.v1/common/job.go @@ -456,8 +456,8 @@ func (jc *JobController) calcPGMinResources(minMember int32, replicas map[apiv1. return CalcPGMinResources(minMember, replicas, jc.PriorityClassLister.Get) } -func (jc *JobController) ManagedByExternalController(rp apiv1.RunPolicy) *string { - if controllerName := rp.ManagedBy; controllerName != nil && *controllerName != apiv1.KubeflowJobsController { +func (jc *JobController) ManagedByExternalController(controllerName *string) *string { + if controllerName != nil && *controllerName != apiv1.KubeflowJobsController { return controllerName } return nil diff --git a/pkg/controller.v1/common/job_test.go b/pkg/controller.v1/common/job_test.go index cdd34fdb19..712ff6028c 100644 --- a/pkg/controller.v1/common/job_test.go +++ b/pkg/controller.v1/common/job_test.go @@ -253,7 +253,7 @@ func TestManagedByExternalController(T *testing.T) { ManagedBy: tc.managedBy, } - if got := jobController.ManagedByExternalController(*runPolicy); got != nil { + if got := jobController.ManagedByExternalController(runPolicy.ManagedBy); got != nil { if !tc.wantResult { t.Errorf("Unwanted manager controller: %s\n", *got) } diff --git a/pkg/controller.v1/mpi/mpijob_controller.go b/pkg/controller.v1/mpi/mpijob_controller.go index 10682d0981..e59fd15d66 100644 --- a/pkg/controller.v1/mpi/mpijob_controller.go +++ b/pkg/controller.v1/mpi/mpijob_controller.go @@ -135,7 +135,7 @@ func (jc *MPIJobReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct return ctrl.Result{}, client.IgnoreNotFound(err) } - if manager := jc.ManagedByExternalController(mpijob.Spec.RunPolicy); manager != nil { + if manager := jc.ManagedByExternalController(mpijob.Spec.RunPolicy.ManagedBy); manager != nil { logger.Info("Skipping MPIJob managed by a custom controller", "managed-by", manager) return ctrl.Result{}, nil } diff --git a/pkg/controller.v1/paddlepaddle/paddlepaddle_controller.go b/pkg/controller.v1/paddlepaddle/paddlepaddle_controller.go index 1f3cf2230f..b453f0eb8e 100644 --- a/pkg/controller.v1/paddlepaddle/paddlepaddle_controller.go +++ b/pkg/controller.v1/paddlepaddle/paddlepaddle_controller.go @@ -131,7 +131,7 @@ func (r *PaddleJobReconciler) Reconcile(ctx context.Context, req ctrl.Request) ( return ctrl.Result{}, client.IgnoreNotFound(err) } - if manager := r.ManagedByExternalController(paddlejob.Spec.RunPolicy); manager != nil { + if manager := r.ManagedByExternalController(paddlejob.Spec.RunPolicy.ManagedBy); manager != nil { logger.Info("Skipping PaddleJob managed by a custom controller", "managed-by", manager) return ctrl.Result{}, nil } diff --git a/pkg/controller.v1/pytorch/pytorchjob_controller.go b/pkg/controller.v1/pytorch/pytorchjob_controller.go index 9af3e4527d..a5effc5cc5 100644 --- a/pkg/controller.v1/pytorch/pytorchjob_controller.go +++ b/pkg/controller.v1/pytorch/pytorchjob_controller.go @@ -132,7 +132,7 @@ func (r *PyTorchJobReconciler) Reconcile(ctx context.Context, req ctrl.Request) return ctrl.Result{}, client.IgnoreNotFound(err) } - if manager := r.ManagedByExternalController(pytorchjob.Spec.RunPolicy); manager != nil { + if manager := r.ManagedByExternalController(pytorchjob.Spec.RunPolicy.ManagedBy); manager != nil { logger.Info("Skipping PyTorchJob managed by a custom controller", "managed-by", manager) return ctrl.Result{}, nil } diff --git a/pkg/controller.v1/tensorflow/tfjob_controller.go b/pkg/controller.v1/tensorflow/tfjob_controller.go index b14c33f36d..8447d02f55 100644 --- a/pkg/controller.v1/tensorflow/tfjob_controller.go +++ b/pkg/controller.v1/tensorflow/tfjob_controller.go @@ -127,7 +127,7 @@ func (r *TFJobReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl return ctrl.Result{}, client.IgnoreNotFound(err) } - if manager := r.ManagedByExternalController(tfjob.Spec.RunPolicy); manager != nil { + if manager := r.ManagedByExternalController(tfjob.Spec.RunPolicy.ManagedBy); manager != nil { logger.Info("Skipping TFJob managed by a custom controller", "managed-by", manager) return ctrl.Result{}, nil } diff --git a/pkg/controller.v1/xgboost/xgboostjob_controller.go b/pkg/controller.v1/xgboost/xgboostjob_controller.go index da3889ce02..4733515bbe 100644 --- a/pkg/controller.v1/xgboost/xgboostjob_controller.go +++ b/pkg/controller.v1/xgboost/xgboostjob_controller.go @@ -130,7 +130,7 @@ func (r *XGBoostJobReconciler) Reconcile(ctx context.Context, req ctrl.Request) return ctrl.Result{}, client.IgnoreNotFound(err) } - if manager := r.ManagedByExternalController(xgboostjob.Spec.RunPolicy); manager != nil { + if manager := r.ManagedByExternalController(xgboostjob.Spec.RunPolicy.ManagedBy); manager != nil { logger.Info("Skipping XGBoostJob managed by a custom controller", "managed-by", manager) return ctrl.Result{}, nil } diff --git a/pkg/webhooks/paddlepaddle/paddlepaddle_webhook_test.go b/pkg/webhooks/paddlepaddle/paddlepaddle_webhook_test.go index a59ed7d32c..10cc0ea053 100644 --- a/pkg/webhooks/paddlepaddle/paddlepaddle_webhook_test.go +++ b/pkg/webhooks/paddlepaddle/paddlepaddle_webhook_test.go @@ -173,7 +173,7 @@ func TestValidateV1PaddleJob(t *testing.T) { field.Required(paddleReplicaSpecPath, ""), }, }, - "unsupported managedBy controller name": { + "attempt to set unsupported managedBy controller name gets rejected": { paddleJob: &trainingoperator.PaddleJob{ ObjectMeta: metav1.ObjectMeta{ Name: "test", diff --git a/pkg/webhooks/pytorch/pytorchjob_webhook_test.go b/pkg/webhooks/pytorch/pytorchjob_webhook_test.go index 64ebc03251..7757e36b3e 100644 --- a/pkg/webhooks/pytorch/pytorchjob_webhook_test.go +++ b/pkg/webhooks/pytorch/pytorchjob_webhook_test.go @@ -290,7 +290,7 @@ func TestValidateV1PyTorchJob(t *testing.T) { specPath.Child("elasticPolicy").Child("nProcPerNode"), specPath.Child("nprocPerNode")), }, }, - "unsupported managedBy controller name": { + "attempt to set unsupported managedBy controller name gets rejected": { pytorchJob: &trainingoperator.PyTorchJob{ ObjectMeta: metav1.ObjectMeta{ Name: "test", @@ -308,7 +308,7 @@ func TestValidateV1PyTorchJob(t *testing.T) { trainingoperator.KubeflowJobsController))), }, }, - "managedBy field becomes mutable": { + "attempt to update the managedBy field gets rejected": { oldPytorchJob: &trainingoperator.PyTorchJob{ ObjectMeta: metav1.ObjectMeta{ Name: "test", diff --git a/pkg/webhooks/tensorflow/tfjob_webhook_test.go b/pkg/webhooks/tensorflow/tfjob_webhook_test.go index a666ddf53f..759cc1b58b 100644 --- a/pkg/webhooks/tensorflow/tfjob_webhook_test.go +++ b/pkg/webhooks/tensorflow/tfjob_webhook_test.go @@ -184,7 +184,7 @@ func TestValidateTFJob(t *testing.T) { field.Forbidden(tfReplicaSpecPath, ""), }, }, - "unsupported managedBy controller name": { + "attempt to set unsupported managedBy controller name gets rejected": { tfJob: &trainingoperator.TFJob{ ObjectMeta: metav1.ObjectMeta{ Name: "test", diff --git a/pkg/webhooks/xgboost/xgboostjob_webhook_test.go b/pkg/webhooks/xgboost/xgboostjob_webhook_test.go index c8b7ffd811..3c1d410598 100644 --- a/pkg/webhooks/xgboost/xgboostjob_webhook_test.go +++ b/pkg/webhooks/xgboost/xgboostjob_webhook_test.go @@ -235,7 +235,7 @@ func TestValidateXGBoostJob(t *testing.T) { field.Required(xgbReplicaSpecPath.Key(string(trainingoperator.XGBoostJobReplicaTypeMaster)), ""), }, }, - "unsupported managedBy controller name": { + "attempt to set unsupported managedBy controller name gets rejected": { xgboostJob: &trainingoperator.XGBoostJob{ ObjectMeta: metav1.ObjectMeta{ Name: "test", From 18a531333ac6f5d182190bb6ae60342169678aba Mon Sep 17 00:00:00 2001 From: Michal Szadkowski Date: Fri, 13 Sep 2024 12:50:48 +0200 Subject: [PATCH 24/26] Split RunPolicy validators to Update and Create Signed-off-by: Michal Szadkowski --- pkg/common/util/webhooks.go | 21 +++++++++---------- .../paddlepaddle/paddlepaddle_webhook.go | 15 +++++-------- pkg/webhooks/pytorch/pytorchjob_webhook.go | 15 +++++-------- pkg/webhooks/tensorflow/tfjob_webhook.go | 15 +++++-------- pkg/webhooks/xgboost/xgboostjob_webhook.go | 15 +++++-------- 5 files changed, 30 insertions(+), 51 deletions(-) diff --git a/pkg/common/util/webhooks.go b/pkg/common/util/webhooks.go index a99f08eb95..46693239bb 100644 --- a/pkg/common/util/webhooks.go +++ b/pkg/common/util/webhooks.go @@ -12,18 +12,10 @@ var supportedJobControllers = sets.New( v1.MultiKueueController, v1.KubeflowJobsController) -func ValidateManagedBy(oldRunPolicy *v1.RunPolicy, newRunPolicy *v1.RunPolicy) field.ErrorList { +func ValidateRunPolicyCreate(runPolicy *v1.RunPolicy) field.ErrorList { errs := field.ErrorList{} - // Validate immutability - if oldRunPolicy != nil && newRunPolicy != nil { - oldManager := oldRunPolicy.ManagedBy - newManager := newRunPolicy.ManagedBy - fieldPath := field.NewPath("spec", "runPolicy", "managedBy") - errs = apivalidation.ValidateImmutableField(newManager, oldManager, fieldPath) - } - // Validate the value - if newRunPolicy != nil && newRunPolicy.ManagedBy != nil { - manager := *newRunPolicy.ManagedBy + if runPolicy.ManagedBy != nil { + manager := *runPolicy.ManagedBy if !supportedJobControllers.Has(manager) { fieldPath := field.NewPath("spec", "runPolicy", "managedBy") errs = append(errs, field.NotSupported(fieldPath, manager, supportedJobControllers.UnsortedList())) @@ -31,3 +23,10 @@ func ValidateManagedBy(oldRunPolicy *v1.RunPolicy, newRunPolicy *v1.RunPolicy) f } return errs } + +func ValidateRunPolicyUpdate(oldRunPolicy, newRunPolicy *v1.RunPolicy) field.ErrorList { + oldManager := oldRunPolicy.ManagedBy + newManager := newRunPolicy.ManagedBy + fieldPath := field.NewPath("spec", "runPolicy", "managedBy") + return apivalidation.ValidateImmutableField(newManager, oldManager, fieldPath) +} diff --git a/pkg/webhooks/paddlepaddle/paddlepaddle_webhook.go b/pkg/webhooks/paddlepaddle/paddlepaddle_webhook.go index 6817cd6da2..8f584003e9 100644 --- a/pkg/webhooks/paddlepaddle/paddlepaddle_webhook.go +++ b/pkg/webhooks/paddlepaddle/paddlepaddle_webhook.go @@ -77,18 +77,13 @@ func validatePaddleJob(oldJob, newJob *trainingoperator.PaddleJob) field.ErrorLi allErrs = append(allErrs, field.Invalid(field.NewPath("metadata").Child("name"), newJob.Name, fmt.Sprintf("should match: %v", strings.Join(errors, ",")))) } - allErrs = append(allErrs, validateRunPolicy(oldJob, newJob)...) - allErrs = append(allErrs, validateSpec(newJob.Spec.PaddleReplicaSpecs)...) - return allErrs -} - -func validateRunPolicy(oldJob, newJob *trainingoperator.PaddleJob) field.ErrorList { - var oldRunPolicy, newRunPolicy *trainingoperator.RunPolicy = nil, &newJob.Spec.RunPolicy if oldJob != nil { - oldRunPolicy = &oldJob.Spec.RunPolicy + allErrs = append(allErrs, util.ValidateRunPolicyUpdate(&oldJob.Spec.RunPolicy, &newJob.Spec.RunPolicy)...) + } else { + allErrs = append(allErrs, util.ValidateRunPolicyCreate(&newJob.Spec.RunPolicy)...) } - - return util.ValidateManagedBy(oldRunPolicy, newRunPolicy) + allErrs = append(allErrs, validateSpec(newJob.Spec.PaddleReplicaSpecs)...) + return allErrs } func validateSpec(rSpecs map[trainingoperator.ReplicaType]*trainingoperator.ReplicaSpec) field.ErrorList { diff --git a/pkg/webhooks/pytorch/pytorchjob_webhook.go b/pkg/webhooks/pytorch/pytorchjob_webhook.go index 362df4a4ac..e8511ff18b 100644 --- a/pkg/webhooks/pytorch/pytorchjob_webhook.go +++ b/pkg/webhooks/pytorch/pytorchjob_webhook.go @@ -80,22 +80,17 @@ func validatePyTorchJob(oldJob, newJob *trainingoperator.PyTorchJob) (admission. if errors := apimachineryvalidation.NameIsDNS1035Label(newJob.ObjectMeta.Name, false); len(errors) != 0 { allErrs = append(allErrs, field.Invalid(field.NewPath("metadata").Child("name"), newJob.Name, fmt.Sprintf("should match: %v", strings.Join(errors, ",")))) } - allErrs = append(allErrs, validateRunPolicy(oldJob, newJob)...) + if oldJob != nil { + allErrs = append(allErrs, util.ValidateRunPolicyUpdate(&oldJob.Spec.RunPolicy, &newJob.Spec.RunPolicy)...) + } else { + allErrs = append(allErrs, util.ValidateRunPolicyCreate(&newJob.Spec.RunPolicy)...) + } ws, err := validateSpec(newJob.Spec) warnings = append(warnings, ws...) allErrs = append(allErrs, err...) return warnings, allErrs } -func validateRunPolicy(oldJob, newJob *trainingoperator.PyTorchJob) field.ErrorList { - var oldRunPolicy, newRunPolicy *trainingoperator.RunPolicy = nil, &newJob.Spec.RunPolicy - if oldJob != nil { - oldRunPolicy = &oldJob.Spec.RunPolicy - } - - return util.ValidateManagedBy(oldRunPolicy, newRunPolicy) -} - func validateSpec(spec trainingoperator.PyTorchJobSpec) (admission.Warnings, field.ErrorList) { var allErrs field.ErrorList var warnings admission.Warnings diff --git a/pkg/webhooks/tensorflow/tfjob_webhook.go b/pkg/webhooks/tensorflow/tfjob_webhook.go index d666e8f98e..499ff986cd 100644 --- a/pkg/webhooks/tensorflow/tfjob_webhook.go +++ b/pkg/webhooks/tensorflow/tfjob_webhook.go @@ -75,18 +75,13 @@ func validateTFJob(oldJob, newJob *trainingoperator.TFJob) field.ErrorList { if errors := apimachineryvalidation.NameIsDNS1035Label(newJob.Name, false); len(errors) != 0 { allErrs = append(allErrs, field.Invalid(field.NewPath("metadata").Child("name"), newJob.Name, fmt.Sprintf("should match: %v", strings.Join(errors, ",")))) } - allErrs = append(allErrs, validateRunPolicy(oldJob, newJob)...) - allErrs = append(allErrs, validateSpec(newJob.Spec)...) - return allErrs -} - -func validateRunPolicy(oldJob, newJob *trainingoperator.TFJob) field.ErrorList { - var oldRunPolicy, newRunPolicy *trainingoperator.RunPolicy = nil, &newJob.Spec.RunPolicy if oldJob != nil { - oldRunPolicy = &oldJob.Spec.RunPolicy + allErrs = append(allErrs, util.ValidateRunPolicyUpdate(&oldJob.Spec.RunPolicy, &newJob.Spec.RunPolicy)...) + } else { + allErrs = append(allErrs, util.ValidateRunPolicyCreate(&newJob.Spec.RunPolicy)...) } - - return util.ValidateManagedBy(oldRunPolicy, newRunPolicy) + allErrs = append(allErrs, validateSpec(newJob.Spec)...) + return allErrs } func validateSpec(spec trainingoperator.TFJobSpec) field.ErrorList { diff --git a/pkg/webhooks/xgboost/xgboostjob_webhook.go b/pkg/webhooks/xgboost/xgboostjob_webhook.go index eb95949149..a411a4c87c 100644 --- a/pkg/webhooks/xgboost/xgboostjob_webhook.go +++ b/pkg/webhooks/xgboost/xgboostjob_webhook.go @@ -76,18 +76,13 @@ func validateXGBoostJob(oldJob, newJob *trainingoperator.XGBoostJob) field.Error if errors := apimachineryvalidation.NameIsDNS1035Label(newJob.Name, false); len(errors) != 0 { allErrs = append(allErrs, field.Invalid(field.NewPath("metadata").Child("name"), newJob.Name, fmt.Sprintf("should match: %v", strings.Join(errors, ",")))) } - allErrs = append(allErrs, validateRunPolicy(oldJob, newJob)...) - allErrs = append(allErrs, validateSpec(newJob.Spec)...) - return allErrs -} - -func validateRunPolicy(oldJob, newJob *trainingoperator.XGBoostJob) field.ErrorList { - var oldRunPolicy, newRunPolicy *trainingoperator.RunPolicy = nil, &newJob.Spec.RunPolicy if oldJob != nil { - oldRunPolicy = &oldJob.Spec.RunPolicy + allErrs = append(allErrs, util.ValidateRunPolicyUpdate(&oldJob.Spec.RunPolicy, &newJob.Spec.RunPolicy)...) + } else { + allErrs = append(allErrs, util.ValidateRunPolicyCreate(&newJob.Spec.RunPolicy)...) } - - return util.ValidateManagedBy(oldRunPolicy, newRunPolicy) + allErrs = append(allErrs, validateSpec(newJob.Spec)...) + return allErrs } func validateSpec(spec trainingoperator.XGBoostJobSpec) field.ErrorList { From 2f920a8eb56d427177d1c1128982acc6b739dab4 Mon Sep 17 00:00:00 2001 From: Michal Szadkowski Date: Fri, 13 Sep 2024 13:14:30 +0200 Subject: [PATCH 25/26] Fix the naming and call validate always Signed-off-by: Michal Szadkowski --- pkg/common/util/webhooks.go | 2 +- pkg/webhooks/paddlepaddle/paddlepaddle_webhook.go | 4 +--- pkg/webhooks/pytorch/pytorchjob_webhook.go | 3 +-- pkg/webhooks/tensorflow/tfjob_webhook.go | 3 +-- pkg/webhooks/xgboost/xgboostjob_webhook.go | 3 +-- 5 files changed, 5 insertions(+), 10 deletions(-) diff --git a/pkg/common/util/webhooks.go b/pkg/common/util/webhooks.go index 46693239bb..aa4031cffe 100644 --- a/pkg/common/util/webhooks.go +++ b/pkg/common/util/webhooks.go @@ -12,7 +12,7 @@ var supportedJobControllers = sets.New( v1.MultiKueueController, v1.KubeflowJobsController) -func ValidateRunPolicyCreate(runPolicy *v1.RunPolicy) field.ErrorList { +func ValidateRunPolicy(runPolicy *v1.RunPolicy) field.ErrorList { errs := field.ErrorList{} if runPolicy.ManagedBy != nil { manager := *runPolicy.ManagedBy diff --git a/pkg/webhooks/paddlepaddle/paddlepaddle_webhook.go b/pkg/webhooks/paddlepaddle/paddlepaddle_webhook.go index 8f584003e9..fedc95b5f7 100644 --- a/pkg/webhooks/paddlepaddle/paddlepaddle_webhook.go +++ b/pkg/webhooks/paddlepaddle/paddlepaddle_webhook.go @@ -76,12 +76,10 @@ func validatePaddleJob(oldJob, newJob *trainingoperator.PaddleJob) field.ErrorLi if errors := apimachineryvalidation.NameIsDNS1035Label(newJob.Name, false); len(errors) != 0 { allErrs = append(allErrs, field.Invalid(field.NewPath("metadata").Child("name"), newJob.Name, fmt.Sprintf("should match: %v", strings.Join(errors, ",")))) } - if oldJob != nil { allErrs = append(allErrs, util.ValidateRunPolicyUpdate(&oldJob.Spec.RunPolicy, &newJob.Spec.RunPolicy)...) - } else { - allErrs = append(allErrs, util.ValidateRunPolicyCreate(&newJob.Spec.RunPolicy)...) } + allErrs = append(allErrs, util.ValidateRunPolicy(&newJob.Spec.RunPolicy)...) allErrs = append(allErrs, validateSpec(newJob.Spec.PaddleReplicaSpecs)...) return allErrs } diff --git a/pkg/webhooks/pytorch/pytorchjob_webhook.go b/pkg/webhooks/pytorch/pytorchjob_webhook.go index e8511ff18b..2459815935 100644 --- a/pkg/webhooks/pytorch/pytorchjob_webhook.go +++ b/pkg/webhooks/pytorch/pytorchjob_webhook.go @@ -82,9 +82,8 @@ func validatePyTorchJob(oldJob, newJob *trainingoperator.PyTorchJob) (admission. } if oldJob != nil { allErrs = append(allErrs, util.ValidateRunPolicyUpdate(&oldJob.Spec.RunPolicy, &newJob.Spec.RunPolicy)...) - } else { - allErrs = append(allErrs, util.ValidateRunPolicyCreate(&newJob.Spec.RunPolicy)...) } + allErrs = append(allErrs, util.ValidateRunPolicy(&newJob.Spec.RunPolicy)...) ws, err := validateSpec(newJob.Spec) warnings = append(warnings, ws...) allErrs = append(allErrs, err...) diff --git a/pkg/webhooks/tensorflow/tfjob_webhook.go b/pkg/webhooks/tensorflow/tfjob_webhook.go index 499ff986cd..95f187f44f 100644 --- a/pkg/webhooks/tensorflow/tfjob_webhook.go +++ b/pkg/webhooks/tensorflow/tfjob_webhook.go @@ -77,9 +77,8 @@ func validateTFJob(oldJob, newJob *trainingoperator.TFJob) field.ErrorList { } if oldJob != nil { allErrs = append(allErrs, util.ValidateRunPolicyUpdate(&oldJob.Spec.RunPolicy, &newJob.Spec.RunPolicy)...) - } else { - allErrs = append(allErrs, util.ValidateRunPolicyCreate(&newJob.Spec.RunPolicy)...) } + allErrs = append(allErrs, util.ValidateRunPolicy(&newJob.Spec.RunPolicy)...) allErrs = append(allErrs, validateSpec(newJob.Spec)...) return allErrs } diff --git a/pkg/webhooks/xgboost/xgboostjob_webhook.go b/pkg/webhooks/xgboost/xgboostjob_webhook.go index a411a4c87c..5372317487 100644 --- a/pkg/webhooks/xgboost/xgboostjob_webhook.go +++ b/pkg/webhooks/xgboost/xgboostjob_webhook.go @@ -78,9 +78,8 @@ func validateXGBoostJob(oldJob, newJob *trainingoperator.XGBoostJob) field.Error } if oldJob != nil { allErrs = append(allErrs, util.ValidateRunPolicyUpdate(&oldJob.Spec.RunPolicy, &newJob.Spec.RunPolicy)...) - } else { - allErrs = append(allErrs, util.ValidateRunPolicyCreate(&newJob.Spec.RunPolicy)...) } + allErrs = append(allErrs, util.ValidateRunPolicy(&newJob.Spec.RunPolicy)...) allErrs = append(allErrs, validateSpec(newJob.Spec)...) return allErrs } From 8e8a6379d996ebfee1553329f1c8b3fb48758c13 Mon Sep 17 00:00:00 2001 From: Michal Szadkowski Date: Fri, 13 Sep 2024 14:39:56 +0200 Subject: [PATCH 26/26] Update tests Signed-off-by: Michal Szadkowski --- pkg/controller.v1/common/job_test.go | 34 ++++++++++++---------------- 1 file changed, 15 insertions(+), 19 deletions(-) diff --git a/pkg/controller.v1/common/job_test.go b/pkg/controller.v1/common/job_test.go index 712ff6028c..ca948b788b 100644 --- a/pkg/controller.v1/common/job_test.go +++ b/pkg/controller.v1/common/job_test.go @@ -222,28 +222,28 @@ func TestPastActiveDeadline(T *testing.T) { func TestManagedByExternalController(T *testing.T) { cases := map[string]struct { - managedBy *string - wantResult bool + managedBy *string + wantControllerName *string }{ "managedBy is nil": { - managedBy: nil, - wantResult: false, + managedBy: nil, + wantControllerName: nil, }, "managedBy is empty": { - managedBy: ptr.To[string](""), - wantResult: true, + managedBy: ptr.To[string](""), + wantControllerName: ptr.To[string](""), }, "managedBy is training-operator controller": { - managedBy: ptr.To[string](apiv1.KubeflowJobsController), - wantResult: false, + managedBy: ptr.To[string](apiv1.KubeflowJobsController), + wantControllerName: nil, }, "managedBy is not the training-operator controller": { - managedBy: ptr.To[string]("kueue.x-k8s.io/multikueue"), - wantResult: true, + managedBy: ptr.To[string]("kueue.x-k8s.io/multikueue"), + wantControllerName: ptr.To[string]("kueue.x-k8s.io/multikueue"), }, "managedBy is other value": { - managedBy: ptr.To[string]("other-job-controller"), - wantResult: true, + managedBy: ptr.To[string]("other-job-controller"), + wantControllerName: ptr.To[string]("other-job-controller"), }, } for name, tc := range cases { @@ -253,13 +253,9 @@ func TestManagedByExternalController(T *testing.T) { ManagedBy: tc.managedBy, } - if got := jobController.ManagedByExternalController(runPolicy.ManagedBy); got != nil { - if !tc.wantResult { - t.Errorf("Unwanted manager controller: %s\n", *got) - } - if diff := cmp.Diff(tc.managedBy, got); diff != "" { - t.Errorf("Unexpected manager controller (-want +got):\n%s", diff) - } + gotControllerName := jobController.ManagedByExternalController(runPolicy.ManagedBy) + if diff := cmp.Diff(tc.wantControllerName, gotControllerName); diff != "" { + t.Errorf("Unexpected manager controller (-want +got):\n%s", diff) } }) }