From 300df75af7f0c890b138c3cbe4cd5c7c29c396c8 Mon Sep 17 00:00:00 2001 From: Yuki Iwai Date: Thu, 8 Aug 2024 04:37:06 +0900 Subject: [PATCH] Update the managedBy specifications Signed-off-by: Yuki Iwai --- .../2170-kubeflow-training-v2/README.md | 26 +++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/docs/proposals/2170-kubeflow-training-v2/README.md b/docs/proposals/2170-kubeflow-training-v2/README.md index eb980bfe5f..5b36ad8e37 100644 --- a/docs/proposals/2170-kubeflow-training-v2/README.md +++ b/docs/proposals/2170-kubeflow-training-v2/README.md @@ -301,6 +301,18 @@ type TrainJobSpec struct { Suspend *bool `json:"suspend,omitempty"` // ManagedBy is used to indicate the controller or entity that manages a TrainJob. + // The value must be either an empty, 'training-operator.kubeflow.org/trainjob-controller' or + // 'kueue.x-k8s.io/multikueue'. + // The built-in TrainJob controller reconciles TrainJob which don't have this + // field at all or the field value is the reserved string + // 'training-operator.kubeflow.org/trainjob-controller', but delegates reconciling TrainJobs + // with a 'kueue.x-k8s.io/multikueue' to the Kueue. + // + // The value must be a valid domain-prefixed path (e.g. acme.io/foo) - + // all characters before the first "/" must be a valid subdomain as defined + // by RFC 1123. All characters trailing the first "/" must be valid HTTP Path + // characters as defined by RFC 3986. The value cannot exceed 63 characters. + // The field is immutable. ManagedBy *string `json:"managedBy,omitempty"` } @@ -1591,3 +1603,17 @@ framework that users want to run on Kubernetes. Since frameworks share common functionality for distributed training (data parallelizm or model parallelizm). For some specific use-cases like MPI or Elastic PyTorch, we will leverage `MLSpec` parameter. + +### Allow users to specify arbitrary value in the managedBy field + +We can allow users to specify the arbitrary values instead of restricting the `.spec.managedBy` field in the TrainJob +with an empty, 'training-operator.kubeflow.org/trainjob-controller' or 'kusus.x-k8s.io/multikueue'. + +But, the arbitrary values allow users to specify external or in-house customized training-operator, which means that +the TrainJobs are reconciled by the controllers without any specification compliance. + +Specifically, the arbitrary training-operator could bring bugs for the status transitions. +So, we do not support the arbitrary values until we find reasonable use cases that the external controllers +need to reconcile the TrainJob. + +Note that we should implement the status transitions validations to once we support the arbitrary values in the `manageBy` field.