From 38bc763465dc8867853a2b74ce14194e92ecdbe4 Mon Sep 17 00:00:00 2001 From: shalousun <836575280@qq.com> Date: Sat, 17 Aug 2024 19:38:13 +0800 Subject: [PATCH] feat: Support debugging webhooks locally --- cmd/training-operator.v1/main.go | 79 +++++++++++-------- docs/development/developer_guide.md | 36 +++++++++ manifests/overlays/local/kustomization.yaml | 20 +++++ manifests/overlays/local/namespace.yaml | 4 + .../local/validating_webhook_patch.yaml | 13 +++ 5 files changed, 121 insertions(+), 31 deletions(-) create mode 100644 manifests/overlays/local/kustomization.yaml create mode 100644 manifests/overlays/local/namespace.yaml create mode 100644 manifests/overlays/local/validating_webhook_patch.yaml diff --git a/cmd/training-operator.v1/main.go b/cmd/training-operator.v1/main.go index bec5cd6b55..f7d3f05a5c 100644 --- a/cmd/training-operator.v1/main.go +++ b/cmd/training-operator.v1/main.go @@ -79,6 +79,7 @@ func main() { var webhookServerPort int var webhookServiceName string var webhookSecretName string + var disableWebhook bool flag.StringVar(&metricsAddr, "metrics-bind-address", ":8080", "The address the metric endpoint binds to.") flag.StringVar(&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.") @@ -110,6 +111,7 @@ func main() { flag.IntVar(&webhookServerPort, "webhook-server-port", 9443, "Endpoint port for the webhook server.") flag.StringVar(&webhookServiceName, "webhook-service-name", "training-operator", "Name of the Service used as part of the DNSName") flag.StringVar(&webhookSecretName, "webhook-secret-name", "training-operator-webhook-cert", "Name of the Secret to store CA and server certs") + flag.BoolVar(&disableWebhook, "disable-webhook", false, "Disable the webhook server for local debugging.") opts := zap.Options{ Development: true, @@ -129,14 +131,19 @@ func main() { } } + var webhookServer webhook.Server + if !disableWebhook { + webhookServer = webhook.NewServer(webhook.Options{ + Port: webhookServerPort, + }) + } + mgr, err := ctrl.NewManager(ctrl.GetConfigOrDie(), ctrl.Options{ Scheme: scheme, Metrics: metricsserver.Options{ BindAddress: metricsAddr, }, - WebhookServer: webhook.NewServer(webhook.Options{ - Port: webhookServerPort, - }), + WebhookServer: webhookServer, HealthProbeBindAddress: probeAddr, LeaderElection: enableLeaderElection, LeaderElectionID: leaderElectionID, @@ -147,20 +154,24 @@ func main() { os.Exit(1) } + // Setup webhook server based on the disableWebhook flag + certsReady := make(chan struct{}) defer close(certsReady) certGenerationConfig := cert.Config{ WebhookSecretName: webhookSecretName, WebhookServiceName: webhookServiceName, } - if err = cert.ManageCerts(mgr, certGenerationConfig, certsReady); err != nil { - setupLog.Error(err, "Unable to set up cert rotation") - os.Exit(1) + if !disableWebhook { + if err = cert.ManageCerts(mgr, certGenerationConfig, certsReady); err != nil { + setupLog.Error(err, "Unable to set up cert rotation") + os.Exit(1) + } } - setupProbeEndpoints(mgr, certsReady) + setupProbeEndpoints(mgr, certsReady, disableWebhook) // Set up controllers using goroutines to start the manager quickly. - go setupControllers(mgr, enabledSchemes, gangSchedulerName, controllerThreads, certsReady) + go setupControllers(mgr, enabledSchemes, gangSchedulerName, controllerThreads, certsReady, disableWebhook) //+kubebuilder:scaffold:builder @@ -171,10 +182,12 @@ func main() { } } -func setupControllers(mgr ctrl.Manager, enabledSchemes controllerv1.EnabledSchemes, gangSchedulerName string, controllerThreads int, certsReady <-chan struct{}) { - setupLog.Info("Waiting for certificate generation to complete") - <-certsReady - setupLog.Info("Certs ready") +func setupControllers(mgr ctrl.Manager, enabledSchemes controllerv1.EnabledSchemes, gangSchedulerName string, controllerThreads int, certsReady <-chan struct{}, disableWebhook bool) { + if !disableWebhook { + setupLog.Info("Waiting for certificate generation to complete") + <-certsReady + setupLog.Info("Certs ready") + } setupLog.Info("registering controllers...") // Prepare GangSchedulingSetupFunc @@ -207,19 +220,21 @@ func setupControllers(mgr ctrl.Manager, enabledSchemes controllerv1.EnabledSchem setupLog.Error(errors.New(errMsg), "unable to create controller", "scheme", s) os.Exit(1) } - setupWebhookFunc, supportedWebhook := webhooks.SupportedSchemeWebhook[s] - if !supportedWebhook { - setupLog.Error(errors.New(errMsg), "scheme is not supported", "scheme", s) - os.Exit(1) - } - if err := setupWebhookFunc(mgr); err != nil { - setupLog.Error(errors.New(errMsg), "unable to start webhook server", "scheme", s) - os.Exit(1) + if !disableWebhook { + setupWebhookFunc, supportedWebhook := webhooks.SupportedSchemeWebhook[s] + if !supportedWebhook { + setupLog.Error(errors.New(errMsg), "scheme is not supported", "scheme", s) + os.Exit(1) + } + if err := setupWebhookFunc(mgr); err != nil { + setupLog.Error(errors.New(errMsg), "unable to start webhook server", "scheme", s) + os.Exit(1) + } } } } -func setupProbeEndpoints(mgr ctrl.Manager, certsReady <-chan struct{}) { +func setupProbeEndpoints(mgr ctrl.Manager, certsReady <-chan struct{}, disableWebhook bool) { defer setupLog.Info("Probe endpoints are configured on healthz and readyz") if err := mgr.AddHealthzCheck("healthz", healthz.Ping); err != nil { @@ -230,20 +245,22 @@ func setupProbeEndpoints(mgr ctrl.Manager, certsReady <-chan struct{}) { // Wait for the webhook server to be listening before advertising the // training-operator replica as ready. This allows users to wait with sending the first // requests, requiring webhooks, until the training-operator deployment is available, so - // that the early requests are not rejected during the traininig-operator's startup. + // that the early requests are not rejected during the training-operator's startup. // We wrap the call to GetWebhookServer in a closure to delay calling // the function, otherwise a not fully-initialized webhook server (without // ready certs) fails the start of the manager. - if err := mgr.AddReadyzCheck("readyz", func(req *http.Request) error { - select { - case <-certsReady: - return mgr.GetWebhookServer().StartedChecker()(req) - default: - return errors.New("certificates are not ready") + if !disableWebhook { + if err := mgr.AddReadyzCheck("readyz", func(req *http.Request) error { + select { + case <-certsReady: + return mgr.GetWebhookServer().StartedChecker()(req) + default: + return errors.New("certificates are not ready") + } + }); err != nil { + setupLog.Error(err, "unable to set up ready check") + os.Exit(1) } - }); err != nil { - setupLog.Error(err, "unable to set up ready check") - os.Exit(1) } } diff --git a/docs/development/developer_guide.md b/docs/development/developer_guide.md index 03312643a9..3ec129eab2 100644 --- a/docs/development/developer_guide.md +++ b/docs/development/developer_guide.md @@ -106,6 +106,39 @@ Defaulted container "pytorch" out of: pytorch, init-pytorch (init) 2024-04-19T19:00:57Z INFO Train Epoch: 1 [10240/60000 (17%)] loss=1.1650 ``` +## Running the Operator and Debugging Locally + +If you need to develop, test, and debug the Operator on your local machine (such as a Mac) and want to disable webhook validation, you can follow the steps below. + +### Configure KUBECONFIG and KUBEFLOW_NAMESPACE + +We can configure the Operator to use the configuration available in your kubeconfig to communicate with a K8s cluster. Set your environment: + +```sh +export KUBECONFIG=$(echo ~/.kube/config) +``` + +### Create the CRDS +After the cluster is up, the CRDS should be created on the cluster. +The CRDS created using `/manifests/overlays/local` will ignore the webhook validation. + +```bash +kubectl apply -k ./manifests/overlays/local +``` + +### Run Operator + +Now we are ready to run the Operator locally and disable webhook validation, + +```sh +go run ./cmd/training-operator.v1/main.go -disable-webhook=true +``` +- `-disable-webhook=true`: This flag disables webhook validation. + + +This way, the Operator will run locally and will not perform any webhook-related operations, making it easier for you to debug. + + ## Testing changes locally Now that you confirmed you can spin up an operator locally, you can try to test your local changes to the operator. @@ -143,6 +176,9 @@ You should be able to see a pod for your training operator running in your names ``` kubectl logs -n kubeflow -l training.kubeflow.org/job-name=pytorch-simple ``` + + + ## Go version On ubuntu the default go package appears to be gccgo-go which has problems see [issue](https://github.com/golang/go/issues/15429) golang-go package is also really old so install from golang tarballs instead. diff --git a/manifests/overlays/local/kustomization.yaml b/manifests/overlays/local/kustomization.yaml new file mode 100644 index 0000000000..c30314c570 --- /dev/null +++ b/manifests/overlays/local/kustomization.yaml @@ -0,0 +1,20 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +namespace: kubeflow +resources: +- ../../base/ +- namespace.yaml +images: +- name: kubeflow/training-operator + newName: kubeflow/training-operator + newTag: latest +secretGenerator: +- name: training-operator-webhook-cert + options: + disableNameSuffixHash: true +patches: +- path: validating_webhook_patch.yaml + target: + group: admissionregistration.k8s.io + kind: ValidatingWebhookConfiguration + version: v1 diff --git a/manifests/overlays/local/namespace.yaml b/manifests/overlays/local/namespace.yaml new file mode 100644 index 0000000000..7a940e4673 --- /dev/null +++ b/manifests/overlays/local/namespace.yaml @@ -0,0 +1,4 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: kubeflow diff --git a/manifests/overlays/local/validating_webhook_patch.yaml b/manifests/overlays/local/validating_webhook_patch.yaml new file mode 100644 index 0000000000..47dc30c091 --- /dev/null +++ b/manifests/overlays/local/validating_webhook_patch.yaml @@ -0,0 +1,13 @@ +--- +- op: replace + path: /webhooks/0/failurePolicy + value: Ignore +- op: replace + path: /webhooks/1/failurePolicy + value: Ignore +- op: replace + path: /webhooks/2/failurePolicy + value: Ignore +- op: replace + path: /webhooks/3/failurePolicy + value: Ignore \ No newline at end of file