Skip to content

Commit

Permalink
feat: Support debugging webhooks locally
Browse files Browse the repository at this point in the history
  • Loading branch information
shalousun committed Aug 31, 2024
1 parent 3f9b0a4 commit 38bc763
Show file tree
Hide file tree
Showing 5 changed files with 121 additions and 31 deletions.
79 changes: 48 additions & 31 deletions cmd/training-operator.v1/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ func main() {
var webhookServerPort int
var webhookServiceName string
var webhookSecretName string
var disableWebhook bool

flag.StringVar(&metricsAddr, "metrics-bind-address", ":8080", "The address the metric endpoint binds to.")
flag.StringVar(&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.")
Expand Down Expand Up @@ -110,6 +111,7 @@ func main() {
flag.IntVar(&webhookServerPort, "webhook-server-port", 9443, "Endpoint port for the webhook server.")
flag.StringVar(&webhookServiceName, "webhook-service-name", "training-operator", "Name of the Service used as part of the DNSName")
flag.StringVar(&webhookSecretName, "webhook-secret-name", "training-operator-webhook-cert", "Name of the Secret to store CA and server certs")
flag.BoolVar(&disableWebhook, "disable-webhook", false, "Disable the webhook server for local debugging.")

opts := zap.Options{
Development: true,
Expand All @@ -129,14 +131,19 @@ func main() {
}
}

var webhookServer webhook.Server
if !disableWebhook {
webhookServer = webhook.NewServer(webhook.Options{
Port: webhookServerPort,
})
}

mgr, err := ctrl.NewManager(ctrl.GetConfigOrDie(), ctrl.Options{
Scheme: scheme,
Metrics: metricsserver.Options{
BindAddress: metricsAddr,
},
WebhookServer: webhook.NewServer(webhook.Options{
Port: webhookServerPort,
}),
WebhookServer: webhookServer,
HealthProbeBindAddress: probeAddr,
LeaderElection: enableLeaderElection,
LeaderElectionID: leaderElectionID,
Expand All @@ -147,20 +154,24 @@ func main() {
os.Exit(1)
}

// Setup webhook server based on the disableWebhook flag

certsReady := make(chan struct{})
defer close(certsReady)
certGenerationConfig := cert.Config{
WebhookSecretName: webhookSecretName,
WebhookServiceName: webhookServiceName,
}
if err = cert.ManageCerts(mgr, certGenerationConfig, certsReady); err != nil {
setupLog.Error(err, "Unable to set up cert rotation")
os.Exit(1)
if !disableWebhook {
if err = cert.ManageCerts(mgr, certGenerationConfig, certsReady); err != nil {
setupLog.Error(err, "Unable to set up cert rotation")
os.Exit(1)
}
}

setupProbeEndpoints(mgr, certsReady)
setupProbeEndpoints(mgr, certsReady, disableWebhook)
// Set up controllers using goroutines to start the manager quickly.
go setupControllers(mgr, enabledSchemes, gangSchedulerName, controllerThreads, certsReady)
go setupControllers(mgr, enabledSchemes, gangSchedulerName, controllerThreads, certsReady, disableWebhook)

//+kubebuilder:scaffold:builder

Expand All @@ -171,10 +182,12 @@ func main() {
}
}

func setupControllers(mgr ctrl.Manager, enabledSchemes controllerv1.EnabledSchemes, gangSchedulerName string, controllerThreads int, certsReady <-chan struct{}) {
setupLog.Info("Waiting for certificate generation to complete")
<-certsReady
setupLog.Info("Certs ready")
func setupControllers(mgr ctrl.Manager, enabledSchemes controllerv1.EnabledSchemes, gangSchedulerName string, controllerThreads int, certsReady <-chan struct{}, disableWebhook bool) {
if !disableWebhook {
setupLog.Info("Waiting for certificate generation to complete")
<-certsReady
setupLog.Info("Certs ready")
}

setupLog.Info("registering controllers...")
// Prepare GangSchedulingSetupFunc
Expand Down Expand Up @@ -207,19 +220,21 @@ func setupControllers(mgr ctrl.Manager, enabledSchemes controllerv1.EnabledSchem
setupLog.Error(errors.New(errMsg), "unable to create controller", "scheme", s)
os.Exit(1)
}
setupWebhookFunc, supportedWebhook := webhooks.SupportedSchemeWebhook[s]
if !supportedWebhook {
setupLog.Error(errors.New(errMsg), "scheme is not supported", "scheme", s)
os.Exit(1)
}
if err := setupWebhookFunc(mgr); err != nil {
setupLog.Error(errors.New(errMsg), "unable to start webhook server", "scheme", s)
os.Exit(1)
if !disableWebhook {
setupWebhookFunc, supportedWebhook := webhooks.SupportedSchemeWebhook[s]
if !supportedWebhook {
setupLog.Error(errors.New(errMsg), "scheme is not supported", "scheme", s)
os.Exit(1)
}
if err := setupWebhookFunc(mgr); err != nil {
setupLog.Error(errors.New(errMsg), "unable to start webhook server", "scheme", s)
os.Exit(1)
}
}
}
}

func setupProbeEndpoints(mgr ctrl.Manager, certsReady <-chan struct{}) {
func setupProbeEndpoints(mgr ctrl.Manager, certsReady <-chan struct{}, disableWebhook bool) {
defer setupLog.Info("Probe endpoints are configured on healthz and readyz")

if err := mgr.AddHealthzCheck("healthz", healthz.Ping); err != nil {
Expand All @@ -230,20 +245,22 @@ func setupProbeEndpoints(mgr ctrl.Manager, certsReady <-chan struct{}) {
// Wait for the webhook server to be listening before advertising the
// training-operator replica as ready. This allows users to wait with sending the first
// requests, requiring webhooks, until the training-operator deployment is available, so
// that the early requests are not rejected during the traininig-operator's startup.
// that the early requests are not rejected during the training-operator's startup.
// We wrap the call to GetWebhookServer in a closure to delay calling
// the function, otherwise a not fully-initialized webhook server (without
// ready certs) fails the start of the manager.
if err := mgr.AddReadyzCheck("readyz", func(req *http.Request) error {
select {
case <-certsReady:
return mgr.GetWebhookServer().StartedChecker()(req)
default:
return errors.New("certificates are not ready")
if !disableWebhook {
if err := mgr.AddReadyzCheck("readyz", func(req *http.Request) error {
select {
case <-certsReady:
return mgr.GetWebhookServer().StartedChecker()(req)
default:
return errors.New("certificates are not ready")
}
}); err != nil {
setupLog.Error(err, "unable to set up ready check")
os.Exit(1)
}
}); err != nil {
setupLog.Error(err, "unable to set up ready check")
os.Exit(1)
}
}

Expand Down
36 changes: 36 additions & 0 deletions docs/development/developer_guide.md
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,39 @@ Defaulted container "pytorch" out of: pytorch, init-pytorch (init)
2024-04-19T19:00:57Z INFO Train Epoch: 1 [10240/60000 (17%)] loss=1.1650
```

## Running the Operator and Debugging Locally

If you need to develop, test, and debug the Operator on your local machine (such as a Mac) and want to disable webhook validation, you can follow the steps below.

### Configure KUBECONFIG and KUBEFLOW_NAMESPACE

We can configure the Operator to use the configuration available in your kubeconfig to communicate with a K8s cluster. Set your environment:

```sh
export KUBECONFIG=$(echo ~/.kube/config)
```

### Create the CRDS
After the cluster is up, the CRDS should be created on the cluster.
The CRDS created using `/manifests/overlays/local` will ignore the webhook validation.

```bash
kubectl apply -k ./manifests/overlays/local
```

### Run Operator

Now we are ready to run the Operator locally and disable webhook validation,

```sh
go run ./cmd/training-operator.v1/main.go -disable-webhook=true
```
- `-disable-webhook=true`: This flag disables webhook validation.


This way, the Operator will run locally and will not perform any webhook-related operations, making it easier for you to debug.


## Testing changes locally

Now that you confirmed you can spin up an operator locally, you can try to test your local changes to the operator.
Expand Down Expand Up @@ -143,6 +176,9 @@ You should be able to see a pod for your training operator running in your names
```
kubectl logs -n kubeflow -l training.kubeflow.org/job-name=pytorch-simple
```



## Go version

On ubuntu the default go package appears to be gccgo-go which has problems see [issue](https://github.com/golang/go/issues/15429) golang-go package is also really old so install from golang tarballs instead.
Expand Down
20 changes: 20 additions & 0 deletions manifests/overlays/local/kustomization.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
namespace: kubeflow
resources:
- ../../base/
- namespace.yaml
images:
- name: kubeflow/training-operator
newName: kubeflow/training-operator
newTag: latest
secretGenerator:
- name: training-operator-webhook-cert
options:
disableNameSuffixHash: true
patches:
- path: validating_webhook_patch.yaml
target:
group: admissionregistration.k8s.io
kind: ValidatingWebhookConfiguration
version: v1
4 changes: 4 additions & 0 deletions manifests/overlays/local/namespace.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
apiVersion: v1
kind: Namespace
metadata:
name: kubeflow
13 changes: 13 additions & 0 deletions manifests/overlays/local/validating_webhook_patch.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
---
- op: replace
path: /webhooks/0/failurePolicy
value: Ignore
- op: replace
path: /webhooks/1/failurePolicy
value: Ignore
- op: replace
path: /webhooks/2/failurePolicy
value: Ignore
- op: replace
path: /webhooks/3/failurePolicy
value: Ignore

0 comments on commit 38bc763

Please sign in to comment.