sigs.k8s.io/cluster-api@v1.7.1/docs/book/src/tasks/experimental-features/runtime-sdk/implement-extensions.md (about) 1 # Implementing Runtime Extensions 2 3 <aside class="note warning"> 4 5 <h1>Caution</h1> 6 7 Please note Runtime SDK is an advanced feature. If implemented incorrectly, a failing Runtime Extension can severely impact the Cluster API runtime. 8 9 </aside> 10 11 ## Introduction 12 13 As a developer building systems on top of Cluster API, if you want to hook into the Cluster’s lifecycle via 14 a Runtime Hook, you have to implement a Runtime Extension handling requests according to the 15 OpenAPI specification for the Runtime Hook you are interested in. 16 17 Runtime Extensions by design are very powerful and flexible, however given that with great power comes 18 great responsibility, a few key consideration should always be kept in mind (more details in the following sections): 19 20 - Runtime Extensions are components that should be designed, written and deployed with great caution given that they 21 can affect the proper functioning of the Cluster API runtime. 22 - Cluster administrators should carefully vet any Runtime Extension registration, thus preventing malicious components 23 from being added to the system. 24 25 Please note that following similar practices is already commonly accepted in the Kubernetes ecosystem for 26 Kubernetes API server admission webhooks. Runtime Extensions share the same foundation and most of the same 27 considerations/concerns apply. 28 29 ## Implementation 30 31 As mentioned above as a developer building systems on top of Cluster API, if you want to hook in the Cluster’s 32 lifecycle via a Runtime Extension, you have to implement an HTTPS server handling a discovery request and a set 33 of additional requests according to the OpenAPI specification for the Runtime Hook you are interested in. 34 35 The following shows a minimal example of a Runtime Extension server implementation: 36 37 ```go 38 package main 39 40 import ( 41 "context" 42 "flag" 43 "net/http" 44 "os" 45 46 "github.com/spf13/pflag" 47 cliflag "k8s.io/component-base/cli/flag" 48 "k8s.io/component-base/logs" 49 logsv1 "k8s.io/component-base/logs/api/v1" 50 "k8s.io/klog/v2" 51 ctrl "sigs.k8s.io/controller-runtime" 52 53 runtimecatalog "sigs.k8s.io/cluster-api/exp/runtime/catalog" 54 runtimehooksv1 "sigs.k8s.io/cluster-api/exp/runtime/hooks/api/v1alpha1" 55 "sigs.k8s.io/cluster-api/exp/runtime/server" 56 ) 57 58 var ( 59 // catalog contains all information about RuntimeHooks. 60 catalog = runtimecatalog.New() 61 62 // Flags. 63 profilerAddress string 64 webhookPort int 65 webhookCertDir string 66 logOptions = logs.NewOptions() 67 ) 68 69 func init() { 70 // Adds to the catalog all the RuntimeHooks defined in cluster API. 71 _ = runtimehooksv1.AddToCatalog(catalog) 72 } 73 74 // InitFlags initializes the flags. 75 func InitFlags(fs *pflag.FlagSet) { 76 // Initialize logs flags using Kubernetes component-base machinery. 77 logsv1.AddFlags(logOptions, fs) 78 79 // Add test-extension specific flags 80 fs.StringVar(&profilerAddress, "profiler-address", "", 81 "Bind address to expose the pprof profiler (e.g. localhost:6060)") 82 83 fs.IntVar(&webhookPort, "webhook-port", 9443, 84 "Webhook Server port") 85 86 fs.StringVar(&webhookCertDir, "webhook-cert-dir", "/tmp/k8s-webhook-server/serving-certs/", 87 "Webhook cert dir, only used when webhook-port is specified.") 88 } 89 90 func main() { 91 // Creates a logger to be used during the main func. 92 setupLog := ctrl.Log.WithName("setup") 93 94 // Initialize and parse command line flags. 95 InitFlags(pflag.CommandLine) 96 pflag.CommandLine.SetNormalizeFunc(cliflag.WordSepNormalizeFunc) 97 pflag.CommandLine.AddGoFlagSet(flag.CommandLine) 98 // Set log level 2 as default. 99 if err := pflag.CommandLine.Set("v", "2"); err != nil { 100 setupLog.Error(err, "failed to set default log level") 101 os.Exit(1) 102 } 103 pflag.Parse() 104 105 // Validates logs flags using Kubernetes component-base machinery and applies them 106 if err := logsv1.ValidateAndApply(logOptions, nil); err != nil { 107 setupLog.Error(err, "unable to start extension") 108 os.Exit(1) 109 } 110 111 // Add the klog logger in the context. 112 ctrl.SetLogger(klog.Background()) 113 114 // Initialize the golang profiler server, if required. 115 if profilerAddress != "" { 116 klog.Infof("Profiler listening for requests at %s", profilerAddress) 117 go func() { 118 klog.Info(http.ListenAndServe(profilerAddress, nil)) 119 }() 120 } 121 122 // Create a http server for serving runtime extensions 123 webhookServer, err := server.New(server.Options{ 124 Catalog: catalog, 125 Port: webhookPort, 126 CertDir: webhookCertDir, 127 }) 128 if err != nil { 129 setupLog.Error(err, "error creating webhook server") 130 os.Exit(1) 131 } 132 133 // Register extension handlers. 134 if err := webhookServer.AddExtensionHandler(server.ExtensionHandler{ 135 Hook: runtimehooksv1.BeforeClusterCreate, 136 Name: "before-cluster-create", 137 HandlerFunc: DoBeforeClusterCreate, 138 }); err != nil { 139 setupLog.Error(err, "error adding handler") 140 os.Exit(1) 141 } 142 if err := webhookServer.AddExtensionHandler(server.ExtensionHandler{ 143 Hook: runtimehooksv1.BeforeClusterUpgrade, 144 Name: "before-cluster-upgrade", 145 HandlerFunc: DoBeforeClusterUpgrade, 146 }); err != nil { 147 setupLog.Error(err, "error adding handler") 148 os.Exit(1) 149 } 150 151 // Setup a context listening for SIGINT. 152 ctx := ctrl.SetupSignalHandler() 153 154 // Start the https server. 155 setupLog.Info("Starting Runtime Extension server") 156 if err := webhookServer.Start(ctx); err != nil { 157 setupLog.Error(err, "error running webhook server") 158 os.Exit(1) 159 } 160 } 161 162 func DoBeforeClusterCreate(ctx context.Context, request *runtimehooksv1.BeforeClusterCreateRequest, response *runtimehooksv1.BeforeClusterCreateResponse) { 163 log := ctrl.LoggerFrom(ctx) 164 log.Info("BeforeClusterCreate is called") 165 // Your implementation 166 } 167 168 func DoBeforeClusterUpgrade(ctx context.Context, request *runtimehooksv1.BeforeClusterUpgradeRequest, response *runtimehooksv1.BeforeClusterUpgradeResponse) { 169 log := ctrl.LoggerFrom(ctx) 170 log.Info("BeforeClusterUpgrade is called") 171 // Your implementation 172 } 173 ``` 174 175 For a full example see our [test extension](https://github.com/kubernetes-sigs/cluster-api/tree/main/test/extension). 176 177 Please note that a Runtime Extension server can serve multiple Runtime Hooks (in the example above 178 `BeforeClusterCreate` and `BeforeClusterUpgrade`) at the same time. Each of them are handled at a different path, like the 179 Kubernetes API server does for different API resources. The exact format of those paths is handled by the server 180 automatically in accordance to the OpenAPI specification of the Runtime Hooks. 181 182 There is an additional `Discovery` endpoint which is automatically served by the `Server`. The `Discovery` endpoint 183 returns a list of extension handlers to inform Cluster API which Runtime Hooks are implemented by this 184 Runtime Extension server. 185 186 Please note that Cluster API is only able to enforce the correct request and response types as defined by a Runtime Hook version. 187 Developers are fully responsible for all other elements of the design of a Runtime Extension implementation, including: 188 189 - To choose which programming language to use; please note that Golang is the language of choice, and we are not planning 190 to test or provide tooling and libraries for other languages. Nevertheless, given that we rely on Open API and plain 191 HTTPS calls, other languages should just work but support will be provided at best effort. 192 - To choose if a dedicated or a shared HTTPS Server is used for the Runtime Extension (it can be e.g. also used to serve a 193 metric endpoint). 194 195 When using Golang the Runtime Extension developer can benefit from the following packages (provided by the 196 `sigs.k8s.io/cluster-api` module) as shown in the example above: 197 198 - `exp/runtime/hooks/api/v1alpha1` contains the Runtime Hook Golang API types, which are also used to generate the 199 OpenAPI specification. 200 - `exp/runtime/catalog` provides the `Catalog` object to register Runtime Hook definitions. The `Catalog` is then 201 used by the `server` package to handle requests. `Catalog` is similar to the `runtime.Scheme` of the 202 `k8s.io/apimachinery/pkg/runtime` package, but it is designed to store Runtime Hook registrations. 203 - `exp/runtime/server` provides a `Server` object which makes it easy to implement a Runtime Extension server. 204 The `Server` will automatically handle tasks like Marshalling/Unmarshalling requests and responses. A Runtime 205 Extension developer only has to implement a strongly typed function that contains the actual logic. 206 207 ## Guidelines 208 209 While writing a Runtime Extension the following important guidelines must be considered: 210 211 ### Timeouts 212 213 Runtime Extension processing adds to reconcile durations of Cluster API controllers. They should respond to requests 214 as quickly as possible, typically in milliseconds. Runtime Extension developers can decide how long the Cluster API Runtime 215 should wait for a Runtime Extension to respond before treating the call as a failure (max is 30s) by returning the timeout 216 during discovery. Of course a Runtime Extension can trigger long-running tasks in the background, but they shouldn't block 217 synchronously. 218 219 ### Availability 220 221 Runtime Extension failure could result in errors in handling the workload clusters lifecycle, and so the implementation 222 should be robust, have proper error handling, avoid panics, etc. Failure policies can be set up to mitigate the 223 negative impact of a Runtime Extension on the Cluster API Runtime, but this option can’t be used in all cases 224 (see [Error Management](#error-management)). 225 226 ### Blocking Hooks 227 228 A Runtime Hook can be defined as "blocking" - e.g. the `BeforeClusterUpgrade` hook allows a Runtime Extension 229 to prevent the upgrade from starting. A Runtime Extension registered for the `BeforeClusterUpgrade` hook 230 can block by returning a non-zero `retryAfterSeconds` value. Following consideration apply: 231 232 - The system might decide to retry the same Runtime Extension even before the `retryAfterSeconds` period expires, 233 e.g. due to other changes in the Cluster, so `retryAfterSeconds` should be considered as an approximate maximum 234 time before the next reconcile. 235 - If there is more than one Runtime Extension registered for the same Runtime Hook and more than one returns 236 `retryAfterSeconds`, the shortest non-zero value will be used. 237 - If there is more than one Runtime Extension registered for the same Runtime Hook and at least one returns 238 `retryAfterSeconds`, all Runtime Extensions will be called again. 239 240 Detailed description of what "blocking" means for each specific Runtime Hooks is documented case by case 241 in the hook-specific implementation documentation (e.g. [Implementing Lifecycle Hook Runtime Extensions](./implement-lifecycle-hooks.md#Definitions)). 242 243 ### Side Effects 244 245 It is recommended that Runtime Extensions should avoid side effects if possible, which means they should operate 246 only on the content of the request sent to them, and not make out-of-band changes. If side effects are required, 247 rules defined in the following sections apply. 248 249 ### Idempotence 250 251 An idempotent Runtime Extension is able to succeed even in case it has already been completed before (the Runtime 252 Extension checks current state and changes it only if necessary). This is necessary because a Runtime Extension 253 may be called many times after it already succeeded because other Runtime Extensions for the same hook may not 254 succeed in the same reconcile. 255 256 A practical example that explains why idempotence is relevant is the fact that extensions could be called more 257 than once for the same lifecycle transition, e.g. 258 259 - Two Runtime Extensions are registered for the `BeforeClusterUpgrade` hook. 260 - Before a Cluster upgrade is started both extensions are called, but one of them temporarily blocks the operation 261 by asking to retry after 30 seconds. 262 - After 30 seconds the system retries the lifecycle transition, and both extensions are called again to re-evaluate 263 if it is now possible to proceed with the Cluster upgrade. 264 265 ### Avoid dependencies 266 267 Each Runtime Extension should accomplish its task without depending on other Runtime Extensions. Introducing 268 dependencies across Runtime Extensions makes the system fragile, and it is probably a consequence of poor 269 "Separation of Concerns" between extensions. 270 271 ### Deterministic result 272 273 A deterministic Runtime Extension is implemented in such a way that given the same input it will always return 274 the same output. 275 276 Some Runtime Hooks, e.g. like external patches, might explicitly request for corresponding Runtime Extensions 277 to support this property. But we encourage developers to follow this pattern more generally given that it fits 278 well with practices like unit testing and generally makes the entire system more predictable and easier to troubleshoot. 279 280 ### Error messages 281 282 RuntimeExtension authors should be aware that error messages are surfaced as a conditions in Kubernetes resources 283 and recorded in Cluster API controller's logs. As a consequence: 284 285 - Error message must not contain any sensitive information. 286 - Error message must be deterministic, and must avoid to including timestamps or values changing at every call. 287 - Error message must not contain external errors when it's not clear if those errors are deterministic (e.g. errors return from cloud APIs). 288 289 <aside class="note warning"> 290 291 <h1>Caution</h1> 292 293 If an error message is not deterministic and it changes at every call even if the problem is the same, it could 294 lead to to Kubernetes resources conditions continuously changing, and this generates a denial attack to 295 controllers processing those resource that might impact system stability. 296 297 </aside> 298 299 ### ExtensionConfig 300 301 To register your runtime extension apply the ExtensionConfig resource in the management cluster, including your CA 302 certs, ClusterIP service associated with the app and namespace, and the target namespace for the given extension. Once 303 created, the extension will detect the associated service and discover the associated Hooks. For clarification, you can 304 check the status of the ExtensionConfig. Below is an example of `ExtensionConfig` - 305 306 ```yaml 307 apiVersion: runtime.cluster.x-k8s.io/v1alpha1 308 kind: ExtensionConfig 309 metadata: 310 annotations: 311 runtime.cluster.x-k8s.io/inject-ca-from-secret: default/test-runtime-sdk-svc-cert 312 name: test-runtime-sdk-extensionconfig 313 spec: 314 clientConfig: 315 service: 316 name: test-runtime-sdk-svc 317 namespace: default # Note: this assumes the test extension get deployed in the default namespace 318 port: 443 319 namespaceSelector: 320 matchExpressions: 321 - key: kubernetes.io/metadata.name 322 operator: In 323 values: 324 - default # Note: this assumes the test extension is used by Cluster in the default namespace only 325 ``` 326 327 ### Settings 328 329 Settings can be added to the ExtensionConfig object in the form of a map with string keys and values. These settings are 330 sent with each request to hooks registered by that ExtensionConfig. Extension developers can implement behavior in their 331 extensions to alter behavior based on these settings. Settings should be well documented by extension developers so that 332 ClusterClass authors can understand usage and expected behaviour. 333 334 Settings can be provided for individual external patches by providing them in the ClusterClass `.spec.patches[*].external.settings`. 335 This can be used to overwrite settings at the ExtensionConfig level for that patch. 336 337 ### Error management 338 339 In case a Runtime Extension returns an error, the error will be handled according to the corresponding failure policy 340 defined in the response of the Discovery call. 341 342 If the failure policy is `Ignore` the error is going to be recorded in the controller's logs, but the processing 343 will continue. However we recognize that this failure policy cannot be used in most of the use cases because Runtime 344 Extension implementers want to ensure that the task implemented by an extension is completed before continuing with 345 the cluster's lifecycle. 346 347 If instead the failure policy is `Fail` the system will retry the operation until it passes. The following general 348 considerations apply: 349 350 - It is the responsibility of Cluster API components to surface Runtime Extension errors using conditions. 351 - Operations will be retried with an exponential backoff or whenever the state of a Cluster changes (we are going to rely 352 on controller runtime exponential backoff/watches). 353 - If there is more than one Runtime Extension registered for the same Runtime Hook and at least one of them fails, 354 all the registered Runtime Extension will be retried. See [Idempotence](#idempotence) 355 356 Additional considerations about errors that apply only to a specific Runtime Hook will be documented in the hook-specific 357 implementation documentation. 358 359 ## Tips & tricks 360 361 After you implemented and deployed a Runtime Extension you can manually test it by sending HTTP requests. 362 This can be for example done via kubectl: 363 364 Via `kubectl create --raw`: 365 366 ```bash 367 # Send a Discovery Request to the webhook-service in namespace default with protocol https on port 443: 368 kubectl create --raw '/api/v1/namespaces/default/services/https:webhook-service:443/proxy/hooks.runtime.cluster.x-k8s.io/v1alpha1/discovery' \ 369 -f <(echo '{"apiVersion":"hooks.runtime.cluster.x-k8s.io/v1alpha1","kind":"DiscoveryRequest"}') | jq 370 ``` 371 372 Via `kubectl proxy` and `curl`: 373 374 ```bash 375 # Open a proxy with kubectl and then use curl to send the request 376 ## First terminal: 377 kubectl proxy 378 ## Second terminal: 379 curl -X 'POST' 'http://127.0.0.1:8001/api/v1/namespaces/default/services/https:webhook-service:443/proxy/hooks.runtime.cluster.x-k8s.io/v1alpha1/discovery' \ 380 -d '{"apiVersion":"hooks.runtime.cluster.x-k8s.io/v1alpha1","kind":"DiscoveryRequest"}' | jq 381 ``` 382 383 For more details about the API of the Runtime Extensions please see <button onclick="openSwaggerUI()">Swagger UI</button>. 384 For more details on proxy support please see [Proxies in Kubernetes](https://kubernetes.io/docs/concepts/cluster-administration/proxies/). 385 386 <script> 387 // openSwaggerUI calculates the absolute URL of the RuntimeSDK YAML file and opens Swagger UI. 388 function openSwaggerUI() { 389 var schemaURL = new URL("runtime-sdk-openapi.yaml", document.baseURI).href 390 window.open("https://editor.swagger.io/?url=" + schemaURL) 391 } 392 </script>