github.com/openshift/installer@v1.4.17/pkg/clusterapi/system.go (about) 1 package clusterapi 2 3 import ( 4 "bytes" 5 "context" 6 "fmt" 7 "io" 8 "net/url" 9 "os" 10 "path/filepath" 11 "strings" 12 "sync" 13 "text/template" 14 "time" 15 16 "github.com/sirupsen/logrus" 17 "sigs.k8s.io/controller-runtime/pkg/client" 18 "sigs.k8s.io/controller-runtime/pkg/envtest" 19 20 "github.com/openshift/installer/cmd/openshift-install/command" 21 "github.com/openshift/installer/data" 22 "github.com/openshift/installer/pkg/asset/cluster/metadata" 23 azic "github.com/openshift/installer/pkg/asset/installconfig/azure" 24 gcpic "github.com/openshift/installer/pkg/asset/installconfig/gcp" 25 powervsic "github.com/openshift/installer/pkg/asset/installconfig/powervs" 26 "github.com/openshift/installer/pkg/clusterapi/internal/process" 27 "github.com/openshift/installer/pkg/clusterapi/internal/process/addr" 28 "github.com/openshift/installer/pkg/types/aws" 29 "github.com/openshift/installer/pkg/types/azure" 30 "github.com/openshift/installer/pkg/types/gcp" 31 "github.com/openshift/installer/pkg/types/ibmcloud" 32 "github.com/openshift/installer/pkg/types/nutanix" 33 "github.com/openshift/installer/pkg/types/openstack" 34 "github.com/openshift/installer/pkg/types/powervs" 35 "github.com/openshift/installer/pkg/types/vsphere" 36 ) 37 38 var ( 39 sys = &system{} 40 ) 41 42 // SystemState is the state of the cluster-api system. 43 type SystemState string 44 45 const ( 46 // SystemStateRunning indicates the system is running. 47 SystemStateRunning SystemState = "running" 48 // SystemStateStopped indicates the system is stopped. 49 SystemStateStopped SystemState = "stopped" 50 51 // ArtifactsDir is the directory where output (manifests, kubeconfig, etc.) 52 // related to CAPI-based installs are stored. 53 ArtifactsDir = ".clusterapi_output" 54 ) 55 56 // Interface is the interface for the cluster-api system. 57 type Interface interface { 58 Run(ctx context.Context) error 59 State() SystemState 60 Client() client.Client 61 Teardown() 62 CleanEtcd() 63 } 64 65 // System returns the cluster-api system. 66 func System() Interface { 67 return sys 68 } 69 70 // system creates a local capi control plane 71 // to use as a management cluster. 72 type system struct { 73 sync.Mutex 74 75 client client.Client 76 77 componentDir string 78 lcp *localControlPlane 79 80 wg sync.WaitGroup 81 teardownOnce sync.Once 82 cancel context.CancelFunc 83 84 logWriter *io.PipeWriter 85 } 86 87 // Run launches the cluster-api system. 88 func (c *system) Run(ctx context.Context) error { 89 c.Lock() 90 defer c.Unlock() 91 92 // Setup the context with a cancel function. 93 ctx, cancel := context.WithCancel(ctx) 94 c.cancel = cancel 95 96 // Create the local control plane. 97 lcp := &localControlPlane{} 98 if err := lcp.Run(ctx); err != nil { 99 return fmt.Errorf("failed to run local control plane: %w", err) 100 } 101 c.lcp = lcp 102 c.client = c.lcp.Client 103 104 // Create a temporary directory to unpack the cluster-api assets 105 // and use it as the working directory for the envtest environment. 106 componentDir, err := os.MkdirTemp("", "openshift-cluster-api-system-components") 107 if err != nil { 108 return fmt.Errorf("failed to create temporary folder for cluster api components: %w", err) 109 } 110 if err := data.Unpack(componentDir, "/cluster-api"); err != nil { 111 return fmt.Errorf("failed to unpack cluster api components: %w", err) 112 } 113 c.componentDir = componentDir 114 115 // Create the controllers, we always need to run the cluster-api core controller. 116 controllers := []*controller{ 117 { 118 Name: "Cluster API", 119 Path: fmt.Sprintf("%s/cluster-api", c.lcp.BinDir), 120 Components: []string{c.componentDir + "/core-components.yaml"}, 121 Args: []string{ 122 "-v=2", 123 "--diagnostics-address=0", 124 "--health-addr={{suggestHealthHostPort}}", 125 "--webhook-port={{.WebhookPort}}", 126 "--webhook-cert-dir={{.WebhookCertDir}}", 127 }, 128 }, 129 } 130 131 metadata, err := metadata.Load(command.RootOpts.Dir) 132 if err != nil { 133 return fmt.Errorf("failed to load metadata: %w", err) 134 } 135 136 platform := metadata.Platform() 137 if platform == "" { 138 return fmt.Errorf("no platform configured in metadata") 139 } 140 141 // Create the infrastructure controllers. 142 // Only add the controllers for the platform we are deploying to. 143 switch platform { 144 case aws.Name: 145 controller := c.getInfrastructureController( 146 &AWS, 147 []string{ 148 "-v=4", 149 "--diagnostics-address=0", 150 "--health-addr={{suggestHealthHostPort}}", 151 "--webhook-port={{.WebhookPort}}", 152 "--webhook-cert-dir={{.WebhookCertDir}}", 153 "--feature-gates=BootstrapFormatIgnition=true,ExternalResourceGC=true,TagUnmanagedNetworkResources=false,EKS=false", 154 }, 155 map[string]string{}, 156 ) 157 if cfg := metadata.AWS; cfg != nil && len(cfg.ServiceEndpoints) > 0 { 158 endpoints := make([]string, 0, len(cfg.ServiceEndpoints)) 159 // CAPA expects name=url pairs of service endpoints 160 for _, endpoint := range cfg.ServiceEndpoints { 161 endpoints = append(endpoints, fmt.Sprintf("%s=%s", endpoint.Name, endpoint.URL)) 162 } 163 controller.Args = append(controller.Args, fmt.Sprintf("--service-endpoints=%s:%s", cfg.Region, strings.Join(endpoints, ","))) 164 } 165 controllers = append(controllers, controller) 166 case azure.Name: 167 cloudName := metadata.Azure.CloudName 168 if cloudName == "" { 169 cloudName = azure.PublicCloud 170 } 171 session, err := azic.GetSession(cloudName, metadata.Azure.ARMEndpoint) 172 if err != nil { 173 return fmt.Errorf("unable to retrieve azure session: %w", err) 174 } 175 176 controllers = append(controllers, 177 c.getInfrastructureController( 178 &Azure, 179 []string{ 180 "-v=2", 181 "--health-addr={{suggestHealthHostPort}}", 182 "--webhook-port={{.WebhookPort}}", 183 "--webhook-cert-dir={{.WebhookCertDir}}", 184 "--feature-gates=MachinePool=false", 185 }, 186 map[string]string{}, 187 ), 188 c.getInfrastructureController( 189 &AzureASO, 190 []string{ 191 "-v=0", 192 "-metrics-addr=0", 193 "-health-addr={{suggestHealthHostPort}}", 194 "-webhook-port={{.WebhookPort}}", 195 "-webhook-cert-dir={{.WebhookCertDir}}", 196 "-crd-pattern=", 197 "-crd-management=none", 198 }, map[string]string{ 199 "POD_NAMESPACE": "capz-system", 200 "AZURE_CLIENT_ID": session.Credentials.ClientID, 201 "AZURE_CLIENT_SECRET": session.Credentials.ClientSecret, 202 "AZURE_CLIENT_CERTIFICATE": session.Credentials.ClientCertificatePath, 203 "AZURE_CLIENT_CERTIFICATE_PASSWORD": session.Credentials.ClientCertificatePassword, 204 "AZURE_TENANT_ID": session.Credentials.TenantID, 205 "AZURE_SUBSCRIPTION_ID": session.Credentials.SubscriptionID, 206 }, 207 ), 208 ) 209 case gcp.Name: 210 session, err := gcpic.GetSession(context.Background()) 211 if err != nil { 212 return fmt.Errorf("failed to create gcp session: %w", err) 213 } 214 215 //nolint:gosec // CAPG only expects a single credentials environment variable 216 gAppCredEnvVar := "GOOGLE_APPLICATION_CREDENTIALS" 217 capgEnvVars := map[string]string{ 218 gAppCredEnvVar: session.Path, 219 } 220 221 if v, ok := capgEnvVars[gAppCredEnvVar]; ok { 222 logrus.Infof("setting %q to %s for capg infrastructure controller", gAppCredEnvVar, v) 223 } 224 225 controllers = append(controllers, 226 c.getInfrastructureController( 227 &GCP, 228 []string{ 229 "-v=2", 230 "--diagnostics-address=0", 231 "--health-addr={{suggestHealthHostPort}}", 232 "--webhook-port={{.WebhookPort}}", 233 "--webhook-cert-dir={{.WebhookCertDir}}", 234 }, 235 capgEnvVars, 236 ), 237 ) 238 case ibmcloud.Name: 239 // TODO 240 case nutanix.Name: 241 controllers = append(controllers, 242 c.getInfrastructureController( 243 &Nutanix, 244 []string{ 245 "-metrics-bind-address=0", 246 "-health-probe-bind-address={{suggestHealthHostPort}}", 247 "-leader-elect=false", 248 }, 249 map[string]string{}, 250 ), 251 ) 252 case openstack.Name: 253 controllers = append(controllers, 254 c.getInfrastructureController( 255 &OpenStack, 256 []string{ 257 "-v=2", 258 "--diagnostics-address=0", 259 "--health-addr={{suggestHealthHostPort}}", 260 "--webhook-port={{.WebhookPort}}", 261 "--webhook-cert-dir={{.WebhookCertDir}}", 262 }, 263 map[string]string{ 264 "EXP_KUBEADM_BOOTSTRAP_FORMAT_IGNITION": "true", 265 }, 266 ), 267 ) 268 case vsphere.Name: 269 controllers = append(controllers, 270 c.getInfrastructureController( 271 &VSphere, 272 []string{ 273 "-v=2", 274 "--diagnostics-address=0", 275 "--health-addr={{suggestHealthHostPort}}", 276 "--webhook-port={{.WebhookPort}}", 277 "--webhook-cert-dir={{.WebhookCertDir}}", 278 "--leader-elect=false", 279 }, 280 map[string]string{ 281 "EXP_KUBEADM_BOOTSTRAP_FORMAT_IGNITION": "true", 282 "EXP_CLUSTER_RESOURCE_SET": "true", 283 }, 284 ), 285 ) 286 case powervs.Name: 287 // We need to prompt for missing variables because NewPISession requires them! 288 bxClient, err := powervsic.NewBxClient(true) 289 if err != nil { 290 return fmt.Errorf("failed to create a BxClient in Run: %w", err) 291 } 292 APIKey := bxClient.GetBxClientAPIKey() 293 294 controller := c.getInfrastructureController( 295 &IBMCloud, 296 []string{ 297 "--provider-id-fmt=v2", 298 "--v=5", 299 "--health-addr={{suggestHealthHostPort}}", 300 "--webhook-port={{.WebhookPort}}", 301 "--webhook-cert-dir={{.WebhookCertDir}}", 302 }, 303 map[string]string{ 304 "IBMCLOUD_AUTH_TYPE": "iam", 305 "IBMCLOUD_APIKEY": APIKey, 306 "IBMCLOUD_AUTH_URL": "https://iam.cloud.ibm.com", 307 "LOGLEVEL": "5", 308 }, 309 ) 310 if cfg := metadata.PowerVS; cfg != nil && len(cfg.ServiceEndpoints) > 0 { 311 overrides := bxClient.FilterServiceEndpoints(cfg) 312 if len(overrides) > 0 { 313 controller.Args = append(controller.Args, fmt.Sprintf("--service-endpoint=%s:%s", cfg.Region, strings.Join(overrides, ","))) 314 } 315 } 316 controllers = append(controllers, controller) 317 default: 318 return fmt.Errorf("unsupported platform %q", platform) 319 } 320 321 // We only show controller logs if the log level is DEBUG or above 322 c.logWriter = logrus.StandardLogger().WriterLevel(logrus.DebugLevel) 323 324 // We create a wait group to wait for the controllers to stop, 325 // this waitgroup is a global, and is used by the Teardown function 326 // which is expected to be called when the program exits. 327 c.wg.Add(1) 328 go func() { 329 defer c.wg.Done() 330 // Stop the controllers when the context is cancelled. 331 <-ctx.Done() 332 logrus.Info("Shutting down local Cluster API controllers...") 333 for _, ct := range controllers { 334 if ct.state != nil { 335 if err := ct.state.Stop(); err != nil { 336 logrus.Warnf("Failed to stop controller: %s: %v", ct.Name, err) 337 continue 338 } 339 logrus.Infof("Stopped controller: %s", ct.Name) 340 } 341 } 342 }() 343 344 // Run the controllers. 345 for _, ct := range controllers { 346 if err := c.runController(ctx, ct); err != nil { 347 return fmt.Errorf("failed to run controller %q: %w", ct.Name, err) 348 } 349 } 350 351 return nil 352 } 353 354 // Client returns the client for the local control plane. 355 func (c *system) Client() client.Client { 356 c.Lock() 357 defer c.Unlock() 358 359 return c.client 360 } 361 362 // Teardown shuts down the local capi control plane and all its controllers. 363 func (c *system) Teardown() { 364 c.Lock() 365 defer c.Unlock() 366 367 if c.lcp == nil { 368 return 369 } 370 371 // Clean up the binary directory. 372 defer os.RemoveAll(c.lcp.BinDir) 373 374 // Clean up log file handles. 375 defer c.lcp.EtcdLog.Close() 376 defer c.lcp.APIServerLog.Close() 377 378 // Proceed to shutdown. 379 c.teardownOnce.Do(func() { 380 c.cancel() 381 ch := make(chan struct{}) 382 go func() { 383 c.wg.Wait() 384 logrus.Info("Shutting down local Cluster API control plane...") 385 if err := c.lcp.Stop(); err != nil { 386 logrus.Warnf("Failed to stop local Cluster API control plane: %v", err) 387 } 388 close(ch) 389 }() 390 select { 391 case <-ch: 392 logrus.Info("Local Cluster API system has completed operations") 393 case <-time.After(60 * time.Second): 394 logrus.Warn("Timed out waiting for local Cluster API system to shut down") 395 } 396 397 c.logWriter.Close() 398 }) 399 } 400 401 // CleanEtcd removes the etcd database from the host. 402 func (c *system) CleanEtcd() { 403 c.Lock() 404 defer c.Unlock() 405 406 if c.lcp == nil { 407 return 408 } 409 410 // Clean up the etcd directory. 411 if err := os.RemoveAll(c.lcp.EtcdDataDir); err != nil { 412 logrus.Warnf("Unable to delete local etcd data directory %s. It is safe to remove the directory manually", c.lcp.EtcdDataDir) 413 } 414 } 415 416 // State returns the state of the cluster-api system. 417 func (c *system) State() SystemState { 418 c.Lock() 419 defer c.Unlock() 420 421 if c.lcp == nil { 422 return SystemStateStopped 423 } 424 return SystemStateRunning 425 } 426 427 // getInfrastructureController returns a controller for the given provider, 428 // most of the configuration is by convention. 429 // 430 // The provider is expected to be compiled as part of the release process, and packaged in the binaries directory 431 // and have the name `cluster-api-provider-<name>`. 432 // 433 // While the manifests can be optional, we expect them to be in the manifests directory and named `<name>-infrastructure-components.yaml`. 434 func (c *system) getInfrastructureController(provider *Provider, args []string, env map[string]string) *controller { 435 manifests := []string{} 436 defaultManifestPath := filepath.Join(c.componentDir, fmt.Sprintf("/%s-infrastructure-components.yaml", provider.Name)) 437 if _, err := os.Stat(defaultManifestPath); err == nil { 438 manifests = append(manifests, defaultManifestPath) 439 } else { 440 logrus.Infof("Failed to find manifests for provider %s at %s", provider.Name, defaultManifestPath) 441 } 442 return &controller{ 443 Provider: provider, 444 Name: fmt.Sprintf("%s infrastructure provider", provider.Name), 445 Path: fmt.Sprintf("%s/cluster-api-provider-%s", c.lcp.BinDir, provider.Name), 446 Components: manifests, 447 Args: args, 448 Env: env, 449 } 450 } 451 452 // controller encapsulates the state of a controller, its process state, and its configuration. 453 type controller struct { 454 Provider *Provider 455 state *process.State 456 457 Name string 458 Dir string 459 Path string 460 Components []string 461 Args []string 462 Env map[string]string 463 } 464 465 // runController configures the controller, and waits for it to be ready. 466 func (c *system) runController(ctx context.Context, ct *controller) error { 467 // If the provider is not empty, we extract it to the binaries directory. 468 if ct.Provider != nil { 469 if err := ct.Provider.Extract(c.lcp.BinDir); err != nil { 470 return fmt.Errorf("failed to extract provider %q: %w", ct.Name, err) 471 } 472 } 473 474 // Create the WebhookInstallOptions from envtest, and pass the manifests we've been given as input. 475 // Once built, we install them in the local control plane using the rest.Config available. 476 // Envtest takes care of a few things needed to run webhooks locally: 477 // - Creates a self-signed certificate for the webhook server. 478 // - Tries to allocate a host:port for the webhook server to listen on. 479 // - Modifies the webhook manifests to point to the local webhook server through a URL and a CABundle. 480 wh := envtest.WebhookInstallOptions{ 481 Paths: ct.Components, 482 IgnoreSchemeConvertible: true, 483 } 484 if err := wh.Install(c.lcp.Cfg); err != nil { 485 return fmt.Errorf("failed to prepare controller %q webhook options: %w", ct.Name, err) 486 } 487 488 // Most providers allocate a host:port configuration for the health check, 489 // which responds to a simple http request on /healthz and /readyz. 490 // When an argument is configured to use the suggestHealthHostPort function, 491 // we record the value, so we can pass it to 492 var healthCheckHostPort string 493 494 // Build the arguments, using go templating to render the values. 495 { 496 funcs := template.FuncMap{ 497 "suggestHealthHostPort": func() (string, error) { 498 healthPort, healthHost, err := addr.Suggest("") 499 if err != nil { 500 return "", fmt.Errorf("unable to grab random port: %w", err) 501 } 502 healthCheckHostPort = fmt.Sprintf("%s:%d", healthHost, healthPort) 503 return healthCheckHostPort, nil 504 }, 505 } 506 507 templateData := map[string]string{ 508 "WebhookPort": fmt.Sprintf("%d", wh.LocalServingPort), 509 "WebhookCertDir": wh.LocalServingCertDir, 510 "KubeconfigPath": c.lcp.KubeconfigPath, 511 } 512 513 // We cannot override KUBECONFIG, e.g., in case the user supplies a callback that needs to access the cluster, 514 // such as via credential_process in the AWS config file. The kubeconfig path is set in the controller instead. 515 if ct.Provider == nil || ct.Provider.Name != "azureaso" { 516 ct.Args = append(ct.Args, "--kubeconfig={{.KubeconfigPath}}") 517 } 518 519 args := make([]string, 0, len(ct.Args)) 520 for _, arg := range ct.Args { 521 final := new(bytes.Buffer) 522 tmpl := template.Must(template.New("arg").Funcs(funcs).Parse(arg)) 523 if err := tmpl.Execute(final, templateData); err != nil { 524 return fmt.Errorf("failed to render controller %q arg %q: %w", ct.Name, arg, err) 525 } 526 args = append(args, strings.TrimSpace(final.String())) 527 } 528 ct.Args = args 529 } 530 531 // Build the environment variables. 532 env := []string{} 533 { 534 if ct.Env == nil { 535 ct.Env = map[string]string{} 536 } 537 // Override KUBECONFIG to point to the local control plane. 538 // azureaso doesn't support the --kubeconfig parameter. 539 if ct.Provider != nil && ct.Provider.Name == "azureaso" { 540 ct.Env["KUBECONFIG"] = c.lcp.KubeconfigPath 541 } 542 for key, value := range ct.Env { 543 env = append(env, fmt.Sprintf("%s=%s", key, value)) 544 } 545 } 546 547 // Install the manifests for the controller, if any. 548 if len(ct.Components) > 0 { 549 opts := envtest.CRDInstallOptions{ 550 Scheme: c.lcp.Env.Scheme, 551 Paths: ct.Components, 552 WebhookOptions: wh, 553 } 554 if _, err := envtest.InstallCRDs(c.lcp.Cfg, opts); err != nil { 555 return fmt.Errorf("failed to install controller %q manifests in local control plane: %w", ct.Name, err) 556 } 557 } 558 559 // Create the process state. 560 pr := &process.State{ 561 Path: ct.Path, 562 Args: ct.Args, 563 Dir: ct.Dir, 564 Env: env, 565 StartTimeout: 60 * time.Second, 566 StopTimeout: 10 * time.Second, 567 } 568 569 // If the controller has a health check, we configure it, and wait for it to be ready. 570 if healthCheckHostPort != "" { 571 pr.HealthCheck = &process.HealthCheck{ 572 URL: url.URL{ 573 Scheme: "http", 574 Host: healthCheckHostPort, 575 Path: "/healthz", 576 }, 577 } 578 } 579 580 // Initialize the process state. 581 if err := pr.Init(ct.Name); err != nil { 582 return fmt.Errorf("failed to initialize process state for controller %q: %w", ct.Name, err) 583 } 584 585 // Run the controller and store its state. 586 logrus.Infof("Running process: %s with args %v", ct.Name, ct.Args) 587 if err := pr.Start(ctx, c.logWriter, c.logWriter); err != nil { 588 return fmt.Errorf("failed to start controller %q: %w", ct.Name, err) 589 } 590 ct.state = pr 591 return nil 592 }