github.com/openshift/installer@v1.4.17/pkg/clusterapi/system.go (about)

     1  package clusterapi
     2  
     3  import (
     4  	"bytes"
     5  	"context"
     6  	"fmt"
     7  	"io"
     8  	"net/url"
     9  	"os"
    10  	"path/filepath"
    11  	"strings"
    12  	"sync"
    13  	"text/template"
    14  	"time"
    15  
    16  	"github.com/sirupsen/logrus"
    17  	"sigs.k8s.io/controller-runtime/pkg/client"
    18  	"sigs.k8s.io/controller-runtime/pkg/envtest"
    19  
    20  	"github.com/openshift/installer/cmd/openshift-install/command"
    21  	"github.com/openshift/installer/data"
    22  	"github.com/openshift/installer/pkg/asset/cluster/metadata"
    23  	azic "github.com/openshift/installer/pkg/asset/installconfig/azure"
    24  	gcpic "github.com/openshift/installer/pkg/asset/installconfig/gcp"
    25  	powervsic "github.com/openshift/installer/pkg/asset/installconfig/powervs"
    26  	"github.com/openshift/installer/pkg/clusterapi/internal/process"
    27  	"github.com/openshift/installer/pkg/clusterapi/internal/process/addr"
    28  	"github.com/openshift/installer/pkg/types/aws"
    29  	"github.com/openshift/installer/pkg/types/azure"
    30  	"github.com/openshift/installer/pkg/types/gcp"
    31  	"github.com/openshift/installer/pkg/types/ibmcloud"
    32  	"github.com/openshift/installer/pkg/types/nutanix"
    33  	"github.com/openshift/installer/pkg/types/openstack"
    34  	"github.com/openshift/installer/pkg/types/powervs"
    35  	"github.com/openshift/installer/pkg/types/vsphere"
    36  )
    37  
    38  var (
    39  	sys = &system{}
    40  )
    41  
    42  // SystemState is the state of the cluster-api system.
    43  type SystemState string
    44  
    45  const (
    46  	// SystemStateRunning indicates the system is running.
    47  	SystemStateRunning SystemState = "running"
    48  	// SystemStateStopped indicates the system is stopped.
    49  	SystemStateStopped SystemState = "stopped"
    50  
    51  	// ArtifactsDir is the directory where output (manifests, kubeconfig, etc.)
    52  	// related to CAPI-based installs are stored.
    53  	ArtifactsDir = ".clusterapi_output"
    54  )
    55  
    56  // Interface is the interface for the cluster-api system.
    57  type Interface interface {
    58  	Run(ctx context.Context) error
    59  	State() SystemState
    60  	Client() client.Client
    61  	Teardown()
    62  	CleanEtcd()
    63  }
    64  
    65  // System returns the cluster-api system.
    66  func System() Interface {
    67  	return sys
    68  }
    69  
    70  // system creates a local capi control plane
    71  // to use as a management cluster.
    72  type system struct {
    73  	sync.Mutex
    74  
    75  	client client.Client
    76  
    77  	componentDir string
    78  	lcp          *localControlPlane
    79  
    80  	wg           sync.WaitGroup
    81  	teardownOnce sync.Once
    82  	cancel       context.CancelFunc
    83  
    84  	logWriter *io.PipeWriter
    85  }
    86  
    87  // Run launches the cluster-api system.
    88  func (c *system) Run(ctx context.Context) error {
    89  	c.Lock()
    90  	defer c.Unlock()
    91  
    92  	// Setup the context with a cancel function.
    93  	ctx, cancel := context.WithCancel(ctx)
    94  	c.cancel = cancel
    95  
    96  	// Create the local control plane.
    97  	lcp := &localControlPlane{}
    98  	if err := lcp.Run(ctx); err != nil {
    99  		return fmt.Errorf("failed to run local control plane: %w", err)
   100  	}
   101  	c.lcp = lcp
   102  	c.client = c.lcp.Client
   103  
   104  	// Create a temporary directory to unpack the cluster-api assets
   105  	// and use it as the working directory for the envtest environment.
   106  	componentDir, err := os.MkdirTemp("", "openshift-cluster-api-system-components")
   107  	if err != nil {
   108  		return fmt.Errorf("failed to create temporary folder for cluster api components: %w", err)
   109  	}
   110  	if err := data.Unpack(componentDir, "/cluster-api"); err != nil {
   111  		return fmt.Errorf("failed to unpack cluster api components: %w", err)
   112  	}
   113  	c.componentDir = componentDir
   114  
   115  	// Create the controllers, we always need to run the cluster-api core controller.
   116  	controllers := []*controller{
   117  		{
   118  			Name:       "Cluster API",
   119  			Path:       fmt.Sprintf("%s/cluster-api", c.lcp.BinDir),
   120  			Components: []string{c.componentDir + "/core-components.yaml"},
   121  			Args: []string{
   122  				"-v=2",
   123  				"--diagnostics-address=0",
   124  				"--health-addr={{suggestHealthHostPort}}",
   125  				"--webhook-port={{.WebhookPort}}",
   126  				"--webhook-cert-dir={{.WebhookCertDir}}",
   127  			},
   128  		},
   129  	}
   130  
   131  	metadata, err := metadata.Load(command.RootOpts.Dir)
   132  	if err != nil {
   133  		return fmt.Errorf("failed to load metadata: %w", err)
   134  	}
   135  
   136  	platform := metadata.Platform()
   137  	if platform == "" {
   138  		return fmt.Errorf("no platform configured in metadata")
   139  	}
   140  
   141  	// Create the infrastructure controllers.
   142  	// Only add the controllers for the platform we are deploying to.
   143  	switch platform {
   144  	case aws.Name:
   145  		controller := c.getInfrastructureController(
   146  			&AWS,
   147  			[]string{
   148  				"-v=4",
   149  				"--diagnostics-address=0",
   150  				"--health-addr={{suggestHealthHostPort}}",
   151  				"--webhook-port={{.WebhookPort}}",
   152  				"--webhook-cert-dir={{.WebhookCertDir}}",
   153  				"--feature-gates=BootstrapFormatIgnition=true,ExternalResourceGC=true,TagUnmanagedNetworkResources=false,EKS=false",
   154  			},
   155  			map[string]string{},
   156  		)
   157  		if cfg := metadata.AWS; cfg != nil && len(cfg.ServiceEndpoints) > 0 {
   158  			endpoints := make([]string, 0, len(cfg.ServiceEndpoints))
   159  			// CAPA expects name=url pairs of service endpoints
   160  			for _, endpoint := range cfg.ServiceEndpoints {
   161  				endpoints = append(endpoints, fmt.Sprintf("%s=%s", endpoint.Name, endpoint.URL))
   162  			}
   163  			controller.Args = append(controller.Args, fmt.Sprintf("--service-endpoints=%s:%s", cfg.Region, strings.Join(endpoints, ",")))
   164  		}
   165  		controllers = append(controllers, controller)
   166  	case azure.Name:
   167  		cloudName := metadata.Azure.CloudName
   168  		if cloudName == "" {
   169  			cloudName = azure.PublicCloud
   170  		}
   171  		session, err := azic.GetSession(cloudName, metadata.Azure.ARMEndpoint)
   172  		if err != nil {
   173  			return fmt.Errorf("unable to retrieve azure session: %w", err)
   174  		}
   175  
   176  		controllers = append(controllers,
   177  			c.getInfrastructureController(
   178  				&Azure,
   179  				[]string{
   180  					"-v=2",
   181  					"--health-addr={{suggestHealthHostPort}}",
   182  					"--webhook-port={{.WebhookPort}}",
   183  					"--webhook-cert-dir={{.WebhookCertDir}}",
   184  					"--feature-gates=MachinePool=false",
   185  				},
   186  				map[string]string{},
   187  			),
   188  			c.getInfrastructureController(
   189  				&AzureASO,
   190  				[]string{
   191  					"-v=0",
   192  					"-metrics-addr=0",
   193  					"-health-addr={{suggestHealthHostPort}}",
   194  					"-webhook-port={{.WebhookPort}}",
   195  					"-webhook-cert-dir={{.WebhookCertDir}}",
   196  					"-crd-pattern=",
   197  					"-crd-management=none",
   198  				}, map[string]string{
   199  					"POD_NAMESPACE":                     "capz-system",
   200  					"AZURE_CLIENT_ID":                   session.Credentials.ClientID,
   201  					"AZURE_CLIENT_SECRET":               session.Credentials.ClientSecret,
   202  					"AZURE_CLIENT_CERTIFICATE":          session.Credentials.ClientCertificatePath,
   203  					"AZURE_CLIENT_CERTIFICATE_PASSWORD": session.Credentials.ClientCertificatePassword,
   204  					"AZURE_TENANT_ID":                   session.Credentials.TenantID,
   205  					"AZURE_SUBSCRIPTION_ID":             session.Credentials.SubscriptionID,
   206  				},
   207  			),
   208  		)
   209  	case gcp.Name:
   210  		session, err := gcpic.GetSession(context.Background())
   211  		if err != nil {
   212  			return fmt.Errorf("failed to create gcp session: %w", err)
   213  		}
   214  
   215  		//nolint:gosec // CAPG only expects a single credentials environment variable
   216  		gAppCredEnvVar := "GOOGLE_APPLICATION_CREDENTIALS"
   217  		capgEnvVars := map[string]string{
   218  			gAppCredEnvVar: session.Path,
   219  		}
   220  
   221  		if v, ok := capgEnvVars[gAppCredEnvVar]; ok {
   222  			logrus.Infof("setting %q to %s for capg infrastructure controller", gAppCredEnvVar, v)
   223  		}
   224  
   225  		controllers = append(controllers,
   226  			c.getInfrastructureController(
   227  				&GCP,
   228  				[]string{
   229  					"-v=2",
   230  					"--diagnostics-address=0",
   231  					"--health-addr={{suggestHealthHostPort}}",
   232  					"--webhook-port={{.WebhookPort}}",
   233  					"--webhook-cert-dir={{.WebhookCertDir}}",
   234  				},
   235  				capgEnvVars,
   236  			),
   237  		)
   238  	case ibmcloud.Name:
   239  		// TODO
   240  	case nutanix.Name:
   241  		controllers = append(controllers,
   242  			c.getInfrastructureController(
   243  				&Nutanix,
   244  				[]string{
   245  					"-metrics-bind-address=0",
   246  					"-health-probe-bind-address={{suggestHealthHostPort}}",
   247  					"-leader-elect=false",
   248  				},
   249  				map[string]string{},
   250  			),
   251  		)
   252  	case openstack.Name:
   253  		controllers = append(controllers,
   254  			c.getInfrastructureController(
   255  				&OpenStack,
   256  				[]string{
   257  					"-v=2",
   258  					"--diagnostics-address=0",
   259  					"--health-addr={{suggestHealthHostPort}}",
   260  					"--webhook-port={{.WebhookPort}}",
   261  					"--webhook-cert-dir={{.WebhookCertDir}}",
   262  				},
   263  				map[string]string{
   264  					"EXP_KUBEADM_BOOTSTRAP_FORMAT_IGNITION": "true",
   265  				},
   266  			),
   267  		)
   268  	case vsphere.Name:
   269  		controllers = append(controllers,
   270  			c.getInfrastructureController(
   271  				&VSphere,
   272  				[]string{
   273  					"-v=2",
   274  					"--diagnostics-address=0",
   275  					"--health-addr={{suggestHealthHostPort}}",
   276  					"--webhook-port={{.WebhookPort}}",
   277  					"--webhook-cert-dir={{.WebhookCertDir}}",
   278  					"--leader-elect=false",
   279  				},
   280  				map[string]string{
   281  					"EXP_KUBEADM_BOOTSTRAP_FORMAT_IGNITION": "true",
   282  					"EXP_CLUSTER_RESOURCE_SET":              "true",
   283  				},
   284  			),
   285  		)
   286  	case powervs.Name:
   287  		// We need to prompt for missing variables because NewPISession requires them!
   288  		bxClient, err := powervsic.NewBxClient(true)
   289  		if err != nil {
   290  			return fmt.Errorf("failed to create a BxClient in Run: %w", err)
   291  		}
   292  		APIKey := bxClient.GetBxClientAPIKey()
   293  
   294  		controller := c.getInfrastructureController(
   295  			&IBMCloud,
   296  			[]string{
   297  				"--provider-id-fmt=v2",
   298  				"--v=5",
   299  				"--health-addr={{suggestHealthHostPort}}",
   300  				"--webhook-port={{.WebhookPort}}",
   301  				"--webhook-cert-dir={{.WebhookCertDir}}",
   302  			},
   303  			map[string]string{
   304  				"IBMCLOUD_AUTH_TYPE": "iam",
   305  				"IBMCLOUD_APIKEY":    APIKey,
   306  				"IBMCLOUD_AUTH_URL":  "https://iam.cloud.ibm.com",
   307  				"LOGLEVEL":           "5",
   308  			},
   309  		)
   310  		if cfg := metadata.PowerVS; cfg != nil && len(cfg.ServiceEndpoints) > 0 {
   311  			overrides := bxClient.FilterServiceEndpoints(cfg)
   312  			if len(overrides) > 0 {
   313  				controller.Args = append(controller.Args, fmt.Sprintf("--service-endpoint=%s:%s", cfg.Region, strings.Join(overrides, ",")))
   314  			}
   315  		}
   316  		controllers = append(controllers, controller)
   317  	default:
   318  		return fmt.Errorf("unsupported platform %q", platform)
   319  	}
   320  
   321  	// We only show controller logs if the log level is DEBUG or above
   322  	c.logWriter = logrus.StandardLogger().WriterLevel(logrus.DebugLevel)
   323  
   324  	// We create a wait group to wait for the controllers to stop,
   325  	// this waitgroup is a global, and is used by the Teardown function
   326  	// which is expected to be called when the program exits.
   327  	c.wg.Add(1)
   328  	go func() {
   329  		defer c.wg.Done()
   330  		// Stop the controllers when the context is cancelled.
   331  		<-ctx.Done()
   332  		logrus.Info("Shutting down local Cluster API controllers...")
   333  		for _, ct := range controllers {
   334  			if ct.state != nil {
   335  				if err := ct.state.Stop(); err != nil {
   336  					logrus.Warnf("Failed to stop controller: %s: %v", ct.Name, err)
   337  					continue
   338  				}
   339  				logrus.Infof("Stopped controller: %s", ct.Name)
   340  			}
   341  		}
   342  	}()
   343  
   344  	// Run the controllers.
   345  	for _, ct := range controllers {
   346  		if err := c.runController(ctx, ct); err != nil {
   347  			return fmt.Errorf("failed to run controller %q: %w", ct.Name, err)
   348  		}
   349  	}
   350  
   351  	return nil
   352  }
   353  
   354  // Client returns the client for the local control plane.
   355  func (c *system) Client() client.Client {
   356  	c.Lock()
   357  	defer c.Unlock()
   358  
   359  	return c.client
   360  }
   361  
   362  // Teardown shuts down the local capi control plane and all its controllers.
   363  func (c *system) Teardown() {
   364  	c.Lock()
   365  	defer c.Unlock()
   366  
   367  	if c.lcp == nil {
   368  		return
   369  	}
   370  
   371  	// Clean up the binary directory.
   372  	defer os.RemoveAll(c.lcp.BinDir)
   373  
   374  	// Clean up log file handles.
   375  	defer c.lcp.EtcdLog.Close()
   376  	defer c.lcp.APIServerLog.Close()
   377  
   378  	// Proceed to shutdown.
   379  	c.teardownOnce.Do(func() {
   380  		c.cancel()
   381  		ch := make(chan struct{})
   382  		go func() {
   383  			c.wg.Wait()
   384  			logrus.Info("Shutting down local Cluster API control plane...")
   385  			if err := c.lcp.Stop(); err != nil {
   386  				logrus.Warnf("Failed to stop local Cluster API control plane: %v", err)
   387  			}
   388  			close(ch)
   389  		}()
   390  		select {
   391  		case <-ch:
   392  			logrus.Info("Local Cluster API system has completed operations")
   393  		case <-time.After(60 * time.Second):
   394  			logrus.Warn("Timed out waiting for local Cluster API system to shut down")
   395  		}
   396  
   397  		c.logWriter.Close()
   398  	})
   399  }
   400  
   401  // CleanEtcd removes the etcd database from the host.
   402  func (c *system) CleanEtcd() {
   403  	c.Lock()
   404  	defer c.Unlock()
   405  
   406  	if c.lcp == nil {
   407  		return
   408  	}
   409  
   410  	// Clean up the etcd directory.
   411  	if err := os.RemoveAll(c.lcp.EtcdDataDir); err != nil {
   412  		logrus.Warnf("Unable to delete local etcd data directory %s. It is safe to remove the directory manually", c.lcp.EtcdDataDir)
   413  	}
   414  }
   415  
   416  // State returns the state of the cluster-api system.
   417  func (c *system) State() SystemState {
   418  	c.Lock()
   419  	defer c.Unlock()
   420  
   421  	if c.lcp == nil {
   422  		return SystemStateStopped
   423  	}
   424  	return SystemStateRunning
   425  }
   426  
   427  // getInfrastructureController returns a controller for the given provider,
   428  // most of the configuration is by convention.
   429  //
   430  // The provider is expected to be compiled as part of the release process, and packaged in the binaries directory
   431  // and have the name `cluster-api-provider-<name>`.
   432  //
   433  // While the manifests can be optional, we expect them to be in the manifests directory and named `<name>-infrastructure-components.yaml`.
   434  func (c *system) getInfrastructureController(provider *Provider, args []string, env map[string]string) *controller {
   435  	manifests := []string{}
   436  	defaultManifestPath := filepath.Join(c.componentDir, fmt.Sprintf("/%s-infrastructure-components.yaml", provider.Name))
   437  	if _, err := os.Stat(defaultManifestPath); err == nil {
   438  		manifests = append(manifests, defaultManifestPath)
   439  	} else {
   440  		logrus.Infof("Failed to find manifests for provider %s at %s", provider.Name, defaultManifestPath)
   441  	}
   442  	return &controller{
   443  		Provider:   provider,
   444  		Name:       fmt.Sprintf("%s infrastructure provider", provider.Name),
   445  		Path:       fmt.Sprintf("%s/cluster-api-provider-%s", c.lcp.BinDir, provider.Name),
   446  		Components: manifests,
   447  		Args:       args,
   448  		Env:        env,
   449  	}
   450  }
   451  
   452  // controller encapsulates the state of a controller, its process state, and its configuration.
   453  type controller struct {
   454  	Provider *Provider
   455  	state    *process.State
   456  
   457  	Name       string
   458  	Dir        string
   459  	Path       string
   460  	Components []string
   461  	Args       []string
   462  	Env        map[string]string
   463  }
   464  
   465  // runController configures the controller, and waits for it to be ready.
   466  func (c *system) runController(ctx context.Context, ct *controller) error {
   467  	// If the provider is not empty, we extract it to the binaries directory.
   468  	if ct.Provider != nil {
   469  		if err := ct.Provider.Extract(c.lcp.BinDir); err != nil {
   470  			return fmt.Errorf("failed to extract provider %q: %w", ct.Name, err)
   471  		}
   472  	}
   473  
   474  	// Create the WebhookInstallOptions from envtest, and pass the manifests we've been given as input.
   475  	// Once built, we install them in the local control plane using the rest.Config available.
   476  	// Envtest takes care of a few things needed to run webhooks locally:
   477  	// - Creates a self-signed certificate for the webhook server.
   478  	// - Tries to allocate a host:port for the webhook server to listen on.
   479  	// - Modifies the webhook manifests to point to the local webhook server through a URL and a CABundle.
   480  	wh := envtest.WebhookInstallOptions{
   481  		Paths:                   ct.Components,
   482  		IgnoreSchemeConvertible: true,
   483  	}
   484  	if err := wh.Install(c.lcp.Cfg); err != nil {
   485  		return fmt.Errorf("failed to prepare controller %q webhook options: %w", ct.Name, err)
   486  	}
   487  
   488  	// Most providers allocate a host:port configuration for the health check,
   489  	// which responds to a simple http request on /healthz and /readyz.
   490  	// When an argument is configured to use the suggestHealthHostPort function,
   491  	// we record the value, so we can pass it to
   492  	var healthCheckHostPort string
   493  
   494  	// Build the arguments, using go templating to render the values.
   495  	{
   496  		funcs := template.FuncMap{
   497  			"suggestHealthHostPort": func() (string, error) {
   498  				healthPort, healthHost, err := addr.Suggest("")
   499  				if err != nil {
   500  					return "", fmt.Errorf("unable to grab random port: %w", err)
   501  				}
   502  				healthCheckHostPort = fmt.Sprintf("%s:%d", healthHost, healthPort)
   503  				return healthCheckHostPort, nil
   504  			},
   505  		}
   506  
   507  		templateData := map[string]string{
   508  			"WebhookPort":    fmt.Sprintf("%d", wh.LocalServingPort),
   509  			"WebhookCertDir": wh.LocalServingCertDir,
   510  			"KubeconfigPath": c.lcp.KubeconfigPath,
   511  		}
   512  
   513  		// We cannot override KUBECONFIG, e.g., in case the user supplies a callback that needs to access the cluster,
   514  		// such as via credential_process in the AWS config file. The kubeconfig path is set in the controller instead.
   515  		if ct.Provider == nil || ct.Provider.Name != "azureaso" {
   516  			ct.Args = append(ct.Args, "--kubeconfig={{.KubeconfigPath}}")
   517  		}
   518  
   519  		args := make([]string, 0, len(ct.Args))
   520  		for _, arg := range ct.Args {
   521  			final := new(bytes.Buffer)
   522  			tmpl := template.Must(template.New("arg").Funcs(funcs).Parse(arg))
   523  			if err := tmpl.Execute(final, templateData); err != nil {
   524  				return fmt.Errorf("failed to render controller %q arg %q: %w", ct.Name, arg, err)
   525  			}
   526  			args = append(args, strings.TrimSpace(final.String()))
   527  		}
   528  		ct.Args = args
   529  	}
   530  
   531  	// Build the environment variables.
   532  	env := []string{}
   533  	{
   534  		if ct.Env == nil {
   535  			ct.Env = map[string]string{}
   536  		}
   537  		// Override KUBECONFIG to point to the local control plane.
   538  		// azureaso doesn't support the --kubeconfig parameter.
   539  		if ct.Provider != nil && ct.Provider.Name == "azureaso" {
   540  			ct.Env["KUBECONFIG"] = c.lcp.KubeconfigPath
   541  		}
   542  		for key, value := range ct.Env {
   543  			env = append(env, fmt.Sprintf("%s=%s", key, value))
   544  		}
   545  	}
   546  
   547  	// Install the manifests for the controller, if any.
   548  	if len(ct.Components) > 0 {
   549  		opts := envtest.CRDInstallOptions{
   550  			Scheme:         c.lcp.Env.Scheme,
   551  			Paths:          ct.Components,
   552  			WebhookOptions: wh,
   553  		}
   554  		if _, err := envtest.InstallCRDs(c.lcp.Cfg, opts); err != nil {
   555  			return fmt.Errorf("failed to install controller %q manifests in local control plane: %w", ct.Name, err)
   556  		}
   557  	}
   558  
   559  	// Create the process state.
   560  	pr := &process.State{
   561  		Path:         ct.Path,
   562  		Args:         ct.Args,
   563  		Dir:          ct.Dir,
   564  		Env:          env,
   565  		StartTimeout: 60 * time.Second,
   566  		StopTimeout:  10 * time.Second,
   567  	}
   568  
   569  	// If the controller has a health check, we configure it, and wait for it to be ready.
   570  	if healthCheckHostPort != "" {
   571  		pr.HealthCheck = &process.HealthCheck{
   572  			URL: url.URL{
   573  				Scheme: "http",
   574  				Host:   healthCheckHostPort,
   575  				Path:   "/healthz",
   576  			},
   577  		}
   578  	}
   579  
   580  	// Initialize the process state.
   581  	if err := pr.Init(ct.Name); err != nil {
   582  		return fmt.Errorf("failed to initialize process state for controller %q: %w", ct.Name, err)
   583  	}
   584  
   585  	// Run the controller and store its state.
   586  	logrus.Infof("Running process: %s with args %v", ct.Name, ct.Args)
   587  	if err := pr.Start(ctx, c.logWriter, c.logWriter); err != nil {
   588  		return fmt.Errorf("failed to start controller %q: %w", ct.Name, err)
   589  	}
   590  	ct.state = pr
   591  	return nil
   592  }