github.com/openshift/installer@v1.4.17/cmd/openshift-install/create.go (about)

     1  package main
     2  
     3  import (
     4  	"context"
     5  	"crypto/x509"
     6  	"fmt"
     7  	"net"
     8  	"os"
     9  	"path/filepath"
    10  	"strings"
    11  	"time"
    12  
    13  	"github.com/pkg/errors"
    14  	"github.com/sirupsen/logrus"
    15  	"github.com/spf13/cobra"
    16  	corev1 "k8s.io/api/core/v1"
    17  	apierrors "k8s.io/apimachinery/pkg/api/errors"
    18  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    19  	"k8s.io/apimachinery/pkg/fields"
    20  	"k8s.io/apimachinery/pkg/labels"
    21  	"k8s.io/apimachinery/pkg/runtime"
    22  	"k8s.io/apimachinery/pkg/runtime/schema"
    23  	"k8s.io/apimachinery/pkg/util/sets"
    24  	"k8s.io/apimachinery/pkg/util/wait"
    25  	"k8s.io/apimachinery/pkg/watch"
    26  	"k8s.io/client-go/dynamic"
    27  	"k8s.io/client-go/kubernetes"
    28  	"k8s.io/client-go/rest"
    29  	"k8s.io/client-go/tools/cache"
    30  	"k8s.io/client-go/tools/clientcmd"
    31  	clientwatch "k8s.io/client-go/tools/watch"
    32  
    33  	configv1 "github.com/openshift/api/config/v1"
    34  	operatorv1 "github.com/openshift/api/operator/v1"
    35  	configclient "github.com/openshift/client-go/config/clientset/versioned"
    36  	configinformers "github.com/openshift/client-go/config/informers/externalversions"
    37  	configlisters "github.com/openshift/client-go/config/listers/config/v1"
    38  	routeclient "github.com/openshift/client-go/route/clientset/versioned"
    39  	"github.com/openshift/installer/cmd/openshift-install/command"
    40  	"github.com/openshift/installer/pkg/asset"
    41  	"github.com/openshift/installer/pkg/asset/agent/agentconfig"
    42  	"github.com/openshift/installer/pkg/asset/cluster"
    43  	"github.com/openshift/installer/pkg/asset/installconfig"
    44  	"github.com/openshift/installer/pkg/asset/kubeconfig"
    45  	"github.com/openshift/installer/pkg/asset/lbconfig"
    46  	"github.com/openshift/installer/pkg/asset/logging"
    47  	assetstore "github.com/openshift/installer/pkg/asset/store"
    48  	targetassets "github.com/openshift/installer/pkg/asset/targets"
    49  	destroybootstrap "github.com/openshift/installer/pkg/destroy/bootstrap"
    50  	"github.com/openshift/installer/pkg/gather/service"
    51  	timer "github.com/openshift/installer/pkg/metrics/timer"
    52  	"github.com/openshift/installer/pkg/types/baremetal"
    53  	"github.com/openshift/installer/pkg/types/gcp"
    54  	"github.com/openshift/installer/pkg/types/vsphere"
    55  	baremetalutils "github.com/openshift/installer/pkg/utils/baremetal"
    56  	cov1helpers "github.com/openshift/library-go/pkg/config/clusteroperator/v1helpers"
    57  	"github.com/openshift/library-go/pkg/route/routeapihelpers"
    58  )
    59  
    60  type target struct {
    61  	name    string
    62  	command *cobra.Command
    63  	assets  []asset.WritableAsset
    64  }
    65  
    66  const (
    67  	exitCodeInstallConfigError = iota + 3
    68  	exitCodeInfrastructureFailed
    69  	exitCodeBootstrapFailed
    70  	exitCodeInstallFailed
    71  	exitCodeOperatorStabilityFailed
    72  	exitCodeInterrupt
    73  
    74  	// coStabilityThreshold is how long a cluster operator must have Progressing=False
    75  	// in order to be considered stable. Measured in seconds.
    76  	coStabilityThreshold float64 = 30
    77  )
    78  
    79  var skipPasswordPrintFlag bool
    80  
    81  // each target is a variable to preserve the order when creating subcommands and still
    82  // allow other functions to directly access each target individually.
    83  var (
    84  	installConfigTarget = target{
    85  		name: "Install Config",
    86  		command: &cobra.Command{
    87  			Use:   "install-config",
    88  			Short: "Generates the Install Config asset",
    89  			// FIXME: add longer descriptions for our commands with examples for better UX.
    90  			// Long:  "",
    91  		},
    92  		assets: targetassets.InstallConfig,
    93  	}
    94  
    95  	manifestsTarget = target{
    96  		name: "Manifests",
    97  		command: &cobra.Command{
    98  			Use:   "manifests",
    99  			Short: "Generates the Kubernetes manifests",
   100  			// FIXME: add longer descriptions for our commands with examples for better UX.
   101  			// Long:  "",
   102  		},
   103  		assets: targetassets.Manifests,
   104  	}
   105  
   106  	ignitionConfigsTarget = target{
   107  		name: "Ignition Configs",
   108  		command: &cobra.Command{
   109  			Use:   "ignition-configs",
   110  			Short: "Generates the Ignition Config asset",
   111  			// FIXME: add longer descriptions for our commands with examples for better UX.
   112  			// Long:  "",
   113  		},
   114  		assets: targetassets.IgnitionConfigs,
   115  	}
   116  
   117  	singleNodeIgnitionConfigTarget = target{
   118  		name: "Single Node Ignition Config",
   119  		command: &cobra.Command{
   120  			Use:   "single-node-ignition-config",
   121  			Short: "Generates the bootstrap-in-place-for-live-iso Ignition Config asset",
   122  			// FIXME: add longer descriptions for our commands with examples for better UX.
   123  			// Long:  "",
   124  		},
   125  		assets: targetassets.SingleNodeIgnitionConfig,
   126  	}
   127  
   128  	clusterTarget = target{
   129  		name: "Cluster",
   130  		command: &cobra.Command{
   131  			Use:   "cluster",
   132  			Short: "Create an OpenShift cluster",
   133  			// FIXME: add longer descriptions for our commands with examples for better UX.
   134  			// Long:  "",
   135  			PostRun: func(cmd *cobra.Command, _ []string) {
   136  				// Get the context that was set in newCreateCmd.
   137  				ctx := cmd.Context()
   138  
   139  				exitCode, err := clusterCreatePostRun(ctx)
   140  				if err != nil {
   141  					logrus.Fatal(err)
   142  				}
   143  				if exitCode != 0 {
   144  					logrus.Exit(exitCode)
   145  				}
   146  			},
   147  		},
   148  		assets: targetassets.Cluster,
   149  	}
   150  
   151  	targets = []target{installConfigTarget, manifestsTarget, ignitionConfigsTarget, clusterTarget, singleNodeIgnitionConfigTarget}
   152  )
   153  
   154  // clusterCreatePostRun is the main entrypoint for the cluster create command
   155  // it was moved out of the clusterTarget.command.PostRun function to allow cleanup operations to always
   156  // run in a defer statement, given that we had multiple exit points in the function, like logrus.Fatal or logrus.Exit.
   157  //
   158  // Currently this function returns an exit code and an error, we should refactor this to only return an error,
   159  // that can be wrapped if we want a custom exit code.
   160  func clusterCreatePostRun(ctx context.Context) (int, error) {
   161  	cleanup := command.SetupFileHook(command.RootOpts.Dir)
   162  	defer cleanup()
   163  
   164  	// FIXME: pulling the kubeconfig and metadata out of the root
   165  	// directory is a bit cludgy when we already have them in memory.
   166  	config, err := clientcmd.BuildConfigFromFlags("", filepath.Join(command.RootOpts.Dir, "auth", "kubeconfig"))
   167  	if err != nil {
   168  		return 0, errors.Wrap(err, "loading kubeconfig")
   169  	}
   170  
   171  	// Handle the case when the API server is not reachable.
   172  	if err := handleUnreachableAPIServer(ctx, config); err != nil {
   173  		logrus.Fatal(fmt.Errorf("unable to handle api server override: %w", err))
   174  	}
   175  
   176  	//
   177  	// Wait for the bootstrap to complete.
   178  	//
   179  	timer.StartTimer("Bootstrap Complete")
   180  	if err := waitForBootstrapComplete(ctx, config); err != nil {
   181  		bundlePath, gatherErr := runGatherBootstrapCmd(ctx, command.RootOpts.Dir)
   182  		if gatherErr != nil {
   183  			logrus.Error("Attempted to gather debug logs after installation failure: ", gatherErr)
   184  		}
   185  		if err := logClusterOperatorConditions(ctx, config); err != nil {
   186  			logrus.Error("Attempted to gather ClusterOperator status after installation failure: ", err)
   187  		}
   188  		logrus.Error("Bootstrap failed to complete: ", err.Unwrap())
   189  		logrus.Error(err.Error())
   190  		if gatherErr == nil {
   191  			if err := service.AnalyzeGatherBundle(bundlePath); err != nil {
   192  				logrus.Error("Attempted to analyze the debug logs after installation failure: ", err)
   193  			}
   194  			logrus.Infof("Bootstrap gather logs captured here %q", bundlePath)
   195  		}
   196  		return exitCodeBootstrapFailed, nil
   197  	}
   198  	timer.StopTimer("Bootstrap Complete")
   199  
   200  	//
   201  	// Wait for the bootstrap to be destroyed.
   202  	//
   203  	timer.StartTimer("Bootstrap Destroy")
   204  	if oi, ok := os.LookupEnv("OPENSHIFT_INSTALL_PRESERVE_BOOTSTRAP"); ok && oi != "" {
   205  		logrus.Warn("OPENSHIFT_INSTALL_PRESERVE_BOOTSTRAP is set, not destroying bootstrap resources. " +
   206  			"Warning: this should only be used for debugging purposes, and poses a risk to cluster stability.")
   207  	} else {
   208  		logrus.Info("Destroying the bootstrap resources...")
   209  		err = destroybootstrap.Destroy(ctx, command.RootOpts.Dir)
   210  		if err != nil {
   211  			return 0, err
   212  		}
   213  	}
   214  	timer.StopTimer("Bootstrap Destroy")
   215  
   216  	//
   217  	// Wait for the cluster to initialize.
   218  	//
   219  	err = waitForInstallComplete(ctx, config, command.RootOpts.Dir)
   220  	if err != nil {
   221  		if err2 := logClusterOperatorConditions(ctx, config); err2 != nil {
   222  			logrus.Error("Attempted to gather ClusterOperator status after installation failure: ", err2)
   223  		}
   224  		logTroubleshootingLink()
   225  		logrus.Error(err)
   226  		return exitCodeInstallFailed, nil
   227  	}
   228  	timer.StopTimer(timer.TotalTimeElapsed)
   229  	timer.LogSummary()
   230  	return 0, nil
   231  }
   232  
   233  // clusterCreateError defines a custom error type that would help identify where the error occurs
   234  // during the bootstrap phase of the installation process. This would help identify whether the error
   235  // comes either from the Kubernetes API failure, the bootstrap failure or a general kubernetes client
   236  // creation error. In the event of any error, this interface packages the error message and a custom
   237  // log message that must be neatly presented to the user before termination of the project.
   238  type clusterCreateError struct {
   239  	wrappedError error
   240  	logMessage   string
   241  }
   242  
   243  // Unwrap provides the actual stored error that occured during installation.
   244  func (ce *clusterCreateError) Unwrap() error {
   245  	return ce.wrappedError
   246  }
   247  
   248  // Error provides the actual stored error that occured during installation.
   249  func (ce *clusterCreateError) Error() string {
   250  	return ce.logMessage
   251  }
   252  
   253  // newAPIError creates a clusterCreateError object with a default error message specific to the API failure.
   254  func newAPIError(errorInfo error) *clusterCreateError {
   255  	return &clusterCreateError{
   256  		wrappedError: errorInfo,
   257  		logMessage: "Failed waiting for Kubernetes API. This error usually happens when there " +
   258  			"is a problem on the bootstrap host that prevents creating a temporary control plane.",
   259  	}
   260  }
   261  
   262  // newBootstrapError creates a clusterCreateError object with a default error message specific to the
   263  // bootstrap failure.
   264  func newBootstrapError(errorInfo error) *clusterCreateError {
   265  	return &clusterCreateError{
   266  		wrappedError: errorInfo,
   267  		logMessage: "Failed to wait for bootstrapping to complete. This error usually " +
   268  			"happens when there is a problem with control plane hosts that prevents " +
   269  			"the control plane operators from creating the control plane.",
   270  	}
   271  }
   272  
   273  // newClientError creates a clusterCreateError object with a default error message specific to the
   274  // kubernetes client creation failure.
   275  func newClientError(errorInfo error) *clusterCreateError {
   276  	return &clusterCreateError{
   277  		wrappedError: errorInfo,
   278  		logMessage:   "Failed to create a kubernetes client.",
   279  	}
   280  }
   281  
   282  func newCreateCmd(ctx context.Context) *cobra.Command {
   283  	cmd := &cobra.Command{
   284  		Use:   "create",
   285  		Short: "Create part of an OpenShift cluster",
   286  		RunE: func(cmd *cobra.Command, args []string) error {
   287  			return cmd.Help()
   288  		},
   289  	}
   290  
   291  	for _, t := range targets {
   292  		t.command.Args = cobra.ExactArgs(0)
   293  		t.command.Run = runTargetCmd(ctx, t.assets...)
   294  		if t.name == "Cluster" {
   295  			t.command.PersistentFlags().BoolVar(&skipPasswordPrintFlag, "skip-password-print", false, "Do not print the generated user password.")
   296  		}
   297  		cmd.AddCommand(t.command)
   298  	}
   299  
   300  	return cmd
   301  }
   302  
   303  func runTargetCmd(ctx context.Context, targets ...asset.WritableAsset) func(cmd *cobra.Command, args []string) {
   304  	runner := func(directory string) error {
   305  		fetcher := assetstore.NewAssetsFetcher(directory)
   306  		return fetcher.FetchAndPersist(ctx, targets)
   307  	}
   308  
   309  	return func(cmd *cobra.Command, args []string) {
   310  		timer.StartTimer(timer.TotalTimeElapsed)
   311  
   312  		// Set the context to be used in the PostRun function.
   313  		cmd.SetContext(ctx)
   314  
   315  		cleanup := command.SetupFileHook(command.RootOpts.Dir)
   316  		defer cleanup()
   317  
   318  		cluster.InstallDir = command.RootOpts.Dir
   319  
   320  		err := runner(command.RootOpts.Dir)
   321  		if err != nil {
   322  			if strings.Contains(err.Error(), asset.InstallConfigError) {
   323  				logrus.Error(err)
   324  				logrus.Exit(exitCodeInstallConfigError)
   325  			}
   326  			if strings.Contains(err.Error(), asset.ClusterCreationError) {
   327  				logrus.Error(err)
   328  				logrus.Exit(exitCodeInfrastructureFailed)
   329  			}
   330  			logrus.Fatal(err)
   331  		}
   332  		switch cmd.Name() {
   333  		case "cluster", "image", "pxe-files":
   334  		default:
   335  			logrus.Infof(logging.LogCreatedFiles(cmd.Name(), command.RootOpts.Dir, targets))
   336  		}
   337  	}
   338  }
   339  
   340  // addRouterCAToClusterCA adds router CA to cluster CA in kubeconfig
   341  func addRouterCAToClusterCA(ctx context.Context, config *rest.Config, directory string) (err error) {
   342  	client, err := kubernetes.NewForConfig(config)
   343  	if err != nil {
   344  		return errors.Wrap(err, "creating a Kubernetes client")
   345  	}
   346  
   347  	// Configmap may not exist. log and accept not-found errors with configmap.
   348  	caConfigMap, err := client.CoreV1().ConfigMaps("openshift-config-managed").Get(ctx, "default-ingress-cert", metav1.GetOptions{})
   349  	if err != nil {
   350  		return errors.Wrap(err, "fetching default-ingress-cert configmap from openshift-config-managed namespace")
   351  	}
   352  
   353  	routerCrtBytes := []byte(caConfigMap.Data["ca-bundle.crt"])
   354  	kubeconfig := filepath.Join(directory, "auth", "kubeconfig")
   355  	kconfig, err := clientcmd.LoadFromFile(kubeconfig)
   356  	if err != nil {
   357  		return errors.Wrap(err, "loading kubeconfig")
   358  	}
   359  
   360  	if kconfig == nil || len(kconfig.Clusters) == 0 {
   361  		return errors.New("kubeconfig is missing expected data")
   362  	}
   363  
   364  	for _, c := range kconfig.Clusters {
   365  		clusterCABytes := c.CertificateAuthorityData
   366  		if len(clusterCABytes) == 0 {
   367  			return errors.New("kubeconfig CertificateAuthorityData not found")
   368  		}
   369  		certPool := x509.NewCertPool()
   370  		if !certPool.AppendCertsFromPEM(clusterCABytes) {
   371  			return errors.New("cluster CA found in kubeconfig not valid PEM format")
   372  		}
   373  		if !certPool.AppendCertsFromPEM(routerCrtBytes) {
   374  			return errors.New("ca-bundle.crt from default-ingress-cert configmap not valid PEM format")
   375  		}
   376  
   377  		newCA := append(routerCrtBytes, clusterCABytes...)
   378  		c.CertificateAuthorityData = newCA
   379  	}
   380  	if err := clientcmd.WriteToFile(*kconfig, kubeconfig); err != nil {
   381  		return errors.Wrap(err, "writing kubeconfig")
   382  	}
   383  	return nil
   384  }
   385  
   386  func waitForBootstrapComplete(ctx context.Context, config *rest.Config) *clusterCreateError {
   387  	client, err := kubernetes.NewForConfig(config)
   388  	if err != nil {
   389  		return newClientError(errors.Wrap(err, "creating a Kubernetes client"))
   390  	}
   391  
   392  	discovery := client.Discovery()
   393  
   394  	apiTimeout := 20 * time.Minute
   395  
   396  	untilTime := time.Now().Add(apiTimeout)
   397  	timezone, _ := untilTime.Zone()
   398  	logrus.Infof("Waiting up to %v (until %v %s) for the Kubernetes API at %s...",
   399  		apiTimeout, untilTime.Format(time.Kitchen), timezone, config.Host)
   400  
   401  	apiContext, cancel := context.WithTimeout(ctx, apiTimeout)
   402  	defer cancel()
   403  	// Poll quickly so we notice changes, but only log when the response
   404  	// changes (because that's interesting) or when we've seen 15 of the
   405  	// same errors in a row (to show we're still alive).
   406  	logDownsample := 15
   407  	silenceRemaining := logDownsample
   408  	previousErrorSuffix := ""
   409  	timer.StartTimer("API")
   410  
   411  	if assetStore, err := assetstore.NewStore(command.RootOpts.Dir); err == nil {
   412  		checkIfAgentCommand(assetStore)
   413  	}
   414  
   415  	var lastErr error
   416  	err = wait.PollUntilContextCancel(apiContext, 2*time.Second, true, func(_ context.Context) (done bool, err error) {
   417  		version, err := discovery.ServerVersion()
   418  		if err == nil {
   419  			logrus.Infof("API %s up", version)
   420  			timer.StopTimer("API")
   421  			return true, nil
   422  		}
   423  
   424  		lastErr = err
   425  		silenceRemaining--
   426  		chunks := strings.Split(err.Error(), ":")
   427  		errorSuffix := chunks[len(chunks)-1]
   428  		if previousErrorSuffix != errorSuffix {
   429  			logrus.Debugf("Still waiting for the Kubernetes API: %v", err)
   430  			previousErrorSuffix = errorSuffix
   431  			silenceRemaining = logDownsample
   432  		} else if silenceRemaining == 0 {
   433  			logrus.Debugf("Still waiting for the Kubernetes API: %v", err)
   434  			silenceRemaining = logDownsample
   435  		}
   436  
   437  		return false, nil
   438  	})
   439  	if err != nil {
   440  		if lastErr != nil {
   441  			return newAPIError(lastErr)
   442  		}
   443  		return newAPIError(err)
   444  	}
   445  
   446  	var platformName string
   447  
   448  	if assetStore, err := assetstore.NewStore(command.RootOpts.Dir); err == nil {
   449  		if installConfig, err := assetStore.Load(&installconfig.InstallConfig{}); err == nil && installConfig != nil {
   450  			platformName = installConfig.(*installconfig.InstallConfig).Config.Platform.Name()
   451  		}
   452  	}
   453  
   454  	timeout := 45 * time.Minute
   455  
   456  	// Wait longer for baremetal, VSphere due to length of time it takes to boot
   457  	if platformName == baremetal.Name || platformName == vsphere.Name {
   458  		timeout = 60 * time.Minute
   459  	}
   460  
   461  	untilTime = time.Now().Add(timeout)
   462  	timezone, _ = untilTime.Zone()
   463  	logrus.Infof("Waiting up to %v (until %v %s) for bootstrapping to complete...",
   464  		timeout, untilTime.Format(time.Kitchen), timezone)
   465  
   466  	waitCtx, cancel := context.WithTimeout(ctx, timeout)
   467  	defer cancel()
   468  
   469  	if platformName == baremetal.Name {
   470  		if err := baremetalutils.WaitForBaremetalBootstrapControlPlane(waitCtx, config, command.RootOpts.Dir); err != nil {
   471  			return newBootstrapError(err)
   472  		}
   473  		logrus.Infof("  Baremetal control plane finished provisioning.")
   474  	}
   475  
   476  	if err := waitForBootstrapConfigMap(waitCtx, client); err != nil {
   477  		return err
   478  	}
   479  
   480  	if err := waitForStableSNOBootstrap(ctx, config); err != nil {
   481  		return newBootstrapError(err)
   482  	}
   483  
   484  	return nil
   485  }
   486  
   487  // waitForBootstrapConfigMap watches the configmaps in the kube-system namespace
   488  // and waits for the bootstrap configmap to report that bootstrapping has
   489  // completed.
   490  func waitForBootstrapConfigMap(ctx context.Context, client *kubernetes.Clientset) *clusterCreateError {
   491  	_, err := clientwatch.UntilWithSync(
   492  		ctx,
   493  		cache.NewListWatchFromClient(client.CoreV1().RESTClient(), "configmaps", "kube-system", fields.OneTermEqualSelector("metadata.name", "bootstrap")),
   494  		&corev1.ConfigMap{},
   495  		nil,
   496  		func(event watch.Event) (bool, error) {
   497  			switch event.Type {
   498  			case watch.Added, watch.Modified:
   499  			default:
   500  				return false, nil
   501  			}
   502  			cm, ok := event.Object.(*corev1.ConfigMap)
   503  			if !ok {
   504  				logrus.Warnf("Expected a core/v1.ConfigMap object but got a %q object instead", event.Object.GetObjectKind().GroupVersionKind())
   505  				return false, nil
   506  			}
   507  			status, ok := cm.Data["status"]
   508  			if !ok {
   509  				logrus.Debugf("No status found in bootstrap configmap")
   510  				return false, nil
   511  			}
   512  			logrus.Debugf("Bootstrap status: %v", status)
   513  			return status == "complete", nil
   514  		},
   515  	)
   516  	if err != nil {
   517  		return newBootstrapError(err)
   518  	}
   519  	return nil
   520  }
   521  
   522  // When bootstrap on SNO deployments, we should not remove the bootstrap node prematurely,
   523  // here we make sure that the deployment is stable.
   524  // Given the nature of single node we just need to make sure things such as etcd are in the proper state
   525  // before continuing.
   526  func waitForStableSNOBootstrap(ctx context.Context, config *rest.Config) error {
   527  	timeout := 5 * time.Minute
   528  
   529  	// If we're not in a single node deployment, bail early
   530  	if isSNO, err := IsSingleNode(); err != nil {
   531  		logrus.Warningf("Can not determine if installing a Single Node cluster, continuing as normal install: %v", err)
   532  		return nil
   533  	} else if !isSNO {
   534  		return nil
   535  	}
   536  
   537  	snoBootstrapContext, cancel := context.WithTimeout(ctx, timeout)
   538  	defer cancel()
   539  
   540  	untilTime := time.Now().Add(timeout)
   541  	timezone, _ := untilTime.Zone()
   542  	logrus.Info("Detected Single Node deployment")
   543  	logrus.Infof("Waiting up to %v (until %v %s) for the bootstrap etcd member to be removed...",
   544  		timeout, untilTime.Format(time.Kitchen), timezone)
   545  
   546  	client, err := dynamic.NewForConfig(config)
   547  	if err != nil {
   548  		return fmt.Errorf("error creating dynamic client: %w", err)
   549  	}
   550  	gvr := schema.GroupVersionResource{
   551  		Group:    operatorv1.SchemeGroupVersion.Group,
   552  		Version:  operatorv1.SchemeGroupVersion.Version,
   553  		Resource: "etcds",
   554  	}
   555  	resourceClient := client.Resource(gvr)
   556  	// Validate the etcd operator has removed the bootstrap etcd member
   557  	return wait.PollUntilContextCancel(snoBootstrapContext, 1*time.Second, true, func(ctx context.Context) (done bool, err error) {
   558  		etcdOperator := &operatorv1.Etcd{}
   559  		etcdUnstructured, err := resourceClient.Get(ctx, "cluster", metav1.GetOptions{})
   560  		if err != nil {
   561  			// There might be service disruptions in SNO, we log those here but keep trying with in the time limit
   562  			logrus.Debugf("Error getting ETCD Cluster resource, retrying: %v", err)
   563  			return false, nil
   564  		}
   565  		err = runtime.DefaultUnstructuredConverter.FromUnstructured(etcdUnstructured.Object, etcdOperator)
   566  		if err != nil {
   567  			// This error should not happen, if we do, we log the error and keep retrying until we hit the limit
   568  			logrus.Debugf("Error parsing etcds resource, retrying: %v", err)
   569  			return false, nil
   570  		}
   571  		for _, condition := range etcdOperator.Status.Conditions {
   572  			if condition.Type == "EtcdBootstrapMemberRemoved" {
   573  				return configv1.ConditionStatus(condition.Status) == configv1.ConditionTrue, nil
   574  			}
   575  		}
   576  		return false, nil
   577  	})
   578  }
   579  
   580  // waitForInitializedCluster watches the ClusterVersion waiting for confirmation
   581  // that the cluster has been initialized.
   582  func waitForInitializedCluster(ctx context.Context, config *rest.Config) error {
   583  	// TODO revert this value back to 30 minutes.  It's currently at the end of 4.6 and we're trying to see if the
   584  	timeout := 40 * time.Minute
   585  
   586  	// Wait longer for baremetal, due to length of time it takes to boot
   587  	if assetStore, err := assetstore.NewStore(command.RootOpts.Dir); err == nil {
   588  		if installConfig, err := assetStore.Load(&installconfig.InstallConfig{}); err == nil && installConfig != nil {
   589  			if installConfig.(*installconfig.InstallConfig).Config.Platform.Name() == baremetal.Name {
   590  				timeout = 60 * time.Minute
   591  			}
   592  		}
   593  
   594  		checkIfAgentCommand(assetStore)
   595  	}
   596  
   597  	untilTime := time.Now().Add(timeout)
   598  	timezone, _ := untilTime.Zone()
   599  	logrus.Infof("Waiting up to %v (until %v %s) for the cluster at %s to initialize...",
   600  		timeout, untilTime.Format(time.Kitchen), timezone, config.Host)
   601  	cc, err := configclient.NewForConfig(config)
   602  	if err != nil {
   603  		return errors.Wrap(err, "failed to create a config client")
   604  	}
   605  	clusterVersionContext, cancel := context.WithTimeout(ctx, timeout)
   606  	defer cancel()
   607  
   608  	failing := configv1.ClusterStatusConditionType("Failing")
   609  	timer.StartTimer("Cluster Operators Available")
   610  	var lastError string
   611  	_, err = clientwatch.UntilWithSync(
   612  		clusterVersionContext,
   613  		cache.NewListWatchFromClient(cc.ConfigV1().RESTClient(), "clusterversions", "", fields.OneTermEqualSelector("metadata.name", "version")),
   614  		&configv1.ClusterVersion{},
   615  		nil,
   616  		func(event watch.Event) (bool, error) {
   617  			switch event.Type {
   618  			case watch.Added, watch.Modified:
   619  				cv, ok := event.Object.(*configv1.ClusterVersion)
   620  				if !ok {
   621  					logrus.Warnf("Expected a ClusterVersion object but got a %q object instead", event.Object.GetObjectKind().GroupVersionKind())
   622  					return false, nil
   623  				}
   624  				if cov1helpers.IsStatusConditionTrue(cv.Status.Conditions, configv1.OperatorAvailable) &&
   625  					cov1helpers.IsStatusConditionFalse(cv.Status.Conditions, failing) &&
   626  					cov1helpers.IsStatusConditionFalse(cv.Status.Conditions, configv1.OperatorProgressing) {
   627  					timer.StopTimer("Cluster Operators Available")
   628  					return true, nil
   629  				}
   630  				if cov1helpers.IsStatusConditionTrue(cv.Status.Conditions, failing) {
   631  					lastError = cov1helpers.FindStatusCondition(cv.Status.Conditions, failing).Message
   632  				} else if cov1helpers.IsStatusConditionTrue(cv.Status.Conditions, configv1.OperatorProgressing) {
   633  					lastError = cov1helpers.FindStatusCondition(cv.Status.Conditions, configv1.OperatorProgressing).Message
   634  				}
   635  				logrus.Debugf("Still waiting for the cluster to initialize: %s", lastError)
   636  				return false, nil
   637  			}
   638  			logrus.Debug("Still waiting for the cluster to initialize...")
   639  			return false, nil
   640  		},
   641  	)
   642  
   643  	if err == nil {
   644  		logrus.Debug("Cluster is initialized")
   645  		return nil
   646  	}
   647  
   648  	if lastError != "" {
   649  		if err == wait.ErrWaitTimeout {
   650  			return errors.Errorf("failed to initialize the cluster: %s", lastError)
   651  		}
   652  
   653  		return errors.Wrapf(err, "failed to initialize the cluster: %s", lastError)
   654  	}
   655  
   656  	return errors.Wrap(err, "failed to initialize the cluster")
   657  }
   658  
   659  // waitForStableOperators ensures that each cluster operator is "stable", i.e. the
   660  // operator has not been in a progressing state for at least a certain duration,
   661  // 30 seconds by default. Returns an error if any operator does meet this threshold
   662  // after a deadline, 30 minutes by default.
   663  func waitForStableOperators(ctx context.Context, config *rest.Config) error {
   664  	timer.StartTimer("Cluster Operators Stable")
   665  
   666  	stabilityCheckDuration := 30 * time.Minute
   667  	stabilityContext, cancel := context.WithTimeout(ctx, stabilityCheckDuration)
   668  	defer cancel()
   669  
   670  	untilTime := time.Now().Add(stabilityCheckDuration)
   671  	timezone, _ := untilTime.Zone()
   672  	logrus.Infof("Waiting up to %v (until %v %s) to ensure each cluster operator has finished progressing...",
   673  		stabilityCheckDuration, untilTime.Format(time.Kitchen), timezone)
   674  
   675  	cc, err := configclient.NewForConfig(config)
   676  	if err != nil {
   677  		return errors.Wrap(err, "failed to create a config client")
   678  	}
   679  	configInformers := configinformers.NewSharedInformerFactory(cc, 0)
   680  	clusterOperatorInformer := configInformers.Config().V1().ClusterOperators().Informer()
   681  	clusterOperatorLister := configInformers.Config().V1().ClusterOperators().Lister()
   682  	configInformers.Start(ctx.Done())
   683  	if !cache.WaitForCacheSync(ctx.Done(), clusterOperatorInformer.HasSynced) {
   684  		return fmt.Errorf("informers never started")
   685  	}
   686  
   687  	waitErr := wait.PollUntilContextCancel(stabilityContext, 1*time.Second, true, waitForAllClusterOperators(clusterOperatorLister))
   688  	if waitErr != nil {
   689  		logrus.Errorf("Error checking cluster operator Progressing status: %q", waitErr)
   690  		stableOperators, unstableOperators, err := currentOperatorStability(clusterOperatorLister)
   691  		if err != nil {
   692  			logrus.Errorf("Error checking final cluster operator Progressing status: %q", err)
   693  		}
   694  		logrus.Debugf("These cluster operators were stable: [%s]", strings.Join(sets.List(stableOperators), ", "))
   695  		logrus.Errorf("These cluster operators were not stable: [%s]", strings.Join(sets.List(unstableOperators), ", "))
   696  
   697  		logrus.Exit(exitCodeOperatorStabilityFailed)
   698  	}
   699  
   700  	timer.StopTimer("Cluster Operators Stable")
   701  
   702  	logrus.Info("All cluster operators have completed progressing")
   703  
   704  	return nil
   705  }
   706  
   707  // getConsole returns the console URL from the route 'console' in namespace openshift-console
   708  func getConsole(ctx context.Context, config *rest.Config) (string, error) {
   709  	url := ""
   710  	// Need to keep these updated if they change
   711  	consoleNamespace := "openshift-console"
   712  	consoleRouteName := "console"
   713  	rc, err := routeclient.NewForConfig(config)
   714  	if err != nil {
   715  		return "", errors.Wrap(err, "creating a route client")
   716  	}
   717  
   718  	consoleRouteTimeout := 2 * time.Minute
   719  	logrus.Infof("Checking to see if there is a route at %s/%s...", consoleNamespace, consoleRouteName)
   720  	consoleRouteContext, cancel := context.WithTimeout(ctx, consoleRouteTimeout)
   721  	defer cancel()
   722  	// Poll quickly but only log when the response
   723  	// when we've seen 15 of the same errors or output of
   724  	// no route in a row (to show we're still alive).
   725  	logDownsample := 15
   726  	silenceRemaining := logDownsample
   727  	timer.StartTimer("Console")
   728  	wait.Until(func() {
   729  		route, err := rc.RouteV1().Routes(consoleNamespace).Get(ctx, consoleRouteName, metav1.GetOptions{})
   730  		if err == nil {
   731  			logrus.Debugf("Route found in openshift-console namespace: %s", consoleRouteName)
   732  			if uri, _, err2 := routeapihelpers.IngressURI(route, ""); err2 == nil {
   733  				url = uri.String()
   734  				logrus.Debug("OpenShift console route is admitted")
   735  				cancel()
   736  			} else {
   737  				err = err2
   738  			}
   739  		} else if apierrors.IsNotFound(err) {
   740  			logrus.Debug("OpenShift console route does not exist")
   741  			cancel()
   742  		}
   743  
   744  		if err != nil {
   745  			silenceRemaining--
   746  			if silenceRemaining == 0 {
   747  				logrus.Debugf("Still waiting for the console route: %v", err)
   748  				silenceRemaining = logDownsample
   749  			}
   750  		}
   751  	}, 2*time.Second, consoleRouteContext.Done())
   752  	err = consoleRouteContext.Err()
   753  	if err != nil && err != context.Canceled {
   754  		return url, errors.Wrap(err, "waiting for openshift-console URL")
   755  	}
   756  	if url == "" {
   757  		return url, errors.New("could not get openshift-console URL")
   758  	}
   759  	timer.StopTimer("Console")
   760  	return url, nil
   761  }
   762  
   763  // logComplete prints info upon completion
   764  func logComplete(directory, consoleURL string) error {
   765  	absDir, err := filepath.Abs(directory)
   766  	if err != nil {
   767  		return err
   768  	}
   769  	kubeconfig := filepath.Join(absDir, "auth", "kubeconfig")
   770  	pwFile := filepath.Join(absDir, "auth", "kubeadmin-password")
   771  	pw, err := os.ReadFile(pwFile)
   772  	if err != nil {
   773  		return err
   774  	}
   775  	logrus.Info("Install complete!")
   776  	logrus.Infof("To access the cluster as the system:admin user when using 'oc', run 'export KUBECONFIG=%s'", kubeconfig)
   777  	if consoleURL != "" {
   778  		logrus.Infof("Access the OpenShift web-console here: %s", consoleURL)
   779  		if skipPasswordPrintFlag {
   780  			logrus.Infof("Credentials omitted, if necessary verify the %s file", pwFile)
   781  		} else {
   782  			logrus.Infof("Login to the console with user: %q, and password: %q", "kubeadmin", pw)
   783  		}
   784  	}
   785  	return nil
   786  }
   787  
   788  func waitForInstallComplete(ctx context.Context, config *rest.Config, directory string) error {
   789  	if err := waitForInitializedCluster(ctx, config); err != nil {
   790  		return err
   791  	}
   792  
   793  	if err := addRouterCAToClusterCA(ctx, config, command.RootOpts.Dir); err != nil {
   794  		return err
   795  	}
   796  
   797  	if err := waitForStableOperators(ctx, config); err != nil {
   798  		return err
   799  	}
   800  
   801  	consoleURL, err := getConsole(ctx, config)
   802  	if err != nil {
   803  		logrus.Warnf("Cluster does not have a console available: %v", err)
   804  	}
   805  
   806  	return logComplete(command.RootOpts.Dir, consoleURL)
   807  }
   808  
   809  func logTroubleshootingLink() {
   810  	logrus.Error(`Cluster initialization failed because one or more operators are not functioning properly.
   811  The cluster should be accessible for troubleshooting as detailed in the documentation linked below,
   812  https://docs.openshift.com/container-platform/latest/support/troubleshooting/troubleshooting-installations.html
   813  The 'wait-for install-complete' subcommand can then be used to continue the installation`)
   814  }
   815  
   816  func checkIfAgentCommand(assetStore asset.Store) {
   817  	if agentConfig, err := assetStore.Load(&agentconfig.AgentConfig{}); err == nil && agentConfig != nil {
   818  		logrus.Warning("An agent configuration was detected but this command is not the agent wait-for command")
   819  	}
   820  }
   821  
   822  func waitForAllClusterOperators(clusterOperatorLister configlisters.ClusterOperatorLister) func(ctx context.Context) (bool, error) {
   823  	previouslyStableOperators := sets.Set[string]{}
   824  
   825  	return func(ctx context.Context) (bool, error) {
   826  		stableOperators, unstableOperators, err := currentOperatorStability(clusterOperatorLister)
   827  		if err != nil {
   828  			return false, err
   829  		}
   830  		if newlyStableOperators := stableOperators.Difference(previouslyStableOperators); len(newlyStableOperators) > 0 {
   831  			for _, name := range sets.List(newlyStableOperators) {
   832  				logrus.Debugf("Cluster Operator %s is stable", name)
   833  			}
   834  		}
   835  		if newlyUnstableOperators := previouslyStableOperators.Difference(stableOperators); len(newlyUnstableOperators) > 0 {
   836  			for _, name := range sets.List(newlyUnstableOperators) {
   837  				logrus.Debugf("Cluster Operator %s became unstable", name)
   838  			}
   839  		}
   840  		previouslyStableOperators = stableOperators
   841  
   842  		if len(unstableOperators) == 0 {
   843  			return true, nil
   844  		}
   845  
   846  		return false, nil
   847  	}
   848  }
   849  
   850  func currentOperatorStability(clusterOperatorLister configlisters.ClusterOperatorLister) (sets.Set[string], sets.Set[string], error) {
   851  	clusterOperators, err := clusterOperatorLister.List(labels.Everything())
   852  	if err != nil {
   853  		return nil, nil, err // lister should never fail
   854  	}
   855  
   856  	stableOperators := sets.Set[string]{}
   857  	unstableOperators := sets.Set[string]{}
   858  	for _, clusterOperator := range clusterOperators {
   859  		name := clusterOperator.Name
   860  		progressing := cov1helpers.FindStatusCondition(clusterOperator.Status.Conditions, configv1.OperatorProgressing)
   861  		if progressing == nil {
   862  			logrus.Debugf("Cluster Operator %s progressing == nil", name)
   863  			unstableOperators.Insert(name)
   864  			continue
   865  		}
   866  		if meetsStabilityThreshold(progressing) {
   867  			stableOperators.Insert(name)
   868  		} else {
   869  			logrus.Debugf("Cluster Operator %s is Progressing=%s LastTransitionTime=%v DurationSinceTransition=%.fs Reason=%s Message=%s", name, progressing.Status, progressing.LastTransitionTime.Time, time.Since(progressing.LastTransitionTime.Time).Seconds(), progressing.Reason, progressing.Message)
   870  			unstableOperators.Insert(name)
   871  		}
   872  	}
   873  
   874  	return stableOperators, unstableOperators, nil
   875  }
   876  
   877  func meetsStabilityThreshold(progressing *configv1.ClusterOperatorStatusCondition) bool {
   878  	return progressing.Status == configv1.ConditionFalse && time.Since(progressing.LastTransitionTime.Time).Seconds() > coStabilityThreshold
   879  }
   880  
   881  func handleUnreachableAPIServer(ctx context.Context, config *rest.Config) error {
   882  	assetStore, err := assetstore.NewStore(command.RootOpts.Dir)
   883  	if err != nil {
   884  		return fmt.Errorf("failed to create asset store: %w", err)
   885  	}
   886  
   887  	// Ensure that the install is expecting the user to provision their own DNS solution.
   888  	installConfig := &installconfig.InstallConfig{}
   889  	if err := assetStore.Fetch(ctx, installConfig); err != nil {
   890  		return fmt.Errorf("failed to fetch %s: %w", installConfig.Name(), err)
   891  	}
   892  	switch installConfig.Config.Platform.Name() { //nolint:gocritic
   893  	case gcp.Name:
   894  		if installConfig.Config.GCP.UserProvisionedDNS != gcp.UserProvisionedDNSEnabled {
   895  			return nil
   896  		}
   897  	default:
   898  		return nil
   899  	}
   900  
   901  	lbConfig := &lbconfig.Config{}
   902  	if err := assetStore.Fetch(ctx, lbConfig); err != nil {
   903  		return fmt.Errorf("failed to fetch %s: %w", lbConfig.Name(), err)
   904  	}
   905  
   906  	_, ipAddrs, err := lbConfig.ParseDNSDataFromConfig(lbconfig.PublicLoadBalancer)
   907  	if err != nil {
   908  		return fmt.Errorf("failed to parse lbconfig: %w", err)
   909  	}
   910  
   911  	// The kubeconfig handles one ip address
   912  	ipAddr := ""
   913  	if len(ipAddrs) > 0 {
   914  		ipAddr = ipAddrs[0].String()
   915  	}
   916  	if ipAddr == "" {
   917  		return fmt.Errorf("no ip address found in lbconfig")
   918  	}
   919  
   920  	dialer := &net.Dialer{
   921  		Timeout:   1 * time.Minute,
   922  		KeepAlive: 1 * time.Minute,
   923  	}
   924  	config.Dial = kubeconfig.CreateDialContext(dialer, ipAddr)
   925  
   926  	// The asset is currently saved in <install-dir>/openshift. This directory
   927  	// was consumed during install but this file is generated after that action. This
   928  	// artifact will hang around unless it is purged here.
   929  	if err := asset.DeleteAssetFromDisk(lbConfig, command.RootOpts.Dir); err != nil {
   930  		return fmt.Errorf("failed to delete %s from disk", lbConfig.Name())
   931  	}
   932  
   933  	return nil
   934  }
   935  
   936  // IsSingleNode determines if we are in a single node configuration based off of the install config
   937  // loaded from the asset store.
   938  func IsSingleNode() (bool, error) {
   939  	assetStore, err := assetstore.NewStore(command.RootOpts.Dir)
   940  	if err != nil {
   941  		return false, fmt.Errorf("error loading asset store: %w", err)
   942  	}
   943  	installConfig, err := assetStore.Load(&installconfig.InstallConfig{})
   944  	if err != nil {
   945  		return false, fmt.Errorf("error loading installConfig: %w", err)
   946  	}
   947  	if installConfig == nil {
   948  		return false, fmt.Errorf("installConfig loaded from asset store was nil")
   949  	}
   950  
   951  	config := installConfig.(*installconfig.InstallConfig).Config
   952  	if machinePool := config.ControlPlane; machinePool != nil {
   953  		return *machinePool.Replicas == int64(1), nil
   954  	}
   955  	return false, nil
   956  }