
     1  package main
     3  import (
     4  	"context"
     5  	"crypto/x509"
     6  	"fmt"
     7  	"net"
     8  	"os"
     9  	"path/filepath"
    10  	"strings"
    11  	"time"
    13  	""
    14  	""
    15  	""
    16  	corev1 ""
    17  	apierrors ""
    18  	metav1 ""
    19  	""
    20  	""
    21  	""
    22  	""
    23  	""
    24  	""
    25  	""
    26  	""
    27  	""
    28  	""
    29  	""
    30  	""
    31  	clientwatch ""
    33  	configv1 ""
    34  	operatorv1 ""
    35  	configclient ""
    36  	configinformers ""
    37  	configlisters ""
    38  	routeclient ""
    39  	""
    40  	""
    41  	""
    42  	""
    43  	""
    44  	""
    45  	""
    46  	""
    47  	assetstore ""
    48  	targetassets ""
    49  	destroybootstrap ""
    50  	""
    51  	timer ""
    52  	""
    53  	""
    54  	""
    55  	baremetalutils ""
    56  	cov1helpers ""
    57  	""
    58  )
    60  type target struct {
    61  	name    string
    62  	command *cobra.Command
    63  	assets  []asset.WritableAsset
    64  }
    66  const (
    67  	exitCodeInstallConfigError = iota + 3
    68  	exitCodeInfrastructureFailed
    69  	exitCodeBootstrapFailed
    70  	exitCodeInstallFailed
    71  	exitCodeOperatorStabilityFailed
    72  	exitCodeInterrupt
    74  	// coStabilityThreshold is how long a cluster operator must have Progressing=False
    75  	// in order to be considered stable. Measured in seconds.
    76  	coStabilityThreshold float64 = 30
    77  )
    79  var skipPasswordPrintFlag bool
    81  // each target is a variable to preserve the order when creating subcommands and still
    82  // allow other functions to directly access each target individually.
    83  var (
    84  	installConfigTarget = target{
    85  		name: "Install Config",
    86  		command: &cobra.Command{
    87  			Use:   "install-config",
    88  			Short: "Generates the Install Config asset",
    89  			// FIXME: add longer descriptions for our commands with examples for better UX.
    90  			// Long:  "",
    91  		},
    92  		assets: targetassets.InstallConfig,
    93  	}
    95  	manifestsTarget = target{
    96  		name: "Manifests",
    97  		command: &cobra.Command{
    98  			Use:   "manifests",
    99  			Short: "Generates the Kubernetes manifests",
   100  			// FIXME: add longer descriptions for our commands with examples for better UX.
   101  			// Long:  "",
   102  		},
   103  		assets: targetassets.Manifests,
   104  	}
   106  	ignitionConfigsTarget = target{
   107  		name: "Ignition Configs",
   108  		command: &cobra.Command{
   109  			Use:   "ignition-configs",
   110  			Short: "Generates the Ignition Config asset",
   111  			// FIXME: add longer descriptions for our commands with examples for better UX.
   112  			// Long:  "",
   113  		},
   114  		assets: targetassets.IgnitionConfigs,
   115  	}
   117  	singleNodeIgnitionConfigTarget = target{
   118  		name: "Single Node Ignition Config",
   119  		command: &cobra.Command{
   120  			Use:   "single-node-ignition-config",
   121  			Short: "Generates the bootstrap-in-place-for-live-iso Ignition Config asset",
   122  			// FIXME: add longer descriptions for our commands with examples for better UX.
   123  			// Long:  "",
   124  		},
   125  		assets: targetassets.SingleNodeIgnitionConfig,
   126  	}
   128  	clusterTarget = target{
   129  		name: "Cluster",
   130  		command: &cobra.Command{
   131  			Use:   "cluster",
   132  			Short: "Create an OpenShift cluster",
   133  			// FIXME: add longer descriptions for our commands with examples for better UX.
   134  			// Long:  "",
   135  			PostRun: func(cmd *cobra.Command, _ []string) {
   136  				// Get the context that was set in newCreateCmd.
   137  				ctx := cmd.Context()
   139  				exitCode, err := clusterCreatePostRun(ctx)
   140  				if err != nil {
   141  					logrus.Fatal(err)
   142  				}
   143  				if exitCode != 0 {
   144  					logrus.Exit(exitCode)
   145  				}
   146  			},
   147  		},
   148  		assets: targetassets.Cluster,
   149  	}
   151  	targets = []target{installConfigTarget, manifestsTarget, ignitionConfigsTarget, clusterTarget, singleNodeIgnitionConfigTarget}
   152  )
   154  // clusterCreatePostRun is the main entrypoint for the cluster create command
   155  // it was moved out of the clusterTarget.command.PostRun function to allow cleanup operations to always
   156  // run in a defer statement, given that we had multiple exit points in the function, like logrus.Fatal or logrus.Exit.
   157  //
   158  // Currently this function returns an exit code and an error, we should refactor this to only return an error,
   159  // that can be wrapped if we want a custom exit code.
   160  func clusterCreatePostRun(ctx context.Context) (int, error) {
   161  	cleanup := command.SetupFileHook(command.RootOpts.Dir)
   162  	defer cleanup()
   164  	// FIXME: pulling the kubeconfig and metadata out of the root
   165  	// directory is a bit cludgy when we already have them in memory.
   166  	config, err := clientcmd.BuildConfigFromFlags("", filepath.Join(command.RootOpts.Dir, "auth", "kubeconfig"))
   167  	if err != nil {
   168  		return 0, errors.Wrap(err, "loading kubeconfig")
   169  	}
   171  	// Handle the case when the API server is not reachable.
   172  	if err := handleUnreachableAPIServer(ctx, config); err != nil {
   173  		logrus.Fatal(fmt.Errorf("unable to handle api server override: %w", err))
   174  	}
   176  	//
   177  	// Wait for the bootstrap to complete.
   178  	//
   179  	timer.StartTimer("Bootstrap Complete")
   180  	if err := waitForBootstrapComplete(ctx, config); err != nil {
   181  		bundlePath, gatherErr := runGatherBootstrapCmd(ctx, command.RootOpts.Dir)
   182  		if gatherErr != nil {
   183  			logrus.Error("Attempted to gather debug logs after installation failure: ", gatherErr)
   184  		}
   185  		if err := logClusterOperatorConditions(ctx, config); err != nil {
   186  			logrus.Error("Attempted to gather ClusterOperator status after installation failure: ", err)
   187  		}
   188  		logrus.Error("Bootstrap failed to complete: ", err.Unwrap())
   189  		logrus.Error(err.Error())
   190  		if gatherErr == nil {
   191  			if err := service.AnalyzeGatherBundle(bundlePath); err != nil {
   192  				logrus.Error("Attempted to analyze the debug logs after installation failure: ", err)
   193  			}
   194  			logrus.Infof("Bootstrap gather logs captured here %q", bundlePath)
   195  		}
   196  		return exitCodeBootstrapFailed, nil
   197  	}
   198  	timer.StopTimer("Bootstrap Complete")
   200  	//
   201  	// Wait for the bootstrap to be destroyed.
   202  	//
   203  	timer.StartTimer("Bootstrap Destroy")
   204  	if oi, ok := os.LookupEnv("OPENSHIFT_INSTALL_PRESERVE_BOOTSTRAP"); ok && oi != "" {
   205  		logrus.Warn("OPENSHIFT_INSTALL_PRESERVE_BOOTSTRAP is set, not destroying bootstrap resources. " +
   206  			"Warning: this should only be used for debugging purposes, and poses a risk to cluster stability.")
   207  	} else {
   208  		logrus.Info("Destroying the bootstrap resources...")
   209  		err = destroybootstrap.Destroy(ctx, command.RootOpts.Dir)
   210  		if err != nil {
   211  			return 0, err
   212  		}
   213  	}
   214  	timer.StopTimer("Bootstrap Destroy")
   216  	//
   217  	// Wait for the cluster to initialize.
   218  	//
   219  	err = waitForInstallComplete(ctx, config, command.RootOpts.Dir)
   220  	if err != nil {
   221  		if err2 := logClusterOperatorConditions(ctx, config); err2 != nil {
   222  			logrus.Error("Attempted to gather ClusterOperator status after installation failure: ", err2)
   223  		}
   224  		logTroubleshootingLink()
   225  		logrus.Error(err)
   226  		return exitCodeInstallFailed, nil
   227  	}
   228  	timer.StopTimer(timer.TotalTimeElapsed)
   229  	timer.LogSummary()
   230  	return 0, nil
   231  }
   233  // clusterCreateError defines a custom error type that would help identify where the error occurs
   234  // during the bootstrap phase of the installation process. This would help identify whether the error
   235  // comes either from the Kubernetes API failure, the bootstrap failure or a general kubernetes client
   236  // creation error. In the event of any error, this interface packages the error message and a custom
   237  // log message that must be neatly presented to the user before termination of the project.
   238  type clusterCreateError struct {
   239  	wrappedError error
   240  	logMessage   string
   241  }
   243  // Unwrap provides the actual stored error that occured during installation.
   244  func (ce *clusterCreateError) Unwrap() error {
   245  	return ce.wrappedError
   246  }
   248  // Error provides the actual stored error that occured during installation.
   249  func (ce *clusterCreateError) Error() string {
   250  	return ce.logMessage
   251  }
   253  // newAPIError creates a clusterCreateError object with a default error message specific to the API failure.
   254  func newAPIError(errorInfo error) *clusterCreateError {
   255  	return &clusterCreateError{
   256  		wrappedError: errorInfo,
   257  		logMessage: "Failed waiting for Kubernetes API. This error usually happens when there " +
   258  			"is a problem on the bootstrap host that prevents creating a temporary control plane.",
   259  	}
   260  }
   262  // newBootstrapError creates a clusterCreateError object with a default error message specific to the
   263  // bootstrap failure.
   264  func newBootstrapError(errorInfo error) *clusterCreateError {
   265  	return &clusterCreateError{
   266  		wrappedError: errorInfo,
   267  		logMessage: "Failed to wait for bootstrapping to complete. This error usually " +
   268  			"happens when there is a problem with control plane hosts that prevents " +
   269  			"the control plane operators from creating the control plane.",
   270  	}
   271  }
   273  // newClientError creates a clusterCreateError object with a default error message specific to the
   274  // kubernetes client creation failure.
   275  func newClientError(errorInfo error) *clusterCreateError {
   276  	return &clusterCreateError{
   277  		wrappedError: errorInfo,
   278  		logMessage:   "Failed to create a kubernetes client.",
   279  	}
   280  }
   282  func newCreateCmd(ctx context.Context) *cobra.Command {
   283  	cmd := &cobra.Command{
   284  		Use:   "create",
   285  		Short: "Create part of an OpenShift cluster",
   286  		RunE: func(cmd *cobra.Command, args []string) error {
   287  			return cmd.Help()
   288  		},
   289  	}
   291  	for _, t := range targets {
   292  		t.command.Args = cobra.ExactArgs(0)
   293  		t.command.Run = runTargetCmd(ctx, t.assets...)
   294  		if == "Cluster" {
   295  			t.command.PersistentFlags().BoolVar(&skipPasswordPrintFlag, "skip-password-print", false, "Do not print the generated user password.")
   296  		}
   297  		cmd.AddCommand(t.command)
   298  	}
   300  	return cmd
   301  }
   303  func runTargetCmd(ctx context.Context, targets ...asset.WritableAsset) func(cmd *cobra.Command, args []string) {
   304  	runner := func(directory string) error {
   305  		fetcher := assetstore.NewAssetsFetcher(directory)
   306  		return fetcher.FetchAndPersist(ctx, targets)
   307  	}
   309  	return func(cmd *cobra.Command, args []string) {
   310  		timer.StartTimer(timer.TotalTimeElapsed)
   312  		// Set the context to be used in the PostRun function.
   313  		cmd.SetContext(ctx)
   315  		cleanup := command.SetupFileHook(command.RootOpts.Dir)
   316  		defer cleanup()
   318  		cluster.InstallDir = command.RootOpts.Dir
   320  		err := runner(command.RootOpts.Dir)
   321  		if err != nil {
   322  			if strings.Contains(err.Error(), asset.InstallConfigError) {
   323  				logrus.Error(err)
   324  				logrus.Exit(exitCodeInstallConfigError)
   325  			}
   326  			if strings.Contains(err.Error(), asset.ClusterCreationError) {
   327  				logrus.Error(err)
   328  				logrus.Exit(exitCodeInfrastructureFailed)
   329  			}
   330  			logrus.Fatal(err)
   331  		}
   332  		switch cmd.Name() {
   333  		case "cluster", "image", "pxe-files":
   334  		default:
   335  			logrus.Infof(logging.LogCreatedFiles(cmd.Name(), command.RootOpts.Dir, targets))
   336  		}
   337  	}
   338  }
   340  // addRouterCAToClusterCA adds router CA to cluster CA in kubeconfig
   341  func addRouterCAToClusterCA(ctx context.Context, config *rest.Config, directory string) (err error) {
   342  	client, err := kubernetes.NewForConfig(config)
   343  	if err != nil {
   344  		return errors.Wrap(err, "creating a Kubernetes client")
   345  	}
   347  	// Configmap may not exist. log and accept not-found errors with configmap.
   348  	caConfigMap, err := client.CoreV1().ConfigMaps("openshift-config-managed").Get(ctx, "default-ingress-cert", metav1.GetOptions{})
   349  	if err != nil {
   350  		return errors.Wrap(err, "fetching default-ingress-cert configmap from openshift-config-managed namespace")
   351  	}
   353  	routerCrtBytes := []byte(caConfigMap.Data["ca-bundle.crt"])
   354  	kubeconfig := filepath.Join(directory, "auth", "kubeconfig")
   355  	kconfig, err := clientcmd.LoadFromFile(kubeconfig)
   356  	if err != nil {
   357  		return errors.Wrap(err, "loading kubeconfig")
   358  	}
   360  	if kconfig == nil || len(kconfig.Clusters) == 0 {
   361  		return errors.New("kubeconfig is missing expected data")
   362  	}
   364  	for _, c := range kconfig.Clusters {
   365  		clusterCABytes := c.CertificateAuthorityData
   366  		if len(clusterCABytes) == 0 {
   367  			return errors.New("kubeconfig CertificateAuthorityData not found")
   368  		}
   369  		certPool := x509.NewCertPool()
   370  		if !certPool.AppendCertsFromPEM(clusterCABytes) {
   371  			return errors.New("cluster CA found in kubeconfig not valid PEM format")
   372  		}
   373  		if !certPool.AppendCertsFromPEM(routerCrtBytes) {
   374  			return errors.New("ca-bundle.crt from default-ingress-cert configmap not valid PEM format")
   375  		}
   377  		newCA := append(routerCrtBytes, clusterCABytes...)
   378  		c.CertificateAuthorityData = newCA
   379  	}
   380  	if err := clientcmd.WriteToFile(*kconfig, kubeconfig); err != nil {
   381  		return errors.Wrap(err, "writing kubeconfig")
   382  	}
   383  	return nil
   384  }
   386  func waitForBootstrapComplete(ctx context.Context, config *rest.Config) *clusterCreateError {
   387  	client, err := kubernetes.NewForConfig(config)
   388  	if err != nil {
   389  		return newClientError(errors.Wrap(err, "creating a Kubernetes client"))
   390  	}
   392  	discovery := client.Discovery()
   394  	apiTimeout := 20 * time.Minute
   396  	untilTime := time.Now().Add(apiTimeout)
   397  	timezone, _ := untilTime.Zone()
   398  	logrus.Infof("Waiting up to %v (until %v %s) for the Kubernetes API at %s...",
   399  		apiTimeout, untilTime.Format(time.Kitchen), timezone, config.Host)
   401  	apiContext, cancel := context.WithTimeout(ctx, apiTimeout)
   402  	defer cancel()
   403  	// Poll quickly so we notice changes, but only log when the response
   404  	// changes (because that's interesting) or when we've seen 15 of the
   405  	// same errors in a row (to show we're still alive).
   406  	logDownsample := 15
   407  	silenceRemaining := logDownsample
   408  	previousErrorSuffix := ""
   409  	timer.StartTimer("API")
   411  	if assetStore, err := assetstore.NewStore(command.RootOpts.Dir); err == nil {
   412  		checkIfAgentCommand(assetStore)
   413  	}
   415  	var lastErr error
   416  	err = wait.PollUntilContextCancel(apiContext, 2*time.Second, true, func(_ context.Context) (done bool, err error) {
   417  		version, err := discovery.ServerVersion()
   418  		if err == nil {
   419  			logrus.Infof("API %s up", version)
   420  			timer.StopTimer("API")
   421  			return true, nil
   422  		}
   424  		lastErr = err
   425  		silenceRemaining--
   426  		chunks := strings.Split(err.Error(), ":")
   427  		errorSuffix := chunks[len(chunks)-1]
   428  		if previousErrorSuffix != errorSuffix {
   429  			logrus.Debugf("Still waiting for the Kubernetes API: %v", err)
   430  			previousErrorSuffix = errorSuffix
   431  			silenceRemaining = logDownsample
   432  		} else if silenceRemaining == 0 {
   433  			logrus.Debugf("Still waiting for the Kubernetes API: %v", err)
   434  			silenceRemaining = logDownsample
   435  		}
   437  		return false, nil
   438  	})
   439  	if err != nil {
   440  		if lastErr != nil {
   441  			return newAPIError(lastErr)
   442  		}
   443  		return newAPIError(err)
   444  	}
   446  	var platformName string
   448  	if assetStore, err := assetstore.NewStore(command.RootOpts.Dir); err == nil {
   449  		if installConfig, err := assetStore.Load(&installconfig.InstallConfig{}); err == nil && installConfig != nil {
   450  			platformName = installConfig.(*installconfig.InstallConfig).Config.Platform.Name()
   451  		}
   452  	}
   454  	timeout := 45 * time.Minute
   456  	// Wait longer for baremetal, VSphere due to length of time it takes to boot
   457  	if platformName == baremetal.Name || platformName == vsphere.Name {
   458  		timeout = 60 * time.Minute
   459  	}
   461  	untilTime = time.Now().Add(timeout)
   462  	timezone, _ = untilTime.Zone()
   463  	logrus.Infof("Waiting up to %v (until %v %s) for bootstrapping to complete...",
   464  		timeout, untilTime.Format(time.Kitchen), timezone)
   466  	waitCtx, cancel := context.WithTimeout(ctx, timeout)
   467  	defer cancel()
   469  	if platformName == baremetal.Name {
   470  		if err := baremetalutils.WaitForBaremetalBootstrapControlPlane(waitCtx, config, command.RootOpts.Dir); err != nil {
   471  			return newBootstrapError(err)
   472  		}
   473  		logrus.Infof("  Baremetal control plane finished provisioning.")
   474  	}
   476  	if err := waitForBootstrapConfigMap(waitCtx, client); err != nil {
   477  		return err
   478  	}
   480  	if err := waitForStableSNOBootstrap(ctx, config); err != nil {
   481  		return newBootstrapError(err)
   482  	}
   484  	return nil
   485  }
   487  // waitForBootstrapConfigMap watches the configmaps in the kube-system namespace
   488  // and waits for the bootstrap configmap to report that bootstrapping has
   489  // completed.
   490  func waitForBootstrapConfigMap(ctx context.Context, client *kubernetes.Clientset) *clusterCreateError {
   491  	_, err := clientwatch.UntilWithSync(
   492  		ctx,
   493  		cache.NewListWatchFromClient(client.CoreV1().RESTClient(), "configmaps", "kube-system", fields.OneTermEqualSelector("", "bootstrap")),
   494  		&corev1.ConfigMap{},
   495  		nil,
   496  		func(event watch.Event) (bool, error) {
   497  			switch event.Type {
   498  			case watch.Added, watch.Modified:
   499  			default:
   500  				return false, nil
   501  			}
   502  			cm, ok := event.Object.(*corev1.ConfigMap)
   503  			if !ok {
   504  				logrus.Warnf("Expected a core/v1.ConfigMap object but got a %q object instead", event.Object.GetObjectKind().GroupVersionKind())
   505  				return false, nil
   506  			}
   507  			status, ok := cm.Data["status"]
   508  			if !ok {
   509  				logrus.Debugf("No status found in bootstrap configmap")
   510  				return false, nil
   511  			}
   512  			logrus.Debugf("Bootstrap status: %v", status)
   513  			return status == "complete", nil
   514  		},
   515  	)
   516  	if err != nil {
   517  		return newBootstrapError(err)
   518  	}
   519  	return nil
   520  }
   522  // When bootstrap on SNO deployments, we should not remove the bootstrap node prematurely,
   523  // here we make sure that the deployment is stable.
   524  // Given the nature of single node we just need to make sure things such as etcd are in the proper state
   525  // before continuing.
   526  func waitForStableSNOBootstrap(ctx context.Context, config *rest.Config) error {
   527  	timeout := 5 * time.Minute
   529  	// If we're not in a single node deployment, bail early
   530  	if isSNO, err := IsSingleNode(); err != nil {
   531  		logrus.Warningf("Can not determine if installing a Single Node cluster, continuing as normal install: %v", err)
   532  		return nil
   533  	} else if !isSNO {
   534  		return nil
   535  	}
   537  	snoBootstrapContext, cancel := context.WithTimeout(ctx, timeout)
   538  	defer cancel()
   540  	untilTime := time.Now().Add(timeout)
   541  	timezone, _ := untilTime.Zone()
   542  	logrus.Info("Detected Single Node deployment")
   543  	logrus.Infof("Waiting up to %v (until %v %s) for the bootstrap etcd member to be removed...",
   544  		timeout, untilTime.Format(time.Kitchen), timezone)
   546  	client, err := dynamic.NewForConfig(config)
   547  	if err != nil {
   548  		return fmt.Errorf("error creating dynamic client: %w", err)
   549  	}
   550  	gvr := schema.GroupVersionResource{
   551  		Group:    operatorv1.SchemeGroupVersion.Group,
   552  		Version:  operatorv1.SchemeGroupVersion.Version,
   553  		Resource: "etcds",
   554  	}
   555  	resourceClient := client.Resource(gvr)
   556  	// Validate the etcd operator has removed the bootstrap etcd member
   557  	return wait.PollUntilContextCancel(snoBootstrapContext, 1*time.Second, true, func(ctx context.Context) (done bool, err error) {
   558  		etcdOperator := &operatorv1.Etcd{}
   559  		etcdUnstructured, err := resourceClient.Get(ctx, "cluster", metav1.GetOptions{})
   560  		if err != nil {
   561  			// There might be service disruptions in SNO, we log those here but keep trying with in the time limit
   562  			logrus.Debugf("Error getting ETCD Cluster resource, retrying: %v", err)
   563  			return false, nil
   564  		}
   565  		err = runtime.DefaultUnstructuredConverter.FromUnstructured(etcdUnstructured.Object, etcdOperator)
   566  		if err != nil {
   567  			// This error should not happen, if we do, we log the error and keep retrying until we hit the limit
   568  			logrus.Debugf("Error parsing etcds resource, retrying: %v", err)
   569  			return false, nil
   570  		}
   571  		for _, condition := range etcdOperator.Status.Conditions {
   572  			if condition.Type == "EtcdBootstrapMemberRemoved" {
   573  				return configv1.ConditionStatus(condition.Status) == configv1.ConditionTrue, nil
   574  			}
   575  		}
   576  		return false, nil
   577  	})
   578  }
   580  // waitForInitializedCluster watches the ClusterVersion waiting for confirmation
   581  // that the cluster has been initialized.
   582  func waitForInitializedCluster(ctx context.Context, config *rest.Config) error {
   583  	// TODO revert this value back to 30 minutes.  It's currently at the end of 4.6 and we're trying to see if the
   584  	timeout := 40 * time.Minute
   586  	// Wait longer for baremetal, due to length of time it takes to boot
   587  	if assetStore, err := assetstore.NewStore(command.RootOpts.Dir); err == nil {
   588  		if installConfig, err := assetStore.Load(&installconfig.InstallConfig{}); err == nil && installConfig != nil {
   589  			if installConfig.(*installconfig.InstallConfig).Config.Platform.Name() == baremetal.Name {
   590  				timeout = 60 * time.Minute
   591  			}
   592  		}
   594  		checkIfAgentCommand(assetStore)
   595  	}
   597  	untilTime := time.Now().Add(timeout)
   598  	timezone, _ := untilTime.Zone()
   599  	logrus.Infof("Waiting up to %v (until %v %s) for the cluster at %s to initialize...",
   600  		timeout, untilTime.Format(time.Kitchen), timezone, config.Host)
   601  	cc, err := configclient.NewForConfig(config)
   602  	if err != nil {
   603  		return errors.Wrap(err, "failed to create a config client")
   604  	}
   605  	clusterVersionContext, cancel := context.WithTimeout(ctx, timeout)
   606  	defer cancel()
   608  	failing := configv1.ClusterStatusConditionType("Failing")
   609  	timer.StartTimer("Cluster Operators Available")
   610  	var lastError string
   611  	_, err = clientwatch.UntilWithSync(
   612  		clusterVersionContext,
   613  		cache.NewListWatchFromClient(cc.ConfigV1().RESTClient(), "clusterversions", "", fields.OneTermEqualSelector("", "version")),
   614  		&configv1.ClusterVersion{},
   615  		nil,
   616  		func(event watch.Event) (bool, error) {
   617  			switch event.Type {
   618  			case watch.Added, watch.Modified:
   619  				cv, ok := event.Object.(*configv1.ClusterVersion)
   620  				if !ok {
   621  					logrus.Warnf("Expected a ClusterVersion object but got a %q object instead", event.Object.GetObjectKind().GroupVersionKind())
   622  					return false, nil
   623  				}
   624  				if cov1helpers.IsStatusConditionTrue(cv.Status.Conditions, configv1.OperatorAvailable) &&
   625  					cov1helpers.IsStatusConditionFalse(cv.Status.Conditions, failing) &&
   626  					cov1helpers.IsStatusConditionFalse(cv.Status.Conditions, configv1.OperatorProgressing) {
   627  					timer.StopTimer("Cluster Operators Available")
   628  					return true, nil
   629  				}
   630  				if cov1helpers.IsStatusConditionTrue(cv.Status.Conditions, failing) {
   631  					lastError = cov1helpers.FindStatusCondition(cv.Status.Conditions, failing).Message
   632  				} else if cov1helpers.IsStatusConditionTrue(cv.Status.Conditions, configv1.OperatorProgressing) {
   633  					lastError = cov1helpers.FindStatusCondition(cv.Status.Conditions, configv1.OperatorProgressing).Message
   634  				}
   635  				logrus.Debugf("Still waiting for the cluster to initialize: %s", lastError)
   636  				return false, nil
   637  			}
   638  			logrus.Debug("Still waiting for the cluster to initialize...")
   639  			return false, nil
   640  		},
   641  	)
   643  	if err == nil {
   644  		logrus.Debug("Cluster is initialized")
   645  		return nil
   646  	}
   648  	if lastError != "" {
   649  		if err == wait.ErrWaitTimeout {
   650  			return errors.Errorf("failed to initialize the cluster: %s", lastError)
   651  		}
   653  		return errors.Wrapf(err, "failed to initialize the cluster: %s", lastError)
   654  	}
   656  	return errors.Wrap(err, "failed to initialize the cluster")
   657  }
   659  // waitForStableOperators ensures that each cluster operator is "stable", i.e. the
   660  // operator has not been in a progressing state for at least a certain duration,
   661  // 30 seconds by default. Returns an error if any operator does meet this threshold
   662  // after a deadline, 30 minutes by default.
   663  func waitForStableOperators(ctx context.Context, config *rest.Config) error {
   664  	timer.StartTimer("Cluster Operators Stable")
   666  	stabilityCheckDuration := 30 * time.Minute
   667  	stabilityContext, cancel := context.WithTimeout(ctx, stabilityCheckDuration)
   668  	defer cancel()
   670  	untilTime := time.Now().Add(stabilityCheckDuration)
   671  	timezone, _ := untilTime.Zone()
   672  	logrus.Infof("Waiting up to %v (until %v %s) to ensure each cluster operator has finished progressing...",
   673  		stabilityCheckDuration, untilTime.Format(time.Kitchen), timezone)
   675  	cc, err := configclient.NewForConfig(config)
   676  	if err != nil {
   677  		return errors.Wrap(err, "failed to create a config client")
   678  	}
   679  	configInformers := configinformers.NewSharedInformerFactory(cc, 0)
   680  	clusterOperatorInformer := configInformers.Config().V1().ClusterOperators().Informer()
   681  	clusterOperatorLister := configInformers.Config().V1().ClusterOperators().Lister()
   682  	configInformers.Start(ctx.Done())
   683  	if !cache.WaitForCacheSync(ctx.Done(), clusterOperatorInformer.HasSynced) {
   684  		return fmt.Errorf("informers never started")
   685  	}
   687  	waitErr := wait.PollUntilContextCancel(stabilityContext, 1*time.Second, true, waitForAllClusterOperators(clusterOperatorLister))
   688  	if waitErr != nil {
   689  		logrus.Errorf("Error checking cluster operator Progressing status: %q", waitErr)
   690  		stableOperators, unstableOperators, err := currentOperatorStability(clusterOperatorLister)
   691  		if err != nil {
   692  			logrus.Errorf("Error checking final cluster operator Progressing status: %q", err)
   693  		}
   694  		logrus.Debugf("These cluster operators were stable: [%s]", strings.Join(sets.List(stableOperators), ", "))
   695  		logrus.Errorf("These cluster operators were not stable: [%s]", strings.Join(sets.List(unstableOperators), ", "))
   697  		logrus.Exit(exitCodeOperatorStabilityFailed)
   698  	}
   700  	timer.StopTimer("Cluster Operators Stable")
   702  	logrus.Info("All cluster operators have completed progressing")
   704  	return nil
   705  }
   707  // getConsole returns the console URL from the route 'console' in namespace openshift-console
   708  func getConsole(ctx context.Context, config *rest.Config) (string, error) {
   709  	url := ""
   710  	// Need to keep these updated if they change
   711  	consoleNamespace := "openshift-console"
   712  	consoleRouteName := "console"
   713  	rc, err := routeclient.NewForConfig(config)
   714  	if err != nil {
   715  		return "", errors.Wrap(err, "creating a route client")
   716  	}
   718  	consoleRouteTimeout := 2 * time.Minute
   719  	logrus.Infof("Checking to see if there is a route at %s/%s...", consoleNamespace, consoleRouteName)
   720  	consoleRouteContext, cancel := context.WithTimeout(ctx, consoleRouteTimeout)
   721  	defer cancel()
   722  	// Poll quickly but only log when the response
   723  	// when we've seen 15 of the same errors or output of
   724  	// no route in a row (to show we're still alive).
   725  	logDownsample := 15
   726  	silenceRemaining := logDownsample
   727  	timer.StartTimer("Console")
   728  	wait.Until(func() {
   729  		route, err := rc.RouteV1().Routes(consoleNamespace).Get(ctx, consoleRouteName, metav1.GetOptions{})
   730  		if err == nil {
   731  			logrus.Debugf("Route found in openshift-console namespace: %s", consoleRouteName)
   732  			if uri, _, err2 := routeapihelpers.IngressURI(route, ""); err2 == nil {
   733  				url = uri.String()
   734  				logrus.Debug("OpenShift console route is admitted")
   735  				cancel()
   736  			} else {
   737  				err = err2
   738  			}
   739  		} else if apierrors.IsNotFound(err) {
   740  			logrus.Debug("OpenShift console route does not exist")
   741  			cancel()
   742  		}
   744  		if err != nil {
   745  			silenceRemaining--
   746  			if silenceRemaining == 0 {
   747  				logrus.Debugf("Still waiting for the console route: %v", err)
   748  				silenceRemaining = logDownsample
   749  			}
   750  		}
   751  	}, 2*time.Second, consoleRouteContext.Done())
   752  	err = consoleRouteContext.Err()
   753  	if err != nil && err != context.Canceled {
   754  		return url, errors.Wrap(err, "waiting for openshift-console URL")
   755  	}
   756  	if url == "" {
   757  		return url, errors.New("could not get openshift-console URL")
   758  	}
   759  	timer.StopTimer("Console")
   760  	return url, nil
   761  }
   763  // logComplete prints info upon completion
   764  func logComplete(directory, consoleURL string) error {
   765  	absDir, err := filepath.Abs(directory)
   766  	if err != nil {
   767  		return err
   768  	}
   769  	kubeconfig := filepath.Join(absDir, "auth", "kubeconfig")
   770  	pwFile := filepath.Join(absDir, "auth", "kubeadmin-password")
   771  	pw, err := os.ReadFile(pwFile)
   772  	if err != nil {
   773  		return err
   774  	}
   775  	logrus.Info("Install complete!")
   776  	logrus.Infof("To access the cluster as the system:admin user when using 'oc', run 'export KUBECONFIG=%s'", kubeconfig)
   777  	if consoleURL != "" {
   778  		logrus.Infof("Access the OpenShift web-console here: %s", consoleURL)
   779  		if skipPasswordPrintFlag {
   780  			logrus.Infof("Credentials omitted, if necessary verify the %s file", pwFile)
   781  		} else {
   782  			logrus.Infof("Login to the console with user: %q, and password: %q", "kubeadmin", pw)
   783  		}
   784  	}
   785  	return nil
   786  }
   788  func waitForInstallComplete(ctx context.Context, config *rest.Config, directory string) error {
   789  	if err := waitForInitializedCluster(ctx, config); err != nil {
   790  		return err
   791  	}
   793  	if err := addRouterCAToClusterCA(ctx, config, command.RootOpts.Dir); err != nil {
   794  		return err
   795  	}
   797  	if err := waitForStableOperators(ctx, config); err != nil {
   798  		return err
   799  	}
   801  	consoleURL, err := getConsole(ctx, config)
   802  	if err != nil {
   803  		logrus.Warnf("Cluster does not have a console available: %v", err)
   804  	}
   806  	return logComplete(command.RootOpts.Dir, consoleURL)
   807  }
   809  func logTroubleshootingLink() {
   810  	logrus.Error(`Cluster initialization failed because one or more operators are not functioning properly.
   811  The cluster should be accessible for troubleshooting as detailed in the documentation linked below,
   813  The 'wait-for install-complete' subcommand can then be used to continue the installation`)
   814  }
   816  func checkIfAgentCommand(assetStore asset.Store) {
   817  	if agentConfig, err := assetStore.Load(&agentconfig.AgentConfig{}); err == nil && agentConfig != nil {
   818  		logrus.Warning("An agent configuration was detected but this command is not the agent wait-for command")
   819  	}
   820  }
   822  func waitForAllClusterOperators(clusterOperatorLister configlisters.ClusterOperatorLister) func(ctx context.Context) (bool, error) {
   823  	previouslyStableOperators := sets.Set[string]{}
   825  	return func(ctx context.Context) (bool, error) {
   826  		stableOperators, unstableOperators, err := currentOperatorStability(clusterOperatorLister)
   827  		if err != nil {
   828  			return false, err
   829  		}
   830  		if newlyStableOperators := stableOperators.Difference(previouslyStableOperators); len(newlyStableOperators) > 0 {
   831  			for _, name := range sets.List(newlyStableOperators) {
   832  				logrus.Debugf("Cluster Operator %s is stable", name)
   833  			}
   834  		}
   835  		if newlyUnstableOperators := previouslyStableOperators.Difference(stableOperators); len(newlyUnstableOperators) > 0 {
   836  			for _, name := range sets.List(newlyUnstableOperators) {
   837  				logrus.Debugf("Cluster Operator %s became unstable", name)
   838  			}
   839  		}
   840  		previouslyStableOperators = stableOperators
   842  		if len(unstableOperators) == 0 {
   843  			return true, nil
   844  		}
   846  		return false, nil
   847  	}
   848  }
   850  func currentOperatorStability(clusterOperatorLister configlisters.ClusterOperatorLister) (sets.Set[string], sets.Set[string], error) {
   851  	clusterOperators, err := clusterOperatorLister.List(labels.Everything())
   852  	if err != nil {
   853  		return nil, nil, err // lister should never fail
   854  	}
   856  	stableOperators := sets.Set[string]{}
   857  	unstableOperators := sets.Set[string]{}
   858  	for _, clusterOperator := range clusterOperators {
   859  		name := clusterOperator.Name
   860  		progressing := cov1helpers.FindStatusCondition(clusterOperator.Status.Conditions, configv1.OperatorProgressing)
   861  		if progressing == nil {
   862  			logrus.Debugf("Cluster Operator %s progressing == nil", name)
   863  			unstableOperators.Insert(name)
   864  			continue
   865  		}
   866  		if meetsStabilityThreshold(progressing) {
   867  			stableOperators.Insert(name)
   868  		} else {
   869  			logrus.Debugf("Cluster Operator %s is Progressing=%s LastTransitionTime=%v DurationSinceTransition=%.fs Reason=%s Message=%s", name, progressing.Status, progressing.LastTransitionTime.Time, time.Since(progressing.LastTransitionTime.Time).Seconds(), progressing.Reason, progressing.Message)
   870  			unstableOperators.Insert(name)
   871  		}
   872  	}
   874  	return stableOperators, unstableOperators, nil
   875  }
   877  func meetsStabilityThreshold(progressing *configv1.ClusterOperatorStatusCondition) bool {
   878  	return progressing.Status == configv1.ConditionFalse && time.Since(progressing.LastTransitionTime.Time).Seconds() > coStabilityThreshold
   879  }
   881  func handleUnreachableAPIServer(ctx context.Context, config *rest.Config) error {
   882  	assetStore, err := assetstore.NewStore(command.RootOpts.Dir)
   883  	if err != nil {
   884  		return fmt.Errorf("failed to create asset store: %w", err)
   885  	}
   887  	// Ensure that the install is expecting the user to provision their own DNS solution.
   888  	installConfig := &installconfig.InstallConfig{}
   889  	if err := assetStore.Fetch(ctx, installConfig); err != nil {
   890  		return fmt.Errorf("failed to fetch %s: %w", installConfig.Name(), err)
   891  	}
   892  	switch installConfig.Config.Platform.Name() { //nolint:gocritic
   893  	case gcp.Name:
   894  		if installConfig.Config.GCP.UserProvisionedDNS != gcp.UserProvisionedDNSEnabled {
   895  			return nil
   896  		}
   897  	default:
   898  		return nil
   899  	}
   901  	lbConfig := &lbconfig.Config{}
   902  	if err := assetStore.Fetch(ctx, lbConfig); err != nil {
   903  		return fmt.Errorf("failed to fetch %s: %w", lbConfig.Name(), err)
   904  	}
   906  	_, ipAddrs, err := lbConfig.ParseDNSDataFromConfig(lbconfig.PublicLoadBalancer)
   907  	if err != nil {
   908  		return fmt.Errorf("failed to parse lbconfig: %w", err)
   909  	}
   911  	// The kubeconfig handles one ip address
   912  	ipAddr := ""
   913  	if len(ipAddrs) > 0 {
   914  		ipAddr = ipAddrs[0].String()
   915  	}
   916  	if ipAddr == "" {
   917  		return fmt.Errorf("no ip address found in lbconfig")
   918  	}
   920  	dialer := &net.Dialer{
   921  		Timeout:   1 * time.Minute,
   922  		KeepAlive: 1 * time.Minute,
   923  	}
   924  	config.Dial = kubeconfig.CreateDialContext(dialer, ipAddr)
   926  	// The asset is currently saved in <install-dir>/openshift. This directory
   927  	// was consumed during install but this file is generated after that action. This
   928  	// artifact will hang around unless it is purged here.
   929  	if err := asset.DeleteAssetFromDisk(lbConfig, command.RootOpts.Dir); err != nil {
   930  		return fmt.Errorf("failed to delete %s from disk", lbConfig.Name())
   931  	}
   933  	return nil
   934  }
   936  // IsSingleNode determines if we are in a single node configuration based off of the install config
   937  // loaded from the asset store.
   938  func IsSingleNode() (bool, error) {
   939  	assetStore, err := assetstore.NewStore(command.RootOpts.Dir)
   940  	if err != nil {
   941  		return false, fmt.Errorf("error loading asset store: %w", err)
   942  	}
   943  	installConfig, err := assetStore.Load(&installconfig.InstallConfig{})
   944  	if err != nil {
   945  		return false, fmt.Errorf("error loading installConfig: %w", err)
   946  	}
   947  	if installConfig == nil {
   948  		return false, fmt.Errorf("installConfig loaded from asset store was nil")
   949  	}
   951  	config := installConfig.(*installconfig.InstallConfig).Config
   952  	if machinePool := config.ControlPlane; machinePool != nil {
   953  		return *machinePool.Replicas == int64(1), nil
   954  	}
   955  	return false, nil
   956  }