github.com/openshift/installer@v1.4.17/cmd/openshift-install/create.go (about) 1 package main 2 3 import ( 4 "context" 5 "crypto/x509" 6 "fmt" 7 "net" 8 "os" 9 "path/filepath" 10 "strings" 11 "time" 12 13 "github.com/pkg/errors" 14 "github.com/sirupsen/logrus" 15 "github.com/spf13/cobra" 16 corev1 "k8s.io/api/core/v1" 17 apierrors "k8s.io/apimachinery/pkg/api/errors" 18 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 19 "k8s.io/apimachinery/pkg/fields" 20 "k8s.io/apimachinery/pkg/labels" 21 "k8s.io/apimachinery/pkg/runtime" 22 "k8s.io/apimachinery/pkg/runtime/schema" 23 "k8s.io/apimachinery/pkg/util/sets" 24 "k8s.io/apimachinery/pkg/util/wait" 25 "k8s.io/apimachinery/pkg/watch" 26 "k8s.io/client-go/dynamic" 27 "k8s.io/client-go/kubernetes" 28 "k8s.io/client-go/rest" 29 "k8s.io/client-go/tools/cache" 30 "k8s.io/client-go/tools/clientcmd" 31 clientwatch "k8s.io/client-go/tools/watch" 32 33 configv1 "github.com/openshift/api/config/v1" 34 operatorv1 "github.com/openshift/api/operator/v1" 35 configclient "github.com/openshift/client-go/config/clientset/versioned" 36 configinformers "github.com/openshift/client-go/config/informers/externalversions" 37 configlisters "github.com/openshift/client-go/config/listers/config/v1" 38 routeclient "github.com/openshift/client-go/route/clientset/versioned" 39 "github.com/openshift/installer/cmd/openshift-install/command" 40 "github.com/openshift/installer/pkg/asset" 41 "github.com/openshift/installer/pkg/asset/agent/agentconfig" 42 "github.com/openshift/installer/pkg/asset/cluster" 43 "github.com/openshift/installer/pkg/asset/installconfig" 44 "github.com/openshift/installer/pkg/asset/kubeconfig" 45 "github.com/openshift/installer/pkg/asset/lbconfig" 46 "github.com/openshift/installer/pkg/asset/logging" 47 assetstore "github.com/openshift/installer/pkg/asset/store" 48 targetassets "github.com/openshift/installer/pkg/asset/targets" 49 destroybootstrap "github.com/openshift/installer/pkg/destroy/bootstrap" 50 "github.com/openshift/installer/pkg/gather/service" 51 timer "github.com/openshift/installer/pkg/metrics/timer" 52 "github.com/openshift/installer/pkg/types/baremetal" 53 "github.com/openshift/installer/pkg/types/gcp" 54 "github.com/openshift/installer/pkg/types/vsphere" 55 baremetalutils "github.com/openshift/installer/pkg/utils/baremetal" 56 cov1helpers "github.com/openshift/library-go/pkg/config/clusteroperator/v1helpers" 57 "github.com/openshift/library-go/pkg/route/routeapihelpers" 58 ) 59 60 type target struct { 61 name string 62 command *cobra.Command 63 assets []asset.WritableAsset 64 } 65 66 const ( 67 exitCodeInstallConfigError = iota + 3 68 exitCodeInfrastructureFailed 69 exitCodeBootstrapFailed 70 exitCodeInstallFailed 71 exitCodeOperatorStabilityFailed 72 exitCodeInterrupt 73 74 // coStabilityThreshold is how long a cluster operator must have Progressing=False 75 // in order to be considered stable. Measured in seconds. 76 coStabilityThreshold float64 = 30 77 ) 78 79 var skipPasswordPrintFlag bool 80 81 // each target is a variable to preserve the order when creating subcommands and still 82 // allow other functions to directly access each target individually. 83 var ( 84 installConfigTarget = target{ 85 name: "Install Config", 86 command: &cobra.Command{ 87 Use: "install-config", 88 Short: "Generates the Install Config asset", 89 // FIXME: add longer descriptions for our commands with examples for better UX. 90 // Long: "", 91 }, 92 assets: targetassets.InstallConfig, 93 } 94 95 manifestsTarget = target{ 96 name: "Manifests", 97 command: &cobra.Command{ 98 Use: "manifests", 99 Short: "Generates the Kubernetes manifests", 100 // FIXME: add longer descriptions for our commands with examples for better UX. 101 // Long: "", 102 }, 103 assets: targetassets.Manifests, 104 } 105 106 ignitionConfigsTarget = target{ 107 name: "Ignition Configs", 108 command: &cobra.Command{ 109 Use: "ignition-configs", 110 Short: "Generates the Ignition Config asset", 111 // FIXME: add longer descriptions for our commands with examples for better UX. 112 // Long: "", 113 }, 114 assets: targetassets.IgnitionConfigs, 115 } 116 117 singleNodeIgnitionConfigTarget = target{ 118 name: "Single Node Ignition Config", 119 command: &cobra.Command{ 120 Use: "single-node-ignition-config", 121 Short: "Generates the bootstrap-in-place-for-live-iso Ignition Config asset", 122 // FIXME: add longer descriptions for our commands with examples for better UX. 123 // Long: "", 124 }, 125 assets: targetassets.SingleNodeIgnitionConfig, 126 } 127 128 clusterTarget = target{ 129 name: "Cluster", 130 command: &cobra.Command{ 131 Use: "cluster", 132 Short: "Create an OpenShift cluster", 133 // FIXME: add longer descriptions for our commands with examples for better UX. 134 // Long: "", 135 PostRun: func(cmd *cobra.Command, _ []string) { 136 // Get the context that was set in newCreateCmd. 137 ctx := cmd.Context() 138 139 exitCode, err := clusterCreatePostRun(ctx) 140 if err != nil { 141 logrus.Fatal(err) 142 } 143 if exitCode != 0 { 144 logrus.Exit(exitCode) 145 } 146 }, 147 }, 148 assets: targetassets.Cluster, 149 } 150 151 targets = []target{installConfigTarget, manifestsTarget, ignitionConfigsTarget, clusterTarget, singleNodeIgnitionConfigTarget} 152 ) 153 154 // clusterCreatePostRun is the main entrypoint for the cluster create command 155 // it was moved out of the clusterTarget.command.PostRun function to allow cleanup operations to always 156 // run in a defer statement, given that we had multiple exit points in the function, like logrus.Fatal or logrus.Exit. 157 // 158 // Currently this function returns an exit code and an error, we should refactor this to only return an error, 159 // that can be wrapped if we want a custom exit code. 160 func clusterCreatePostRun(ctx context.Context) (int, error) { 161 cleanup := command.SetupFileHook(command.RootOpts.Dir) 162 defer cleanup() 163 164 // FIXME: pulling the kubeconfig and metadata out of the root 165 // directory is a bit cludgy when we already have them in memory. 166 config, err := clientcmd.BuildConfigFromFlags("", filepath.Join(command.RootOpts.Dir, "auth", "kubeconfig")) 167 if err != nil { 168 return 0, errors.Wrap(err, "loading kubeconfig") 169 } 170 171 // Handle the case when the API server is not reachable. 172 if err := handleUnreachableAPIServer(ctx, config); err != nil { 173 logrus.Fatal(fmt.Errorf("unable to handle api server override: %w", err)) 174 } 175 176 // 177 // Wait for the bootstrap to complete. 178 // 179 timer.StartTimer("Bootstrap Complete") 180 if err := waitForBootstrapComplete(ctx, config); err != nil { 181 bundlePath, gatherErr := runGatherBootstrapCmd(ctx, command.RootOpts.Dir) 182 if gatherErr != nil { 183 logrus.Error("Attempted to gather debug logs after installation failure: ", gatherErr) 184 } 185 if err := logClusterOperatorConditions(ctx, config); err != nil { 186 logrus.Error("Attempted to gather ClusterOperator status after installation failure: ", err) 187 } 188 logrus.Error("Bootstrap failed to complete: ", err.Unwrap()) 189 logrus.Error(err.Error()) 190 if gatherErr == nil { 191 if err := service.AnalyzeGatherBundle(bundlePath); err != nil { 192 logrus.Error("Attempted to analyze the debug logs after installation failure: ", err) 193 } 194 logrus.Infof("Bootstrap gather logs captured here %q", bundlePath) 195 } 196 return exitCodeBootstrapFailed, nil 197 } 198 timer.StopTimer("Bootstrap Complete") 199 200 // 201 // Wait for the bootstrap to be destroyed. 202 // 203 timer.StartTimer("Bootstrap Destroy") 204 if oi, ok := os.LookupEnv("OPENSHIFT_INSTALL_PRESERVE_BOOTSTRAP"); ok && oi != "" { 205 logrus.Warn("OPENSHIFT_INSTALL_PRESERVE_BOOTSTRAP is set, not destroying bootstrap resources. " + 206 "Warning: this should only be used for debugging purposes, and poses a risk to cluster stability.") 207 } else { 208 logrus.Info("Destroying the bootstrap resources...") 209 err = destroybootstrap.Destroy(ctx, command.RootOpts.Dir) 210 if err != nil { 211 return 0, err 212 } 213 } 214 timer.StopTimer("Bootstrap Destroy") 215 216 // 217 // Wait for the cluster to initialize. 218 // 219 err = waitForInstallComplete(ctx, config, command.RootOpts.Dir) 220 if err != nil { 221 if err2 := logClusterOperatorConditions(ctx, config); err2 != nil { 222 logrus.Error("Attempted to gather ClusterOperator status after installation failure: ", err2) 223 } 224 logTroubleshootingLink() 225 logrus.Error(err) 226 return exitCodeInstallFailed, nil 227 } 228 timer.StopTimer(timer.TotalTimeElapsed) 229 timer.LogSummary() 230 return 0, nil 231 } 232 233 // clusterCreateError defines a custom error type that would help identify where the error occurs 234 // during the bootstrap phase of the installation process. This would help identify whether the error 235 // comes either from the Kubernetes API failure, the bootstrap failure or a general kubernetes client 236 // creation error. In the event of any error, this interface packages the error message and a custom 237 // log message that must be neatly presented to the user before termination of the project. 238 type clusterCreateError struct { 239 wrappedError error 240 logMessage string 241 } 242 243 // Unwrap provides the actual stored error that occured during installation. 244 func (ce *clusterCreateError) Unwrap() error { 245 return ce.wrappedError 246 } 247 248 // Error provides the actual stored error that occured during installation. 249 func (ce *clusterCreateError) Error() string { 250 return ce.logMessage 251 } 252 253 // newAPIError creates a clusterCreateError object with a default error message specific to the API failure. 254 func newAPIError(errorInfo error) *clusterCreateError { 255 return &clusterCreateError{ 256 wrappedError: errorInfo, 257 logMessage: "Failed waiting for Kubernetes API. This error usually happens when there " + 258 "is a problem on the bootstrap host that prevents creating a temporary control plane.", 259 } 260 } 261 262 // newBootstrapError creates a clusterCreateError object with a default error message specific to the 263 // bootstrap failure. 264 func newBootstrapError(errorInfo error) *clusterCreateError { 265 return &clusterCreateError{ 266 wrappedError: errorInfo, 267 logMessage: "Failed to wait for bootstrapping to complete. This error usually " + 268 "happens when there is a problem with control plane hosts that prevents " + 269 "the control plane operators from creating the control plane.", 270 } 271 } 272 273 // newClientError creates a clusterCreateError object with a default error message specific to the 274 // kubernetes client creation failure. 275 func newClientError(errorInfo error) *clusterCreateError { 276 return &clusterCreateError{ 277 wrappedError: errorInfo, 278 logMessage: "Failed to create a kubernetes client.", 279 } 280 } 281 282 func newCreateCmd(ctx context.Context) *cobra.Command { 283 cmd := &cobra.Command{ 284 Use: "create", 285 Short: "Create part of an OpenShift cluster", 286 RunE: func(cmd *cobra.Command, args []string) error { 287 return cmd.Help() 288 }, 289 } 290 291 for _, t := range targets { 292 t.command.Args = cobra.ExactArgs(0) 293 t.command.Run = runTargetCmd(ctx, t.assets...) 294 if t.name == "Cluster" { 295 t.command.PersistentFlags().BoolVar(&skipPasswordPrintFlag, "skip-password-print", false, "Do not print the generated user password.") 296 } 297 cmd.AddCommand(t.command) 298 } 299 300 return cmd 301 } 302 303 func runTargetCmd(ctx context.Context, targets ...asset.WritableAsset) func(cmd *cobra.Command, args []string) { 304 runner := func(directory string) error { 305 fetcher := assetstore.NewAssetsFetcher(directory) 306 return fetcher.FetchAndPersist(ctx, targets) 307 } 308 309 return func(cmd *cobra.Command, args []string) { 310 timer.StartTimer(timer.TotalTimeElapsed) 311 312 // Set the context to be used in the PostRun function. 313 cmd.SetContext(ctx) 314 315 cleanup := command.SetupFileHook(command.RootOpts.Dir) 316 defer cleanup() 317 318 cluster.InstallDir = command.RootOpts.Dir 319 320 err := runner(command.RootOpts.Dir) 321 if err != nil { 322 if strings.Contains(err.Error(), asset.InstallConfigError) { 323 logrus.Error(err) 324 logrus.Exit(exitCodeInstallConfigError) 325 } 326 if strings.Contains(err.Error(), asset.ClusterCreationError) { 327 logrus.Error(err) 328 logrus.Exit(exitCodeInfrastructureFailed) 329 } 330 logrus.Fatal(err) 331 } 332 switch cmd.Name() { 333 case "cluster", "image", "pxe-files": 334 default: 335 logrus.Infof(logging.LogCreatedFiles(cmd.Name(), command.RootOpts.Dir, targets)) 336 } 337 } 338 } 339 340 // addRouterCAToClusterCA adds router CA to cluster CA in kubeconfig 341 func addRouterCAToClusterCA(ctx context.Context, config *rest.Config, directory string) (err error) { 342 client, err := kubernetes.NewForConfig(config) 343 if err != nil { 344 return errors.Wrap(err, "creating a Kubernetes client") 345 } 346 347 // Configmap may not exist. log and accept not-found errors with configmap. 348 caConfigMap, err := client.CoreV1().ConfigMaps("openshift-config-managed").Get(ctx, "default-ingress-cert", metav1.GetOptions{}) 349 if err != nil { 350 return errors.Wrap(err, "fetching default-ingress-cert configmap from openshift-config-managed namespace") 351 } 352 353 routerCrtBytes := []byte(caConfigMap.Data["ca-bundle.crt"]) 354 kubeconfig := filepath.Join(directory, "auth", "kubeconfig") 355 kconfig, err := clientcmd.LoadFromFile(kubeconfig) 356 if err != nil { 357 return errors.Wrap(err, "loading kubeconfig") 358 } 359 360 if kconfig == nil || len(kconfig.Clusters) == 0 { 361 return errors.New("kubeconfig is missing expected data") 362 } 363 364 for _, c := range kconfig.Clusters { 365 clusterCABytes := c.CertificateAuthorityData 366 if len(clusterCABytes) == 0 { 367 return errors.New("kubeconfig CertificateAuthorityData not found") 368 } 369 certPool := x509.NewCertPool() 370 if !certPool.AppendCertsFromPEM(clusterCABytes) { 371 return errors.New("cluster CA found in kubeconfig not valid PEM format") 372 } 373 if !certPool.AppendCertsFromPEM(routerCrtBytes) { 374 return errors.New("ca-bundle.crt from default-ingress-cert configmap not valid PEM format") 375 } 376 377 newCA := append(routerCrtBytes, clusterCABytes...) 378 c.CertificateAuthorityData = newCA 379 } 380 if err := clientcmd.WriteToFile(*kconfig, kubeconfig); err != nil { 381 return errors.Wrap(err, "writing kubeconfig") 382 } 383 return nil 384 } 385 386 func waitForBootstrapComplete(ctx context.Context, config *rest.Config) *clusterCreateError { 387 client, err := kubernetes.NewForConfig(config) 388 if err != nil { 389 return newClientError(errors.Wrap(err, "creating a Kubernetes client")) 390 } 391 392 discovery := client.Discovery() 393 394 apiTimeout := 20 * time.Minute 395 396 untilTime := time.Now().Add(apiTimeout) 397 timezone, _ := untilTime.Zone() 398 logrus.Infof("Waiting up to %v (until %v %s) for the Kubernetes API at %s...", 399 apiTimeout, untilTime.Format(time.Kitchen), timezone, config.Host) 400 401 apiContext, cancel := context.WithTimeout(ctx, apiTimeout) 402 defer cancel() 403 // Poll quickly so we notice changes, but only log when the response 404 // changes (because that's interesting) or when we've seen 15 of the 405 // same errors in a row (to show we're still alive). 406 logDownsample := 15 407 silenceRemaining := logDownsample 408 previousErrorSuffix := "" 409 timer.StartTimer("API") 410 411 if assetStore, err := assetstore.NewStore(command.RootOpts.Dir); err == nil { 412 checkIfAgentCommand(assetStore) 413 } 414 415 var lastErr error 416 err = wait.PollUntilContextCancel(apiContext, 2*time.Second, true, func(_ context.Context) (done bool, err error) { 417 version, err := discovery.ServerVersion() 418 if err == nil { 419 logrus.Infof("API %s up", version) 420 timer.StopTimer("API") 421 return true, nil 422 } 423 424 lastErr = err 425 silenceRemaining-- 426 chunks := strings.Split(err.Error(), ":") 427 errorSuffix := chunks[len(chunks)-1] 428 if previousErrorSuffix != errorSuffix { 429 logrus.Debugf("Still waiting for the Kubernetes API: %v", err) 430 previousErrorSuffix = errorSuffix 431 silenceRemaining = logDownsample 432 } else if silenceRemaining == 0 { 433 logrus.Debugf("Still waiting for the Kubernetes API: %v", err) 434 silenceRemaining = logDownsample 435 } 436 437 return false, nil 438 }) 439 if err != nil { 440 if lastErr != nil { 441 return newAPIError(lastErr) 442 } 443 return newAPIError(err) 444 } 445 446 var platformName string 447 448 if assetStore, err := assetstore.NewStore(command.RootOpts.Dir); err == nil { 449 if installConfig, err := assetStore.Load(&installconfig.InstallConfig{}); err == nil && installConfig != nil { 450 platformName = installConfig.(*installconfig.InstallConfig).Config.Platform.Name() 451 } 452 } 453 454 timeout := 45 * time.Minute 455 456 // Wait longer for baremetal, VSphere due to length of time it takes to boot 457 if platformName == baremetal.Name || platformName == vsphere.Name { 458 timeout = 60 * time.Minute 459 } 460 461 untilTime = time.Now().Add(timeout) 462 timezone, _ = untilTime.Zone() 463 logrus.Infof("Waiting up to %v (until %v %s) for bootstrapping to complete...", 464 timeout, untilTime.Format(time.Kitchen), timezone) 465 466 waitCtx, cancel := context.WithTimeout(ctx, timeout) 467 defer cancel() 468 469 if platformName == baremetal.Name { 470 if err := baremetalutils.WaitForBaremetalBootstrapControlPlane(waitCtx, config, command.RootOpts.Dir); err != nil { 471 return newBootstrapError(err) 472 } 473 logrus.Infof(" Baremetal control plane finished provisioning.") 474 } 475 476 if err := waitForBootstrapConfigMap(waitCtx, client); err != nil { 477 return err 478 } 479 480 if err := waitForStableSNOBootstrap(ctx, config); err != nil { 481 return newBootstrapError(err) 482 } 483 484 return nil 485 } 486 487 // waitForBootstrapConfigMap watches the configmaps in the kube-system namespace 488 // and waits for the bootstrap configmap to report that bootstrapping has 489 // completed. 490 func waitForBootstrapConfigMap(ctx context.Context, client *kubernetes.Clientset) *clusterCreateError { 491 _, err := clientwatch.UntilWithSync( 492 ctx, 493 cache.NewListWatchFromClient(client.CoreV1().RESTClient(), "configmaps", "kube-system", fields.OneTermEqualSelector("metadata.name", "bootstrap")), 494 &corev1.ConfigMap{}, 495 nil, 496 func(event watch.Event) (bool, error) { 497 switch event.Type { 498 case watch.Added, watch.Modified: 499 default: 500 return false, nil 501 } 502 cm, ok := event.Object.(*corev1.ConfigMap) 503 if !ok { 504 logrus.Warnf("Expected a core/v1.ConfigMap object but got a %q object instead", event.Object.GetObjectKind().GroupVersionKind()) 505 return false, nil 506 } 507 status, ok := cm.Data["status"] 508 if !ok { 509 logrus.Debugf("No status found in bootstrap configmap") 510 return false, nil 511 } 512 logrus.Debugf("Bootstrap status: %v", status) 513 return status == "complete", nil 514 }, 515 ) 516 if err != nil { 517 return newBootstrapError(err) 518 } 519 return nil 520 } 521 522 // When bootstrap on SNO deployments, we should not remove the bootstrap node prematurely, 523 // here we make sure that the deployment is stable. 524 // Given the nature of single node we just need to make sure things such as etcd are in the proper state 525 // before continuing. 526 func waitForStableSNOBootstrap(ctx context.Context, config *rest.Config) error { 527 timeout := 5 * time.Minute 528 529 // If we're not in a single node deployment, bail early 530 if isSNO, err := IsSingleNode(); err != nil { 531 logrus.Warningf("Can not determine if installing a Single Node cluster, continuing as normal install: %v", err) 532 return nil 533 } else if !isSNO { 534 return nil 535 } 536 537 snoBootstrapContext, cancel := context.WithTimeout(ctx, timeout) 538 defer cancel() 539 540 untilTime := time.Now().Add(timeout) 541 timezone, _ := untilTime.Zone() 542 logrus.Info("Detected Single Node deployment") 543 logrus.Infof("Waiting up to %v (until %v %s) for the bootstrap etcd member to be removed...", 544 timeout, untilTime.Format(time.Kitchen), timezone) 545 546 client, err := dynamic.NewForConfig(config) 547 if err != nil { 548 return fmt.Errorf("error creating dynamic client: %w", err) 549 } 550 gvr := schema.GroupVersionResource{ 551 Group: operatorv1.SchemeGroupVersion.Group, 552 Version: operatorv1.SchemeGroupVersion.Version, 553 Resource: "etcds", 554 } 555 resourceClient := client.Resource(gvr) 556 // Validate the etcd operator has removed the bootstrap etcd member 557 return wait.PollUntilContextCancel(snoBootstrapContext, 1*time.Second, true, func(ctx context.Context) (done bool, err error) { 558 etcdOperator := &operatorv1.Etcd{} 559 etcdUnstructured, err := resourceClient.Get(ctx, "cluster", metav1.GetOptions{}) 560 if err != nil { 561 // There might be service disruptions in SNO, we log those here but keep trying with in the time limit 562 logrus.Debugf("Error getting ETCD Cluster resource, retrying: %v", err) 563 return false, nil 564 } 565 err = runtime.DefaultUnstructuredConverter.FromUnstructured(etcdUnstructured.Object, etcdOperator) 566 if err != nil { 567 // This error should not happen, if we do, we log the error and keep retrying until we hit the limit 568 logrus.Debugf("Error parsing etcds resource, retrying: %v", err) 569 return false, nil 570 } 571 for _, condition := range etcdOperator.Status.Conditions { 572 if condition.Type == "EtcdBootstrapMemberRemoved" { 573 return configv1.ConditionStatus(condition.Status) == configv1.ConditionTrue, nil 574 } 575 } 576 return false, nil 577 }) 578 } 579 580 // waitForInitializedCluster watches the ClusterVersion waiting for confirmation 581 // that the cluster has been initialized. 582 func waitForInitializedCluster(ctx context.Context, config *rest.Config) error { 583 // TODO revert this value back to 30 minutes. It's currently at the end of 4.6 and we're trying to see if the 584 timeout := 40 * time.Minute 585 586 // Wait longer for baremetal, due to length of time it takes to boot 587 if assetStore, err := assetstore.NewStore(command.RootOpts.Dir); err == nil { 588 if installConfig, err := assetStore.Load(&installconfig.InstallConfig{}); err == nil && installConfig != nil { 589 if installConfig.(*installconfig.InstallConfig).Config.Platform.Name() == baremetal.Name { 590 timeout = 60 * time.Minute 591 } 592 } 593 594 checkIfAgentCommand(assetStore) 595 } 596 597 untilTime := time.Now().Add(timeout) 598 timezone, _ := untilTime.Zone() 599 logrus.Infof("Waiting up to %v (until %v %s) for the cluster at %s to initialize...", 600 timeout, untilTime.Format(time.Kitchen), timezone, config.Host) 601 cc, err := configclient.NewForConfig(config) 602 if err != nil { 603 return errors.Wrap(err, "failed to create a config client") 604 } 605 clusterVersionContext, cancel := context.WithTimeout(ctx, timeout) 606 defer cancel() 607 608 failing := configv1.ClusterStatusConditionType("Failing") 609 timer.StartTimer("Cluster Operators Available") 610 var lastError string 611 _, err = clientwatch.UntilWithSync( 612 clusterVersionContext, 613 cache.NewListWatchFromClient(cc.ConfigV1().RESTClient(), "clusterversions", "", fields.OneTermEqualSelector("metadata.name", "version")), 614 &configv1.ClusterVersion{}, 615 nil, 616 func(event watch.Event) (bool, error) { 617 switch event.Type { 618 case watch.Added, watch.Modified: 619 cv, ok := event.Object.(*configv1.ClusterVersion) 620 if !ok { 621 logrus.Warnf("Expected a ClusterVersion object but got a %q object instead", event.Object.GetObjectKind().GroupVersionKind()) 622 return false, nil 623 } 624 if cov1helpers.IsStatusConditionTrue(cv.Status.Conditions, configv1.OperatorAvailable) && 625 cov1helpers.IsStatusConditionFalse(cv.Status.Conditions, failing) && 626 cov1helpers.IsStatusConditionFalse(cv.Status.Conditions, configv1.OperatorProgressing) { 627 timer.StopTimer("Cluster Operators Available") 628 return true, nil 629 } 630 if cov1helpers.IsStatusConditionTrue(cv.Status.Conditions, failing) { 631 lastError = cov1helpers.FindStatusCondition(cv.Status.Conditions, failing).Message 632 } else if cov1helpers.IsStatusConditionTrue(cv.Status.Conditions, configv1.OperatorProgressing) { 633 lastError = cov1helpers.FindStatusCondition(cv.Status.Conditions, configv1.OperatorProgressing).Message 634 } 635 logrus.Debugf("Still waiting for the cluster to initialize: %s", lastError) 636 return false, nil 637 } 638 logrus.Debug("Still waiting for the cluster to initialize...") 639 return false, nil 640 }, 641 ) 642 643 if err == nil { 644 logrus.Debug("Cluster is initialized") 645 return nil 646 } 647 648 if lastError != "" { 649 if err == wait.ErrWaitTimeout { 650 return errors.Errorf("failed to initialize the cluster: %s", lastError) 651 } 652 653 return errors.Wrapf(err, "failed to initialize the cluster: %s", lastError) 654 } 655 656 return errors.Wrap(err, "failed to initialize the cluster") 657 } 658 659 // waitForStableOperators ensures that each cluster operator is "stable", i.e. the 660 // operator has not been in a progressing state for at least a certain duration, 661 // 30 seconds by default. Returns an error if any operator does meet this threshold 662 // after a deadline, 30 minutes by default. 663 func waitForStableOperators(ctx context.Context, config *rest.Config) error { 664 timer.StartTimer("Cluster Operators Stable") 665 666 stabilityCheckDuration := 30 * time.Minute 667 stabilityContext, cancel := context.WithTimeout(ctx, stabilityCheckDuration) 668 defer cancel() 669 670 untilTime := time.Now().Add(stabilityCheckDuration) 671 timezone, _ := untilTime.Zone() 672 logrus.Infof("Waiting up to %v (until %v %s) to ensure each cluster operator has finished progressing...", 673 stabilityCheckDuration, untilTime.Format(time.Kitchen), timezone) 674 675 cc, err := configclient.NewForConfig(config) 676 if err != nil { 677 return errors.Wrap(err, "failed to create a config client") 678 } 679 configInformers := configinformers.NewSharedInformerFactory(cc, 0) 680 clusterOperatorInformer := configInformers.Config().V1().ClusterOperators().Informer() 681 clusterOperatorLister := configInformers.Config().V1().ClusterOperators().Lister() 682 configInformers.Start(ctx.Done()) 683 if !cache.WaitForCacheSync(ctx.Done(), clusterOperatorInformer.HasSynced) { 684 return fmt.Errorf("informers never started") 685 } 686 687 waitErr := wait.PollUntilContextCancel(stabilityContext, 1*time.Second, true, waitForAllClusterOperators(clusterOperatorLister)) 688 if waitErr != nil { 689 logrus.Errorf("Error checking cluster operator Progressing status: %q", waitErr) 690 stableOperators, unstableOperators, err := currentOperatorStability(clusterOperatorLister) 691 if err != nil { 692 logrus.Errorf("Error checking final cluster operator Progressing status: %q", err) 693 } 694 logrus.Debugf("These cluster operators were stable: [%s]", strings.Join(sets.List(stableOperators), ", ")) 695 logrus.Errorf("These cluster operators were not stable: [%s]", strings.Join(sets.List(unstableOperators), ", ")) 696 697 logrus.Exit(exitCodeOperatorStabilityFailed) 698 } 699 700 timer.StopTimer("Cluster Operators Stable") 701 702 logrus.Info("All cluster operators have completed progressing") 703 704 return nil 705 } 706 707 // getConsole returns the console URL from the route 'console' in namespace openshift-console 708 func getConsole(ctx context.Context, config *rest.Config) (string, error) { 709 url := "" 710 // Need to keep these updated if they change 711 consoleNamespace := "openshift-console" 712 consoleRouteName := "console" 713 rc, err := routeclient.NewForConfig(config) 714 if err != nil { 715 return "", errors.Wrap(err, "creating a route client") 716 } 717 718 consoleRouteTimeout := 2 * time.Minute 719 logrus.Infof("Checking to see if there is a route at %s/%s...", consoleNamespace, consoleRouteName) 720 consoleRouteContext, cancel := context.WithTimeout(ctx, consoleRouteTimeout) 721 defer cancel() 722 // Poll quickly but only log when the response 723 // when we've seen 15 of the same errors or output of 724 // no route in a row (to show we're still alive). 725 logDownsample := 15 726 silenceRemaining := logDownsample 727 timer.StartTimer("Console") 728 wait.Until(func() { 729 route, err := rc.RouteV1().Routes(consoleNamespace).Get(ctx, consoleRouteName, metav1.GetOptions{}) 730 if err == nil { 731 logrus.Debugf("Route found in openshift-console namespace: %s", consoleRouteName) 732 if uri, _, err2 := routeapihelpers.IngressURI(route, ""); err2 == nil { 733 url = uri.String() 734 logrus.Debug("OpenShift console route is admitted") 735 cancel() 736 } else { 737 err = err2 738 } 739 } else if apierrors.IsNotFound(err) { 740 logrus.Debug("OpenShift console route does not exist") 741 cancel() 742 } 743 744 if err != nil { 745 silenceRemaining-- 746 if silenceRemaining == 0 { 747 logrus.Debugf("Still waiting for the console route: %v", err) 748 silenceRemaining = logDownsample 749 } 750 } 751 }, 2*time.Second, consoleRouteContext.Done()) 752 err = consoleRouteContext.Err() 753 if err != nil && err != context.Canceled { 754 return url, errors.Wrap(err, "waiting for openshift-console URL") 755 } 756 if url == "" { 757 return url, errors.New("could not get openshift-console URL") 758 } 759 timer.StopTimer("Console") 760 return url, nil 761 } 762 763 // logComplete prints info upon completion 764 func logComplete(directory, consoleURL string) error { 765 absDir, err := filepath.Abs(directory) 766 if err != nil { 767 return err 768 } 769 kubeconfig := filepath.Join(absDir, "auth", "kubeconfig") 770 pwFile := filepath.Join(absDir, "auth", "kubeadmin-password") 771 pw, err := os.ReadFile(pwFile) 772 if err != nil { 773 return err 774 } 775 logrus.Info("Install complete!") 776 logrus.Infof("To access the cluster as the system:admin user when using 'oc', run 'export KUBECONFIG=%s'", kubeconfig) 777 if consoleURL != "" { 778 logrus.Infof("Access the OpenShift web-console here: %s", consoleURL) 779 if skipPasswordPrintFlag { 780 logrus.Infof("Credentials omitted, if necessary verify the %s file", pwFile) 781 } else { 782 logrus.Infof("Login to the console with user: %q, and password: %q", "kubeadmin", pw) 783 } 784 } 785 return nil 786 } 787 788 func waitForInstallComplete(ctx context.Context, config *rest.Config, directory string) error { 789 if err := waitForInitializedCluster(ctx, config); err != nil { 790 return err 791 } 792 793 if err := addRouterCAToClusterCA(ctx, config, command.RootOpts.Dir); err != nil { 794 return err 795 } 796 797 if err := waitForStableOperators(ctx, config); err != nil { 798 return err 799 } 800 801 consoleURL, err := getConsole(ctx, config) 802 if err != nil { 803 logrus.Warnf("Cluster does not have a console available: %v", err) 804 } 805 806 return logComplete(command.RootOpts.Dir, consoleURL) 807 } 808 809 func logTroubleshootingLink() { 810 logrus.Error(`Cluster initialization failed because one or more operators are not functioning properly. 811 The cluster should be accessible for troubleshooting as detailed in the documentation linked below, 812 https://docs.openshift.com/container-platform/latest/support/troubleshooting/troubleshooting-installations.html 813 The 'wait-for install-complete' subcommand can then be used to continue the installation`) 814 } 815 816 func checkIfAgentCommand(assetStore asset.Store) { 817 if agentConfig, err := assetStore.Load(&agentconfig.AgentConfig{}); err == nil && agentConfig != nil { 818 logrus.Warning("An agent configuration was detected but this command is not the agent wait-for command") 819 } 820 } 821 822 func waitForAllClusterOperators(clusterOperatorLister configlisters.ClusterOperatorLister) func(ctx context.Context) (bool, error) { 823 previouslyStableOperators := sets.Set[string]{} 824 825 return func(ctx context.Context) (bool, error) { 826 stableOperators, unstableOperators, err := currentOperatorStability(clusterOperatorLister) 827 if err != nil { 828 return false, err 829 } 830 if newlyStableOperators := stableOperators.Difference(previouslyStableOperators); len(newlyStableOperators) > 0 { 831 for _, name := range sets.List(newlyStableOperators) { 832 logrus.Debugf("Cluster Operator %s is stable", name) 833 } 834 } 835 if newlyUnstableOperators := previouslyStableOperators.Difference(stableOperators); len(newlyUnstableOperators) > 0 { 836 for _, name := range sets.List(newlyUnstableOperators) { 837 logrus.Debugf("Cluster Operator %s became unstable", name) 838 } 839 } 840 previouslyStableOperators = stableOperators 841 842 if len(unstableOperators) == 0 { 843 return true, nil 844 } 845 846 return false, nil 847 } 848 } 849 850 func currentOperatorStability(clusterOperatorLister configlisters.ClusterOperatorLister) (sets.Set[string], sets.Set[string], error) { 851 clusterOperators, err := clusterOperatorLister.List(labels.Everything()) 852 if err != nil { 853 return nil, nil, err // lister should never fail 854 } 855 856 stableOperators := sets.Set[string]{} 857 unstableOperators := sets.Set[string]{} 858 for _, clusterOperator := range clusterOperators { 859 name := clusterOperator.Name 860 progressing := cov1helpers.FindStatusCondition(clusterOperator.Status.Conditions, configv1.OperatorProgressing) 861 if progressing == nil { 862 logrus.Debugf("Cluster Operator %s progressing == nil", name) 863 unstableOperators.Insert(name) 864 continue 865 } 866 if meetsStabilityThreshold(progressing) { 867 stableOperators.Insert(name) 868 } else { 869 logrus.Debugf("Cluster Operator %s is Progressing=%s LastTransitionTime=%v DurationSinceTransition=%.fs Reason=%s Message=%s", name, progressing.Status, progressing.LastTransitionTime.Time, time.Since(progressing.LastTransitionTime.Time).Seconds(), progressing.Reason, progressing.Message) 870 unstableOperators.Insert(name) 871 } 872 } 873 874 return stableOperators, unstableOperators, nil 875 } 876 877 func meetsStabilityThreshold(progressing *configv1.ClusterOperatorStatusCondition) bool { 878 return progressing.Status == configv1.ConditionFalse && time.Since(progressing.LastTransitionTime.Time).Seconds() > coStabilityThreshold 879 } 880 881 func handleUnreachableAPIServer(ctx context.Context, config *rest.Config) error { 882 assetStore, err := assetstore.NewStore(command.RootOpts.Dir) 883 if err != nil { 884 return fmt.Errorf("failed to create asset store: %w", err) 885 } 886 887 // Ensure that the install is expecting the user to provision their own DNS solution. 888 installConfig := &installconfig.InstallConfig{} 889 if err := assetStore.Fetch(ctx, installConfig); err != nil { 890 return fmt.Errorf("failed to fetch %s: %w", installConfig.Name(), err) 891 } 892 switch installConfig.Config.Platform.Name() { //nolint:gocritic 893 case gcp.Name: 894 if installConfig.Config.GCP.UserProvisionedDNS != gcp.UserProvisionedDNSEnabled { 895 return nil 896 } 897 default: 898 return nil 899 } 900 901 lbConfig := &lbconfig.Config{} 902 if err := assetStore.Fetch(ctx, lbConfig); err != nil { 903 return fmt.Errorf("failed to fetch %s: %w", lbConfig.Name(), err) 904 } 905 906 _, ipAddrs, err := lbConfig.ParseDNSDataFromConfig(lbconfig.PublicLoadBalancer) 907 if err != nil { 908 return fmt.Errorf("failed to parse lbconfig: %w", err) 909 } 910 911 // The kubeconfig handles one ip address 912 ipAddr := "" 913 if len(ipAddrs) > 0 { 914 ipAddr = ipAddrs[0].String() 915 } 916 if ipAddr == "" { 917 return fmt.Errorf("no ip address found in lbconfig") 918 } 919 920 dialer := &net.Dialer{ 921 Timeout: 1 * time.Minute, 922 KeepAlive: 1 * time.Minute, 923 } 924 config.Dial = kubeconfig.CreateDialContext(dialer, ipAddr) 925 926 // The asset is currently saved in <install-dir>/openshift. This directory 927 // was consumed during install but this file is generated after that action. This 928 // artifact will hang around unless it is purged here. 929 if err := asset.DeleteAssetFromDisk(lbConfig, command.RootOpts.Dir); err != nil { 930 return fmt.Errorf("failed to delete %s from disk", lbConfig.Name()) 931 } 932 933 return nil 934 } 935 936 // IsSingleNode determines if we are in a single node configuration based off of the install config 937 // loaded from the asset store. 938 func IsSingleNode() (bool, error) { 939 assetStore, err := assetstore.NewStore(command.RootOpts.Dir) 940 if err != nil { 941 return false, fmt.Errorf("error loading asset store: %w", err) 942 } 943 installConfig, err := assetStore.Load(&installconfig.InstallConfig{}) 944 if err != nil { 945 return false, fmt.Errorf("error loading installConfig: %w", err) 946 } 947 if installConfig == nil { 948 return false, fmt.Errorf("installConfig loaded from asset store was nil") 949 } 950 951 config := installConfig.(*installconfig.InstallConfig).Config 952 if machinePool := config.ControlPlane; machinePool != nil { 953 return *machinePool.Replicas == int64(1), nil 954 } 955 return false, nil 956 }