github.com/openshift/installer@v1.4.17/pkg/agent/cluster.go (about) 1 package agent 2 3 import ( 4 "context" 5 "fmt" 6 "net" 7 "os" 8 "path/filepath" 9 "strconv" 10 "time" 11 12 "github.com/go-openapi/strfmt" 13 "github.com/pkg/errors" 14 "github.com/sirupsen/logrus" 15 16 "github.com/openshift/assisted-service/client/installer" 17 "github.com/openshift/assisted-service/models" 18 "github.com/openshift/installer/pkg/asset/agent/gencrypto" 19 "github.com/openshift/installer/pkg/asset/agent/workflow" 20 "github.com/openshift/installer/pkg/gather/ssh" 21 ) 22 23 // Cluster is a struct designed to help interact with the cluster that is 24 // currently being installed by agent installer. 25 type Cluster struct { 26 Ctx context.Context 27 API *clientSet 28 assetDir string 29 clusterConsoleRouteURL string 30 clusterID *strfmt.UUID 31 clusterInfraEnvID *strfmt.UUID 32 installHistory *clusterInstallStatusHistory 33 workflow workflow.AgentWorkflowType 34 } 35 36 type clientSet struct { 37 Kube *ClusterKubeAPIClient 38 OpenShift *ClusterOpenShiftAPIClient 39 Rest *NodeZeroRestClient 40 } 41 42 type clusterInstallStatusHistory struct { 43 RestAPISeen bool 44 RestAPIClusterStatusAddingHostsSeen bool 45 RestAPIClusterStatusCancelledSeen bool 46 RestAPIClusterStatusInstallingSeen bool 47 RestAPIClusterStatusInstallingPendingUserActionSeen bool 48 RestAPIClusterStatusInsufficientSeen bool 49 RestAPIClusterStatusFinalizingSeen bool 50 RestAPIClusterStatusErrorSeen bool 51 RestAPIClusterStatusPendingForInputSeen bool 52 RestAPIClusterStatusPreparingForInstallationSeen bool 53 RestAPIClusterStatusReadySeen bool 54 RestAPIInfraEnvEventList models.EventList 55 RestAPIPreviousClusterStatus string 56 RestAPIPreviousEventMessage string 57 RestAPIHostValidationsPassed bool 58 ClusterKubeAPISeen bool 59 ClusterBootstrapComplete bool 60 ClusterOperatorsInitialized bool 61 ClusterConsoleRouteCreated bool 62 ClusterConsoleRouteURLCreated bool 63 ClusterInstallComplete bool 64 NotReadyTime time.Time 65 ValidationResults *validationResults 66 ClusterInitTime time.Time 67 } 68 69 // NewCluster initializes a Cluster object 70 func NewCluster(ctx context.Context, assetDir, rendezvousIP, kubeconfigPath, sshKey string, workflowType workflow.AgentWorkflowType) (*Cluster, error) { 71 czero := &Cluster{} 72 capi := &clientSet{} 73 74 var authToken string 75 var err error 76 77 switch workflowType { 78 case workflow.AgentWorkflowTypeInstall: 79 authToken, err = FindAuthTokenFromAssetStore(assetDir) 80 if err != nil { 81 return nil, err 82 } 83 case workflow.AgentWorkflowTypeAddNodes: 84 authToken, err = gencrypto.GetAuthTokenFromCluster(ctx, kubeconfigPath) 85 if err != nil { 86 return nil, err 87 } 88 default: 89 return nil, fmt.Errorf("AgentWorkflowType value not supported: %s", workflowType) 90 } 91 92 restclient := NewNodeZeroRestClient(ctx, rendezvousIP, sshKey, authToken) 93 94 kubeclient, err := NewClusterKubeAPIClient(ctx, kubeconfigPath) 95 if err != nil { 96 logrus.Fatal(err) 97 } 98 99 ocpclient, err := NewClusterOpenShiftAPIClient(ctx, kubeconfigPath) 100 if err != nil { 101 logrus.Fatal(err) 102 } 103 104 capi.Rest = restclient 105 capi.Kube = kubeclient 106 capi.OpenShift = ocpclient 107 108 cinstallstatushistory := &clusterInstallStatusHistory{ 109 RestAPISeen: false, 110 RestAPIInfraEnvEventList: nil, 111 RestAPIPreviousClusterStatus: "", 112 RestAPIPreviousEventMessage: "", 113 RestAPIHostValidationsPassed: false, 114 ClusterKubeAPISeen: false, 115 ClusterBootstrapComplete: false, 116 ClusterOperatorsInitialized: false, 117 ClusterConsoleRouteCreated: false, 118 ClusterConsoleRouteURLCreated: false, 119 ClusterInstallComplete: false, 120 ClusterInitTime: time.Now(), 121 } 122 123 cvalidationresults := &validationResults{ 124 ClusterValidationHistory: make(map[string]*validationResultHistory), 125 HostValidationHistory: make(map[string]map[string]*validationResultHistory), 126 } 127 128 czero.Ctx = ctx 129 czero.API = capi 130 czero.workflow = workflowType 131 czero.clusterID = nil 132 czero.clusterInfraEnvID = nil 133 czero.assetDir = assetDir 134 czero.clusterConsoleRouteURL = "" 135 czero.installHistory = cinstallstatushistory 136 czero.installHistory.ValidationResults = cvalidationresults 137 return czero, nil 138 } 139 140 // IsBootstrapComplete (is-bootstrap-complete, exit-on-error, returned-error) 141 // IsBootstrapComplete Determine if the cluster has completed the bootstrap process. 142 func (czero *Cluster) IsBootstrapComplete() (bool, bool, error) { 143 144 if czero.installHistory.ClusterBootstrapComplete { 145 logrus.Info("Bootstrap is complete") 146 return true, false, nil 147 } 148 149 clusterKubeAPILive := czero.API.Kube.IsKubeAPILive() 150 151 agentRestAPILive := czero.API.Rest.IsRestAPILive() 152 153 // Both API's are not available 154 if !agentRestAPILive && !clusterKubeAPILive { 155 // Current API Status: Agent Rest API: down, Bootstrap Kube API: down 156 if !czero.installHistory.RestAPISeen && !czero.installHistory.ClusterKubeAPISeen { 157 logrus.Debug("Agent Rest API never initialized. Bootstrap Kube API never initialized") 158 elapsedSinceInit := time.Since(czero.installHistory.ClusterInitTime) 159 // After allowing time for the interface to come up, check if Node0 can be accessed via ssh 160 if elapsedSinceInit > 2*time.Minute && !czero.CanSSHToNodeZero() { 161 logrus.Info("Cannot access Rendezvous Host. There may be a network configuration problem, check console for additional info") 162 } else { 163 logrus.Info("Waiting for cluster install to initialize. Sleeping for 30 seconds") 164 } 165 166 time.Sleep(30 * time.Second) 167 return false, false, nil 168 } 169 170 if czero.installHistory.RestAPISeen && !czero.installHistory.ClusterKubeAPISeen { 171 logrus.Debug("Bootstrap Kube API never initialized") 172 logrus.Debugf("Cluster install status from Agent Rest API last seen was: %s", czero.installHistory.RestAPIPreviousClusterStatus) 173 return false, false, errors.New("cluster bootstrap did not complete") 174 } 175 } 176 177 // Kube API is available 178 if clusterKubeAPILive { 179 180 // First time we see the cluster Kube API 181 if !czero.installHistory.ClusterKubeAPISeen { 182 logrus.Info("Bootstrap Kube API Initialized") 183 czero.installHistory.ClusterKubeAPISeen = true 184 } 185 186 configmap, err := czero.API.Kube.IsBootstrapConfigMapComplete() 187 if configmap { 188 logrus.Info("Bootstrap configMap status is complete") 189 czero.installHistory.ClusterBootstrapComplete = true 190 } 191 if err != nil { 192 logrus.Debug(err) 193 } 194 } 195 196 // Agent Rest API is available 197 if agentRestAPILive { 198 exitOnErr, err := czero.MonitorStatusFromAssistedService() 199 if err != nil { 200 return false, exitOnErr, err 201 } 202 } 203 204 // cluster bootstrap is not complete 205 return false, false, nil 206 } 207 208 // MonitorStatusFromAssistedService (exit-on-error, returned-error) 209 // checks if the Assisted Service API is up, and both cluster and 210 // infraenv have been registered. 211 // 212 // After those preconditions are met, 213 // it then reports on the host validation status and overall cluster 214 // status and updates the cluster's install history. 215 // 216 // After cluster or host installation has started, new events from 217 // the Assisted Service API are also logged and updated to the cluster's 218 // install history. 219 func (czero *Cluster) MonitorStatusFromAssistedService() (bool, error) { 220 resource := "cluster" 221 logPrefix := "" 222 if czero.workflow == workflow.AgentWorkflowTypeAddNodes { 223 resource = "host" 224 logPrefix = fmt.Sprintf("Node %s: ", czero.API.Rest.NodeZeroIP) 225 } 226 227 // First time we see the agent Rest API 228 if !czero.installHistory.RestAPISeen { 229 logrus.Debugf("%sAgent Rest API Initialized", logPrefix) 230 czero.installHistory.RestAPISeen = true 231 czero.installHistory.NotReadyTime = time.Now() 232 } 233 234 // Lazy loading of the clusterID and clusterInfraEnvID 235 if czero.clusterID == nil { 236 clusterID, err := czero.API.Rest.getClusterID() 237 if err != nil { 238 return false, errors.Wrap(err, "Unable to retrieve clusterID from Agent Rest API") 239 } 240 czero.clusterID = clusterID 241 } 242 243 if czero.clusterInfraEnvID == nil { 244 clusterInfraEnvID, err := czero.API.Rest.getClusterInfraEnvID() 245 if err != nil { 246 return false, errors.Wrap(err, "Unable to retrieve clusterInfraEnvID from Agent Rest API") 247 } 248 czero.clusterInfraEnvID = clusterInfraEnvID 249 } 250 251 // Getting cluster metadata from Agent Rest API 252 clusterMetadata, err := czero.GetClusterRestAPIMetadata() 253 if err != nil { 254 return false, errors.Wrap(err, "Unable to retrieve cluster metadata from Agent Rest API") 255 } 256 257 if clusterMetadata == nil { 258 return false, errors.New("cluster metadata returned nil from Agent Rest API") 259 } 260 261 czero.PrintInstallStatus(clusterMetadata) 262 263 // If status indicates pending action, log host info to help pinpoint what is missing 264 if (*clusterMetadata.Status != czero.installHistory.RestAPIPreviousClusterStatus) && 265 (*clusterMetadata.Status == models.ClusterStatusInstallingPendingUserAction) { 266 for _, host := range clusterMetadata.Hosts { 267 if *host.Status == models.ClusterStatusInstallingPendingUserAction { 268 if logPrefix != "" { 269 logrus.Warningf("%s%s %s", logPrefix, host.RequestedHostname, *host.StatusInfo) 270 } else { 271 logrus.Warningf("Host %s %s", host.RequestedHostname, *host.StatusInfo) 272 } 273 } 274 } 275 } 276 277 if *clusterMetadata.Status == models.ClusterStatusReady { 278 stuck, err := czero.IsClusterStuckInReady() 279 if err != nil { 280 return stuck, err 281 } 282 } else { 283 czero.installHistory.NotReadyTime = time.Now() 284 } 285 286 czero.installHistory.RestAPIPreviousClusterStatus = *clusterMetadata.Status 287 288 installing, _ := czero.IsInstalling(*clusterMetadata.Status) 289 if !installing { 290 errored, _ := czero.HasErrored(*clusterMetadata.Status) 291 if errored { 292 return false, fmt.Errorf("%s has stopped installing... working to recover installation", resource) 293 } else if *clusterMetadata.Status == models.ClusterStatusCancelled { 294 return true, fmt.Errorf("%s installation was cancelled", resource) 295 } 296 } 297 298 validationsErr := checkValidations(clusterMetadata, czero.installHistory.ValidationResults, logrus.StandardLogger(), logPrefix) 299 if validationsErr != nil { 300 return false, errors.Wrap(validationsErr, "host validations failed") 301 } 302 303 // Print most recent event associated with the clusterInfraEnvID 304 eventList, err := czero.API.Rest.GetInfraEnvEvents(czero.clusterInfraEnvID) 305 if err != nil { 306 return false, errors.Wrap(err, fmt.Sprintf("Unable to retrieve events about the %s from the Agent Rest API", resource)) 307 } 308 if len(eventList) == 0 { 309 // No cluster events detected from the Agent Rest API 310 } else { 311 mostRecentEvent := eventList[len(eventList)-1] 312 // Don't print the same status message back to back 313 if *mostRecentEvent.Message != czero.installHistory.RestAPIPreviousEventMessage { 314 if *mostRecentEvent.Severity == models.EventSeverityInfo { 315 logrus.Infof("%s%s", logPrefix, *mostRecentEvent.Message) 316 } else { 317 logrus.Warnf("%s%s", logPrefix, *mostRecentEvent.Message) 318 } 319 } 320 czero.installHistory.RestAPIPreviousEventMessage = *mostRecentEvent.Message 321 czero.installHistory.RestAPIInfraEnvEventList = eventList 322 } 323 return false, nil 324 } 325 326 // IsInstallComplete Determine if the cluster has completed installation. 327 func (czero *Cluster) IsInstallComplete() (bool, error) { 328 329 if czero.installHistory.ClusterInstallComplete { 330 logrus.Info("Cluster installation is complete") 331 return true, nil 332 } 333 334 if !czero.installHistory.ClusterOperatorsInitialized { 335 initialized, err := czero.API.OpenShift.AreClusterOperatorsInitialized() 336 if initialized && err == nil { 337 czero.installHistory.ClusterOperatorsInitialized = true 338 } 339 if err != nil { 340 return false, errors.Wrap(err, "Error while initializing cluster operators") 341 } 342 343 } 344 345 if !czero.installHistory.ClusterConsoleRouteCreated { 346 route, err := czero.API.OpenShift.IsConsoleRouteAvailable() 347 if route && err == nil { 348 czero.installHistory.ClusterConsoleRouteCreated = true 349 } 350 if err != nil { 351 return false, errors.Wrap(err, "Error while waiting for console route") 352 } 353 354 } 355 356 if !czero.installHistory.ClusterConsoleRouteURLCreated { 357 available, url, err := czero.API.OpenShift.IsConsoleRouteURLAvailable() 358 if available && url != "" && err == nil { 359 czero.clusterConsoleRouteURL = url 360 czero.installHistory.ClusterConsoleRouteURLCreated = true 361 } 362 if err != nil { 363 return false, errors.Wrap(err, "Error while waiting for console route URL") 364 } 365 } 366 367 if czero.installHistory.ClusterOperatorsInitialized && 368 czero.installHistory.ClusterConsoleRouteCreated && 369 czero.installHistory.ClusterConsoleRouteURLCreated { 370 czero.installHistory.ClusterInstallComplete = true 371 return true, nil 372 } 373 374 return false, nil 375 } 376 377 // IsClusterStuckInReady Determine if the cluster has stopped transitioning out of the Ready state 378 func (czero *Cluster) IsClusterStuckInReady() (bool, error) { 379 380 // If the status changes back to Ready from Installing it indicates an error. This condition 381 // will be retried 382 if czero.installHistory.RestAPIPreviousClusterStatus == models.ClusterStatusPreparingForInstallation { 383 return false, errors.New("failed to prepare cluster installation, retrying") 384 } 385 386 // Check if stuck in Ready state 387 if czero.installHistory.RestAPIPreviousClusterStatus == models.ClusterStatusReady { 388 current := time.Now() 389 elapsed := current.Sub(czero.installHistory.NotReadyTime) 390 if elapsed > 1*time.Minute { 391 return true, errors.New("failed to progress after all hosts available") 392 } 393 } 394 395 return false, nil 396 } 397 398 // GetClusterRestAPIMetadata Retrieve the current cluster metadata from the Agent Rest API 399 func (czero *Cluster) GetClusterRestAPIMetadata() (*models.Cluster, error) { 400 // GET /v2/clusters/{cluster_zero_id} 401 if czero.clusterID != nil { 402 getClusterParams := &installer.V2GetClusterParams{ClusterID: *czero.clusterID} 403 result, err := czero.API.Rest.Client.Installer.V2GetCluster(czero.Ctx, getClusterParams) 404 if err != nil { 405 return nil, err 406 } 407 return result.Payload, nil 408 } 409 return nil, errors.New("no clusterID known for the cluster") 410 } 411 412 // HasErrored Determine if the cluster installation has errored using the models from the Agent Rest API. 413 func (czero *Cluster) HasErrored(status string) (bool, string) { 414 clusterErrorStates := map[string]bool{ 415 models.ClusterStatusAddingHosts: false, 416 models.ClusterStatusCancelled: false, 417 models.ClusterStatusInstalling: false, 418 models.ClusterStatusInstallingPendingUserAction: true, 419 models.ClusterStatusInsufficient: false, 420 models.ClusterStatusError: true, 421 models.ClusterStatusFinalizing: false, 422 models.ClusterStatusPendingForInput: false, 423 models.ClusterStatusPreparingForInstallation: false, 424 models.ClusterStatusReady: false, 425 } 426 return clusterErrorStates[status], status 427 } 428 429 // IsInstalling Determine if the cluster is still installing using the models from the Agent Rest API. 430 func (czero *Cluster) IsInstalling(status string) (bool, string) { 431 clusterInstallingStates := map[string]bool{ 432 models.ClusterStatusAddingHosts: true, 433 models.ClusterStatusCancelled: false, 434 models.ClusterStatusInstalling: true, 435 models.ClusterStatusInstallingPendingUserAction: false, 436 models.ClusterStatusInsufficient: false, 437 models.ClusterStatusError: false, 438 models.ClusterStatusFinalizing: true, 439 models.ClusterStatusPendingForInput: true, 440 models.ClusterStatusPreparingForInstallation: true, 441 models.ClusterStatusReady: true, 442 } 443 return clusterInstallingStates[status], status 444 } 445 446 // PrintInfraEnvRestAPIEventList Prints the whole event list for debugging 447 func (czero *Cluster) PrintInfraEnvRestAPIEventList() { 448 if czero.installHistory.RestAPIInfraEnvEventList != nil { 449 for i := 0; i < len(czero.installHistory.RestAPIInfraEnvEventList); i++ { 450 logrus.Debug(*czero.installHistory.RestAPIInfraEnvEventList[i].Message) 451 } 452 } else { 453 logrus.Debug("No events logged from the Agent Rest API") 454 } 455 } 456 457 // PrintInstallationComplete Prints the installation complete information 458 func (czero *Cluster) PrintInstallationComplete() error { 459 absDir, err := filepath.Abs(czero.assetDir) 460 if err != nil { 461 return err 462 } 463 kubeconfig := filepath.Join(absDir, "auth", "kubeconfig") 464 pwFile := filepath.Join(absDir, "auth", "kubeadmin-password") 465 pw, err := os.ReadFile(pwFile) 466 if err != nil { 467 return err 468 } 469 logrus.Info("Install complete!") 470 logrus.Infof("To access the cluster as the system:admin user when using 'oc', run\n export KUBECONFIG=%s", kubeconfig) 471 logrus.Infof("Access the OpenShift web-console here: %s", czero.clusterConsoleRouteURL) 472 logrus.Infof("Login to the console with user: %q, and password: %q", "kubeadmin", pw) 473 return nil 474 475 } 476 477 // PrintInstallStatus Print a human friendly message using the models from the Agent Rest API. 478 func (czero *Cluster) PrintInstallStatus(cluster *models.Cluster) { 479 friendlyStatus := czero.humanFriendlyClusterInstallStatus(*cluster.Status) 480 // Don't print the same status message back to back 481 if *cluster.Status != czero.installHistory.RestAPIPreviousClusterStatus { 482 logrus.Info(friendlyStatus) 483 } 484 } 485 486 // CanSSHToNodeZero Checks if ssh to NodeZero succeeds. 487 func (czero *Cluster) CanSSHToNodeZero() bool { 488 ip := czero.API.Rest.NodeZeroIP 489 port := 22 490 491 _, err := ssh.NewClient("core", net.JoinHostPort(ip, strconv.Itoa(port)), czero.API.Rest.NodeSSHKey) 492 if err != nil { 493 logrus.Debugf("Failed to connect to the Rendezvous Host: %s", err) 494 } 495 return err == nil 496 } 497 498 // Human friendly install status strings mapped to the Agent Rest API cluster statuses 499 func (czero *Cluster) humanFriendlyClusterInstallStatus(status string) string { 500 clusterStoppedInstallingStates := map[string]string{ 501 models.ClusterStatusAddingHosts: "Cluster is adding hosts", 502 models.ClusterStatusCancelled: "Cluster installation cancelled", 503 models.ClusterStatusError: "Cluster has hosts in error", 504 models.ClusterStatusFinalizing: "Finalizing cluster installation", 505 models.ClusterStatusInstalling: "Cluster installation in progress", 506 models.ClusterStatusInstallingPendingUserAction: "Cluster has hosts requiring user input", 507 models.ClusterStatusInsufficient: "Cluster is not ready for install. Check validations", 508 models.ClusterStatusPendingForInput: "User input is required to continue cluster installation", 509 models.ClusterStatusPreparingForInstallation: "Preparing cluster for installation", 510 models.ClusterStatusReady: "Cluster is ready for install", 511 } 512 switch czero.workflow { 513 case workflow.AgentWorkflowTypeAddNodes: 514 return fmt.Sprintf("Node %s: %s", czero.API.Rest.NodeZeroIP, clusterStoppedInstallingStates[status]) 515 default: 516 return clusterStoppedInstallingStates[status] 517 } 518 }