github.com/openshift/installer@v1.4.17/pkg/agent/cluster.go (about)

     1  package agent
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"net"
     7  	"os"
     8  	"path/filepath"
     9  	"strconv"
    10  	"time"
    11  
    12  	"github.com/go-openapi/strfmt"
    13  	"github.com/pkg/errors"
    14  	"github.com/sirupsen/logrus"
    15  
    16  	"github.com/openshift/assisted-service/client/installer"
    17  	"github.com/openshift/assisted-service/models"
    18  	"github.com/openshift/installer/pkg/asset/agent/gencrypto"
    19  	"github.com/openshift/installer/pkg/asset/agent/workflow"
    20  	"github.com/openshift/installer/pkg/gather/ssh"
    21  )
    22  
    23  // Cluster is a struct designed to help interact with the cluster that is
    24  // currently being installed by agent installer.
    25  type Cluster struct {
    26  	Ctx                    context.Context
    27  	API                    *clientSet
    28  	assetDir               string
    29  	clusterConsoleRouteURL string
    30  	clusterID              *strfmt.UUID
    31  	clusterInfraEnvID      *strfmt.UUID
    32  	installHistory         *clusterInstallStatusHistory
    33  	workflow               workflow.AgentWorkflowType
    34  }
    35  
    36  type clientSet struct {
    37  	Kube      *ClusterKubeAPIClient
    38  	OpenShift *ClusterOpenShiftAPIClient
    39  	Rest      *NodeZeroRestClient
    40  }
    41  
    42  type clusterInstallStatusHistory struct {
    43  	RestAPISeen                                         bool
    44  	RestAPIClusterStatusAddingHostsSeen                 bool
    45  	RestAPIClusterStatusCancelledSeen                   bool
    46  	RestAPIClusterStatusInstallingSeen                  bool
    47  	RestAPIClusterStatusInstallingPendingUserActionSeen bool
    48  	RestAPIClusterStatusInsufficientSeen                bool
    49  	RestAPIClusterStatusFinalizingSeen                  bool
    50  	RestAPIClusterStatusErrorSeen                       bool
    51  	RestAPIClusterStatusPendingForInputSeen             bool
    52  	RestAPIClusterStatusPreparingForInstallationSeen    bool
    53  	RestAPIClusterStatusReadySeen                       bool
    54  	RestAPIInfraEnvEventList                            models.EventList
    55  	RestAPIPreviousClusterStatus                        string
    56  	RestAPIPreviousEventMessage                         string
    57  	RestAPIHostValidationsPassed                        bool
    58  	ClusterKubeAPISeen                                  bool
    59  	ClusterBootstrapComplete                            bool
    60  	ClusterOperatorsInitialized                         bool
    61  	ClusterConsoleRouteCreated                          bool
    62  	ClusterConsoleRouteURLCreated                       bool
    63  	ClusterInstallComplete                              bool
    64  	NotReadyTime                                        time.Time
    65  	ValidationResults                                   *validationResults
    66  	ClusterInitTime                                     time.Time
    67  }
    68  
    69  // NewCluster initializes a Cluster object
    70  func NewCluster(ctx context.Context, assetDir, rendezvousIP, kubeconfigPath, sshKey string, workflowType workflow.AgentWorkflowType) (*Cluster, error) {
    71  	czero := &Cluster{}
    72  	capi := &clientSet{}
    73  
    74  	var authToken string
    75  	var err error
    76  
    77  	switch workflowType {
    78  	case workflow.AgentWorkflowTypeInstall:
    79  		authToken, err = FindAuthTokenFromAssetStore(assetDir)
    80  		if err != nil {
    81  			return nil, err
    82  		}
    83  	case workflow.AgentWorkflowTypeAddNodes:
    84  		authToken, err = gencrypto.GetAuthTokenFromCluster(ctx, kubeconfigPath)
    85  		if err != nil {
    86  			return nil, err
    87  		}
    88  	default:
    89  		return nil, fmt.Errorf("AgentWorkflowType value not supported: %s", workflowType)
    90  	}
    91  
    92  	restclient := NewNodeZeroRestClient(ctx, rendezvousIP, sshKey, authToken)
    93  
    94  	kubeclient, err := NewClusterKubeAPIClient(ctx, kubeconfigPath)
    95  	if err != nil {
    96  		logrus.Fatal(err)
    97  	}
    98  
    99  	ocpclient, err := NewClusterOpenShiftAPIClient(ctx, kubeconfigPath)
   100  	if err != nil {
   101  		logrus.Fatal(err)
   102  	}
   103  
   104  	capi.Rest = restclient
   105  	capi.Kube = kubeclient
   106  	capi.OpenShift = ocpclient
   107  
   108  	cinstallstatushistory := &clusterInstallStatusHistory{
   109  		RestAPISeen:                   false,
   110  		RestAPIInfraEnvEventList:      nil,
   111  		RestAPIPreviousClusterStatus:  "",
   112  		RestAPIPreviousEventMessage:   "",
   113  		RestAPIHostValidationsPassed:  false,
   114  		ClusterKubeAPISeen:            false,
   115  		ClusterBootstrapComplete:      false,
   116  		ClusterOperatorsInitialized:   false,
   117  		ClusterConsoleRouteCreated:    false,
   118  		ClusterConsoleRouteURLCreated: false,
   119  		ClusterInstallComplete:        false,
   120  		ClusterInitTime:               time.Now(),
   121  	}
   122  
   123  	cvalidationresults := &validationResults{
   124  		ClusterValidationHistory: make(map[string]*validationResultHistory),
   125  		HostValidationHistory:    make(map[string]map[string]*validationResultHistory),
   126  	}
   127  
   128  	czero.Ctx = ctx
   129  	czero.API = capi
   130  	czero.workflow = workflowType
   131  	czero.clusterID = nil
   132  	czero.clusterInfraEnvID = nil
   133  	czero.assetDir = assetDir
   134  	czero.clusterConsoleRouteURL = ""
   135  	czero.installHistory = cinstallstatushistory
   136  	czero.installHistory.ValidationResults = cvalidationresults
   137  	return czero, nil
   138  }
   139  
   140  // IsBootstrapComplete (is-bootstrap-complete, exit-on-error, returned-error)
   141  // IsBootstrapComplete Determine if the cluster has completed the bootstrap process.
   142  func (czero *Cluster) IsBootstrapComplete() (bool, bool, error) {
   143  
   144  	if czero.installHistory.ClusterBootstrapComplete {
   145  		logrus.Info("Bootstrap is complete")
   146  		return true, false, nil
   147  	}
   148  
   149  	clusterKubeAPILive := czero.API.Kube.IsKubeAPILive()
   150  
   151  	agentRestAPILive := czero.API.Rest.IsRestAPILive()
   152  
   153  	// Both API's are not available
   154  	if !agentRestAPILive && !clusterKubeAPILive {
   155  		// Current API Status: Agent Rest API: down, Bootstrap Kube API: down
   156  		if !czero.installHistory.RestAPISeen && !czero.installHistory.ClusterKubeAPISeen {
   157  			logrus.Debug("Agent Rest API never initialized. Bootstrap Kube API never initialized")
   158  			elapsedSinceInit := time.Since(czero.installHistory.ClusterInitTime)
   159  			// After allowing time for the interface to come up, check if Node0 can be accessed via ssh
   160  			if elapsedSinceInit > 2*time.Minute && !czero.CanSSHToNodeZero() {
   161  				logrus.Info("Cannot access Rendezvous Host. There may be a network configuration problem, check console for additional info")
   162  			} else {
   163  				logrus.Info("Waiting for cluster install to initialize. Sleeping for 30 seconds")
   164  			}
   165  
   166  			time.Sleep(30 * time.Second)
   167  			return false, false, nil
   168  		}
   169  
   170  		if czero.installHistory.RestAPISeen && !czero.installHistory.ClusterKubeAPISeen {
   171  			logrus.Debug("Bootstrap Kube API never initialized")
   172  			logrus.Debugf("Cluster install status from Agent Rest API last seen was: %s", czero.installHistory.RestAPIPreviousClusterStatus)
   173  			return false, false, errors.New("cluster bootstrap did not complete")
   174  		}
   175  	}
   176  
   177  	// Kube API is available
   178  	if clusterKubeAPILive {
   179  
   180  		// First time we see the cluster Kube API
   181  		if !czero.installHistory.ClusterKubeAPISeen {
   182  			logrus.Info("Bootstrap Kube API Initialized")
   183  			czero.installHistory.ClusterKubeAPISeen = true
   184  		}
   185  
   186  		configmap, err := czero.API.Kube.IsBootstrapConfigMapComplete()
   187  		if configmap {
   188  			logrus.Info("Bootstrap configMap status is complete")
   189  			czero.installHistory.ClusterBootstrapComplete = true
   190  		}
   191  		if err != nil {
   192  			logrus.Debug(err)
   193  		}
   194  	}
   195  
   196  	// Agent Rest API is available
   197  	if agentRestAPILive {
   198  		exitOnErr, err := czero.MonitorStatusFromAssistedService()
   199  		if err != nil {
   200  			return false, exitOnErr, err
   201  		}
   202  	}
   203  
   204  	// cluster bootstrap is not complete
   205  	return false, false, nil
   206  }
   207  
   208  // MonitorStatusFromAssistedService (exit-on-error, returned-error)
   209  // checks if the Assisted Service API is up, and both cluster and
   210  // infraenv have been registered.
   211  //
   212  // After those preconditions are met,
   213  // it then reports on the host validation status and overall cluster
   214  // status and updates the cluster's install history.
   215  //
   216  // After cluster or host installation has started, new events from
   217  // the Assisted Service API are also logged and updated to the cluster's
   218  // install history.
   219  func (czero *Cluster) MonitorStatusFromAssistedService() (bool, error) {
   220  	resource := "cluster"
   221  	logPrefix := ""
   222  	if czero.workflow == workflow.AgentWorkflowTypeAddNodes {
   223  		resource = "host"
   224  		logPrefix = fmt.Sprintf("Node %s: ", czero.API.Rest.NodeZeroIP)
   225  	}
   226  
   227  	// First time we see the agent Rest API
   228  	if !czero.installHistory.RestAPISeen {
   229  		logrus.Debugf("%sAgent Rest API Initialized", logPrefix)
   230  		czero.installHistory.RestAPISeen = true
   231  		czero.installHistory.NotReadyTime = time.Now()
   232  	}
   233  
   234  	// Lazy loading of the clusterID and clusterInfraEnvID
   235  	if czero.clusterID == nil {
   236  		clusterID, err := czero.API.Rest.getClusterID()
   237  		if err != nil {
   238  			return false, errors.Wrap(err, "Unable to retrieve clusterID from Agent Rest API")
   239  		}
   240  		czero.clusterID = clusterID
   241  	}
   242  
   243  	if czero.clusterInfraEnvID == nil {
   244  		clusterInfraEnvID, err := czero.API.Rest.getClusterInfraEnvID()
   245  		if err != nil {
   246  			return false, errors.Wrap(err, "Unable to retrieve clusterInfraEnvID from Agent Rest API")
   247  		}
   248  		czero.clusterInfraEnvID = clusterInfraEnvID
   249  	}
   250  
   251  	// Getting cluster metadata from Agent Rest API
   252  	clusterMetadata, err := czero.GetClusterRestAPIMetadata()
   253  	if err != nil {
   254  		return false, errors.Wrap(err, "Unable to retrieve cluster metadata from Agent Rest API")
   255  	}
   256  
   257  	if clusterMetadata == nil {
   258  		return false, errors.New("cluster metadata returned nil from Agent Rest API")
   259  	}
   260  
   261  	czero.PrintInstallStatus(clusterMetadata)
   262  
   263  	// If status indicates pending action, log host info to help pinpoint what is missing
   264  	if (*clusterMetadata.Status != czero.installHistory.RestAPIPreviousClusterStatus) &&
   265  		(*clusterMetadata.Status == models.ClusterStatusInstallingPendingUserAction) {
   266  		for _, host := range clusterMetadata.Hosts {
   267  			if *host.Status == models.ClusterStatusInstallingPendingUserAction {
   268  				if logPrefix != "" {
   269  					logrus.Warningf("%s%s %s", logPrefix, host.RequestedHostname, *host.StatusInfo)
   270  				} else {
   271  					logrus.Warningf("Host %s %s", host.RequestedHostname, *host.StatusInfo)
   272  				}
   273  			}
   274  		}
   275  	}
   276  
   277  	if *clusterMetadata.Status == models.ClusterStatusReady {
   278  		stuck, err := czero.IsClusterStuckInReady()
   279  		if err != nil {
   280  			return stuck, err
   281  		}
   282  	} else {
   283  		czero.installHistory.NotReadyTime = time.Now()
   284  	}
   285  
   286  	czero.installHistory.RestAPIPreviousClusterStatus = *clusterMetadata.Status
   287  
   288  	installing, _ := czero.IsInstalling(*clusterMetadata.Status)
   289  	if !installing {
   290  		errored, _ := czero.HasErrored(*clusterMetadata.Status)
   291  		if errored {
   292  			return false, fmt.Errorf("%s has stopped installing... working to recover installation", resource)
   293  		} else if *clusterMetadata.Status == models.ClusterStatusCancelled {
   294  			return true, fmt.Errorf("%s installation was cancelled", resource)
   295  		}
   296  	}
   297  
   298  	validationsErr := checkValidations(clusterMetadata, czero.installHistory.ValidationResults, logrus.StandardLogger(), logPrefix)
   299  	if validationsErr != nil {
   300  		return false, errors.Wrap(validationsErr, "host validations failed")
   301  	}
   302  
   303  	// Print most recent event associated with the clusterInfraEnvID
   304  	eventList, err := czero.API.Rest.GetInfraEnvEvents(czero.clusterInfraEnvID)
   305  	if err != nil {
   306  		return false, errors.Wrap(err, fmt.Sprintf("Unable to retrieve events about the %s from the Agent Rest API", resource))
   307  	}
   308  	if len(eventList) == 0 {
   309  		// No cluster events detected from the Agent Rest API
   310  	} else {
   311  		mostRecentEvent := eventList[len(eventList)-1]
   312  		// Don't print the same status message back to back
   313  		if *mostRecentEvent.Message != czero.installHistory.RestAPIPreviousEventMessage {
   314  			if *mostRecentEvent.Severity == models.EventSeverityInfo {
   315  				logrus.Infof("%s%s", logPrefix, *mostRecentEvent.Message)
   316  			} else {
   317  				logrus.Warnf("%s%s", logPrefix, *mostRecentEvent.Message)
   318  			}
   319  		}
   320  		czero.installHistory.RestAPIPreviousEventMessage = *mostRecentEvent.Message
   321  		czero.installHistory.RestAPIInfraEnvEventList = eventList
   322  	}
   323  	return false, nil
   324  }
   325  
   326  // IsInstallComplete Determine if the cluster has completed installation.
   327  func (czero *Cluster) IsInstallComplete() (bool, error) {
   328  
   329  	if czero.installHistory.ClusterInstallComplete {
   330  		logrus.Info("Cluster installation is complete")
   331  		return true, nil
   332  	}
   333  
   334  	if !czero.installHistory.ClusterOperatorsInitialized {
   335  		initialized, err := czero.API.OpenShift.AreClusterOperatorsInitialized()
   336  		if initialized && err == nil {
   337  			czero.installHistory.ClusterOperatorsInitialized = true
   338  		}
   339  		if err != nil {
   340  			return false, errors.Wrap(err, "Error while initializing cluster operators")
   341  		}
   342  
   343  	}
   344  
   345  	if !czero.installHistory.ClusterConsoleRouteCreated {
   346  		route, err := czero.API.OpenShift.IsConsoleRouteAvailable()
   347  		if route && err == nil {
   348  			czero.installHistory.ClusterConsoleRouteCreated = true
   349  		}
   350  		if err != nil {
   351  			return false, errors.Wrap(err, "Error while waiting for console route")
   352  		}
   353  
   354  	}
   355  
   356  	if !czero.installHistory.ClusterConsoleRouteURLCreated {
   357  		available, url, err := czero.API.OpenShift.IsConsoleRouteURLAvailable()
   358  		if available && url != "" && err == nil {
   359  			czero.clusterConsoleRouteURL = url
   360  			czero.installHistory.ClusterConsoleRouteURLCreated = true
   361  		}
   362  		if err != nil {
   363  			return false, errors.Wrap(err, "Error while waiting for console route URL")
   364  		}
   365  	}
   366  
   367  	if czero.installHistory.ClusterOperatorsInitialized &&
   368  		czero.installHistory.ClusterConsoleRouteCreated &&
   369  		czero.installHistory.ClusterConsoleRouteURLCreated {
   370  		czero.installHistory.ClusterInstallComplete = true
   371  		return true, nil
   372  	}
   373  
   374  	return false, nil
   375  }
   376  
   377  // IsClusterStuckInReady Determine if the cluster has stopped transitioning out of the Ready state
   378  func (czero *Cluster) IsClusterStuckInReady() (bool, error) {
   379  
   380  	// If the status changes back to Ready from Installing it indicates an error. This condition
   381  	// will be retried
   382  	if czero.installHistory.RestAPIPreviousClusterStatus == models.ClusterStatusPreparingForInstallation {
   383  		return false, errors.New("failed to prepare cluster installation, retrying")
   384  	}
   385  
   386  	// Check if stuck in Ready state
   387  	if czero.installHistory.RestAPIPreviousClusterStatus == models.ClusterStatusReady {
   388  		current := time.Now()
   389  		elapsed := current.Sub(czero.installHistory.NotReadyTime)
   390  		if elapsed > 1*time.Minute {
   391  			return true, errors.New("failed to progress after all hosts available")
   392  		}
   393  	}
   394  
   395  	return false, nil
   396  }
   397  
   398  // GetClusterRestAPIMetadata Retrieve the current cluster metadata from the Agent Rest API
   399  func (czero *Cluster) GetClusterRestAPIMetadata() (*models.Cluster, error) {
   400  	// GET /v2/clusters/{cluster_zero_id}
   401  	if czero.clusterID != nil {
   402  		getClusterParams := &installer.V2GetClusterParams{ClusterID: *czero.clusterID}
   403  		result, err := czero.API.Rest.Client.Installer.V2GetCluster(czero.Ctx, getClusterParams)
   404  		if err != nil {
   405  			return nil, err
   406  		}
   407  		return result.Payload, nil
   408  	}
   409  	return nil, errors.New("no clusterID known for the cluster")
   410  }
   411  
   412  // HasErrored Determine if the cluster installation has errored using the models from the Agent Rest API.
   413  func (czero *Cluster) HasErrored(status string) (bool, string) {
   414  	clusterErrorStates := map[string]bool{
   415  		models.ClusterStatusAddingHosts:                 false,
   416  		models.ClusterStatusCancelled:                   false,
   417  		models.ClusterStatusInstalling:                  false,
   418  		models.ClusterStatusInstallingPendingUserAction: true,
   419  		models.ClusterStatusInsufficient:                false,
   420  		models.ClusterStatusError:                       true,
   421  		models.ClusterStatusFinalizing:                  false,
   422  		models.ClusterStatusPendingForInput:             false,
   423  		models.ClusterStatusPreparingForInstallation:    false,
   424  		models.ClusterStatusReady:                       false,
   425  	}
   426  	return clusterErrorStates[status], status
   427  }
   428  
   429  // IsInstalling Determine if the cluster is still installing using the models from the Agent Rest API.
   430  func (czero *Cluster) IsInstalling(status string) (bool, string) {
   431  	clusterInstallingStates := map[string]bool{
   432  		models.ClusterStatusAddingHosts:                 true,
   433  		models.ClusterStatusCancelled:                   false,
   434  		models.ClusterStatusInstalling:                  true,
   435  		models.ClusterStatusInstallingPendingUserAction: false,
   436  		models.ClusterStatusInsufficient:                false,
   437  		models.ClusterStatusError:                       false,
   438  		models.ClusterStatusFinalizing:                  true,
   439  		models.ClusterStatusPendingForInput:             true,
   440  		models.ClusterStatusPreparingForInstallation:    true,
   441  		models.ClusterStatusReady:                       true,
   442  	}
   443  	return clusterInstallingStates[status], status
   444  }
   445  
   446  // PrintInfraEnvRestAPIEventList Prints the whole event list for debugging
   447  func (czero *Cluster) PrintInfraEnvRestAPIEventList() {
   448  	if czero.installHistory.RestAPIInfraEnvEventList != nil {
   449  		for i := 0; i < len(czero.installHistory.RestAPIInfraEnvEventList); i++ {
   450  			logrus.Debug(*czero.installHistory.RestAPIInfraEnvEventList[i].Message)
   451  		}
   452  	} else {
   453  		logrus.Debug("No events logged from the Agent Rest API")
   454  	}
   455  }
   456  
   457  // PrintInstallationComplete Prints the installation complete information
   458  func (czero *Cluster) PrintInstallationComplete() error {
   459  	absDir, err := filepath.Abs(czero.assetDir)
   460  	if err != nil {
   461  		return err
   462  	}
   463  	kubeconfig := filepath.Join(absDir, "auth", "kubeconfig")
   464  	pwFile := filepath.Join(absDir, "auth", "kubeadmin-password")
   465  	pw, err := os.ReadFile(pwFile)
   466  	if err != nil {
   467  		return err
   468  	}
   469  	logrus.Info("Install complete!")
   470  	logrus.Infof("To access the cluster as the system:admin user when using 'oc', run\n    export KUBECONFIG=%s", kubeconfig)
   471  	logrus.Infof("Access the OpenShift web-console here: %s", czero.clusterConsoleRouteURL)
   472  	logrus.Infof("Login to the console with user: %q, and password: %q", "kubeadmin", pw)
   473  	return nil
   474  
   475  }
   476  
   477  // PrintInstallStatus Print a human friendly message using the models from the Agent Rest API.
   478  func (czero *Cluster) PrintInstallStatus(cluster *models.Cluster) {
   479  	friendlyStatus := czero.humanFriendlyClusterInstallStatus(*cluster.Status)
   480  	// Don't print the same status message back to back
   481  	if *cluster.Status != czero.installHistory.RestAPIPreviousClusterStatus {
   482  		logrus.Info(friendlyStatus)
   483  	}
   484  }
   485  
   486  // CanSSHToNodeZero Checks if ssh to NodeZero succeeds.
   487  func (czero *Cluster) CanSSHToNodeZero() bool {
   488  	ip := czero.API.Rest.NodeZeroIP
   489  	port := 22
   490  
   491  	_, err := ssh.NewClient("core", net.JoinHostPort(ip, strconv.Itoa(port)), czero.API.Rest.NodeSSHKey)
   492  	if err != nil {
   493  		logrus.Debugf("Failed to connect to the Rendezvous Host: %s", err)
   494  	}
   495  	return err == nil
   496  }
   497  
   498  // Human friendly install status strings mapped to the Agent Rest API cluster statuses
   499  func (czero *Cluster) humanFriendlyClusterInstallStatus(status string) string {
   500  	clusterStoppedInstallingStates := map[string]string{
   501  		models.ClusterStatusAddingHosts:                 "Cluster is adding hosts",
   502  		models.ClusterStatusCancelled:                   "Cluster installation cancelled",
   503  		models.ClusterStatusError:                       "Cluster has hosts in error",
   504  		models.ClusterStatusFinalizing:                  "Finalizing cluster installation",
   505  		models.ClusterStatusInstalling:                  "Cluster installation in progress",
   506  		models.ClusterStatusInstallingPendingUserAction: "Cluster has hosts requiring user input",
   507  		models.ClusterStatusInsufficient:                "Cluster is not ready for install. Check validations",
   508  		models.ClusterStatusPendingForInput:             "User input is required to continue cluster installation",
   509  		models.ClusterStatusPreparingForInstallation:    "Preparing cluster for installation",
   510  		models.ClusterStatusReady:                       "Cluster is ready for install",
   511  	}
   512  	switch czero.workflow {
   513  	case workflow.AgentWorkflowTypeAddNodes:
   514  		return fmt.Sprintf("Node %s: %s", czero.API.Rest.NodeZeroIP, clusterStoppedInstallingStates[status])
   515  	default:
   516  		return clusterStoppedInstallingStates[status]
   517  	}
   518  }