github.com/openshift/installer@v1.4.17/pkg/agent/monitoraddnodes.go (about)

     1  package agent
     2  
     3  import (
     4  	"context"
     5  	"crypto/x509"
     6  	"encoding/pem"
     7  	"fmt"
     8  	"net"
     9  	"net/http"
    10  	"strings"
    11  	"sync"
    12  	"time"
    13  
    14  	"github.com/pkg/errors"
    15  	"github.com/sirupsen/logrus"
    16  	certificatesv1 "k8s.io/api/certificates/v1"
    17  	corev1 "k8s.io/api/core/v1"
    18  )
    19  
    20  var (
    21  	timeout = 90 * time.Minute
    22  )
    23  
    24  const (
    25  	firstCSRSignerName  = "kubernetes.io/kube-apiserver-client-kubelet"
    26  	secondCSRSignerName = "kubernetes.io/kubelet-serving"
    27  )
    28  
    29  type addNodeStatusHistory struct {
    30  	RestAPISeen            bool
    31  	KubeletIsRunningOnNode bool
    32  	FirstCSRSeen           bool
    33  	SecondCSRSeen          bool
    34  	NodeJoinedCluster      bool
    35  	NodeIsReady            bool
    36  }
    37  
    38  type addNodeMonitor struct {
    39  	nodeIPAddress string
    40  	hostnames     []string
    41  	cluster       *Cluster
    42  	status        addNodeStatusHistory
    43  }
    44  
    45  func newAddNodeMonitor(nodeIP string, cluster *Cluster) (*addNodeMonitor, error) {
    46  	parsedIPAddress := net.ParseIP(nodeIP)
    47  	if parsedIPAddress == nil {
    48  		return nil, fmt.Errorf("%s is not valid IP Address", nodeIP)
    49  	}
    50  	mon := addNodeMonitor{
    51  		nodeIPAddress: parsedIPAddress.String(),
    52  		cluster:       cluster,
    53  		status: addNodeStatusHistory{
    54  			RestAPISeen:            false,
    55  			KubeletIsRunningOnNode: false,
    56  			FirstCSRSeen:           false,
    57  			SecondCSRSeen:          false,
    58  			NodeJoinedCluster:      false,
    59  			NodeIsReady:            false,
    60  		},
    61  	}
    62  	hostnames, err := net.LookupAddr(nodeIP)
    63  	if err != nil {
    64  		logrus.Infof("Cannot resolve IP address %v to a hostname. Skipping checks for pending CSRs.", nodeIP)
    65  	} else {
    66  		mon.hostnames = hostnames
    67  	}
    68  	return &mon, nil
    69  }
    70  
    71  func (mon *addNodeMonitor) logStatus(status string) {
    72  	logrus.Infof("Node %s: %s", mon.nodeIPAddress, status)
    73  }
    74  
    75  // MonitorAddNodes display the progress of one or more nodes being
    76  // added to a cluster. ipAddresses is an array of IP addresses to be
    77  // monitored. clusters is an array of their corresponding initialized Cluster
    78  // struct used to interact with the assisted-service and k8s APIs.
    79  func MonitorAddNodes(ctx context.Context, clusters []*Cluster, ipAddresses []string) {
    80  	waitContext, cancel := context.WithTimeout(ctx, timeout)
    81  	defer cancel()
    82  	var wg sync.WaitGroup
    83  
    84  	for i, ipAddress := range ipAddresses {
    85  		wg.Add(1)
    86  		go MonitorSingleNode(waitContext, clusters[i], ipAddress, &wg)
    87  	}
    88  
    89  	wg.Wait()
    90  }
    91  
    92  // MonitorSingleNode waits for the a node to be added to the cluster
    93  // and reports its status until it becomes Ready.
    94  func MonitorSingleNode(waitContext context.Context, cluster *Cluster, nodeIPAddress string, wg *sync.WaitGroup) {
    95  	defer wg.Done()
    96  
    97  	mon, err := newAddNodeMonitor(nodeIPAddress, cluster)
    98  	if err != nil {
    99  		logrus.Errorf("could not initialize node monitor for node %v: %v", nodeIPAddress, err)
   100  		return
   101  	}
   102  
   103  	for {
   104  		if !mon.status.RestAPISeen &&
   105  			mon.cluster.API.Rest.IsRestAPILive() {
   106  			mon.status.RestAPISeen = true
   107  			mon.logStatus("Assisted Service API is available")
   108  		}
   109  
   110  		if !mon.status.KubeletIsRunningOnNode &&
   111  			mon.isKubeletRunningOnNode() {
   112  			mon.status.KubeletIsRunningOnNode = true
   113  			mon.logStatus("Kubelet is running")
   114  		}
   115  
   116  		if mon.status.KubeletIsRunningOnNode &&
   117  			!mon.status.FirstCSRSeen &&
   118  			mon.clusterHasFirstCSRPending() {
   119  			mon.status.FirstCSRSeen = true
   120  			mon.logStatus("First CSR Pending approval")
   121  			mon.logCSRsPendingApproval(firstCSRSignerName)
   122  		}
   123  
   124  		if mon.status.KubeletIsRunningOnNode &&
   125  			!mon.status.SecondCSRSeen &&
   126  			mon.clusterHasSecondCSRPending() {
   127  			mon.status.SecondCSRSeen = true
   128  			mon.logStatus("Second CSR Pending approval")
   129  			mon.logCSRsPendingApproval(secondCSRSignerName)
   130  		}
   131  
   132  		hasJoined, isReady, err := mon.nodeHasJoinedClusterAndIsReady()
   133  		if err != nil {
   134  			logrus.Debugf("Node %v joined cluster and is ready check returned err: %v", nodeIPAddress, err)
   135  		}
   136  
   137  		if !mon.status.NodeJoinedCluster && hasJoined {
   138  			mon.status.NodeJoinedCluster = true
   139  			mon.logStatus("Node joined cluster")
   140  		}
   141  
   142  		if !mon.status.NodeIsReady && isReady {
   143  			mon.status.NodeIsReady = true
   144  			if !mon.clusterHasSecondCSRPending() {
   145  				mon.logStatus("Node is Ready")
   146  			} else {
   147  				// The node becomes Ready before second CSR is approved. Log Pending CSRs
   148  				// so users are aware there are still some waiting their approval even
   149  				// though the node status is Ready.
   150  				mon.logStatus("Node is Ready but has CSRs pending approval. Until all CSRs are approved, the node may not be fully functional.")
   151  				mon.logCSRsPendingApproval(secondCSRSignerName)
   152  			}
   153  			return
   154  		}
   155  
   156  		if mon.cluster.API.Rest.IsRestAPILive() {
   157  			_, err = cluster.MonitorStatusFromAssistedService()
   158  			if err != nil {
   159  				logrus.Warnf("error fetching status from assisted-service for node %s: %s", nodeIPAddress, err)
   160  			}
   161  		}
   162  
   163  		waitErr := waitContext.Err()
   164  		if waitErr != nil {
   165  			if errors.Is(waitErr, context.Canceled) {
   166  				mon.logStatus(fmt.Sprintf("Node monitoring cancelled: %v", waitErr))
   167  				return
   168  			}
   169  			if errors.Is(waitErr, context.DeadlineExceeded) {
   170  				mon.logStatus(fmt.Sprintf("Node monitoring timed out after %v minutes", timeout))
   171  				return
   172  			}
   173  		}
   174  
   175  		time.Sleep(5 * time.Second)
   176  	}
   177  }
   178  
   179  func (mon *addNodeMonitor) nodeHasJoinedClusterAndIsReady() (bool, bool, error) {
   180  	nodes, err := mon.cluster.API.Kube.ListNodes()
   181  	if err != nil {
   182  		logrus.Debugf("error getting node list %v", err)
   183  		return false, false, nil
   184  	}
   185  
   186  	var joinedNode corev1.Node
   187  	hasJoined := false
   188  	for _, node := range nodes.Items {
   189  		for _, address := range node.Status.Addresses {
   190  			if address.Type == corev1.NodeInternalIP {
   191  				if address.Address == mon.nodeIPAddress {
   192  					joinedNode = node
   193  					hasJoined = true
   194  				}
   195  			}
   196  		}
   197  	}
   198  
   199  	isReady := false
   200  	if hasJoined {
   201  		logrus.Debugf("Node %v (%s) has joined cluster", mon.nodeIPAddress, joinedNode.Name)
   202  		for _, cond := range joinedNode.Status.Conditions {
   203  			if cond.Type == corev1.NodeReady && cond.Status == corev1.ConditionTrue {
   204  				isReady = true
   205  			}
   206  		}
   207  		if isReady {
   208  			logrus.Debugf("Node %s (%s) is Ready", mon.nodeIPAddress, joinedNode.Name)
   209  		} else {
   210  			logrus.Debugf("Node %s (%s) is not Ready", mon.nodeIPAddress, joinedNode.Name)
   211  		}
   212  	} else {
   213  		logrus.Debugf("Node %s has not joined cluster", mon.nodeIPAddress)
   214  	}
   215  
   216  	return hasJoined, isReady, nil
   217  }
   218  
   219  func (mon *addNodeMonitor) logCSRsPendingApproval(signerName string) {
   220  	csrsPendingApproval := mon.getCSRsPendingApproval(signerName)
   221  
   222  	for _, csr := range csrsPendingApproval {
   223  		mon.logStatus(fmt.Sprintf("CSR %s with signerName %s and username %s is Pending and awaiting approval",
   224  			csr.Name, csr.Spec.SignerName, csr.Spec.Username))
   225  	}
   226  }
   227  
   228  func (mon *addNodeMonitor) clusterHasFirstCSRPending() bool {
   229  	return len(mon.getCSRsPendingApproval(firstCSRSignerName)) > 0
   230  }
   231  
   232  func (mon *addNodeMonitor) clusterHasSecondCSRPending() bool {
   233  	return len(mon.getCSRsPendingApproval(secondCSRSignerName)) > 0
   234  }
   235  
   236  func (mon *addNodeMonitor) getCSRsPendingApproval(signerName string) []certificatesv1.CertificateSigningRequest {
   237  	if mon.hostnames == nil {
   238  		return []certificatesv1.CertificateSigningRequest{}
   239  	}
   240  
   241  	csrs, err := mon.cluster.API.Kube.ListCSRs()
   242  	if err != nil {
   243  		logrus.Debugf("error calling listCSRs(): %v", err)
   244  		logrus.Infof("Cannot retrieve CSRs from Kube API. Skipping checks for pending CSRs")
   245  		return []certificatesv1.CertificateSigningRequest{}
   246  	}
   247  
   248  	return filterCSRsMatchingHostname(signerName, csrs, mon.hostnames)
   249  }
   250  
   251  func filterCSRsMatchingHostname(signerName string, csrs *certificatesv1.CertificateSigningRequestList, hostnames []string) []certificatesv1.CertificateSigningRequest {
   252  	matchedCSRs := []certificatesv1.CertificateSigningRequest{}
   253  	for _, csr := range csrs.Items {
   254  		if len(csr.Status.Conditions) > 0 {
   255  			// CSR is not Pending and not awaiting approval
   256  			continue
   257  		}
   258  		if signerName == firstCSRSignerName && csr.Spec.SignerName == firstCSRSignerName &&
   259  			containsHostname(decodedFirstCSRSubject(csr.Spec.Request), hostnames) {
   260  			matchedCSRs = append(matchedCSRs, csr)
   261  		}
   262  		if signerName == secondCSRSignerName && csr.Spec.SignerName == secondCSRSignerName &&
   263  			containsHostname(csr.Spec.Username, hostnames) {
   264  			matchedCSRs = append(matchedCSRs, csr)
   265  		}
   266  	}
   267  	return matchedCSRs
   268  }
   269  
   270  // containsHostname checks if the searchString contains one of the node's
   271  // hostnames. Only the first element of the hostname is checked.
   272  // For example if the hostname is "extraworker-0.ostest.test.metalkube.org",
   273  // "extraworker-0" is used to check if it exists in the searchString.
   274  func containsHostname(searchString string, hostnames []string) bool {
   275  	for _, hostname := range hostnames {
   276  		parts := strings.Split(hostname, ".")
   277  		if strings.Contains(searchString, parts[0]) {
   278  			return true
   279  		}
   280  	}
   281  	return false
   282  }
   283  
   284  // isKubeletRunningOnNode checks if kubelet responds
   285  // to http. Even if kubelet responds with error like
   286  // TLS errors, kubelet is considered running.
   287  func (mon *addNodeMonitor) isKubeletRunningOnNode() bool {
   288  	url := fmt.Sprintf("https://%s:10250/metrics", mon.nodeIPAddress)
   289  	// http get without authentication
   290  	resp, err := http.Get(url) //nolint mon.nodeIPAddress is prevalidated to be IP address
   291  	if err != nil {
   292  		logrus.Debugf("kubelet http err: %v", err)
   293  		if strings.Contains(err.Error(), "remote error: tls: internal error") {
   294  			// nodes being added will return this error
   295  			return true
   296  		}
   297  		if strings.Contains(err.Error(), "tls: failed to verify certificate: x509: certificate signed by unknown authority") {
   298  			// existing control plane nodes returns this error
   299  			return true
   300  		}
   301  		if strings.Contains(err.Error(), "connect: no route to host") {
   302  			return false
   303  		}
   304  	} else {
   305  		logrus.Debugf("kubelet http status code: %v", resp.StatusCode)
   306  	}
   307  	return false
   308  }
   309  
   310  // decodedFirstCSRSubject decodes the CSR.Spec.Request PEM block
   311  // into readable output and returns the subject as string.
   312  //
   313  // Example of decoded request:
   314  // Certificate Request:
   315  // Data:
   316  // Version: 1 (0x0)
   317  // Subject: O = system:nodes, CN = system:node:extraworker-1
   318  // Subject Public Key Info:
   319  //
   320  //	Public Key Algorithm: id-ecPublicKey
   321  //		Public-Key: (256 bit)
   322  //		pub:
   323  //			*snip*
   324  //		ASN1 OID: prime256v1
   325  //		NIST CURVE: P-256
   326  //
   327  // Attributes:
   328  //
   329  //	a0:00
   330  //
   331  // Signature Algorithm: ecdsa-with-SHA256
   332  //
   333  //	*snip*
   334  func decodedFirstCSRSubject(request []byte) string {
   335  	block, _ := pem.Decode(request)
   336  	if block == nil {
   337  		return ""
   338  	}
   339  	csrDER := block.Bytes
   340  	decodedRequest, err := x509.ParseCertificateRequest(csrDER)
   341  	if err != nil {
   342  		logrus.Warn("error in x509.ParseCertificateRequest(csrDER)")
   343  		return ""
   344  	}
   345  	return decodedRequest.Subject.String()
   346  }