github.com/openshift/installer@v1.4.17/pkg/agent/monitoraddnodes.go (about) 1 package agent 2 3 import ( 4 "context" 5 "crypto/x509" 6 "encoding/pem" 7 "fmt" 8 "net" 9 "net/http" 10 "strings" 11 "sync" 12 "time" 13 14 "github.com/pkg/errors" 15 "github.com/sirupsen/logrus" 16 certificatesv1 "k8s.io/api/certificates/v1" 17 corev1 "k8s.io/api/core/v1" 18 ) 19 20 var ( 21 timeout = 90 * time.Minute 22 ) 23 24 const ( 25 firstCSRSignerName = "kubernetes.io/kube-apiserver-client-kubelet" 26 secondCSRSignerName = "kubernetes.io/kubelet-serving" 27 ) 28 29 type addNodeStatusHistory struct { 30 RestAPISeen bool 31 KubeletIsRunningOnNode bool 32 FirstCSRSeen bool 33 SecondCSRSeen bool 34 NodeJoinedCluster bool 35 NodeIsReady bool 36 } 37 38 type addNodeMonitor struct { 39 nodeIPAddress string 40 hostnames []string 41 cluster *Cluster 42 status addNodeStatusHistory 43 } 44 45 func newAddNodeMonitor(nodeIP string, cluster *Cluster) (*addNodeMonitor, error) { 46 parsedIPAddress := net.ParseIP(nodeIP) 47 if parsedIPAddress == nil { 48 return nil, fmt.Errorf("%s is not valid IP Address", nodeIP) 49 } 50 mon := addNodeMonitor{ 51 nodeIPAddress: parsedIPAddress.String(), 52 cluster: cluster, 53 status: addNodeStatusHistory{ 54 RestAPISeen: false, 55 KubeletIsRunningOnNode: false, 56 FirstCSRSeen: false, 57 SecondCSRSeen: false, 58 NodeJoinedCluster: false, 59 NodeIsReady: false, 60 }, 61 } 62 hostnames, err := net.LookupAddr(nodeIP) 63 if err != nil { 64 logrus.Infof("Cannot resolve IP address %v to a hostname. Skipping checks for pending CSRs.", nodeIP) 65 } else { 66 mon.hostnames = hostnames 67 } 68 return &mon, nil 69 } 70 71 func (mon *addNodeMonitor) logStatus(status string) { 72 logrus.Infof("Node %s: %s", mon.nodeIPAddress, status) 73 } 74 75 // MonitorAddNodes display the progress of one or more nodes being 76 // added to a cluster. ipAddresses is an array of IP addresses to be 77 // monitored. clusters is an array of their corresponding initialized Cluster 78 // struct used to interact with the assisted-service and k8s APIs. 79 func MonitorAddNodes(ctx context.Context, clusters []*Cluster, ipAddresses []string) { 80 waitContext, cancel := context.WithTimeout(ctx, timeout) 81 defer cancel() 82 var wg sync.WaitGroup 83 84 for i, ipAddress := range ipAddresses { 85 wg.Add(1) 86 go MonitorSingleNode(waitContext, clusters[i], ipAddress, &wg) 87 } 88 89 wg.Wait() 90 } 91 92 // MonitorSingleNode waits for the a node to be added to the cluster 93 // and reports its status until it becomes Ready. 94 func MonitorSingleNode(waitContext context.Context, cluster *Cluster, nodeIPAddress string, wg *sync.WaitGroup) { 95 defer wg.Done() 96 97 mon, err := newAddNodeMonitor(nodeIPAddress, cluster) 98 if err != nil { 99 logrus.Errorf("could not initialize node monitor for node %v: %v", nodeIPAddress, err) 100 return 101 } 102 103 for { 104 if !mon.status.RestAPISeen && 105 mon.cluster.API.Rest.IsRestAPILive() { 106 mon.status.RestAPISeen = true 107 mon.logStatus("Assisted Service API is available") 108 } 109 110 if !mon.status.KubeletIsRunningOnNode && 111 mon.isKubeletRunningOnNode() { 112 mon.status.KubeletIsRunningOnNode = true 113 mon.logStatus("Kubelet is running") 114 } 115 116 if mon.status.KubeletIsRunningOnNode && 117 !mon.status.FirstCSRSeen && 118 mon.clusterHasFirstCSRPending() { 119 mon.status.FirstCSRSeen = true 120 mon.logStatus("First CSR Pending approval") 121 mon.logCSRsPendingApproval(firstCSRSignerName) 122 } 123 124 if mon.status.KubeletIsRunningOnNode && 125 !mon.status.SecondCSRSeen && 126 mon.clusterHasSecondCSRPending() { 127 mon.status.SecondCSRSeen = true 128 mon.logStatus("Second CSR Pending approval") 129 mon.logCSRsPendingApproval(secondCSRSignerName) 130 } 131 132 hasJoined, isReady, err := mon.nodeHasJoinedClusterAndIsReady() 133 if err != nil { 134 logrus.Debugf("Node %v joined cluster and is ready check returned err: %v", nodeIPAddress, err) 135 } 136 137 if !mon.status.NodeJoinedCluster && hasJoined { 138 mon.status.NodeJoinedCluster = true 139 mon.logStatus("Node joined cluster") 140 } 141 142 if !mon.status.NodeIsReady && isReady { 143 mon.status.NodeIsReady = true 144 if !mon.clusterHasSecondCSRPending() { 145 mon.logStatus("Node is Ready") 146 } else { 147 // The node becomes Ready before second CSR is approved. Log Pending CSRs 148 // so users are aware there are still some waiting their approval even 149 // though the node status is Ready. 150 mon.logStatus("Node is Ready but has CSRs pending approval. Until all CSRs are approved, the node may not be fully functional.") 151 mon.logCSRsPendingApproval(secondCSRSignerName) 152 } 153 return 154 } 155 156 if mon.cluster.API.Rest.IsRestAPILive() { 157 _, err = cluster.MonitorStatusFromAssistedService() 158 if err != nil { 159 logrus.Warnf("error fetching status from assisted-service for node %s: %s", nodeIPAddress, err) 160 } 161 } 162 163 waitErr := waitContext.Err() 164 if waitErr != nil { 165 if errors.Is(waitErr, context.Canceled) { 166 mon.logStatus(fmt.Sprintf("Node monitoring cancelled: %v", waitErr)) 167 return 168 } 169 if errors.Is(waitErr, context.DeadlineExceeded) { 170 mon.logStatus(fmt.Sprintf("Node monitoring timed out after %v minutes", timeout)) 171 return 172 } 173 } 174 175 time.Sleep(5 * time.Second) 176 } 177 } 178 179 func (mon *addNodeMonitor) nodeHasJoinedClusterAndIsReady() (bool, bool, error) { 180 nodes, err := mon.cluster.API.Kube.ListNodes() 181 if err != nil { 182 logrus.Debugf("error getting node list %v", err) 183 return false, false, nil 184 } 185 186 var joinedNode corev1.Node 187 hasJoined := false 188 for _, node := range nodes.Items { 189 for _, address := range node.Status.Addresses { 190 if address.Type == corev1.NodeInternalIP { 191 if address.Address == mon.nodeIPAddress { 192 joinedNode = node 193 hasJoined = true 194 } 195 } 196 } 197 } 198 199 isReady := false 200 if hasJoined { 201 logrus.Debugf("Node %v (%s) has joined cluster", mon.nodeIPAddress, joinedNode.Name) 202 for _, cond := range joinedNode.Status.Conditions { 203 if cond.Type == corev1.NodeReady && cond.Status == corev1.ConditionTrue { 204 isReady = true 205 } 206 } 207 if isReady { 208 logrus.Debugf("Node %s (%s) is Ready", mon.nodeIPAddress, joinedNode.Name) 209 } else { 210 logrus.Debugf("Node %s (%s) is not Ready", mon.nodeIPAddress, joinedNode.Name) 211 } 212 } else { 213 logrus.Debugf("Node %s has not joined cluster", mon.nodeIPAddress) 214 } 215 216 return hasJoined, isReady, nil 217 } 218 219 func (mon *addNodeMonitor) logCSRsPendingApproval(signerName string) { 220 csrsPendingApproval := mon.getCSRsPendingApproval(signerName) 221 222 for _, csr := range csrsPendingApproval { 223 mon.logStatus(fmt.Sprintf("CSR %s with signerName %s and username %s is Pending and awaiting approval", 224 csr.Name, csr.Spec.SignerName, csr.Spec.Username)) 225 } 226 } 227 228 func (mon *addNodeMonitor) clusterHasFirstCSRPending() bool { 229 return len(mon.getCSRsPendingApproval(firstCSRSignerName)) > 0 230 } 231 232 func (mon *addNodeMonitor) clusterHasSecondCSRPending() bool { 233 return len(mon.getCSRsPendingApproval(secondCSRSignerName)) > 0 234 } 235 236 func (mon *addNodeMonitor) getCSRsPendingApproval(signerName string) []certificatesv1.CertificateSigningRequest { 237 if mon.hostnames == nil { 238 return []certificatesv1.CertificateSigningRequest{} 239 } 240 241 csrs, err := mon.cluster.API.Kube.ListCSRs() 242 if err != nil { 243 logrus.Debugf("error calling listCSRs(): %v", err) 244 logrus.Infof("Cannot retrieve CSRs from Kube API. Skipping checks for pending CSRs") 245 return []certificatesv1.CertificateSigningRequest{} 246 } 247 248 return filterCSRsMatchingHostname(signerName, csrs, mon.hostnames) 249 } 250 251 func filterCSRsMatchingHostname(signerName string, csrs *certificatesv1.CertificateSigningRequestList, hostnames []string) []certificatesv1.CertificateSigningRequest { 252 matchedCSRs := []certificatesv1.CertificateSigningRequest{} 253 for _, csr := range csrs.Items { 254 if len(csr.Status.Conditions) > 0 { 255 // CSR is not Pending and not awaiting approval 256 continue 257 } 258 if signerName == firstCSRSignerName && csr.Spec.SignerName == firstCSRSignerName && 259 containsHostname(decodedFirstCSRSubject(csr.Spec.Request), hostnames) { 260 matchedCSRs = append(matchedCSRs, csr) 261 } 262 if signerName == secondCSRSignerName && csr.Spec.SignerName == secondCSRSignerName && 263 containsHostname(csr.Spec.Username, hostnames) { 264 matchedCSRs = append(matchedCSRs, csr) 265 } 266 } 267 return matchedCSRs 268 } 269 270 // containsHostname checks if the searchString contains one of the node's 271 // hostnames. Only the first element of the hostname is checked. 272 // For example if the hostname is "extraworker-0.ostest.test.metalkube.org", 273 // "extraworker-0" is used to check if it exists in the searchString. 274 func containsHostname(searchString string, hostnames []string) bool { 275 for _, hostname := range hostnames { 276 parts := strings.Split(hostname, ".") 277 if strings.Contains(searchString, parts[0]) { 278 return true 279 } 280 } 281 return false 282 } 283 284 // isKubeletRunningOnNode checks if kubelet responds 285 // to http. Even if kubelet responds with error like 286 // TLS errors, kubelet is considered running. 287 func (mon *addNodeMonitor) isKubeletRunningOnNode() bool { 288 url := fmt.Sprintf("https://%s:10250/metrics", mon.nodeIPAddress) 289 // http get without authentication 290 resp, err := http.Get(url) //nolint mon.nodeIPAddress is prevalidated to be IP address 291 if err != nil { 292 logrus.Debugf("kubelet http err: %v", err) 293 if strings.Contains(err.Error(), "remote error: tls: internal error") { 294 // nodes being added will return this error 295 return true 296 } 297 if strings.Contains(err.Error(), "tls: failed to verify certificate: x509: certificate signed by unknown authority") { 298 // existing control plane nodes returns this error 299 return true 300 } 301 if strings.Contains(err.Error(), "connect: no route to host") { 302 return false 303 } 304 } else { 305 logrus.Debugf("kubelet http status code: %v", resp.StatusCode) 306 } 307 return false 308 } 309 310 // decodedFirstCSRSubject decodes the CSR.Spec.Request PEM block 311 // into readable output and returns the subject as string. 312 // 313 // Example of decoded request: 314 // Certificate Request: 315 // Data: 316 // Version: 1 (0x0) 317 // Subject: O = system:nodes, CN = system:node:extraworker-1 318 // Subject Public Key Info: 319 // 320 // Public Key Algorithm: id-ecPublicKey 321 // Public-Key: (256 bit) 322 // pub: 323 // *snip* 324 // ASN1 OID: prime256v1 325 // NIST CURVE: P-256 326 // 327 // Attributes: 328 // 329 // a0:00 330 // 331 // Signature Algorithm: ecdsa-with-SHA256 332 // 333 // *snip* 334 func decodedFirstCSRSubject(request []byte) string { 335 block, _ := pem.Decode(request) 336 if block == nil { 337 return "" 338 } 339 csrDER := block.Bytes 340 decodedRequest, err := x509.ParseCertificateRequest(csrDER) 341 if err != nil { 342 logrus.Warn("error in x509.ParseCertificateRequest(csrDER)") 343 return "" 344 } 345 return decodedRequest.Subject.String() 346 }