k8s.io/kubernetes@v1.29.3/test/e2e/framework/node/wait.go (about) 1 /* 2 Copyright 2019 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package node 18 19 import ( 20 "context" 21 "fmt" 22 "regexp" 23 "time" 24 25 v1 "k8s.io/api/core/v1" 26 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 27 "k8s.io/apimachinery/pkg/fields" 28 "k8s.io/apimachinery/pkg/util/wait" 29 clientset "k8s.io/client-go/kubernetes" 30 "k8s.io/kubernetes/test/e2e/framework" 31 ) 32 33 const sleepTime = 20 * time.Second 34 35 var requiredPerNodePods = []*regexp.Regexp{ 36 regexp.MustCompile(".*kube-proxy.*"), 37 regexp.MustCompile(".*fluentd-elasticsearch.*"), 38 regexp.MustCompile(".*node-problem-detector.*"), 39 } 40 41 // WaitForReadyNodes waits up to timeout for cluster to has desired size and 42 // there is no not-ready nodes in it. By cluster size we mean number of schedulable Nodes. 43 func WaitForReadyNodes(ctx context.Context, c clientset.Interface, size int, timeout time.Duration) error { 44 _, err := CheckReady(ctx, c, size, timeout) 45 return err 46 } 47 48 // WaitForTotalHealthy checks whether all registered nodes are ready and all required Pods are running on them. 49 func WaitForTotalHealthy(ctx context.Context, c clientset.Interface, timeout time.Duration) error { 50 framework.Logf("Waiting up to %v for all nodes to be ready", timeout) 51 52 var notReady []v1.Node 53 var missingPodsPerNode map[string][]string 54 err := wait.PollUntilContextTimeout(ctx, poll, timeout, true, func(ctx context.Context) (bool, error) { 55 notReady = nil 56 // It should be OK to list unschedulable Nodes here. 57 nodes, err := c.CoreV1().Nodes().List(ctx, metav1.ListOptions{ResourceVersion: "0"}) 58 if err != nil { 59 return false, err 60 } 61 for _, node := range nodes.Items { 62 if !IsConditionSetAsExpected(&node, v1.NodeReady, true) { 63 notReady = append(notReady, node) 64 } 65 } 66 pods, err := c.CoreV1().Pods(metav1.NamespaceAll).List(ctx, metav1.ListOptions{ResourceVersion: "0"}) 67 if err != nil { 68 return false, err 69 } 70 71 systemPodsPerNode := make(map[string][]string) 72 for _, pod := range pods.Items { 73 if pod.Namespace == metav1.NamespaceSystem && pod.Status.Phase == v1.PodRunning { 74 if pod.Spec.NodeName != "" { 75 systemPodsPerNode[pod.Spec.NodeName] = append(systemPodsPerNode[pod.Spec.NodeName], pod.Name) 76 } 77 } 78 } 79 missingPodsPerNode = make(map[string][]string) 80 for _, node := range nodes.Items { 81 if isNodeSchedulableWithoutTaints(&node) { 82 for _, requiredPod := range requiredPerNodePods { 83 foundRequired := false 84 for _, presentPod := range systemPodsPerNode[node.Name] { 85 if requiredPod.MatchString(presentPod) { 86 foundRequired = true 87 break 88 } 89 } 90 if !foundRequired { 91 missingPodsPerNode[node.Name] = append(missingPodsPerNode[node.Name], requiredPod.String()) 92 } 93 } 94 } 95 } 96 return len(notReady) == 0 && len(missingPodsPerNode) == 0, nil 97 }) 98 99 if err != nil && !wait.Interrupted(err) { 100 return err 101 } 102 103 if len(notReady) > 0 { 104 return fmt.Errorf("Not ready nodes: %v", notReady) 105 } 106 if len(missingPodsPerNode) > 0 { 107 return fmt.Errorf("Not running system Pods: %v", missingPodsPerNode) 108 } 109 return nil 110 111 } 112 113 // WaitConditionToBe returns whether node "name's" condition state matches wantTrue 114 // within timeout. If wantTrue is true, it will ensure the node condition status 115 // is ConditionTrue; if it's false, it ensures the node condition is in any state 116 // other than ConditionTrue (e.g. not true or unknown). 117 func WaitConditionToBe(ctx context.Context, c clientset.Interface, name string, conditionType v1.NodeConditionType, wantTrue bool, timeout time.Duration) bool { 118 framework.Logf("Waiting up to %v for node %s condition %s to be %t", timeout, name, conditionType, wantTrue) 119 for start := time.Now(); time.Since(start) < timeout; time.Sleep(poll) { 120 node, err := c.CoreV1().Nodes().Get(ctx, name, metav1.GetOptions{}) 121 if err != nil { 122 framework.Logf("Couldn't get node %s", name) 123 continue 124 } 125 126 if IsConditionSetAsExpected(node, conditionType, wantTrue) { 127 return true 128 } 129 } 130 framework.Logf("Node %s didn't reach desired %s condition status (%t) within %v", name, conditionType, wantTrue, timeout) 131 return false 132 } 133 134 // WaitForNodeToBeNotReady returns whether node name is not ready (i.e. the 135 // readiness condition is anything but ready, e.g false or unknown) within 136 // timeout. 137 func WaitForNodeToBeNotReady(ctx context.Context, c clientset.Interface, name string, timeout time.Duration) bool { 138 return WaitConditionToBe(ctx, c, name, v1.NodeReady, false, timeout) 139 } 140 141 // WaitForNodeToBeReady returns whether node name is ready within timeout. 142 func WaitForNodeToBeReady(ctx context.Context, c clientset.Interface, name string, timeout time.Duration) bool { 143 return WaitConditionToBe(ctx, c, name, v1.NodeReady, true, timeout) 144 } 145 146 func WaitForNodeSchedulable(ctx context.Context, c clientset.Interface, name string, timeout time.Duration, wantSchedulable bool) bool { 147 framework.Logf("Waiting up to %v for node %s to be schedulable: %t", timeout, name, wantSchedulable) 148 for start := time.Now(); time.Since(start) < timeout; time.Sleep(poll) { 149 node, err := c.CoreV1().Nodes().Get(ctx, name, metav1.GetOptions{}) 150 if err != nil { 151 framework.Logf("Couldn't get node %s", name) 152 continue 153 } 154 155 if IsNodeSchedulable(node) == wantSchedulable { 156 return true 157 } 158 } 159 framework.Logf("Node %s didn't reach desired schedulable status (%t) within %v", name, wantSchedulable, timeout) 160 return false 161 } 162 163 // CheckReady waits up to timeout for cluster to has desired size and 164 // there is no not-ready nodes in it. By cluster size we mean number of schedulable Nodes. 165 func CheckReady(ctx context.Context, c clientset.Interface, size int, timeout time.Duration) ([]v1.Node, error) { 166 for start := time.Now(); time.Since(start) < timeout; time.Sleep(sleepTime) { 167 nodes, err := waitListSchedulableNodes(ctx, c) 168 if err != nil { 169 framework.Logf("Failed to list nodes: %v", err) 170 continue 171 } 172 numNodes := len(nodes.Items) 173 174 // Filter out not-ready nodes. 175 Filter(nodes, func(node v1.Node) bool { 176 nodeReady := IsConditionSetAsExpected(&node, v1.NodeReady, true) 177 networkReady := isConditionUnset(&node, v1.NodeNetworkUnavailable) || IsConditionSetAsExpected(&node, v1.NodeNetworkUnavailable, false) 178 return nodeReady && networkReady 179 }) 180 numReady := len(nodes.Items) 181 182 if numNodes == size && numReady == size { 183 framework.Logf("Cluster has reached the desired number of ready nodes %d", size) 184 return nodes.Items, nil 185 } 186 framework.Logf("Waiting for ready nodes %d, current ready %d, not ready nodes %d", size, numReady, numNodes-numReady) 187 } 188 return nil, fmt.Errorf("timeout waiting %v for number of ready nodes to be %d", timeout, size) 189 } 190 191 // waitListSchedulableNodes is a wrapper around listing nodes supporting retries. 192 func waitListSchedulableNodes(ctx context.Context, c clientset.Interface) (*v1.NodeList, error) { 193 var nodes *v1.NodeList 194 var err error 195 if wait.PollUntilContextTimeout(ctx, poll, singleCallTimeout, true, func(ctx context.Context) (bool, error) { 196 nodes, err = c.CoreV1().Nodes().List(ctx, metav1.ListOptions{FieldSelector: fields.Set{ 197 "spec.unschedulable": "false", 198 }.AsSelector().String()}) 199 if err != nil { 200 return false, err 201 } 202 return true, nil 203 }) != nil { 204 return nodes, err 205 } 206 return nodes, nil 207 } 208 209 // checkWaitListSchedulableNodes is a wrapper around listing nodes supporting retries. 210 func checkWaitListSchedulableNodes(ctx context.Context, c clientset.Interface) (*v1.NodeList, error) { 211 nodes, err := waitListSchedulableNodes(ctx, c) 212 if err != nil { 213 return nil, fmt.Errorf("error: %s. Non-retryable failure or timed out while listing nodes for e2e cluster", err) 214 } 215 return nodes, nil 216 } 217 218 // CheckReadyForTests returns a function which will return 'true' once the number of ready nodes is above the allowedNotReadyNodes threshold (i.e. to be used as a global gate for starting the tests). 219 func CheckReadyForTests(ctx context.Context, c clientset.Interface, nonblockingTaints string, allowedNotReadyNodes, largeClusterThreshold int) func(ctx context.Context) (bool, error) { 220 attempt := 0 221 return func(ctx context.Context) (bool, error) { 222 if allowedNotReadyNodes == -1 { 223 return true, nil 224 } 225 attempt++ 226 var nodesNotReadyYet []v1.Node 227 opts := metav1.ListOptions{ 228 ResourceVersion: "0", 229 // remove uncordoned nodes from our calculation, TODO refactor if node v2 API removes that semantic. 230 FieldSelector: fields.Set{"spec.unschedulable": "false"}.AsSelector().String(), 231 } 232 allNodes, err := c.CoreV1().Nodes().List(ctx, opts) 233 if err != nil { 234 var terminalListNodesErr error 235 framework.Logf("Unexpected error listing nodes: %v", err) 236 if attempt >= 3 { 237 terminalListNodesErr = err 238 } 239 return false, terminalListNodesErr 240 } 241 for _, node := range allNodes.Items { 242 if !readyForTests(&node, nonblockingTaints) { 243 nodesNotReadyYet = append(nodesNotReadyYet, node) 244 } 245 } 246 // Framework allows for <TestContext.AllowedNotReadyNodes> nodes to be non-ready, 247 // to make it possible e.g. for incorrect deployment of some small percentage 248 // of nodes (which we allow in cluster validation). Some nodes that are not 249 // provisioned correctly at startup will never become ready (e.g. when something 250 // won't install correctly), so we can't expect them to be ready at any point. 251 // 252 // We log the *reason* why nodes are not schedulable, specifically, its usually the network not being available. 253 if len(nodesNotReadyYet) > 0 { 254 // In large clusters, log them only every 10th pass. 255 if len(nodesNotReadyYet) < largeClusterThreshold || attempt%10 == 0 { 256 framework.Logf("Unschedulable nodes= %v, maximum value for starting tests= %v", len(nodesNotReadyYet), allowedNotReadyNodes) 257 for _, node := range nodesNotReadyYet { 258 framework.Logf(" -> Node %s [[[ Ready=%t, Network(available)=%t, Taints=%v, NonblockingTaints=%v ]]]", 259 node.Name, 260 IsConditionSetAsExpectedSilent(&node, v1.NodeReady, true), 261 IsConditionSetAsExpectedSilent(&node, v1.NodeNetworkUnavailable, false), 262 node.Spec.Taints, 263 nonblockingTaints, 264 ) 265 266 } 267 if len(nodesNotReadyYet) > allowedNotReadyNodes { 268 ready := len(allNodes.Items) - len(nodesNotReadyYet) 269 remaining := len(nodesNotReadyYet) - allowedNotReadyNodes 270 framework.Logf("==== node wait: %v out of %v nodes are ready, max notReady allowed %v. Need %v more before starting.", ready, len(allNodes.Items), allowedNotReadyNodes, remaining) 271 } 272 } 273 } 274 return len(nodesNotReadyYet) <= allowedNotReadyNodes, nil 275 } 276 } 277 278 // readyForTests determines whether or not we should continue waiting for the nodes 279 // to enter a testable state. By default this means it is schedulable, NodeReady, and untainted. 280 // Nodes with taints nonblocking taints are permitted to have that taint and 281 // also have their node.Spec.Unschedulable field ignored for the purposes of this function. 282 func readyForTests(node *v1.Node, nonblockingTaints string) bool { 283 if hasNonblockingTaint(node, nonblockingTaints) { 284 // If the node has one of the nonblockingTaints taints; just check that it is ready 285 // and don't require node.Spec.Unschedulable to be set either way. 286 if !IsNodeReady(node) || !isNodeUntaintedWithNonblocking(node, nonblockingTaints) { 287 return false 288 } 289 } else { 290 if !IsNodeSchedulable(node) || !isNodeUntainted(node) { 291 return false 292 } 293 } 294 return true 295 }