k8s.io/kubernetes@v1.31.0-alpha.0.0.20240520171757-56147500dadc/pkg/controller/nodelifecycle/node_lifecycle_controller_test.go (about) 1 /* 2 Copyright 2017 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package nodelifecycle 18 19 import ( 20 "context" 21 "fmt" 22 goruntime "runtime" 23 "strings" 24 "testing" 25 "time" 26 27 "github.com/google/go-cmp/cmp" 28 coordv1 "k8s.io/api/coordination/v1" 29 v1 "k8s.io/api/core/v1" 30 apiequality "k8s.io/apimachinery/pkg/api/equality" 31 "k8s.io/apimachinery/pkg/api/resource" 32 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 33 "k8s.io/apimachinery/pkg/fields" 34 "k8s.io/apimachinery/pkg/labels" 35 "k8s.io/apimachinery/pkg/runtime" 36 "k8s.io/client-go/informers" 37 appsinformers "k8s.io/client-go/informers/apps/v1" 38 coordinformers "k8s.io/client-go/informers/coordination/v1" 39 coreinformers "k8s.io/client-go/informers/core/v1" 40 clientset "k8s.io/client-go/kubernetes" 41 "k8s.io/client-go/kubernetes/fake" 42 testcore "k8s.io/client-go/testing" 43 "k8s.io/klog/v2/ktesting" 44 kubeletapis "k8s.io/kubelet/pkg/apis" 45 "k8s.io/kubernetes/pkg/controller" 46 "k8s.io/kubernetes/pkg/controller/nodelifecycle/scheduler" 47 "k8s.io/kubernetes/pkg/controller/testutil" 48 controllerutil "k8s.io/kubernetes/pkg/controller/util/node" 49 "k8s.io/kubernetes/pkg/util/node" 50 taintutils "k8s.io/kubernetes/pkg/util/taints" 51 "k8s.io/utils/pointer" 52 ) 53 54 const ( 55 testNodeMonitorGracePeriod = 40 * time.Second 56 testNodeStartupGracePeriod = 60 * time.Second 57 testNodeMonitorPeriod = 5 * time.Second 58 testRateLimiterQPS = float32(100000) 59 testLargeClusterThreshold = 20 60 testUnhealthyThreshold = float32(0.55) 61 ) 62 63 func alwaysReady() bool { return true } 64 65 func fakeGetPodsAssignedToNode(c *fake.Clientset) func(string) ([]*v1.Pod, error) { 66 return func(nodeName string) ([]*v1.Pod, error) { 67 selector := fields.SelectorFromSet(fields.Set{"spec.nodeName": nodeName}) 68 pods, err := c.CoreV1().Pods(v1.NamespaceAll).List(context.TODO(), metav1.ListOptions{ 69 FieldSelector: selector.String(), 70 LabelSelector: labels.Everything().String(), 71 }) 72 if err != nil { 73 return nil, fmt.Errorf("failed to get Pods assigned to node %v", nodeName) 74 } 75 rPods := make([]*v1.Pod, len(pods.Items)) 76 for i := range pods.Items { 77 rPods[i] = &pods.Items[i] 78 } 79 return rPods, nil 80 } 81 } 82 83 type nodeLifecycleController struct { 84 *Controller 85 leaseInformer coordinformers.LeaseInformer 86 nodeInformer coreinformers.NodeInformer 87 daemonSetInformer appsinformers.DaemonSetInformer 88 } 89 90 func createNodeLease(nodeName string, renewTime metav1.MicroTime) *coordv1.Lease { 91 return &coordv1.Lease{ 92 ObjectMeta: metav1.ObjectMeta{ 93 Name: nodeName, 94 Namespace: v1.NamespaceNodeLease, 95 }, 96 Spec: coordv1.LeaseSpec{ 97 HolderIdentity: pointer.String(nodeName), 98 RenewTime: &renewTime, 99 }, 100 } 101 } 102 103 func (nc *nodeLifecycleController) syncLeaseStore(lease *coordv1.Lease) error { 104 if lease == nil { 105 return nil 106 } 107 newElems := make([]interface{}, 0, 1) 108 newElems = append(newElems, lease) 109 return nc.leaseInformer.Informer().GetStore().Replace(newElems, "newRV") 110 } 111 112 func (nc *nodeLifecycleController) syncNodeStore(fakeNodeHandler *testutil.FakeNodeHandler) error { 113 nodes, err := fakeNodeHandler.List(context.TODO(), metav1.ListOptions{}) 114 if err != nil { 115 return err 116 } 117 newElems := make([]interface{}, 0, len(nodes.Items)) 118 for i := range nodes.Items { 119 newElems = append(newElems, &nodes.Items[i]) 120 } 121 return nc.nodeInformer.Informer().GetStore().Replace(newElems, "newRV") 122 } 123 124 func newNodeLifecycleControllerFromClient( 125 ctx context.Context, 126 kubeClient clientset.Interface, 127 evictionLimiterQPS float32, 128 secondaryEvictionLimiterQPS float32, 129 largeClusterThreshold int32, 130 unhealthyZoneThreshold float32, 131 nodeMonitorGracePeriod time.Duration, 132 nodeStartupGracePeriod time.Duration, 133 nodeMonitorPeriod time.Duration, 134 ) (*nodeLifecycleController, error) { 135 136 factory := informers.NewSharedInformerFactory(kubeClient, controller.NoResyncPeriodFunc()) 137 138 leaseInformer := factory.Coordination().V1().Leases() 139 nodeInformer := factory.Core().V1().Nodes() 140 daemonSetInformer := factory.Apps().V1().DaemonSets() 141 142 nc, err := NewNodeLifecycleController( 143 ctx, 144 leaseInformer, 145 factory.Core().V1().Pods(), 146 nodeInformer, 147 daemonSetInformer, 148 kubeClient, 149 nodeMonitorPeriod, 150 nodeStartupGracePeriod, 151 nodeMonitorGracePeriod, 152 evictionLimiterQPS, 153 secondaryEvictionLimiterQPS, 154 largeClusterThreshold, 155 unhealthyZoneThreshold, 156 ) 157 if err != nil { 158 return nil, err 159 } 160 161 nc.leaseInformerSynced = alwaysReady 162 nc.podInformerSynced = alwaysReady 163 nc.nodeInformerSynced = alwaysReady 164 nc.daemonSetInformerSynced = alwaysReady 165 166 return &nodeLifecycleController{nc, leaseInformer, nodeInformer, daemonSetInformer}, nil 167 } 168 169 func TestMonitorNodeHealth(t *testing.T) { 170 fakeNow := metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC) 171 timeToPass := 60 * time.Minute 172 healthyNodeNewStatus := v1.NodeStatus{ 173 Conditions: []v1.NodeCondition{ 174 { 175 Type: v1.NodeReady, 176 Status: v1.ConditionTrue, 177 LastHeartbeatTime: metav1.NewTime(fakeNow.Add(timeToPass)), 178 LastTransitionTime: fakeNow, 179 }, 180 }, 181 } 182 unhealthyNodeNewStatus := v1.NodeStatus{ 183 Conditions: []v1.NodeCondition{ 184 { 185 Type: v1.NodeReady, 186 Status: v1.ConditionUnknown, 187 // Node status was updated by nodecontroller timeToPass ago 188 LastHeartbeatTime: fakeNow, 189 LastTransitionTime: fakeNow, 190 }, 191 }, 192 } 193 194 tests := map[string]struct { 195 nodeList []*v1.Node 196 updatedNodeStatuses []v1.NodeStatus 197 expectedInitialStates map[string]ZoneState 198 expectedFollowingStates map[string]ZoneState 199 }{ 200 "No Disruption: Node created recently without failure domain labels (happens only at cluster startup)": { 201 nodeList: []*v1.Node{ 202 { 203 ObjectMeta: metav1.ObjectMeta{ 204 Name: "node0", 205 CreationTimestamp: fakeNow, 206 }, 207 Status: v1.NodeStatus{ 208 Conditions: []v1.NodeCondition{ 209 { 210 Type: v1.NodeReady, 211 Status: v1.ConditionTrue, 212 LastHeartbeatTime: fakeNow, 213 LastTransitionTime: fakeNow, 214 }, 215 }, 216 }, 217 }, 218 }, 219 updatedNodeStatuses: []v1.NodeStatus{ 220 healthyNodeNewStatus, 221 }, 222 expectedInitialStates: map[string]ZoneState{ 223 "": stateNormal, 224 }, 225 expectedFollowingStates: map[string]ZoneState{ 226 "": stateNormal, 227 }, 228 }, 229 "No Disruption: Initially both zones down, one comes back": { 230 nodeList: []*v1.Node{ 231 { 232 ObjectMeta: metav1.ObjectMeta{ 233 Name: "node0", 234 CreationTimestamp: fakeNow, 235 Labels: map[string]string{ 236 v1.LabelTopologyRegion: "region1", 237 v1.LabelTopologyZone: "zone1", 238 v1.LabelFailureDomainBetaRegion: "region1", 239 v1.LabelFailureDomainBetaZone: "zone1", 240 }, 241 }, 242 Status: v1.NodeStatus{ 243 Conditions: []v1.NodeCondition{ 244 { 245 Type: v1.NodeReady, 246 Status: v1.ConditionUnknown, 247 LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), 248 LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), 249 }, 250 }, 251 }, 252 }, 253 { 254 ObjectMeta: metav1.ObjectMeta{ 255 Name: "node1", 256 CreationTimestamp: fakeNow, 257 Labels: map[string]string{ 258 v1.LabelTopologyRegion: "region1", 259 v1.LabelTopologyZone: "zone2", 260 v1.LabelFailureDomainBetaRegion: "region1", 261 v1.LabelFailureDomainBetaZone: "zone2", 262 }, 263 }, 264 Status: v1.NodeStatus{ 265 Conditions: []v1.NodeCondition{ 266 { 267 Type: v1.NodeReady, 268 Status: v1.ConditionUnknown, 269 LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), 270 LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), 271 }, 272 }, 273 }, 274 }, 275 }, 276 updatedNodeStatuses: []v1.NodeStatus{ 277 unhealthyNodeNewStatus, 278 healthyNodeNewStatus, 279 }, 280 expectedInitialStates: map[string]ZoneState{ 281 testutil.CreateZoneID("region1", "zone1"): stateFullDisruption, 282 testutil.CreateZoneID("region1", "zone2"): stateFullDisruption, 283 }, 284 expectedFollowingStates: map[string]ZoneState{ 285 testutil.CreateZoneID("region1", "zone1"): stateFullDisruption, 286 testutil.CreateZoneID("region1", "zone2"): stateNormal, 287 }, 288 }, 289 "Partial Disruption: Nodes created recently without status conditions (happens only at cluster startup)": { 290 nodeList: []*v1.Node{ 291 { 292 ObjectMeta: metav1.ObjectMeta{ 293 Name: "node0", 294 CreationTimestamp: fakeNow, 295 Labels: map[string]string{ 296 v1.LabelTopologyRegion: "region1", 297 v1.LabelTopologyZone: "zone1", 298 v1.LabelFailureDomainBetaRegion: "region1", 299 v1.LabelFailureDomainBetaZone: "zone1", 300 }, 301 }, 302 }, 303 { 304 ObjectMeta: metav1.ObjectMeta{ 305 Name: "node1", 306 CreationTimestamp: fakeNow, 307 Labels: map[string]string{ 308 v1.LabelTopologyRegion: "region1", 309 v1.LabelTopologyZone: "zone1", 310 v1.LabelFailureDomainBetaRegion: "region1", 311 v1.LabelFailureDomainBetaZone: "zone1", 312 }, 313 }, 314 }, 315 { 316 ObjectMeta: metav1.ObjectMeta{ 317 Name: "node2", 318 CreationTimestamp: fakeNow, 319 Labels: map[string]string{ 320 v1.LabelTopologyRegion: "region1", 321 v1.LabelTopologyZone: "zone1", 322 v1.LabelFailureDomainBetaRegion: "region1", 323 v1.LabelFailureDomainBetaZone: "zone1", 324 }, 325 }, 326 }, 327 { 328 ObjectMeta: metav1.ObjectMeta{ 329 Name: "node3", 330 CreationTimestamp: fakeNow, 331 Labels: map[string]string{ 332 v1.LabelTopologyRegion: "region1", 333 v1.LabelTopologyZone: "zone1", 334 v1.LabelFailureDomainBetaRegion: "region1", 335 v1.LabelFailureDomainBetaZone: "zone1", 336 }, 337 }, 338 }, 339 }, 340 updatedNodeStatuses: []v1.NodeStatus{ 341 unhealthyNodeNewStatus, 342 unhealthyNodeNewStatus, 343 unhealthyNodeNewStatus, 344 healthyNodeNewStatus, 345 }, 346 expectedInitialStates: map[string]ZoneState{ 347 // we've not received any status for the nodes yet 348 // so the controller assumes the zones is fully disrupted 349 testutil.CreateZoneID("region1", "zone1"): stateFullDisruption, 350 }, 351 expectedFollowingStates: map[string]ZoneState{ 352 testutil.CreateZoneID("region1", "zone1"): statePartialDisruption, 353 }, 354 }, 355 "Partial Disruption: one Node failed leading to the number of healthy Nodes to exceed the configured threshold": { 356 nodeList: []*v1.Node{ 357 { 358 ObjectMeta: metav1.ObjectMeta{ 359 Name: "node0", 360 CreationTimestamp: fakeNow, 361 Labels: map[string]string{ 362 v1.LabelTopologyRegion: "region1", 363 v1.LabelTopologyZone: "zone1", 364 v1.LabelFailureDomainBetaRegion: "region1", 365 v1.LabelFailureDomainBetaZone: "zone1", 366 }, 367 }, 368 Status: v1.NodeStatus{ 369 Conditions: []v1.NodeCondition{ 370 { 371 Type: v1.NodeReady, 372 Status: v1.ConditionUnknown, 373 LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), 374 LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), 375 }, 376 }, 377 }, 378 }, 379 { 380 ObjectMeta: metav1.ObjectMeta{ 381 Name: "node1", 382 CreationTimestamp: fakeNow, 383 Labels: map[string]string{ 384 v1.LabelTopologyRegion: "region1", 385 v1.LabelTopologyZone: "zone1", 386 v1.LabelFailureDomainBetaRegion: "region1", 387 v1.LabelFailureDomainBetaZone: "zone1", 388 }, 389 }, 390 Status: v1.NodeStatus{ 391 Conditions: []v1.NodeCondition{ 392 { 393 Type: v1.NodeReady, 394 Status: v1.ConditionUnknown, 395 LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), 396 LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), 397 }, 398 }, 399 }, 400 }, 401 { 402 ObjectMeta: metav1.ObjectMeta{ 403 Name: "node2", 404 CreationTimestamp: fakeNow, 405 Labels: map[string]string{ 406 v1.LabelTopologyRegion: "region1", 407 v1.LabelTopologyZone: "zone1", 408 v1.LabelFailureDomainBetaRegion: "region1", 409 v1.LabelFailureDomainBetaZone: "zone1", 410 }, 411 }, 412 Status: v1.NodeStatus{ 413 Conditions: []v1.NodeCondition{ 414 { 415 Type: v1.NodeReady, 416 Status: v1.ConditionTrue, 417 LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), 418 LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), 419 }, 420 }, 421 }, 422 }, 423 { 424 ObjectMeta: metav1.ObjectMeta{ 425 Name: "node3", 426 CreationTimestamp: fakeNow, 427 Labels: map[string]string{ 428 v1.LabelTopologyRegion: "region1", 429 v1.LabelTopologyZone: "zone1", 430 v1.LabelFailureDomainBetaRegion: "region1", 431 v1.LabelFailureDomainBetaZone: "zone1", 432 }, 433 }, 434 Status: v1.NodeStatus{ 435 Conditions: []v1.NodeCondition{ 436 { 437 Type: v1.NodeReady, 438 Status: v1.ConditionTrue, 439 LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), 440 LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), 441 }, 442 }, 443 }, 444 }, 445 { 446 ObjectMeta: metav1.ObjectMeta{ 447 Name: "node4", 448 CreationTimestamp: fakeNow, 449 Labels: map[string]string{ 450 v1.LabelTopologyRegion: "region1", 451 v1.LabelTopologyZone: "zone1", 452 v1.LabelFailureDomainBetaRegion: "region1", 453 v1.LabelFailureDomainBetaZone: "zone1", 454 }, 455 }, 456 Status: v1.NodeStatus{ 457 Conditions: []v1.NodeCondition{ 458 { 459 Type: v1.NodeReady, 460 Status: v1.ConditionTrue, 461 LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), 462 LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), 463 }, 464 }, 465 }, 466 }, 467 }, 468 updatedNodeStatuses: []v1.NodeStatus{ 469 unhealthyNodeNewStatus, 470 unhealthyNodeNewStatus, 471 unhealthyNodeNewStatus, 472 healthyNodeNewStatus, 473 healthyNodeNewStatus, 474 }, 475 expectedInitialStates: map[string]ZoneState{ 476 testutil.CreateZoneID("region1", "zone1"): stateNormal, 477 }, 478 expectedFollowingStates: map[string]ZoneState{ 479 testutil.CreateZoneID("region1", "zone1"): statePartialDisruption, 480 }, 481 }, 482 "Full Disruption: the zone has less than 2 Nodes down, the last healthy Node has failed": { 483 nodeList: []*v1.Node{ 484 { 485 ObjectMeta: metav1.ObjectMeta{ 486 Name: "node0", 487 CreationTimestamp: fakeNow, 488 Labels: map[string]string{ 489 v1.LabelTopologyRegion: "region1", 490 v1.LabelTopologyZone: "zone1", 491 v1.LabelFailureDomainBetaRegion: "region1", 492 v1.LabelFailureDomainBetaZone: "zone1", 493 }, 494 }, 495 Status: v1.NodeStatus{ 496 Conditions: []v1.NodeCondition{ 497 { 498 Type: v1.NodeReady, 499 Status: v1.ConditionUnknown, 500 LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), 501 LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), 502 }, 503 }, 504 }, 505 }, 506 { 507 ObjectMeta: metav1.ObjectMeta{ 508 Name: "node1", 509 CreationTimestamp: fakeNow, 510 Labels: map[string]string{ 511 v1.LabelTopologyRegion: "region1", 512 v1.LabelTopologyZone: "zone1", 513 v1.LabelFailureDomainBetaRegion: "region1", 514 v1.LabelFailureDomainBetaZone: "zone1", 515 }, 516 }, 517 Status: v1.NodeStatus{ 518 Conditions: []v1.NodeCondition{ 519 { 520 Type: v1.NodeReady, 521 Status: v1.ConditionUnknown, 522 LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), 523 LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), 524 }, 525 }, 526 }, 527 }, 528 { 529 ObjectMeta: metav1.ObjectMeta{ 530 Name: "node2", 531 CreationTimestamp: fakeNow, 532 Labels: map[string]string{ 533 v1.LabelTopologyRegion: "region1", 534 v1.LabelTopologyZone: "zone1", 535 v1.LabelFailureDomainBetaRegion: "region1", 536 v1.LabelFailureDomainBetaZone: "zone1", 537 }, 538 }, 539 Status: v1.NodeStatus{ 540 Conditions: []v1.NodeCondition{ 541 { 542 Type: v1.NodeReady, 543 Status: v1.ConditionTrue, 544 LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), 545 LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), 546 }, 547 }, 548 }, 549 }, 550 }, 551 updatedNodeStatuses: []v1.NodeStatus{ 552 unhealthyNodeNewStatus, 553 unhealthyNodeNewStatus, 554 unhealthyNodeNewStatus, 555 }, 556 expectedInitialStates: map[string]ZoneState{ 557 // if a zone has a number of unhealthy nodes less or equal to 2 558 // the controller will consider it normal regardless on 559 // the ration of healthy vs unhealthy nodes 560 testutil.CreateZoneID("region1", "zone1"): stateNormal, 561 }, 562 expectedFollowingStates: map[string]ZoneState{ 563 testutil.CreateZoneID("region1", "zone1"): stateFullDisruption, 564 }, 565 }, 566 "Full Disruption: all the Nodes in one zone are down": { 567 nodeList: []*v1.Node{ 568 { 569 ObjectMeta: metav1.ObjectMeta{ 570 Name: "node0", 571 CreationTimestamp: fakeNow, 572 Labels: map[string]string{ 573 v1.LabelTopologyRegion: "region1", 574 v1.LabelTopologyZone: "zone1", 575 v1.LabelFailureDomainBetaRegion: "region1", 576 v1.LabelFailureDomainBetaZone: "zone1", 577 }, 578 }, 579 Status: v1.NodeStatus{ 580 Conditions: []v1.NodeCondition{ 581 { 582 Type: v1.NodeReady, 583 Status: v1.ConditionUnknown, 584 LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), 585 LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), 586 }, 587 }, 588 }, 589 }, 590 { 591 ObjectMeta: metav1.ObjectMeta{ 592 Name: "node1", 593 CreationTimestamp: fakeNow, 594 Labels: map[string]string{ 595 v1.LabelTopologyRegion: "region1", 596 v1.LabelTopologyZone: "zone2", 597 v1.LabelFailureDomainBetaRegion: "region1", 598 v1.LabelFailureDomainBetaZone: "zone2", 599 }, 600 }, 601 Status: v1.NodeStatus{ 602 Conditions: []v1.NodeCondition{ 603 { 604 Type: v1.NodeReady, 605 Status: v1.ConditionTrue, 606 LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), 607 LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), 608 }, 609 }, 610 }, 611 }, 612 }, 613 updatedNodeStatuses: []v1.NodeStatus{ 614 unhealthyNodeNewStatus, 615 healthyNodeNewStatus, 616 }, 617 expectedInitialStates: map[string]ZoneState{ 618 testutil.CreateZoneID("region1", "zone1"): stateFullDisruption, 619 testutil.CreateZoneID("region1", "zone2"): stateNormal, 620 }, 621 expectedFollowingStates: map[string]ZoneState{ 622 testutil.CreateZoneID("region1", "zone1"): stateFullDisruption, 623 testutil.CreateZoneID("region1", "zone2"): stateNormal, 624 }, 625 }, 626 "Full Disruption: all the Nodes in both the zones are down": { 627 nodeList: []*v1.Node{ 628 { 629 ObjectMeta: metav1.ObjectMeta{ 630 Name: "node0", 631 CreationTimestamp: fakeNow, 632 Labels: map[string]string{ 633 v1.LabelTopologyRegion: "region1", 634 v1.LabelTopologyZone: "zone1", 635 v1.LabelFailureDomainBetaRegion: "region1", 636 v1.LabelFailureDomainBetaZone: "zone1", 637 }, 638 }, 639 Status: v1.NodeStatus{ 640 Conditions: []v1.NodeCondition{ 641 { 642 Type: v1.NodeReady, 643 Status: v1.ConditionUnknown, 644 LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), 645 LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), 646 }, 647 }, 648 }, 649 }, 650 { 651 ObjectMeta: metav1.ObjectMeta{ 652 Name: "node1", 653 CreationTimestamp: fakeNow, 654 Labels: map[string]string{ 655 v1.LabelTopologyRegion: "region2", 656 v1.LabelTopologyZone: "zone2", 657 v1.LabelFailureDomainBetaRegion: "region2", 658 v1.LabelFailureDomainBetaZone: "zone2", 659 }, 660 }, 661 Status: v1.NodeStatus{ 662 Conditions: []v1.NodeCondition{ 663 { 664 Type: v1.NodeReady, 665 Status: v1.ConditionUnknown, 666 LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), 667 LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), 668 }, 669 }, 670 }, 671 }, 672 }, 673 674 updatedNodeStatuses: []v1.NodeStatus{ 675 unhealthyNodeNewStatus, 676 unhealthyNodeNewStatus, 677 }, 678 expectedInitialStates: map[string]ZoneState{ 679 testutil.CreateZoneID("region1", "zone1"): stateFullDisruption, 680 testutil.CreateZoneID("region2", "zone2"): stateFullDisruption, 681 }, 682 expectedFollowingStates: map[string]ZoneState{ 683 testutil.CreateZoneID("region1", "zone1"): stateFullDisruption, 684 testutil.CreateZoneID("region2", "zone2"): stateFullDisruption, 685 }, 686 }, 687 "Full Disruption: Ready condition removed from the Node": { 688 nodeList: []*v1.Node{ 689 { 690 ObjectMeta: metav1.ObjectMeta{ 691 Name: "node0", 692 CreationTimestamp: fakeNow, 693 Labels: map[string]string{ 694 v1.LabelTopologyRegion: "region1", 695 v1.LabelTopologyZone: "zone1", 696 v1.LabelFailureDomainBetaRegion: "region1", 697 v1.LabelFailureDomainBetaZone: "zone1", 698 }, 699 }, 700 Status: v1.NodeStatus{ 701 Conditions: []v1.NodeCondition{ 702 { 703 Type: v1.NodeReady, 704 Status: v1.ConditionTrue, 705 LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), 706 LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), 707 }, 708 }, 709 }, 710 }, 711 }, 712 713 updatedNodeStatuses: []v1.NodeStatus{ 714 { 715 Conditions: []v1.NodeCondition{}, 716 }, 717 }, 718 expectedInitialStates: map[string]ZoneState{ 719 testutil.CreateZoneID("region1", "zone1"): stateNormal, 720 }, 721 expectedFollowingStates: map[string]ZoneState{ 722 testutil.CreateZoneID("region1", "zone1"): stateFullDisruption, 723 }, 724 }, 725 "Full Disruption: the only available Node has the node.kubernetes.io/exclude-disruption label": { 726 nodeList: []*v1.Node{ 727 { 728 ObjectMeta: metav1.ObjectMeta{ 729 Name: "node0", 730 CreationTimestamp: fakeNow, 731 Labels: map[string]string{ 732 v1.LabelTopologyRegion: "region1", 733 v1.LabelTopologyZone: "zone1", 734 v1.LabelFailureDomainBetaRegion: "region1", 735 v1.LabelFailureDomainBetaZone: "zone1", 736 }, 737 }, 738 Status: v1.NodeStatus{ 739 Conditions: []v1.NodeCondition{ 740 { 741 Type: v1.NodeReady, 742 Status: v1.ConditionUnknown, 743 LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), 744 LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), 745 }, 746 }, 747 }, 748 }, 749 { 750 ObjectMeta: metav1.ObjectMeta{ 751 Name: "node-master", 752 CreationTimestamp: fakeNow, 753 Labels: map[string]string{ 754 v1.LabelTopologyRegion: "region1", 755 v1.LabelTopologyZone: "zone1", 756 v1.LabelFailureDomainBetaRegion: "region1", 757 v1.LabelFailureDomainBetaZone: "zone1", 758 labelNodeDisruptionExclusion: "", 759 }, 760 }, 761 Status: v1.NodeStatus{ 762 Conditions: []v1.NodeCondition{ 763 { 764 Type: v1.NodeReady, 765 Status: v1.ConditionTrue, 766 LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), 767 LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), 768 }, 769 }, 770 }, 771 }, 772 }, 773 updatedNodeStatuses: []v1.NodeStatus{ 774 unhealthyNodeNewStatus, 775 healthyNodeNewStatus, 776 }, 777 expectedInitialStates: map[string]ZoneState{ 778 testutil.CreateZoneID("region1", "zone1"): stateFullDisruption, 779 }, 780 expectedFollowingStates: map[string]ZoneState{ 781 testutil.CreateZoneID("region1", "zone1"): stateFullDisruption, 782 }, 783 }, 784 } 785 786 for testName, tt := range tests { 787 t.Run(testName, func(t *testing.T) { 788 _, ctx := ktesting.NewTestContext(t) 789 fakeNodeHandler := &testutil.FakeNodeHandler{ 790 Existing: tt.nodeList, 791 Clientset: fake.NewSimpleClientset(), 792 } 793 nodeController, _ := newNodeLifecycleControllerFromClient( 794 ctx, 795 fakeNodeHandler, 796 testRateLimiterQPS, 797 testRateLimiterQPS, 798 testLargeClusterThreshold, 799 testUnhealthyThreshold, 800 testNodeMonitorGracePeriod, 801 testNodeStartupGracePeriod, 802 testNodeMonitorPeriod) 803 nodeController.recorder = testutil.NewFakeRecorder() 804 nodeController.enterPartialDisruptionFunc = func(nodeNum int) float32 { 805 return testRateLimiterQPS 806 } 807 nodeController.enterFullDisruptionFunc = func(nodeNum int) float32 { 808 return testRateLimiterQPS 809 } 810 811 syncAndDiffZoneState := func(wanted map[string]ZoneState) { 812 if err := nodeController.syncNodeStore(fakeNodeHandler); err != nil { 813 t.Errorf("unexpected error: %v", err) 814 } 815 if err := nodeController.monitorNodeHealth(ctx); err != nil { 816 t.Errorf("unexpected error: %v", err) 817 } 818 if diff := cmp.Diff(wanted, nodeController.zoneStates); diff != "" { 819 t.Errorf("unexpected zone state (-want +got):\n%s", diff) 820 } 821 } 822 823 // initial zone state 824 nodeController.now = func() metav1.Time { return fakeNow } 825 syncAndDiffZoneState(tt.expectedInitialStates) 826 827 // following zone state 828 nodeController.now = func() metav1.Time { return metav1.Time{Time: fakeNow.Add(timeToPass)} } 829 for i := range tt.updatedNodeStatuses { 830 fakeNodeHandler.Existing[i].Status = tt.updatedNodeStatuses[i] 831 } 832 syncAndDiffZoneState(tt.expectedFollowingStates) 833 }) 834 } 835 } 836 837 func TestPodStatusChange(t *testing.T) { 838 fakeNow := metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC) 839 840 // Because of the logic that prevents NC from evicting anything when all Nodes are NotReady 841 // we need second healthy node in tests. Because of how the tests are written we need to update 842 // the status of this Node. 843 healthyNodeNewStatus := v1.NodeStatus{ 844 Conditions: []v1.NodeCondition{ 845 { 846 Type: v1.NodeReady, 847 Status: v1.ConditionTrue, 848 // Node status has just been updated, and is NotReady for 10min. 849 LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 9, 0, 0, time.UTC), 850 LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), 851 }, 852 }, 853 } 854 855 // Node created long time ago, node controller posted Unknown for a long period of time. 856 table := []struct { 857 fakeNodeHandler *testutil.FakeNodeHandler 858 timeToPass time.Duration 859 newNodeStatus v1.NodeStatus 860 secondNodeNewStatus v1.NodeStatus 861 expectedPodUpdate bool 862 expectedReason string 863 description string 864 }{ 865 { 866 fakeNodeHandler: &testutil.FakeNodeHandler{ 867 Existing: []*v1.Node{ 868 { 869 ObjectMeta: metav1.ObjectMeta{ 870 Name: "node0", 871 CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), 872 Labels: map[string]string{ 873 v1.LabelTopologyRegion: "region1", 874 v1.LabelTopologyZone: "zone1", 875 v1.LabelFailureDomainBetaRegion: "region1", 876 v1.LabelFailureDomainBetaZone: "zone1", 877 }, 878 }, 879 Status: v1.NodeStatus{ 880 Conditions: []v1.NodeCondition{ 881 { 882 Type: v1.NodeReady, 883 Status: v1.ConditionUnknown, 884 LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), 885 LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), 886 }, 887 }, 888 }, 889 }, 890 { 891 ObjectMeta: metav1.ObjectMeta{ 892 Name: "node1", 893 CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), 894 Labels: map[string]string{ 895 v1.LabelFailureDomainBetaRegion: "region1", 896 v1.LabelFailureDomainBetaZone: "zone1", 897 }, 898 }, 899 Status: v1.NodeStatus{ 900 Conditions: []v1.NodeCondition{ 901 { 902 Type: v1.NodeReady, 903 Status: v1.ConditionTrue, 904 LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), 905 LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), 906 }, 907 }, 908 }, 909 }, 910 }, 911 Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}), 912 }, 913 timeToPass: 60 * time.Minute, 914 newNodeStatus: v1.NodeStatus{ 915 Conditions: []v1.NodeCondition{ 916 { 917 Type: v1.NodeReady, 918 Status: v1.ConditionUnknown, 919 // Node status was updated by nodecontroller 1hr ago 920 LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), 921 LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), 922 }, 923 }, 924 }, 925 secondNodeNewStatus: healthyNodeNewStatus, 926 expectedPodUpdate: true, 927 expectedReason: node.NodeUnreachablePodReason, 928 description: "Node created long time ago, node controller posted Unknown for a " + 929 "long period of time, the pod status must include reason for termination.", 930 }, 931 } 932 933 _, ctx := ktesting.NewTestContext(t) 934 for _, item := range table { 935 nodeController, _ := newNodeLifecycleControllerFromClient( 936 ctx, 937 item.fakeNodeHandler, 938 testRateLimiterQPS, 939 testRateLimiterQPS, 940 testLargeClusterThreshold, 941 testUnhealthyThreshold, 942 testNodeMonitorGracePeriod, 943 testNodeStartupGracePeriod, 944 testNodeMonitorPeriod, 945 ) 946 nodeController.now = func() metav1.Time { return fakeNow } 947 nodeController.recorder = testutil.NewFakeRecorder() 948 nodeController.getPodsAssignedToNode = fakeGetPodsAssignedToNode(item.fakeNodeHandler.Clientset) 949 if err := nodeController.syncNodeStore(item.fakeNodeHandler); err != nil { 950 t.Errorf("unexpected error: %v", err) 951 } 952 if err := nodeController.monitorNodeHealth(ctx); err != nil { 953 t.Errorf("unexpected error: %v", err) 954 } 955 if item.timeToPass > 0 { 956 nodeController.now = func() metav1.Time { return metav1.Time{Time: fakeNow.Add(item.timeToPass)} } 957 item.fakeNodeHandler.Existing[0].Status = item.newNodeStatus 958 item.fakeNodeHandler.Existing[1].Status = item.secondNodeNewStatus 959 } 960 if err := nodeController.syncNodeStore(item.fakeNodeHandler); err != nil { 961 t.Errorf("unexpected error: %v", err) 962 } 963 if err := nodeController.monitorNodeHealth(ctx); err != nil { 964 t.Errorf("unexpected error: %v", err) 965 } 966 zones := testutil.GetZones(item.fakeNodeHandler) 967 logger, _ := ktesting.NewTestContext(t) 968 for _, zone := range zones { 969 nodeController.zoneNoExecuteTainter[zone].Try(logger, func(value scheduler.TimedValue) (bool, time.Duration) { 970 nodeUID, _ := value.UID.(string) 971 pods, err := nodeController.getPodsAssignedToNode(value.Value) 972 if err != nil { 973 t.Errorf("unexpected error: %v", err) 974 } 975 controllerutil.DeletePods(ctx, item.fakeNodeHandler, pods, nodeController.recorder, value.Value, nodeUID, nodeController.daemonSetStore) 976 return true, 0 977 }) 978 } 979 980 podReasonUpdate := false 981 for _, action := range item.fakeNodeHandler.Actions() { 982 if action.GetVerb() == "update" && action.GetResource().Resource == "pods" { 983 updateReason := action.(testcore.UpdateActionImpl).GetObject().(*v1.Pod).Status.Reason 984 podReasonUpdate = true 985 if updateReason != item.expectedReason { 986 t.Errorf("expected pod status reason: %+v, got %+v for %+v", item.expectedReason, updateReason, item.description) 987 } 988 } 989 } 990 991 if podReasonUpdate != item.expectedPodUpdate { 992 t.Errorf("expected pod update: %+v, got %+v for %+v", item.expectedPodUpdate, podReasonUpdate, item.description) 993 } 994 } 995 } 996 997 func TestMonitorNodeHealthUpdateStatus(t *testing.T) { 998 fakeNow := metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC) 999 table := []struct { 1000 fakeNodeHandler *testutil.FakeNodeHandler 1001 timeToPass time.Duration 1002 newNodeStatus v1.NodeStatus 1003 expectedRequestCount int 1004 expectedNodes []*v1.Node 1005 expectedPodStatusUpdate bool 1006 }{ 1007 // Node created long time ago, without status: 1008 // Expect Unknown status posted from node controller. 1009 { 1010 fakeNodeHandler: &testutil.FakeNodeHandler{ 1011 Existing: []*v1.Node{ 1012 { 1013 ObjectMeta: metav1.ObjectMeta{ 1014 Name: "node0", 1015 CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), 1016 }, 1017 }, 1018 }, 1019 Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}), 1020 }, 1021 expectedRequestCount: 2, // List+Update 1022 expectedNodes: []*v1.Node{ 1023 { 1024 ObjectMeta: metav1.ObjectMeta{ 1025 Name: "node0", 1026 CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), 1027 }, 1028 Status: v1.NodeStatus{ 1029 Conditions: []v1.NodeCondition{ 1030 { 1031 Type: v1.NodeReady, 1032 Status: v1.ConditionUnknown, 1033 Reason: "NodeStatusNeverUpdated", 1034 Message: "Kubelet never posted node status.", 1035 LastHeartbeatTime: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), 1036 LastTransitionTime: fakeNow, 1037 }, 1038 { 1039 Type: v1.NodeMemoryPressure, 1040 Status: v1.ConditionUnknown, 1041 Reason: "NodeStatusNeverUpdated", 1042 Message: "Kubelet never posted node status.", 1043 LastHeartbeatTime: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), 1044 LastTransitionTime: fakeNow, 1045 }, 1046 { 1047 Type: v1.NodeDiskPressure, 1048 Status: v1.ConditionUnknown, 1049 Reason: "NodeStatusNeverUpdated", 1050 Message: "Kubelet never posted node status.", 1051 LastHeartbeatTime: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), 1052 LastTransitionTime: fakeNow, 1053 }, 1054 { 1055 Type: v1.NodePIDPressure, 1056 Status: v1.ConditionUnknown, 1057 Reason: "NodeStatusNeverUpdated", 1058 Message: "Kubelet never posted node status.", 1059 LastHeartbeatTime: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), 1060 LastTransitionTime: fakeNow, 1061 }, 1062 }, 1063 }, 1064 }, 1065 }, 1066 expectedPodStatusUpdate: false, // Pod was never scheduled 1067 }, 1068 // Node created recently, without status. 1069 // Expect no action from node controller (within startup grace period). 1070 { 1071 fakeNodeHandler: &testutil.FakeNodeHandler{ 1072 Existing: []*v1.Node{ 1073 { 1074 ObjectMeta: metav1.ObjectMeta{ 1075 Name: "node0", 1076 CreationTimestamp: fakeNow, 1077 }, 1078 }, 1079 }, 1080 Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}), 1081 }, 1082 expectedRequestCount: 1, // List 1083 expectedNodes: nil, 1084 expectedPodStatusUpdate: false, 1085 }, 1086 // Node created long time ago, with status updated by kubelet exceeds grace period. 1087 // Expect Unknown status posted from node controller. 1088 { 1089 fakeNodeHandler: &testutil.FakeNodeHandler{ 1090 Existing: []*v1.Node{ 1091 { 1092 ObjectMeta: metav1.ObjectMeta{ 1093 Name: "node0", 1094 CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), 1095 }, 1096 Status: v1.NodeStatus{ 1097 Conditions: []v1.NodeCondition{ 1098 { 1099 Type: v1.NodeReady, 1100 Status: v1.ConditionTrue, 1101 // Node status hasn't been updated for 1hr. 1102 LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), 1103 LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), 1104 }, 1105 }, 1106 Capacity: v1.ResourceList{ 1107 v1.ResourceName(v1.ResourceCPU): resource.MustParse("10"), 1108 v1.ResourceName(v1.ResourceMemory): resource.MustParse("10G"), 1109 }, 1110 }, 1111 }, 1112 }, 1113 Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}), 1114 }, 1115 expectedRequestCount: 3, // (List+)List+Update 1116 timeToPass: time.Hour, 1117 newNodeStatus: v1.NodeStatus{ 1118 Conditions: []v1.NodeCondition{ 1119 { 1120 Type: v1.NodeReady, 1121 Status: v1.ConditionTrue, 1122 // Node status hasn't been updated for 1hr. 1123 LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), 1124 LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), 1125 }, 1126 }, 1127 Capacity: v1.ResourceList{ 1128 v1.ResourceName(v1.ResourceCPU): resource.MustParse("10"), 1129 v1.ResourceName(v1.ResourceMemory): resource.MustParse("10G"), 1130 }, 1131 }, 1132 expectedNodes: []*v1.Node{ 1133 { 1134 ObjectMeta: metav1.ObjectMeta{ 1135 Name: "node0", 1136 CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), 1137 }, 1138 Status: v1.NodeStatus{ 1139 Conditions: []v1.NodeCondition{ 1140 { 1141 Type: v1.NodeReady, 1142 Status: v1.ConditionUnknown, 1143 Reason: "NodeStatusUnknown", 1144 Message: "Kubelet stopped posting node status.", 1145 LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), 1146 LastTransitionTime: metav1.Time{Time: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC).Add(time.Hour)}, 1147 }, 1148 { 1149 Type: v1.NodeMemoryPressure, 1150 Status: v1.ConditionUnknown, 1151 Reason: "NodeStatusNeverUpdated", 1152 Message: "Kubelet never posted node status.", 1153 LastHeartbeatTime: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), // should default to node creation time if condition was never updated 1154 LastTransitionTime: metav1.Time{Time: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC).Add(time.Hour)}, 1155 }, 1156 { 1157 Type: v1.NodeDiskPressure, 1158 Status: v1.ConditionUnknown, 1159 Reason: "NodeStatusNeverUpdated", 1160 Message: "Kubelet never posted node status.", 1161 LastHeartbeatTime: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), // should default to node creation time if condition was never updated 1162 LastTransitionTime: metav1.Time{Time: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC).Add(time.Hour)}, 1163 }, 1164 { 1165 Type: v1.NodePIDPressure, 1166 Status: v1.ConditionUnknown, 1167 Reason: "NodeStatusNeverUpdated", 1168 Message: "Kubelet never posted node status.", 1169 LastHeartbeatTime: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), // should default to node creation time if condition was never updated 1170 LastTransitionTime: metav1.Time{Time: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC).Add(time.Hour)}, 1171 }, 1172 }, 1173 Capacity: v1.ResourceList{ 1174 v1.ResourceName(v1.ResourceCPU): resource.MustParse("10"), 1175 v1.ResourceName(v1.ResourceMemory): resource.MustParse("10G"), 1176 }, 1177 }, 1178 }, 1179 }, 1180 expectedPodStatusUpdate: true, 1181 }, 1182 // Node created long time ago, with status updated recently. 1183 // Expect no action from node controller (within monitor grace period). 1184 { 1185 fakeNodeHandler: &testutil.FakeNodeHandler{ 1186 Existing: []*v1.Node{ 1187 { 1188 ObjectMeta: metav1.ObjectMeta{ 1189 Name: "node0", 1190 CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), 1191 }, 1192 Status: v1.NodeStatus{ 1193 Conditions: []v1.NodeCondition{ 1194 { 1195 Type: v1.NodeReady, 1196 Status: v1.ConditionTrue, 1197 // Node status has just been updated. 1198 LastHeartbeatTime: fakeNow, 1199 LastTransitionTime: fakeNow, 1200 }, 1201 }, 1202 Capacity: v1.ResourceList{ 1203 v1.ResourceName(v1.ResourceCPU): resource.MustParse("10"), 1204 v1.ResourceName(v1.ResourceMemory): resource.MustParse("10G"), 1205 }, 1206 }, 1207 }, 1208 }, 1209 Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}), 1210 }, 1211 expectedRequestCount: 1, // List 1212 expectedNodes: nil, 1213 expectedPodStatusUpdate: false, 1214 }, 1215 } 1216 _, ctx := ktesting.NewTestContext(t) 1217 for i, item := range table { 1218 nodeController, _ := newNodeLifecycleControllerFromClient( 1219 ctx, 1220 item.fakeNodeHandler, 1221 testRateLimiterQPS, 1222 testRateLimiterQPS, 1223 testLargeClusterThreshold, 1224 testUnhealthyThreshold, 1225 testNodeMonitorGracePeriod, 1226 testNodeStartupGracePeriod, 1227 testNodeMonitorPeriod, 1228 ) 1229 nodeController.now = func() metav1.Time { return fakeNow } 1230 nodeController.recorder = testutil.NewFakeRecorder() 1231 nodeController.getPodsAssignedToNode = fakeGetPodsAssignedToNode(item.fakeNodeHandler.Clientset) 1232 if err := nodeController.syncNodeStore(item.fakeNodeHandler); err != nil { 1233 t.Errorf("unexpected error: %v", err) 1234 } 1235 if err := nodeController.monitorNodeHealth(ctx); err != nil { 1236 t.Errorf("unexpected error: %v", err) 1237 } 1238 if item.timeToPass > 0 { 1239 nodeController.now = func() metav1.Time { return metav1.Time{Time: fakeNow.Add(item.timeToPass)} } 1240 item.fakeNodeHandler.Existing[0].Status = item.newNodeStatus 1241 if err := nodeController.syncNodeStore(item.fakeNodeHandler); err != nil { 1242 t.Errorf("unexpected error: %v", err) 1243 } 1244 if err := nodeController.monitorNodeHealth(ctx); err != nil { 1245 t.Errorf("unexpected error: %v", err) 1246 } 1247 } 1248 if item.expectedRequestCount != item.fakeNodeHandler.RequestCount { 1249 t.Errorf("expected %v call, but got %v.", item.expectedRequestCount, item.fakeNodeHandler.RequestCount) 1250 } 1251 if len(item.fakeNodeHandler.UpdatedNodes) > 0 && !apiequality.Semantic.DeepEqual(item.expectedNodes, item.fakeNodeHandler.UpdatedNodes) { 1252 t.Errorf("Case[%d] unexpected nodes: %s", i, cmp.Diff(item.expectedNodes[0], item.fakeNodeHandler.UpdatedNodes[0])) 1253 } 1254 if len(item.fakeNodeHandler.UpdatedNodeStatuses) > 0 && !apiequality.Semantic.DeepEqual(item.expectedNodes, item.fakeNodeHandler.UpdatedNodeStatuses) { 1255 t.Errorf("Case[%d] unexpected nodes: %s", i, cmp.Diff(item.expectedNodes[0], item.fakeNodeHandler.UpdatedNodeStatuses[0])) 1256 } 1257 1258 podStatusUpdated := false 1259 for _, action := range item.fakeNodeHandler.Actions() { 1260 if action.GetVerb() == "update" && action.GetResource().Resource == "pods" && action.GetSubresource() == "status" { 1261 podStatusUpdated = true 1262 } 1263 } 1264 if podStatusUpdated != item.expectedPodStatusUpdate { 1265 t.Errorf("Case[%d] expect pod status updated to be %v, but got %v", i, item.expectedPodStatusUpdate, podStatusUpdated) 1266 } 1267 } 1268 } 1269 1270 func TestMonitorNodeHealthUpdateNodeAndPodStatusWithLease(t *testing.T) { 1271 nodeCreationTime := metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC) 1272 fakeNow := metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC) 1273 testcases := []struct { 1274 description string 1275 fakeNodeHandler *testutil.FakeNodeHandler 1276 lease *coordv1.Lease 1277 timeToPass time.Duration 1278 newNodeStatus v1.NodeStatus 1279 newLease *coordv1.Lease 1280 expectedRequestCount int 1281 expectedNodes []*v1.Node 1282 expectedPodStatusUpdate bool 1283 }{ 1284 // Node created recently, without status. Node lease is missing. 1285 // Expect no action from node controller (within startup grace period). 1286 { 1287 description: "Node created recently, without status. Node lease is missing.", 1288 fakeNodeHandler: &testutil.FakeNodeHandler{ 1289 Existing: []*v1.Node{ 1290 { 1291 ObjectMeta: metav1.ObjectMeta{ 1292 Name: "node0", 1293 CreationTimestamp: fakeNow, 1294 }, 1295 }, 1296 }, 1297 Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}), 1298 }, 1299 expectedRequestCount: 1, // List 1300 expectedNodes: nil, 1301 expectedPodStatusUpdate: false, 1302 }, 1303 // Node created recently, without status. Node lease is renewed recently. 1304 // Expect no action from node controller (within startup grace period). 1305 { 1306 description: "Node created recently, without status. Node lease is renewed recently.", 1307 fakeNodeHandler: &testutil.FakeNodeHandler{ 1308 Existing: []*v1.Node{ 1309 { 1310 ObjectMeta: metav1.ObjectMeta{ 1311 Name: "node0", 1312 CreationTimestamp: fakeNow, 1313 }, 1314 }, 1315 }, 1316 Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}), 1317 }, 1318 lease: createNodeLease("node0", metav1.NewMicroTime(fakeNow.Time)), 1319 expectedRequestCount: 1, // List 1320 expectedNodes: nil, 1321 expectedPodStatusUpdate: false, 1322 }, 1323 // Node created long time ago, without status. Node lease is missing. 1324 // Expect Unknown status posted from node controller. 1325 { 1326 description: "Node created long time ago, without status. Node lease is missing.", 1327 fakeNodeHandler: &testutil.FakeNodeHandler{ 1328 Existing: []*v1.Node{ 1329 { 1330 ObjectMeta: metav1.ObjectMeta{ 1331 Name: "node0", 1332 CreationTimestamp: nodeCreationTime, 1333 }, 1334 }, 1335 }, 1336 Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}), 1337 }, 1338 expectedRequestCount: 2, // List+Update 1339 expectedNodes: []*v1.Node{ 1340 { 1341 ObjectMeta: metav1.ObjectMeta{ 1342 Name: "node0", 1343 CreationTimestamp: nodeCreationTime, 1344 }, 1345 Status: v1.NodeStatus{ 1346 Conditions: []v1.NodeCondition{ 1347 { 1348 Type: v1.NodeReady, 1349 Status: v1.ConditionUnknown, 1350 Reason: "NodeStatusNeverUpdated", 1351 Message: "Kubelet never posted node status.", 1352 LastHeartbeatTime: nodeCreationTime, 1353 LastTransitionTime: fakeNow, 1354 }, 1355 { 1356 Type: v1.NodeMemoryPressure, 1357 Status: v1.ConditionUnknown, 1358 Reason: "NodeStatusNeverUpdated", 1359 Message: "Kubelet never posted node status.", 1360 LastHeartbeatTime: nodeCreationTime, 1361 LastTransitionTime: fakeNow, 1362 }, 1363 { 1364 Type: v1.NodeDiskPressure, 1365 Status: v1.ConditionUnknown, 1366 Reason: "NodeStatusNeverUpdated", 1367 Message: "Kubelet never posted node status.", 1368 LastHeartbeatTime: nodeCreationTime, 1369 LastTransitionTime: fakeNow, 1370 }, 1371 { 1372 Type: v1.NodePIDPressure, 1373 Status: v1.ConditionUnknown, 1374 Reason: "NodeStatusNeverUpdated", 1375 Message: "Kubelet never posted node status.", 1376 LastHeartbeatTime: nodeCreationTime, 1377 LastTransitionTime: fakeNow, 1378 }, 1379 }, 1380 }, 1381 }, 1382 }, 1383 expectedPodStatusUpdate: false, // Pod was never scheduled because the node was never ready. 1384 }, 1385 // Node created long time ago, without status. Node lease is renewed recently. 1386 // Expect no action from node controller (within monitor grace period). 1387 { 1388 description: "Node created long time ago, without status. Node lease is renewed recently.", 1389 fakeNodeHandler: &testutil.FakeNodeHandler{ 1390 Existing: []*v1.Node{ 1391 { 1392 ObjectMeta: metav1.ObjectMeta{ 1393 Name: "node0", 1394 CreationTimestamp: nodeCreationTime, 1395 }, 1396 }, 1397 }, 1398 Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}), 1399 }, 1400 lease: createNodeLease("node0", metav1.NewMicroTime(fakeNow.Time)), 1401 timeToPass: time.Hour, 1402 newLease: createNodeLease("node0", metav1.NewMicroTime(fakeNow.Time.Add(time.Hour))), // Lease is renewed after 1 hour. 1403 expectedRequestCount: 2, // List+List 1404 expectedNodes: []*v1.Node{ 1405 { 1406 ObjectMeta: metav1.ObjectMeta{ 1407 Name: "node0", 1408 CreationTimestamp: nodeCreationTime, 1409 }, 1410 }, 1411 }, 1412 expectedPodStatusUpdate: false, 1413 }, 1414 // Node created long time ago, without status. Node lease is expired. 1415 // Expect Unknown status posted from node controller. 1416 { 1417 description: "Node created long time ago, without status. Node lease is expired.", 1418 fakeNodeHandler: &testutil.FakeNodeHandler{ 1419 Existing: []*v1.Node{ 1420 { 1421 ObjectMeta: metav1.ObjectMeta{ 1422 Name: "node0", 1423 CreationTimestamp: nodeCreationTime, 1424 }, 1425 }, 1426 }, 1427 Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}), 1428 }, 1429 lease: createNodeLease("node0", metav1.NewMicroTime(fakeNow.Time)), 1430 timeToPass: time.Hour, 1431 newLease: createNodeLease("node0", metav1.NewMicroTime(fakeNow.Time)), // Lease is not renewed after 1 hour. 1432 expectedRequestCount: 3, // List+List+Update 1433 expectedNodes: []*v1.Node{ 1434 { 1435 ObjectMeta: metav1.ObjectMeta{ 1436 Name: "node0", 1437 CreationTimestamp: nodeCreationTime, 1438 }, 1439 Status: v1.NodeStatus{ 1440 Conditions: []v1.NodeCondition{ 1441 { 1442 Type: v1.NodeReady, 1443 Status: v1.ConditionUnknown, 1444 Reason: "NodeStatusNeverUpdated", 1445 Message: "Kubelet never posted node status.", 1446 LastHeartbeatTime: nodeCreationTime, 1447 LastTransitionTime: metav1.Time{Time: fakeNow.Add(time.Hour)}, 1448 }, 1449 { 1450 Type: v1.NodeMemoryPressure, 1451 Status: v1.ConditionUnknown, 1452 Reason: "NodeStatusNeverUpdated", 1453 Message: "Kubelet never posted node status.", 1454 LastHeartbeatTime: nodeCreationTime, 1455 LastTransitionTime: metav1.Time{Time: fakeNow.Add(time.Hour)}, 1456 }, 1457 { 1458 Type: v1.NodeDiskPressure, 1459 Status: v1.ConditionUnknown, 1460 Reason: "NodeStatusNeverUpdated", 1461 Message: "Kubelet never posted node status.", 1462 LastHeartbeatTime: nodeCreationTime, 1463 LastTransitionTime: metav1.Time{Time: fakeNow.Add(time.Hour)}, 1464 }, 1465 { 1466 Type: v1.NodePIDPressure, 1467 Status: v1.ConditionUnknown, 1468 Reason: "NodeStatusNeverUpdated", 1469 Message: "Kubelet never posted node status.", 1470 LastHeartbeatTime: nodeCreationTime, 1471 LastTransitionTime: metav1.Time{Time: fakeNow.Add(time.Hour)}, 1472 }, 1473 }, 1474 }, 1475 }, 1476 }, 1477 expectedPodStatusUpdate: false, 1478 }, 1479 // Node created long time ago, with status updated by kubelet exceeds grace period. Node lease is renewed. 1480 // Expect no action from node controller (within monitor grace period). 1481 { 1482 description: "Node created long time ago, with status updated by kubelet exceeds grace period. Node lease is renewed.", 1483 fakeNodeHandler: &testutil.FakeNodeHandler{ 1484 Existing: []*v1.Node{ 1485 { 1486 ObjectMeta: metav1.ObjectMeta{ 1487 Name: "node0", 1488 CreationTimestamp: nodeCreationTime, 1489 }, 1490 Status: v1.NodeStatus{ 1491 Conditions: []v1.NodeCondition{ 1492 { 1493 Type: v1.NodeReady, 1494 Status: v1.ConditionTrue, 1495 LastHeartbeatTime: fakeNow, 1496 LastTransitionTime: fakeNow, 1497 }, 1498 { 1499 Type: v1.NodeDiskPressure, 1500 Status: v1.ConditionFalse, 1501 LastHeartbeatTime: fakeNow, 1502 LastTransitionTime: fakeNow, 1503 }, 1504 }, 1505 Capacity: v1.ResourceList{ 1506 v1.ResourceName(v1.ResourceCPU): resource.MustParse("10"), 1507 v1.ResourceName(v1.ResourceMemory): resource.MustParse("10G"), 1508 }, 1509 }, 1510 }, 1511 }, 1512 Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}), 1513 }, 1514 lease: createNodeLease("node0", metav1.NewMicroTime(fakeNow.Time)), 1515 expectedRequestCount: 2, // List+List 1516 timeToPass: time.Hour, 1517 newNodeStatus: v1.NodeStatus{ 1518 // Node status hasn't been updated for 1 hour. 1519 Conditions: []v1.NodeCondition{ 1520 { 1521 Type: v1.NodeReady, 1522 Status: v1.ConditionTrue, 1523 LastHeartbeatTime: fakeNow, 1524 LastTransitionTime: fakeNow, 1525 }, 1526 { 1527 Type: v1.NodeDiskPressure, 1528 Status: v1.ConditionFalse, 1529 LastHeartbeatTime: fakeNow, 1530 LastTransitionTime: fakeNow, 1531 }, 1532 }, 1533 Capacity: v1.ResourceList{ 1534 v1.ResourceName(v1.ResourceCPU): resource.MustParse("10"), 1535 v1.ResourceName(v1.ResourceMemory): resource.MustParse("10G"), 1536 }, 1537 }, 1538 newLease: createNodeLease("node0", metav1.NewMicroTime(fakeNow.Time.Add(time.Hour))), // Lease is renewed after 1 hour. 1539 expectedNodes: []*v1.Node{ 1540 { 1541 ObjectMeta: metav1.ObjectMeta{ 1542 Name: "node0", 1543 CreationTimestamp: nodeCreationTime, 1544 }, 1545 Status: v1.NodeStatus{ 1546 Conditions: []v1.NodeCondition{ 1547 { 1548 Type: v1.NodeReady, 1549 Status: v1.ConditionTrue, 1550 LastHeartbeatTime: fakeNow, 1551 LastTransitionTime: fakeNow, 1552 }, 1553 { 1554 Type: v1.NodeDiskPressure, 1555 Status: v1.ConditionFalse, 1556 LastHeartbeatTime: fakeNow, 1557 LastTransitionTime: fakeNow, 1558 }, 1559 }, 1560 Capacity: v1.ResourceList{ 1561 v1.ResourceName(v1.ResourceCPU): resource.MustParse("10"), 1562 v1.ResourceName(v1.ResourceMemory): resource.MustParse("10G"), 1563 }, 1564 }, 1565 }, 1566 }, 1567 expectedPodStatusUpdate: false, 1568 }, 1569 // Node created long time ago, with status updated by kubelet recently. Node lease is expired. 1570 // Expect no action from node controller (within monitor grace period). 1571 { 1572 description: "Node created long time ago, with status updated by kubelet recently. Node lease is expired.", 1573 fakeNodeHandler: &testutil.FakeNodeHandler{ 1574 Existing: []*v1.Node{ 1575 { 1576 ObjectMeta: metav1.ObjectMeta{ 1577 Name: "node0", 1578 CreationTimestamp: nodeCreationTime, 1579 }, 1580 Status: v1.NodeStatus{ 1581 Conditions: []v1.NodeCondition{ 1582 { 1583 Type: v1.NodeReady, 1584 Status: v1.ConditionTrue, 1585 LastHeartbeatTime: fakeNow, 1586 LastTransitionTime: fakeNow, 1587 }, 1588 { 1589 Type: v1.NodeDiskPressure, 1590 Status: v1.ConditionFalse, 1591 LastHeartbeatTime: fakeNow, 1592 LastTransitionTime: fakeNow, 1593 }, 1594 }, 1595 Capacity: v1.ResourceList{ 1596 v1.ResourceName(v1.ResourceCPU): resource.MustParse("10"), 1597 v1.ResourceName(v1.ResourceMemory): resource.MustParse("10G"), 1598 }, 1599 }, 1600 }, 1601 }, 1602 Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}), 1603 }, 1604 lease: createNodeLease("node0", metav1.NewMicroTime(fakeNow.Time)), 1605 expectedRequestCount: 2, // List+List 1606 timeToPass: time.Hour, 1607 newNodeStatus: v1.NodeStatus{ 1608 // Node status is updated after 1 hour. 1609 Conditions: []v1.NodeCondition{ 1610 { 1611 Type: v1.NodeReady, 1612 Status: v1.ConditionTrue, 1613 LastHeartbeatTime: metav1.Time{Time: fakeNow.Add(time.Hour)}, 1614 LastTransitionTime: fakeNow, 1615 }, 1616 { 1617 Type: v1.NodeDiskPressure, 1618 Status: v1.ConditionFalse, 1619 LastHeartbeatTime: metav1.Time{Time: fakeNow.Add(time.Hour)}, 1620 LastTransitionTime: fakeNow, 1621 }, 1622 }, 1623 Capacity: v1.ResourceList{ 1624 v1.ResourceName(v1.ResourceCPU): resource.MustParse("10"), 1625 v1.ResourceName(v1.ResourceMemory): resource.MustParse("10G"), 1626 }, 1627 }, 1628 newLease: createNodeLease("node0", metav1.NewMicroTime(fakeNow.Time)), // Lease is not renewed after 1 hour. 1629 expectedNodes: []*v1.Node{ 1630 { 1631 ObjectMeta: metav1.ObjectMeta{ 1632 Name: "node0", 1633 CreationTimestamp: nodeCreationTime, 1634 }, 1635 Status: v1.NodeStatus{ 1636 Conditions: []v1.NodeCondition{ 1637 { 1638 Type: v1.NodeReady, 1639 Status: v1.ConditionTrue, 1640 LastHeartbeatTime: metav1.Time{Time: fakeNow.Add(time.Hour)}, 1641 LastTransitionTime: fakeNow, 1642 }, 1643 { 1644 Type: v1.NodeDiskPressure, 1645 Status: v1.ConditionFalse, 1646 LastHeartbeatTime: metav1.Time{Time: fakeNow.Add(time.Hour)}, 1647 LastTransitionTime: fakeNow, 1648 }, 1649 }, 1650 Capacity: v1.ResourceList{ 1651 v1.ResourceName(v1.ResourceCPU): resource.MustParse("10"), 1652 v1.ResourceName(v1.ResourceMemory): resource.MustParse("10G"), 1653 }, 1654 }, 1655 }, 1656 }, 1657 expectedPodStatusUpdate: false, 1658 }, 1659 // Node created long time ago, with status updated by kubelet exceeds grace period. Node lease is also expired. 1660 // Expect Unknown status posted from node controller. 1661 { 1662 description: "Node created long time ago, with status updated by kubelet exceeds grace period. Node lease is also expired.", 1663 fakeNodeHandler: &testutil.FakeNodeHandler{ 1664 Existing: []*v1.Node{ 1665 { 1666 ObjectMeta: metav1.ObjectMeta{ 1667 Name: "node0", 1668 CreationTimestamp: nodeCreationTime, 1669 }, 1670 Status: v1.NodeStatus{ 1671 Conditions: []v1.NodeCondition{ 1672 { 1673 Type: v1.NodeReady, 1674 Status: v1.ConditionTrue, 1675 LastHeartbeatTime: fakeNow, 1676 LastTransitionTime: fakeNow, 1677 }, 1678 }, 1679 Capacity: v1.ResourceList{ 1680 v1.ResourceName(v1.ResourceCPU): resource.MustParse("10"), 1681 v1.ResourceName(v1.ResourceMemory): resource.MustParse("10G"), 1682 }, 1683 }, 1684 }, 1685 }, 1686 Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}), 1687 }, 1688 lease: createNodeLease("node0", metav1.NewMicroTime(fakeNow.Time)), 1689 expectedRequestCount: 3, // List+List+Update 1690 timeToPass: time.Hour, 1691 newNodeStatus: v1.NodeStatus{ 1692 // Node status hasn't been updated for 1 hour. 1693 Conditions: []v1.NodeCondition{ 1694 { 1695 Type: v1.NodeReady, 1696 Status: v1.ConditionTrue, 1697 LastHeartbeatTime: fakeNow, 1698 LastTransitionTime: fakeNow, 1699 }, 1700 }, 1701 Capacity: v1.ResourceList{ 1702 v1.ResourceName(v1.ResourceCPU): resource.MustParse("10"), 1703 v1.ResourceName(v1.ResourceMemory): resource.MustParse("10G"), 1704 }, 1705 }, 1706 newLease: createNodeLease("node0", metav1.NewMicroTime(fakeNow.Time)), // Lease is not renewed after 1 hour. 1707 expectedNodes: []*v1.Node{ 1708 { 1709 ObjectMeta: metav1.ObjectMeta{ 1710 Name: "node0", 1711 CreationTimestamp: nodeCreationTime, 1712 }, 1713 Status: v1.NodeStatus{ 1714 Conditions: []v1.NodeCondition{ 1715 { 1716 Type: v1.NodeReady, 1717 Status: v1.ConditionUnknown, 1718 Reason: "NodeStatusUnknown", 1719 Message: "Kubelet stopped posting node status.", 1720 LastHeartbeatTime: fakeNow, 1721 LastTransitionTime: metav1.Time{Time: fakeNow.Add(time.Hour)}, 1722 }, 1723 { 1724 Type: v1.NodeMemoryPressure, 1725 Status: v1.ConditionUnknown, 1726 Reason: "NodeStatusNeverUpdated", 1727 Message: "Kubelet never posted node status.", 1728 LastHeartbeatTime: nodeCreationTime, // should default to node creation time if condition was never updated 1729 LastTransitionTime: metav1.Time{Time: fakeNow.Add(time.Hour)}, 1730 }, 1731 { 1732 Type: v1.NodeDiskPressure, 1733 Status: v1.ConditionUnknown, 1734 Reason: "NodeStatusNeverUpdated", 1735 Message: "Kubelet never posted node status.", 1736 LastHeartbeatTime: nodeCreationTime, // should default to node creation time if condition was never updated 1737 LastTransitionTime: metav1.Time{Time: fakeNow.Add(time.Hour)}, 1738 }, 1739 { 1740 Type: v1.NodePIDPressure, 1741 Status: v1.ConditionUnknown, 1742 Reason: "NodeStatusNeverUpdated", 1743 Message: "Kubelet never posted node status.", 1744 LastHeartbeatTime: nodeCreationTime, // should default to node creation time if condition was never updated 1745 LastTransitionTime: metav1.Time{Time: fakeNow.Add(time.Hour)}, 1746 }, 1747 }, 1748 Capacity: v1.ResourceList{ 1749 v1.ResourceName(v1.ResourceCPU): resource.MustParse("10"), 1750 v1.ResourceName(v1.ResourceMemory): resource.MustParse("10G"), 1751 }, 1752 }, 1753 }, 1754 }, 1755 expectedPodStatusUpdate: true, 1756 }, 1757 } 1758 1759 for _, item := range testcases { 1760 t.Run(item.description, func(t *testing.T) { 1761 _, ctx := ktesting.NewTestContext(t) 1762 nodeController, _ := newNodeLifecycleControllerFromClient( 1763 ctx, 1764 item.fakeNodeHandler, 1765 testRateLimiterQPS, 1766 testRateLimiterQPS, 1767 testLargeClusterThreshold, 1768 testUnhealthyThreshold, 1769 testNodeMonitorGracePeriod, 1770 testNodeStartupGracePeriod, 1771 testNodeMonitorPeriod, 1772 ) 1773 nodeController.now = func() metav1.Time { return fakeNow } 1774 nodeController.recorder = testutil.NewFakeRecorder() 1775 nodeController.getPodsAssignedToNode = fakeGetPodsAssignedToNode(item.fakeNodeHandler.Clientset) 1776 if err := nodeController.syncNodeStore(item.fakeNodeHandler); err != nil { 1777 t.Fatalf("unexpected error: %v", err) 1778 } 1779 if err := nodeController.syncLeaseStore(item.lease); err != nil { 1780 t.Fatalf("unexpected error: %v", err) 1781 } 1782 if err := nodeController.monitorNodeHealth(ctx); err != nil { 1783 t.Fatalf("unexpected error: %v", err) 1784 } 1785 if item.timeToPass > 0 { 1786 nodeController.now = func() metav1.Time { return metav1.Time{Time: fakeNow.Add(item.timeToPass)} } 1787 item.fakeNodeHandler.Existing[0].Status = item.newNodeStatus 1788 if err := nodeController.syncNodeStore(item.fakeNodeHandler); err != nil { 1789 t.Fatalf("unexpected error: %v", err) 1790 } 1791 if err := nodeController.syncLeaseStore(item.newLease); err != nil { 1792 t.Fatalf("unexpected error: %v", err) 1793 } 1794 if err := nodeController.monitorNodeHealth(ctx); err != nil { 1795 t.Fatalf("unexpected error: %v", err) 1796 } 1797 } 1798 if item.expectedRequestCount != item.fakeNodeHandler.RequestCount { 1799 t.Errorf("expected %v call, but got %v.", item.expectedRequestCount, item.fakeNodeHandler.RequestCount) 1800 } 1801 if len(item.fakeNodeHandler.UpdatedNodes) > 0 && !apiequality.Semantic.DeepEqual(item.expectedNodes, item.fakeNodeHandler.UpdatedNodes) { 1802 t.Errorf("unexpected nodes: %s", cmp.Diff(item.expectedNodes[0], item.fakeNodeHandler.UpdatedNodes[0])) 1803 } 1804 if len(item.fakeNodeHandler.UpdatedNodeStatuses) > 0 && !apiequality.Semantic.DeepEqual(item.expectedNodes, item.fakeNodeHandler.UpdatedNodeStatuses) { 1805 t.Errorf("unexpected nodes: %s", cmp.Diff(item.expectedNodes[0], item.fakeNodeHandler.UpdatedNodeStatuses[0])) 1806 } 1807 1808 podStatusUpdated := false 1809 for _, action := range item.fakeNodeHandler.Actions() { 1810 if action.GetVerb() == "update" && action.GetResource().Resource == "pods" && action.GetSubresource() == "status" { 1811 podStatusUpdated = true 1812 } 1813 } 1814 if podStatusUpdated != item.expectedPodStatusUpdate { 1815 t.Errorf("expect pod status updated to be %v, but got %v", item.expectedPodStatusUpdate, podStatusUpdated) 1816 } 1817 }) 1818 } 1819 } 1820 1821 func TestMonitorNodeHealthMarkPodsNotReady(t *testing.T) { 1822 fakeNow := metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC) 1823 table := []struct { 1824 fakeNodeHandler *testutil.FakeNodeHandler 1825 timeToPass time.Duration 1826 newNodeStatus v1.NodeStatus 1827 expectedPodStatusUpdate bool 1828 }{ 1829 // Node created recently, without status. 1830 // Expect no action from node controller (within startup grace period). 1831 { 1832 fakeNodeHandler: &testutil.FakeNodeHandler{ 1833 Existing: []*v1.Node{ 1834 { 1835 ObjectMeta: metav1.ObjectMeta{ 1836 Name: "node0", 1837 CreationTimestamp: fakeNow, 1838 }, 1839 }, 1840 }, 1841 Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}), 1842 }, 1843 expectedPodStatusUpdate: false, 1844 }, 1845 // Node created long time ago, with status updated recently. 1846 // Expect no action from node controller (within monitor grace period). 1847 { 1848 fakeNodeHandler: &testutil.FakeNodeHandler{ 1849 Existing: []*v1.Node{ 1850 { 1851 ObjectMeta: metav1.ObjectMeta{ 1852 Name: "node0", 1853 CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), 1854 }, 1855 Status: v1.NodeStatus{ 1856 Conditions: []v1.NodeCondition{ 1857 { 1858 Type: v1.NodeReady, 1859 Status: v1.ConditionTrue, 1860 // Node status has just been updated. 1861 LastHeartbeatTime: fakeNow, 1862 LastTransitionTime: fakeNow, 1863 }, 1864 }, 1865 Capacity: v1.ResourceList{ 1866 v1.ResourceName(v1.ResourceCPU): resource.MustParse("10"), 1867 v1.ResourceName(v1.ResourceMemory): resource.MustParse("10G"), 1868 }, 1869 }, 1870 }, 1871 }, 1872 Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}), 1873 }, 1874 expectedPodStatusUpdate: false, 1875 }, 1876 // Node created long time ago, with status updated by kubelet exceeds grace period. 1877 // Expect pods status updated and Unknown node status posted from node controller 1878 { 1879 fakeNodeHandler: &testutil.FakeNodeHandler{ 1880 Existing: []*v1.Node{ 1881 { 1882 ObjectMeta: metav1.ObjectMeta{ 1883 Name: "node0", 1884 CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), 1885 }, 1886 Status: v1.NodeStatus{ 1887 Conditions: []v1.NodeCondition{ 1888 { 1889 Type: v1.NodeReady, 1890 Status: v1.ConditionTrue, 1891 // Node status hasn't been updated for 1hr. 1892 LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), 1893 LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), 1894 }, 1895 }, 1896 Capacity: v1.ResourceList{ 1897 v1.ResourceName(v1.ResourceCPU): resource.MustParse("10"), 1898 v1.ResourceName(v1.ResourceMemory): resource.MustParse("10G"), 1899 }, 1900 }, 1901 }, 1902 }, 1903 Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}), 1904 }, 1905 timeToPass: 1 * time.Minute, 1906 newNodeStatus: v1.NodeStatus{ 1907 Conditions: []v1.NodeCondition{ 1908 { 1909 Type: v1.NodeReady, 1910 Status: v1.ConditionTrue, 1911 // Node status hasn't been updated for 1hr. 1912 LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), 1913 LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), 1914 }, 1915 }, 1916 Capacity: v1.ResourceList{ 1917 v1.ResourceName(v1.ResourceCPU): resource.MustParse("10"), 1918 v1.ResourceName(v1.ResourceMemory): resource.MustParse("10G"), 1919 }, 1920 }, 1921 expectedPodStatusUpdate: true, 1922 }, 1923 } 1924 1925 _, ctx := ktesting.NewTestContext(t) 1926 for i, item := range table { 1927 nodeController, _ := newNodeLifecycleControllerFromClient( 1928 ctx, 1929 item.fakeNodeHandler, 1930 testRateLimiterQPS, 1931 testRateLimiterQPS, 1932 testLargeClusterThreshold, 1933 testUnhealthyThreshold, 1934 testNodeMonitorGracePeriod, 1935 testNodeStartupGracePeriod, 1936 testNodeMonitorPeriod, 1937 ) 1938 nodeController.now = func() metav1.Time { return fakeNow } 1939 nodeController.recorder = testutil.NewFakeRecorder() 1940 nodeController.getPodsAssignedToNode = fakeGetPodsAssignedToNode(item.fakeNodeHandler.Clientset) 1941 if err := nodeController.syncNodeStore(item.fakeNodeHandler); err != nil { 1942 t.Errorf("unexpected error: %v", err) 1943 } 1944 if err := nodeController.monitorNodeHealth(ctx); err != nil { 1945 t.Errorf("Case[%d] unexpected error: %v", i, err) 1946 } 1947 if item.timeToPass > 0 { 1948 nodeController.now = func() metav1.Time { return metav1.Time{Time: fakeNow.Add(item.timeToPass)} } 1949 item.fakeNodeHandler.Existing[0].Status = item.newNodeStatus 1950 if err := nodeController.syncNodeStore(item.fakeNodeHandler); err != nil { 1951 t.Errorf("unexpected error: %v", err) 1952 } 1953 if err := nodeController.monitorNodeHealth(ctx); err != nil { 1954 t.Errorf("Case[%d] unexpected error: %v", i, err) 1955 } 1956 } 1957 1958 podStatusUpdated := false 1959 for _, action := range item.fakeNodeHandler.Actions() { 1960 if action.GetVerb() == "update" && action.GetResource().Resource == "pods" && action.GetSubresource() == "status" { 1961 podStatusUpdated = true 1962 } 1963 } 1964 if podStatusUpdated != item.expectedPodStatusUpdate { 1965 t.Errorf("Case[%d] expect pod status updated to be %v, but got %v", i, item.expectedPodStatusUpdate, podStatusUpdated) 1966 } 1967 } 1968 } 1969 1970 // TestMonitorNodeHealthMarkPodsNotReadyWithWorkerSize tests the happy path of 1971 // TestMonitorNodeHealthMarkPodsNotReady with a large number of nodes/pods and 1972 // varying numbers of workers. 1973 func TestMonitorNodeHealthMarkPodsNotReadyWithWorkerSize(t *testing.T) { 1974 const numNodes = 50 1975 const podsPerNode = 100 1976 makeNodes := func() []*v1.Node { 1977 nodes := make([]*v1.Node, numNodes) 1978 // Node created long time ago, with status updated by kubelet exceeds grace period. 1979 // Expect pods status updated and Unknown node status posted from node controller 1980 for i := 0; i < numNodes; i++ { 1981 nodes[i] = &v1.Node{ 1982 ObjectMeta: metav1.ObjectMeta{ 1983 Name: fmt.Sprintf("node%d", i), 1984 CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), 1985 }, 1986 Status: v1.NodeStatus{ 1987 Conditions: []v1.NodeCondition{ 1988 { 1989 Type: v1.NodeReady, 1990 Status: v1.ConditionTrue, 1991 // Node status hasn't been updated for 1hr. 1992 LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), 1993 LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), 1994 }, 1995 }, 1996 Capacity: v1.ResourceList{ 1997 v1.ResourceName(v1.ResourceCPU): resource.MustParse("10"), 1998 v1.ResourceName(v1.ResourceMemory): resource.MustParse("10G"), 1999 }, 2000 }, 2001 } 2002 } 2003 return nodes 2004 } 2005 makePods := func() []v1.Pod { 2006 pods := make([]v1.Pod, numNodes*podsPerNode) 2007 for i := 0; i < numNodes*podsPerNode; i++ { 2008 pods[i] = *testutil.NewPod(fmt.Sprintf("pod%d", i), fmt.Sprintf("node%d", i%numNodes)) 2009 } 2010 return pods 2011 } 2012 2013 table := []struct { 2014 workers int 2015 }{ 2016 {workers: 0}, // will default to scheduler.UpdateWorkerSize 2017 {workers: 1}, 2018 } 2019 2020 _, ctx := ktesting.NewTestContext(t) 2021 for i, item := range table { 2022 fakeNow := metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC) 2023 2024 fakeNodeHandler := &testutil.FakeNodeHandler{ 2025 Existing: makeNodes(), 2026 Clientset: fake.NewSimpleClientset(&v1.PodList{Items: makePods()}), 2027 } 2028 2029 nodeController, _ := newNodeLifecycleControllerFromClient( 2030 ctx, 2031 fakeNodeHandler, 2032 testRateLimiterQPS, 2033 testRateLimiterQPS, 2034 testLargeClusterThreshold, 2035 testUnhealthyThreshold, 2036 testNodeMonitorGracePeriod, 2037 testNodeStartupGracePeriod, 2038 testNodeMonitorPeriod) 2039 nodeController.now = func() metav1.Time { return fakeNow } 2040 nodeController.recorder = testutil.NewFakeRecorder() 2041 nodeController.getPodsAssignedToNode = fakeGetPodsAssignedToNode(fakeNodeHandler.Clientset) 2042 if item.workers != 0 { 2043 nodeController.nodeUpdateWorkerSize = item.workers 2044 } 2045 if err := nodeController.syncNodeStore(fakeNodeHandler); err != nil { 2046 t.Errorf("unexpected error: %v", err) 2047 } 2048 if err := nodeController.monitorNodeHealth(ctx); err != nil { 2049 t.Errorf("Case[%d] unexpected error: %v", i, err) 2050 } 2051 2052 nodeController.now = func() metav1.Time { return metav1.Time{Time: fakeNow.Add(1 * time.Minute)} } 2053 for i := range fakeNodeHandler.Existing { 2054 fakeNodeHandler.Existing[i].Status = v1.NodeStatus{ 2055 Conditions: []v1.NodeCondition{ 2056 { 2057 Type: v1.NodeReady, 2058 Status: v1.ConditionTrue, 2059 // Node status hasn't been updated for 1hr. 2060 LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), 2061 LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), 2062 }, 2063 }, 2064 Capacity: v1.ResourceList{ 2065 v1.ResourceName(v1.ResourceCPU): resource.MustParse("10"), 2066 v1.ResourceName(v1.ResourceMemory): resource.MustParse("10G"), 2067 }, 2068 } 2069 } 2070 2071 if err := nodeController.syncNodeStore(fakeNodeHandler); err != nil { 2072 t.Errorf("unexpected error: %v", err) 2073 } 2074 if err := nodeController.monitorNodeHealth(ctx); err != nil { 2075 t.Errorf("Case[%d] unexpected error: %v", i, err) 2076 } 2077 2078 podStatusUpdates := 0 2079 for _, action := range fakeNodeHandler.Actions() { 2080 if action.GetVerb() == "update" && action.GetResource().Resource == "pods" && action.GetSubresource() == "status" { 2081 podStatusUpdates++ 2082 } 2083 } 2084 const expectedPodStatusUpdates = numNodes * podsPerNode 2085 if podStatusUpdates != expectedPodStatusUpdates { 2086 t.Errorf("Case[%d] expect pod status updated to be %v, but got %v", i, expectedPodStatusUpdates, podStatusUpdates) 2087 } 2088 } 2089 } 2090 2091 func TestMonitorNodeHealthMarkPodsNotReadyRetry(t *testing.T) { 2092 type nodeIteration struct { 2093 timeToPass time.Duration 2094 newNodes []*v1.Node 2095 } 2096 timeNow := metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC) 2097 timePlusTwoMinutes := metav1.Date(2015, 1, 1, 12, 0, 2, 0, time.UTC) 2098 makeNodes := func(status v1.ConditionStatus, lastHeartbeatTime, lastTransitionTime metav1.Time) []*v1.Node { 2099 return []*v1.Node{ 2100 { 2101 ObjectMeta: metav1.ObjectMeta{ 2102 Name: "node0", 2103 CreationTimestamp: timeNow, 2104 }, 2105 Status: v1.NodeStatus{ 2106 Conditions: []v1.NodeCondition{ 2107 { 2108 Type: v1.NodeReady, 2109 Status: status, 2110 LastHeartbeatTime: lastHeartbeatTime, 2111 LastTransitionTime: lastTransitionTime, 2112 }, 2113 }, 2114 }, 2115 }, 2116 } 2117 } 2118 table := []struct { 2119 desc string 2120 fakeNodeHandler *testutil.FakeNodeHandler 2121 updateReactor func(action testcore.Action) (bool, runtime.Object, error) 2122 fakeGetPodsAssignedToNode func(c *fake.Clientset) func(string) ([]*v1.Pod, error) 2123 nodeIterations []nodeIteration 2124 expectedPodStatusUpdates int 2125 }{ 2126 // Node created long time ago, with status updated by kubelet exceeds grace period. 2127 // First monitorNodeHealth check will update pod status to NotReady. 2128 // Second monitorNodeHealth check will do no updates (no retry). 2129 { 2130 desc: "successful pod status update, no retry required", 2131 fakeNodeHandler: &testutil.FakeNodeHandler{ 2132 Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}), 2133 }, 2134 fakeGetPodsAssignedToNode: fakeGetPodsAssignedToNode, 2135 nodeIterations: []nodeIteration{ 2136 { 2137 timeToPass: 0, 2138 newNodes: makeNodes(v1.ConditionTrue, timeNow, timeNow), 2139 }, 2140 { 2141 timeToPass: 1 * time.Minute, 2142 newNodes: makeNodes(v1.ConditionTrue, timeNow, timeNow), 2143 }, 2144 { 2145 timeToPass: 1 * time.Minute, 2146 newNodes: makeNodes(v1.ConditionFalse, timePlusTwoMinutes, timePlusTwoMinutes), 2147 }, 2148 }, 2149 expectedPodStatusUpdates: 1, 2150 }, 2151 // Node created long time ago, with status updated by kubelet exceeds grace period. 2152 // First monitorNodeHealth check will fail to update pod status to NotReady. 2153 // Second monitorNodeHealth check will update pod status to NotReady (retry). 2154 { 2155 desc: "unsuccessful pod status update, retry required", 2156 fakeNodeHandler: &testutil.FakeNodeHandler{ 2157 Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}), 2158 }, 2159 updateReactor: func() func(action testcore.Action) (bool, runtime.Object, error) { 2160 i := 0 2161 return func(action testcore.Action) (bool, runtime.Object, error) { 2162 if action.GetVerb() == "update" && action.GetResource().Resource == "pods" && action.GetSubresource() == "status" { 2163 i++ 2164 switch i { 2165 case 1: 2166 return true, nil, fmt.Errorf("fake error") 2167 default: 2168 return true, testutil.NewPod("pod0", "node0"), nil 2169 } 2170 } 2171 2172 return true, nil, fmt.Errorf("unsupported action") 2173 } 2174 }(), 2175 fakeGetPodsAssignedToNode: fakeGetPodsAssignedToNode, 2176 nodeIterations: []nodeIteration{ 2177 { 2178 timeToPass: 0, 2179 newNodes: makeNodes(v1.ConditionTrue, timeNow, timeNow), 2180 }, 2181 { 2182 timeToPass: 1 * time.Minute, 2183 newNodes: makeNodes(v1.ConditionTrue, timeNow, timeNow), 2184 }, 2185 { 2186 timeToPass: 1 * time.Minute, 2187 newNodes: makeNodes(v1.ConditionFalse, timePlusTwoMinutes, timePlusTwoMinutes), 2188 }, 2189 }, 2190 expectedPodStatusUpdates: 2, // One failed and one retry. 2191 }, 2192 // Node created long time ago, with status updated by kubelet exceeds grace period. 2193 // First monitorNodeHealth check will fail to list pods. 2194 // Second monitorNodeHealth check will update pod status to NotReady (retry). 2195 { 2196 desc: "unsuccessful pod list, retry required", 2197 fakeNodeHandler: &testutil.FakeNodeHandler{ 2198 Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}), 2199 }, 2200 fakeGetPodsAssignedToNode: func(c *fake.Clientset) func(string) ([]*v1.Pod, error) { 2201 i := 0 2202 f := fakeGetPodsAssignedToNode(c) 2203 return func(nodeName string) ([]*v1.Pod, error) { 2204 i++ 2205 if i == 1 { 2206 return nil, fmt.Errorf("fake error") 2207 } 2208 return f(nodeName) 2209 } 2210 }, 2211 nodeIterations: []nodeIteration{ 2212 { 2213 timeToPass: 0, 2214 newNodes: makeNodes(v1.ConditionTrue, timeNow, timeNow), 2215 }, 2216 { 2217 timeToPass: 1 * time.Minute, 2218 newNodes: makeNodes(v1.ConditionTrue, timeNow, timeNow), 2219 }, 2220 { 2221 timeToPass: 1 * time.Minute, 2222 newNodes: makeNodes(v1.ConditionFalse, timePlusTwoMinutes, timePlusTwoMinutes), 2223 }, 2224 }, 2225 expectedPodStatusUpdates: 1, 2226 }, 2227 } 2228 2229 for _, item := range table { 2230 t.Run(item.desc, func(t *testing.T) { 2231 _, ctx := ktesting.NewTestContext(t) 2232 nodeController, _ := newNodeLifecycleControllerFromClient( 2233 ctx, 2234 item.fakeNodeHandler, 2235 testRateLimiterQPS, 2236 testRateLimiterQPS, 2237 testLargeClusterThreshold, 2238 testUnhealthyThreshold, 2239 testNodeMonitorGracePeriod, 2240 testNodeStartupGracePeriod, 2241 testNodeMonitorPeriod, 2242 ) 2243 if item.updateReactor != nil { 2244 item.fakeNodeHandler.Clientset.PrependReactor("update", "pods", item.updateReactor) 2245 } 2246 nodeController.now = func() metav1.Time { return timeNow } 2247 nodeController.recorder = testutil.NewFakeRecorder() 2248 nodeController.getPodsAssignedToNode = item.fakeGetPodsAssignedToNode(item.fakeNodeHandler.Clientset) 2249 for _, itertion := range item.nodeIterations { 2250 nodeController.now = func() metav1.Time { return metav1.Time{Time: timeNow.Add(itertion.timeToPass)} } 2251 item.fakeNodeHandler.Existing = itertion.newNodes 2252 if err := nodeController.syncNodeStore(item.fakeNodeHandler); err != nil { 2253 t.Errorf("unexpected error: %v", err) 2254 } 2255 if err := nodeController.monitorNodeHealth(ctx); err != nil { 2256 t.Errorf("unexpected error: %v", err) 2257 } 2258 } 2259 2260 podStatusUpdates := 0 2261 for _, action := range item.fakeNodeHandler.Actions() { 2262 if action.GetVerb() == "update" && action.GetResource().Resource == "pods" && action.GetSubresource() == "status" { 2263 podStatusUpdates++ 2264 } 2265 } 2266 if podStatusUpdates != item.expectedPodStatusUpdates { 2267 t.Errorf("expect pod status updated to happen %d times, but got %d", item.expectedPodStatusUpdates, podStatusUpdates) 2268 } 2269 }) 2270 } 2271 } 2272 2273 // TestApplyNoExecuteTaints, ensures we just have a NoExecute taint applied to node. 2274 // NodeController is just responsible for enqueuing the node to tainting queue from which taint manager picks up 2275 // and evicts the pods on the node. 2276 func TestApplyNoExecuteTaints(t *testing.T) { 2277 // TODO: Remove skip once https://github.com/kubernetes/kubernetes/pull/114607 merges. 2278 if goruntime.GOOS == "windows" { 2279 t.Skip("Skipping test on Windows.") 2280 } 2281 fakeNow := metav1.Date(2017, 1, 1, 12, 0, 0, 0, time.UTC) 2282 2283 fakeNodeHandler := &testutil.FakeNodeHandler{ 2284 Existing: []*v1.Node{ 2285 // Unreachable Taint with effect 'NoExecute' should be applied to this node. 2286 { 2287 ObjectMeta: metav1.ObjectMeta{ 2288 Name: "node0", 2289 CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), 2290 Labels: map[string]string{ 2291 v1.LabelTopologyRegion: "region1", 2292 v1.LabelTopologyZone: "zone1", 2293 v1.LabelFailureDomainBetaRegion: "region1", 2294 v1.LabelFailureDomainBetaZone: "zone1", 2295 }, 2296 }, 2297 Status: v1.NodeStatus{ 2298 Conditions: []v1.NodeCondition{ 2299 { 2300 Type: v1.NodeReady, 2301 Status: v1.ConditionUnknown, 2302 LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), 2303 LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), 2304 }, 2305 }, 2306 }, 2307 }, 2308 // Because of the logic that prevents NC from evicting anything when all Nodes are NotReady 2309 // we need second healthy node in tests. 2310 { 2311 ObjectMeta: metav1.ObjectMeta{ 2312 Name: "node1", 2313 CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), 2314 Labels: map[string]string{ 2315 v1.LabelTopologyRegion: "region1", 2316 v1.LabelTopologyZone: "zone1", 2317 v1.LabelFailureDomainBetaRegion: "region1", 2318 v1.LabelFailureDomainBetaZone: "zone1", 2319 }, 2320 }, 2321 Status: v1.NodeStatus{ 2322 Conditions: []v1.NodeCondition{ 2323 { 2324 Type: v1.NodeReady, 2325 Status: v1.ConditionTrue, 2326 LastHeartbeatTime: metav1.Date(2017, 1, 1, 12, 0, 0, 0, time.UTC), 2327 LastTransitionTime: metav1.Date(2017, 1, 1, 12, 0, 0, 0, time.UTC), 2328 }, 2329 }, 2330 }, 2331 }, 2332 // NotReady Taint with NoExecute effect should be applied to this node. 2333 { 2334 ObjectMeta: metav1.ObjectMeta{ 2335 Name: "node2", 2336 CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), 2337 Labels: map[string]string{ 2338 v1.LabelTopologyRegion: "region1", 2339 v1.LabelTopologyZone: "zone1", 2340 v1.LabelFailureDomainBetaRegion: "region1", 2341 v1.LabelFailureDomainBetaZone: "zone1", 2342 }, 2343 }, 2344 Status: v1.NodeStatus{ 2345 Conditions: []v1.NodeCondition{ 2346 { 2347 Type: v1.NodeReady, 2348 Status: v1.ConditionFalse, 2349 LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), 2350 LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), 2351 }, 2352 }, 2353 }, 2354 }, 2355 // NotReady Taint with NoExecute effect should not be applied to a node if the NodeCondition Type NodeReady has been set to nil in the interval between the NodeController enqueuing the node and the taint manager picking up. 2356 { 2357 ObjectMeta: metav1.ObjectMeta{ 2358 Name: "node3", 2359 CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), 2360 Labels: map[string]string{ 2361 v1.LabelTopologyRegion: "region1", 2362 v1.LabelTopologyZone: "zone1", 2363 v1.LabelFailureDomainBetaRegion: "region1", 2364 v1.LabelFailureDomainBetaZone: "zone1", 2365 }, 2366 }, 2367 Status: v1.NodeStatus{ 2368 Conditions: []v1.NodeCondition{ 2369 { 2370 Type: v1.NodeReady, 2371 Status: v1.ConditionTrue, 2372 LastHeartbeatTime: metav1.Date(2016, 1, 1, 12, 0, 0, 0, time.UTC), 2373 LastTransitionTime: metav1.Date(2016, 1, 1, 12, 0, 0, 0, time.UTC), 2374 }, 2375 }, 2376 }, 2377 }, 2378 }, 2379 Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}), 2380 } 2381 healthyNodeNewStatus := v1.NodeStatus{ 2382 Conditions: []v1.NodeCondition{ 2383 { 2384 Type: v1.NodeReady, 2385 Status: v1.ConditionTrue, 2386 LastHeartbeatTime: metav1.Date(2017, 1, 1, 12, 10, 0, 0, time.UTC), 2387 LastTransitionTime: metav1.Date(2017, 1, 1, 12, 0, 0, 0, time.UTC), 2388 }, 2389 }, 2390 } 2391 unhealthyNodeNewStatus := v1.NodeStatus{ 2392 Conditions: []v1.NodeCondition{ 2393 { 2394 Type: v1.NodeReady, 2395 Status: v1.ConditionFalse, 2396 LastHeartbeatTime: metav1.Date(2017, 1, 1, 12, 10, 0, 0, time.UTC), 2397 LastTransitionTime: metav1.Date(2017, 1, 1, 12, 0, 0, 0, time.UTC), 2398 }, 2399 }, 2400 } 2401 overrideNodeNewStatusConditions := []v1.NodeCondition{ 2402 { 2403 Type: "MemoryPressure", 2404 Status: v1.ConditionUnknown, 2405 LastHeartbeatTime: metav1.Date(2017, 1, 1, 12, 10, 0, 0, time.UTC), 2406 LastTransitionTime: metav1.Date(2017, 1, 1, 12, 0, 0, 0, time.UTC), 2407 }, 2408 } 2409 originalTaint := UnreachableTaintTemplate 2410 _, ctx := ktesting.NewTestContext(t) 2411 nodeController, _ := newNodeLifecycleControllerFromClient( 2412 ctx, 2413 fakeNodeHandler, 2414 testRateLimiterQPS, 2415 testRateLimiterQPS, 2416 testLargeClusterThreshold, 2417 testUnhealthyThreshold, 2418 testNodeMonitorGracePeriod, 2419 testNodeStartupGracePeriod, 2420 testNodeMonitorPeriod, 2421 ) 2422 nodeController.now = func() metav1.Time { return fakeNow } 2423 nodeController.recorder = testutil.NewFakeRecorder() 2424 nodeController.getPodsAssignedToNode = fakeGetPodsAssignedToNode(fakeNodeHandler.Clientset) 2425 if err := nodeController.syncNodeStore(fakeNodeHandler); err != nil { 2426 t.Errorf("unexpected error: %v", err) 2427 } 2428 if err := nodeController.monitorNodeHealth(ctx); err != nil { 2429 t.Errorf("unexpected error: %v", err) 2430 } 2431 nodeController.doNoExecuteTaintingPass(ctx) 2432 node0, err := fakeNodeHandler.Get(ctx, "node0", metav1.GetOptions{}) 2433 if err != nil { 2434 t.Errorf("Can't get current node0...") 2435 return 2436 } 2437 if !taintutils.TaintExists(node0.Spec.Taints, UnreachableTaintTemplate) { 2438 t.Errorf("Can't find taint %v in %v", originalTaint, node0.Spec.Taints) 2439 } 2440 node2, err := fakeNodeHandler.Get(ctx, "node2", metav1.GetOptions{}) 2441 if err != nil { 2442 t.Errorf("Can't get current node2...") 2443 return 2444 } 2445 if !taintutils.TaintExists(node2.Spec.Taints, NotReadyTaintTemplate) { 2446 t.Errorf("Can't find taint %v in %v", NotReadyTaintTemplate, node2.Spec.Taints) 2447 } 2448 2449 // Make node3 healthy again. 2450 node2.Status = healthyNodeNewStatus 2451 _, err = fakeNodeHandler.UpdateStatus(ctx, node2, metav1.UpdateOptions{}) 2452 if err != nil { 2453 t.Errorf(err.Error()) 2454 return 2455 } 2456 if err := nodeController.syncNodeStore(fakeNodeHandler); err != nil { 2457 t.Errorf("unexpected error: %v", err) 2458 } 2459 if err := nodeController.monitorNodeHealth(ctx); err != nil { 2460 t.Errorf("unexpected error: %v", err) 2461 } 2462 nodeController.doNoExecuteTaintingPass(ctx) 2463 2464 node2, err = fakeNodeHandler.Get(ctx, "node2", metav1.GetOptions{}) 2465 if err != nil { 2466 t.Errorf("Can't get current node2...") 2467 return 2468 } 2469 // We should not see any taint on the node(especially the Not-Ready taint with NoExecute effect). 2470 if taintutils.TaintExists(node2.Spec.Taints, NotReadyTaintTemplate) || len(node2.Spec.Taints) > 0 { 2471 t.Errorf("Found taint %v in %v, which should not be present", NotReadyTaintTemplate, node2.Spec.Taints) 2472 } 2473 2474 node3, err := fakeNodeHandler.Get(ctx, "node3", metav1.GetOptions{}) 2475 if err != nil { 2476 t.Errorf("Can't get current node3...") 2477 return 2478 } 2479 node3.Status = unhealthyNodeNewStatus 2480 _, err = fakeNodeHandler.UpdateStatus(ctx, node3, metav1.UpdateOptions{}) 2481 if err != nil { 2482 t.Errorf(err.Error()) 2483 return 2484 } 2485 if err := nodeController.syncNodeStore(fakeNodeHandler); err != nil { 2486 t.Errorf("unexpected error: %v", err) 2487 } 2488 if err := nodeController.monitorNodeHealth(ctx); err != nil { 2489 t.Errorf("unexpected error: %v", err) 2490 } 2491 // Before taint manager work, the status has been replaced(maybe merge-patch replace). 2492 node3.Status.Conditions = overrideNodeNewStatusConditions 2493 _, err = fakeNodeHandler.UpdateStatus(ctx, node3, metav1.UpdateOptions{}) 2494 if err != nil { 2495 t.Errorf(err.Error()) 2496 return 2497 } 2498 if err := nodeController.syncNodeStore(fakeNodeHandler); err != nil { 2499 t.Errorf("unexpected error: %v", err) 2500 } 2501 nodeController.doNoExecuteTaintingPass(ctx) 2502 node3, err = fakeNodeHandler.Get(ctx, "node3", metav1.GetOptions{}) 2503 if err != nil { 2504 t.Errorf("Can't get current node3...") 2505 return 2506 } 2507 // We should not see any taint on the node(especially the Not-Ready taint with NoExecute effect). 2508 if taintutils.TaintExists(node3.Spec.Taints, NotReadyTaintTemplate) || len(node3.Spec.Taints) > 0 { 2509 t.Errorf("Found taint %v in %v, which should not be present", NotReadyTaintTemplate, node3.Spec.Taints) 2510 } 2511 } 2512 2513 // TestApplyNoExecuteTaintsToNodesEnqueueTwice ensures we taint every node with NoExecute even if enqueued twice 2514 func TestApplyNoExecuteTaintsToNodesEnqueueTwice(t *testing.T) { 2515 // TODO: Remove skip once https://github.com/kubernetes/kubernetes/pull/114607 merges. 2516 if goruntime.GOOS == "windows" { 2517 t.Skip("Skipping test on Windows.") 2518 } 2519 fakeNow := metav1.Date(2017, 1, 1, 12, 0, 0, 0, time.UTC) 2520 2521 fakeNodeHandler := &testutil.FakeNodeHandler{ 2522 Existing: []*v1.Node{ 2523 // Unreachable Taint with effect 'NoExecute' should be applied to this node. 2524 { 2525 ObjectMeta: metav1.ObjectMeta{ 2526 Name: "node0", 2527 CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), 2528 Labels: map[string]string{ 2529 v1.LabelTopologyRegion: "region1", 2530 v1.LabelTopologyZone: "zone1", 2531 v1.LabelFailureDomainBetaRegion: "region1", 2532 v1.LabelFailureDomainBetaZone: "zone1", 2533 }, 2534 }, 2535 Status: v1.NodeStatus{ 2536 Conditions: []v1.NodeCondition{ 2537 { 2538 Type: v1.NodeReady, 2539 Status: v1.ConditionUnknown, 2540 LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), 2541 LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), 2542 }, 2543 }, 2544 }, 2545 }, 2546 // Because of the logic that prevents NC from evicting anything when all Nodes are NotReady 2547 // we need second healthy node in tests. 2548 { 2549 ObjectMeta: metav1.ObjectMeta{ 2550 Name: "node1", 2551 CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), 2552 Labels: map[string]string{ 2553 v1.LabelTopologyRegion: "region1", 2554 v1.LabelTopologyZone: "zone1", 2555 v1.LabelFailureDomainBetaRegion: "region1", 2556 v1.LabelFailureDomainBetaZone: "zone1", 2557 }, 2558 }, 2559 Status: v1.NodeStatus{ 2560 Conditions: []v1.NodeCondition{ 2561 { 2562 Type: v1.NodeReady, 2563 Status: v1.ConditionTrue, 2564 LastHeartbeatTime: metav1.Date(2017, 1, 1, 12, 0, 0, 0, time.UTC), 2565 LastTransitionTime: metav1.Date(2017, 1, 1, 12, 0, 0, 0, time.UTC), 2566 }, 2567 }, 2568 }, 2569 }, 2570 // NotReady Taint with NoExecute effect should be applied to this node. 2571 { 2572 ObjectMeta: metav1.ObjectMeta{ 2573 Name: "node2", 2574 CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), 2575 Labels: map[string]string{ 2576 v1.LabelTopologyRegion: "region1", 2577 v1.LabelTopologyZone: "zone1", 2578 v1.LabelFailureDomainBetaRegion: "region1", 2579 v1.LabelFailureDomainBetaZone: "zone1", 2580 }, 2581 }, 2582 Status: v1.NodeStatus{ 2583 Conditions: []v1.NodeCondition{ 2584 { 2585 Type: v1.NodeReady, 2586 Status: v1.ConditionFalse, 2587 LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), 2588 LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), 2589 }, 2590 }, 2591 }, 2592 }, 2593 }, 2594 Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}), 2595 } 2596 healthyNodeNewStatus := v1.NodeStatus{ 2597 Conditions: []v1.NodeCondition{ 2598 { 2599 Type: v1.NodeReady, 2600 Status: v1.ConditionTrue, 2601 LastHeartbeatTime: metav1.Date(2017, 1, 1, 12, 10, 0, 0, time.UTC), 2602 LastTransitionTime: metav1.Date(2017, 1, 1, 12, 0, 0, 0, time.UTC), 2603 }, 2604 }, 2605 } 2606 _, ctx := ktesting.NewTestContext(t) 2607 nodeController, _ := newNodeLifecycleControllerFromClient( 2608 ctx, 2609 fakeNodeHandler, 2610 testRateLimiterQPS, 2611 testRateLimiterQPS, 2612 testLargeClusterThreshold, 2613 testUnhealthyThreshold, 2614 testNodeMonitorGracePeriod, 2615 testNodeStartupGracePeriod, 2616 testNodeMonitorPeriod, 2617 ) 2618 nodeController.now = func() metav1.Time { return fakeNow } 2619 nodeController.recorder = testutil.NewFakeRecorder() 2620 nodeController.getPodsAssignedToNode = fakeGetPodsAssignedToNode(fakeNodeHandler.Clientset) 2621 if err := nodeController.syncNodeStore(fakeNodeHandler); err != nil { 2622 t.Errorf("unexpected error: %v", err) 2623 } 2624 // 1. monitor node health twice, add untainted node once 2625 if err := nodeController.monitorNodeHealth(ctx); err != nil { 2626 t.Errorf("unexpected error: %v", err) 2627 } 2628 if err := nodeController.monitorNodeHealth(ctx); err != nil { 2629 t.Errorf("unexpected error: %v", err) 2630 } 2631 2632 // 2. mark node0 healthy 2633 node0, err := fakeNodeHandler.Get(ctx, "node0", metav1.GetOptions{}) 2634 if err != nil { 2635 t.Errorf("Can't get current node0...") 2636 return 2637 } 2638 node0.Status = healthyNodeNewStatus 2639 _, err = fakeNodeHandler.UpdateStatus(ctx, node0, metav1.UpdateOptions{}) 2640 if err != nil { 2641 t.Errorf(err.Error()) 2642 return 2643 } 2644 2645 // add other notReady nodes 2646 fakeNodeHandler.Existing = append(fakeNodeHandler.Existing, []*v1.Node{ 2647 // Unreachable Taint with effect 'NoExecute' should be applied to this node. 2648 { 2649 ObjectMeta: metav1.ObjectMeta{ 2650 Name: "node3", 2651 CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), 2652 Labels: map[string]string{ 2653 v1.LabelTopologyRegion: "region1", 2654 v1.LabelTopologyZone: "zone1", 2655 v1.LabelFailureDomainBetaRegion: "region1", 2656 v1.LabelFailureDomainBetaZone: "zone1", 2657 }, 2658 }, 2659 Status: v1.NodeStatus{ 2660 Conditions: []v1.NodeCondition{ 2661 { 2662 Type: v1.NodeReady, 2663 Status: v1.ConditionUnknown, 2664 LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), 2665 LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), 2666 }, 2667 }, 2668 }, 2669 }, 2670 // Because of the logic that prevents NC from evicting anything when all Nodes are NotReady 2671 // we need second healthy node in tests. 2672 { 2673 ObjectMeta: metav1.ObjectMeta{ 2674 Name: "node4", 2675 CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), 2676 Labels: map[string]string{ 2677 v1.LabelTopologyRegion: "region1", 2678 v1.LabelTopologyZone: "zone1", 2679 v1.LabelFailureDomainBetaRegion: "region1", 2680 v1.LabelFailureDomainBetaZone: "zone1", 2681 }, 2682 }, 2683 Status: v1.NodeStatus{ 2684 Conditions: []v1.NodeCondition{ 2685 { 2686 Type: v1.NodeReady, 2687 Status: v1.ConditionTrue, 2688 LastHeartbeatTime: metav1.Date(2017, 1, 1, 12, 0, 0, 0, time.UTC), 2689 LastTransitionTime: metav1.Date(2017, 1, 1, 12, 0, 0, 0, time.UTC), 2690 }, 2691 }, 2692 }, 2693 }, 2694 // NotReady Taint with NoExecute effect should be applied to this node. 2695 { 2696 ObjectMeta: metav1.ObjectMeta{ 2697 Name: "node5", 2698 CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), 2699 Labels: map[string]string{ 2700 v1.LabelTopologyRegion: "region1", 2701 v1.LabelTopologyZone: "zone1", 2702 v1.LabelFailureDomainBetaRegion: "region1", 2703 v1.LabelFailureDomainBetaZone: "zone1", 2704 }, 2705 }, 2706 Status: v1.NodeStatus{ 2707 Conditions: []v1.NodeCondition{ 2708 { 2709 Type: v1.NodeReady, 2710 Status: v1.ConditionFalse, 2711 LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), 2712 LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), 2713 }, 2714 }, 2715 }, 2716 }, 2717 }...) 2718 if err := nodeController.syncNodeStore(fakeNodeHandler); err != nil { 2719 t.Errorf("unexpected error: %v", err) 2720 } 2721 // 3. start monitor node health again, add untainted node twice, construct UniqueQueue with duplicated node cache 2722 if err := nodeController.monitorNodeHealth(ctx); err != nil { 2723 t.Errorf("unexpected error: %v", err) 2724 } 2725 2726 // 4. do NoExecute taint pass 2727 // when processing with node0, condition.Status is NodeReady, and return true with default case 2728 // then remove the set value and queue value both, the taint job never stuck 2729 nodeController.doNoExecuteTaintingPass(ctx) 2730 2731 // 5. get node3 and node5, see if it has ready got NoExecute taint 2732 node3, err := fakeNodeHandler.Get(ctx, "node3", metav1.GetOptions{}) 2733 if err != nil { 2734 t.Errorf("Can't get current node3...") 2735 return 2736 } 2737 if !taintutils.TaintExists(node3.Spec.Taints, UnreachableTaintTemplate) || len(node3.Spec.Taints) == 0 { 2738 t.Errorf("Not found taint %v in %v, which should be present in %s", UnreachableTaintTemplate, node3.Spec.Taints, node3.Name) 2739 } 2740 node5, err := fakeNodeHandler.Get(ctx, "node5", metav1.GetOptions{}) 2741 if err != nil { 2742 t.Errorf("Can't get current node5...") 2743 return 2744 } 2745 if !taintutils.TaintExists(node5.Spec.Taints, NotReadyTaintTemplate) || len(node5.Spec.Taints) == 0 { 2746 t.Errorf("Not found taint %v in %v, which should be present in %s", NotReadyTaintTemplate, node5.Spec.Taints, node5.Name) 2747 } 2748 } 2749 2750 func TestSwapUnreachableNotReadyTaints(t *testing.T) { 2751 fakeNow := metav1.Date(2017, 1, 1, 12, 0, 0, 0, time.UTC) 2752 2753 fakeNodeHandler := &testutil.FakeNodeHandler{ 2754 Existing: []*v1.Node{ 2755 { 2756 ObjectMeta: metav1.ObjectMeta{ 2757 Name: "node0", 2758 CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), 2759 Labels: map[string]string{ 2760 v1.LabelTopologyRegion: "region1", 2761 v1.LabelTopologyZone: "zone1", 2762 v1.LabelFailureDomainBetaRegion: "region1", 2763 v1.LabelFailureDomainBetaZone: "zone1", 2764 }, 2765 }, 2766 Status: v1.NodeStatus{ 2767 Conditions: []v1.NodeCondition{ 2768 { 2769 Type: v1.NodeReady, 2770 Status: v1.ConditionUnknown, 2771 LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), 2772 LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), 2773 }, 2774 }, 2775 }, 2776 }, 2777 // Because of the logic that prevents NC from evicting anything when all Nodes are NotReady 2778 // we need second healthy node in tests. Because of how the tests are written we need to update 2779 // the status of this Node. 2780 { 2781 ObjectMeta: metav1.ObjectMeta{ 2782 Name: "node1", 2783 CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), 2784 Labels: map[string]string{ 2785 v1.LabelTopologyRegion: "region1", 2786 v1.LabelTopologyZone: "zone1", 2787 v1.LabelFailureDomainBetaRegion: "region1", 2788 v1.LabelFailureDomainBetaZone: "zone1", 2789 }, 2790 }, 2791 Status: v1.NodeStatus{ 2792 Conditions: []v1.NodeCondition{ 2793 { 2794 Type: v1.NodeReady, 2795 Status: v1.ConditionTrue, 2796 LastHeartbeatTime: metav1.Date(2017, 1, 1, 12, 0, 0, 0, time.UTC), 2797 LastTransitionTime: metav1.Date(2017, 1, 1, 12, 0, 0, 0, time.UTC), 2798 }, 2799 }, 2800 }, 2801 }, 2802 }, 2803 Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}), 2804 } 2805 newNodeStatus := v1.NodeStatus{ 2806 Conditions: []v1.NodeCondition{ 2807 { 2808 Type: v1.NodeReady, 2809 Status: v1.ConditionFalse, 2810 // Node status has just been updated, and is NotReady for 10min. 2811 LastHeartbeatTime: metav1.Date(2017, 1, 1, 12, 9, 0, 0, time.UTC), 2812 LastTransitionTime: metav1.Date(2017, 1, 1, 12, 0, 0, 0, time.UTC), 2813 }, 2814 }, 2815 } 2816 healthyNodeNewStatus := v1.NodeStatus{ 2817 Conditions: []v1.NodeCondition{ 2818 { 2819 Type: v1.NodeReady, 2820 Status: v1.ConditionTrue, 2821 LastHeartbeatTime: metav1.Date(2017, 1, 1, 12, 10, 0, 0, time.UTC), 2822 LastTransitionTime: metav1.Date(2017, 1, 1, 12, 0, 0, 0, time.UTC), 2823 }, 2824 }, 2825 } 2826 originalTaint := UnreachableTaintTemplate 2827 updatedTaint := NotReadyTaintTemplate 2828 2829 _, ctx := ktesting.NewTestContext(t) 2830 nodeController, _ := newNodeLifecycleControllerFromClient( 2831 ctx, 2832 fakeNodeHandler, 2833 testRateLimiterQPS, 2834 testRateLimiterQPS, 2835 testLargeClusterThreshold, 2836 testUnhealthyThreshold, 2837 testNodeMonitorGracePeriod, 2838 testNodeStartupGracePeriod, 2839 testNodeMonitorPeriod, 2840 ) 2841 nodeController.now = func() metav1.Time { return fakeNow } 2842 nodeController.recorder = testutil.NewFakeRecorder() 2843 nodeController.getPodsAssignedToNode = fakeGetPodsAssignedToNode(fakeNodeHandler.Clientset) 2844 if err := nodeController.syncNodeStore(fakeNodeHandler); err != nil { 2845 t.Errorf("unexpected error: %v", err) 2846 } 2847 if err := nodeController.monitorNodeHealth(ctx); err != nil { 2848 t.Errorf("unexpected error: %v", err) 2849 } 2850 nodeController.doNoExecuteTaintingPass(ctx) 2851 2852 node0, err := fakeNodeHandler.Get(ctx, "node0", metav1.GetOptions{}) 2853 if err != nil { 2854 t.Errorf("Can't get current node0...") 2855 return 2856 } 2857 node1, err := fakeNodeHandler.Get(ctx, "node1", metav1.GetOptions{}) 2858 if err != nil { 2859 t.Errorf("Can't get current node1...") 2860 return 2861 } 2862 2863 if originalTaint != nil && !taintutils.TaintExists(node0.Spec.Taints, originalTaint) { 2864 t.Errorf("Can't find taint %v in %v", originalTaint, node0.Spec.Taints) 2865 } 2866 2867 nodeController.now = func() metav1.Time { return metav1.Time{Time: fakeNow.Time} } 2868 2869 node0.Status = newNodeStatus 2870 node1.Status = healthyNodeNewStatus 2871 _, err = fakeNodeHandler.UpdateStatus(ctx, node0, metav1.UpdateOptions{}) 2872 if err != nil { 2873 t.Errorf(err.Error()) 2874 return 2875 } 2876 _, err = fakeNodeHandler.UpdateStatus(ctx, node1, metav1.UpdateOptions{}) 2877 if err != nil { 2878 t.Errorf(err.Error()) 2879 return 2880 } 2881 2882 if err := nodeController.syncNodeStore(fakeNodeHandler); err != nil { 2883 t.Errorf("unexpected error: %v", err) 2884 } 2885 if err := nodeController.monitorNodeHealth(ctx); err != nil { 2886 t.Errorf("unexpected error: %v", err) 2887 } 2888 nodeController.doNoExecuteTaintingPass(ctx) 2889 2890 node0, err = fakeNodeHandler.Get(ctx, "node0", metav1.GetOptions{}) 2891 if err != nil { 2892 t.Errorf("Can't get current node0...") 2893 return 2894 } 2895 if updatedTaint != nil { 2896 if !taintutils.TaintExists(node0.Spec.Taints, updatedTaint) { 2897 t.Errorf("Can't find taint %v in %v", updatedTaint, node0.Spec.Taints) 2898 } 2899 } 2900 } 2901 2902 func TestTaintsNodeByCondition(t *testing.T) { 2903 fakeNow := metav1.Date(2017, 1, 1, 12, 0, 0, 0, time.UTC) 2904 2905 fakeNodeHandler := &testutil.FakeNodeHandler{ 2906 Existing: []*v1.Node{ 2907 { 2908 ObjectMeta: metav1.ObjectMeta{ 2909 Name: "node0", 2910 CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), 2911 Labels: map[string]string{ 2912 v1.LabelTopologyRegion: "region1", 2913 v1.LabelTopologyZone: "zone1", 2914 v1.LabelFailureDomainBetaRegion: "region1", 2915 v1.LabelFailureDomainBetaZone: "zone1", 2916 }, 2917 }, 2918 Status: v1.NodeStatus{ 2919 Conditions: []v1.NodeCondition{ 2920 { 2921 Type: v1.NodeReady, 2922 Status: v1.ConditionTrue, 2923 LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), 2924 LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), 2925 }, 2926 }, 2927 }, 2928 }, 2929 }, 2930 Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}), 2931 } 2932 2933 _, ctx := ktesting.NewTestContext(t) 2934 nodeController, _ := newNodeLifecycleControllerFromClient( 2935 ctx, 2936 fakeNodeHandler, 2937 testRateLimiterQPS, 2938 testRateLimiterQPS, 2939 testLargeClusterThreshold, 2940 testUnhealthyThreshold, 2941 testNodeMonitorGracePeriod, 2942 testNodeStartupGracePeriod, 2943 testNodeMonitorPeriod, 2944 ) 2945 nodeController.now = func() metav1.Time { return fakeNow } 2946 nodeController.recorder = testutil.NewFakeRecorder() 2947 nodeController.getPodsAssignedToNode = fakeGetPodsAssignedToNode(fakeNodeHandler.Clientset) 2948 2949 networkUnavailableTaint := &v1.Taint{ 2950 Key: v1.TaintNodeNetworkUnavailable, 2951 Effect: v1.TaintEffectNoSchedule, 2952 } 2953 notReadyTaint := &v1.Taint{ 2954 Key: v1.TaintNodeNotReady, 2955 Effect: v1.TaintEffectNoSchedule, 2956 } 2957 unreachableTaint := &v1.Taint{ 2958 Key: v1.TaintNodeUnreachable, 2959 Effect: v1.TaintEffectNoSchedule, 2960 } 2961 2962 tests := []struct { 2963 Name string 2964 Node *v1.Node 2965 ExpectedTaints []*v1.Taint 2966 }{ 2967 { 2968 Name: "NetworkUnavailable is true", 2969 Node: &v1.Node{ 2970 ObjectMeta: metav1.ObjectMeta{ 2971 Name: "node0", 2972 CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), 2973 Labels: map[string]string{ 2974 v1.LabelTopologyRegion: "region1", 2975 v1.LabelTopologyZone: "zone1", 2976 v1.LabelFailureDomainBetaRegion: "region1", 2977 v1.LabelFailureDomainBetaZone: "zone1", 2978 }, 2979 }, 2980 Status: v1.NodeStatus{ 2981 Conditions: []v1.NodeCondition{ 2982 { 2983 Type: v1.NodeReady, 2984 Status: v1.ConditionTrue, 2985 LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), 2986 LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), 2987 }, 2988 { 2989 Type: v1.NodeNetworkUnavailable, 2990 Status: v1.ConditionTrue, 2991 LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), 2992 LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), 2993 }, 2994 }, 2995 }, 2996 }, 2997 ExpectedTaints: []*v1.Taint{networkUnavailableTaint}, 2998 }, 2999 { 3000 Name: "NetworkUnavailable is true", 3001 Node: &v1.Node{ 3002 ObjectMeta: metav1.ObjectMeta{ 3003 Name: "node0", 3004 CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), 3005 Labels: map[string]string{ 3006 v1.LabelTopologyRegion: "region1", 3007 v1.LabelTopologyZone: "zone1", 3008 v1.LabelFailureDomainBetaRegion: "region1", 3009 v1.LabelFailureDomainBetaZone: "zone1", 3010 }, 3011 }, 3012 Status: v1.NodeStatus{ 3013 Conditions: []v1.NodeCondition{ 3014 { 3015 Type: v1.NodeReady, 3016 Status: v1.ConditionTrue, 3017 LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), 3018 LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), 3019 }, 3020 { 3021 Type: v1.NodeNetworkUnavailable, 3022 Status: v1.ConditionTrue, 3023 LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), 3024 LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), 3025 }, 3026 }, 3027 }, 3028 }, 3029 ExpectedTaints: []*v1.Taint{networkUnavailableTaint}, 3030 }, 3031 { 3032 Name: "Ready is false", 3033 Node: &v1.Node{ 3034 ObjectMeta: metav1.ObjectMeta{ 3035 Name: "node0", 3036 CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), 3037 Labels: map[string]string{ 3038 v1.LabelTopologyRegion: "region1", 3039 v1.LabelTopologyZone: "zone1", 3040 v1.LabelFailureDomainBetaRegion: "region1", 3041 v1.LabelFailureDomainBetaZone: "zone1", 3042 }, 3043 }, 3044 Status: v1.NodeStatus{ 3045 Conditions: []v1.NodeCondition{ 3046 { 3047 Type: v1.NodeReady, 3048 Status: v1.ConditionFalse, 3049 LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), 3050 LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), 3051 }, 3052 }, 3053 }, 3054 }, 3055 ExpectedTaints: []*v1.Taint{notReadyTaint}, 3056 }, 3057 { 3058 Name: "Ready is unknown", 3059 Node: &v1.Node{ 3060 ObjectMeta: metav1.ObjectMeta{ 3061 Name: "node0", 3062 CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), 3063 Labels: map[string]string{ 3064 v1.LabelTopologyRegion: "region1", 3065 v1.LabelTopologyZone: "zone1", 3066 v1.LabelFailureDomainBetaRegion: "region1", 3067 v1.LabelFailureDomainBetaZone: "zone1", 3068 }, 3069 }, 3070 Status: v1.NodeStatus{ 3071 Conditions: []v1.NodeCondition{ 3072 { 3073 Type: v1.NodeReady, 3074 Status: v1.ConditionUnknown, 3075 LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), 3076 LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), 3077 }, 3078 }, 3079 }, 3080 }, 3081 ExpectedTaints: []*v1.Taint{unreachableTaint}, 3082 }, 3083 } 3084 3085 for _, test := range tests { 3086 fakeNodeHandler.Update(ctx, test.Node, metav1.UpdateOptions{}) 3087 if err := nodeController.syncNodeStore(fakeNodeHandler); err != nil { 3088 t.Errorf("unexpected error: %v", err) 3089 } 3090 nodeController.doNoScheduleTaintingPass(ctx, test.Node.Name) 3091 if err := nodeController.syncNodeStore(fakeNodeHandler); err != nil { 3092 t.Errorf("unexpected error: %v", err) 3093 } 3094 node0, err := nodeController.nodeLister.Get("node0") 3095 if err != nil { 3096 t.Errorf("Can't get current node0...") 3097 return 3098 } 3099 if len(node0.Spec.Taints) != len(test.ExpectedTaints) { 3100 t.Errorf("%s: Unexpected number of taints: expected %d, got %d", 3101 test.Name, len(test.ExpectedTaints), len(node0.Spec.Taints)) 3102 } 3103 for _, taint := range test.ExpectedTaints { 3104 if !taintutils.TaintExists(node0.Spec.Taints, taint) { 3105 t.Errorf("%s: Can't find taint %v in %v", test.Name, taint, node0.Spec.Taints) 3106 } 3107 } 3108 } 3109 } 3110 3111 func TestNodeEventGeneration(t *testing.T) { 3112 fakeNow := metav1.Date(2016, 9, 10, 12, 0, 0, 0, time.UTC) 3113 fakeNodeHandler := &testutil.FakeNodeHandler{ 3114 Existing: []*v1.Node{ 3115 { 3116 ObjectMeta: metav1.ObjectMeta{ 3117 Name: "node0", 3118 UID: "1234567890", 3119 CreationTimestamp: metav1.Date(2015, 8, 10, 0, 0, 0, 0, time.UTC), 3120 }, 3121 Status: v1.NodeStatus{ 3122 Conditions: []v1.NodeCondition{ 3123 { 3124 Type: v1.NodeReady, 3125 Status: v1.ConditionUnknown, 3126 LastHeartbeatTime: metav1.Date(2015, 8, 10, 0, 0, 0, 0, time.UTC), 3127 LastTransitionTime: metav1.Date(2015, 8, 10, 0, 0, 0, 0, time.UTC), 3128 }, 3129 }, 3130 }, 3131 }, 3132 }, 3133 Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}), 3134 } 3135 3136 _, ctx := ktesting.NewTestContext(t) 3137 nodeController, _ := newNodeLifecycleControllerFromClient( 3138 ctx, 3139 fakeNodeHandler, 3140 testRateLimiterQPS, 3141 testRateLimiterQPS, 3142 testLargeClusterThreshold, 3143 testUnhealthyThreshold, 3144 testNodeMonitorGracePeriod, 3145 testNodeStartupGracePeriod, 3146 testNodeMonitorPeriod, 3147 ) 3148 nodeController.now = func() metav1.Time { return fakeNow } 3149 fakeRecorder := testutil.NewFakeRecorder() 3150 nodeController.recorder = fakeRecorder 3151 nodeController.getPodsAssignedToNode = fakeGetPodsAssignedToNode(fakeNodeHandler.Clientset) 3152 3153 if err := nodeController.syncNodeStore(fakeNodeHandler); err != nil { 3154 t.Errorf("unexpected error: %v", err) 3155 } 3156 if err := nodeController.monitorNodeHealth(ctx); err != nil { 3157 t.Errorf("unexpected error: %v", err) 3158 } 3159 if len(fakeRecorder.Events) != 1 { 3160 t.Fatalf("unexpected events, got %v, expected %v: %+v", len(fakeRecorder.Events), 1, fakeRecorder.Events) 3161 } 3162 if fakeRecorder.Events[0].Reason != "RegisteredNode" { 3163 var reasons []string 3164 for _, event := range fakeRecorder.Events { 3165 reasons = append(reasons, event.Reason) 3166 } 3167 t.Fatalf("unexpected events generation: %v", strings.Join(reasons, ",")) 3168 } 3169 for _, event := range fakeRecorder.Events { 3170 involvedObject := event.InvolvedObject 3171 actualUID := string(involvedObject.UID) 3172 if actualUID != "1234567890" { 3173 t.Fatalf("unexpected event uid: %v", actualUID) 3174 } 3175 } 3176 } 3177 3178 func TestReconcileNodeLabels(t *testing.T) { 3179 fakeNow := metav1.Date(2017, 1, 1, 12, 0, 0, 0, time.UTC) 3180 3181 fakeNodeHandler := &testutil.FakeNodeHandler{ 3182 Existing: []*v1.Node{ 3183 { 3184 ObjectMeta: metav1.ObjectMeta{ 3185 Name: "node0", 3186 CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), 3187 Labels: map[string]string{ 3188 v1.LabelTopologyRegion: "region1", 3189 v1.LabelTopologyZone: "zone1", 3190 v1.LabelFailureDomainBetaRegion: "region1", 3191 v1.LabelFailureDomainBetaZone: "zone1", 3192 }, 3193 }, 3194 Status: v1.NodeStatus{ 3195 Conditions: []v1.NodeCondition{ 3196 { 3197 Type: v1.NodeReady, 3198 Status: v1.ConditionTrue, 3199 LastHeartbeatTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), 3200 LastTransitionTime: metav1.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC), 3201 }, 3202 }, 3203 }, 3204 }, 3205 }, 3206 Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}), 3207 } 3208 3209 _, ctx := ktesting.NewTestContext(t) 3210 nodeController, _ := newNodeLifecycleControllerFromClient( 3211 ctx, 3212 fakeNodeHandler, 3213 testRateLimiterQPS, 3214 testRateLimiterQPS, 3215 testLargeClusterThreshold, 3216 testUnhealthyThreshold, 3217 testNodeMonitorGracePeriod, 3218 testNodeStartupGracePeriod, 3219 testNodeMonitorPeriod, 3220 ) 3221 nodeController.now = func() metav1.Time { return fakeNow } 3222 nodeController.recorder = testutil.NewFakeRecorder() 3223 nodeController.getPodsAssignedToNode = fakeGetPodsAssignedToNode(fakeNodeHandler.Clientset) 3224 3225 tests := []struct { 3226 Name string 3227 Node *v1.Node 3228 ExpectedLabels map[string]string 3229 }{ 3230 { 3231 Name: "No-op if node has no labels", 3232 Node: &v1.Node{ 3233 ObjectMeta: metav1.ObjectMeta{ 3234 Name: "node0", 3235 CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), 3236 }, 3237 }, 3238 ExpectedLabels: nil, 3239 }, 3240 { 3241 Name: "No-op if no target labels present", 3242 Node: &v1.Node{ 3243 ObjectMeta: metav1.ObjectMeta{ 3244 Name: "node0", 3245 CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), 3246 Labels: map[string]string{ 3247 v1.LabelTopologyRegion: "region1", 3248 }, 3249 }, 3250 }, 3251 ExpectedLabels: map[string]string{ 3252 v1.LabelTopologyRegion: "region1", 3253 }, 3254 }, 3255 { 3256 Name: "Create OS/arch beta labels when they don't exist", 3257 Node: &v1.Node{ 3258 ObjectMeta: metav1.ObjectMeta{ 3259 Name: "node0", 3260 CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), 3261 Labels: map[string]string{ 3262 v1.LabelOSStable: "linux", 3263 v1.LabelArchStable: "amd64", 3264 }, 3265 }, 3266 }, 3267 ExpectedLabels: map[string]string{ 3268 kubeletapis.LabelOS: "linux", 3269 kubeletapis.LabelArch: "amd64", 3270 v1.LabelOSStable: "linux", 3271 v1.LabelArchStable: "amd64", 3272 }, 3273 }, 3274 { 3275 Name: "Reconcile OS/arch beta labels to match stable labels", 3276 Node: &v1.Node{ 3277 ObjectMeta: metav1.ObjectMeta{ 3278 Name: "node0", 3279 CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), 3280 Labels: map[string]string{ 3281 kubeletapis.LabelOS: "windows", 3282 kubeletapis.LabelArch: "arm", 3283 v1.LabelOSStable: "linux", 3284 v1.LabelArchStable: "amd64", 3285 }, 3286 }, 3287 }, 3288 ExpectedLabels: map[string]string{ 3289 kubeletapis.LabelOS: "linux", 3290 kubeletapis.LabelArch: "amd64", 3291 v1.LabelOSStable: "linux", 3292 v1.LabelArchStable: "amd64", 3293 }, 3294 }, 3295 } 3296 3297 for _, test := range tests { 3298 fakeNodeHandler.Update(ctx, test.Node, metav1.UpdateOptions{}) 3299 if err := nodeController.syncNodeStore(fakeNodeHandler); err != nil { 3300 t.Fatalf("unexpected error: %v", err) 3301 } 3302 nodeController.reconcileNodeLabels(ctx, test.Node.Name) 3303 if err := nodeController.syncNodeStore(fakeNodeHandler); err != nil { 3304 t.Fatalf("unexpected error: %v", err) 3305 } 3306 node0, err := nodeController.nodeLister.Get("node0") 3307 if err != nil { 3308 t.Fatalf("Can't get current node0...") 3309 } 3310 if len(node0.Labels) != len(test.ExpectedLabels) { 3311 t.Errorf("%s: Unexpected number of taints: expected %d, got %d", 3312 test.Name, len(test.ExpectedLabels), len(node0.Labels)) 3313 } 3314 for key, expectedValue := range test.ExpectedLabels { 3315 actualValue, ok := node0.Labels[key] 3316 if !ok { 3317 t.Errorf("%s: Can't find label %v in %v", test.Name, key, node0.Labels) 3318 } 3319 if actualValue != expectedValue { 3320 t.Errorf("%s: label %q: expected value %q, got value %q", test.Name, key, expectedValue, actualValue) 3321 } 3322 } 3323 } 3324 } 3325 3326 func TestTryUpdateNodeHealth(t *testing.T) { 3327 fakeNow := metav1.Date(2017, 1, 1, 12, 0, 0, 0, time.UTC) 3328 fakeOld := metav1.Date(2016, 1, 1, 12, 0, 0, 0, time.UTC) 3329 3330 fakeNodeHandler := &testutil.FakeNodeHandler{ 3331 Existing: []*v1.Node{ 3332 { 3333 ObjectMeta: metav1.ObjectMeta{ 3334 Name: "node0", 3335 CreationTimestamp: fakeNow, 3336 }, 3337 Status: v1.NodeStatus{ 3338 Conditions: []v1.NodeCondition{ 3339 { 3340 Type: v1.NodeReady, 3341 Status: v1.ConditionTrue, 3342 LastHeartbeatTime: fakeNow, 3343 LastTransitionTime: fakeNow, 3344 }, 3345 }, 3346 }, 3347 }, 3348 }, 3349 Clientset: fake.NewSimpleClientset(&v1.PodList{Items: []v1.Pod{*testutil.NewPod("pod0", "node0")}}), 3350 } 3351 3352 _, ctx := ktesting.NewTestContext(t) 3353 nodeController, _ := newNodeLifecycleControllerFromClient( 3354 ctx, 3355 fakeNodeHandler, 3356 testRateLimiterQPS, 3357 testRateLimiterQPS, 3358 testLargeClusterThreshold, 3359 testUnhealthyThreshold, 3360 testNodeMonitorGracePeriod, 3361 testNodeStartupGracePeriod, 3362 testNodeMonitorPeriod, 3363 ) 3364 nodeController.now = func() metav1.Time { return fakeNow } 3365 nodeController.recorder = testutil.NewFakeRecorder() 3366 nodeController.getPodsAssignedToNode = fakeGetPodsAssignedToNode(fakeNodeHandler.Clientset) 3367 3368 getStatus := func(cond *v1.NodeCondition) *v1.ConditionStatus { 3369 if cond == nil { 3370 return nil 3371 } 3372 return &cond.Status 3373 } 3374 3375 tests := []struct { 3376 name string 3377 node *v1.Node 3378 }{ 3379 { 3380 name: "Status true", 3381 node: &v1.Node{ 3382 ObjectMeta: metav1.ObjectMeta{ 3383 Name: "node0", 3384 CreationTimestamp: fakeNow, 3385 }, 3386 Status: v1.NodeStatus{ 3387 Conditions: []v1.NodeCondition{ 3388 { 3389 Type: v1.NodeReady, 3390 Status: v1.ConditionTrue, 3391 LastHeartbeatTime: fakeNow, 3392 LastTransitionTime: fakeNow, 3393 }, 3394 }, 3395 }, 3396 }, 3397 }, 3398 { 3399 name: "Status false", 3400 node: &v1.Node{ 3401 ObjectMeta: metav1.ObjectMeta{ 3402 Name: "node0", 3403 CreationTimestamp: fakeNow, 3404 }, 3405 Status: v1.NodeStatus{ 3406 Conditions: []v1.NodeCondition{ 3407 { 3408 Type: v1.NodeReady, 3409 Status: v1.ConditionFalse, 3410 LastHeartbeatTime: fakeNow, 3411 LastTransitionTime: fakeNow, 3412 }, 3413 }, 3414 }, 3415 }, 3416 }, 3417 { 3418 name: "Status unknown", 3419 node: &v1.Node{ 3420 ObjectMeta: metav1.ObjectMeta{ 3421 Name: "node0", 3422 CreationTimestamp: fakeNow, 3423 }, 3424 Status: v1.NodeStatus{ 3425 Conditions: []v1.NodeCondition{ 3426 { 3427 Type: v1.NodeReady, 3428 Status: v1.ConditionUnknown, 3429 LastHeartbeatTime: fakeNow, 3430 LastTransitionTime: fakeNow, 3431 }, 3432 }, 3433 }, 3434 }, 3435 }, 3436 { 3437 name: "Status nil", 3438 node: &v1.Node{ 3439 ObjectMeta: metav1.ObjectMeta{ 3440 Name: "node0", 3441 CreationTimestamp: fakeNow, 3442 }, 3443 Status: v1.NodeStatus{ 3444 Conditions: []v1.NodeCondition{}, 3445 }, 3446 }, 3447 }, 3448 { 3449 name: "Status true - after grace period", 3450 node: &v1.Node{ 3451 ObjectMeta: metav1.ObjectMeta{ 3452 Name: "node0", 3453 CreationTimestamp: fakeOld, 3454 }, 3455 Status: v1.NodeStatus{ 3456 Conditions: []v1.NodeCondition{ 3457 { 3458 Type: v1.NodeReady, 3459 Status: v1.ConditionTrue, 3460 LastHeartbeatTime: fakeOld, 3461 LastTransitionTime: fakeOld, 3462 }, 3463 }, 3464 }, 3465 }, 3466 }, 3467 { 3468 name: "Status false - after grace period", 3469 node: &v1.Node{ 3470 ObjectMeta: metav1.ObjectMeta{ 3471 Name: "node0", 3472 CreationTimestamp: fakeOld, 3473 }, 3474 Status: v1.NodeStatus{ 3475 Conditions: []v1.NodeCondition{ 3476 { 3477 Type: v1.NodeReady, 3478 Status: v1.ConditionFalse, 3479 LastHeartbeatTime: fakeOld, 3480 LastTransitionTime: fakeOld, 3481 }, 3482 }, 3483 }, 3484 }, 3485 }, 3486 { 3487 name: "Status unknown - after grace period", 3488 node: &v1.Node{ 3489 ObjectMeta: metav1.ObjectMeta{ 3490 Name: "node0", 3491 CreationTimestamp: fakeOld, 3492 }, 3493 Status: v1.NodeStatus{ 3494 Conditions: []v1.NodeCondition{ 3495 { 3496 Type: v1.NodeReady, 3497 Status: v1.ConditionUnknown, 3498 LastHeartbeatTime: fakeOld, 3499 LastTransitionTime: fakeOld, 3500 }, 3501 }, 3502 }, 3503 }, 3504 }, 3505 { 3506 name: "Status nil - after grace period", 3507 node: &v1.Node{ 3508 ObjectMeta: metav1.ObjectMeta{ 3509 Name: "node0", 3510 CreationTimestamp: fakeOld, 3511 }, 3512 Status: v1.NodeStatus{ 3513 Conditions: []v1.NodeCondition{}, 3514 }, 3515 }, 3516 }, 3517 } 3518 3519 for _, test := range tests { 3520 t.Run(test.name, func(t *testing.T) { 3521 nodeController.nodeHealthMap.set(test.node.Name, &nodeHealthData{ 3522 status: &test.node.Status, 3523 probeTimestamp: test.node.CreationTimestamp, 3524 readyTransitionTimestamp: test.node.CreationTimestamp, 3525 }) 3526 _, _, currentReadyCondition, err := nodeController.tryUpdateNodeHealth(ctx, test.node) 3527 if err != nil { 3528 t.Fatalf("unexpected error: %v", err) 3529 } 3530 _, savedReadyCondition := controllerutil.GetNodeCondition(nodeController.nodeHealthMap.getDeepCopy(test.node.Name).status, v1.NodeReady) 3531 savedStatus := getStatus(savedReadyCondition) 3532 currentStatus := getStatus(currentReadyCondition) 3533 if !apiequality.Semantic.DeepEqual(currentStatus, savedStatus) { 3534 t.Errorf("expected %v, got %v", savedStatus, currentStatus) 3535 } 3536 }) 3537 } 3538 } 3539 3540 func Test_isNodeExcludedFromDisruptionChecks(t *testing.T) { 3541 validNodeStatus := v1.NodeStatus{Conditions: []v1.NodeCondition{{Type: "Test"}}} 3542 tests := []struct { 3543 name string 3544 3545 input *v1.Node 3546 want bool 3547 }{ 3548 {want: false, input: &v1.Node{Status: validNodeStatus, ObjectMeta: metav1.ObjectMeta{Labels: map[string]string{}}}}, 3549 {want: false, input: &v1.Node{Status: validNodeStatus, ObjectMeta: metav1.ObjectMeta{Name: "master-abc"}}}, 3550 {want: true, input: &v1.Node{Status: validNodeStatus, ObjectMeta: metav1.ObjectMeta{Labels: map[string]string{labelNodeDisruptionExclusion: ""}}}}, 3551 } 3552 for _, tt := range tests { 3553 t.Run(tt.name, func(t *testing.T) { 3554 if result := isNodeExcludedFromDisruptionChecks(tt.input); result != tt.want { 3555 t.Errorf("isNodeExcludedFromDisruptionChecks() = %v, want %v", result, tt.want) 3556 } 3557 }) 3558 } 3559 }