k8s.io/kubernetes@v1.29.3/test/integration/node/lifecycle_test.go (about)

     1  /*
     2  Copyright 2020 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package node
    18  
    19  import (
    20  	"fmt"
    21  	"testing"
    22  	"time"
    23  
    24  	v1 "k8s.io/api/core/v1"
    25  	"k8s.io/apimachinery/pkg/api/resource"
    26  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    27  	"k8s.io/apimachinery/pkg/util/wait"
    28  	"k8s.io/apiserver/pkg/admission"
    29  	"k8s.io/apiserver/pkg/util/feature"
    30  	"k8s.io/client-go/informers"
    31  	clientset "k8s.io/client-go/kubernetes"
    32  	restclient "k8s.io/client-go/rest"
    33  	featuregatetesting "k8s.io/component-base/featuregate/testing"
    34  	"k8s.io/klog/v2"
    35  	"k8s.io/kubernetes/cmd/kube-controller-manager/names"
    36  	podutil "k8s.io/kubernetes/pkg/api/v1/pod"
    37  	"k8s.io/kubernetes/pkg/controller/nodelifecycle"
    38  	"k8s.io/kubernetes/pkg/controller/tainteviction"
    39  	"k8s.io/kubernetes/pkg/features"
    40  	"k8s.io/kubernetes/plugin/pkg/admission/defaulttolerationseconds"
    41  	"k8s.io/kubernetes/plugin/pkg/admission/podtolerationrestriction"
    42  	pluginapi "k8s.io/kubernetes/plugin/pkg/admission/podtolerationrestriction/apis/podtolerationrestriction"
    43  	testutils "k8s.io/kubernetes/test/integration/util"
    44  	imageutils "k8s.io/kubernetes/test/utils/image"
    45  )
    46  
    47  // TestEvictionForNoExecuteTaintAddedByUser tests taint-based eviction for a node tainted NoExecute
    48  func TestEvictionForNoExecuteTaintAddedByUser(t *testing.T) {
    49  	// we need at least 2 nodes to prevent lifecycle manager from entering "fully-disrupted" mode
    50  	nodeCount := 3
    51  	nodeIndex := 1 // the exact node doesn't matter, pick one
    52  
    53  	tests := map[string]struct {
    54  		enablePodDisruptionConditions          bool
    55  		enableSeparateTaintEvictionController  bool
    56  		startStandaloneTaintEvictionController bool
    57  		wantPodEvicted                         bool
    58  	}{
    59  		"Test eviction for NoExecute taint added by user; pod condition added when PodDisruptionConditions enabled; separate taint eviction controller disabled": {
    60  			enablePodDisruptionConditions:          true,
    61  			enableSeparateTaintEvictionController:  false,
    62  			startStandaloneTaintEvictionController: false,
    63  			wantPodEvicted:                         true,
    64  		},
    65  		"Test eviction for NoExecute taint added by user; no pod condition added when PodDisruptionConditions disabled; separate taint eviction controller disabled": {
    66  			enablePodDisruptionConditions:          false,
    67  			enableSeparateTaintEvictionController:  false,
    68  			startStandaloneTaintEvictionController: false,
    69  			wantPodEvicted:                         true,
    70  		},
    71  		"Test eviction for NoExecute taint added by user; separate taint eviction controller enabled but not started": {
    72  			enablePodDisruptionConditions:          false,
    73  			enableSeparateTaintEvictionController:  true,
    74  			startStandaloneTaintEvictionController: false,
    75  			wantPodEvicted:                         false,
    76  		},
    77  		"Test eviction for NoExecute taint added by user; separate taint eviction controller enabled and started": {
    78  			enablePodDisruptionConditions:          false,
    79  			enableSeparateTaintEvictionController:  true,
    80  			startStandaloneTaintEvictionController: true,
    81  			wantPodEvicted:                         true,
    82  		},
    83  	}
    84  
    85  	for name, test := range tests {
    86  		t.Run(name, func(t *testing.T) {
    87  			var nodes []*v1.Node
    88  			for i := 0; i < nodeCount; i++ {
    89  				node := &v1.Node{
    90  					ObjectMeta: metav1.ObjectMeta{
    91  						Name:   fmt.Sprintf("testnode-%d", i),
    92  						Labels: map[string]string{"node.kubernetes.io/exclude-disruption": "true"},
    93  					},
    94  					Spec: v1.NodeSpec{},
    95  					Status: v1.NodeStatus{
    96  						Conditions: []v1.NodeCondition{
    97  							{
    98  								Type:   v1.NodeReady,
    99  								Status: v1.ConditionTrue,
   100  							},
   101  						},
   102  					},
   103  				}
   104  				nodes = append(nodes, node)
   105  			}
   106  			testPod := &v1.Pod{
   107  				ObjectMeta: metav1.ObjectMeta{
   108  					Name: "testpod",
   109  				},
   110  				Spec: v1.PodSpec{
   111  					NodeName: nodes[nodeIndex].Name,
   112  					Containers: []v1.Container{
   113  						{Name: "container", Image: imageutils.GetPauseImageName()},
   114  					},
   115  				},
   116  				Status: v1.PodStatus{
   117  					Phase: v1.PodRunning,
   118  					Conditions: []v1.PodCondition{
   119  						{
   120  							Type:   v1.PodReady,
   121  							Status: v1.ConditionTrue,
   122  						},
   123  					},
   124  				},
   125  			}
   126  
   127  			defer featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.PodDisruptionConditions, test.enablePodDisruptionConditions)()
   128  			defer featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.SeparateTaintEvictionController, test.enableSeparateTaintEvictionController)()
   129  			testCtx := testutils.InitTestAPIServer(t, "taint-no-execute", nil)
   130  			cs := testCtx.ClientSet
   131  
   132  			// Build clientset and informers for controllers.
   133  			externalClientConfig := restclient.CopyConfig(testCtx.KubeConfig)
   134  			externalClientConfig.QPS = -1
   135  			externalClientset := clientset.NewForConfigOrDie(externalClientConfig)
   136  			externalInformers := informers.NewSharedInformerFactory(externalClientset, time.Second)
   137  
   138  			// Start NodeLifecycleController for taint.
   139  			nc, err := nodelifecycle.NewNodeLifecycleController(
   140  				testCtx.Ctx,
   141  				externalInformers.Coordination().V1().Leases(),
   142  				externalInformers.Core().V1().Pods(),
   143  				externalInformers.Core().V1().Nodes(),
   144  				externalInformers.Apps().V1().DaemonSets(),
   145  				cs,
   146  				1*time.Second,    // Node monitor grace period
   147  				time.Minute,      // Node startup grace period
   148  				time.Millisecond, // Node monitor period
   149  				100,              // Eviction limiter QPS
   150  				100,              // Secondary eviction limiter QPS
   151  				50,               // Large cluster threshold
   152  				0.55,             // Unhealthy zone threshold
   153  			)
   154  			if err != nil {
   155  				t.Fatalf("Failed to create node controller: %v", err)
   156  			}
   157  
   158  			// Waiting for all controllers to sync
   159  			externalInformers.Start(testCtx.Ctx.Done())
   160  			externalInformers.WaitForCacheSync(testCtx.Ctx.Done())
   161  
   162  			// Run all controllers
   163  			go nc.Run(testCtx.Ctx)
   164  
   165  			// Start TaintManager
   166  			if test.startStandaloneTaintEvictionController {
   167  				tm, _ := tainteviction.New(
   168  					testCtx.Ctx,
   169  					testCtx.ClientSet,
   170  					externalInformers.Core().V1().Pods(),
   171  					externalInformers.Core().V1().Nodes(),
   172  					names.TaintEvictionController,
   173  				)
   174  				go tm.Run(testCtx.Ctx)
   175  			}
   176  
   177  			for index := range nodes {
   178  				nodes[index], err = cs.CoreV1().Nodes().Create(testCtx.Ctx, nodes[index], metav1.CreateOptions{})
   179  				if err != nil {
   180  					t.Fatalf("Failed to create node, err: %v", err)
   181  				}
   182  			}
   183  
   184  			testPod, err = cs.CoreV1().Pods(testCtx.NS.Name).Create(testCtx.Ctx, testPod, metav1.CreateOptions{})
   185  			if err != nil {
   186  				t.Fatalf("Test Failed: error: %v, while creating pod", err)
   187  			}
   188  
   189  			if err := testutils.AddTaintToNode(cs, nodes[nodeIndex].Name, v1.Taint{Key: "CustomTaintByUser", Effect: v1.TaintEffectNoExecute}); err != nil {
   190  				t.Errorf("Failed to taint node in test %s <%s>, err: %v", name, nodes[nodeIndex].Name, err)
   191  			}
   192  
   193  			err = wait.PollUntilContextTimeout(testCtx.Ctx, time.Second, time.Second*20, true, testutils.PodIsGettingEvicted(cs, testPod.Namespace, testPod.Name))
   194  			if err != nil && test.wantPodEvicted {
   195  				t.Fatalf("Test Failed: error %v while waiting for pod %q to be evicted", err, klog.KObj(testPod))
   196  			} else if !wait.Interrupted(err) && !test.wantPodEvicted {
   197  				t.Fatalf("Test Failed: unexpected eviction of pod %q", klog.KObj(testPod))
   198  			}
   199  
   200  			testPod, err = cs.CoreV1().Pods(testCtx.NS.Name).Get(testCtx.Ctx, testPod.Name, metav1.GetOptions{})
   201  			if err != nil {
   202  				t.Fatalf("Test Failed: error: %q, while getting updated pod", err)
   203  			}
   204  			_, cond := podutil.GetPodCondition(&testPod.Status, v1.DisruptionTarget)
   205  			if test.enablePodDisruptionConditions && cond == nil {
   206  				t.Errorf("Pod %q does not have the expected condition: %q", klog.KObj(testPod), v1.DisruptionTarget)
   207  			} else if !test.enablePodDisruptionConditions && cond != nil {
   208  				t.Errorf("Pod %q has an unexpected condition: %q", klog.KObj(testPod), v1.DisruptionTarget)
   209  			}
   210  		})
   211  	}
   212  }
   213  
   214  // TestTaintBasedEvictions tests related cases for the TaintBasedEvictions feature
   215  func TestTaintBasedEvictions(t *testing.T) {
   216  	// we need at least 2 nodes to prevent lifecycle manager from entering "fully-disrupted" mode
   217  	nodeCount := 3
   218  	nodeIndex := 1 // the exact node doesn't matter, pick one
   219  	zero := int64(0)
   220  	gracePeriod := int64(1)
   221  	testPod := &v1.Pod{
   222  		ObjectMeta: metav1.ObjectMeta{Name: "testpod1", DeletionGracePeriodSeconds: &zero},
   223  		Spec: v1.PodSpec{
   224  			Containers: []v1.Container{
   225  				{Name: "container", Image: imageutils.GetPauseImageName()},
   226  			},
   227  			Tolerations: []v1.Toleration{
   228  				{
   229  					Key:      v1.TaintNodeNotReady,
   230  					Operator: v1.TolerationOpExists,
   231  					Effect:   v1.TaintEffectNoExecute,
   232  				},
   233  			},
   234  			TerminationGracePeriodSeconds: &gracePeriod,
   235  		},
   236  	}
   237  	tests := []struct {
   238  		name                                  string
   239  		nodeTaints                            []v1.Taint
   240  		nodeConditions                        []v1.NodeCondition
   241  		pod                                   *v1.Pod
   242  		tolerationSeconds                     int64
   243  		expectedWaitForPodCondition           string
   244  		enableSeparateTaintEvictionController bool
   245  	}{
   246  		{
   247  			name:                                  "Taint based evictions for NodeNotReady and 200 tolerationseconds; separate taint eviction controller disabled",
   248  			nodeTaints:                            []v1.Taint{{Key: v1.TaintNodeNotReady, Effect: v1.TaintEffectNoExecute}},
   249  			nodeConditions:                        []v1.NodeCondition{{Type: v1.NodeReady, Status: v1.ConditionFalse}},
   250  			pod:                                   testPod.DeepCopy(),
   251  			tolerationSeconds:                     200,
   252  			expectedWaitForPodCondition:           "updated with tolerationSeconds of 200",
   253  			enableSeparateTaintEvictionController: false,
   254  		},
   255  		{
   256  			name:                                  "Taint based evictions for NodeNotReady and 200 tolerationseconds; separate taint eviction controller enabled",
   257  			nodeTaints:                            []v1.Taint{{Key: v1.TaintNodeNotReady, Effect: v1.TaintEffectNoExecute}},
   258  			nodeConditions:                        []v1.NodeCondition{{Type: v1.NodeReady, Status: v1.ConditionFalse}},
   259  			pod:                                   testPod.DeepCopy(),
   260  			tolerationSeconds:                     200,
   261  			expectedWaitForPodCondition:           "updated with tolerationSeconds of 200",
   262  			enableSeparateTaintEvictionController: true,
   263  		},
   264  		{
   265  			name:           "Taint based evictions for NodeNotReady with no pod tolerations; separate taint eviction controller disabled",
   266  			nodeTaints:     []v1.Taint{{Key: v1.TaintNodeNotReady, Effect: v1.TaintEffectNoExecute}},
   267  			nodeConditions: []v1.NodeCondition{{Type: v1.NodeReady, Status: v1.ConditionFalse}},
   268  			pod: &v1.Pod{
   269  				ObjectMeta: metav1.ObjectMeta{Name: "testpod1"},
   270  				Spec: v1.PodSpec{
   271  					Containers: []v1.Container{
   272  						{Name: "container", Image: imageutils.GetPauseImageName()},
   273  					},
   274  				},
   275  			},
   276  			tolerationSeconds:                     300,
   277  			expectedWaitForPodCondition:           "updated with tolerationSeconds=300",
   278  			enableSeparateTaintEvictionController: false,
   279  		},
   280  		{
   281  			name:           "Taint based evictions for NodeNotReady with no pod tolerations; separate taint eviction controller enabled",
   282  			nodeTaints:     []v1.Taint{{Key: v1.TaintNodeNotReady, Effect: v1.TaintEffectNoExecute}},
   283  			nodeConditions: []v1.NodeCondition{{Type: v1.NodeReady, Status: v1.ConditionFalse}},
   284  			pod: &v1.Pod{
   285  				ObjectMeta: metav1.ObjectMeta{Name: "testpod1"},
   286  				Spec: v1.PodSpec{
   287  					Containers: []v1.Container{
   288  						{Name: "container", Image: imageutils.GetPauseImageName()},
   289  					},
   290  				},
   291  			},
   292  			tolerationSeconds:                     300,
   293  			expectedWaitForPodCondition:           "updated with tolerationSeconds=300",
   294  			enableSeparateTaintEvictionController: true,
   295  		},
   296  		{
   297  			name:                                  "Taint based evictions for NodeNotReady and 0 tolerationseconds; separate taint eviction controller disabled",
   298  			nodeTaints:                            []v1.Taint{{Key: v1.TaintNodeNotReady, Effect: v1.TaintEffectNoExecute}},
   299  			nodeConditions:                        []v1.NodeCondition{{Type: v1.NodeReady, Status: v1.ConditionFalse}},
   300  			pod:                                   testPod.DeepCopy(),
   301  			tolerationSeconds:                     0,
   302  			expectedWaitForPodCondition:           "terminating",
   303  			enableSeparateTaintEvictionController: false,
   304  		},
   305  		{
   306  			name:                                  "Taint based evictions for NodeNotReady and 0 tolerationseconds; separate taint eviction controller enabled",
   307  			nodeTaints:                            []v1.Taint{{Key: v1.TaintNodeNotReady, Effect: v1.TaintEffectNoExecute}},
   308  			nodeConditions:                        []v1.NodeCondition{{Type: v1.NodeReady, Status: v1.ConditionFalse}},
   309  			pod:                                   testPod.DeepCopy(),
   310  			tolerationSeconds:                     0,
   311  			expectedWaitForPodCondition:           "terminating",
   312  			enableSeparateTaintEvictionController: true,
   313  		},
   314  		{
   315  			name:                                  "Taint based evictions for NodeUnreachable; separate taint eviction controller disabled",
   316  			nodeTaints:                            []v1.Taint{{Key: v1.TaintNodeUnreachable, Effect: v1.TaintEffectNoExecute}},
   317  			nodeConditions:                        []v1.NodeCondition{{Type: v1.NodeReady, Status: v1.ConditionUnknown}},
   318  			enableSeparateTaintEvictionController: false,
   319  		},
   320  		{
   321  			name:                                  "Taint based evictions for NodeUnreachable; separate taint eviction controller enabled",
   322  			nodeTaints:                            []v1.Taint{{Key: v1.TaintNodeUnreachable, Effect: v1.TaintEffectNoExecute}},
   323  			nodeConditions:                        []v1.NodeCondition{{Type: v1.NodeReady, Status: v1.ConditionUnknown}},
   324  			enableSeparateTaintEvictionController: true,
   325  		},
   326  	}
   327  
   328  	// Build admission chain handler.
   329  	podTolerations := podtolerationrestriction.NewPodTolerationsPlugin(&pluginapi.Configuration{})
   330  	admission := admission.NewChainHandler(
   331  		podTolerations,
   332  		defaulttolerationseconds.NewDefaultTolerationSeconds(),
   333  	)
   334  	for _, test := range tests {
   335  		t.Run(test.name, func(t *testing.T) {
   336  			defer featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.SeparateTaintEvictionController, test.enableSeparateTaintEvictionController)()
   337  
   338  			testCtx := testutils.InitTestAPIServer(t, "taint-based-evictions", admission)
   339  
   340  			// Build clientset and informers for controllers.
   341  			externalClientConfig := restclient.CopyConfig(testCtx.KubeConfig)
   342  			externalClientConfig.QPS = -1
   343  			externalClientset := clientset.NewForConfigOrDie(externalClientConfig)
   344  			externalInformers := informers.NewSharedInformerFactory(externalClientset, time.Second)
   345  			podTolerations.SetExternalKubeClientSet(externalClientset)
   346  			podTolerations.SetExternalKubeInformerFactory(externalInformers)
   347  
   348  			cs := testCtx.ClientSet
   349  
   350  			// Start NodeLifecycleController for taint.
   351  			nc, err := nodelifecycle.NewNodeLifecycleController(
   352  				testCtx.Ctx,
   353  				externalInformers.Coordination().V1().Leases(),
   354  				externalInformers.Core().V1().Pods(),
   355  				externalInformers.Core().V1().Nodes(),
   356  				externalInformers.Apps().V1().DaemonSets(),
   357  				cs,
   358  				1*time.Second,    // Node monitor grace period
   359  				time.Minute,      // Node startup grace period
   360  				time.Millisecond, // Node monitor period
   361  				100,              // Eviction limiter QPS
   362  				100,              // Secondary eviction limiter QPS
   363  				50,               // Large cluster threshold
   364  				0.55,             // Unhealthy zone threshold
   365  			)
   366  			if err != nil {
   367  				t.Fatalf("Failed to create node controller: %v", err)
   368  			}
   369  
   370  			// Waiting for all controllers to sync
   371  			externalInformers.Start(testCtx.Ctx.Done())
   372  			externalInformers.WaitForCacheSync(testCtx.Ctx.Done())
   373  
   374  			// Run the controller
   375  			go nc.Run(testCtx.Ctx)
   376  
   377  			// Start TaintEvictionController
   378  			if test.enableSeparateTaintEvictionController {
   379  				tm, _ := tainteviction.New(
   380  					testCtx.Ctx,
   381  					testCtx.ClientSet,
   382  					externalInformers.Core().V1().Pods(),
   383  					externalInformers.Core().V1().Nodes(),
   384  					names.TaintEvictionController,
   385  				)
   386  				go tm.Run(testCtx.Ctx)
   387  			}
   388  
   389  			nodeRes := v1.ResourceList{
   390  				v1.ResourceCPU:    resource.MustParse("4000m"),
   391  				v1.ResourceMemory: resource.MustParse("16Gi"),
   392  				v1.ResourcePods:   resource.MustParse("110"),
   393  			}
   394  
   395  			var nodes []*v1.Node
   396  			for i := 0; i < nodeCount; i++ {
   397  				node := &v1.Node{
   398  					ObjectMeta: metav1.ObjectMeta{
   399  						Name: fmt.Sprintf("node-%d", i),
   400  						Labels: map[string]string{
   401  							v1.LabelTopologyRegion:                  "region1",
   402  							v1.LabelTopologyZone:                    "zone1",
   403  							"node.kubernetes.io/exclude-disruption": "true",
   404  						},
   405  					},
   406  					Spec: v1.NodeSpec{},
   407  					Status: v1.NodeStatus{
   408  						Capacity:    nodeRes,
   409  						Allocatable: nodeRes,
   410  					},
   411  				}
   412  				if i == nodeIndex {
   413  					node.Status.Conditions = append(node.Status.Conditions, test.nodeConditions...)
   414  				} else {
   415  					node.Status.Conditions = append(node.Status.Conditions, v1.NodeCondition{
   416  						Type:   v1.NodeReady,
   417  						Status: v1.ConditionTrue,
   418  					})
   419  				}
   420  				nodes = append(nodes, node)
   421  				if _, err := cs.CoreV1().Nodes().Create(testCtx.Ctx, node, metav1.CreateOptions{}); err != nil {
   422  					t.Fatalf("Failed to create node: %q, err: %v", klog.KObj(node), err)
   423  				}
   424  			}
   425  
   426  			if test.pod != nil {
   427  				test.pod.Spec.NodeName = nodes[nodeIndex].Name
   428  				test.pod.Name = "testpod"
   429  				if len(test.pod.Spec.Tolerations) > 0 {
   430  					test.pod.Spec.Tolerations[0].TolerationSeconds = &test.tolerationSeconds
   431  				}
   432  
   433  				test.pod, err = cs.CoreV1().Pods(testCtx.NS.Name).Create(testCtx.Ctx, test.pod, metav1.CreateOptions{})
   434  				if err != nil {
   435  					t.Fatalf("Test Failed: error: %q, while creating pod %q", err, klog.KObj(test.pod))
   436  				}
   437  			}
   438  
   439  			if err := testutils.WaitForNodeTaints(cs, nodes[nodeIndex], test.nodeTaints); err != nil {
   440  				t.Errorf("Failed to taint node %q, err: %v", klog.KObj(nodes[nodeIndex]), err)
   441  			}
   442  
   443  			if test.pod != nil {
   444  				err = wait.PollImmediate(time.Second, time.Second*15, func() (bool, error) {
   445  					pod, err := cs.CoreV1().Pods(test.pod.Namespace).Get(testCtx.Ctx, test.pod.Name, metav1.GetOptions{})
   446  					if err != nil {
   447  						return false, err
   448  					}
   449  					// as node is unreachable, pod0 is expected to be in Terminating status
   450  					// rather than getting deleted
   451  					if test.tolerationSeconds == 0 {
   452  						return pod.DeletionTimestamp != nil, nil
   453  					}
   454  					if seconds, err := testutils.GetTolerationSeconds(pod.Spec.Tolerations); err == nil {
   455  						return seconds == test.tolerationSeconds, nil
   456  					}
   457  					return false, nil
   458  				})
   459  				if err != nil {
   460  					pod, _ := cs.CoreV1().Pods(testCtx.NS.Name).Get(testCtx.Ctx, test.pod.Name, metav1.GetOptions{})
   461  					t.Fatalf("Error: %v, Expected test pod to be %s but it's %v", err, test.expectedWaitForPodCondition, pod)
   462  				}
   463  				testutils.CleanupPods(testCtx.Ctx, cs, t, []*v1.Pod{test.pod})
   464  			}
   465  			testutils.CleanupNodes(cs, t)
   466  		})
   467  	}
   468  }