k8s.io/kubernetes@v1.29.3/test/e2e_node/node_problem_detector_linux.go (about)

     1  //go:build cgo && linux
     2  // +build cgo,linux
     3  
     4  /*
     5  Copyright 2016 The Kubernetes Authors.
     6  
     7  Licensed under the Apache License, Version 2.0 (the "License");
     8  you may not use this file except in compliance with the License.
     9  You may obtain a copy of the License at
    10  
    11      http://www.apache.org/licenses/LICENSE-2.0
    12  
    13  Unless required by applicable law or agreed to in writing, software
    14  distributed under the License is distributed on an "AS IS" BASIS,
    15  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    16  See the License for the specific language governing permissions and
    17  limitations under the License.
    18  */
    19  
    20  package e2enode
    21  
    22  import (
    23  	"context"
    24  	"fmt"
    25  	"os"
    26  	"path"
    27  	"time"
    28  
    29  	"github.com/onsi/ginkgo/v2"
    30  	"github.com/onsi/gomega"
    31  	v1 "k8s.io/api/core/v1"
    32  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    33  	"k8s.io/apimachinery/pkg/fields"
    34  	"k8s.io/apimachinery/pkg/types"
    35  	"k8s.io/apimachinery/pkg/util/uuid"
    36  	clientset "k8s.io/client-go/kubernetes"
    37  	coreclientset "k8s.io/client-go/kubernetes/typed/core/v1"
    38  	admissionapi "k8s.io/pod-security-admission/api"
    39  
    40  	"k8s.io/kubernetes/pkg/kubelet/util"
    41  	"k8s.io/kubernetes/test/e2e/framework"
    42  	e2epod "k8s.io/kubernetes/test/e2e/framework/pod"
    43  	"k8s.io/kubernetes/test/e2e/nodefeature"
    44  	testutils "k8s.io/kubernetes/test/utils"
    45  )
    46  
    47  var _ = SIGDescribe("NodeProblemDetector", nodefeature.NodeProblemDetector, framework.WithSerial(), func() {
    48  	const (
    49  		pollInterval   = 1 * time.Second
    50  		pollConsistent = 5 * time.Second
    51  		pollTimeout    = 5 * time.Minute
    52  	)
    53  	f := framework.NewDefaultFramework("node-problem-detector")
    54  	f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged
    55  	var c clientset.Interface
    56  	var uid string
    57  	var ns, name, configName, eventNamespace string
    58  	var bootTime, nodeTime time.Time
    59  	var image string
    60  
    61  	ginkgo.BeforeEach(func() {
    62  		c = f.ClientSet
    63  		ns = f.Namespace.Name
    64  		uid = string(uuid.NewUUID())
    65  		name = "node-problem-detector-" + uid
    66  		configName = "node-problem-detector-config-" + uid
    67  		// There is no namespace for Node, event recorder will set default namespace for node events.
    68  		eventNamespace = metav1.NamespaceDefault
    69  		image = getNodeProblemDetectorImage()
    70  		ginkgo.By(fmt.Sprintf("Using node-problem-detector image: %s", image))
    71  	})
    72  
    73  	// Test system log monitor. We may add other tests if we have more problem daemons in the future.
    74  	ginkgo.Describe("SystemLogMonitor", func() {
    75  		const (
    76  			// Use test condition to avoid changing the real node condition in use.
    77  			// TODO(random-liu): Now node condition could be arbitrary string, consider whether we need to
    78  			// add TestCondition when switching to predefined condition list.
    79  			condition = v1.NodeConditionType("TestCondition")
    80  
    81  			// File paths used in the test.
    82  			logFile        = "/log/test.log"
    83  			configFile     = "/config/testconfig.json"
    84  			kubeConfigFile = "/config/kubeconfig"
    85  			etcLocaltime   = "/etc/localtime"
    86  
    87  			// Volumes used in the test.
    88  			configVolume    = "config"
    89  			logVolume       = "log"
    90  			localtimeVolume = "localtime"
    91  
    92  			// Reasons and messages used in the test.
    93  			defaultReason  = "Default"
    94  			defaultMessage = "default message"
    95  			tempReason     = "Temporary"
    96  			tempMessage    = "temporary error"
    97  			permReason1    = "Permanent1"
    98  			permMessage1   = "permanent error 1"
    99  			permReason2    = "Permanent2"
   100  			permMessage2   = "permanent error 2"
   101  		)
   102  		var source, config, hostLogFile string
   103  		var lookback time.Duration
   104  		var eventListOptions metav1.ListOptions
   105  
   106  		ginkgo.BeforeEach(func(ctx context.Context) {
   107  			ginkgo.By("Calculate Lookback duration")
   108  			var err error
   109  
   110  			nodeTime = time.Now()
   111  			bootTime, err = util.GetBootTime()
   112  			framework.ExpectNoError(err)
   113  
   114  			// Set lookback duration longer than node up time.
   115  			// Assume the test won't take more than 1 hour, in fact it usually only takes 90 seconds.
   116  			lookback = nodeTime.Sub(bootTime) + time.Hour
   117  
   118  			// Randomize the source name
   119  			source = "kernel-monitor-" + uid
   120  			config = `
   121  			{
   122  				"plugin": "filelog",
   123  				"pluginConfig": {
   124  					"timestamp": "^.{15}",
   125  					"message": "kernel: \\[.*\\] (.*)",
   126  					"timestampFormat": "` + time.Stamp + `"
   127  				},
   128  				"logPath": "` + logFile + `",
   129  				"lookback": "` + lookback.String() + `",
   130  				"bufferSize": 10,
   131  				"source": "` + source + `",
   132  				"conditions": [
   133  				{
   134  					"type": "` + string(condition) + `",
   135  					"reason": "` + defaultReason + `",
   136  					"message": "` + defaultMessage + `"
   137  				}
   138  				],
   139  				"rules": [
   140  				{
   141  					"type": "temporary",
   142  					"reason": "` + tempReason + `",
   143  					"pattern": "` + tempMessage + `"
   144  				},
   145  				{
   146  					"type": "permanent",
   147  					"condition": "` + string(condition) + `",
   148  					"reason": "` + permReason1 + `",
   149  					"pattern": "` + permMessage1 + ".*" + `"
   150  				},
   151  				{
   152  					"type": "permanent",
   153  					"condition": "` + string(condition) + `",
   154  					"reason": "` + permReason2 + `",
   155  					"pattern": "` + permMessage2 + ".*" + `"
   156  				}
   157  				]
   158  			}`
   159  
   160  			// This token is known to apiserver and its group is `system:masters`.
   161  			// See also the function `generateTokenFile` in `test/e2e_node/services/apiserver.go`.
   162  			kubeConfig := fmt.Sprintf(`
   163  apiVersion: v1
   164  kind: Config
   165  users:
   166  - name: node-problem-detector
   167    user:
   168      token: %s
   169  clusters:
   170  - cluster:
   171      server: %s
   172      insecure-skip-tls-verify: true
   173    name: local
   174  contexts:
   175  - context:
   176      cluster: local
   177      user: node-problem-detector
   178    name: local-context
   179  current-context: local-context
   180  `, framework.TestContext.BearerToken, framework.TestContext.Host)
   181  
   182  			ginkgo.By("Generate event list options")
   183  			selector := fields.Set{
   184  				"involvedObject.kind":      "Node",
   185  				"involvedObject.name":      framework.TestContext.NodeName,
   186  				"involvedObject.namespace": metav1.NamespaceAll,
   187  				"source":                   source,
   188  			}.AsSelector().String()
   189  			eventListOptions = metav1.ListOptions{FieldSelector: selector}
   190  
   191  			ginkgo.By("Create config map for the node problem detector")
   192  			_, err = c.CoreV1().ConfigMaps(ns).Create(ctx, &v1.ConfigMap{
   193  				ObjectMeta: metav1.ObjectMeta{Name: configName},
   194  				Data: map[string]string{
   195  					path.Base(configFile):     config,
   196  					path.Base(kubeConfigFile): kubeConfig,
   197  				},
   198  			}, metav1.CreateOptions{})
   199  			framework.ExpectNoError(err)
   200  
   201  			ginkgo.By("Create the node problem detector")
   202  			hostPathType := new(v1.HostPathType)
   203  			*hostPathType = v1.HostPathFileOrCreate
   204  			pod := e2epod.NewPodClient(f).CreateSync(ctx, &v1.Pod{
   205  				ObjectMeta: metav1.ObjectMeta{
   206  					Name: name,
   207  				},
   208  				Spec: v1.PodSpec{
   209  					HostNetwork:        true,
   210  					SecurityContext:    &v1.PodSecurityContext{},
   211  					ServiceAccountName: name,
   212  					Volumes: []v1.Volume{
   213  						{
   214  							Name: configVolume,
   215  							VolumeSource: v1.VolumeSource{
   216  								ConfigMap: &v1.ConfigMapVolumeSource{
   217  									LocalObjectReference: v1.LocalObjectReference{Name: configName},
   218  								},
   219  							},
   220  						},
   221  						{
   222  							Name: logVolume,
   223  							VolumeSource: v1.VolumeSource{
   224  								EmptyDir: &v1.EmptyDirVolumeSource{},
   225  							},
   226  						},
   227  						{
   228  							Name: localtimeVolume,
   229  							VolumeSource: v1.VolumeSource{
   230  								HostPath: &v1.HostPathVolumeSource{
   231  									Path: etcLocaltime,
   232  									Type: hostPathType,
   233  								},
   234  							},
   235  						},
   236  					},
   237  					InitContainers: []v1.Container{
   238  						{
   239  							Name:    "init-log-file",
   240  							Image:   "debian",
   241  							Command: []string{"/bin/sh"},
   242  							Args: []string{
   243  								"-c",
   244  								fmt.Sprintf("touch %s", logFile),
   245  							},
   246  							VolumeMounts: []v1.VolumeMount{
   247  								{
   248  									Name:      logVolume,
   249  									MountPath: path.Dir(logFile),
   250  								},
   251  								{
   252  									Name:      localtimeVolume,
   253  									MountPath: etcLocaltime,
   254  								},
   255  							},
   256  						},
   257  					},
   258  					Containers: []v1.Container{
   259  						{
   260  							Name:    name,
   261  							Image:   image,
   262  							Command: []string{"/node-problem-detector"},
   263  							Args: []string{
   264  								"--logtostderr",
   265  								fmt.Sprintf("--system-log-monitors=%s", configFile),
   266  								// `ServiceAccount` admission controller is disabled in node e2e tests, so we could not use
   267  								// inClusterConfig here.
   268  								fmt.Sprintf("--apiserver-override=%s?inClusterConfig=false&auth=%s", framework.TestContext.Host, kubeConfigFile),
   269  							},
   270  							Env: []v1.EnvVar{
   271  								{
   272  									Name: "NODE_NAME",
   273  									ValueFrom: &v1.EnvVarSource{
   274  										FieldRef: &v1.ObjectFieldSelector{
   275  											APIVersion: "v1",
   276  											FieldPath:  "spec.nodeName",
   277  										},
   278  									},
   279  								},
   280  							},
   281  							VolumeMounts: []v1.VolumeMount{
   282  								{
   283  									Name:      logVolume,
   284  									MountPath: path.Dir(logFile),
   285  								},
   286  								{
   287  									Name:      localtimeVolume,
   288  									MountPath: etcLocaltime,
   289  								},
   290  								{
   291  									Name:      configVolume,
   292  									MountPath: path.Dir(configFile),
   293  								},
   294  							},
   295  						},
   296  					},
   297  				},
   298  			})
   299  			// TODO: remove hardcoded kubelet volume directory path
   300  			// framework.TestContext.KubeVolumeDir is currently not populated for node e2e
   301  			hostLogFile = "/var/lib/kubelet/pods/" + string(pod.UID) + "/volumes/kubernetes.io~empty-dir" + logFile
   302  		})
   303  
   304  		ginkgo.It("should generate node condition and events for corresponding errors", func(ctx context.Context) {
   305  			for _, test := range []struct {
   306  				description      string
   307  				timestamp        time.Time
   308  				message          string
   309  				messageNum       int
   310  				tempEvents       int // Events for temp errors
   311  				totalEvents      int // Events for both temp errors and condition changes
   312  				conditionReason  string
   313  				conditionMessage string
   314  				conditionType    v1.ConditionStatus
   315  			}{
   316  				{
   317  					description:      "should generate default node condition",
   318  					conditionReason:  defaultReason,
   319  					conditionMessage: defaultMessage,
   320  					conditionType:    v1.ConditionFalse,
   321  				},
   322  				{
   323  					description:      "should not generate events for too old log",
   324  					timestamp:        bootTime.Add(-1 * time.Minute),
   325  					message:          tempMessage,
   326  					messageNum:       3,
   327  					conditionReason:  defaultReason,
   328  					conditionMessage: defaultMessage,
   329  					conditionType:    v1.ConditionFalse,
   330  				},
   331  				{
   332  					description:      "should not change node condition for too old log",
   333  					timestamp:        bootTime.Add(-1 * time.Minute),
   334  					message:          permMessage1,
   335  					messageNum:       1,
   336  					conditionReason:  defaultReason,
   337  					conditionMessage: defaultMessage,
   338  					conditionType:    v1.ConditionFalse,
   339  				},
   340  				{
   341  					description:      "should generate event for old log within lookback duration",
   342  					timestamp:        nodeTime,
   343  					message:          tempMessage,
   344  					messageNum:       3,
   345  					tempEvents:       3,
   346  					totalEvents:      3,
   347  					conditionReason:  defaultReason,
   348  					conditionMessage: defaultMessage,
   349  					conditionType:    v1.ConditionFalse,
   350  				},
   351  				{
   352  					description:      "should change node condition for old log within lookback duration",
   353  					timestamp:        nodeTime,
   354  					message:          permMessage1,
   355  					messageNum:       1,
   356  					tempEvents:       3, // event number for temp errors should not change
   357  					totalEvents:      4, // add 1 event for condition change
   358  					conditionReason:  permReason1,
   359  					conditionMessage: permMessage1,
   360  					conditionType:    v1.ConditionTrue,
   361  				},
   362  				{
   363  					description:      "should generate event for new log",
   364  					timestamp:        nodeTime.Add(5 * time.Minute),
   365  					message:          tempMessage,
   366  					messageNum:       3,
   367  					tempEvents:       6, // add 3 events for temp errors
   368  					totalEvents:      7, // add 3 events for temp errors
   369  					conditionReason:  permReason1,
   370  					conditionMessage: permMessage1,
   371  					conditionType:    v1.ConditionTrue,
   372  				},
   373  				{
   374  					description:      "should not update node condition with the same reason",
   375  					timestamp:        nodeTime.Add(5 * time.Minute),
   376  					message:          permMessage1 + "different message",
   377  					messageNum:       1,
   378  					tempEvents:       6, // event number should not change
   379  					totalEvents:      7, // event number should not change
   380  					conditionReason:  permReason1,
   381  					conditionMessage: permMessage1,
   382  					conditionType:    v1.ConditionTrue,
   383  				},
   384  				{
   385  					description:      "should change node condition for new log",
   386  					timestamp:        nodeTime.Add(5 * time.Minute),
   387  					message:          permMessage2,
   388  					messageNum:       1,
   389  					tempEvents:       6, // event number for temp errors should not change
   390  					totalEvents:      8, // add 1 event for condition change
   391  					conditionReason:  permReason2,
   392  					conditionMessage: permMessage2,
   393  					conditionType:    v1.ConditionTrue,
   394  				},
   395  			} {
   396  				ginkgo.By(test.description)
   397  				if test.messageNum > 0 {
   398  					ginkgo.By(fmt.Sprintf("Inject %d logs: %q", test.messageNum, test.message))
   399  					err := injectLog(hostLogFile, test.timestamp, test.message, test.messageNum)
   400  					framework.ExpectNoError(err)
   401  				}
   402  
   403  				ginkgo.By(fmt.Sprintf("Wait for %d temp events generated", test.tempEvents))
   404  				gomega.Eventually(ctx, func(ctx context.Context) error {
   405  					return verifyEvents(ctx, c.CoreV1().Events(eventNamespace), eventListOptions, test.tempEvents, tempReason, tempMessage)
   406  				}, pollTimeout, pollInterval).Should(gomega.Succeed())
   407  				ginkgo.By(fmt.Sprintf("Wait for %d total events generated", test.totalEvents))
   408  				gomega.Eventually(ctx, func(ctx context.Context) error {
   409  					return verifyTotalEvents(ctx, c.CoreV1().Events(eventNamespace), eventListOptions, test.totalEvents)
   410  				}, pollTimeout, pollInterval).Should(gomega.Succeed())
   411  				ginkgo.By(fmt.Sprintf("Make sure only %d total events generated", test.totalEvents))
   412  				gomega.Consistently(ctx, func(ctx context.Context) error {
   413  					return verifyTotalEvents(ctx, c.CoreV1().Events(eventNamespace), eventListOptions, test.totalEvents)
   414  				}, pollConsistent, pollInterval).Should(gomega.Succeed())
   415  
   416  				ginkgo.By(fmt.Sprintf("Make sure node condition %q is set", condition))
   417  				gomega.Eventually(ctx, func(ctx context.Context) error {
   418  					return verifyNodeCondition(ctx, c.CoreV1().Nodes(), condition, test.conditionType, test.conditionReason, test.conditionMessage)
   419  				}, pollTimeout, pollInterval).Should(gomega.Succeed())
   420  				ginkgo.By(fmt.Sprintf("Make sure node condition %q is stable", condition))
   421  				gomega.Consistently(ctx, func(ctx context.Context) error {
   422  					return verifyNodeCondition(ctx, c.CoreV1().Nodes(), condition, test.conditionType, test.conditionReason, test.conditionMessage)
   423  				}, pollConsistent, pollInterval).Should(gomega.Succeed())
   424  			}
   425  		})
   426  
   427  		ginkgo.AfterEach(func(ctx context.Context) {
   428  			if ginkgo.CurrentSpecReport().Failed() && framework.TestContext.DumpLogsOnFailure {
   429  				ginkgo.By("Get node problem detector log")
   430  				log, err := e2epod.GetPodLogs(ctx, c, ns, name, name)
   431  				gomega.Expect(err).ShouldNot(gomega.HaveOccurred())
   432  				framework.Logf("Node Problem Detector logs:\n %s", log)
   433  			}
   434  			ginkgo.By("Delete the node problem detector")
   435  			framework.ExpectNoError(e2epod.NewPodClient(f).Delete(ctx, name, *metav1.NewDeleteOptions(0)))
   436  			ginkgo.By("Wait for the node problem detector to disappear")
   437  			gomega.Expect(e2epod.WaitForPodNotFoundInNamespace(ctx, c, name, ns, pollTimeout)).To(gomega.Succeed())
   438  			ginkgo.By("Delete the config map")
   439  			framework.ExpectNoError(c.CoreV1().ConfigMaps(ns).Delete(ctx, configName, metav1.DeleteOptions{}))
   440  			ginkgo.By("Clean up the events")
   441  			gomega.Expect(c.CoreV1().Events(eventNamespace).DeleteCollection(ctx, *metav1.NewDeleteOptions(0), eventListOptions)).To(gomega.Succeed())
   442  			ginkgo.By("Clean up the node condition")
   443  			patch := []byte(fmt.Sprintf(`{"status":{"conditions":[{"$patch":"delete","type":"%s"}]}}`, condition))
   444  			c.CoreV1().RESTClient().Patch(types.StrategicMergePatchType).Resource("nodes").Name(framework.TestContext.NodeName).SubResource("status").Body(patch).Do(ctx)
   445  		})
   446  	})
   447  })
   448  
   449  // injectLog injects kernel log into specified file.
   450  func injectLog(file string, timestamp time.Time, log string, num int) error {
   451  	f, err := os.OpenFile(file, os.O_RDWR|os.O_APPEND, 0666)
   452  	if err != nil {
   453  		return err
   454  	}
   455  	defer f.Close()
   456  	for i := 0; i < num; i++ {
   457  		_, err := f.WriteString(fmt.Sprintf("%s kernel: [0.000000] %s\n", timestamp.Format(time.Stamp), log))
   458  		if err != nil {
   459  			return err
   460  		}
   461  	}
   462  	return nil
   463  }
   464  
   465  // verifyEvents verifies there are num specific events generated with given reason and message.
   466  func verifyEvents(ctx context.Context, e coreclientset.EventInterface, options metav1.ListOptions, num int, reason, message string) error {
   467  	events, err := e.List(ctx, options)
   468  	if err != nil {
   469  		return err
   470  	}
   471  	count := 0
   472  	for _, event := range events.Items {
   473  		if event.Reason != reason || event.Message != message {
   474  			continue
   475  		}
   476  		count += int(event.Count)
   477  	}
   478  	if count != num {
   479  		return fmt.Errorf("expected %d events with reason set to %s and message set to %s\nbut %d actual events occurred. Events : %v", num, reason, message, count, events.Items)
   480  	}
   481  	return nil
   482  }
   483  
   484  // verifyTotalEvents verifies there are num events in total.
   485  func verifyTotalEvents(ctx context.Context, e coreclientset.EventInterface, options metav1.ListOptions, num int) error {
   486  	events, err := e.List(ctx, options)
   487  	if err != nil {
   488  		return err
   489  	}
   490  	count := 0
   491  	for _, event := range events.Items {
   492  		count += int(event.Count)
   493  	}
   494  	if count != num {
   495  		return fmt.Errorf("expected total number of events was %d, actual events counted was %d\nEvents  : %v", num, count, events.Items)
   496  	}
   497  	return nil
   498  }
   499  
   500  // verifyNodeCondition verifies specific node condition is generated, if reason and message are empty, they will not be checked
   501  func verifyNodeCondition(ctx context.Context, n coreclientset.NodeInterface, condition v1.NodeConditionType, status v1.ConditionStatus, reason, message string) error {
   502  	node, err := n.Get(ctx, framework.TestContext.NodeName, metav1.GetOptions{})
   503  	if err != nil {
   504  		return err
   505  	}
   506  	_, c := testutils.GetNodeCondition(&node.Status, condition)
   507  	if c == nil {
   508  		return fmt.Errorf("node condition %q not found", condition)
   509  	}
   510  	if c.Status != status || c.Reason != reason || c.Message != message {
   511  		return fmt.Errorf("unexpected node condition %q: %+v", condition, c)
   512  	}
   513  	return nil
   514  }