k8s.io/kubernetes@v1.29.3/test/e2e_node/node_problem_detector_linux.go (about) 1 //go:build cgo && linux 2 // +build cgo,linux 3 4 /* 5 Copyright 2016 The Kubernetes Authors. 6 7 Licensed under the Apache License, Version 2.0 (the "License"); 8 you may not use this file except in compliance with the License. 9 You may obtain a copy of the License at 10 11 http://www.apache.org/licenses/LICENSE-2.0 12 13 Unless required by applicable law or agreed to in writing, software 14 distributed under the License is distributed on an "AS IS" BASIS, 15 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 See the License for the specific language governing permissions and 17 limitations under the License. 18 */ 19 20 package e2enode 21 22 import ( 23 "context" 24 "fmt" 25 "os" 26 "path" 27 "time" 28 29 "github.com/onsi/ginkgo/v2" 30 "github.com/onsi/gomega" 31 v1 "k8s.io/api/core/v1" 32 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 33 "k8s.io/apimachinery/pkg/fields" 34 "k8s.io/apimachinery/pkg/types" 35 "k8s.io/apimachinery/pkg/util/uuid" 36 clientset "k8s.io/client-go/kubernetes" 37 coreclientset "k8s.io/client-go/kubernetes/typed/core/v1" 38 admissionapi "k8s.io/pod-security-admission/api" 39 40 "k8s.io/kubernetes/pkg/kubelet/util" 41 "k8s.io/kubernetes/test/e2e/framework" 42 e2epod "k8s.io/kubernetes/test/e2e/framework/pod" 43 "k8s.io/kubernetes/test/e2e/nodefeature" 44 testutils "k8s.io/kubernetes/test/utils" 45 ) 46 47 var _ = SIGDescribe("NodeProblemDetector", nodefeature.NodeProblemDetector, framework.WithSerial(), func() { 48 const ( 49 pollInterval = 1 * time.Second 50 pollConsistent = 5 * time.Second 51 pollTimeout = 5 * time.Minute 52 ) 53 f := framework.NewDefaultFramework("node-problem-detector") 54 f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged 55 var c clientset.Interface 56 var uid string 57 var ns, name, configName, eventNamespace string 58 var bootTime, nodeTime time.Time 59 var image string 60 61 ginkgo.BeforeEach(func() { 62 c = f.ClientSet 63 ns = f.Namespace.Name 64 uid = string(uuid.NewUUID()) 65 name = "node-problem-detector-" + uid 66 configName = "node-problem-detector-config-" + uid 67 // There is no namespace for Node, event recorder will set default namespace for node events. 68 eventNamespace = metav1.NamespaceDefault 69 image = getNodeProblemDetectorImage() 70 ginkgo.By(fmt.Sprintf("Using node-problem-detector image: %s", image)) 71 }) 72 73 // Test system log monitor. We may add other tests if we have more problem daemons in the future. 74 ginkgo.Describe("SystemLogMonitor", func() { 75 const ( 76 // Use test condition to avoid changing the real node condition in use. 77 // TODO(random-liu): Now node condition could be arbitrary string, consider whether we need to 78 // add TestCondition when switching to predefined condition list. 79 condition = v1.NodeConditionType("TestCondition") 80 81 // File paths used in the test. 82 logFile = "/log/test.log" 83 configFile = "/config/testconfig.json" 84 kubeConfigFile = "/config/kubeconfig" 85 etcLocaltime = "/etc/localtime" 86 87 // Volumes used in the test. 88 configVolume = "config" 89 logVolume = "log" 90 localtimeVolume = "localtime" 91 92 // Reasons and messages used in the test. 93 defaultReason = "Default" 94 defaultMessage = "default message" 95 tempReason = "Temporary" 96 tempMessage = "temporary error" 97 permReason1 = "Permanent1" 98 permMessage1 = "permanent error 1" 99 permReason2 = "Permanent2" 100 permMessage2 = "permanent error 2" 101 ) 102 var source, config, hostLogFile string 103 var lookback time.Duration 104 var eventListOptions metav1.ListOptions 105 106 ginkgo.BeforeEach(func(ctx context.Context) { 107 ginkgo.By("Calculate Lookback duration") 108 var err error 109 110 nodeTime = time.Now() 111 bootTime, err = util.GetBootTime() 112 framework.ExpectNoError(err) 113 114 // Set lookback duration longer than node up time. 115 // Assume the test won't take more than 1 hour, in fact it usually only takes 90 seconds. 116 lookback = nodeTime.Sub(bootTime) + time.Hour 117 118 // Randomize the source name 119 source = "kernel-monitor-" + uid 120 config = ` 121 { 122 "plugin": "filelog", 123 "pluginConfig": { 124 "timestamp": "^.{15}", 125 "message": "kernel: \\[.*\\] (.*)", 126 "timestampFormat": "` + time.Stamp + `" 127 }, 128 "logPath": "` + logFile + `", 129 "lookback": "` + lookback.String() + `", 130 "bufferSize": 10, 131 "source": "` + source + `", 132 "conditions": [ 133 { 134 "type": "` + string(condition) + `", 135 "reason": "` + defaultReason + `", 136 "message": "` + defaultMessage + `" 137 } 138 ], 139 "rules": [ 140 { 141 "type": "temporary", 142 "reason": "` + tempReason + `", 143 "pattern": "` + tempMessage + `" 144 }, 145 { 146 "type": "permanent", 147 "condition": "` + string(condition) + `", 148 "reason": "` + permReason1 + `", 149 "pattern": "` + permMessage1 + ".*" + `" 150 }, 151 { 152 "type": "permanent", 153 "condition": "` + string(condition) + `", 154 "reason": "` + permReason2 + `", 155 "pattern": "` + permMessage2 + ".*" + `" 156 } 157 ] 158 }` 159 160 // This token is known to apiserver and its group is `system:masters`. 161 // See also the function `generateTokenFile` in `test/e2e_node/services/apiserver.go`. 162 kubeConfig := fmt.Sprintf(` 163 apiVersion: v1 164 kind: Config 165 users: 166 - name: node-problem-detector 167 user: 168 token: %s 169 clusters: 170 - cluster: 171 server: %s 172 insecure-skip-tls-verify: true 173 name: local 174 contexts: 175 - context: 176 cluster: local 177 user: node-problem-detector 178 name: local-context 179 current-context: local-context 180 `, framework.TestContext.BearerToken, framework.TestContext.Host) 181 182 ginkgo.By("Generate event list options") 183 selector := fields.Set{ 184 "involvedObject.kind": "Node", 185 "involvedObject.name": framework.TestContext.NodeName, 186 "involvedObject.namespace": metav1.NamespaceAll, 187 "source": source, 188 }.AsSelector().String() 189 eventListOptions = metav1.ListOptions{FieldSelector: selector} 190 191 ginkgo.By("Create config map for the node problem detector") 192 _, err = c.CoreV1().ConfigMaps(ns).Create(ctx, &v1.ConfigMap{ 193 ObjectMeta: metav1.ObjectMeta{Name: configName}, 194 Data: map[string]string{ 195 path.Base(configFile): config, 196 path.Base(kubeConfigFile): kubeConfig, 197 }, 198 }, metav1.CreateOptions{}) 199 framework.ExpectNoError(err) 200 201 ginkgo.By("Create the node problem detector") 202 hostPathType := new(v1.HostPathType) 203 *hostPathType = v1.HostPathFileOrCreate 204 pod := e2epod.NewPodClient(f).CreateSync(ctx, &v1.Pod{ 205 ObjectMeta: metav1.ObjectMeta{ 206 Name: name, 207 }, 208 Spec: v1.PodSpec{ 209 HostNetwork: true, 210 SecurityContext: &v1.PodSecurityContext{}, 211 ServiceAccountName: name, 212 Volumes: []v1.Volume{ 213 { 214 Name: configVolume, 215 VolumeSource: v1.VolumeSource{ 216 ConfigMap: &v1.ConfigMapVolumeSource{ 217 LocalObjectReference: v1.LocalObjectReference{Name: configName}, 218 }, 219 }, 220 }, 221 { 222 Name: logVolume, 223 VolumeSource: v1.VolumeSource{ 224 EmptyDir: &v1.EmptyDirVolumeSource{}, 225 }, 226 }, 227 { 228 Name: localtimeVolume, 229 VolumeSource: v1.VolumeSource{ 230 HostPath: &v1.HostPathVolumeSource{ 231 Path: etcLocaltime, 232 Type: hostPathType, 233 }, 234 }, 235 }, 236 }, 237 InitContainers: []v1.Container{ 238 { 239 Name: "init-log-file", 240 Image: "debian", 241 Command: []string{"/bin/sh"}, 242 Args: []string{ 243 "-c", 244 fmt.Sprintf("touch %s", logFile), 245 }, 246 VolumeMounts: []v1.VolumeMount{ 247 { 248 Name: logVolume, 249 MountPath: path.Dir(logFile), 250 }, 251 { 252 Name: localtimeVolume, 253 MountPath: etcLocaltime, 254 }, 255 }, 256 }, 257 }, 258 Containers: []v1.Container{ 259 { 260 Name: name, 261 Image: image, 262 Command: []string{"/node-problem-detector"}, 263 Args: []string{ 264 "--logtostderr", 265 fmt.Sprintf("--system-log-monitors=%s", configFile), 266 // `ServiceAccount` admission controller is disabled in node e2e tests, so we could not use 267 // inClusterConfig here. 268 fmt.Sprintf("--apiserver-override=%s?inClusterConfig=false&auth=%s", framework.TestContext.Host, kubeConfigFile), 269 }, 270 Env: []v1.EnvVar{ 271 { 272 Name: "NODE_NAME", 273 ValueFrom: &v1.EnvVarSource{ 274 FieldRef: &v1.ObjectFieldSelector{ 275 APIVersion: "v1", 276 FieldPath: "spec.nodeName", 277 }, 278 }, 279 }, 280 }, 281 VolumeMounts: []v1.VolumeMount{ 282 { 283 Name: logVolume, 284 MountPath: path.Dir(logFile), 285 }, 286 { 287 Name: localtimeVolume, 288 MountPath: etcLocaltime, 289 }, 290 { 291 Name: configVolume, 292 MountPath: path.Dir(configFile), 293 }, 294 }, 295 }, 296 }, 297 }, 298 }) 299 // TODO: remove hardcoded kubelet volume directory path 300 // framework.TestContext.KubeVolumeDir is currently not populated for node e2e 301 hostLogFile = "/var/lib/kubelet/pods/" + string(pod.UID) + "/volumes/kubernetes.io~empty-dir" + logFile 302 }) 303 304 ginkgo.It("should generate node condition and events for corresponding errors", func(ctx context.Context) { 305 for _, test := range []struct { 306 description string 307 timestamp time.Time 308 message string 309 messageNum int 310 tempEvents int // Events for temp errors 311 totalEvents int // Events for both temp errors and condition changes 312 conditionReason string 313 conditionMessage string 314 conditionType v1.ConditionStatus 315 }{ 316 { 317 description: "should generate default node condition", 318 conditionReason: defaultReason, 319 conditionMessage: defaultMessage, 320 conditionType: v1.ConditionFalse, 321 }, 322 { 323 description: "should not generate events for too old log", 324 timestamp: bootTime.Add(-1 * time.Minute), 325 message: tempMessage, 326 messageNum: 3, 327 conditionReason: defaultReason, 328 conditionMessage: defaultMessage, 329 conditionType: v1.ConditionFalse, 330 }, 331 { 332 description: "should not change node condition for too old log", 333 timestamp: bootTime.Add(-1 * time.Minute), 334 message: permMessage1, 335 messageNum: 1, 336 conditionReason: defaultReason, 337 conditionMessage: defaultMessage, 338 conditionType: v1.ConditionFalse, 339 }, 340 { 341 description: "should generate event for old log within lookback duration", 342 timestamp: nodeTime, 343 message: tempMessage, 344 messageNum: 3, 345 tempEvents: 3, 346 totalEvents: 3, 347 conditionReason: defaultReason, 348 conditionMessage: defaultMessage, 349 conditionType: v1.ConditionFalse, 350 }, 351 { 352 description: "should change node condition for old log within lookback duration", 353 timestamp: nodeTime, 354 message: permMessage1, 355 messageNum: 1, 356 tempEvents: 3, // event number for temp errors should not change 357 totalEvents: 4, // add 1 event for condition change 358 conditionReason: permReason1, 359 conditionMessage: permMessage1, 360 conditionType: v1.ConditionTrue, 361 }, 362 { 363 description: "should generate event for new log", 364 timestamp: nodeTime.Add(5 * time.Minute), 365 message: tempMessage, 366 messageNum: 3, 367 tempEvents: 6, // add 3 events for temp errors 368 totalEvents: 7, // add 3 events for temp errors 369 conditionReason: permReason1, 370 conditionMessage: permMessage1, 371 conditionType: v1.ConditionTrue, 372 }, 373 { 374 description: "should not update node condition with the same reason", 375 timestamp: nodeTime.Add(5 * time.Minute), 376 message: permMessage1 + "different message", 377 messageNum: 1, 378 tempEvents: 6, // event number should not change 379 totalEvents: 7, // event number should not change 380 conditionReason: permReason1, 381 conditionMessage: permMessage1, 382 conditionType: v1.ConditionTrue, 383 }, 384 { 385 description: "should change node condition for new log", 386 timestamp: nodeTime.Add(5 * time.Minute), 387 message: permMessage2, 388 messageNum: 1, 389 tempEvents: 6, // event number for temp errors should not change 390 totalEvents: 8, // add 1 event for condition change 391 conditionReason: permReason2, 392 conditionMessage: permMessage2, 393 conditionType: v1.ConditionTrue, 394 }, 395 } { 396 ginkgo.By(test.description) 397 if test.messageNum > 0 { 398 ginkgo.By(fmt.Sprintf("Inject %d logs: %q", test.messageNum, test.message)) 399 err := injectLog(hostLogFile, test.timestamp, test.message, test.messageNum) 400 framework.ExpectNoError(err) 401 } 402 403 ginkgo.By(fmt.Sprintf("Wait for %d temp events generated", test.tempEvents)) 404 gomega.Eventually(ctx, func(ctx context.Context) error { 405 return verifyEvents(ctx, c.CoreV1().Events(eventNamespace), eventListOptions, test.tempEvents, tempReason, tempMessage) 406 }, pollTimeout, pollInterval).Should(gomega.Succeed()) 407 ginkgo.By(fmt.Sprintf("Wait for %d total events generated", test.totalEvents)) 408 gomega.Eventually(ctx, func(ctx context.Context) error { 409 return verifyTotalEvents(ctx, c.CoreV1().Events(eventNamespace), eventListOptions, test.totalEvents) 410 }, pollTimeout, pollInterval).Should(gomega.Succeed()) 411 ginkgo.By(fmt.Sprintf("Make sure only %d total events generated", test.totalEvents)) 412 gomega.Consistently(ctx, func(ctx context.Context) error { 413 return verifyTotalEvents(ctx, c.CoreV1().Events(eventNamespace), eventListOptions, test.totalEvents) 414 }, pollConsistent, pollInterval).Should(gomega.Succeed()) 415 416 ginkgo.By(fmt.Sprintf("Make sure node condition %q is set", condition)) 417 gomega.Eventually(ctx, func(ctx context.Context) error { 418 return verifyNodeCondition(ctx, c.CoreV1().Nodes(), condition, test.conditionType, test.conditionReason, test.conditionMessage) 419 }, pollTimeout, pollInterval).Should(gomega.Succeed()) 420 ginkgo.By(fmt.Sprintf("Make sure node condition %q is stable", condition)) 421 gomega.Consistently(ctx, func(ctx context.Context) error { 422 return verifyNodeCondition(ctx, c.CoreV1().Nodes(), condition, test.conditionType, test.conditionReason, test.conditionMessage) 423 }, pollConsistent, pollInterval).Should(gomega.Succeed()) 424 } 425 }) 426 427 ginkgo.AfterEach(func(ctx context.Context) { 428 if ginkgo.CurrentSpecReport().Failed() && framework.TestContext.DumpLogsOnFailure { 429 ginkgo.By("Get node problem detector log") 430 log, err := e2epod.GetPodLogs(ctx, c, ns, name, name) 431 gomega.Expect(err).ShouldNot(gomega.HaveOccurred()) 432 framework.Logf("Node Problem Detector logs:\n %s", log) 433 } 434 ginkgo.By("Delete the node problem detector") 435 framework.ExpectNoError(e2epod.NewPodClient(f).Delete(ctx, name, *metav1.NewDeleteOptions(0))) 436 ginkgo.By("Wait for the node problem detector to disappear") 437 gomega.Expect(e2epod.WaitForPodNotFoundInNamespace(ctx, c, name, ns, pollTimeout)).To(gomega.Succeed()) 438 ginkgo.By("Delete the config map") 439 framework.ExpectNoError(c.CoreV1().ConfigMaps(ns).Delete(ctx, configName, metav1.DeleteOptions{})) 440 ginkgo.By("Clean up the events") 441 gomega.Expect(c.CoreV1().Events(eventNamespace).DeleteCollection(ctx, *metav1.NewDeleteOptions(0), eventListOptions)).To(gomega.Succeed()) 442 ginkgo.By("Clean up the node condition") 443 patch := []byte(fmt.Sprintf(`{"status":{"conditions":[{"$patch":"delete","type":"%s"}]}}`, condition)) 444 c.CoreV1().RESTClient().Patch(types.StrategicMergePatchType).Resource("nodes").Name(framework.TestContext.NodeName).SubResource("status").Body(patch).Do(ctx) 445 }) 446 }) 447 }) 448 449 // injectLog injects kernel log into specified file. 450 func injectLog(file string, timestamp time.Time, log string, num int) error { 451 f, err := os.OpenFile(file, os.O_RDWR|os.O_APPEND, 0666) 452 if err != nil { 453 return err 454 } 455 defer f.Close() 456 for i := 0; i < num; i++ { 457 _, err := f.WriteString(fmt.Sprintf("%s kernel: [0.000000] %s\n", timestamp.Format(time.Stamp), log)) 458 if err != nil { 459 return err 460 } 461 } 462 return nil 463 } 464 465 // verifyEvents verifies there are num specific events generated with given reason and message. 466 func verifyEvents(ctx context.Context, e coreclientset.EventInterface, options metav1.ListOptions, num int, reason, message string) error { 467 events, err := e.List(ctx, options) 468 if err != nil { 469 return err 470 } 471 count := 0 472 for _, event := range events.Items { 473 if event.Reason != reason || event.Message != message { 474 continue 475 } 476 count += int(event.Count) 477 } 478 if count != num { 479 return fmt.Errorf("expected %d events with reason set to %s and message set to %s\nbut %d actual events occurred. Events : %v", num, reason, message, count, events.Items) 480 } 481 return nil 482 } 483 484 // verifyTotalEvents verifies there are num events in total. 485 func verifyTotalEvents(ctx context.Context, e coreclientset.EventInterface, options metav1.ListOptions, num int) error { 486 events, err := e.List(ctx, options) 487 if err != nil { 488 return err 489 } 490 count := 0 491 for _, event := range events.Items { 492 count += int(event.Count) 493 } 494 if count != num { 495 return fmt.Errorf("expected total number of events was %d, actual events counted was %d\nEvents : %v", num, count, events.Items) 496 } 497 return nil 498 } 499 500 // verifyNodeCondition verifies specific node condition is generated, if reason and message are empty, they will not be checked 501 func verifyNodeCondition(ctx context.Context, n coreclientset.NodeInterface, condition v1.NodeConditionType, status v1.ConditionStatus, reason, message string) error { 502 node, err := n.Get(ctx, framework.TestContext.NodeName, metav1.GetOptions{}) 503 if err != nil { 504 return err 505 } 506 _, c := testutils.GetNodeCondition(&node.Status, condition) 507 if c == nil { 508 return fmt.Errorf("node condition %q not found", condition) 509 } 510 if c.Status != status || c.Reason != reason || c.Message != message { 511 return fmt.Errorf("unexpected node condition %q: %+v", condition, c) 512 } 513 return nil 514 }