k8s.io/kubernetes@v1.31.0-alpha.0.0.20240520171757-56147500dadc/test/e2e/node/node_problem_detector.go (about) 1 /* 2 Copyright 2019 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package node 18 19 import ( 20 "context" 21 "fmt" 22 "net" 23 "sort" 24 "strconv" 25 "strings" 26 "time" 27 28 v1 "k8s.io/api/core/v1" 29 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 30 "k8s.io/apimachinery/pkg/fields" 31 "k8s.io/kubernetes/test/e2e/framework" 32 e2ekubelet "k8s.io/kubernetes/test/e2e/framework/kubelet" 33 e2enode "k8s.io/kubernetes/test/e2e/framework/node" 34 e2eskipper "k8s.io/kubernetes/test/e2e/framework/skipper" 35 e2essh "k8s.io/kubernetes/test/e2e/framework/ssh" 36 "k8s.io/kubernetes/test/e2e/nodefeature" 37 testutils "k8s.io/kubernetes/test/utils" 38 admissionapi "k8s.io/pod-security-admission/api" 39 40 "github.com/onsi/ginkgo/v2" 41 "github.com/onsi/gomega" 42 ) 43 44 // This test checks if node-problem-detector (NPD) runs fine without error on 45 // the up to 10 nodes in the cluster. NPD's functionality is tested in e2e_node tests. 46 var _ = SIGDescribe("NodeProblemDetector", nodefeature.NodeProblemDetector, func() { 47 const ( 48 pollInterval = 1 * time.Second 49 pollTimeout = 1 * time.Minute 50 maxNodesToProcess = 10 51 ) 52 f := framework.NewDefaultFramework("node-problem-detector") 53 f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged 54 55 ginkgo.BeforeEach(func(ctx context.Context) { 56 e2eskipper.SkipUnlessSSHKeyPresent() 57 e2eskipper.SkipUnlessProviderIs(framework.ProvidersWithSSH...) 58 e2eskipper.SkipUnlessProviderIs("gce", "gke") 59 e2eskipper.SkipUnlessNodeOSDistroIs("gci", "ubuntu") 60 e2enode.WaitForTotalHealthy(ctx, f.ClientSet, time.Minute) 61 }) 62 63 ginkgo.It("should run without error", func(ctx context.Context) { 64 ginkgo.By("Getting all nodes and their SSH-able IP addresses") 65 readyNodes, err := e2enode.GetReadySchedulableNodes(ctx, f.ClientSet) 66 framework.ExpectNoError(err) 67 68 nodes := []v1.Node{} 69 hosts := []string{} 70 for _, node := range readyNodes.Items { 71 host := "" 72 for _, addr := range node.Status.Addresses { 73 if addr.Type == v1.NodeExternalIP { 74 host = net.JoinHostPort(addr.Address, "22") 75 break 76 } 77 } 78 // Not every node has to have an external IP address. 79 if len(host) > 0 { 80 nodes = append(nodes, node) 81 hosts = append(hosts, host) 82 } 83 } 84 85 if len(nodes) == 0 { 86 ginkgo.Skip("Skipping test due to lack of ready nodes with public IP") 87 } 88 89 if len(nodes) > maxNodesToProcess { 90 nodes = nodes[:maxNodesToProcess] 91 hosts = hosts[:maxNodesToProcess] 92 } 93 94 isStandaloneMode := make(map[string]bool) 95 cpuUsageStats := make(map[string][]float64) 96 uptimeStats := make(map[string][]float64) 97 rssStats := make(map[string][]float64) 98 workingSetStats := make(map[string][]float64) 99 100 // Some tests suites running for days. 101 // This test is not marked as Disruptive or Serial so we do not want to 102 // restart the kubelet during the test to check for KubeletStart event 103 // detection. We use heuristic here to check if we need to validate for the 104 // KubeletStart event since there is no easy way to check when test has actually started. 105 checkForKubeletStart := false 106 107 for _, host := range hosts { 108 cpuUsageStats[host] = []float64{} 109 uptimeStats[host] = []float64{} 110 rssStats[host] = []float64{} 111 workingSetStats[host] = []float64{} 112 113 cmd := "systemctl status node-problem-detector.service" 114 result, err := e2essh.SSH(ctx, cmd, host, framework.TestContext.Provider) 115 isStandaloneMode[host] = (err == nil && result.Code == 0) 116 117 if isStandaloneMode[host] { 118 ginkgo.By(fmt.Sprintf("Check node %q has node-problem-detector process", host)) 119 // Using brackets "[n]" is a trick to prevent grep command itself from 120 // showing up, because string text "[n]ode-problem-detector" does not 121 // match regular expression "[n]ode-problem-detector". 122 psCmd := "ps aux | grep [n]ode-problem-detector" 123 result, err = e2essh.SSH(ctx, psCmd, host, framework.TestContext.Provider) 124 framework.ExpectNoError(err) 125 gomega.Expect(result.Code).To(gomega.Equal(0)) 126 gomega.Expect(result.Stdout).To(gomega.ContainSubstring("node-problem-detector")) 127 128 ginkgo.By(fmt.Sprintf("Check node-problem-detector is running fine on node %q", host)) 129 journalctlCmd := "sudo journalctl -r -u node-problem-detector" 130 result, err = e2essh.SSH(ctx, journalctlCmd, host, framework.TestContext.Provider) 131 framework.ExpectNoError(err) 132 gomega.Expect(result.Code).To(gomega.Equal(0)) 133 gomega.Expect(result.Stdout).NotTo(gomega.ContainSubstring("node-problem-detector.service: Failed")) 134 135 // We only will check for the KubeletStart even if parsing of date here succeeded. 136 ginkgo.By(fmt.Sprintf("Check when node-problem-detector started on node %q", host)) 137 npdStartTimeCommand := "sudo systemctl show --timestamp=utc node-problem-detector -P ActiveEnterTimestamp" 138 result, err = e2essh.SSH(ctx, npdStartTimeCommand, host, framework.TestContext.Provider) 139 framework.ExpectNoError(err) 140 gomega.Expect(result.Code).To(gomega.Equal(0)) 141 142 // The time format matches the systemd format. 143 // 'utc': 'Day YYYY-MM-DD HH:MM:SS UTC (see https://www.freedesktop.org/software/systemd/man/systemd.time.html) 144 st, err := time.Parse("Mon 2006-01-02 15:04:05 MST", result.Stdout) 145 if err != nil { 146 framework.Logf("Failed to parse when NPD started. Got exit code: %v and stdout: %v, error: %v. Will skip check for kubelet start event.", result.Code, result.Stdout, err) 147 } else { 148 checkForKubeletStart = time.Since(st) < time.Hour 149 } 150 151 cpuUsage, uptime := getCPUStat(ctx, f, host) 152 cpuUsageStats[host] = append(cpuUsageStats[host], cpuUsage) 153 uptimeStats[host] = append(uptimeStats[host], uptime) 154 155 } 156 ginkgo.By(fmt.Sprintf("Inject log to trigger DockerHung on node %q", host)) 157 log := "INFO: task docker:12345 blocked for more than 120 seconds." 158 injectLogCmd := "sudo sh -c \"echo 'kernel: " + log + "' >> /dev/kmsg\"" 159 result, err = e2essh.SSH(ctx, injectLogCmd, host, framework.TestContext.Provider) 160 framework.ExpectNoError(err) 161 gomega.Expect(result.Code).To(gomega.Equal(0)) 162 } 163 164 ginkgo.By("Gather node-problem-detector cpu and memory stats") 165 numIterations := 60 166 for i := 1; i <= numIterations; i++ { 167 for j, host := range hosts { 168 if isStandaloneMode[host] { 169 rss, workingSet := getMemoryStat(ctx, f, host) 170 rssStats[host] = append(rssStats[host], rss) 171 workingSetStats[host] = append(workingSetStats[host], workingSet) 172 if i == numIterations { 173 cpuUsage, uptime := getCPUStat(ctx, f, host) 174 cpuUsageStats[host] = append(cpuUsageStats[host], cpuUsage) 175 uptimeStats[host] = append(uptimeStats[host], uptime) 176 } 177 } else { 178 cpuUsage, rss, workingSet := getNpdPodStat(ctx, f, nodes[j].Name) 179 cpuUsageStats[host] = append(cpuUsageStats[host], cpuUsage) 180 rssStats[host] = append(rssStats[host], rss) 181 workingSetStats[host] = append(workingSetStats[host], workingSet) 182 } 183 } 184 time.Sleep(time.Second) 185 } 186 187 cpuStatsMsg := "CPU (core):" 188 rssStatsMsg := "RSS (MB):" 189 workingSetStatsMsg := "WorkingSet (MB):" 190 for i, host := range hosts { 191 if isStandaloneMode[host] { 192 // When in standalone mode, NPD is running as systemd service. We 193 // calculate its cpu usage from cgroup cpuacct value differences. 194 cpuUsage := cpuUsageStats[host][1] - cpuUsageStats[host][0] 195 totaltime := uptimeStats[host][1] - uptimeStats[host][0] 196 cpuStatsMsg += fmt.Sprintf(" %s[%.3f];", nodes[i].Name, cpuUsage/totaltime) 197 } else { 198 sort.Float64s(cpuUsageStats[host]) 199 cpuStatsMsg += fmt.Sprintf(" %s[%.3f|%.3f|%.3f];", nodes[i].Name, 200 cpuUsageStats[host][0], cpuUsageStats[host][len(cpuUsageStats[host])/2], cpuUsageStats[host][len(cpuUsageStats[host])-1]) 201 } 202 203 sort.Float64s(rssStats[host]) 204 rssStatsMsg += fmt.Sprintf(" %s[%.1f|%.1f|%.1f];", nodes[i].Name, 205 rssStats[host][0], rssStats[host][len(rssStats[host])/2], rssStats[host][len(rssStats[host])-1]) 206 207 sort.Float64s(workingSetStats[host]) 208 workingSetStatsMsg += fmt.Sprintf(" %s[%.1f|%.1f|%.1f];", nodes[i].Name, 209 workingSetStats[host][0], workingSetStats[host][len(workingSetStats[host])/2], workingSetStats[host][len(workingSetStats[host])-1]) 210 } 211 framework.Logf("Node-Problem-Detector CPU and Memory Stats:\n\t%s\n\t%s\n\t%s", cpuStatsMsg, rssStatsMsg, workingSetStatsMsg) 212 213 ginkgo.By("Check node-problem-detector can post conditions and events to API server") 214 for _, node := range nodes { 215 ginkgo.By(fmt.Sprintf("Check node-problem-detector posted KernelDeadlock condition on node %q", node.Name)) 216 gomega.Eventually(ctx, func() error { 217 return verifyNodeCondition(ctx, f, "KernelDeadlock", v1.ConditionTrue, "DockerHung", node.Name) 218 }, pollTimeout, pollInterval).Should(gomega.Succeed()) 219 220 ginkgo.By(fmt.Sprintf("Check node-problem-detector posted DockerHung event on node %q", node.Name)) 221 eventListOptions := metav1.ListOptions{FieldSelector: fields.Set{"involvedObject.kind": "Node"}.AsSelector().String()} 222 gomega.Eventually(ctx, func(ctx context.Context) error { 223 return verifyEvents(ctx, f, eventListOptions, 1, "DockerHung", node.Name) 224 }, pollTimeout, pollInterval).Should(gomega.Succeed()) 225 226 if checkForKubeletStart { 227 // Node problem detector reports kubelet start events automatically starting from NPD v0.7.0+. 228 // Since Kubelet may be restarted for a few times after node is booted. We just check the event 229 // is detected, but do not check how many times Kubelet is started. 230 // 231 // Some test suites run for hours and KubeletStart event will already be cleaned up 232 ginkgo.By(fmt.Sprintf("Check node-problem-detector posted KubeletStart event on node %q", node.Name)) 233 gomega.Eventually(ctx, func(ctx context.Context) error { 234 return verifyEventExists(ctx, f, eventListOptions, "KubeletStart", node.Name) 235 }, pollTimeout, pollInterval).Should(gomega.Succeed()) 236 } else { 237 ginkgo.By("KubeletStart event will NOT be checked") 238 } 239 } 240 241 }) 242 }) 243 244 func verifyEvents(ctx context.Context, f *framework.Framework, options metav1.ListOptions, num int, reason, nodeName string) error { 245 events, err := f.ClientSet.CoreV1().Events(metav1.NamespaceDefault).List(ctx, options) 246 if err != nil { 247 return err 248 } 249 count := 0 250 for _, event := range events.Items { 251 if event.Reason != reason || event.Source.Host != nodeName { 252 continue 253 } 254 count += int(event.Count) 255 } 256 if count != num { 257 return fmt.Errorf("expect event number %d, got %d: %v", num, count, events.Items) 258 } 259 return nil 260 } 261 262 func verifyEventExists(ctx context.Context, f *framework.Framework, options metav1.ListOptions, reason, nodeName string) error { 263 events, err := f.ClientSet.CoreV1().Events(metav1.NamespaceDefault).List(ctx, options) 264 if err != nil { 265 return err 266 } 267 for _, event := range events.Items { 268 if event.Reason == reason && event.Source.Host == nodeName && event.Count > 0 { 269 return nil 270 } 271 } 272 return fmt.Errorf("Event %s does not exist: %v", reason, events.Items) 273 } 274 275 func verifyNodeCondition(ctx context.Context, f *framework.Framework, condition v1.NodeConditionType, status v1.ConditionStatus, reason, nodeName string) error { 276 node, err := f.ClientSet.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) 277 if err != nil { 278 return err 279 } 280 _, c := testutils.GetNodeCondition(&node.Status, condition) 281 if c == nil { 282 return fmt.Errorf("node condition %q not found", condition) 283 } 284 if c.Status != status || c.Reason != reason { 285 return fmt.Errorf("unexpected node condition %q: %+v", condition, c) 286 } 287 return nil 288 } 289 290 func getMemoryStat(ctx context.Context, f *framework.Framework, host string) (rss, workingSet float64) { 291 var memCmd string 292 293 isCgroupV2 := isHostRunningCgroupV2(ctx, f, host) 294 if isCgroupV2 { 295 memCmd = "cat /sys/fs/cgroup/system.slice/node-problem-detector.service/memory.current && cat /sys/fs/cgroup/system.slice/node-problem-detector.service/memory.stat" 296 } else { 297 memCmd = "cat /sys/fs/cgroup/memory/system.slice/node-problem-detector.service/memory.usage_in_bytes && cat /sys/fs/cgroup/memory/system.slice/node-problem-detector.service/memory.stat" 298 } 299 300 result, err := e2essh.SSH(ctx, memCmd, host, framework.TestContext.Provider) 301 framework.ExpectNoError(err) 302 gomega.Expect(result.Code).To(gomega.Equal(0)) 303 lines := strings.Split(result.Stdout, "\n") 304 305 memoryUsage, err := strconv.ParseFloat(lines[0], 64) 306 framework.ExpectNoError(err) 307 308 var rssToken, inactiveFileToken string 309 if isCgroupV2 { 310 // Use Anon memory for RSS as cAdvisor on cgroupv2 311 // see https://github.com/google/cadvisor/blob/a9858972e75642c2b1914c8d5428e33e6392c08a/container/libcontainer/handler.go#L799 312 rssToken = "anon" 313 inactiveFileToken = "inactive_file" 314 } else { 315 rssToken = "total_rss" 316 inactiveFileToken = "total_inactive_file" 317 } 318 319 var totalInactiveFile float64 320 for _, line := range lines[1:] { 321 tokens := strings.Split(line, " ") 322 323 if tokens[0] == rssToken { 324 rss, err = strconv.ParseFloat(tokens[1], 64) 325 framework.ExpectNoError(err) 326 } 327 if tokens[0] == inactiveFileToken { 328 totalInactiveFile, err = strconv.ParseFloat(tokens[1], 64) 329 framework.ExpectNoError(err) 330 } 331 } 332 333 workingSet = memoryUsage 334 if workingSet < totalInactiveFile { 335 workingSet = 0 336 } else { 337 workingSet -= totalInactiveFile 338 } 339 340 // Convert to MB 341 rss = rss / 1024 / 1024 342 workingSet = workingSet / 1024 / 1024 343 return 344 } 345 346 func getCPUStat(ctx context.Context, f *framework.Framework, host string) (usage, uptime float64) { 347 var cpuCmd string 348 if isHostRunningCgroupV2(ctx, f, host) { 349 cpuCmd = " cat /sys/fs/cgroup/cpu.stat | grep 'usage_usec' | sed 's/[^0-9]*//g' && cat /proc/uptime | awk '{print $1}'" 350 } else { 351 cpuCmd = "cat /sys/fs/cgroup/cpu/system.slice/node-problem-detector.service/cpuacct.usage && cat /proc/uptime | awk '{print $1}'" 352 } 353 354 result, err := e2essh.SSH(ctx, cpuCmd, host, framework.TestContext.Provider) 355 framework.ExpectNoError(err) 356 gomega.Expect(result.Code).To(gomega.Equal(0)) 357 lines := strings.Split(result.Stdout, "\n") 358 359 usage, err = strconv.ParseFloat(lines[0], 64) 360 framework.ExpectNoError(err, "Cannot parse float for usage") 361 uptime, err = strconv.ParseFloat(lines[1], 64) 362 framework.ExpectNoError(err, "Cannot parse float for uptime") 363 364 // Convert from nanoseconds to seconds 365 usage *= 1e-9 366 return 367 } 368 369 func isHostRunningCgroupV2(ctx context.Context, f *framework.Framework, host string) bool { 370 result, err := e2essh.SSH(ctx, "stat -fc %T /sys/fs/cgroup/", host, framework.TestContext.Provider) 371 framework.ExpectNoError(err) 372 gomega.Expect(result.Code).To(gomega.Equal(0)) 373 374 // 0x63677270 == CGROUP2_SUPER_MAGIC 375 // https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html 376 return strings.Contains(result.Stdout, "cgroup2") || strings.Contains(result.Stdout, "0x63677270") 377 } 378 379 func getNpdPodStat(ctx context.Context, f *framework.Framework, nodeName string) (cpuUsage, rss, workingSet float64) { 380 summary, err := e2ekubelet.GetStatsSummary(ctx, f.ClientSet, nodeName) 381 framework.ExpectNoError(err) 382 383 hasNpdPod := false 384 for _, pod := range summary.Pods { 385 if !strings.HasPrefix(pod.PodRef.Name, "node-problem-detector") { 386 continue 387 } 388 if pod.CPU != nil && pod.CPU.UsageNanoCores != nil { 389 cpuUsage = float64(*pod.CPU.UsageNanoCores) * 1e-9 390 } 391 if pod.Memory != nil { 392 if pod.Memory.RSSBytes != nil { 393 rss = float64(*pod.Memory.RSSBytes) / 1024 / 1024 394 } 395 if pod.Memory.WorkingSetBytes != nil { 396 workingSet = float64(*pod.Memory.WorkingSetBytes) / 1024 / 1024 397 } 398 } 399 hasNpdPod = true 400 break 401 } 402 if !hasNpdPod { 403 framework.Failf("No node-problem-detector pod is present in %+v", summary.Pods) 404 } 405 return 406 }