k8s.io/kubernetes@v1.29.3/test/e2e/node/node_problem_detector.go (about) 1 /* 2 Copyright 2019 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package node 18 19 import ( 20 "context" 21 "fmt" 22 "net" 23 "sort" 24 "strconv" 25 "strings" 26 "time" 27 28 v1 "k8s.io/api/core/v1" 29 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 30 "k8s.io/apimachinery/pkg/fields" 31 "k8s.io/kubernetes/test/e2e/framework" 32 e2ekubelet "k8s.io/kubernetes/test/e2e/framework/kubelet" 33 e2enode "k8s.io/kubernetes/test/e2e/framework/node" 34 e2eskipper "k8s.io/kubernetes/test/e2e/framework/skipper" 35 e2essh "k8s.io/kubernetes/test/e2e/framework/ssh" 36 testutils "k8s.io/kubernetes/test/utils" 37 admissionapi "k8s.io/pod-security-admission/api" 38 39 "github.com/onsi/ginkgo/v2" 40 "github.com/onsi/gomega" 41 ) 42 43 // This test checks if node-problem-detector (NPD) runs fine without error on 44 // the up to 10 nodes in the cluster. NPD's functionality is tested in e2e_node tests. 45 var _ = SIGDescribe("NodeProblemDetector", func() { 46 const ( 47 pollInterval = 1 * time.Second 48 pollTimeout = 1 * time.Minute 49 maxNodesToProcess = 10 50 ) 51 f := framework.NewDefaultFramework("node-problem-detector") 52 f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged 53 54 ginkgo.BeforeEach(func(ctx context.Context) { 55 e2eskipper.SkipUnlessSSHKeyPresent() 56 e2eskipper.SkipUnlessProviderIs(framework.ProvidersWithSSH...) 57 e2eskipper.SkipUnlessProviderIs("gce", "gke") 58 e2eskipper.SkipUnlessNodeOSDistroIs("gci", "ubuntu") 59 e2enode.WaitForTotalHealthy(ctx, f.ClientSet, time.Minute) 60 }) 61 62 ginkgo.It("should run without error", func(ctx context.Context) { 63 ginkgo.By("Getting all nodes and their SSH-able IP addresses") 64 readyNodes, err := e2enode.GetReadySchedulableNodes(ctx, f.ClientSet) 65 framework.ExpectNoError(err) 66 67 nodes := []v1.Node{} 68 hosts := []string{} 69 for _, node := range readyNodes.Items { 70 host := "" 71 for _, addr := range node.Status.Addresses { 72 if addr.Type == v1.NodeExternalIP { 73 host = net.JoinHostPort(addr.Address, "22") 74 break 75 } 76 } 77 // Not every node has to have an external IP address. 78 if len(host) > 0 { 79 nodes = append(nodes, node) 80 hosts = append(hosts, host) 81 } 82 } 83 84 if len(nodes) == 0 { 85 ginkgo.Skip("Skipping test due to lack of ready nodes with public IP") 86 } 87 88 if len(nodes) > maxNodesToProcess { 89 nodes = nodes[:maxNodesToProcess] 90 hosts = hosts[:maxNodesToProcess] 91 } 92 93 isStandaloneMode := make(map[string]bool) 94 cpuUsageStats := make(map[string][]float64) 95 uptimeStats := make(map[string][]float64) 96 rssStats := make(map[string][]float64) 97 workingSetStats := make(map[string][]float64) 98 99 // Some tests suites running for days. 100 // This test is not marked as Disruptive or Serial so we do not want to 101 // restart the kubelet during the test to check for KubeletStart event 102 // detection. We use heuristic here to check if we need to validate for the 103 // KubeletStart event since there is no easy way to check when test has actually started. 104 checkForKubeletStart := false 105 106 for _, host := range hosts { 107 cpuUsageStats[host] = []float64{} 108 uptimeStats[host] = []float64{} 109 rssStats[host] = []float64{} 110 workingSetStats[host] = []float64{} 111 112 cmd := "systemctl status node-problem-detector.service" 113 result, err := e2essh.SSH(ctx, cmd, host, framework.TestContext.Provider) 114 isStandaloneMode[host] = (err == nil && result.Code == 0) 115 116 if isStandaloneMode[host] { 117 ginkgo.By(fmt.Sprintf("Check node %q has node-problem-detector process", host)) 118 // Using brackets "[n]" is a trick to prevent grep command itself from 119 // showing up, because string text "[n]ode-problem-detector" does not 120 // match regular expression "[n]ode-problem-detector". 121 psCmd := "ps aux | grep [n]ode-problem-detector" 122 result, err = e2essh.SSH(ctx, psCmd, host, framework.TestContext.Provider) 123 framework.ExpectNoError(err) 124 gomega.Expect(result.Code).To(gomega.Equal(0)) 125 gomega.Expect(result.Stdout).To(gomega.ContainSubstring("node-problem-detector")) 126 127 ginkgo.By(fmt.Sprintf("Check node-problem-detector is running fine on node %q", host)) 128 journalctlCmd := "sudo journalctl -r -u node-problem-detector" 129 result, err = e2essh.SSH(ctx, journalctlCmd, host, framework.TestContext.Provider) 130 framework.ExpectNoError(err) 131 gomega.Expect(result.Code).To(gomega.Equal(0)) 132 gomega.Expect(result.Stdout).NotTo(gomega.ContainSubstring("node-problem-detector.service: Failed")) 133 134 // We only will check for the KubeletStart even if parsing of date here succeeded. 135 ginkgo.By(fmt.Sprintf("Check when node-problem-detector started on node %q", host)) 136 npdStartTimeCommand := "sudo systemctl show --timestamp=utc node-problem-detector -P ActiveEnterTimestamp" 137 result, err = e2essh.SSH(ctx, npdStartTimeCommand, host, framework.TestContext.Provider) 138 framework.ExpectNoError(err) 139 gomega.Expect(result.Code).To(gomega.Equal(0)) 140 141 // The time format matches the systemd format. 142 // 'utc': 'Day YYYY-MM-DD HH:MM:SS UTC (see https://www.freedesktop.org/software/systemd/man/systemd.time.html) 143 st, err := time.Parse("Mon 2006-01-02 15:04:05 MST", result.Stdout) 144 if err != nil { 145 framework.Logf("Failed to parse when NPD started. Got exit code: %v and stdout: %v, error: %v. Will skip check for kubelet start event.", result.Code, result.Stdout, err) 146 } else { 147 checkForKubeletStart = time.Since(st) < time.Hour 148 } 149 150 cpuUsage, uptime := getCPUStat(ctx, f, host) 151 cpuUsageStats[host] = append(cpuUsageStats[host], cpuUsage) 152 uptimeStats[host] = append(uptimeStats[host], uptime) 153 154 } 155 ginkgo.By(fmt.Sprintf("Inject log to trigger DockerHung on node %q", host)) 156 log := "INFO: task docker:12345 blocked for more than 120 seconds." 157 injectLogCmd := "sudo sh -c \"echo 'kernel: " + log + "' >> /dev/kmsg\"" 158 result, err = e2essh.SSH(ctx, injectLogCmd, host, framework.TestContext.Provider) 159 framework.ExpectNoError(err) 160 gomega.Expect(result.Code).To(gomega.Equal(0)) 161 } 162 163 ginkgo.By("Check node-problem-detector can post conditions and events to API server") 164 for _, node := range nodes { 165 ginkgo.By(fmt.Sprintf("Check node-problem-detector posted KernelDeadlock condition on node %q", node.Name)) 166 gomega.Eventually(ctx, func() error { 167 return verifyNodeCondition(ctx, f, "KernelDeadlock", v1.ConditionTrue, "DockerHung", node.Name) 168 }, pollTimeout, pollInterval).Should(gomega.Succeed()) 169 170 ginkgo.By(fmt.Sprintf("Check node-problem-detector posted DockerHung event on node %q", node.Name)) 171 eventListOptions := metav1.ListOptions{FieldSelector: fields.Set{"involvedObject.kind": "Node"}.AsSelector().String()} 172 gomega.Eventually(ctx, func(ctx context.Context) error { 173 return verifyEvents(ctx, f, eventListOptions, 1, "DockerHung", node.Name) 174 }, pollTimeout, pollInterval).Should(gomega.Succeed()) 175 176 if checkForKubeletStart { 177 // Node problem detector reports kubelet start events automatically starting from NPD v0.7.0+. 178 // Since Kubelet may be restarted for a few times after node is booted. We just check the event 179 // is detected, but do not check how many times Kubelet is started. 180 // 181 // Some test suites run for hours and KubeletStart event will already be cleaned up 182 ginkgo.By(fmt.Sprintf("Check node-problem-detector posted KubeletStart event on node %q", node.Name)) 183 gomega.Eventually(ctx, func(ctx context.Context) error { 184 return verifyEventExists(ctx, f, eventListOptions, "KubeletStart", node.Name) 185 }, pollTimeout, pollInterval).Should(gomega.Succeed()) 186 } else { 187 ginkgo.By("KubeletStart event will NOT be checked") 188 } 189 } 190 191 ginkgo.By("Gather node-problem-detector cpu and memory stats") 192 numIterations := 60 193 for i := 1; i <= numIterations; i++ { 194 for j, host := range hosts { 195 if isStandaloneMode[host] { 196 rss, workingSet := getMemoryStat(ctx, f, host) 197 rssStats[host] = append(rssStats[host], rss) 198 workingSetStats[host] = append(workingSetStats[host], workingSet) 199 if i == numIterations { 200 cpuUsage, uptime := getCPUStat(ctx, f, host) 201 cpuUsageStats[host] = append(cpuUsageStats[host], cpuUsage) 202 uptimeStats[host] = append(uptimeStats[host], uptime) 203 } 204 } else { 205 cpuUsage, rss, workingSet := getNpdPodStat(ctx, f, nodes[j].Name) 206 cpuUsageStats[host] = append(cpuUsageStats[host], cpuUsage) 207 rssStats[host] = append(rssStats[host], rss) 208 workingSetStats[host] = append(workingSetStats[host], workingSet) 209 } 210 } 211 time.Sleep(time.Second) 212 } 213 214 cpuStatsMsg := "CPU (core):" 215 rssStatsMsg := "RSS (MB):" 216 workingSetStatsMsg := "WorkingSet (MB):" 217 for i, host := range hosts { 218 if isStandaloneMode[host] { 219 // When in standalone mode, NPD is running as systemd service. We 220 // calculate its cpu usage from cgroup cpuacct value differences. 221 cpuUsage := cpuUsageStats[host][1] - cpuUsageStats[host][0] 222 totaltime := uptimeStats[host][1] - uptimeStats[host][0] 223 cpuStatsMsg += fmt.Sprintf(" %s[%.3f];", nodes[i].Name, cpuUsage/totaltime) 224 } else { 225 sort.Float64s(cpuUsageStats[host]) 226 cpuStatsMsg += fmt.Sprintf(" %s[%.3f|%.3f|%.3f];", nodes[i].Name, 227 cpuUsageStats[host][0], cpuUsageStats[host][len(cpuUsageStats[host])/2], cpuUsageStats[host][len(cpuUsageStats[host])-1]) 228 } 229 230 sort.Float64s(rssStats[host]) 231 rssStatsMsg += fmt.Sprintf(" %s[%.1f|%.1f|%.1f];", nodes[i].Name, 232 rssStats[host][0], rssStats[host][len(rssStats[host])/2], rssStats[host][len(rssStats[host])-1]) 233 234 sort.Float64s(workingSetStats[host]) 235 workingSetStatsMsg += fmt.Sprintf(" %s[%.1f|%.1f|%.1f];", nodes[i].Name, 236 workingSetStats[host][0], workingSetStats[host][len(workingSetStats[host])/2], workingSetStats[host][len(workingSetStats[host])-1]) 237 } 238 framework.Logf("Node-Problem-Detector CPU and Memory Stats:\n\t%s\n\t%s\n\t%s", cpuStatsMsg, rssStatsMsg, workingSetStatsMsg) 239 }) 240 }) 241 242 func verifyEvents(ctx context.Context, f *framework.Framework, options metav1.ListOptions, num int, reason, nodeName string) error { 243 events, err := f.ClientSet.CoreV1().Events(metav1.NamespaceDefault).List(ctx, options) 244 if err != nil { 245 return err 246 } 247 count := 0 248 for _, event := range events.Items { 249 if event.Reason != reason || event.Source.Host != nodeName { 250 continue 251 } 252 count += int(event.Count) 253 } 254 if count != num { 255 return fmt.Errorf("expect event number %d, got %d: %v", num, count, events.Items) 256 } 257 return nil 258 } 259 260 func verifyEventExists(ctx context.Context, f *framework.Framework, options metav1.ListOptions, reason, nodeName string) error { 261 events, err := f.ClientSet.CoreV1().Events(metav1.NamespaceDefault).List(ctx, options) 262 if err != nil { 263 return err 264 } 265 for _, event := range events.Items { 266 if event.Reason == reason && event.Source.Host == nodeName && event.Count > 0 { 267 return nil 268 } 269 } 270 return fmt.Errorf("Event %s does not exist: %v", reason, events.Items) 271 } 272 273 func verifyNodeCondition(ctx context.Context, f *framework.Framework, condition v1.NodeConditionType, status v1.ConditionStatus, reason, nodeName string) error { 274 node, err := f.ClientSet.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) 275 if err != nil { 276 return err 277 } 278 _, c := testutils.GetNodeCondition(&node.Status, condition) 279 if c == nil { 280 return fmt.Errorf("node condition %q not found", condition) 281 } 282 if c.Status != status || c.Reason != reason { 283 return fmt.Errorf("unexpected node condition %q: %+v", condition, c) 284 } 285 return nil 286 } 287 288 func getMemoryStat(ctx context.Context, f *framework.Framework, host string) (rss, workingSet float64) { 289 var memCmd string 290 291 isCgroupV2 := isHostRunningCgroupV2(ctx, f, host) 292 if isCgroupV2 { 293 memCmd = "cat /sys/fs/cgroup/system.slice/node-problem-detector.service/memory.current && cat /sys/fs/cgroup/system.slice/node-problem-detector.service/memory.stat" 294 } else { 295 memCmd = "cat /sys/fs/cgroup/memory/system.slice/node-problem-detector.service/memory.usage_in_bytes && cat /sys/fs/cgroup/memory/system.slice/node-problem-detector.service/memory.stat" 296 } 297 298 result, err := e2essh.SSH(ctx, memCmd, host, framework.TestContext.Provider) 299 framework.ExpectNoError(err) 300 gomega.Expect(result.Code).To(gomega.Equal(0)) 301 lines := strings.Split(result.Stdout, "\n") 302 303 memoryUsage, err := strconv.ParseFloat(lines[0], 64) 304 framework.ExpectNoError(err) 305 306 var rssToken, inactiveFileToken string 307 if isCgroupV2 { 308 // Use Anon memory for RSS as cAdvisor on cgroupv2 309 // see https://github.com/google/cadvisor/blob/a9858972e75642c2b1914c8d5428e33e6392c08a/container/libcontainer/handler.go#L799 310 rssToken = "anon" 311 inactiveFileToken = "inactive_file" 312 } else { 313 rssToken = "total_rss" 314 inactiveFileToken = "total_inactive_file" 315 } 316 317 var totalInactiveFile float64 318 for _, line := range lines[1:] { 319 tokens := strings.Split(line, " ") 320 321 if tokens[0] == rssToken { 322 rss, err = strconv.ParseFloat(tokens[1], 64) 323 framework.ExpectNoError(err) 324 } 325 if tokens[0] == inactiveFileToken { 326 totalInactiveFile, err = strconv.ParseFloat(tokens[1], 64) 327 framework.ExpectNoError(err) 328 } 329 } 330 331 workingSet = memoryUsage 332 if workingSet < totalInactiveFile { 333 workingSet = 0 334 } else { 335 workingSet -= totalInactiveFile 336 } 337 338 // Convert to MB 339 rss = rss / 1024 / 1024 340 workingSet = workingSet / 1024 / 1024 341 return 342 } 343 344 func getCPUStat(ctx context.Context, f *framework.Framework, host string) (usage, uptime float64) { 345 var cpuCmd string 346 if isHostRunningCgroupV2(ctx, f, host) { 347 cpuCmd = " cat /sys/fs/cgroup/cpu.stat | grep 'usage_usec' | sed 's/[^0-9]*//g' && cat /proc/uptime | awk '{print $1}'" 348 } else { 349 cpuCmd = "cat /sys/fs/cgroup/cpu/system.slice/node-problem-detector.service/cpuacct.usage && cat /proc/uptime | awk '{print $1}'" 350 } 351 352 result, err := e2essh.SSH(ctx, cpuCmd, host, framework.TestContext.Provider) 353 framework.ExpectNoError(err) 354 gomega.Expect(result.Code).To(gomega.Equal(0)) 355 lines := strings.Split(result.Stdout, "\n") 356 357 usage, err = strconv.ParseFloat(lines[0], 64) 358 framework.ExpectNoError(err, "Cannot parse float for usage") 359 uptime, err = strconv.ParseFloat(lines[1], 64) 360 framework.ExpectNoError(err, "Cannot parse float for uptime") 361 362 // Convert from nanoseconds to seconds 363 usage *= 1e-9 364 return 365 } 366 367 func isHostRunningCgroupV2(ctx context.Context, f *framework.Framework, host string) bool { 368 result, err := e2essh.SSH(ctx, "stat -fc %T /sys/fs/cgroup/", host, framework.TestContext.Provider) 369 framework.ExpectNoError(err) 370 gomega.Expect(result.Code).To(gomega.Equal(0)) 371 372 // 0x63677270 == CGROUP2_SUPER_MAGIC 373 // https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html 374 return strings.Contains(result.Stdout, "cgroup2") || strings.Contains(result.Stdout, "0x63677270") 375 } 376 377 func getNpdPodStat(ctx context.Context, f *framework.Framework, nodeName string) (cpuUsage, rss, workingSet float64) { 378 summary, err := e2ekubelet.GetStatsSummary(ctx, f.ClientSet, nodeName) 379 framework.ExpectNoError(err) 380 381 hasNpdPod := false 382 for _, pod := range summary.Pods { 383 if !strings.HasPrefix(pod.PodRef.Name, "node-problem-detector") { 384 continue 385 } 386 cpuUsage = float64(*pod.CPU.UsageNanoCores) * 1e-9 387 rss = float64(*pod.Memory.RSSBytes) / 1024 / 1024 388 workingSet = float64(*pod.Memory.WorkingSetBytes) / 1024 / 1024 389 hasNpdPod = true 390 break 391 } 392 if !hasNpdPod { 393 framework.Failf("No node-problem-detector pod is present in %+v", summary.Pods) 394 } 395 return 396 }