github.com/verrazzano/verrazzano@v1.7.0/tools/vz/pkg/analysis/internal/util/cluster/pods.go (about) 1 // Copyright (c) 2021, 2023, Oracle and/or its affiliates. 2 // Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl. 3 4 // Package cluster handles cluster analysis 5 package cluster 6 7 import ( 8 "bytes" 9 encjson "encoding/json" 10 "fmt" 11 "github.com/verrazzano/verrazzano/tools/vz/pkg/analysis/internal/util/files" 12 "github.com/verrazzano/verrazzano/tools/vz/pkg/analysis/internal/util/report" 13 "github.com/verrazzano/verrazzano/tools/vz/pkg/helpers" 14 "go.uber.org/zap" 15 "io" 16 corev1 "k8s.io/api/core/v1" 17 "os" 18 "regexp" 19 "strings" 20 "sync" 21 "text/template" 22 ) 23 24 // podListMap holds podLists which have been read in already 25 var podListMap = make(map[string]*corev1.PodList) 26 var podCacheMutex = &sync.Mutex{} 27 28 var dockerPullRateLimitRe = regexp.MustCompile(`.*You have reached your pull rate limit.*`) 29 var dockerNameUnknownRe = regexp.MustCompile(`.*name unknown.*`) 30 var dockerNotFoundRe = regexp.MustCompile(`.*not found.*`) 31 var dockerServiceUnavailableRe = regexp.MustCompile(`.*Service Unavailable.*`) 32 33 // TODO: "Verrazzano Uninstall Pod Issue": AnalyzeVerrazzanoUninstallIssue, 34 var podAnalysisFunctions = map[string]func(log *zap.SugaredLogger, directory string, podFile string, pod corev1.Pod, issueReporter *report.IssueReporter) (err error){ 35 "Pod Container Related Issues": podContainerIssues, 36 "Pod Status Condition Related Issues": podStatusConditionIssues, 37 } 38 39 // AnalyzePodIssues analyzes pod issues. It starts by scanning for problem pod phases 40 // in the cluster and drill down from there. 41 func AnalyzePodIssues(log *zap.SugaredLogger, clusterRoot string) (err error) { 42 log.Debugf("PodIssues called for %s", clusterRoot) 43 44 // Do a quick scan to find pods.json which have Pod which are not in a good state 45 podFiles, err := findProblematicPodFiles(log, clusterRoot) 46 if err != nil { 47 return err 48 } 49 totalFound := 0 50 for _, podFile := range podFiles { 51 found, err := analyzePods(log, clusterRoot, podFile) 52 totalFound += found 53 if err != nil { 54 log.Errorf("Failed during analyze Pods for cluster: %s, pods: %s ", clusterRoot, podFile, err) 55 return err 56 } 57 } 58 59 // If no issues were reported, but there were problem pods, we need to beef up the detection 60 // so report an issue (high confidence, low impact) 61 if totalFound == 0 && len(podFiles) > 0 { 62 reportProblemPodsNoIssues(log, clusterRoot, podFiles) 63 } 64 65 return nil 66 } 67 68 func analyzePods(log *zap.SugaredLogger, clusterRoot string, podFile string) (reported int, err error) { 69 log.Debugf("analyzePods called with %s", podFile) 70 podList, err := GetPodList(log, podFile) 71 if err != nil { 72 log.Debugf("Failed to get the PodList for %s", podFile, err) 73 return 0, err 74 } 75 if podList == nil { 76 log.Debugf("No PodList was returned, skipping") 77 return 0, nil 78 } 79 80 var issueReporter = report.IssueReporter{ 81 PendingIssues: make(map[string]report.Issue), 82 } 83 for _, pod := range podList.Items { 84 if !IsPodProblematic(pod) { 85 continue 86 } 87 88 // Call the pod analysis functions 89 for functionName, function := range podAnalysisFunctions { 90 err := function(log, clusterRoot, podFile, pod, &issueReporter) 91 if err != nil { 92 // Log the error and continue on 93 log.Errorf("Error processing analysis function %s", functionName, err) 94 } 95 } 96 } 97 98 reported = len(issueReporter.PendingIssues) 99 issueReporter.Contribute(log, clusterRoot) 100 return reported, nil 101 } 102 103 // IsPodReadyOrCompleted will return true if the Pod has containers that are neither Ready nor Completed 104 // TODO: Extend for transition time correlation (ie: change from bool to struct) 105 func IsPodReadyOrCompleted(podStatus corev1.PodStatus) bool { 106 for _, containerStatus := range podStatus.ContainerStatuses { 107 state := containerStatus.State 108 if state.Terminated != nil && state.Terminated.Reason != "Completed" { 109 return false 110 } 111 if state.Running != nil && !containerStatus.Ready { 112 return false 113 } 114 if state.Waiting != nil { 115 return false 116 } 117 } 118 return true 119 } 120 121 // This is evolving as we add more cases in podContainerIssues 122 // 123 // One thing that switching to this drill-down model makes harder to do is track overarching 124 // issues that are related. I have an IssueReporter that is being passed along and will 125 // consolidate the same KnownIssue types to help with similar issues. 126 // 127 // Note that this is not showing it here as the current analysis only is using the IssueReporter 128 // but analysis code is free to use the NewKnown* helpers or form fully custom issues and Contribute 129 // those directly to the report.Contribute* helpers 130 func podContainerIssues(log *zap.SugaredLogger, clusterRoot string, podFile string, pod corev1.Pod, issueReporter *report.IssueReporter) (err error) { 131 log.Debugf("podContainerIssues analysis called for cluster: %s, ns: %s, pod: %s", clusterRoot, pod.ObjectMeta.Namespace, pod.ObjectMeta.Name) 132 podEvents, err := GetEventsRelatedToPod(log, clusterRoot, pod, nil) 133 if err != nil { 134 log.Debugf("Failed to get events related to ns: %s, pod: %s", pod.ObjectMeta.Namespace, pod.ObjectMeta.Name) 135 } 136 // TODO: We can get duplicated event drilldown messages if the initcontainers and containers are both impacted similarly 137 // Since we contribute it to the IssueReporter, thinking maybe can handle de-duplication under the covers to allow 138 // discrete analysis to be handled various ways, though could rethink the approach here as well to reduce the need too. 139 if len(pod.Status.InitContainerStatuses) > 0 { 140 for _, initContainerStatus := range pod.Status.InitContainerStatuses { 141 if initContainerStatus.State.Waiting != nil { 142 if initContainerStatus.State.Waiting.Reason == "ImagePullBackOff" { 143 handleImagePullBackOff(log, clusterRoot, podFile, pod, podEvents, initContainerStatus.Image, 144 fmt.Sprintf("Namespace %s, Pod %s, InitContainer %s, Message %s", 145 pod.ObjectMeta.Namespace, pod.ObjectMeta.Name, initContainerStatus.Name, initContainerStatus.State.Waiting.Message), 146 issueReporter) 147 } 148 } 149 } 150 } 151 if len(pod.Status.ContainerStatuses) > 0 { 152 for _, containerStatus := range pod.Status.ContainerStatuses { 153 if containerStatus.State.Waiting != nil { 154 if containerStatus.State.Waiting.Reason == "ImagePullBackOff" { 155 handleImagePullBackOff(log, clusterRoot, podFile, pod, podEvents, containerStatus.Image, 156 fmt.Sprintf("Namespace %s, Pod %s, Container %s, Message %s", 157 pod.ObjectMeta.Namespace, pod.ObjectMeta.Name, containerStatus.Name, containerStatus.State.Waiting.Message), 158 issueReporter) 159 } 160 } 161 } 162 } 163 return nil 164 } 165 166 func handleImagePullBackOff(log *zap.SugaredLogger, clusterRoot string, podFile string, pod corev1.Pod, podEvents []corev1.Event, 167 image string, initialMessage string, issueReporter *report.IssueReporter) { 168 messages := make(StringSlice, 1) 169 messages[0] = initialMessage 170 messages.addMessages(drillIntoEventsForImagePullIssue(log, pod, image, podEvents)) 171 172 var files []string 173 if helpers.GetIsLiveCluster() { 174 files = []string{report.GetRelatedPodMessage(pod.ObjectMeta.Name, pod.ObjectMeta.Namespace)} 175 } else { 176 files = []string{podFile} 177 } 178 179 reported := 0 180 for _, message := range messages { 181 if dockerPullRateLimitRe.MatchString(message) { 182 issueReporter.AddKnownIssueMessagesFiles( 183 report.ImagePullRateLimit, 184 clusterRoot, 185 messages, 186 files, 187 ) 188 reported++ 189 } else if dockerServiceUnavailableRe.MatchString(message) { 190 issueReporter.AddKnownIssueMessagesFiles( 191 report.ImagePullService, 192 clusterRoot, 193 messages, 194 files, 195 ) 196 reported++ 197 } else if dockerNameUnknownRe.MatchString(message) || dockerNotFoundRe.MatchString(message) { 198 issueReporter.AddKnownIssueMessagesFiles( 199 report.ImagePullNotFound, 200 clusterRoot, 201 messages, 202 files, 203 ) 204 reported++ 205 } 206 } 207 208 // If we didn't detect more specific issues here, fall back to the general ImagePullBackOff 209 if reported == 0 { 210 issueReporter.AddKnownIssueMessagesFiles( 211 report.ImagePullBackOff, 212 clusterRoot, 213 messages, 214 files, 215 ) 216 } 217 } 218 219 func registerIssues(messages map[string][]string, files []string, clusterRoot string, issueReporter *report.IssueReporter) { 220 if len(messages[report.InsufficientMemory]) > 0 { 221 issueReporter.AddKnownIssueMessagesFiles(report.InsufficientMemory, clusterRoot, messages[report.InsufficientMemory], files) 222 } 223 if len(messages[report.InsufficientCPU]) > 0 { 224 issueReporter.AddKnownIssueMessagesFiles(report.InsufficientCPU, clusterRoot, messages[report.InsufficientCPU], files) 225 } 226 } 227 228 func podStatusConditionIssues(log *zap.SugaredLogger, clusterRoot string, podFile string, pod corev1.Pod, issueReporter *report.IssueReporter) (err error) { 229 log.Debugf("Memory or CPU Issues called for %s", clusterRoot) 230 231 if len(pod.Status.Conditions) > 0 { 232 messages := make(map[string][]string) 233 for _, condition := range pod.Status.Conditions { 234 if strings.Contains(condition.Message, "Insufficient memory") { 235 messages[report.InsufficientMemory] = append(messages[report.InsufficientMemory], fmt.Sprintf("Namespace %s, Pod %s, Status %s, Reason %s, Message %s", 236 pod.ObjectMeta.Namespace, pod.ObjectMeta.Name, condition.Status, condition.Reason, condition.Message)) 237 } 238 if strings.Contains(condition.Message, "Insufficient cpu") { 239 messages[report.InsufficientCPU] = append(messages[report.InsufficientCPU], fmt.Sprintf("Namespace %s, Pod %s, Status %s, Reason %s, Message %s", 240 pod.ObjectMeta.Namespace, pod.ObjectMeta.Name, condition.Status, condition.Reason, condition.Message)) 241 } 242 } 243 if len(messages) > 0 { 244 files := []string{podFile} 245 if helpers.GetIsLiveCluster() { 246 files = []string{report.GetRelatedPodMessage(pod.ObjectMeta.Name, pod.ObjectMeta.Namespace)} 247 } 248 registerIssues(messages, files, clusterRoot, issueReporter) 249 } 250 } 251 return nil 252 } 253 254 // StringSlice is a string slice 255 type StringSlice []string 256 257 func (messages *StringSlice) addMessages(newMessages []string, err error) (errOut error) { 258 if err != nil { 259 errOut = err 260 return errOut 261 } 262 if len(newMessages) > 0 { 263 *messages = append(*messages, newMessages...) 264 } 265 return nil 266 } 267 268 // This is WIP, initially it will report more specific cause info (like auth, timeout, etc...), but we may want 269 // to have different known issue types rather than reporting in messages as the runbooks to look at may be different 270 // so this is evolving... 271 func drillIntoEventsForImagePullIssue(log *zap.SugaredLogger, pod corev1.Pod, imageName string, eventList []corev1.Event) (messages []string, err error) { 272 // This is handed the events that are associated with the Pod that has containers/initContainers that had image pull issues 273 // So it will look at what events are found, these may glean more info on the specific cause to help narrow 274 // it further 275 for _, event := range eventList { 276 // TODO: Discern whether the event is relevant or not, we likely will need more info supplied in to correlate 277 // whether the event really is related to the issue being drilled into or not, but starting off just dumping out 278 // what is there first. Hoping that this can be a more general drilldown here rather than just specific to ImagePulls 279 // Ie: drill into events related to Pod/container issue. 280 // We may want to add in a "Reason" concept here as well. Ie: the issue is Image pull, and we were able to 281 // discern more about the reason that happened, so we can return back up a Reason that can be useful in the 282 // action handling to focus on the correct steps, instead of having an entirely separate issue type to handle that. 283 // (or just have a more specific issue type, but need to think about it as we are setting the basic patterns 284 // here in general). Need to think about it though as it will affect how we handle runbooks as well. 285 log.Debugf("Drilldown event reason: %s, message: %s\n", event.Reason, event.Message) 286 if event.Reason == "Failed" && strings.Contains(event.Message, imageName) { 287 // TBD: We need a better mechanism at this level than just the messages. It can add other types of supporting 288 // data to contribute as well here (related files, etc...) 289 messages = append(messages, event.Message) 290 } 291 292 } 293 return messages, nil 294 } 295 296 // GetPodList gets a pod list 297 func GetPodList(log *zap.SugaredLogger, path string) (podList *corev1.PodList, err error) { 298 // Check the cache first 299 podList = getPodListIfPresent(path) 300 if podList != nil { 301 log.Debugf("Returning cached podList for %s", path) 302 return podList, nil 303 } 304 305 // Not found in the cache, get it from the file 306 file, err := os.Open(path) 307 if err != nil { 308 log.Debugf("file %s not found", path) 309 return nil, err 310 } 311 defer file.Close() 312 313 fileBytes, err := io.ReadAll(file) 314 if err != nil { 315 log.Debugf("Failed reading Json file %s", path) 316 return nil, err 317 } 318 err = encjson.Unmarshal(fileBytes, &podList) 319 if err != nil { 320 log.Debugf("Failed to unmarshal podList at %s", path) 321 return nil, err 322 } 323 putPodListIfNotPresent(path, podList) 324 return podList, nil 325 } 326 327 // IsPodProblematic returns a boolean indicating whether a pod is deemed problematic or not 328 func IsPodProblematic(pod corev1.Pod) bool { 329 // If it isn't Running or Succeeded it is problematic 330 if pod.Status.Phase == corev1.PodRunning || 331 pod.Status.Phase == corev1.PodSucceeded { 332 // The Pod indicates it is Running/Succeeded, check if there are containers that are not ready 333 return !IsPodReadyOrCompleted(pod.Status) 334 } 335 return true 336 } 337 338 // IsPodPending returns a boolean indicating whether a pod is pending or not 339 func IsPodPending(pod corev1.Pod) bool { 340 return pod.Status.Phase == corev1.PodPending 341 } 342 343 // This looks at the pods.json files in the cluster and will give a list of files 344 // if any have pods that are not Running or Succeeded. 345 func findProblematicPodFiles(log *zap.SugaredLogger, clusterRoot string) (podFiles []string, err error) { 346 allPodFiles, err := files.GetMatchingFiles(log, clusterRoot, PodFilesMatchRe) 347 if err != nil { 348 return podFiles, err 349 } 350 351 if len(allPodFiles) == 0 { 352 return podFiles, nil 353 } 354 podFiles = make([]string, 0, len(allPodFiles)) 355 for _, podFile := range allPodFiles { 356 log.Debugf("Looking at pod file for problematic pods: %s", podFile) 357 podList, err := GetPodList(log, podFile) 358 if err != nil { 359 log.Debugf("Failed to get the PodList for %s, skipping", podFile, err) 360 continue 361 } 362 if podList == nil { 363 log.Debugf("No PodList was returned, skipping") 364 continue 365 } 366 367 // If we find any we flag the file as havin problematic pods and move to the next file 368 // this is just a quick scan to identify which files to drill into 369 for _, pod := range podList.Items { 370 if !IsPodProblematic(pod) { 371 continue 372 } 373 log.Debugf("Problematic pods detected: %s", podFile) 374 podFiles = append(podFiles, podFile) 375 break 376 } 377 } 378 return podFiles, nil 379 } 380 381 func reportProblemPodsNoIssues(log *zap.SugaredLogger, clusterRoot string, podFiles []string) { 382 messages := make([]string, 0, len(podFiles)) 383 matches := make([]files.TextMatch, 0, len(podFiles)) 384 problematicNotPending := 0 385 pendingPodsSeen := 0 386 for _, podFile := range podFiles { 387 podList, err := GetPodList(log, podFile) 388 if err != nil { 389 log.Debugf("Failed to get the PodList for %s", podFile, err) 390 continue 391 } 392 if podList == nil { 393 log.Debugf("No PodList was returned, skipping") 394 continue 395 } 396 for _, pod := range podList.Items { 397 if !IsPodProblematic(pod) { 398 continue 399 } 400 if pod.Status.Phase == corev1.PodFailed || pod.Status.Phase == corev1.PodUnknown { 401 problematicNotPending++ 402 if len(pod.Status.Reason) > 0 || len(pod.Status.Message) > 0 { 403 messages = append(messages, fmt.Sprintf("Namespace %s, Pod %s, Phase %s, Reason %s, Message %s", 404 pod.ObjectMeta.Namespace, pod.ObjectMeta.Name, pod.Status.Phase, pod.Status.Reason, pod.Status.Message)) 405 } 406 } else if pod.Status.Phase == corev1.PodPending { 407 pendingPodsSeen++ 408 } 409 410 for _, condition := range pod.Status.Conditions { 411 message, err := podConditionMessage(pod.Name, pod.Namespace, condition) 412 if err != nil { 413 log.Debugf("Failed to create pod condition message: %v", err) 414 continue 415 } 416 messages = append(messages, message) 417 } 418 // TODO: Time correlation for search 419 420 fileName := files.FindPodLogFileName(clusterRoot, pod) 421 matched, err := files.SearchFile(log, fileName, WideErrorSearchRe, nil) 422 if err != nil { 423 log.Debugf("Failed to search the logfile %s for the ns/pod %s/%s", 424 files.FindPodLogFileName(clusterRoot, pod), pod.ObjectMeta.Namespace, pod.ObjectMeta.Name, err) 425 } 426 if len(matched) > 0 { 427 for _, m := range matched { 428 if helpers.GetIsLiveCluster() { 429 m.FileName = report.GetRelatedLogFromPodMessage(fileName) 430 } 431 matches = append(matches, m) 432 } 433 } 434 } 435 } 436 messages = checkIfHelmPodsAdded(messages) 437 supportingData := make([]report.SupportData, 1) 438 supportingData[0] = report.SupportData{ 439 Messages: messages, 440 TextMatches: matches, 441 } 442 // If all of the problematic pods were pending only, just report that, otherwise report them as problematic if some are 443 // failing or unknown 444 if len(messages) > 0 { 445 if pendingPodsSeen > 0 && problematicNotPending == 0 { 446 report.ContributeIssue(log, report.NewKnownIssueSupportingData(report.PendingPods, clusterRoot, supportingData)) 447 } else { 448 report.ContributeIssue(log, report.NewKnownIssueSupportingData(report.PodProblemsNotReported, clusterRoot, supportingData)) 449 } 450 } 451 } 452 453 // checkIfHelmPodsAdded tries to assess whether there is an issue with helm and rancher pods or not 454 // in cattle-system namespace 455 // if the helm pods are not fine and rancher pods are working, then it deletes the issues related to helm operation pods in cattle-system 456 // it edits the message array to remove the messages related to helm-operation pods and returns the modified message array 457 func checkIfHelmPodsAdded(messages []string) []string { 458 isHelm := false 459 isRancher := false 460 var indices []int 461 for i, data := range messages { 462 if strings.Contains(data, "Namespace cattle-system, Pod helm-operation") { 463 isHelm = true 464 indices = append(indices, i) 465 } 466 if strings.Contains(data, "Namespace cattle-system, Pod rancher") { 467 isRancher = true 468 } 469 470 } 471 if isHelm && !isRancher { 472 for i, index := range indices { 473 messages = append(messages[:(index-i)], messages[(index+1-i):]...) 474 475 } 476 } 477 return messages 478 479 } 480 func getPodListIfPresent(path string) (podList *corev1.PodList) { 481 podCacheMutex.Lock() 482 podListTest := podListMap[path] 483 if podListTest != nil { 484 podList = podListTest 485 } 486 podCacheMutex.Unlock() 487 return podList 488 } 489 490 func putPodListIfNotPresent(path string, podList *corev1.PodList) { 491 podCacheMutex.Lock() 492 podListInMap := podListMap[path] 493 if podListInMap == nil { 494 podListMap[path] = podList 495 } 496 podCacheMutex.Unlock() 497 } 498 499 const podConditionMessageTmpl = "Namespace {{.namespace}}, Pod {{.name}}{{- if .type }}, ConditionType {{.type}}{{- end}}{{- if .status}}, Status {{.status}}{{- end}}{{- if .reason}}, Reason {{.reason}}{{- end }}{{- if .message}}, Message {{.message}}{{- end }}" 500 501 func podConditionMessage(name, namespace string, condition corev1.PodCondition) (string, error) { 502 vals := map[string]interface{}{} 503 addKV := func(k, v string) { 504 if v != "" { 505 vals[k] = v 506 } 507 } 508 addKV("message", condition.Message) 509 addKV("reason", condition.Reason) 510 addKV("status", string(condition.Status)) 511 addKV("type", string(condition.Type)) 512 vals["name"] = name 513 vals["namespace"] = namespace 514 tmpl, err := template.New("podConditionMessage"). 515 Parse(podConditionMessageTmpl) 516 if err != nil { 517 return "", err 518 } 519 520 buffer := &bytes.Buffer{} 521 if err = tmpl.Execute(buffer, vals); err != nil { 522 return "", err 523 } 524 return buffer.String(), nil 525 }