github.com/verrazzano/verrazzano@v1.7.0/tools/vz/pkg/analysis/internal/util/cluster/pods.go (about)

     1  // Copyright (c) 2021, 2023, Oracle and/or its affiliates.
     2  // Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl.
     3  
     4  // Package cluster handles cluster analysis
     5  package cluster
     6  
     7  import (
     8  	"bytes"
     9  	encjson "encoding/json"
    10  	"fmt"
    11  	"github.com/verrazzano/verrazzano/tools/vz/pkg/analysis/internal/util/files"
    12  	"github.com/verrazzano/verrazzano/tools/vz/pkg/analysis/internal/util/report"
    13  	"github.com/verrazzano/verrazzano/tools/vz/pkg/helpers"
    14  	"go.uber.org/zap"
    15  	"io"
    16  	corev1 "k8s.io/api/core/v1"
    17  	"os"
    18  	"regexp"
    19  	"strings"
    20  	"sync"
    21  	"text/template"
    22  )
    23  
    24  // podListMap holds podLists which have been read in already
    25  var podListMap = make(map[string]*corev1.PodList)
    26  var podCacheMutex = &sync.Mutex{}
    27  
    28  var dockerPullRateLimitRe = regexp.MustCompile(`.*You have reached your pull rate limit.*`)
    29  var dockerNameUnknownRe = regexp.MustCompile(`.*name unknown.*`)
    30  var dockerNotFoundRe = regexp.MustCompile(`.*not found.*`)
    31  var dockerServiceUnavailableRe = regexp.MustCompile(`.*Service Unavailable.*`)
    32  
    33  // TODO: "Verrazzano Uninstall Pod Issue":    AnalyzeVerrazzanoUninstallIssue,
    34  var podAnalysisFunctions = map[string]func(log *zap.SugaredLogger, directory string, podFile string, pod corev1.Pod, issueReporter *report.IssueReporter) (err error){
    35  	"Pod Container Related Issues":        podContainerIssues,
    36  	"Pod Status Condition Related Issues": podStatusConditionIssues,
    37  }
    38  
    39  // AnalyzePodIssues analyzes pod issues. It starts by scanning for problem pod phases
    40  // in the cluster and drill down from there.
    41  func AnalyzePodIssues(log *zap.SugaredLogger, clusterRoot string) (err error) {
    42  	log.Debugf("PodIssues called for %s", clusterRoot)
    43  
    44  	// Do a quick scan to find pods.json which have Pod which are not in a good state
    45  	podFiles, err := findProblematicPodFiles(log, clusterRoot)
    46  	if err != nil {
    47  		return err
    48  	}
    49  	totalFound := 0
    50  	for _, podFile := range podFiles {
    51  		found, err := analyzePods(log, clusterRoot, podFile)
    52  		totalFound += found
    53  		if err != nil {
    54  			log.Errorf("Failed during analyze Pods for cluster: %s, pods: %s ", clusterRoot, podFile, err)
    55  			return err
    56  		}
    57  	}
    58  
    59  	// If no issues were reported, but there were problem pods, we need to beef up the detection
    60  	// so report an issue (high confidence, low impact)
    61  	if totalFound == 0 && len(podFiles) > 0 {
    62  		reportProblemPodsNoIssues(log, clusterRoot, podFiles)
    63  	}
    64  
    65  	return nil
    66  }
    67  
    68  func analyzePods(log *zap.SugaredLogger, clusterRoot string, podFile string) (reported int, err error) {
    69  	log.Debugf("analyzePods called with %s", podFile)
    70  	podList, err := GetPodList(log, podFile)
    71  	if err != nil {
    72  		log.Debugf("Failed to get the PodList for %s", podFile, err)
    73  		return 0, err
    74  	}
    75  	if podList == nil {
    76  		log.Debugf("No PodList was returned, skipping")
    77  		return 0, nil
    78  	}
    79  
    80  	var issueReporter = report.IssueReporter{
    81  		PendingIssues: make(map[string]report.Issue),
    82  	}
    83  	for _, pod := range podList.Items {
    84  		if !IsPodProblematic(pod) {
    85  			continue
    86  		}
    87  
    88  		// Call the pod analysis functions
    89  		for functionName, function := range podAnalysisFunctions {
    90  			err := function(log, clusterRoot, podFile, pod, &issueReporter)
    91  			if err != nil {
    92  				// Log the error and continue on
    93  				log.Errorf("Error processing analysis function %s", functionName, err)
    94  			}
    95  		}
    96  	}
    97  
    98  	reported = len(issueReporter.PendingIssues)
    99  	issueReporter.Contribute(log, clusterRoot)
   100  	return reported, nil
   101  }
   102  
   103  // IsPodReadyOrCompleted will return true if the Pod has containers that are neither Ready nor Completed
   104  // TODO: Extend for transition time correlation (ie: change from bool to struct)
   105  func IsPodReadyOrCompleted(podStatus corev1.PodStatus) bool {
   106  	for _, containerStatus := range podStatus.ContainerStatuses {
   107  		state := containerStatus.State
   108  		if state.Terminated != nil && state.Terminated.Reason != "Completed" {
   109  			return false
   110  		}
   111  		if state.Running != nil && !containerStatus.Ready {
   112  			return false
   113  		}
   114  		if state.Waiting != nil {
   115  			return false
   116  		}
   117  	}
   118  	return true
   119  }
   120  
   121  // This is evolving as we add more cases in podContainerIssues
   122  //
   123  //	One thing that switching to this drill-down model makes harder to do is track overarching
   124  //	issues that are related. I have an IssueReporter that is being passed along and will
   125  //	consolidate the same KnownIssue types to help with similar issues.
   126  //
   127  //	Note that this is not showing it here as the current analysis only is using the IssueReporter
   128  //	but analysis code is free to use the NewKnown* helpers or form fully custom issues and Contribute
   129  //	those directly to the report.Contribute* helpers
   130  func podContainerIssues(log *zap.SugaredLogger, clusterRoot string, podFile string, pod corev1.Pod, issueReporter *report.IssueReporter) (err error) {
   131  	log.Debugf("podContainerIssues analysis called for cluster: %s, ns: %s, pod: %s", clusterRoot, pod.ObjectMeta.Namespace, pod.ObjectMeta.Name)
   132  	podEvents, err := GetEventsRelatedToPod(log, clusterRoot, pod, nil)
   133  	if err != nil {
   134  		log.Debugf("Failed to get events related to ns: %s, pod: %s", pod.ObjectMeta.Namespace, pod.ObjectMeta.Name)
   135  	}
   136  	// TODO: We can get duplicated event drilldown messages if the initcontainers and containers are both impacted similarly
   137  	//       Since we contribute it to the IssueReporter, thinking maybe can handle de-duplication under the covers to allow
   138  	//       discrete analysis to be handled various ways, though could rethink the approach here as well to reduce the need too.
   139  	if len(pod.Status.InitContainerStatuses) > 0 {
   140  		for _, initContainerStatus := range pod.Status.InitContainerStatuses {
   141  			if initContainerStatus.State.Waiting != nil {
   142  				if initContainerStatus.State.Waiting.Reason == "ImagePullBackOff" {
   143  					handleImagePullBackOff(log, clusterRoot, podFile, pod, podEvents, initContainerStatus.Image,
   144  						fmt.Sprintf("Namespace %s, Pod %s, InitContainer %s, Message %s",
   145  							pod.ObjectMeta.Namespace, pod.ObjectMeta.Name, initContainerStatus.Name, initContainerStatus.State.Waiting.Message),
   146  						issueReporter)
   147  				}
   148  			}
   149  		}
   150  	}
   151  	if len(pod.Status.ContainerStatuses) > 0 {
   152  		for _, containerStatus := range pod.Status.ContainerStatuses {
   153  			if containerStatus.State.Waiting != nil {
   154  				if containerStatus.State.Waiting.Reason == "ImagePullBackOff" {
   155  					handleImagePullBackOff(log, clusterRoot, podFile, pod, podEvents, containerStatus.Image,
   156  						fmt.Sprintf("Namespace %s, Pod %s, Container %s, Message %s",
   157  							pod.ObjectMeta.Namespace, pod.ObjectMeta.Name, containerStatus.Name, containerStatus.State.Waiting.Message),
   158  						issueReporter)
   159  				}
   160  			}
   161  		}
   162  	}
   163  	return nil
   164  }
   165  
   166  func handleImagePullBackOff(log *zap.SugaredLogger, clusterRoot string, podFile string, pod corev1.Pod, podEvents []corev1.Event,
   167  	image string, initialMessage string, issueReporter *report.IssueReporter) {
   168  	messages := make(StringSlice, 1)
   169  	messages[0] = initialMessage
   170  	messages.addMessages(drillIntoEventsForImagePullIssue(log, pod, image, podEvents))
   171  
   172  	var files []string
   173  	if helpers.GetIsLiveCluster() {
   174  		files = []string{report.GetRelatedPodMessage(pod.ObjectMeta.Name, pod.ObjectMeta.Namespace)}
   175  	} else {
   176  		files = []string{podFile}
   177  	}
   178  
   179  	reported := 0
   180  	for _, message := range messages {
   181  		if dockerPullRateLimitRe.MatchString(message) {
   182  			issueReporter.AddKnownIssueMessagesFiles(
   183  				report.ImagePullRateLimit,
   184  				clusterRoot,
   185  				messages,
   186  				files,
   187  			)
   188  			reported++
   189  		} else if dockerServiceUnavailableRe.MatchString(message) {
   190  			issueReporter.AddKnownIssueMessagesFiles(
   191  				report.ImagePullService,
   192  				clusterRoot,
   193  				messages,
   194  				files,
   195  			)
   196  			reported++
   197  		} else if dockerNameUnknownRe.MatchString(message) || dockerNotFoundRe.MatchString(message) {
   198  			issueReporter.AddKnownIssueMessagesFiles(
   199  				report.ImagePullNotFound,
   200  				clusterRoot,
   201  				messages,
   202  				files,
   203  			)
   204  			reported++
   205  		}
   206  	}
   207  
   208  	// If we didn't detect more specific issues here, fall back to the general ImagePullBackOff
   209  	if reported == 0 {
   210  		issueReporter.AddKnownIssueMessagesFiles(
   211  			report.ImagePullBackOff,
   212  			clusterRoot,
   213  			messages,
   214  			files,
   215  		)
   216  	}
   217  }
   218  
   219  func registerIssues(messages map[string][]string, files []string, clusterRoot string, issueReporter *report.IssueReporter) {
   220  	if len(messages[report.InsufficientMemory]) > 0 {
   221  		issueReporter.AddKnownIssueMessagesFiles(report.InsufficientMemory, clusterRoot, messages[report.InsufficientMemory], files)
   222  	}
   223  	if len(messages[report.InsufficientCPU]) > 0 {
   224  		issueReporter.AddKnownIssueMessagesFiles(report.InsufficientCPU, clusterRoot, messages[report.InsufficientCPU], files)
   225  	}
   226  }
   227  
   228  func podStatusConditionIssues(log *zap.SugaredLogger, clusterRoot string, podFile string, pod corev1.Pod, issueReporter *report.IssueReporter) (err error) {
   229  	log.Debugf("Memory or CPU Issues called for %s", clusterRoot)
   230  
   231  	if len(pod.Status.Conditions) > 0 {
   232  		messages := make(map[string][]string)
   233  		for _, condition := range pod.Status.Conditions {
   234  			if strings.Contains(condition.Message, "Insufficient memory") {
   235  				messages[report.InsufficientMemory] = append(messages[report.InsufficientMemory], fmt.Sprintf("Namespace %s, Pod %s, Status %s, Reason %s, Message %s",
   236  					pod.ObjectMeta.Namespace, pod.ObjectMeta.Name, condition.Status, condition.Reason, condition.Message))
   237  			}
   238  			if strings.Contains(condition.Message, "Insufficient cpu") {
   239  				messages[report.InsufficientCPU] = append(messages[report.InsufficientCPU], fmt.Sprintf("Namespace %s, Pod %s, Status %s, Reason %s, Message %s",
   240  					pod.ObjectMeta.Namespace, pod.ObjectMeta.Name, condition.Status, condition.Reason, condition.Message))
   241  			}
   242  		}
   243  		if len(messages) > 0 {
   244  			files := []string{podFile}
   245  			if helpers.GetIsLiveCluster() {
   246  				files = []string{report.GetRelatedPodMessage(pod.ObjectMeta.Name, pod.ObjectMeta.Namespace)}
   247  			}
   248  			registerIssues(messages, files, clusterRoot, issueReporter)
   249  		}
   250  	}
   251  	return nil
   252  }
   253  
   254  // StringSlice is a string slice
   255  type StringSlice []string
   256  
   257  func (messages *StringSlice) addMessages(newMessages []string, err error) (errOut error) {
   258  	if err != nil {
   259  		errOut = err
   260  		return errOut
   261  	}
   262  	if len(newMessages) > 0 {
   263  		*messages = append(*messages, newMessages...)
   264  	}
   265  	return nil
   266  }
   267  
   268  // This is WIP, initially it will report more specific cause info (like auth, timeout, etc...), but we may want
   269  // to have different known issue types rather than reporting in messages as the runbooks to look at may be different
   270  // so this is evolving...
   271  func drillIntoEventsForImagePullIssue(log *zap.SugaredLogger, pod corev1.Pod, imageName string, eventList []corev1.Event) (messages []string, err error) {
   272  	// This is handed the events that are associated with the Pod that has containers/initContainers that had image pull issues
   273  	// So it will look at what events are found, these may glean more info on the specific cause to help narrow
   274  	// it further
   275  	for _, event := range eventList {
   276  		// TODO: Discern whether the event is relevant or not, we likely will need more info supplied in to correlate
   277  		// whether the event really is related to the issue being drilled into or not, but starting off just dumping out
   278  		// what is there first. Hoping that this can be a more general drilldown here rather than just specific to ImagePulls
   279  		// Ie: drill into events related to Pod/container issue.
   280  		// We may want to add in a "Reason" concept here as well. Ie: the issue is Image pull, and we were able to
   281  		// discern more about the reason that happened, so we can return back up a Reason that can be useful in the
   282  		// action handling to focus on the correct steps, instead of having an entirely separate issue type to handle that.
   283  		// (or just have a more specific issue type, but need to think about it as we are setting the basic patterns
   284  		// here in general). Need to think about it though as it will affect how we handle runbooks as well.
   285  		log.Debugf("Drilldown event reason: %s, message: %s\n", event.Reason, event.Message)
   286  		if event.Reason == "Failed" && strings.Contains(event.Message, imageName) {
   287  			// TBD: We need a better mechanism at this level than just the messages. It can add other types of supporting
   288  			// data to contribute as well here (related files, etc...)
   289  			messages = append(messages, event.Message)
   290  		}
   291  
   292  	}
   293  	return messages, nil
   294  }
   295  
   296  // GetPodList gets a pod list
   297  func GetPodList(log *zap.SugaredLogger, path string) (podList *corev1.PodList, err error) {
   298  	// Check the cache first
   299  	podList = getPodListIfPresent(path)
   300  	if podList != nil {
   301  		log.Debugf("Returning cached podList for %s", path)
   302  		return podList, nil
   303  	}
   304  
   305  	// Not found in the cache, get it from the file
   306  	file, err := os.Open(path)
   307  	if err != nil {
   308  		log.Debugf("file %s not found", path)
   309  		return nil, err
   310  	}
   311  	defer file.Close()
   312  
   313  	fileBytes, err := io.ReadAll(file)
   314  	if err != nil {
   315  		log.Debugf("Failed reading Json file %s", path)
   316  		return nil, err
   317  	}
   318  	err = encjson.Unmarshal(fileBytes, &podList)
   319  	if err != nil {
   320  		log.Debugf("Failed to unmarshal podList at %s", path)
   321  		return nil, err
   322  	}
   323  	putPodListIfNotPresent(path, podList)
   324  	return podList, nil
   325  }
   326  
   327  // IsPodProblematic returns a boolean indicating whether a pod is deemed problematic or not
   328  func IsPodProblematic(pod corev1.Pod) bool {
   329  	// If it isn't Running or Succeeded it is problematic
   330  	if pod.Status.Phase == corev1.PodRunning ||
   331  		pod.Status.Phase == corev1.PodSucceeded {
   332  		// The Pod indicates it is Running/Succeeded, check if there are containers that are not ready
   333  		return !IsPodReadyOrCompleted(pod.Status)
   334  	}
   335  	return true
   336  }
   337  
   338  // IsPodPending returns a boolean indicating whether a pod is pending or not
   339  func IsPodPending(pod corev1.Pod) bool {
   340  	return pod.Status.Phase == corev1.PodPending
   341  }
   342  
   343  // This looks at the pods.json files in the cluster and will give a list of files
   344  // if any have pods that are not Running or Succeeded.
   345  func findProblematicPodFiles(log *zap.SugaredLogger, clusterRoot string) (podFiles []string, err error) {
   346  	allPodFiles, err := files.GetMatchingFiles(log, clusterRoot, PodFilesMatchRe)
   347  	if err != nil {
   348  		return podFiles, err
   349  	}
   350  
   351  	if len(allPodFiles) == 0 {
   352  		return podFiles, nil
   353  	}
   354  	podFiles = make([]string, 0, len(allPodFiles))
   355  	for _, podFile := range allPodFiles {
   356  		log.Debugf("Looking at pod file for problematic pods: %s", podFile)
   357  		podList, err := GetPodList(log, podFile)
   358  		if err != nil {
   359  			log.Debugf("Failed to get the PodList for %s, skipping", podFile, err)
   360  			continue
   361  		}
   362  		if podList == nil {
   363  			log.Debugf("No PodList was returned, skipping")
   364  			continue
   365  		}
   366  
   367  		// If we find any we flag the file as havin problematic pods and move to the next file
   368  		// this is just a quick scan to identify which files to drill into
   369  		for _, pod := range podList.Items {
   370  			if !IsPodProblematic(pod) {
   371  				continue
   372  			}
   373  			log.Debugf("Problematic pods detected: %s", podFile)
   374  			podFiles = append(podFiles, podFile)
   375  			break
   376  		}
   377  	}
   378  	return podFiles, nil
   379  }
   380  
   381  func reportProblemPodsNoIssues(log *zap.SugaredLogger, clusterRoot string, podFiles []string) {
   382  	messages := make([]string, 0, len(podFiles))
   383  	matches := make([]files.TextMatch, 0, len(podFiles))
   384  	problematicNotPending := 0
   385  	pendingPodsSeen := 0
   386  	for _, podFile := range podFiles {
   387  		podList, err := GetPodList(log, podFile)
   388  		if err != nil {
   389  			log.Debugf("Failed to get the PodList for %s", podFile, err)
   390  			continue
   391  		}
   392  		if podList == nil {
   393  			log.Debugf("No PodList was returned, skipping")
   394  			continue
   395  		}
   396  		for _, pod := range podList.Items {
   397  			if !IsPodProblematic(pod) {
   398  				continue
   399  			}
   400  			if pod.Status.Phase == corev1.PodFailed || pod.Status.Phase == corev1.PodUnknown {
   401  				problematicNotPending++
   402  				if len(pod.Status.Reason) > 0 || len(pod.Status.Message) > 0 {
   403  					messages = append(messages, fmt.Sprintf("Namespace %s, Pod %s, Phase %s, Reason %s, Message %s",
   404  						pod.ObjectMeta.Namespace, pod.ObjectMeta.Name, pod.Status.Phase, pod.Status.Reason, pod.Status.Message))
   405  				}
   406  			} else if pod.Status.Phase == corev1.PodPending {
   407  				pendingPodsSeen++
   408  			}
   409  
   410  			for _, condition := range pod.Status.Conditions {
   411  				message, err := podConditionMessage(pod.Name, pod.Namespace, condition)
   412  				if err != nil {
   413  					log.Debugf("Failed to create pod condition message: %v", err)
   414  					continue
   415  				}
   416  				messages = append(messages, message)
   417  			}
   418  			// TODO: Time correlation for search
   419  
   420  			fileName := files.FindPodLogFileName(clusterRoot, pod)
   421  			matched, err := files.SearchFile(log, fileName, WideErrorSearchRe, nil)
   422  			if err != nil {
   423  				log.Debugf("Failed to search the logfile %s for the ns/pod %s/%s",
   424  					files.FindPodLogFileName(clusterRoot, pod), pod.ObjectMeta.Namespace, pod.ObjectMeta.Name, err)
   425  			}
   426  			if len(matched) > 0 {
   427  				for _, m := range matched {
   428  					if helpers.GetIsLiveCluster() {
   429  						m.FileName = report.GetRelatedLogFromPodMessage(fileName)
   430  					}
   431  					matches = append(matches, m)
   432  				}
   433  			}
   434  		}
   435  	}
   436  	messages = checkIfHelmPodsAdded(messages)
   437  	supportingData := make([]report.SupportData, 1)
   438  	supportingData[0] = report.SupportData{
   439  		Messages:    messages,
   440  		TextMatches: matches,
   441  	}
   442  	// If all of the problematic pods were pending only, just report that, otherwise report them as problematic if some are
   443  	// failing or unknown
   444  	if len(messages) > 0 {
   445  		if pendingPodsSeen > 0 && problematicNotPending == 0 {
   446  			report.ContributeIssue(log, report.NewKnownIssueSupportingData(report.PendingPods, clusterRoot, supportingData))
   447  		} else {
   448  			report.ContributeIssue(log, report.NewKnownIssueSupportingData(report.PodProblemsNotReported, clusterRoot, supportingData))
   449  		}
   450  	}
   451  }
   452  
   453  // checkIfHelmPodsAdded tries to assess whether there is an issue with helm and rancher pods or not
   454  // in cattle-system namespace
   455  // if the helm pods are not fine and rancher pods are working, then it deletes the issues related to helm operation pods in cattle-system
   456  // it edits the message array to remove the messages related to helm-operation pods and returns the modified message array
   457  func checkIfHelmPodsAdded(messages []string) []string {
   458  	isHelm := false
   459  	isRancher := false
   460  	var indices []int
   461  	for i, data := range messages {
   462  		if strings.Contains(data, "Namespace cattle-system, Pod helm-operation") {
   463  			isHelm = true
   464  			indices = append(indices, i)
   465  		}
   466  		if strings.Contains(data, "Namespace cattle-system, Pod rancher") {
   467  			isRancher = true
   468  		}
   469  
   470  	}
   471  	if isHelm && !isRancher {
   472  		for i, index := range indices {
   473  			messages = append(messages[:(index-i)], messages[(index+1-i):]...)
   474  
   475  		}
   476  	}
   477  	return messages
   478  
   479  }
   480  func getPodListIfPresent(path string) (podList *corev1.PodList) {
   481  	podCacheMutex.Lock()
   482  	podListTest := podListMap[path]
   483  	if podListTest != nil {
   484  		podList = podListTest
   485  	}
   486  	podCacheMutex.Unlock()
   487  	return podList
   488  }
   489  
   490  func putPodListIfNotPresent(path string, podList *corev1.PodList) {
   491  	podCacheMutex.Lock()
   492  	podListInMap := podListMap[path]
   493  	if podListInMap == nil {
   494  		podListMap[path] = podList
   495  	}
   496  	podCacheMutex.Unlock()
   497  }
   498  
   499  const podConditionMessageTmpl = "Namespace {{.namespace}}, Pod {{.name}}{{- if .type }}, ConditionType {{.type}}{{- end}}{{- if .status}}, Status {{.status}}{{- end}}{{- if .reason}}, Reason {{.reason}}{{- end }}{{- if .message}}, Message {{.message}}{{- end }}"
   500  
   501  func podConditionMessage(name, namespace string, condition corev1.PodCondition) (string, error) {
   502  	vals := map[string]interface{}{}
   503  	addKV := func(k, v string) {
   504  		if v != "" {
   505  			vals[k] = v
   506  		}
   507  	}
   508  	addKV("message", condition.Message)
   509  	addKV("reason", condition.Reason)
   510  	addKV("status", string(condition.Status))
   511  	addKV("type", string(condition.Type))
   512  	vals["name"] = name
   513  	vals["namespace"] = namespace
   514  	tmpl, err := template.New("podConditionMessage").
   515  		Parse(podConditionMessageTmpl)
   516  	if err != nil {
   517  		return "", err
   518  	}
   519  
   520  	buffer := &bytes.Buffer{}
   521  	if err = tmpl.Execute(buffer, vals); err != nil {
   522  		return "", err
   523  	}
   524  	return buffer.String(), nil
   525  }