github.com/verrazzano/verrazzano@v1.7.0/tools/vz/pkg/analysis/internal/util/report/issue.go (about)

     1  // Copyright (c) 2021, 2023, Oracle and/or its affiliates.
     2  // Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl.
     3  
     4  // Package report handles reporting
     5  package report
     6  
     7  import (
     8  	"errors"
     9  	"fmt"
    10  	"strings"
    11  
    12  	"github.com/verrazzano/verrazzano/tools/vz/pkg/analysis/internal/util/files"
    13  	"go.uber.org/zap"
    14  )
    15  
    16  // NOTE: This is part of the contract with the analyzers however it is currently an initial stake in the ground and
    17  //		 will be evolving rapidly initially as we add analysis cases
    18  
    19  // An issue describes a specific problem that has been found and includes information such as
    20  //     A Summary of the issue
    21  //     A list of Actions which can be taken
    22  //         - Actions are reported in the order specified in this list (so actions more likely to mitigate an issue
    23  //         should be specified first).
    24  //         - Each action may have Steps to take and/or give a list of runbook Links
    25  //     A list of supporting data (TBD)
    26  //         - Source which helped identify the issue
    27  //         - Indicators that identified the issue (search matches, json elements)
    28  //         - etc...
    29  //     A Confidence level (TBD)
    30  //         This is and indication of how confident the analysis is that the issue is really causing
    31  //         problems. The analysis will attempt to weed out things that are not causing an issue and will
    32  //         not report them if it is certain. However there may be situations where something that is found
    33  //         could be causing problems but it is not certain.
    34  
    35  // JSONPath is a JSON path
    36  type JSONPath struct {
    37  	File string // Json filename
    38  	Path string // Json Path
    39  }
    40  
    41  // SupportData is data which helps a user to further identify an issue TODO: Shake this out more as we add more types, see what we really end up needing here
    42  type SupportData struct {
    43  	Messages     []string          // Optional, Messages and/or descriptions the supporting data
    44  	RelatedFiles []string          // Optional, if present provides a list of related files that support the issue identification
    45  	TextMatches  []files.TextMatch // Optional, if present provides search results that support the issue identification
    46  	JSONPaths    []JSONPath        // Optional, if present provides a list of Json paths that support the issue identification
    47  }
    48  
    49  // Issue holds the information about an issue, supporting data, and actions
    50  type Issue struct {
    51  	Type          string   // Required, This identifies the type of issue. This is either a Known Issue type, or a custom type name
    52  	Source        string   // Required, This is the source of the analysis, It may be the root of the cluster analyzed (ie: there can be multiple)
    53  	Informational bool     // Defaults to false, if this is not an issue but an Informational note (TBD: may separate these)
    54  	Summary       string   // Required, there must be a Summary of the issue included
    55  	Actions       []Action // Optional, if Actions are known these are included. Actions will be reported in the order specified
    56  
    57  	SupportingData []SupportData // Optional but highly desirable for issues when possible. Data that helps support issue identification
    58  	Confidence     int           // Required if not informational 0-10 ()
    59  	Impact         int           // Optional 0-10 (TBD: This is a swag at how broad the impact is, 0 low, 10 high, defaults to -1 unknown)
    60  }
    61  
    62  // Validate validates an issue. A zeroed Issue is not valid, there is some amount of information that must be specified for the Issue to
    63  // be useful. Currently the report will validate that the issues contributed are valid at the point where they are
    64  // being contributed.
    65  func (issue *Issue) Validate(log *zap.SugaredLogger, mapSource string) (err error) {
    66  	if len(issue.Type) == 0 {
    67  		return errors.New("A Type is required for an Issue")
    68  	}
    69  	if len(issue.Source) == 0 {
    70  		return errors.New("A Source is required for an Issue")
    71  	}
    72  	// If there was a map source supplied, this means we are additionally checking that the source key
    73  	// for the map matches the issue source as well (ie: when handed a map/slice of issues and a source
    74  	// key, we check these here). If there is no mapSource supplied it just means the issue Source is used for
    75  	// map insertions.
    76  	if len(mapSource) != 0 && issue.Source != mapSource {
    77  		return fmt.Errorf("The issue source %s doesn't match the map source supplied %s", issue.Source, mapSource)
    78  	}
    79  	if len(issue.Summary) == 0 {
    80  		return errors.New("A Summary is required for an Issue")
    81  	}
    82  	if len(issue.Actions) > 0 {
    83  		for _, action := range issue.Actions {
    84  			err = action.Validate(log)
    85  			if err != nil {
    86  				log.Debugf("Action related to issue %s was invalid", issue.Summary, err)
    87  				return err
    88  			}
    89  		}
    90  	}
    91  	if issue.Confidence < 0 || issue.Confidence > 10 {
    92  		log.Debugf("Confidence %d is out of range, related to issue %s", issue.Confidence, issue.Summary)
    93  		return fmt.Errorf("Confidence %d is out of range, related to issue %s", issue.Confidence, issue.Summary)
    94  	}
    95  	return nil
    96  }
    97  
    98  // Known Issue Types.
    99  const (
   100  	ImagePullBackOff                                 = "ImagePullBackOff"
   101  	ImagePullRateLimit                               = "ImagePullRateLimit"
   102  	ImagePullNotFound                                = "ImagePullNotFound"
   103  	ImagePullService                                 = "ImagePullService"
   104  	InsufficientMemory                               = "InsufficientMemory"
   105  	InsufficientCPU                                  = "InsufficientCPU"
   106  	IngressInstallFailure                            = "IngressInstallFailure"
   107  	IngressLBLimitExceeded                           = "IngressLBLimitExceeded"
   108  	IngressNoLoadBalancerIP                          = "IngressNoLoadBalancerIP"
   109  	IngressOciIPLimitExceeded                        = "IngressOciIPLimitExceeded"
   110  	InstallFailure                                   = "InstallFailure"
   111  	PendingPods                                      = "PendingPods"
   112  	PodProblemsNotReported                           = "PodProblemsNotReported"
   113  	ComponentsNotReady                               = "ComponentsNotReady"
   114  	IngressNoIPFound                                 = "IngressNoIPFound"
   115  	IstioIngressNoIP                                 = "IstioIngressNoIP"
   116  	IngressShapeInvalid                              = "IngressShapeInvalid"
   117  	IstioIngressPrivateSubnet                        = "IstioIngressPrivateSubnet"
   118  	NginxIngressPrivateSubnet                        = "NginxIngressPrivateSubnet"
   119  	ExternalDNSConfigureIssue                        = "ExternalDNSConfigureIssue"
   120  	KeycloakDataMigrationFailure                     = "KeycloakDataMigrationFailure"
   121  	RancherIssues                                    = "RancherIssues"
   122  	VZClientHangingIssueDueToLongCertificateApproval = "VZClientHangingIssueDueToLongCertificateApproval"
   123  	CertificateExpired                               = "CertificateExpired"
   124  	CertificateExperiencingIssuesInCluster           = "CertificateExperiencingIssuesInCluster"
   125  	ClusterAPIClusterIssues                          = "ClusterAPIClusterIssues"
   126  	CaCrtExpiredInCluster                            = "CaCrtExpiredInCluster"
   127  )
   128  
   129  // NOTE: How we are handling the issues/actions/reporting is still very much evolving here. Currently supplying some
   130  // helpers to reduce boilerplate when creating/reporting issues with common cases.
   131  
   132  // Known Issue Templates. While analyzers are free to roll their own custom Issues, the preference for well-known issues is to capture them
   133  // here so they are more generally available.
   134  var knownIssues = map[string]Issue{
   135  	ImagePullBackOff:             {Type: ImagePullBackOff, Summary: "Failure(s) pulling images have been detected, however a specific root cause was not identified", Informational: false, Impact: 10, Confidence: 10, Actions: []Action{KnownActions[ImagePullBackOff]}},
   136  	ImagePullRateLimit:           {Type: ImagePullRateLimit, Summary: "Failure(s) pulling images have been detected due to an image pull rate limit", Informational: false, Impact: 10, Confidence: 10, Actions: []Action{KnownActions[ImagePullRateLimit]}},
   137  	ImagePullNotFound:            {Type: ImagePullNotFound, Summary: "Failure(s) pulling images have been detected due to the image not being found", Informational: false, Impact: 10, Confidence: 10, Actions: []Action{KnownActions[ImagePullNotFound]}},
   138  	ImagePullService:             {Type: ImagePullService, Summary: "Failure(s) pulling images have been detected due to the service not being available, the service may be unreachable or may be incorrectly specified", Informational: false, Impact: 10, Confidence: 10, Actions: []Action{KnownActions[ImagePullService]}},
   139  	InsufficientMemory:           {Type: InsufficientMemory, Summary: "Failure(s) due to insufficient memory on nodes have been detected", Informational: false, Impact: 10, Confidence: 10, Actions: []Action{KnownActions[InsufficientMemory]}},
   140  	InsufficientCPU:              {Type: InsufficientCPU, Summary: "Failure(s) due to insufficient CPU on nodes have been detected", Informational: false, Impact: 10, Confidence: 10, Actions: []Action{KnownActions[InsufficientCPU]}},
   141  	IngressInstallFailure:        {Type: IngressInstallFailure, Summary: "Verrazzano install failed while installing the NGINX Ingress Controller, however a specific root cause was not identified", Informational: false, Impact: 10, Confidence: 10, Actions: []Action{KnownActions[IngressInstallFailure]}},
   142  	IngressLBLimitExceeded:       {Type: IngressLBLimitExceeded, Summary: "Verrazzano install failed while installing the NGINX Ingress Controller, the root cause appears to be that the load balancer service limit has been reached", Informational: false, Impact: 10, Confidence: 10, Actions: []Action{KnownActions[IngressLBLimitExceeded]}},
   143  	IngressNoLoadBalancerIP:      {Type: IngressNoLoadBalancerIP, Summary: "Verrazzano install failed while installing the NGINX Ingress Controller, the root cause appears to be the LoadBalancer is not there or is unable to set the ingress IP address on the NGINX Ingress service", Informational: false, Impact: 10, Confidence: 10, Actions: []Action{KnownActions[IngressNoLoadBalancerIP]}},
   144  	IngressOciIPLimitExceeded:    {Type: IngressOciIPLimitExceeded, Summary: "Verrazzano install failed while installing the NGINX Ingress Controller, the root cause appears to be an OCI IP non-ephemeral address limit has been reached", Informational: false, Impact: 10, Confidence: 10, Actions: []Action{KnownActions[IngressOciIPLimitExceeded]}},
   145  	InstallFailure:               {Type: InstallFailure, Summary: "Verrazzano install failed, however a specific root cause was not identified", Informational: false, Impact: 10, Confidence: 10, Actions: []Action{KnownActions[InstallFailure]}},
   146  	PendingPods:                  {Type: PendingPods, Summary: "Pods in a Pending state were detected. These may come up normally or there may be specific issues preventing them from coming up", Informational: true, Impact: 0, Confidence: 1, Actions: []Action{KnownActions[PendingPods]}},
   147  	PodProblemsNotReported:       {Type: PodProblemsNotReported, Summary: "Problem pods were detected, however a specific root cause was not identified", Informational: true, Impact: 0, Confidence: 10, Actions: []Action{KnownActions[PodProblemsNotReported]}},
   148  	ComponentsNotReady:           {Type: InstallFailure, Summary: "Verrazzano install failed, one or more components did not reach Ready state", Informational: false, Impact: 10, Confidence: 10, Actions: []Action{KnownActions[InstallFailure]}},
   149  	IngressNoIPFound:             {Type: IngressNoIPFound, Summary: "Verrazzano install failed as no IP found for service ingress-controller-ingress-nginx-controller with type LoadBalancer", Informational: false, Impact: 10, Confidence: 10, Actions: []Action{KnownActions[IngressNoIPFound]}},
   150  	IstioIngressNoIP:             {Type: IstioIngressNoIP, Summary: "Verrazzano install failed as no IP found for service istio-ingressgateway with type LoadBalancer", Informational: false, Impact: 10, Confidence: 10, Actions: []Action{KnownActions[IstioIngressNoIP]}},
   151  	IngressShapeInvalid:          {Type: IngressShapeInvalid, Summary: "Verrazzano install failed as the shape provided for NGINX Ingress Controller is invalid", Informational: false, Impact: 10, Confidence: 10, Actions: []Action{KnownActions[IngressShapeInvalid]}},
   152  	IstioIngressPrivateSubnet:    {Type: IstioIngressPrivateSubnet, Summary: "Failed to create LoadBalancer for Istio Ingress Gateway", Informational: false, Impact: 10, Confidence: 10, Actions: []Action{KnownActions[IstioIngressPrivateSubnet]}},
   153  	NginxIngressPrivateSubnet:    {Type: NginxIngressPrivateSubnet, Summary: "Failed to create LoadBalancer for Nginx Ingress Controller", Informational: false, Impact: 10, Confidence: 10, Actions: []Action{KnownActions[NginxIngressPrivateSubnet]}},
   154  	ExternalDNSConfigureIssue:    {Type: ExternalDNSConfigureIssue, Summary: "Failed to setup DNS configuration", Informational: false, Impact: 10, Confidence: 10, Actions: []Action{KnownActions[ExternalDNSConfigureIssue]}},
   155  	KeycloakDataMigrationFailure: {Type: KeycloakDataMigrationFailure, Summary: "Failure(s) migrating Keycloak data during MySQL upgrade", Informational: true, Impact: 10, Confidence: 10, Actions: []Action{KnownActions[KeycloakDataMigrationFailure]}},
   156  	RancherIssues:                {Type: RancherIssues, Summary: "Rancher resources are not in the expected state", Informational: false, Impact: 10, Confidence: 10, Actions: []Action{KnownActions[RancherIssues]}},
   157  	VZClientHangingIssueDueToLongCertificateApproval: {Type: VZClientHangingIssueDueToLongCertificateApproval, Summary: " Verrazzano Client is hanging due to the long time that it takes to approve and provision certificates", Informational: true, Impact: 10, Confidence: 10},
   158  	CertificateExpired:                     {Type: CertificateExpired, Summary: "A certificate in the cluster is currently expired", Informational: true, Impact: 10, Confidence: 10},
   159  	CertificateExperiencingIssuesInCluster: {Type: CertificateExperiencingIssuesInCluster, Summary: "A certificate in the cluster is experiencing issues, but it is not expired", Informational: true, Impact: 10, Confidence: 10},
   160  	ClusterAPIClusterIssues:                {Type: ClusterAPIClusterIssues, Summary: "Cluster API cluster resources are not in the expected state", Informational: false, Impact: 10, Confidence: 10, Actions: []Action{KnownActions[ClusterAPIClusterIssues]}},
   161  	CaCrtExpiredInCluster:                  {Type: CaCrtExpiredInCluster, Summary: "A ca.crt value in the cluster is expired", Informational: true, Impact: 10, Confidence: 10},
   162  }
   163  
   164  // NewKnownIssueSupportingData adds a known issue
   165  func NewKnownIssueSupportingData(issueType string, source string, supportingData []SupportData) (issue Issue) {
   166  	issue = getKnownIssueOrDie(issueType)
   167  	issue.Source = source
   168  	issue.SupportingData = supportingData
   169  	return issue
   170  }
   171  
   172  // NewKnownIssueMessagesFiles adds a known issue
   173  func NewKnownIssueMessagesFiles(issueType string, source string, messages []string, fileNames []string) (issue Issue) {
   174  	issue = getKnownIssueOrDie(issueType)
   175  	issue.Source = source
   176  	issue.SupportingData = make([]SupportData, 1)
   177  	issue.SupportingData[0] = SupportData{
   178  		Messages:     messages,
   179  		RelatedFiles: fileNames,
   180  	}
   181  	return issue
   182  }
   183  
   184  // NewKnownIssueMessagesMatches adds a known issue
   185  func NewKnownIssueMessagesMatches(issueType string, source string, messages []string, matches []files.TextMatch) (issue Issue) {
   186  	issue = getKnownIssueOrDie(issueType)
   187  	issue.Source = source
   188  	issue.SupportingData = make([]SupportData, 1)
   189  	issue.SupportingData[0] = SupportData{
   190  		Messages:    messages,
   191  		TextMatches: matches,
   192  	}
   193  	return issue
   194  }
   195  
   196  // IssueReporter is a helper for consolidating known issues before contributing them to the report
   197  // An analyzer may is free to use the IssueReporter NewKnown* helpers for known issues, however they
   198  // are not required to do so and are free to form fully custom issues and Contribute
   199  // those directly to the report.Contribute* helpers. This allows analyzers flexibility, but the goal
   200  // here is that the IssueReporter can evolve to support all of the cases if possible.
   201  type IssueReporter struct {
   202  	PendingIssues map[string]Issue
   203  }
   204  
   205  // AddKnownIssueSupportingData adds a known issue
   206  func (issueReporter *IssueReporter) AddKnownIssueSupportingData(issueType string, source string, supportingData []SupportData) {
   207  	confirmKnownIssueOrDie(issueType)
   208  
   209  	// If this is a new issue, get a new one
   210  	if issue, ok := issueReporter.PendingIssues[issueType]; !ok {
   211  		issueReporter.PendingIssues[issueType] = NewKnownIssueSupportingData(issueType, source, supportingData)
   212  	} else {
   213  		issue.SupportingData = append(issue.SupportingData, supportingData...)
   214  		issueReporter.PendingIssues[issueType] = issue
   215  	}
   216  }
   217  
   218  // AddKnownIssueMessagesFiles adds a known issue
   219  func (issueReporter *IssueReporter) AddKnownIssueMessagesFiles(issueType string, source string, messages []string, fileNames []string) {
   220  	confirmKnownIssueOrDie(issueType)
   221  
   222  	// If this is a new issue, get a new one
   223  	if issue, ok := issueReporter.PendingIssues[issueType]; !ok {
   224  		issueReporter.PendingIssues[issueType] = NewKnownIssueMessagesFiles(issueType, source, messages, fileNames)
   225  	} else {
   226  		supportData := SupportData{
   227  			Messages:     messages,
   228  			RelatedFiles: fileNames,
   229  		}
   230  		issue.SupportingData = append(issue.SupportingData, supportData)
   231  		issueReporter.PendingIssues[issueType] = issue
   232  	}
   233  }
   234  
   235  // AddKnownIssueMessagesMatches adds a known issue
   236  func (issueReporter *IssueReporter) AddKnownIssueMessagesMatches(issueType string, source string, messages []string, matches []files.TextMatch) {
   237  	confirmKnownIssueOrDie(issueType)
   238  
   239  	// If this is a new issue, get a new one
   240  	if issue, ok := issueReporter.PendingIssues[issueType]; !ok {
   241  		issueReporter.PendingIssues[issueType] = NewKnownIssueMessagesMatches(issueType, source, messages, matches)
   242  	} else {
   243  		supportData := SupportData{
   244  			Messages:    messages,
   245  			TextMatches: matches,
   246  		}
   247  		issue.SupportingData = append(issue.SupportingData, supportData)
   248  		issueReporter.PendingIssues[issueType] = issue
   249  	}
   250  }
   251  
   252  // DeduplicateSupportingData
   253  func DeduplicateSupportingData(dataIn []SupportData) (dataOut []SupportData) {
   254  	// First deduplicate each individual SupportData element, get a minimal set of file and messages at least in
   255  	// each one.
   256  	dataOut = make([]SupportData, len(dataIn))
   257  	for index, supportData := range dataIn {
   258  		dataOut[index] = deduplicateSupportData(supportData)
   259  	}
   260  	// TODO: Next deduplicate the SupportData entries that match exactly
   261  
   262  	return dataIn
   263  }
   264  
   265  // deduplicateSupportData will deduplicate values within a single SupportData
   266  func deduplicateSupportData(dataIn SupportData) (dataOut SupportData) {
   267  	dataOut.RelatedFiles = deduplicateStringSlice(dataIn.RelatedFiles)
   268  	dataOut.Messages = deduplicateStringSlice(dataIn.Messages)
   269  	// TODO: deduplicate
   270  	dataOut.JSONPaths = dataIn.JSONPaths
   271  	dataOut.TextMatches = dataIn.TextMatches
   272  	return dataOut
   273  }
   274  
   275  func deduplicateStringSlice(sliceIn []string) (sliceOut []string) {
   276  	if len(sliceIn) <= 1 {
   277  		copy(sliceOut, sliceIn)
   278  	} else {
   279  		tempMap := make(map[string]int)
   280  		for _, value := range sliceIn {
   281  			_, ok := tempMap[value]
   282  			if !ok {
   283  				tempMap[value] = 0
   284  			}
   285  		}
   286  		sliceOut = make([]string, len(tempMap))
   287  		index := 0
   288  		for key := range tempMap {
   289  			sliceOut[index] = key
   290  			index++
   291  		}
   292  	}
   293  	return sliceOut
   294  }
   295  
   296  // The helpers that work with known issue types only support working with those types
   297  // If code is supplying an issueType that is not known, that is a coding error and we
   298  // panic so that is clear immediately to the developer.
   299  func getKnownIssueOrDie(issueType string) (issue Issue) {
   300  	issue, ok := knownIssues[issueType]
   301  	if !ok {
   302  		panic("This helper is used with known issue types only")
   303  	}
   304  	return issue
   305  }
   306  
   307  func confirmKnownIssueOrDie(issueType string) {
   308  	_, ok := knownIssues[issueType]
   309  	if !ok {
   310  		panic("This helper is used with known issue types only")
   311  	}
   312  }
   313  
   314  // Contribute will contribute issues which have been added to the issue reporter
   315  func (issueReporter *IssueReporter) Contribute(log *zap.SugaredLogger, source string) {
   316  	if len(issueReporter.PendingIssues) == 0 {
   317  		return
   318  	}
   319  	// Contribute the issues all at once
   320  	ContributeIssuesMap(log, source, issueReporter.PendingIssues)
   321  	issueReporter.PendingIssues = make(map[string]Issue)
   322  }
   323  
   324  // SingleMessage is a helper which is useful when adding a single message to supporting data
   325  func SingleMessage(message string) (messages []string) {
   326  	messages = make([]string, 1)
   327  	messages[0] = message
   328  	return messages
   329  }
   330  
   331  // GetRelatedPodMessage returns the message for an issue in pod, used for setting supporting data
   332  func GetRelatedPodMessage(pod, ns string) string {
   333  	return "Pod \"" + pod + "\" in namespace \"" + ns + "\""
   334  }
   335  
   336  // GetRelatedServiceMessage returns the message for an issue in a service, used for setting supporting data
   337  func GetRelatedServiceMessage(service, ns string) string {
   338  	return "Service \"" + service + "\" in namespace \"" + ns + "\""
   339  }
   340  
   341  // GetRelatedLogFromPodMessage returns the message to indicate the issue in the pod log, in a given namespace
   342  func GetRelatedLogFromPodMessage(podLog string) string {
   343  	splitStr := strings.Split(podLog, "/")
   344  	pod := splitStr[len(splitStr)-2]
   345  	ns := splitStr[len(splitStr)-3]
   346  	return "Log from pod \"" + pod + "\" in namespace \"" + ns + "\""
   347  }
   348  
   349  // GetRelatedEventMessage returns the message for an event, used for setting supporting data
   350  func GetRelatedEventMessage(ns string) string {
   351  	return "Event(s) in namespace \"" + ns + "\""
   352  }
   353  
   354  // GetRelatedVZResourceMessage returns the message for Verrazzano resource, used for setting supporting data
   355  func GetRelatedVZResourceMessage() string {
   356  	return "Verrazzano custom resource"
   357  }