github.com/verrazzano/verrazzano@v1.7.1/tools/vz/pkg/internal/util/report/issue.go (about) 1 // Copyright (c) 2021, 2024, Oracle and/or its affiliates. 2 // Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl. 3 4 // Package report handles reporting 5 package report 6 7 import ( 8 "errors" 9 "fmt" 10 "strings" 11 12 "github.com/verrazzano/verrazzano/tools/vz/pkg/internal/util/files" 13 "go.uber.org/zap" 14 ) 15 16 // NOTE: This is part of the contract with the analyzers however it is currently an initial stake in the ground and 17 // will be evolving rapidly initially as we add analysis cases 18 19 // An issue describes a specific problem that has been found and includes information such as 20 // A Summary of the issue 21 // A list of Actions which can be taken 22 // - Actions are reported in the order specified in this list (so actions more likely to mitigate an issue 23 // should be specified first). 24 // - Each action may have Steps to take and/or give a list of runbook Links 25 // A list of supporting data (TBD) 26 // - Source which helped identify the issue 27 // - Indicators that identified the issue (search matches, json elements) 28 // - etc... 29 // A Confidence level (TBD) 30 // This is and indication of how confident the analysis is that the issue is really causing 31 // problems. The analysis will attempt to weed out things that are not causing an issue and will 32 // not report them if it is certain. However there may be situations where something that is found 33 // could be causing problems but it is not certain. 34 35 // JSONPath is a JSON path 36 type JSONPath struct { 37 File string // Json filename 38 Path string // Json Path 39 } 40 41 // SupportData is data which helps a user to further identify an issue TODO: Shake this out more as we add more types, see what we really end up needing here 42 type SupportData struct { 43 Messages []string // Optional, Messages and/or descriptions the supporting data 44 RelatedFiles []string // Optional, if present provides a list of related files that support the issue identification 45 TextMatches []files.TextMatch // Optional, if present provides search results that support the issue identification 46 JSONPaths []JSONPath // Optional, if present provides a list of Json paths that support the issue identification 47 } 48 49 // Issue holds the information about an issue, supporting data, and actions 50 type Issue struct { 51 Type string // Required, This identifies the type of issue. This is either a Known Issue type, or a custom type name 52 Source string // Required, This is the source of the analysis, It may be the root of the cluster analyzed (ie: there can be multiple) 53 Informational bool // Defaults to false, if this is not an issue but an Informational note (TBD: may separate these) 54 Summary string // Required, there must be a Summary of the issue included 55 Actions []Action // Optional, if Actions are known these are included. Actions will be reported in the order specified 56 57 SupportingData []SupportData // Optional but highly desirable for issues when possible. Data that helps support issue identification 58 Confidence int // Required if not informational 0-10 () 59 Impact int // Optional 0-10 (TBD: This is a swag at how broad the impact is, 0 low, 10 high, defaults to -1 unknown) 60 } 61 62 // Validate validates an issue. A zeroed Issue is not valid, there is some amount of information that must be specified for the Issue to 63 // be useful. Currently the report will validate that the issues contributed are valid at the point where they are 64 // being contributed. 65 func (issue *Issue) Validate(log *zap.SugaredLogger, mapSource string) (err error) { 66 if len(issue.Type) == 0 { 67 return errors.New("A Type is required for an Issue") 68 } 69 if len(issue.Source) == 0 { 70 return errors.New("A Source is required for an Issue") 71 } 72 // If there was a map source supplied, this means we are additionally checking that the source key 73 // for the map matches the issue source as well (ie: when handed a map/slice of issues and a source 74 // key, we check these here). If there is no mapSource supplied it just means the issue Source is used for 75 // map insertions. 76 if len(mapSource) != 0 && issue.Source != mapSource { 77 return fmt.Errorf("The issue source %s doesn't match the map source supplied %s", issue.Source, mapSource) 78 } 79 if len(issue.Summary) == 0 { 80 return errors.New("A Summary is required for an Issue") 81 } 82 if len(issue.Actions) > 0 { 83 for _, action := range issue.Actions { 84 err = action.Validate(log) 85 if err != nil { 86 log.Debugf("Action related to issue %s was invalid", issue.Summary, err) 87 return err 88 } 89 } 90 } 91 if issue.Confidence < 0 || issue.Confidence > 10 { 92 log.Debugf("Confidence %d is out of range, related to issue %s", issue.Confidence, issue.Summary) 93 return fmt.Errorf("Confidence %d is out of range, related to issue %s", issue.Confidence, issue.Summary) 94 } 95 return nil 96 } 97 98 // Known Issue Types. 99 const ( 100 ImagePullBackOff = "ImagePullBackOff" 101 ImagePullRateLimit = "ImagePullRateLimit" 102 ImagePullNotFound = "ImagePullNotFound" 103 ImagePullService = "ImagePullService" 104 InsufficientMemory = "InsufficientMemory" 105 InsufficientCPU = "InsufficientCPU" 106 IngressInstallFailure = "IngressInstallFailure" 107 IngressLBLimitExceeded = "IngressLBLimitExceeded" 108 IngressNoLoadBalancerIP = "IngressNoLoadBalancerIP" 109 IngressOciIPLimitExceeded = "IngressOciIPLimitExceeded" 110 InstallFailure = "InstallFailure" 111 PendingPods = "PendingPods" 112 PodProblemsNotReported = "PodProblemsNotReported" 113 ComponentsNotReady = "ComponentsNotReady" 114 ComponentsUnavailable = "ComponentsUnavailable" 115 IngressNoIPFound = "IngressNoIPFound" 116 IstioIngressNoIP = "IstioIngressNoIP" 117 IngressShapeInvalid = "IngressShapeInvalid" 118 IstioIngressPrivateSubnet = "IstioIngressPrivateSubnet" 119 NginxIngressPrivateSubnet = "NginxIngressPrivateSubnet" 120 ExternalDNSConfigureIssue = "ExternalDNSConfigureIssue" 121 KeycloakDataMigrationFailure = "KeycloakDataMigrationFailure" 122 RancherIssues = "RancherIssues" 123 VZClientHangingIssueDueToLongCertificateApproval = "VZClientHangingIssueDueToLongCertificateApproval" 124 CertificateExpired = "CertificateExpired" 125 CertificateExperiencingIssuesInCluster = "CertificateExperiencingIssuesInCluster" 126 ClusterAPIClusterIssues = "ClusterAPIClusterIssues" 127 CaCrtExpiredInCluster = "CaCrtExpiredInCluster" 128 BlockStorageLimitExceeded = "BlockStorageLimitExceeded" 129 TCPKeepIdleIssues = "TCPKeepIdleIssues" 130 NamespaceCurrentlyInTerminatingStateForLongDuration = "NamespaceCurrentlyInTerminatingStateForLongDuration" 131 InnoDBClusterResourceCurrentlyInTerminatingStateForLongDuration = "InnoDBClusterResourceCurrentlyInTerminatingStateForLongDuration" 132 PodHangingOnDeletion = "PodHangingOnDeletion" 133 PodWaitingOnReadinessGates = "PodWaitingOnReadiness" 134 ) 135 136 // NOTE: How we are handling the issues/actions/reporting is still very much evolving here. Currently supplying some 137 // helpers to reduce boilerplate when creating/reporting issues with common cases. 138 139 // Known Issue Templates. While analyzers are free to roll their own custom Issues, the preference for well-known issues is to capture them 140 // here so they are more generally available. 141 var knownIssues = map[string]Issue{ 142 ImagePullBackOff: {Type: ImagePullBackOff, Summary: "Failure(s) pulling images have been detected, however a specific root cause was not identified", Informational: false, Impact: 10, Confidence: 10, Actions: []Action{KnownActions[ImagePullBackOff]}}, 143 ImagePullRateLimit: {Type: ImagePullRateLimit, Summary: "Failure(s) pulling images have been detected due to an image pull rate limit", Informational: false, Impact: 10, Confidence: 10, Actions: []Action{KnownActions[ImagePullRateLimit]}}, 144 ImagePullNotFound: {Type: ImagePullNotFound, Summary: "Failure(s) pulling images have been detected due to the image not being found", Informational: false, Impact: 10, Confidence: 10, Actions: []Action{KnownActions[ImagePullNotFound]}}, 145 ImagePullService: {Type: ImagePullService, Summary: "Failure(s) pulling images have been detected due to the service not being available, the service may be unreachable or may be incorrectly specified", Informational: false, Impact: 10, Confidence: 10, Actions: []Action{KnownActions[ImagePullService]}}, 146 InsufficientMemory: {Type: InsufficientMemory, Summary: "Failure(s) due to insufficient memory on nodes have been detected", Informational: false, Impact: 10, Confidence: 10, Actions: []Action{KnownActions[InsufficientMemory]}}, 147 InsufficientCPU: {Type: InsufficientCPU, Summary: "Failure(s) due to insufficient CPU on nodes have been detected", Informational: false, Impact: 10, Confidence: 10, Actions: []Action{KnownActions[InsufficientCPU]}}, 148 IngressInstallFailure: {Type: IngressInstallFailure, Summary: "Verrazzano install failed while installing the NGINX Ingress Controller, however a specific root cause was not identified", Informational: false, Impact: 10, Confidence: 10, Actions: []Action{KnownActions[IngressInstallFailure]}}, 149 IngressLBLimitExceeded: {Type: IngressLBLimitExceeded, Summary: "Verrazzano install failed while installing the NGINX Ingress Controller, the root cause appears to be that the load balancer service limit has been reached", Informational: false, Impact: 10, Confidence: 10, Actions: []Action{KnownActions[IngressLBLimitExceeded]}}, 150 IngressNoLoadBalancerIP: {Type: IngressNoLoadBalancerIP, Summary: "Verrazzano install failed while installing the NGINX Ingress Controller, the root cause appears to be the LoadBalancer is not there or is unable to set the ingress IP address on the NGINX Ingress service", Informational: false, Impact: 10, Confidence: 10, Actions: []Action{KnownActions[IngressNoLoadBalancerIP]}}, 151 IngressOciIPLimitExceeded: {Type: IngressOciIPLimitExceeded, Summary: "Verrazzano install failed while installing the NGINX Ingress Controller, the root cause appears to be an OCI IP non-ephemeral address limit has been reached", Informational: false, Impact: 10, Confidence: 10, Actions: []Action{KnownActions[IngressOciIPLimitExceeded]}}, 152 InstallFailure: {Type: InstallFailure, Summary: "Verrazzano install failed, however a specific root cause was not identified", Informational: false, Impact: 10, Confidence: 10, Actions: []Action{KnownActions[InstallFailure]}}, 153 PendingPods: {Type: PendingPods, Summary: "Pods in a Pending state were detected. These may come up normally or there may be specific issues preventing them from coming up", Informational: true, Impact: 0, Confidence: 1, Actions: []Action{KnownActions[PendingPods]}}, 154 PodProblemsNotReported: {Type: PodProblemsNotReported, Summary: "Problem pods were detected, however a specific root cause was not identified", Informational: true, Impact: 0, Confidence: 10, Actions: []Action{KnownActions[PodProblemsNotReported]}}, 155 ComponentsNotReady: {Type: InstallFailure, Summary: "Verrazzano install failed, one or more components did not reach Ready state", Informational: false, Impact: 10, Confidence: 10, Actions: []Action{KnownActions[InstallFailure]}}, 156 ComponentsUnavailable: {Type: ComponentsUnavailable, Summary: "One or more components reached Ready state, but is unavailable", Informational: false, Impact: 0, Confidence: 10}, 157 IngressNoIPFound: {Type: IngressNoIPFound, Summary: "Verrazzano install failed as no IP found for service ingress-controller-ingress-nginx-controller with type LoadBalancer", Informational: false, Impact: 10, Confidence: 10, Actions: []Action{KnownActions[IngressNoIPFound]}}, 158 IstioIngressNoIP: {Type: IstioIngressNoIP, Summary: "Verrazzano install failed as no IP found for service istio-ingressgateway with type LoadBalancer", Informational: false, Impact: 10, Confidence: 10, Actions: []Action{KnownActions[IstioIngressNoIP]}}, 159 IngressShapeInvalid: {Type: IngressShapeInvalid, Summary: "Verrazzano install failed as the shape provided for NGINX Ingress Controller is invalid", Informational: false, Impact: 10, Confidence: 10, Actions: []Action{KnownActions[IngressShapeInvalid]}}, 160 IstioIngressPrivateSubnet: {Type: IstioIngressPrivateSubnet, Summary: "Failed to create LoadBalancer for Istio Ingress Gateway", Informational: false, Impact: 10, Confidence: 10, Actions: []Action{KnownActions[IstioIngressPrivateSubnet]}}, 161 NginxIngressPrivateSubnet: {Type: NginxIngressPrivateSubnet, Summary: "Failed to create LoadBalancer for Nginx Ingress Controller", Informational: false, Impact: 10, Confidence: 10, Actions: []Action{KnownActions[NginxIngressPrivateSubnet]}}, 162 ExternalDNSConfigureIssue: {Type: ExternalDNSConfigureIssue, Summary: "Failed to setup DNS configuration", Informational: false, Impact: 10, Confidence: 10, Actions: []Action{KnownActions[ExternalDNSConfigureIssue]}}, 163 KeycloakDataMigrationFailure: {Type: KeycloakDataMigrationFailure, Summary: "Failure(s) migrating Keycloak data during MySQL upgrade", Informational: true, Impact: 10, Confidence: 10, Actions: []Action{KnownActions[KeycloakDataMigrationFailure]}}, 164 RancherIssues: {Type: RancherIssues, Summary: "Rancher resources are not in the expected state", Informational: false, Impact: 10, Confidence: 10, Actions: []Action{KnownActions[RancherIssues]}}, 165 VZClientHangingIssueDueToLongCertificateApproval: {Type: VZClientHangingIssueDueToLongCertificateApproval, Summary: " Verrazzano Client is hanging due to the long time that it takes to approve and provision certificates", Informational: true, Impact: 10, Confidence: 10}, 166 CertificateExpired: {Type: CertificateExpired, Summary: "A certificate in the cluster is currently expired", Informational: true, Impact: 10, Confidence: 10}, 167 CertificateExperiencingIssuesInCluster: {Type: CertificateExperiencingIssuesInCluster, Summary: "A certificate in the cluster is experiencing issues, but it is not expired", Informational: true, Impact: 10, Confidence: 10}, 168 ClusterAPIClusterIssues: {Type: ClusterAPIClusterIssues, Summary: "Cluster API cluster resources are not in the expected state", Informational: false, Impact: 10, Confidence: 10, Actions: []Action{KnownActions[ClusterAPIClusterIssues]}}, 169 CaCrtExpiredInCluster: {Type: CaCrtExpiredInCluster, Summary: "A ca.crt value in the cluster is expired", Informational: true, Impact: 10, Confidence: 10}, 170 BlockStorageLimitExceeded: {Type: BlockStorageLimitExceeded, Summary: "You have reached your service limit in this Availability Domain for volumes. Please try creating the volume in a different Availability Domain or Region, or try using a smaller volume size.", Informational: true, Impact: 10, Confidence: 10}, 171 TCPKeepIdleIssues: {Type: TCPKeepIdleIssues, Summary: "Issues setting the TCP_KEEPIDLE socket option have been detected in the cluster", Informational: true, Impact: 10, Confidence: 10}, 172 NamespaceCurrentlyInTerminatingStateForLongDuration: {Type: NamespaceCurrentlyInTerminatingStateForLongDuration, Summary: "A namespace within the cluster has been in a terminating state for a long duration of time", Informational: true, Impact: 5, Confidence: 10}, 173 InnoDBClusterResourceCurrentlyInTerminatingStateForLongDuration: {Type: InnoDBClusterResourceCurrentlyInTerminatingStateForLongDuration, Summary: "An InnoDBCluster resource within the cluster has been in a terminating state for a long duration of time", Informational: true, Impact: 5, Confidence: 10}, 174 PodHangingOnDeletion: {Type: PodHangingOnDeletion, Summary: "A pod has been stuck terminating for 10 minutes or greater", Informational: true, Impact: 5, Confidence: 10}, 175 PodWaitingOnReadinessGates: {Type: PodWaitingOnReadinessGates, Summary: "A pod in the cluster is waiting on its readiness gates", Informational: true, Impact: 8, Confidence: 10}, 176 } 177 178 // NewKnownIssueSupportingData adds a known issue 179 func NewKnownIssueSupportingData(issueType string, source string, supportingData []SupportData) (issue Issue) { 180 issue = getKnownIssueOrDie(issueType) 181 issue.Source = source 182 issue.SupportingData = supportingData 183 return issue 184 } 185 186 // NewKnownIssueMessagesFiles adds a known issue 187 func NewKnownIssueMessagesFiles(issueType string, source string, messages []string, fileNames []string) (issue Issue) { 188 issue = getKnownIssueOrDie(issueType) 189 issue.Source = source 190 issue.SupportingData = make([]SupportData, 1) 191 issue.SupportingData[0] = SupportData{ 192 Messages: messages, 193 RelatedFiles: fileNames, 194 } 195 return issue 196 } 197 198 // NewKnownIssueMessagesMatches adds a known issue 199 func NewKnownIssueMessagesMatches(issueType string, source string, messages []string, matches []files.TextMatch) (issue Issue) { 200 issue = getKnownIssueOrDie(issueType) 201 issue.Source = source 202 issue.SupportingData = make([]SupportData, 1) 203 issue.SupportingData[0] = SupportData{ 204 Messages: messages, 205 TextMatches: matches, 206 } 207 return issue 208 } 209 210 // IssueReporter is a helper for consolidating known issues before contributing them to the report 211 // An analyzer may is free to use the IssueReporter NewKnown* helpers for known issues, however they 212 // are not required to do so and are free to form fully custom issues and Contribute 213 // those directly to the report.Contribute* helpers. This allows analyzers flexibility, but the goal 214 // here is that the IssueReporter can evolve to support all of the cases if possible. 215 type IssueReporter struct { 216 PendingIssues map[string]Issue 217 } 218 219 // AddKnownIssueSupportingData adds a known issue 220 func (issueReporter *IssueReporter) AddKnownIssueSupportingData(issueType string, source string, supportingData []SupportData) { 221 confirmKnownIssueOrDie(issueType) 222 223 // If this is a new issue, get a new one 224 if issue, ok := issueReporter.PendingIssues[issueType]; !ok { 225 issueReporter.PendingIssues[issueType] = NewKnownIssueSupportingData(issueType, source, supportingData) 226 } else { 227 issue.SupportingData = append(issue.SupportingData, supportingData...) 228 issueReporter.PendingIssues[issueType] = issue 229 } 230 } 231 232 // AddKnownIssueMessagesFiles adds a known issue 233 func (issueReporter *IssueReporter) AddKnownIssueMessagesFiles(issueType string, source string, messages []string, fileNames []string) { 234 confirmKnownIssueOrDie(issueType) 235 236 // If this is a new issue, get a new one 237 if issue, ok := issueReporter.PendingIssues[issueType]; !ok { 238 issueReporter.PendingIssues[issueType] = NewKnownIssueMessagesFiles(issueType, source, messages, fileNames) 239 } else { 240 supportData := SupportData{ 241 Messages: messages, 242 RelatedFiles: fileNames, 243 } 244 issue.SupportingData = append(issue.SupportingData, supportData) 245 issueReporter.PendingIssues[issueType] = issue 246 } 247 } 248 249 // AddKnownIssueMessagesMatches adds a known issue 250 func (issueReporter *IssueReporter) AddKnownIssueMessagesMatches(issueType string, source string, messages []string, matches []files.TextMatch) { 251 confirmKnownIssueOrDie(issueType) 252 253 // If this is a new issue, get a new one 254 if issue, ok := issueReporter.PendingIssues[issueType]; !ok { 255 issueReporter.PendingIssues[issueType] = NewKnownIssueMessagesMatches(issueType, source, messages, matches) 256 } else { 257 supportData := SupportData{ 258 Messages: messages, 259 TextMatches: matches, 260 } 261 issue.SupportingData = append(issue.SupportingData, supportData) 262 issueReporter.PendingIssues[issueType] = issue 263 } 264 } 265 266 // DeduplicateSupportingDataList 267 func DeduplicateSupportingDataList(dataIn []SupportData) (dataOut []SupportData) { 268 // First deduplicate each individual SupportData element, get a minimal set of file and messages at least in 269 // each one. 270 dataOut = make([]SupportData, len(dataIn)) 271 for index, supportData := range dataIn { 272 dataOut[index] = deduplicateSupportData(supportData) 273 } 274 // TODO: Next deduplicate the SupportData entries that match exactly 275 276 return dataIn 277 } 278 279 // deduplicateSupportData will deduplicate values within a single SupportData 280 func deduplicateSupportData(dataIn SupportData) (dataOut SupportData) { 281 dataOut.RelatedFiles = deduplicateStringSlice(dataIn.RelatedFiles) 282 dataOut.Messages = deduplicateStringSlice(dataIn.Messages) 283 // TODO: deduplicate 284 dataOut.JSONPaths = dataIn.JSONPaths 285 dataOut.TextMatches = dataIn.TextMatches 286 return dataOut 287 } 288 289 func deduplicateStringSlice(sliceIn []string) (sliceOut []string) { 290 if len(sliceIn) <= 1 { 291 copy(sliceOut, sliceIn) 292 } else { 293 tempMap := make(map[string]int) 294 for _, value := range sliceIn { 295 _, ok := tempMap[value] 296 if !ok { 297 tempMap[value] = 0 298 } 299 } 300 sliceOut = make([]string, len(tempMap)) 301 index := 0 302 for key := range tempMap { 303 sliceOut[index] = key 304 index++ 305 } 306 } 307 return sliceOut 308 } 309 310 // The helpers that work with known issue types only support working with those types 311 // If code is supplying an issueType that is not known, that is a coding error and we 312 // panic so that is clear immediately to the developer. 313 func getKnownIssueOrDie(issueType string) (issue Issue) { 314 issue, ok := knownIssues[issueType] 315 if !ok { 316 panic("This helper is used with known issue types only") 317 } 318 return issue 319 } 320 321 func confirmKnownIssueOrDie(issueType string) { 322 _, ok := knownIssues[issueType] 323 if !ok { 324 panic("This helper is used with known issue types only") 325 } 326 } 327 328 // Contribute will contribute issues which have been added to the issue reporter 329 func (issueReporter *IssueReporter) Contribute(log *zap.SugaredLogger, source string) { 330 if len(issueReporter.PendingIssues) == 0 { 331 return 332 } 333 // Contribute the issues all at once 334 ContributeIssuesMap(log, source, issueReporter.PendingIssues) 335 issueReporter.PendingIssues = make(map[string]Issue) 336 } 337 338 // SingleMessage is a helper which is useful when adding a single message to supporting data 339 func SingleMessage(message string) (messages []string) { 340 messages = make([]string, 1) 341 messages[0] = message 342 return messages 343 } 344 345 // GetRelatedPodMessage returns the message for an issue in pod, used for setting supporting data 346 func GetRelatedPodMessage(pod, ns string) string { 347 return "Pod \"" + pod + "\" in namespace \"" + ns + "\"" 348 } 349 350 // GetRelatedServiceMessage returns the message for an issue in a service, used for setting supporting data 351 func GetRelatedServiceMessage(service, ns string) string { 352 return "Service \"" + service + "\" in namespace \"" + ns + "\"" 353 } 354 355 // GetRelatedLogFromPodMessage returns the message to indicate the issue in the pod log, in a given namespace 356 func GetRelatedLogFromPodMessage(podLog string) string { 357 splitStr := strings.Split(podLog, "/") 358 pod := splitStr[len(splitStr)-2] 359 ns := splitStr[len(splitStr)-3] 360 return "Log from pod \"" + pod + "\" in namespace \"" + ns + "\"" 361 } 362 363 // GetRelatedEventMessage returns the message for an event, used for setting supporting data 364 func GetRelatedEventMessage(ns string) string { 365 return "Event(s) in namespace \"" + ns + "\"" 366 } 367 368 // GetRelatedVZResourceMessage returns the message for Verrazzano resource, used for setting supporting data 369 func GetRelatedVZResourceMessage() string { 370 return "Verrazzano custom resource" 371 }