github.com/kiali/kiali@v1.84.0/graph/telemetry/istio/appender/health.go (about) 1 package appender 2 3 import ( 4 "context" 5 "sync" 6 "time" 7 8 "github.com/kiali/kiali/business" 9 "github.com/kiali/kiali/graph" 10 "github.com/kiali/kiali/models" 11 ) 12 13 const HealthAppenderName = "health" 14 15 // HealthAppender is responsible for adding the information needed to perform client-side health calculations. This 16 // includes both health configuration, and health data, to the graph. TODO: replace this with server-side 17 // health calculation, and report only the health results. 18 // Name: health 19 type HealthAppender struct { 20 Namespaces graph.NamespaceInfoMap 21 QueryTime int64 // unix time in seconds 22 RequestedDuration time.Duration 23 } 24 25 // Name implements Appender 26 func (a HealthAppender) Name() string { 27 return HealthAppenderName 28 } 29 30 // IsFinalizer implements Appender 31 func (a HealthAppender) IsFinalizer() bool { 32 return true 33 } 34 35 // AppendGraph implements Appender 36 func (a HealthAppender) AppendGraph(trafficMap graph.TrafficMap, globalInfo *graph.AppenderGlobalInfo, _ *graph.AppenderNamespaceInfo) { 37 if len(trafficMap) == 0 { 38 return 39 } 40 41 a.attachHealthConfig(trafficMap, globalInfo) 42 a.attachHealth(trafficMap, globalInfo) 43 } 44 45 func addValueToRequests(requests map[string]map[string]float64, protocol, code string, val float64) { 46 if _, ok := requests[protocol]; !ok { 47 requests[protocol] = make(map[string]float64) 48 } 49 if _, ok := requests[protocol][code]; !ok { 50 requests[protocol][code] = 0 51 } 52 requests[protocol][code] += val 53 } 54 55 // addEdgeToHealthData adds the edge's responses to the source and destination nodes' health data. 56 func addEdgeTrafficToNodeHealth(edge *graph.Edge) { 57 source := edge.Source 58 dest := edge.Dest 59 initHealthData(source) 60 initHealthData(dest) 61 62 var ( 63 protocol string 64 responses graph.Responses 65 ok bool 66 ) 67 if protocol, ok = edge.Metadata[graph.ProtocolKey].(string); !ok { 68 return 69 } 70 if responses, ok = edge.Metadata[graph.MetadataKey(protocol+"Responses")].(graph.Responses); !ok { 71 return 72 } 73 74 for code, detail := range responses { 75 for _, val := range detail.Flags { 76 switch source.NodeType { 77 case graph.NodeTypeService: 78 health := source.Metadata[graph.HealthData].(*models.ServiceHealth) 79 addValueToRequests(health.Requests.Outbound, protocol, code, val) 80 source.Metadata[graph.HealthData] = health 81 case graph.NodeTypeWorkload: 82 health := source.Metadata[graph.HealthData].(*models.WorkloadHealth) 83 addValueToRequests(health.Requests.Outbound, protocol, code, val) 84 source.Metadata[graph.HealthData] = health 85 case graph.NodeTypeApp: 86 health := source.Metadata[graph.HealthData].(*models.AppHealth) 87 addValueToRequests(health.Requests.Outbound, protocol, code, val) 88 source.Metadata[graph.HealthData] = health 89 health = source.Metadata[graph.HealthDataApp].(*models.AppHealth) 90 addValueToRequests(health.Requests.Outbound, protocol, code, val) 91 source.Metadata[graph.HealthDataApp] = health 92 } 93 94 switch dest.NodeType { 95 case graph.NodeTypeService: 96 health := dest.Metadata[graph.HealthData].(*models.ServiceHealth) 97 addValueToRequests(health.Requests.Inbound, protocol, code, val) 98 dest.Metadata[graph.HealthData] = health 99 case graph.NodeTypeWorkload: 100 health := dest.Metadata[graph.HealthData].(*models.WorkloadHealth) 101 addValueToRequests(health.Requests.Inbound, protocol, code, val) 102 dest.Metadata[graph.HealthData] = health 103 case graph.NodeTypeApp: 104 health := dest.Metadata[graph.HealthData].(*models.AppHealth) 105 addValueToRequests(health.Requests.Inbound, protocol, code, val) 106 dest.Metadata[graph.HealthData] = health 107 health = dest.Metadata[graph.HealthDataApp].(*models.AppHealth) 108 addValueToRequests(health.Requests.Inbound, protocol, code, val) 109 dest.Metadata[graph.HealthDataApp] = health 110 } 111 } 112 } 113 } 114 115 func initHealthData(node *graph.Node) { 116 if _, ok := node.Metadata[graph.HealthData]; !ok { 117 if node.NodeType == graph.NodeTypeService { 118 m := models.EmptyServiceHealth() 119 node.Metadata[graph.HealthData] = &m 120 } else if node.NodeType == graph.NodeTypeWorkload { 121 m := models.EmptyWorkloadHealth() 122 node.Metadata[graph.HealthData] = m 123 } else if node.NodeType == graph.NodeTypeApp { 124 m := models.EmptyAppHealth() 125 mApp := models.EmptyAppHealth() 126 node.Metadata[graph.HealthData] = &m 127 node.Metadata[graph.HealthDataApp] = &mApp 128 } 129 } 130 } 131 132 func (a *HealthAppender) attachHealthConfig(trafficMap graph.TrafficMap, globalInfo *graph.AppenderGlobalInfo) { 133 for _, n := range trafficMap { 134 // skip health for inaccessible nodes. For now, include health for outsider nodes because edge health 135 // may depend on any health config for those nodes. And, users likely find the health useful. 136 if b, ok := n.Metadata[graph.IsInaccessible]; ok && b.(bool) { 137 continue 138 } 139 140 // for applicable node types, attach any custom health configuration. additionally, 141 switch n.NodeType { 142 case graph.NodeTypeService: 143 if srv, found := getServiceDefinition(n.Cluster, n.Namespace, n.Service, globalInfo); found { 144 n.Metadata[graph.HasHealthConfig] = models.GetHealthAnnotation(srv.HealthAnnotations, models.GetHealthConfigAnnotation()) 145 } 146 case graph.NodeTypeWorkload: 147 if workload, found := getWorkload(n.Cluster, n.Namespace, n.Workload, globalInfo); found { 148 n.Metadata[graph.HasHealthConfig] = models.GetHealthAnnotation(workload.HealthAnnotations, models.GetHealthConfigAnnotation()) 149 } 150 default: 151 continue 152 } 153 } 154 } 155 156 func (a *HealthAppender) attachHealth(trafficMap graph.TrafficMap, globalInfo *graph.AppenderGlobalInfo) { 157 var nodesWithHealth []*graph.Node 158 type healthRequest struct { 159 app bool 160 service bool 161 workload bool 162 cluster string 163 namespace string 164 } 165 166 // Health requests are per namespace meaning if a single node in the namespace 167 // has health info then we send a namespace wide health request to fetch the 168 // health info for the whole namespace. 169 healthReqs := make(map[string]healthRequest) 170 171 // Limit health fetches to only the necessary namespaces for the necessary types 172 for _, n := range trafficMap { 173 // This also gets initialized when summarizing health data from the edges but 174 // not all nodes (idle nodes) have edges so we init the health data here as well. 175 // Frontend expects the health data to not be null and will fail if it is. 176 initHealthData(n) 177 178 // skip health for inaccessible nodes. For now, include health for outsider nodes because edge health 179 // may depend on any health config for those nodes. And, users likely find the health useful. 180 if b, ok := n.Metadata[graph.IsInaccessible]; ok && b.(bool) { 181 continue 182 } 183 184 var req healthRequest 185 var ok bool 186 if req, ok = healthReqs[n.Namespace+n.Cluster]; !ok { 187 req = healthRequest{} 188 } 189 190 switch n.NodeType { 191 case graph.NodeTypeApp: 192 // always get app health for app node (used for app box health) 193 req.app = true 194 195 // for versioned app node, get workload health as well (used for the versioned app node itself) 196 if graph.IsOK(n.Workload) { 197 req.workload = true 198 } 199 case graph.NodeTypeWorkload: 200 req.workload = true 201 case graph.NodeTypeService: 202 req.service = true 203 } 204 205 req.cluster = n.Cluster 206 req.namespace = n.Namespace 207 208 healthReqs[n.Namespace+n.Cluster] = req 209 nodesWithHealth = append(nodesWithHealth, n) 210 } 211 212 bs := globalInfo.Business 213 ctx := globalInfo.Context 214 215 var cancel context.CancelFunc 216 if ctx == nil { 217 ctx = context.Background() 218 } 219 // TODO: Decide if this should be the request duration. If so, 220 // then the user should be informed why the graph request failed 221 // so that they can increase the refresh interval. 222 const maxRequestDuration = time.Minute * 15 223 ctx, cancel = context.WithTimeout(ctx, maxRequestDuration) 224 defer cancel() 225 226 type result struct { 227 namespace string 228 cluster string 229 appNSHealth models.NamespaceAppHealth 230 serviceNSHealth models.NamespaceServiceHealth 231 workloadNSHealth models.NamespaceWorkloadHealth 232 err error 233 } 234 resultsCh := make(chan result) 235 // Fetch all the health data in parallel. The health data will most likely be cached 236 // and no prom queries are performed. 237 go func(ctx context.Context) { 238 wg := &sync.WaitGroup{} 239 for _, req := range healthReqs { 240 if req.app { 241 wg.Add(1) 242 go func(ctx context.Context, namespace, cluster string) { 243 defer wg.Done() 244 h, err := bs.Health.GetNamespaceAppHealth(ctx, business.NamespaceHealthCriteria{Namespace: namespace, Cluster: cluster, IncludeMetrics: false}) 245 resultsCh <- result{appNSHealth: h, namespace: namespace, err: err, cluster: cluster} 246 }(ctx, req.namespace, req.cluster) 247 } 248 249 if req.workload { 250 wg.Add(1) 251 go func(ctx context.Context, namespace, cluster string) { 252 defer wg.Done() 253 h, err := bs.Health.GetNamespaceWorkloadHealth(ctx, business.NamespaceHealthCriteria{Namespace: namespace, Cluster: cluster, IncludeMetrics: false}) 254 resultsCh <- result{workloadNSHealth: h, namespace: namespace, err: err, cluster: cluster} 255 }(ctx, req.namespace, req.cluster) 256 } 257 258 if req.service { 259 wg.Add(1) 260 go func(ctx context.Context, namespace, cluster string) { 261 defer wg.Done() 262 s, err := bs.Health.GetNamespaceServiceHealth(ctx, business.NamespaceHealthCriteria{Namespace: namespace, Cluster: cluster, IncludeMetrics: false}) 263 resultsCh <- result{serviceNSHealth: s, namespace: namespace, err: err, cluster: cluster} 264 }(ctx, req.namespace, req.cluster) 265 } 266 } 267 // Wait for all requests to finish sending before closing the channel. 268 wg.Wait() 269 close(resultsCh) 270 }(ctx) 271 272 // Note: these are key'd off of namespace+name instead of namespace to make lookups unique 273 // and keep the map flatter. 274 appHealth := make(map[string]*models.AppHealth) 275 serviceHealth := make(map[string]*models.ServiceHealth) 276 workloadHealth := make(map[string]*models.WorkloadHealth) 277 var errors []error 278 // This will block until all requests have finished. 279 for result := range resultsCh { 280 if result.err != nil { 281 errors = append(errors, result.err) 282 continue 283 } 284 285 if result.appNSHealth != nil { 286 for name, health := range result.appNSHealth { 287 appHealth[name+result.namespace+result.cluster] = health 288 } 289 } else if result.workloadNSHealth != nil { 290 for name, health := range result.workloadNSHealth { 291 workloadHealth[name+result.namespace+result.cluster] = health 292 } 293 } else if result.serviceNSHealth != nil { 294 for name, health := range result.serviceNSHealth { 295 serviceHealth[name+result.namespace+result.cluster] = health 296 } 297 } 298 } 299 if len(errors) > 0 { 300 // This just panics with the first error. 301 graph.CheckError(errors[0]) 302 } 303 304 for _, e := range trafficMap.Edges() { 305 addEdgeTrafficToNodeHealth(e) 306 } 307 308 for _, n := range nodesWithHealth { 309 switch n.NodeType { 310 case graph.NodeTypeApp: 311 var key graph.MetadataKey 312 if graph.IsOK(n.Workload) { 313 key = graph.HealthDataApp 314 } else { 315 key = graph.HealthData 316 } 317 318 var health *models.AppHealth 319 if h, found := n.Metadata[key]; found { 320 health = h.(*models.AppHealth) 321 } else { 322 health = &models.AppHealth{} 323 } 324 325 if h, found := appHealth[n.App+n.Namespace+n.Cluster]; found { 326 health.WorkloadStatuses = h.WorkloadStatuses 327 health.Requests.HealthAnnotations = h.Requests.HealthAnnotations 328 } 329 n.Metadata[key] = health 330 case graph.NodeTypeService: 331 var health *models.ServiceHealth 332 if h, found := n.Metadata[graph.HealthData]; found { 333 health = h.(*models.ServiceHealth) 334 } else { 335 health = &models.ServiceHealth{} 336 } 337 338 if h, found := serviceHealth[n.Service+n.Namespace+n.Cluster]; found { 339 health.Requests.HealthAnnotations = h.Requests.HealthAnnotations 340 } 341 n.Metadata[graph.HealthData] = health 342 case graph.NodeTypeWorkload: 343 var health *models.WorkloadHealth 344 if h, found := n.Metadata[graph.HealthData]; found { 345 health = h.(*models.WorkloadHealth) 346 } else { 347 health = &models.WorkloadHealth{} 348 } 349 350 if h, found := workloadHealth[n.Workload+n.Namespace+n.Cluster]; found { 351 health.WorkloadStatus = h.WorkloadStatus 352 health.Requests.HealthAnnotations = h.Requests.HealthAnnotations 353 } 354 n.Metadata[graph.HealthData] = health 355 } 356 } 357 }