github.com/kiali/kiali@v1.84.0/graph/telemetry/istio/appender/health.go (about)

     1  package appender
     2  
     3  import (
     4  	"context"
     5  	"sync"
     6  	"time"
     7  
     8  	"github.com/kiali/kiali/business"
     9  	"github.com/kiali/kiali/graph"
    10  	"github.com/kiali/kiali/models"
    11  )
    12  
    13  const HealthAppenderName = "health"
    14  
    15  // HealthAppender is responsible for adding the information needed to perform client-side health calculations. This
    16  // includes both health configuration, and health data, to the graph.  TODO: replace this with server-side
    17  // health calculation, and report only the health results.
    18  // Name: health
    19  type HealthAppender struct {
    20  	Namespaces        graph.NamespaceInfoMap
    21  	QueryTime         int64 // unix time in seconds
    22  	RequestedDuration time.Duration
    23  }
    24  
    25  // Name implements Appender
    26  func (a HealthAppender) Name() string {
    27  	return HealthAppenderName
    28  }
    29  
    30  // IsFinalizer implements Appender
    31  func (a HealthAppender) IsFinalizer() bool {
    32  	return true
    33  }
    34  
    35  // AppendGraph implements Appender
    36  func (a HealthAppender) AppendGraph(trafficMap graph.TrafficMap, globalInfo *graph.AppenderGlobalInfo, _ *graph.AppenderNamespaceInfo) {
    37  	if len(trafficMap) == 0 {
    38  		return
    39  	}
    40  
    41  	a.attachHealthConfig(trafficMap, globalInfo)
    42  	a.attachHealth(trafficMap, globalInfo)
    43  }
    44  
    45  func addValueToRequests(requests map[string]map[string]float64, protocol, code string, val float64) {
    46  	if _, ok := requests[protocol]; !ok {
    47  		requests[protocol] = make(map[string]float64)
    48  	}
    49  	if _, ok := requests[protocol][code]; !ok {
    50  		requests[protocol][code] = 0
    51  	}
    52  	requests[protocol][code] += val
    53  }
    54  
    55  // addEdgeToHealthData adds the edge's responses to the source and destination nodes' health data.
    56  func addEdgeTrafficToNodeHealth(edge *graph.Edge) {
    57  	source := edge.Source
    58  	dest := edge.Dest
    59  	initHealthData(source)
    60  	initHealthData(dest)
    61  
    62  	var (
    63  		protocol  string
    64  		responses graph.Responses
    65  		ok        bool
    66  	)
    67  	if protocol, ok = edge.Metadata[graph.ProtocolKey].(string); !ok {
    68  		return
    69  	}
    70  	if responses, ok = edge.Metadata[graph.MetadataKey(protocol+"Responses")].(graph.Responses); !ok {
    71  		return
    72  	}
    73  
    74  	for code, detail := range responses {
    75  		for _, val := range detail.Flags {
    76  			switch source.NodeType {
    77  			case graph.NodeTypeService:
    78  				health := source.Metadata[graph.HealthData].(*models.ServiceHealth)
    79  				addValueToRequests(health.Requests.Outbound, protocol, code, val)
    80  				source.Metadata[graph.HealthData] = health
    81  			case graph.NodeTypeWorkload:
    82  				health := source.Metadata[graph.HealthData].(*models.WorkloadHealth)
    83  				addValueToRequests(health.Requests.Outbound, protocol, code, val)
    84  				source.Metadata[graph.HealthData] = health
    85  			case graph.NodeTypeApp:
    86  				health := source.Metadata[graph.HealthData].(*models.AppHealth)
    87  				addValueToRequests(health.Requests.Outbound, protocol, code, val)
    88  				source.Metadata[graph.HealthData] = health
    89  				health = source.Metadata[graph.HealthDataApp].(*models.AppHealth)
    90  				addValueToRequests(health.Requests.Outbound, protocol, code, val)
    91  				source.Metadata[graph.HealthDataApp] = health
    92  			}
    93  
    94  			switch dest.NodeType {
    95  			case graph.NodeTypeService:
    96  				health := dest.Metadata[graph.HealthData].(*models.ServiceHealth)
    97  				addValueToRequests(health.Requests.Inbound, protocol, code, val)
    98  				dest.Metadata[graph.HealthData] = health
    99  			case graph.NodeTypeWorkload:
   100  				health := dest.Metadata[graph.HealthData].(*models.WorkloadHealth)
   101  				addValueToRequests(health.Requests.Inbound, protocol, code, val)
   102  				dest.Metadata[graph.HealthData] = health
   103  			case graph.NodeTypeApp:
   104  				health := dest.Metadata[graph.HealthData].(*models.AppHealth)
   105  				addValueToRequests(health.Requests.Inbound, protocol, code, val)
   106  				dest.Metadata[graph.HealthData] = health
   107  				health = dest.Metadata[graph.HealthDataApp].(*models.AppHealth)
   108  				addValueToRequests(health.Requests.Inbound, protocol, code, val)
   109  				dest.Metadata[graph.HealthDataApp] = health
   110  			}
   111  		}
   112  	}
   113  }
   114  
   115  func initHealthData(node *graph.Node) {
   116  	if _, ok := node.Metadata[graph.HealthData]; !ok {
   117  		if node.NodeType == graph.NodeTypeService {
   118  			m := models.EmptyServiceHealth()
   119  			node.Metadata[graph.HealthData] = &m
   120  		} else if node.NodeType == graph.NodeTypeWorkload {
   121  			m := models.EmptyWorkloadHealth()
   122  			node.Metadata[graph.HealthData] = m
   123  		} else if node.NodeType == graph.NodeTypeApp {
   124  			m := models.EmptyAppHealth()
   125  			mApp := models.EmptyAppHealth()
   126  			node.Metadata[graph.HealthData] = &m
   127  			node.Metadata[graph.HealthDataApp] = &mApp
   128  		}
   129  	}
   130  }
   131  
   132  func (a *HealthAppender) attachHealthConfig(trafficMap graph.TrafficMap, globalInfo *graph.AppenderGlobalInfo) {
   133  	for _, n := range trafficMap {
   134  		// skip health for inaccessible nodes.  For now, include health for outsider nodes because edge health
   135  		// may depend on any health config for those nodes.  And, users likely find the health useful.
   136  		if b, ok := n.Metadata[graph.IsInaccessible]; ok && b.(bool) {
   137  			continue
   138  		}
   139  
   140  		// for applicable node types, attach any custom health configuration.  additionally,
   141  		switch n.NodeType {
   142  		case graph.NodeTypeService:
   143  			if srv, found := getServiceDefinition(n.Cluster, n.Namespace, n.Service, globalInfo); found {
   144  				n.Metadata[graph.HasHealthConfig] = models.GetHealthAnnotation(srv.HealthAnnotations, models.GetHealthConfigAnnotation())
   145  			}
   146  		case graph.NodeTypeWorkload:
   147  			if workload, found := getWorkload(n.Cluster, n.Namespace, n.Workload, globalInfo); found {
   148  				n.Metadata[graph.HasHealthConfig] = models.GetHealthAnnotation(workload.HealthAnnotations, models.GetHealthConfigAnnotation())
   149  			}
   150  		default:
   151  			continue
   152  		}
   153  	}
   154  }
   155  
   156  func (a *HealthAppender) attachHealth(trafficMap graph.TrafficMap, globalInfo *graph.AppenderGlobalInfo) {
   157  	var nodesWithHealth []*graph.Node
   158  	type healthRequest struct {
   159  		app       bool
   160  		service   bool
   161  		workload  bool
   162  		cluster   string
   163  		namespace string
   164  	}
   165  
   166  	// Health requests are per namespace meaning if a single node in the namespace
   167  	// has health info then we send a namespace wide health request to fetch the
   168  	// health info for the whole namespace.
   169  	healthReqs := make(map[string]healthRequest)
   170  
   171  	// Limit health fetches to only the necessary namespaces for the necessary types
   172  	for _, n := range trafficMap {
   173  		// This also gets initialized when summarizing health data from the edges but
   174  		// not all nodes (idle nodes) have edges so we init the health data here as well.
   175  		// Frontend expects the health data to not be null and will fail if it is.
   176  		initHealthData(n)
   177  
   178  		// skip health for inaccessible nodes.  For now, include health for outsider nodes because edge health
   179  		// may depend on any health config for those nodes.  And, users likely find the health useful.
   180  		if b, ok := n.Metadata[graph.IsInaccessible]; ok && b.(bool) {
   181  			continue
   182  		}
   183  
   184  		var req healthRequest
   185  		var ok bool
   186  		if req, ok = healthReqs[n.Namespace+n.Cluster]; !ok {
   187  			req = healthRequest{}
   188  		}
   189  
   190  		switch n.NodeType {
   191  		case graph.NodeTypeApp:
   192  			// always get app health for app node (used for app box health)
   193  			req.app = true
   194  
   195  			// for versioned app node, get workload health as well (used for the versioned app node itself)
   196  			if graph.IsOK(n.Workload) {
   197  				req.workload = true
   198  			}
   199  		case graph.NodeTypeWorkload:
   200  			req.workload = true
   201  		case graph.NodeTypeService:
   202  			req.service = true
   203  		}
   204  
   205  		req.cluster = n.Cluster
   206  		req.namespace = n.Namespace
   207  
   208  		healthReqs[n.Namespace+n.Cluster] = req
   209  		nodesWithHealth = append(nodesWithHealth, n)
   210  	}
   211  
   212  	bs := globalInfo.Business
   213  	ctx := globalInfo.Context
   214  
   215  	var cancel context.CancelFunc
   216  	if ctx == nil {
   217  		ctx = context.Background()
   218  	}
   219  	// TODO: Decide if this should be the request duration. If so,
   220  	// then the user should be informed why the graph request failed
   221  	// so that they can increase the refresh interval.
   222  	const maxRequestDuration = time.Minute * 15
   223  	ctx, cancel = context.WithTimeout(ctx, maxRequestDuration)
   224  	defer cancel()
   225  
   226  	type result struct {
   227  		namespace        string
   228  		cluster          string
   229  		appNSHealth      models.NamespaceAppHealth
   230  		serviceNSHealth  models.NamespaceServiceHealth
   231  		workloadNSHealth models.NamespaceWorkloadHealth
   232  		err              error
   233  	}
   234  	resultsCh := make(chan result)
   235  	// Fetch all the health data in parallel. The health data will most likely be cached
   236  	// and no prom queries are performed.
   237  	go func(ctx context.Context) {
   238  		wg := &sync.WaitGroup{}
   239  		for _, req := range healthReqs {
   240  			if req.app {
   241  				wg.Add(1)
   242  				go func(ctx context.Context, namespace, cluster string) {
   243  					defer wg.Done()
   244  					h, err := bs.Health.GetNamespaceAppHealth(ctx, business.NamespaceHealthCriteria{Namespace: namespace, Cluster: cluster, IncludeMetrics: false})
   245  					resultsCh <- result{appNSHealth: h, namespace: namespace, err: err, cluster: cluster}
   246  				}(ctx, req.namespace, req.cluster)
   247  			}
   248  
   249  			if req.workload {
   250  				wg.Add(1)
   251  				go func(ctx context.Context, namespace, cluster string) {
   252  					defer wg.Done()
   253  					h, err := bs.Health.GetNamespaceWorkloadHealth(ctx, business.NamespaceHealthCriteria{Namespace: namespace, Cluster: cluster, IncludeMetrics: false})
   254  					resultsCh <- result{workloadNSHealth: h, namespace: namespace, err: err, cluster: cluster}
   255  				}(ctx, req.namespace, req.cluster)
   256  			}
   257  
   258  			if req.service {
   259  				wg.Add(1)
   260  				go func(ctx context.Context, namespace, cluster string) {
   261  					defer wg.Done()
   262  					s, err := bs.Health.GetNamespaceServiceHealth(ctx, business.NamespaceHealthCriteria{Namespace: namespace, Cluster: cluster, IncludeMetrics: false})
   263  					resultsCh <- result{serviceNSHealth: s, namespace: namespace, err: err, cluster: cluster}
   264  				}(ctx, req.namespace, req.cluster)
   265  			}
   266  		}
   267  		// Wait for all requests to finish sending before closing the channel.
   268  		wg.Wait()
   269  		close(resultsCh)
   270  	}(ctx)
   271  
   272  	// Note: these are key'd off of namespace+name instead of namespace to make lookups unique
   273  	// and keep the map flatter.
   274  	appHealth := make(map[string]*models.AppHealth)
   275  	serviceHealth := make(map[string]*models.ServiceHealth)
   276  	workloadHealth := make(map[string]*models.WorkloadHealth)
   277  	var errors []error
   278  	// This will block until all requests have finished.
   279  	for result := range resultsCh {
   280  		if result.err != nil {
   281  			errors = append(errors, result.err)
   282  			continue
   283  		}
   284  
   285  		if result.appNSHealth != nil {
   286  			for name, health := range result.appNSHealth {
   287  				appHealth[name+result.namespace+result.cluster] = health
   288  			}
   289  		} else if result.workloadNSHealth != nil {
   290  			for name, health := range result.workloadNSHealth {
   291  				workloadHealth[name+result.namespace+result.cluster] = health
   292  			}
   293  		} else if result.serviceNSHealth != nil {
   294  			for name, health := range result.serviceNSHealth {
   295  				serviceHealth[name+result.namespace+result.cluster] = health
   296  			}
   297  		}
   298  	}
   299  	if len(errors) > 0 {
   300  		// This just panics with the first error.
   301  		graph.CheckError(errors[0])
   302  	}
   303  
   304  	for _, e := range trafficMap.Edges() {
   305  		addEdgeTrafficToNodeHealth(e)
   306  	}
   307  
   308  	for _, n := range nodesWithHealth {
   309  		switch n.NodeType {
   310  		case graph.NodeTypeApp:
   311  			var key graph.MetadataKey
   312  			if graph.IsOK(n.Workload) {
   313  				key = graph.HealthDataApp
   314  			} else {
   315  				key = graph.HealthData
   316  			}
   317  
   318  			var health *models.AppHealth
   319  			if h, found := n.Metadata[key]; found {
   320  				health = h.(*models.AppHealth)
   321  			} else {
   322  				health = &models.AppHealth{}
   323  			}
   324  
   325  			if h, found := appHealth[n.App+n.Namespace+n.Cluster]; found {
   326  				health.WorkloadStatuses = h.WorkloadStatuses
   327  				health.Requests.HealthAnnotations = h.Requests.HealthAnnotations
   328  			}
   329  			n.Metadata[key] = health
   330  		case graph.NodeTypeService:
   331  			var health *models.ServiceHealth
   332  			if h, found := n.Metadata[graph.HealthData]; found {
   333  				health = h.(*models.ServiceHealth)
   334  			} else {
   335  				health = &models.ServiceHealth{}
   336  			}
   337  
   338  			if h, found := serviceHealth[n.Service+n.Namespace+n.Cluster]; found {
   339  				health.Requests.HealthAnnotations = h.Requests.HealthAnnotations
   340  			}
   341  			n.Metadata[graph.HealthData] = health
   342  		case graph.NodeTypeWorkload:
   343  			var health *models.WorkloadHealth
   344  			if h, found := n.Metadata[graph.HealthData]; found {
   345  				health = h.(*models.WorkloadHealth)
   346  			} else {
   347  				health = &models.WorkloadHealth{}
   348  			}
   349  
   350  			if h, found := workloadHealth[n.Workload+n.Namespace+n.Cluster]; found {
   351  				health.WorkloadStatus = h.WorkloadStatus
   352  				health.Requests.HealthAnnotations = h.Requests.HealthAnnotations
   353  			}
   354  			n.Metadata[graph.HealthData] = health
   355  		}
   356  	}
   357  }