github.com/kiali/kiali@v1.84.0/business/health.go (about)

     1  package business
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"time"
     7  
     8  	"github.com/prometheus/common/model"
     9  	"k8s.io/apimachinery/pkg/api/errors"
    10  
    11  	"github.com/kiali/kiali/kubernetes"
    12  	"github.com/kiali/kiali/models"
    13  	"github.com/kiali/kiali/observability"
    14  	"github.com/kiali/kiali/prometheus"
    15  )
    16  
    17  // HealthService deals with fetching health from various sources and convert to kiali model
    18  type HealthService struct {
    19  	prom          prometheus.ClientInterface
    20  	businessLayer *Layer
    21  	userClients   map[string]kubernetes.ClientInterface
    22  }
    23  
    24  type NamespaceHealthCriteria struct {
    25  	IncludeMetrics bool
    26  	Namespace      string
    27  	Cluster        string
    28  	QueryTime      time.Time
    29  	RateInterval   string
    30  }
    31  
    32  // Annotation Filter for Health
    33  var HealthAnnotation = []models.AnnotationKey{models.RateHealthAnnotation}
    34  
    35  // GetServiceHealth returns a service health (service request error rate)
    36  func (in *HealthService) GetServiceHealth(ctx context.Context, namespace, cluster, service, rateInterval string, queryTime time.Time, svc *models.Service) (models.ServiceHealth, error) {
    37  	var end observability.EndFunc
    38  	_, end = observability.StartSpan(ctx, "GetServiceHealth",
    39  		observability.Attribute("package", "business"),
    40  		observability.Attribute("namespace", namespace),
    41  		observability.Attribute("service", service),
    42  		observability.Attribute("rateInterval", rateInterval),
    43  		observability.Attribute("queryTime", queryTime),
    44  	)
    45  	defer end()
    46  
    47  	rqHealth, err := in.getServiceRequestsHealth(namespace, cluster, service, rateInterval, queryTime, svc)
    48  	return models.ServiceHealth{Requests: rqHealth}, err
    49  }
    50  
    51  // GetAppHealth returns an app health from just Namespace and app name (thus, it fetches data from K8S and Prometheus)
    52  func (in *HealthService) GetAppHealth(ctx context.Context, namespace, cluster, app, rateInterval string, queryTime time.Time, appD *appDetails) (models.AppHealth, error) {
    53  	var end observability.EndFunc
    54  	_, end = observability.StartSpan(ctx, "GetAppHealth",
    55  		observability.Attribute("package", "business"),
    56  		observability.Attribute("namespace", namespace),
    57  		observability.Attribute("cluster", cluster),
    58  		observability.Attribute("app", app),
    59  		observability.Attribute("rateInterval", rateInterval),
    60  		observability.Attribute("queryTime", queryTime),
    61  	)
    62  	defer end()
    63  
    64  	return in.getAppHealth(namespace, cluster, app, rateInterval, queryTime, appD.Workloads)
    65  }
    66  
    67  func (in *HealthService) getAppHealth(namespace, cluster, app, rateInterval string, queryTime time.Time, ws models.Workloads) (models.AppHealth, error) {
    68  	health := models.EmptyAppHealth()
    69  
    70  	// Perf: do not bother fetching request rate if there are no workloads or no workload has sidecar
    71  	hasSidecar := false
    72  	for _, w := range ws {
    73  		if w.IstioSidecar || w.IsGateway() {
    74  			hasSidecar = true
    75  			break
    76  		}
    77  	}
    78  
    79  	// Fetch services requests rates
    80  	var errRate error
    81  	if hasSidecar {
    82  		rate, err := in.getAppRequestsHealth(namespace, cluster, app, rateInterval, queryTime)
    83  		health.Requests = rate
    84  		errRate = err
    85  	}
    86  
    87  	// Deployment status
    88  	health.WorkloadStatuses = ws.CastWorkloadStatuses()
    89  
    90  	return health, errRate
    91  }
    92  
    93  // GetWorkloadHealth returns a workload health from just Namespace and workload (thus, it fetches data from K8S and Prometheus)
    94  func (in *HealthService) GetWorkloadHealth(ctx context.Context, namespace, cluster, workload, rateInterval string, queryTime time.Time, w *models.Workload) (models.WorkloadHealth, error) {
    95  	var end observability.EndFunc
    96  	_, end = observability.StartSpan(ctx, "GetWorkloadHealth",
    97  		observability.Attribute("package", "business"),
    98  		observability.Attribute("namespace", namespace),
    99  		observability.Attribute("workload", workload),
   100  		observability.Attribute("rateInterval", rateInterval),
   101  		observability.Attribute("queryTime", queryTime),
   102  	)
   103  	defer end()
   104  
   105  	// Perf: do not bother fetching request rate if workload has no sidecar
   106  	if !w.IstioSidecar && !w.IsGateway() {
   107  		return models.WorkloadHealth{
   108  			WorkloadStatus: w.CastWorkloadStatus(),
   109  			Requests:       models.NewEmptyRequestHealth(),
   110  		}, nil
   111  	}
   112  
   113  	// Add Telemetry info
   114  	rate, err := in.getWorkloadRequestsHealth(namespace, cluster, workload, rateInterval, queryTime, w)
   115  	return models.WorkloadHealth{
   116  		WorkloadStatus: w.CastWorkloadStatus(),
   117  		Requests:       rate,
   118  	}, err
   119  }
   120  
   121  // GetNamespaceAppHealth returns a health for all apps in given Namespace (thus, it fetches data from K8S and Prometheus)
   122  func (in *HealthService) GetNamespaceAppHealth(ctx context.Context, criteria NamespaceHealthCriteria) (models.NamespaceAppHealth, error) {
   123  	var end observability.EndFunc
   124  	ctx, end = observability.StartSpan(ctx, "GetNamespaceAppHealth",
   125  		observability.Attribute("package", "business"),
   126  		observability.Attribute("cluster", criteria.Cluster),
   127  		observability.Attribute("namespace", criteria.Namespace),
   128  		observability.Attribute("rateInterval", criteria.RateInterval),
   129  		observability.Attribute("queryTime", criteria.QueryTime),
   130  	)
   131  	defer end()
   132  
   133  	cluster := criteria.Cluster
   134  
   135  	if _, ok := in.userClients[cluster]; !ok {
   136  		return nil, fmt.Errorf("Cluster [%s] is not found or is not accessible for Kiali", cluster)
   137  	}
   138  
   139  	appEntities, err := in.businessLayer.App.fetchNamespaceApps(ctx, criteria.Namespace, cluster, "")
   140  	if err != nil {
   141  		return nil, err
   142  	}
   143  
   144  	return in.getNamespaceAppHealth(appEntities, criteria)
   145  }
   146  
   147  func (in *HealthService) getNamespaceAppHealth(appEntities namespaceApps, criteria NamespaceHealthCriteria) (models.NamespaceAppHealth, error) {
   148  	namespace := criteria.Namespace
   149  	queryTime := criteria.QueryTime
   150  	rateInterval := criteria.RateInterval
   151  	cluster := criteria.Cluster
   152  	allHealth := make(models.NamespaceAppHealth)
   153  
   154  	// Perf: do not bother fetching request rate if no workloads or no workload has sidecar
   155  	sidecarPresent := false
   156  	var appSidecars = make(map[string]bool)
   157  
   158  	// Prepare all data
   159  	for app, entities := range appEntities {
   160  		if app != "" {
   161  			h := models.EmptyAppHealth()
   162  			allHealth[app] = &h
   163  			if entities != nil {
   164  				h.WorkloadStatuses = entities.Workloads.CastWorkloadStatuses()
   165  				for _, w := range entities.Workloads {
   166  					if w.IstioSidecar || w.IsGateway() {
   167  						sidecarPresent = true
   168  						appSidecars[app] = true
   169  						break
   170  					}
   171  				}
   172  			}
   173  		}
   174  	}
   175  
   176  	if sidecarPresent && criteria.IncludeMetrics {
   177  		// Fetch services requests rates
   178  		rates, err := in.prom.GetAllRequestRates(namespace, cluster, rateInterval, queryTime)
   179  		if err != nil {
   180  			return allHealth, errors.NewServiceUnavailable(err.Error())
   181  		}
   182  		// Fill with collected request rates
   183  		fillAppRequestRates(allHealth, rates, appSidecars)
   184  	}
   185  
   186  	return allHealth, nil
   187  }
   188  
   189  // GetNamespaceServiceHealth returns a health for all services in given Namespace (thus, it fetches data from K8S and Prometheus)
   190  func (in *HealthService) GetNamespaceServiceHealth(ctx context.Context, criteria NamespaceHealthCriteria) (models.NamespaceServiceHealth, error) {
   191  	var end observability.EndFunc
   192  	ctx, end = observability.StartSpan(ctx, "GetNamespaceServiceHealth",
   193  		observability.Attribute("package", "business"),
   194  		observability.Attribute("namespace", criteria.Namespace),
   195  		observability.Attribute("cluster", criteria.Cluster),
   196  		observability.Attribute("rateInterval", criteria.RateInterval),
   197  		observability.Attribute("queryTime", criteria.QueryTime),
   198  	)
   199  	defer end()
   200  
   201  	namespace := criteria.Namespace
   202  	cluster := criteria.Cluster
   203  
   204  	if _, ok := in.userClients[cluster]; !ok {
   205  		return nil, fmt.Errorf("Cluster [%s] is not found or is not accessible for Kiali", cluster)
   206  	}
   207  
   208  	if _, err := in.businessLayer.Namespace.GetClusterNamespace(ctx, namespace, cluster); err != nil {
   209  		return nil, err
   210  	}
   211  
   212  	var services *models.ServiceList
   213  	var err error
   214  
   215  	svcCriteria := ServiceCriteria{
   216  		Cluster:                cluster,
   217  		Namespace:              namespace,
   218  		IncludeHealth:          false,
   219  		IncludeIstioResources:  false,
   220  		IncludeOnlyDefinitions: true,
   221  	}
   222  	services, err = in.businessLayer.Svc.GetServiceList(ctx, svcCriteria)
   223  	if err != nil {
   224  		return nil, err
   225  	}
   226  	return in.getNamespaceServiceHealth(services, criteria), nil
   227  }
   228  
   229  func (in *HealthService) getNamespaceServiceHealth(services *models.ServiceList, criteria NamespaceHealthCriteria) models.NamespaceServiceHealth {
   230  	namespace := criteria.Namespace
   231  	queryTime := criteria.QueryTime
   232  	rateInterval := criteria.RateInterval
   233  	cluster := criteria.Cluster
   234  
   235  	allHealth := make(models.NamespaceServiceHealth)
   236  
   237  	// Prepare all data (note that it's important to provide data for all services, even those which may not have any health, for overview cards)
   238  	if services != nil {
   239  		for _, service := range services.Services {
   240  			h := models.EmptyServiceHealth()
   241  			h.Requests.HealthAnnotations = service.HealthAnnotations
   242  			allHealth[service.Name] = &h
   243  		}
   244  	}
   245  
   246  	if criteria.IncludeMetrics {
   247  		// Fetch services requests rates
   248  		rates, _ := in.prom.GetNamespaceServicesRequestRates(namespace, cluster, rateInterval, queryTime)
   249  		// Fill with collected request rates
   250  		lblDestSvc := model.LabelName("destination_service_name")
   251  		for _, sample := range rates {
   252  			service := string(sample.Metric[lblDestSvc])
   253  			if health, ok := allHealth[service]; ok {
   254  				health.Requests.AggregateInbound(sample)
   255  			}
   256  		}
   257  		for _, health := range allHealth {
   258  			health.Requests.CombineReporters()
   259  		}
   260  	}
   261  	return allHealth
   262  }
   263  
   264  // GetNamespaceWorkloadHealth returns a health for all workloads in given Namespace (thus, it fetches data from K8S and Prometheus)
   265  func (in *HealthService) GetNamespaceWorkloadHealth(ctx context.Context, criteria NamespaceHealthCriteria) (models.NamespaceWorkloadHealth, error) {
   266  	namespace := criteria.Namespace
   267  	rateInterval := criteria.RateInterval
   268  	queryTime := criteria.QueryTime
   269  	cluster := criteria.Cluster
   270  	var end observability.EndFunc
   271  	ctx, end = observability.StartSpan(ctx, "GetNamespaceWorkloadHealth",
   272  		observability.Attribute("package", "business"),
   273  		observability.Attribute("namespace", namespace),
   274  		observability.Attribute("cluster", cluster),
   275  		observability.Attribute("rateInterval", rateInterval),
   276  		observability.Attribute("queryTime", queryTime),
   277  	)
   278  	defer end()
   279  
   280  	if _, ok := in.userClients[cluster]; !ok {
   281  		return nil, fmt.Errorf("Cluster [%s] is not found or is not accessible for Kiali", cluster)
   282  	}
   283  
   284  	if _, err := in.businessLayer.Namespace.GetClusterNamespace(ctx, namespace, cluster); err != nil {
   285  		return nil, err
   286  	}
   287  
   288  	wl, err := in.businessLayer.Workload.fetchWorkloadsFromCluster(ctx, cluster, namespace, "")
   289  	if err != nil {
   290  		return nil, err
   291  	}
   292  
   293  	return in.getNamespaceWorkloadHealth(wl, criteria)
   294  }
   295  
   296  func (in *HealthService) getNamespaceWorkloadHealth(ws models.Workloads, criteria NamespaceHealthCriteria) (models.NamespaceWorkloadHealth, error) {
   297  	// Perf: do not bother fetching request rate if no workloads or no workload has sidecar
   298  	hasSidecar := false
   299  	namespace := criteria.Namespace
   300  	rateInterval := criteria.RateInterval
   301  	queryTime := criteria.QueryTime
   302  	cluster := criteria.Cluster
   303  	var wlSidecars = make(map[string]bool)
   304  
   305  	allHealth := make(models.NamespaceWorkloadHealth)
   306  	for _, w := range ws {
   307  		allHealth[w.Name] = models.EmptyWorkloadHealth()
   308  		allHealth[w.Name].Requests.HealthAnnotations = models.GetHealthAnnotation(w.HealthAnnotations, HealthAnnotation)
   309  		allHealth[w.Name].WorkloadStatus = w.CastWorkloadStatus()
   310  		if w.IstioSidecar || w.IsGateway() {
   311  			hasSidecar = true
   312  			wlSidecars[w.Name] = true
   313  		}
   314  	}
   315  
   316  	if hasSidecar && criteria.IncludeMetrics {
   317  		// Fetch services requests rates
   318  		rates, err := in.prom.GetAllRequestRates(namespace, cluster, rateInterval, queryTime)
   319  		if err != nil {
   320  			return allHealth, errors.NewServiceUnavailable(err.Error())
   321  		}
   322  		// Fill with collected request rates
   323  		fillWorkloadRequestRates(allHealth, rates, wlSidecars)
   324  	}
   325  
   326  	return allHealth, nil
   327  }
   328  
   329  // fillAppRequestRates aggregates requests rates from metrics fetched from Prometheus, and stores the result in the health map.
   330  func fillAppRequestRates(allHealth models.NamespaceAppHealth, rates model.Vector, appSidecars map[string]bool) {
   331  	lblDest := model.LabelName("destination_canonical_service")
   332  	lblSrc := model.LabelName("source_canonical_service")
   333  
   334  	for _, sample := range rates {
   335  		name := string(sample.Metric[lblDest])
   336  		// include requests only to apps which have a sidecar
   337  		if _, ok := appSidecars[name]; ok {
   338  			if health, ok := allHealth[name]; ok {
   339  				health.Requests.AggregateInbound(sample)
   340  			}
   341  			name = string(sample.Metric[lblSrc])
   342  			if health, ok := allHealth[name]; ok {
   343  				health.Requests.AggregateOutbound(sample)
   344  			}
   345  		}
   346  	}
   347  	for _, health := range allHealth {
   348  		health.Requests.CombineReporters()
   349  	}
   350  }
   351  
   352  // fillWorkloadRequestRates aggregates requests rates from metrics fetched from Prometheus, and stores the result in the health map.
   353  func fillWorkloadRequestRates(allHealth models.NamespaceWorkloadHealth, rates model.Vector, wlSidecars map[string]bool) {
   354  	lblDest := model.LabelName("destination_workload")
   355  	lblSrc := model.LabelName("source_workload")
   356  	for _, sample := range rates {
   357  		name := string(sample.Metric[lblDest])
   358  		// include requests only to workloads which have a sidecar
   359  		if _, ok := wlSidecars[name]; ok {
   360  			if health, ok := allHealth[name]; ok {
   361  				health.Requests.AggregateInbound(sample)
   362  			}
   363  			name = string(sample.Metric[lblSrc])
   364  			if health, ok := allHealth[name]; ok {
   365  				health.Requests.AggregateOutbound(sample)
   366  			}
   367  		}
   368  	}
   369  	for _, health := range allHealth {
   370  		health.Requests.CombineReporters()
   371  	}
   372  }
   373  
   374  func (in *HealthService) getServiceRequestsHealth(namespace, cluster, service, rateInterval string, queryTime time.Time, svc *models.Service) (models.RequestHealth, error) {
   375  	rqHealth := models.NewEmptyRequestHealth()
   376  	if svc.Type == "External" {
   377  		// ServiceEntry from Istio Registry
   378  		// Telemetry doesn't collect a namespace
   379  		namespace = "unknown"
   380  	}
   381  	inbound, err := in.prom.GetServiceRequestRates(namespace, cluster, service, rateInterval, queryTime)
   382  	if err != nil {
   383  		return rqHealth, errors.NewServiceUnavailable(err.Error())
   384  	}
   385  	for _, sample := range inbound {
   386  		rqHealth.AggregateInbound(sample)
   387  	}
   388  	rqHealth.HealthAnnotations = svc.HealthAnnotations
   389  	rqHealth.CombineReporters()
   390  	return rqHealth, nil
   391  }
   392  
   393  func (in *HealthService) getAppRequestsHealth(namespace, cluster, app, rateInterval string, queryTime time.Time) (models.RequestHealth, error) {
   394  	rqHealth := models.NewEmptyRequestHealth()
   395  
   396  	inbound, outbound, err := in.prom.GetAppRequestRates(namespace, cluster, app, rateInterval, queryTime)
   397  	if err != nil {
   398  		return rqHealth, errors.NewServiceUnavailable(err.Error())
   399  	}
   400  	for _, sample := range inbound {
   401  		rqHealth.AggregateInbound(sample)
   402  	}
   403  	for _, sample := range outbound {
   404  		rqHealth.AggregateOutbound(sample)
   405  	}
   406  	rqHealth.CombineReporters()
   407  	return rqHealth, nil
   408  }
   409  
   410  func (in *HealthService) getWorkloadRequestsHealth(namespace, cluster, workload, rateInterval string, queryTime time.Time, w *models.Workload) (models.RequestHealth, error) {
   411  	rqHealth := models.NewEmptyRequestHealth()
   412  	// @TODO include w.Cluster into query
   413  	inbound, outbound, err := in.prom.GetWorkloadRequestRates(namespace, cluster, workload, rateInterval, queryTime)
   414  	if err != nil {
   415  		return rqHealth, err
   416  	}
   417  	for _, sample := range inbound {
   418  		rqHealth.AggregateInbound(sample)
   419  	}
   420  	for _, sample := range outbound {
   421  		rqHealth.AggregateOutbound(sample)
   422  	}
   423  	if len(w.Pods) > 0 {
   424  		rqHealth.HealthAnnotations = models.GetHealthAnnotation(w.HealthAnnotations, HealthAnnotation)
   425  	}
   426  	rqHealth.CombineReporters()
   427  	return rqHealth, err
   428  }