github.com/kiali/kiali@v1.84.0/graph/telemetry/istio/appender/aggregate_node.go (about)

     1  package appender
     2  
     3  import (
     4  	"fmt"
     5  	"time"
     6  
     7  	"github.com/prometheus/common/model"
     8  
     9  	"github.com/kiali/kiali/graph"
    10  	"github.com/kiali/kiali/graph/telemetry/istio/util"
    11  	"github.com/kiali/kiali/log"
    12  	"github.com/kiali/kiali/prometheus"
    13  )
    14  
    15  const (
    16  	AggregateNodeAppenderName = "aggregateNode"
    17  )
    18  
    19  // AggregateNodeAppender is responsible for injecting aggregate nodes into the graph to gain
    20  // visibility into traffic aggregations for a user-specfied metric attribute.
    21  // Note: Aggregate Nodes are supported only on Requests traffic (not TCP or gRPC-message traffic)
    22  type AggregateNodeAppender struct {
    23  	Aggregate          string
    24  	AggregateValue     string
    25  	GraphType          string
    26  	InjectServiceNodes bool
    27  	Namespaces         map[string]graph.NamespaceInfo
    28  	QueryTime          int64 // unix time in seconds
    29  	Rates              graph.RequestedRates
    30  	Service            string
    31  }
    32  
    33  // Name implements Appender
    34  func (a AggregateNodeAppender) Name() string {
    35  	return AggregateNodeAppenderName
    36  }
    37  
    38  // IsFinalizer implements Appender
    39  func (a AggregateNodeAppender) IsFinalizer() bool {
    40  	return false
    41  }
    42  
    43  // AppendGraph implements Appender
    44  func (a AggregateNodeAppender) AppendGraph(trafficMap graph.TrafficMap, globalInfo *graph.AppenderGlobalInfo, namespaceInfo *graph.AppenderNamespaceInfo) {
    45  	if len(trafficMap) == 0 {
    46  		return
    47  	}
    48  
    49  	// Aggregate Nodes are not applicable to Service Graphs
    50  	if a.GraphType == graph.GraphTypeService {
    51  		return
    52  	}
    53  
    54  	// Aggregate Nodes are currently supported only on Requests traffic (not TCP or gRPC-message traffic)
    55  	if a.Rates.Grpc != graph.RateRequests && a.Rates.Http != graph.RateRequests {
    56  		return
    57  	}
    58  
    59  	if globalInfo.PromClient == nil {
    60  		var err error
    61  		globalInfo.PromClient, err = prometheus.NewClient()
    62  		graph.CheckError(err)
    63  	}
    64  
    65  	if a.AggregateValue == "" {
    66  		a.appendGraph(trafficMap, namespaceInfo.Namespace, globalInfo.PromClient)
    67  	} else {
    68  		a.appendNodeGraph(trafficMap, namespaceInfo.Namespace, globalInfo.PromClient)
    69  	}
    70  }
    71  
    72  func (a AggregateNodeAppender) appendGraph(trafficMap graph.TrafficMap, namespace string, client *prometheus.Client) {
    73  	log.Tracef("Resolving request aggregates for namespace=[%s], aggregate=[%s]", namespace, a.Aggregate)
    74  	duration := a.Namespaces[namespace].Duration
    75  
    76  	// query prometheus for aggregate info in two queries (assume aggregation is typically request classification, so use dest telemetry):
    77  	//   note1: we want to only match the aggregate when it is set and not "unknown".  But in Prometheus a negative test on an unset label
    78  	//      matches everything, so using %s!=unknown means we still have to filter out unset time-series below...
    79  	//   note2: for now we will filter out aggregates with no traffic on the assumption that users probably don't want to
    80  	//      see them and it will just increase the graph density.  To change that behavior remove the "> 0" conditions.
    81  	// 1) query for requests originating from a workload outside the namespace.
    82  	groupBy := fmt.Sprintf("source_cluster,source_workload_namespace,source_workload,source_canonical_service,source_canonical_revision,destination_cluster,destination_service_namespace,destination_service,destination_service_name,destination_workload_namespace,destination_workload,destination_canonical_service,destination_canonical_revision,request_protocol,response_code,grpc_response_status,response_flags,%s", a.Aggregate)
    83  	httpQuery := fmt.Sprintf(`sum(rate(%s{reporter="destination",source_workload_namespace!="%s",destination_service_namespace="%v",%s!="unknown"}[%vs])) by (%s) > 0`,
    84  		"istio_requests_total",
    85  		namespace,
    86  		namespace,
    87  		a.Aggregate,
    88  		int(duration.Seconds()), // range duration for the query
    89  		groupBy)
    90  	query := httpQuery
    91  	vector := promQuery(query, time.Unix(a.QueryTime, 0), client.GetContext(), client.API(), a)
    92  	a.injectAggregates(trafficMap, &vector)
    93  
    94  	// 2) query for requests originating from a workload inside of the namespace
    95  	httpQuery = fmt.Sprintf(`sum(rate(%s{reporter="destination",source_workload_namespace="%s",%s!="unknown"}[%vs])) by (%s) > 0`,
    96  		"istio_requests_total",
    97  		namespace,
    98  		a.Aggregate,
    99  		int(duration.Seconds()), // range duration for the query
   100  		groupBy)
   101  	query = httpQuery
   102  	vector = promQuery(query, time.Unix(a.QueryTime, 0), client.GetContext(), client.API(), a)
   103  	a.injectAggregates(trafficMap, &vector)
   104  }
   105  
   106  func (a AggregateNodeAppender) appendNodeGraph(trafficMap graph.TrafficMap, namespace string, client *prometheus.Client) {
   107  	log.Tracef("Resolving node request aggregates for namespace=[%s], aggregate=[%s=%s]", namespace, a.Aggregate, a.AggregateValue)
   108  	duration := a.Namespaces[namespace].Duration
   109  
   110  	// query prometheus for aggregate info in a single query (assume aggregation is typically request classification, so use dest telemetry):
   111  	//   note1: for now we will filter out aggregates with no traffic on the assumption that users probably don't want to
   112  	//      see them and it will just increase the graph density.  To change that behavior remove the "> 0" conditions.
   113  	serviceFragment := ""
   114  	if a.Service != "" {
   115  		serviceFragment = fmt.Sprintf(`,destination_service_name="%s"`, a.Service)
   116  	}
   117  	groupBy := fmt.Sprintf("source_cluster,source_workload_namespace,source_workload,source_canonical_service,source_canonical_revision,destination_cluster,destination_service_namespace,destination_service,destination_service_name,destination_workload_namespace,destination_workload,destination_canonical_service,destination_canonical_revision,request_protocol,response_code,grpc_response_status,response_flags,%s", a.Aggregate)
   118  	httpQuery := fmt.Sprintf(`sum(rate(%s{reporter="destination",destination_service_namespace="%s",%s="%s"%s}[%vs])) by (%s) > 0`,
   119  		"istio_requests_total",
   120  		namespace,
   121  		a.Aggregate,
   122  		a.AggregateValue,
   123  		serviceFragment,
   124  		int(duration.Seconds()), // range duration for the query
   125  		groupBy)
   126  	query := httpQuery
   127  	vector := promQuery(query, time.Unix(a.QueryTime, 0), client.GetContext(), client.API(), a)
   128  	a.injectAggregates(trafficMap, &vector)
   129  }
   130  
   131  func (a AggregateNodeAppender) injectAggregates(trafficMap graph.TrafficMap, vector *model.Vector) {
   132  	skipRequestsGrpc := a.Rates.Grpc != graph.RateRequests
   133  	skipRequestsHttp := a.Rates.Http != graph.RateRequests
   134  
   135  	for _, s := range *vector {
   136  		m := s.Metric
   137  		lSourceCluster, sourceClusterOk := m["source_cluster"]
   138  		lSourceWlNs, sourceWlNsOk := m["source_workload_namespace"]
   139  		lSourceWl, sourceWlOk := m["source_workload"]
   140  		lSourceApp, sourceAppOk := m["source_canonical_service"]
   141  		lSourceVer, sourceVerOk := m["source_canonical_revision"]
   142  		lDestCluster, destClusterOk := m["destination_cluster"]
   143  		lDestSvcNs, destSvcNsOk := m["destination_service_namespace"]
   144  		lDestSvc, destSvcOk := m["destination_service"]
   145  		lDestSvcName, destSvcNameOk := m["destination_service_name"]
   146  		lDestWlNs, destWlNsOk := m["destination_workload_namespace"]
   147  		lDestWl, destWlOk := m["destination_workload"]
   148  		lDestApp, destAppOk := m["destination_canonical_service"]
   149  		lDestVer, destVerOk := m["destination_canonical_revision"]
   150  		lCode := m["response_code"]
   151  		lGrpc, grpcOk := m["grpc_response_status"] // will be missing for non-GRPC
   152  		lFlags, flagsOk := m["response_flags"]
   153  		lProtocol, protocolOk := m["request_protocol"]             // because currently we only support requests traffic the protocol should be set
   154  		lAggregate, aggregateOk := m[model.LabelName(a.Aggregate)] // may be unset, see note above
   155  
   156  		if !aggregateOk {
   157  			continue
   158  		}
   159  
   160  		if !sourceWlNsOk || !sourceWlOk || !sourceAppOk || !sourceVerOk || !destSvcNsOk || !destSvcOk || !destSvcNameOk || !destWlNsOk || !destWlOk || !destAppOk || !destVerOk || !flagsOk || !protocolOk {
   161  			log.Warningf("Skipping %v, missing expected labels", m.String())
   162  			continue
   163  		}
   164  
   165  		sourceWlNs := string(lSourceWlNs)
   166  		sourceWl := string(lSourceWl)
   167  		sourceApp := string(lSourceApp)
   168  		sourceVer := string(lSourceVer)
   169  		destSvc := string(lDestSvc)
   170  		code := string(lCode)
   171  		protocol := string(lProtocol)
   172  		flags := string(lFlags)
   173  		aggregate := string(lAggregate)
   174  
   175  		if (skipRequestsHttp && protocol == graph.HTTP.Name) || (skipRequestsGrpc && protocol == graph.GRPC.Name) {
   176  			continue
   177  		}
   178  
   179  		// handle clusters
   180  		sourceCluster, destCluster := util.HandleClusters(lSourceCluster, sourceClusterOk, lDestCluster, destClusterOk)
   181  
   182  		if util.IsBadSourceTelemetry(sourceCluster, sourceClusterOk, sourceWlNs, sourceWl, sourceApp) {
   183  			continue
   184  		}
   185  
   186  		if protocolOk {
   187  			// set response code in a backward compatible way
   188  			code = util.HandleResponseCode(protocol, code, grpcOk, string(lGrpc))
   189  		} else {
   190  			// because currently we only support requests traffic the protocol should be set
   191  			log.Warningf("Skipping %v, missing expected protocol label", m.String())
   192  			continue
   193  			// protocol = "tcp"
   194  		}
   195  
   196  		// handle unusual destinations
   197  		destCluster, destSvcNs, destSvcName, destWlNs, destWl, destApp, destVer, _ := util.HandleDestination(sourceCluster, sourceWlNs, sourceWl, destCluster, string(lDestSvcNs), string(lDestSvc), string(lDestSvcName), string(lDestWlNs), string(lDestWl), string(lDestApp), string(lDestVer))
   198  
   199  		if util.IsBadDestTelemetry(destCluster, destClusterOk, destSvcNs, destSvc, destSvcName, destWl) {
   200  			continue
   201  		}
   202  
   203  		// make code more readable by setting "host" because "destSvc" holds destination.service.host | request.host | "unknown"
   204  		host := destSvc
   205  
   206  		val := float64(s.Value)
   207  
   208  		// inject aggregate node between source and destination
   209  		sourceID, _, _ := graph.Id(sourceCluster, sourceWlNs, "", sourceWlNs, sourceWl, sourceApp, sourceVer, a.GraphType)
   210  		sourceNode, sourceFound := trafficMap[sourceID]
   211  		if !sourceFound {
   212  			log.Debugf("Expected source [%s] node not found in traffic map. Skipping aggregate injection [%s]", sourceID, aggregate)
   213  			continue
   214  		}
   215  
   216  		// if service nodes are injected show the service-related aggregation:
   217  		//   - use the service node as the dest
   218  		//   - associate aggregate node with the destSvcName and, if set, destApp
   219  		// else show the independent aggregation by using the workload/app node as the dest
   220  		destID := ""
   221  		if a.InjectServiceNodes {
   222  			destID, _, _ = graph.Id(destCluster, destSvcNs, destSvcName, "", "", "", "", a.GraphType) // service
   223  		} else {
   224  			destID, _, _ = graph.Id(destCluster, destSvcNs, destSvcName, destWlNs, destWl, destApp, destVer, a.GraphType) // wl/app
   225  		}
   226  		destNode, destFound := trafficMap[destID]
   227  		if !destFound {
   228  			log.Debugf("Expected dest [%s] node not found in traffic map. Skipping aggregate injection [%s]", destID, aggregate)
   229  			continue
   230  		}
   231  
   232  		var aggrNode *graph.Node
   233  		if a.InjectServiceNodes {
   234  			aggrNode, _ = addNode(trafficMap, destCluster, destSvcNs, a.Aggregate, aggregate, destSvcName, destApp)
   235  		} else {
   236  			aggrNode, _ = addNode(trafficMap, destCluster, destWlNs, a.Aggregate, aggregate, "", "")
   237  		}
   238  
   239  		// replace the non-classified edge (from source to dest) with the classified edges
   240  		// - note that if not every request has a classification match the traffic may be lower than actual, I
   241  		//   think this this OK, and if the user cares they should define a "catch-all" classification match
   242  		safeEdges := []*graph.Edge{}
   243  		for _, e := range sourceNode.Edges {
   244  			if e.Dest.ID != destID {
   245  				safeEdges = append(safeEdges, e)
   246  			}
   247  		}
   248  		sourceNode.Edges = safeEdges
   249  
   250  		addTraffic(val, protocol, code, flags, host, sourceNode, aggrNode)
   251  		addTraffic(val, protocol, code, flags, host, aggrNode, destNode)
   252  	}
   253  }
   254  
   255  func addTraffic(val float64, protocol, code, flags, host string, source, dest *graph.Node) {
   256  	var edge *graph.Edge
   257  	for _, e := range source.Edges {
   258  		if dest.ID == e.Dest.ID && e.Metadata[graph.ProtocolKey] == protocol {
   259  			edge = e
   260  			break
   261  		}
   262  	}
   263  	if nil == edge {
   264  		edge = source.AddEdge(dest)
   265  		edge.Metadata[graph.ProtocolKey] = protocol
   266  	}
   267  
   268  	// Only update traffic on the aggregate node and associated edges.  Remember that this is an appender and the
   269  	// in/out traffic is already set for the non-aggregate nodes.
   270  	var sourceMetadata graph.Metadata
   271  	var destMetadata graph.Metadata
   272  	if source.NodeType == graph.NodeTypeAggregate {
   273  		sourceMetadata = source.Metadata
   274  	} else {
   275  		destMetadata = dest.Metadata
   276  	}
   277  	graph.AddToMetadata(protocol, val, code, flags, host, sourceMetadata, destMetadata, edge.Metadata)
   278  }
   279  
   280  func addNode(trafficMap graph.TrafficMap, cluster, namespace, aggregate, aggregateVal, svcName, app string) (*graph.Node, bool) {
   281  	id := graph.AggregateID(cluster, namespace, aggregate, aggregateVal, svcName)
   282  	node, found := trafficMap[id]
   283  	if !found {
   284  		newNode := graph.NewAggregateNodeExplicit(id, cluster, namespace, aggregate, aggregateVal, svcName, app)
   285  		node = &newNode
   286  		trafficMap[id] = node
   287  	}
   288  	return node, found
   289  }