github.com/kiali/kiali@v1.84.0/graph/telemetry/istio/istio.go (about) 1 // Package istio provides the Istio implementation of graph/TelemetryProvider. 2 package istio 3 4 // Istio.go is responsible for generating TrafficMaps using Istio telemetry. It implements the 5 // TelemetryVendor interface. 6 // 7 // The algorithm: 8 // Step 1) For each namespace: 9 // a) Query Prometheus (istio-requests-total metric) to retrieve the source-destination 10 // dependencies. Build a traffic map to provide a full representation of nodes and edges. 11 // 12 // b) Apply any requested appenders to alter or append-to the namespace traffic-map. 13 // 14 // c) Merge the namespace traffic-map into the final traffic-map 15 // 16 // Step 2) For the global traffic map 17 // a) Apply standard and requested finalizers to alter or append-to the final traffic-map 18 // 19 // b) Convert the final traffic-map to the requested vendor configiration (i.e. Cytoscape) and return 20 // 21 // Supports three vendor-specific query parameters: 22 // aggregate: Must be a valid metric attribute (default: request_operation) 23 // responseTime: Must be one of: avg | 50 | 95 | 99 24 // throughputType: request | response (default: response) 25 // 26 import ( 27 "context" 28 "crypto/md5" 29 "fmt" 30 "regexp" 31 "strings" 32 "time" 33 34 prom_v1 "github.com/prometheus/client_golang/api/prometheus/v1" 35 "github.com/prometheus/common/model" 36 37 "github.com/kiali/kiali/graph" 38 "github.com/kiali/kiali/graph/telemetry" 39 "github.com/kiali/kiali/graph/telemetry/istio/appender" 40 "github.com/kiali/kiali/graph/telemetry/istio/util" 41 "github.com/kiali/kiali/log" 42 "github.com/kiali/kiali/observability" 43 "github.com/kiali/kiali/prometheus" 44 "github.com/kiali/kiali/prometheus/internalmetrics" 45 ) 46 47 const ( 48 tsHash graph.MetadataKey = "tsHash" 49 tsHashMap graph.MetadataKey = "tsHashMap" 50 ) 51 52 var grpcMetric = regexp.MustCompile(`istio_.*_messages`) 53 54 // BuildNamespacesTrafficMap is required by the graph/TelemetryVendor interface 55 func BuildNamespacesTrafficMap(ctx context.Context, o graph.TelemetryOptions, client *prometheus.Client, globalInfo *graph.AppenderGlobalInfo) graph.TrafficMap { 56 var end observability.EndFunc 57 ctx, end = observability.StartSpan(ctx, "BuildNamespacesTrafficMap", 58 observability.Attribute("package", "istio"), 59 ) 60 defer end() 61 62 log.Tracef("Build [%s] graph for [%d] namespaces [%v]", o.GraphType, len(o.Namespaces), o.Namespaces) 63 64 appenders, finalizers := appender.ParseAppenders(o) 65 trafficMap := graph.NewTrafficMap() 66 67 for _, namespace := range o.Namespaces { 68 log.Tracef("Build traffic map for namespace [%v]", namespace) 69 namespaceTrafficMap := buildNamespaceTrafficMap(ctx, namespace.Name, o, client) 70 71 // The appenders can add/remove/alter nodes for the namespace 72 namespaceInfo := graph.NewAppenderNamespaceInfo(namespace.Name) 73 for _, a := range appenders { 74 var appenderEnd observability.EndFunc 75 _, appenderEnd = observability.StartSpan(ctx, "Appender "+a.Name(), 76 observability.Attribute("package", "istio"), 77 observability.Attribute("namespace", namespace.Name), 78 ) 79 appenderTimer := internalmetrics.GetGraphAppenderTimePrometheusTimer(a.Name()) 80 a.AppendGraph(namespaceTrafficMap, globalInfo, namespaceInfo) 81 appenderTimer.ObserveDuration() 82 appenderEnd() 83 } 84 85 // Merge this namespace into the final TrafficMap 86 telemetry.MergeTrafficMaps(trafficMap, namespace.Name, namespaceTrafficMap) 87 } 88 89 // The finalizers can perform final manipulations on the complete graph 90 for _, f := range finalizers { 91 f.AppendGraph(trafficMap, globalInfo, nil) 92 } 93 94 if graph.GraphTypeService == o.GraphType { 95 trafficMap = telemetry.ReduceToServiceGraph(trafficMap) 96 } 97 98 return trafficMap 99 } 100 101 // buildNamespaceTrafficMap returns a map of all namespace nodes (key=id). All 102 // nodes either directly send and/or receive requests from a node in the namespace. 103 func buildNamespaceTrafficMap(ctx context.Context, namespace string, o graph.TelemetryOptions, client *prometheus.Client) graph.TrafficMap { 104 var end observability.EndFunc 105 _, end = observability.StartSpan(ctx, "buildNamespaceTrafficMap", 106 observability.Attribute("package", "istio"), 107 observability.Attribute("namespace", namespace), 108 ) 109 defer end() 110 // create map to aggregate traffic by protocol and response code 111 trafficMap := graph.NewTrafficMap() 112 duration := o.Namespaces[namespace].Duration 113 idleCondition := "> 0" 114 if o.IncludeIdleEdges { 115 idleCondition = "" 116 } 117 118 // HTTP/GRPC request traffic 119 if o.Rates.Http == graph.RateRequests || o.Rates.Grpc == graph.RateRequests { 120 metric := "istio_requests_total" 121 groupBy := "source_cluster,source_workload_namespace,source_workload,source_canonical_service,source_canonical_revision,destination_cluster,destination_service_namespace,destination_service,destination_service_name,destination_workload_namespace,destination_workload,destination_canonical_service,destination_canonical_revision,request_protocol,response_code,grpc_response_status,response_flags" 122 123 // 0) Incoming: query source telemetry to capture unserviced namespace services' incoming traffic 124 query := fmt.Sprintf(`sum(rate(%s{reporter="source",source_workload_namespace!="%s",destination_workload_namespace="unknown",destination_workload="unknown",destination_service=~"^.+\\.%s\\..+$"} [%vs])) by (%s) %s`, 125 metric, 126 namespace, 127 namespace, 128 int(duration.Seconds()), // range duration for the query 129 groupBy, 130 idleCondition) 131 incomingVector := promQuery(query, time.Unix(o.QueryTime, 0), client.API()) 132 populateTrafficMap(trafficMap, &incomingVector, metric, o) 133 134 // 1) Incoming: query destination telemetry to capture namespace services' incoming traffic 135 query = fmt.Sprintf(`sum(rate(%s{reporter="destination",destination_workload_namespace="%s"} [%vs])) by (%s) %s`, 136 metric, 137 namespace, 138 int(duration.Seconds()), // range duration for the query 139 groupBy, 140 idleCondition) 141 incomingVector = promQuery(query, time.Unix(o.QueryTime, 0), client.API()) 142 populateTrafficMap(trafficMap, &incomingVector, metric, o) 143 144 // 2) Outgoing: query source telemetry to capture namespace workloads' outgoing traffic 145 query = fmt.Sprintf(`sum(rate(%s{reporter="source",source_workload_namespace="%s"} [%vs])) by (%s) %s`, 146 metric, 147 namespace, 148 int(duration.Seconds()), // range duration for the query 149 groupBy, 150 idleCondition) 151 outgoingVector := promQuery(query, time.Unix(o.QueryTime, 0), client.API()) 152 populateTrafficMap(trafficMap, &outgoingVector, metric, o) 153 } 154 155 // GRPC Message traffic 156 if o.Rates.Grpc != graph.RateNone && o.Rates.Grpc != graph.RateRequests { 157 var metrics []string 158 groupBy := "source_cluster,source_workload_namespace,source_workload,source_canonical_service,source_canonical_revision,destination_cluster,destination_service_namespace,destination_service,destination_service_name,destination_workload_namespace,destination_workload,destination_canonical_service,destination_canonical_revision" 159 160 switch o.Rates.Grpc { 161 case graph.RateReceived: 162 metrics = []string{"istio_response_messages_total"} 163 case graph.RateSent: 164 metrics = []string{"istio_request_messages_total"} 165 case graph.RateTotal: 166 metrics = []string{"istio_request_messages_total", "istio_response_messages_total"} 167 default: 168 metrics = []string{} 169 } 170 171 for _, metric := range metrics { 172 // 0) Incoming: query source telemetry to capture unserviced namespace services' incoming traffic 173 query := fmt.Sprintf(`sum(rate(%s{reporter="source",source_workload_namespace!="%s",destination_workload_namespace="unknown",destination_workload="unknown",destination_service=~"^.+\\.%s\\..+$"} [%vs])) by (%s) %s`, 174 metric, 175 namespace, 176 namespace, 177 int(duration.Seconds()), // range duration for the query 178 groupBy, 179 idleCondition) 180 incomingVector := promQuery(query, time.Unix(o.QueryTime, 0), client.API()) 181 populateTrafficMap(trafficMap, &incomingVector, metric, o) 182 183 // 1) Incoming: query destination telemetry to capture namespace services' incoming traffic query = fmt.Sprintf(`sum(rate(%s{reporter="destination",destination_service_namespace="%s"} [%vs])) by (%s) %s`, 184 query = fmt.Sprintf(`sum(rate(%s{reporter="destination",destination_workload_namespace="%s"} [%vs])) by (%s) %s`, 185 metric, 186 namespace, 187 int(duration.Seconds()), // range duration for the query 188 groupBy, 189 idleCondition) 190 incomingVector = promQuery(query, time.Unix(o.QueryTime, 0), client.API()) 191 populateTrafficMap(trafficMap, &incomingVector, metric, o) 192 193 // 2) Outgoing: query source telemetry to capture namespace workloads' outgoing traffic 194 query = fmt.Sprintf(`sum(rate(%s{reporter="source",source_workload_namespace="%s"} [%vs])) by (%s) %s`, 195 metric, 196 namespace, 197 int(duration.Seconds()), // range duration for the query 198 groupBy, 199 idleCondition) 200 outgoingVector := promQuery(query, time.Unix(o.QueryTime, 0), client.API()) 201 populateTrafficMap(trafficMap, &outgoingVector, metric, o) 202 } 203 } 204 205 // TCP Byte traffic 206 if o.Rates.Tcp != graph.RateNone { 207 var metrics []string 208 groupBy := "source_cluster,source_workload_namespace,source_workload,source_canonical_service,source_canonical_revision,destination_cluster,destination_service_namespace,destination_service,destination_service_name,destination_workload_namespace,destination_workload,destination_canonical_service,destination_canonical_revision,response_flags" 209 210 // L4 telemetry is backwards, see https://github.com/istio/istio/issues/32399 211 switch o.Rates.Tcp { 212 case graph.RateReceived: 213 metrics = []string{"istio_tcp_sent_bytes_total"} 214 case graph.RateSent: 215 metrics = []string{"istio_tcp_received_bytes_total"} 216 case graph.RateTotal: 217 metrics = []string{"istio_tcp_received_bytes_total", "istio_tcp_sent_bytes_total"} 218 default: 219 metrics = []string{} 220 } 221 222 for _, metric := range metrics { 223 // 0) Incoming: query source telemetry to capture unserviced namespace services' incoming traffic 224 query := fmt.Sprintf(`sum(rate(%s{reporter="source",source_workload_namespace!="%s",destination_workload_namespace="unknown",destination_workload="unknown",destination_service=~"^.+\\.%s\\..+$"} [%vs])) by (%s) %s`, 225 metric, 226 namespace, 227 namespace, 228 int(duration.Seconds()), // range duration for the query 229 groupBy, 230 idleCondition) 231 incomingVector := promQuery(query, time.Unix(o.QueryTime, 0), client.API()) 232 populateTrafficMap(trafficMap, &incomingVector, metric, o) 233 234 // 1) Incoming: query destination telemetry to capture namespace services' incoming traffic query = fmt.Sprintf(`sum(rate(%s{reporter="destination",destination_service_namespace="%s"} [%vs])) by (%s) %s`, 235 query = fmt.Sprintf(`sum(rate(%s{reporter="destination",destination_workload_namespace="%s"} [%vs])) by (%s) %s`, 236 metric, 237 namespace, 238 int(duration.Seconds()), // range duration for the query 239 groupBy, 240 idleCondition) 241 incomingVector = promQuery(query, time.Unix(o.QueryTime, 0), client.API()) 242 populateTrafficMap(trafficMap, &incomingVector, metric, o) 243 244 // 2) Outgoing: query source telemetry to capture namespace workloads' outgoing traffic 245 query = fmt.Sprintf(`sum(rate(%s{reporter="source",source_workload_namespace="%s"} [%vs])) by (%s) %s`, 246 metric, 247 namespace, 248 int(duration.Seconds()), // range duration for the query 249 groupBy, 250 idleCondition) 251 outgoingVector := promQuery(query, time.Unix(o.QueryTime, 0), client.API()) 252 populateTrafficMap(trafficMap, &outgoingVector, metric, o) 253 } 254 } 255 256 return trafficMap 257 } 258 259 func populateTrafficMap(trafficMap graph.TrafficMap, vector *model.Vector, metric string, o graph.TelemetryOptions) { 260 isRequests := true 261 protocol := "" 262 switch { 263 case grpcMetric.MatchString(metric): 264 isRequests = false 265 protocol = graph.GRPC.Name 266 case strings.HasPrefix(metric, "istio_tcp"): 267 isRequests = false 268 protocol = graph.TCP.Name 269 } 270 skipRequestsGrpc := isRequests && o.Rates.Grpc != graph.RateRequests 271 skipRequestsHttp := isRequests && o.Rates.Http != graph.RateRequests 272 273 for _, s := range *vector { 274 val := float64(s.Value) 275 276 m := s.Metric 277 lSourceCluster, sourceClusterOk := m["source_cluster"] 278 lSourceWlNs, sourceWlNsOk := m["source_workload_namespace"] 279 lSourceWl, sourceWlOk := m["source_workload"] 280 lSourceApp, sourceAppOk := m["source_canonical_service"] 281 lSourceVer, sourceVerOk := m["source_canonical_revision"] 282 lDestCluster, destClusterOk := m["destination_cluster"] 283 lDestSvcNs, destSvcNsOk := m["destination_service_namespace"] 284 lDestSvc, destSvcOk := m["destination_service"] 285 lDestSvcName, destSvcNameOk := m["destination_service_name"] 286 lDestWlNs, destWlNsOk := m["destination_workload_namespace"] 287 lDestWl, destWlOk := m["destination_workload"] 288 lDestApp, destAppOk := m["destination_canonical_service"] 289 lDestVer, destVerOk := m["destination_canonical_revision"] 290 291 if !sourceWlNsOk || !sourceWlOk || !sourceAppOk || !sourceVerOk || !destSvcNsOk || !destSvcOk || !destSvcNameOk || !destWlNsOk || !destWlOk || !destAppOk || !destVerOk { 292 log.Warningf("Skipping %s, missing expected TS labels", m.String()) 293 continue 294 } 295 296 sourceWlNs := string(lSourceWlNs) 297 sourceWl := string(lSourceWl) 298 sourceApp := string(lSourceApp) 299 sourceVer := string(lSourceVer) 300 destSvc := string(lDestSvc) 301 302 flags := "" 303 if isRequests || protocol == graph.TCP.Name { 304 lFlags, flagsOk := m["response_flags"] 305 if !flagsOk { 306 log.Warningf("Skipping %s, missing expected TS labels", m.String()) 307 continue 308 } 309 flags = string(lFlags) 310 } 311 312 // handle clusters 313 sourceCluster, destCluster := util.HandleClusters(lSourceCluster, sourceClusterOk, lDestCluster, destClusterOk) 314 315 if util.IsBadSourceTelemetry(sourceCluster, sourceClusterOk, sourceWlNs, sourceWl, sourceApp) { 316 continue 317 } 318 319 // handle unusual destinations 320 destCluster, destSvcNs, destSvcName, destWlNs, destWl, destApp, destVer, _ := util.HandleDestination(sourceCluster, sourceWlNs, sourceWl, destCluster, string(lDestSvcNs), string(lDestSvc), string(lDestSvcName), string(lDestWlNs), string(lDestWl), string(lDestApp), string(lDestVer)) 321 322 if util.IsBadDestTelemetry(destCluster, destClusterOk, destSvcNs, destSvc, destSvcName, destWl) { 323 continue 324 } 325 326 var code string 327 if isRequests { 328 lProtocol, protocolOk := m["request_protocol"] 329 lCode, codeOk := m["response_code"] 330 lGrpc, grpcOk := m["grpc_response_status"] 331 332 if !protocolOk || !codeOk { 333 log.Warningf("Skipping %s, missing expected HTTP/GRPC TS labels", m.String()) 334 continue 335 } 336 337 protocol = string(lProtocol) 338 if skipRequestsGrpc && protocol == graph.GRPC.Name || skipRequestsHttp && protocol == graph.HTTP.Name { 339 continue 340 } 341 342 // set response code in a backward compatible way 343 code = util.HandleResponseCode(protocol, string(lCode), grpcOk, string(lGrpc)) 344 } 345 346 // make code more readable by setting "host" because "destSvc" holds destination.service.host | request.host | "unknown" 347 host := destSvc 348 349 // don't inject a service node if any of: 350 // - destSvcName is not set 351 // - destSvcName is PassthroughCluster (see https://github.com/kiali/kiali/issues/4488) 352 // - dest node is already a service node 353 inject := false 354 if o.InjectServiceNodes && graph.IsOK(destSvcName) && destSvcName != graph.PassthroughCluster { 355 _, destNodeType, err := graph.Id(destCluster, destSvcNs, destSvcName, destWlNs, destWl, destApp, destVer, o.GraphType) 356 if err != nil { 357 log.Warningf("Skipping %s, %s", m.String(), err) 358 continue 359 } 360 inject = (graph.NodeTypeService != destNodeType) 361 } 362 addTraffic(trafficMap, metric, inject, val, protocol, code, flags, host, sourceCluster, sourceWlNs, "", sourceWl, sourceApp, sourceVer, destCluster, destSvcNs, destSvcName, destWlNs, destWl, destApp, destVer, o) 363 } 364 } 365 366 func addTraffic(trafficMap graph.TrafficMap, metric string, inject bool, val float64, protocol, code, flags, host, sourceCluster, sourceNs, sourceSvc, sourceWl, sourceApp, sourceVer, destCluster, destSvcNs, destSvcName, destWlNs, destWl, destApp, destVer string, o graph.TelemetryOptions) { 367 source, _, err := addNode(trafficMap, sourceCluster, sourceNs, sourceSvc, sourceNs, sourceWl, sourceApp, sourceVer, o) 368 if err != nil { 369 log.Warningf("Skipping addTraffic (source), %s", err) 370 return 371 } 372 dest, _, err := addNode(trafficMap, destCluster, destSvcNs, destSvcName, destWlNs, destWl, destApp, destVer, o) 373 if err != nil { 374 log.Warningf("Skipping addTraffic (dest), %s", err) 375 return 376 } 377 378 // Istio can generate duplicate metrics by reporting from both the source and destination proxies. To avoid 379 // processing the same information twice we keep track of the time series applied to a particular edge. The 380 // edgeTSHash incorporates information about the time series' source, destination and metric information, 381 // and uses that unique TS has to protect against applying the same intomation twice. 382 edgeTSHash := fmt.Sprintf("%x", md5.Sum([]byte(fmt.Sprintf("%s:%s:%s:%s:%s:%s", metric, source.Metadata[tsHash], dest.Metadata[tsHash], code, flags, host)))) 383 384 if inject { 385 injectedService, _, err := addNode(trafficMap, destCluster, destSvcNs, destSvcName, "", "", "", "", o) 386 if err != nil { 387 log.Warningf("Skipping addTraffic (inject), %s", err) 388 return 389 } 390 injectedService.Metadata[graph.IsInjected] = true 391 if addEdgeTraffic(trafficMap, val, protocol, code, flags, host, source, injectedService, edgeTSHash, o) { 392 addToDestServices(injectedService.Metadata, destCluster, destSvcNs, destSvcName) 393 394 addEdgeTraffic(trafficMap, val, protocol, code, flags, host, injectedService, dest, edgeTSHash, o) 395 addToDestServices(dest.Metadata, destCluster, destSvcNs, destSvcName) 396 } 397 } else { 398 if addEdgeTraffic(trafficMap, val, protocol, code, flags, host, source, dest, edgeTSHash, o) { 399 addToDestServices(dest.Metadata, destCluster, destSvcNs, destSvcName) 400 } 401 } 402 } 403 404 // addEdgeTraffic uses edgeTSHash that the metric information has not been applied to the edge. Returns true 405 // if the the metric information is applied, false if it determined to be a duplicate. 406 func addEdgeTraffic(trafficMap graph.TrafficMap, val float64, protocol, code, flags, host string, source, dest *graph.Node, edgeTSHash string, o graph.TelemetryOptions) bool { 407 var edge *graph.Edge 408 for _, e := range source.Edges { 409 if dest.ID == e.Dest.ID && e.Metadata[graph.ProtocolKey] == protocol { 410 edge = e 411 break 412 } 413 } 414 if nil == edge { 415 edge = source.AddEdge(dest) 416 edge.Metadata[graph.ProtocolKey] = protocol 417 edge.Metadata[tsHashMap] = make(map[string]bool) 418 } 419 420 if _, ok := edge.Metadata[tsHashMap].(map[string]bool)[edgeTSHash]; !ok { 421 edge.Metadata[tsHashMap].(map[string]bool)[edgeTSHash] = true 422 graph.AddToMetadata(protocol, val, code, flags, host, source.Metadata, dest.Metadata, edge.Metadata) 423 return true 424 } 425 426 return false 427 } 428 429 func addToDestServices(md graph.Metadata, cluster, namespace, service string) { 430 if !graph.IsOK(service) { 431 return 432 } 433 destServices, ok := md[graph.DestServices] 434 if !ok { 435 destServices = graph.NewDestServicesMetadata() 436 md[graph.DestServices] = destServices 437 } 438 destService := graph.ServiceName{Cluster: cluster, Namespace: namespace, Name: service} 439 destServices.(graph.DestServicesMetadata)[destService.Key()] = destService 440 } 441 442 func addNode(trafficMap graph.TrafficMap, cluster, serviceNs, service, workloadNs, workload, app, version string, o graph.TelemetryOptions) (*graph.Node, bool, error) { 443 id, nodeType, err := graph.Id(cluster, serviceNs, service, workloadNs, workload, app, version, o.GraphType) 444 if err != nil { 445 return nil, false, err 446 } 447 node, found := trafficMap[id] 448 if !found { 449 namespace := workloadNs 450 if !graph.IsOK(namespace) { 451 namespace = serviceNs 452 } 453 newNode := graph.NewNodeExplicit(id, cluster, namespace, workload, app, version, service, nodeType, o.GraphType) 454 node = newNode 455 trafficMap[id] = node 456 } 457 node.Metadata["tsHash"] = timeSeriesHash(cluster, serviceNs, service, workloadNs, workload, app, version) 458 return node, found, nil 459 } 460 461 func timeSeriesHash(cluster, serviceNs, service, workloadNs, workload, app, version string) string { 462 return fmt.Sprintf("%x", md5.Sum([]byte(fmt.Sprintf("%s:%s:%s:%s:%s:%s:%s", cluster, serviceNs, service, workloadNs, workload, app, version)))) 463 } 464 465 // BuildNodeTrafficMap is required by the graph/TelemtryVendor interface 466 func BuildNodeTrafficMap(o graph.TelemetryOptions, client *prometheus.Client, globalInfo *graph.AppenderGlobalInfo) (graph.TrafficMap, error) { 467 if o.NodeOptions.Aggregate != "" { 468 return handleAggregateNodeTrafficMap(o, client, globalInfo), nil 469 } 470 471 n, err := graph.NewNode(o.NodeOptions.Cluster, o.NodeOptions.Namespace, o.NodeOptions.Service, o.NodeOptions.Namespace, o.NodeOptions.Workload, o.NodeOptions.App, o.NodeOptions.Version, o.GraphType) 472 if err != nil { 473 log.Warningf("Skipping NodeTrafficMap (bad node), %s", err) 474 return nil, err 475 } 476 477 log.Tracef("Build graph for node [%+v]", n) 478 479 appenders, finalizers := appender.ParseAppenders(o) 480 trafficMap := buildNodeTrafficMap(o.Cluster, o.NodeOptions.Namespace, n, o, client) 481 482 namespaceInfo := graph.NewAppenderNamespaceInfo(o.NodeOptions.Namespace) 483 484 for _, a := range appenders { 485 appenderTimer := internalmetrics.GetGraphAppenderTimePrometheusTimer(a.Name()) 486 a.AppendGraph(trafficMap, globalInfo, namespaceInfo) 487 appenderTimer.ObserveDuration() 488 } 489 490 // The finalizers can perform final manipulations on the complete graph 491 for _, f := range finalizers { 492 f.AppendGraph(trafficMap, globalInfo, nil) 493 } 494 495 // Note that this is where we would call reduceToServiceGraph for graphTypeService but 496 // the current decision is to not reduce the node graph to provide more detail. This may be 497 // confusing to users, we'll see... 498 499 return trafficMap, nil 500 } 501 502 // buildNodeTrafficMap returns a map of all nodes requesting or requested by the target node (key=id). Node graphs 503 // are from the perspective of the node, as such we use destination telemetry for incoming traffic and source telemetry 504 // for outgoing traffic. 505 func buildNodeTrafficMap(cluster, namespace string, n *graph.Node, o graph.TelemetryOptions, client *prometheus.Client) graph.TrafficMap { 506 // create map to aggregate traffic by protocol and response code 507 trafficMap := graph.NewTrafficMap() 508 duration := o.Namespaces[namespace].Duration 509 idleCondition := "> 0" 510 if o.IncludeIdleEdges { 511 idleCondition = "" 512 } 513 514 // only narrow by cluster if it is set on the target node 515 var sourceCluster, destCluster string 516 if cluster != graph.Unknown { 517 sourceCluster = fmt.Sprintf(`,source_cluster="%s"`, cluster) 518 destCluster = fmt.Sprintf(`,destination_cluster="%s"`, cluster) 519 } 520 521 // HTTP/GRPC Traffic 522 if o.Rates.Http == graph.RateRequests || o.Rates.Grpc == graph.RateRequests { 523 metric := "istio_requests_total" 524 groupBy := "source_cluster,source_workload_namespace,source_workload,source_canonical_service,source_canonical_revision,destination_cluster,destination_service_namespace,destination_service,destination_service_name,destination_workload_namespace,destination_workload,destination_canonical_service,destination_canonical_revision,request_protocol,response_code,grpc_response_status,response_flags" 525 526 // query prometheus for request traffic in two queries: 527 // 1) query for incoming traffic 528 var query string 529 switch n.NodeType { 530 case graph.NodeTypeWorkload: 531 query = fmt.Sprintf(`sum(rate(%s{reporter="destination"%s,destination_workload_namespace="%s",destination_workload="%s"} [%vs])) by (%s) %s`, 532 metric, 533 destCluster, 534 namespace, 535 n.Workload, 536 int(duration.Seconds()), // range duration for the query 537 groupBy, 538 idleCondition) 539 case graph.NodeTypeApp: 540 if graph.IsOK(n.Version) { 541 query = fmt.Sprintf(`sum(rate(%s{reporter="destination"%s,destination_service_namespace="%s",destination_canonical_service="%s",destination_canonical_revision="%s"} [%vs])) by (%s) %s`, 542 metric, 543 destCluster, 544 namespace, 545 n.App, 546 n.Version, 547 int(duration.Seconds()), // range duration for the query 548 groupBy, 549 idleCondition) 550 } else { 551 query = fmt.Sprintf(`sum(rate(%s{reporter="destination"%s,destination_service_namespace="%s",destination_canonical_service="%s"} [%vs])) by (%s) %s`, 552 metric, 553 destCluster, 554 namespace, 555 n.App, 556 int(duration.Seconds()), // range duration for the query 557 groupBy, 558 idleCondition) 559 } 560 case graph.NodeTypeService: 561 // Service nodes require two queries for incoming 562 // 1.a) query source telemetry for requests to the service that could not be serviced 563 query = fmt.Sprintf(`sum(rate(%s{reporter="source"%s,destination_workload="unknown",destination_service=~"^%s\\.%s\\..*$"} [%vs])) by (%s) %s`, 564 metric, 565 destCluster, 566 n.Service, 567 namespace, 568 int(duration.Seconds()), // range duration for the query 569 groupBy, 570 idleCondition) 571 vector := promQuery(query, time.Unix(o.QueryTime, 0), client.API()) 572 populateTrafficMap(trafficMap, &vector, metric, o) 573 574 // 1.b) query dest telemetry for requests to the service, serviced by service workloads 575 query = fmt.Sprintf(`sum(rate(%s{reporter="destination"%s,destination_service_namespace="%s",destination_service=~"^%s\\.%s\\..*$"} [%vs])) by (%s) %s`, 576 metric, 577 destCluster, 578 namespace, 579 n.Service, 580 namespace, 581 int(duration.Seconds()), // range duration for the query 582 groupBy, 583 idleCondition) 584 default: 585 graph.Error(fmt.Sprintf("NodeType [%s] not supported", n.NodeType)) 586 } 587 inVector := promQuery(query, time.Unix(o.QueryTime, 0), client.API()) 588 populateTrafficMap(trafficMap, &inVector, metric, o) 589 590 // 2) query for outbound traffic 591 switch n.NodeType { 592 case graph.NodeTypeWorkload: 593 query = fmt.Sprintf(`sum(rate(%s{reporter="source"%s,source_workload_namespace="%s",source_workload="%s"} [%vs])) by (%s) %s`, 594 metric, 595 sourceCluster, 596 namespace, 597 n.Workload, 598 int(duration.Seconds()), // range duration for the query 599 groupBy, 600 idleCondition) 601 case graph.NodeTypeApp: 602 if graph.IsOK(n.Version) { 603 query = fmt.Sprintf(`sum(rate(%s{reporter="source"%s,source_workload_namespace="%s",source_canonical_service="%s",source_canonical_revision="%s"} [%vs])) by (%s) %s`, 604 metric, 605 sourceCluster, 606 namespace, 607 n.App, 608 n.Version, 609 int(duration.Seconds()), // range duration for the query 610 groupBy, 611 idleCondition) 612 } else { 613 query = fmt.Sprintf(`sum(rate(%s{reporter="source"%s,source_workload_namespace="%s",source_canonical_service="%s"} [%vs])) by (%s) %s`, 614 metric, 615 sourceCluster, 616 namespace, 617 n.App, 618 int(duration.Seconds()), // range duration for the query 619 groupBy, 620 idleCondition) 621 } 622 case graph.NodeTypeService: 623 query = "" 624 default: 625 graph.Error(fmt.Sprintf("NodeType [%s] not supported", n.NodeType)) 626 } 627 outVector := promQuery(query, time.Unix(o.QueryTime, 0), client.API()) 628 populateTrafficMap(trafficMap, &outVector, metric, o) 629 } 630 631 // gRPC message traffic 632 if o.Rates.Grpc != graph.RateNone && o.Rates.Grpc != graph.RateRequests { 633 var metrics []string 634 groupBy := "source_cluster,source_workload_namespace,source_workload,source_canonical_service,source_canonical_revision,destination_cluster,destination_service_namespace,destination_service,destination_service_name,destination_workload_namespace,destination_workload,destination_canonical_service,destination_canonical_revision" 635 636 switch o.Rates.Grpc { 637 case graph.RateReceived: 638 metrics = []string{"istio_response_messages_total"} 639 case graph.RateSent: 640 metrics = []string{"istio_request_messages_total"} 641 case graph.RateTotal: 642 metrics = []string{"istio_request_messages_total", "istio_response_messages_total"} 643 default: 644 metrics = []string{} 645 } 646 647 for _, metric := range metrics { 648 var query string 649 650 switch n.NodeType { 651 case graph.NodeTypeWorkload: 652 query = fmt.Sprintf(`sum(rate(%s{reporter="destination"%s,destination_workload_namespace="%s",destination_workload="%s"} [%vs])) by (%s) %s`, 653 metric, 654 destCluster, 655 namespace, 656 n.Workload, 657 int(duration.Seconds()), // range duration for the query 658 groupBy, 659 idleCondition) 660 case graph.NodeTypeApp: 661 if graph.IsOK(n.Version) { 662 query = fmt.Sprintf(`sum(rate(%s{reporter="destination"%s,destination_service_namespace="%s",destination_canonical_service="%s",destination_canonical_revision="%s"} [%vs])) by (%s) %s`, 663 metric, 664 destCluster, 665 namespace, 666 n.App, 667 n.Version, 668 int(duration.Seconds()), // range duration for the query 669 groupBy, 670 idleCondition) 671 } else { 672 query = fmt.Sprintf(`sum(rate(%s{reporter="destination"%s,destination_service_namespace="%s",destination_canonical_service="%s"} [%vs])) by (%s) %s`, 673 metric, 674 destCluster, 675 namespace, 676 n.App, 677 int(duration.Seconds()), // range duration for the query 678 groupBy, 679 idleCondition) 680 } 681 case graph.NodeTypeService: 682 // TODO: Do we need to handle requests from unknown in a special way (like in HTTP above)? Not sure how gRPC-messages is reported from unknown. 683 query = fmt.Sprintf(`sum(rate(%s{reporter="destination"%s,destination_service_namespace="%s",destination_service=~"^%s\\.%s\\..*$"} [%vs])) by (%s) %s`, 684 metric, 685 destCluster, 686 namespace, 687 n.Service, 688 namespace, 689 int(duration.Seconds()), // range duration for the query 690 groupBy, 691 idleCondition) 692 default: 693 graph.Error(fmt.Sprintf("NodeType [%s] not supported", n.NodeType)) 694 } 695 incomingVector := promQuery(query, time.Unix(o.QueryTime, 0), client.API()) 696 populateTrafficMap(trafficMap, &incomingVector, metric, o) 697 698 // 2) query for outbound traffic 699 switch n.NodeType { 700 case graph.NodeTypeWorkload: 701 query = fmt.Sprintf(`sum(rate(%s{reporter="source"%s,source_workload_namespace="%s",source_workload="%s"} [%vs])) by (%s) %s`, 702 metric, 703 sourceCluster, 704 namespace, 705 n.Workload, 706 int(duration.Seconds()), // range duration for the query 707 groupBy, 708 idleCondition) 709 case graph.NodeTypeApp: 710 if graph.IsOK(n.Version) { 711 query = fmt.Sprintf(`sum(rate(%s{reporter="source"%s,source_workload_namespace="%s",source_canonical_service="%s",source_canonical_revision="%s"} [%vs])) by (%s) %s`, 712 metric, 713 sourceCluster, 714 namespace, 715 n.App, 716 n.Version, 717 int(duration.Seconds()), // range duration for the query 718 groupBy, 719 idleCondition) 720 } else { 721 query = fmt.Sprintf(`sum(rate(%s{reporter="source"%s,source_workload_namespace="%s",source_canonical_service="%s"} [%vs])) by (%s) %s`, 722 metric, 723 sourceCluster, 724 namespace, 725 n.App, 726 int(duration.Seconds()), // range duration for the query 727 groupBy, 728 idleCondition) 729 } 730 case graph.NodeTypeService: 731 query = "" 732 default: 733 graph.Error(fmt.Sprintf("NodeType [%s] not supported", n.NodeType)) 734 } 735 outgoingVector := promQuery(query, time.Unix(o.QueryTime, 0), client.API()) 736 populateTrafficMap(trafficMap, &outgoingVector, metric, o) 737 } 738 } 739 740 // TCP byte traffic 741 if o.Rates.Tcp != graph.RateNone { 742 var metrics []string 743 groupBy := "source_cluster,source_workload_namespace,source_workload,source_canonical_service,source_canonical_revision,destination_cluster,destination_service_namespace,destination_service,destination_service_name,destination_workload_namespace,destination_workload,destination_canonical_service,destination_canonical_revision,response_flags" 744 745 // L4 telemetry is backwards, see https://github.com/istio/istio/issues/32399 746 switch o.Rates.Tcp { 747 case graph.RateReceived: 748 metrics = []string{"istio_tcp_sent_bytes_total"} 749 case graph.RateSent: 750 metrics = []string{"istio_tcp_received_bytes_total"} 751 case graph.RateTotal: 752 metrics = []string{"istio_tcp_received_bytes_total", "istio_tcp_sent_bytes_total"} 753 default: 754 metrics = []string{} 755 } 756 757 for _, metric := range metrics { 758 var query string 759 760 switch n.NodeType { 761 case graph.NodeTypeWorkload: 762 query = fmt.Sprintf(`sum(rate(%s{reporter="destination"%s,destination_workload_namespace="%s",destination_workload="%s"} [%vs])) by (%s) %s`, 763 metric, 764 destCluster, 765 namespace, 766 n.Workload, 767 int(duration.Seconds()), // range duration for the query 768 groupBy, 769 idleCondition) 770 case graph.NodeTypeApp: 771 if graph.IsOK(n.Version) { 772 query = fmt.Sprintf(`sum(rate(%s{reporter="destination"%s,destination_service_namespace="%s",destination_canonical_service="%s",destination_canonical_revision="%s"} [%vs])) by (%s) %s`, 773 metric, 774 destCluster, 775 namespace, 776 n.App, 777 n.Version, 778 int(duration.Seconds()), // range duration for the query 779 groupBy, 780 idleCondition) 781 } else { 782 query = fmt.Sprintf(`sum(rate(%s{reporter="destination"%s,destination_service_namespace="%s",destination_canonical_service="%s"} [%vs])) by (%s) %s`, 783 metric, 784 destCluster, 785 namespace, 786 n.App, 787 int(duration.Seconds()), // range duration for the query 788 groupBy, 789 idleCondition) 790 } 791 case graph.NodeTypeService: 792 // TODO: Do we need to handle requests from unknown in a special way (like in HTTP above)? Not sure how tcp is reported from unknown. 793 query = fmt.Sprintf(`sum(rate(%s{reporter="destination"%s,destination_service_namespace="%s",destination_service=~"^%s\\.%s\\..*$"} [%vs])) by (%s) %s`, 794 metric, 795 destCluster, 796 namespace, 797 n.Service, 798 namespace, 799 int(duration.Seconds()), // range duration for the query 800 groupBy, 801 idleCondition) 802 default: 803 graph.Error(fmt.Sprintf("NodeType [%s] not supported", n.NodeType)) 804 } 805 incomingVector := promQuery(query, time.Unix(o.QueryTime, 0), client.API()) 806 populateTrafficMap(trafficMap, &incomingVector, metric, o) 807 808 // 2) query for outbound traffic 809 switch n.NodeType { 810 case graph.NodeTypeWorkload: 811 query = fmt.Sprintf(`sum(rate(%s{reporter="source"%s,source_workload_namespace="%s",source_workload="%s"} [%vs])) by (%s) %s`, 812 metric, 813 sourceCluster, 814 namespace, 815 n.Workload, 816 int(duration.Seconds()), // range duration for the query 817 groupBy, 818 idleCondition) 819 case graph.NodeTypeApp: 820 if graph.IsOK(n.Version) { 821 query = fmt.Sprintf(`sum(rate(%s{reporter="source"%s,source_workload_namespace="%s",source_canonical_service="%s",source_canonical_revision="%s"} [%vs])) by (%s) %s`, 822 metric, 823 sourceCluster, 824 namespace, 825 n.App, 826 n.Version, 827 int(duration.Seconds()), // range duration for the query 828 groupBy, 829 idleCondition) 830 } else { 831 query = fmt.Sprintf(`sum(rate(%s{reporter="source"%s,source_workload_namespace="%s",source_canonical_service="%s"} [%vs])) by (%s) %s`, 832 metric, 833 sourceCluster, 834 namespace, 835 n.App, 836 int(duration.Seconds()), // range duration for the query 837 groupBy, 838 idleCondition) 839 } 840 case graph.NodeTypeService: 841 query = "" 842 default: 843 graph.Error(fmt.Sprintf("NodeType [%s] not supported", n.NodeType)) 844 } 845 outgoingVector := promQuery(query, time.Unix(o.QueryTime, 0), client.API()) 846 populateTrafficMap(trafficMap, &outgoingVector, metric, o) 847 } 848 } 849 850 return trafficMap 851 } 852 853 func handleAggregateNodeTrafficMap(o graph.TelemetryOptions, client *prometheus.Client, globalInfo *graph.AppenderGlobalInfo) graph.TrafficMap { 854 n := graph.NewAggregateNode(o.NodeOptions.Cluster, o.NodeOptions.Namespace, o.NodeOptions.Aggregate, o.NodeOptions.AggregateValue, o.NodeOptions.Service, o.NodeOptions.App) 855 856 log.Tracef("Build graph for aggregate node [%+v]", n) 857 858 if !o.Appenders.All { 859 o.Appenders.AppenderNames = append(o.Appenders.AppenderNames, appender.AggregateNodeAppenderName) 860 } 861 appenders, finalizers := appender.ParseAppenders(o) 862 trafficMap := buildAggregateNodeTrafficMap(o.NodeOptions.Namespace, n, o, client) 863 864 namespaceInfo := graph.NewAppenderNamespaceInfo(o.NodeOptions.Namespace) 865 866 for _, a := range appenders { 867 appenderTimer := internalmetrics.GetGraphAppenderTimePrometheusTimer(a.Name()) 868 a.AppendGraph(trafficMap, globalInfo, namespaceInfo) 869 appenderTimer.ObserveDuration() 870 } 871 872 // The finalizers can perform final manipulations on the complete graph 873 for _, f := range finalizers { 874 f.AppendGraph(trafficMap, globalInfo, nil) 875 } 876 877 return trafficMap 878 } 879 880 // buildAggregateNodeTrafficMap returns a map of all incoming and outgoing traffic from the perspective of the aggregate. Aggregates 881 // are always generated for serviced requests and therefore via destination telemetry. 882 func buildAggregateNodeTrafficMap(namespace string, n graph.Node, o graph.TelemetryOptions, client *prometheus.Client) graph.TrafficMap { 883 interval := o.Namespaces[namespace].Duration 884 885 // create map to aggregate traffic by response code 886 trafficMap := graph.NewTrafficMap() 887 888 // It takes only one prometheus query to get everything involving the target operation 889 serviceFragment := "" 890 if n.Service != "" { 891 serviceFragment = fmt.Sprintf(`,destination_service_name="%s"`, n.Service) 892 } 893 metric := "istio_requests_total" 894 groupBy := "source_cluster,source_workload_namespace,source_workload,source_canonical_service,source_canonical_revision,destination_cluster,destination_service_namespace,destination_service,destination_service_name,destination_workload_namespace,destination_workload,destination_canonical_service,destination_canonical_revision,request_protocol,response_code,grpc_response_status,response_flags" 895 httpQuery := fmt.Sprintf(`sum(rate(%s{reporter="destination",destination_service_namespace="%s",%s="%s"%s}[%vs])) by (%s) > 0`, 896 metric, 897 namespace, 898 n.Metadata[graph.Aggregate], 899 n.Metadata[graph.AggregateValue], 900 serviceFragment, 901 int(interval.Seconds()), // range duration for the query 902 groupBy) 903 /* It's not clear that request classification makes sense for TCP metrics. Because it costs us queries I'm 904 removing the support for now, we can add it back if someone presents a valid use case. (same for gRCP message metrics) 905 tcpQuery := fmt.Sprintf(`sum(rate(%s{reporter="destination",destination_service_namespace="%s",%s="%s"}[%vs])) by (%s) > 0`, 906 "istio_tcp_sent_bytes_total", 907 namespace, 908 n.Metadata[graph.Aggregate], 909 n.Metadata[graph.AggregateValue], 910 int(interval.Seconds()), // range duration for the query 911 groupBy) 912 query := fmt.Sprintf(`(%s) OR (%s)`, httpQuery, tcpQuery) 913 */ 914 query := httpQuery 915 vector := promQuery(query, time.Unix(o.QueryTime, 0), client.API()) 916 populateTrafficMap(trafficMap, &vector, metric, o) 917 918 return trafficMap 919 } 920 921 // TODO: Can this be combined with graph.telemetry.istio.appender.promQuery? 922 func promQuery(query string, queryTime time.Time, api prom_v1.API) model.Vector { 923 if query == "" { 924 return model.Vector{} 925 } 926 927 ctx, cancel := context.WithCancel(context.Background()) 928 defer cancel() 929 930 // add scope if necessary 931 query = util.AddQueryScope(query) 932 933 // wrap with a round() to be in line with metrics api 934 query = fmt.Sprintf("round(%s,0.001)", query) 935 log.Tracef("Graph query:\n%s@time=%v (now=%v, %v)\n", query, queryTime.Format(graph.TF), time.Now().Format(graph.TF), queryTime.Unix()) 936 937 promtimer := internalmetrics.GetPrometheusProcessingTimePrometheusTimer("Graph-Generation") 938 value, warnings, err := api.Query(ctx, query, queryTime) 939 940 if len(warnings) > 0 { 941 log.Warningf("promQuery. Prometheus Warnings: [%s]", strings.Join(warnings, ",")) 942 } 943 graph.CheckUnavailable(err) 944 promtimer.ObserveDuration() // notice we only collect metrics for successful prom queries 945 946 switch t := value.Type(); t { 947 case model.ValVector: // Instant Vector 948 return value.(model.Vector) 949 default: 950 graph.Error(fmt.Sprintf("No handling for type %v!\n", t)) 951 } 952 953 return nil 954 }