github.com/kiali/kiali@v1.84.0/graph/telemetry/istio/appender/response_time.go (about) 1 package appender 2 3 import ( 4 "fmt" 5 "math" 6 "time" 7 8 "github.com/prometheus/common/model" 9 10 "github.com/kiali/kiali/graph" 11 "github.com/kiali/kiali/graph/telemetry/istio/util" 12 "github.com/kiali/kiali/log" 13 "github.com/kiali/kiali/prometheus" 14 ) 15 16 const ( 17 // ResponseTimeAppenderName uniquely identifies the appender: responseTime 18 ResponseTimeAppenderName = "responseTime" 19 ) 20 21 // ResponseTimeAppender is responsible for adding responseTime information to the graph. ResponseTime 22 // is represented as a percentile value. The default is 95th percentile, which means that 23 // 95% of requests executed in no more than the resulting milliseconds. ResponeTime values are 24 // reported in milliseconds. 25 // Response Times are reported using destination proxy telemetry, when available, which should remove 26 // network latency fluctuations. 27 // TODO: Should we report both source and destination when possible (with and without latency)? 28 // Name: responseTime 29 type ResponseTimeAppender struct { 30 GraphType string 31 InjectServiceNodes bool 32 Namespaces graph.NamespaceInfoMap 33 Quantile float64 34 QueryTime int64 // unix time in seconds 35 Rates graph.RequestedRates 36 } 37 38 // Name implements Appender 39 func (a ResponseTimeAppender) Name() string { 40 return ResponseTimeAppenderName 41 } 42 43 // IsFinalizer implements Appender 44 func (a ResponseTimeAppender) IsFinalizer() bool { 45 return false 46 } 47 48 // AppendGraph implements Appender 49 func (a ResponseTimeAppender) AppendGraph(trafficMap graph.TrafficMap, globalInfo *graph.AppenderGlobalInfo, namespaceInfo *graph.AppenderNamespaceInfo) { 50 if len(trafficMap) == 0 { 51 return 52 } 53 54 // Response times only apply to request traffic (not TCP or gRPC-message traffic) 55 if a.Rates.Grpc != graph.RateRequests && a.Rates.Http != graph.RateRequests { 56 return 57 } 58 59 if globalInfo.PromClient == nil { 60 var err error 61 globalInfo.PromClient, err = prometheus.NewClient() 62 graph.CheckError(err) 63 } 64 65 a.appendGraph(trafficMap, namespaceInfo.Namespace, globalInfo.PromClient) 66 } 67 68 func (a ResponseTimeAppender) appendGraph(trafficMap graph.TrafficMap, namespace string, client *prometheus.Client) { 69 // create map to quickly look up responseTime 70 responseTimeMap := make(map[string]float64) 71 duration := a.Namespaces[namespace].Duration 72 73 quantile := a.Quantile 74 if a.Quantile == 0.0 { 75 log.Tracef("Generating average responseTime; namespace = %v", namespace) 76 77 // query prometheus for the responseTime info in two queries: 78 groupBy := "source_cluster,source_workload_namespace,source_workload,source_canonical_service,source_canonical_revision,destination_cluster,destination_service_namespace,destination_service,destination_service_name,destination_workload_namespace,destination_workload,destination_canonical_service,destination_canonical_revision,request_protocol" 79 80 // 1) Incoming: query destination telemetry to capture namespace services' incoming traffic 81 // note - the query order is important as both queries may have overlapping results for edges within 82 // the namespace. This query uses destination proxy and so must come first. 83 query := fmt.Sprintf(`sum(rate(%s{reporter="destination",destination_service_namespace="%s"}[%vs])) by (%s) / sum(rate(%s{reporter="destination",destination_service_namespace="%s"}[%vs])) by (%s) > 0`, 84 "istio_request_duration_milliseconds_sum", 85 namespace, 86 int(duration.Seconds()), // range duration for the query 87 groupBy, 88 "istio_request_duration_milliseconds_count", 89 namespace, 90 int(duration.Seconds()), // range duration for the query 91 groupBy) 92 incomingVector := promQuery(query, time.Unix(a.QueryTime, 0), client.GetContext(), client.API(), a) 93 a.populateResponseTimeMap(responseTimeMap, &incomingVector) 94 95 // 2) Outgoing: query source telemetry to capture namespace workloads' outgoing traffic 96 query = fmt.Sprintf(`sum(rate(%s{reporter="source",source_workload_namespace="%s"}[%vs])) by (%s) / sum(rate(%s{reporter="source",source_workload_namespace="%s"}[%vs])) by (%s) > 0`, 97 "istio_request_duration_milliseconds_sum", 98 namespace, 99 int(duration.Seconds()), // range duration for the query 100 groupBy, 101 "istio_request_duration_milliseconds_count", 102 namespace, 103 int(duration.Seconds()), // range duration for the query 104 groupBy) 105 outgoingVector := promQuery(query, time.Unix(a.QueryTime, 0), client.GetContext(), client.API(), a) 106 a.populateResponseTimeMap(responseTimeMap, &outgoingVector) 107 108 } else { 109 log.Tracef("Generating responseTime for quantile [%.2f]; namespace = %v", quantile, namespace) 110 111 // query prometheus for the responseTime info in two queries: 112 groupBy := "le,source_cluster,source_workload_namespace,source_workload,source_canonical_service,source_canonical_revision,destination_cluster,destination_service_namespace,destination_service,destination_service_name,destination_workload_namespace,destination_workload,destination_canonical_service,destination_canonical_revision,request_protocol" 113 114 // 1) Incoming: query destination telemetry to capture namespace services' incoming traffic 115 // note - the query order is important as both queries may have overlapping results for edges within 116 // the namespace. This query uses destination proxy and so must come first. 117 query := fmt.Sprintf(`histogram_quantile(%.2f, sum(rate(%s{reporter="destination",destination_service_namespace="%s"}[%vs])) by (%s)) > 0`, 118 quantile, 119 "istio_request_duration_milliseconds_bucket", 120 namespace, 121 int(duration.Seconds()), // range duration for the query 122 groupBy) 123 incomingVector := promQuery(query, time.Unix(a.QueryTime, 0), client.GetContext(), client.API(), a) 124 a.populateResponseTimeMap(responseTimeMap, &incomingVector) 125 126 // 2) Outgoing: query source telemetry to capture namespace workloads' outgoing traffic 127 query = fmt.Sprintf(`histogram_quantile(%.2f, sum(rate(%s{reporter="source",source_workload_namespace="%s"}[%vs])) by (%s)) > 0`, 128 quantile, 129 "istio_request_duration_milliseconds_bucket", 130 namespace, 131 int(duration.Seconds()), // range duration for the query 132 groupBy) 133 outgoingVector := promQuery(query, time.Unix(a.QueryTime, 0), client.GetContext(), client.API(), a) 134 a.populateResponseTimeMap(responseTimeMap, &outgoingVector) 135 } 136 137 applyResponseTime(trafficMap, responseTimeMap) 138 } 139 140 func applyResponseTime(trafficMap graph.TrafficMap, responseTimeMap map[string]float64) { 141 for _, n := range trafficMap { 142 for _, e := range n.Edges { 143 key := fmt.Sprintf("%s %s %s", e.Source.ID, e.Dest.ID, e.Metadata[graph.ProtocolKey].(string)) 144 if val, ok := responseTimeMap[key]; ok { 145 e.Metadata[graph.ResponseTime] = val 146 } 147 } 148 } 149 } 150 151 func (a ResponseTimeAppender) populateResponseTimeMap(responseTimeMap map[string]float64, vector *model.Vector) { 152 skipRequestsGrpc := a.Rates.Grpc != graph.RateRequests 153 skipRequestsHttp := a.Rates.Http != graph.RateRequests 154 155 for _, s := range *vector { 156 m := s.Metric 157 lSourceCluster, sourceClusterOk := m["source_cluster"] 158 lSourceWlNs, sourceWlNsOk := m["source_workload_namespace"] 159 lSourceWl, sourceWlOk := m["source_workload"] 160 lSourceApp, sourceAppOk := m["source_canonical_service"] 161 lSourceVer, sourceVerOk := m["source_canonical_revision"] 162 lDestCluster, destClusterOk := m["destination_cluster"] 163 lDestSvcNs, destSvcNsOk := m["destination_service_namespace"] 164 lDestSvc, destSvcOk := m["destination_service"] 165 lDestSvcName, destSvcNameOk := m["destination_service_name"] 166 lDestWlNs, destWlNsOk := m["destination_workload_namespace"] 167 lDestWl, destWlOk := m["destination_workload"] 168 lDestApp, destAppOk := m["destination_canonical_service"] 169 lDestVer, destVerOk := m["destination_canonical_revision"] 170 lProtocol, protocolOk := m["request_protocol"] 171 172 if !sourceWlNsOk || !sourceWlOk || !sourceAppOk || !sourceVerOk || !destSvcNsOk || !destSvcNameOk || !destSvcOk || !destWlNsOk || !destWlOk || !destAppOk || !destVerOk || !protocolOk { 173 log.Warningf("populateResponseTimeMap: Skipping %s, missing expected labels", m.String()) 174 continue 175 } 176 177 sourceWlNs := string(lSourceWlNs) 178 sourceWl := string(lSourceWl) 179 sourceApp := string(lSourceApp) 180 sourceVer := string(lSourceVer) 181 destSvc := string(lDestSvc) 182 protocol := string(lProtocol) 183 184 if (skipRequestsHttp && protocol == graph.HTTP.Name) || (skipRequestsGrpc && protocol == graph.GRPC.Name) { 185 continue 186 } 187 188 // handle clusters 189 sourceCluster, destCluster := util.HandleClusters(lSourceCluster, sourceClusterOk, lDestCluster, destClusterOk) 190 191 if util.IsBadSourceTelemetry(sourceCluster, sourceClusterOk, sourceWlNs, sourceWl, sourceApp) { 192 continue 193 } 194 195 val := float64(s.Value) 196 197 // handle unusual destinations 198 destCluster, destSvcNs, destSvcName, destWlNs, destWl, destApp, destVer, _ := util.HandleDestination(sourceCluster, sourceWlNs, sourceWl, destCluster, string(lDestSvcNs), string(lDestSvc), string(lDestSvcName), string(lDestWlNs), string(lDestWl), string(lDestApp), string(lDestVer)) 199 200 if util.IsBadDestTelemetry(destCluster, destClusterOk, destSvcNs, destSvc, destSvcName, destWl) { 201 continue 202 } 203 204 // Should not happen but if NaN for any reason, Just skip it 205 if math.IsNaN(val) { 206 continue 207 } 208 209 // don't inject a service node if any of: 210 // - destSvcName is not set 211 // - destSvcName is PassthroughCluster (see https://github.com/kiali/kiali/issues/4488) 212 // - dest node is already a service node 213 inject := false 214 if a.InjectServiceNodes && graph.IsOK(destSvcName) && destSvcName != graph.PassthroughCluster { 215 _, destNodeType, err := graph.Id(destCluster, destSvcNs, destSvcName, destWlNs, destWl, destApp, destVer, a.GraphType) 216 if err != nil { 217 log.Warningf("Skipping (rt) %s, %s", m.String(), err) 218 continue 219 } 220 inject = (graph.NodeTypeService != destNodeType) 221 } 222 223 if inject { 224 // Only set response time on the outgoing edge. On the incoming edge, we can't validly aggregate response times of the outgoing edges (kiali-2297) 225 a.addResponseTime(responseTimeMap, val, protocol, destCluster, destSvcNs, destSvcName, "", "", "", destCluster, destSvcNs, destSvcName, destWlNs, destWl, destApp, destVer) 226 } else { 227 a.addResponseTime(responseTimeMap, val, protocol, sourceCluster, sourceWlNs, "", sourceWl, sourceApp, sourceVer, destCluster, destSvcNs, destSvcName, destWlNs, destWl, destApp, destVer) 228 } 229 } 230 } 231 232 func (a ResponseTimeAppender) addResponseTime(responseTimeMap map[string]float64, val float64, protocol, sourceCluster, sourceNs, sourceSvc, sourceWl, sourceApp, sourceVer, destCluster, destSvcNs, destSvc, destWlNs, destWl, destApp, destVer string) { 233 sourceID, _, err := graph.Id(sourceCluster, sourceNs, sourceSvc, sourceNs, sourceWl, sourceApp, sourceVer, a.GraphType) 234 if err != nil { 235 log.Warningf("Skipping addResponseTime (source), %s", err) 236 return 237 } 238 destID, _, err := graph.Id(destCluster, destSvcNs, destSvc, destWlNs, destWl, destApp, destVer, a.GraphType) 239 if err != nil { 240 log.Warningf("Skipping addResponseTime (dest), %s", err) 241 return 242 } 243 244 key := fmt.Sprintf("%s %s %s", sourceID, destID, protocol) 245 246 // For edges within the namespace we may get a responseTime reported from both the incoming and outgoing 247 // traffic queries. We assume here the first reported value is preferred (i.e. defer to query order) 248 if _, found := responseTimeMap[key]; !found { 249 responseTimeMap[key] = val 250 } 251 }