github.com/kiali/kiali@v1.84.0/business/tracing.go (about) 1 package business 2 3 import ( 4 "context" 5 "fmt" 6 "strings" 7 "sync" 8 "time" 9 10 "github.com/kiali/kiali/config" 11 "github.com/kiali/kiali/log" 12 "github.com/kiali/kiali/models" 13 "github.com/kiali/kiali/observability" 14 "github.com/kiali/kiali/tracing" 15 "github.com/kiali/kiali/tracing/jaeger/model" 16 jaegerModels "github.com/kiali/kiali/tracing/jaeger/model/json" 17 ) 18 19 type ( 20 SpanFilter = func(span *jaegerModels.Span) bool 21 ) 22 23 type TracingService struct { 24 conf *config.Config 25 svc *SvcService 26 tracing tracing.ClientInterface 27 workload *WorkloadService 28 } 29 30 func NewTracingService(conf *config.Config, tracing tracing.ClientInterface, svcService *SvcService, workloadService *WorkloadService) TracingService { 31 return TracingService{ 32 conf: conf, 33 svc: svcService, 34 tracing: tracing, 35 workload: workloadService, 36 } 37 } 38 39 func (in *TracingService) client() (tracing.ClientInterface, error) { 40 if !in.conf.ExternalServices.Tracing.Enabled { 41 return nil, fmt.Errorf("Tracing is not enabled") 42 } 43 44 if in.tracing == nil { 45 return nil, fmt.Errorf("Tracing client is not initialized") 46 } 47 48 return in.tracing, nil 49 } 50 51 func (in *TracingService) getFilteredSpans(ns, app string, query models.TracingQuery, filter SpanFilter) ([]model.TracingSpan, error) { 52 // This is info needed for Tempo as it is not in the results by default 53 if in.conf.ExternalServices.Tracing.Provider == config.TempoProvider { 54 query.Tags["http.method"] = ".*" 55 } 56 r, err := in.GetAppTraces(ns, app, query) 57 if err != nil { 58 return []model.TracingSpan{}, err 59 } 60 spans := tracesToSpans(app, r, filter, in.conf) 61 return spans, nil 62 } 63 64 func mergeResponses(dest *model.TracingResponse, src *model.TracingResponse) { 65 dest.TracingServiceName = src.TracingServiceName 66 dest.Errors = append(dest.Errors, src.Errors...) 67 traceIds := make(map[jaegerModels.TraceID]bool) 68 for _, prev := range dest.Data { 69 traceIds[prev.TraceID] = true 70 } 71 for _, trace := range src.Data { 72 if _, ok := traceIds[trace.TraceID]; !ok { 73 dest.Data = append(dest.Data, trace) 74 traceIds[trace.TraceID] = true 75 } 76 } 77 } 78 79 func (in *TracingService) GetAppSpans(ns, app string, query models.TracingQuery) ([]model.TracingSpan, error) { 80 return in.getFilteredSpans(ns, app, query, nil /*no post-filtering for apps*/) 81 } 82 83 func (in *TracingService) GetServiceSpans(ctx context.Context, ns, service string, query models.TracingQuery) ([]model.TracingSpan, error) { 84 var end observability.EndFunc 85 ctx, end = observability.StartSpan(ctx, "GetServiceSpans", 86 observability.Attribute("package", "business"), 87 observability.Attribute("cluster", query.Cluster), 88 observability.Attribute("namespace", ns), 89 observability.Attribute("service", service), 90 ) 91 defer end() 92 93 app, err := in.svc.GetServiceAppName(ctx, query.Cluster, ns, service) 94 if err != nil { 95 return nil, err 96 } 97 var postFilter SpanFilter 98 // Run post-filter only for service != app 99 if app != service { 100 postFilter = operationSpanFilter(ns, service) 101 } 102 return in.getFilteredSpans(ns, app, query, postFilter) 103 } 104 105 func operationSpanFilter(ns, service string) SpanFilter { 106 fqService := service + "." + ns 107 // Filter out app spans based on operation name. 108 // For envoy traces, operation name is like "service-name.namespace.svc.cluster.local:8000/*" 109 return func(span *jaegerModels.Span) bool { 110 return strings.HasPrefix(span.OperationName, fqService) 111 } 112 } 113 114 func (in *TracingService) GetWorkloadSpans(ctx context.Context, ns, workload string, query models.TracingQuery) ([]model.TracingSpan, error) { 115 var end observability.EndFunc 116 ctx, end = observability.StartSpan(ctx, "GetWorkloadSpans", 117 observability.Attribute("package", "business"), 118 observability.Attribute("cluster", query.Cluster), 119 observability.Attribute("namespace", ns), 120 observability.Attribute("workload", workload), 121 ) 122 defer end() 123 124 app, err := in.workload.GetWorkloadAppName(ctx, query.Cluster, ns, workload) 125 if err != nil { 126 return nil, err 127 } 128 return in.getFilteredSpans(ns, app, query, wkdSpanFilter(ns, workload)) 129 } 130 131 func wkdSpanFilter(ns, workload string) SpanFilter { 132 // Filter out app traces based on the node_id tag, that contains workload information. 133 return func(span *jaegerModels.Span) bool { 134 return spanMatchesWorkload(span, ns, workload) 135 } 136 } 137 138 func (in *TracingService) GetAppTraces(ns, app string, query models.TracingQuery) (*model.TracingResponse, error) { 139 client, err := in.client() 140 if err != nil { 141 return nil, err 142 } 143 r, err := client.GetAppTraces(ns, app, query) 144 if err != nil { 145 return nil, err 146 } 147 148 if len(r.Data) == query.Limit { 149 // Reached the limit, use split & join mode to spread traces over the requested interval 150 log.Trace("Limit of traces was reached, using split & join mode") 151 more, err := in.getAppTracesSlicedInterval(ns, app, query) 152 if err != nil { 153 // Log error but continue to process results (might still have some data fetched) 154 log.Errorf("Traces split & join failed: %v", err) 155 } 156 if more != nil { 157 mergeResponses(r, more) 158 } 159 } 160 return r, nil 161 } 162 163 // GetServiceTraces returns traces involving the requested service. Note that because the tracing API pulls traces by "App", only a 164 // subset of the traces may actually involve the requested service. Callers may need to upwardly adjust TracingQuery.Limit to get back 165 // the number of desired traces. It depends on the number of services backing the app. For example, if there are 2 services for the 166 // app, if evenly distributed, a query limit of 20 may return only 10 traces. The ratio is typically not as bad as it is with 167 // GetWorkloadTraces. 168 func (in *TracingService) GetServiceTraces(ctx context.Context, ns, service string, query models.TracingQuery) (*model.TracingResponse, error) { 169 var end observability.EndFunc 170 ctx, end = observability.StartSpan(ctx, "GetServiceTraces", 171 observability.Attribute("package", "business"), 172 observability.Attribute("cluster", query.Cluster), 173 observability.Attribute("namespace", ns), 174 observability.Attribute("service", service), 175 ) 176 defer end() 177 178 app, err := in.svc.GetServiceAppName(ctx, query.Cluster, ns, service) 179 if err != nil { 180 return nil, err 181 } 182 if app == service { 183 // No post-filtering 184 return in.GetAppTraces(ns, app, query) 185 } 186 187 r, err := in.GetAppTraces(ns, app, query) 188 if r != nil && err == nil { 189 // Filter out app traces based on operation name. 190 // For envoy traces, operation name is like "service-name.namespace.svc.cluster.local:8000/*" 191 filter := operationSpanFilter(ns, service) 192 traces := []jaegerModels.Trace{} 193 for _, trace := range r.Data { 194 for _, span := range trace.Spans { 195 if filter(&span) { 196 traces = append(traces, trace) 197 break 198 } 199 } 200 } 201 r.Data = traces 202 } 203 return r, err 204 } 205 206 // GetWorkloadTraces returns traces involving the requested workload. Note that because the tracing API pulls traces by "App", only 207 // a subset of the traces may actually involve the requested workload. Callers may need to upwardly adjust TracingQuery.Limit to get back 208 // the number of desired traces. It depends on the number of workloads backing the app. For example, if there are 5 workloads for the 209 // app, if evenly distributed, a query limit of 25 may return only 5 traces. 210 func (in *TracingService) GetWorkloadTraces(ctx context.Context, ns, workload string, query models.TracingQuery) (*model.TracingResponse, error) { 211 var end observability.EndFunc 212 ctx, end = observability.StartSpan(ctx, "GetWorkloadTraces", 213 observability.Attribute("package", "business"), 214 observability.Attribute("cluster", query.Cluster), 215 observability.Attribute("namespace", ns), 216 observability.Attribute("workload", workload), 217 ) 218 defer end() 219 220 app, err := in.workload.GetWorkloadAppName(ctx, query.Cluster, ns, workload) 221 if err != nil { 222 return nil, err 223 } 224 225 r, err := in.GetAppTraces(ns, app, query) 226 // Filter out app traces based on the node_id tag, that contains workload information. 227 if r != nil && err == nil { 228 traces := []jaegerModels.Trace{} 229 for _, trace := range r.Data { 230 if matchesWorkload(&trace, ns, workload) { 231 traces = append(traces, trace) 232 } 233 } 234 r.Data = traces 235 } 236 return r, err 237 } 238 239 func (in *TracingService) getAppTracesSlicedInterval(ns, app string, query models.TracingQuery) (*model.TracingResponse, error) { 240 client, err := in.client() 241 if err != nil { 242 return nil, err 243 } 244 // Spread queries over 10 interval slices 245 nSlices := 10 246 limit := query.Limit / nSlices 247 if limit == 0 { 248 limit = 1 249 } 250 diff := query.End.Sub(query.Start) 251 duration := diff / time.Duration(nSlices) 252 253 type tracesChanResult struct { 254 resp *model.TracingResponse 255 err error 256 } 257 tracesChan := make(chan tracesChanResult, nSlices) 258 var wg sync.WaitGroup 259 260 for i := 0; i < nSlices; i++ { 261 q := query 262 q.Limit = limit 263 q.Start = query.Start.Add(duration * time.Duration(i)) 264 q.End = q.Start.Add(duration) 265 wg.Add(1) 266 go func(q models.TracingQuery) { 267 defer wg.Done() 268 r, err := client.GetAppTraces(ns, app, q) 269 tracesChan <- tracesChanResult{resp: r, err: err} 270 }(q) 271 } 272 wg.Wait() 273 // All slices are fetched, close channel 274 close(tracesChan) 275 merged := &model.TracingResponse{} 276 for r := range tracesChan { 277 if r.err != nil { 278 err = r.err 279 continue 280 } 281 mergeResponses(merged, r.resp) 282 } 283 return merged, err 284 } 285 286 func (in *TracingService) GetTraceDetail(traceID string) (trace *model.TracingSingleTrace, err error) { 287 client, err := in.client() 288 if err != nil { 289 return nil, err 290 } 291 return client.GetTraceDetail(traceID) 292 } 293 294 func (in *TracingService) GetErrorTraces(ns, app string, duration time.Duration) (errorTraces int, err error) { 295 client, err := in.client() 296 if err != nil { 297 return 0, err 298 } 299 return client.GetErrorTraces(ns, app, duration) 300 } 301 302 func (in *TracingService) GetStatus() (accessible bool, err error) { 303 client, err := in.client() 304 if err != nil { 305 return false, err 306 } 307 return client.GetServiceStatus() 308 } 309 310 func matchesWorkload(trace *jaegerModels.Trace, namespace, workload string) bool { 311 for _, span := range trace.Spans { 312 if process, ok := trace.Processes[span.ProcessID]; ok { 313 span.Process = &process 314 } 315 if spanMatchesWorkload(&span, namespace, workload) { 316 return true 317 } 318 } 319 return false 320 } 321 322 func spanMatchesWorkload(span *jaegerModels.Span, namespace, workload string) bool { 323 // For envoy traces, with a workload named "ai-locals", node_id is like: 324 // sidecar~172.17.0.20~ai-locals-6d8996bff-ztg6z.default~default.svc.cluster.local 325 for _, tag := range span.Tags { 326 if tag.Key == "node_id" { 327 if v, ok := tag.Value.(string); ok { 328 parts := strings.Split(v, "~") 329 if len(parts) >= 3 && strings.HasPrefix(parts[2], workload) && strings.HasSuffix(parts[2], namespace) { 330 return true 331 } 332 } 333 } 334 // For Tempo Traces 335 if tag.Key == "hostname" { 336 if v, ok := tag.Value.(string); ok { 337 if strings.HasPrefix(v, workload) { 338 return true 339 } 340 } 341 } 342 } 343 // Tag not found => try with 'hostname' in process' tags 344 if span.Process != nil { 345 for _, tag := range span.Process.Tags { 346 if tag.Key == "hostname" { 347 if v, ok := tag.Value.(string); ok { 348 if strings.HasPrefix(v, workload) { 349 return true 350 } 351 } 352 } 353 } 354 } 355 return false 356 } 357 358 func tracesToSpans(app string, r *model.TracingResponse, filter SpanFilter, conf *config.Config) []model.TracingSpan { 359 spans := []model.TracingSpan{} 360 for _, trace := range r.Data { 361 // Diferent for Tempo & Jaeger 362 // For Tempo the proccess matched with the service name of the trace batch 363 // So t is already filtered in the query 364 if conf.ExternalServices.Tracing.Provider == config.TempoProvider { 365 // Second, find spans for these processes 366 for _, span := range trace.Spans { 367 if span.Process.ServiceName == r.TracingServiceName { 368 if filter == nil || filter(&span) { 369 spans = append(spans, model.TracingSpan{ 370 Span: span, 371 TraceSize: len(trace.Spans), 372 }) 373 } 374 } 375 } 376 } else { 377 // First, get the desired processes for our service 378 processes := make(map[jaegerModels.ProcessID]jaegerModels.Process) 379 for pId, process := range trace.Processes { 380 if process.ServiceName == app || process.ServiceName == r.TracingServiceName { 381 processes[pId] = process 382 } 383 } 384 // Second, find spans for these processes 385 for _, span := range trace.Spans { 386 if p, ok := processes[span.ProcessID]; ok { 387 span.Process = &p 388 if filter == nil || filter(&span) { 389 spans = append(spans, model.TracingSpan{ 390 Span: span, 391 TraceSize: len(trace.Spans), 392 }) 393 } 394 } 395 } 396 } 397 } 398 log.Tracef("Found %d spans in the %d traces for app %s", len(spans), len(r.Data), app) 399 return spans 400 }