github.com/kiali/kiali@v1.84.0/business/tracing.go (about)

     1  package business
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"strings"
     7  	"sync"
     8  	"time"
     9  
    10  	"github.com/kiali/kiali/config"
    11  	"github.com/kiali/kiali/log"
    12  	"github.com/kiali/kiali/models"
    13  	"github.com/kiali/kiali/observability"
    14  	"github.com/kiali/kiali/tracing"
    15  	"github.com/kiali/kiali/tracing/jaeger/model"
    16  	jaegerModels "github.com/kiali/kiali/tracing/jaeger/model/json"
    17  )
    18  
    19  type (
    20  	SpanFilter = func(span *jaegerModels.Span) bool
    21  )
    22  
    23  type TracingService struct {
    24  	conf     *config.Config
    25  	svc      *SvcService
    26  	tracing  tracing.ClientInterface
    27  	workload *WorkloadService
    28  }
    29  
    30  func NewTracingService(conf *config.Config, tracing tracing.ClientInterface, svcService *SvcService, workloadService *WorkloadService) TracingService {
    31  	return TracingService{
    32  		conf:     conf,
    33  		svc:      svcService,
    34  		tracing:  tracing,
    35  		workload: workloadService,
    36  	}
    37  }
    38  
    39  func (in *TracingService) client() (tracing.ClientInterface, error) {
    40  	if !in.conf.ExternalServices.Tracing.Enabled {
    41  		return nil, fmt.Errorf("Tracing is not enabled")
    42  	}
    43  
    44  	if in.tracing == nil {
    45  		return nil, fmt.Errorf("Tracing client is not initialized")
    46  	}
    47  
    48  	return in.tracing, nil
    49  }
    50  
    51  func (in *TracingService) getFilteredSpans(ns, app string, query models.TracingQuery, filter SpanFilter) ([]model.TracingSpan, error) {
    52  	// This is info needed for Tempo as it is not in the results by default
    53  	if in.conf.ExternalServices.Tracing.Provider == config.TempoProvider {
    54  		query.Tags["http.method"] = ".*"
    55  	}
    56  	r, err := in.GetAppTraces(ns, app, query)
    57  	if err != nil {
    58  		return []model.TracingSpan{}, err
    59  	}
    60  	spans := tracesToSpans(app, r, filter, in.conf)
    61  	return spans, nil
    62  }
    63  
    64  func mergeResponses(dest *model.TracingResponse, src *model.TracingResponse) {
    65  	dest.TracingServiceName = src.TracingServiceName
    66  	dest.Errors = append(dest.Errors, src.Errors...)
    67  	traceIds := make(map[jaegerModels.TraceID]bool)
    68  	for _, prev := range dest.Data {
    69  		traceIds[prev.TraceID] = true
    70  	}
    71  	for _, trace := range src.Data {
    72  		if _, ok := traceIds[trace.TraceID]; !ok {
    73  			dest.Data = append(dest.Data, trace)
    74  			traceIds[trace.TraceID] = true
    75  		}
    76  	}
    77  }
    78  
    79  func (in *TracingService) GetAppSpans(ns, app string, query models.TracingQuery) ([]model.TracingSpan, error) {
    80  	return in.getFilteredSpans(ns, app, query, nil /*no post-filtering for apps*/)
    81  }
    82  
    83  func (in *TracingService) GetServiceSpans(ctx context.Context, ns, service string, query models.TracingQuery) ([]model.TracingSpan, error) {
    84  	var end observability.EndFunc
    85  	ctx, end = observability.StartSpan(ctx, "GetServiceSpans",
    86  		observability.Attribute("package", "business"),
    87  		observability.Attribute("cluster", query.Cluster),
    88  		observability.Attribute("namespace", ns),
    89  		observability.Attribute("service", service),
    90  	)
    91  	defer end()
    92  
    93  	app, err := in.svc.GetServiceAppName(ctx, query.Cluster, ns, service)
    94  	if err != nil {
    95  		return nil, err
    96  	}
    97  	var postFilter SpanFilter
    98  	// Run post-filter only for service != app
    99  	if app != service {
   100  		postFilter = operationSpanFilter(ns, service)
   101  	}
   102  	return in.getFilteredSpans(ns, app, query, postFilter)
   103  }
   104  
   105  func operationSpanFilter(ns, service string) SpanFilter {
   106  	fqService := service + "." + ns
   107  	// Filter out app spans based on operation name.
   108  	// For envoy traces, operation name is like "service-name.namespace.svc.cluster.local:8000/*"
   109  	return func(span *jaegerModels.Span) bool {
   110  		return strings.HasPrefix(span.OperationName, fqService)
   111  	}
   112  }
   113  
   114  func (in *TracingService) GetWorkloadSpans(ctx context.Context, ns, workload string, query models.TracingQuery) ([]model.TracingSpan, error) {
   115  	var end observability.EndFunc
   116  	ctx, end = observability.StartSpan(ctx, "GetWorkloadSpans",
   117  		observability.Attribute("package", "business"),
   118  		observability.Attribute("cluster", query.Cluster),
   119  		observability.Attribute("namespace", ns),
   120  		observability.Attribute("workload", workload),
   121  	)
   122  	defer end()
   123  
   124  	app, err := in.workload.GetWorkloadAppName(ctx, query.Cluster, ns, workload)
   125  	if err != nil {
   126  		return nil, err
   127  	}
   128  	return in.getFilteredSpans(ns, app, query, wkdSpanFilter(ns, workload))
   129  }
   130  
   131  func wkdSpanFilter(ns, workload string) SpanFilter {
   132  	// Filter out app traces based on the node_id tag, that contains workload information.
   133  	return func(span *jaegerModels.Span) bool {
   134  		return spanMatchesWorkload(span, ns, workload)
   135  	}
   136  }
   137  
   138  func (in *TracingService) GetAppTraces(ns, app string, query models.TracingQuery) (*model.TracingResponse, error) {
   139  	client, err := in.client()
   140  	if err != nil {
   141  		return nil, err
   142  	}
   143  	r, err := client.GetAppTraces(ns, app, query)
   144  	if err != nil {
   145  		return nil, err
   146  	}
   147  
   148  	if len(r.Data) == query.Limit {
   149  		// Reached the limit, use split & join mode to spread traces over the requested interval
   150  		log.Trace("Limit of traces was reached, using split & join mode")
   151  		more, err := in.getAppTracesSlicedInterval(ns, app, query)
   152  		if err != nil {
   153  			// Log error but continue to process results (might still have some data fetched)
   154  			log.Errorf("Traces split & join failed: %v", err)
   155  		}
   156  		if more != nil {
   157  			mergeResponses(r, more)
   158  		}
   159  	}
   160  	return r, nil
   161  }
   162  
   163  // GetServiceTraces returns traces involving the requested service.  Note that because the tracing API pulls traces by "App", only a
   164  // subset of the traces may actually involve the requested service.  Callers may need to upwardly adjust TracingQuery.Limit to get back
   165  // the number of desired traces.  It depends on the number of services backing the app. For example, if there are 2 services for the
   166  // app, if evenly distributed, a query limit of 20 may return only 10 traces.  The ratio is typically not as bad as it is with
   167  // GetWorkloadTraces.
   168  func (in *TracingService) GetServiceTraces(ctx context.Context, ns, service string, query models.TracingQuery) (*model.TracingResponse, error) {
   169  	var end observability.EndFunc
   170  	ctx, end = observability.StartSpan(ctx, "GetServiceTraces",
   171  		observability.Attribute("package", "business"),
   172  		observability.Attribute("cluster", query.Cluster),
   173  		observability.Attribute("namespace", ns),
   174  		observability.Attribute("service", service),
   175  	)
   176  	defer end()
   177  
   178  	app, err := in.svc.GetServiceAppName(ctx, query.Cluster, ns, service)
   179  	if err != nil {
   180  		return nil, err
   181  	}
   182  	if app == service {
   183  		// No post-filtering
   184  		return in.GetAppTraces(ns, app, query)
   185  	}
   186  
   187  	r, err := in.GetAppTraces(ns, app, query)
   188  	if r != nil && err == nil {
   189  		// Filter out app traces based on operation name.
   190  		// For envoy traces, operation name is like "service-name.namespace.svc.cluster.local:8000/*"
   191  		filter := operationSpanFilter(ns, service)
   192  		traces := []jaegerModels.Trace{}
   193  		for _, trace := range r.Data {
   194  			for _, span := range trace.Spans {
   195  				if filter(&span) {
   196  					traces = append(traces, trace)
   197  					break
   198  				}
   199  			}
   200  		}
   201  		r.Data = traces
   202  	}
   203  	return r, err
   204  }
   205  
   206  // GetWorkloadTraces returns traces involving the requested workload.  Note that because the tracing API pulls traces by "App", only
   207  // a subset of the traces may actually involve the requested workload.  Callers may need to upwardly adjust TracingQuery.Limit to get back
   208  // the number of desired traces.  It depends on the number of workloads backing the app. For example, if there are 5 workloads for the
   209  // app, if evenly distributed, a query limit of 25 may return only 5 traces.
   210  func (in *TracingService) GetWorkloadTraces(ctx context.Context, ns, workload string, query models.TracingQuery) (*model.TracingResponse, error) {
   211  	var end observability.EndFunc
   212  	ctx, end = observability.StartSpan(ctx, "GetWorkloadTraces",
   213  		observability.Attribute("package", "business"),
   214  		observability.Attribute("cluster", query.Cluster),
   215  		observability.Attribute("namespace", ns),
   216  		observability.Attribute("workload", workload),
   217  	)
   218  	defer end()
   219  
   220  	app, err := in.workload.GetWorkloadAppName(ctx, query.Cluster, ns, workload)
   221  	if err != nil {
   222  		return nil, err
   223  	}
   224  
   225  	r, err := in.GetAppTraces(ns, app, query)
   226  	// Filter out app traces based on the node_id tag, that contains workload information.
   227  	if r != nil && err == nil {
   228  		traces := []jaegerModels.Trace{}
   229  		for _, trace := range r.Data {
   230  			if matchesWorkload(&trace, ns, workload) {
   231  				traces = append(traces, trace)
   232  			}
   233  		}
   234  		r.Data = traces
   235  	}
   236  	return r, err
   237  }
   238  
   239  func (in *TracingService) getAppTracesSlicedInterval(ns, app string, query models.TracingQuery) (*model.TracingResponse, error) {
   240  	client, err := in.client()
   241  	if err != nil {
   242  		return nil, err
   243  	}
   244  	// Spread queries over 10 interval slices
   245  	nSlices := 10
   246  	limit := query.Limit / nSlices
   247  	if limit == 0 {
   248  		limit = 1
   249  	}
   250  	diff := query.End.Sub(query.Start)
   251  	duration := diff / time.Duration(nSlices)
   252  
   253  	type tracesChanResult struct {
   254  		resp *model.TracingResponse
   255  		err  error
   256  	}
   257  	tracesChan := make(chan tracesChanResult, nSlices)
   258  	var wg sync.WaitGroup
   259  
   260  	for i := 0; i < nSlices; i++ {
   261  		q := query
   262  		q.Limit = limit
   263  		q.Start = query.Start.Add(duration * time.Duration(i))
   264  		q.End = q.Start.Add(duration)
   265  		wg.Add(1)
   266  		go func(q models.TracingQuery) {
   267  			defer wg.Done()
   268  			r, err := client.GetAppTraces(ns, app, q)
   269  			tracesChan <- tracesChanResult{resp: r, err: err}
   270  		}(q)
   271  	}
   272  	wg.Wait()
   273  	// All slices are fetched, close channel
   274  	close(tracesChan)
   275  	merged := &model.TracingResponse{}
   276  	for r := range tracesChan {
   277  		if r.err != nil {
   278  			err = r.err
   279  			continue
   280  		}
   281  		mergeResponses(merged, r.resp)
   282  	}
   283  	return merged, err
   284  }
   285  
   286  func (in *TracingService) GetTraceDetail(traceID string) (trace *model.TracingSingleTrace, err error) {
   287  	client, err := in.client()
   288  	if err != nil {
   289  		return nil, err
   290  	}
   291  	return client.GetTraceDetail(traceID)
   292  }
   293  
   294  func (in *TracingService) GetErrorTraces(ns, app string, duration time.Duration) (errorTraces int, err error) {
   295  	client, err := in.client()
   296  	if err != nil {
   297  		return 0, err
   298  	}
   299  	return client.GetErrorTraces(ns, app, duration)
   300  }
   301  
   302  func (in *TracingService) GetStatus() (accessible bool, err error) {
   303  	client, err := in.client()
   304  	if err != nil {
   305  		return false, err
   306  	}
   307  	return client.GetServiceStatus()
   308  }
   309  
   310  func matchesWorkload(trace *jaegerModels.Trace, namespace, workload string) bool {
   311  	for _, span := range trace.Spans {
   312  		if process, ok := trace.Processes[span.ProcessID]; ok {
   313  			span.Process = &process
   314  		}
   315  		if spanMatchesWorkload(&span, namespace, workload) {
   316  			return true
   317  		}
   318  	}
   319  	return false
   320  }
   321  
   322  func spanMatchesWorkload(span *jaegerModels.Span, namespace, workload string) bool {
   323  	// For envoy traces, with a workload named "ai-locals", node_id is like:
   324  	// sidecar~172.17.0.20~ai-locals-6d8996bff-ztg6z.default~default.svc.cluster.local
   325  	for _, tag := range span.Tags {
   326  		if tag.Key == "node_id" {
   327  			if v, ok := tag.Value.(string); ok {
   328  				parts := strings.Split(v, "~")
   329  				if len(parts) >= 3 && strings.HasPrefix(parts[2], workload) && strings.HasSuffix(parts[2], namespace) {
   330  					return true
   331  				}
   332  			}
   333  		}
   334  		// For Tempo Traces
   335  		if tag.Key == "hostname" {
   336  			if v, ok := tag.Value.(string); ok {
   337  				if strings.HasPrefix(v, workload) {
   338  					return true
   339  				}
   340  			}
   341  		}
   342  	}
   343  	// Tag not found => try with 'hostname' in process' tags
   344  	if span.Process != nil {
   345  		for _, tag := range span.Process.Tags {
   346  			if tag.Key == "hostname" {
   347  				if v, ok := tag.Value.(string); ok {
   348  					if strings.HasPrefix(v, workload) {
   349  						return true
   350  					}
   351  				}
   352  			}
   353  		}
   354  	}
   355  	return false
   356  }
   357  
   358  func tracesToSpans(app string, r *model.TracingResponse, filter SpanFilter, conf *config.Config) []model.TracingSpan {
   359  	spans := []model.TracingSpan{}
   360  	for _, trace := range r.Data {
   361  		// Diferent for Tempo & Jaeger
   362  		// For Tempo the proccess matched with the service name of the trace batch
   363  		// So t is already filtered in the query
   364  		if conf.ExternalServices.Tracing.Provider == config.TempoProvider {
   365  			// Second, find spans for these processes
   366  			for _, span := range trace.Spans {
   367  				if span.Process.ServiceName == r.TracingServiceName {
   368  					if filter == nil || filter(&span) {
   369  						spans = append(spans, model.TracingSpan{
   370  							Span:      span,
   371  							TraceSize: len(trace.Spans),
   372  						})
   373  					}
   374  				}
   375  			}
   376  		} else {
   377  			// First, get the desired processes for our service
   378  			processes := make(map[jaegerModels.ProcessID]jaegerModels.Process)
   379  			for pId, process := range trace.Processes {
   380  				if process.ServiceName == app || process.ServiceName == r.TracingServiceName {
   381  					processes[pId] = process
   382  				}
   383  			}
   384  			// Second, find spans for these processes
   385  			for _, span := range trace.Spans {
   386  				if p, ok := processes[span.ProcessID]; ok {
   387  					span.Process = &p
   388  					if filter == nil || filter(&span) {
   389  						spans = append(spans, model.TracingSpan{
   390  							Span:      span,
   391  							TraceSize: len(trace.Spans),
   392  						})
   393  					}
   394  				}
   395  			}
   396  		}
   397  	}
   398  	log.Tracef("Found %d spans in the %d traces for app %s", len(spans), len(r.Data), app)
   399  	return spans
   400  }