github.com/grafana/pyroscope@v1.18.0/cmd/profilecli/query-tracer.go (about)

     1  package main
     2  
     3  import (
     4  	"context"
     5  	"encoding/json"
     6  	"fmt"
     7  	"io"
     8  	"net/http"
     9  	"net/url"
    10  	"os"
    11  	"path/filepath"
    12  	"runtime"
    13  	"runtime/debug"
    14  	"sort"
    15  	"strconv"
    16  	"strings"
    17  	"sync"
    18  	"time"
    19  
    20  	"github.com/go-kit/log/level"
    21  	tempopb "github.com/simonswine/tempopb"
    22  	commonv1 "go.opentelemetry.io/proto/otlp/common/v1"
    23  	tracev1 "go.opentelemetry.io/proto/otlp/trace/v1"
    24  	"golang.org/x/sync/errgroup"
    25  )
    26  
    27  var userAgent string = "profilecli/unknown"
    28  
    29  func init() {
    30  	bi, ok := debug.ReadBuildInfo()
    31  	if !ok {
    32  		return
    33  	}
    34  	userAgent = fmt.Sprintf("profilecli/%s", bi.Main.Version)
    35  }
    36  
    37  type queryTracerParams struct {
    38  	File     []string
    39  	Delay    time.Duration
    40  	Interval time.Duration
    41  	TempoURL string
    42  }
    43  
    44  func addQueryTracerParams(cmd commander) *queryTracerParams {
    45  	params := &queryTracerParams{}
    46  	cmd.Flag("interval", "Interval for how often to query Tempo").Default("5m").DurationVar(&params.Interval)
    47  	cmd.Flag("delay", "Allow for a delay between ingestion and availability in the query path.").Default("0s").DurationVar(&params.Delay)
    48  	cmd.Flag("tempo-url", "Tempo URL").Default("http://localhost:3100/").StringVar(&params.TempoURL)
    49  	cmd.Flag("filter-sensitive", "Filters potentially sensitive fields from the output").Default("false").BoolVar(&filterSensitive)
    50  	cmd.Flag("file", "File to read traces in protobuf directly from").StringsVar(&params.File)
    51  	return params
    52  }
    53  
    54  var filterSensitive bool
    55  
    56  type Duration time.Duration
    57  
    58  func (d Duration) MarshalJSON() ([]byte, error) {
    59  	return json.Marshal(time.Duration(d).String())
    60  }
    61  
    62  type SensitiveString string
    63  
    64  func (s SensitiveString) MarshalJSON() ([]byte, error) {
    65  	if !filterSensitive {
    66  		return json.Marshal(string(s))
    67  	}
    68  	if len(s) == 0 {
    69  		return json.Marshal("")
    70  	}
    71  	return json.Marshal("***")
    72  }
    73  
    74  type TraceResult struct {
    75  	Namespace    string    `json:"namespace,omitempty"`
    76  	Cluster      string    `json:"cluster,omitempty"`
    77  	TraceID      string    `json:"trace_id,omitempty"`
    78  	StartTime    time.Time `json:"request_start_time,omitempty"`
    79  	EndTime      time.Time `json:"request_end_time,omitempty"`
    80  	Duration     Duration  `json:"duration,omitempty"`
    81  	Status       int       `json:"status,omitempty"`
    82  	ResponseSize int       `json:"response_size,omitempty"`
    83  	Query        struct {
    84  		Method      string          `json:"method,omitempty"`
    85  		StartTime   time.Time       `json:"start_time,omitempty"`
    86  		EndTime     time.Time       `json:"end_time,omitempty"`
    87  		Duration    Duration        `json:"duration,omitempty"`
    88  		Selector    SensitiveString `json:"selector,omitempty"`
    89  		ProfileType string          `json:"profile_type,omitempty"`
    90  	} `json:"query,omitempty"`
    91  	Stats struct {
    92  		Profiles        int `json:"profiles,omitempty"`
    93  		ProfilesFetched int `json:"profiles_fetched,omitempty"`
    94  	} `json:"stats,omitempty"`
    95  	Grafana struct {
    96  		TenantID   SensitiveString `json:"tenant_id,omitempty"`
    97  		User       SensitiveString `json:"user,omitempty"`
    98  		Datasource SensitiveString `json:"datasource,omitempty"`
    99  		Plugin     string          `json:"plugin,omitempty"`
   100  	} `json:"grafana,omitempty"`
   101  }
   102  
   103  func (r *TraceResult) LogFields(fields []interface{}) []interface{} {
   104  	d, err := json.Marshal(r)
   105  	if err != nil {
   106  		panic(err)
   107  	}
   108  	var m map[string]interface{}
   109  	if err := json.Unmarshal(d, &m); err != nil {
   110  		panic(err)
   111  	}
   112  	return mapToLogFields(m, fields, "")
   113  }
   114  
   115  func mapToLogFields(m map[string]interface{}, fields []interface{}, prefix string) []interface{} {
   116  	keys := make([]string, 0, len(m))
   117  	for k := range m {
   118  		keys = append(keys, k)
   119  	}
   120  	sort.Strings(keys)
   121  	for _, k := range keys {
   122  		switch m[k].(type) {
   123  		case map[string]interface{}:
   124  			fields = mapToLogFields(m[k].(map[string]interface{}), fields, prefix+k+"_")
   125  		default:
   126  			fields = append(fields, prefix+k, m[k])
   127  		}
   128  	}
   129  	return fields
   130  }
   131  
   132  func httpGetOrPost(spanName string) bool {
   133  	return strings.HasPrefix(spanName, "HTTP GET - ") || strings.HasPrefix(spanName, "HTTP POST - ")
   134  }
   135  
   136  func parseTrace(ctx context.Context, traceID string, data []byte) (*TraceResult, error) {
   137  	var traceResult TraceResult
   138  	traceResult.TraceID = traceID
   139  
   140  	// Read the existing address book.
   141  	trace := &tempopb.Trace{}
   142  
   143  	if err := trace.UnmarshalVT(data); err != nil {
   144  		return nil, fmt.Errorf("failed to parse trace: %w", err)
   145  	}
   146  
   147  	setK8SMeta := func(attrs []*commonv1.KeyValue) {
   148  		for _, attr := range attrs {
   149  			if len(traceResult.Cluster) == 0 && attr.Key == "cluster" {
   150  				traceResult.Cluster = attr.Value.GetStringValue()
   151  			}
   152  			if len(traceResult.Namespace) == 0 && attr.Key == "namespace" {
   153  				traceResult.Namespace = attr.Value.GetStringValue()
   154  			}
   155  		}
   156  	}
   157  
   158  	var spansByServiceName = map[string][]*tracev1.Span{
   159  		"grafana":                  nil,
   160  		"pyroscope-query-frontend": nil,
   161  		"pyroscope-gateway":        nil,
   162  		"pyroscope-ingester":       nil,
   163  		"pyroscope-store-gateway":  nil,
   164  	}
   165  	for _, batch := range trace.Batches {
   166  		if ctx.Err() != nil {
   167  			return nil, ctx.Err()
   168  		}
   169  		for _, attr := range batch.Resource.Attributes {
   170  			if attr.Key == "service.name" {
   171  				// check by service.name map
   172  				serviceName := attr.Value.GetStringValue()
   173  				_, ok := spansByServiceName[attr.Value.GetStringValue()]
   174  				if ok {
   175  					// add spans to the map
   176  					for _, span := range batch.ScopeSpans {
   177  						spansByServiceName[serviceName] = append(spansByServiceName[serviceName], span.Spans...)
   178  					}
   179  				}
   180  
   181  				if serviceName == "pyroscope-gateway" {
   182  					setK8SMeta(batch.Resource.Attributes)
   183  				}
   184  			}
   185  		}
   186  	}
   187  
   188  	// find gateway span for duration, status code and response size
   189  	firstFound := false
   190  	for _, span := range spansByServiceName["pyroscope-gateway"] {
   191  		if ctx.Err() != nil {
   192  			return nil, ctx.Err()
   193  		}
   194  		if !firstFound && httpGetOrPost(span.Name) {
   195  			//traceResult.TraceID = span.TraceId
   196  			traceResult.StartTime = time.Unix(0, int64(span.StartTimeUnixNano))
   197  			traceResult.EndTime = time.Unix(0, int64(span.EndTimeUnixNano))
   198  			traceResult.Duration = Duration(traceResult.EndTime.Sub(traceResult.StartTime))
   199  
   200  			for _, attr := range span.Attributes {
   201  				if attr.Key == "http.status_code" {
   202  					traceResult.Status = int(attr.Value.GetIntValue())
   203  				}
   204  				if attr.Key == "http.response_size" {
   205  					traceResult.ResponseSize = int(attr.Value.GetIntValue())
   206  				}
   207  			}
   208  			firstFound = true
   209  			continue
   210  		}
   211  		if span.Name == "Auth/Authenticate" {
   212  			for _, attr := range span.Attributes {
   213  				if attr.Key == "instance_id" {
   214  					traceResult.Grafana.TenantID = SensitiveString(attr.Value.GetStringValue())
   215  				}
   216  			}
   217  		}
   218  	}
   219  
   220  	// get the query parameters
   221  	firstFound = false
   222  	timeFormat := "2006-01-02 15:04:05.999 -0700 MST"
   223  	for _, span := range spansByServiceName["pyroscope-query-frontend"] {
   224  		if ctx.Err() != nil {
   225  			return nil, ctx.Err()
   226  		}
   227  		if !firstFound && httpGetOrPost(span.Name) {
   228  			for _, attr := range span.Attributes {
   229  				if attr.Key == "http.url" {
   230  					method := strings.SplitN(attr.Value.GetStringValue(), "?", 2)
   231  					traceResult.Query.Method = method[0]
   232  				}
   233  				if attr.Key == "start" {
   234  					traceResult.Query.StartTime, _ = time.Parse(timeFormat, attr.Value.GetStringValue())
   235  				}
   236  				if attr.Key == "end" {
   237  					traceResult.Query.EndTime, _ = time.Parse(timeFormat, attr.Value.GetStringValue())
   238  				}
   239  				if attr.Key == "selector" {
   240  					traceResult.Query.Selector = SensitiveString(attr.Value.GetStringValue())
   241  				}
   242  				if attr.Key == "profile_type" {
   243  					traceResult.Query.ProfileType = attr.Value.GetStringValue()
   244  				}
   245  			}
   246  			firstFound = true
   247  		}
   248  	}
   249  	if !traceResult.Query.StartTime.IsZero() && !traceResult.Query.EndTime.IsZero() {
   250  		traceResult.Query.Duration = Duration(traceResult.Query.EndTime.Sub(traceResult.Query.StartTime))
   251  	}
   252  
   253  	// get the user name from grafana
   254  	for _, span := range spansByServiceName["grafana"] {
   255  		if ctx.Err() != nil {
   256  			return nil, ctx.Err()
   257  		}
   258  		if span.Name == "PluginClient.queryData" || span.Name == "PluginClient.callResource" {
   259  			for _, attr := range span.Attributes {
   260  				if attr.Key == "user" {
   261  					traceResult.Grafana.User = SensitiveString(attr.Value.GetStringValue())
   262  				}
   263  				if attr.Key == "plugin_id" {
   264  					traceResult.Grafana.Plugin = attr.Value.GetStringValue()
   265  				}
   266  				if attr.Key == "datasource_name" {
   267  					traceResult.Grafana.Datasource = SensitiveString(attr.Value.GetStringValue())
   268  				}
   269  			}
   270  			break
   271  		}
   272  	}
   273  
   274  	storeSpans := append(spansByServiceName["pyroscope-ingester"], spansByServiceName["pyroscope-store-gateway"]...)
   275  	found := false
   276  	for _, span := range storeSpans {
   277  		if ctx.Err() != nil {
   278  			return nil, ctx.Err()
   279  		}
   280  		if span.Name == "RepeatedRowColumnIterator" {
   281  			found = false
   282  			for _, attr := range span.Attributes {
   283  				if attr.Key == "column" && attr.Value.GetStringValue() == "Samples.list.element.Value" {
   284  					found = true
   285  					break
   286  				}
   287  			}
   288  			if !found {
   289  				continue
   290  			}
   291  			for _, ev := range span.Events {
   292  				for _, attr := range ev.Attributes {
   293  					if attr.Key == "rows_fetched" {
   294  						traceResult.Stats.ProfilesFetched += int(attr.Value.GetIntValue())
   295  					}
   296  					if attr.Key == "rows_read" {
   297  						traceResult.Stats.Profiles += int(attr.Value.GetIntValue())
   298  					}
   299  				}
   300  			}
   301  		}
   302  	}
   303  
   304  	return &traceResult, nil
   305  }
   306  
   307  func queryTracer(ctx context.Context, params *queryTracerParams) (err error) {
   308  	if len(params.File) > 0 {
   309  		fields := make([]interface{}, 0, 64)
   310  
   311  		for _, f := range params.File {
   312  			data, err := os.ReadFile(f)
   313  			if err != nil {
   314  				return err
   315  			}
   316  			result, err := parseTrace(ctx, "", data)
   317  			if err != nil {
   318  				return err
   319  			}
   320  			b, _ := json.Marshal(result)
   321  			mapD := map[string]interface{}{}
   322  			if err := json.Unmarshal(b, &mapD); err != nil {
   323  				return err
   324  			}
   325  
   326  			fields = fields[:4]
   327  			fields[0] = "msg"
   328  			fields[1] = "query successfully traced"
   329  			fields[2] = "file"
   330  			fields[3] = f
   331  			fields = result.LogFields(fields)
   332  			level.Info(logger).Log(fields...)
   333  		}
   334  		return nil
   335  	}
   336  
   337  	client := http.DefaultClient
   338  	end := time.Now().Add(-params.Delay)
   339  	start := end.Add(-params.Interval)
   340  
   341  	traceQL := `{
   342      ( span.http.url= "/querier.v1.QuerierService/SelectMergeStacktraces" ||
   343        span.http.url= "/querier.v1.QuerierService/SelectMergeProfile" || 
   344        name= "HTTP GET - pyroscope_render_diff" ||
   345        name= "HTTP GET - pyroscope_render"
   346      ) && span.profile_type!="deadmans_switch:made_up:profilos:made_up:profilos"
   347   }`
   348  
   349  	tempoURL, err := url.Parse(params.TempoURL)
   350  	if err != nil {
   351  		return err
   352  	}
   353  	originalPath := tempoURL.Path
   354  
   355  	tempoURL.Path = filepath.Join(originalPath, "/api/search")
   356  
   357  	level.Info(logger).Log("msg", "search for matching traces", "start", start, "end", end)
   358  
   359  	tempoParams := url.Values{}
   360  	tempoParams.Add("start", strconv.FormatInt(start.Unix(), 10))
   361  	tempoParams.Add("end", strconv.FormatInt(end.Unix(), 10))
   362  	tempoParams.Add("limit", "10000")
   363  	tempoParams.Add("q", traceQL)
   364  	tempoURL.RawQuery = tempoParams.Encode()
   365  
   366  	req, err := http.NewRequestWithContext(ctx, "GET", tempoURL.String(), nil)
   367  	if err != nil {
   368  		return err
   369  	}
   370  	req.Header.Set("User-Agent", userAgent)
   371  
   372  	resp, err := client.Do(req)
   373  	if err != nil {
   374  		return err
   375  	}
   376  	defer resp.Body.Close()
   377  
   378  	if resp.StatusCode != http.StatusOK {
   379  		body, _ := io.ReadAll(io.LimitReader(resp.Body, 1024))
   380  		return fmt.Errorf("tempo returned status code [%d]: %s ", resp.StatusCode, body)
   381  	}
   382  
   383  	var respBody struct {
   384  		Traces []struct {
   385  			TraceID           string `json:"traceID"`
   386  			RootServiceName   string `json:"rootServiceName"`
   387  			RootTraceName     string `json:"rootTraceName"`
   388  			StartTimeUnixNano string `json:"startTimeUnixNano"`
   389  			DurationMs        int64  `json:"durationMs"`
   390  		} `json:"traces"`
   391  	}
   392  
   393  	if err := json.NewDecoder(resp.Body).Decode(&respBody); err != nil {
   394  		return err
   395  	}
   396  
   397  	level.Info(logger).Log("msg", fmt.Sprintf("found %d matching traces", len(respBody.Traces)))
   398  
   399  	// now fetch traces in parallel
   400  	g, gctx := errgroup.WithContext(ctx)
   401  	g.SetLimit(runtime.GOMAXPROCS(-1))
   402  
   403  	results := make(chan *TraceResult)
   404  	var wg sync.WaitGroup
   405  	wg.Add(1)
   406  	go func() {
   407  		defer wg.Done()
   408  		fields := make([]interface{}, 0, 64)
   409  		for r := range results {
   410  			fields = fields[:2]
   411  			fields[0] = "msg"
   412  			fields[1] = "query successfully traced"
   413  			fields = r.LogFields(fields)
   414  			level.Info(logger).Log(fields...)
   415  		}
   416  	}()
   417  	defer wg.Wait()
   418  	defer close(results)
   419  
   420  	for _, t := range respBody.Traces {
   421  		startTimeUnixNanos, err := strconv.ParseInt(t.StartTimeUnixNano, 10, 64)
   422  		if err != nil {
   423  			return err
   424  		}
   425  
   426  		start = time.Unix(0, startTimeUnixNanos)
   427  		end = start.Add(time.Duration(t.DurationMs) * time.Millisecond)
   428  
   429  		tempoURL.Path = filepath.Join(originalPath, "/api/traces", t.TraceID)
   430  		tempoParams := url.Values{}
   431  		tempoParams.Add("start", strconv.FormatInt(start.Unix(), 10))
   432  		tempoParams.Add("end", strconv.FormatInt(end.Unix()+1, 10))
   433  		tempoParams.Add("limit", "10000")
   434  		tempoParams.Add("q", traceQL)
   435  		tempoURL.RawQuery = tempoParams.Encode()
   436  
   437  		var (
   438  			url     = tempoURL.String()
   439  			traceID = t.TraceID
   440  		)
   441  		g.Go(func() error {
   442  			req, err := http.NewRequestWithContext(gctx, "GET", url, nil)
   443  			if err != nil {
   444  				return err
   445  			}
   446  			req.Header.Set("Accept", "application/protobuf")
   447  			req.Header.Set("User-Agent", userAgent)
   448  
   449  			resp, err := client.Do(req)
   450  			if err != nil {
   451  				return err
   452  			}
   453  			defer resp.Body.Close()
   454  
   455  			if resp.StatusCode != http.StatusOK {
   456  				body, _ := io.ReadAll(io.LimitReader(resp.Body, 1024))
   457  				return fmt.Errorf("tempo returned status code [%d]: %s ", resp.StatusCode, body)
   458  			}
   459  
   460  			data, err := io.ReadAll(resp.Body)
   461  			if err != nil {
   462  				return err
   463  			}
   464  
   465  			result, err := parseTrace(gctx, traceID, data)
   466  			if err != nil {
   467  				level.Error(logger).Log("msg", "failed to parse trace", "trace_id", traceID, "err", err)
   468  			}
   469  
   470  			results <- result
   471  
   472  			return nil
   473  		})
   474  	}
   475  
   476  	return g.Wait()
   477  }