github.com/grafana/pyroscope@v1.18.0/cmd/profilecli/query-tracer.go (about) 1 package main 2 3 import ( 4 "context" 5 "encoding/json" 6 "fmt" 7 "io" 8 "net/http" 9 "net/url" 10 "os" 11 "path/filepath" 12 "runtime" 13 "runtime/debug" 14 "sort" 15 "strconv" 16 "strings" 17 "sync" 18 "time" 19 20 "github.com/go-kit/log/level" 21 tempopb "github.com/simonswine/tempopb" 22 commonv1 "go.opentelemetry.io/proto/otlp/common/v1" 23 tracev1 "go.opentelemetry.io/proto/otlp/trace/v1" 24 "golang.org/x/sync/errgroup" 25 ) 26 27 var userAgent string = "profilecli/unknown" 28 29 func init() { 30 bi, ok := debug.ReadBuildInfo() 31 if !ok { 32 return 33 } 34 userAgent = fmt.Sprintf("profilecli/%s", bi.Main.Version) 35 } 36 37 type queryTracerParams struct { 38 File []string 39 Delay time.Duration 40 Interval time.Duration 41 TempoURL string 42 } 43 44 func addQueryTracerParams(cmd commander) *queryTracerParams { 45 params := &queryTracerParams{} 46 cmd.Flag("interval", "Interval for how often to query Tempo").Default("5m").DurationVar(¶ms.Interval) 47 cmd.Flag("delay", "Allow for a delay between ingestion and availability in the query path.").Default("0s").DurationVar(¶ms.Delay) 48 cmd.Flag("tempo-url", "Tempo URL").Default("http://localhost:3100/").StringVar(¶ms.TempoURL) 49 cmd.Flag("filter-sensitive", "Filters potentially sensitive fields from the output").Default("false").BoolVar(&filterSensitive) 50 cmd.Flag("file", "File to read traces in protobuf directly from").StringsVar(¶ms.File) 51 return params 52 } 53 54 var filterSensitive bool 55 56 type Duration time.Duration 57 58 func (d Duration) MarshalJSON() ([]byte, error) { 59 return json.Marshal(time.Duration(d).String()) 60 } 61 62 type SensitiveString string 63 64 func (s SensitiveString) MarshalJSON() ([]byte, error) { 65 if !filterSensitive { 66 return json.Marshal(string(s)) 67 } 68 if len(s) == 0 { 69 return json.Marshal("") 70 } 71 return json.Marshal("***") 72 } 73 74 type TraceResult struct { 75 Namespace string `json:"namespace,omitempty"` 76 Cluster string `json:"cluster,omitempty"` 77 TraceID string `json:"trace_id,omitempty"` 78 StartTime time.Time `json:"request_start_time,omitempty"` 79 EndTime time.Time `json:"request_end_time,omitempty"` 80 Duration Duration `json:"duration,omitempty"` 81 Status int `json:"status,omitempty"` 82 ResponseSize int `json:"response_size,omitempty"` 83 Query struct { 84 Method string `json:"method,omitempty"` 85 StartTime time.Time `json:"start_time,omitempty"` 86 EndTime time.Time `json:"end_time,omitempty"` 87 Duration Duration `json:"duration,omitempty"` 88 Selector SensitiveString `json:"selector,omitempty"` 89 ProfileType string `json:"profile_type,omitempty"` 90 } `json:"query,omitempty"` 91 Stats struct { 92 Profiles int `json:"profiles,omitempty"` 93 ProfilesFetched int `json:"profiles_fetched,omitempty"` 94 } `json:"stats,omitempty"` 95 Grafana struct { 96 TenantID SensitiveString `json:"tenant_id,omitempty"` 97 User SensitiveString `json:"user,omitempty"` 98 Datasource SensitiveString `json:"datasource,omitempty"` 99 Plugin string `json:"plugin,omitempty"` 100 } `json:"grafana,omitempty"` 101 } 102 103 func (r *TraceResult) LogFields(fields []interface{}) []interface{} { 104 d, err := json.Marshal(r) 105 if err != nil { 106 panic(err) 107 } 108 var m map[string]interface{} 109 if err := json.Unmarshal(d, &m); err != nil { 110 panic(err) 111 } 112 return mapToLogFields(m, fields, "") 113 } 114 115 func mapToLogFields(m map[string]interface{}, fields []interface{}, prefix string) []interface{} { 116 keys := make([]string, 0, len(m)) 117 for k := range m { 118 keys = append(keys, k) 119 } 120 sort.Strings(keys) 121 for _, k := range keys { 122 switch m[k].(type) { 123 case map[string]interface{}: 124 fields = mapToLogFields(m[k].(map[string]interface{}), fields, prefix+k+"_") 125 default: 126 fields = append(fields, prefix+k, m[k]) 127 } 128 } 129 return fields 130 } 131 132 func httpGetOrPost(spanName string) bool { 133 return strings.HasPrefix(spanName, "HTTP GET - ") || strings.HasPrefix(spanName, "HTTP POST - ") 134 } 135 136 func parseTrace(ctx context.Context, traceID string, data []byte) (*TraceResult, error) { 137 var traceResult TraceResult 138 traceResult.TraceID = traceID 139 140 // Read the existing address book. 141 trace := &tempopb.Trace{} 142 143 if err := trace.UnmarshalVT(data); err != nil { 144 return nil, fmt.Errorf("failed to parse trace: %w", err) 145 } 146 147 setK8SMeta := func(attrs []*commonv1.KeyValue) { 148 for _, attr := range attrs { 149 if len(traceResult.Cluster) == 0 && attr.Key == "cluster" { 150 traceResult.Cluster = attr.Value.GetStringValue() 151 } 152 if len(traceResult.Namespace) == 0 && attr.Key == "namespace" { 153 traceResult.Namespace = attr.Value.GetStringValue() 154 } 155 } 156 } 157 158 var spansByServiceName = map[string][]*tracev1.Span{ 159 "grafana": nil, 160 "pyroscope-query-frontend": nil, 161 "pyroscope-gateway": nil, 162 "pyroscope-ingester": nil, 163 "pyroscope-store-gateway": nil, 164 } 165 for _, batch := range trace.Batches { 166 if ctx.Err() != nil { 167 return nil, ctx.Err() 168 } 169 for _, attr := range batch.Resource.Attributes { 170 if attr.Key == "service.name" { 171 // check by service.name map 172 serviceName := attr.Value.GetStringValue() 173 _, ok := spansByServiceName[attr.Value.GetStringValue()] 174 if ok { 175 // add spans to the map 176 for _, span := range batch.ScopeSpans { 177 spansByServiceName[serviceName] = append(spansByServiceName[serviceName], span.Spans...) 178 } 179 } 180 181 if serviceName == "pyroscope-gateway" { 182 setK8SMeta(batch.Resource.Attributes) 183 } 184 } 185 } 186 } 187 188 // find gateway span for duration, status code and response size 189 firstFound := false 190 for _, span := range spansByServiceName["pyroscope-gateway"] { 191 if ctx.Err() != nil { 192 return nil, ctx.Err() 193 } 194 if !firstFound && httpGetOrPost(span.Name) { 195 //traceResult.TraceID = span.TraceId 196 traceResult.StartTime = time.Unix(0, int64(span.StartTimeUnixNano)) 197 traceResult.EndTime = time.Unix(0, int64(span.EndTimeUnixNano)) 198 traceResult.Duration = Duration(traceResult.EndTime.Sub(traceResult.StartTime)) 199 200 for _, attr := range span.Attributes { 201 if attr.Key == "http.status_code" { 202 traceResult.Status = int(attr.Value.GetIntValue()) 203 } 204 if attr.Key == "http.response_size" { 205 traceResult.ResponseSize = int(attr.Value.GetIntValue()) 206 } 207 } 208 firstFound = true 209 continue 210 } 211 if span.Name == "Auth/Authenticate" { 212 for _, attr := range span.Attributes { 213 if attr.Key == "instance_id" { 214 traceResult.Grafana.TenantID = SensitiveString(attr.Value.GetStringValue()) 215 } 216 } 217 } 218 } 219 220 // get the query parameters 221 firstFound = false 222 timeFormat := "2006-01-02 15:04:05.999 -0700 MST" 223 for _, span := range spansByServiceName["pyroscope-query-frontend"] { 224 if ctx.Err() != nil { 225 return nil, ctx.Err() 226 } 227 if !firstFound && httpGetOrPost(span.Name) { 228 for _, attr := range span.Attributes { 229 if attr.Key == "http.url" { 230 method := strings.SplitN(attr.Value.GetStringValue(), "?", 2) 231 traceResult.Query.Method = method[0] 232 } 233 if attr.Key == "start" { 234 traceResult.Query.StartTime, _ = time.Parse(timeFormat, attr.Value.GetStringValue()) 235 } 236 if attr.Key == "end" { 237 traceResult.Query.EndTime, _ = time.Parse(timeFormat, attr.Value.GetStringValue()) 238 } 239 if attr.Key == "selector" { 240 traceResult.Query.Selector = SensitiveString(attr.Value.GetStringValue()) 241 } 242 if attr.Key == "profile_type" { 243 traceResult.Query.ProfileType = attr.Value.GetStringValue() 244 } 245 } 246 firstFound = true 247 } 248 } 249 if !traceResult.Query.StartTime.IsZero() && !traceResult.Query.EndTime.IsZero() { 250 traceResult.Query.Duration = Duration(traceResult.Query.EndTime.Sub(traceResult.Query.StartTime)) 251 } 252 253 // get the user name from grafana 254 for _, span := range spansByServiceName["grafana"] { 255 if ctx.Err() != nil { 256 return nil, ctx.Err() 257 } 258 if span.Name == "PluginClient.queryData" || span.Name == "PluginClient.callResource" { 259 for _, attr := range span.Attributes { 260 if attr.Key == "user" { 261 traceResult.Grafana.User = SensitiveString(attr.Value.GetStringValue()) 262 } 263 if attr.Key == "plugin_id" { 264 traceResult.Grafana.Plugin = attr.Value.GetStringValue() 265 } 266 if attr.Key == "datasource_name" { 267 traceResult.Grafana.Datasource = SensitiveString(attr.Value.GetStringValue()) 268 } 269 } 270 break 271 } 272 } 273 274 storeSpans := append(spansByServiceName["pyroscope-ingester"], spansByServiceName["pyroscope-store-gateway"]...) 275 found := false 276 for _, span := range storeSpans { 277 if ctx.Err() != nil { 278 return nil, ctx.Err() 279 } 280 if span.Name == "RepeatedRowColumnIterator" { 281 found = false 282 for _, attr := range span.Attributes { 283 if attr.Key == "column" && attr.Value.GetStringValue() == "Samples.list.element.Value" { 284 found = true 285 break 286 } 287 } 288 if !found { 289 continue 290 } 291 for _, ev := range span.Events { 292 for _, attr := range ev.Attributes { 293 if attr.Key == "rows_fetched" { 294 traceResult.Stats.ProfilesFetched += int(attr.Value.GetIntValue()) 295 } 296 if attr.Key == "rows_read" { 297 traceResult.Stats.Profiles += int(attr.Value.GetIntValue()) 298 } 299 } 300 } 301 } 302 } 303 304 return &traceResult, nil 305 } 306 307 func queryTracer(ctx context.Context, params *queryTracerParams) (err error) { 308 if len(params.File) > 0 { 309 fields := make([]interface{}, 0, 64) 310 311 for _, f := range params.File { 312 data, err := os.ReadFile(f) 313 if err != nil { 314 return err 315 } 316 result, err := parseTrace(ctx, "", data) 317 if err != nil { 318 return err 319 } 320 b, _ := json.Marshal(result) 321 mapD := map[string]interface{}{} 322 if err := json.Unmarshal(b, &mapD); err != nil { 323 return err 324 } 325 326 fields = fields[:4] 327 fields[0] = "msg" 328 fields[1] = "query successfully traced" 329 fields[2] = "file" 330 fields[3] = f 331 fields = result.LogFields(fields) 332 level.Info(logger).Log(fields...) 333 } 334 return nil 335 } 336 337 client := http.DefaultClient 338 end := time.Now().Add(-params.Delay) 339 start := end.Add(-params.Interval) 340 341 traceQL := `{ 342 ( span.http.url= "/querier.v1.QuerierService/SelectMergeStacktraces" || 343 span.http.url= "/querier.v1.QuerierService/SelectMergeProfile" || 344 name= "HTTP GET - pyroscope_render_diff" || 345 name= "HTTP GET - pyroscope_render" 346 ) && span.profile_type!="deadmans_switch:made_up:profilos:made_up:profilos" 347 }` 348 349 tempoURL, err := url.Parse(params.TempoURL) 350 if err != nil { 351 return err 352 } 353 originalPath := tempoURL.Path 354 355 tempoURL.Path = filepath.Join(originalPath, "/api/search") 356 357 level.Info(logger).Log("msg", "search for matching traces", "start", start, "end", end) 358 359 tempoParams := url.Values{} 360 tempoParams.Add("start", strconv.FormatInt(start.Unix(), 10)) 361 tempoParams.Add("end", strconv.FormatInt(end.Unix(), 10)) 362 tempoParams.Add("limit", "10000") 363 tempoParams.Add("q", traceQL) 364 tempoURL.RawQuery = tempoParams.Encode() 365 366 req, err := http.NewRequestWithContext(ctx, "GET", tempoURL.String(), nil) 367 if err != nil { 368 return err 369 } 370 req.Header.Set("User-Agent", userAgent) 371 372 resp, err := client.Do(req) 373 if err != nil { 374 return err 375 } 376 defer resp.Body.Close() 377 378 if resp.StatusCode != http.StatusOK { 379 body, _ := io.ReadAll(io.LimitReader(resp.Body, 1024)) 380 return fmt.Errorf("tempo returned status code [%d]: %s ", resp.StatusCode, body) 381 } 382 383 var respBody struct { 384 Traces []struct { 385 TraceID string `json:"traceID"` 386 RootServiceName string `json:"rootServiceName"` 387 RootTraceName string `json:"rootTraceName"` 388 StartTimeUnixNano string `json:"startTimeUnixNano"` 389 DurationMs int64 `json:"durationMs"` 390 } `json:"traces"` 391 } 392 393 if err := json.NewDecoder(resp.Body).Decode(&respBody); err != nil { 394 return err 395 } 396 397 level.Info(logger).Log("msg", fmt.Sprintf("found %d matching traces", len(respBody.Traces))) 398 399 // now fetch traces in parallel 400 g, gctx := errgroup.WithContext(ctx) 401 g.SetLimit(runtime.GOMAXPROCS(-1)) 402 403 results := make(chan *TraceResult) 404 var wg sync.WaitGroup 405 wg.Add(1) 406 go func() { 407 defer wg.Done() 408 fields := make([]interface{}, 0, 64) 409 for r := range results { 410 fields = fields[:2] 411 fields[0] = "msg" 412 fields[1] = "query successfully traced" 413 fields = r.LogFields(fields) 414 level.Info(logger).Log(fields...) 415 } 416 }() 417 defer wg.Wait() 418 defer close(results) 419 420 for _, t := range respBody.Traces { 421 startTimeUnixNanos, err := strconv.ParseInt(t.StartTimeUnixNano, 10, 64) 422 if err != nil { 423 return err 424 } 425 426 start = time.Unix(0, startTimeUnixNanos) 427 end = start.Add(time.Duration(t.DurationMs) * time.Millisecond) 428 429 tempoURL.Path = filepath.Join(originalPath, "/api/traces", t.TraceID) 430 tempoParams := url.Values{} 431 tempoParams.Add("start", strconv.FormatInt(start.Unix(), 10)) 432 tempoParams.Add("end", strconv.FormatInt(end.Unix()+1, 10)) 433 tempoParams.Add("limit", "10000") 434 tempoParams.Add("q", traceQL) 435 tempoURL.RawQuery = tempoParams.Encode() 436 437 var ( 438 url = tempoURL.String() 439 traceID = t.TraceID 440 ) 441 g.Go(func() error { 442 req, err := http.NewRequestWithContext(gctx, "GET", url, nil) 443 if err != nil { 444 return err 445 } 446 req.Header.Set("Accept", "application/protobuf") 447 req.Header.Set("User-Agent", userAgent) 448 449 resp, err := client.Do(req) 450 if err != nil { 451 return err 452 } 453 defer resp.Body.Close() 454 455 if resp.StatusCode != http.StatusOK { 456 body, _ := io.ReadAll(io.LimitReader(resp.Body, 1024)) 457 return fmt.Errorf("tempo returned status code [%d]: %s ", resp.StatusCode, body) 458 } 459 460 data, err := io.ReadAll(resp.Body) 461 if err != nil { 462 return err 463 } 464 465 result, err := parseTrace(gctx, traceID, data) 466 if err != nil { 467 level.Error(logger).Log("msg", "failed to parse trace", "trace_id", traceID, "err", err) 468 } 469 470 results <- result 471 472 return nil 473 }) 474 } 475 476 return g.Wait() 477 }