github.com/grafana/pyroscope@v1.18.0/pkg/querier/select_merge.go (about) 1 package querier 2 3 import ( 4 "context" 5 "fmt" 6 "math" 7 "sync" 8 "time" 9 10 "github.com/grafana/dskit/multierror" 11 "github.com/opentracing/opentracing-go" 12 otlog "github.com/opentracing/opentracing-go/log" 13 "github.com/samber/lo" 14 "golang.org/x/sync/errgroup" 15 16 googlev1 "github.com/grafana/pyroscope/api/gen/proto/go/google/v1" 17 ingestv1 "github.com/grafana/pyroscope/api/gen/proto/go/ingester/v1" 18 typesv1 "github.com/grafana/pyroscope/api/gen/proto/go/types/v1" 19 "github.com/grafana/pyroscope/pkg/clientpool" 20 "github.com/grafana/pyroscope/pkg/iter" 21 phlaremodel "github.com/grafana/pyroscope/pkg/model" 22 "github.com/grafana/pyroscope/pkg/pprof" 23 "github.com/grafana/pyroscope/pkg/util" 24 "github.com/grafana/pyroscope/pkg/util/loser" 25 ) 26 27 type ProfileWithLabels struct { 28 Timestamp int64 29 Fingerprint uint64 30 IngesterAddr string 31 phlaremodel.Labels 32 } 33 34 type BidiClientMerge[Req any, Res any] interface { 35 Send(Req) error 36 Receive() (Res, error) 37 CloseRequest() error 38 CloseResponse() error 39 } 40 41 type Request interface { 42 *ingestv1.MergeProfilesStacktracesRequest | 43 *ingestv1.MergeProfilesLabelsRequest | 44 *ingestv1.MergeProfilesPprofRequest | 45 *ingestv1.MergeSpanProfileRequest 46 } 47 48 type Response interface { 49 *ingestv1.MergeProfilesStacktracesResponse | 50 *ingestv1.MergeProfilesLabelsResponse | 51 *ingestv1.MergeProfilesPprofResponse | 52 *ingestv1.MergeSpanProfileResponse 53 } 54 55 type MergeResult[R any] interface { 56 Result() (R, error) 57 } 58 type MergeIterator interface { 59 iter.Iterator[*ProfileWithLabels] 60 Keep() 61 } 62 63 type keepResponse struct { 64 *ingestv1.MergeProfilesStacktracesRequest 65 *ingestv1.MergeProfilesLabelsRequest 66 *ingestv1.MergeProfilesPprofRequest 67 *ingestv1.MergeSpanProfileRequest 68 } 69 type mergeIterator[R any, Req Request, Res Response] struct { 70 ctx context.Context 71 bidi BidiClientMerge[Req, Res] 72 73 err error 74 curr *ingestv1.ProfileSets 75 currIdx int 76 keep []bool 77 keepSent bool // keepSent is true if we have sent the keep request to the ingester. 78 79 currentProfile *ProfileWithLabels 80 81 response keepResponse 82 } 83 84 // NewMergeIterator return a new iterator that stream profiles and allows to filter them using `Keep` to keep 85 // only a subset of the profiles for an aggregation result. 86 // Merging or querying profiles sample values is expensive, we only merge the sample of the profiles that are kept. 87 // On creating the iterator, we send a request to ingesters to fetch the first batch. 88 func NewMergeIterator[ 89 R any, 90 Req Request, 91 Res Response, 92 ](ctx context.Context, r ResponseFromReplica[BidiClientMerge[Req, Res]], 93 ) *mergeIterator[R, Req, Res] { 94 it := &mergeIterator[R, Req, Res]{ 95 bidi: r.response, 96 keepSent: true, // at the start we don't send a keep request. 97 ctx: ctx, 98 currentProfile: &ProfileWithLabels{IngesterAddr: r.addr}, 99 currIdx: -1, 100 response: keepResponse{ 101 MergeProfilesStacktracesRequest: &ingestv1.MergeProfilesStacktracesRequest{}, 102 MergeProfilesLabelsRequest: &ingestv1.MergeProfilesLabelsRequest{}, 103 MergeProfilesPprofRequest: &ingestv1.MergeProfilesPprofRequest{}, 104 MergeSpanProfileRequest: &ingestv1.MergeSpanProfileRequest{}, 105 }, 106 } 107 it.fetchBatch() 108 return it 109 } 110 111 func (s *mergeIterator[R, Req, Res]) Next() bool { 112 if s.curr == nil || len(s.curr.Profiles) == 0 { 113 return false 114 } 115 if s.currIdx >= len(s.curr.Profiles)-1 { 116 if !s.keepSent { 117 var err error 118 switch bidi := (s.bidi).(type) { 119 case BidiClientMerge[*ingestv1.MergeProfilesStacktracesRequest, *ingestv1.MergeProfilesStacktracesResponse]: 120 s.response.MergeProfilesStacktracesRequest.Profiles = s.keep 121 err = bidi.Send(s.response.MergeProfilesStacktracesRequest) 122 case BidiClientMerge[*ingestv1.MergeProfilesLabelsRequest, *ingestv1.MergeProfilesLabelsResponse]: 123 s.response.MergeProfilesLabelsRequest.Profiles = s.keep 124 err = bidi.Send(s.response.MergeProfilesLabelsRequest) 125 case BidiClientMerge[*ingestv1.MergeProfilesPprofRequest, *ingestv1.MergeProfilesPprofResponse]: 126 s.response.MergeProfilesPprofRequest.Profiles = s.keep 127 err = bidi.Send(s.response.MergeProfilesPprofRequest) 128 case BidiClientMerge[*ingestv1.MergeSpanProfileRequest, *ingestv1.MergeSpanProfileResponse]: 129 s.response.MergeSpanProfileRequest.Profiles = s.keep 130 err = bidi.Send(s.response.MergeSpanProfileRequest) 131 } 132 if err != nil { 133 s.err = err 134 return false 135 } 136 } 137 s.fetchBatch() 138 if s.curr == nil || len(s.curr.Profiles) == 0 { 139 return false 140 } 141 s.currIdx = 0 142 s.setCurrentProfile() 143 return true 144 } 145 s.currIdx++ 146 s.setCurrentProfile() 147 return true 148 } 149 150 func (s *mergeIterator[R, Req, Res]) setCurrentProfile() { 151 p := s.curr.Profiles[s.currIdx] 152 s.currentProfile.Timestamp = p.Timestamp 153 if len(s.curr.LabelsSets) > 0 { 154 s.currentProfile.Labels = s.curr.LabelsSets[p.LabelIndex].Labels 155 } 156 if len(s.curr.Fingerprints) > 0 { 157 s.currentProfile.Fingerprint = s.curr.Fingerprints[p.LabelIndex] 158 } 159 } 160 161 func (s *mergeIterator[R, Req, Res]) fetchBatch() { 162 var selectedProfiles *ingestv1.ProfileSets 163 switch bidi := (s.bidi).(type) { 164 case BidiClientMerge[*ingestv1.MergeProfilesStacktracesRequest, *ingestv1.MergeProfilesStacktracesResponse]: 165 res, err := bidi.Receive() 166 if err != nil { 167 s.err = err 168 return 169 } 170 selectedProfiles = res.SelectedProfiles 171 case BidiClientMerge[*ingestv1.MergeProfilesLabelsRequest, *ingestv1.MergeProfilesLabelsResponse]: 172 res, err := bidi.Receive() 173 if err != nil { 174 s.err = err 175 return 176 } 177 selectedProfiles = res.SelectedProfiles 178 case BidiClientMerge[*ingestv1.MergeProfilesPprofRequest, *ingestv1.MergeProfilesPprofResponse]: 179 res, err := bidi.Receive() 180 if err != nil { 181 s.err = err 182 return 183 } 184 selectedProfiles = res.SelectedProfiles 185 case BidiClientMerge[*ingestv1.MergeSpanProfileRequest, *ingestv1.MergeSpanProfileResponse]: 186 res, err := bidi.Receive() 187 if err != nil { 188 s.err = err 189 return 190 } 191 selectedProfiles = res.SelectedProfiles 192 } 193 s.curr = selectedProfiles 194 if s.curr == nil { 195 return 196 } 197 if len(s.curr.Profiles) > cap(s.keep) { 198 s.keep = make([]bool, len(s.curr.Profiles)) 199 } 200 s.keep = s.keep[:len(s.curr.Profiles)] 201 // reset selections to none 202 for i := range s.keep { 203 s.keep[i] = false 204 } 205 s.keepSent = false 206 } 207 208 func (s *mergeIterator[R, Req, Res]) Keep() { 209 s.keep[s.currIdx] = true 210 } 211 212 func (s *mergeIterator[R, Req, Res]) At() *ProfileWithLabels { 213 return s.currentProfile 214 } 215 216 func (s *mergeIterator[R, Req, Res]) Result() (R, error) { 217 res, err := s.bidi.Receive() 218 if err != nil { 219 s.err = err 220 return *new(R), err 221 } 222 switch result := any(res).(type) { 223 case *ingestv1.MergeProfilesStacktracesResponse: 224 return any(result.Result).(R), nil 225 case *ingestv1.MergeProfilesLabelsResponse: 226 return any(result.Series).(R), nil 227 case *ingestv1.MergeProfilesPprofResponse: 228 return any(result.Result).(R), nil 229 case *ingestv1.MergeSpanProfileResponse: 230 return any(result.Result).(R), nil 231 default: 232 return *new(R), fmt.Errorf("unexpected response type %T", result) 233 } 234 } 235 236 func (s *mergeIterator[R, Req, Res]) Err() error { 237 return s.err 238 } 239 240 func (s *mergeIterator[R, Req, Res]) Close() error { 241 // Only close the Send side since we need to get the final result. 242 var errs multierror.MultiError 243 if err := s.bidi.CloseRequest(); err != nil { 244 errs = append(errs, err) 245 } 246 return errs.Err() 247 } 248 249 // skipDuplicates iterates through the iterator and skip duplicates. 250 func skipDuplicates(ctx context.Context, its []MergeIterator) error { 251 span, _ := opentracing.StartSpanFromContext(ctx, "skipDuplicates") 252 defer span.Finish() 253 var errors multierror.MultiError 254 tree := loser.New(its, 255 &ProfileWithLabels{ 256 Timestamp: math.MaxInt64, 257 }, 258 func(s MergeIterator) *ProfileWithLabels { 259 return s.At() 260 }, 261 func(p1, p2 *ProfileWithLabels) bool { 262 return p1.Timestamp <= p2.Timestamp 263 }, 264 func(s MergeIterator) { 265 if err := s.Close(); err != nil { 266 errors.Add(err) 267 } 268 }) 269 270 defer tree.Close() 271 // We rely on the fact that profiles are ordered by timestamp. 272 // In order to deduplicate profiles, we only keep the first profile 273 // with a given fingerprint for a given timestamp. 274 fingerprints := newTimestampedFingerprints() 275 duplicates := 0 276 total := 0 277 for tree.Next() { 278 next := tree.Winner() 279 profile := next.At() 280 total++ 281 fingerprint := profile.Fingerprint 282 if fingerprint == 0 && len(profile.Labels) > 0 { 283 fingerprint = profile.Hash() 284 } 285 if fingerprints.keep(profile.Timestamp, fingerprint) { 286 next.Keep() 287 continue 288 } 289 duplicates++ 290 } 291 span.LogFields(otlog.Int("duplicates", duplicates)) 292 span.LogFields(otlog.Int("total", total)) 293 if err := tree.Err(); err != nil { 294 errors.Add(err) 295 } 296 297 return errors.Err() 298 } 299 300 func newTimestampedFingerprints() *timestampedFingerprints { 301 return ×tampedFingerprints{ 302 timestamp: math.MaxInt64, 303 fingerprints: make(map[uint64]struct{}), 304 } 305 } 306 307 type timestampedFingerprints struct { 308 timestamp int64 309 fingerprints map[uint64]struct{} 310 } 311 312 // keep reports whether the profile has unique fingerprint for the timestamp. 313 func (p *timestampedFingerprints) keep(ts int64, fingerprint uint64) bool { 314 if p.timestamp != ts { 315 p.reset(ts, fingerprint) 316 return true 317 } 318 return !p.fingerprintSeen(fingerprint) 319 } 320 321 func (p *timestampedFingerprints) reset(ts int64, fingerprint uint64) { 322 p.timestamp = ts 323 clear(p.fingerprints) 324 p.fingerprints[fingerprint] = struct{}{} 325 } 326 327 func (p *timestampedFingerprints) fingerprintSeen(fingerprint uint64) (seen bool) { 328 _, seen = p.fingerprints[fingerprint] 329 if seen { 330 return true 331 } 332 p.fingerprints[fingerprint] = struct{}{} 333 return false 334 } 335 336 // selectMergeTree selects the profile from each ingester by deduping them and 337 // returns merge of stacktrace samples represented as a tree. 338 func selectMergeTree(ctx context.Context, responses []ResponseFromReplica[clientpool.BidiClientMergeProfilesStacktraces]) (*phlaremodel.Tree, error) { 339 span, ctx := opentracing.StartSpanFromContext(ctx, "selectMergeTree") 340 defer span.Finish() 341 342 mergeResults := make([]MergeResult[*ingestv1.MergeProfilesStacktracesResult], len(responses)) 343 iters := make([]MergeIterator, len(responses)) 344 var wg sync.WaitGroup 345 for i, resp := range responses { 346 wg.Add(1) 347 go func(i int, resp ResponseFromReplica[clientpool.BidiClientMergeProfilesStacktraces]) { 348 defer wg.Done() 349 it := NewMergeIterator[*ingestv1.MergeProfilesStacktracesResult]( 350 ctx, ResponseFromReplica[BidiClientMerge[*ingestv1.MergeProfilesStacktracesRequest, *ingestv1.MergeProfilesStacktracesResponse]]{ 351 addr: resp.addr, 352 response: resp.response, 353 }) 354 iters[i] = it 355 mergeResults[i] = it 356 }(i, resp) 357 } 358 wg.Wait() 359 360 if err := skipDuplicates(ctx, iters); err != nil { 361 return nil, err 362 } 363 364 // Collects the results in parallel. 365 span.LogFields(otlog.String("msg", "collecting merge results")) 366 g, _ := errgroup.WithContext(ctx) 367 m := phlaremodel.NewTreeMerger() 368 sm := phlaremodel.NewStackTraceMerger() 369 for _, iter := range mergeResults { 370 iter := iter 371 g.Go(util.RecoverPanic(func() error { 372 result, err := iter.Result() 373 if err != nil || result == nil { 374 return err 375 } 376 switch result.Format { 377 default: 378 return fmt.Errorf("unknown merge result format") 379 case ingestv1.StacktracesMergeFormat_MERGE_FORMAT_STACKTRACES: 380 sm.MergeStackTraces(result.Stacktraces, result.FunctionNames) 381 case ingestv1.StacktracesMergeFormat_MERGE_FORMAT_TREE: 382 err = m.MergeTreeBytes(result.TreeBytes) 383 } 384 return err 385 })) 386 } 387 if err := g.Wait(); err != nil { 388 return nil, err 389 } 390 if sm.Size() > 0 { 391 // For backward compatibility: during a rollout, multiple formats 392 // may coexist for some period of time (efficiency is not a concern). 393 if err := m.MergeTreeBytes(sm.TreeBytes(-1)); err != nil { 394 return nil, err 395 } 396 } 397 398 span.LogFields(otlog.String("msg", "building tree")) 399 return m.Tree(), nil 400 } 401 402 // selectMergePprofProfile selects the profile from each ingester by deduping them and request merges of stacktraces in the pprof format. 403 func selectMergePprofProfile(ctx context.Context, ty *typesv1.ProfileType, responses []ResponseFromReplica[clientpool.BidiClientMergeProfilesPprof]) (*googlev1.Profile, error) { 404 mergeResults := make([]MergeResult[[]byte], len(responses)) 405 iters := make([]MergeIterator, len(responses)) 406 var wg sync.WaitGroup 407 for i, resp := range responses { 408 wg.Add(1) 409 go func(i int, resp ResponseFromReplica[clientpool.BidiClientMergeProfilesPprof]) { 410 defer wg.Done() 411 it := NewMergeIterator[[]byte]( 412 ctx, ResponseFromReplica[BidiClientMerge[*ingestv1.MergeProfilesPprofRequest, *ingestv1.MergeProfilesPprofResponse]]{ 413 addr: resp.addr, 414 response: resp.response, 415 }) 416 iters[i] = it 417 mergeResults[i] = it 418 }(i, resp) 419 } 420 wg.Wait() 421 422 if err := skipDuplicates(ctx, iters); err != nil { 423 return nil, err 424 } 425 426 span := opentracing.SpanFromContext(ctx) 427 var pprofMerge pprof.ProfileMerge 428 g, _ := errgroup.WithContext(ctx) 429 for _, iter := range mergeResults { 430 iter := iter 431 g.Go(util.RecoverPanic(func() error { 432 start := time.Now() 433 result, err := iter.Result() 434 if err != nil || result == nil { 435 return err 436 } 437 if span != nil { 438 span.LogFields( 439 otlog.Int("profile_size", len(result)), 440 otlog.Int64("took_ms", time.Since(start).Milliseconds()), 441 ) 442 } 443 var p googlev1.Profile 444 if err = pprof.Unmarshal(result, &p); err != nil { 445 return err 446 } 447 return pprofMerge.Merge(&p, true) 448 })) 449 } 450 if err := g.Wait(); err != nil { 451 return nil, err 452 } 453 454 p := pprofMerge.Profile() 455 if len(p.Sample) == 0 { 456 pprof.SetProfileMetadata(p, ty, 0, 0) 457 } 458 return p, nil 459 } 460 461 // selectMergeSeries selects the profile from each ingester by deduping them and request merges of total values. 462 func selectMergeSeries(ctx context.Context, aggregation *typesv1.TimeSeriesAggregationType, responses []ResponseFromReplica[clientpool.BidiClientMergeProfilesLabels]) (iter.Iterator[phlaremodel.TimeSeriesValue], error) { 463 mergeResults := make([]MergeResult[[]*typesv1.Series], len(responses)) 464 iters := make([]MergeIterator, len(responses)) 465 var wg sync.WaitGroup 466 for i, resp := range responses { 467 wg.Add(1) 468 go func(i int, resp ResponseFromReplica[clientpool.BidiClientMergeProfilesLabels]) { 469 defer wg.Done() 470 it := NewMergeIterator[[]*typesv1.Series]( 471 ctx, ResponseFromReplica[BidiClientMerge[*ingestv1.MergeProfilesLabelsRequest, *ingestv1.MergeProfilesLabelsResponse]]{ 472 addr: resp.addr, 473 response: resp.response, 474 }) 475 iters[i] = it 476 mergeResults[i] = it 477 }(i, resp) 478 } 479 wg.Wait() 480 481 if err := skipDuplicates(ctx, iters); err != nil { 482 return nil, err 483 } 484 485 // Collects the results in parallel. 486 results := make([][]*typesv1.Series, 0, len(iters)) 487 s := lo.Synchronize() 488 g, _ := errgroup.WithContext(ctx) 489 for _, iter := range mergeResults { 490 iter := iter 491 g.Go(util.RecoverPanic(func() error { 492 result, err := iter.Result() 493 if err != nil || result == nil { 494 return err 495 } 496 s.Do(func() { 497 results = append(results, result) 498 }) 499 return nil 500 })) 501 } 502 if err := g.Wait(); err != nil { 503 return nil, err 504 } 505 var series = phlaremodel.MergeSeries(aggregation, results...) 506 507 seriesIters := make([]iter.Iterator[phlaremodel.TimeSeriesValue], 0, len(series)) 508 for _, s := range series { 509 s := s 510 seriesIters = append(seriesIters, phlaremodel.NewSeriesIterator(s.Labels, s.Points)) 511 } 512 return phlaremodel.NewMergeIterator(phlaremodel.TimeSeriesValue{Ts: math.MaxInt64}, false, seriesIters...), nil 513 } 514 515 // selectMergeSpanProfile selects the profile from each ingester by deduping them and 516 // returns merge of stacktrace samples represented as a tree. 517 func selectMergeSpanProfile(ctx context.Context, responses []ResponseFromReplica[clientpool.BidiClientMergeSpanProfile]) (*phlaremodel.Tree, error) { 518 span, ctx := opentracing.StartSpanFromContext(ctx, "selectMergeSpanProfile") 519 defer span.Finish() 520 521 mergeResults := make([]MergeResult[*ingestv1.MergeSpanProfileResult], len(responses)) 522 iters := make([]MergeIterator, len(responses)) 523 var wg sync.WaitGroup 524 for i, resp := range responses { 525 wg.Add(1) 526 go func(i int, resp ResponseFromReplica[clientpool.BidiClientMergeSpanProfile]) { 527 defer wg.Done() 528 it := NewMergeIterator[*ingestv1.MergeSpanProfileResult]( 529 ctx, ResponseFromReplica[BidiClientMerge[*ingestv1.MergeSpanProfileRequest, *ingestv1.MergeSpanProfileResponse]]{ 530 addr: resp.addr, 531 response: resp.response, 532 }) 533 iters[i] = it 534 mergeResults[i] = it 535 }(i, resp) 536 } 537 wg.Wait() 538 539 if err := skipDuplicates(ctx, iters); err != nil { 540 return nil, err 541 } 542 543 // Collects the results in parallel. 544 span.LogFields(otlog.String("msg", "collecting merge results")) 545 g, _ := errgroup.WithContext(ctx) 546 m := phlaremodel.NewTreeMerger() 547 for _, iter := range mergeResults { 548 iter := iter 549 g.Go(util.RecoverPanic(func() error { 550 result, err := iter.Result() 551 if err != nil || result == nil { 552 return err 553 } 554 return m.MergeTreeBytes(result.TreeBytes) 555 })) 556 } 557 if err := g.Wait(); err != nil { 558 return nil, err 559 } 560 561 span.LogFields(otlog.String("msg", "building tree")) 562 return m.Tree(), nil 563 }