github.com/grafana/pyroscope@v1.18.0/pkg/querier/select_merge.go (about)

     1  package querier
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"math"
     7  	"sync"
     8  	"time"
     9  
    10  	"github.com/grafana/dskit/multierror"
    11  	"github.com/opentracing/opentracing-go"
    12  	otlog "github.com/opentracing/opentracing-go/log"
    13  	"github.com/samber/lo"
    14  	"golang.org/x/sync/errgroup"
    15  
    16  	googlev1 "github.com/grafana/pyroscope/api/gen/proto/go/google/v1"
    17  	ingestv1 "github.com/grafana/pyroscope/api/gen/proto/go/ingester/v1"
    18  	typesv1 "github.com/grafana/pyroscope/api/gen/proto/go/types/v1"
    19  	"github.com/grafana/pyroscope/pkg/clientpool"
    20  	"github.com/grafana/pyroscope/pkg/iter"
    21  	phlaremodel "github.com/grafana/pyroscope/pkg/model"
    22  	"github.com/grafana/pyroscope/pkg/pprof"
    23  	"github.com/grafana/pyroscope/pkg/util"
    24  	"github.com/grafana/pyroscope/pkg/util/loser"
    25  )
    26  
    27  type ProfileWithLabels struct {
    28  	Timestamp    int64
    29  	Fingerprint  uint64
    30  	IngesterAddr string
    31  	phlaremodel.Labels
    32  }
    33  
    34  type BidiClientMerge[Req any, Res any] interface {
    35  	Send(Req) error
    36  	Receive() (Res, error)
    37  	CloseRequest() error
    38  	CloseResponse() error
    39  }
    40  
    41  type Request interface {
    42  	*ingestv1.MergeProfilesStacktracesRequest |
    43  		*ingestv1.MergeProfilesLabelsRequest |
    44  		*ingestv1.MergeProfilesPprofRequest |
    45  		*ingestv1.MergeSpanProfileRequest
    46  }
    47  
    48  type Response interface {
    49  	*ingestv1.MergeProfilesStacktracesResponse |
    50  		*ingestv1.MergeProfilesLabelsResponse |
    51  		*ingestv1.MergeProfilesPprofResponse |
    52  		*ingestv1.MergeSpanProfileResponse
    53  }
    54  
    55  type MergeResult[R any] interface {
    56  	Result() (R, error)
    57  }
    58  type MergeIterator interface {
    59  	iter.Iterator[*ProfileWithLabels]
    60  	Keep()
    61  }
    62  
    63  type keepResponse struct {
    64  	*ingestv1.MergeProfilesStacktracesRequest
    65  	*ingestv1.MergeProfilesLabelsRequest
    66  	*ingestv1.MergeProfilesPprofRequest
    67  	*ingestv1.MergeSpanProfileRequest
    68  }
    69  type mergeIterator[R any, Req Request, Res Response] struct {
    70  	ctx  context.Context
    71  	bidi BidiClientMerge[Req, Res]
    72  
    73  	err      error
    74  	curr     *ingestv1.ProfileSets
    75  	currIdx  int
    76  	keep     []bool
    77  	keepSent bool // keepSent is true if we have sent the keep request to the ingester.
    78  
    79  	currentProfile *ProfileWithLabels
    80  
    81  	response keepResponse
    82  }
    83  
    84  // NewMergeIterator return a new iterator that stream profiles and allows to filter them using `Keep` to keep
    85  // only a subset of the profiles for an aggregation result.
    86  // Merging or querying profiles sample values is expensive, we only merge the sample of the profiles that are kept.
    87  // On creating the iterator, we send a request to ingesters to fetch the first batch.
    88  func NewMergeIterator[
    89  	R any,
    90  	Req Request,
    91  	Res Response,
    92  ](ctx context.Context, r ResponseFromReplica[BidiClientMerge[Req, Res]],
    93  ) *mergeIterator[R, Req, Res] {
    94  	it := &mergeIterator[R, Req, Res]{
    95  		bidi:           r.response,
    96  		keepSent:       true, // at the start we don't send a keep request.
    97  		ctx:            ctx,
    98  		currentProfile: &ProfileWithLabels{IngesterAddr: r.addr},
    99  		currIdx:        -1,
   100  		response: keepResponse{
   101  			MergeProfilesStacktracesRequest: &ingestv1.MergeProfilesStacktracesRequest{},
   102  			MergeProfilesLabelsRequest:      &ingestv1.MergeProfilesLabelsRequest{},
   103  			MergeProfilesPprofRequest:       &ingestv1.MergeProfilesPprofRequest{},
   104  			MergeSpanProfileRequest:         &ingestv1.MergeSpanProfileRequest{},
   105  		},
   106  	}
   107  	it.fetchBatch()
   108  	return it
   109  }
   110  
   111  func (s *mergeIterator[R, Req, Res]) Next() bool {
   112  	if s.curr == nil || len(s.curr.Profiles) == 0 {
   113  		return false
   114  	}
   115  	if s.currIdx >= len(s.curr.Profiles)-1 {
   116  		if !s.keepSent {
   117  			var err error
   118  			switch bidi := (s.bidi).(type) {
   119  			case BidiClientMerge[*ingestv1.MergeProfilesStacktracesRequest, *ingestv1.MergeProfilesStacktracesResponse]:
   120  				s.response.MergeProfilesStacktracesRequest.Profiles = s.keep
   121  				err = bidi.Send(s.response.MergeProfilesStacktracesRequest)
   122  			case BidiClientMerge[*ingestv1.MergeProfilesLabelsRequest, *ingestv1.MergeProfilesLabelsResponse]:
   123  				s.response.MergeProfilesLabelsRequest.Profiles = s.keep
   124  				err = bidi.Send(s.response.MergeProfilesLabelsRequest)
   125  			case BidiClientMerge[*ingestv1.MergeProfilesPprofRequest, *ingestv1.MergeProfilesPprofResponse]:
   126  				s.response.MergeProfilesPprofRequest.Profiles = s.keep
   127  				err = bidi.Send(s.response.MergeProfilesPprofRequest)
   128  			case BidiClientMerge[*ingestv1.MergeSpanProfileRequest, *ingestv1.MergeSpanProfileResponse]:
   129  				s.response.MergeSpanProfileRequest.Profiles = s.keep
   130  				err = bidi.Send(s.response.MergeSpanProfileRequest)
   131  			}
   132  			if err != nil {
   133  				s.err = err
   134  				return false
   135  			}
   136  		}
   137  		s.fetchBatch()
   138  		if s.curr == nil || len(s.curr.Profiles) == 0 {
   139  			return false
   140  		}
   141  		s.currIdx = 0
   142  		s.setCurrentProfile()
   143  		return true
   144  	}
   145  	s.currIdx++
   146  	s.setCurrentProfile()
   147  	return true
   148  }
   149  
   150  func (s *mergeIterator[R, Req, Res]) setCurrentProfile() {
   151  	p := s.curr.Profiles[s.currIdx]
   152  	s.currentProfile.Timestamp = p.Timestamp
   153  	if len(s.curr.LabelsSets) > 0 {
   154  		s.currentProfile.Labels = s.curr.LabelsSets[p.LabelIndex].Labels
   155  	}
   156  	if len(s.curr.Fingerprints) > 0 {
   157  		s.currentProfile.Fingerprint = s.curr.Fingerprints[p.LabelIndex]
   158  	}
   159  }
   160  
   161  func (s *mergeIterator[R, Req, Res]) fetchBatch() {
   162  	var selectedProfiles *ingestv1.ProfileSets
   163  	switch bidi := (s.bidi).(type) {
   164  	case BidiClientMerge[*ingestv1.MergeProfilesStacktracesRequest, *ingestv1.MergeProfilesStacktracesResponse]:
   165  		res, err := bidi.Receive()
   166  		if err != nil {
   167  			s.err = err
   168  			return
   169  		}
   170  		selectedProfiles = res.SelectedProfiles
   171  	case BidiClientMerge[*ingestv1.MergeProfilesLabelsRequest, *ingestv1.MergeProfilesLabelsResponse]:
   172  		res, err := bidi.Receive()
   173  		if err != nil {
   174  			s.err = err
   175  			return
   176  		}
   177  		selectedProfiles = res.SelectedProfiles
   178  	case BidiClientMerge[*ingestv1.MergeProfilesPprofRequest, *ingestv1.MergeProfilesPprofResponse]:
   179  		res, err := bidi.Receive()
   180  		if err != nil {
   181  			s.err = err
   182  			return
   183  		}
   184  		selectedProfiles = res.SelectedProfiles
   185  	case BidiClientMerge[*ingestv1.MergeSpanProfileRequest, *ingestv1.MergeSpanProfileResponse]:
   186  		res, err := bidi.Receive()
   187  		if err != nil {
   188  			s.err = err
   189  			return
   190  		}
   191  		selectedProfiles = res.SelectedProfiles
   192  	}
   193  	s.curr = selectedProfiles
   194  	if s.curr == nil {
   195  		return
   196  	}
   197  	if len(s.curr.Profiles) > cap(s.keep) {
   198  		s.keep = make([]bool, len(s.curr.Profiles))
   199  	}
   200  	s.keep = s.keep[:len(s.curr.Profiles)]
   201  	// reset selections to none
   202  	for i := range s.keep {
   203  		s.keep[i] = false
   204  	}
   205  	s.keepSent = false
   206  }
   207  
   208  func (s *mergeIterator[R, Req, Res]) Keep() {
   209  	s.keep[s.currIdx] = true
   210  }
   211  
   212  func (s *mergeIterator[R, Req, Res]) At() *ProfileWithLabels {
   213  	return s.currentProfile
   214  }
   215  
   216  func (s *mergeIterator[R, Req, Res]) Result() (R, error) {
   217  	res, err := s.bidi.Receive()
   218  	if err != nil {
   219  		s.err = err
   220  		return *new(R), err
   221  	}
   222  	switch result := any(res).(type) {
   223  	case *ingestv1.MergeProfilesStacktracesResponse:
   224  		return any(result.Result).(R), nil
   225  	case *ingestv1.MergeProfilesLabelsResponse:
   226  		return any(result.Series).(R), nil
   227  	case *ingestv1.MergeProfilesPprofResponse:
   228  		return any(result.Result).(R), nil
   229  	case *ingestv1.MergeSpanProfileResponse:
   230  		return any(result.Result).(R), nil
   231  	default:
   232  		return *new(R), fmt.Errorf("unexpected response type %T", result)
   233  	}
   234  }
   235  
   236  func (s *mergeIterator[R, Req, Res]) Err() error {
   237  	return s.err
   238  }
   239  
   240  func (s *mergeIterator[R, Req, Res]) Close() error {
   241  	// Only close the Send side since we need to get the final result.
   242  	var errs multierror.MultiError
   243  	if err := s.bidi.CloseRequest(); err != nil {
   244  		errs = append(errs, err)
   245  	}
   246  	return errs.Err()
   247  }
   248  
   249  // skipDuplicates iterates through the iterator and skip duplicates.
   250  func skipDuplicates(ctx context.Context, its []MergeIterator) error {
   251  	span, _ := opentracing.StartSpanFromContext(ctx, "skipDuplicates")
   252  	defer span.Finish()
   253  	var errors multierror.MultiError
   254  	tree := loser.New(its,
   255  		&ProfileWithLabels{
   256  			Timestamp: math.MaxInt64,
   257  		},
   258  		func(s MergeIterator) *ProfileWithLabels {
   259  			return s.At()
   260  		},
   261  		func(p1, p2 *ProfileWithLabels) bool {
   262  			return p1.Timestamp <= p2.Timestamp
   263  		},
   264  		func(s MergeIterator) {
   265  			if err := s.Close(); err != nil {
   266  				errors.Add(err)
   267  			}
   268  		})
   269  
   270  	defer tree.Close()
   271  	// We rely on the fact that profiles are ordered by timestamp.
   272  	// In order to deduplicate profiles, we only keep the first profile
   273  	// with a given fingerprint for a given timestamp.
   274  	fingerprints := newTimestampedFingerprints()
   275  	duplicates := 0
   276  	total := 0
   277  	for tree.Next() {
   278  		next := tree.Winner()
   279  		profile := next.At()
   280  		total++
   281  		fingerprint := profile.Fingerprint
   282  		if fingerprint == 0 && len(profile.Labels) > 0 {
   283  			fingerprint = profile.Hash()
   284  		}
   285  		if fingerprints.keep(profile.Timestamp, fingerprint) {
   286  			next.Keep()
   287  			continue
   288  		}
   289  		duplicates++
   290  	}
   291  	span.LogFields(otlog.Int("duplicates", duplicates))
   292  	span.LogFields(otlog.Int("total", total))
   293  	if err := tree.Err(); err != nil {
   294  		errors.Add(err)
   295  	}
   296  
   297  	return errors.Err()
   298  }
   299  
   300  func newTimestampedFingerprints() *timestampedFingerprints {
   301  	return &timestampedFingerprints{
   302  		timestamp:    math.MaxInt64,
   303  		fingerprints: make(map[uint64]struct{}),
   304  	}
   305  }
   306  
   307  type timestampedFingerprints struct {
   308  	timestamp    int64
   309  	fingerprints map[uint64]struct{}
   310  }
   311  
   312  // keep reports whether the profile has unique fingerprint for the timestamp.
   313  func (p *timestampedFingerprints) keep(ts int64, fingerprint uint64) bool {
   314  	if p.timestamp != ts {
   315  		p.reset(ts, fingerprint)
   316  		return true
   317  	}
   318  	return !p.fingerprintSeen(fingerprint)
   319  }
   320  
   321  func (p *timestampedFingerprints) reset(ts int64, fingerprint uint64) {
   322  	p.timestamp = ts
   323  	clear(p.fingerprints)
   324  	p.fingerprints[fingerprint] = struct{}{}
   325  }
   326  
   327  func (p *timestampedFingerprints) fingerprintSeen(fingerprint uint64) (seen bool) {
   328  	_, seen = p.fingerprints[fingerprint]
   329  	if seen {
   330  		return true
   331  	}
   332  	p.fingerprints[fingerprint] = struct{}{}
   333  	return false
   334  }
   335  
   336  // selectMergeTree selects the  profile from each ingester by deduping them and
   337  // returns merge of stacktrace samples represented as a tree.
   338  func selectMergeTree(ctx context.Context, responses []ResponseFromReplica[clientpool.BidiClientMergeProfilesStacktraces]) (*phlaremodel.Tree, error) {
   339  	span, ctx := opentracing.StartSpanFromContext(ctx, "selectMergeTree")
   340  	defer span.Finish()
   341  
   342  	mergeResults := make([]MergeResult[*ingestv1.MergeProfilesStacktracesResult], len(responses))
   343  	iters := make([]MergeIterator, len(responses))
   344  	var wg sync.WaitGroup
   345  	for i, resp := range responses {
   346  		wg.Add(1)
   347  		go func(i int, resp ResponseFromReplica[clientpool.BidiClientMergeProfilesStacktraces]) {
   348  			defer wg.Done()
   349  			it := NewMergeIterator[*ingestv1.MergeProfilesStacktracesResult](
   350  				ctx, ResponseFromReplica[BidiClientMerge[*ingestv1.MergeProfilesStacktracesRequest, *ingestv1.MergeProfilesStacktracesResponse]]{
   351  					addr:     resp.addr,
   352  					response: resp.response,
   353  				})
   354  			iters[i] = it
   355  			mergeResults[i] = it
   356  		}(i, resp)
   357  	}
   358  	wg.Wait()
   359  
   360  	if err := skipDuplicates(ctx, iters); err != nil {
   361  		return nil, err
   362  	}
   363  
   364  	// Collects the results in parallel.
   365  	span.LogFields(otlog.String("msg", "collecting merge results"))
   366  	g, _ := errgroup.WithContext(ctx)
   367  	m := phlaremodel.NewTreeMerger()
   368  	sm := phlaremodel.NewStackTraceMerger()
   369  	for _, iter := range mergeResults {
   370  		iter := iter
   371  		g.Go(util.RecoverPanic(func() error {
   372  			result, err := iter.Result()
   373  			if err != nil || result == nil {
   374  				return err
   375  			}
   376  			switch result.Format {
   377  			default:
   378  				return fmt.Errorf("unknown merge result format")
   379  			case ingestv1.StacktracesMergeFormat_MERGE_FORMAT_STACKTRACES:
   380  				sm.MergeStackTraces(result.Stacktraces, result.FunctionNames)
   381  			case ingestv1.StacktracesMergeFormat_MERGE_FORMAT_TREE:
   382  				err = m.MergeTreeBytes(result.TreeBytes)
   383  			}
   384  			return err
   385  		}))
   386  	}
   387  	if err := g.Wait(); err != nil {
   388  		return nil, err
   389  	}
   390  	if sm.Size() > 0 {
   391  		// For backward compatibility: during a rollout, multiple formats
   392  		// may coexist for some period of time (efficiency is not a concern).
   393  		if err := m.MergeTreeBytes(sm.TreeBytes(-1)); err != nil {
   394  			return nil, err
   395  		}
   396  	}
   397  
   398  	span.LogFields(otlog.String("msg", "building tree"))
   399  	return m.Tree(), nil
   400  }
   401  
   402  // selectMergePprofProfile selects the  profile from each ingester by deduping them and request merges of stacktraces in the pprof format.
   403  func selectMergePprofProfile(ctx context.Context, ty *typesv1.ProfileType, responses []ResponseFromReplica[clientpool.BidiClientMergeProfilesPprof]) (*googlev1.Profile, error) {
   404  	mergeResults := make([]MergeResult[[]byte], len(responses))
   405  	iters := make([]MergeIterator, len(responses))
   406  	var wg sync.WaitGroup
   407  	for i, resp := range responses {
   408  		wg.Add(1)
   409  		go func(i int, resp ResponseFromReplica[clientpool.BidiClientMergeProfilesPprof]) {
   410  			defer wg.Done()
   411  			it := NewMergeIterator[[]byte](
   412  				ctx, ResponseFromReplica[BidiClientMerge[*ingestv1.MergeProfilesPprofRequest, *ingestv1.MergeProfilesPprofResponse]]{
   413  					addr:     resp.addr,
   414  					response: resp.response,
   415  				})
   416  			iters[i] = it
   417  			mergeResults[i] = it
   418  		}(i, resp)
   419  	}
   420  	wg.Wait()
   421  
   422  	if err := skipDuplicates(ctx, iters); err != nil {
   423  		return nil, err
   424  	}
   425  
   426  	span := opentracing.SpanFromContext(ctx)
   427  	var pprofMerge pprof.ProfileMerge
   428  	g, _ := errgroup.WithContext(ctx)
   429  	for _, iter := range mergeResults {
   430  		iter := iter
   431  		g.Go(util.RecoverPanic(func() error {
   432  			start := time.Now()
   433  			result, err := iter.Result()
   434  			if err != nil || result == nil {
   435  				return err
   436  			}
   437  			if span != nil {
   438  				span.LogFields(
   439  					otlog.Int("profile_size", len(result)),
   440  					otlog.Int64("took_ms", time.Since(start).Milliseconds()),
   441  				)
   442  			}
   443  			var p googlev1.Profile
   444  			if err = pprof.Unmarshal(result, &p); err != nil {
   445  				return err
   446  			}
   447  			return pprofMerge.Merge(&p, true)
   448  		}))
   449  	}
   450  	if err := g.Wait(); err != nil {
   451  		return nil, err
   452  	}
   453  
   454  	p := pprofMerge.Profile()
   455  	if len(p.Sample) == 0 {
   456  		pprof.SetProfileMetadata(p, ty, 0, 0)
   457  	}
   458  	return p, nil
   459  }
   460  
   461  // selectMergeSeries selects the  profile from each ingester by deduping them and request merges of total values.
   462  func selectMergeSeries(ctx context.Context, aggregation *typesv1.TimeSeriesAggregationType, responses []ResponseFromReplica[clientpool.BidiClientMergeProfilesLabels]) (iter.Iterator[phlaremodel.TimeSeriesValue], error) {
   463  	mergeResults := make([]MergeResult[[]*typesv1.Series], len(responses))
   464  	iters := make([]MergeIterator, len(responses))
   465  	var wg sync.WaitGroup
   466  	for i, resp := range responses {
   467  		wg.Add(1)
   468  		go func(i int, resp ResponseFromReplica[clientpool.BidiClientMergeProfilesLabels]) {
   469  			defer wg.Done()
   470  			it := NewMergeIterator[[]*typesv1.Series](
   471  				ctx, ResponseFromReplica[BidiClientMerge[*ingestv1.MergeProfilesLabelsRequest, *ingestv1.MergeProfilesLabelsResponse]]{
   472  					addr:     resp.addr,
   473  					response: resp.response,
   474  				})
   475  			iters[i] = it
   476  			mergeResults[i] = it
   477  		}(i, resp)
   478  	}
   479  	wg.Wait()
   480  
   481  	if err := skipDuplicates(ctx, iters); err != nil {
   482  		return nil, err
   483  	}
   484  
   485  	// Collects the results in parallel.
   486  	results := make([][]*typesv1.Series, 0, len(iters))
   487  	s := lo.Synchronize()
   488  	g, _ := errgroup.WithContext(ctx)
   489  	for _, iter := range mergeResults {
   490  		iter := iter
   491  		g.Go(util.RecoverPanic(func() error {
   492  			result, err := iter.Result()
   493  			if err != nil || result == nil {
   494  				return err
   495  			}
   496  			s.Do(func() {
   497  				results = append(results, result)
   498  			})
   499  			return nil
   500  		}))
   501  	}
   502  	if err := g.Wait(); err != nil {
   503  		return nil, err
   504  	}
   505  	var series = phlaremodel.MergeSeries(aggregation, results...)
   506  
   507  	seriesIters := make([]iter.Iterator[phlaremodel.TimeSeriesValue], 0, len(series))
   508  	for _, s := range series {
   509  		s := s
   510  		seriesIters = append(seriesIters, phlaremodel.NewSeriesIterator(s.Labels, s.Points))
   511  	}
   512  	return phlaremodel.NewMergeIterator(phlaremodel.TimeSeriesValue{Ts: math.MaxInt64}, false, seriesIters...), nil
   513  }
   514  
   515  // selectMergeSpanProfile selects the  profile from each ingester by deduping them and
   516  // returns merge of stacktrace samples represented as a tree.
   517  func selectMergeSpanProfile(ctx context.Context, responses []ResponseFromReplica[clientpool.BidiClientMergeSpanProfile]) (*phlaremodel.Tree, error) {
   518  	span, ctx := opentracing.StartSpanFromContext(ctx, "selectMergeSpanProfile")
   519  	defer span.Finish()
   520  
   521  	mergeResults := make([]MergeResult[*ingestv1.MergeSpanProfileResult], len(responses))
   522  	iters := make([]MergeIterator, len(responses))
   523  	var wg sync.WaitGroup
   524  	for i, resp := range responses {
   525  		wg.Add(1)
   526  		go func(i int, resp ResponseFromReplica[clientpool.BidiClientMergeSpanProfile]) {
   527  			defer wg.Done()
   528  			it := NewMergeIterator[*ingestv1.MergeSpanProfileResult](
   529  				ctx, ResponseFromReplica[BidiClientMerge[*ingestv1.MergeSpanProfileRequest, *ingestv1.MergeSpanProfileResponse]]{
   530  					addr:     resp.addr,
   531  					response: resp.response,
   532  				})
   533  			iters[i] = it
   534  			mergeResults[i] = it
   535  		}(i, resp)
   536  	}
   537  	wg.Wait()
   538  
   539  	if err := skipDuplicates(ctx, iters); err != nil {
   540  		return nil, err
   541  	}
   542  
   543  	// Collects the results in parallel.
   544  	span.LogFields(otlog.String("msg", "collecting merge results"))
   545  	g, _ := errgroup.WithContext(ctx)
   546  	m := phlaremodel.NewTreeMerger()
   547  	for _, iter := range mergeResults {
   548  		iter := iter
   549  		g.Go(util.RecoverPanic(func() error {
   550  			result, err := iter.Result()
   551  			if err != nil || result == nil {
   552  				return err
   553  			}
   554  			return m.MergeTreeBytes(result.TreeBytes)
   555  		}))
   556  	}
   557  	if err := g.Wait(); err != nil {
   558  		return nil, err
   559  	}
   560  
   561  	span.LogFields(otlog.String("msg", "building tree"))
   562  	return m.Tree(), nil
   563  }