github.com/grafana/pyroscope@v1.18.0/pkg/phlaredb/block_querier.go (about)

     1  package phlaredb
     2  
     3  import (
     4  	"bytes"
     5  	"context"
     6  	"fmt"
     7  	"io"
     8  	"math"
     9  	"path/filepath"
    10  	"slices"
    11  	"sort"
    12  	"strings"
    13  	"sync"
    14  	"time"
    15  
    16  	"connectrpc.com/connect"
    17  	"github.com/go-kit/log"
    18  	"github.com/go-kit/log/level"
    19  	"github.com/gogo/status"
    20  	"github.com/grafana/dskit/multierror"
    21  	"github.com/grafana/dskit/runutil"
    22  	"github.com/oklog/ulid/v2"
    23  	"github.com/opentracing/opentracing-go"
    24  	otlog "github.com/opentracing/opentracing-go/log"
    25  	"github.com/parquet-go/parquet-go"
    26  	"github.com/pkg/errors"
    27  	"github.com/prometheus/common/model"
    28  	"github.com/prometheus/prometheus/promql/parser"
    29  	"github.com/prometheus/prometheus/storage"
    30  	"github.com/samber/lo"
    31  	"golang.org/x/sync/errgroup"
    32  	"google.golang.org/grpc/codes"
    33  
    34  	profilev1 "github.com/grafana/pyroscope/api/gen/proto/go/google/v1"
    35  	ingestv1 "github.com/grafana/pyroscope/api/gen/proto/go/ingester/v1"
    36  	typesv1 "github.com/grafana/pyroscope/api/gen/proto/go/types/v1"
    37  	"github.com/grafana/pyroscope/pkg/iter"
    38  	phlaremodel "github.com/grafana/pyroscope/pkg/model"
    39  	phlareobj "github.com/grafana/pyroscope/pkg/objstore"
    40  	parquetobj "github.com/grafana/pyroscope/pkg/objstore/parquet"
    41  	"github.com/grafana/pyroscope/pkg/phlaredb/block"
    42  	"github.com/grafana/pyroscope/pkg/phlaredb/query"
    43  	schemav1 "github.com/grafana/pyroscope/pkg/phlaredb/schemas/v1"
    44  	"github.com/grafana/pyroscope/pkg/phlaredb/symdb"
    45  	"github.com/grafana/pyroscope/pkg/phlaredb/tsdb/index"
    46  	"github.com/grafana/pyroscope/pkg/pprof"
    47  	phlarecontext "github.com/grafana/pyroscope/pkg/pyroscope/context"
    48  	"github.com/grafana/pyroscope/pkg/util"
    49  )
    50  
    51  const (
    52  	defaultBatchSize = 64 << 10
    53  
    54  	// This controls the buffer size for reads to a parquet io.Reader. This value should be small for memory or
    55  	// disk backed readers, but when the reader is backed by network storage a larger size will be advantageous.
    56  	//
    57  	// The chosen value should be larger than the page size. Page sizes depend on the write buffer size as well as
    58  	// on how well the data is encoded. In practice, they tend to be around 1MB.
    59  	parquetReadBufferSize = 2 << 20
    60  )
    61  
    62  type tableReader interface {
    63  	open(ctx context.Context, bucketReader phlareobj.BucketReader) error
    64  	io.Closer
    65  }
    66  
    67  type BlockQuerier struct {
    68  	phlarectx context.Context
    69  	logger    log.Logger
    70  
    71  	bkt phlareobj.Bucket
    72  
    73  	queriers     []*singleBlockQuerier
    74  	queriersLock sync.RWMutex
    75  }
    76  
    77  func NewBlockQuerier(phlarectx context.Context, bucketReader phlareobj.Bucket) *BlockQuerier {
    78  	return &BlockQuerier{
    79  		phlarectx: ContextWithBlockMetrics(phlarectx,
    80  			NewBlocksMetrics(
    81  				phlarecontext.Registry(phlarectx),
    82  			),
    83  		),
    84  		logger: phlarecontext.Logger(phlarectx),
    85  		bkt:    bucketReader,
    86  	}
    87  }
    88  
    89  func (b *BlockQuerier) Queriers() Queriers {
    90  	b.queriersLock.RLock()
    91  	defer b.queriersLock.RUnlock()
    92  
    93  	res := make([]Querier, 0, len(b.queriers))
    94  	for _, q := range b.queriers {
    95  		res = append(res, q)
    96  	}
    97  	return res
    98  }
    99  
   100  func (b *BlockQuerier) BlockMetas(ctx context.Context) (metas []*block.Meta, _ error) {
   101  	var names []ulid.ULID
   102  	if err := b.bkt.Iter(ctx, "", func(n string) error {
   103  		ulid, ok := block.IsBlockDir(n)
   104  		if !ok {
   105  			return nil
   106  		}
   107  		names = append(names, ulid)
   108  		return nil
   109  	}); err != nil {
   110  		return nil, err
   111  	}
   112  
   113  	g, ctx := errgroup.WithContext(ctx)
   114  	g.SetLimit(16)
   115  	metas = make([]*block.Meta, len(names))
   116  	for pos := range names {
   117  		func(pos int) {
   118  			g.Go(util.RecoverPanic(func() error {
   119  				path := filepath.Join(names[pos].String(), block.MetaFilename)
   120  				metaReader, err := b.bkt.Get(ctx, path)
   121  				if err != nil {
   122  					level.Error(b.logger).Log("msg", "error reading block meta", "block", path, "err", err)
   123  					return nil
   124  				}
   125  
   126  				metas[pos], err = block.Read(metaReader)
   127  				if err != nil {
   128  					level.Error(b.logger).Log("msg", "error parsing block meta", "block", path, "err", err)
   129  					return nil
   130  				}
   131  				return nil
   132  			}))
   133  		}(pos)
   134  	}
   135  
   136  	if err := g.Wait(); err != nil {
   137  		return nil, err
   138  	}
   139  
   140  	// sort slice and make sure nils are last
   141  	sort.Slice(metas, func(i, j int) bool {
   142  		if metas[i] == nil {
   143  			return false
   144  		}
   145  		if metas[j] == nil {
   146  			return true
   147  		}
   148  		return metas[i].MinTime < metas[j].MinTime
   149  	})
   150  
   151  	// iterate from the end and cut of till the first non-nil
   152  	var pos int
   153  	for pos = len(metas) - 1; pos >= 0; pos-- {
   154  		if metas[pos] != nil {
   155  			break
   156  		}
   157  	}
   158  
   159  	return metas[0 : pos+1], nil
   160  }
   161  
   162  func (b *BlockQuerier) BlockMeta(ctx context.Context, name string) (meta *block.Meta, _ error) {
   163  	path := filepath.Join(name, block.MetaFilename)
   164  	metaReader, err := b.bkt.Get(ctx, path)
   165  	if err != nil {
   166  		level.Error(b.logger).Log("msg", "error reading block meta", "block", path, "err", err)
   167  		return nil, err
   168  	}
   169  
   170  	meta, err = block.Read(metaReader)
   171  	if err != nil {
   172  		level.Error(b.logger).Log("msg", "error parsing block meta", "block", path, "err", err)
   173  		return nil, err
   174  	}
   175  
   176  	return meta, nil
   177  }
   178  
   179  // Sync gradually scans the available blocks. If there are any changes to the
   180  // last run it will Open/Close new/no longer existing ones.
   181  func (b *BlockQuerier) Sync(ctx context.Context) error {
   182  	observedMetas, err := b.BlockMetas(ctx)
   183  	if err != nil {
   184  		return err
   185  	}
   186  
   187  	// hold write lock to queriers
   188  	b.queriersLock.Lock()
   189  
   190  	// build lookup map
   191  
   192  	querierByULID := make(map[ulid.ULID]*singleBlockQuerier)
   193  
   194  	for pos := range b.queriers {
   195  		querierByULID[b.queriers[pos].meta.ULID] = b.queriers[pos]
   196  	}
   197  
   198  	// ensure queries has the right length
   199  	lenQueriers := len(observedMetas)
   200  	if cap(b.queriers) < lenQueriers {
   201  		b.queriers = make([]*singleBlockQuerier, lenQueriers)
   202  	} else {
   203  		b.queriers = b.queriers[:lenQueriers]
   204  	}
   205  
   206  	for pos, m := range observedMetas {
   207  
   208  		q, ok := querierByULID[m.ULID]
   209  		if ok {
   210  			b.queriers[pos] = q
   211  			delete(querierByULID, m.ULID)
   212  			continue
   213  		}
   214  
   215  		b.queriers[pos] = NewSingleBlockQuerierFromMeta(b.phlarectx, b.bkt, m)
   216  	}
   217  	// ensure queriers are in ascending order.
   218  	sort.Slice(b.queriers, func(i, j int) bool {
   219  		return b.queriers[i].meta.MinTime < b.queriers[j].meta.MinTime
   220  	})
   221  	b.queriersLock.Unlock()
   222  
   223  	// now close no longer available queries
   224  	for _, q := range querierByULID {
   225  		if err := q.Close(); err != nil {
   226  			return err
   227  		}
   228  	}
   229  
   230  	return nil
   231  }
   232  
   233  func (b *BlockQuerier) AddBlockQuerierByMeta(m *block.Meta) {
   234  	q := NewSingleBlockQuerierFromMeta(b.phlarectx, b.bkt, m)
   235  	b.queriersLock.Lock()
   236  	defer b.queriersLock.Unlock()
   237  	i := sort.Search(len(b.queriers), func(i int) bool {
   238  		return b.queriers[i].meta.MinTime >= m.MinTime
   239  	})
   240  	if i < len(b.queriers) && b.queriers[i].meta.ULID == m.ULID {
   241  		// Block with this meta is already present, skipping.
   242  		return
   243  	}
   244  	b.queriers = append(b.queriers, q) // Ensure we have enough capacity.
   245  	copy(b.queriers[i+1:], b.queriers[i:])
   246  	b.queriers[i] = q
   247  }
   248  
   249  // evict removes the block with the given ULID from the querier.
   250  func (b *BlockQuerier) evict(blockID ulid.ULID) (bool, error) {
   251  	b.queriersLock.Lock()
   252  	// N.B: queriers are sorted by meta.MinTime.
   253  	j := -1
   254  	for i, q := range b.queriers {
   255  		if q.meta.ULID.Compare(blockID) == 0 {
   256  			j = i
   257  			break
   258  		}
   259  	}
   260  	if j < 0 {
   261  		b.queriersLock.Unlock()
   262  		return false, nil
   263  	}
   264  	blockQuerier := b.queriers[j]
   265  	// Delete the querier from the slice and make it eligible for GC.
   266  	copy(b.queriers[j:], b.queriers[j+1:])
   267  	b.queriers[len(b.queriers)-1] = nil
   268  	b.queriers = b.queriers[:len(b.queriers)-1]
   269  	b.queriersLock.Unlock()
   270  	return true, blockQuerier.Close()
   271  }
   272  
   273  func (b *BlockQuerier) Close() error {
   274  	b.queriersLock.Lock()
   275  	defer b.queriersLock.Unlock()
   276  
   277  	errs := multierror.New()
   278  	for pos := range b.queriers {
   279  		if err := b.queriers[pos].Close(); err != nil {
   280  			errs.Add(err)
   281  		}
   282  	}
   283  	return errs.Err()
   284  }
   285  
   286  type TableInfo struct {
   287  	Rows      uint64
   288  	RowGroups uint64
   289  	Bytes     uint64
   290  }
   291  
   292  type BlockInfo struct {
   293  	ID          ulid.ULID
   294  	MinTime     model.Time
   295  	MaxTime     model.Time
   296  	Profiles    TableInfo
   297  	Stacktraces TableInfo
   298  	Locations   TableInfo
   299  	Functions   TableInfo
   300  	Mappings    TableInfo
   301  	Strings     TableInfo
   302  	Series      uint64
   303  }
   304  
   305  type singleBlockQuerier struct {
   306  	logger  log.Logger
   307  	metrics *BlocksMetrics
   308  
   309  	bucket phlareobj.Bucket
   310  	meta   *block.Meta
   311  
   312  	tables []tableReader
   313  
   314  	queries  sync.WaitGroup
   315  	openLock sync.Mutex
   316  	opened   bool
   317  	index    *index.Reader
   318  	profiles map[profileTableKey]*parquetReader[*schemav1.ProfilePersister]
   319  	symbols  symbolsResolver
   320  }
   321  
   322  type profileTableKey struct {
   323  	resolution  time.Duration
   324  	aggregation string
   325  }
   326  
   327  func NewSingleBlockQuerierFromMeta(phlarectx context.Context, bucketReader phlareobj.Bucket, meta *block.Meta) *singleBlockQuerier {
   328  	q := &singleBlockQuerier{
   329  		logger:   phlarecontext.Logger(phlarectx),
   330  		metrics:  blockMetricsFromContext(phlarectx),
   331  		profiles: make(map[profileTableKey]*parquetReader[*schemav1.ProfilePersister], 3),
   332  		bucket:   phlareobj.NewPrefixedBucket(bucketReader, meta.ULID.String()),
   333  		meta:     meta,
   334  	}
   335  	for _, f := range meta.Files {
   336  		k, ok := parseProfileTableName(f.RelPath)
   337  		if ok {
   338  			r := &parquetReader[*schemav1.ProfilePersister]{meta: f}
   339  			q.profiles[k] = r
   340  			q.tables = append(q.tables, r)
   341  		}
   342  	}
   343  	return q
   344  }
   345  
   346  func (b *singleBlockQuerier) Profiles() ProfileReader {
   347  	return b.profileSourceTable().file
   348  }
   349  
   350  func (b *singleBlockQuerier) Index() IndexReader {
   351  	return b.index
   352  }
   353  
   354  func (b *singleBlockQuerier) Symbols() symdb.SymbolsReader {
   355  	return b.symbols
   356  }
   357  
   358  func (b *singleBlockQuerier) Meta() block.Meta {
   359  	if b.meta == nil {
   360  		return block.Meta{}
   361  	}
   362  	return *b.meta
   363  }
   364  
   365  func (b *singleBlockQuerier) ProfileTypes(ctx context.Context, req *connect.Request[ingestv1.ProfileTypesRequest]) (*connect.Response[ingestv1.ProfileTypesResponse], error) {
   366  	sp, ctx := opentracing.StartSpanFromContext(ctx, "ProfileTypes Block")
   367  	defer sp.Finish()
   368  
   369  	if err := b.Open(ctx); err != nil {
   370  		return nil, err
   371  	}
   372  	b.queries.Add(1)
   373  	defer b.queries.Done()
   374  
   375  	values, err := b.index.LabelValues(phlaremodel.LabelNameProfileType)
   376  	if err != nil {
   377  		return nil, err
   378  	}
   379  	slices.Sort(values)
   380  
   381  	types := make([]*typesv1.ProfileType, len(values))
   382  	for i, value := range values {
   383  		typ, err := phlaremodel.ParseProfileTypeSelector(value)
   384  		if err != nil {
   385  			return nil, err
   386  		}
   387  		types[i] = typ
   388  	}
   389  
   390  	return connect.NewResponse(&ingestv1.ProfileTypesResponse{
   391  		ProfileTypes: types,
   392  	}), nil
   393  }
   394  
   395  func (b *singleBlockQuerier) LabelValues(ctx context.Context, req *connect.Request[typesv1.LabelValuesRequest]) (*connect.Response[typesv1.LabelValuesResponse], error) {
   396  	sp, ctx := opentracing.StartSpanFromContext(ctx, "LabelValues Block")
   397  	defer sp.Finish()
   398  
   399  	params := req.Msg
   400  
   401  	if err := b.Open(ctx); err != nil {
   402  		return nil, err
   403  	}
   404  	b.queries.Add(1)
   405  	defer b.queries.Done()
   406  
   407  	names, err := b.index.LabelNames()
   408  	if err != nil {
   409  		return nil, err
   410  	}
   411  	if !slices.Contains(names, req.Msg.Name) {
   412  		return connect.NewResponse(&typesv1.LabelValuesResponse{
   413  			Names: []string{},
   414  		}), nil
   415  	}
   416  
   417  	selectors, err := parseSelectors(params.Matchers)
   418  	if err != nil {
   419  		return nil, err
   420  	}
   421  
   422  	iters := make([]index.Postings, 0, 1)
   423  	if selectors.matchesAll() {
   424  		k, v := index.AllPostingsKey()
   425  		iter, err := b.index.Postings(k, nil, v)
   426  		if err != nil {
   427  			return nil, err
   428  		}
   429  		iters = append(iters, iter)
   430  	} else {
   431  		for _, matchers := range selectors {
   432  			iter, err := PostingsForMatchers(b.index, nil, matchers...)
   433  			if err != nil {
   434  				return nil, err
   435  			}
   436  			iters = append(iters, iter)
   437  		}
   438  	}
   439  
   440  	valueSet := make(map[string]struct{})
   441  	iter := index.Intersect(iters...)
   442  	for iter.Next() {
   443  		value, err := b.index.LabelValueFor(iter.At(), req.Msg.Name)
   444  		if err != nil {
   445  			if err == storage.ErrNotFound {
   446  				continue
   447  			}
   448  			return nil, err
   449  		}
   450  		valueSet[value] = struct{}{}
   451  	}
   452  
   453  	values := make([]string, 0, len(valueSet))
   454  	for value := range valueSet {
   455  		values = append(values, value)
   456  	}
   457  	slices.Sort(values)
   458  	return connect.NewResponse(&typesv1.LabelValuesResponse{
   459  		Names: values,
   460  	}), nil
   461  }
   462  
   463  func (b *singleBlockQuerier) LabelNames(ctx context.Context, req *connect.Request[typesv1.LabelNamesRequest]) (*connect.Response[typesv1.LabelNamesResponse], error) {
   464  	sp, ctx := opentracing.StartSpanFromContext(ctx, "LabelNames Block")
   465  	defer sp.Finish()
   466  
   467  	params := req.Msg
   468  
   469  	if err := b.Open(ctx); err != nil {
   470  		return nil, err
   471  	}
   472  	b.queries.Add(1)
   473  	defer b.queries.Done()
   474  
   475  	selectors, err := parseSelectors(params.Matchers)
   476  	if err != nil {
   477  		return nil, err
   478  	}
   479  
   480  	if selectors.matchesAll() {
   481  		names, err := b.index.LabelNames()
   482  		if err != nil {
   483  			return nil, err
   484  		}
   485  		return connect.NewResponse(&typesv1.LabelNamesResponse{
   486  			Names: names,
   487  		}), nil
   488  	}
   489  
   490  	var iters []index.Postings
   491  	for _, matchers := range selectors {
   492  		iter, err := PostingsForMatchers(b.index, nil, matchers...)
   493  		if err != nil {
   494  			return nil, err
   495  		}
   496  		iters = append(iters, iter)
   497  	}
   498  
   499  	nameSet := make(map[string]struct{})
   500  	iter := index.Intersect(iters...)
   501  	for iter.Next() {
   502  		names, err := b.index.LabelNamesFor(iter.At())
   503  		if err != nil {
   504  			if err == storage.ErrNotFound {
   505  				continue
   506  			}
   507  			return nil, err
   508  		}
   509  
   510  		for _, name := range names {
   511  			nameSet[name] = struct{}{}
   512  		}
   513  	}
   514  
   515  	names := make([]string, 0, len(nameSet))
   516  	for name := range nameSet {
   517  		names = append(names, name)
   518  	}
   519  	slices.Sort(names)
   520  	return connect.NewResponse(&typesv1.LabelNamesResponse{
   521  		Names: names,
   522  	}), nil
   523  }
   524  
   525  func (b *singleBlockQuerier) BlockID() string {
   526  	return b.meta.ULID.String()
   527  }
   528  
   529  func (b *singleBlockQuerier) Close() error {
   530  	b.openLock.Lock()
   531  	defer func() {
   532  		b.openLock.Unlock()
   533  		b.metrics.blockOpened.Dec()
   534  	}()
   535  
   536  	if !b.opened {
   537  		return nil
   538  	}
   539  	b.queries.Wait()
   540  
   541  	errs := multierror.New()
   542  	if b.index != nil {
   543  		err := b.index.Close()
   544  		b.index = nil
   545  		if err != nil {
   546  			errs.Add(err)
   547  		}
   548  	}
   549  	for _, t := range b.tables {
   550  		if err := t.Close(); err != nil {
   551  			errs.Add(err)
   552  		}
   553  	}
   554  	if b.symbols != nil {
   555  		if err := b.symbols.Close(); err != nil {
   556  			errs.Add(err)
   557  		}
   558  	}
   559  	b.opened = false
   560  	return errs.Err()
   561  }
   562  
   563  func (b *singleBlockQuerier) Bounds() (model.Time, model.Time) {
   564  	return b.meta.MinTime, b.meta.MaxTime
   565  }
   566  
   567  func (b *singleBlockQuerier) GetMetaStats() block.MetaStats {
   568  	return b.meta.GetStats()
   569  }
   570  
   571  type Profile interface {
   572  	RowNumber() int64
   573  	StacktracePartition() uint64
   574  	Timestamp() model.Time
   575  	Fingerprint() model.Fingerprint
   576  	Labels() phlaremodel.Labels
   577  }
   578  
   579  type Querier interface {
   580  	// BlockID returns the block ID of the querier, when it is representing a single block.
   581  	BlockID() string
   582  	Bounds() (model.Time, model.Time)
   583  	Open(ctx context.Context) error
   584  	Sort([]Profile) []Profile
   585  
   586  	MergeByStacktraces(ctx context.Context, rows iter.Iterator[Profile], maxNodes int64) (*phlaremodel.Tree, error)
   587  	MergeBySpans(ctx context.Context, rows iter.Iterator[Profile], spans phlaremodel.SpanSelector) (*phlaremodel.Tree, error)
   588  	MergeByLabels(ctx context.Context, rows iter.Iterator[Profile], s *typesv1.StackTraceSelector, by ...string) ([]*typesv1.Series, error)
   589  	MergePprof(ctx context.Context, rows iter.Iterator[Profile], maxNodes int64, s *typesv1.StackTraceSelector) (*profilev1.Profile, error)
   590  	Series(ctx context.Context, params *ingestv1.SeriesRequest) ([]*typesv1.Labels, error)
   591  
   592  	SelectMatchingProfiles(ctx context.Context, params *ingestv1.SelectProfilesRequest) (iter.Iterator[Profile], error)
   593  	SelectMergeByStacktraces(ctx context.Context, params *ingestv1.SelectProfilesRequest, maxNodes int64) (*phlaremodel.Tree, error)
   594  	SelectMergeByLabels(ctx context.Context, params *ingestv1.SelectProfilesRequest, s *typesv1.StackTraceSelector, by ...string) ([]*typesv1.Series, error)
   595  	SelectMergeBySpans(ctx context.Context, params *ingestv1.SelectSpanProfileRequest) (*phlaremodel.Tree, error)
   596  	SelectMergePprof(ctx context.Context, params *ingestv1.SelectProfilesRequest, maxNodes int64, s *typesv1.StackTraceSelector) (*profilev1.Profile, error)
   597  
   598  	ProfileTypes(context.Context, *connect.Request[ingestv1.ProfileTypesRequest]) (*connect.Response[ingestv1.ProfileTypesResponse], error)
   599  	LabelValues(ctx context.Context, req *connect.Request[typesv1.LabelValuesRequest]) (*connect.Response[typesv1.LabelValuesResponse], error)
   600  	LabelNames(ctx context.Context, req *connect.Request[typesv1.LabelNamesRequest]) (*connect.Response[typesv1.LabelNamesResponse], error)
   601  }
   602  
   603  type TimeBounded interface {
   604  	Bounds() (model.Time, model.Time)
   605  }
   606  
   607  func InRange(q TimeBounded, start, end model.Time) bool {
   608  	min, max := q.Bounds()
   609  	if start > max {
   610  		return false
   611  	}
   612  	if end < min {
   613  		return false
   614  	}
   615  	return true
   616  }
   617  
   618  type ReadAPI interface {
   619  	LabelValues(context.Context, *connect.Request[typesv1.LabelValuesRequest]) (*connect.Response[typesv1.LabelValuesResponse], error)
   620  	LabelNames(context.Context, *connect.Request[typesv1.LabelNamesRequest]) (*connect.Response[typesv1.LabelNamesResponse], error)
   621  	ProfileTypes(context.Context, *connect.Request[ingestv1.ProfileTypesRequest]) (*connect.Response[ingestv1.ProfileTypesResponse], error)
   622  	Series(context.Context, *connect.Request[ingestv1.SeriesRequest]) (*connect.Response[ingestv1.SeriesResponse], error)
   623  	MergeProfilesStacktraces(context.Context, *connect.BidiStream[ingestv1.MergeProfilesStacktracesRequest, ingestv1.MergeProfilesStacktracesResponse]) error
   624  	MergeProfilesLabels(context.Context, *connect.BidiStream[ingestv1.MergeProfilesLabelsRequest, ingestv1.MergeProfilesLabelsResponse]) error
   625  	MergeProfilesPprof(context.Context, *connect.BidiStream[ingestv1.MergeProfilesPprofRequest, ingestv1.MergeProfilesPprofResponse]) error
   626  	MergeSpanProfile(context.Context, *connect.BidiStream[ingestv1.MergeSpanProfileRequest, ingestv1.MergeSpanProfileResponse]) error
   627  }
   628  
   629  var _ ReadAPI = make(Queriers, 0)
   630  
   631  type Queriers []Querier
   632  
   633  func (queriers Queriers) Open(ctx context.Context) error {
   634  	g, ctx := errgroup.WithContext(ctx)
   635  	g.SetLimit(128)
   636  	for _, q := range queriers {
   637  		q := q
   638  		g.Go(func() error {
   639  			if err := q.Open(ctx); err != nil {
   640  				return err
   641  			}
   642  			return nil
   643  		})
   644  	}
   645  	return g.Wait()
   646  }
   647  
   648  func (queriers Queriers) SelectMatchingProfiles(ctx context.Context, params *ingestv1.SelectProfilesRequest) (iter.Iterator[Profile], error) {
   649  	iters, err := SelectMatchingProfiles(ctx, params, queriers)
   650  	if err != nil {
   651  		return nil, err
   652  	}
   653  	return phlaremodel.NewMergeIterator(maxBlockProfile, true, iters...), nil
   654  }
   655  
   656  func (queriers Queriers) LabelValues(ctx context.Context, req *connect.Request[typesv1.LabelValuesRequest]) (*connect.Response[typesv1.LabelValuesResponse], error) {
   657  	blockGetter := queriers.ForTimeRange
   658  	_, hasTimeRange := phlaremodel.GetTimeRange(req.Msg)
   659  	if !hasTimeRange {
   660  		blockGetter = func(_ context.Context, _, _ model.Time, _ *ingestv1.Hints) (Queriers, error) {
   661  			return queriers, nil
   662  		}
   663  	}
   664  	res, err := LabelValues(ctx, req, blockGetter)
   665  	if err != nil {
   666  		return nil, err
   667  	}
   668  	return connect.NewResponse(res), nil
   669  }
   670  
   671  func (queriers Queriers) LabelNames(ctx context.Context, req *connect.Request[typesv1.LabelNamesRequest]) (*connect.Response[typesv1.LabelNamesResponse], error) {
   672  	blockGetter := queriers.ForTimeRange
   673  	_, hasTimeRange := phlaremodel.GetTimeRange(req.Msg)
   674  	if !hasTimeRange {
   675  		blockGetter = func(_ context.Context, _, _ model.Time, _ *ingestv1.Hints) (Queriers, error) {
   676  			return queriers, nil
   677  		}
   678  	}
   679  	res, err := LabelNames(ctx, req, blockGetter)
   680  	if err != nil {
   681  		return nil, err
   682  	}
   683  	return connect.NewResponse(res), nil
   684  }
   685  
   686  func (queriers Queriers) ProfileTypes(ctx context.Context, req *connect.Request[ingestv1.ProfileTypesRequest]) (*connect.Response[ingestv1.ProfileTypesResponse], error) {
   687  	blockGetter := queriers.ForTimeRange
   688  	_, hasTimeRange := phlaremodel.GetTimeRange(req.Msg)
   689  	if !hasTimeRange {
   690  		blockGetter = func(_ context.Context, _, _ model.Time, _ *ingestv1.Hints) (Queriers, error) {
   691  			return queriers, nil
   692  		}
   693  	}
   694  	res, err := ProfileTypes(ctx, req, blockGetter)
   695  	if err != nil {
   696  		return nil, err
   697  	}
   698  	return res, nil
   699  }
   700  
   701  func (queriers Queriers) Series(ctx context.Context, req *connect.Request[ingestv1.SeriesRequest]) (*connect.Response[ingestv1.SeriesResponse], error) {
   702  	// todo: verify empty timestamp request should return all series
   703  	blockGetter := queriers.ForTimeRange
   704  	// Legacy Series queries without a range should return all series from all head blocks.
   705  	if req.Msg.Start == 0 || req.Msg.End == 0 {
   706  		blockGetter = func(_ context.Context, _, _ model.Time, _ *ingestv1.Hints) (Queriers, error) {
   707  			return queriers, nil
   708  		}
   709  	}
   710  	res, err := Series(ctx, req.Msg, blockGetter)
   711  	if err != nil {
   712  		return nil, err
   713  	}
   714  	return connect.NewResponse(res), nil
   715  }
   716  
   717  func (queriers Queriers) MergeProfilesStacktraces(ctx context.Context, stream *connect.BidiStream[ingestv1.MergeProfilesStacktracesRequest, ingestv1.MergeProfilesStacktracesResponse]) error {
   718  	return MergeProfilesStacktraces(ctx, stream, queriers.ForTimeRange)
   719  }
   720  
   721  func (queriers Queriers) MergeProfilesLabels(ctx context.Context, stream *connect.BidiStream[ingestv1.MergeProfilesLabelsRequest, ingestv1.MergeProfilesLabelsResponse]) error {
   722  	return MergeProfilesLabels(ctx, stream, queriers.ForTimeRange)
   723  }
   724  
   725  func (queriers Queriers) MergeProfilesPprof(ctx context.Context, stream *connect.BidiStream[ingestv1.MergeProfilesPprofRequest, ingestv1.MergeProfilesPprofResponse]) error {
   726  	return MergeProfilesPprof(ctx, stream, queriers.ForTimeRange)
   727  }
   728  
   729  func (queriers Queriers) MergeSpanProfile(ctx context.Context, stream *connect.BidiStream[ingestv1.MergeSpanProfileRequest, ingestv1.MergeSpanProfileResponse]) error {
   730  	return MergeSpanProfile(ctx, stream, queriers.ForTimeRange)
   731  }
   732  
   733  type BlockGetter func(ctx context.Context, start, end model.Time, hints *ingestv1.Hints) (Queriers, error)
   734  
   735  func (queriers Queriers) ForTimeRange(_ context.Context, start, end model.Time, hints *ingestv1.Hints) (Queriers, error) {
   736  	skipBlock := HintsToBlockSkipper(hints)
   737  
   738  	result := make(Queriers, 0, len(queriers))
   739  	for _, q := range queriers {
   740  		if !InRange(q, start, end) {
   741  			continue
   742  		}
   743  
   744  		if skipBlock(q.BlockID()) {
   745  			continue
   746  		}
   747  
   748  		result = append(result, q)
   749  	}
   750  	return result, nil
   751  }
   752  
   753  func HintsToBlockSkipper(hints *ingestv1.Hints) func(ulid string) bool {
   754  	if hints != nil && hints.Block != nil {
   755  		m := make(map[string]struct{})
   756  		for _, blockID := range hints.Block.Ulids {
   757  			m[blockID] = struct{}{}
   758  		}
   759  		return func(ulid string) bool {
   760  			_, exists := m[ulid]
   761  			return !exists
   762  		}
   763  	}
   764  
   765  	// without hints do not skip any block
   766  	return func(ulid string) bool { return false }
   767  }
   768  
   769  // SelectMatchingProfiles returns a list iterator of profiles matching the given request.
   770  func SelectMatchingProfiles(ctx context.Context, request *ingestv1.SelectProfilesRequest, queriers Queriers) ([]iter.Iterator[Profile], error) {
   771  	g, ctx := errgroup.WithContext(ctx)
   772  	iters := make([]iter.Iterator[Profile], len(queriers))
   773  
   774  	skipBlock := HintsToBlockSkipper(request.Hints)
   775  
   776  	for i, querier := range queriers {
   777  		if skipBlock(querier.BlockID()) {
   778  			iters[i] = iter.NewEmptyIterator[Profile]()
   779  			continue
   780  		}
   781  		i := i
   782  		querier := querier
   783  		g.Go(util.RecoverPanic(func() error {
   784  			profiles, err := querier.SelectMatchingProfiles(ctx, request)
   785  			if err != nil {
   786  				return err
   787  			}
   788  			iters[i] = iter.NewBufferedIterator(profiles, 1024)
   789  			return nil
   790  		}))
   791  	}
   792  
   793  	if err := g.Wait(); err != nil {
   794  		for _, it := range iters {
   795  			if it != nil {
   796  				runutil.CloseWithLogOnErr(util.Logger, it, "closing buffered iterator")
   797  			}
   798  		}
   799  		return nil, err
   800  	}
   801  	return iters, nil
   802  }
   803  
   804  func MergeProfilesStacktraces(ctx context.Context, stream *connect.BidiStream[ingestv1.MergeProfilesStacktracesRequest, ingestv1.MergeProfilesStacktracesResponse], blockGetter BlockGetter) error {
   805  	sp, ctx := opentracing.StartSpanFromContext(ctx, "MergeProfilesStacktraces")
   806  	defer sp.Finish()
   807  
   808  	r, err := stream.Receive()
   809  	if err != nil {
   810  		if errors.Is(err, io.EOF) {
   811  			return connect.NewError(connect.CodeCanceled, errors.New("client closed stream"))
   812  		}
   813  		return err
   814  	}
   815  
   816  	if r.Request == nil {
   817  		return connect.NewError(connect.CodeInvalidArgument, errors.New("missing initial select request"))
   818  	}
   819  	request := r.Request
   820  	sp.LogFields(
   821  		otlog.String("start", model.Time(request.Start).Time().String()),
   822  		otlog.String("end", model.Time(request.End).Time().String()),
   823  		otlog.String("selector", request.LabelSelector),
   824  		otlog.String("profile_id", request.Type.ID),
   825  		otlog.Object("hints", request.Hints),
   826  	)
   827  
   828  	queriers, err := blockGetter(ctx, model.Time(request.Start), model.Time(request.End), request.Hints)
   829  	if err != nil {
   830  		return err
   831  	}
   832  
   833  	deduplicationNeeded := true
   834  	if request.Hints != nil && request.Hints.Block != nil {
   835  		deduplicationNeeded = request.Hints.Block.Deduplication
   836  	}
   837  
   838  	var m sync.Mutex
   839  	t := new(phlaremodel.Tree)
   840  	g, ctx := errgroup.WithContext(ctx)
   841  
   842  	// depending on if new need deduplication or not there are two different code paths.
   843  	if !deduplicationNeeded {
   844  		// signal the end of the profile streaming by sending an empty response.
   845  		sp.LogFields(otlog.String("msg", "no profile streaming as no deduplication needed"))
   846  		if err = stream.Send(&ingestv1.MergeProfilesStacktracesResponse{}); err != nil {
   847  			return err
   848  		}
   849  
   850  		// in this path we can just merge the profiles from each block and send the result to the client.
   851  		for _, querier := range queriers {
   852  			querier := querier
   853  			g.Go(util.RecoverPanic(func() error {
   854  				// TODO(simonswine): Split profiles per row group and run the MergeByStacktraces in parallel.
   855  				merge, err := querier.SelectMergeByStacktraces(ctx, request, r.GetMaxNodes())
   856  				if err != nil {
   857  					return err
   858  				}
   859  
   860  				m.Lock()
   861  				t.Merge(merge)
   862  				m.Unlock()
   863  				return nil
   864  			}))
   865  		}
   866  	} else {
   867  		// in this path we have to go thorugh every profile and deduplicate them.
   868  		iters, err := SelectMatchingProfiles(ctx, request, queriers)
   869  		if err != nil {
   870  			return err
   871  		}
   872  
   873  		// send batches of profiles to client and filter via bidi stream.
   874  		selectedProfiles, err := filterProfiles[
   875  			BidiServerMerge[*ingestv1.MergeProfilesStacktracesResponse, *ingestv1.MergeProfilesStacktracesRequest],
   876  			*ingestv1.MergeProfilesStacktracesResponse,
   877  			*ingestv1.MergeProfilesStacktracesRequest](ctx, iters, defaultBatchSize, stream)
   878  		if err != nil {
   879  			return err
   880  		}
   881  
   882  		for i, querier := range queriers {
   883  			querier := querier
   884  			i := i
   885  			if len(selectedProfiles[i]) == 0 {
   886  				continue
   887  			}
   888  			// Sort profiles for better read locality.
   889  			// Merge async the result so we can continue streaming profiles.
   890  			g.Go(util.RecoverPanic(func() error {
   891  				merge, err := querier.MergeByStacktraces(ctx, iter.NewSliceIterator(querier.Sort(selectedProfiles[i])), r.GetMaxNodes())
   892  				if err != nil {
   893  					return err
   894  				}
   895  				m.Lock()
   896  				t.Merge(merge)
   897  				m.Unlock()
   898  				return nil
   899  			}))
   900  		}
   901  
   902  		// Signals the end of the profile streaming by sending an empty response.
   903  		// This allows the client to not block other streaming ingesters.
   904  		sp.LogFields(otlog.String("msg", "signaling the end of the profile streaming"))
   905  		if err = stream.Send(&ingestv1.MergeProfilesStacktracesResponse{}); err != nil {
   906  			return err
   907  		}
   908  	}
   909  
   910  	if err = g.Wait(); err != nil {
   911  		return err
   912  	}
   913  
   914  	// sends the final result to the client.
   915  	treeBytes := t.Bytes(r.GetMaxNodes())
   916  	sp.LogFields(
   917  		otlog.String("msg", "sending the final result to the client"),
   918  		otlog.Int("tree_bytes", len(treeBytes)),
   919  	)
   920  	err = stream.Send(&ingestv1.MergeProfilesStacktracesResponse{
   921  		Result: &ingestv1.MergeProfilesStacktracesResult{
   922  			Format:    ingestv1.StacktracesMergeFormat_MERGE_FORMAT_TREE,
   923  			TreeBytes: treeBytes,
   924  		},
   925  	})
   926  	if err != nil {
   927  		if errors.Is(err, io.EOF) {
   928  			return connect.NewError(connect.CodeCanceled, errors.New("client closed stream"))
   929  		}
   930  		return err
   931  	}
   932  
   933  	return nil
   934  }
   935  
   936  func MergeSpanProfile(ctx context.Context, stream *connect.BidiStream[ingestv1.MergeSpanProfileRequest, ingestv1.MergeSpanProfileResponse], blockGetter BlockGetter) error {
   937  	sp, ctx := opentracing.StartSpanFromContext(ctx, "MergeSpanProfile")
   938  	defer sp.Finish()
   939  
   940  	r, err := stream.Receive()
   941  	if err != nil {
   942  		if errors.Is(err, io.EOF) {
   943  			return connect.NewError(connect.CodeCanceled, errors.New("client closed stream"))
   944  		}
   945  		return err
   946  	}
   947  
   948  	if r.Request == nil {
   949  		return connect.NewError(connect.CodeInvalidArgument, errors.New("missing initial select request"))
   950  	}
   951  	request := r.Request
   952  	sp.LogFields(
   953  		otlog.String("start", model.Time(request.Start).Time().String()),
   954  		otlog.String("end", model.Time(request.End).Time().String()),
   955  		otlog.String("selector", request.LabelSelector),
   956  		otlog.String("profile_type_id", request.Type.ID),
   957  		otlog.Object("hints", request.Hints),
   958  	)
   959  
   960  	spanSelector, err := phlaremodel.NewSpanSelector(request.SpanSelector)
   961  	if err != nil {
   962  		return err
   963  	}
   964  
   965  	queriers, err := blockGetter(ctx, model.Time(request.Start), model.Time(request.End), request.Hints)
   966  	if err != nil {
   967  		return err
   968  	}
   969  
   970  	deduplicationNeeded := true
   971  	if request.Hints != nil && request.Hints.Block != nil {
   972  		deduplicationNeeded = request.Hints.Block.Deduplication
   973  	}
   974  
   975  	var m sync.Mutex
   976  	t := new(phlaremodel.Tree)
   977  	g, ctx := errgroup.WithContext(ctx)
   978  
   979  	// depending on if new need deduplication or not there are two different code paths.
   980  	if !deduplicationNeeded {
   981  		// signal the end of the profile streaming by sending an empty response.
   982  		sp.LogFields(otlog.String("msg", "no profile streaming as no deduplication needed"))
   983  		if err = stream.Send(&ingestv1.MergeSpanProfileResponse{}); err != nil {
   984  			return err
   985  		}
   986  
   987  		// in this path we can just merge the profiles from each block and send the result to the client.
   988  		for _, querier := range queriers {
   989  			querier := querier
   990  			g.Go(util.RecoverPanic(func() error {
   991  				// TODO(simonswine): Split profiles per row group and run the MergeByStacktraces in parallel.
   992  				merge, err := querier.SelectMergeBySpans(ctx, request)
   993  				if err != nil {
   994  					return err
   995  				}
   996  
   997  				m.Lock()
   998  				t.Merge(merge)
   999  				m.Unlock()
  1000  				return nil
  1001  			}))
  1002  		}
  1003  	} else {
  1004  		// in this path we have to go thorugh every profile and deduplicate them.
  1005  		iters, err := SelectMatchingProfiles(ctx, &ingestv1.SelectProfilesRequest{
  1006  			LabelSelector: request.LabelSelector,
  1007  			Type:          request.Type,
  1008  			Start:         request.Start,
  1009  			End:           request.End,
  1010  			Hints:         request.Hints,
  1011  		}, queriers)
  1012  		if err != nil {
  1013  			return err
  1014  		}
  1015  
  1016  		// send batches of profiles to client and filter via bidi stream.
  1017  		selectedProfiles, err := filterProfiles[
  1018  			BidiServerMerge[*ingestv1.MergeSpanProfileResponse, *ingestv1.MergeSpanProfileRequest],
  1019  			*ingestv1.MergeSpanProfileResponse,
  1020  			*ingestv1.MergeSpanProfileRequest](ctx, iters, defaultBatchSize, stream)
  1021  		if err != nil {
  1022  			return err
  1023  		}
  1024  
  1025  		for i, querier := range queriers {
  1026  			querier := querier
  1027  			i := i
  1028  			if len(selectedProfiles[i]) == 0 {
  1029  				continue
  1030  			}
  1031  			// Sort profiles for better read locality.
  1032  			// Merge async the result so we can continue streaming profiles.
  1033  			g.Go(util.RecoverPanic(func() error {
  1034  				merge, err := querier.MergeBySpans(ctx, iter.NewSliceIterator(querier.Sort(selectedProfiles[i])), spanSelector)
  1035  				if err != nil {
  1036  					return err
  1037  				}
  1038  				m.Lock()
  1039  				t.Merge(merge)
  1040  				m.Unlock()
  1041  				return nil
  1042  			}))
  1043  		}
  1044  
  1045  		// Signals the end of the profile streaming by sending an empty response.
  1046  		// This allows the client to not block other streaming ingesters.
  1047  		sp.LogFields(otlog.String("msg", "signaling the end of the profile streaming"))
  1048  		if err = stream.Send(&ingestv1.MergeSpanProfileResponse{}); err != nil {
  1049  			return err
  1050  		}
  1051  	}
  1052  
  1053  	if err = g.Wait(); err != nil {
  1054  		return err
  1055  	}
  1056  
  1057  	// sends the final result to the client.
  1058  	treeBytes := t.Bytes(r.GetMaxNodes())
  1059  	sp.LogFields(
  1060  		otlog.String("msg", "sending the final result to the client"),
  1061  		otlog.Int("tree_bytes", len(treeBytes)),
  1062  	)
  1063  	err = stream.Send(&ingestv1.MergeSpanProfileResponse{
  1064  		Result: &ingestv1.MergeSpanProfileResult{
  1065  			TreeBytes: treeBytes,
  1066  		},
  1067  	})
  1068  	if err != nil {
  1069  		if errors.Is(err, io.EOF) {
  1070  			return connect.NewError(connect.CodeCanceled, errors.New("client closed stream"))
  1071  		}
  1072  		return err
  1073  	}
  1074  
  1075  	return nil
  1076  }
  1077  
  1078  func MergeProfilesLabels(ctx context.Context, stream *connect.BidiStream[ingestv1.MergeProfilesLabelsRequest, ingestv1.MergeProfilesLabelsResponse], blockGetter BlockGetter) error {
  1079  	sp, ctx := opentracing.StartSpanFromContext(ctx, "MergeProfilesLabels")
  1080  	defer sp.Finish()
  1081  
  1082  	r, err := stream.Receive()
  1083  	if err != nil {
  1084  		if errors.Is(err, io.EOF) {
  1085  			return connect.NewError(connect.CodeCanceled, errors.New("client closed stream"))
  1086  		}
  1087  		return err
  1088  	}
  1089  
  1090  	if r.Request == nil {
  1091  		return connect.NewError(connect.CodeInvalidArgument, errors.New("missing initial select request"))
  1092  	}
  1093  	request := r.Request
  1094  	by := r.By
  1095  	sort.Strings(by)
  1096  	sp.LogFields(
  1097  		otlog.String("start", model.Time(request.Start).Time().String()),
  1098  		otlog.String("end", model.Time(request.End).Time().String()),
  1099  		otlog.String("selector", request.LabelSelector),
  1100  		otlog.String("profile_id", request.Type.ID),
  1101  		otlog.String("by", strings.Join(by, ",")),
  1102  	)
  1103  
  1104  	queriers, err := blockGetter(ctx, model.Time(request.Start), model.Time(request.End), request.Hints)
  1105  	if err != nil {
  1106  		return err
  1107  	}
  1108  	result := make([][]*typesv1.Series, 0, len(queriers))
  1109  	g, ctx := errgroup.WithContext(ctx)
  1110  	sync := lo.Synchronize()
  1111  
  1112  	deduplicationNeeded := true
  1113  	if request.Hints != nil && request.Hints.Block != nil {
  1114  		deduplicationNeeded = request.Hints.Block.Deduplication
  1115  	}
  1116  
  1117  	if !deduplicationNeeded {
  1118  		// signal the end of the profile streaming by sending an empty response.
  1119  		sp.LogFields(otlog.String("msg", "no profile streaming as no deduplication needed"))
  1120  		if err = stream.Send(&ingestv1.MergeProfilesLabelsResponse{}); err != nil {
  1121  			return err
  1122  		}
  1123  		// in this path we can just merge the profiles from each block and send the result to the client.
  1124  		for _, querier := range queriers {
  1125  			querier := querier
  1126  			g.Go(util.RecoverPanic(func() error {
  1127  				merge, err := querier.SelectMergeByLabels(ctx, request, r.StackTraceSelector, by...)
  1128  				if err != nil {
  1129  					return err
  1130  				}
  1131  
  1132  				sync.Do(func() {
  1133  					result = append(result, merge)
  1134  				})
  1135  				return nil
  1136  			}))
  1137  		}
  1138  	} else {
  1139  		iters, err := SelectMatchingProfiles(ctx, request, queriers)
  1140  		if err != nil {
  1141  			return err
  1142  		}
  1143  		// send batches of profiles to client and filter via bidi stream.
  1144  		selectedProfiles, err := filterProfiles[
  1145  			BidiServerMerge[*ingestv1.MergeProfilesLabelsResponse, *ingestv1.MergeProfilesLabelsRequest],
  1146  			*ingestv1.MergeProfilesLabelsResponse,
  1147  			*ingestv1.MergeProfilesLabelsRequest](ctx, iters, defaultBatchSize, stream)
  1148  		if err != nil {
  1149  			return err
  1150  		}
  1151  
  1152  		// Signals the end of the profile streaming by sending an empty request.
  1153  		// This allows the client to not block other streaming ingesters.
  1154  		if err := stream.Send(&ingestv1.MergeProfilesLabelsResponse{}); err != nil {
  1155  			return err
  1156  		}
  1157  		for i, querier := range queriers {
  1158  			i := i
  1159  			querier := querier
  1160  			if len(selectedProfiles[i]) == 0 {
  1161  				continue
  1162  			}
  1163  			// Sort profiles for better read locality.
  1164  			// And merge async the result for each queriers.
  1165  			g.Go(util.RecoverPanic(func() error {
  1166  				merge, err := querier.MergeByLabels(ctx,
  1167  					iter.NewSliceIterator(querier.Sort(selectedProfiles[i])),
  1168  					r.StackTraceSelector,
  1169  					by...)
  1170  				if err != nil {
  1171  					return err
  1172  				}
  1173  				sync.Do(func() {
  1174  					result = append(result, merge)
  1175  				})
  1176  
  1177  				return nil
  1178  			}))
  1179  		}
  1180  	}
  1181  
  1182  	if err := g.Wait(); err != nil {
  1183  		return err
  1184  	}
  1185  
  1186  	// sends the final result to the client.
  1187  	err = stream.Send(&ingestv1.MergeProfilesLabelsResponse{
  1188  		Series: phlaremodel.MergeSeries(request.Aggregation, result...),
  1189  	})
  1190  	if err != nil {
  1191  		if errors.Is(err, io.EOF) {
  1192  			return connect.NewError(connect.CodeCanceled, errors.New("client closed stream"))
  1193  		}
  1194  		return err
  1195  	}
  1196  
  1197  	return nil
  1198  }
  1199  
  1200  func MergeProfilesPprof(ctx context.Context, stream *connect.BidiStream[ingestv1.MergeProfilesPprofRequest, ingestv1.MergeProfilesPprofResponse], blockGetter BlockGetter) error {
  1201  	sp, ctx := opentracing.StartSpanFromContext(ctx, "MergeProfilesPprof")
  1202  	defer sp.Finish()
  1203  
  1204  	r, err := stream.Receive()
  1205  	if err != nil {
  1206  		if errors.Is(err, io.EOF) {
  1207  			return connect.NewError(connect.CodeCanceled, errors.New("client closed stream"))
  1208  		}
  1209  		return err
  1210  	}
  1211  
  1212  	if r.Request == nil {
  1213  		return connect.NewError(connect.CodeInvalidArgument, errors.New("missing initial select request"))
  1214  	}
  1215  
  1216  	request := r.Request
  1217  	sp.SetTag("start", model.Time(request.Start).Time().String()).
  1218  		SetTag("end", model.Time(request.End).Time().String()).
  1219  		SetTag("selector", request.LabelSelector).
  1220  		SetTag("profile_type", request.Type.ID).
  1221  		SetTag("max_nodes", r.GetMaxNodes())
  1222  	sp.LogFields(otlog.Object("hints", request.Hints))
  1223  
  1224  	queriers, err := blockGetter(ctx, model.Time(request.Start), model.Time(request.End), request.Hints)
  1225  	if err != nil {
  1226  		return err
  1227  	}
  1228  
  1229  	deduplicationNeeded := true
  1230  	if request.Hints != nil && request.Hints.Block != nil {
  1231  		deduplicationNeeded = request.Hints.Block.Deduplication
  1232  	}
  1233  
  1234  	var result pprof.ProfileMerge
  1235  	g, ctx := errgroup.WithContext(ctx)
  1236  
  1237  	// depending on if new need deduplication or not there are two different code paths.
  1238  	if !deduplicationNeeded {
  1239  		// signal the end of the profile streaming by sending an empty response.
  1240  		sp.LogFields(otlog.String("msg", "no profile streaming as no deduplication needed"))
  1241  		if err = stream.Send(&ingestv1.MergeProfilesPprofResponse{}); err != nil {
  1242  			return err
  1243  		}
  1244  
  1245  		// in this path we can just merge the profiles from each block and send the result to the client.
  1246  		for _, querier := range queriers {
  1247  			querier := querier
  1248  			g.Go(util.RecoverPanic(func() error {
  1249  				p, err := querier.SelectMergePprof(ctx, request, r.GetMaxNodes(), r.StackTraceSelector)
  1250  				if err != nil {
  1251  					return err
  1252  				}
  1253  				return result.Merge(p, true)
  1254  			}))
  1255  		}
  1256  	} else {
  1257  		// in this path we have to go thorugh every profile and deduplicate them.
  1258  		iters, err := SelectMatchingProfiles(ctx, request, queriers)
  1259  		if err != nil {
  1260  			return err
  1261  		}
  1262  
  1263  		// send batches of profiles to client and filter via bidi stream.
  1264  		selectedProfiles, err := filterProfiles[
  1265  			BidiServerMerge[*ingestv1.MergeProfilesPprofResponse, *ingestv1.MergeProfilesPprofRequest],
  1266  			*ingestv1.MergeProfilesPprofResponse,
  1267  			*ingestv1.MergeProfilesPprofRequest](ctx, iters, defaultBatchSize, stream)
  1268  		if err != nil {
  1269  			return err
  1270  		}
  1271  
  1272  		for i, querier := range queriers {
  1273  			querier := querier
  1274  			i := i
  1275  			if len(selectedProfiles[i]) == 0 {
  1276  				continue
  1277  			}
  1278  			// Sort profiles for better read locality.
  1279  			// Merge async the result so we can continue streaming profiles.
  1280  			g.Go(util.RecoverPanic(func() error {
  1281  				p, err := querier.MergePprof(ctx,
  1282  					iter.NewSliceIterator(querier.Sort(selectedProfiles[i])),
  1283  					r.GetMaxNodes(), r.StackTraceSelector)
  1284  				if err != nil {
  1285  					return err
  1286  				}
  1287  				return result.Merge(p, true)
  1288  			}))
  1289  		}
  1290  
  1291  		// Signals the end of the profile streaming by sending an empty response.
  1292  		// This allows the client to not block other streaming ingesters.
  1293  		sp.LogFields(otlog.String("msg", "signaling the end of the profile streaming"))
  1294  		if err = stream.Send(&ingestv1.MergeProfilesPprofResponse{}); err != nil {
  1295  			return err
  1296  		}
  1297  	}
  1298  
  1299  	if err = g.Wait(); err != nil {
  1300  		return err
  1301  	}
  1302  
  1303  	sp.LogFields(otlog.String("msg", "building pprof bytes"))
  1304  	mergedProfile := result.Profile()
  1305  	pprof.SetProfileMetadata(mergedProfile, request.Type, model.Time(r.Request.End).UnixNano(), 0)
  1306  
  1307  	// connect go already handles compression.
  1308  	pprofBytes, err := pprof.Marshal(mergedProfile, false)
  1309  	if err != nil {
  1310  		return err
  1311  	}
  1312  	// sends the final result to the client.
  1313  	sp.LogFields(
  1314  		otlog.String("msg", "sending the final result to the client"),
  1315  		otlog.Int("tree_bytes", len(pprofBytes)),
  1316  	)
  1317  	err = stream.Send(&ingestv1.MergeProfilesPprofResponse{Result: pprofBytes})
  1318  	if err != nil {
  1319  		if errors.Is(err, io.EOF) {
  1320  			return connect.NewError(connect.CodeCanceled, errors.New("client closed stream"))
  1321  		}
  1322  		return err
  1323  	}
  1324  
  1325  	return nil
  1326  }
  1327  
  1328  func ProfileTypes(ctx context.Context, req *connect.Request[ingestv1.ProfileTypesRequest], blockGetter BlockGetter) (*connect.Response[ingestv1.ProfileTypesResponse], error) {
  1329  	queriers, err := blockGetter(ctx, model.Time(req.Msg.Start), model.Time(req.Msg.End), nil)
  1330  	if err != nil {
  1331  		return nil, err
  1332  	}
  1333  
  1334  	g, ctx := errgroup.WithContext(ctx)
  1335  	uniqTypes := make(map[string]*typesv1.ProfileType)
  1336  	lock := sync.Mutex{}
  1337  
  1338  	for _, q := range queriers {
  1339  		q := q
  1340  		g.Go(func() error {
  1341  			res, err := q.ProfileTypes(ctx, req)
  1342  			if err != nil {
  1343  				return err
  1344  			}
  1345  
  1346  			lock.Lock()
  1347  			defer lock.Unlock()
  1348  			for _, t := range res.Msg.ProfileTypes {
  1349  				uniqTypes[t.ID] = t.CloneVT()
  1350  			}
  1351  			return nil
  1352  		})
  1353  	}
  1354  	if err := g.Wait(); err != nil {
  1355  		return nil, err
  1356  	}
  1357  	types := lo.Values(uniqTypes)
  1358  	sort.Slice(types, func(i, j int) bool {
  1359  		return types[i].ID < types[j].ID
  1360  	})
  1361  	return connect.NewResponse(&ingestv1.ProfileTypesResponse{
  1362  		ProfileTypes: types,
  1363  	}), nil
  1364  }
  1365  
  1366  func LabelValues(ctx context.Context, req *connect.Request[typesv1.LabelValuesRequest], blockGetter BlockGetter) (*typesv1.LabelValuesResponse, error) {
  1367  	queriers, err := blockGetter(ctx, model.Time(req.Msg.Start), model.Time(req.Msg.End), nil)
  1368  	if err != nil {
  1369  		return nil, err
  1370  	}
  1371  
  1372  	var values []string
  1373  	var lock sync.Mutex
  1374  	group, ctx := errgroup.WithContext(ctx)
  1375  
  1376  	const concurrentQueryLimit = 50
  1377  	group.SetLimit(concurrentQueryLimit)
  1378  
  1379  	for _, q := range queriers {
  1380  		group.Go(util.RecoverPanic(func() error {
  1381  			res, err := q.LabelValues(ctx, req)
  1382  			if err != nil {
  1383  				return err
  1384  			}
  1385  
  1386  			lock.Lock()
  1387  			values = append(values, res.Msg.Names...)
  1388  			lock.Unlock()
  1389  			return nil
  1390  		}))
  1391  	}
  1392  	err = group.Wait()
  1393  	if err != nil {
  1394  		return nil, err
  1395  	}
  1396  
  1397  	slices.Sort(values)
  1398  	return &typesv1.LabelValuesResponse{Names: lo.Uniq(values)}, nil
  1399  }
  1400  
  1401  func LabelNames(ctx context.Context, req *connect.Request[typesv1.LabelNamesRequest], blockGetter BlockGetter) (*typesv1.LabelNamesResponse, error) {
  1402  	queriers, err := blockGetter(ctx, model.Time(req.Msg.Start), model.Time(req.Msg.End), nil)
  1403  	if err != nil {
  1404  		return nil, err
  1405  	}
  1406  
  1407  	var labelNames []string
  1408  	var lock sync.Mutex
  1409  	group, ctx := errgroup.WithContext(ctx)
  1410  
  1411  	const concurrentQueryLimit = 50
  1412  	group.SetLimit(concurrentQueryLimit)
  1413  
  1414  	for _, q := range queriers {
  1415  		group.Go(util.RecoverPanic(func() error {
  1416  			res, err := q.LabelNames(ctx, req)
  1417  			if err != nil {
  1418  				return err
  1419  			}
  1420  
  1421  			lock.Lock()
  1422  			labelNames = append(labelNames, res.Msg.Names...)
  1423  			lock.Unlock()
  1424  			return nil
  1425  		}))
  1426  	}
  1427  	err = group.Wait()
  1428  	if err != nil {
  1429  		return nil, err
  1430  	}
  1431  
  1432  	slices.Sort(labelNames)
  1433  	return &typesv1.LabelNamesResponse{
  1434  		Names: lo.Uniq(labelNames),
  1435  	}, nil
  1436  }
  1437  
  1438  func Series(ctx context.Context, req *ingestv1.SeriesRequest, blockGetter BlockGetter) (*ingestv1.SeriesResponse, error) {
  1439  	queriers, err := blockGetter(ctx, model.Time(req.Start), model.Time(req.End), nil)
  1440  	if err != nil {
  1441  		return nil, err
  1442  	}
  1443  
  1444  	var labelsSet []*typesv1.Labels
  1445  	var lock sync.Mutex
  1446  	group, ctx := errgroup.WithContext(ctx)
  1447  
  1448  	// TODO(bryan) Verify this limit is ok
  1449  	const concurrentQueryLimit = 50
  1450  	group.SetLimit(concurrentQueryLimit)
  1451  
  1452  	for _, q := range queriers {
  1453  		q := q
  1454  		group.Go(util.RecoverPanic(func() error {
  1455  			labels, err := q.Series(ctx, req)
  1456  			if err != nil {
  1457  				return err
  1458  			}
  1459  
  1460  			lock.Lock()
  1461  			labelsSet = append(labelsSet, labels...)
  1462  			lock.Unlock()
  1463  			return nil
  1464  		}))
  1465  	}
  1466  	err = group.Wait()
  1467  	if err != nil {
  1468  		return nil, err
  1469  	}
  1470  
  1471  	sort.Slice(labelsSet, func(i, j int) bool {
  1472  		return phlaremodel.CompareLabelPairs(labelsSet[i].Labels, labelsSet[j].Labels) < 0
  1473  	})
  1474  	return &ingestv1.SeriesResponse{
  1475  		LabelsSet: lo.UniqBy(labelsSet, func(set *typesv1.Labels) uint64 {
  1476  			return phlaremodel.Labels(set.Labels).Hash()
  1477  		}),
  1478  	}, nil
  1479  }
  1480  
  1481  var maxBlockProfile Profile = BlockProfile{
  1482  	timestamp: model.Time(math.MaxInt64),
  1483  }
  1484  
  1485  type BlockProfile struct {
  1486  	rowNum      int64
  1487  	timestamp   model.Time
  1488  	fingerprint model.Fingerprint
  1489  	labels      phlaremodel.Labels
  1490  	partition   uint64
  1491  }
  1492  
  1493  func (p BlockProfile) StacktracePartition() uint64 {
  1494  	return p.partition
  1495  }
  1496  
  1497  func (p BlockProfile) RowNumber() int64 {
  1498  	return p.rowNum
  1499  }
  1500  
  1501  func (p BlockProfile) Labels() phlaremodel.Labels {
  1502  	return p.labels
  1503  }
  1504  
  1505  func (p BlockProfile) Timestamp() model.Time {
  1506  	return p.timestamp
  1507  }
  1508  
  1509  func (p BlockProfile) Fingerprint() model.Fingerprint {
  1510  	return p.fingerprint
  1511  }
  1512  
  1513  func retrieveStacktracePartition(buf [][]parquet.Value, pos int) uint64 {
  1514  	if len(buf) > pos && len(buf[pos]) == 1 {
  1515  		return buf[pos][0].Uint64()
  1516  	}
  1517  
  1518  	// return 0 stacktrace partition
  1519  	return uint64(0)
  1520  }
  1521  
  1522  type labelsInfo struct {
  1523  	fp  model.Fingerprint
  1524  	lbs phlaremodel.Labels
  1525  }
  1526  
  1527  func (b *singleBlockQuerier) SelectMatchingProfiles(ctx context.Context, params *ingestv1.SelectProfilesRequest) (iter.Iterator[Profile], error) {
  1528  	sp, ctx := opentracing.StartSpanFromContext(ctx, "SelectMatchingProfiles - Block")
  1529  	defer sp.Finish()
  1530  	sp.SetTag("block ULID", b.meta.ULID.String())
  1531  
  1532  	if err := b.Open(ctx); err != nil {
  1533  		return nil, err
  1534  	}
  1535  	b.queries.Add(1)
  1536  	defer b.queries.Done()
  1537  
  1538  	matchers, err := parser.ParseMetricSelector(params.LabelSelector)
  1539  	if err != nil {
  1540  		return nil, status.Error(codes.InvalidArgument, "failed to parse label selectors: "+err.Error())
  1541  	}
  1542  	if params.Type == nil {
  1543  		return nil, errors.New("no profileType given")
  1544  	}
  1545  	matchers = append(matchers, phlaremodel.SelectorFromProfileType(params.Type))
  1546  
  1547  	postings, err := PostingsForMatchers(b.index, nil, matchers...)
  1548  	if err != nil {
  1549  		return nil, err
  1550  	}
  1551  
  1552  	var (
  1553  		lbls       = make(phlaremodel.Labels, 0, 6)
  1554  		chks       = make([]index.ChunkMeta, 1)
  1555  		lblsPerRef = make(map[int64]labelsInfo)
  1556  	)
  1557  
  1558  	// get all relevant labels/fingerprints
  1559  	for postings.Next() {
  1560  		fp, err := b.index.Series(postings.At(), &lbls, &chks)
  1561  		if err != nil {
  1562  			return nil, err
  1563  		}
  1564  		if _, exists := lblsPerRef[int64(chks[0].SeriesIndex)]; exists {
  1565  			continue
  1566  		}
  1567  		info := labelsInfo{
  1568  			fp:  model.Fingerprint(fp),
  1569  			lbs: make(phlaremodel.Labels, len(lbls)),
  1570  		}
  1571  		copy(info.lbs, lbls)
  1572  		lblsPerRef[int64(chks[0].SeriesIndex)] = info
  1573  
  1574  	}
  1575  
  1576  	var buf [][]parquet.Value
  1577  
  1578  	profiles := b.profileSourceTable()
  1579  	pIt := query.NewBinaryJoinIterator(
  1580  		0,
  1581  		profiles.columnIter(ctx, "SeriesIndex", query.NewMapPredicate(lblsPerRef), "SeriesIndex"),
  1582  		profiles.columnIter(ctx, "TimeNanos", query.NewIntBetweenPredicate(model.Time(params.Start).UnixNano(), model.Time(params.End).UnixNano()), "TimeNanos"),
  1583  	)
  1584  
  1585  	if b.meta.Version >= 2 {
  1586  		pIt = query.NewBinaryJoinIterator(
  1587  			0,
  1588  			pIt,
  1589  			profiles.columnIter(ctx, "StacktracePartition", nil, "StacktracePartition"),
  1590  		)
  1591  		buf = make([][]parquet.Value, 3)
  1592  	} else {
  1593  		buf = make([][]parquet.Value, 2)
  1594  	}
  1595  
  1596  	iters := make([]iter.Iterator[Profile], 0, len(lblsPerRef))
  1597  	defer pIt.Close()
  1598  
  1599  	currSeriesIndex := int64(-1)
  1600  	var currentSeriesSlice []Profile
  1601  	for pIt.Next() {
  1602  		res := pIt.At()
  1603  		buf = res.Columns(buf, "SeriesIndex", "TimeNanos", "StacktracePartition")
  1604  		seriesIndex := buf[0][0].Int64()
  1605  		if seriesIndex != currSeriesIndex {
  1606  			currSeriesIndex = seriesIndex
  1607  			if len(currentSeriesSlice) > 0 {
  1608  				iters = append(iters, iter.NewSliceIterator(currentSeriesSlice))
  1609  			}
  1610  			currentSeriesSlice = make([]Profile, 0, 100)
  1611  		}
  1612  
  1613  		currentSeriesSlice = append(currentSeriesSlice, BlockProfile{
  1614  			labels:      lblsPerRef[seriesIndex].lbs,
  1615  			fingerprint: lblsPerRef[seriesIndex].fp,
  1616  			timestamp:   model.TimeFromUnixNano(buf[1][0].Int64()),
  1617  			partition:   retrieveStacktracePartition(buf, 2),
  1618  			rowNum:      res.RowNumber[0],
  1619  		})
  1620  	}
  1621  	if len(currentSeriesSlice) > 0 {
  1622  		iters = append(iters, iter.NewSliceIterator(currentSeriesSlice))
  1623  	}
  1624  
  1625  	return phlaremodel.NewMergeIterator(maxBlockProfile, false, iters...), nil
  1626  }
  1627  
  1628  func (b *singleBlockQuerier) SelectMergeByLabels(
  1629  	ctx context.Context,
  1630  	params *ingestv1.SelectProfilesRequest,
  1631  	sts *typesv1.StackTraceSelector,
  1632  	by ...string,
  1633  ) ([]*typesv1.Series, error) {
  1634  	sp, ctx := opentracing.StartSpanFromContext(ctx, "SelectMergeByLabels - Block")
  1635  	defer sp.Finish()
  1636  	sp.SetTag("block ULID", b.meta.ULID.String())
  1637  	ctx = query.AddMetricsToContext(ctx, b.metrics.query)
  1638  
  1639  	if err := b.Open(ctx); err != nil {
  1640  		return nil, err
  1641  	}
  1642  	b.queries.Add(1)
  1643  	defer b.queries.Done()
  1644  
  1645  	matchers, err := parser.ParseMetricSelector(params.LabelSelector)
  1646  	if err != nil {
  1647  		return nil, status.Error(codes.InvalidArgument, "failed to parse label selectors: "+err.Error())
  1648  	}
  1649  	if params.Type == nil {
  1650  		return nil, errors.New("no profileType given")
  1651  	}
  1652  	matchers = append(matchers, phlaremodel.SelectorFromProfileType(params.Type))
  1653  
  1654  	postings, err := PostingsForMatchers(b.index, nil, matchers...)
  1655  	if err != nil {
  1656  		return nil, err
  1657  	}
  1658  	var (
  1659  		chks       = make([]index.ChunkMeta, 1)
  1660  		lblsPerRef = make(map[int64]labelsInfo)
  1661  		lbls       = make(phlaremodel.Labels, 0, 6)
  1662  	)
  1663  	// get all relevant labels/fingerprints
  1664  	for postings.Next() {
  1665  		fp, err := b.index.SeriesBy(postings.At(), &lbls, &chks, by...)
  1666  		if err != nil {
  1667  			return nil, err
  1668  		}
  1669  
  1670  		_, ok := lblsPerRef[int64(chks[0].SeriesIndex)]
  1671  		if !ok {
  1672  			info := labelsInfo{
  1673  				fp:  model.Fingerprint(fp),
  1674  				lbs: make(phlaremodel.Labels, len(lbls)),
  1675  			}
  1676  			copy(info.lbs, lbls)
  1677  			lblsPerRef[int64(chks[0].SeriesIndex)] = info
  1678  		}
  1679  	}
  1680  
  1681  	profiles := b.profileSourceTable()
  1682  	it := query.NewBinaryJoinIterator(
  1683  		0,
  1684  		profiles.columnIter(ctx, "SeriesIndex", query.NewMapPredicate(lblsPerRef), "SeriesIndex"),
  1685  		profiles.columnIter(ctx, "TimeNanos", query.NewIntBetweenPredicate(model.Time(params.Start).UnixNano(), model.Time(params.End).UnixNano()), "TimeNanos"),
  1686  	)
  1687  
  1688  	if len(sts.GetCallSite()) == 0 {
  1689  		columnName := "TotalValue"
  1690  		if b.meta.Version == 1 {
  1691  			columnName = "Samples.list.element.Value"
  1692  		}
  1693  		rows := profileBatchIteratorBySeriesIndex(it, lblsPerRef)
  1694  		defer rows.Close()
  1695  		return mergeByLabels[Profile](ctx, profiles.file, columnName, rows, by...)
  1696  	}
  1697  
  1698  	if b.meta.Version < 2 {
  1699  		return nil, nil
  1700  	}
  1701  
  1702  	r := symdb.NewResolver(ctx, b.symbols,
  1703  		symdb.WithResolverStackTraceSelector(sts))
  1704  	defer r.Release()
  1705  
  1706  	it = query.NewBinaryJoinIterator(0, it, profiles.columnIter(ctx, "StacktracePartition", nil, "StacktracePartition"))
  1707  	rows := profileBatchIteratorBySeriesIndex(it, lblsPerRef)
  1708  	defer rows.Close()
  1709  
  1710  	return mergeByLabelsWithStackTraceSelector[Profile](ctx, profiles.file, rows, r, by...)
  1711  }
  1712  
  1713  func (b *singleBlockQuerier) SelectMergeByStacktraces(ctx context.Context, params *ingestv1.SelectProfilesRequest, maxNodes int64) (tree *phlaremodel.Tree, err error) {
  1714  	sp, ctx := opentracing.StartSpanFromContext(ctx, "SelectMergeByStacktraces - Block")
  1715  	defer sp.Finish()
  1716  	sp.SetTag("block ULID", b.meta.ULID.String())
  1717  	ctx = query.AddMetricsToContext(ctx, b.metrics.query)
  1718  
  1719  	if err := b.Open(ctx); err != nil {
  1720  		return nil, err
  1721  	}
  1722  	b.queries.Add(1)
  1723  	defer b.queries.Done()
  1724  
  1725  	matchers, err := parser.ParseMetricSelector(params.LabelSelector)
  1726  	if err != nil {
  1727  		return nil, status.Error(codes.InvalidArgument, "failed to parse label selectors: "+err.Error())
  1728  	}
  1729  	if params.Type == nil {
  1730  		return nil, errors.New("no profileType given")
  1731  	}
  1732  	matchers = append(matchers, phlaremodel.SelectorFromProfileType(params.Type))
  1733  
  1734  	postings, err := PostingsForMatchers(b.index, nil, matchers...)
  1735  	if err != nil {
  1736  		return nil, err
  1737  	}
  1738  
  1739  	var (
  1740  		chks       = make([]index.ChunkMeta, 1)
  1741  		lblsPerRef = make(map[int64]struct{})
  1742  	)
  1743  
  1744  	// get all relevant labels/fingerprints
  1745  	for postings.Next() {
  1746  		_, err := b.index.Series(postings.At(), nil, &chks)
  1747  		if err != nil {
  1748  			return nil, err
  1749  		}
  1750  		lblsPerRef[int64(chks[0].SeriesIndex)] = struct{}{}
  1751  	}
  1752  	r := symdb.NewResolver(ctx, b.symbols, symdb.WithResolverMaxNodes(maxNodes))
  1753  	defer r.Release()
  1754  
  1755  	g, ctx := errgroup.WithContext(ctx)
  1756  	util.SplitTimeRangeByResolution(time.UnixMilli(params.Start), time.UnixMilli(params.End), b.downsampleResolutions(), func(tr util.TimeRange) {
  1757  		g.Go(func() error {
  1758  			profiles := b.profileTable(tr.Resolution, params.GetAggregation())
  1759  			it := query.NewBinaryJoinIterator(
  1760  				0,
  1761  				profiles.columnIter(ctx, "SeriesIndex", query.NewMapPredicate(lblsPerRef), ""),
  1762  				profiles.columnIter(ctx, "TimeNanos", query.NewIntBetweenPredicate(tr.Start.UnixNano(), tr.End.UnixNano()), ""),
  1763  			)
  1764  
  1765  			if b.meta.Version >= 2 {
  1766  				it = query.NewBinaryJoinIterator(0,
  1767  					it,
  1768  					profiles.columnIter(ctx, "StacktracePartition", nil, "StacktracePartition"),
  1769  				)
  1770  			}
  1771  			rows := profileRowBatchIterator(it)
  1772  			defer rows.Close()
  1773  			return mergeByStacktraces(ctx, profiles.file, rows, r)
  1774  		})
  1775  	})
  1776  	if err = g.Wait(); err != nil {
  1777  		return nil, err
  1778  	}
  1779  	return r.Tree()
  1780  }
  1781  
  1782  func (b *singleBlockQuerier) SelectMergeBySpans(ctx context.Context, params *ingestv1.SelectSpanProfileRequest) (*phlaremodel.Tree, error) {
  1783  	sp, ctx := opentracing.StartSpanFromContext(ctx, "SelectMergeBySpans - Block")
  1784  	defer sp.Finish()
  1785  	sp.SetTag("block ULID", b.meta.ULID.String())
  1786  	ctx = query.AddMetricsToContext(ctx, b.metrics.query)
  1787  
  1788  	if err := b.Open(ctx); err != nil {
  1789  		return nil, err
  1790  	}
  1791  	b.queries.Add(1)
  1792  	defer b.queries.Done()
  1793  
  1794  	matchers, err := parser.ParseMetricSelector(params.LabelSelector)
  1795  	if err != nil {
  1796  		return nil, status.Error(codes.InvalidArgument, "failed to parse label selectors: "+err.Error())
  1797  	}
  1798  	if params.Type == nil {
  1799  		return nil, errors.New("no profileType given")
  1800  	}
  1801  	spans, err := phlaremodel.NewSpanSelector(params.SpanSelector)
  1802  	if err != nil {
  1803  		return nil, err
  1804  	}
  1805  	matchers = append(matchers, phlaremodel.SelectorFromProfileType(params.Type))
  1806  
  1807  	postings, err := PostingsForMatchers(b.index, nil, matchers...)
  1808  	if err != nil {
  1809  		return nil, err
  1810  	}
  1811  
  1812  	var (
  1813  		chks       = make([]index.ChunkMeta, 1)
  1814  		lblsPerRef = make(map[int64]struct{})
  1815  	)
  1816  
  1817  	// get all relevant labels/fingerprints
  1818  	for postings.Next() {
  1819  		_, err := b.index.Series(postings.At(), nil, &chks)
  1820  		if err != nil {
  1821  			return nil, err
  1822  		}
  1823  		lblsPerRef[int64(chks[0].SeriesIndex)] = struct{}{}
  1824  	}
  1825  	r := symdb.NewResolver(ctx, b.symbols)
  1826  	defer r.Release()
  1827  
  1828  	profiles := b.profileSourceTable()
  1829  	it := query.NewBinaryJoinIterator(
  1830  		0,
  1831  		profiles.columnIter(ctx, "SeriesIndex", query.NewMapPredicate(lblsPerRef), ""),
  1832  		profiles.columnIter(ctx, "TimeNanos", query.NewIntBetweenPredicate(model.Time(params.Start).UnixNano(), model.Time(params.End).UnixNano()), ""),
  1833  	)
  1834  
  1835  	if b.meta.Version >= 2 {
  1836  		it = query.NewBinaryJoinIterator(0,
  1837  			it,
  1838  			profiles.columnIter(ctx, "StacktracePartition", nil, "StacktracePartition"),
  1839  		)
  1840  	}
  1841  
  1842  	rows := profileRowBatchIterator(it)
  1843  	defer rows.Close()
  1844  	if err = mergeBySpans[rowProfile](ctx, profiles.file, rows, r, spans); err != nil {
  1845  		return nil, err
  1846  	}
  1847  	return r.Tree()
  1848  }
  1849  
  1850  func (b *singleBlockQuerier) SelectMergePprof(ctx context.Context, params *ingestv1.SelectProfilesRequest, maxNodes int64, sts *typesv1.StackTraceSelector) (*profilev1.Profile, error) {
  1851  	sp, ctx := opentracing.StartSpanFromContext(ctx, "SelectMergePprof - Block")
  1852  	defer sp.Finish()
  1853  	sp.SetTag("block ULID", b.meta.ULID.String())
  1854  	ctx = query.AddMetricsToContext(ctx, b.metrics.query)
  1855  
  1856  	if err := b.Open(ctx); err != nil {
  1857  		return nil, err
  1858  	}
  1859  	b.queries.Add(1)
  1860  	defer b.queries.Done()
  1861  
  1862  	matchers, err := parser.ParseMetricSelector(params.LabelSelector)
  1863  	if err != nil {
  1864  		return nil, status.Error(codes.InvalidArgument, "failed to parse label selectors: "+err.Error())
  1865  	}
  1866  	if params.Type == nil {
  1867  		return nil, errors.New("no profileType given")
  1868  	}
  1869  	matchers = append(matchers, phlaremodel.SelectorFromProfileType(params.Type))
  1870  
  1871  	postings, err := PostingsForMatchers(b.index, nil, matchers...)
  1872  	if err != nil {
  1873  		return nil, err
  1874  	}
  1875  
  1876  	var (
  1877  		chks       = make([]index.ChunkMeta, 1)
  1878  		lblsPerRef = make(map[int64]struct{})
  1879  	)
  1880  
  1881  	// get all relevant labels/fingerprints
  1882  	for postings.Next() {
  1883  		_, err := b.index.Series(postings.At(), nil, &chks)
  1884  		if err != nil {
  1885  			return nil, err
  1886  		}
  1887  		lblsPerRef[int64(chks[0].SeriesIndex)] = struct{}{}
  1888  	}
  1889  	r := symdb.NewResolver(ctx, b.symbols,
  1890  		symdb.WithResolverMaxNodes(maxNodes),
  1891  		symdb.WithResolverStackTraceSelector(sts))
  1892  	defer r.Release()
  1893  
  1894  	g, ctx := errgroup.WithContext(ctx)
  1895  	util.SplitTimeRangeByResolution(time.UnixMilli(params.Start), time.UnixMilli(params.End), b.downsampleResolutions(), func(tr util.TimeRange) {
  1896  		g.Go(func() error {
  1897  			profiles := b.profileTable(tr.Resolution, params.GetAggregation())
  1898  			it := query.NewBinaryJoinIterator(
  1899  				0,
  1900  				profiles.columnIter(ctx, "SeriesIndex", query.NewMapPredicate(lblsPerRef), ""),
  1901  				profiles.columnIter(ctx, "TimeNanos", query.NewIntBetweenPredicate(tr.Start.UnixNano(), tr.End.UnixNano()), ""),
  1902  			)
  1903  
  1904  			if b.meta.Version >= 2 {
  1905  				it = query.NewBinaryJoinIterator(0,
  1906  					it,
  1907  					profiles.columnIter(ctx, "StacktracePartition", nil, "StacktracePartition"),
  1908  				)
  1909  			}
  1910  			rows := profileRowBatchIterator(it)
  1911  			defer rows.Close()
  1912  			return mergeByStacktraces[rowProfile](ctx, profiles.file, rows, r)
  1913  		})
  1914  	})
  1915  	if err = g.Wait(); err != nil {
  1916  		return nil, err
  1917  	}
  1918  	return r.Pprof()
  1919  }
  1920  
  1921  // Series selects the series labels from this block.
  1922  //
  1923  // Note: It will select ALL the labels in the block, not necessarily just the
  1924  // subset in the time range SeriesRequest.Start to SeriesRequest.End.
  1925  func (b *singleBlockQuerier) Series(ctx context.Context, params *ingestv1.SeriesRequest) ([]*typesv1.Labels, error) {
  1926  	sp, ctx := opentracing.StartSpanFromContext(ctx, "Series Block")
  1927  	defer sp.Finish()
  1928  
  1929  	if err := b.Open(ctx); err != nil {
  1930  		return nil, err
  1931  	}
  1932  	b.queries.Add(1)
  1933  	defer b.queries.Done()
  1934  
  1935  	selectors, err := parseSelectors(params.Matchers)
  1936  	if err != nil {
  1937  		return nil, err
  1938  	}
  1939  
  1940  	names, err := b.index.LabelNames()
  1941  	if err != nil {
  1942  		return nil, err
  1943  	}
  1944  
  1945  	if len(params.LabelNames) > 0 {
  1946  		labelNamesFilter := make(map[string]struct{}, len(params.LabelNames))
  1947  		for _, n := range params.LabelNames {
  1948  			labelNamesFilter[n] = struct{}{}
  1949  		}
  1950  		names = lo.Filter(names, func(name string, _ int) bool {
  1951  			_, ok := labelNamesFilter[name]
  1952  			return ok
  1953  		})
  1954  	}
  1955  
  1956  	var labelsSets []*typesv1.Labels
  1957  	fingerprints := make(map[uint64]struct{})
  1958  	if selectors.matchesAll() {
  1959  		k, v := index.AllPostingsKey()
  1960  		iter, err := b.index.Postings(k, nil, v)
  1961  		if err != nil {
  1962  			return nil, err
  1963  		}
  1964  
  1965  		sets, err := b.getUniqueLabelsSets(iter, names, &fingerprints)
  1966  		if err != nil {
  1967  			return nil, err
  1968  		}
  1969  		labelsSets = append(labelsSets, sets...)
  1970  	} else {
  1971  		for _, matchers := range selectors {
  1972  			iter, err := PostingsForMatchers(b.index, nil, matchers...)
  1973  			if err != nil {
  1974  				return nil, err
  1975  			}
  1976  
  1977  			sets, err := b.getUniqueLabelsSets(iter, names, &fingerprints)
  1978  			if err != nil {
  1979  				return nil, err
  1980  			}
  1981  			labelsSets = append(labelsSets, sets...)
  1982  		}
  1983  	}
  1984  	return labelsSets, nil
  1985  }
  1986  
  1987  func (b *singleBlockQuerier) getUniqueLabelsSets(postings index.Postings, names []string, fingerprints *map[uint64]struct{}) ([]*typesv1.Labels, error) {
  1988  	var labelsSets []*typesv1.Labels
  1989  
  1990  	// This memory will be re-used between posting iterations to avoid
  1991  	// re-allocating many *typesv1.LabelPair objects.
  1992  	matchedLabelsPool := make(phlaremodel.Labels, len(names))
  1993  	for i := range matchedLabelsPool {
  1994  		matchedLabelsPool[i] = &typesv1.LabelPair{}
  1995  	}
  1996  
  1997  	for postings.Next() {
  1998  		// Reset the pool.
  1999  		matchedLabelsPool = matchedLabelsPool[:0]
  2000  
  2001  		for _, name := range names {
  2002  			value, err := b.index.LabelValueFor(postings.At(), name)
  2003  			if err != nil {
  2004  				if err == storage.ErrNotFound {
  2005  					continue
  2006  				}
  2007  				return nil, err
  2008  			}
  2009  
  2010  			// Expand the pool's length and add this label to the end. The pool
  2011  			// will always have enough capacity for all the labels.
  2012  			matchedLabelsPool = matchedLabelsPool[:len(matchedLabelsPool)+1]
  2013  			matchedLabelsPool[len(matchedLabelsPool)-1].Name = name
  2014  			matchedLabelsPool[len(matchedLabelsPool)-1].Value = value
  2015  		}
  2016  
  2017  		fp := matchedLabelsPool.Hash()
  2018  		_, ok := (*fingerprints)[fp]
  2019  		if ok {
  2020  			continue
  2021  		}
  2022  		(*fingerprints)[fp] = struct{}{}
  2023  
  2024  		// Copy every element from the pool to a new slice.
  2025  		labels := &typesv1.Labels{
  2026  			Labels: make([]*typesv1.LabelPair, 0, len(matchedLabelsPool)),
  2027  		}
  2028  		for _, label := range matchedLabelsPool {
  2029  			labels.Labels = append(labels.Labels, label.CloneVT())
  2030  		}
  2031  		labelsSets = append(labelsSets, labels)
  2032  	}
  2033  	return labelsSets, nil
  2034  }
  2035  
  2036  func (b *singleBlockQuerier) Sort(in []Profile) []Profile {
  2037  	// Sort by RowNumber to avoid seeking back and forth in the file.
  2038  	sort.Slice(in, func(i, j int) bool {
  2039  		return in[i].(BlockProfile).rowNum < in[j].(BlockProfile).rowNum
  2040  	})
  2041  	return in
  2042  }
  2043  
  2044  func (q *singleBlockQuerier) openTSDBIndex(ctx context.Context) error {
  2045  	f, err := q.bucket.Get(ctx, block.IndexFilename)
  2046  	if err != nil {
  2047  		return fmt.Errorf("opening index.tsdb file: %w", err)
  2048  	}
  2049  	defer func() {
  2050  		_ = f.Close()
  2051  	}()
  2052  	var buf []byte
  2053  	var tsdbIndexFile block.File
  2054  	for _, mf := range q.meta.Files {
  2055  		if mf.RelPath == block.IndexFilename {
  2056  			tsdbIndexFile = mf
  2057  			break
  2058  		}
  2059  	}
  2060  	if tsdbIndexFile.SizeBytes > 0 {
  2061  		// If index size is known beforehand, we can allocate
  2062  		// a buffer of the exact size to save some space.
  2063  		buf = make([]byte, tsdbIndexFile.SizeBytes)
  2064  		_, err = io.ReadFull(f, buf)
  2065  	} else {
  2066  		// 32KB is the default buf size of io.Copy.
  2067  		// It's unlikely that a tsdb index is less than that.
  2068  		b := bytes.NewBuffer(make([]byte, 0, 32<<10))
  2069  		_, err = io.Copy(b, f)
  2070  		buf = b.Bytes()
  2071  	}
  2072  	if err != nil {
  2073  		return fmt.Errorf("reading tsdb index: %w", err)
  2074  	}
  2075  
  2076  	q.index, err = index.NewReader(index.RealByteSlice(buf))
  2077  	if err != nil {
  2078  		return fmt.Errorf("opening tsdb index: %w", err)
  2079  	}
  2080  	return nil
  2081  }
  2082  
  2083  func (q *singleBlockQuerier) Open(ctx context.Context) error {
  2084  	q.openLock.Lock()
  2085  	defer q.openLock.Unlock()
  2086  	if !q.opened {
  2087  		if err := q.openFiles(ctx); err != nil {
  2088  			return err
  2089  		}
  2090  	}
  2091  	q.metrics.blockOpened.Inc()
  2092  	q.opened = true
  2093  	return nil
  2094  }
  2095  
  2096  // openFiles opens the parquet and tsdb files so they are ready for usage.
  2097  func (q *singleBlockQuerier) openFiles(ctx context.Context) error {
  2098  	start := time.Now()
  2099  	sp, ctx := opentracing.StartSpanFromContext(ctx, "BlockQuerier - open")
  2100  	defer func() {
  2101  		q.metrics.blockOpeningLatency.Observe(time.Since(start).Seconds())
  2102  		sp.LogFields(
  2103  			otlog.String("block_ulid", q.meta.ULID.String()),
  2104  		)
  2105  		sp.Finish()
  2106  	}()
  2107  
  2108  	ctx = ContextWithBlockMetrics(ctx, q.metrics)
  2109  	g, ctx := errgroup.WithContext(ctx)
  2110  	g.Go(util.RecoverPanic(func() error {
  2111  		return q.openTSDBIndex(ctx)
  2112  	}))
  2113  
  2114  	// open parquet files
  2115  	for _, tableReader := range q.tables {
  2116  		tableReader := tableReader
  2117  		g.Go(util.RecoverPanic(func() error {
  2118  			return tableReader.open(ctx, q.bucket)
  2119  		}))
  2120  	}
  2121  
  2122  	g.Go(util.RecoverPanic(func() (err error) {
  2123  		switch q.meta.Version {
  2124  		case block.MetaVersion1:
  2125  			q.symbols, err = newSymbolsResolverV1(ctx, q.bucket, q.meta)
  2126  		case block.MetaVersion2:
  2127  			q.symbols, err = newSymbolsResolverV2(ctx, q.bucket, q.meta)
  2128  		case block.MetaVersion3:
  2129  			q.symbols, err = symdb.Open(ctx, q.bucket, q.meta)
  2130  		default:
  2131  			panic(fmt.Errorf("unsupported block version %d id %s", q.meta.Version, q.meta.ULID.String()))
  2132  		}
  2133  		return err
  2134  	}))
  2135  
  2136  	return g.Wait()
  2137  }
  2138  
  2139  func (b *singleBlockQuerier) profileSourceTable() *parquetReader[*schemav1.ProfilePersister] {
  2140  	return b.profiles[profileTableKey{}]
  2141  }
  2142  
  2143  func (b *singleBlockQuerier) profileTable(resolution time.Duration, aggregation typesv1.TimeSeriesAggregationType) (t *parquetReader[*schemav1.ProfilePersister]) {
  2144  	defer func() {
  2145  		if t != nil {
  2146  			b.metrics.profileTableAccess.WithLabelValues(t.meta.RelPath).Inc()
  2147  		}
  2148  	}()
  2149  	var ok bool
  2150  	t, ok = b.profiles[profileTableKey{
  2151  		resolution:  resolution,
  2152  		aggregation: downsampleAggregation(aggregation),
  2153  	}]
  2154  	if ok {
  2155  		return t
  2156  	}
  2157  	return b.profiles[profileTableKey{}]
  2158  }
  2159  
  2160  func (b *singleBlockQuerier) downsampleResolutions() []time.Duration {
  2161  	if len(b.profiles) < 2 {
  2162  		// b.profiles contains only the table of original resolution.
  2163  		return nil
  2164  	}
  2165  	resolutions := make([]time.Duration, 0, len(b.profiles)-1)
  2166  	for k := range b.profiles {
  2167  		if k.resolution > 0 {
  2168  			resolutions = append(resolutions, k.resolution)
  2169  		}
  2170  	}
  2171  	return resolutions
  2172  }
  2173  
  2174  func downsampleAggregation(v typesv1.TimeSeriesAggregationType) string {
  2175  	switch v {
  2176  	case typesv1.TimeSeriesAggregationType_TIME_SERIES_AGGREGATION_TYPE_SUM:
  2177  		return "sum"
  2178  	}
  2179  	return ""
  2180  }
  2181  
  2182  const profileTableName = "profiles"
  2183  
  2184  func parseProfileTableName(n string) (profileTableKey, bool) {
  2185  	if n == profileTableName+block.ParquetSuffix {
  2186  		return profileTableKey{}, true
  2187  	}
  2188  	parts := strings.Split(strings.TrimSuffix(n, block.ParquetSuffix), "_")
  2189  	if len(parts) != 3 || parts[0] != profileTableName {
  2190  		return profileTableKey{}, false
  2191  	}
  2192  	r, err := time.ParseDuration(parts[1])
  2193  	if err != nil {
  2194  		return profileTableKey{}, false
  2195  	}
  2196  	return profileTableKey{
  2197  		resolution:  r,
  2198  		aggregation: parts[2],
  2199  	}, true
  2200  }
  2201  
  2202  type parquetReader[P schemav1.PersisterName] struct {
  2203  	persister P
  2204  	file      parquetobj.File
  2205  	meta      block.File
  2206  	metrics   *BlocksMetrics
  2207  }
  2208  
  2209  func (r *parquetReader[P]) open(ctx context.Context, bucketReader phlareobj.BucketReader) error {
  2210  	r.metrics = blockMetricsFromContext(ctx)
  2211  	return r.file.Open(
  2212  		ctx,
  2213  		bucketReader,
  2214  		r.meta,
  2215  		parquet.SkipBloomFilters(true), // we don't use bloom filters
  2216  		parquet.FileReadMode(parquet.ReadModeAsync),
  2217  		parquet.ReadBufferSize(parquetReadBufferSize),
  2218  	)
  2219  }
  2220  
  2221  func (r *parquetReader[P]) Close() error {
  2222  	return r.file.Close()
  2223  }
  2224  
  2225  func (r *parquetReader[P]) relPath() string {
  2226  	return r.persister.Name() + block.ParquetSuffix
  2227  }
  2228  
  2229  func (r *parquetReader[P]) columnIter(ctx context.Context, columnName string, predicate query.Predicate, alias string) query.Iterator {
  2230  	index, _ := query.GetColumnIndexByPath(r.file.Root(), columnName)
  2231  	if index == -1 {
  2232  		return query.NewErrIterator(fmt.Errorf("column '%s' not found in parquet file '%s'", columnName, r.relPath()))
  2233  	}
  2234  	ctx = query.AddMetricsToContext(ctx, r.metrics.query)
  2235  	return query.NewSyncIterator(ctx, r.file.RowGroups(), index, columnName, 1000, predicate, alias)
  2236  }