
     1  // Copyright (c) The Thanos Authors.
     2  // Licensed under the Apache License 2.0.
     4  package store
     6  import (
     7  	"bufio"
     8  	"bytes"
     9  	"context"
    10  	"encoding/binary"
    11  	"fmt"
    12  	"hash"
    13  	"io"
    14  	"math"
    15  	"os"
    16  	"path"
    17  	"path/filepath"
    18  	"sort"
    19  	"strings"
    20  	"sync"
    21  	"time"
    23  	""
    24  	""
    25  	""
    26  	""
    27  	""
    28  	""
    29  	""
    30  	""
    31  	""
    32  	""
    33  	""
    34  	""
    35  	""
    36  	""
    37  	""
    38  	""
    39  	""
    40  	""
    41  	""
    42  	""
    43  	""
    45  	""
    47  	""
    48  	""
    49  	""
    50  	""
    51  	""
    52  	""
    53  	""
    54  	""
    55  	""
    56  	""
    57  	""
    58  	storecache ""
    59  	""
    60  	""
    61  	""
    62  	""
    63  	""
    64  	""
    65  )
    67  const (
    68  	// MaxSamplesPerChunk is approximately the max number of samples that we may have in any given chunk. This is needed
    69  	// for precalculating the number of samples that we may have to retrieve and decode for any given query
    70  	// without downloading them. Please take a look at to know
    71  	// where this number comes from. Long story short: TSDB is made in such a way, and it is made in such a way
    72  	// because you barely get any improvements in compression when the number of samples is beyond this.
    73  	// Take a look at Figure 6 in this whitepaper
    74  	MaxSamplesPerChunk = 120
    75  	// EstimatedMaxChunkSize is average max of chunk size. This can be exceeded though in very rare (valid) cases.
    76  	EstimatedMaxChunkSize  = 16000
    77  	EstimatedMaxSeriesSize = 64 * 1024
    78  	// Relatively large in order to reduce memory waste, yet small enough to avoid excessive allocations.
    79  	chunkBytesPoolMinSize = 64 * 1024        // 64 KiB
    80  	chunkBytesPoolMaxSize = 64 * 1024 * 1024 // 64 MiB
    82  	// CompatibilityTypeLabelName is an artificial label that Store Gateway can optionally advertise. This is required for compatibility
    83  	// with pre v0.8.0 Querier. Previous Queriers was strict about duplicated external labels of all StoreAPIs that had any labels.
    84  	// Now with newer Store Gateway advertising all the external labels it has access to, there was simple case where
    85  	// Querier was blocking Store Gateway as duplicate with sidecar.
    86  	//
    87  	// Newer Queriers are not strict, no duplicated external labels check is there anymore.
    88  	// Additionally newer Queriers removes/ignore this exact labels from UI and querying.
    89  	//
    90  	// This label name is intentionally against Prometheus label style.
    91  	// TODO(bwplotka): Remove it at some point.
    92  	CompatibilityTypeLabelName = "@thanos_compatibility_store_type"
    94  	// DefaultPostingOffsetInMemorySampling represents default value for --store.index-header-posting-offsets-in-mem-sampling.
    95  	// 32 value is chosen as it's a good balance for common setups. Sampling that is not too large (too many CPU cycles) and
    96  	// not too small (too much memory).
    97  	DefaultPostingOffsetInMemorySampling = 32
    99  	PartitionerMaxGapSize = 512 * 1024
   101  	// Labels for metrics.
   102  	labelEncode = "encode"
   103  	labelDecode = "decode"
   105  	minBlockSyncConcurrency = 1
   107  	enableChunkHashCalculation = true
   109  	// SeriesBatchSize is the default batch size when fetching series from object storage.
   110  	SeriesBatchSize = 10000
   111  )
   113  var (
   114  	errBlockSyncConcurrencyNotValid = errors.New("the block sync concurrency must be equal or greater than 1.")
   115  	hashPool                        = sync.Pool{New: func() interface{} { return xxhash.New() }}
   116  )
   118  type bucketStoreMetrics struct {
   119  	blocksLoaded          prometheus.Gauge
   120  	blockLoads            prometheus.Counter
   121  	blockLoadFailures     prometheus.Counter
   122  	lastLoadedBlock       prometheus.Gauge
   123  	blockDrops            prometheus.Counter
   124  	blockDropFailures     prometheus.Counter
   125  	seriesDataTouched     *prometheus.HistogramVec
   126  	seriesDataFetched     *prometheus.HistogramVec
   127  	seriesDataSizeTouched *prometheus.HistogramVec
   128  	seriesDataSizeFetched *prometheus.HistogramVec
   129  	seriesBlocksQueried   prometheus.Histogram
   130  	seriesGetAllDuration  prometheus.Histogram
   131  	seriesMergeDuration   prometheus.Histogram
   132  	resultSeriesCount     prometheus.Histogram
   133  	chunkSizeBytes        prometheus.Histogram
   134  	postingsSizeBytes     prometheus.Histogram
   135  	queriesDropped        *prometheus.CounterVec
   136  	seriesRefetches       prometheus.Counter
   137  	chunkRefetches        prometheus.Counter
   138  	emptyPostingCount     prometheus.Counter
   140  	cachedPostingsCompressions           *prometheus.CounterVec
   141  	cachedPostingsCompressionErrors      *prometheus.CounterVec
   142  	cachedPostingsCompressionTimeSeconds *prometheus.CounterVec
   143  	cachedPostingsOriginalSizeBytes      prometheus.Counter
   144  	cachedPostingsCompressedSizeBytes    prometheus.Counter
   146  	seriesFetchDuration   prometheus.Histogram
   147  	postingsFetchDuration prometheus.Histogram
   148  	chunkFetchDuration    prometheus.Histogram
   149  }
   151  func newBucketStoreMetrics(reg prometheus.Registerer) *bucketStoreMetrics {
   152  	var m bucketStoreMetrics
   154  	m.blockLoads = promauto.With(reg).NewCounter(prometheus.CounterOpts{
   155  		Name: "thanos_bucket_store_block_loads_total",
   156  		Help: "Total number of remote block loading attempts.",
   157  	})
   158  	m.blockLoadFailures = promauto.With(reg).NewCounter(prometheus.CounterOpts{
   159  		Name: "thanos_bucket_store_block_load_failures_total",
   160  		Help: "Total number of failed remote block loading attempts.",
   161  	})
   162  	m.blockDrops = promauto.With(reg).NewCounter(prometheus.CounterOpts{
   163  		Name: "thanos_bucket_store_block_drops_total",
   164  		Help: "Total number of local blocks that were dropped.",
   165  	})
   166  	m.blockDropFailures = promauto.With(reg).NewCounter(prometheus.CounterOpts{
   167  		Name: "thanos_bucket_store_block_drop_failures_total",
   168  		Help: "Total number of local blocks that failed to be dropped.",
   169  	})
   170  	m.blocksLoaded = promauto.With(reg).NewGauge(prometheus.GaugeOpts{
   171  		Name: "thanos_bucket_store_blocks_loaded",
   172  		Help: "Number of currently loaded blocks.",
   173  	})
   174  	m.lastLoadedBlock = promauto.With(reg).NewGauge(prometheus.GaugeOpts{
   175  		Name: "thanos_bucket_store_blocks_last_loaded_timestamp_seconds",
   176  		Help: "Timestamp when last block got loaded.",
   177  	})
   179  	m.seriesDataTouched = promauto.With(reg).NewHistogramVec(prometheus.HistogramOpts{
   180  		Name:    "thanos_bucket_store_series_data_touched",
   181  		Help:    "Number of items of a data type touched to fulfill a single Store API series request.",
   182  		Buckets: prometheus.ExponentialBuckets(200, 2, 15),
   183  	}, []string{"data_type"})
   184  	m.seriesDataFetched = promauto.With(reg).NewHistogramVec(prometheus.HistogramOpts{
   185  		Name:    "thanos_bucket_store_series_data_fetched",
   186  		Help:    "Number of items of a data type retrieved to fulfill a single Store API series request.",
   187  		Buckets: prometheus.ExponentialBuckets(200, 2, 15),
   188  	}, []string{"data_type"})
   190  	m.seriesDataSizeTouched = promauto.With(reg).NewHistogramVec(prometheus.HistogramOpts{
   191  		Name:    "thanos_bucket_store_series_data_size_touched_bytes",
   192  		Help:    "Total size of items of a data type touched to fulfill a single Store API series request in Bytes.",
   193  		Buckets: prometheus.ExponentialBuckets(1024, 2, 15),
   194  	}, []string{"data_type"})
   195  	m.seriesDataSizeFetched = promauto.With(reg).NewHistogramVec(prometheus.HistogramOpts{
   196  		Name:    "thanos_bucket_store_series_data_size_fetched_bytes",
   197  		Help:    "Total size of items of a data type fetched to fulfill a single Store API series request in Bytes.",
   198  		Buckets: prometheus.ExponentialBuckets(1024, 2, 15),
   199  	}, []string{"data_type"})
   201  	m.seriesBlocksQueried = promauto.With(reg).NewHistogram(prometheus.HistogramOpts{
   202  		Name:    "thanos_bucket_store_series_blocks_queried",
   203  		Help:    "Number of blocks in a bucket store that were touched to satisfy a query.",
   204  		Buckets: prometheus.ExponentialBuckets(1, 2, 10),
   205  	})
   206  	m.seriesGetAllDuration = promauto.With(reg).NewHistogram(prometheus.HistogramOpts{
   207  		Name:    "thanos_bucket_store_series_get_all_duration_seconds",
   208  		Help:    "Time it takes until all per-block prepares and loads for a query are finished.",
   209  		Buckets: []float64{0.001, 0.01, 0.1, 0.3, 0.6, 1, 3, 6, 9, 20, 30, 60, 90, 120},
   210  	})
   211  	m.seriesMergeDuration = promauto.With(reg).NewHistogram(prometheus.HistogramOpts{
   212  		Name:    "thanos_bucket_store_series_merge_duration_seconds",
   213  		Help:    "Time it takes to merge sub-results from all queried blocks into a single result.",
   214  		Buckets: []float64{0.001, 0.01, 0.1, 0.3, 0.6, 1, 3, 6, 9, 20, 30, 60, 90, 120},
   215  	})
   216  	m.resultSeriesCount = promauto.With(reg).NewHistogram(prometheus.HistogramOpts{
   217  		Name:    "thanos_bucket_store_series_result_series",
   218  		Help:    "Number of series observed in the final result of a query.",
   219  		Buckets: prometheus.ExponentialBuckets(1, 2, 15),
   220  	})
   222  	m.chunkSizeBytes = promauto.With(reg).NewHistogram(prometheus.HistogramOpts{
   223  		Name: "thanos_bucket_store_sent_chunk_size_bytes",
   224  		Help: "Size in bytes of the chunks for the single series, which is adequate to the gRPC message size sent to querier.",
   225  		Buckets: []float64{
   226  			32, 256, 512, 1024, 32 * 1024, 256 * 1024, 512 * 1024, 1024 * 1024, 32 * 1024 * 1024, 256 * 1024 * 1024, 512 * 1024 * 1024,
   227  		},
   228  	})
   230  	m.postingsSizeBytes = promauto.With(reg).NewHistogram(prometheus.HistogramOpts{
   231  		Name: "thanos_bucket_store_postings_size_bytes",
   232  		Help: "Size in bytes of the postings for a single series call.",
   233  		Buckets: []float64{
   234  			32, 256, 512, 1024, 32 * 1024, 256 * 1024, 512 * 1024, 1024 * 1024, 32 * 1024 * 1024, 128 * 1024 * 1024, 256 * 1024 * 1024, 512 * 1024 * 1024, 768 * 1024 * 1024, 1024 * 1024 * 1024,
   235  		},
   236  	})
   238  	m.queriesDropped = promauto.With(reg).NewCounterVec(prometheus.CounterOpts{
   239  		Name: "thanos_bucket_store_queries_dropped_total",
   240  		Help: "Number of queries that were dropped due to the limit.",
   241  	}, []string{"reason"})
   242  	m.seriesRefetches = promauto.With(reg).NewCounter(prometheus.CounterOpts{
   243  		Name: "thanos_bucket_store_series_refetches_total",
   244  		Help: "Total number of cases where configured estimated series bytes was not enough was to fetch series from index, resulting in refetch.",
   245  	})
   246  	m.chunkRefetches = promauto.With(reg).NewCounter(prometheus.CounterOpts{
   247  		Name: "thanos_bucket_store_chunk_refetches_total",
   248  		Help: "Total number of cases where configured estimated chunk bytes was not enough was to fetch chunks from object store, resulting in refetch.",
   249  	})
   251  	m.cachedPostingsCompressions = promauto.With(reg).NewCounterVec(prometheus.CounterOpts{
   252  		Name: "thanos_bucket_store_cached_postings_compressions_total",
   253  		Help: "Number of postings compressions before storing to index cache.",
   254  	}, []string{"op"})
   255  	m.cachedPostingsCompressions.WithLabelValues(labelEncode)
   256  	m.cachedPostingsCompressions.WithLabelValues(labelDecode)
   258  	m.cachedPostingsCompressionErrors = promauto.With(reg).NewCounterVec(prometheus.CounterOpts{
   259  		Name: "thanos_bucket_store_cached_postings_compression_errors_total",
   260  		Help: "Number of postings compression errors.",
   261  	}, []string{"op"})
   262  	m.cachedPostingsCompressionErrors.WithLabelValues(labelEncode)
   263  	m.cachedPostingsCompressionErrors.WithLabelValues(labelDecode)
   265  	m.cachedPostingsCompressionTimeSeconds = promauto.With(reg).NewCounterVec(prometheus.CounterOpts{
   266  		Name: "thanos_bucket_store_cached_postings_compression_time_seconds_total",
   267  		Help: "Time spent compressing postings before storing them into postings cache.",
   268  	}, []string{"op"})
   269  	m.cachedPostingsCompressionTimeSeconds.WithLabelValues(labelEncode)
   270  	m.cachedPostingsCompressionTimeSeconds.WithLabelValues(labelDecode)
   272  	m.cachedPostingsOriginalSizeBytes = promauto.With(reg).NewCounter(prometheus.CounterOpts{
   273  		Name: "thanos_bucket_store_cached_postings_original_size_bytes_total",
   274  		Help: "Original size of postings stored into cache.",
   275  	})
   276  	m.cachedPostingsCompressedSizeBytes = promauto.With(reg).NewCounter(prometheus.CounterOpts{
   277  		Name: "thanos_bucket_store_cached_postings_compressed_size_bytes_total",
   278  		Help: "Compressed size of postings stored into cache.",
   279  	})
   281  	m.seriesFetchDuration = promauto.With(reg).NewHistogram(prometheus.HistogramOpts{
   282  		Name:    "thanos_bucket_store_series_fetch_duration_seconds",
   283  		Help:    "The time it takes to fetch series to respond to a request sent to a store gateway. It includes both the time to fetch it from the cache and from storage in case of cache misses.",
   284  		Buckets: []float64{0.001, 0.01, 0.1, 0.3, 0.6, 1, 3, 6, 9, 20, 30, 60, 90, 120},
   285  	})
   287  	m.postingsFetchDuration = promauto.With(reg).NewHistogram(prometheus.HistogramOpts{
   288  		Name:    "thanos_bucket_store_postings_fetch_duration_seconds",
   289  		Help:    "The time it takes to fetch postings to respond to a request sent to a store gateway. It includes both the time to fetch it from the cache and from storage in case of cache misses.",
   290  		Buckets: []float64{0.001, 0.01, 0.1, 0.3, 0.6, 1, 3, 6, 9, 20, 30, 60, 90, 120},
   291  	})
   293  	m.chunkFetchDuration = promauto.With(reg).NewHistogram(prometheus.HistogramOpts{
   294  		Name:    "thanos_bucket_store_chunks_fetch_duration_seconds",
   295  		Help:    "The total time spent fetching chunks within a single request a store gateway.",
   296  		Buckets: []float64{0.001, 0.01, 0.1, 0.3, 0.6, 1, 3, 6, 9, 20, 30, 60, 90, 120},
   297  	})
   299  	m.emptyPostingCount = promauto.With(reg).NewCounter(prometheus.CounterOpts{
   300  		Name: "thanos_bucket_store_empty_postings_total",
   301  		Help: "Total number of empty postings when fetching block series.",
   302  	})
   304  	return &m
   305  }
   307  // FilterConfig is a configuration, which Store uses for filtering metrics based on time.
   308  type FilterConfig struct {
   309  	MinTime, MaxTime model.TimeOrDurationValue
   310  }
   312  type BlockEstimator func(meta metadata.Meta) uint64
   314  // BucketStore implements the store API backed by a bucket. It loads all index
   315  // files to local disk.
   316  //
   317  // NOTE: Bucket store reencodes postings using diff+varint+snappy when storing to cache.
   318  // This makes them smaller, but takes extra CPU and memory.
   319  // When used with in-memory cache, memory usage should decrease overall, thanks to postings being smaller.
   320  type BucketStore struct {
   321  	logger          log.Logger
   322  	reg             prometheus.Registerer // TODO(metalmatze) remove and add via BucketStoreOption
   323  	metrics         *bucketStoreMetrics
   324  	bkt             objstore.InstrumentedBucketReader
   325  	fetcher         block.MetadataFetcher
   326  	dir             string
   327  	indexCache      storecache.IndexCache
   328  	indexReaderPool *indexheader.ReaderPool
   329  	buffers         sync.Pool
   330  	chunkPool       pool.Bytes
   331  	seriesBatchSize int
   333  	// Sets of blocks that have the same labels. They are indexed by a hash over their label set.
   334  	mtx       sync.RWMutex
   335  	blocks    map[ulid.ULID]*bucketBlock
   336  	blockSets map[uint64]*bucketBlockSet
   338  	// Verbose enabled additional logging.
   339  	debugLogging bool
   340  	// Number of goroutines to use when syncing blocks from object storage.
   341  	blockSyncConcurrency int
   343  	// Query gate which limits the maximum amount of concurrent queries.
   344  	queryGate gate.Gate
   346  	// chunksLimiterFactory creates a new limiter used to limit the number of chunks fetched by each Series() call.
   347  	chunksLimiterFactory ChunksLimiterFactory
   348  	// seriesLimiterFactory creates a new limiter used to limit the number of touched series by each Series() call,
   349  	// or LabelName and LabelValues calls when used with matchers.
   350  	seriesLimiterFactory SeriesLimiterFactory
   352  	// bytesLimiterFactory creates a new limiter used to limit the amount of bytes fetched/touched by each Series() call.
   353  	bytesLimiterFactory BytesLimiterFactory
   354  	partitioner         Partitioner
   356  	filterConfig             *FilterConfig
   357  	advLabelSets             []labelpb.ZLabelSet
   358  	enableCompatibilityLabel bool
   360  	// Every how many posting offset entry we pool in heap memory. Default in Prometheus is 32.
   361  	postingOffsetsInMemSampling int
   363  	// Enables hints in the Series() response.
   364  	enableSeriesResponseHints bool
   366  	enableChunkHashCalculation bool
   368  	blockEstimatedMaxSeriesFunc BlockEstimator
   369  	blockEstimatedMaxChunkFunc  BlockEstimator
   370  }
   372  func (s *BucketStore) validate() error {
   373  	if s.blockSyncConcurrency < minBlockSyncConcurrency {
   374  		return errBlockSyncConcurrencyNotValid
   375  	}
   376  	return nil
   377  }
   379  type noopCache struct{}
   381  func (noopCache) StorePostings(ulid.ULID, labels.Label, []byte) {}
   382  func (noopCache) FetchMultiPostings(_ context.Context, _ ulid.ULID, keys []labels.Label) (map[labels.Label][]byte, []labels.Label) {
   383  	return map[labels.Label][]byte{}, keys
   384  }
   386  func (noopCache) StoreExpandedPostings(_ ulid.ULID, _ []*labels.Matcher, _ []byte) {}
   387  func (noopCache) FetchExpandedPostings(_ context.Context, _ ulid.ULID, _ []*labels.Matcher) ([]byte, bool) {
   388  	return []byte{}, false
   389  }
   391  func (noopCache) StoreSeries(ulid.ULID, storage.SeriesRef, []byte) {}
   392  func (noopCache) FetchMultiSeries(_ context.Context, _ ulid.ULID, ids []storage.SeriesRef) (map[storage.SeriesRef][]byte, []storage.SeriesRef) {
   393  	return map[storage.SeriesRef][]byte{}, ids
   394  }
   396  // BucketStoreOption are functions that configure BucketStore.
   397  type BucketStoreOption func(s *BucketStore)
   399  // WithLogger sets the BucketStore logger to the one you pass.
   400  func WithLogger(logger log.Logger) BucketStoreOption {
   401  	return func(s *BucketStore) {
   402  		s.logger = logger
   403  	}
   404  }
   406  // WithRegistry sets a registry that BucketStore uses to register metrics with.
   407  func WithRegistry(reg prometheus.Registerer) BucketStoreOption {
   408  	return func(s *BucketStore) {
   409  		s.reg = reg
   410  	}
   411  }
   413  // WithIndexCache sets a indexCache to use instead of a noopCache.
   414  func WithIndexCache(cache storecache.IndexCache) BucketStoreOption {
   415  	return func(s *BucketStore) {
   416  		s.indexCache = cache
   417  	}
   418  }
   420  // WithQueryGate sets a queryGate to use instead of a noopGate.
   421  func WithQueryGate(queryGate gate.Gate) BucketStoreOption {
   422  	return func(s *BucketStore) {
   423  		s.queryGate = queryGate
   424  	}
   425  }
   427  // WithChunkPool sets a pool.Bytes to use for chunks.
   428  func WithChunkPool(chunkPool pool.Bytes) BucketStoreOption {
   429  	return func(s *BucketStore) {
   430  		s.chunkPool = chunkPool
   431  	}
   432  }
   434  // WithFilterConfig sets a filter which Store uses for filtering metrics based on time.
   435  func WithFilterConfig(filter *FilterConfig) BucketStoreOption {
   436  	return func(s *BucketStore) {
   437  		s.filterConfig = filter
   438  	}
   439  }
   441  // WithDebugLogging enables debug logging.
   442  func WithDebugLogging() BucketStoreOption {
   443  	return func(s *BucketStore) {
   444  		s.debugLogging = true
   445  	}
   446  }
   448  func WithChunkHashCalculation(enableChunkHashCalculation bool) BucketStoreOption {
   449  	return func(s *BucketStore) {
   450  		s.enableChunkHashCalculation = enableChunkHashCalculation
   451  	}
   452  }
   454  func WithSeriesBatchSize(seriesBatchSize int) BucketStoreOption {
   455  	return func(s *BucketStore) {
   456  		s.seriesBatchSize = seriesBatchSize
   457  	}
   458  }
   460  func WithBlockEstimatedMaxSeriesFunc(f BlockEstimator) BucketStoreOption {
   461  	return func(s *BucketStore) {
   462  		s.blockEstimatedMaxSeriesFunc = f
   463  	}
   464  }
   466  func WithBlockEstimatedMaxChunkFunc(f BlockEstimator) BucketStoreOption {
   467  	return func(s *BucketStore) {
   468  		s.blockEstimatedMaxChunkFunc = f
   469  	}
   470  }
   472  // NewBucketStore creates a new bucket backed store that implements the store API against
   473  // an object store bucket. It is optimized to work against high latency backends.
   474  func NewBucketStore(
   475  	bkt objstore.InstrumentedBucketReader,
   476  	fetcher block.MetadataFetcher,
   477  	dir string,
   478  	chunksLimiterFactory ChunksLimiterFactory,
   479  	seriesLimiterFactory SeriesLimiterFactory,
   480  	bytesLimiterFactory BytesLimiterFactory,
   481  	partitioner Partitioner,
   482  	blockSyncConcurrency int,
   483  	enableCompatibilityLabel bool,
   484  	postingOffsetsInMemSampling int,
   485  	enableSeriesResponseHints bool, // TODO(pracucci) Thanos 0.12 and below doesn't gracefully handle new fields in SeriesResponse. Drop this flag and always enable hints once we can drop backward compatibility.
   486  	lazyIndexReaderEnabled bool,
   487  	lazyIndexReaderIdleTimeout time.Duration,
   488  	options ...BucketStoreOption,
   489  ) (*BucketStore, error) {
   490  	s := &BucketStore{
   491  		logger:     log.NewNopLogger(),
   492  		bkt:        bkt,
   493  		fetcher:    fetcher,
   494  		dir:        dir,
   495  		indexCache: noopCache{},
   496  		buffers: sync.Pool{New: func() interface{} {
   497  			b := make([]byte, 0, initialBufSize)
   498  			return &b
   499  		}},
   500  		chunkPool:                   pool.NoopBytes{},
   501  		blocks:                      map[ulid.ULID]*bucketBlock{},
   502  		blockSets:                   map[uint64]*bucketBlockSet{},
   503  		blockSyncConcurrency:        blockSyncConcurrency,
   504  		queryGate:                   gate.NewNoop(),
   505  		chunksLimiterFactory:        chunksLimiterFactory,
   506  		seriesLimiterFactory:        seriesLimiterFactory,
   507  		bytesLimiterFactory:         bytesLimiterFactory,
   508  		partitioner:                 partitioner,
   509  		enableCompatibilityLabel:    enableCompatibilityLabel,
   510  		postingOffsetsInMemSampling: postingOffsetsInMemSampling,
   511  		enableSeriesResponseHints:   enableSeriesResponseHints,
   512  		enableChunkHashCalculation:  enableChunkHashCalculation,
   513  		seriesBatchSize:             SeriesBatchSize,
   514  	}
   516  	for _, option := range options {
   517  		option(s)
   518  	}
   520  	// Depend on the options
   521  	indexReaderPoolMetrics := indexheader.NewReaderPoolMetrics(extprom.WrapRegistererWithPrefix("thanos_bucket_store_", s.reg))
   522  	s.indexReaderPool = indexheader.NewReaderPool(s.logger, lazyIndexReaderEnabled, lazyIndexReaderIdleTimeout, indexReaderPoolMetrics)
   523  	s.metrics = newBucketStoreMetrics(s.reg) // TODO(metalmatze): Might be possible via Option too
   525  	if err := s.validate(); err != nil {
   526  		return nil, errors.Wrap(err, "validate config")
   527  	}
   529  	if dir == "" {
   530  		return s, nil
   531  	}
   533  	if err := os.MkdirAll(dir, 0750); err != nil {
   534  		return nil, errors.Wrap(err, "create dir")
   535  	}
   537  	return s, nil
   538  }
   540  // Close the store.
   541  func (s *BucketStore) Close() (err error) {
   542  	s.mtx.Lock()
   543  	defer s.mtx.Unlock()
   545  	for _, b := range s.blocks {
   546  		runutil.CloseWithErrCapture(&err, b, "closing Bucket Block")
   547  	}
   549  	s.indexReaderPool.Close()
   550  	return err
   551  }
   553  // SyncBlocks synchronizes the stores state with the Bucket bucket.
   554  // It will reuse disk space as persistent cache based on s.dir param.
   555  func (s *BucketStore) SyncBlocks(ctx context.Context) error {
   556  	metas, _, metaFetchErr := s.fetcher.Fetch(ctx)
   557  	// For partial view allow adding new blocks at least.
   558  	if metaFetchErr != nil && metas == nil {
   559  		return metaFetchErr
   560  	}
   562  	var wg sync.WaitGroup
   563  	blockc := make(chan *metadata.Meta)
   565  	for i := 0; i < s.blockSyncConcurrency; i++ {
   566  		wg.Add(1)
   567  		go func() {
   568  			for meta := range blockc {
   569  				if err := s.addBlock(ctx, meta); err != nil {
   570  					continue
   571  				}
   572  			}
   573  			wg.Done()
   574  		}()
   575  	}
   577  	for id, meta := range metas {
   578  		if b := s.getBlock(id); b != nil {
   579  			continue
   580  		}
   581  		select {
   582  		case <-ctx.Done():
   583  		case blockc <- meta:
   584  		}
   585  	}
   587  	close(blockc)
   588  	wg.Wait()
   590  	if metaFetchErr != nil {
   591  		return metaFetchErr
   592  	}
   594  	// Drop all blocks that are no longer present in the bucket.
   595  	for id := range s.blocks {
   596  		if _, ok := metas[id]; ok {
   597  			continue
   598  		}
   599  		if err := s.removeBlock(id); err != nil {
   600  			level.Warn(s.logger).Log("msg", "drop of outdated block failed", "block", id, "err", err)
   601  			s.metrics.blockDropFailures.Inc()
   602  		}
   603  		level.Info(s.logger).Log("msg", "dropped outdated block", "block", id)
   604  		s.metrics.blockDrops.Inc()
   605  	}
   607  	// Sync advertise labels.
   608  	var storeLabels labels.Labels
   609  	s.mtx.Lock()
   610  	s.advLabelSets = make([]labelpb.ZLabelSet, 0, len(s.advLabelSets))
   611  	for _, bs := range s.blockSets {
   612  		storeLabels = storeLabels[:0]
   613  		s.advLabelSets = append(s.advLabelSets, labelpb.ZLabelSet{Labels: labelpb.ZLabelsFromPromLabels(append(storeLabels, bs.labels...))})
   614  	}
   615  	sort.Slice(s.advLabelSets, func(i, j int) bool {
   616  		return strings.Compare(s.advLabelSets[i].String(), s.advLabelSets[j].String()) < 0
   617  	})
   618  	s.mtx.Unlock()
   619  	return nil
   620  }
   622  // InitialSync perform blocking sync with extra step at the end to delete locally saved blocks that are no longer
   623  // present in the bucket. The mismatch of these can only happen between restarts, so we can do that only once per startup.
   624  func (s *BucketStore) InitialSync(ctx context.Context) error {
   625  	if err := s.SyncBlocks(ctx); err != nil {
   626  		return errors.Wrap(err, "sync block")
   627  	}
   629  	if s.dir == "" {
   630  		return nil
   631  	}
   633  	fis, err := os.ReadDir(s.dir)
   634  	if err != nil {
   635  		return errors.Wrap(err, "read dir")
   636  	}
   637  	names := make([]string, 0, len(fis))
   638  	for _, fi := range fis {
   639  		names = append(names, fi.Name())
   640  	}
   641  	for _, n := range names {
   642  		id, ok := block.IsBlockDir(n)
   643  		if !ok {
   644  			continue
   645  		}
   646  		if b := s.getBlock(id); b != nil {
   647  			continue
   648  		}
   650  		// No such block loaded, remove the local dir.
   651  		if err := os.RemoveAll(path.Join(s.dir, id.String())); err != nil {
   652  			level.Warn(s.logger).Log("msg", "failed to remove block which is not needed", "err", err)
   653  		}
   654  	}
   656  	return nil
   657  }
   659  func (s *BucketStore) getBlock(id ulid.ULID) *bucketBlock {
   660  	s.mtx.RLock()
   661  	defer s.mtx.RUnlock()
   662  	return s.blocks[id]
   663  }
   665  func (s *BucketStore) addBlock(ctx context.Context, meta *metadata.Meta) (err error) {
   666  	var dir string
   667  	if s.dir != "" {
   668  		dir = filepath.Join(s.dir, meta.ULID.String())
   669  	}
   670  	start := time.Now()
   672  	level.Debug(s.logger).Log("msg", "loading new block", "id", meta.ULID)
   673  	defer func() {
   674  		if err != nil {
   675  			s.metrics.blockLoadFailures.Inc()
   676  			if dir != "" {
   677  				if err2 := os.RemoveAll(dir); err2 != nil {
   678  					level.Warn(s.logger).Log("msg", "failed to remove block we cannot load", "err", err2)
   679  				}
   680  			}
   681  			level.Warn(s.logger).Log("msg", "loading block failed", "elapsed", time.Since(start), "id", meta.ULID, "err", err)
   682  		} else {
   683  			level.Info(s.logger).Log("msg", "loaded new block", "elapsed", time.Since(start), "id", meta.ULID)
   684  		}
   685  	}()
   686  	s.metrics.blockLoads.Inc()
   688  	lset := labels.FromMap(meta.Thanos.Labels)
   689  	h := lset.Hash()
   691  	indexHeaderReader, err := s.indexReaderPool.NewBinaryReader(
   692  		ctx,
   693  		s.logger,
   694  		s.bkt,
   695  		s.dir,
   696  		meta.ULID,
   697  		s.postingOffsetsInMemSampling,
   698  	)
   699  	if err != nil {
   700  		return errors.Wrap(err, "create index header reader")
   701  	}
   702  	defer func() {
   703  		if err != nil {
   704  			runutil.CloseWithErrCapture(&err, indexHeaderReader, "index-header")
   705  		}
   706  	}()
   708  	b, err := newBucketBlock(
   709  		ctx,
   710  		log.With(s.logger, "block", meta.ULID),
   711  		s.metrics,
   712  		meta,
   713  		s.bkt,
   714  		dir,
   715  		s.indexCache,
   716  		s.chunkPool,
   717  		indexHeaderReader,
   718  		s.partitioner,
   719  		s.blockEstimatedMaxSeriesFunc,
   720  		s.blockEstimatedMaxChunkFunc,
   721  	)
   722  	if err != nil {
   723  		return errors.Wrap(err, "new bucket block")
   724  	}
   725  	defer func() {
   726  		if err != nil {
   727  			runutil.CloseWithErrCapture(&err, b, "index-header")
   728  		}
   729  	}()
   731  	s.mtx.Lock()
   732  	defer s.mtx.Unlock()
   734  	sort.Sort(lset)
   736  	set, ok := s.blockSets[h]
   737  	if !ok {
   738  		set = newBucketBlockSet(lset)
   739  		s.blockSets[h] = set
   740  	}
   742  	if err = set.add(b); err != nil {
   743  		return errors.Wrap(err, "add block to set")
   744  	}
   745  	s.blocks[b.meta.ULID] = b
   747  	s.metrics.blocksLoaded.Inc()
   748  	s.metrics.lastLoadedBlock.SetToCurrentTime()
   749  	return nil
   750  }
   752  func (s *BucketStore) removeBlock(id ulid.ULID) error {
   753  	s.mtx.Lock()
   754  	b, ok := s.blocks[id]
   755  	if ok {
   756  		lset := labels.FromMap(b.meta.Thanos.Labels)
   757  		s.blockSets[lset.Hash()].remove(id)
   758  		delete(s.blocks, id)
   759  	}
   760  	s.mtx.Unlock()
   762  	if !ok {
   763  		return nil
   764  	}
   766  	s.metrics.blocksLoaded.Dec()
   767  	if err := b.Close(); err != nil {
   768  		return errors.Wrap(err, "close block")
   769  	}
   771  	if b.dir == "" {
   772  		return nil
   773  	}
   775  	return os.RemoveAll(b.dir)
   776  }
   778  // TimeRange returns the minimum and maximum timestamp of data available in the store.
   779  func (s *BucketStore) TimeRange() (mint, maxt int64) {
   780  	s.mtx.RLock()
   781  	defer s.mtx.RUnlock()
   783  	mint = math.MaxInt64
   784  	maxt = math.MinInt64
   786  	for _, b := range s.blocks {
   787  		if b.meta.MinTime < mint {
   788  			mint = b.meta.MinTime
   789  		}
   790  		if b.meta.MaxTime > maxt {
   791  			maxt = b.meta.MaxTime
   792  		}
   793  	}
   795  	mint = s.limitMinTime(mint)
   796  	maxt = s.limitMaxTime(maxt)
   798  	return mint, maxt
   799  }
   801  // TSDBInfos returns a list of infopb.TSDBInfos for blocks in the bucket store.
   802  func (s *BucketStore) TSDBInfos() []infopb.TSDBInfo {
   803  	s.mtx.RLock()
   804  	defer s.mtx.RUnlock()
   806  	infos := make([]infopb.TSDBInfo, 0, len(s.blocks))
   807  	for _, b := range s.blocks {
   808  		infos = append(infos, infopb.TSDBInfo{
   809  			Labels: labelpb.ZLabelSet{
   810  				Labels: labelpb.ZLabelsFromPromLabels(labels.FromMap(b.meta.Thanos.Labels)),
   811  			},
   812  			MinTime: b.meta.MinTime,
   813  			MaxTime: b.meta.MaxTime,
   814  		})
   815  	}
   817  	return infos
   818  }
   820  func (s *BucketStore) LabelSet() []labelpb.ZLabelSet {
   821  	s.mtx.RLock()
   822  	labelSets := s.advLabelSets
   823  	s.mtx.RUnlock()
   825  	if s.enableCompatibilityLabel && len(labelSets) > 0 {
   826  		labelSets = append(labelSets, labelpb.ZLabelSet{Labels: []labelpb.ZLabel{{Name: CompatibilityTypeLabelName, Value: "store"}}})
   827  	}
   829  	return labelSets
   830  }
   832  // Info implements the storepb.StoreServer interface.
   833  func (s *BucketStore) Info(context.Context, *storepb.InfoRequest) (*storepb.InfoResponse, error) {
   834  	mint, maxt := s.TimeRange()
   835  	res := &storepb.InfoResponse{
   836  		StoreType: component.Store.ToProto(),
   837  		MinTime:   mint,
   838  		MaxTime:   maxt,
   839  		LabelSets: s.LabelSet(),
   840  	}
   842  	return res, nil
   843  }
   845  func (s *BucketStore) limitMinTime(mint int64) int64 {
   846  	if s.filterConfig == nil {
   847  		return mint
   848  	}
   850  	filterMinTime := s.filterConfig.MinTime.PrometheusTimestamp()
   852  	if mint < filterMinTime {
   853  		return filterMinTime
   854  	}
   856  	return mint
   857  }
   859  func (s *BucketStore) limitMaxTime(maxt int64) int64 {
   860  	if s.filterConfig == nil {
   861  		return maxt
   862  	}
   864  	filterMaxTime := s.filterConfig.MaxTime.PrometheusTimestamp()
   866  	if maxt > filterMaxTime {
   867  		maxt = filterMaxTime
   868  	}
   870  	return maxt
   871  }
   873  type seriesEntry struct {
   874  	lset labels.Labels
   875  	refs []chunks.ChunkRef
   876  	chks []storepb.AggrChunk
   877  }
   879  // blockSeriesClient is a storepb.Store_SeriesClient for a
   880  // single TSDB block in object storage.
   881  type blockSeriesClient struct {
   882  	grpc.ClientStream
   883  	ctx             context.Context
   884  	logger          log.Logger
   885  	extLset         labels.Labels
   886  	extLsetToRemove map[string]struct{}
   888  	mint           int64
   889  	maxt           int64
   890  	indexr         *bucketIndexReader
   891  	chunkr         *bucketChunkReader
   892  	loadAggregates []storepb.Aggr
   893  	chunksLimiter  ChunksLimiter
   894  	bytesLimiter   BytesLimiter
   896  	skipChunks         bool
   897  	shardMatcher       *storepb.ShardMatcher
   898  	calculateChunkHash bool
   899  	chunkFetchDuration prometheus.Histogram
   901  	// Internal state.
   902  	i               uint64
   903  	postings        []storage.SeriesRef
   904  	chkMetas        []chunks.Meta
   905  	lset            labels.Labels
   906  	symbolizedLset  []symbolizedLabel
   907  	entries         []seriesEntry
   908  	hasMorePostings bool
   909  	batchSize       int
   910  }
   912  func newBlockSeriesClient(
   913  	ctx context.Context,
   914  	logger log.Logger,
   915  	b *bucketBlock,
   916  	req *storepb.SeriesRequest,
   917  	limiter ChunksLimiter,
   918  	bytesLimiter BytesLimiter,
   919  	shardMatcher *storepb.ShardMatcher,
   920  	calculateChunkHash bool,
   921  	batchSize int,
   922  	chunkFetchDuration prometheus.Histogram,
   923  	extLsetToRemove map[string]struct{},
   924  ) *blockSeriesClient {
   925  	var chunkr *bucketChunkReader
   926  	if !req.SkipChunks {
   927  		chunkr = b.chunkReader()
   928  	}
   930  	extLset := b.extLset
   931  	if extLsetToRemove != nil {
   932  		extLset = rmLabels(extLset.Copy(), extLsetToRemove)
   933  	}
   935  	return &blockSeriesClient{
   936  		ctx:             ctx,
   937  		logger:          logger,
   938  		extLset:         extLset,
   939  		extLsetToRemove: extLsetToRemove,
   941  		mint:               req.MinTime,
   942  		maxt:               req.MaxTime,
   943  		indexr:             b.indexReader(),
   944  		chunkr:             chunkr,
   945  		chunksLimiter:      limiter,
   946  		bytesLimiter:       bytesLimiter,
   947  		skipChunks:         req.SkipChunks,
   948  		chunkFetchDuration: chunkFetchDuration,
   950  		loadAggregates:     req.Aggregates,
   951  		shardMatcher:       shardMatcher,
   952  		calculateChunkHash: calculateChunkHash,
   953  		hasMorePostings:    true,
   954  		batchSize:          batchSize,
   955  	}
   956  }
   958  func (b *blockSeriesClient) Close() {
   959  	if !b.skipChunks {
   960  		runutil.CloseWithLogOnErr(b.logger, b.chunkr, "series block")
   961  	}
   963  	runutil.CloseWithLogOnErr(b.logger, b.indexr, "series block")
   964  }
   966  func (b *blockSeriesClient) MergeStats(stats *queryStats) *queryStats {
   967  	stats = stats.merge(b.indexr.stats)
   968  	if !b.skipChunks {
   969  		stats = stats.merge(b.chunkr.stats)
   970  	}
   971  	return stats
   972  }
   974  type sortedMatchers []*labels.Matcher
   976  func newSortedMatchers(matchers []*labels.Matcher) sortedMatchers {
   977  	sort.Slice(matchers, func(i, j int) bool {
   978  		if matchers[i].Type == matchers[j].Type {
   979  			if matchers[i].Name == matchers[j].Name {
   980  				return matchers[i].Value < matchers[j].Value
   981  			}
   982  			return matchers[i].Name < matchers[j].Name
   983  		}
   984  		return matchers[i].Type < matchers[j].Type
   985  	})
   987  	return matchers
   988  }
   990  func (b *blockSeriesClient) ExpandPostings(
   991  	matchers sortedMatchers,
   992  	seriesLimiter SeriesLimiter,
   993  ) error {
   994  	ps, err := b.indexr.ExpandedPostings(b.ctx, matchers, b.bytesLimiter)
   995  	if err != nil {
   996  		return errors.Wrap(err, "expanded matching posting")
   997  	}
   999  	if len(ps) == 0 {
  1000  		return nil
  1001  	}
  1003  	if err := seriesLimiter.Reserve(uint64(len(ps))); err != nil {
  1004  		return httpgrpc.Errorf(int(codes.ResourceExhausted), "exceeded series limit: %s", err)
  1005  	}
  1007  	b.postings = ps
  1008  	if b.batchSize > len(ps) {
  1009  		b.batchSize = len(ps)
  1010  	}
  1011  	b.entries = make([]seriesEntry, 0, b.batchSize)
  1012  	return nil
  1013  }
  1015  func (b *blockSeriesClient) Recv() (*storepb.SeriesResponse, error) {
  1016  	for len(b.entries) == 0 && b.hasMorePostings {
  1017  		if err := b.nextBatch(); err != nil {
  1018  			return nil, err
  1019  		}
  1020  	}
  1022  	if len(b.entries) == 0 {
  1023  		if b.chunkr != nil {
  1024  			b.chunkFetchDuration.Observe(b.chunkr.stats.ChunksFetchDurationSum.Seconds())
  1025  		}
  1026  		return nil, io.EOF
  1027  	}
  1029  	next := b.entries[0]
  1030  	b.entries = b.entries[1:]
  1032  	return storepb.NewSeriesResponse(&storepb.Series{
  1033  		Labels: labelpb.ZLabelsFromPromLabels(next.lset),
  1034  		Chunks: next.chks,
  1035  	}), nil
  1036  }
  1038  func (b *blockSeriesClient) nextBatch() error {
  1039  	start := b.i
  1040  	end := start + SeriesBatchSize
  1041  	if end > uint64(len(b.postings)) {
  1042  		end = uint64(len(b.postings))
  1043  	}
  1044  	b.i = end
  1046  	postingsBatch := b.postings[start:end]
  1047  	if len(postingsBatch) == 0 {
  1048  		b.hasMorePostings = false
  1049  		return nil
  1050  	}
  1052  	b.indexr.reset()
  1053  	if !b.skipChunks {
  1054  		b.chunkr.reset()
  1055  	}
  1057  	if err := b.indexr.PreloadSeries(b.ctx, postingsBatch, b.bytesLimiter); err != nil {
  1058  		return errors.Wrap(err, "preload series")
  1059  	}
  1061  	b.entries = b.entries[:0]
  1062  	for i := 0; i < len(postingsBatch); i++ {
  1063  		if err := b.ctx.Err(); err != nil {
  1064  			return err
  1065  		}
  1066  		ok, err := b.indexr.LoadSeriesForTime(postingsBatch[i], &b.symbolizedLset, &b.chkMetas, b.skipChunks,, b.maxt)
  1067  		if err != nil {
  1068  			return errors.Wrap(err, "read series")
  1069  		}
  1070  		if !ok {
  1071  			continue
  1072  		}
  1074  		if err := b.indexr.LookupLabelsSymbols(b.symbolizedLset, &b.lset); err != nil {
  1075  			return errors.Wrap(err, "Lookup labels symbols")
  1076  		}
  1078  		completeLabelset := labelpb.ExtendSortedLabels(b.lset, b.extLset)
  1079  		if b.extLsetToRemove != nil {
  1080  			completeLabelset = rmLabels(completeLabelset, b.extLsetToRemove)
  1081  		}
  1083  		if !b.shardMatcher.MatchesLabels(completeLabelset) {
  1084  			continue
  1085  		}
  1087  		s := seriesEntry{lset: completeLabelset}
  1088  		if b.skipChunks {
  1089  			b.entries = append(b.entries, s)
  1090  			continue
  1091  		}
  1093  		// Schedule loading chunks.
  1094  		s.refs = make([]chunks.ChunkRef, 0, len(b.chkMetas))
  1095  		s.chks = make([]storepb.AggrChunk, 0, len(b.chkMetas))
  1097  		for j, meta := range b.chkMetas {
  1098  			if err := b.chunkr.addLoad(meta.Ref, len(b.entries), j); err != nil {
  1099  				return errors.Wrap(err, "add chunk load")
  1100  			}
  1101  			s.chks = append(s.chks, storepb.AggrChunk{
  1102  				MinTime: meta.MinTime,
  1103  				MaxTime: meta.MaxTime,
  1104  			})
  1105  			s.refs = append(s.refs, meta.Ref)
  1106  		}
  1108  		// Ensure sample limit through chunksLimiter if we return chunks.
  1109  		if err := b.chunksLimiter.Reserve(uint64(len(b.chkMetas))); err != nil {
  1110  			return httpgrpc.Errorf(int(codes.ResourceExhausted), "exceeded chunks limit: %s", err)
  1111  		}
  1113  		b.entries = append(b.entries, s)
  1114  	}
  1116  	if !b.skipChunks {
  1117  		if err := b.chunkr.load(b.ctx, b.entries, b.loadAggregates, b.calculateChunkHash, b.bytesLimiter); err != nil {
  1118  			return errors.Wrap(err, "load chunks")
  1119  		}
  1120  	}
  1122  	return nil
  1123  }
  1125  func populateChunk(out *storepb.AggrChunk, in chunkenc.Chunk, aggrs []storepb.Aggr, save func([]byte) ([]byte, error), calculateChecksum bool) error {
  1126  	hasher := hashPool.Get().(hash.Hash64)
  1127  	defer hashPool.Put(hasher)
  1129  	if in.Encoding() == chunkenc.EncXOR || in.Encoding() == chunkenc.EncHistogram {
  1130  		b, err := save(in.Bytes())
  1131  		if err != nil {
  1132  			return err
  1133  		}
  1134  		out.Raw = &storepb.Chunk{
  1135  			Data: b,
  1136  			Type: storepb.Chunk_Encoding(in.Encoding() - 1),
  1137  			Hash: hashChunk(hasher, b, calculateChecksum),
  1138  		}
  1139  		return nil
  1140  	}
  1142  	if in.Encoding() != downsample.ChunkEncAggr {
  1143  		return errors.Errorf("unsupported chunk encoding %d", in.Encoding())
  1144  	}
  1146  	ac := downsample.AggrChunk(in.Bytes())
  1148  	for _, at := range aggrs {
  1149  		switch at {
  1150  		case storepb.Aggr_COUNT:
  1151  			x, err := ac.Get(downsample.AggrCount)
  1152  			if err != nil {
  1153  				return errors.Errorf("aggregate %s does not exist", downsample.AggrCount)
  1154  			}
  1155  			b, err := save(x.Bytes())
  1156  			if err != nil {
  1157  				return err
  1158  			}
  1159  			out.Count = &storepb.Chunk{Type: storepb.Chunk_XOR, Data: b, Hash: hashChunk(hasher, b, calculateChecksum)}
  1160  		case storepb.Aggr_SUM:
  1161  			x, err := ac.Get(downsample.AggrSum)
  1162  			if err != nil {
  1163  				return errors.Errorf("aggregate %s does not exist", downsample.AggrSum)
  1164  			}
  1165  			b, err := save(x.Bytes())
  1166  			if err != nil {
  1167  				return err
  1168  			}
  1169  			out.Sum = &storepb.Chunk{Type: storepb.Chunk_XOR, Data: b, Hash: hashChunk(hasher, b, calculateChecksum)}
  1170  		case storepb.Aggr_MIN:
  1171  			x, err := ac.Get(downsample.AggrMin)
  1172  			if err != nil {
  1173  				return errors.Errorf("aggregate %s does not exist", downsample.AggrMin)
  1174  			}
  1175  			b, err := save(x.Bytes())
  1176  			if err != nil {
  1177  				return err
  1178  			}
  1179  			out.Min = &storepb.Chunk{Type: storepb.Chunk_XOR, Data: b, Hash: hashChunk(hasher, b, calculateChecksum)}
  1180  		case storepb.Aggr_MAX:
  1181  			x, err := ac.Get(downsample.AggrMax)
  1182  			if err != nil {
  1183  				return errors.Errorf("aggregate %s does not exist", downsample.AggrMax)
  1184  			}
  1185  			b, err := save(x.Bytes())
  1186  			if err != nil {
  1187  				return err
  1188  			}
  1189  			out.Max = &storepb.Chunk{Type: storepb.Chunk_XOR, Data: b, Hash: hashChunk(hasher, b, calculateChecksum)}
  1190  		case storepb.Aggr_COUNTER:
  1191  			x, err := ac.Get(downsample.AggrCounter)
  1192  			if err != nil {
  1193  				return errors.Errorf("aggregate %s does not exist", downsample.AggrCounter)
  1194  			}
  1195  			b, err := save(x.Bytes())
  1196  			if err != nil {
  1197  				return err
  1198  			}
  1199  			out.Counter = &storepb.Chunk{Type: storepb.Chunk_XOR, Data: b, Hash: hashChunk(hasher, b, calculateChecksum)}
  1200  		}
  1201  	}
  1202  	return nil
  1203  }
  1205  func hashChunk(hasher hash.Hash64, b []byte, doHash bool) uint64 {
  1206  	if !doHash {
  1207  		return 0
  1208  	}
  1209  	hasher.Reset()
  1210  	// Write never returns an error on the hasher implementation
  1211  	_, _ = hasher.Write(b)
  1212  	return hasher.Sum64()
  1213  }
  1215  // debugFoundBlockSetOverview logs on debug level what exactly blocks we used for query in terms of
  1216  // labels and resolution. This is important because we allow mixed resolution results, so it is quite crucial
  1217  // to be aware what exactly resolution we see on query.
  1218  // TODO(bplotka): Consider adding resolution label to all results to propagate that info to UI and Query API.
  1219  func debugFoundBlockSetOverview(logger log.Logger, mint, maxt, maxResolutionMillis int64, lset labels.Labels, bs []*bucketBlock) {
  1220  	if len(bs) == 0 {
  1221  		level.Debug(logger).Log("msg", "No block found", "mint", mint, "maxt", maxt, "lset", lset.String())
  1222  		return
  1223  	}
  1225  	var (
  1226  		parts            []string
  1227  		currRes          = int64(-1)
  1228  		currMin, currMax int64
  1229  	)
  1230  	for _, b := range bs {
  1231  		if currRes == b.meta.Thanos.Downsample.Resolution {
  1232  			currMax = b.meta.MaxTime
  1233  			continue
  1234  		}
  1236  		if currRes != -1 {
  1237  			parts = append(parts, fmt.Sprintf("Range: %d-%d Resolution: %d", currMin, currMax, currRes))
  1238  		}
  1240  		currRes = b.meta.Thanos.Downsample.Resolution
  1241  		currMin = b.meta.MinTime
  1242  		currMax = b.meta.MaxTime
  1243  	}
  1245  	parts = append(parts, fmt.Sprintf("Range: %d-%d Resolution: %d", currMin, currMax, currRes))
  1247  	level.Debug(logger).Log("msg", "Blocks source resolutions", "blocks", len(bs), "Maximum Resolution", maxResolutionMillis, "mint", mint, "maxt", maxt, "lset", lset.String(), "spans", strings.Join(parts, "\n"))
  1248  }
  1250  // Series implements the storepb.StoreServer interface.
  1251  func (s *BucketStore) Series(req *storepb.SeriesRequest, seriesSrv storepb.Store_SeriesServer) (err error) {
  1252  	srv := newFlushableServer(seriesSrv, sortingStrategyNone)
  1254  	if s.queryGate != nil {
  1255  		tracing.DoInSpan(srv.Context(), "store_query_gate_ismyturn", func(ctx context.Context) {
  1256  			err = s.queryGate.Start(srv.Context())
  1257  		})
  1258  		if err != nil {
  1259  			return errors.Wrapf(err, "failed to wait for turn")
  1260  		}
  1262  		defer s.queryGate.Done()
  1263  	}
  1265  	tenant, _ := tenancy.GetTenantFromGRPCMetadata(srv.Context())
  1266  	level.Debug(s.logger).Log("msg", "Tenant for Series request", "tenant", tenant)
  1268  	matchers, err := storepb.MatchersToPromMatchers(req.Matchers...)
  1269  	if err != nil {
  1270  		return status.Error(codes.InvalidArgument, err.Error())
  1271  	}
  1272  	req.MinTime = s.limitMinTime(req.MinTime)
  1273  	req.MaxTime = s.limitMaxTime(req.MaxTime)
  1275  	var (
  1276  		bytesLimiter     = s.bytesLimiterFactory(s.metrics.queriesDropped.WithLabelValues("bytes"))
  1277  		ctx              = srv.Context()
  1278  		stats            = &queryStats{}
  1279  		respSets         []respSet
  1280  		mtx              sync.Mutex
  1281  		g, gctx          = errgroup.WithContext(ctx)
  1282  		resHints         = &hintspb.SeriesResponseHints{}
  1283  		reqBlockMatchers []*labels.Matcher
  1284  		chunksLimiter    = s.chunksLimiterFactory(s.metrics.queriesDropped.WithLabelValues("chunks"))
  1285  		seriesLimiter    = s.seriesLimiterFactory(s.metrics.queriesDropped.WithLabelValues("series"))
  1287  		queryStatsEnabled = false
  1288  	)
  1290  	if req.Hints != nil {
  1291  		reqHints := &hintspb.SeriesRequestHints{}
  1292  		if err := types.UnmarshalAny(req.Hints, reqHints); err != nil {
  1293  			return status.Error(codes.InvalidArgument, errors.Wrap(err, "unmarshal series request hints").Error())
  1294  		}
  1295  		queryStatsEnabled = reqHints.EnableQueryStats
  1297  		reqBlockMatchers, err = storepb.MatchersToPromMatchers(reqHints.BlockMatchers...)
  1298  		if err != nil {
  1299  			return status.Error(codes.InvalidArgument, errors.Wrap(err, "translate request hints labels matchers").Error())
  1300  		}
  1301  	}
  1303  	var extLsetToRemove map[string]struct{}
  1304  	if len(req.WithoutReplicaLabels) > 0 {
  1305  		extLsetToRemove = make(map[string]struct{})
  1306  		for _, l := range req.WithoutReplicaLabels {
  1307  			extLsetToRemove[l] = struct{}{}
  1308  		}
  1309  	}
  1311  	s.mtx.RLock()
  1312  	for _, bs := range s.blockSets {
  1313  		blockMatchers, ok := bs.labelMatchers(matchers...)
  1314  		if !ok {
  1315  			continue
  1316  		}
  1318  		sortedBlockMatchers := newSortedMatchers(blockMatchers)
  1320  		blocks := bs.getFor(req.MinTime, req.MaxTime, req.MaxResolutionWindow, reqBlockMatchers)
  1322  		if s.debugLogging {
  1323  			debugFoundBlockSetOverview(s.logger, req.MinTime, req.MaxTime, req.MaxResolutionWindow, bs.labels, blocks)
  1324  		}
  1326  		for _, b := range blocks {
  1327  			blk := b
  1328  			gctx := gctx
  1330  			if s.enableSeriesResponseHints {
  1331  				// Keep track of queried blocks.
  1332  				resHints.AddQueriedBlock(blk.meta.ULID)
  1333  			}
  1335  			shardMatcher := req.ShardInfo.Matcher(&s.buffers)
  1337  			blockClient := newBlockSeriesClient(
  1338  				srv.Context(),
  1339  				s.logger,
  1340  				blk,
  1341  				req,
  1342  				chunksLimiter,
  1343  				bytesLimiter,
  1344  				shardMatcher,
  1345  				s.enableChunkHashCalculation,
  1346  				s.seriesBatchSize,
  1347  				s.metrics.chunkFetchDuration,
  1348  				extLsetToRemove,
  1349  			)
  1351  			defer blockClient.Close()
  1353  			g.Go(func() error {
  1355  				span, _ := tracing.StartSpan(gctx, "bucket_store_block_series", tracing.Tags{
  1356  					"":         blk.meta.ULID,
  1357  					"":       blk.meta.MinTime,
  1358  					"block.maxt":       blk.meta.MaxTime,
  1359  					"block.resolution": blk.meta.Thanos.Downsample.Resolution,
  1360  				})
  1362  				onClose := func() {
  1363  					mtx.Lock()
  1364  					stats = blockClient.MergeStats(stats)
  1365  					mtx.Unlock()
  1366  				}
  1368  				if err := blockClient.ExpandPostings(sortedBlockMatchers, seriesLimiter); err != nil {
  1369  					onClose()
  1370  					span.Finish()
  1371  					return errors.Wrapf(err, "fetch postings for block %s", blk.meta.ULID)
  1372  				}
  1374  				resp := newEagerRespSet(
  1375  					srv.Context(),
  1376  					span,
  1377  					10*time.Minute,
  1378  					blk.meta.ULID.String(),
  1379  					[]labels.Labels{blk.extLset},
  1380  					onClose,
  1381  					blockClient,
  1382  					shardMatcher,
  1383  					false,
  1384  					s.metrics.emptyPostingCount,
  1385  					nil,
  1386  				)
  1388  				mtx.Lock()
  1389  				respSets = append(respSets, resp)
  1390  				mtx.Unlock()
  1392  				return nil
  1393  			})
  1394  		}
  1395  	}
  1397  	s.mtx.RUnlock()
  1399  	defer func() {
  1400  		s.metrics.seriesDataTouched.WithLabelValues("postings").Observe(float64(stats.postingsTouched))
  1401  		s.metrics.seriesDataFetched.WithLabelValues("postings").Observe(float64(stats.postingsFetched))
  1402  		s.metrics.seriesDataSizeTouched.WithLabelValues("postings").Observe(float64(stats.PostingsTouchedSizeSum))
  1403  		s.metrics.seriesDataSizeFetched.WithLabelValues("postings").Observe(float64(stats.PostingsFetchedSizeSum))
  1404  		s.metrics.seriesDataTouched.WithLabelValues("series").Observe(float64(stats.seriesTouched))
  1405  		s.metrics.seriesDataFetched.WithLabelValues("series").Observe(float64(stats.seriesFetched))
  1406  		s.metrics.seriesDataSizeTouched.WithLabelValues("series").Observe(float64(stats.SeriesTouchedSizeSum))
  1407  		s.metrics.seriesDataSizeFetched.WithLabelValues("series").Observe(float64(stats.SeriesFetchedSizeSum))
  1408  		s.metrics.seriesDataTouched.WithLabelValues("chunks").Observe(float64(stats.chunksTouched))
  1409  		s.metrics.seriesDataFetched.WithLabelValues("chunks").Observe(float64(stats.chunksFetched))
  1410  		s.metrics.seriesDataSizeTouched.WithLabelValues("chunks").Observe(float64(stats.ChunksTouchedSizeSum))
  1411  		s.metrics.seriesDataSizeFetched.WithLabelValues("chunks").Observe(float64(stats.ChunksFetchedSizeSum))
  1412  		s.metrics.resultSeriesCount.Observe(float64(stats.mergedSeriesCount))
  1413  		s.metrics.cachedPostingsCompressions.WithLabelValues(labelEncode).Add(float64(stats.cachedPostingsCompressions))
  1414  		s.metrics.cachedPostingsCompressions.WithLabelValues(labelDecode).Add(float64(stats.cachedPostingsDecompressions))
  1415  		s.metrics.cachedPostingsCompressionErrors.WithLabelValues(labelEncode).Add(float64(stats.cachedPostingsCompressionErrors))
  1416  		s.metrics.cachedPostingsCompressionErrors.WithLabelValues(labelDecode).Add(float64(stats.cachedPostingsDecompressionErrors))
  1417  		s.metrics.cachedPostingsCompressionTimeSeconds.WithLabelValues(labelEncode).Add(stats.CachedPostingsCompressionTimeSum.Seconds())
  1418  		s.metrics.cachedPostingsCompressionTimeSeconds.WithLabelValues(labelDecode).Add(stats.CachedPostingsDecompressionTimeSum.Seconds())
  1419  		s.metrics.cachedPostingsOriginalSizeBytes.Add(float64(stats.CachedPostingsOriginalSizeSum))
  1420  		s.metrics.cachedPostingsCompressedSizeBytes.Add(float64(stats.CachedPostingsCompressedSizeSum))
  1421  		s.metrics.postingsSizeBytes.Observe(float64(int(stats.PostingsFetchedSizeSum) + int(stats.PostingsTouchedSizeSum)))
  1423  		level.Debug(s.logger).Log("msg", "stats query processed",
  1424  			"request", req,
  1425  			"stats", fmt.Sprintf("%+v", stats), "err", err)
  1426  	}()
  1428  	// Concurrently get data from all blocks.
  1429  	{
  1430  		begin := time.Now()
  1431  		tracing.DoInSpan(ctx, "bucket_store_preload_all", func(_ context.Context) {
  1432  			err = g.Wait()
  1433  		})
  1434  		if err != nil {
  1435  			code := codes.Aborted
  1436  			if s, ok := status.FromError(errors.Cause(err)); ok {
  1437  				code = s.Code()
  1438  			}
  1439  			return status.Error(code, err.Error())
  1440  		}
  1441  		stats.blocksQueried = len(respSets)
  1442  		stats.GetAllDuration = time.Since(begin)
  1443  		s.metrics.seriesGetAllDuration.Observe(stats.GetAllDuration.Seconds())
  1444  		s.metrics.seriesBlocksQueried.Observe(float64(stats.blocksQueried))
  1445  	}
  1447  	// Merge the sub-results from each selected block.
  1448  	tracing.DoInSpan(ctx, "bucket_store_merge_all", func(ctx context.Context) {
  1449  		defer func() {
  1450  			for _, resp := range respSets {
  1451  				resp.Close()
  1452  			}
  1453  		}()
  1454  		begin := time.Now()
  1455  		set := NewDedupResponseHeap(NewProxyResponseHeap(respSets...))
  1456  		for set.Next() {
  1457  			at := set.At()
  1458  			warn := at.GetWarning()
  1459  			if warn != "" {
  1460  				// TODO(fpetkovski): Consider deprecating string based warnings in favor of a
  1461  				// separate protobuf message containing the grpc code and
  1462  				// a human readable error message.
  1463  				err = status.Error(storepb.GRPCCodeFromWarn(warn), at.GetWarning())
  1464  				return
  1465  			}
  1467  			series := at.GetSeries()
  1468  			if series != nil {
  1469  				stats.mergedSeriesCount++
  1470  				if !req.SkipChunks {
  1471  					stats.mergedChunksCount += len(series.Chunks)
  1472  					s.metrics.chunkSizeBytes.Observe(float64(chunksSize(series.Chunks)))
  1473  				}
  1474  			}
  1475  			if err = srv.Send(at); err != nil {
  1476  				err = status.Error(codes.Unknown, errors.Wrap(err, "send series response").Error())
  1477  				return
  1478  			}
  1479  		}
  1480  		stats.MergeDuration = time.Since(begin)
  1481  		s.metrics.seriesMergeDuration.Observe(stats.MergeDuration.Seconds())
  1483  		err = nil
  1484  	})
  1485  	if err != nil {
  1486  		return err
  1487  	}
  1489  	if s.enableSeriesResponseHints {
  1490  		var anyHints *types.Any
  1492  		if queryStatsEnabled {
  1493  			resHints.QueryStats = stats.toHints()
  1494  		}
  1495  		if anyHints, err = types.MarshalAny(resHints); err != nil {
  1496  			err = status.Error(codes.Unknown, errors.Wrap(err, "marshal series response hints").Error())
  1497  			return
  1498  		}
  1500  		if err = srv.Send(storepb.NewHintsSeriesResponse(anyHints)); err != nil {
  1501  			err = status.Error(codes.Unknown, errors.Wrap(err, "send series response hints").Error())
  1502  			return
  1503  		}
  1504  	}
  1506  	if err != nil {
  1507  		return err
  1508  	}
  1509  	return srv.Flush()
  1510  }
  1512  func chunksSize(chks []storepb.AggrChunk) (size int) {
  1513  	for _, chk := range chks {
  1514  		size += chk.Size() // This gets the encoded proto size.
  1515  	}
  1516  	return size
  1517  }
  1519  // LabelNames implements the storepb.StoreServer interface.
  1520  func (s *BucketStore) LabelNames(ctx context.Context, req *storepb.LabelNamesRequest) (*storepb.LabelNamesResponse, error) {
  1521  	reqSeriesMatchers, err := storepb.MatchersToPromMatchers(req.Matchers...)
  1522  	if err != nil {
  1523  		return nil, status.Error(codes.InvalidArgument, errors.Wrap(err, "translate request labels matchers").Error())
  1524  	}
  1526  	tenant, _ := tenancy.GetTenantFromGRPCMetadata(ctx)
  1527  	level.Debug(s.logger).Log("msg", "Tenant for LabelNames request", "tenant", tenant)
  1529  	resHints := &hintspb.LabelNamesResponseHints{}
  1531  	var reqBlockMatchers []*labels.Matcher
  1532  	if req.Hints != nil {
  1533  		reqHints := &hintspb.LabelNamesRequestHints{}
  1534  		err := types.UnmarshalAny(req.Hints, reqHints)
  1535  		if err != nil {
  1536  			return nil, status.Error(codes.InvalidArgument, errors.Wrap(err, "unmarshal label names request hints").Error())
  1537  		}
  1539  		reqBlockMatchers, err = storepb.MatchersToPromMatchers(reqHints.BlockMatchers...)
  1540  		if err != nil {
  1541  			return nil, status.Error(codes.InvalidArgument, errors.Wrap(err, "translate request hints labels matchers").Error())
  1542  		}
  1543  	}
  1545  	g, gctx := errgroup.WithContext(ctx)
  1547  	s.mtx.RLock()
  1549  	var mtx sync.Mutex
  1550  	var sets [][]string
  1551  	var seriesLimiter = s.seriesLimiterFactory(s.metrics.queriesDropped.WithLabelValues("series"))
  1552  	var bytesLimiter = s.bytesLimiterFactory(s.metrics.queriesDropped.WithLabelValues("bytes"))
  1554  	for _, b := range s.blocks {
  1555  		b := b
  1556  		gctx := gctx
  1558  		if !b.overlapsClosedInterval(req.Start, req.End) {
  1559  			continue
  1560  		}
  1561  		if len(reqBlockMatchers) > 0 && !b.matchRelabelLabels(reqBlockMatchers) {
  1562  			continue
  1563  		}
  1564  		// Filter external labels from matchers.
  1565  		reqSeriesMatchersNoExtLabels, ok := b.FilterExtLabelsMatchers(reqSeriesMatchers)
  1566  		if !ok {
  1567  			continue
  1568  		}
  1570  		sortedReqSeriesMatchersNoExtLabels := newSortedMatchers(reqSeriesMatchersNoExtLabels)
  1572  		resHints.AddQueriedBlock(b.meta.ULID)
  1574  		indexr := b.indexReader()
  1576  		g.Go(func() error {
  1577  			span, newCtx := tracing.StartSpan(gctx, "bucket_store_block_series", tracing.Tags{
  1578  				"":         b.meta.ULID,
  1579  				"":       b.meta.MinTime,
  1580  				"block.maxt":       b.meta.MaxTime,
  1581  				"block.resolution": b.meta.Thanos.Downsample.Resolution,
  1582  			})
  1583  			defer span.Finish()
  1584  			defer runutil.CloseWithLogOnErr(s.logger, indexr, "label names")
  1586  			var result []string
  1587  			if len(reqSeriesMatchersNoExtLabels) == 0 {
  1588  				// Do it via index reader to have pending reader registered correctly.
  1589  				// LabelNames are already sorted.
  1590  				res, err := indexr.block.indexHeaderReader.LabelNames()
  1591  				if err != nil {
  1592  					return errors.Wrapf(err, "label names for block %s", b.meta.ULID)
  1593  				}
  1595  				// Add  a set for the external labels as well.
  1596  				// We're not adding them directly to refs because there could be duplicates.
  1597  				// b.extLset is already sorted by label name, no need to sort it again.
  1598  				extRes := make([]string, 0, len(b.extLset))
  1599  				for _, l := range b.extLset {
  1600  					extRes = append(extRes, l.Name)
  1601  				}
  1603  				result = strutil.MergeSlices(res, extRes)
  1604  			} else {
  1605  				seriesReq := &storepb.SeriesRequest{
  1606  					MinTime:    req.Start,
  1607  					MaxTime:    req.End,
  1608  					SkipChunks: true,
  1609  				}
  1610  				blockClient := newBlockSeriesClient(
  1611  					newCtx,
  1612  					s.logger,
  1613  					b,
  1614  					seriesReq,
  1615  					nil,
  1616  					bytesLimiter,
  1617  					nil,
  1618  					true,
  1619  					SeriesBatchSize,
  1620  					s.metrics.chunkFetchDuration,
  1621  					nil,
  1622  				)
  1623  				defer blockClient.Close()
  1625  				if err := blockClient.ExpandPostings(
  1626  					sortedReqSeriesMatchersNoExtLabels,
  1627  					seriesLimiter,
  1628  				); err != nil {
  1629  					return err
  1630  				}
  1632  				// Extract label names from all series. Many label names will be the same, so we need to deduplicate them.
  1633  				// Note that label names will already include external labels (passed to blockSeries), so we don't need
  1634  				// to add them again.
  1635  				labelNames := map[string]struct{}{}
  1636  				for {
  1637  					ls, err := blockClient.Recv()
  1638  					if err == io.EOF {
  1639  						break
  1640  					}
  1641  					if err != nil {
  1642  						return errors.Wrapf(err, "iterate series for block %s", b.meta.ULID)
  1643  					}
  1645  					if ls.GetWarning() != "" {
  1646  						return errors.Wrapf(errors.New(ls.GetWarning()), "iterate series for block %s", b.meta.ULID)
  1647  					}
  1648  					if ls.GetSeries() == nil {
  1649  						continue
  1650  					}
  1651  					for _, l := range ls.GetSeries().Labels {
  1652  						labelNames[l.Name] = struct{}{}
  1653  					}
  1654  				}
  1656  				result = make([]string, 0, len(labelNames))
  1657  				for n := range labelNames {
  1658  					result = append(result, n)
  1659  				}
  1660  				sort.Strings(result)
  1661  			}
  1663  			if len(result) > 0 {
  1664  				mtx.Lock()
  1665  				sets = append(sets, result)
  1666  				mtx.Unlock()
  1667  			}
  1669  			return nil
  1670  		})
  1671  	}
  1673  	s.mtx.RUnlock()
  1675  	if err := g.Wait(); err != nil {
  1676  		code := codes.Internal
  1677  		if s, ok := status.FromError(errors.Cause(err)); ok {
  1678  			code = s.Code()
  1679  		}
  1680  		return nil, status.Error(code, err.Error())
  1681  	}
  1683  	anyHints, err := types.MarshalAny(resHints)
  1684  	if err != nil {
  1685  		return nil, status.Error(codes.Unknown, errors.Wrap(err, "marshal label names response hints").Error())
  1686  	}
  1688  	return &storepb.LabelNamesResponse{
  1689  		Names: strutil.MergeSlices(sets...),
  1690  		Hints: anyHints,
  1691  	}, nil
  1692  }
  1694  func (b *bucketBlock) FilterExtLabelsMatchers(matchers []*labels.Matcher) ([]*labels.Matcher, bool) {
  1695  	// We filter external labels from matchers so we won't try to match series on them.
  1696  	var result []*labels.Matcher
  1697  	for _, m := range matchers {
  1698  		// Get value of external label from block.
  1699  		v := b.extLset.Get(m.Name)
  1700  		// If value is empty string the matcher is a valid one since it's not part of external labels.
  1701  		if v == "" {
  1702  			result = append(result, m)
  1703  		} else if v != "" && v != m.Value {
  1704  			// If matcher is external label but value is different we don't want to look in block anyway.
  1705  			return []*labels.Matcher{}, false
  1706  		}
  1707  	}
  1709  	return result, true
  1710  }
  1712  // LabelValues implements the storepb.StoreServer interface.
  1713  func (s *BucketStore) LabelValues(ctx context.Context, req *storepb.LabelValuesRequest) (*storepb.LabelValuesResponse, error) {
  1714  	reqSeriesMatchers, err := storepb.MatchersToPromMatchers(req.Matchers...)
  1715  	if err != nil {
  1716  		return nil, status.Error(codes.InvalidArgument, errors.Wrap(err, "translate request labels matchers").Error())
  1717  	}
  1719  	tenant, _ := tenancy.GetTenantFromGRPCMetadata(ctx)
  1720  	level.Debug(s.logger).Log("msg", "Tenant for LabelValues request", "tenant", tenant)
  1722  	resHints := &hintspb.LabelValuesResponseHints{}
  1724  	g, gctx := errgroup.WithContext(ctx)
  1726  	var reqBlockMatchers []*labels.Matcher
  1727  	if req.Hints != nil {
  1728  		reqHints := &hintspb.LabelValuesRequestHints{}
  1729  		err := types.UnmarshalAny(req.Hints, reqHints)
  1730  		if err != nil {
  1731  			return nil, status.Error(codes.InvalidArgument, errors.Wrap(err, "unmarshal label values request hints").Error())
  1732  		}
  1734  		reqBlockMatchers, err = storepb.MatchersToPromMatchers(reqHints.BlockMatchers...)
  1735  		if err != nil {
  1736  			return nil, status.Error(codes.InvalidArgument, errors.Wrap(err, "translate request hints labels matchers").Error())
  1737  		}
  1738  	}
  1740  	s.mtx.RLock()
  1742  	var mtx sync.Mutex
  1743  	var sets [][]string
  1744  	var seriesLimiter = s.seriesLimiterFactory(s.metrics.queriesDropped.WithLabelValues("series"))
  1745  	var bytesLimiter = s.bytesLimiterFactory(s.metrics.queriesDropped.WithLabelValues("bytes"))
  1747  	for _, b := range s.blocks {
  1748  		b := b
  1750  		if !b.overlapsClosedInterval(req.Start, req.End) {
  1751  			continue
  1752  		}
  1753  		if len(reqBlockMatchers) > 0 && !b.matchRelabelLabels(reqBlockMatchers) {
  1754  			continue
  1755  		}
  1756  		// Filter external labels from matchers.
  1757  		reqSeriesMatchersNoExtLabels, ok := b.FilterExtLabelsMatchers(reqSeriesMatchers)
  1758  		if !ok {
  1759  			continue
  1760  		}
  1762  		// If we have series matchers, add <labelName> != "" matcher, to only select series that have given label name.
  1763  		if len(reqSeriesMatchersNoExtLabels) > 0 {
  1764  			m, err := labels.NewMatcher(labels.MatchNotEqual, req.Label, "")
  1765  			if err != nil {
  1766  				return nil, status.Error(codes.InvalidArgument, err.Error())
  1767  			}
  1769  			reqSeriesMatchersNoExtLabels = append(reqSeriesMatchersNoExtLabels, m)
  1770  		}
  1772  		sortedReqSeriesMatchersNoExtLabels := newSortedMatchers(reqSeriesMatchersNoExtLabels)
  1774  		resHints.AddQueriedBlock(b.meta.ULID)
  1776  		indexr := b.indexReader()
  1777  		g.Go(func() error {
  1778  			span, newCtx := tracing.StartSpan(gctx, "bucket_store_block_series", tracing.Tags{
  1779  				"":         b.meta.ULID,
  1780  				"":       b.meta.MinTime,
  1781  				"block.maxt":       b.meta.MaxTime,
  1782  				"block.resolution": b.meta.Thanos.Downsample.Resolution,
  1783  			})
  1784  			defer span.Finish()
  1785  			defer runutil.CloseWithLogOnErr(s.logger, indexr, "label values")
  1787  			var result []string
  1788  			if len(reqSeriesMatchersNoExtLabels) == 0 {
  1789  				// Do it via index reader to have pending reader registered correctly.
  1790  				res, err := indexr.block.indexHeaderReader.LabelValues(req.Label)
  1791  				if err != nil {
  1792  					return errors.Wrapf(err, "index header label values for block %s", b.meta.ULID)
  1793  				}
  1795  				// Add the external label value as well.
  1796  				if extLabelValue := b.extLset.Get(req.Label); extLabelValue != "" {
  1797  					res = strutil.MergeSlices(res, []string{extLabelValue})
  1798  				}
  1799  				result = res
  1800  			} else {
  1801  				seriesReq := &storepb.SeriesRequest{
  1802  					MinTime:    req.Start,
  1803  					MaxTime:    req.End,
  1804  					SkipChunks: true,
  1805  				}
  1806  				blockClient := newBlockSeriesClient(
  1807  					newCtx,
  1808  					s.logger,
  1809  					b,
  1810  					seriesReq,
  1811  					nil,
  1812  					bytesLimiter,
  1813  					nil,
  1814  					true,
  1815  					SeriesBatchSize,
  1816  					s.metrics.chunkFetchDuration,
  1817  					nil,
  1818  				)
  1819  				defer blockClient.Close()
  1821  				if err := blockClient.ExpandPostings(
  1822  					sortedReqSeriesMatchersNoExtLabels,
  1823  					seriesLimiter,
  1824  				); err != nil {
  1825  					return err
  1826  				}
  1828  				// Extract given label's value from all series and deduplicate them.
  1829  				// We don't need to deal with external labels, since they are already added by blockSeries.
  1830  				values := map[string]struct{}{}
  1831  				for {
  1832  					ls, err := blockClient.Recv()
  1833  					if err == io.EOF {
  1834  						break
  1835  					}
  1836  					if err != nil {
  1837  						return errors.Wrapf(err, "iterate series for block %s", b.meta.ULID)
  1838  					}
  1840  					if ls.GetWarning() != "" {
  1841  						return errors.Wrapf(errors.New(ls.GetWarning()), "iterate series for block %s", b.meta.ULID)
  1842  					}
  1843  					if ls.GetSeries() == nil {
  1844  						continue
  1845  					}
  1847  					val := labelpb.ZLabelsToPromLabels(ls.GetSeries().Labels).Get(req.Label)
  1848  					if val != "" { // Should never be empty since we added labelName!="" matcher to the list of matchers.
  1849  						values[val] = struct{}{}
  1850  					}
  1851  				}
  1853  				result = make([]string, 0, len(values))
  1854  				for n := range values {
  1855  					result = append(result, n)
  1856  				}
  1857  				sort.Strings(result)
  1858  			}
  1860  			if len(result) > 0 {
  1861  				mtx.Lock()
  1862  				sets = append(sets, result)
  1863  				mtx.Unlock()
  1864  			}
  1866  			return nil
  1867  		})
  1868  	}
  1870  	s.mtx.RUnlock()
  1872  	if err := g.Wait(); err != nil {
  1873  		code := codes.Internal
  1874  		if s, ok := status.FromError(errors.Cause(err)); ok {
  1875  			code = s.Code()
  1876  		}
  1877  		return nil, status.Error(code, err.Error())
  1878  	}
  1880  	anyHints, err := types.MarshalAny(resHints)
  1881  	if err != nil {
  1882  		return nil, status.Error(codes.Unknown, errors.Wrap(err, "marshal label values response hints").Error())
  1883  	}
  1885  	return &storepb.LabelValuesResponse{
  1886  		Values: strutil.MergeSlices(sets...),
  1887  		Hints:  anyHints,
  1888  	}, nil
  1889  }
  1891  // bucketBlockSet holds all blocks of an equal label set. It internally splits
  1892  // them up by downsampling resolution and allows querying.
  1893  type bucketBlockSet struct {
  1894  	labels      labels.Labels
  1895  	mtx         sync.RWMutex
  1896  	resolutions []int64          // Available resolution, high to low (in milliseconds).
  1897  	blocks      [][]*bucketBlock // Ordered buckets for the existing resolutions.
  1898  }
  1900  // newBucketBlockSet initializes a new set with the known downsampling windows hard-configured.
  1901  // The set currently does not support arbitrary ranges.
  1902  func newBucketBlockSet(lset labels.Labels) *bucketBlockSet {
  1903  	return &bucketBlockSet{
  1904  		labels:      lset,
  1905  		resolutions: []int64{downsample.ResLevel2, downsample.ResLevel1, downsample.ResLevel0},
  1906  		blocks:      make([][]*bucketBlock, 3),
  1907  	}
  1908  }
  1910  func (s *bucketBlockSet) add(b *bucketBlock) error {
  1911  	if !labels.Equal(s.labels, labels.FromMap(b.meta.Thanos.Labels)) {
  1912  		return errors.New("block's label set does not match set")
  1913  	}
  1914  	s.mtx.Lock()
  1915  	defer s.mtx.Unlock()
  1917  	i := int64index(s.resolutions, b.meta.Thanos.Downsample.Resolution)
  1918  	if i < 0 {
  1919  		return errors.Errorf("unsupported downsampling resolution %d", b.meta.Thanos.Downsample.Resolution)
  1920  	}
  1921  	bs := append(s.blocks[i], b)
  1922  	s.blocks[i] = bs
  1924  	// Always sort blocks by min time, then max time.
  1925  	sort.Slice(bs, func(j, k int) bool {
  1926  		if bs[j].meta.MinTime == bs[k].meta.MinTime {
  1927  			return bs[j].meta.MaxTime < bs[k].meta.MaxTime
  1928  		}
  1929  		return bs[j].meta.MinTime < bs[k].meta.MinTime
  1930  	})
  1931  	return nil
  1932  }
  1934  func (s *bucketBlockSet) remove(id ulid.ULID) {
  1935  	s.mtx.Lock()
  1936  	defer s.mtx.Unlock()
  1938  	for i, bs := range s.blocks {
  1939  		for j, b := range bs {
  1940  			if b.meta.ULID != id {
  1941  				continue
  1942  			}
  1943  			s.blocks[i] = append(bs[:j], bs[j+1:]...)
  1944  			return
  1945  		}
  1946  	}
  1947  }
  1949  func int64index(s []int64, x int64) int {
  1950  	for i, v := range s {
  1951  		if v == x {
  1952  			return i
  1953  		}
  1954  	}
  1955  	return -1
  1956  }
  1958  // getFor returns a time-ordered list of blocks that cover date between mint and maxt.
  1959  // Blocks with the biggest resolution possible but not bigger than the given max resolution are returned.
  1960  // It supports overlapping blocks.
  1961  //
  1962  // NOTE: s.blocks are expected to be sorted in minTime order.
  1963  func (s *bucketBlockSet) getFor(mint, maxt, maxResolutionMillis int64, blockMatchers []*labels.Matcher) (bs []*bucketBlock) {
  1964  	if mint > maxt {
  1965  		return nil
  1966  	}
  1968  	s.mtx.RLock()
  1969  	defer s.mtx.RUnlock()
  1971  	// Find first matching resolution.
  1972  	i := 0
  1973  	for ; i < len(s.resolutions) && s.resolutions[i] > maxResolutionMillis; i++ {
  1974  	}
  1976  	// Fill the given interval with the blocks for the current resolution.
  1977  	// Our current resolution might not cover all data, so recursively fill the gaps with higher resolution blocks
  1978  	// if there is any.
  1979  	start := mint
  1980  	for _, b := range s.blocks[i] {
  1981  		if b.meta.MaxTime <= mint {
  1982  			continue
  1983  		}
  1984  		// NOTE: Block intervals are half-open: [b.MinTime, b.MaxTime).
  1985  		if b.meta.MinTime > maxt {
  1986  			break
  1987  		}
  1989  		if i+1 < len(s.resolutions) {
  1990  			bs = append(bs, s.getFor(start, b.meta.MinTime-1, s.resolutions[i+1], blockMatchers)...)
  1991  		}
  1993  		// Include the block in the list of matching ones only if there are no block-level matchers
  1994  		// or they actually match.
  1995  		if len(blockMatchers) == 0 || b.matchRelabelLabels(blockMatchers) {
  1996  			bs = append(bs, b)
  1997  		}
  1999  		start = b.meta.MaxTime
  2000  	}
  2002  	if i+1 < len(s.resolutions) {
  2003  		bs = append(bs, s.getFor(start, maxt, s.resolutions[i+1], blockMatchers)...)
  2004  	}
  2005  	return bs
  2006  }
  2008  // labelMatchers verifies whether the block set matches the given matchers and returns a new
  2009  // set of matchers that is equivalent when querying data within the block.
  2010  func (s *bucketBlockSet) labelMatchers(matchers ...*labels.Matcher) ([]*labels.Matcher, bool) {
  2011  	res := make([]*labels.Matcher, 0, len(matchers))
  2013  	for _, m := range matchers {
  2014  		v := s.labels.Get(m.Name)
  2015  		if v == "" {
  2016  			res = append(res, m)
  2017  			continue
  2018  		}
  2019  		if !m.Matches(v) {
  2020  			return nil, false
  2021  		}
  2022  	}
  2023  	return res, true
  2024  }
  2026  // bucketBlock represents a block that is located in a bucket. It holds intermediate
  2027  // state for the block on local disk.
  2028  type bucketBlock struct {
  2029  	logger     log.Logger
  2030  	metrics    *bucketStoreMetrics
  2031  	bkt        objstore.BucketReader
  2032  	meta       *metadata.Meta
  2033  	dir        string
  2034  	indexCache storecache.IndexCache
  2035  	chunkPool  pool.Bytes
  2036  	extLset    labels.Labels
  2038  	indexHeaderReader indexheader.Reader
  2040  	chunkObjs []string
  2042  	pendingReaders sync.WaitGroup
  2044  	partitioner Partitioner
  2046  	// Block's labels used by block-level matchers to filter blocks to query. These are used to select blocks using
  2047  	// request hints' BlockMatchers.
  2048  	relabelLabels labels.Labels
  2050  	estimatedMaxChunkSize  int
  2051  	estimatedMaxSeriesSize int
  2052  }
  2054  func newBucketBlock(
  2055  	ctx context.Context,
  2056  	logger log.Logger,
  2057  	metrics *bucketStoreMetrics,
  2058  	meta *metadata.Meta,
  2059  	bkt objstore.BucketReader,
  2060  	dir string,
  2061  	indexCache storecache.IndexCache,
  2062  	chunkPool pool.Bytes,
  2063  	indexHeadReader indexheader.Reader,
  2064  	p Partitioner,
  2065  	maxSeriesSizeFunc BlockEstimator,
  2066  	maxChunkSizeFunc BlockEstimator,
  2067  ) (b *bucketBlock, err error) {
  2068  	maxSeriesSize := EstimatedMaxSeriesSize
  2069  	if maxSeriesSizeFunc != nil {
  2070  		maxSeriesSize = int(maxSeriesSizeFunc(*meta))
  2071  	}
  2072  	maxChunkSize := EstimatedMaxChunkSize
  2073  	if maxChunkSizeFunc != nil {
  2074  		maxChunkSize = int(maxChunkSizeFunc(*meta))
  2075  	}
  2076  	b = &bucketBlock{
  2077  		logger:            logger,
  2078  		metrics:           metrics,
  2079  		bkt:               bkt,
  2080  		indexCache:        indexCache,
  2081  		chunkPool:         chunkPool,
  2082  		dir:               dir,
  2083  		partitioner:       p,
  2084  		meta:              meta,
  2085  		indexHeaderReader: indexHeadReader,
  2086  		extLset:           labels.FromMap(meta.Thanos.Labels),
  2087  		// Translate the block's labels and inject the block ID as a label
  2088  		// to allow to match blocks also by ID.
  2089  		relabelLabels: append(labels.FromMap(meta.Thanos.Labels), labels.Label{
  2090  			Name:  block.BlockIDLabel,
  2091  			Value: meta.ULID.String(),
  2092  		}),
  2093  		estimatedMaxSeriesSize: maxSeriesSize,
  2094  		estimatedMaxChunkSize:  maxChunkSize,
  2095  	}
  2096  	sort.Sort(b.extLset)
  2097  	sort.Sort(b.relabelLabels)
  2099  	// Get object handles for all chunk files (segment files) from meta.json, if available.
  2100  	if len(meta.Thanos.SegmentFiles) > 0 {
  2101  		b.chunkObjs = make([]string, 0, len(meta.Thanos.SegmentFiles))
  2103  		for _, sf := range meta.Thanos.SegmentFiles {
  2104  			b.chunkObjs = append(b.chunkObjs, path.Join(meta.ULID.String(), block.ChunksDirname, sf))
  2105  		}
  2106  		return b, nil
  2107  	}
  2109  	// Get object handles for all chunk files from storage.
  2110  	if err = bkt.Iter(ctx, path.Join(meta.ULID.String(), block.ChunksDirname), func(n string) error {
  2111  		b.chunkObjs = append(b.chunkObjs, n)
  2112  		return nil
  2113  	}); err != nil {
  2114  		return nil, errors.Wrap(err, "list chunk files")
  2115  	}
  2116  	return b, nil
  2117  }
  2119  func (b *bucketBlock) indexFilename() string {
  2120  	return path.Join(b.meta.ULID.String(), block.IndexFilename)
  2121  }
  2123  func (b *bucketBlock) readIndexRange(ctx context.Context, off, length int64) ([]byte, error) {
  2124  	r, err := b.bkt.GetRange(ctx, b.indexFilename(), off, length)
  2125  	if err != nil {
  2126  		return nil, errors.Wrap(err, "get range reader")
  2127  	}
  2128  	defer runutil.CloseWithLogOnErr(b.logger, r, "readIndexRange close range reader")
  2130  	// Preallocate the buffer with the exact size so we don't waste allocations
  2131  	// while progressively growing an initial small buffer. The buffer capacity
  2132  	// is increased by MinRead to avoid extra allocations due to how ReadFrom()
  2133  	// internally works.
  2134  	buf := bytes.NewBuffer(make([]byte, 0, length+bytes.MinRead))
  2135  	if _, err := buf.ReadFrom(r); err != nil {
  2136  		return nil, errors.Wrap(err, "read range")
  2137  	}
  2138  	return buf.Bytes(), nil
  2139  }
  2141  func (b *bucketBlock) readChunkRange(ctx context.Context, seq int, off, length int64, chunkRanges byteRanges) (*[]byte, error) {
  2142  	if seq < 0 || seq >= len(b.chunkObjs) {
  2143  		return nil, errors.Errorf("unknown segment file for index %d", seq)
  2144  	}
  2146  	// Get a reader for the required range.
  2147  	reader, err := b.bkt.GetRange(ctx, b.chunkObjs[seq], off, length)
  2148  	if err != nil {
  2149  		return nil, errors.Wrap(err, "get range reader")
  2150  	}
  2151  	defer runutil.CloseWithLogOnErr(b.logger, reader, "readChunkRange close range reader")
  2153  	// Get a buffer from the pool.
  2154  	chunkBuffer, err := b.chunkPool.Get(chunkRanges.size())
  2155  	if err != nil {
  2156  		return nil, errors.Wrap(err, "allocate chunk bytes")
  2157  	}
  2159  	*chunkBuffer, err = readByteRanges(reader, *chunkBuffer, chunkRanges)
  2160  	if err != nil {
  2161  		return nil, err
  2162  	}
  2164  	return chunkBuffer, nil
  2165  }
  2167  func (b *bucketBlock) chunkRangeReader(ctx context.Context, seq int, off, length int64) (io.ReadCloser, error) {
  2168  	if seq < 0 || seq >= len(b.chunkObjs) {
  2169  		return nil, errors.Errorf("unknown segment file for index %d", seq)
  2170  	}
  2172  	return b.bkt.GetRange(ctx, b.chunkObjs[seq], off, length)
  2173  }
  2175  func (b *bucketBlock) indexReader() *bucketIndexReader {
  2176  	b.pendingReaders.Add(1)
  2177  	return newBucketIndexReader(b)
  2178  }
  2180  func (b *bucketBlock) chunkReader() *bucketChunkReader {
  2181  	b.pendingReaders.Add(1)
  2182  	return newBucketChunkReader(b)
  2183  }
  2185  // matchRelabelLabels verifies whether the block matches the given matchers.
  2186  func (b *bucketBlock) matchRelabelLabels(matchers []*labels.Matcher) bool {
  2187  	for _, m := range matchers {
  2188  		if !m.Matches(b.relabelLabels.Get(m.Name)) {
  2189  			return false
  2190  		}
  2191  	}
  2192  	return true
  2193  }
  2195  // overlapsClosedInterval returns true if the block overlaps [mint, maxt).
  2196  func (b *bucketBlock) overlapsClosedInterval(mint, maxt int64) bool {
  2197  	// The block itself is a half-open interval
  2198  	// [b.meta.MinTime, b.meta.MaxTime).
  2199  	return b.meta.MinTime <= maxt && mint < b.meta.MaxTime
  2200  }
  2202  // Close waits for all pending readers to finish and then closes all underlying resources.
  2203  func (b *bucketBlock) Close() error {
  2204  	b.pendingReaders.Wait()
  2205  	return b.indexHeaderReader.Close()
  2206  }
  2208  // bucketIndexReader is a custom index reader (not conforming index.Reader interface) that reads index that is stored in
  2209  // object storage without having to fully download it.
  2210  type bucketIndexReader struct {
  2211  	block *bucketBlock
  2212  	dec   *index.Decoder
  2213  	stats *queryStats
  2215  	mtx          sync.Mutex
  2216  	loadedSeries map[storage.SeriesRef][]byte
  2217  }
  2219  func newBucketIndexReader(block *bucketBlock) *bucketIndexReader {
  2220  	r := &bucketIndexReader{
  2221  		block: block,
  2222  		dec: &index.Decoder{
  2223  			LookupSymbol: block.indexHeaderReader.LookupSymbol,
  2224  		},
  2225  		stats:        &queryStats{},
  2226  		loadedSeries: map[storage.SeriesRef][]byte{},
  2227  	}
  2228  	return r
  2229  }
  2230  func (r *bucketIndexReader) reset() {
  2231  	r.loadedSeries = map[storage.SeriesRef][]byte{}
  2232  }
  2234  // ExpandedPostings returns postings in expanded list instead of index.Postings.
  2235  // This is because we need to have them buffered anyway to perform efficient lookup
  2236  // on object storage.
  2237  // Found posting IDs (ps) are not strictly required to point to a valid Series, e.g. during
  2238  // background garbage collections.
  2239  //
  2240  // Reminder: A posting is a reference (represented as a uint64) to a series reference, which in turn points to the first
  2241  // chunk where the series contains the matching label-value pair for a given block of data. Postings can be fetched by
  2242  // single label name=value.
  2243  func (r *bucketIndexReader) ExpandedPostings(ctx context.Context, ms sortedMatchers, bytesLimiter BytesLimiter) ([]storage.SeriesRef, error) {
  2244  	// Shortcut the case of `len(postingGroups) == 0`. It will only happen when no
  2245  	// matchers specified, and we don't need to fetch expanded postings from cache.
  2246  	if len(ms) == 0 {
  2247  		return nil, nil
  2248  	}
  2250  	hit, postings, err := r.fetchExpandedPostingsFromCache(ctx, ms, bytesLimiter)
  2251  	if err != nil {
  2252  		return nil, err
  2253  	}
  2254  	if hit {
  2255  		return postings, nil
  2256  	}
  2257  	var (
  2258  		allRequested = false
  2259  		hasAdds      = false
  2260  		keys         []labels.Label
  2261  	)
  2263  	postingGroups, err := matchersToPostingGroups(ctx, r.block.indexHeaderReader.LabelValues, ms)
  2264  	if err != nil {
  2265  		return nil, errors.Wrap(err, "matchersToPostingGroups")
  2266  	}
  2267  	if postingGroups == nil {
  2268  		r.storeExpandedPostingsToCache(ms, index.EmptyPostings(), 0)
  2269  		return nil, nil
  2270  	}
  2271  	for _, pg := range postingGroups {
  2272  		allRequested = allRequested || pg.addAll
  2273  		hasAdds = hasAdds || len(pg.addKeys) > 0
  2275  		// Postings returned by fetchPostings will be in the same order as keys
  2276  		// so it's important that we iterate them in the same order later.
  2277  		// We don't have any other way of pairing keys and fetched postings.
  2278  		for _, key := range pg.addKeys {
  2279  			keys = append(keys, labels.Label{Name:, Value: key})
  2280  		}
  2281  		for _, key := range pg.removeKeys {
  2282  			keys = append(keys, labels.Label{Name:, Value: key})
  2283  		}
  2284  	}
  2286  	// We only need special All postings if there are no other adds. If there are, we can skip fetching
  2287  	// special All postings completely.
  2288  	if allRequested && !hasAdds {
  2289  		// add group with label to fetch "special All postings".
  2290  		name, value := index.AllPostingsKey()
  2291  		allPostingsLabel := labels.Label{Name: name, Value: value}
  2293  		postingGroups = append(postingGroups, newPostingGroup(true, name, []string{value}, nil))
  2294  		keys = append(keys, allPostingsLabel)
  2295  	}
  2297  	fetchedPostings, closeFns, err := r.fetchPostings(ctx, keys, bytesLimiter)
  2298  	defer func() {
  2299  		for _, closeFn := range closeFns {
  2300  			closeFn()
  2301  		}
  2302  	}()
  2303  	if err != nil {
  2304  		return nil, errors.Wrap(err, "get postings")
  2305  	}
  2307  	// Get "add" and "remove" postings from groups. We iterate over postingGroups and their keys
  2308  	// again, and this is exactly the same order as before (when building the groups), so we can simply
  2309  	// use one incrementing index to fetch postings from returned slice.
  2310  	postingIndex := 0
  2312  	var groupAdds, groupRemovals []index.Postings
  2313  	for _, g := range postingGroups {
  2314  		// We cannot add empty set to groupAdds, since they are intersected.
  2315  		if len(g.addKeys) > 0 {
  2316  			toMerge := make([]index.Postings, 0, len(g.addKeys))
  2317  			for _, l := range g.addKeys {
  2318  				toMerge = append(toMerge, checkNilPosting(, l, fetchedPostings[postingIndex]))
  2319  				postingIndex++
  2320  			}
  2322  			groupAdds = append(groupAdds, index.Merge(toMerge...))
  2323  		}
  2325  		for _, l := range g.removeKeys {
  2326  			groupRemovals = append(groupRemovals, checkNilPosting(, l, fetchedPostings[postingIndex]))
  2327  			postingIndex++
  2328  		}
  2329  	}
  2331  	result := index.Without(index.Intersect(groupAdds...), index.Merge(groupRemovals...))
  2332  	ps, err := ExpandPostingsWithContext(ctx, result)
  2333  	if err != nil {
  2334  		return nil, errors.Wrap(err, "expand")
  2335  	}
  2336  	r.storeExpandedPostingsToCache(ms, index.NewListPostings(ps), len(ps))
  2338  	if len(ps) > 0 {
  2339  		// As of version two all series entries are 16 byte padded. All references
  2340  		// we get have to account for that to get the correct offset.
  2341  		version, err := r.block.indexHeaderReader.IndexVersion()
  2342  		if err != nil {
  2343  			return nil, errors.Wrap(err, "get index version")
  2344  		}
  2345  		if version >= 2 {
  2346  			for i, id := range ps {
  2347  				ps[i] = id * 16
  2348  			}
  2349  		}
  2350  	}
  2351  	return ps, nil
  2352  }
  2354  // ExpandPostingsWithContext returns the postings expanded as a slice and considers context.
  2355  func ExpandPostingsWithContext(ctx context.Context, p index.Postings) (res []storage.SeriesRef, err error) {
  2356  	for p.Next() {
  2357  		if ctx.Err() != nil {
  2358  			return nil, ctx.Err()
  2359  		}
  2360  		res = append(res, p.At())
  2361  	}
  2362  	return res, p.Err()
  2363  }
  2365  // postingGroup keeps posting keys for one or more matchers with the same label name. Logical result of the group is:
  2366  // If addAll is set: special All postings minus postings for removeKeys labels. No need to merge postings for addKeys in this case.
  2367  // If addAll is not set: Merge of postings for "addKeys" labels minus postings for removeKeys labels
  2368  // This computation happens in ExpandedPostings.
  2369  type postingGroup struct {
  2370  	addAll     bool
  2371  	name       string
  2372  	addKeys    []string
  2373  	removeKeys []string
  2374  }
  2376  func newPostingGroup(addAll bool, name string, addKeys, removeKeys []string) *postingGroup {
  2377  	return &postingGroup{
  2378  		addAll:     addAll,
  2379  		name:       name,
  2380  		addKeys:    addKeys,
  2381  		removeKeys: removeKeys,
  2382  	}
  2383  }
  2385  func (pg postingGroup) merge(other *postingGroup) *postingGroup {
  2386  	if other == nil {
  2387  		return &pg
  2388  	}
  2389  	// This shouldn't happen, but add this as a safeguard.
  2390  	if != {
  2391  		return nil
  2392  	}
  2393  	var i, j int
  2394  	// Both add all, merge remove keys.
  2395  	if pg.addAll && other.addAll {
  2396  		// Fast path to not allocate output slice if no remove keys are specified.
  2397  		// This is possible when matcher is `=~".*"`.
  2398  		if len(pg.removeKeys) == 0 {
  2399  			pg.removeKeys = other.removeKeys
  2400  			return &pg
  2401  		} else if len(other.removeKeys) == 0 {
  2402  			return &pg
  2403  		}
  2404  		output := make([]string, 0, len(pg.removeKeys)+len(other.removeKeys))
  2405  		for i < len(pg.removeKeys) && j < len(other.removeKeys) {
  2406  			if pg.removeKeys[i] < other.removeKeys[j] {
  2407  				output = append(output, pg.removeKeys[i])
  2408  				i++
  2409  			} else if pg.removeKeys[i] > other.removeKeys[j] {
  2410  				output = append(output, other.removeKeys[j])
  2411  				j++
  2412  			} else {
  2413  				output = append(output, pg.removeKeys[i])
  2414  				i++
  2415  				j++
  2416  			}
  2417  		}
  2418  		if i < len(pg.removeKeys) {
  2419  			output = append(output, pg.removeKeys[i:len(pg.removeKeys)]...)
  2420  		}
  2421  		if j < len(other.removeKeys) {
  2422  			output = append(output, other.removeKeys[j:len(other.removeKeys)]...)
  2423  		}
  2424  		pg.removeKeys = output
  2425  	} else if pg.addAll || other.addAll {
  2426  		// Subtract the remove keys.
  2427  		toRemove := other
  2428  		toAdd := &pg
  2429  		if pg.addAll {
  2430  			toRemove = &pg
  2431  			toAdd = other
  2432  		}
  2433  		var k int
  2434  		for i < len(toAdd.addKeys) && j < len(toRemove.removeKeys) {
  2435  			if toAdd.addKeys[i] < toRemove.removeKeys[j] {
  2436  				toAdd.addKeys[k] = toAdd.addKeys[i]
  2437  				k++
  2438  				i++
  2439  			} else if toAdd.addKeys[i] > toRemove.removeKeys[j] {
  2440  				j++
  2441  			} else {
  2442  				i++
  2443  				j++
  2444  			}
  2445  		}
  2446  		for i < len(toAdd.addKeys) {
  2447  			toAdd.addKeys[k] = toAdd.addKeys[i]
  2448  			i++
  2449  			k++
  2450  		}
  2451  		pg.addKeys = toAdd.addKeys[:k]
  2452  		pg.addAll = false
  2453  		pg.removeKeys = nil
  2454  	} else {
  2455  		addKeys := make([]string, 0, len(pg.addKeys)+len(other.addKeys))
  2456  		for i < len(pg.addKeys) && j < len(other.addKeys) {
  2457  			if pg.addKeys[i] == other.addKeys[j] {
  2458  				addKeys = append(addKeys, pg.addKeys[i])
  2459  				i++
  2460  				j++
  2461  			} else if pg.addKeys[i] < other.addKeys[j] {
  2462  				i++
  2463  			} else {
  2464  				j++
  2465  			}
  2466  		}
  2467  		pg.addKeys = addKeys
  2468  	}
  2469  	return &pg
  2470  }
  2472  func checkNilPosting(name, value string, p index.Postings) index.Postings {
  2473  	if p == nil {
  2474  		// This should not happen. Debug for
  2475  		return index.ErrPostings(errors.Errorf("postings is nil for {%s=%s}. It was never fetched.", name, value))
  2476  	}
  2477  	return p
  2478  }
  2480  func matchersToPostingGroups(ctx context.Context, lvalsFn func(name string) ([]string, error), ms []*labels.Matcher) ([]*postingGroup, error) {
  2481  	matchersMap := make(map[string][]*labels.Matcher)
  2482  	for _, m := range ms {
  2483  		matchersMap[m.Name] = append(matchersMap[m.Name], m)
  2484  	}
  2486  	pgs := make([]*postingGroup, 0)
  2487  	// NOTE: Derived from tsdb.PostingsForMatchers.
  2488  	for _, values := range matchersMap {
  2489  		var (
  2490  			mergedPG     *postingGroup
  2491  			pg           *postingGroup
  2492  			vals         []string
  2493  			err          error
  2494  			valuesCached bool
  2495  		)
  2496  		lvalsFunc := lvalsFn
  2497  		// Merge PostingGroups with the same matcher into 1 to
  2498  		//  avoid fetching duplicate postings.
  2499  		for _, val := range values {
  2500  			pg, vals, err = toPostingGroup(ctx, lvalsFunc, val)
  2501  			if err != nil {
  2502  				return nil, errors.Wrap(err, "toPostingGroup")
  2503  			}
  2504  			// Cache label values because label name is the same.
  2505  			if !valuesCached && vals != nil {
  2506  				lvalsFunc = func(_ string) ([]string, error) {
  2507  					return vals, nil
  2508  				}
  2509  				valuesCached = true
  2510  			}
  2512  			// If this groups adds nothing, it's an empty group. We can shortcut this, since intersection with empty
  2513  			// postings would return no postings anyway.
  2514  			// E.g. label="non-existing-value" returns empty group.
  2515  			if !pg.addAll && len(pg.addKeys) == 0 {
  2516  				return nil, nil
  2517  			}
  2518  			if mergedPG == nil {
  2519  				mergedPG = pg
  2520  			} else {
  2521  				mergedPG = mergedPG.merge(pg)
  2522  			}
  2524  			// If this groups adds nothing, it's an empty group. We can shortcut this, since intersection with empty
  2525  			// postings would return no postings anyway.
  2526  			// E.g. label="non-existing-value" returns empty group.
  2527  			if !mergedPG.addAll && len(mergedPG.addKeys) == 0 {
  2528  				return nil, nil
  2529  			}
  2530  		}
  2531  		pgs = append(pgs, mergedPG)
  2532  	}
  2533  	slices.SortFunc(pgs, func(a, b *postingGroup) bool {
  2534  		return <
  2535  	})
  2536  	return pgs, nil
  2537  }
  2539  // NOTE: Derived from tsdb.postingsForMatcher. index.Merge is equivalent to map duplication.
  2540  func toPostingGroup(ctx context.Context, lvalsFn func(name string) ([]string, error), m *labels.Matcher) (*postingGroup, []string, error) {
  2541  	// If the matcher selects an empty value, it selects all the series which don't
  2542  	// have the label name set too. See:
  2543  	// and
  2544  	if m.Matches("") {
  2545  		var toRemove []string
  2547  		// Fast-path for MatchNotRegexp matching.
  2548  		// Inverse of a MatchNotRegexp is MatchRegexp (double negation).
  2549  		// Fast-path for set matching.
  2550  		if m.Type == labels.MatchNotRegexp {
  2551  			if vals := findSetMatches(m.Value); len(vals) > 0 {
  2552  				sort.Strings(vals)
  2553  				return newPostingGroup(true, m.Name, nil, vals), nil, nil
  2554  			}
  2555  		}
  2557  		// Fast-path for MatchNotEqual matching.
  2558  		// Inverse of a MatchNotEqual is MatchEqual (double negation).
  2559  		if m.Type == labels.MatchNotEqual {
  2560  			return newPostingGroup(true, m.Name, nil, []string{m.Value}), nil, nil
  2561  		}
  2563  		vals, err := lvalsFn(m.Name)
  2564  		if err != nil {
  2565  			return nil, nil, err
  2566  		}
  2568  		for _, val := range vals {
  2569  			if ctx.Err() != nil {
  2570  				return nil, nil, ctx.Err()
  2571  			}
  2572  			if !m.Matches(val) {
  2573  				toRemove = append(toRemove, val)
  2574  			}
  2575  		}
  2577  		return newPostingGroup(true, m.Name, nil, toRemove), vals, nil
  2578  	}
  2579  	if m.Type == labels.MatchRegexp {
  2580  		if vals := findSetMatches(m.Value); len(vals) > 0 {
  2581  			sort.Strings(vals)
  2582  			return newPostingGroup(false, m.Name, vals, nil), nil, nil
  2583  		}
  2584  	}
  2586  	// Fast-path for equal matching.
  2587  	if m.Type == labels.MatchEqual {
  2588  		return newPostingGroup(false, m.Name, []string{m.Value}, nil), nil, nil
  2589  	}
  2591  	vals, err := lvalsFn(m.Name)
  2592  	if err != nil {
  2593  		return nil, nil, err
  2594  	}
  2596  	var toAdd []string
  2597  	for _, val := range vals {
  2598  		if ctx.Err() != nil {
  2599  			return nil, nil, ctx.Err()
  2600  		}
  2601  		if m.Matches(val) {
  2602  			toAdd = append(toAdd, val)
  2603  		}
  2604  	}
  2606  	return newPostingGroup(false, m.Name, toAdd, nil), vals, nil
  2607  }
  2609  type postingPtr struct {
  2610  	keyID int
  2611  	ptr   index.Range
  2612  }
  2614  func (r *bucketIndexReader) fetchExpandedPostingsFromCache(ctx context.Context, ms []*labels.Matcher, bytesLimiter BytesLimiter) (bool, []storage.SeriesRef, error) {
  2615  	dataFromCache, hit := r.block.indexCache.FetchExpandedPostings(ctx, r.block.meta.ULID, ms)
  2616  	if !hit {
  2617  		return false, nil, nil
  2618  	}
  2619  	if err := bytesLimiter.Reserve(uint64(len(dataFromCache))); err != nil {
  2620  		return false, nil, httpgrpc.Errorf(int(codes.ResourceExhausted), "exceeded bytes limit while loading expanded postings from index cache: %s", err)
  2621  	}
  2622  	r.stats.DataDownloadedSizeSum += units.Base2Bytes(len(dataFromCache))
  2623  	r.stats.postingsTouched++
  2624  	r.stats.PostingsTouchedSizeSum += units.Base2Bytes(len(dataFromCache))
  2625  	p, closeFns, err := r.decodeCachedPostings(dataFromCache)
  2626  	defer func() {
  2627  		for _, closeFn := range closeFns {
  2628  			closeFn()
  2629  		}
  2630  	}()
  2631  	// If failed to decode or expand cached postings, return and expand postings again.
  2632  	if err != nil {
  2633  		level.Error(r.block.logger).Log("msg", "failed to decode cached expanded postings, refetch postings", "id", r.block.meta.ULID.String(), "err", err)
  2634  		return false, nil, nil
  2635  	}
  2637  	ps, err := ExpandPostingsWithContext(ctx, p)
  2638  	if err != nil {
  2639  		level.Error(r.block.logger).Log("msg", "failed to expand cached expanded postings, refetch postings", "id", r.block.meta.ULID.String(), "err", err)
  2640  		return false, nil, nil
  2641  	}
  2643  	if len(ps) > 0 {
  2644  		// As of version two all series entries are 16 byte padded. All references
  2645  		// we get have to account for that to get the correct offset.
  2646  		version, err := r.block.indexHeaderReader.IndexVersion()
  2647  		if err != nil {
  2648  			return false, nil, errors.Wrap(err, "get index version")
  2649  		}
  2650  		if version >= 2 {
  2651  			for i, id := range ps {
  2652  				ps[i] = id * 16
  2653  			}
  2654  		}
  2655  	}
  2656  	return true, ps, nil
  2657  }
  2659  func (r *bucketIndexReader) storeExpandedPostingsToCache(ms []*labels.Matcher, ps index.Postings, length int) {
  2660  	// Encode postings to cache. We compress and cache postings before adding
  2661  	// 16 bytes padding in order to make compressed size smaller.
  2662  	dataToCache, compressionDuration, compressionErrors, compressedSize := r.encodePostingsToCache(ps, length)
  2663  	r.stats.cachedPostingsCompressions++
  2664  	r.stats.cachedPostingsCompressionErrors += compressionErrors
  2665  	r.stats.CachedPostingsCompressionTimeSum += compressionDuration
  2666  	r.stats.CachedPostingsCompressedSizeSum += units.Base2Bytes(compressedSize)
  2667  	r.stats.CachedPostingsOriginalSizeSum += units.Base2Bytes(length * 4) // Estimate the posting list size.
  2668  	r.block.indexCache.StoreExpandedPostings(r.block.meta.ULID, ms, dataToCache)
  2669  }
  2671  var bufioReaderPool = sync.Pool{
  2672  	New: func() any {
  2673  		return bufio.NewReader(nil)
  2674  	},
  2675  }
  2677  // fetchPostings fill postings requested by posting groups.
  2678  // It returns one posting for each key, in the same order.
  2679  // If postings for given key is not fetched, entry at given index will be nil.
  2680  func (r *bucketIndexReader) fetchPostings(ctx context.Context, keys []labels.Label, bytesLimiter BytesLimiter) ([]index.Postings, []func(), error) {
  2681  	var closeFns []func()
  2683  	timer := prometheus.NewTimer(r.block.metrics.postingsFetchDuration)
  2684  	defer timer.ObserveDuration()
  2686  	var ptrs []postingPtr
  2688  	output := make([]index.Postings, len(keys))
  2690  	// Fetch postings from the cache with a single call.
  2691  	fromCache, _ := r.block.indexCache.FetchMultiPostings(ctx, r.block.meta.ULID, keys)
  2692  	for _, dataFromCache := range fromCache {
  2693  		if err := bytesLimiter.Reserve(uint64(len(dataFromCache))); err != nil {
  2694  			return nil, closeFns, httpgrpc.Errorf(int(codes.ResourceExhausted), "exceeded bytes limit while loading postings from index cache: %s", err)
  2695  		}
  2696  		r.stats.DataDownloadedSizeSum += units.Base2Bytes(len(dataFromCache))
  2697  	}
  2699  	// Iterate over all groups and fetch posting from cache.
  2700  	// If we have a miss, mark key to be fetched in `ptrs` slice.
  2701  	// Overlaps are well handled by partitioner, so we don't need to deduplicate keys.
  2702  	for ix, key := range keys {
  2703  		if err := ctx.Err(); err != nil {
  2704  			return nil, closeFns, err
  2705  		}
  2706  		// Get postings for the given key from cache first.
  2707  		if b, ok := fromCache[key]; ok {
  2708  			r.stats.postingsTouched++
  2709  			r.stats.PostingsTouchedSizeSum += units.Base2Bytes(len(b))
  2711  			l, closer, err := r.decodeCachedPostings(b)
  2712  			if err != nil {
  2713  				return nil, closeFns, errors.Wrap(err, "decode postings")
  2714  			}
  2715  			output[ix] = l
  2716  			closeFns = append(closeFns, closer...)
  2717  			continue
  2718  		}
  2720  		// Cache miss; save pointer for actual posting in index stored in object store.
  2721  		ptr, err := r.block.indexHeaderReader.PostingsOffset(key.Name, key.Value)
  2722  		if err == indexheader.NotFoundRangeErr {
  2723  			// This block does not have any posting for given key.
  2724  			output[ix] = index.EmptyPostings()
  2725  			continue
  2726  		}
  2728  		if err != nil {
  2729  			return nil, closeFns, errors.Wrap(err, "index header PostingsOffset")
  2730  		}
  2732  		r.stats.postingsToFetch++
  2733  		ptrs = append(ptrs, postingPtr{ptr: ptr, keyID: ix})
  2734  	}
  2736  	sort.Slice(ptrs, func(i, j int) bool {
  2737  		return ptrs[i].ptr.Start < ptrs[j].ptr.Start
  2738  	})
  2740  	// TODO(bwplotka): Asses how large in worst case scenario this can be. (e.g fetch for AllPostingsKeys)
  2741  	// Consider sub split if too big.
  2742  	parts := r.block.partitioner.Partition(len(ptrs), func(i int) (start, end uint64) {
  2743  		return uint64(ptrs[i].ptr.Start), uint64(ptrs[i].ptr.End)
  2744  	})
  2746  	for _, part := range parts {
  2747  		start := int64(part.Start)
  2748  		length := int64(part.End) - start
  2750  		if err := bytesLimiter.Reserve(uint64(length)); err != nil {
  2751  			return nil, closeFns, httpgrpc.Errorf(int(codes.ResourceExhausted), "exceeded bytes limit while fetching postings: %s", err)
  2752  		}
  2753  		r.stats.DataDownloadedSizeSum += units.Base2Bytes(length)
  2754  	}
  2756  	g, ctx := errgroup.WithContext(ctx)
  2757  	for _, part := range parts {
  2758  		i, j := part.ElemRng[0], part.ElemRng[1]
  2760  		start := int64(part.Start)
  2761  		// We assume index does not have any ptrs that has 0 length.
  2762  		length := int64(part.End) - start
  2764  		// Fetch from object storage concurrently and update stats and posting list.
  2765  		g.Go(func() error {
  2766  			begin := time.Now()
  2768  			brdr := bufioReaderPool.Get().(*bufio.Reader)
  2769  			defer bufioReaderPool.Put(brdr)
  2771  			partReader, err := r.block.bkt.GetRange(ctx, r.block.indexFilename(), start, length)
  2772  			if err != nil {
  2773  				return errors.Wrap(err, "read postings range")
  2774  			}
  2775  			defer runutil.CloseWithLogOnErr(r.block.logger, partReader, "readIndexRange close range reader")
  2776  			brdr.Reset(partReader)
  2778  			rdr := newPostingsReaderBuilder(ctx, brdr, ptrs[i:j], start, length)
  2780  			r.mtx.Lock()
  2781  			r.stats.postingsFetchCount++
  2782  			r.stats.postingsFetched += j - i
  2783  			r.stats.PostingsFetchedSizeSum += units.Base2Bytes(int(length))
  2784  			r.mtx.Unlock()
  2786  			for rdr.Next() {
  2787  				diffVarintPostings, postingsCount, keyID := rdr.AtDiffVarint()
  2789  				output[keyID] = newDiffVarintPostings(diffVarintPostings, nil)
  2791  				startCompression := time.Now()
  2792  				dataToCache, err := snappyStreamedEncode(int(postingsCount), diffVarintPostings)
  2793  				if err != nil {
  2794  					r.mtx.Lock()
  2795  					r.stats.cachedPostingsCompressionErrors += 1
  2796  					r.mtx.Unlock()
  2797  					return errors.Wrap(err, "encoding with snappy")
  2798  				}
  2800  				r.mtx.Lock()
  2801  				r.stats.postingsTouched++
  2802  				r.stats.PostingsTouchedSizeSum += units.Base2Bytes(int(len(diffVarintPostings)))
  2803  				r.stats.cachedPostingsCompressions += 1
  2804  				r.stats.CachedPostingsOriginalSizeSum += units.Base2Bytes(len(diffVarintPostings))
  2805  				r.stats.CachedPostingsCompressedSizeSum += units.Base2Bytes(len(dataToCache))
  2806  				r.stats.CachedPostingsCompressionTimeSum += time.Since(startCompression)
  2807  				r.mtx.Unlock()
  2809  				r.block.indexCache.StorePostings(r.block.meta.ULID, keys[keyID], dataToCache)
  2810  			}
  2812  			r.mtx.Lock()
  2813  			r.stats.PostingsFetchDurationSum += time.Since(begin)
  2814  			r.mtx.Unlock()
  2816  			if err := rdr.Error(); err != nil {
  2817  				return errors.Wrap(err, "reading postings")
  2818  			}
  2819  			return nil
  2820  		})
  2821  	}
  2823  	return output, closeFns, g.Wait()
  2824  }
  2826  func (r *bucketIndexReader) decodeCachedPostings(b []byte) (index.Postings, []func(), error) {
  2827  	// Even if this instance is not using compression, there may be compressed
  2828  	// entries in the cache written by other stores.
  2829  	var (
  2830  		l        index.Postings
  2831  		err      error
  2832  		closeFns []func()
  2833  	)
  2834  	if isDiffVarintSnappyEncodedPostings(b) || isDiffVarintSnappyStreamedEncodedPostings(b) {
  2835  		s := time.Now()
  2836  		l, err = decodePostings(b)
  2837  		r.stats.cachedPostingsDecompressions += 1
  2838  		r.stats.CachedPostingsDecompressionTimeSum += time.Since(s)
  2839  		if err != nil {
  2840  			r.stats.cachedPostingsDecompressionErrors += 1
  2841  		} else {
  2842  			closeFns = append(closeFns, l.(closeablePostings).close)
  2843  		}
  2844  	} else {
  2845  		_, l, err = r.dec.Postings(b)
  2846  	}
  2847  	return l, closeFns, err
  2848  }
  2850  func (r *bucketIndexReader) encodePostingsToCache(p index.Postings, length int) ([]byte, time.Duration, int, int) {
  2851  	var dataToCache []byte
  2852  	compressionTime := time.Duration(0)
  2853  	compressionErrors, compressedSize := 0, 0
  2854  	s := time.Now()
  2855  	data, err := diffVarintSnappyStreamedEncode(p, length)
  2856  	compressionTime = time.Since(s)
  2857  	if err == nil {
  2858  		dataToCache = data
  2859  		compressedSize = len(data)
  2860  	} else {
  2861  		compressionErrors = 1
  2862  	}
  2863  	return dataToCache, compressionTime, compressionErrors, compressedSize
  2864  }
  2866  // bigEndianPostings implements the Postings interface over a byte stream of
  2867  // big endian numbers.
  2868  type bigEndianPostings struct {
  2869  	list []byte
  2870  	cur  uint32
  2871  }
  2873  // TODO(bwplotka): Expose those inside Prometheus.
  2874  func newBigEndianPostings(list []byte) *bigEndianPostings {
  2875  	return &bigEndianPostings{list: list}
  2876  }
  2878  func (it *bigEndianPostings) At() storage.SeriesRef {
  2879  	return storage.SeriesRef(it.cur)
  2880  }
  2882  func (it *bigEndianPostings) Next() bool {
  2883  	if len(it.list) >= 4 {
  2884  		it.cur = binary.BigEndian.Uint32(it.list)
  2885  		it.list = it.list[4:]
  2886  		return true
  2887  	}
  2888  	return false
  2889  }
  2891  func (it *bigEndianPostings) Seek(x storage.SeriesRef) bool {
  2892  	if storage.SeriesRef(it.cur) >= x {
  2893  		return true
  2894  	}
  2896  	num := len(it.list) / 4
  2897  	// Do binary search between current position and end.
  2898  	i := sort.Search(num, func(i int) bool {
  2899  		return binary.BigEndian.Uint32(it.list[i*4:]) >= uint32(x)
  2900  	})
  2901  	if i < num {
  2902  		j := i * 4
  2903  		it.cur = binary.BigEndian.Uint32(it.list[j:])
  2904  		it.list = it.list[j+4:]
  2905  		return true
  2906  	}
  2907  	it.list = nil
  2908  	return false
  2909  }
  2911  func (it *bigEndianPostings) Err() error {
  2912  	return nil
  2913  }
  2915  // Returns number of remaining postings values.
  2916  func (it *bigEndianPostings) length() int {
  2917  	return len(it.list) / 4
  2918  }
  2920  func (r *bucketIndexReader) PreloadSeries(ctx context.Context, ids []storage.SeriesRef, bytesLimiter BytesLimiter) error {
  2921  	timer := prometheus.NewTimer(r.block.metrics.seriesFetchDuration)
  2922  	defer timer.ObserveDuration()
  2924  	// Load series from cache, overwriting the list of ids to preload
  2925  	// with the missing ones.
  2926  	fromCache, ids := r.block.indexCache.FetchMultiSeries(ctx, r.block.meta.ULID, ids)
  2927  	for id, b := range fromCache {
  2928  		r.loadedSeries[id] = b
  2929  		if err := bytesLimiter.Reserve(uint64(len(b))); err != nil {
  2930  			return httpgrpc.Errorf(int(codes.ResourceExhausted), "exceeded bytes limit while loading series from index cache: %s", err)
  2931  		}
  2932  		r.stats.DataDownloadedSizeSum += units.Base2Bytes(len(b))
  2933  	}
  2935  	parts := r.block.partitioner.Partition(len(ids), func(i int) (start, end uint64) {
  2936  		return uint64(ids[i]), uint64(ids[i]) + uint64(r.block.estimatedMaxSeriesSize)
  2937  	})
  2939  	g, ctx := errgroup.WithContext(ctx)
  2940  	for _, p := range parts {
  2941  		s, e := p.Start, p.End
  2942  		i, j := p.ElemRng[0], p.ElemRng[1]
  2944  		g.Go(func() error {
  2945  			return r.loadSeries(ctx, ids[i:j], false, s, e, bytesLimiter)
  2946  		})
  2947  	}
  2948  	return g.Wait()
  2949  }
  2951  func (r *bucketIndexReader) loadSeries(ctx context.Context, ids []storage.SeriesRef, refetch bool, start, end uint64, bytesLimiter BytesLimiter) error {
  2952  	begin := time.Now()
  2954  	if bytesLimiter != nil {
  2955  		if err := bytesLimiter.Reserve(uint64(end - start)); err != nil {
  2956  			return httpgrpc.Errorf(int(codes.ResourceExhausted), "exceeded bytes limit while fetching series: %s", err)
  2957  		}
  2958  		r.mtx.Lock()
  2959  		r.stats.DataDownloadedSizeSum += units.Base2Bytes(end - start)
  2960  		r.mtx.Unlock()
  2961  	}
  2963  	b, err := r.block.readIndexRange(ctx, int64(start), int64(end-start))
  2964  	if err != nil {
  2965  		return errors.Wrap(err, "read series range")
  2966  	}
  2968  	r.mtx.Lock()
  2969  	r.stats.seriesFetchCount++
  2970  	r.stats.seriesFetched += len(ids)
  2971  	r.stats.SeriesFetchDurationSum += time.Since(begin)
  2972  	r.stats.SeriesFetchedSizeSum += units.Base2Bytes(int(end - start))
  2973  	r.mtx.Unlock()
  2975  	for i, id := range ids {
  2976  		c := b[uint64(id)-start:]
  2978  		l, n := binary.Uvarint(c)
  2979  		if n < 1 {
  2980  			return errors.New("reading series length failed")
  2981  		}
  2982  		if len(c) < n+int(l) {
  2983  			if i == 0 && refetch {
  2984  				return errors.Errorf("invalid remaining size, even after refetch, remaining: %d, expected %d", len(c), n+int(l))
  2985  			}
  2987  			// Inefficient, but should be rare.
  2988  			r.block.metrics.seriesRefetches.Inc()
  2989  			level.Warn(r.block.logger).Log("msg", "series size exceeded expected size; refetching", "id", id, "series length", n+int(l), "maxSeriesSize", r.block.estimatedMaxSeriesSize)
  2991  			// Fetch plus to get the size of next one if exists.
  2992  			return r.loadSeries(ctx, ids[i:], true, uint64(id), uint64(id)+uint64(n+int(l)+1), bytesLimiter)
  2993  		}
  2994  		c = c[n : n+int(l)]
  2995  		r.mtx.Lock()
  2996  		r.loadedSeries[id] = c
  2997  		r.block.indexCache.StoreSeries(r.block.meta.ULID, id, c)
  2998  		r.mtx.Unlock()
  2999  	}
  3000  	return nil
  3001  }
  3003  type Part struct {
  3004  	Start uint64
  3005  	End   uint64
  3007  	ElemRng [2]int
  3008  }
  3010  type Partitioner interface {
  3011  	// Partition partitions length entries into n <= length ranges that cover all
  3012  	// input ranges
  3013  	// It supports overlapping ranges.
  3014  	// NOTE: It expects range to be sorted by start time.
  3015  	Partition(length int, rng func(int) (uint64, uint64)) []Part
  3016  }
  3018  type gapBasedPartitioner struct {
  3019  	maxGapSize uint64
  3020  }
  3022  func NewGapBasedPartitioner(maxGapSize uint64) Partitioner {
  3023  	return gapBasedPartitioner{
  3024  		maxGapSize: maxGapSize,
  3025  	}
  3026  }
  3028  // Partition partitions length entries into n <= length ranges that cover all
  3029  // input ranges by combining entries that are separated by reasonably small gaps.
  3030  // It is used to combine multiple small ranges from object storage into bigger, more efficient/cheaper ones.
  3031  func (g gapBasedPartitioner) Partition(length int, rng func(int) (uint64, uint64)) (parts []Part) {
  3032  	j := 0
  3033  	k := 0
  3034  	for k < length {
  3035  		j = k
  3036  		k++
  3038  		p := Part{}
  3039  		p.Start, p.End = rng(j)
  3041  		// Keep growing the range until the end or we encounter a large gap.
  3042  		for ; k < length; k++ {
  3043  			s, e := rng(k)
  3045  			if p.End+g.maxGapSize < s {
  3046  				break
  3047  			}
  3049  			if p.End <= e {
  3050  				p.End = e
  3051  			}
  3052  		}
  3053  		p.ElemRng = [2]int{j, k}
  3054  		parts = append(parts, p)
  3055  	}
  3056  	return parts
  3057  }
  3059  type symbolizedLabel struct {
  3060  	name, value uint32
  3061  }
  3063  // LoadSeriesForTime populates the given symbolized labels for the series identified by the reference if at least one chunk is within
  3064  // time selection.
  3065  // LoadSeriesForTime also populates chunk metas slices if skipChunks if set to false. Chunks are also limited by the given time selection.
  3066  // LoadSeriesForTime returns false, when there are no series data for given time range.
  3067  //
  3068  // Error is returned on decoding error or if the reference does not resolve to a known series.
  3069  func (r *bucketIndexReader) LoadSeriesForTime(ref storage.SeriesRef, lset *[]symbolizedLabel, chks *[]chunks.Meta, skipChunks bool, mint, maxt int64) (ok bool, err error) {
  3070  	b, ok := r.loadedSeries[ref]
  3071  	if !ok {
  3072  		return false, errors.Errorf("series %d not found", ref)
  3073  	}
  3075  	r.stats.seriesTouched++
  3076  	r.stats.SeriesTouchedSizeSum += units.Base2Bytes(len(b))
  3077  	return decodeSeriesForTime(b, lset, chks, skipChunks, mint, maxt)
  3078  }
  3080  // Close released the underlying resources of the reader.
  3081  func (r *bucketIndexReader) Close() error {
  3082  	r.block.pendingReaders.Done()
  3083  	return nil
  3084  }
  3086  // LookupLabelsSymbols allows populates label set strings from symbolized label set.
  3087  func (r *bucketIndexReader) LookupLabelsSymbols(symbolized []symbolizedLabel, lbls *labels.Labels) error {
  3088  	*lbls = (*lbls)[:0]
  3089  	for _, s := range symbolized {
  3090  		ln, err := r.dec.LookupSymbol(
  3091  		if err != nil {
  3092  			return errors.Wrap(err, "lookup label name")
  3093  		}
  3094  		lv, err := r.dec.LookupSymbol(s.value)
  3095  		if err != nil {
  3096  			return errors.Wrap(err, "lookup label value")
  3097  		}
  3098  		*lbls = append(*lbls, labels.Label{Name: ln, Value: lv})
  3099  	}
  3100  	return nil
  3101  }
  3103  // decodeSeriesForTime decodes a series entry from the given byte slice decoding only chunk metas that are within given min and max time.
  3104  // If skipChunks is specified decodeSeriesForTime does not return any chunks, but only labels and only if at least single chunk is within time range.
  3105  // decodeSeriesForTime returns false, when there are no series data for given time range.
  3106  func decodeSeriesForTime(b []byte, lset *[]symbolizedLabel, chks *[]chunks.Meta, skipChunks bool, selectMint, selectMaxt int64) (ok bool, err error) {
  3107  	*lset = (*lset)[:0]
  3108  	*chks = (*chks)[:0]
  3110  	d := encoding.Decbuf{B: b}
  3112  	// Read labels without looking up symbols.
  3113  	k := d.Uvarint()
  3114  	for i := 0; i < k; i++ {
  3115  		lno := uint32(d.Uvarint())
  3116  		lvo := uint32(d.Uvarint())
  3117  		*lset = append(*lset, symbolizedLabel{name: lno, value: lvo})
  3118  	}
  3119  	// Read the chunks meta data.
  3120  	k = d.Uvarint()
  3121  	if k == 0 {
  3122  		return false, d.Err()
  3123  	}
  3125  	// First t0 is absolute, rest is just diff so different type is used (Uvarint64).
  3126  	mint := d.Varint64()
  3127  	maxt := int64(d.Uvarint64()) + mint
  3128  	// Similar for first ref.
  3129  	ref := int64(d.Uvarint64())
  3131  	for i := 0; i < k; i++ {
  3132  		if i > 0 {
  3133  			mint += int64(d.Uvarint64())
  3134  			maxt = int64(d.Uvarint64()) + mint
  3135  			ref += d.Varint64()
  3136  		}
  3138  		if mint > selectMaxt {
  3139  			break
  3140  		}
  3142  		if maxt >= selectMint {
  3143  			// Found a chunk.
  3144  			if skipChunks {
  3145  				// We are not interested in chunks and we know there is at least one, that's enough to return series.
  3146  				return true, nil
  3147  			}
  3149  			*chks = append(*chks, chunks.Meta{
  3150  				Ref:     chunks.ChunkRef(ref),
  3151  				MinTime: mint,
  3152  				MaxTime: maxt,
  3153  			})
  3154  		}
  3156  		mint = maxt
  3157  	}
  3158  	return len(*chks) > 0, d.Err()
  3159  }
  3161  type loadIdx struct {
  3162  	offset uint32
  3163  	// Indices, not actual entries and chunks.
  3164  	seriesEntry int
  3165  	chunk       int
  3166  }
  3168  type bucketChunkReader struct {
  3169  	block *bucketBlock
  3171  	toLoad [][]loadIdx
  3173  	// Mutex protects access to following fields, when updated from chunks-loading goroutines.
  3174  	// After chunks are loaded, mutex is no longer used.
  3175  	mtx        sync.Mutex
  3176  	stats      *queryStats
  3177  	chunkBytes []*[]byte // Byte slice to return to the chunk pool on close.
  3179  	loadingChunksMtx  sync.Mutex
  3180  	loadingChunks     bool
  3181  	finishLoadingChks chan struct{}
  3182  }
  3184  func newBucketChunkReader(block *bucketBlock) *bucketChunkReader {
  3185  	return &bucketChunkReader{
  3186  		block:  block,
  3187  		stats:  &queryStats{},
  3188  		toLoad: make([][]loadIdx, len(block.chunkObjs)),
  3189  	}
  3190  }
  3192  func (r *bucketChunkReader) reset() {
  3193  	for i := range r.toLoad {
  3194  		r.toLoad[i] = r.toLoad[i][:0]
  3195  	}
  3196  	r.loadingChunksMtx.Lock()
  3197  	r.loadingChunks = false
  3198  	r.finishLoadingChks = make(chan struct{})
  3199  	r.loadingChunksMtx.Unlock()
  3200  }
  3202  func (r *bucketChunkReader) Close() error {
  3203  	// NOTE(GiedriusS): we need to wait until loading chunks because loading
  3204  	// chunks modifies r.block.chunkPool.
  3205  	r.loadingChunksMtx.Lock()
  3206  	loadingChks := r.loadingChunks
  3207  	r.loadingChunksMtx.Unlock()
  3209  	if loadingChks {
  3210  		<-r.finishLoadingChks
  3211  	}
  3212  	r.block.pendingReaders.Done()
  3214  	for _, b := range r.chunkBytes {
  3215  		r.block.chunkPool.Put(b)
  3216  	}
  3217  	return nil
  3218  }
  3220  // addLoad adds the chunk with id to the data set to be fetched.
  3221  // Chunk will be fetched and saved to refs[seriesEntry][chunk] upon r.load(refs, <...>) call.
  3222  func (r *bucketChunkReader) addLoad(id chunks.ChunkRef, seriesEntry, chunk int) error {
  3223  	var (
  3224  		seq = int(id >> 32)
  3225  		off = uint32(id)
  3226  	)
  3227  	if seq >= len(r.toLoad) {
  3228  		return errors.Errorf("reference sequence %d out of range", seq)
  3229  	}
  3230  	r.toLoad[seq] = append(r.toLoad[seq], loadIdx{off, seriesEntry, chunk})
  3231  	return nil
  3232  }
  3234  // load loads all added chunks and saves resulting aggrs to refs.
  3235  func (r *bucketChunkReader) load(ctx context.Context, res []seriesEntry, aggrs []storepb.Aggr, calculateChunkChecksum bool, bytesLimiter BytesLimiter) error {
  3236  	r.loadingChunksMtx.Lock()
  3237  	r.loadingChunks = true
  3238  	r.loadingChunksMtx.Unlock()
  3240  	defer func() {
  3241  		r.loadingChunksMtx.Lock()
  3242  		r.loadingChunks = false
  3243  		r.loadingChunksMtx.Unlock()
  3245  		close(r.finishLoadingChks)
  3246  	}()
  3248  	g, ctx := errgroup.WithContext(ctx)
  3250  	for seq, pIdxs := range r.toLoad {
  3251  		sort.Slice(pIdxs, func(i, j int) bool {
  3252  			return pIdxs[i].offset < pIdxs[j].offset
  3253  		})
  3254  		parts := r.block.partitioner.Partition(len(pIdxs), func(i int) (start, end uint64) {
  3255  			return uint64(pIdxs[i].offset), uint64(pIdxs[i].offset) + uint64(r.block.estimatedMaxChunkSize)
  3256  		})
  3258  		for _, p := range parts {
  3259  			if err := bytesLimiter.Reserve(uint64(p.End - p.Start)); err != nil {
  3260  				return httpgrpc.Errorf(int(codes.ResourceExhausted), "exceeded bytes limit while fetching chunks: %s", err)
  3261  			}
  3262  			r.stats.DataDownloadedSizeSum += units.Base2Bytes(p.End - p.Start)
  3263  		}
  3265  		for _, p := range parts {
  3266  			seq := seq
  3267  			p := p
  3268  			indices := pIdxs[p.ElemRng[0]:p.ElemRng[1]]
  3269  			g.Go(func() error {
  3270  				return r.loadChunks(ctx, res, aggrs, seq, p, indices, calculateChunkChecksum, bytesLimiter)
  3271  			})
  3272  		}
  3273  	}
  3274  	return g.Wait()
  3275  }
  3277  // loadChunks will read range [start, end] from the segment file with sequence number seq.
  3278  // This data range covers chunks starting at supplied offsets.
  3279  func (r *bucketChunkReader) loadChunks(ctx context.Context, res []seriesEntry, aggrs []storepb.Aggr, seq int, part Part, pIdxs []loadIdx, calculateChunkChecksum bool, bytesLimiter BytesLimiter) error {
  3280  	var locked bool
  3281  	fetchBegin := time.Now()
  3282  	defer func() {
  3283  		if !locked {
  3284  			r.mtx.Lock()
  3285  		}
  3286  		r.stats.ChunksFetchDurationSum += time.Since(fetchBegin)
  3287  		r.mtx.Unlock()
  3288  	}()
  3290  	// Get a reader for the required range.
  3291  	reader, err := r.block.chunkRangeReader(ctx, seq, int64(part.Start), int64(part.End-part.Start))
  3292  	if err != nil {
  3293  		return errors.Wrap(err, "get range reader")
  3294  	}
  3295  	defer runutil.CloseWithLogOnErr(r.block.logger, reader, "readChunkRange close range reader")
  3296  	bufReader := bufio.NewReaderSize(reader, r.block.estimatedMaxChunkSize)
  3298  	locked = true
  3299  	r.mtx.Lock()
  3301  	r.stats.chunksFetchCount++
  3302  	r.stats.chunksFetched += len(pIdxs)
  3303  	r.stats.ChunksFetchedSizeSum += units.Base2Bytes(int(part.End - part.Start))
  3305  	var (
  3306  		buf        []byte
  3307  		readOffset = int(pIdxs[0].offset)
  3309  		// Save a few allocations.
  3310  		written  int
  3311  		diff     uint32
  3312  		chunkLen int
  3313  		n        int
  3314  	)
  3316  	bufPooled, err := r.block.chunkPool.Get(r.block.estimatedMaxChunkSize)
  3317  	if err == nil {
  3318  		buf = *bufPooled
  3319  	} else {
  3320  		buf = make([]byte, r.block.estimatedMaxChunkSize)
  3321  	}
  3322  	defer r.block.chunkPool.Put(&buf)
  3324  	for i, pIdx := range pIdxs {
  3325  		// Fast forward range reader to the next chunk start in case of sparse (for our purposes) byte range.
  3326  		for readOffset < int(pIdx.offset) {
  3327  			written, err = bufReader.Discard(int(pIdx.offset) - int(readOffset))
  3328  			if err != nil {
  3329  				return errors.Wrap(err, "fast forward range reader")
  3330  			}
  3331  			readOffset += written
  3332  		}
  3333  		// Presume chunk length to be reasonably large for common use cases.
  3334  		// However, declaration for EstimatedMaxChunkSize warns us some chunks could be larger in some rare cases.
  3335  		// This is handled further down below.
  3336  		chunkLen = r.block.estimatedMaxChunkSize
  3337  		if i+1 < len(pIdxs) {
  3338  			if diff = pIdxs[i+1].offset - pIdx.offset; int(diff) < chunkLen {
  3339  				chunkLen = int(diff)
  3340  			}
  3341  		}
  3342  		cb := buf[:chunkLen]
  3343  		n, err = io.ReadFull(bufReader, cb)
  3344  		readOffset += n
  3345  		// Unexpected EOF for last chunk could be a valid case. Any other errors are definitely real.
  3346  		if err != nil && !(errors.Is(err, io.ErrUnexpectedEOF) && i == len(pIdxs)-1) {
  3347  			return errors.Wrapf(err, "read range for seq %d offset %x", seq, pIdx.offset)
  3348  		}
  3350  		chunkDataLen, n := binary.Uvarint(cb)
  3351  		if n < 1 {
  3352  			return errors.New("reading chunk length failed")
  3353  		}
  3355  		// Chunk length is n (number of bytes used to encode chunk data), 1 for chunk encoding and chunkDataLen for actual chunk data.
  3356  		// There is also crc32 after the chunk, but we ignore that.
  3357  		chunkLen = n + 1 + int(chunkDataLen)
  3358  		if chunkLen <= len(cb) {
  3359  			err = populateChunk(&(res[pIdx.seriesEntry].chks[pIdx.chunk]), rawChunk(cb[n:chunkLen]), aggrs,, calculateChunkChecksum)
  3360  			if err != nil {
  3361  				return errors.Wrap(err, "populate chunk")
  3362  			}
  3363  			r.stats.chunksTouched++
  3364  			r.stats.ChunksTouchedSizeSum += units.Base2Bytes(int(chunkDataLen))
  3365  			continue
  3366  		}
  3368  		r.block.metrics.chunkRefetches.Inc()
  3369  		// If we didn't fetch enough data for the chunk, fetch more.
  3370  		fetchBegin = time.Now()
  3371  		// Read entire chunk into new buffer.
  3372  		// TODO: readChunkRange call could be avoided for any chunk but last in this particular part.
  3373  		if err := bytesLimiter.Reserve(uint64(chunkLen)); err != nil {
  3374  			return httpgrpc.Errorf(int(codes.ResourceExhausted), "exceeded bytes limit while fetching chunks: %s", err)
  3375  		}
  3376  		r.stats.DataDownloadedSizeSum += units.Base2Bytes(chunkLen)
  3377  		r.mtx.Unlock()
  3378  		locked = false
  3380  		nb, err := r.block.readChunkRange(ctx, seq, int64(pIdx.offset), int64(chunkLen), []byteRange{{offset: 0, length: chunkLen}})
  3381  		if err != nil {
  3382  			return errors.Wrapf(err, "preloaded chunk too small, expecting %d, and failed to fetch full chunk", chunkLen)
  3383  		}
  3384  		if len(*nb) != chunkLen {
  3385  			return errors.Errorf("preloaded chunk too small, expecting %d", chunkLen)
  3386  		}
  3388  		r.mtx.Lock()
  3389  		locked = true
  3391  		r.stats.chunksFetchCount++
  3392  		r.stats.ChunksFetchedSizeSum += units.Base2Bytes(len(*nb))
  3393  		err = populateChunk(&(res[pIdx.seriesEntry].chks[pIdx.chunk]), rawChunk((*nb)[n:]), aggrs,, calculateChunkChecksum)
  3394  		if err != nil {
  3395  			r.block.chunkPool.Put(nb)
  3396  			return errors.Wrap(err, "populate chunk")
  3397  		}
  3398  		r.stats.chunksTouched++
  3399  		r.stats.ChunksTouchedSizeSum += units.Base2Bytes(int(chunkDataLen))
  3401  		r.block.chunkPool.Put(nb)
  3402  	}
  3403  	return nil
  3404  }
  3406  // save saves a copy of b's payload to a memory pool of its own and returns a new byte slice referencing said copy.
  3407  // Returned slice becomes invalid once r.block.chunkPool.Put() is called.
  3408  func (r *bucketChunkReader) save(b []byte) ([]byte, error) {
  3409  	// Ensure we never grow slab beyond original capacity.
  3410  	if len(r.chunkBytes) == 0 ||
  3411  		cap(*r.chunkBytes[len(r.chunkBytes)-1])-len(*r.chunkBytes[len(r.chunkBytes)-1]) < len(b) {
  3412  		s, err := r.block.chunkPool.Get(len(b))
  3413  		if err != nil {
  3414  			return nil, errors.Wrap(err, "allocate chunk bytes")
  3415  		}
  3416  		r.chunkBytes = append(r.chunkBytes, s)
  3417  	}
  3418  	slab := r.chunkBytes[len(r.chunkBytes)-1]
  3419  	*slab = append(*slab, b...)
  3420  	return (*slab)[len(*slab)-len(b):], nil
  3421  }
  3423  // rawChunk is a helper type that wraps a chunk's raw bytes and implements the chunkenc.Chunk
  3424  // interface over it.
  3425  // It is used to Store API responses which don't need to introspect and validate the chunk's contents.
  3426  type rawChunk []byte
  3428  func (b rawChunk) Encoding() chunkenc.Encoding {
  3429  	return chunkenc.Encoding(b[0])
  3430  }
  3432  func (b rawChunk) Bytes() []byte {
  3433  	return b[1:]
  3434  }
  3435  func (b rawChunk) Compact() {}
  3437  func (b rawChunk) Iterator(_ chunkenc.Iterator) chunkenc.Iterator {
  3438  	panic("invalid call")
  3439  }
  3441  func (b rawChunk) Appender() (chunkenc.Appender, error) {
  3442  	panic("invalid call")
  3443  }
  3445  func (b rawChunk) NumSamples() int {
  3446  	panic("invalid call")
  3447  }
  3449  type queryStats struct {
  3450  	blocksQueried int
  3452  	postingsTouched          int
  3453  	PostingsTouchedSizeSum   units.Base2Bytes
  3454  	postingsToFetch          int
  3455  	postingsFetched          int
  3456  	PostingsFetchedSizeSum   units.Base2Bytes
  3457  	postingsFetchCount       int
  3458  	PostingsFetchDurationSum time.Duration
  3460  	cachedPostingsCompressions         int
  3461  	cachedPostingsCompressionErrors    int
  3462  	CachedPostingsOriginalSizeSum      units.Base2Bytes
  3463  	CachedPostingsCompressedSizeSum    units.Base2Bytes
  3464  	CachedPostingsCompressionTimeSum   time.Duration
  3465  	cachedPostingsDecompressions       int
  3466  	cachedPostingsDecompressionErrors  int
  3467  	CachedPostingsDecompressionTimeSum time.Duration
  3469  	seriesTouched          int
  3470  	SeriesTouchedSizeSum   units.Base2Bytes
  3471  	seriesFetched          int
  3472  	SeriesFetchedSizeSum   units.Base2Bytes
  3473  	seriesFetchCount       int
  3474  	SeriesFetchDurationSum time.Duration
  3476  	chunksTouched          int
  3477  	ChunksTouchedSizeSum   units.Base2Bytes
  3478  	chunksFetched          int
  3479  	ChunksFetchedSizeSum   units.Base2Bytes
  3480  	chunksFetchCount       int
  3481  	ChunksFetchDurationSum time.Duration
  3483  	GetAllDuration    time.Duration
  3484  	mergedSeriesCount int
  3485  	mergedChunksCount int
  3486  	MergeDuration     time.Duration
  3488  	DataDownloadedSizeSum units.Base2Bytes
  3489  }
  3491  func (s queryStats) merge(o *queryStats) *queryStats {
  3492  	s.blocksQueried += o.blocksQueried
  3494  	s.postingsToFetch += o.postingsToFetch
  3495  	s.postingsTouched += o.postingsTouched
  3496  	s.PostingsTouchedSizeSum += o.PostingsTouchedSizeSum
  3497  	s.postingsFetched += o.postingsFetched
  3498  	s.PostingsFetchedSizeSum += o.PostingsFetchedSizeSum
  3499  	s.postingsFetchCount += o.postingsFetchCount
  3500  	s.PostingsFetchDurationSum += o.PostingsFetchDurationSum
  3502  	s.cachedPostingsCompressions += o.cachedPostingsCompressions
  3503  	s.cachedPostingsCompressionErrors += o.cachedPostingsCompressionErrors
  3504  	s.CachedPostingsOriginalSizeSum += o.CachedPostingsOriginalSizeSum
  3505  	s.CachedPostingsCompressedSizeSum += o.CachedPostingsCompressedSizeSum
  3506  	s.CachedPostingsCompressionTimeSum += o.CachedPostingsCompressionTimeSum
  3507  	s.cachedPostingsDecompressions += o.cachedPostingsDecompressions
  3508  	s.cachedPostingsDecompressionErrors += o.cachedPostingsDecompressionErrors
  3509  	s.CachedPostingsDecompressionTimeSum += o.CachedPostingsDecompressionTimeSum
  3511  	s.seriesTouched += o.seriesTouched
  3512  	s.SeriesTouchedSizeSum += o.SeriesTouchedSizeSum
  3513  	s.seriesFetched += o.seriesFetched
  3514  	s.SeriesFetchedSizeSum += o.SeriesFetchedSizeSum
  3515  	s.seriesFetchCount += o.seriesFetchCount
  3516  	s.SeriesFetchDurationSum += o.SeriesFetchDurationSum
  3518  	s.chunksTouched += o.chunksTouched
  3519  	s.ChunksTouchedSizeSum += o.ChunksTouchedSizeSum
  3520  	s.chunksFetched += o.chunksFetched
  3521  	s.ChunksFetchedSizeSum += o.ChunksFetchedSizeSum
  3522  	s.chunksFetchCount += o.chunksFetchCount
  3523  	s.ChunksFetchDurationSum += o.ChunksFetchDurationSum
  3525  	s.GetAllDuration += o.GetAllDuration
  3526  	s.mergedSeriesCount += o.mergedSeriesCount
  3527  	s.mergedChunksCount += o.mergedChunksCount
  3528  	s.MergeDuration += o.MergeDuration
  3530  	s.DataDownloadedSizeSum += o.DataDownloadedSizeSum
  3532  	return &s
  3533  }
  3535  func (s queryStats) toHints() *hintspb.QueryStats {
  3536  	return &hintspb.QueryStats{
  3537  		BlocksQueried:          int64(s.blocksQueried),
  3538  		PostingsTouched:        int64(s.postingsTouched),
  3539  		PostingsTouchedSizeSum: int64(s.PostingsTouchedSizeSum),
  3540  		PostingsToFetch:        int64(s.postingsToFetch),
  3541  		PostingsFetched:        int64(s.postingsFetched),
  3542  		PostingsFetchedSizeSum: int64(s.PostingsFetchedSizeSum),
  3543  		PostingsFetchCount:     int64(s.postingsFetchCount),
  3544  		SeriesTouched:          int64(s.seriesTouched),
  3545  		SeriesTouchedSizeSum:   int64(s.SeriesTouchedSizeSum),
  3546  		SeriesFetched:          int64(s.seriesFetched),
  3547  		SeriesFetchedSizeSum:   int64(s.SeriesFetchedSizeSum),
  3548  		SeriesFetchCount:       int64(s.seriesFetchCount),
  3549  		ChunksTouched:          int64(s.chunksTouched),
  3550  		ChunksTouchedSizeSum:   int64(s.ChunksTouchedSizeSum),
  3551  		ChunksFetched:          int64(s.chunksFetched),
  3552  		ChunksFetchedSizeSum:   int64(s.ChunksFetchedSizeSum),
  3553  		ChunksFetchCount:       int64(s.chunksFetchCount),
  3554  		MergedSeriesCount:      int64(s.mergedSeriesCount),
  3555  		MergedChunksCount:      int64(s.mergedChunksCount),
  3556  		DataDownloadedSizeSum:  int64(s.DataDownloadedSizeSum),
  3557  	}
  3558  }
  3560  // NewDefaultChunkBytesPool returns a chunk bytes pool with default settings.
  3561  func NewDefaultChunkBytesPool(maxChunkPoolBytes uint64) (pool.Bytes, error) {
  3562  	return pool.NewBucketedBytes(chunkBytesPoolMinSize, chunkBytesPoolMaxSize, 2, maxChunkPoolBytes)
  3563  }