github.com/thanos-io/thanos@v0.32.5/pkg/store/bucket.go (about)

     1  // Copyright (c) The Thanos Authors.
     2  // Licensed under the Apache License 2.0.
     3  
     4  package store
     5  
     6  import (
     7  	"bufio"
     8  	"bytes"
     9  	"context"
    10  	"encoding/binary"
    11  	"fmt"
    12  	"hash"
    13  	"io"
    14  	"math"
    15  	"os"
    16  	"path"
    17  	"path/filepath"
    18  	"sort"
    19  	"strings"
    20  	"sync"
    21  	"time"
    22  
    23  	"github.com/alecthomas/units"
    24  	"github.com/cespare/xxhash"
    25  	"github.com/go-kit/log"
    26  	"github.com/go-kit/log/level"
    27  	"github.com/gogo/protobuf/types"
    28  	"github.com/oklog/ulid"
    29  	"github.com/pkg/errors"
    30  	"github.com/prometheus/client_golang/prometheus"
    31  	"github.com/prometheus/client_golang/prometheus/promauto"
    32  	"github.com/prometheus/prometheus/model/labels"
    33  	"github.com/prometheus/prometheus/storage"
    34  	"github.com/prometheus/prometheus/tsdb/chunkenc"
    35  	"github.com/prometheus/prometheus/tsdb/chunks"
    36  	"github.com/prometheus/prometheus/tsdb/encoding"
    37  	"github.com/prometheus/prometheus/tsdb/index"
    38  	"github.com/weaveworks/common/httpgrpc"
    39  	"golang.org/x/exp/slices"
    40  	"golang.org/x/sync/errgroup"
    41  	"google.golang.org/grpc"
    42  	"google.golang.org/grpc/codes"
    43  	"google.golang.org/grpc/status"
    44  
    45  	"github.com/thanos-io/objstore"
    46  
    47  	"github.com/thanos-io/thanos/pkg/block"
    48  	"github.com/thanos-io/thanos/pkg/block/indexheader"
    49  	"github.com/thanos-io/thanos/pkg/block/metadata"
    50  	"github.com/thanos-io/thanos/pkg/compact/downsample"
    51  	"github.com/thanos-io/thanos/pkg/component"
    52  	"github.com/thanos-io/thanos/pkg/extprom"
    53  	"github.com/thanos-io/thanos/pkg/gate"
    54  	"github.com/thanos-io/thanos/pkg/info/infopb"
    55  	"github.com/thanos-io/thanos/pkg/model"
    56  	"github.com/thanos-io/thanos/pkg/pool"
    57  	"github.com/thanos-io/thanos/pkg/runutil"
    58  	storecache "github.com/thanos-io/thanos/pkg/store/cache"
    59  	"github.com/thanos-io/thanos/pkg/store/hintspb"
    60  	"github.com/thanos-io/thanos/pkg/store/labelpb"
    61  	"github.com/thanos-io/thanos/pkg/store/storepb"
    62  	"github.com/thanos-io/thanos/pkg/strutil"
    63  	"github.com/thanos-io/thanos/pkg/tenancy"
    64  	"github.com/thanos-io/thanos/pkg/tracing"
    65  )
    66  
    67  const (
    68  	// MaxSamplesPerChunk is approximately the max number of samples that we may have in any given chunk. This is needed
    69  	// for precalculating the number of samples that we may have to retrieve and decode for any given query
    70  	// without downloading them. Please take a look at https://github.com/prometheus/tsdb/pull/397 to know
    71  	// where this number comes from. Long story short: TSDB is made in such a way, and it is made in such a way
    72  	// because you barely get any improvements in compression when the number of samples is beyond this.
    73  	// Take a look at Figure 6 in this whitepaper http://www.vldb.org/pvldb/vol8/p1816-teller.pdf.
    74  	MaxSamplesPerChunk = 120
    75  	// EstimatedMaxChunkSize is average max of chunk size. This can be exceeded though in very rare (valid) cases.
    76  	EstimatedMaxChunkSize  = 16000
    77  	EstimatedMaxSeriesSize = 64 * 1024
    78  	// Relatively large in order to reduce memory waste, yet small enough to avoid excessive allocations.
    79  	chunkBytesPoolMinSize = 64 * 1024        // 64 KiB
    80  	chunkBytesPoolMaxSize = 64 * 1024 * 1024 // 64 MiB
    81  
    82  	// CompatibilityTypeLabelName is an artificial label that Store Gateway can optionally advertise. This is required for compatibility
    83  	// with pre v0.8.0 Querier. Previous Queriers was strict about duplicated external labels of all StoreAPIs that had any labels.
    84  	// Now with newer Store Gateway advertising all the external labels it has access to, there was simple case where
    85  	// Querier was blocking Store Gateway as duplicate with sidecar.
    86  	//
    87  	// Newer Queriers are not strict, no duplicated external labels check is there anymore.
    88  	// Additionally newer Queriers removes/ignore this exact labels from UI and querying.
    89  	//
    90  	// This label name is intentionally against Prometheus label style.
    91  	// TODO(bwplotka): Remove it at some point.
    92  	CompatibilityTypeLabelName = "@thanos_compatibility_store_type"
    93  
    94  	// DefaultPostingOffsetInMemorySampling represents default value for --store.index-header-posting-offsets-in-mem-sampling.
    95  	// 32 value is chosen as it's a good balance for common setups. Sampling that is not too large (too many CPU cycles) and
    96  	// not too small (too much memory).
    97  	DefaultPostingOffsetInMemorySampling = 32
    98  
    99  	PartitionerMaxGapSize = 512 * 1024
   100  
   101  	// Labels for metrics.
   102  	labelEncode = "encode"
   103  	labelDecode = "decode"
   104  
   105  	minBlockSyncConcurrency = 1
   106  
   107  	enableChunkHashCalculation = true
   108  
   109  	// SeriesBatchSize is the default batch size when fetching series from object storage.
   110  	SeriesBatchSize = 10000
   111  )
   112  
   113  var (
   114  	errBlockSyncConcurrencyNotValid = errors.New("the block sync concurrency must be equal or greater than 1.")
   115  	hashPool                        = sync.Pool{New: func() interface{} { return xxhash.New() }}
   116  )
   117  
   118  type bucketStoreMetrics struct {
   119  	blocksLoaded          prometheus.Gauge
   120  	blockLoads            prometheus.Counter
   121  	blockLoadFailures     prometheus.Counter
   122  	lastLoadedBlock       prometheus.Gauge
   123  	blockDrops            prometheus.Counter
   124  	blockDropFailures     prometheus.Counter
   125  	seriesDataTouched     *prometheus.HistogramVec
   126  	seriesDataFetched     *prometheus.HistogramVec
   127  	seriesDataSizeTouched *prometheus.HistogramVec
   128  	seriesDataSizeFetched *prometheus.HistogramVec
   129  	seriesBlocksQueried   prometheus.Histogram
   130  	seriesGetAllDuration  prometheus.Histogram
   131  	seriesMergeDuration   prometheus.Histogram
   132  	resultSeriesCount     prometheus.Histogram
   133  	chunkSizeBytes        prometheus.Histogram
   134  	postingsSizeBytes     prometheus.Histogram
   135  	queriesDropped        *prometheus.CounterVec
   136  	seriesRefetches       prometheus.Counter
   137  	chunkRefetches        prometheus.Counter
   138  	emptyPostingCount     prometheus.Counter
   139  
   140  	cachedPostingsCompressions           *prometheus.CounterVec
   141  	cachedPostingsCompressionErrors      *prometheus.CounterVec
   142  	cachedPostingsCompressionTimeSeconds *prometheus.CounterVec
   143  	cachedPostingsOriginalSizeBytes      prometheus.Counter
   144  	cachedPostingsCompressedSizeBytes    prometheus.Counter
   145  
   146  	seriesFetchDuration   prometheus.Histogram
   147  	postingsFetchDuration prometheus.Histogram
   148  	chunkFetchDuration    prometheus.Histogram
   149  }
   150  
   151  func newBucketStoreMetrics(reg prometheus.Registerer) *bucketStoreMetrics {
   152  	var m bucketStoreMetrics
   153  
   154  	m.blockLoads = promauto.With(reg).NewCounter(prometheus.CounterOpts{
   155  		Name: "thanos_bucket_store_block_loads_total",
   156  		Help: "Total number of remote block loading attempts.",
   157  	})
   158  	m.blockLoadFailures = promauto.With(reg).NewCounter(prometheus.CounterOpts{
   159  		Name: "thanos_bucket_store_block_load_failures_total",
   160  		Help: "Total number of failed remote block loading attempts.",
   161  	})
   162  	m.blockDrops = promauto.With(reg).NewCounter(prometheus.CounterOpts{
   163  		Name: "thanos_bucket_store_block_drops_total",
   164  		Help: "Total number of local blocks that were dropped.",
   165  	})
   166  	m.blockDropFailures = promauto.With(reg).NewCounter(prometheus.CounterOpts{
   167  		Name: "thanos_bucket_store_block_drop_failures_total",
   168  		Help: "Total number of local blocks that failed to be dropped.",
   169  	})
   170  	m.blocksLoaded = promauto.With(reg).NewGauge(prometheus.GaugeOpts{
   171  		Name: "thanos_bucket_store_blocks_loaded",
   172  		Help: "Number of currently loaded blocks.",
   173  	})
   174  	m.lastLoadedBlock = promauto.With(reg).NewGauge(prometheus.GaugeOpts{
   175  		Name: "thanos_bucket_store_blocks_last_loaded_timestamp_seconds",
   176  		Help: "Timestamp when last block got loaded.",
   177  	})
   178  
   179  	m.seriesDataTouched = promauto.With(reg).NewHistogramVec(prometheus.HistogramOpts{
   180  		Name:    "thanos_bucket_store_series_data_touched",
   181  		Help:    "Number of items of a data type touched to fulfill a single Store API series request.",
   182  		Buckets: prometheus.ExponentialBuckets(200, 2, 15),
   183  	}, []string{"data_type"})
   184  	m.seriesDataFetched = promauto.With(reg).NewHistogramVec(prometheus.HistogramOpts{
   185  		Name:    "thanos_bucket_store_series_data_fetched",
   186  		Help:    "Number of items of a data type retrieved to fulfill a single Store API series request.",
   187  		Buckets: prometheus.ExponentialBuckets(200, 2, 15),
   188  	}, []string{"data_type"})
   189  
   190  	m.seriesDataSizeTouched = promauto.With(reg).NewHistogramVec(prometheus.HistogramOpts{
   191  		Name:    "thanos_bucket_store_series_data_size_touched_bytes",
   192  		Help:    "Total size of items of a data type touched to fulfill a single Store API series request in Bytes.",
   193  		Buckets: prometheus.ExponentialBuckets(1024, 2, 15),
   194  	}, []string{"data_type"})
   195  	m.seriesDataSizeFetched = promauto.With(reg).NewHistogramVec(prometheus.HistogramOpts{
   196  		Name:    "thanos_bucket_store_series_data_size_fetched_bytes",
   197  		Help:    "Total size of items of a data type fetched to fulfill a single Store API series request in Bytes.",
   198  		Buckets: prometheus.ExponentialBuckets(1024, 2, 15),
   199  	}, []string{"data_type"})
   200  
   201  	m.seriesBlocksQueried = promauto.With(reg).NewHistogram(prometheus.HistogramOpts{
   202  		Name:    "thanos_bucket_store_series_blocks_queried",
   203  		Help:    "Number of blocks in a bucket store that were touched to satisfy a query.",
   204  		Buckets: prometheus.ExponentialBuckets(1, 2, 10),
   205  	})
   206  	m.seriesGetAllDuration = promauto.With(reg).NewHistogram(prometheus.HistogramOpts{
   207  		Name:    "thanos_bucket_store_series_get_all_duration_seconds",
   208  		Help:    "Time it takes until all per-block prepares and loads for a query are finished.",
   209  		Buckets: []float64{0.001, 0.01, 0.1, 0.3, 0.6, 1, 3, 6, 9, 20, 30, 60, 90, 120},
   210  	})
   211  	m.seriesMergeDuration = promauto.With(reg).NewHistogram(prometheus.HistogramOpts{
   212  		Name:    "thanos_bucket_store_series_merge_duration_seconds",
   213  		Help:    "Time it takes to merge sub-results from all queried blocks into a single result.",
   214  		Buckets: []float64{0.001, 0.01, 0.1, 0.3, 0.6, 1, 3, 6, 9, 20, 30, 60, 90, 120},
   215  	})
   216  	m.resultSeriesCount = promauto.With(reg).NewHistogram(prometheus.HistogramOpts{
   217  		Name:    "thanos_bucket_store_series_result_series",
   218  		Help:    "Number of series observed in the final result of a query.",
   219  		Buckets: prometheus.ExponentialBuckets(1, 2, 15),
   220  	})
   221  
   222  	m.chunkSizeBytes = promauto.With(reg).NewHistogram(prometheus.HistogramOpts{
   223  		Name: "thanos_bucket_store_sent_chunk_size_bytes",
   224  		Help: "Size in bytes of the chunks for the single series, which is adequate to the gRPC message size sent to querier.",
   225  		Buckets: []float64{
   226  			32, 256, 512, 1024, 32 * 1024, 256 * 1024, 512 * 1024, 1024 * 1024, 32 * 1024 * 1024, 256 * 1024 * 1024, 512 * 1024 * 1024,
   227  		},
   228  	})
   229  
   230  	m.postingsSizeBytes = promauto.With(reg).NewHistogram(prometheus.HistogramOpts{
   231  		Name: "thanos_bucket_store_postings_size_bytes",
   232  		Help: "Size in bytes of the postings for a single series call.",
   233  		Buckets: []float64{
   234  			32, 256, 512, 1024, 32 * 1024, 256 * 1024, 512 * 1024, 1024 * 1024, 32 * 1024 * 1024, 128 * 1024 * 1024, 256 * 1024 * 1024, 512 * 1024 * 1024, 768 * 1024 * 1024, 1024 * 1024 * 1024,
   235  		},
   236  	})
   237  
   238  	m.queriesDropped = promauto.With(reg).NewCounterVec(prometheus.CounterOpts{
   239  		Name: "thanos_bucket_store_queries_dropped_total",
   240  		Help: "Number of queries that were dropped due to the limit.",
   241  	}, []string{"reason"})
   242  	m.seriesRefetches = promauto.With(reg).NewCounter(prometheus.CounterOpts{
   243  		Name: "thanos_bucket_store_series_refetches_total",
   244  		Help: "Total number of cases where configured estimated series bytes was not enough was to fetch series from index, resulting in refetch.",
   245  	})
   246  	m.chunkRefetches = promauto.With(reg).NewCounter(prometheus.CounterOpts{
   247  		Name: "thanos_bucket_store_chunk_refetches_total",
   248  		Help: "Total number of cases where configured estimated chunk bytes was not enough was to fetch chunks from object store, resulting in refetch.",
   249  	})
   250  
   251  	m.cachedPostingsCompressions = promauto.With(reg).NewCounterVec(prometheus.CounterOpts{
   252  		Name: "thanos_bucket_store_cached_postings_compressions_total",
   253  		Help: "Number of postings compressions before storing to index cache.",
   254  	}, []string{"op"})
   255  	m.cachedPostingsCompressions.WithLabelValues(labelEncode)
   256  	m.cachedPostingsCompressions.WithLabelValues(labelDecode)
   257  
   258  	m.cachedPostingsCompressionErrors = promauto.With(reg).NewCounterVec(prometheus.CounterOpts{
   259  		Name: "thanos_bucket_store_cached_postings_compression_errors_total",
   260  		Help: "Number of postings compression errors.",
   261  	}, []string{"op"})
   262  	m.cachedPostingsCompressionErrors.WithLabelValues(labelEncode)
   263  	m.cachedPostingsCompressionErrors.WithLabelValues(labelDecode)
   264  
   265  	m.cachedPostingsCompressionTimeSeconds = promauto.With(reg).NewCounterVec(prometheus.CounterOpts{
   266  		Name: "thanos_bucket_store_cached_postings_compression_time_seconds_total",
   267  		Help: "Time spent compressing postings before storing them into postings cache.",
   268  	}, []string{"op"})
   269  	m.cachedPostingsCompressionTimeSeconds.WithLabelValues(labelEncode)
   270  	m.cachedPostingsCompressionTimeSeconds.WithLabelValues(labelDecode)
   271  
   272  	m.cachedPostingsOriginalSizeBytes = promauto.With(reg).NewCounter(prometheus.CounterOpts{
   273  		Name: "thanos_bucket_store_cached_postings_original_size_bytes_total",
   274  		Help: "Original size of postings stored into cache.",
   275  	})
   276  	m.cachedPostingsCompressedSizeBytes = promauto.With(reg).NewCounter(prometheus.CounterOpts{
   277  		Name: "thanos_bucket_store_cached_postings_compressed_size_bytes_total",
   278  		Help: "Compressed size of postings stored into cache.",
   279  	})
   280  
   281  	m.seriesFetchDuration = promauto.With(reg).NewHistogram(prometheus.HistogramOpts{
   282  		Name:    "thanos_bucket_store_series_fetch_duration_seconds",
   283  		Help:    "The time it takes to fetch series to respond to a request sent to a store gateway. It includes both the time to fetch it from the cache and from storage in case of cache misses.",
   284  		Buckets: []float64{0.001, 0.01, 0.1, 0.3, 0.6, 1, 3, 6, 9, 20, 30, 60, 90, 120},
   285  	})
   286  
   287  	m.postingsFetchDuration = promauto.With(reg).NewHistogram(prometheus.HistogramOpts{
   288  		Name:    "thanos_bucket_store_postings_fetch_duration_seconds",
   289  		Help:    "The time it takes to fetch postings to respond to a request sent to a store gateway. It includes both the time to fetch it from the cache and from storage in case of cache misses.",
   290  		Buckets: []float64{0.001, 0.01, 0.1, 0.3, 0.6, 1, 3, 6, 9, 20, 30, 60, 90, 120},
   291  	})
   292  
   293  	m.chunkFetchDuration = promauto.With(reg).NewHistogram(prometheus.HistogramOpts{
   294  		Name:    "thanos_bucket_store_chunks_fetch_duration_seconds",
   295  		Help:    "The total time spent fetching chunks within a single request a store gateway.",
   296  		Buckets: []float64{0.001, 0.01, 0.1, 0.3, 0.6, 1, 3, 6, 9, 20, 30, 60, 90, 120},
   297  	})
   298  
   299  	m.emptyPostingCount = promauto.With(reg).NewCounter(prometheus.CounterOpts{
   300  		Name: "thanos_bucket_store_empty_postings_total",
   301  		Help: "Total number of empty postings when fetching block series.",
   302  	})
   303  
   304  	return &m
   305  }
   306  
   307  // FilterConfig is a configuration, which Store uses for filtering metrics based on time.
   308  type FilterConfig struct {
   309  	MinTime, MaxTime model.TimeOrDurationValue
   310  }
   311  
   312  type BlockEstimator func(meta metadata.Meta) uint64
   313  
   314  // BucketStore implements the store API backed by a bucket. It loads all index
   315  // files to local disk.
   316  //
   317  // NOTE: Bucket store reencodes postings using diff+varint+snappy when storing to cache.
   318  // This makes them smaller, but takes extra CPU and memory.
   319  // When used with in-memory cache, memory usage should decrease overall, thanks to postings being smaller.
   320  type BucketStore struct {
   321  	logger          log.Logger
   322  	reg             prometheus.Registerer // TODO(metalmatze) remove and add via BucketStoreOption
   323  	metrics         *bucketStoreMetrics
   324  	bkt             objstore.InstrumentedBucketReader
   325  	fetcher         block.MetadataFetcher
   326  	dir             string
   327  	indexCache      storecache.IndexCache
   328  	indexReaderPool *indexheader.ReaderPool
   329  	buffers         sync.Pool
   330  	chunkPool       pool.Bytes
   331  	seriesBatchSize int
   332  
   333  	// Sets of blocks that have the same labels. They are indexed by a hash over their label set.
   334  	mtx       sync.RWMutex
   335  	blocks    map[ulid.ULID]*bucketBlock
   336  	blockSets map[uint64]*bucketBlockSet
   337  
   338  	// Verbose enabled additional logging.
   339  	debugLogging bool
   340  	// Number of goroutines to use when syncing blocks from object storage.
   341  	blockSyncConcurrency int
   342  
   343  	// Query gate which limits the maximum amount of concurrent queries.
   344  	queryGate gate.Gate
   345  
   346  	// chunksLimiterFactory creates a new limiter used to limit the number of chunks fetched by each Series() call.
   347  	chunksLimiterFactory ChunksLimiterFactory
   348  	// seriesLimiterFactory creates a new limiter used to limit the number of touched series by each Series() call,
   349  	// or LabelName and LabelValues calls when used with matchers.
   350  	seriesLimiterFactory SeriesLimiterFactory
   351  
   352  	// bytesLimiterFactory creates a new limiter used to limit the amount of bytes fetched/touched by each Series() call.
   353  	bytesLimiterFactory BytesLimiterFactory
   354  	partitioner         Partitioner
   355  
   356  	filterConfig             *FilterConfig
   357  	advLabelSets             []labelpb.ZLabelSet
   358  	enableCompatibilityLabel bool
   359  
   360  	// Every how many posting offset entry we pool in heap memory. Default in Prometheus is 32.
   361  	postingOffsetsInMemSampling int
   362  
   363  	// Enables hints in the Series() response.
   364  	enableSeriesResponseHints bool
   365  
   366  	enableChunkHashCalculation bool
   367  
   368  	blockEstimatedMaxSeriesFunc BlockEstimator
   369  	blockEstimatedMaxChunkFunc  BlockEstimator
   370  }
   371  
   372  func (s *BucketStore) validate() error {
   373  	if s.blockSyncConcurrency < minBlockSyncConcurrency {
   374  		return errBlockSyncConcurrencyNotValid
   375  	}
   376  	return nil
   377  }
   378  
   379  type noopCache struct{}
   380  
   381  func (noopCache) StorePostings(ulid.ULID, labels.Label, []byte) {}
   382  func (noopCache) FetchMultiPostings(_ context.Context, _ ulid.ULID, keys []labels.Label) (map[labels.Label][]byte, []labels.Label) {
   383  	return map[labels.Label][]byte{}, keys
   384  }
   385  
   386  func (noopCache) StoreExpandedPostings(_ ulid.ULID, _ []*labels.Matcher, _ []byte) {}
   387  func (noopCache) FetchExpandedPostings(_ context.Context, _ ulid.ULID, _ []*labels.Matcher) ([]byte, bool) {
   388  	return []byte{}, false
   389  }
   390  
   391  func (noopCache) StoreSeries(ulid.ULID, storage.SeriesRef, []byte) {}
   392  func (noopCache) FetchMultiSeries(_ context.Context, _ ulid.ULID, ids []storage.SeriesRef) (map[storage.SeriesRef][]byte, []storage.SeriesRef) {
   393  	return map[storage.SeriesRef][]byte{}, ids
   394  }
   395  
   396  // BucketStoreOption are functions that configure BucketStore.
   397  type BucketStoreOption func(s *BucketStore)
   398  
   399  // WithLogger sets the BucketStore logger to the one you pass.
   400  func WithLogger(logger log.Logger) BucketStoreOption {
   401  	return func(s *BucketStore) {
   402  		s.logger = logger
   403  	}
   404  }
   405  
   406  // WithRegistry sets a registry that BucketStore uses to register metrics with.
   407  func WithRegistry(reg prometheus.Registerer) BucketStoreOption {
   408  	return func(s *BucketStore) {
   409  		s.reg = reg
   410  	}
   411  }
   412  
   413  // WithIndexCache sets a indexCache to use instead of a noopCache.
   414  func WithIndexCache(cache storecache.IndexCache) BucketStoreOption {
   415  	return func(s *BucketStore) {
   416  		s.indexCache = cache
   417  	}
   418  }
   419  
   420  // WithQueryGate sets a queryGate to use instead of a noopGate.
   421  func WithQueryGate(queryGate gate.Gate) BucketStoreOption {
   422  	return func(s *BucketStore) {
   423  		s.queryGate = queryGate
   424  	}
   425  }
   426  
   427  // WithChunkPool sets a pool.Bytes to use for chunks.
   428  func WithChunkPool(chunkPool pool.Bytes) BucketStoreOption {
   429  	return func(s *BucketStore) {
   430  		s.chunkPool = chunkPool
   431  	}
   432  }
   433  
   434  // WithFilterConfig sets a filter which Store uses for filtering metrics based on time.
   435  func WithFilterConfig(filter *FilterConfig) BucketStoreOption {
   436  	return func(s *BucketStore) {
   437  		s.filterConfig = filter
   438  	}
   439  }
   440  
   441  // WithDebugLogging enables debug logging.
   442  func WithDebugLogging() BucketStoreOption {
   443  	return func(s *BucketStore) {
   444  		s.debugLogging = true
   445  	}
   446  }
   447  
   448  func WithChunkHashCalculation(enableChunkHashCalculation bool) BucketStoreOption {
   449  	return func(s *BucketStore) {
   450  		s.enableChunkHashCalculation = enableChunkHashCalculation
   451  	}
   452  }
   453  
   454  func WithSeriesBatchSize(seriesBatchSize int) BucketStoreOption {
   455  	return func(s *BucketStore) {
   456  		s.seriesBatchSize = seriesBatchSize
   457  	}
   458  }
   459  
   460  func WithBlockEstimatedMaxSeriesFunc(f BlockEstimator) BucketStoreOption {
   461  	return func(s *BucketStore) {
   462  		s.blockEstimatedMaxSeriesFunc = f
   463  	}
   464  }
   465  
   466  func WithBlockEstimatedMaxChunkFunc(f BlockEstimator) BucketStoreOption {
   467  	return func(s *BucketStore) {
   468  		s.blockEstimatedMaxChunkFunc = f
   469  	}
   470  }
   471  
   472  // NewBucketStore creates a new bucket backed store that implements the store API against
   473  // an object store bucket. It is optimized to work against high latency backends.
   474  func NewBucketStore(
   475  	bkt objstore.InstrumentedBucketReader,
   476  	fetcher block.MetadataFetcher,
   477  	dir string,
   478  	chunksLimiterFactory ChunksLimiterFactory,
   479  	seriesLimiterFactory SeriesLimiterFactory,
   480  	bytesLimiterFactory BytesLimiterFactory,
   481  	partitioner Partitioner,
   482  	blockSyncConcurrency int,
   483  	enableCompatibilityLabel bool,
   484  	postingOffsetsInMemSampling int,
   485  	enableSeriesResponseHints bool, // TODO(pracucci) Thanos 0.12 and below doesn't gracefully handle new fields in SeriesResponse. Drop this flag and always enable hints once we can drop backward compatibility.
   486  	lazyIndexReaderEnabled bool,
   487  	lazyIndexReaderIdleTimeout time.Duration,
   488  	options ...BucketStoreOption,
   489  ) (*BucketStore, error) {
   490  	s := &BucketStore{
   491  		logger:     log.NewNopLogger(),
   492  		bkt:        bkt,
   493  		fetcher:    fetcher,
   494  		dir:        dir,
   495  		indexCache: noopCache{},
   496  		buffers: sync.Pool{New: func() interface{} {
   497  			b := make([]byte, 0, initialBufSize)
   498  			return &b
   499  		}},
   500  		chunkPool:                   pool.NoopBytes{},
   501  		blocks:                      map[ulid.ULID]*bucketBlock{},
   502  		blockSets:                   map[uint64]*bucketBlockSet{},
   503  		blockSyncConcurrency:        blockSyncConcurrency,
   504  		queryGate:                   gate.NewNoop(),
   505  		chunksLimiterFactory:        chunksLimiterFactory,
   506  		seriesLimiterFactory:        seriesLimiterFactory,
   507  		bytesLimiterFactory:         bytesLimiterFactory,
   508  		partitioner:                 partitioner,
   509  		enableCompatibilityLabel:    enableCompatibilityLabel,
   510  		postingOffsetsInMemSampling: postingOffsetsInMemSampling,
   511  		enableSeriesResponseHints:   enableSeriesResponseHints,
   512  		enableChunkHashCalculation:  enableChunkHashCalculation,
   513  		seriesBatchSize:             SeriesBatchSize,
   514  	}
   515  
   516  	for _, option := range options {
   517  		option(s)
   518  	}
   519  
   520  	// Depend on the options
   521  	indexReaderPoolMetrics := indexheader.NewReaderPoolMetrics(extprom.WrapRegistererWithPrefix("thanos_bucket_store_", s.reg))
   522  	s.indexReaderPool = indexheader.NewReaderPool(s.logger, lazyIndexReaderEnabled, lazyIndexReaderIdleTimeout, indexReaderPoolMetrics)
   523  	s.metrics = newBucketStoreMetrics(s.reg) // TODO(metalmatze): Might be possible via Option too
   524  
   525  	if err := s.validate(); err != nil {
   526  		return nil, errors.Wrap(err, "validate config")
   527  	}
   528  
   529  	if dir == "" {
   530  		return s, nil
   531  	}
   532  
   533  	if err := os.MkdirAll(dir, 0750); err != nil {
   534  		return nil, errors.Wrap(err, "create dir")
   535  	}
   536  
   537  	return s, nil
   538  }
   539  
   540  // Close the store.
   541  func (s *BucketStore) Close() (err error) {
   542  	s.mtx.Lock()
   543  	defer s.mtx.Unlock()
   544  
   545  	for _, b := range s.blocks {
   546  		runutil.CloseWithErrCapture(&err, b, "closing Bucket Block")
   547  	}
   548  
   549  	s.indexReaderPool.Close()
   550  	return err
   551  }
   552  
   553  // SyncBlocks synchronizes the stores state with the Bucket bucket.
   554  // It will reuse disk space as persistent cache based on s.dir param.
   555  func (s *BucketStore) SyncBlocks(ctx context.Context) error {
   556  	metas, _, metaFetchErr := s.fetcher.Fetch(ctx)
   557  	// For partial view allow adding new blocks at least.
   558  	if metaFetchErr != nil && metas == nil {
   559  		return metaFetchErr
   560  	}
   561  
   562  	var wg sync.WaitGroup
   563  	blockc := make(chan *metadata.Meta)
   564  
   565  	for i := 0; i < s.blockSyncConcurrency; i++ {
   566  		wg.Add(1)
   567  		go func() {
   568  			for meta := range blockc {
   569  				if err := s.addBlock(ctx, meta); err != nil {
   570  					continue
   571  				}
   572  			}
   573  			wg.Done()
   574  		}()
   575  	}
   576  
   577  	for id, meta := range metas {
   578  		if b := s.getBlock(id); b != nil {
   579  			continue
   580  		}
   581  		select {
   582  		case <-ctx.Done():
   583  		case blockc <- meta:
   584  		}
   585  	}
   586  
   587  	close(blockc)
   588  	wg.Wait()
   589  
   590  	if metaFetchErr != nil {
   591  		return metaFetchErr
   592  	}
   593  
   594  	// Drop all blocks that are no longer present in the bucket.
   595  	for id := range s.blocks {
   596  		if _, ok := metas[id]; ok {
   597  			continue
   598  		}
   599  		if err := s.removeBlock(id); err != nil {
   600  			level.Warn(s.logger).Log("msg", "drop of outdated block failed", "block", id, "err", err)
   601  			s.metrics.blockDropFailures.Inc()
   602  		}
   603  		level.Info(s.logger).Log("msg", "dropped outdated block", "block", id)
   604  		s.metrics.blockDrops.Inc()
   605  	}
   606  
   607  	// Sync advertise labels.
   608  	var storeLabels labels.Labels
   609  	s.mtx.Lock()
   610  	s.advLabelSets = make([]labelpb.ZLabelSet, 0, len(s.advLabelSets))
   611  	for _, bs := range s.blockSets {
   612  		storeLabels = storeLabels[:0]
   613  		s.advLabelSets = append(s.advLabelSets, labelpb.ZLabelSet{Labels: labelpb.ZLabelsFromPromLabels(append(storeLabels, bs.labels...))})
   614  	}
   615  	sort.Slice(s.advLabelSets, func(i, j int) bool {
   616  		return strings.Compare(s.advLabelSets[i].String(), s.advLabelSets[j].String()) < 0
   617  	})
   618  	s.mtx.Unlock()
   619  	return nil
   620  }
   621  
   622  // InitialSync perform blocking sync with extra step at the end to delete locally saved blocks that are no longer
   623  // present in the bucket. The mismatch of these can only happen between restarts, so we can do that only once per startup.
   624  func (s *BucketStore) InitialSync(ctx context.Context) error {
   625  	if err := s.SyncBlocks(ctx); err != nil {
   626  		return errors.Wrap(err, "sync block")
   627  	}
   628  
   629  	if s.dir == "" {
   630  		return nil
   631  	}
   632  
   633  	fis, err := os.ReadDir(s.dir)
   634  	if err != nil {
   635  		return errors.Wrap(err, "read dir")
   636  	}
   637  	names := make([]string, 0, len(fis))
   638  	for _, fi := range fis {
   639  		names = append(names, fi.Name())
   640  	}
   641  	for _, n := range names {
   642  		id, ok := block.IsBlockDir(n)
   643  		if !ok {
   644  			continue
   645  		}
   646  		if b := s.getBlock(id); b != nil {
   647  			continue
   648  		}
   649  
   650  		// No such block loaded, remove the local dir.
   651  		if err := os.RemoveAll(path.Join(s.dir, id.String())); err != nil {
   652  			level.Warn(s.logger).Log("msg", "failed to remove block which is not needed", "err", err)
   653  		}
   654  	}
   655  
   656  	return nil
   657  }
   658  
   659  func (s *BucketStore) getBlock(id ulid.ULID) *bucketBlock {
   660  	s.mtx.RLock()
   661  	defer s.mtx.RUnlock()
   662  	return s.blocks[id]
   663  }
   664  
   665  func (s *BucketStore) addBlock(ctx context.Context, meta *metadata.Meta) (err error) {
   666  	var dir string
   667  	if s.dir != "" {
   668  		dir = filepath.Join(s.dir, meta.ULID.String())
   669  	}
   670  	start := time.Now()
   671  
   672  	level.Debug(s.logger).Log("msg", "loading new block", "id", meta.ULID)
   673  	defer func() {
   674  		if err != nil {
   675  			s.metrics.blockLoadFailures.Inc()
   676  			if dir != "" {
   677  				if err2 := os.RemoveAll(dir); err2 != nil {
   678  					level.Warn(s.logger).Log("msg", "failed to remove block we cannot load", "err", err2)
   679  				}
   680  			}
   681  			level.Warn(s.logger).Log("msg", "loading block failed", "elapsed", time.Since(start), "id", meta.ULID, "err", err)
   682  		} else {
   683  			level.Info(s.logger).Log("msg", "loaded new block", "elapsed", time.Since(start), "id", meta.ULID)
   684  		}
   685  	}()
   686  	s.metrics.blockLoads.Inc()
   687  
   688  	lset := labels.FromMap(meta.Thanos.Labels)
   689  	h := lset.Hash()
   690  
   691  	indexHeaderReader, err := s.indexReaderPool.NewBinaryReader(
   692  		ctx,
   693  		s.logger,
   694  		s.bkt,
   695  		s.dir,
   696  		meta.ULID,
   697  		s.postingOffsetsInMemSampling,
   698  	)
   699  	if err != nil {
   700  		return errors.Wrap(err, "create index header reader")
   701  	}
   702  	defer func() {
   703  		if err != nil {
   704  			runutil.CloseWithErrCapture(&err, indexHeaderReader, "index-header")
   705  		}
   706  	}()
   707  
   708  	b, err := newBucketBlock(
   709  		ctx,
   710  		log.With(s.logger, "block", meta.ULID),
   711  		s.metrics,
   712  		meta,
   713  		s.bkt,
   714  		dir,
   715  		s.indexCache,
   716  		s.chunkPool,
   717  		indexHeaderReader,
   718  		s.partitioner,
   719  		s.blockEstimatedMaxSeriesFunc,
   720  		s.blockEstimatedMaxChunkFunc,
   721  	)
   722  	if err != nil {
   723  		return errors.Wrap(err, "new bucket block")
   724  	}
   725  	defer func() {
   726  		if err != nil {
   727  			runutil.CloseWithErrCapture(&err, b, "index-header")
   728  		}
   729  	}()
   730  
   731  	s.mtx.Lock()
   732  	defer s.mtx.Unlock()
   733  
   734  	sort.Sort(lset)
   735  
   736  	set, ok := s.blockSets[h]
   737  	if !ok {
   738  		set = newBucketBlockSet(lset)
   739  		s.blockSets[h] = set
   740  	}
   741  
   742  	if err = set.add(b); err != nil {
   743  		return errors.Wrap(err, "add block to set")
   744  	}
   745  	s.blocks[b.meta.ULID] = b
   746  
   747  	s.metrics.blocksLoaded.Inc()
   748  	s.metrics.lastLoadedBlock.SetToCurrentTime()
   749  	return nil
   750  }
   751  
   752  func (s *BucketStore) removeBlock(id ulid.ULID) error {
   753  	s.mtx.Lock()
   754  	b, ok := s.blocks[id]
   755  	if ok {
   756  		lset := labels.FromMap(b.meta.Thanos.Labels)
   757  		s.blockSets[lset.Hash()].remove(id)
   758  		delete(s.blocks, id)
   759  	}
   760  	s.mtx.Unlock()
   761  
   762  	if !ok {
   763  		return nil
   764  	}
   765  
   766  	s.metrics.blocksLoaded.Dec()
   767  	if err := b.Close(); err != nil {
   768  		return errors.Wrap(err, "close block")
   769  	}
   770  
   771  	if b.dir == "" {
   772  		return nil
   773  	}
   774  
   775  	return os.RemoveAll(b.dir)
   776  }
   777  
   778  // TimeRange returns the minimum and maximum timestamp of data available in the store.
   779  func (s *BucketStore) TimeRange() (mint, maxt int64) {
   780  	s.mtx.RLock()
   781  	defer s.mtx.RUnlock()
   782  
   783  	mint = math.MaxInt64
   784  	maxt = math.MinInt64
   785  
   786  	for _, b := range s.blocks {
   787  		if b.meta.MinTime < mint {
   788  			mint = b.meta.MinTime
   789  		}
   790  		if b.meta.MaxTime > maxt {
   791  			maxt = b.meta.MaxTime
   792  		}
   793  	}
   794  
   795  	mint = s.limitMinTime(mint)
   796  	maxt = s.limitMaxTime(maxt)
   797  
   798  	return mint, maxt
   799  }
   800  
   801  // TSDBInfos returns a list of infopb.TSDBInfos for blocks in the bucket store.
   802  func (s *BucketStore) TSDBInfos() []infopb.TSDBInfo {
   803  	s.mtx.RLock()
   804  	defer s.mtx.RUnlock()
   805  
   806  	infos := make([]infopb.TSDBInfo, 0, len(s.blocks))
   807  	for _, b := range s.blocks {
   808  		infos = append(infos, infopb.TSDBInfo{
   809  			Labels: labelpb.ZLabelSet{
   810  				Labels: labelpb.ZLabelsFromPromLabels(labels.FromMap(b.meta.Thanos.Labels)),
   811  			},
   812  			MinTime: b.meta.MinTime,
   813  			MaxTime: b.meta.MaxTime,
   814  		})
   815  	}
   816  
   817  	return infos
   818  }
   819  
   820  func (s *BucketStore) LabelSet() []labelpb.ZLabelSet {
   821  	s.mtx.RLock()
   822  	labelSets := s.advLabelSets
   823  	s.mtx.RUnlock()
   824  
   825  	if s.enableCompatibilityLabel && len(labelSets) > 0 {
   826  		labelSets = append(labelSets, labelpb.ZLabelSet{Labels: []labelpb.ZLabel{{Name: CompatibilityTypeLabelName, Value: "store"}}})
   827  	}
   828  
   829  	return labelSets
   830  }
   831  
   832  // Info implements the storepb.StoreServer interface.
   833  func (s *BucketStore) Info(context.Context, *storepb.InfoRequest) (*storepb.InfoResponse, error) {
   834  	mint, maxt := s.TimeRange()
   835  	res := &storepb.InfoResponse{
   836  		StoreType: component.Store.ToProto(),
   837  		MinTime:   mint,
   838  		MaxTime:   maxt,
   839  		LabelSets: s.LabelSet(),
   840  	}
   841  
   842  	return res, nil
   843  }
   844  
   845  func (s *BucketStore) limitMinTime(mint int64) int64 {
   846  	if s.filterConfig == nil {
   847  		return mint
   848  	}
   849  
   850  	filterMinTime := s.filterConfig.MinTime.PrometheusTimestamp()
   851  
   852  	if mint < filterMinTime {
   853  		return filterMinTime
   854  	}
   855  
   856  	return mint
   857  }
   858  
   859  func (s *BucketStore) limitMaxTime(maxt int64) int64 {
   860  	if s.filterConfig == nil {
   861  		return maxt
   862  	}
   863  
   864  	filterMaxTime := s.filterConfig.MaxTime.PrometheusTimestamp()
   865  
   866  	if maxt > filterMaxTime {
   867  		maxt = filterMaxTime
   868  	}
   869  
   870  	return maxt
   871  }
   872  
   873  type seriesEntry struct {
   874  	lset labels.Labels
   875  	refs []chunks.ChunkRef
   876  	chks []storepb.AggrChunk
   877  }
   878  
   879  // blockSeriesClient is a storepb.Store_SeriesClient for a
   880  // single TSDB block in object storage.
   881  type blockSeriesClient struct {
   882  	grpc.ClientStream
   883  	ctx             context.Context
   884  	logger          log.Logger
   885  	extLset         labels.Labels
   886  	extLsetToRemove map[string]struct{}
   887  
   888  	mint           int64
   889  	maxt           int64
   890  	indexr         *bucketIndexReader
   891  	chunkr         *bucketChunkReader
   892  	loadAggregates []storepb.Aggr
   893  	chunksLimiter  ChunksLimiter
   894  	bytesLimiter   BytesLimiter
   895  
   896  	skipChunks         bool
   897  	shardMatcher       *storepb.ShardMatcher
   898  	calculateChunkHash bool
   899  	chunkFetchDuration prometheus.Histogram
   900  
   901  	// Internal state.
   902  	i               uint64
   903  	postings        []storage.SeriesRef
   904  	chkMetas        []chunks.Meta
   905  	lset            labels.Labels
   906  	symbolizedLset  []symbolizedLabel
   907  	entries         []seriesEntry
   908  	hasMorePostings bool
   909  	batchSize       int
   910  }
   911  
   912  func newBlockSeriesClient(
   913  	ctx context.Context,
   914  	logger log.Logger,
   915  	b *bucketBlock,
   916  	req *storepb.SeriesRequest,
   917  	limiter ChunksLimiter,
   918  	bytesLimiter BytesLimiter,
   919  	shardMatcher *storepb.ShardMatcher,
   920  	calculateChunkHash bool,
   921  	batchSize int,
   922  	chunkFetchDuration prometheus.Histogram,
   923  	extLsetToRemove map[string]struct{},
   924  ) *blockSeriesClient {
   925  	var chunkr *bucketChunkReader
   926  	if !req.SkipChunks {
   927  		chunkr = b.chunkReader()
   928  	}
   929  
   930  	extLset := b.extLset
   931  	if extLsetToRemove != nil {
   932  		extLset = rmLabels(extLset.Copy(), extLsetToRemove)
   933  	}
   934  
   935  	return &blockSeriesClient{
   936  		ctx:             ctx,
   937  		logger:          logger,
   938  		extLset:         extLset,
   939  		extLsetToRemove: extLsetToRemove,
   940  
   941  		mint:               req.MinTime,
   942  		maxt:               req.MaxTime,
   943  		indexr:             b.indexReader(),
   944  		chunkr:             chunkr,
   945  		chunksLimiter:      limiter,
   946  		bytesLimiter:       bytesLimiter,
   947  		skipChunks:         req.SkipChunks,
   948  		chunkFetchDuration: chunkFetchDuration,
   949  
   950  		loadAggregates:     req.Aggregates,
   951  		shardMatcher:       shardMatcher,
   952  		calculateChunkHash: calculateChunkHash,
   953  		hasMorePostings:    true,
   954  		batchSize:          batchSize,
   955  	}
   956  }
   957  
   958  func (b *blockSeriesClient) Close() {
   959  	if !b.skipChunks {
   960  		runutil.CloseWithLogOnErr(b.logger, b.chunkr, "series block")
   961  	}
   962  
   963  	runutil.CloseWithLogOnErr(b.logger, b.indexr, "series block")
   964  }
   965  
   966  func (b *blockSeriesClient) MergeStats(stats *queryStats) *queryStats {
   967  	stats = stats.merge(b.indexr.stats)
   968  	if !b.skipChunks {
   969  		stats = stats.merge(b.chunkr.stats)
   970  	}
   971  	return stats
   972  }
   973  
   974  type sortedMatchers []*labels.Matcher
   975  
   976  func newSortedMatchers(matchers []*labels.Matcher) sortedMatchers {
   977  	sort.Slice(matchers, func(i, j int) bool {
   978  		if matchers[i].Type == matchers[j].Type {
   979  			if matchers[i].Name == matchers[j].Name {
   980  				return matchers[i].Value < matchers[j].Value
   981  			}
   982  			return matchers[i].Name < matchers[j].Name
   983  		}
   984  		return matchers[i].Type < matchers[j].Type
   985  	})
   986  
   987  	return matchers
   988  }
   989  
   990  func (b *blockSeriesClient) ExpandPostings(
   991  	matchers sortedMatchers,
   992  	seriesLimiter SeriesLimiter,
   993  ) error {
   994  	ps, err := b.indexr.ExpandedPostings(b.ctx, matchers, b.bytesLimiter)
   995  	if err != nil {
   996  		return errors.Wrap(err, "expanded matching posting")
   997  	}
   998  
   999  	if len(ps) == 0 {
  1000  		return nil
  1001  	}
  1002  
  1003  	if err := seriesLimiter.Reserve(uint64(len(ps))); err != nil {
  1004  		return httpgrpc.Errorf(int(codes.ResourceExhausted), "exceeded series limit: %s", err)
  1005  	}
  1006  
  1007  	b.postings = ps
  1008  	if b.batchSize > len(ps) {
  1009  		b.batchSize = len(ps)
  1010  	}
  1011  	b.entries = make([]seriesEntry, 0, b.batchSize)
  1012  	return nil
  1013  }
  1014  
  1015  func (b *blockSeriesClient) Recv() (*storepb.SeriesResponse, error) {
  1016  	for len(b.entries) == 0 && b.hasMorePostings {
  1017  		if err := b.nextBatch(); err != nil {
  1018  			return nil, err
  1019  		}
  1020  	}
  1021  
  1022  	if len(b.entries) == 0 {
  1023  		if b.chunkr != nil {
  1024  			b.chunkFetchDuration.Observe(b.chunkr.stats.ChunksFetchDurationSum.Seconds())
  1025  		}
  1026  		return nil, io.EOF
  1027  	}
  1028  
  1029  	next := b.entries[0]
  1030  	b.entries = b.entries[1:]
  1031  
  1032  	return storepb.NewSeriesResponse(&storepb.Series{
  1033  		Labels: labelpb.ZLabelsFromPromLabels(next.lset),
  1034  		Chunks: next.chks,
  1035  	}), nil
  1036  }
  1037  
  1038  func (b *blockSeriesClient) nextBatch() error {
  1039  	start := b.i
  1040  	end := start + SeriesBatchSize
  1041  	if end > uint64(len(b.postings)) {
  1042  		end = uint64(len(b.postings))
  1043  	}
  1044  	b.i = end
  1045  
  1046  	postingsBatch := b.postings[start:end]
  1047  	if len(postingsBatch) == 0 {
  1048  		b.hasMorePostings = false
  1049  		return nil
  1050  	}
  1051  
  1052  	b.indexr.reset()
  1053  	if !b.skipChunks {
  1054  		b.chunkr.reset()
  1055  	}
  1056  
  1057  	if err := b.indexr.PreloadSeries(b.ctx, postingsBatch, b.bytesLimiter); err != nil {
  1058  		return errors.Wrap(err, "preload series")
  1059  	}
  1060  
  1061  	b.entries = b.entries[:0]
  1062  	for i := 0; i < len(postingsBatch); i++ {
  1063  		if err := b.ctx.Err(); err != nil {
  1064  			return err
  1065  		}
  1066  		ok, err := b.indexr.LoadSeriesForTime(postingsBatch[i], &b.symbolizedLset, &b.chkMetas, b.skipChunks, b.mint, b.maxt)
  1067  		if err != nil {
  1068  			return errors.Wrap(err, "read series")
  1069  		}
  1070  		if !ok {
  1071  			continue
  1072  		}
  1073  
  1074  		if err := b.indexr.LookupLabelsSymbols(b.symbolizedLset, &b.lset); err != nil {
  1075  			return errors.Wrap(err, "Lookup labels symbols")
  1076  		}
  1077  
  1078  		completeLabelset := labelpb.ExtendSortedLabels(b.lset, b.extLset)
  1079  		if b.extLsetToRemove != nil {
  1080  			completeLabelset = rmLabels(completeLabelset, b.extLsetToRemove)
  1081  		}
  1082  
  1083  		if !b.shardMatcher.MatchesLabels(completeLabelset) {
  1084  			continue
  1085  		}
  1086  
  1087  		s := seriesEntry{lset: completeLabelset}
  1088  		if b.skipChunks {
  1089  			b.entries = append(b.entries, s)
  1090  			continue
  1091  		}
  1092  
  1093  		// Schedule loading chunks.
  1094  		s.refs = make([]chunks.ChunkRef, 0, len(b.chkMetas))
  1095  		s.chks = make([]storepb.AggrChunk, 0, len(b.chkMetas))
  1096  
  1097  		for j, meta := range b.chkMetas {
  1098  			if err := b.chunkr.addLoad(meta.Ref, len(b.entries), j); err != nil {
  1099  				return errors.Wrap(err, "add chunk load")
  1100  			}
  1101  			s.chks = append(s.chks, storepb.AggrChunk{
  1102  				MinTime: meta.MinTime,
  1103  				MaxTime: meta.MaxTime,
  1104  			})
  1105  			s.refs = append(s.refs, meta.Ref)
  1106  		}
  1107  
  1108  		// Ensure sample limit through chunksLimiter if we return chunks.
  1109  		if err := b.chunksLimiter.Reserve(uint64(len(b.chkMetas))); err != nil {
  1110  			return httpgrpc.Errorf(int(codes.ResourceExhausted), "exceeded chunks limit: %s", err)
  1111  		}
  1112  
  1113  		b.entries = append(b.entries, s)
  1114  	}
  1115  
  1116  	if !b.skipChunks {
  1117  		if err := b.chunkr.load(b.ctx, b.entries, b.loadAggregates, b.calculateChunkHash, b.bytesLimiter); err != nil {
  1118  			return errors.Wrap(err, "load chunks")
  1119  		}
  1120  	}
  1121  
  1122  	return nil
  1123  }
  1124  
  1125  func populateChunk(out *storepb.AggrChunk, in chunkenc.Chunk, aggrs []storepb.Aggr, save func([]byte) ([]byte, error), calculateChecksum bool) error {
  1126  	hasher := hashPool.Get().(hash.Hash64)
  1127  	defer hashPool.Put(hasher)
  1128  
  1129  	if in.Encoding() == chunkenc.EncXOR || in.Encoding() == chunkenc.EncHistogram {
  1130  		b, err := save(in.Bytes())
  1131  		if err != nil {
  1132  			return err
  1133  		}
  1134  		out.Raw = &storepb.Chunk{
  1135  			Data: b,
  1136  			Type: storepb.Chunk_Encoding(in.Encoding() - 1),
  1137  			Hash: hashChunk(hasher, b, calculateChecksum),
  1138  		}
  1139  		return nil
  1140  	}
  1141  
  1142  	if in.Encoding() != downsample.ChunkEncAggr {
  1143  		return errors.Errorf("unsupported chunk encoding %d", in.Encoding())
  1144  	}
  1145  
  1146  	ac := downsample.AggrChunk(in.Bytes())
  1147  
  1148  	for _, at := range aggrs {
  1149  		switch at {
  1150  		case storepb.Aggr_COUNT:
  1151  			x, err := ac.Get(downsample.AggrCount)
  1152  			if err != nil {
  1153  				return errors.Errorf("aggregate %s does not exist", downsample.AggrCount)
  1154  			}
  1155  			b, err := save(x.Bytes())
  1156  			if err != nil {
  1157  				return err
  1158  			}
  1159  			out.Count = &storepb.Chunk{Type: storepb.Chunk_XOR, Data: b, Hash: hashChunk(hasher, b, calculateChecksum)}
  1160  		case storepb.Aggr_SUM:
  1161  			x, err := ac.Get(downsample.AggrSum)
  1162  			if err != nil {
  1163  				return errors.Errorf("aggregate %s does not exist", downsample.AggrSum)
  1164  			}
  1165  			b, err := save(x.Bytes())
  1166  			if err != nil {
  1167  				return err
  1168  			}
  1169  			out.Sum = &storepb.Chunk{Type: storepb.Chunk_XOR, Data: b, Hash: hashChunk(hasher, b, calculateChecksum)}
  1170  		case storepb.Aggr_MIN:
  1171  			x, err := ac.Get(downsample.AggrMin)
  1172  			if err != nil {
  1173  				return errors.Errorf("aggregate %s does not exist", downsample.AggrMin)
  1174  			}
  1175  			b, err := save(x.Bytes())
  1176  			if err != nil {
  1177  				return err
  1178  			}
  1179  			out.Min = &storepb.Chunk{Type: storepb.Chunk_XOR, Data: b, Hash: hashChunk(hasher, b, calculateChecksum)}
  1180  		case storepb.Aggr_MAX:
  1181  			x, err := ac.Get(downsample.AggrMax)
  1182  			if err != nil {
  1183  				return errors.Errorf("aggregate %s does not exist", downsample.AggrMax)
  1184  			}
  1185  			b, err := save(x.Bytes())
  1186  			if err != nil {
  1187  				return err
  1188  			}
  1189  			out.Max = &storepb.Chunk{Type: storepb.Chunk_XOR, Data: b, Hash: hashChunk(hasher, b, calculateChecksum)}
  1190  		case storepb.Aggr_COUNTER:
  1191  			x, err := ac.Get(downsample.AggrCounter)
  1192  			if err != nil {
  1193  				return errors.Errorf("aggregate %s does not exist", downsample.AggrCounter)
  1194  			}
  1195  			b, err := save(x.Bytes())
  1196  			if err != nil {
  1197  				return err
  1198  			}
  1199  			out.Counter = &storepb.Chunk{Type: storepb.Chunk_XOR, Data: b, Hash: hashChunk(hasher, b, calculateChecksum)}
  1200  		}
  1201  	}
  1202  	return nil
  1203  }
  1204  
  1205  func hashChunk(hasher hash.Hash64, b []byte, doHash bool) uint64 {
  1206  	if !doHash {
  1207  		return 0
  1208  	}
  1209  	hasher.Reset()
  1210  	// Write never returns an error on the hasher implementation
  1211  	_, _ = hasher.Write(b)
  1212  	return hasher.Sum64()
  1213  }
  1214  
  1215  // debugFoundBlockSetOverview logs on debug level what exactly blocks we used for query in terms of
  1216  // labels and resolution. This is important because we allow mixed resolution results, so it is quite crucial
  1217  // to be aware what exactly resolution we see on query.
  1218  // TODO(bplotka): Consider adding resolution label to all results to propagate that info to UI and Query API.
  1219  func debugFoundBlockSetOverview(logger log.Logger, mint, maxt, maxResolutionMillis int64, lset labels.Labels, bs []*bucketBlock) {
  1220  	if len(bs) == 0 {
  1221  		level.Debug(logger).Log("msg", "No block found", "mint", mint, "maxt", maxt, "lset", lset.String())
  1222  		return
  1223  	}
  1224  
  1225  	var (
  1226  		parts            []string
  1227  		currRes          = int64(-1)
  1228  		currMin, currMax int64
  1229  	)
  1230  	for _, b := range bs {
  1231  		if currRes == b.meta.Thanos.Downsample.Resolution {
  1232  			currMax = b.meta.MaxTime
  1233  			continue
  1234  		}
  1235  
  1236  		if currRes != -1 {
  1237  			parts = append(parts, fmt.Sprintf("Range: %d-%d Resolution: %d", currMin, currMax, currRes))
  1238  		}
  1239  
  1240  		currRes = b.meta.Thanos.Downsample.Resolution
  1241  		currMin = b.meta.MinTime
  1242  		currMax = b.meta.MaxTime
  1243  	}
  1244  
  1245  	parts = append(parts, fmt.Sprintf("Range: %d-%d Resolution: %d", currMin, currMax, currRes))
  1246  
  1247  	level.Debug(logger).Log("msg", "Blocks source resolutions", "blocks", len(bs), "Maximum Resolution", maxResolutionMillis, "mint", mint, "maxt", maxt, "lset", lset.String(), "spans", strings.Join(parts, "\n"))
  1248  }
  1249  
  1250  // Series implements the storepb.StoreServer interface.
  1251  func (s *BucketStore) Series(req *storepb.SeriesRequest, seriesSrv storepb.Store_SeriesServer) (err error) {
  1252  	srv := newFlushableServer(seriesSrv, sortingStrategyNone)
  1253  
  1254  	if s.queryGate != nil {
  1255  		tracing.DoInSpan(srv.Context(), "store_query_gate_ismyturn", func(ctx context.Context) {
  1256  			err = s.queryGate.Start(srv.Context())
  1257  		})
  1258  		if err != nil {
  1259  			return errors.Wrapf(err, "failed to wait for turn")
  1260  		}
  1261  
  1262  		defer s.queryGate.Done()
  1263  	}
  1264  
  1265  	tenant, _ := tenancy.GetTenantFromGRPCMetadata(srv.Context())
  1266  	level.Debug(s.logger).Log("msg", "Tenant for Series request", "tenant", tenant)
  1267  
  1268  	matchers, err := storepb.MatchersToPromMatchers(req.Matchers...)
  1269  	if err != nil {
  1270  		return status.Error(codes.InvalidArgument, err.Error())
  1271  	}
  1272  	req.MinTime = s.limitMinTime(req.MinTime)
  1273  	req.MaxTime = s.limitMaxTime(req.MaxTime)
  1274  
  1275  	var (
  1276  		bytesLimiter     = s.bytesLimiterFactory(s.metrics.queriesDropped.WithLabelValues("bytes"))
  1277  		ctx              = srv.Context()
  1278  		stats            = &queryStats{}
  1279  		respSets         []respSet
  1280  		mtx              sync.Mutex
  1281  		g, gctx          = errgroup.WithContext(ctx)
  1282  		resHints         = &hintspb.SeriesResponseHints{}
  1283  		reqBlockMatchers []*labels.Matcher
  1284  		chunksLimiter    = s.chunksLimiterFactory(s.metrics.queriesDropped.WithLabelValues("chunks"))
  1285  		seriesLimiter    = s.seriesLimiterFactory(s.metrics.queriesDropped.WithLabelValues("series"))
  1286  
  1287  		queryStatsEnabled = false
  1288  	)
  1289  
  1290  	if req.Hints != nil {
  1291  		reqHints := &hintspb.SeriesRequestHints{}
  1292  		if err := types.UnmarshalAny(req.Hints, reqHints); err != nil {
  1293  			return status.Error(codes.InvalidArgument, errors.Wrap(err, "unmarshal series request hints").Error())
  1294  		}
  1295  		queryStatsEnabled = reqHints.EnableQueryStats
  1296  
  1297  		reqBlockMatchers, err = storepb.MatchersToPromMatchers(reqHints.BlockMatchers...)
  1298  		if err != nil {
  1299  			return status.Error(codes.InvalidArgument, errors.Wrap(err, "translate request hints labels matchers").Error())
  1300  		}
  1301  	}
  1302  
  1303  	var extLsetToRemove map[string]struct{}
  1304  	if len(req.WithoutReplicaLabels) > 0 {
  1305  		extLsetToRemove = make(map[string]struct{})
  1306  		for _, l := range req.WithoutReplicaLabels {
  1307  			extLsetToRemove[l] = struct{}{}
  1308  		}
  1309  	}
  1310  
  1311  	s.mtx.RLock()
  1312  	for _, bs := range s.blockSets {
  1313  		blockMatchers, ok := bs.labelMatchers(matchers...)
  1314  		if !ok {
  1315  			continue
  1316  		}
  1317  
  1318  		sortedBlockMatchers := newSortedMatchers(blockMatchers)
  1319  
  1320  		blocks := bs.getFor(req.MinTime, req.MaxTime, req.MaxResolutionWindow, reqBlockMatchers)
  1321  
  1322  		if s.debugLogging {
  1323  			debugFoundBlockSetOverview(s.logger, req.MinTime, req.MaxTime, req.MaxResolutionWindow, bs.labels, blocks)
  1324  		}
  1325  
  1326  		for _, b := range blocks {
  1327  			blk := b
  1328  			gctx := gctx
  1329  
  1330  			if s.enableSeriesResponseHints {
  1331  				// Keep track of queried blocks.
  1332  				resHints.AddQueriedBlock(blk.meta.ULID)
  1333  			}
  1334  
  1335  			shardMatcher := req.ShardInfo.Matcher(&s.buffers)
  1336  
  1337  			blockClient := newBlockSeriesClient(
  1338  				srv.Context(),
  1339  				s.logger,
  1340  				blk,
  1341  				req,
  1342  				chunksLimiter,
  1343  				bytesLimiter,
  1344  				shardMatcher,
  1345  				s.enableChunkHashCalculation,
  1346  				s.seriesBatchSize,
  1347  				s.metrics.chunkFetchDuration,
  1348  				extLsetToRemove,
  1349  			)
  1350  
  1351  			defer blockClient.Close()
  1352  
  1353  			g.Go(func() error {
  1354  
  1355  				span, _ := tracing.StartSpan(gctx, "bucket_store_block_series", tracing.Tags{
  1356  					"block.id":         blk.meta.ULID,
  1357  					"block.mint":       blk.meta.MinTime,
  1358  					"block.maxt":       blk.meta.MaxTime,
  1359  					"block.resolution": blk.meta.Thanos.Downsample.Resolution,
  1360  				})
  1361  
  1362  				onClose := func() {
  1363  					mtx.Lock()
  1364  					stats = blockClient.MergeStats(stats)
  1365  					mtx.Unlock()
  1366  				}
  1367  
  1368  				if err := blockClient.ExpandPostings(sortedBlockMatchers, seriesLimiter); err != nil {
  1369  					onClose()
  1370  					span.Finish()
  1371  					return errors.Wrapf(err, "fetch postings for block %s", blk.meta.ULID)
  1372  				}
  1373  
  1374  				resp := newEagerRespSet(
  1375  					srv.Context(),
  1376  					span,
  1377  					10*time.Minute,
  1378  					blk.meta.ULID.String(),
  1379  					[]labels.Labels{blk.extLset},
  1380  					onClose,
  1381  					blockClient,
  1382  					shardMatcher,
  1383  					false,
  1384  					s.metrics.emptyPostingCount,
  1385  					nil,
  1386  				)
  1387  
  1388  				mtx.Lock()
  1389  				respSets = append(respSets, resp)
  1390  				mtx.Unlock()
  1391  
  1392  				return nil
  1393  			})
  1394  		}
  1395  	}
  1396  
  1397  	s.mtx.RUnlock()
  1398  
  1399  	defer func() {
  1400  		s.metrics.seriesDataTouched.WithLabelValues("postings").Observe(float64(stats.postingsTouched))
  1401  		s.metrics.seriesDataFetched.WithLabelValues("postings").Observe(float64(stats.postingsFetched))
  1402  		s.metrics.seriesDataSizeTouched.WithLabelValues("postings").Observe(float64(stats.PostingsTouchedSizeSum))
  1403  		s.metrics.seriesDataSizeFetched.WithLabelValues("postings").Observe(float64(stats.PostingsFetchedSizeSum))
  1404  		s.metrics.seriesDataTouched.WithLabelValues("series").Observe(float64(stats.seriesTouched))
  1405  		s.metrics.seriesDataFetched.WithLabelValues("series").Observe(float64(stats.seriesFetched))
  1406  		s.metrics.seriesDataSizeTouched.WithLabelValues("series").Observe(float64(stats.SeriesTouchedSizeSum))
  1407  		s.metrics.seriesDataSizeFetched.WithLabelValues("series").Observe(float64(stats.SeriesFetchedSizeSum))
  1408  		s.metrics.seriesDataTouched.WithLabelValues("chunks").Observe(float64(stats.chunksTouched))
  1409  		s.metrics.seriesDataFetched.WithLabelValues("chunks").Observe(float64(stats.chunksFetched))
  1410  		s.metrics.seriesDataSizeTouched.WithLabelValues("chunks").Observe(float64(stats.ChunksTouchedSizeSum))
  1411  		s.metrics.seriesDataSizeFetched.WithLabelValues("chunks").Observe(float64(stats.ChunksFetchedSizeSum))
  1412  		s.metrics.resultSeriesCount.Observe(float64(stats.mergedSeriesCount))
  1413  		s.metrics.cachedPostingsCompressions.WithLabelValues(labelEncode).Add(float64(stats.cachedPostingsCompressions))
  1414  		s.metrics.cachedPostingsCompressions.WithLabelValues(labelDecode).Add(float64(stats.cachedPostingsDecompressions))
  1415  		s.metrics.cachedPostingsCompressionErrors.WithLabelValues(labelEncode).Add(float64(stats.cachedPostingsCompressionErrors))
  1416  		s.metrics.cachedPostingsCompressionErrors.WithLabelValues(labelDecode).Add(float64(stats.cachedPostingsDecompressionErrors))
  1417  		s.metrics.cachedPostingsCompressionTimeSeconds.WithLabelValues(labelEncode).Add(stats.CachedPostingsCompressionTimeSum.Seconds())
  1418  		s.metrics.cachedPostingsCompressionTimeSeconds.WithLabelValues(labelDecode).Add(stats.CachedPostingsDecompressionTimeSum.Seconds())
  1419  		s.metrics.cachedPostingsOriginalSizeBytes.Add(float64(stats.CachedPostingsOriginalSizeSum))
  1420  		s.metrics.cachedPostingsCompressedSizeBytes.Add(float64(stats.CachedPostingsCompressedSizeSum))
  1421  		s.metrics.postingsSizeBytes.Observe(float64(int(stats.PostingsFetchedSizeSum) + int(stats.PostingsTouchedSizeSum)))
  1422  
  1423  		level.Debug(s.logger).Log("msg", "stats query processed",
  1424  			"request", req,
  1425  			"stats", fmt.Sprintf("%+v", stats), "err", err)
  1426  	}()
  1427  
  1428  	// Concurrently get data from all blocks.
  1429  	{
  1430  		begin := time.Now()
  1431  		tracing.DoInSpan(ctx, "bucket_store_preload_all", func(_ context.Context) {
  1432  			err = g.Wait()
  1433  		})
  1434  		if err != nil {
  1435  			code := codes.Aborted
  1436  			if s, ok := status.FromError(errors.Cause(err)); ok {
  1437  				code = s.Code()
  1438  			}
  1439  			return status.Error(code, err.Error())
  1440  		}
  1441  		stats.blocksQueried = len(respSets)
  1442  		stats.GetAllDuration = time.Since(begin)
  1443  		s.metrics.seriesGetAllDuration.Observe(stats.GetAllDuration.Seconds())
  1444  		s.metrics.seriesBlocksQueried.Observe(float64(stats.blocksQueried))
  1445  	}
  1446  
  1447  	// Merge the sub-results from each selected block.
  1448  	tracing.DoInSpan(ctx, "bucket_store_merge_all", func(ctx context.Context) {
  1449  		defer func() {
  1450  			for _, resp := range respSets {
  1451  				resp.Close()
  1452  			}
  1453  		}()
  1454  		begin := time.Now()
  1455  		set := NewDedupResponseHeap(NewProxyResponseHeap(respSets...))
  1456  		for set.Next() {
  1457  			at := set.At()
  1458  			warn := at.GetWarning()
  1459  			if warn != "" {
  1460  				// TODO(fpetkovski): Consider deprecating string based warnings in favor of a
  1461  				// separate protobuf message containing the grpc code and
  1462  				// a human readable error message.
  1463  				err = status.Error(storepb.GRPCCodeFromWarn(warn), at.GetWarning())
  1464  				return
  1465  			}
  1466  
  1467  			series := at.GetSeries()
  1468  			if series != nil {
  1469  				stats.mergedSeriesCount++
  1470  				if !req.SkipChunks {
  1471  					stats.mergedChunksCount += len(series.Chunks)
  1472  					s.metrics.chunkSizeBytes.Observe(float64(chunksSize(series.Chunks)))
  1473  				}
  1474  			}
  1475  			if err = srv.Send(at); err != nil {
  1476  				err = status.Error(codes.Unknown, errors.Wrap(err, "send series response").Error())
  1477  				return
  1478  			}
  1479  		}
  1480  		stats.MergeDuration = time.Since(begin)
  1481  		s.metrics.seriesMergeDuration.Observe(stats.MergeDuration.Seconds())
  1482  
  1483  		err = nil
  1484  	})
  1485  	if err != nil {
  1486  		return err
  1487  	}
  1488  
  1489  	if s.enableSeriesResponseHints {
  1490  		var anyHints *types.Any
  1491  
  1492  		if queryStatsEnabled {
  1493  			resHints.QueryStats = stats.toHints()
  1494  		}
  1495  		if anyHints, err = types.MarshalAny(resHints); err != nil {
  1496  			err = status.Error(codes.Unknown, errors.Wrap(err, "marshal series response hints").Error())
  1497  			return
  1498  		}
  1499  
  1500  		if err = srv.Send(storepb.NewHintsSeriesResponse(anyHints)); err != nil {
  1501  			err = status.Error(codes.Unknown, errors.Wrap(err, "send series response hints").Error())
  1502  			return
  1503  		}
  1504  	}
  1505  
  1506  	if err != nil {
  1507  		return err
  1508  	}
  1509  	return srv.Flush()
  1510  }
  1511  
  1512  func chunksSize(chks []storepb.AggrChunk) (size int) {
  1513  	for _, chk := range chks {
  1514  		size += chk.Size() // This gets the encoded proto size.
  1515  	}
  1516  	return size
  1517  }
  1518  
  1519  // LabelNames implements the storepb.StoreServer interface.
  1520  func (s *BucketStore) LabelNames(ctx context.Context, req *storepb.LabelNamesRequest) (*storepb.LabelNamesResponse, error) {
  1521  	reqSeriesMatchers, err := storepb.MatchersToPromMatchers(req.Matchers...)
  1522  	if err != nil {
  1523  		return nil, status.Error(codes.InvalidArgument, errors.Wrap(err, "translate request labels matchers").Error())
  1524  	}
  1525  
  1526  	tenant, _ := tenancy.GetTenantFromGRPCMetadata(ctx)
  1527  	level.Debug(s.logger).Log("msg", "Tenant for LabelNames request", "tenant", tenant)
  1528  
  1529  	resHints := &hintspb.LabelNamesResponseHints{}
  1530  
  1531  	var reqBlockMatchers []*labels.Matcher
  1532  	if req.Hints != nil {
  1533  		reqHints := &hintspb.LabelNamesRequestHints{}
  1534  		err := types.UnmarshalAny(req.Hints, reqHints)
  1535  		if err != nil {
  1536  			return nil, status.Error(codes.InvalidArgument, errors.Wrap(err, "unmarshal label names request hints").Error())
  1537  		}
  1538  
  1539  		reqBlockMatchers, err = storepb.MatchersToPromMatchers(reqHints.BlockMatchers...)
  1540  		if err != nil {
  1541  			return nil, status.Error(codes.InvalidArgument, errors.Wrap(err, "translate request hints labels matchers").Error())
  1542  		}
  1543  	}
  1544  
  1545  	g, gctx := errgroup.WithContext(ctx)
  1546  
  1547  	s.mtx.RLock()
  1548  
  1549  	var mtx sync.Mutex
  1550  	var sets [][]string
  1551  	var seriesLimiter = s.seriesLimiterFactory(s.metrics.queriesDropped.WithLabelValues("series"))
  1552  	var bytesLimiter = s.bytesLimiterFactory(s.metrics.queriesDropped.WithLabelValues("bytes"))
  1553  
  1554  	for _, b := range s.blocks {
  1555  		b := b
  1556  		gctx := gctx
  1557  
  1558  		if !b.overlapsClosedInterval(req.Start, req.End) {
  1559  			continue
  1560  		}
  1561  		if len(reqBlockMatchers) > 0 && !b.matchRelabelLabels(reqBlockMatchers) {
  1562  			continue
  1563  		}
  1564  		// Filter external labels from matchers.
  1565  		reqSeriesMatchersNoExtLabels, ok := b.FilterExtLabelsMatchers(reqSeriesMatchers)
  1566  		if !ok {
  1567  			continue
  1568  		}
  1569  
  1570  		sortedReqSeriesMatchersNoExtLabels := newSortedMatchers(reqSeriesMatchersNoExtLabels)
  1571  
  1572  		resHints.AddQueriedBlock(b.meta.ULID)
  1573  
  1574  		indexr := b.indexReader()
  1575  
  1576  		g.Go(func() error {
  1577  			span, newCtx := tracing.StartSpan(gctx, "bucket_store_block_series", tracing.Tags{
  1578  				"block.id":         b.meta.ULID,
  1579  				"block.mint":       b.meta.MinTime,
  1580  				"block.maxt":       b.meta.MaxTime,
  1581  				"block.resolution": b.meta.Thanos.Downsample.Resolution,
  1582  			})
  1583  			defer span.Finish()
  1584  			defer runutil.CloseWithLogOnErr(s.logger, indexr, "label names")
  1585  
  1586  			var result []string
  1587  			if len(reqSeriesMatchersNoExtLabels) == 0 {
  1588  				// Do it via index reader to have pending reader registered correctly.
  1589  				// LabelNames are already sorted.
  1590  				res, err := indexr.block.indexHeaderReader.LabelNames()
  1591  				if err != nil {
  1592  					return errors.Wrapf(err, "label names for block %s", b.meta.ULID)
  1593  				}
  1594  
  1595  				// Add  a set for the external labels as well.
  1596  				// We're not adding them directly to refs because there could be duplicates.
  1597  				// b.extLset is already sorted by label name, no need to sort it again.
  1598  				extRes := make([]string, 0, len(b.extLset))
  1599  				for _, l := range b.extLset {
  1600  					extRes = append(extRes, l.Name)
  1601  				}
  1602  
  1603  				result = strutil.MergeSlices(res, extRes)
  1604  			} else {
  1605  				seriesReq := &storepb.SeriesRequest{
  1606  					MinTime:    req.Start,
  1607  					MaxTime:    req.End,
  1608  					SkipChunks: true,
  1609  				}
  1610  				blockClient := newBlockSeriesClient(
  1611  					newCtx,
  1612  					s.logger,
  1613  					b,
  1614  					seriesReq,
  1615  					nil,
  1616  					bytesLimiter,
  1617  					nil,
  1618  					true,
  1619  					SeriesBatchSize,
  1620  					s.metrics.chunkFetchDuration,
  1621  					nil,
  1622  				)
  1623  				defer blockClient.Close()
  1624  
  1625  				if err := blockClient.ExpandPostings(
  1626  					sortedReqSeriesMatchersNoExtLabels,
  1627  					seriesLimiter,
  1628  				); err != nil {
  1629  					return err
  1630  				}
  1631  
  1632  				// Extract label names from all series. Many label names will be the same, so we need to deduplicate them.
  1633  				// Note that label names will already include external labels (passed to blockSeries), so we don't need
  1634  				// to add them again.
  1635  				labelNames := map[string]struct{}{}
  1636  				for {
  1637  					ls, err := blockClient.Recv()
  1638  					if err == io.EOF {
  1639  						break
  1640  					}
  1641  					if err != nil {
  1642  						return errors.Wrapf(err, "iterate series for block %s", b.meta.ULID)
  1643  					}
  1644  
  1645  					if ls.GetWarning() != "" {
  1646  						return errors.Wrapf(errors.New(ls.GetWarning()), "iterate series for block %s", b.meta.ULID)
  1647  					}
  1648  					if ls.GetSeries() == nil {
  1649  						continue
  1650  					}
  1651  					for _, l := range ls.GetSeries().Labels {
  1652  						labelNames[l.Name] = struct{}{}
  1653  					}
  1654  				}
  1655  
  1656  				result = make([]string, 0, len(labelNames))
  1657  				for n := range labelNames {
  1658  					result = append(result, n)
  1659  				}
  1660  				sort.Strings(result)
  1661  			}
  1662  
  1663  			if len(result) > 0 {
  1664  				mtx.Lock()
  1665  				sets = append(sets, result)
  1666  				mtx.Unlock()
  1667  			}
  1668  
  1669  			return nil
  1670  		})
  1671  	}
  1672  
  1673  	s.mtx.RUnlock()
  1674  
  1675  	if err := g.Wait(); err != nil {
  1676  		code := codes.Internal
  1677  		if s, ok := status.FromError(errors.Cause(err)); ok {
  1678  			code = s.Code()
  1679  		}
  1680  		return nil, status.Error(code, err.Error())
  1681  	}
  1682  
  1683  	anyHints, err := types.MarshalAny(resHints)
  1684  	if err != nil {
  1685  		return nil, status.Error(codes.Unknown, errors.Wrap(err, "marshal label names response hints").Error())
  1686  	}
  1687  
  1688  	return &storepb.LabelNamesResponse{
  1689  		Names: strutil.MergeSlices(sets...),
  1690  		Hints: anyHints,
  1691  	}, nil
  1692  }
  1693  
  1694  func (b *bucketBlock) FilterExtLabelsMatchers(matchers []*labels.Matcher) ([]*labels.Matcher, bool) {
  1695  	// We filter external labels from matchers so we won't try to match series on them.
  1696  	var result []*labels.Matcher
  1697  	for _, m := range matchers {
  1698  		// Get value of external label from block.
  1699  		v := b.extLset.Get(m.Name)
  1700  		// If value is empty string the matcher is a valid one since it's not part of external labels.
  1701  		if v == "" {
  1702  			result = append(result, m)
  1703  		} else if v != "" && v != m.Value {
  1704  			// If matcher is external label but value is different we don't want to look in block anyway.
  1705  			return []*labels.Matcher{}, false
  1706  		}
  1707  	}
  1708  
  1709  	return result, true
  1710  }
  1711  
  1712  // LabelValues implements the storepb.StoreServer interface.
  1713  func (s *BucketStore) LabelValues(ctx context.Context, req *storepb.LabelValuesRequest) (*storepb.LabelValuesResponse, error) {
  1714  	reqSeriesMatchers, err := storepb.MatchersToPromMatchers(req.Matchers...)
  1715  	if err != nil {
  1716  		return nil, status.Error(codes.InvalidArgument, errors.Wrap(err, "translate request labels matchers").Error())
  1717  	}
  1718  
  1719  	tenant, _ := tenancy.GetTenantFromGRPCMetadata(ctx)
  1720  	level.Debug(s.logger).Log("msg", "Tenant for LabelValues request", "tenant", tenant)
  1721  
  1722  	resHints := &hintspb.LabelValuesResponseHints{}
  1723  
  1724  	g, gctx := errgroup.WithContext(ctx)
  1725  
  1726  	var reqBlockMatchers []*labels.Matcher
  1727  	if req.Hints != nil {
  1728  		reqHints := &hintspb.LabelValuesRequestHints{}
  1729  		err := types.UnmarshalAny(req.Hints, reqHints)
  1730  		if err != nil {
  1731  			return nil, status.Error(codes.InvalidArgument, errors.Wrap(err, "unmarshal label values request hints").Error())
  1732  		}
  1733  
  1734  		reqBlockMatchers, err = storepb.MatchersToPromMatchers(reqHints.BlockMatchers...)
  1735  		if err != nil {
  1736  			return nil, status.Error(codes.InvalidArgument, errors.Wrap(err, "translate request hints labels matchers").Error())
  1737  		}
  1738  	}
  1739  
  1740  	s.mtx.RLock()
  1741  
  1742  	var mtx sync.Mutex
  1743  	var sets [][]string
  1744  	var seriesLimiter = s.seriesLimiterFactory(s.metrics.queriesDropped.WithLabelValues("series"))
  1745  	var bytesLimiter = s.bytesLimiterFactory(s.metrics.queriesDropped.WithLabelValues("bytes"))
  1746  
  1747  	for _, b := range s.blocks {
  1748  		b := b
  1749  
  1750  		if !b.overlapsClosedInterval(req.Start, req.End) {
  1751  			continue
  1752  		}
  1753  		if len(reqBlockMatchers) > 0 && !b.matchRelabelLabels(reqBlockMatchers) {
  1754  			continue
  1755  		}
  1756  		// Filter external labels from matchers.
  1757  		reqSeriesMatchersNoExtLabels, ok := b.FilterExtLabelsMatchers(reqSeriesMatchers)
  1758  		if !ok {
  1759  			continue
  1760  		}
  1761  
  1762  		// If we have series matchers, add <labelName> != "" matcher, to only select series that have given label name.
  1763  		if len(reqSeriesMatchersNoExtLabels) > 0 {
  1764  			m, err := labels.NewMatcher(labels.MatchNotEqual, req.Label, "")
  1765  			if err != nil {
  1766  				return nil, status.Error(codes.InvalidArgument, err.Error())
  1767  			}
  1768  
  1769  			reqSeriesMatchersNoExtLabels = append(reqSeriesMatchersNoExtLabels, m)
  1770  		}
  1771  
  1772  		sortedReqSeriesMatchersNoExtLabels := newSortedMatchers(reqSeriesMatchersNoExtLabels)
  1773  
  1774  		resHints.AddQueriedBlock(b.meta.ULID)
  1775  
  1776  		indexr := b.indexReader()
  1777  		g.Go(func() error {
  1778  			span, newCtx := tracing.StartSpan(gctx, "bucket_store_block_series", tracing.Tags{
  1779  				"block.id":         b.meta.ULID,
  1780  				"block.mint":       b.meta.MinTime,
  1781  				"block.maxt":       b.meta.MaxTime,
  1782  				"block.resolution": b.meta.Thanos.Downsample.Resolution,
  1783  			})
  1784  			defer span.Finish()
  1785  			defer runutil.CloseWithLogOnErr(s.logger, indexr, "label values")
  1786  
  1787  			var result []string
  1788  			if len(reqSeriesMatchersNoExtLabels) == 0 {
  1789  				// Do it via index reader to have pending reader registered correctly.
  1790  				res, err := indexr.block.indexHeaderReader.LabelValues(req.Label)
  1791  				if err != nil {
  1792  					return errors.Wrapf(err, "index header label values for block %s", b.meta.ULID)
  1793  				}
  1794  
  1795  				// Add the external label value as well.
  1796  				if extLabelValue := b.extLset.Get(req.Label); extLabelValue != "" {
  1797  					res = strutil.MergeSlices(res, []string{extLabelValue})
  1798  				}
  1799  				result = res
  1800  			} else {
  1801  				seriesReq := &storepb.SeriesRequest{
  1802  					MinTime:    req.Start,
  1803  					MaxTime:    req.End,
  1804  					SkipChunks: true,
  1805  				}
  1806  				blockClient := newBlockSeriesClient(
  1807  					newCtx,
  1808  					s.logger,
  1809  					b,
  1810  					seriesReq,
  1811  					nil,
  1812  					bytesLimiter,
  1813  					nil,
  1814  					true,
  1815  					SeriesBatchSize,
  1816  					s.metrics.chunkFetchDuration,
  1817  					nil,
  1818  				)
  1819  				defer blockClient.Close()
  1820  
  1821  				if err := blockClient.ExpandPostings(
  1822  					sortedReqSeriesMatchersNoExtLabels,
  1823  					seriesLimiter,
  1824  				); err != nil {
  1825  					return err
  1826  				}
  1827  
  1828  				// Extract given label's value from all series and deduplicate them.
  1829  				// We don't need to deal with external labels, since they are already added by blockSeries.
  1830  				values := map[string]struct{}{}
  1831  				for {
  1832  					ls, err := blockClient.Recv()
  1833  					if err == io.EOF {
  1834  						break
  1835  					}
  1836  					if err != nil {
  1837  						return errors.Wrapf(err, "iterate series for block %s", b.meta.ULID)
  1838  					}
  1839  
  1840  					if ls.GetWarning() != "" {
  1841  						return errors.Wrapf(errors.New(ls.GetWarning()), "iterate series for block %s", b.meta.ULID)
  1842  					}
  1843  					if ls.GetSeries() == nil {
  1844  						continue
  1845  					}
  1846  
  1847  					val := labelpb.ZLabelsToPromLabels(ls.GetSeries().Labels).Get(req.Label)
  1848  					if val != "" { // Should never be empty since we added labelName!="" matcher to the list of matchers.
  1849  						values[val] = struct{}{}
  1850  					}
  1851  				}
  1852  
  1853  				result = make([]string, 0, len(values))
  1854  				for n := range values {
  1855  					result = append(result, n)
  1856  				}
  1857  				sort.Strings(result)
  1858  			}
  1859  
  1860  			if len(result) > 0 {
  1861  				mtx.Lock()
  1862  				sets = append(sets, result)
  1863  				mtx.Unlock()
  1864  			}
  1865  
  1866  			return nil
  1867  		})
  1868  	}
  1869  
  1870  	s.mtx.RUnlock()
  1871  
  1872  	if err := g.Wait(); err != nil {
  1873  		code := codes.Internal
  1874  		if s, ok := status.FromError(errors.Cause(err)); ok {
  1875  			code = s.Code()
  1876  		}
  1877  		return nil, status.Error(code, err.Error())
  1878  	}
  1879  
  1880  	anyHints, err := types.MarshalAny(resHints)
  1881  	if err != nil {
  1882  		return nil, status.Error(codes.Unknown, errors.Wrap(err, "marshal label values response hints").Error())
  1883  	}
  1884  
  1885  	return &storepb.LabelValuesResponse{
  1886  		Values: strutil.MergeSlices(sets...),
  1887  		Hints:  anyHints,
  1888  	}, nil
  1889  }
  1890  
  1891  // bucketBlockSet holds all blocks of an equal label set. It internally splits
  1892  // them up by downsampling resolution and allows querying.
  1893  type bucketBlockSet struct {
  1894  	labels      labels.Labels
  1895  	mtx         sync.RWMutex
  1896  	resolutions []int64          // Available resolution, high to low (in milliseconds).
  1897  	blocks      [][]*bucketBlock // Ordered buckets for the existing resolutions.
  1898  }
  1899  
  1900  // newBucketBlockSet initializes a new set with the known downsampling windows hard-configured.
  1901  // The set currently does not support arbitrary ranges.
  1902  func newBucketBlockSet(lset labels.Labels) *bucketBlockSet {
  1903  	return &bucketBlockSet{
  1904  		labels:      lset,
  1905  		resolutions: []int64{downsample.ResLevel2, downsample.ResLevel1, downsample.ResLevel0},
  1906  		blocks:      make([][]*bucketBlock, 3),
  1907  	}
  1908  }
  1909  
  1910  func (s *bucketBlockSet) add(b *bucketBlock) error {
  1911  	if !labels.Equal(s.labels, labels.FromMap(b.meta.Thanos.Labels)) {
  1912  		return errors.New("block's label set does not match set")
  1913  	}
  1914  	s.mtx.Lock()
  1915  	defer s.mtx.Unlock()
  1916  
  1917  	i := int64index(s.resolutions, b.meta.Thanos.Downsample.Resolution)
  1918  	if i < 0 {
  1919  		return errors.Errorf("unsupported downsampling resolution %d", b.meta.Thanos.Downsample.Resolution)
  1920  	}
  1921  	bs := append(s.blocks[i], b)
  1922  	s.blocks[i] = bs
  1923  
  1924  	// Always sort blocks by min time, then max time.
  1925  	sort.Slice(bs, func(j, k int) bool {
  1926  		if bs[j].meta.MinTime == bs[k].meta.MinTime {
  1927  			return bs[j].meta.MaxTime < bs[k].meta.MaxTime
  1928  		}
  1929  		return bs[j].meta.MinTime < bs[k].meta.MinTime
  1930  	})
  1931  	return nil
  1932  }
  1933  
  1934  func (s *bucketBlockSet) remove(id ulid.ULID) {
  1935  	s.mtx.Lock()
  1936  	defer s.mtx.Unlock()
  1937  
  1938  	for i, bs := range s.blocks {
  1939  		for j, b := range bs {
  1940  			if b.meta.ULID != id {
  1941  				continue
  1942  			}
  1943  			s.blocks[i] = append(bs[:j], bs[j+1:]...)
  1944  			return
  1945  		}
  1946  	}
  1947  }
  1948  
  1949  func int64index(s []int64, x int64) int {
  1950  	for i, v := range s {
  1951  		if v == x {
  1952  			return i
  1953  		}
  1954  	}
  1955  	return -1
  1956  }
  1957  
  1958  // getFor returns a time-ordered list of blocks that cover date between mint and maxt.
  1959  // Blocks with the biggest resolution possible but not bigger than the given max resolution are returned.
  1960  // It supports overlapping blocks.
  1961  //
  1962  // NOTE: s.blocks are expected to be sorted in minTime order.
  1963  func (s *bucketBlockSet) getFor(mint, maxt, maxResolutionMillis int64, blockMatchers []*labels.Matcher) (bs []*bucketBlock) {
  1964  	if mint > maxt {
  1965  		return nil
  1966  	}
  1967  
  1968  	s.mtx.RLock()
  1969  	defer s.mtx.RUnlock()
  1970  
  1971  	// Find first matching resolution.
  1972  	i := 0
  1973  	for ; i < len(s.resolutions) && s.resolutions[i] > maxResolutionMillis; i++ {
  1974  	}
  1975  
  1976  	// Fill the given interval with the blocks for the current resolution.
  1977  	// Our current resolution might not cover all data, so recursively fill the gaps with higher resolution blocks
  1978  	// if there is any.
  1979  	start := mint
  1980  	for _, b := range s.blocks[i] {
  1981  		if b.meta.MaxTime <= mint {
  1982  			continue
  1983  		}
  1984  		// NOTE: Block intervals are half-open: [b.MinTime, b.MaxTime).
  1985  		if b.meta.MinTime > maxt {
  1986  			break
  1987  		}
  1988  
  1989  		if i+1 < len(s.resolutions) {
  1990  			bs = append(bs, s.getFor(start, b.meta.MinTime-1, s.resolutions[i+1], blockMatchers)...)
  1991  		}
  1992  
  1993  		// Include the block in the list of matching ones only if there are no block-level matchers
  1994  		// or they actually match.
  1995  		if len(blockMatchers) == 0 || b.matchRelabelLabels(blockMatchers) {
  1996  			bs = append(bs, b)
  1997  		}
  1998  
  1999  		start = b.meta.MaxTime
  2000  	}
  2001  
  2002  	if i+1 < len(s.resolutions) {
  2003  		bs = append(bs, s.getFor(start, maxt, s.resolutions[i+1], blockMatchers)...)
  2004  	}
  2005  	return bs
  2006  }
  2007  
  2008  // labelMatchers verifies whether the block set matches the given matchers and returns a new
  2009  // set of matchers that is equivalent when querying data within the block.
  2010  func (s *bucketBlockSet) labelMatchers(matchers ...*labels.Matcher) ([]*labels.Matcher, bool) {
  2011  	res := make([]*labels.Matcher, 0, len(matchers))
  2012  
  2013  	for _, m := range matchers {
  2014  		v := s.labels.Get(m.Name)
  2015  		if v == "" {
  2016  			res = append(res, m)
  2017  			continue
  2018  		}
  2019  		if !m.Matches(v) {
  2020  			return nil, false
  2021  		}
  2022  	}
  2023  	return res, true
  2024  }
  2025  
  2026  // bucketBlock represents a block that is located in a bucket. It holds intermediate
  2027  // state for the block on local disk.
  2028  type bucketBlock struct {
  2029  	logger     log.Logger
  2030  	metrics    *bucketStoreMetrics
  2031  	bkt        objstore.BucketReader
  2032  	meta       *metadata.Meta
  2033  	dir        string
  2034  	indexCache storecache.IndexCache
  2035  	chunkPool  pool.Bytes
  2036  	extLset    labels.Labels
  2037  
  2038  	indexHeaderReader indexheader.Reader
  2039  
  2040  	chunkObjs []string
  2041  
  2042  	pendingReaders sync.WaitGroup
  2043  
  2044  	partitioner Partitioner
  2045  
  2046  	// Block's labels used by block-level matchers to filter blocks to query. These are used to select blocks using
  2047  	// request hints' BlockMatchers.
  2048  	relabelLabels labels.Labels
  2049  
  2050  	estimatedMaxChunkSize  int
  2051  	estimatedMaxSeriesSize int
  2052  }
  2053  
  2054  func newBucketBlock(
  2055  	ctx context.Context,
  2056  	logger log.Logger,
  2057  	metrics *bucketStoreMetrics,
  2058  	meta *metadata.Meta,
  2059  	bkt objstore.BucketReader,
  2060  	dir string,
  2061  	indexCache storecache.IndexCache,
  2062  	chunkPool pool.Bytes,
  2063  	indexHeadReader indexheader.Reader,
  2064  	p Partitioner,
  2065  	maxSeriesSizeFunc BlockEstimator,
  2066  	maxChunkSizeFunc BlockEstimator,
  2067  ) (b *bucketBlock, err error) {
  2068  	maxSeriesSize := EstimatedMaxSeriesSize
  2069  	if maxSeriesSizeFunc != nil {
  2070  		maxSeriesSize = int(maxSeriesSizeFunc(*meta))
  2071  	}
  2072  	maxChunkSize := EstimatedMaxChunkSize
  2073  	if maxChunkSizeFunc != nil {
  2074  		maxChunkSize = int(maxChunkSizeFunc(*meta))
  2075  	}
  2076  	b = &bucketBlock{
  2077  		logger:            logger,
  2078  		metrics:           metrics,
  2079  		bkt:               bkt,
  2080  		indexCache:        indexCache,
  2081  		chunkPool:         chunkPool,
  2082  		dir:               dir,
  2083  		partitioner:       p,
  2084  		meta:              meta,
  2085  		indexHeaderReader: indexHeadReader,
  2086  		extLset:           labels.FromMap(meta.Thanos.Labels),
  2087  		// Translate the block's labels and inject the block ID as a label
  2088  		// to allow to match blocks also by ID.
  2089  		relabelLabels: append(labels.FromMap(meta.Thanos.Labels), labels.Label{
  2090  			Name:  block.BlockIDLabel,
  2091  			Value: meta.ULID.String(),
  2092  		}),
  2093  		estimatedMaxSeriesSize: maxSeriesSize,
  2094  		estimatedMaxChunkSize:  maxChunkSize,
  2095  	}
  2096  	sort.Sort(b.extLset)
  2097  	sort.Sort(b.relabelLabels)
  2098  
  2099  	// Get object handles for all chunk files (segment files) from meta.json, if available.
  2100  	if len(meta.Thanos.SegmentFiles) > 0 {
  2101  		b.chunkObjs = make([]string, 0, len(meta.Thanos.SegmentFiles))
  2102  
  2103  		for _, sf := range meta.Thanos.SegmentFiles {
  2104  			b.chunkObjs = append(b.chunkObjs, path.Join(meta.ULID.String(), block.ChunksDirname, sf))
  2105  		}
  2106  		return b, nil
  2107  	}
  2108  
  2109  	// Get object handles for all chunk files from storage.
  2110  	if err = bkt.Iter(ctx, path.Join(meta.ULID.String(), block.ChunksDirname), func(n string) error {
  2111  		b.chunkObjs = append(b.chunkObjs, n)
  2112  		return nil
  2113  	}); err != nil {
  2114  		return nil, errors.Wrap(err, "list chunk files")
  2115  	}
  2116  	return b, nil
  2117  }
  2118  
  2119  func (b *bucketBlock) indexFilename() string {
  2120  	return path.Join(b.meta.ULID.String(), block.IndexFilename)
  2121  }
  2122  
  2123  func (b *bucketBlock) readIndexRange(ctx context.Context, off, length int64) ([]byte, error) {
  2124  	r, err := b.bkt.GetRange(ctx, b.indexFilename(), off, length)
  2125  	if err != nil {
  2126  		return nil, errors.Wrap(err, "get range reader")
  2127  	}
  2128  	defer runutil.CloseWithLogOnErr(b.logger, r, "readIndexRange close range reader")
  2129  
  2130  	// Preallocate the buffer with the exact size so we don't waste allocations
  2131  	// while progressively growing an initial small buffer. The buffer capacity
  2132  	// is increased by MinRead to avoid extra allocations due to how ReadFrom()
  2133  	// internally works.
  2134  	buf := bytes.NewBuffer(make([]byte, 0, length+bytes.MinRead))
  2135  	if _, err := buf.ReadFrom(r); err != nil {
  2136  		return nil, errors.Wrap(err, "read range")
  2137  	}
  2138  	return buf.Bytes(), nil
  2139  }
  2140  
  2141  func (b *bucketBlock) readChunkRange(ctx context.Context, seq int, off, length int64, chunkRanges byteRanges) (*[]byte, error) {
  2142  	if seq < 0 || seq >= len(b.chunkObjs) {
  2143  		return nil, errors.Errorf("unknown segment file for index %d", seq)
  2144  	}
  2145  
  2146  	// Get a reader for the required range.
  2147  	reader, err := b.bkt.GetRange(ctx, b.chunkObjs[seq], off, length)
  2148  	if err != nil {
  2149  		return nil, errors.Wrap(err, "get range reader")
  2150  	}
  2151  	defer runutil.CloseWithLogOnErr(b.logger, reader, "readChunkRange close range reader")
  2152  
  2153  	// Get a buffer from the pool.
  2154  	chunkBuffer, err := b.chunkPool.Get(chunkRanges.size())
  2155  	if err != nil {
  2156  		return nil, errors.Wrap(err, "allocate chunk bytes")
  2157  	}
  2158  
  2159  	*chunkBuffer, err = readByteRanges(reader, *chunkBuffer, chunkRanges)
  2160  	if err != nil {
  2161  		return nil, err
  2162  	}
  2163  
  2164  	return chunkBuffer, nil
  2165  }
  2166  
  2167  func (b *bucketBlock) chunkRangeReader(ctx context.Context, seq int, off, length int64) (io.ReadCloser, error) {
  2168  	if seq < 0 || seq >= len(b.chunkObjs) {
  2169  		return nil, errors.Errorf("unknown segment file for index %d", seq)
  2170  	}
  2171  
  2172  	return b.bkt.GetRange(ctx, b.chunkObjs[seq], off, length)
  2173  }
  2174  
  2175  func (b *bucketBlock) indexReader() *bucketIndexReader {
  2176  	b.pendingReaders.Add(1)
  2177  	return newBucketIndexReader(b)
  2178  }
  2179  
  2180  func (b *bucketBlock) chunkReader() *bucketChunkReader {
  2181  	b.pendingReaders.Add(1)
  2182  	return newBucketChunkReader(b)
  2183  }
  2184  
  2185  // matchRelabelLabels verifies whether the block matches the given matchers.
  2186  func (b *bucketBlock) matchRelabelLabels(matchers []*labels.Matcher) bool {
  2187  	for _, m := range matchers {
  2188  		if !m.Matches(b.relabelLabels.Get(m.Name)) {
  2189  			return false
  2190  		}
  2191  	}
  2192  	return true
  2193  }
  2194  
  2195  // overlapsClosedInterval returns true if the block overlaps [mint, maxt).
  2196  func (b *bucketBlock) overlapsClosedInterval(mint, maxt int64) bool {
  2197  	// The block itself is a half-open interval
  2198  	// [b.meta.MinTime, b.meta.MaxTime).
  2199  	return b.meta.MinTime <= maxt && mint < b.meta.MaxTime
  2200  }
  2201  
  2202  // Close waits for all pending readers to finish and then closes all underlying resources.
  2203  func (b *bucketBlock) Close() error {
  2204  	b.pendingReaders.Wait()
  2205  	return b.indexHeaderReader.Close()
  2206  }
  2207  
  2208  // bucketIndexReader is a custom index reader (not conforming index.Reader interface) that reads index that is stored in
  2209  // object storage without having to fully download it.
  2210  type bucketIndexReader struct {
  2211  	block *bucketBlock
  2212  	dec   *index.Decoder
  2213  	stats *queryStats
  2214  
  2215  	mtx          sync.Mutex
  2216  	loadedSeries map[storage.SeriesRef][]byte
  2217  }
  2218  
  2219  func newBucketIndexReader(block *bucketBlock) *bucketIndexReader {
  2220  	r := &bucketIndexReader{
  2221  		block: block,
  2222  		dec: &index.Decoder{
  2223  			LookupSymbol: block.indexHeaderReader.LookupSymbol,
  2224  		},
  2225  		stats:        &queryStats{},
  2226  		loadedSeries: map[storage.SeriesRef][]byte{},
  2227  	}
  2228  	return r
  2229  }
  2230  func (r *bucketIndexReader) reset() {
  2231  	r.loadedSeries = map[storage.SeriesRef][]byte{}
  2232  }
  2233  
  2234  // ExpandedPostings returns postings in expanded list instead of index.Postings.
  2235  // This is because we need to have them buffered anyway to perform efficient lookup
  2236  // on object storage.
  2237  // Found posting IDs (ps) are not strictly required to point to a valid Series, e.g. during
  2238  // background garbage collections.
  2239  //
  2240  // Reminder: A posting is a reference (represented as a uint64) to a series reference, which in turn points to the first
  2241  // chunk where the series contains the matching label-value pair for a given block of data. Postings can be fetched by
  2242  // single label name=value.
  2243  func (r *bucketIndexReader) ExpandedPostings(ctx context.Context, ms sortedMatchers, bytesLimiter BytesLimiter) ([]storage.SeriesRef, error) {
  2244  	// Shortcut the case of `len(postingGroups) == 0`. It will only happen when no
  2245  	// matchers specified, and we don't need to fetch expanded postings from cache.
  2246  	if len(ms) == 0 {
  2247  		return nil, nil
  2248  	}
  2249  
  2250  	hit, postings, err := r.fetchExpandedPostingsFromCache(ctx, ms, bytesLimiter)
  2251  	if err != nil {
  2252  		return nil, err
  2253  	}
  2254  	if hit {
  2255  		return postings, nil
  2256  	}
  2257  	var (
  2258  		allRequested = false
  2259  		hasAdds      = false
  2260  		keys         []labels.Label
  2261  	)
  2262  
  2263  	postingGroups, err := matchersToPostingGroups(ctx, r.block.indexHeaderReader.LabelValues, ms)
  2264  	if err != nil {
  2265  		return nil, errors.Wrap(err, "matchersToPostingGroups")
  2266  	}
  2267  	if postingGroups == nil {
  2268  		r.storeExpandedPostingsToCache(ms, index.EmptyPostings(), 0)
  2269  		return nil, nil
  2270  	}
  2271  	for _, pg := range postingGroups {
  2272  		allRequested = allRequested || pg.addAll
  2273  		hasAdds = hasAdds || len(pg.addKeys) > 0
  2274  
  2275  		// Postings returned by fetchPostings will be in the same order as keys
  2276  		// so it's important that we iterate them in the same order later.
  2277  		// We don't have any other way of pairing keys and fetched postings.
  2278  		for _, key := range pg.addKeys {
  2279  			keys = append(keys, labels.Label{Name: pg.name, Value: key})
  2280  		}
  2281  		for _, key := range pg.removeKeys {
  2282  			keys = append(keys, labels.Label{Name: pg.name, Value: key})
  2283  		}
  2284  	}
  2285  
  2286  	// We only need special All postings if there are no other adds. If there are, we can skip fetching
  2287  	// special All postings completely.
  2288  	if allRequested && !hasAdds {
  2289  		// add group with label to fetch "special All postings".
  2290  		name, value := index.AllPostingsKey()
  2291  		allPostingsLabel := labels.Label{Name: name, Value: value}
  2292  
  2293  		postingGroups = append(postingGroups, newPostingGroup(true, name, []string{value}, nil))
  2294  		keys = append(keys, allPostingsLabel)
  2295  	}
  2296  
  2297  	fetchedPostings, closeFns, err := r.fetchPostings(ctx, keys, bytesLimiter)
  2298  	defer func() {
  2299  		for _, closeFn := range closeFns {
  2300  			closeFn()
  2301  		}
  2302  	}()
  2303  	if err != nil {
  2304  		return nil, errors.Wrap(err, "get postings")
  2305  	}
  2306  
  2307  	// Get "add" and "remove" postings from groups. We iterate over postingGroups and their keys
  2308  	// again, and this is exactly the same order as before (when building the groups), so we can simply
  2309  	// use one incrementing index to fetch postings from returned slice.
  2310  	postingIndex := 0
  2311  
  2312  	var groupAdds, groupRemovals []index.Postings
  2313  	for _, g := range postingGroups {
  2314  		// We cannot add empty set to groupAdds, since they are intersected.
  2315  		if len(g.addKeys) > 0 {
  2316  			toMerge := make([]index.Postings, 0, len(g.addKeys))
  2317  			for _, l := range g.addKeys {
  2318  				toMerge = append(toMerge, checkNilPosting(g.name, l, fetchedPostings[postingIndex]))
  2319  				postingIndex++
  2320  			}
  2321  
  2322  			groupAdds = append(groupAdds, index.Merge(toMerge...))
  2323  		}
  2324  
  2325  		for _, l := range g.removeKeys {
  2326  			groupRemovals = append(groupRemovals, checkNilPosting(g.name, l, fetchedPostings[postingIndex]))
  2327  			postingIndex++
  2328  		}
  2329  	}
  2330  
  2331  	result := index.Without(index.Intersect(groupAdds...), index.Merge(groupRemovals...))
  2332  	ps, err := ExpandPostingsWithContext(ctx, result)
  2333  	if err != nil {
  2334  		return nil, errors.Wrap(err, "expand")
  2335  	}
  2336  	r.storeExpandedPostingsToCache(ms, index.NewListPostings(ps), len(ps))
  2337  
  2338  	if len(ps) > 0 {
  2339  		// As of version two all series entries are 16 byte padded. All references
  2340  		// we get have to account for that to get the correct offset.
  2341  		version, err := r.block.indexHeaderReader.IndexVersion()
  2342  		if err != nil {
  2343  			return nil, errors.Wrap(err, "get index version")
  2344  		}
  2345  		if version >= 2 {
  2346  			for i, id := range ps {
  2347  				ps[i] = id * 16
  2348  			}
  2349  		}
  2350  	}
  2351  	return ps, nil
  2352  }
  2353  
  2354  // ExpandPostingsWithContext returns the postings expanded as a slice and considers context.
  2355  func ExpandPostingsWithContext(ctx context.Context, p index.Postings) (res []storage.SeriesRef, err error) {
  2356  	for p.Next() {
  2357  		if ctx.Err() != nil {
  2358  			return nil, ctx.Err()
  2359  		}
  2360  		res = append(res, p.At())
  2361  	}
  2362  	return res, p.Err()
  2363  }
  2364  
  2365  // postingGroup keeps posting keys for one or more matchers with the same label name. Logical result of the group is:
  2366  // If addAll is set: special All postings minus postings for removeKeys labels. No need to merge postings for addKeys in this case.
  2367  // If addAll is not set: Merge of postings for "addKeys" labels minus postings for removeKeys labels
  2368  // This computation happens in ExpandedPostings.
  2369  type postingGroup struct {
  2370  	addAll     bool
  2371  	name       string
  2372  	addKeys    []string
  2373  	removeKeys []string
  2374  }
  2375  
  2376  func newPostingGroup(addAll bool, name string, addKeys, removeKeys []string) *postingGroup {
  2377  	return &postingGroup{
  2378  		addAll:     addAll,
  2379  		name:       name,
  2380  		addKeys:    addKeys,
  2381  		removeKeys: removeKeys,
  2382  	}
  2383  }
  2384  
  2385  func (pg postingGroup) merge(other *postingGroup) *postingGroup {
  2386  	if other == nil {
  2387  		return &pg
  2388  	}
  2389  	// This shouldn't happen, but add this as a safeguard.
  2390  	if pg.name != other.name {
  2391  		return nil
  2392  	}
  2393  	var i, j int
  2394  	// Both add all, merge remove keys.
  2395  	if pg.addAll && other.addAll {
  2396  		// Fast path to not allocate output slice if no remove keys are specified.
  2397  		// This is possible when matcher is `=~".*"`.
  2398  		if len(pg.removeKeys) == 0 {
  2399  			pg.removeKeys = other.removeKeys
  2400  			return &pg
  2401  		} else if len(other.removeKeys) == 0 {
  2402  			return &pg
  2403  		}
  2404  		output := make([]string, 0, len(pg.removeKeys)+len(other.removeKeys))
  2405  		for i < len(pg.removeKeys) && j < len(other.removeKeys) {
  2406  			if pg.removeKeys[i] < other.removeKeys[j] {
  2407  				output = append(output, pg.removeKeys[i])
  2408  				i++
  2409  			} else if pg.removeKeys[i] > other.removeKeys[j] {
  2410  				output = append(output, other.removeKeys[j])
  2411  				j++
  2412  			} else {
  2413  				output = append(output, pg.removeKeys[i])
  2414  				i++
  2415  				j++
  2416  			}
  2417  		}
  2418  		if i < len(pg.removeKeys) {
  2419  			output = append(output, pg.removeKeys[i:len(pg.removeKeys)]...)
  2420  		}
  2421  		if j < len(other.removeKeys) {
  2422  			output = append(output, other.removeKeys[j:len(other.removeKeys)]...)
  2423  		}
  2424  		pg.removeKeys = output
  2425  	} else if pg.addAll || other.addAll {
  2426  		// Subtract the remove keys.
  2427  		toRemove := other
  2428  		toAdd := &pg
  2429  		if pg.addAll {
  2430  			toRemove = &pg
  2431  			toAdd = other
  2432  		}
  2433  		var k int
  2434  		for i < len(toAdd.addKeys) && j < len(toRemove.removeKeys) {
  2435  			if toAdd.addKeys[i] < toRemove.removeKeys[j] {
  2436  				toAdd.addKeys[k] = toAdd.addKeys[i]
  2437  				k++
  2438  				i++
  2439  			} else if toAdd.addKeys[i] > toRemove.removeKeys[j] {
  2440  				j++
  2441  			} else {
  2442  				i++
  2443  				j++
  2444  			}
  2445  		}
  2446  		for i < len(toAdd.addKeys) {
  2447  			toAdd.addKeys[k] = toAdd.addKeys[i]
  2448  			i++
  2449  			k++
  2450  		}
  2451  		pg.addKeys = toAdd.addKeys[:k]
  2452  		pg.addAll = false
  2453  		pg.removeKeys = nil
  2454  	} else {
  2455  		addKeys := make([]string, 0, len(pg.addKeys)+len(other.addKeys))
  2456  		for i < len(pg.addKeys) && j < len(other.addKeys) {
  2457  			if pg.addKeys[i] == other.addKeys[j] {
  2458  				addKeys = append(addKeys, pg.addKeys[i])
  2459  				i++
  2460  				j++
  2461  			} else if pg.addKeys[i] < other.addKeys[j] {
  2462  				i++
  2463  			} else {
  2464  				j++
  2465  			}
  2466  		}
  2467  		pg.addKeys = addKeys
  2468  	}
  2469  	return &pg
  2470  }
  2471  
  2472  func checkNilPosting(name, value string, p index.Postings) index.Postings {
  2473  	if p == nil {
  2474  		// This should not happen. Debug for https://github.com/thanos-io/thanos/issues/874.
  2475  		return index.ErrPostings(errors.Errorf("postings is nil for {%s=%s}. It was never fetched.", name, value))
  2476  	}
  2477  	return p
  2478  }
  2479  
  2480  func matchersToPostingGroups(ctx context.Context, lvalsFn func(name string) ([]string, error), ms []*labels.Matcher) ([]*postingGroup, error) {
  2481  	matchersMap := make(map[string][]*labels.Matcher)
  2482  	for _, m := range ms {
  2483  		matchersMap[m.Name] = append(matchersMap[m.Name], m)
  2484  	}
  2485  
  2486  	pgs := make([]*postingGroup, 0)
  2487  	// NOTE: Derived from tsdb.PostingsForMatchers.
  2488  	for _, values := range matchersMap {
  2489  		var (
  2490  			mergedPG     *postingGroup
  2491  			pg           *postingGroup
  2492  			vals         []string
  2493  			err          error
  2494  			valuesCached bool
  2495  		)
  2496  		lvalsFunc := lvalsFn
  2497  		// Merge PostingGroups with the same matcher into 1 to
  2498  		//  avoid fetching duplicate postings.
  2499  		for _, val := range values {
  2500  			pg, vals, err = toPostingGroup(ctx, lvalsFunc, val)
  2501  			if err != nil {
  2502  				return nil, errors.Wrap(err, "toPostingGroup")
  2503  			}
  2504  			// Cache label values because label name is the same.
  2505  			if !valuesCached && vals != nil {
  2506  				lvalsFunc = func(_ string) ([]string, error) {
  2507  					return vals, nil
  2508  				}
  2509  				valuesCached = true
  2510  			}
  2511  
  2512  			// If this groups adds nothing, it's an empty group. We can shortcut this, since intersection with empty
  2513  			// postings would return no postings anyway.
  2514  			// E.g. label="non-existing-value" returns empty group.
  2515  			if !pg.addAll && len(pg.addKeys) == 0 {
  2516  				return nil, nil
  2517  			}
  2518  			if mergedPG == nil {
  2519  				mergedPG = pg
  2520  			} else {
  2521  				mergedPG = mergedPG.merge(pg)
  2522  			}
  2523  
  2524  			// If this groups adds nothing, it's an empty group. We can shortcut this, since intersection with empty
  2525  			// postings would return no postings anyway.
  2526  			// E.g. label="non-existing-value" returns empty group.
  2527  			if !mergedPG.addAll && len(mergedPG.addKeys) == 0 {
  2528  				return nil, nil
  2529  			}
  2530  		}
  2531  		pgs = append(pgs, mergedPG)
  2532  	}
  2533  	slices.SortFunc(pgs, func(a, b *postingGroup) bool {
  2534  		return a.name < b.name
  2535  	})
  2536  	return pgs, nil
  2537  }
  2538  
  2539  // NOTE: Derived from tsdb.postingsForMatcher. index.Merge is equivalent to map duplication.
  2540  func toPostingGroup(ctx context.Context, lvalsFn func(name string) ([]string, error), m *labels.Matcher) (*postingGroup, []string, error) {
  2541  	// If the matcher selects an empty value, it selects all the series which don't
  2542  	// have the label name set too. See: https://github.com/prometheus/prometheus/issues/3575
  2543  	// and https://github.com/prometheus/prometheus/pull/3578#issuecomment-351653555.
  2544  	if m.Matches("") {
  2545  		var toRemove []string
  2546  
  2547  		// Fast-path for MatchNotRegexp matching.
  2548  		// Inverse of a MatchNotRegexp is MatchRegexp (double negation).
  2549  		// Fast-path for set matching.
  2550  		if m.Type == labels.MatchNotRegexp {
  2551  			if vals := findSetMatches(m.Value); len(vals) > 0 {
  2552  				sort.Strings(vals)
  2553  				return newPostingGroup(true, m.Name, nil, vals), nil, nil
  2554  			}
  2555  		}
  2556  
  2557  		// Fast-path for MatchNotEqual matching.
  2558  		// Inverse of a MatchNotEqual is MatchEqual (double negation).
  2559  		if m.Type == labels.MatchNotEqual {
  2560  			return newPostingGroup(true, m.Name, nil, []string{m.Value}), nil, nil
  2561  		}
  2562  
  2563  		vals, err := lvalsFn(m.Name)
  2564  		if err != nil {
  2565  			return nil, nil, err
  2566  		}
  2567  
  2568  		for _, val := range vals {
  2569  			if ctx.Err() != nil {
  2570  				return nil, nil, ctx.Err()
  2571  			}
  2572  			if !m.Matches(val) {
  2573  				toRemove = append(toRemove, val)
  2574  			}
  2575  		}
  2576  
  2577  		return newPostingGroup(true, m.Name, nil, toRemove), vals, nil
  2578  	}
  2579  	if m.Type == labels.MatchRegexp {
  2580  		if vals := findSetMatches(m.Value); len(vals) > 0 {
  2581  			sort.Strings(vals)
  2582  			return newPostingGroup(false, m.Name, vals, nil), nil, nil
  2583  		}
  2584  	}
  2585  
  2586  	// Fast-path for equal matching.
  2587  	if m.Type == labels.MatchEqual {
  2588  		return newPostingGroup(false, m.Name, []string{m.Value}, nil), nil, nil
  2589  	}
  2590  
  2591  	vals, err := lvalsFn(m.Name)
  2592  	if err != nil {
  2593  		return nil, nil, err
  2594  	}
  2595  
  2596  	var toAdd []string
  2597  	for _, val := range vals {
  2598  		if ctx.Err() != nil {
  2599  			return nil, nil, ctx.Err()
  2600  		}
  2601  		if m.Matches(val) {
  2602  			toAdd = append(toAdd, val)
  2603  		}
  2604  	}
  2605  
  2606  	return newPostingGroup(false, m.Name, toAdd, nil), vals, nil
  2607  }
  2608  
  2609  type postingPtr struct {
  2610  	keyID int
  2611  	ptr   index.Range
  2612  }
  2613  
  2614  func (r *bucketIndexReader) fetchExpandedPostingsFromCache(ctx context.Context, ms []*labels.Matcher, bytesLimiter BytesLimiter) (bool, []storage.SeriesRef, error) {
  2615  	dataFromCache, hit := r.block.indexCache.FetchExpandedPostings(ctx, r.block.meta.ULID, ms)
  2616  	if !hit {
  2617  		return false, nil, nil
  2618  	}
  2619  	if err := bytesLimiter.Reserve(uint64(len(dataFromCache))); err != nil {
  2620  		return false, nil, httpgrpc.Errorf(int(codes.ResourceExhausted), "exceeded bytes limit while loading expanded postings from index cache: %s", err)
  2621  	}
  2622  	r.stats.DataDownloadedSizeSum += units.Base2Bytes(len(dataFromCache))
  2623  	r.stats.postingsTouched++
  2624  	r.stats.PostingsTouchedSizeSum += units.Base2Bytes(len(dataFromCache))
  2625  	p, closeFns, err := r.decodeCachedPostings(dataFromCache)
  2626  	defer func() {
  2627  		for _, closeFn := range closeFns {
  2628  			closeFn()
  2629  		}
  2630  	}()
  2631  	// If failed to decode or expand cached postings, return and expand postings again.
  2632  	if err != nil {
  2633  		level.Error(r.block.logger).Log("msg", "failed to decode cached expanded postings, refetch postings", "id", r.block.meta.ULID.String(), "err", err)
  2634  		return false, nil, nil
  2635  	}
  2636  
  2637  	ps, err := ExpandPostingsWithContext(ctx, p)
  2638  	if err != nil {
  2639  		level.Error(r.block.logger).Log("msg", "failed to expand cached expanded postings, refetch postings", "id", r.block.meta.ULID.String(), "err", err)
  2640  		return false, nil, nil
  2641  	}
  2642  
  2643  	if len(ps) > 0 {
  2644  		// As of version two all series entries are 16 byte padded. All references
  2645  		// we get have to account for that to get the correct offset.
  2646  		version, err := r.block.indexHeaderReader.IndexVersion()
  2647  		if err != nil {
  2648  			return false, nil, errors.Wrap(err, "get index version")
  2649  		}
  2650  		if version >= 2 {
  2651  			for i, id := range ps {
  2652  				ps[i] = id * 16
  2653  			}
  2654  		}
  2655  	}
  2656  	return true, ps, nil
  2657  }
  2658  
  2659  func (r *bucketIndexReader) storeExpandedPostingsToCache(ms []*labels.Matcher, ps index.Postings, length int) {
  2660  	// Encode postings to cache. We compress and cache postings before adding
  2661  	// 16 bytes padding in order to make compressed size smaller.
  2662  	dataToCache, compressionDuration, compressionErrors, compressedSize := r.encodePostingsToCache(ps, length)
  2663  	r.stats.cachedPostingsCompressions++
  2664  	r.stats.cachedPostingsCompressionErrors += compressionErrors
  2665  	r.stats.CachedPostingsCompressionTimeSum += compressionDuration
  2666  	r.stats.CachedPostingsCompressedSizeSum += units.Base2Bytes(compressedSize)
  2667  	r.stats.CachedPostingsOriginalSizeSum += units.Base2Bytes(length * 4) // Estimate the posting list size.
  2668  	r.block.indexCache.StoreExpandedPostings(r.block.meta.ULID, ms, dataToCache)
  2669  }
  2670  
  2671  var bufioReaderPool = sync.Pool{
  2672  	New: func() any {
  2673  		return bufio.NewReader(nil)
  2674  	},
  2675  }
  2676  
  2677  // fetchPostings fill postings requested by posting groups.
  2678  // It returns one posting for each key, in the same order.
  2679  // If postings for given key is not fetched, entry at given index will be nil.
  2680  func (r *bucketIndexReader) fetchPostings(ctx context.Context, keys []labels.Label, bytesLimiter BytesLimiter) ([]index.Postings, []func(), error) {
  2681  	var closeFns []func()
  2682  
  2683  	timer := prometheus.NewTimer(r.block.metrics.postingsFetchDuration)
  2684  	defer timer.ObserveDuration()
  2685  
  2686  	var ptrs []postingPtr
  2687  
  2688  	output := make([]index.Postings, len(keys))
  2689  
  2690  	// Fetch postings from the cache with a single call.
  2691  	fromCache, _ := r.block.indexCache.FetchMultiPostings(ctx, r.block.meta.ULID, keys)
  2692  	for _, dataFromCache := range fromCache {
  2693  		if err := bytesLimiter.Reserve(uint64(len(dataFromCache))); err != nil {
  2694  			return nil, closeFns, httpgrpc.Errorf(int(codes.ResourceExhausted), "exceeded bytes limit while loading postings from index cache: %s", err)
  2695  		}
  2696  		r.stats.DataDownloadedSizeSum += units.Base2Bytes(len(dataFromCache))
  2697  	}
  2698  
  2699  	// Iterate over all groups and fetch posting from cache.
  2700  	// If we have a miss, mark key to be fetched in `ptrs` slice.
  2701  	// Overlaps are well handled by partitioner, so we don't need to deduplicate keys.
  2702  	for ix, key := range keys {
  2703  		if err := ctx.Err(); err != nil {
  2704  			return nil, closeFns, err
  2705  		}
  2706  		// Get postings for the given key from cache first.
  2707  		if b, ok := fromCache[key]; ok {
  2708  			r.stats.postingsTouched++
  2709  			r.stats.PostingsTouchedSizeSum += units.Base2Bytes(len(b))
  2710  
  2711  			l, closer, err := r.decodeCachedPostings(b)
  2712  			if err != nil {
  2713  				return nil, closeFns, errors.Wrap(err, "decode postings")
  2714  			}
  2715  			output[ix] = l
  2716  			closeFns = append(closeFns, closer...)
  2717  			continue
  2718  		}
  2719  
  2720  		// Cache miss; save pointer for actual posting in index stored in object store.
  2721  		ptr, err := r.block.indexHeaderReader.PostingsOffset(key.Name, key.Value)
  2722  		if err == indexheader.NotFoundRangeErr {
  2723  			// This block does not have any posting for given key.
  2724  			output[ix] = index.EmptyPostings()
  2725  			continue
  2726  		}
  2727  
  2728  		if err != nil {
  2729  			return nil, closeFns, errors.Wrap(err, "index header PostingsOffset")
  2730  		}
  2731  
  2732  		r.stats.postingsToFetch++
  2733  		ptrs = append(ptrs, postingPtr{ptr: ptr, keyID: ix})
  2734  	}
  2735  
  2736  	sort.Slice(ptrs, func(i, j int) bool {
  2737  		return ptrs[i].ptr.Start < ptrs[j].ptr.Start
  2738  	})
  2739  
  2740  	// TODO(bwplotka): Asses how large in worst case scenario this can be. (e.g fetch for AllPostingsKeys)
  2741  	// Consider sub split if too big.
  2742  	parts := r.block.partitioner.Partition(len(ptrs), func(i int) (start, end uint64) {
  2743  		return uint64(ptrs[i].ptr.Start), uint64(ptrs[i].ptr.End)
  2744  	})
  2745  
  2746  	for _, part := range parts {
  2747  		start := int64(part.Start)
  2748  		length := int64(part.End) - start
  2749  
  2750  		if err := bytesLimiter.Reserve(uint64(length)); err != nil {
  2751  			return nil, closeFns, httpgrpc.Errorf(int(codes.ResourceExhausted), "exceeded bytes limit while fetching postings: %s", err)
  2752  		}
  2753  		r.stats.DataDownloadedSizeSum += units.Base2Bytes(length)
  2754  	}
  2755  
  2756  	g, ctx := errgroup.WithContext(ctx)
  2757  	for _, part := range parts {
  2758  		i, j := part.ElemRng[0], part.ElemRng[1]
  2759  
  2760  		start := int64(part.Start)
  2761  		// We assume index does not have any ptrs that has 0 length.
  2762  		length := int64(part.End) - start
  2763  
  2764  		// Fetch from object storage concurrently and update stats and posting list.
  2765  		g.Go(func() error {
  2766  			begin := time.Now()
  2767  
  2768  			brdr := bufioReaderPool.Get().(*bufio.Reader)
  2769  			defer bufioReaderPool.Put(brdr)
  2770  
  2771  			partReader, err := r.block.bkt.GetRange(ctx, r.block.indexFilename(), start, length)
  2772  			if err != nil {
  2773  				return errors.Wrap(err, "read postings range")
  2774  			}
  2775  			defer runutil.CloseWithLogOnErr(r.block.logger, partReader, "readIndexRange close range reader")
  2776  			brdr.Reset(partReader)
  2777  
  2778  			rdr := newPostingsReaderBuilder(ctx, brdr, ptrs[i:j], start, length)
  2779  
  2780  			r.mtx.Lock()
  2781  			r.stats.postingsFetchCount++
  2782  			r.stats.postingsFetched += j - i
  2783  			r.stats.PostingsFetchedSizeSum += units.Base2Bytes(int(length))
  2784  			r.mtx.Unlock()
  2785  
  2786  			for rdr.Next() {
  2787  				diffVarintPostings, postingsCount, keyID := rdr.AtDiffVarint()
  2788  
  2789  				output[keyID] = newDiffVarintPostings(diffVarintPostings, nil)
  2790  
  2791  				startCompression := time.Now()
  2792  				dataToCache, err := snappyStreamedEncode(int(postingsCount), diffVarintPostings)
  2793  				if err != nil {
  2794  					r.mtx.Lock()
  2795  					r.stats.cachedPostingsCompressionErrors += 1
  2796  					r.mtx.Unlock()
  2797  					return errors.Wrap(err, "encoding with snappy")
  2798  				}
  2799  
  2800  				r.mtx.Lock()
  2801  				r.stats.postingsTouched++
  2802  				r.stats.PostingsTouchedSizeSum += units.Base2Bytes(int(len(diffVarintPostings)))
  2803  				r.stats.cachedPostingsCompressions += 1
  2804  				r.stats.CachedPostingsOriginalSizeSum += units.Base2Bytes(len(diffVarintPostings))
  2805  				r.stats.CachedPostingsCompressedSizeSum += units.Base2Bytes(len(dataToCache))
  2806  				r.stats.CachedPostingsCompressionTimeSum += time.Since(startCompression)
  2807  				r.mtx.Unlock()
  2808  
  2809  				r.block.indexCache.StorePostings(r.block.meta.ULID, keys[keyID], dataToCache)
  2810  			}
  2811  
  2812  			r.mtx.Lock()
  2813  			r.stats.PostingsFetchDurationSum += time.Since(begin)
  2814  			r.mtx.Unlock()
  2815  
  2816  			if err := rdr.Error(); err != nil {
  2817  				return errors.Wrap(err, "reading postings")
  2818  			}
  2819  			return nil
  2820  		})
  2821  	}
  2822  
  2823  	return output, closeFns, g.Wait()
  2824  }
  2825  
  2826  func (r *bucketIndexReader) decodeCachedPostings(b []byte) (index.Postings, []func(), error) {
  2827  	// Even if this instance is not using compression, there may be compressed
  2828  	// entries in the cache written by other stores.
  2829  	var (
  2830  		l        index.Postings
  2831  		err      error
  2832  		closeFns []func()
  2833  	)
  2834  	if isDiffVarintSnappyEncodedPostings(b) || isDiffVarintSnappyStreamedEncodedPostings(b) {
  2835  		s := time.Now()
  2836  		l, err = decodePostings(b)
  2837  		r.stats.cachedPostingsDecompressions += 1
  2838  		r.stats.CachedPostingsDecompressionTimeSum += time.Since(s)
  2839  		if err != nil {
  2840  			r.stats.cachedPostingsDecompressionErrors += 1
  2841  		} else {
  2842  			closeFns = append(closeFns, l.(closeablePostings).close)
  2843  		}
  2844  	} else {
  2845  		_, l, err = r.dec.Postings(b)
  2846  	}
  2847  	return l, closeFns, err
  2848  }
  2849  
  2850  func (r *bucketIndexReader) encodePostingsToCache(p index.Postings, length int) ([]byte, time.Duration, int, int) {
  2851  	var dataToCache []byte
  2852  	compressionTime := time.Duration(0)
  2853  	compressionErrors, compressedSize := 0, 0
  2854  	s := time.Now()
  2855  	data, err := diffVarintSnappyStreamedEncode(p, length)
  2856  	compressionTime = time.Since(s)
  2857  	if err == nil {
  2858  		dataToCache = data
  2859  		compressedSize = len(data)
  2860  	} else {
  2861  		compressionErrors = 1
  2862  	}
  2863  	return dataToCache, compressionTime, compressionErrors, compressedSize
  2864  }
  2865  
  2866  // bigEndianPostings implements the Postings interface over a byte stream of
  2867  // big endian numbers.
  2868  type bigEndianPostings struct {
  2869  	list []byte
  2870  	cur  uint32
  2871  }
  2872  
  2873  // TODO(bwplotka): Expose those inside Prometheus.
  2874  func newBigEndianPostings(list []byte) *bigEndianPostings {
  2875  	return &bigEndianPostings{list: list}
  2876  }
  2877  
  2878  func (it *bigEndianPostings) At() storage.SeriesRef {
  2879  	return storage.SeriesRef(it.cur)
  2880  }
  2881  
  2882  func (it *bigEndianPostings) Next() bool {
  2883  	if len(it.list) >= 4 {
  2884  		it.cur = binary.BigEndian.Uint32(it.list)
  2885  		it.list = it.list[4:]
  2886  		return true
  2887  	}
  2888  	return false
  2889  }
  2890  
  2891  func (it *bigEndianPostings) Seek(x storage.SeriesRef) bool {
  2892  	if storage.SeriesRef(it.cur) >= x {
  2893  		return true
  2894  	}
  2895  
  2896  	num := len(it.list) / 4
  2897  	// Do binary search between current position and end.
  2898  	i := sort.Search(num, func(i int) bool {
  2899  		return binary.BigEndian.Uint32(it.list[i*4:]) >= uint32(x)
  2900  	})
  2901  	if i < num {
  2902  		j := i * 4
  2903  		it.cur = binary.BigEndian.Uint32(it.list[j:])
  2904  		it.list = it.list[j+4:]
  2905  		return true
  2906  	}
  2907  	it.list = nil
  2908  	return false
  2909  }
  2910  
  2911  func (it *bigEndianPostings) Err() error {
  2912  	return nil
  2913  }
  2914  
  2915  // Returns number of remaining postings values.
  2916  func (it *bigEndianPostings) length() int {
  2917  	return len(it.list) / 4
  2918  }
  2919  
  2920  func (r *bucketIndexReader) PreloadSeries(ctx context.Context, ids []storage.SeriesRef, bytesLimiter BytesLimiter) error {
  2921  	timer := prometheus.NewTimer(r.block.metrics.seriesFetchDuration)
  2922  	defer timer.ObserveDuration()
  2923  
  2924  	// Load series from cache, overwriting the list of ids to preload
  2925  	// with the missing ones.
  2926  	fromCache, ids := r.block.indexCache.FetchMultiSeries(ctx, r.block.meta.ULID, ids)
  2927  	for id, b := range fromCache {
  2928  		r.loadedSeries[id] = b
  2929  		if err := bytesLimiter.Reserve(uint64(len(b))); err != nil {
  2930  			return httpgrpc.Errorf(int(codes.ResourceExhausted), "exceeded bytes limit while loading series from index cache: %s", err)
  2931  		}
  2932  		r.stats.DataDownloadedSizeSum += units.Base2Bytes(len(b))
  2933  	}
  2934  
  2935  	parts := r.block.partitioner.Partition(len(ids), func(i int) (start, end uint64) {
  2936  		return uint64(ids[i]), uint64(ids[i]) + uint64(r.block.estimatedMaxSeriesSize)
  2937  	})
  2938  
  2939  	g, ctx := errgroup.WithContext(ctx)
  2940  	for _, p := range parts {
  2941  		s, e := p.Start, p.End
  2942  		i, j := p.ElemRng[0], p.ElemRng[1]
  2943  
  2944  		g.Go(func() error {
  2945  			return r.loadSeries(ctx, ids[i:j], false, s, e, bytesLimiter)
  2946  		})
  2947  	}
  2948  	return g.Wait()
  2949  }
  2950  
  2951  func (r *bucketIndexReader) loadSeries(ctx context.Context, ids []storage.SeriesRef, refetch bool, start, end uint64, bytesLimiter BytesLimiter) error {
  2952  	begin := time.Now()
  2953  
  2954  	if bytesLimiter != nil {
  2955  		if err := bytesLimiter.Reserve(uint64(end - start)); err != nil {
  2956  			return httpgrpc.Errorf(int(codes.ResourceExhausted), "exceeded bytes limit while fetching series: %s", err)
  2957  		}
  2958  		r.mtx.Lock()
  2959  		r.stats.DataDownloadedSizeSum += units.Base2Bytes(end - start)
  2960  		r.mtx.Unlock()
  2961  	}
  2962  
  2963  	b, err := r.block.readIndexRange(ctx, int64(start), int64(end-start))
  2964  	if err != nil {
  2965  		return errors.Wrap(err, "read series range")
  2966  	}
  2967  
  2968  	r.mtx.Lock()
  2969  	r.stats.seriesFetchCount++
  2970  	r.stats.seriesFetched += len(ids)
  2971  	r.stats.SeriesFetchDurationSum += time.Since(begin)
  2972  	r.stats.SeriesFetchedSizeSum += units.Base2Bytes(int(end - start))
  2973  	r.mtx.Unlock()
  2974  
  2975  	for i, id := range ids {
  2976  		c := b[uint64(id)-start:]
  2977  
  2978  		l, n := binary.Uvarint(c)
  2979  		if n < 1 {
  2980  			return errors.New("reading series length failed")
  2981  		}
  2982  		if len(c) < n+int(l) {
  2983  			if i == 0 && refetch {
  2984  				return errors.Errorf("invalid remaining size, even after refetch, remaining: %d, expected %d", len(c), n+int(l))
  2985  			}
  2986  
  2987  			// Inefficient, but should be rare.
  2988  			r.block.metrics.seriesRefetches.Inc()
  2989  			level.Warn(r.block.logger).Log("msg", "series size exceeded expected size; refetching", "id", id, "series length", n+int(l), "maxSeriesSize", r.block.estimatedMaxSeriesSize)
  2990  
  2991  			// Fetch plus to get the size of next one if exists.
  2992  			return r.loadSeries(ctx, ids[i:], true, uint64(id), uint64(id)+uint64(n+int(l)+1), bytesLimiter)
  2993  		}
  2994  		c = c[n : n+int(l)]
  2995  		r.mtx.Lock()
  2996  		r.loadedSeries[id] = c
  2997  		r.block.indexCache.StoreSeries(r.block.meta.ULID, id, c)
  2998  		r.mtx.Unlock()
  2999  	}
  3000  	return nil
  3001  }
  3002  
  3003  type Part struct {
  3004  	Start uint64
  3005  	End   uint64
  3006  
  3007  	ElemRng [2]int
  3008  }
  3009  
  3010  type Partitioner interface {
  3011  	// Partition partitions length entries into n <= length ranges that cover all
  3012  	// input ranges
  3013  	// It supports overlapping ranges.
  3014  	// NOTE: It expects range to be sorted by start time.
  3015  	Partition(length int, rng func(int) (uint64, uint64)) []Part
  3016  }
  3017  
  3018  type gapBasedPartitioner struct {
  3019  	maxGapSize uint64
  3020  }
  3021  
  3022  func NewGapBasedPartitioner(maxGapSize uint64) Partitioner {
  3023  	return gapBasedPartitioner{
  3024  		maxGapSize: maxGapSize,
  3025  	}
  3026  }
  3027  
  3028  // Partition partitions length entries into n <= length ranges that cover all
  3029  // input ranges by combining entries that are separated by reasonably small gaps.
  3030  // It is used to combine multiple small ranges from object storage into bigger, more efficient/cheaper ones.
  3031  func (g gapBasedPartitioner) Partition(length int, rng func(int) (uint64, uint64)) (parts []Part) {
  3032  	j := 0
  3033  	k := 0
  3034  	for k < length {
  3035  		j = k
  3036  		k++
  3037  
  3038  		p := Part{}
  3039  		p.Start, p.End = rng(j)
  3040  
  3041  		// Keep growing the range until the end or we encounter a large gap.
  3042  		for ; k < length; k++ {
  3043  			s, e := rng(k)
  3044  
  3045  			if p.End+g.maxGapSize < s {
  3046  				break
  3047  			}
  3048  
  3049  			if p.End <= e {
  3050  				p.End = e
  3051  			}
  3052  		}
  3053  		p.ElemRng = [2]int{j, k}
  3054  		parts = append(parts, p)
  3055  	}
  3056  	return parts
  3057  }
  3058  
  3059  type symbolizedLabel struct {
  3060  	name, value uint32
  3061  }
  3062  
  3063  // LoadSeriesForTime populates the given symbolized labels for the series identified by the reference if at least one chunk is within
  3064  // time selection.
  3065  // LoadSeriesForTime also populates chunk metas slices if skipChunks if set to false. Chunks are also limited by the given time selection.
  3066  // LoadSeriesForTime returns false, when there are no series data for given time range.
  3067  //
  3068  // Error is returned on decoding error or if the reference does not resolve to a known series.
  3069  func (r *bucketIndexReader) LoadSeriesForTime(ref storage.SeriesRef, lset *[]symbolizedLabel, chks *[]chunks.Meta, skipChunks bool, mint, maxt int64) (ok bool, err error) {
  3070  	b, ok := r.loadedSeries[ref]
  3071  	if !ok {
  3072  		return false, errors.Errorf("series %d not found", ref)
  3073  	}
  3074  
  3075  	r.stats.seriesTouched++
  3076  	r.stats.SeriesTouchedSizeSum += units.Base2Bytes(len(b))
  3077  	return decodeSeriesForTime(b, lset, chks, skipChunks, mint, maxt)
  3078  }
  3079  
  3080  // Close released the underlying resources of the reader.
  3081  func (r *bucketIndexReader) Close() error {
  3082  	r.block.pendingReaders.Done()
  3083  	return nil
  3084  }
  3085  
  3086  // LookupLabelsSymbols allows populates label set strings from symbolized label set.
  3087  func (r *bucketIndexReader) LookupLabelsSymbols(symbolized []symbolizedLabel, lbls *labels.Labels) error {
  3088  	*lbls = (*lbls)[:0]
  3089  	for _, s := range symbolized {
  3090  		ln, err := r.dec.LookupSymbol(s.name)
  3091  		if err != nil {
  3092  			return errors.Wrap(err, "lookup label name")
  3093  		}
  3094  		lv, err := r.dec.LookupSymbol(s.value)
  3095  		if err != nil {
  3096  			return errors.Wrap(err, "lookup label value")
  3097  		}
  3098  		*lbls = append(*lbls, labels.Label{Name: ln, Value: lv})
  3099  	}
  3100  	return nil
  3101  }
  3102  
  3103  // decodeSeriesForTime decodes a series entry from the given byte slice decoding only chunk metas that are within given min and max time.
  3104  // If skipChunks is specified decodeSeriesForTime does not return any chunks, but only labels and only if at least single chunk is within time range.
  3105  // decodeSeriesForTime returns false, when there are no series data for given time range.
  3106  func decodeSeriesForTime(b []byte, lset *[]symbolizedLabel, chks *[]chunks.Meta, skipChunks bool, selectMint, selectMaxt int64) (ok bool, err error) {
  3107  	*lset = (*lset)[:0]
  3108  	*chks = (*chks)[:0]
  3109  
  3110  	d := encoding.Decbuf{B: b}
  3111  
  3112  	// Read labels without looking up symbols.
  3113  	k := d.Uvarint()
  3114  	for i := 0; i < k; i++ {
  3115  		lno := uint32(d.Uvarint())
  3116  		lvo := uint32(d.Uvarint())
  3117  		*lset = append(*lset, symbolizedLabel{name: lno, value: lvo})
  3118  	}
  3119  	// Read the chunks meta data.
  3120  	k = d.Uvarint()
  3121  	if k == 0 {
  3122  		return false, d.Err()
  3123  	}
  3124  
  3125  	// First t0 is absolute, rest is just diff so different type is used (Uvarint64).
  3126  	mint := d.Varint64()
  3127  	maxt := int64(d.Uvarint64()) + mint
  3128  	// Similar for first ref.
  3129  	ref := int64(d.Uvarint64())
  3130  
  3131  	for i := 0; i < k; i++ {
  3132  		if i > 0 {
  3133  			mint += int64(d.Uvarint64())
  3134  			maxt = int64(d.Uvarint64()) + mint
  3135  			ref += d.Varint64()
  3136  		}
  3137  
  3138  		if mint > selectMaxt {
  3139  			break
  3140  		}
  3141  
  3142  		if maxt >= selectMint {
  3143  			// Found a chunk.
  3144  			if skipChunks {
  3145  				// We are not interested in chunks and we know there is at least one, that's enough to return series.
  3146  				return true, nil
  3147  			}
  3148  
  3149  			*chks = append(*chks, chunks.Meta{
  3150  				Ref:     chunks.ChunkRef(ref),
  3151  				MinTime: mint,
  3152  				MaxTime: maxt,
  3153  			})
  3154  		}
  3155  
  3156  		mint = maxt
  3157  	}
  3158  	return len(*chks) > 0, d.Err()
  3159  }
  3160  
  3161  type loadIdx struct {
  3162  	offset uint32
  3163  	// Indices, not actual entries and chunks.
  3164  	seriesEntry int
  3165  	chunk       int
  3166  }
  3167  
  3168  type bucketChunkReader struct {
  3169  	block *bucketBlock
  3170  
  3171  	toLoad [][]loadIdx
  3172  
  3173  	// Mutex protects access to following fields, when updated from chunks-loading goroutines.
  3174  	// After chunks are loaded, mutex is no longer used.
  3175  	mtx        sync.Mutex
  3176  	stats      *queryStats
  3177  	chunkBytes []*[]byte // Byte slice to return to the chunk pool on close.
  3178  
  3179  	loadingChunksMtx  sync.Mutex
  3180  	loadingChunks     bool
  3181  	finishLoadingChks chan struct{}
  3182  }
  3183  
  3184  func newBucketChunkReader(block *bucketBlock) *bucketChunkReader {
  3185  	return &bucketChunkReader{
  3186  		block:  block,
  3187  		stats:  &queryStats{},
  3188  		toLoad: make([][]loadIdx, len(block.chunkObjs)),
  3189  	}
  3190  }
  3191  
  3192  func (r *bucketChunkReader) reset() {
  3193  	for i := range r.toLoad {
  3194  		r.toLoad[i] = r.toLoad[i][:0]
  3195  	}
  3196  	r.loadingChunksMtx.Lock()
  3197  	r.loadingChunks = false
  3198  	r.finishLoadingChks = make(chan struct{})
  3199  	r.loadingChunksMtx.Unlock()
  3200  }
  3201  
  3202  func (r *bucketChunkReader) Close() error {
  3203  	// NOTE(GiedriusS): we need to wait until loading chunks because loading
  3204  	// chunks modifies r.block.chunkPool.
  3205  	r.loadingChunksMtx.Lock()
  3206  	loadingChks := r.loadingChunks
  3207  	r.loadingChunksMtx.Unlock()
  3208  
  3209  	if loadingChks {
  3210  		<-r.finishLoadingChks
  3211  	}
  3212  	r.block.pendingReaders.Done()
  3213  
  3214  	for _, b := range r.chunkBytes {
  3215  		r.block.chunkPool.Put(b)
  3216  	}
  3217  	return nil
  3218  }
  3219  
  3220  // addLoad adds the chunk with id to the data set to be fetched.
  3221  // Chunk will be fetched and saved to refs[seriesEntry][chunk] upon r.load(refs, <...>) call.
  3222  func (r *bucketChunkReader) addLoad(id chunks.ChunkRef, seriesEntry, chunk int) error {
  3223  	var (
  3224  		seq = int(id >> 32)
  3225  		off = uint32(id)
  3226  	)
  3227  	if seq >= len(r.toLoad) {
  3228  		return errors.Errorf("reference sequence %d out of range", seq)
  3229  	}
  3230  	r.toLoad[seq] = append(r.toLoad[seq], loadIdx{off, seriesEntry, chunk})
  3231  	return nil
  3232  }
  3233  
  3234  // load loads all added chunks and saves resulting aggrs to refs.
  3235  func (r *bucketChunkReader) load(ctx context.Context, res []seriesEntry, aggrs []storepb.Aggr, calculateChunkChecksum bool, bytesLimiter BytesLimiter) error {
  3236  	r.loadingChunksMtx.Lock()
  3237  	r.loadingChunks = true
  3238  	r.loadingChunksMtx.Unlock()
  3239  
  3240  	defer func() {
  3241  		r.loadingChunksMtx.Lock()
  3242  		r.loadingChunks = false
  3243  		r.loadingChunksMtx.Unlock()
  3244  
  3245  		close(r.finishLoadingChks)
  3246  	}()
  3247  
  3248  	g, ctx := errgroup.WithContext(ctx)
  3249  
  3250  	for seq, pIdxs := range r.toLoad {
  3251  		sort.Slice(pIdxs, func(i, j int) bool {
  3252  			return pIdxs[i].offset < pIdxs[j].offset
  3253  		})
  3254  		parts := r.block.partitioner.Partition(len(pIdxs), func(i int) (start, end uint64) {
  3255  			return uint64(pIdxs[i].offset), uint64(pIdxs[i].offset) + uint64(r.block.estimatedMaxChunkSize)
  3256  		})
  3257  
  3258  		for _, p := range parts {
  3259  			if err := bytesLimiter.Reserve(uint64(p.End - p.Start)); err != nil {
  3260  				return httpgrpc.Errorf(int(codes.ResourceExhausted), "exceeded bytes limit while fetching chunks: %s", err)
  3261  			}
  3262  			r.stats.DataDownloadedSizeSum += units.Base2Bytes(p.End - p.Start)
  3263  		}
  3264  
  3265  		for _, p := range parts {
  3266  			seq := seq
  3267  			p := p
  3268  			indices := pIdxs[p.ElemRng[0]:p.ElemRng[1]]
  3269  			g.Go(func() error {
  3270  				return r.loadChunks(ctx, res, aggrs, seq, p, indices, calculateChunkChecksum, bytesLimiter)
  3271  			})
  3272  		}
  3273  	}
  3274  	return g.Wait()
  3275  }
  3276  
  3277  // loadChunks will read range [start, end] from the segment file with sequence number seq.
  3278  // This data range covers chunks starting at supplied offsets.
  3279  func (r *bucketChunkReader) loadChunks(ctx context.Context, res []seriesEntry, aggrs []storepb.Aggr, seq int, part Part, pIdxs []loadIdx, calculateChunkChecksum bool, bytesLimiter BytesLimiter) error {
  3280  	var locked bool
  3281  	fetchBegin := time.Now()
  3282  	defer func() {
  3283  		if !locked {
  3284  			r.mtx.Lock()
  3285  		}
  3286  		r.stats.ChunksFetchDurationSum += time.Since(fetchBegin)
  3287  		r.mtx.Unlock()
  3288  	}()
  3289  
  3290  	// Get a reader for the required range.
  3291  	reader, err := r.block.chunkRangeReader(ctx, seq, int64(part.Start), int64(part.End-part.Start))
  3292  	if err != nil {
  3293  		return errors.Wrap(err, "get range reader")
  3294  	}
  3295  	defer runutil.CloseWithLogOnErr(r.block.logger, reader, "readChunkRange close range reader")
  3296  	bufReader := bufio.NewReaderSize(reader, r.block.estimatedMaxChunkSize)
  3297  
  3298  	locked = true
  3299  	r.mtx.Lock()
  3300  
  3301  	r.stats.chunksFetchCount++
  3302  	r.stats.chunksFetched += len(pIdxs)
  3303  	r.stats.ChunksFetchedSizeSum += units.Base2Bytes(int(part.End - part.Start))
  3304  
  3305  	var (
  3306  		buf        []byte
  3307  		readOffset = int(pIdxs[0].offset)
  3308  
  3309  		// Save a few allocations.
  3310  		written  int
  3311  		diff     uint32
  3312  		chunkLen int
  3313  		n        int
  3314  	)
  3315  
  3316  	bufPooled, err := r.block.chunkPool.Get(r.block.estimatedMaxChunkSize)
  3317  	if err == nil {
  3318  		buf = *bufPooled
  3319  	} else {
  3320  		buf = make([]byte, r.block.estimatedMaxChunkSize)
  3321  	}
  3322  	defer r.block.chunkPool.Put(&buf)
  3323  
  3324  	for i, pIdx := range pIdxs {
  3325  		// Fast forward range reader to the next chunk start in case of sparse (for our purposes) byte range.
  3326  		for readOffset < int(pIdx.offset) {
  3327  			written, err = bufReader.Discard(int(pIdx.offset) - int(readOffset))
  3328  			if err != nil {
  3329  				return errors.Wrap(err, "fast forward range reader")
  3330  			}
  3331  			readOffset += written
  3332  		}
  3333  		// Presume chunk length to be reasonably large for common use cases.
  3334  		// However, declaration for EstimatedMaxChunkSize warns us some chunks could be larger in some rare cases.
  3335  		// This is handled further down below.
  3336  		chunkLen = r.block.estimatedMaxChunkSize
  3337  		if i+1 < len(pIdxs) {
  3338  			if diff = pIdxs[i+1].offset - pIdx.offset; int(diff) < chunkLen {
  3339  				chunkLen = int(diff)
  3340  			}
  3341  		}
  3342  		cb := buf[:chunkLen]
  3343  		n, err = io.ReadFull(bufReader, cb)
  3344  		readOffset += n
  3345  		// Unexpected EOF for last chunk could be a valid case. Any other errors are definitely real.
  3346  		if err != nil && !(errors.Is(err, io.ErrUnexpectedEOF) && i == len(pIdxs)-1) {
  3347  			return errors.Wrapf(err, "read range for seq %d offset %x", seq, pIdx.offset)
  3348  		}
  3349  
  3350  		chunkDataLen, n := binary.Uvarint(cb)
  3351  		if n < 1 {
  3352  			return errors.New("reading chunk length failed")
  3353  		}
  3354  
  3355  		// Chunk length is n (number of bytes used to encode chunk data), 1 for chunk encoding and chunkDataLen for actual chunk data.
  3356  		// There is also crc32 after the chunk, but we ignore that.
  3357  		chunkLen = n + 1 + int(chunkDataLen)
  3358  		if chunkLen <= len(cb) {
  3359  			err = populateChunk(&(res[pIdx.seriesEntry].chks[pIdx.chunk]), rawChunk(cb[n:chunkLen]), aggrs, r.save, calculateChunkChecksum)
  3360  			if err != nil {
  3361  				return errors.Wrap(err, "populate chunk")
  3362  			}
  3363  			r.stats.chunksTouched++
  3364  			r.stats.ChunksTouchedSizeSum += units.Base2Bytes(int(chunkDataLen))
  3365  			continue
  3366  		}
  3367  
  3368  		r.block.metrics.chunkRefetches.Inc()
  3369  		// If we didn't fetch enough data for the chunk, fetch more.
  3370  		fetchBegin = time.Now()
  3371  		// Read entire chunk into new buffer.
  3372  		// TODO: readChunkRange call could be avoided for any chunk but last in this particular part.
  3373  		if err := bytesLimiter.Reserve(uint64(chunkLen)); err != nil {
  3374  			return httpgrpc.Errorf(int(codes.ResourceExhausted), "exceeded bytes limit while fetching chunks: %s", err)
  3375  		}
  3376  		r.stats.DataDownloadedSizeSum += units.Base2Bytes(chunkLen)
  3377  		r.mtx.Unlock()
  3378  		locked = false
  3379  
  3380  		nb, err := r.block.readChunkRange(ctx, seq, int64(pIdx.offset), int64(chunkLen), []byteRange{{offset: 0, length: chunkLen}})
  3381  		if err != nil {
  3382  			return errors.Wrapf(err, "preloaded chunk too small, expecting %d, and failed to fetch full chunk", chunkLen)
  3383  		}
  3384  		if len(*nb) != chunkLen {
  3385  			return errors.Errorf("preloaded chunk too small, expecting %d", chunkLen)
  3386  		}
  3387  
  3388  		r.mtx.Lock()
  3389  		locked = true
  3390  
  3391  		r.stats.chunksFetchCount++
  3392  		r.stats.ChunksFetchedSizeSum += units.Base2Bytes(len(*nb))
  3393  		err = populateChunk(&(res[pIdx.seriesEntry].chks[pIdx.chunk]), rawChunk((*nb)[n:]), aggrs, r.save, calculateChunkChecksum)
  3394  		if err != nil {
  3395  			r.block.chunkPool.Put(nb)
  3396  			return errors.Wrap(err, "populate chunk")
  3397  		}
  3398  		r.stats.chunksTouched++
  3399  		r.stats.ChunksTouchedSizeSum += units.Base2Bytes(int(chunkDataLen))
  3400  
  3401  		r.block.chunkPool.Put(nb)
  3402  	}
  3403  	return nil
  3404  }
  3405  
  3406  // save saves a copy of b's payload to a memory pool of its own and returns a new byte slice referencing said copy.
  3407  // Returned slice becomes invalid once r.block.chunkPool.Put() is called.
  3408  func (r *bucketChunkReader) save(b []byte) ([]byte, error) {
  3409  	// Ensure we never grow slab beyond original capacity.
  3410  	if len(r.chunkBytes) == 0 ||
  3411  		cap(*r.chunkBytes[len(r.chunkBytes)-1])-len(*r.chunkBytes[len(r.chunkBytes)-1]) < len(b) {
  3412  		s, err := r.block.chunkPool.Get(len(b))
  3413  		if err != nil {
  3414  			return nil, errors.Wrap(err, "allocate chunk bytes")
  3415  		}
  3416  		r.chunkBytes = append(r.chunkBytes, s)
  3417  	}
  3418  	slab := r.chunkBytes[len(r.chunkBytes)-1]
  3419  	*slab = append(*slab, b...)
  3420  	return (*slab)[len(*slab)-len(b):], nil
  3421  }
  3422  
  3423  // rawChunk is a helper type that wraps a chunk's raw bytes and implements the chunkenc.Chunk
  3424  // interface over it.
  3425  // It is used to Store API responses which don't need to introspect and validate the chunk's contents.
  3426  type rawChunk []byte
  3427  
  3428  func (b rawChunk) Encoding() chunkenc.Encoding {
  3429  	return chunkenc.Encoding(b[0])
  3430  }
  3431  
  3432  func (b rawChunk) Bytes() []byte {
  3433  	return b[1:]
  3434  }
  3435  func (b rawChunk) Compact() {}
  3436  
  3437  func (b rawChunk) Iterator(_ chunkenc.Iterator) chunkenc.Iterator {
  3438  	panic("invalid call")
  3439  }
  3440  
  3441  func (b rawChunk) Appender() (chunkenc.Appender, error) {
  3442  	panic("invalid call")
  3443  }
  3444  
  3445  func (b rawChunk) NumSamples() int {
  3446  	panic("invalid call")
  3447  }
  3448  
  3449  type queryStats struct {
  3450  	blocksQueried int
  3451  
  3452  	postingsTouched          int
  3453  	PostingsTouchedSizeSum   units.Base2Bytes
  3454  	postingsToFetch          int
  3455  	postingsFetched          int
  3456  	PostingsFetchedSizeSum   units.Base2Bytes
  3457  	postingsFetchCount       int
  3458  	PostingsFetchDurationSum time.Duration
  3459  
  3460  	cachedPostingsCompressions         int
  3461  	cachedPostingsCompressionErrors    int
  3462  	CachedPostingsOriginalSizeSum      units.Base2Bytes
  3463  	CachedPostingsCompressedSizeSum    units.Base2Bytes
  3464  	CachedPostingsCompressionTimeSum   time.Duration
  3465  	cachedPostingsDecompressions       int
  3466  	cachedPostingsDecompressionErrors  int
  3467  	CachedPostingsDecompressionTimeSum time.Duration
  3468  
  3469  	seriesTouched          int
  3470  	SeriesTouchedSizeSum   units.Base2Bytes
  3471  	seriesFetched          int
  3472  	SeriesFetchedSizeSum   units.Base2Bytes
  3473  	seriesFetchCount       int
  3474  	SeriesFetchDurationSum time.Duration
  3475  
  3476  	chunksTouched          int
  3477  	ChunksTouchedSizeSum   units.Base2Bytes
  3478  	chunksFetched          int
  3479  	ChunksFetchedSizeSum   units.Base2Bytes
  3480  	chunksFetchCount       int
  3481  	ChunksFetchDurationSum time.Duration
  3482  
  3483  	GetAllDuration    time.Duration
  3484  	mergedSeriesCount int
  3485  	mergedChunksCount int
  3486  	MergeDuration     time.Duration
  3487  
  3488  	DataDownloadedSizeSum units.Base2Bytes
  3489  }
  3490  
  3491  func (s queryStats) merge(o *queryStats) *queryStats {
  3492  	s.blocksQueried += o.blocksQueried
  3493  
  3494  	s.postingsToFetch += o.postingsToFetch
  3495  	s.postingsTouched += o.postingsTouched
  3496  	s.PostingsTouchedSizeSum += o.PostingsTouchedSizeSum
  3497  	s.postingsFetched += o.postingsFetched
  3498  	s.PostingsFetchedSizeSum += o.PostingsFetchedSizeSum
  3499  	s.postingsFetchCount += o.postingsFetchCount
  3500  	s.PostingsFetchDurationSum += o.PostingsFetchDurationSum
  3501  
  3502  	s.cachedPostingsCompressions += o.cachedPostingsCompressions
  3503  	s.cachedPostingsCompressionErrors += o.cachedPostingsCompressionErrors
  3504  	s.CachedPostingsOriginalSizeSum += o.CachedPostingsOriginalSizeSum
  3505  	s.CachedPostingsCompressedSizeSum += o.CachedPostingsCompressedSizeSum
  3506  	s.CachedPostingsCompressionTimeSum += o.CachedPostingsCompressionTimeSum
  3507  	s.cachedPostingsDecompressions += o.cachedPostingsDecompressions
  3508  	s.cachedPostingsDecompressionErrors += o.cachedPostingsDecompressionErrors
  3509  	s.CachedPostingsDecompressionTimeSum += o.CachedPostingsDecompressionTimeSum
  3510  
  3511  	s.seriesTouched += o.seriesTouched
  3512  	s.SeriesTouchedSizeSum += o.SeriesTouchedSizeSum
  3513  	s.seriesFetched += o.seriesFetched
  3514  	s.SeriesFetchedSizeSum += o.SeriesFetchedSizeSum
  3515  	s.seriesFetchCount += o.seriesFetchCount
  3516  	s.SeriesFetchDurationSum += o.SeriesFetchDurationSum
  3517  
  3518  	s.chunksTouched += o.chunksTouched
  3519  	s.ChunksTouchedSizeSum += o.ChunksTouchedSizeSum
  3520  	s.chunksFetched += o.chunksFetched
  3521  	s.ChunksFetchedSizeSum += o.ChunksFetchedSizeSum
  3522  	s.chunksFetchCount += o.chunksFetchCount
  3523  	s.ChunksFetchDurationSum += o.ChunksFetchDurationSum
  3524  
  3525  	s.GetAllDuration += o.GetAllDuration
  3526  	s.mergedSeriesCount += o.mergedSeriesCount
  3527  	s.mergedChunksCount += o.mergedChunksCount
  3528  	s.MergeDuration += o.MergeDuration
  3529  
  3530  	s.DataDownloadedSizeSum += o.DataDownloadedSizeSum
  3531  
  3532  	return &s
  3533  }
  3534  
  3535  func (s queryStats) toHints() *hintspb.QueryStats {
  3536  	return &hintspb.QueryStats{
  3537  		BlocksQueried:          int64(s.blocksQueried),
  3538  		PostingsTouched:        int64(s.postingsTouched),
  3539  		PostingsTouchedSizeSum: int64(s.PostingsTouchedSizeSum),
  3540  		PostingsToFetch:        int64(s.postingsToFetch),
  3541  		PostingsFetched:        int64(s.postingsFetched),
  3542  		PostingsFetchedSizeSum: int64(s.PostingsFetchedSizeSum),
  3543  		PostingsFetchCount:     int64(s.postingsFetchCount),
  3544  		SeriesTouched:          int64(s.seriesTouched),
  3545  		SeriesTouchedSizeSum:   int64(s.SeriesTouchedSizeSum),
  3546  		SeriesFetched:          int64(s.seriesFetched),
  3547  		SeriesFetchedSizeSum:   int64(s.SeriesFetchedSizeSum),
  3548  		SeriesFetchCount:       int64(s.seriesFetchCount),
  3549  		ChunksTouched:          int64(s.chunksTouched),
  3550  		ChunksTouchedSizeSum:   int64(s.ChunksTouchedSizeSum),
  3551  		ChunksFetched:          int64(s.chunksFetched),
  3552  		ChunksFetchedSizeSum:   int64(s.ChunksFetchedSizeSum),
  3553  		ChunksFetchCount:       int64(s.chunksFetchCount),
  3554  		MergedSeriesCount:      int64(s.mergedSeriesCount),
  3555  		MergedChunksCount:      int64(s.mergedChunksCount),
  3556  		DataDownloadedSizeSum:  int64(s.DataDownloadedSizeSum),
  3557  	}
  3558  }
  3559  
  3560  // NewDefaultChunkBytesPool returns a chunk bytes pool with default settings.
  3561  func NewDefaultChunkBytesPool(maxChunkPoolBytes uint64) (pool.Bytes, error) {
  3562  	return pool.NewBucketedBytes(chunkBytesPoolMinSize, chunkBytesPoolMaxSize, 2, maxChunkPoolBytes)
  3563  }