github.com/thanos-io/thanos@v0.32.5/pkg/store/bucket.go (about) 1 // Copyright (c) The Thanos Authors. 2 // Licensed under the Apache License 2.0. 3 4 package store 5 6 import ( 7 "bufio" 8 "bytes" 9 "context" 10 "encoding/binary" 11 "fmt" 12 "hash" 13 "io" 14 "math" 15 "os" 16 "path" 17 "path/filepath" 18 "sort" 19 "strings" 20 "sync" 21 "time" 22 23 "github.com/alecthomas/units" 24 "github.com/cespare/xxhash" 25 "github.com/go-kit/log" 26 "github.com/go-kit/log/level" 27 "github.com/gogo/protobuf/types" 28 "github.com/oklog/ulid" 29 "github.com/pkg/errors" 30 "github.com/prometheus/client_golang/prometheus" 31 "github.com/prometheus/client_golang/prometheus/promauto" 32 "github.com/prometheus/prometheus/model/labels" 33 "github.com/prometheus/prometheus/storage" 34 "github.com/prometheus/prometheus/tsdb/chunkenc" 35 "github.com/prometheus/prometheus/tsdb/chunks" 36 "github.com/prometheus/prometheus/tsdb/encoding" 37 "github.com/prometheus/prometheus/tsdb/index" 38 "github.com/weaveworks/common/httpgrpc" 39 "golang.org/x/exp/slices" 40 "golang.org/x/sync/errgroup" 41 "google.golang.org/grpc" 42 "google.golang.org/grpc/codes" 43 "google.golang.org/grpc/status" 44 45 "github.com/thanos-io/objstore" 46 47 "github.com/thanos-io/thanos/pkg/block" 48 "github.com/thanos-io/thanos/pkg/block/indexheader" 49 "github.com/thanos-io/thanos/pkg/block/metadata" 50 "github.com/thanos-io/thanos/pkg/compact/downsample" 51 "github.com/thanos-io/thanos/pkg/component" 52 "github.com/thanos-io/thanos/pkg/extprom" 53 "github.com/thanos-io/thanos/pkg/gate" 54 "github.com/thanos-io/thanos/pkg/info/infopb" 55 "github.com/thanos-io/thanos/pkg/model" 56 "github.com/thanos-io/thanos/pkg/pool" 57 "github.com/thanos-io/thanos/pkg/runutil" 58 storecache "github.com/thanos-io/thanos/pkg/store/cache" 59 "github.com/thanos-io/thanos/pkg/store/hintspb" 60 "github.com/thanos-io/thanos/pkg/store/labelpb" 61 "github.com/thanos-io/thanos/pkg/store/storepb" 62 "github.com/thanos-io/thanos/pkg/strutil" 63 "github.com/thanos-io/thanos/pkg/tenancy" 64 "github.com/thanos-io/thanos/pkg/tracing" 65 ) 66 67 const ( 68 // MaxSamplesPerChunk is approximately the max number of samples that we may have in any given chunk. This is needed 69 // for precalculating the number of samples that we may have to retrieve and decode for any given query 70 // without downloading them. Please take a look at https://github.com/prometheus/tsdb/pull/397 to know 71 // where this number comes from. Long story short: TSDB is made in such a way, and it is made in such a way 72 // because you barely get any improvements in compression when the number of samples is beyond this. 73 // Take a look at Figure 6 in this whitepaper http://www.vldb.org/pvldb/vol8/p1816-teller.pdf. 74 MaxSamplesPerChunk = 120 75 // EstimatedMaxChunkSize is average max of chunk size. This can be exceeded though in very rare (valid) cases. 76 EstimatedMaxChunkSize = 16000 77 EstimatedMaxSeriesSize = 64 * 1024 78 // Relatively large in order to reduce memory waste, yet small enough to avoid excessive allocations. 79 chunkBytesPoolMinSize = 64 * 1024 // 64 KiB 80 chunkBytesPoolMaxSize = 64 * 1024 * 1024 // 64 MiB 81 82 // CompatibilityTypeLabelName is an artificial label that Store Gateway can optionally advertise. This is required for compatibility 83 // with pre v0.8.0 Querier. Previous Queriers was strict about duplicated external labels of all StoreAPIs that had any labels. 84 // Now with newer Store Gateway advertising all the external labels it has access to, there was simple case where 85 // Querier was blocking Store Gateway as duplicate with sidecar. 86 // 87 // Newer Queriers are not strict, no duplicated external labels check is there anymore. 88 // Additionally newer Queriers removes/ignore this exact labels from UI and querying. 89 // 90 // This label name is intentionally against Prometheus label style. 91 // TODO(bwplotka): Remove it at some point. 92 CompatibilityTypeLabelName = "@thanos_compatibility_store_type" 93 94 // DefaultPostingOffsetInMemorySampling represents default value for --store.index-header-posting-offsets-in-mem-sampling. 95 // 32 value is chosen as it's a good balance for common setups. Sampling that is not too large (too many CPU cycles) and 96 // not too small (too much memory). 97 DefaultPostingOffsetInMemorySampling = 32 98 99 PartitionerMaxGapSize = 512 * 1024 100 101 // Labels for metrics. 102 labelEncode = "encode" 103 labelDecode = "decode" 104 105 minBlockSyncConcurrency = 1 106 107 enableChunkHashCalculation = true 108 109 // SeriesBatchSize is the default batch size when fetching series from object storage. 110 SeriesBatchSize = 10000 111 ) 112 113 var ( 114 errBlockSyncConcurrencyNotValid = errors.New("the block sync concurrency must be equal or greater than 1.") 115 hashPool = sync.Pool{New: func() interface{} { return xxhash.New() }} 116 ) 117 118 type bucketStoreMetrics struct { 119 blocksLoaded prometheus.Gauge 120 blockLoads prometheus.Counter 121 blockLoadFailures prometheus.Counter 122 lastLoadedBlock prometheus.Gauge 123 blockDrops prometheus.Counter 124 blockDropFailures prometheus.Counter 125 seriesDataTouched *prometheus.HistogramVec 126 seriesDataFetched *prometheus.HistogramVec 127 seriesDataSizeTouched *prometheus.HistogramVec 128 seriesDataSizeFetched *prometheus.HistogramVec 129 seriesBlocksQueried prometheus.Histogram 130 seriesGetAllDuration prometheus.Histogram 131 seriesMergeDuration prometheus.Histogram 132 resultSeriesCount prometheus.Histogram 133 chunkSizeBytes prometheus.Histogram 134 postingsSizeBytes prometheus.Histogram 135 queriesDropped *prometheus.CounterVec 136 seriesRefetches prometheus.Counter 137 chunkRefetches prometheus.Counter 138 emptyPostingCount prometheus.Counter 139 140 cachedPostingsCompressions *prometheus.CounterVec 141 cachedPostingsCompressionErrors *prometheus.CounterVec 142 cachedPostingsCompressionTimeSeconds *prometheus.CounterVec 143 cachedPostingsOriginalSizeBytes prometheus.Counter 144 cachedPostingsCompressedSizeBytes prometheus.Counter 145 146 seriesFetchDuration prometheus.Histogram 147 postingsFetchDuration prometheus.Histogram 148 chunkFetchDuration prometheus.Histogram 149 } 150 151 func newBucketStoreMetrics(reg prometheus.Registerer) *bucketStoreMetrics { 152 var m bucketStoreMetrics 153 154 m.blockLoads = promauto.With(reg).NewCounter(prometheus.CounterOpts{ 155 Name: "thanos_bucket_store_block_loads_total", 156 Help: "Total number of remote block loading attempts.", 157 }) 158 m.blockLoadFailures = promauto.With(reg).NewCounter(prometheus.CounterOpts{ 159 Name: "thanos_bucket_store_block_load_failures_total", 160 Help: "Total number of failed remote block loading attempts.", 161 }) 162 m.blockDrops = promauto.With(reg).NewCounter(prometheus.CounterOpts{ 163 Name: "thanos_bucket_store_block_drops_total", 164 Help: "Total number of local blocks that were dropped.", 165 }) 166 m.blockDropFailures = promauto.With(reg).NewCounter(prometheus.CounterOpts{ 167 Name: "thanos_bucket_store_block_drop_failures_total", 168 Help: "Total number of local blocks that failed to be dropped.", 169 }) 170 m.blocksLoaded = promauto.With(reg).NewGauge(prometheus.GaugeOpts{ 171 Name: "thanos_bucket_store_blocks_loaded", 172 Help: "Number of currently loaded blocks.", 173 }) 174 m.lastLoadedBlock = promauto.With(reg).NewGauge(prometheus.GaugeOpts{ 175 Name: "thanos_bucket_store_blocks_last_loaded_timestamp_seconds", 176 Help: "Timestamp when last block got loaded.", 177 }) 178 179 m.seriesDataTouched = promauto.With(reg).NewHistogramVec(prometheus.HistogramOpts{ 180 Name: "thanos_bucket_store_series_data_touched", 181 Help: "Number of items of a data type touched to fulfill a single Store API series request.", 182 Buckets: prometheus.ExponentialBuckets(200, 2, 15), 183 }, []string{"data_type"}) 184 m.seriesDataFetched = promauto.With(reg).NewHistogramVec(prometheus.HistogramOpts{ 185 Name: "thanos_bucket_store_series_data_fetched", 186 Help: "Number of items of a data type retrieved to fulfill a single Store API series request.", 187 Buckets: prometheus.ExponentialBuckets(200, 2, 15), 188 }, []string{"data_type"}) 189 190 m.seriesDataSizeTouched = promauto.With(reg).NewHistogramVec(prometheus.HistogramOpts{ 191 Name: "thanos_bucket_store_series_data_size_touched_bytes", 192 Help: "Total size of items of a data type touched to fulfill a single Store API series request in Bytes.", 193 Buckets: prometheus.ExponentialBuckets(1024, 2, 15), 194 }, []string{"data_type"}) 195 m.seriesDataSizeFetched = promauto.With(reg).NewHistogramVec(prometheus.HistogramOpts{ 196 Name: "thanos_bucket_store_series_data_size_fetched_bytes", 197 Help: "Total size of items of a data type fetched to fulfill a single Store API series request in Bytes.", 198 Buckets: prometheus.ExponentialBuckets(1024, 2, 15), 199 }, []string{"data_type"}) 200 201 m.seriesBlocksQueried = promauto.With(reg).NewHistogram(prometheus.HistogramOpts{ 202 Name: "thanos_bucket_store_series_blocks_queried", 203 Help: "Number of blocks in a bucket store that were touched to satisfy a query.", 204 Buckets: prometheus.ExponentialBuckets(1, 2, 10), 205 }) 206 m.seriesGetAllDuration = promauto.With(reg).NewHistogram(prometheus.HistogramOpts{ 207 Name: "thanos_bucket_store_series_get_all_duration_seconds", 208 Help: "Time it takes until all per-block prepares and loads for a query are finished.", 209 Buckets: []float64{0.001, 0.01, 0.1, 0.3, 0.6, 1, 3, 6, 9, 20, 30, 60, 90, 120}, 210 }) 211 m.seriesMergeDuration = promauto.With(reg).NewHistogram(prometheus.HistogramOpts{ 212 Name: "thanos_bucket_store_series_merge_duration_seconds", 213 Help: "Time it takes to merge sub-results from all queried blocks into a single result.", 214 Buckets: []float64{0.001, 0.01, 0.1, 0.3, 0.6, 1, 3, 6, 9, 20, 30, 60, 90, 120}, 215 }) 216 m.resultSeriesCount = promauto.With(reg).NewHistogram(prometheus.HistogramOpts{ 217 Name: "thanos_bucket_store_series_result_series", 218 Help: "Number of series observed in the final result of a query.", 219 Buckets: prometheus.ExponentialBuckets(1, 2, 15), 220 }) 221 222 m.chunkSizeBytes = promauto.With(reg).NewHistogram(prometheus.HistogramOpts{ 223 Name: "thanos_bucket_store_sent_chunk_size_bytes", 224 Help: "Size in bytes of the chunks for the single series, which is adequate to the gRPC message size sent to querier.", 225 Buckets: []float64{ 226 32, 256, 512, 1024, 32 * 1024, 256 * 1024, 512 * 1024, 1024 * 1024, 32 * 1024 * 1024, 256 * 1024 * 1024, 512 * 1024 * 1024, 227 }, 228 }) 229 230 m.postingsSizeBytes = promauto.With(reg).NewHistogram(prometheus.HistogramOpts{ 231 Name: "thanos_bucket_store_postings_size_bytes", 232 Help: "Size in bytes of the postings for a single series call.", 233 Buckets: []float64{ 234 32, 256, 512, 1024, 32 * 1024, 256 * 1024, 512 * 1024, 1024 * 1024, 32 * 1024 * 1024, 128 * 1024 * 1024, 256 * 1024 * 1024, 512 * 1024 * 1024, 768 * 1024 * 1024, 1024 * 1024 * 1024, 235 }, 236 }) 237 238 m.queriesDropped = promauto.With(reg).NewCounterVec(prometheus.CounterOpts{ 239 Name: "thanos_bucket_store_queries_dropped_total", 240 Help: "Number of queries that were dropped due to the limit.", 241 }, []string{"reason"}) 242 m.seriesRefetches = promauto.With(reg).NewCounter(prometheus.CounterOpts{ 243 Name: "thanos_bucket_store_series_refetches_total", 244 Help: "Total number of cases where configured estimated series bytes was not enough was to fetch series from index, resulting in refetch.", 245 }) 246 m.chunkRefetches = promauto.With(reg).NewCounter(prometheus.CounterOpts{ 247 Name: "thanos_bucket_store_chunk_refetches_total", 248 Help: "Total number of cases where configured estimated chunk bytes was not enough was to fetch chunks from object store, resulting in refetch.", 249 }) 250 251 m.cachedPostingsCompressions = promauto.With(reg).NewCounterVec(prometheus.CounterOpts{ 252 Name: "thanos_bucket_store_cached_postings_compressions_total", 253 Help: "Number of postings compressions before storing to index cache.", 254 }, []string{"op"}) 255 m.cachedPostingsCompressions.WithLabelValues(labelEncode) 256 m.cachedPostingsCompressions.WithLabelValues(labelDecode) 257 258 m.cachedPostingsCompressionErrors = promauto.With(reg).NewCounterVec(prometheus.CounterOpts{ 259 Name: "thanos_bucket_store_cached_postings_compression_errors_total", 260 Help: "Number of postings compression errors.", 261 }, []string{"op"}) 262 m.cachedPostingsCompressionErrors.WithLabelValues(labelEncode) 263 m.cachedPostingsCompressionErrors.WithLabelValues(labelDecode) 264 265 m.cachedPostingsCompressionTimeSeconds = promauto.With(reg).NewCounterVec(prometheus.CounterOpts{ 266 Name: "thanos_bucket_store_cached_postings_compression_time_seconds_total", 267 Help: "Time spent compressing postings before storing them into postings cache.", 268 }, []string{"op"}) 269 m.cachedPostingsCompressionTimeSeconds.WithLabelValues(labelEncode) 270 m.cachedPostingsCompressionTimeSeconds.WithLabelValues(labelDecode) 271 272 m.cachedPostingsOriginalSizeBytes = promauto.With(reg).NewCounter(prometheus.CounterOpts{ 273 Name: "thanos_bucket_store_cached_postings_original_size_bytes_total", 274 Help: "Original size of postings stored into cache.", 275 }) 276 m.cachedPostingsCompressedSizeBytes = promauto.With(reg).NewCounter(prometheus.CounterOpts{ 277 Name: "thanos_bucket_store_cached_postings_compressed_size_bytes_total", 278 Help: "Compressed size of postings stored into cache.", 279 }) 280 281 m.seriesFetchDuration = promauto.With(reg).NewHistogram(prometheus.HistogramOpts{ 282 Name: "thanos_bucket_store_series_fetch_duration_seconds", 283 Help: "The time it takes to fetch series to respond to a request sent to a store gateway. It includes both the time to fetch it from the cache and from storage in case of cache misses.", 284 Buckets: []float64{0.001, 0.01, 0.1, 0.3, 0.6, 1, 3, 6, 9, 20, 30, 60, 90, 120}, 285 }) 286 287 m.postingsFetchDuration = promauto.With(reg).NewHistogram(prometheus.HistogramOpts{ 288 Name: "thanos_bucket_store_postings_fetch_duration_seconds", 289 Help: "The time it takes to fetch postings to respond to a request sent to a store gateway. It includes both the time to fetch it from the cache and from storage in case of cache misses.", 290 Buckets: []float64{0.001, 0.01, 0.1, 0.3, 0.6, 1, 3, 6, 9, 20, 30, 60, 90, 120}, 291 }) 292 293 m.chunkFetchDuration = promauto.With(reg).NewHistogram(prometheus.HistogramOpts{ 294 Name: "thanos_bucket_store_chunks_fetch_duration_seconds", 295 Help: "The total time spent fetching chunks within a single request a store gateway.", 296 Buckets: []float64{0.001, 0.01, 0.1, 0.3, 0.6, 1, 3, 6, 9, 20, 30, 60, 90, 120}, 297 }) 298 299 m.emptyPostingCount = promauto.With(reg).NewCounter(prometheus.CounterOpts{ 300 Name: "thanos_bucket_store_empty_postings_total", 301 Help: "Total number of empty postings when fetching block series.", 302 }) 303 304 return &m 305 } 306 307 // FilterConfig is a configuration, which Store uses for filtering metrics based on time. 308 type FilterConfig struct { 309 MinTime, MaxTime model.TimeOrDurationValue 310 } 311 312 type BlockEstimator func(meta metadata.Meta) uint64 313 314 // BucketStore implements the store API backed by a bucket. It loads all index 315 // files to local disk. 316 // 317 // NOTE: Bucket store reencodes postings using diff+varint+snappy when storing to cache. 318 // This makes them smaller, but takes extra CPU and memory. 319 // When used with in-memory cache, memory usage should decrease overall, thanks to postings being smaller. 320 type BucketStore struct { 321 logger log.Logger 322 reg prometheus.Registerer // TODO(metalmatze) remove and add via BucketStoreOption 323 metrics *bucketStoreMetrics 324 bkt objstore.InstrumentedBucketReader 325 fetcher block.MetadataFetcher 326 dir string 327 indexCache storecache.IndexCache 328 indexReaderPool *indexheader.ReaderPool 329 buffers sync.Pool 330 chunkPool pool.Bytes 331 seriesBatchSize int 332 333 // Sets of blocks that have the same labels. They are indexed by a hash over their label set. 334 mtx sync.RWMutex 335 blocks map[ulid.ULID]*bucketBlock 336 blockSets map[uint64]*bucketBlockSet 337 338 // Verbose enabled additional logging. 339 debugLogging bool 340 // Number of goroutines to use when syncing blocks from object storage. 341 blockSyncConcurrency int 342 343 // Query gate which limits the maximum amount of concurrent queries. 344 queryGate gate.Gate 345 346 // chunksLimiterFactory creates a new limiter used to limit the number of chunks fetched by each Series() call. 347 chunksLimiterFactory ChunksLimiterFactory 348 // seriesLimiterFactory creates a new limiter used to limit the number of touched series by each Series() call, 349 // or LabelName and LabelValues calls when used with matchers. 350 seriesLimiterFactory SeriesLimiterFactory 351 352 // bytesLimiterFactory creates a new limiter used to limit the amount of bytes fetched/touched by each Series() call. 353 bytesLimiterFactory BytesLimiterFactory 354 partitioner Partitioner 355 356 filterConfig *FilterConfig 357 advLabelSets []labelpb.ZLabelSet 358 enableCompatibilityLabel bool 359 360 // Every how many posting offset entry we pool in heap memory. Default in Prometheus is 32. 361 postingOffsetsInMemSampling int 362 363 // Enables hints in the Series() response. 364 enableSeriesResponseHints bool 365 366 enableChunkHashCalculation bool 367 368 blockEstimatedMaxSeriesFunc BlockEstimator 369 blockEstimatedMaxChunkFunc BlockEstimator 370 } 371 372 func (s *BucketStore) validate() error { 373 if s.blockSyncConcurrency < minBlockSyncConcurrency { 374 return errBlockSyncConcurrencyNotValid 375 } 376 return nil 377 } 378 379 type noopCache struct{} 380 381 func (noopCache) StorePostings(ulid.ULID, labels.Label, []byte) {} 382 func (noopCache) FetchMultiPostings(_ context.Context, _ ulid.ULID, keys []labels.Label) (map[labels.Label][]byte, []labels.Label) { 383 return map[labels.Label][]byte{}, keys 384 } 385 386 func (noopCache) StoreExpandedPostings(_ ulid.ULID, _ []*labels.Matcher, _ []byte) {} 387 func (noopCache) FetchExpandedPostings(_ context.Context, _ ulid.ULID, _ []*labels.Matcher) ([]byte, bool) { 388 return []byte{}, false 389 } 390 391 func (noopCache) StoreSeries(ulid.ULID, storage.SeriesRef, []byte) {} 392 func (noopCache) FetchMultiSeries(_ context.Context, _ ulid.ULID, ids []storage.SeriesRef) (map[storage.SeriesRef][]byte, []storage.SeriesRef) { 393 return map[storage.SeriesRef][]byte{}, ids 394 } 395 396 // BucketStoreOption are functions that configure BucketStore. 397 type BucketStoreOption func(s *BucketStore) 398 399 // WithLogger sets the BucketStore logger to the one you pass. 400 func WithLogger(logger log.Logger) BucketStoreOption { 401 return func(s *BucketStore) { 402 s.logger = logger 403 } 404 } 405 406 // WithRegistry sets a registry that BucketStore uses to register metrics with. 407 func WithRegistry(reg prometheus.Registerer) BucketStoreOption { 408 return func(s *BucketStore) { 409 s.reg = reg 410 } 411 } 412 413 // WithIndexCache sets a indexCache to use instead of a noopCache. 414 func WithIndexCache(cache storecache.IndexCache) BucketStoreOption { 415 return func(s *BucketStore) { 416 s.indexCache = cache 417 } 418 } 419 420 // WithQueryGate sets a queryGate to use instead of a noopGate. 421 func WithQueryGate(queryGate gate.Gate) BucketStoreOption { 422 return func(s *BucketStore) { 423 s.queryGate = queryGate 424 } 425 } 426 427 // WithChunkPool sets a pool.Bytes to use for chunks. 428 func WithChunkPool(chunkPool pool.Bytes) BucketStoreOption { 429 return func(s *BucketStore) { 430 s.chunkPool = chunkPool 431 } 432 } 433 434 // WithFilterConfig sets a filter which Store uses for filtering metrics based on time. 435 func WithFilterConfig(filter *FilterConfig) BucketStoreOption { 436 return func(s *BucketStore) { 437 s.filterConfig = filter 438 } 439 } 440 441 // WithDebugLogging enables debug logging. 442 func WithDebugLogging() BucketStoreOption { 443 return func(s *BucketStore) { 444 s.debugLogging = true 445 } 446 } 447 448 func WithChunkHashCalculation(enableChunkHashCalculation bool) BucketStoreOption { 449 return func(s *BucketStore) { 450 s.enableChunkHashCalculation = enableChunkHashCalculation 451 } 452 } 453 454 func WithSeriesBatchSize(seriesBatchSize int) BucketStoreOption { 455 return func(s *BucketStore) { 456 s.seriesBatchSize = seriesBatchSize 457 } 458 } 459 460 func WithBlockEstimatedMaxSeriesFunc(f BlockEstimator) BucketStoreOption { 461 return func(s *BucketStore) { 462 s.blockEstimatedMaxSeriesFunc = f 463 } 464 } 465 466 func WithBlockEstimatedMaxChunkFunc(f BlockEstimator) BucketStoreOption { 467 return func(s *BucketStore) { 468 s.blockEstimatedMaxChunkFunc = f 469 } 470 } 471 472 // NewBucketStore creates a new bucket backed store that implements the store API against 473 // an object store bucket. It is optimized to work against high latency backends. 474 func NewBucketStore( 475 bkt objstore.InstrumentedBucketReader, 476 fetcher block.MetadataFetcher, 477 dir string, 478 chunksLimiterFactory ChunksLimiterFactory, 479 seriesLimiterFactory SeriesLimiterFactory, 480 bytesLimiterFactory BytesLimiterFactory, 481 partitioner Partitioner, 482 blockSyncConcurrency int, 483 enableCompatibilityLabel bool, 484 postingOffsetsInMemSampling int, 485 enableSeriesResponseHints bool, // TODO(pracucci) Thanos 0.12 and below doesn't gracefully handle new fields in SeriesResponse. Drop this flag and always enable hints once we can drop backward compatibility. 486 lazyIndexReaderEnabled bool, 487 lazyIndexReaderIdleTimeout time.Duration, 488 options ...BucketStoreOption, 489 ) (*BucketStore, error) { 490 s := &BucketStore{ 491 logger: log.NewNopLogger(), 492 bkt: bkt, 493 fetcher: fetcher, 494 dir: dir, 495 indexCache: noopCache{}, 496 buffers: sync.Pool{New: func() interface{} { 497 b := make([]byte, 0, initialBufSize) 498 return &b 499 }}, 500 chunkPool: pool.NoopBytes{}, 501 blocks: map[ulid.ULID]*bucketBlock{}, 502 blockSets: map[uint64]*bucketBlockSet{}, 503 blockSyncConcurrency: blockSyncConcurrency, 504 queryGate: gate.NewNoop(), 505 chunksLimiterFactory: chunksLimiterFactory, 506 seriesLimiterFactory: seriesLimiterFactory, 507 bytesLimiterFactory: bytesLimiterFactory, 508 partitioner: partitioner, 509 enableCompatibilityLabel: enableCompatibilityLabel, 510 postingOffsetsInMemSampling: postingOffsetsInMemSampling, 511 enableSeriesResponseHints: enableSeriesResponseHints, 512 enableChunkHashCalculation: enableChunkHashCalculation, 513 seriesBatchSize: SeriesBatchSize, 514 } 515 516 for _, option := range options { 517 option(s) 518 } 519 520 // Depend on the options 521 indexReaderPoolMetrics := indexheader.NewReaderPoolMetrics(extprom.WrapRegistererWithPrefix("thanos_bucket_store_", s.reg)) 522 s.indexReaderPool = indexheader.NewReaderPool(s.logger, lazyIndexReaderEnabled, lazyIndexReaderIdleTimeout, indexReaderPoolMetrics) 523 s.metrics = newBucketStoreMetrics(s.reg) // TODO(metalmatze): Might be possible via Option too 524 525 if err := s.validate(); err != nil { 526 return nil, errors.Wrap(err, "validate config") 527 } 528 529 if dir == "" { 530 return s, nil 531 } 532 533 if err := os.MkdirAll(dir, 0750); err != nil { 534 return nil, errors.Wrap(err, "create dir") 535 } 536 537 return s, nil 538 } 539 540 // Close the store. 541 func (s *BucketStore) Close() (err error) { 542 s.mtx.Lock() 543 defer s.mtx.Unlock() 544 545 for _, b := range s.blocks { 546 runutil.CloseWithErrCapture(&err, b, "closing Bucket Block") 547 } 548 549 s.indexReaderPool.Close() 550 return err 551 } 552 553 // SyncBlocks synchronizes the stores state with the Bucket bucket. 554 // It will reuse disk space as persistent cache based on s.dir param. 555 func (s *BucketStore) SyncBlocks(ctx context.Context) error { 556 metas, _, metaFetchErr := s.fetcher.Fetch(ctx) 557 // For partial view allow adding new blocks at least. 558 if metaFetchErr != nil && metas == nil { 559 return metaFetchErr 560 } 561 562 var wg sync.WaitGroup 563 blockc := make(chan *metadata.Meta) 564 565 for i := 0; i < s.blockSyncConcurrency; i++ { 566 wg.Add(1) 567 go func() { 568 for meta := range blockc { 569 if err := s.addBlock(ctx, meta); err != nil { 570 continue 571 } 572 } 573 wg.Done() 574 }() 575 } 576 577 for id, meta := range metas { 578 if b := s.getBlock(id); b != nil { 579 continue 580 } 581 select { 582 case <-ctx.Done(): 583 case blockc <- meta: 584 } 585 } 586 587 close(blockc) 588 wg.Wait() 589 590 if metaFetchErr != nil { 591 return metaFetchErr 592 } 593 594 // Drop all blocks that are no longer present in the bucket. 595 for id := range s.blocks { 596 if _, ok := metas[id]; ok { 597 continue 598 } 599 if err := s.removeBlock(id); err != nil { 600 level.Warn(s.logger).Log("msg", "drop of outdated block failed", "block", id, "err", err) 601 s.metrics.blockDropFailures.Inc() 602 } 603 level.Info(s.logger).Log("msg", "dropped outdated block", "block", id) 604 s.metrics.blockDrops.Inc() 605 } 606 607 // Sync advertise labels. 608 var storeLabels labels.Labels 609 s.mtx.Lock() 610 s.advLabelSets = make([]labelpb.ZLabelSet, 0, len(s.advLabelSets)) 611 for _, bs := range s.blockSets { 612 storeLabels = storeLabels[:0] 613 s.advLabelSets = append(s.advLabelSets, labelpb.ZLabelSet{Labels: labelpb.ZLabelsFromPromLabels(append(storeLabels, bs.labels...))}) 614 } 615 sort.Slice(s.advLabelSets, func(i, j int) bool { 616 return strings.Compare(s.advLabelSets[i].String(), s.advLabelSets[j].String()) < 0 617 }) 618 s.mtx.Unlock() 619 return nil 620 } 621 622 // InitialSync perform blocking sync with extra step at the end to delete locally saved blocks that are no longer 623 // present in the bucket. The mismatch of these can only happen between restarts, so we can do that only once per startup. 624 func (s *BucketStore) InitialSync(ctx context.Context) error { 625 if err := s.SyncBlocks(ctx); err != nil { 626 return errors.Wrap(err, "sync block") 627 } 628 629 if s.dir == "" { 630 return nil 631 } 632 633 fis, err := os.ReadDir(s.dir) 634 if err != nil { 635 return errors.Wrap(err, "read dir") 636 } 637 names := make([]string, 0, len(fis)) 638 for _, fi := range fis { 639 names = append(names, fi.Name()) 640 } 641 for _, n := range names { 642 id, ok := block.IsBlockDir(n) 643 if !ok { 644 continue 645 } 646 if b := s.getBlock(id); b != nil { 647 continue 648 } 649 650 // No such block loaded, remove the local dir. 651 if err := os.RemoveAll(path.Join(s.dir, id.String())); err != nil { 652 level.Warn(s.logger).Log("msg", "failed to remove block which is not needed", "err", err) 653 } 654 } 655 656 return nil 657 } 658 659 func (s *BucketStore) getBlock(id ulid.ULID) *bucketBlock { 660 s.mtx.RLock() 661 defer s.mtx.RUnlock() 662 return s.blocks[id] 663 } 664 665 func (s *BucketStore) addBlock(ctx context.Context, meta *metadata.Meta) (err error) { 666 var dir string 667 if s.dir != "" { 668 dir = filepath.Join(s.dir, meta.ULID.String()) 669 } 670 start := time.Now() 671 672 level.Debug(s.logger).Log("msg", "loading new block", "id", meta.ULID) 673 defer func() { 674 if err != nil { 675 s.metrics.blockLoadFailures.Inc() 676 if dir != "" { 677 if err2 := os.RemoveAll(dir); err2 != nil { 678 level.Warn(s.logger).Log("msg", "failed to remove block we cannot load", "err", err2) 679 } 680 } 681 level.Warn(s.logger).Log("msg", "loading block failed", "elapsed", time.Since(start), "id", meta.ULID, "err", err) 682 } else { 683 level.Info(s.logger).Log("msg", "loaded new block", "elapsed", time.Since(start), "id", meta.ULID) 684 } 685 }() 686 s.metrics.blockLoads.Inc() 687 688 lset := labels.FromMap(meta.Thanos.Labels) 689 h := lset.Hash() 690 691 indexHeaderReader, err := s.indexReaderPool.NewBinaryReader( 692 ctx, 693 s.logger, 694 s.bkt, 695 s.dir, 696 meta.ULID, 697 s.postingOffsetsInMemSampling, 698 ) 699 if err != nil { 700 return errors.Wrap(err, "create index header reader") 701 } 702 defer func() { 703 if err != nil { 704 runutil.CloseWithErrCapture(&err, indexHeaderReader, "index-header") 705 } 706 }() 707 708 b, err := newBucketBlock( 709 ctx, 710 log.With(s.logger, "block", meta.ULID), 711 s.metrics, 712 meta, 713 s.bkt, 714 dir, 715 s.indexCache, 716 s.chunkPool, 717 indexHeaderReader, 718 s.partitioner, 719 s.blockEstimatedMaxSeriesFunc, 720 s.blockEstimatedMaxChunkFunc, 721 ) 722 if err != nil { 723 return errors.Wrap(err, "new bucket block") 724 } 725 defer func() { 726 if err != nil { 727 runutil.CloseWithErrCapture(&err, b, "index-header") 728 } 729 }() 730 731 s.mtx.Lock() 732 defer s.mtx.Unlock() 733 734 sort.Sort(lset) 735 736 set, ok := s.blockSets[h] 737 if !ok { 738 set = newBucketBlockSet(lset) 739 s.blockSets[h] = set 740 } 741 742 if err = set.add(b); err != nil { 743 return errors.Wrap(err, "add block to set") 744 } 745 s.blocks[b.meta.ULID] = b 746 747 s.metrics.blocksLoaded.Inc() 748 s.metrics.lastLoadedBlock.SetToCurrentTime() 749 return nil 750 } 751 752 func (s *BucketStore) removeBlock(id ulid.ULID) error { 753 s.mtx.Lock() 754 b, ok := s.blocks[id] 755 if ok { 756 lset := labels.FromMap(b.meta.Thanos.Labels) 757 s.blockSets[lset.Hash()].remove(id) 758 delete(s.blocks, id) 759 } 760 s.mtx.Unlock() 761 762 if !ok { 763 return nil 764 } 765 766 s.metrics.blocksLoaded.Dec() 767 if err := b.Close(); err != nil { 768 return errors.Wrap(err, "close block") 769 } 770 771 if b.dir == "" { 772 return nil 773 } 774 775 return os.RemoveAll(b.dir) 776 } 777 778 // TimeRange returns the minimum and maximum timestamp of data available in the store. 779 func (s *BucketStore) TimeRange() (mint, maxt int64) { 780 s.mtx.RLock() 781 defer s.mtx.RUnlock() 782 783 mint = math.MaxInt64 784 maxt = math.MinInt64 785 786 for _, b := range s.blocks { 787 if b.meta.MinTime < mint { 788 mint = b.meta.MinTime 789 } 790 if b.meta.MaxTime > maxt { 791 maxt = b.meta.MaxTime 792 } 793 } 794 795 mint = s.limitMinTime(mint) 796 maxt = s.limitMaxTime(maxt) 797 798 return mint, maxt 799 } 800 801 // TSDBInfos returns a list of infopb.TSDBInfos for blocks in the bucket store. 802 func (s *BucketStore) TSDBInfos() []infopb.TSDBInfo { 803 s.mtx.RLock() 804 defer s.mtx.RUnlock() 805 806 infos := make([]infopb.TSDBInfo, 0, len(s.blocks)) 807 for _, b := range s.blocks { 808 infos = append(infos, infopb.TSDBInfo{ 809 Labels: labelpb.ZLabelSet{ 810 Labels: labelpb.ZLabelsFromPromLabels(labels.FromMap(b.meta.Thanos.Labels)), 811 }, 812 MinTime: b.meta.MinTime, 813 MaxTime: b.meta.MaxTime, 814 }) 815 } 816 817 return infos 818 } 819 820 func (s *BucketStore) LabelSet() []labelpb.ZLabelSet { 821 s.mtx.RLock() 822 labelSets := s.advLabelSets 823 s.mtx.RUnlock() 824 825 if s.enableCompatibilityLabel && len(labelSets) > 0 { 826 labelSets = append(labelSets, labelpb.ZLabelSet{Labels: []labelpb.ZLabel{{Name: CompatibilityTypeLabelName, Value: "store"}}}) 827 } 828 829 return labelSets 830 } 831 832 // Info implements the storepb.StoreServer interface. 833 func (s *BucketStore) Info(context.Context, *storepb.InfoRequest) (*storepb.InfoResponse, error) { 834 mint, maxt := s.TimeRange() 835 res := &storepb.InfoResponse{ 836 StoreType: component.Store.ToProto(), 837 MinTime: mint, 838 MaxTime: maxt, 839 LabelSets: s.LabelSet(), 840 } 841 842 return res, nil 843 } 844 845 func (s *BucketStore) limitMinTime(mint int64) int64 { 846 if s.filterConfig == nil { 847 return mint 848 } 849 850 filterMinTime := s.filterConfig.MinTime.PrometheusTimestamp() 851 852 if mint < filterMinTime { 853 return filterMinTime 854 } 855 856 return mint 857 } 858 859 func (s *BucketStore) limitMaxTime(maxt int64) int64 { 860 if s.filterConfig == nil { 861 return maxt 862 } 863 864 filterMaxTime := s.filterConfig.MaxTime.PrometheusTimestamp() 865 866 if maxt > filterMaxTime { 867 maxt = filterMaxTime 868 } 869 870 return maxt 871 } 872 873 type seriesEntry struct { 874 lset labels.Labels 875 refs []chunks.ChunkRef 876 chks []storepb.AggrChunk 877 } 878 879 // blockSeriesClient is a storepb.Store_SeriesClient for a 880 // single TSDB block in object storage. 881 type blockSeriesClient struct { 882 grpc.ClientStream 883 ctx context.Context 884 logger log.Logger 885 extLset labels.Labels 886 extLsetToRemove map[string]struct{} 887 888 mint int64 889 maxt int64 890 indexr *bucketIndexReader 891 chunkr *bucketChunkReader 892 loadAggregates []storepb.Aggr 893 chunksLimiter ChunksLimiter 894 bytesLimiter BytesLimiter 895 896 skipChunks bool 897 shardMatcher *storepb.ShardMatcher 898 calculateChunkHash bool 899 chunkFetchDuration prometheus.Histogram 900 901 // Internal state. 902 i uint64 903 postings []storage.SeriesRef 904 chkMetas []chunks.Meta 905 lset labels.Labels 906 symbolizedLset []symbolizedLabel 907 entries []seriesEntry 908 hasMorePostings bool 909 batchSize int 910 } 911 912 func newBlockSeriesClient( 913 ctx context.Context, 914 logger log.Logger, 915 b *bucketBlock, 916 req *storepb.SeriesRequest, 917 limiter ChunksLimiter, 918 bytesLimiter BytesLimiter, 919 shardMatcher *storepb.ShardMatcher, 920 calculateChunkHash bool, 921 batchSize int, 922 chunkFetchDuration prometheus.Histogram, 923 extLsetToRemove map[string]struct{}, 924 ) *blockSeriesClient { 925 var chunkr *bucketChunkReader 926 if !req.SkipChunks { 927 chunkr = b.chunkReader() 928 } 929 930 extLset := b.extLset 931 if extLsetToRemove != nil { 932 extLset = rmLabels(extLset.Copy(), extLsetToRemove) 933 } 934 935 return &blockSeriesClient{ 936 ctx: ctx, 937 logger: logger, 938 extLset: extLset, 939 extLsetToRemove: extLsetToRemove, 940 941 mint: req.MinTime, 942 maxt: req.MaxTime, 943 indexr: b.indexReader(), 944 chunkr: chunkr, 945 chunksLimiter: limiter, 946 bytesLimiter: bytesLimiter, 947 skipChunks: req.SkipChunks, 948 chunkFetchDuration: chunkFetchDuration, 949 950 loadAggregates: req.Aggregates, 951 shardMatcher: shardMatcher, 952 calculateChunkHash: calculateChunkHash, 953 hasMorePostings: true, 954 batchSize: batchSize, 955 } 956 } 957 958 func (b *blockSeriesClient) Close() { 959 if !b.skipChunks { 960 runutil.CloseWithLogOnErr(b.logger, b.chunkr, "series block") 961 } 962 963 runutil.CloseWithLogOnErr(b.logger, b.indexr, "series block") 964 } 965 966 func (b *blockSeriesClient) MergeStats(stats *queryStats) *queryStats { 967 stats = stats.merge(b.indexr.stats) 968 if !b.skipChunks { 969 stats = stats.merge(b.chunkr.stats) 970 } 971 return stats 972 } 973 974 type sortedMatchers []*labels.Matcher 975 976 func newSortedMatchers(matchers []*labels.Matcher) sortedMatchers { 977 sort.Slice(matchers, func(i, j int) bool { 978 if matchers[i].Type == matchers[j].Type { 979 if matchers[i].Name == matchers[j].Name { 980 return matchers[i].Value < matchers[j].Value 981 } 982 return matchers[i].Name < matchers[j].Name 983 } 984 return matchers[i].Type < matchers[j].Type 985 }) 986 987 return matchers 988 } 989 990 func (b *blockSeriesClient) ExpandPostings( 991 matchers sortedMatchers, 992 seriesLimiter SeriesLimiter, 993 ) error { 994 ps, err := b.indexr.ExpandedPostings(b.ctx, matchers, b.bytesLimiter) 995 if err != nil { 996 return errors.Wrap(err, "expanded matching posting") 997 } 998 999 if len(ps) == 0 { 1000 return nil 1001 } 1002 1003 if err := seriesLimiter.Reserve(uint64(len(ps))); err != nil { 1004 return httpgrpc.Errorf(int(codes.ResourceExhausted), "exceeded series limit: %s", err) 1005 } 1006 1007 b.postings = ps 1008 if b.batchSize > len(ps) { 1009 b.batchSize = len(ps) 1010 } 1011 b.entries = make([]seriesEntry, 0, b.batchSize) 1012 return nil 1013 } 1014 1015 func (b *blockSeriesClient) Recv() (*storepb.SeriesResponse, error) { 1016 for len(b.entries) == 0 && b.hasMorePostings { 1017 if err := b.nextBatch(); err != nil { 1018 return nil, err 1019 } 1020 } 1021 1022 if len(b.entries) == 0 { 1023 if b.chunkr != nil { 1024 b.chunkFetchDuration.Observe(b.chunkr.stats.ChunksFetchDurationSum.Seconds()) 1025 } 1026 return nil, io.EOF 1027 } 1028 1029 next := b.entries[0] 1030 b.entries = b.entries[1:] 1031 1032 return storepb.NewSeriesResponse(&storepb.Series{ 1033 Labels: labelpb.ZLabelsFromPromLabels(next.lset), 1034 Chunks: next.chks, 1035 }), nil 1036 } 1037 1038 func (b *blockSeriesClient) nextBatch() error { 1039 start := b.i 1040 end := start + SeriesBatchSize 1041 if end > uint64(len(b.postings)) { 1042 end = uint64(len(b.postings)) 1043 } 1044 b.i = end 1045 1046 postingsBatch := b.postings[start:end] 1047 if len(postingsBatch) == 0 { 1048 b.hasMorePostings = false 1049 return nil 1050 } 1051 1052 b.indexr.reset() 1053 if !b.skipChunks { 1054 b.chunkr.reset() 1055 } 1056 1057 if err := b.indexr.PreloadSeries(b.ctx, postingsBatch, b.bytesLimiter); err != nil { 1058 return errors.Wrap(err, "preload series") 1059 } 1060 1061 b.entries = b.entries[:0] 1062 for i := 0; i < len(postingsBatch); i++ { 1063 if err := b.ctx.Err(); err != nil { 1064 return err 1065 } 1066 ok, err := b.indexr.LoadSeriesForTime(postingsBatch[i], &b.symbolizedLset, &b.chkMetas, b.skipChunks, b.mint, b.maxt) 1067 if err != nil { 1068 return errors.Wrap(err, "read series") 1069 } 1070 if !ok { 1071 continue 1072 } 1073 1074 if err := b.indexr.LookupLabelsSymbols(b.symbolizedLset, &b.lset); err != nil { 1075 return errors.Wrap(err, "Lookup labels symbols") 1076 } 1077 1078 completeLabelset := labelpb.ExtendSortedLabels(b.lset, b.extLset) 1079 if b.extLsetToRemove != nil { 1080 completeLabelset = rmLabels(completeLabelset, b.extLsetToRemove) 1081 } 1082 1083 if !b.shardMatcher.MatchesLabels(completeLabelset) { 1084 continue 1085 } 1086 1087 s := seriesEntry{lset: completeLabelset} 1088 if b.skipChunks { 1089 b.entries = append(b.entries, s) 1090 continue 1091 } 1092 1093 // Schedule loading chunks. 1094 s.refs = make([]chunks.ChunkRef, 0, len(b.chkMetas)) 1095 s.chks = make([]storepb.AggrChunk, 0, len(b.chkMetas)) 1096 1097 for j, meta := range b.chkMetas { 1098 if err := b.chunkr.addLoad(meta.Ref, len(b.entries), j); err != nil { 1099 return errors.Wrap(err, "add chunk load") 1100 } 1101 s.chks = append(s.chks, storepb.AggrChunk{ 1102 MinTime: meta.MinTime, 1103 MaxTime: meta.MaxTime, 1104 }) 1105 s.refs = append(s.refs, meta.Ref) 1106 } 1107 1108 // Ensure sample limit through chunksLimiter if we return chunks. 1109 if err := b.chunksLimiter.Reserve(uint64(len(b.chkMetas))); err != nil { 1110 return httpgrpc.Errorf(int(codes.ResourceExhausted), "exceeded chunks limit: %s", err) 1111 } 1112 1113 b.entries = append(b.entries, s) 1114 } 1115 1116 if !b.skipChunks { 1117 if err := b.chunkr.load(b.ctx, b.entries, b.loadAggregates, b.calculateChunkHash, b.bytesLimiter); err != nil { 1118 return errors.Wrap(err, "load chunks") 1119 } 1120 } 1121 1122 return nil 1123 } 1124 1125 func populateChunk(out *storepb.AggrChunk, in chunkenc.Chunk, aggrs []storepb.Aggr, save func([]byte) ([]byte, error), calculateChecksum bool) error { 1126 hasher := hashPool.Get().(hash.Hash64) 1127 defer hashPool.Put(hasher) 1128 1129 if in.Encoding() == chunkenc.EncXOR || in.Encoding() == chunkenc.EncHistogram { 1130 b, err := save(in.Bytes()) 1131 if err != nil { 1132 return err 1133 } 1134 out.Raw = &storepb.Chunk{ 1135 Data: b, 1136 Type: storepb.Chunk_Encoding(in.Encoding() - 1), 1137 Hash: hashChunk(hasher, b, calculateChecksum), 1138 } 1139 return nil 1140 } 1141 1142 if in.Encoding() != downsample.ChunkEncAggr { 1143 return errors.Errorf("unsupported chunk encoding %d", in.Encoding()) 1144 } 1145 1146 ac := downsample.AggrChunk(in.Bytes()) 1147 1148 for _, at := range aggrs { 1149 switch at { 1150 case storepb.Aggr_COUNT: 1151 x, err := ac.Get(downsample.AggrCount) 1152 if err != nil { 1153 return errors.Errorf("aggregate %s does not exist", downsample.AggrCount) 1154 } 1155 b, err := save(x.Bytes()) 1156 if err != nil { 1157 return err 1158 } 1159 out.Count = &storepb.Chunk{Type: storepb.Chunk_XOR, Data: b, Hash: hashChunk(hasher, b, calculateChecksum)} 1160 case storepb.Aggr_SUM: 1161 x, err := ac.Get(downsample.AggrSum) 1162 if err != nil { 1163 return errors.Errorf("aggregate %s does not exist", downsample.AggrSum) 1164 } 1165 b, err := save(x.Bytes()) 1166 if err != nil { 1167 return err 1168 } 1169 out.Sum = &storepb.Chunk{Type: storepb.Chunk_XOR, Data: b, Hash: hashChunk(hasher, b, calculateChecksum)} 1170 case storepb.Aggr_MIN: 1171 x, err := ac.Get(downsample.AggrMin) 1172 if err != nil { 1173 return errors.Errorf("aggregate %s does not exist", downsample.AggrMin) 1174 } 1175 b, err := save(x.Bytes()) 1176 if err != nil { 1177 return err 1178 } 1179 out.Min = &storepb.Chunk{Type: storepb.Chunk_XOR, Data: b, Hash: hashChunk(hasher, b, calculateChecksum)} 1180 case storepb.Aggr_MAX: 1181 x, err := ac.Get(downsample.AggrMax) 1182 if err != nil { 1183 return errors.Errorf("aggregate %s does not exist", downsample.AggrMax) 1184 } 1185 b, err := save(x.Bytes()) 1186 if err != nil { 1187 return err 1188 } 1189 out.Max = &storepb.Chunk{Type: storepb.Chunk_XOR, Data: b, Hash: hashChunk(hasher, b, calculateChecksum)} 1190 case storepb.Aggr_COUNTER: 1191 x, err := ac.Get(downsample.AggrCounter) 1192 if err != nil { 1193 return errors.Errorf("aggregate %s does not exist", downsample.AggrCounter) 1194 } 1195 b, err := save(x.Bytes()) 1196 if err != nil { 1197 return err 1198 } 1199 out.Counter = &storepb.Chunk{Type: storepb.Chunk_XOR, Data: b, Hash: hashChunk(hasher, b, calculateChecksum)} 1200 } 1201 } 1202 return nil 1203 } 1204 1205 func hashChunk(hasher hash.Hash64, b []byte, doHash bool) uint64 { 1206 if !doHash { 1207 return 0 1208 } 1209 hasher.Reset() 1210 // Write never returns an error on the hasher implementation 1211 _, _ = hasher.Write(b) 1212 return hasher.Sum64() 1213 } 1214 1215 // debugFoundBlockSetOverview logs on debug level what exactly blocks we used for query in terms of 1216 // labels and resolution. This is important because we allow mixed resolution results, so it is quite crucial 1217 // to be aware what exactly resolution we see on query. 1218 // TODO(bplotka): Consider adding resolution label to all results to propagate that info to UI and Query API. 1219 func debugFoundBlockSetOverview(logger log.Logger, mint, maxt, maxResolutionMillis int64, lset labels.Labels, bs []*bucketBlock) { 1220 if len(bs) == 0 { 1221 level.Debug(logger).Log("msg", "No block found", "mint", mint, "maxt", maxt, "lset", lset.String()) 1222 return 1223 } 1224 1225 var ( 1226 parts []string 1227 currRes = int64(-1) 1228 currMin, currMax int64 1229 ) 1230 for _, b := range bs { 1231 if currRes == b.meta.Thanos.Downsample.Resolution { 1232 currMax = b.meta.MaxTime 1233 continue 1234 } 1235 1236 if currRes != -1 { 1237 parts = append(parts, fmt.Sprintf("Range: %d-%d Resolution: %d", currMin, currMax, currRes)) 1238 } 1239 1240 currRes = b.meta.Thanos.Downsample.Resolution 1241 currMin = b.meta.MinTime 1242 currMax = b.meta.MaxTime 1243 } 1244 1245 parts = append(parts, fmt.Sprintf("Range: %d-%d Resolution: %d", currMin, currMax, currRes)) 1246 1247 level.Debug(logger).Log("msg", "Blocks source resolutions", "blocks", len(bs), "Maximum Resolution", maxResolutionMillis, "mint", mint, "maxt", maxt, "lset", lset.String(), "spans", strings.Join(parts, "\n")) 1248 } 1249 1250 // Series implements the storepb.StoreServer interface. 1251 func (s *BucketStore) Series(req *storepb.SeriesRequest, seriesSrv storepb.Store_SeriesServer) (err error) { 1252 srv := newFlushableServer(seriesSrv, sortingStrategyNone) 1253 1254 if s.queryGate != nil { 1255 tracing.DoInSpan(srv.Context(), "store_query_gate_ismyturn", func(ctx context.Context) { 1256 err = s.queryGate.Start(srv.Context()) 1257 }) 1258 if err != nil { 1259 return errors.Wrapf(err, "failed to wait for turn") 1260 } 1261 1262 defer s.queryGate.Done() 1263 } 1264 1265 tenant, _ := tenancy.GetTenantFromGRPCMetadata(srv.Context()) 1266 level.Debug(s.logger).Log("msg", "Tenant for Series request", "tenant", tenant) 1267 1268 matchers, err := storepb.MatchersToPromMatchers(req.Matchers...) 1269 if err != nil { 1270 return status.Error(codes.InvalidArgument, err.Error()) 1271 } 1272 req.MinTime = s.limitMinTime(req.MinTime) 1273 req.MaxTime = s.limitMaxTime(req.MaxTime) 1274 1275 var ( 1276 bytesLimiter = s.bytesLimiterFactory(s.metrics.queriesDropped.WithLabelValues("bytes")) 1277 ctx = srv.Context() 1278 stats = &queryStats{} 1279 respSets []respSet 1280 mtx sync.Mutex 1281 g, gctx = errgroup.WithContext(ctx) 1282 resHints = &hintspb.SeriesResponseHints{} 1283 reqBlockMatchers []*labels.Matcher 1284 chunksLimiter = s.chunksLimiterFactory(s.metrics.queriesDropped.WithLabelValues("chunks")) 1285 seriesLimiter = s.seriesLimiterFactory(s.metrics.queriesDropped.WithLabelValues("series")) 1286 1287 queryStatsEnabled = false 1288 ) 1289 1290 if req.Hints != nil { 1291 reqHints := &hintspb.SeriesRequestHints{} 1292 if err := types.UnmarshalAny(req.Hints, reqHints); err != nil { 1293 return status.Error(codes.InvalidArgument, errors.Wrap(err, "unmarshal series request hints").Error()) 1294 } 1295 queryStatsEnabled = reqHints.EnableQueryStats 1296 1297 reqBlockMatchers, err = storepb.MatchersToPromMatchers(reqHints.BlockMatchers...) 1298 if err != nil { 1299 return status.Error(codes.InvalidArgument, errors.Wrap(err, "translate request hints labels matchers").Error()) 1300 } 1301 } 1302 1303 var extLsetToRemove map[string]struct{} 1304 if len(req.WithoutReplicaLabels) > 0 { 1305 extLsetToRemove = make(map[string]struct{}) 1306 for _, l := range req.WithoutReplicaLabels { 1307 extLsetToRemove[l] = struct{}{} 1308 } 1309 } 1310 1311 s.mtx.RLock() 1312 for _, bs := range s.blockSets { 1313 blockMatchers, ok := bs.labelMatchers(matchers...) 1314 if !ok { 1315 continue 1316 } 1317 1318 sortedBlockMatchers := newSortedMatchers(blockMatchers) 1319 1320 blocks := bs.getFor(req.MinTime, req.MaxTime, req.MaxResolutionWindow, reqBlockMatchers) 1321 1322 if s.debugLogging { 1323 debugFoundBlockSetOverview(s.logger, req.MinTime, req.MaxTime, req.MaxResolutionWindow, bs.labels, blocks) 1324 } 1325 1326 for _, b := range blocks { 1327 blk := b 1328 gctx := gctx 1329 1330 if s.enableSeriesResponseHints { 1331 // Keep track of queried blocks. 1332 resHints.AddQueriedBlock(blk.meta.ULID) 1333 } 1334 1335 shardMatcher := req.ShardInfo.Matcher(&s.buffers) 1336 1337 blockClient := newBlockSeriesClient( 1338 srv.Context(), 1339 s.logger, 1340 blk, 1341 req, 1342 chunksLimiter, 1343 bytesLimiter, 1344 shardMatcher, 1345 s.enableChunkHashCalculation, 1346 s.seriesBatchSize, 1347 s.metrics.chunkFetchDuration, 1348 extLsetToRemove, 1349 ) 1350 1351 defer blockClient.Close() 1352 1353 g.Go(func() error { 1354 1355 span, _ := tracing.StartSpan(gctx, "bucket_store_block_series", tracing.Tags{ 1356 "block.id": blk.meta.ULID, 1357 "block.mint": blk.meta.MinTime, 1358 "block.maxt": blk.meta.MaxTime, 1359 "block.resolution": blk.meta.Thanos.Downsample.Resolution, 1360 }) 1361 1362 onClose := func() { 1363 mtx.Lock() 1364 stats = blockClient.MergeStats(stats) 1365 mtx.Unlock() 1366 } 1367 1368 if err := blockClient.ExpandPostings(sortedBlockMatchers, seriesLimiter); err != nil { 1369 onClose() 1370 span.Finish() 1371 return errors.Wrapf(err, "fetch postings for block %s", blk.meta.ULID) 1372 } 1373 1374 resp := newEagerRespSet( 1375 srv.Context(), 1376 span, 1377 10*time.Minute, 1378 blk.meta.ULID.String(), 1379 []labels.Labels{blk.extLset}, 1380 onClose, 1381 blockClient, 1382 shardMatcher, 1383 false, 1384 s.metrics.emptyPostingCount, 1385 nil, 1386 ) 1387 1388 mtx.Lock() 1389 respSets = append(respSets, resp) 1390 mtx.Unlock() 1391 1392 return nil 1393 }) 1394 } 1395 } 1396 1397 s.mtx.RUnlock() 1398 1399 defer func() { 1400 s.metrics.seriesDataTouched.WithLabelValues("postings").Observe(float64(stats.postingsTouched)) 1401 s.metrics.seriesDataFetched.WithLabelValues("postings").Observe(float64(stats.postingsFetched)) 1402 s.metrics.seriesDataSizeTouched.WithLabelValues("postings").Observe(float64(stats.PostingsTouchedSizeSum)) 1403 s.metrics.seriesDataSizeFetched.WithLabelValues("postings").Observe(float64(stats.PostingsFetchedSizeSum)) 1404 s.metrics.seriesDataTouched.WithLabelValues("series").Observe(float64(stats.seriesTouched)) 1405 s.metrics.seriesDataFetched.WithLabelValues("series").Observe(float64(stats.seriesFetched)) 1406 s.metrics.seriesDataSizeTouched.WithLabelValues("series").Observe(float64(stats.SeriesTouchedSizeSum)) 1407 s.metrics.seriesDataSizeFetched.WithLabelValues("series").Observe(float64(stats.SeriesFetchedSizeSum)) 1408 s.metrics.seriesDataTouched.WithLabelValues("chunks").Observe(float64(stats.chunksTouched)) 1409 s.metrics.seriesDataFetched.WithLabelValues("chunks").Observe(float64(stats.chunksFetched)) 1410 s.metrics.seriesDataSizeTouched.WithLabelValues("chunks").Observe(float64(stats.ChunksTouchedSizeSum)) 1411 s.metrics.seriesDataSizeFetched.WithLabelValues("chunks").Observe(float64(stats.ChunksFetchedSizeSum)) 1412 s.metrics.resultSeriesCount.Observe(float64(stats.mergedSeriesCount)) 1413 s.metrics.cachedPostingsCompressions.WithLabelValues(labelEncode).Add(float64(stats.cachedPostingsCompressions)) 1414 s.metrics.cachedPostingsCompressions.WithLabelValues(labelDecode).Add(float64(stats.cachedPostingsDecompressions)) 1415 s.metrics.cachedPostingsCompressionErrors.WithLabelValues(labelEncode).Add(float64(stats.cachedPostingsCompressionErrors)) 1416 s.metrics.cachedPostingsCompressionErrors.WithLabelValues(labelDecode).Add(float64(stats.cachedPostingsDecompressionErrors)) 1417 s.metrics.cachedPostingsCompressionTimeSeconds.WithLabelValues(labelEncode).Add(stats.CachedPostingsCompressionTimeSum.Seconds()) 1418 s.metrics.cachedPostingsCompressionTimeSeconds.WithLabelValues(labelDecode).Add(stats.CachedPostingsDecompressionTimeSum.Seconds()) 1419 s.metrics.cachedPostingsOriginalSizeBytes.Add(float64(stats.CachedPostingsOriginalSizeSum)) 1420 s.metrics.cachedPostingsCompressedSizeBytes.Add(float64(stats.CachedPostingsCompressedSizeSum)) 1421 s.metrics.postingsSizeBytes.Observe(float64(int(stats.PostingsFetchedSizeSum) + int(stats.PostingsTouchedSizeSum))) 1422 1423 level.Debug(s.logger).Log("msg", "stats query processed", 1424 "request", req, 1425 "stats", fmt.Sprintf("%+v", stats), "err", err) 1426 }() 1427 1428 // Concurrently get data from all blocks. 1429 { 1430 begin := time.Now() 1431 tracing.DoInSpan(ctx, "bucket_store_preload_all", func(_ context.Context) { 1432 err = g.Wait() 1433 }) 1434 if err != nil { 1435 code := codes.Aborted 1436 if s, ok := status.FromError(errors.Cause(err)); ok { 1437 code = s.Code() 1438 } 1439 return status.Error(code, err.Error()) 1440 } 1441 stats.blocksQueried = len(respSets) 1442 stats.GetAllDuration = time.Since(begin) 1443 s.metrics.seriesGetAllDuration.Observe(stats.GetAllDuration.Seconds()) 1444 s.metrics.seriesBlocksQueried.Observe(float64(stats.blocksQueried)) 1445 } 1446 1447 // Merge the sub-results from each selected block. 1448 tracing.DoInSpan(ctx, "bucket_store_merge_all", func(ctx context.Context) { 1449 defer func() { 1450 for _, resp := range respSets { 1451 resp.Close() 1452 } 1453 }() 1454 begin := time.Now() 1455 set := NewDedupResponseHeap(NewProxyResponseHeap(respSets...)) 1456 for set.Next() { 1457 at := set.At() 1458 warn := at.GetWarning() 1459 if warn != "" { 1460 // TODO(fpetkovski): Consider deprecating string based warnings in favor of a 1461 // separate protobuf message containing the grpc code and 1462 // a human readable error message. 1463 err = status.Error(storepb.GRPCCodeFromWarn(warn), at.GetWarning()) 1464 return 1465 } 1466 1467 series := at.GetSeries() 1468 if series != nil { 1469 stats.mergedSeriesCount++ 1470 if !req.SkipChunks { 1471 stats.mergedChunksCount += len(series.Chunks) 1472 s.metrics.chunkSizeBytes.Observe(float64(chunksSize(series.Chunks))) 1473 } 1474 } 1475 if err = srv.Send(at); err != nil { 1476 err = status.Error(codes.Unknown, errors.Wrap(err, "send series response").Error()) 1477 return 1478 } 1479 } 1480 stats.MergeDuration = time.Since(begin) 1481 s.metrics.seriesMergeDuration.Observe(stats.MergeDuration.Seconds()) 1482 1483 err = nil 1484 }) 1485 if err != nil { 1486 return err 1487 } 1488 1489 if s.enableSeriesResponseHints { 1490 var anyHints *types.Any 1491 1492 if queryStatsEnabled { 1493 resHints.QueryStats = stats.toHints() 1494 } 1495 if anyHints, err = types.MarshalAny(resHints); err != nil { 1496 err = status.Error(codes.Unknown, errors.Wrap(err, "marshal series response hints").Error()) 1497 return 1498 } 1499 1500 if err = srv.Send(storepb.NewHintsSeriesResponse(anyHints)); err != nil { 1501 err = status.Error(codes.Unknown, errors.Wrap(err, "send series response hints").Error()) 1502 return 1503 } 1504 } 1505 1506 if err != nil { 1507 return err 1508 } 1509 return srv.Flush() 1510 } 1511 1512 func chunksSize(chks []storepb.AggrChunk) (size int) { 1513 for _, chk := range chks { 1514 size += chk.Size() // This gets the encoded proto size. 1515 } 1516 return size 1517 } 1518 1519 // LabelNames implements the storepb.StoreServer interface. 1520 func (s *BucketStore) LabelNames(ctx context.Context, req *storepb.LabelNamesRequest) (*storepb.LabelNamesResponse, error) { 1521 reqSeriesMatchers, err := storepb.MatchersToPromMatchers(req.Matchers...) 1522 if err != nil { 1523 return nil, status.Error(codes.InvalidArgument, errors.Wrap(err, "translate request labels matchers").Error()) 1524 } 1525 1526 tenant, _ := tenancy.GetTenantFromGRPCMetadata(ctx) 1527 level.Debug(s.logger).Log("msg", "Tenant for LabelNames request", "tenant", tenant) 1528 1529 resHints := &hintspb.LabelNamesResponseHints{} 1530 1531 var reqBlockMatchers []*labels.Matcher 1532 if req.Hints != nil { 1533 reqHints := &hintspb.LabelNamesRequestHints{} 1534 err := types.UnmarshalAny(req.Hints, reqHints) 1535 if err != nil { 1536 return nil, status.Error(codes.InvalidArgument, errors.Wrap(err, "unmarshal label names request hints").Error()) 1537 } 1538 1539 reqBlockMatchers, err = storepb.MatchersToPromMatchers(reqHints.BlockMatchers...) 1540 if err != nil { 1541 return nil, status.Error(codes.InvalidArgument, errors.Wrap(err, "translate request hints labels matchers").Error()) 1542 } 1543 } 1544 1545 g, gctx := errgroup.WithContext(ctx) 1546 1547 s.mtx.RLock() 1548 1549 var mtx sync.Mutex 1550 var sets [][]string 1551 var seriesLimiter = s.seriesLimiterFactory(s.metrics.queriesDropped.WithLabelValues("series")) 1552 var bytesLimiter = s.bytesLimiterFactory(s.metrics.queriesDropped.WithLabelValues("bytes")) 1553 1554 for _, b := range s.blocks { 1555 b := b 1556 gctx := gctx 1557 1558 if !b.overlapsClosedInterval(req.Start, req.End) { 1559 continue 1560 } 1561 if len(reqBlockMatchers) > 0 && !b.matchRelabelLabels(reqBlockMatchers) { 1562 continue 1563 } 1564 // Filter external labels from matchers. 1565 reqSeriesMatchersNoExtLabels, ok := b.FilterExtLabelsMatchers(reqSeriesMatchers) 1566 if !ok { 1567 continue 1568 } 1569 1570 sortedReqSeriesMatchersNoExtLabels := newSortedMatchers(reqSeriesMatchersNoExtLabels) 1571 1572 resHints.AddQueriedBlock(b.meta.ULID) 1573 1574 indexr := b.indexReader() 1575 1576 g.Go(func() error { 1577 span, newCtx := tracing.StartSpan(gctx, "bucket_store_block_series", tracing.Tags{ 1578 "block.id": b.meta.ULID, 1579 "block.mint": b.meta.MinTime, 1580 "block.maxt": b.meta.MaxTime, 1581 "block.resolution": b.meta.Thanos.Downsample.Resolution, 1582 }) 1583 defer span.Finish() 1584 defer runutil.CloseWithLogOnErr(s.logger, indexr, "label names") 1585 1586 var result []string 1587 if len(reqSeriesMatchersNoExtLabels) == 0 { 1588 // Do it via index reader to have pending reader registered correctly. 1589 // LabelNames are already sorted. 1590 res, err := indexr.block.indexHeaderReader.LabelNames() 1591 if err != nil { 1592 return errors.Wrapf(err, "label names for block %s", b.meta.ULID) 1593 } 1594 1595 // Add a set for the external labels as well. 1596 // We're not adding them directly to refs because there could be duplicates. 1597 // b.extLset is already sorted by label name, no need to sort it again. 1598 extRes := make([]string, 0, len(b.extLset)) 1599 for _, l := range b.extLset { 1600 extRes = append(extRes, l.Name) 1601 } 1602 1603 result = strutil.MergeSlices(res, extRes) 1604 } else { 1605 seriesReq := &storepb.SeriesRequest{ 1606 MinTime: req.Start, 1607 MaxTime: req.End, 1608 SkipChunks: true, 1609 } 1610 blockClient := newBlockSeriesClient( 1611 newCtx, 1612 s.logger, 1613 b, 1614 seriesReq, 1615 nil, 1616 bytesLimiter, 1617 nil, 1618 true, 1619 SeriesBatchSize, 1620 s.metrics.chunkFetchDuration, 1621 nil, 1622 ) 1623 defer blockClient.Close() 1624 1625 if err := blockClient.ExpandPostings( 1626 sortedReqSeriesMatchersNoExtLabels, 1627 seriesLimiter, 1628 ); err != nil { 1629 return err 1630 } 1631 1632 // Extract label names from all series. Many label names will be the same, so we need to deduplicate them. 1633 // Note that label names will already include external labels (passed to blockSeries), so we don't need 1634 // to add them again. 1635 labelNames := map[string]struct{}{} 1636 for { 1637 ls, err := blockClient.Recv() 1638 if err == io.EOF { 1639 break 1640 } 1641 if err != nil { 1642 return errors.Wrapf(err, "iterate series for block %s", b.meta.ULID) 1643 } 1644 1645 if ls.GetWarning() != "" { 1646 return errors.Wrapf(errors.New(ls.GetWarning()), "iterate series for block %s", b.meta.ULID) 1647 } 1648 if ls.GetSeries() == nil { 1649 continue 1650 } 1651 for _, l := range ls.GetSeries().Labels { 1652 labelNames[l.Name] = struct{}{} 1653 } 1654 } 1655 1656 result = make([]string, 0, len(labelNames)) 1657 for n := range labelNames { 1658 result = append(result, n) 1659 } 1660 sort.Strings(result) 1661 } 1662 1663 if len(result) > 0 { 1664 mtx.Lock() 1665 sets = append(sets, result) 1666 mtx.Unlock() 1667 } 1668 1669 return nil 1670 }) 1671 } 1672 1673 s.mtx.RUnlock() 1674 1675 if err := g.Wait(); err != nil { 1676 code := codes.Internal 1677 if s, ok := status.FromError(errors.Cause(err)); ok { 1678 code = s.Code() 1679 } 1680 return nil, status.Error(code, err.Error()) 1681 } 1682 1683 anyHints, err := types.MarshalAny(resHints) 1684 if err != nil { 1685 return nil, status.Error(codes.Unknown, errors.Wrap(err, "marshal label names response hints").Error()) 1686 } 1687 1688 return &storepb.LabelNamesResponse{ 1689 Names: strutil.MergeSlices(sets...), 1690 Hints: anyHints, 1691 }, nil 1692 } 1693 1694 func (b *bucketBlock) FilterExtLabelsMatchers(matchers []*labels.Matcher) ([]*labels.Matcher, bool) { 1695 // We filter external labels from matchers so we won't try to match series on them. 1696 var result []*labels.Matcher 1697 for _, m := range matchers { 1698 // Get value of external label from block. 1699 v := b.extLset.Get(m.Name) 1700 // If value is empty string the matcher is a valid one since it's not part of external labels. 1701 if v == "" { 1702 result = append(result, m) 1703 } else if v != "" && v != m.Value { 1704 // If matcher is external label but value is different we don't want to look in block anyway. 1705 return []*labels.Matcher{}, false 1706 } 1707 } 1708 1709 return result, true 1710 } 1711 1712 // LabelValues implements the storepb.StoreServer interface. 1713 func (s *BucketStore) LabelValues(ctx context.Context, req *storepb.LabelValuesRequest) (*storepb.LabelValuesResponse, error) { 1714 reqSeriesMatchers, err := storepb.MatchersToPromMatchers(req.Matchers...) 1715 if err != nil { 1716 return nil, status.Error(codes.InvalidArgument, errors.Wrap(err, "translate request labels matchers").Error()) 1717 } 1718 1719 tenant, _ := tenancy.GetTenantFromGRPCMetadata(ctx) 1720 level.Debug(s.logger).Log("msg", "Tenant for LabelValues request", "tenant", tenant) 1721 1722 resHints := &hintspb.LabelValuesResponseHints{} 1723 1724 g, gctx := errgroup.WithContext(ctx) 1725 1726 var reqBlockMatchers []*labels.Matcher 1727 if req.Hints != nil { 1728 reqHints := &hintspb.LabelValuesRequestHints{} 1729 err := types.UnmarshalAny(req.Hints, reqHints) 1730 if err != nil { 1731 return nil, status.Error(codes.InvalidArgument, errors.Wrap(err, "unmarshal label values request hints").Error()) 1732 } 1733 1734 reqBlockMatchers, err = storepb.MatchersToPromMatchers(reqHints.BlockMatchers...) 1735 if err != nil { 1736 return nil, status.Error(codes.InvalidArgument, errors.Wrap(err, "translate request hints labels matchers").Error()) 1737 } 1738 } 1739 1740 s.mtx.RLock() 1741 1742 var mtx sync.Mutex 1743 var sets [][]string 1744 var seriesLimiter = s.seriesLimiterFactory(s.metrics.queriesDropped.WithLabelValues("series")) 1745 var bytesLimiter = s.bytesLimiterFactory(s.metrics.queriesDropped.WithLabelValues("bytes")) 1746 1747 for _, b := range s.blocks { 1748 b := b 1749 1750 if !b.overlapsClosedInterval(req.Start, req.End) { 1751 continue 1752 } 1753 if len(reqBlockMatchers) > 0 && !b.matchRelabelLabels(reqBlockMatchers) { 1754 continue 1755 } 1756 // Filter external labels from matchers. 1757 reqSeriesMatchersNoExtLabels, ok := b.FilterExtLabelsMatchers(reqSeriesMatchers) 1758 if !ok { 1759 continue 1760 } 1761 1762 // If we have series matchers, add <labelName> != "" matcher, to only select series that have given label name. 1763 if len(reqSeriesMatchersNoExtLabels) > 0 { 1764 m, err := labels.NewMatcher(labels.MatchNotEqual, req.Label, "") 1765 if err != nil { 1766 return nil, status.Error(codes.InvalidArgument, err.Error()) 1767 } 1768 1769 reqSeriesMatchersNoExtLabels = append(reqSeriesMatchersNoExtLabels, m) 1770 } 1771 1772 sortedReqSeriesMatchersNoExtLabels := newSortedMatchers(reqSeriesMatchersNoExtLabels) 1773 1774 resHints.AddQueriedBlock(b.meta.ULID) 1775 1776 indexr := b.indexReader() 1777 g.Go(func() error { 1778 span, newCtx := tracing.StartSpan(gctx, "bucket_store_block_series", tracing.Tags{ 1779 "block.id": b.meta.ULID, 1780 "block.mint": b.meta.MinTime, 1781 "block.maxt": b.meta.MaxTime, 1782 "block.resolution": b.meta.Thanos.Downsample.Resolution, 1783 }) 1784 defer span.Finish() 1785 defer runutil.CloseWithLogOnErr(s.logger, indexr, "label values") 1786 1787 var result []string 1788 if len(reqSeriesMatchersNoExtLabels) == 0 { 1789 // Do it via index reader to have pending reader registered correctly. 1790 res, err := indexr.block.indexHeaderReader.LabelValues(req.Label) 1791 if err != nil { 1792 return errors.Wrapf(err, "index header label values for block %s", b.meta.ULID) 1793 } 1794 1795 // Add the external label value as well. 1796 if extLabelValue := b.extLset.Get(req.Label); extLabelValue != "" { 1797 res = strutil.MergeSlices(res, []string{extLabelValue}) 1798 } 1799 result = res 1800 } else { 1801 seriesReq := &storepb.SeriesRequest{ 1802 MinTime: req.Start, 1803 MaxTime: req.End, 1804 SkipChunks: true, 1805 } 1806 blockClient := newBlockSeriesClient( 1807 newCtx, 1808 s.logger, 1809 b, 1810 seriesReq, 1811 nil, 1812 bytesLimiter, 1813 nil, 1814 true, 1815 SeriesBatchSize, 1816 s.metrics.chunkFetchDuration, 1817 nil, 1818 ) 1819 defer blockClient.Close() 1820 1821 if err := blockClient.ExpandPostings( 1822 sortedReqSeriesMatchersNoExtLabels, 1823 seriesLimiter, 1824 ); err != nil { 1825 return err 1826 } 1827 1828 // Extract given label's value from all series and deduplicate them. 1829 // We don't need to deal with external labels, since they are already added by blockSeries. 1830 values := map[string]struct{}{} 1831 for { 1832 ls, err := blockClient.Recv() 1833 if err == io.EOF { 1834 break 1835 } 1836 if err != nil { 1837 return errors.Wrapf(err, "iterate series for block %s", b.meta.ULID) 1838 } 1839 1840 if ls.GetWarning() != "" { 1841 return errors.Wrapf(errors.New(ls.GetWarning()), "iterate series for block %s", b.meta.ULID) 1842 } 1843 if ls.GetSeries() == nil { 1844 continue 1845 } 1846 1847 val := labelpb.ZLabelsToPromLabels(ls.GetSeries().Labels).Get(req.Label) 1848 if val != "" { // Should never be empty since we added labelName!="" matcher to the list of matchers. 1849 values[val] = struct{}{} 1850 } 1851 } 1852 1853 result = make([]string, 0, len(values)) 1854 for n := range values { 1855 result = append(result, n) 1856 } 1857 sort.Strings(result) 1858 } 1859 1860 if len(result) > 0 { 1861 mtx.Lock() 1862 sets = append(sets, result) 1863 mtx.Unlock() 1864 } 1865 1866 return nil 1867 }) 1868 } 1869 1870 s.mtx.RUnlock() 1871 1872 if err := g.Wait(); err != nil { 1873 code := codes.Internal 1874 if s, ok := status.FromError(errors.Cause(err)); ok { 1875 code = s.Code() 1876 } 1877 return nil, status.Error(code, err.Error()) 1878 } 1879 1880 anyHints, err := types.MarshalAny(resHints) 1881 if err != nil { 1882 return nil, status.Error(codes.Unknown, errors.Wrap(err, "marshal label values response hints").Error()) 1883 } 1884 1885 return &storepb.LabelValuesResponse{ 1886 Values: strutil.MergeSlices(sets...), 1887 Hints: anyHints, 1888 }, nil 1889 } 1890 1891 // bucketBlockSet holds all blocks of an equal label set. It internally splits 1892 // them up by downsampling resolution and allows querying. 1893 type bucketBlockSet struct { 1894 labels labels.Labels 1895 mtx sync.RWMutex 1896 resolutions []int64 // Available resolution, high to low (in milliseconds). 1897 blocks [][]*bucketBlock // Ordered buckets for the existing resolutions. 1898 } 1899 1900 // newBucketBlockSet initializes a new set with the known downsampling windows hard-configured. 1901 // The set currently does not support arbitrary ranges. 1902 func newBucketBlockSet(lset labels.Labels) *bucketBlockSet { 1903 return &bucketBlockSet{ 1904 labels: lset, 1905 resolutions: []int64{downsample.ResLevel2, downsample.ResLevel1, downsample.ResLevel0}, 1906 blocks: make([][]*bucketBlock, 3), 1907 } 1908 } 1909 1910 func (s *bucketBlockSet) add(b *bucketBlock) error { 1911 if !labels.Equal(s.labels, labels.FromMap(b.meta.Thanos.Labels)) { 1912 return errors.New("block's label set does not match set") 1913 } 1914 s.mtx.Lock() 1915 defer s.mtx.Unlock() 1916 1917 i := int64index(s.resolutions, b.meta.Thanos.Downsample.Resolution) 1918 if i < 0 { 1919 return errors.Errorf("unsupported downsampling resolution %d", b.meta.Thanos.Downsample.Resolution) 1920 } 1921 bs := append(s.blocks[i], b) 1922 s.blocks[i] = bs 1923 1924 // Always sort blocks by min time, then max time. 1925 sort.Slice(bs, func(j, k int) bool { 1926 if bs[j].meta.MinTime == bs[k].meta.MinTime { 1927 return bs[j].meta.MaxTime < bs[k].meta.MaxTime 1928 } 1929 return bs[j].meta.MinTime < bs[k].meta.MinTime 1930 }) 1931 return nil 1932 } 1933 1934 func (s *bucketBlockSet) remove(id ulid.ULID) { 1935 s.mtx.Lock() 1936 defer s.mtx.Unlock() 1937 1938 for i, bs := range s.blocks { 1939 for j, b := range bs { 1940 if b.meta.ULID != id { 1941 continue 1942 } 1943 s.blocks[i] = append(bs[:j], bs[j+1:]...) 1944 return 1945 } 1946 } 1947 } 1948 1949 func int64index(s []int64, x int64) int { 1950 for i, v := range s { 1951 if v == x { 1952 return i 1953 } 1954 } 1955 return -1 1956 } 1957 1958 // getFor returns a time-ordered list of blocks that cover date between mint and maxt. 1959 // Blocks with the biggest resolution possible but not bigger than the given max resolution are returned. 1960 // It supports overlapping blocks. 1961 // 1962 // NOTE: s.blocks are expected to be sorted in minTime order. 1963 func (s *bucketBlockSet) getFor(mint, maxt, maxResolutionMillis int64, blockMatchers []*labels.Matcher) (bs []*bucketBlock) { 1964 if mint > maxt { 1965 return nil 1966 } 1967 1968 s.mtx.RLock() 1969 defer s.mtx.RUnlock() 1970 1971 // Find first matching resolution. 1972 i := 0 1973 for ; i < len(s.resolutions) && s.resolutions[i] > maxResolutionMillis; i++ { 1974 } 1975 1976 // Fill the given interval with the blocks for the current resolution. 1977 // Our current resolution might not cover all data, so recursively fill the gaps with higher resolution blocks 1978 // if there is any. 1979 start := mint 1980 for _, b := range s.blocks[i] { 1981 if b.meta.MaxTime <= mint { 1982 continue 1983 } 1984 // NOTE: Block intervals are half-open: [b.MinTime, b.MaxTime). 1985 if b.meta.MinTime > maxt { 1986 break 1987 } 1988 1989 if i+1 < len(s.resolutions) { 1990 bs = append(bs, s.getFor(start, b.meta.MinTime-1, s.resolutions[i+1], blockMatchers)...) 1991 } 1992 1993 // Include the block in the list of matching ones only if there are no block-level matchers 1994 // or they actually match. 1995 if len(blockMatchers) == 0 || b.matchRelabelLabels(blockMatchers) { 1996 bs = append(bs, b) 1997 } 1998 1999 start = b.meta.MaxTime 2000 } 2001 2002 if i+1 < len(s.resolutions) { 2003 bs = append(bs, s.getFor(start, maxt, s.resolutions[i+1], blockMatchers)...) 2004 } 2005 return bs 2006 } 2007 2008 // labelMatchers verifies whether the block set matches the given matchers and returns a new 2009 // set of matchers that is equivalent when querying data within the block. 2010 func (s *bucketBlockSet) labelMatchers(matchers ...*labels.Matcher) ([]*labels.Matcher, bool) { 2011 res := make([]*labels.Matcher, 0, len(matchers)) 2012 2013 for _, m := range matchers { 2014 v := s.labels.Get(m.Name) 2015 if v == "" { 2016 res = append(res, m) 2017 continue 2018 } 2019 if !m.Matches(v) { 2020 return nil, false 2021 } 2022 } 2023 return res, true 2024 } 2025 2026 // bucketBlock represents a block that is located in a bucket. It holds intermediate 2027 // state for the block on local disk. 2028 type bucketBlock struct { 2029 logger log.Logger 2030 metrics *bucketStoreMetrics 2031 bkt objstore.BucketReader 2032 meta *metadata.Meta 2033 dir string 2034 indexCache storecache.IndexCache 2035 chunkPool pool.Bytes 2036 extLset labels.Labels 2037 2038 indexHeaderReader indexheader.Reader 2039 2040 chunkObjs []string 2041 2042 pendingReaders sync.WaitGroup 2043 2044 partitioner Partitioner 2045 2046 // Block's labels used by block-level matchers to filter blocks to query. These are used to select blocks using 2047 // request hints' BlockMatchers. 2048 relabelLabels labels.Labels 2049 2050 estimatedMaxChunkSize int 2051 estimatedMaxSeriesSize int 2052 } 2053 2054 func newBucketBlock( 2055 ctx context.Context, 2056 logger log.Logger, 2057 metrics *bucketStoreMetrics, 2058 meta *metadata.Meta, 2059 bkt objstore.BucketReader, 2060 dir string, 2061 indexCache storecache.IndexCache, 2062 chunkPool pool.Bytes, 2063 indexHeadReader indexheader.Reader, 2064 p Partitioner, 2065 maxSeriesSizeFunc BlockEstimator, 2066 maxChunkSizeFunc BlockEstimator, 2067 ) (b *bucketBlock, err error) { 2068 maxSeriesSize := EstimatedMaxSeriesSize 2069 if maxSeriesSizeFunc != nil { 2070 maxSeriesSize = int(maxSeriesSizeFunc(*meta)) 2071 } 2072 maxChunkSize := EstimatedMaxChunkSize 2073 if maxChunkSizeFunc != nil { 2074 maxChunkSize = int(maxChunkSizeFunc(*meta)) 2075 } 2076 b = &bucketBlock{ 2077 logger: logger, 2078 metrics: metrics, 2079 bkt: bkt, 2080 indexCache: indexCache, 2081 chunkPool: chunkPool, 2082 dir: dir, 2083 partitioner: p, 2084 meta: meta, 2085 indexHeaderReader: indexHeadReader, 2086 extLset: labels.FromMap(meta.Thanos.Labels), 2087 // Translate the block's labels and inject the block ID as a label 2088 // to allow to match blocks also by ID. 2089 relabelLabels: append(labels.FromMap(meta.Thanos.Labels), labels.Label{ 2090 Name: block.BlockIDLabel, 2091 Value: meta.ULID.String(), 2092 }), 2093 estimatedMaxSeriesSize: maxSeriesSize, 2094 estimatedMaxChunkSize: maxChunkSize, 2095 } 2096 sort.Sort(b.extLset) 2097 sort.Sort(b.relabelLabels) 2098 2099 // Get object handles for all chunk files (segment files) from meta.json, if available. 2100 if len(meta.Thanos.SegmentFiles) > 0 { 2101 b.chunkObjs = make([]string, 0, len(meta.Thanos.SegmentFiles)) 2102 2103 for _, sf := range meta.Thanos.SegmentFiles { 2104 b.chunkObjs = append(b.chunkObjs, path.Join(meta.ULID.String(), block.ChunksDirname, sf)) 2105 } 2106 return b, nil 2107 } 2108 2109 // Get object handles for all chunk files from storage. 2110 if err = bkt.Iter(ctx, path.Join(meta.ULID.String(), block.ChunksDirname), func(n string) error { 2111 b.chunkObjs = append(b.chunkObjs, n) 2112 return nil 2113 }); err != nil { 2114 return nil, errors.Wrap(err, "list chunk files") 2115 } 2116 return b, nil 2117 } 2118 2119 func (b *bucketBlock) indexFilename() string { 2120 return path.Join(b.meta.ULID.String(), block.IndexFilename) 2121 } 2122 2123 func (b *bucketBlock) readIndexRange(ctx context.Context, off, length int64) ([]byte, error) { 2124 r, err := b.bkt.GetRange(ctx, b.indexFilename(), off, length) 2125 if err != nil { 2126 return nil, errors.Wrap(err, "get range reader") 2127 } 2128 defer runutil.CloseWithLogOnErr(b.logger, r, "readIndexRange close range reader") 2129 2130 // Preallocate the buffer with the exact size so we don't waste allocations 2131 // while progressively growing an initial small buffer. The buffer capacity 2132 // is increased by MinRead to avoid extra allocations due to how ReadFrom() 2133 // internally works. 2134 buf := bytes.NewBuffer(make([]byte, 0, length+bytes.MinRead)) 2135 if _, err := buf.ReadFrom(r); err != nil { 2136 return nil, errors.Wrap(err, "read range") 2137 } 2138 return buf.Bytes(), nil 2139 } 2140 2141 func (b *bucketBlock) readChunkRange(ctx context.Context, seq int, off, length int64, chunkRanges byteRanges) (*[]byte, error) { 2142 if seq < 0 || seq >= len(b.chunkObjs) { 2143 return nil, errors.Errorf("unknown segment file for index %d", seq) 2144 } 2145 2146 // Get a reader for the required range. 2147 reader, err := b.bkt.GetRange(ctx, b.chunkObjs[seq], off, length) 2148 if err != nil { 2149 return nil, errors.Wrap(err, "get range reader") 2150 } 2151 defer runutil.CloseWithLogOnErr(b.logger, reader, "readChunkRange close range reader") 2152 2153 // Get a buffer from the pool. 2154 chunkBuffer, err := b.chunkPool.Get(chunkRanges.size()) 2155 if err != nil { 2156 return nil, errors.Wrap(err, "allocate chunk bytes") 2157 } 2158 2159 *chunkBuffer, err = readByteRanges(reader, *chunkBuffer, chunkRanges) 2160 if err != nil { 2161 return nil, err 2162 } 2163 2164 return chunkBuffer, nil 2165 } 2166 2167 func (b *bucketBlock) chunkRangeReader(ctx context.Context, seq int, off, length int64) (io.ReadCloser, error) { 2168 if seq < 0 || seq >= len(b.chunkObjs) { 2169 return nil, errors.Errorf("unknown segment file for index %d", seq) 2170 } 2171 2172 return b.bkt.GetRange(ctx, b.chunkObjs[seq], off, length) 2173 } 2174 2175 func (b *bucketBlock) indexReader() *bucketIndexReader { 2176 b.pendingReaders.Add(1) 2177 return newBucketIndexReader(b) 2178 } 2179 2180 func (b *bucketBlock) chunkReader() *bucketChunkReader { 2181 b.pendingReaders.Add(1) 2182 return newBucketChunkReader(b) 2183 } 2184 2185 // matchRelabelLabels verifies whether the block matches the given matchers. 2186 func (b *bucketBlock) matchRelabelLabels(matchers []*labels.Matcher) bool { 2187 for _, m := range matchers { 2188 if !m.Matches(b.relabelLabels.Get(m.Name)) { 2189 return false 2190 } 2191 } 2192 return true 2193 } 2194 2195 // overlapsClosedInterval returns true if the block overlaps [mint, maxt). 2196 func (b *bucketBlock) overlapsClosedInterval(mint, maxt int64) bool { 2197 // The block itself is a half-open interval 2198 // [b.meta.MinTime, b.meta.MaxTime). 2199 return b.meta.MinTime <= maxt && mint < b.meta.MaxTime 2200 } 2201 2202 // Close waits for all pending readers to finish and then closes all underlying resources. 2203 func (b *bucketBlock) Close() error { 2204 b.pendingReaders.Wait() 2205 return b.indexHeaderReader.Close() 2206 } 2207 2208 // bucketIndexReader is a custom index reader (not conforming index.Reader interface) that reads index that is stored in 2209 // object storage without having to fully download it. 2210 type bucketIndexReader struct { 2211 block *bucketBlock 2212 dec *index.Decoder 2213 stats *queryStats 2214 2215 mtx sync.Mutex 2216 loadedSeries map[storage.SeriesRef][]byte 2217 } 2218 2219 func newBucketIndexReader(block *bucketBlock) *bucketIndexReader { 2220 r := &bucketIndexReader{ 2221 block: block, 2222 dec: &index.Decoder{ 2223 LookupSymbol: block.indexHeaderReader.LookupSymbol, 2224 }, 2225 stats: &queryStats{}, 2226 loadedSeries: map[storage.SeriesRef][]byte{}, 2227 } 2228 return r 2229 } 2230 func (r *bucketIndexReader) reset() { 2231 r.loadedSeries = map[storage.SeriesRef][]byte{} 2232 } 2233 2234 // ExpandedPostings returns postings in expanded list instead of index.Postings. 2235 // This is because we need to have them buffered anyway to perform efficient lookup 2236 // on object storage. 2237 // Found posting IDs (ps) are not strictly required to point to a valid Series, e.g. during 2238 // background garbage collections. 2239 // 2240 // Reminder: A posting is a reference (represented as a uint64) to a series reference, which in turn points to the first 2241 // chunk where the series contains the matching label-value pair for a given block of data. Postings can be fetched by 2242 // single label name=value. 2243 func (r *bucketIndexReader) ExpandedPostings(ctx context.Context, ms sortedMatchers, bytesLimiter BytesLimiter) ([]storage.SeriesRef, error) { 2244 // Shortcut the case of `len(postingGroups) == 0`. It will only happen when no 2245 // matchers specified, and we don't need to fetch expanded postings from cache. 2246 if len(ms) == 0 { 2247 return nil, nil 2248 } 2249 2250 hit, postings, err := r.fetchExpandedPostingsFromCache(ctx, ms, bytesLimiter) 2251 if err != nil { 2252 return nil, err 2253 } 2254 if hit { 2255 return postings, nil 2256 } 2257 var ( 2258 allRequested = false 2259 hasAdds = false 2260 keys []labels.Label 2261 ) 2262 2263 postingGroups, err := matchersToPostingGroups(ctx, r.block.indexHeaderReader.LabelValues, ms) 2264 if err != nil { 2265 return nil, errors.Wrap(err, "matchersToPostingGroups") 2266 } 2267 if postingGroups == nil { 2268 r.storeExpandedPostingsToCache(ms, index.EmptyPostings(), 0) 2269 return nil, nil 2270 } 2271 for _, pg := range postingGroups { 2272 allRequested = allRequested || pg.addAll 2273 hasAdds = hasAdds || len(pg.addKeys) > 0 2274 2275 // Postings returned by fetchPostings will be in the same order as keys 2276 // so it's important that we iterate them in the same order later. 2277 // We don't have any other way of pairing keys and fetched postings. 2278 for _, key := range pg.addKeys { 2279 keys = append(keys, labels.Label{Name: pg.name, Value: key}) 2280 } 2281 for _, key := range pg.removeKeys { 2282 keys = append(keys, labels.Label{Name: pg.name, Value: key}) 2283 } 2284 } 2285 2286 // We only need special All postings if there are no other adds. If there are, we can skip fetching 2287 // special All postings completely. 2288 if allRequested && !hasAdds { 2289 // add group with label to fetch "special All postings". 2290 name, value := index.AllPostingsKey() 2291 allPostingsLabel := labels.Label{Name: name, Value: value} 2292 2293 postingGroups = append(postingGroups, newPostingGroup(true, name, []string{value}, nil)) 2294 keys = append(keys, allPostingsLabel) 2295 } 2296 2297 fetchedPostings, closeFns, err := r.fetchPostings(ctx, keys, bytesLimiter) 2298 defer func() { 2299 for _, closeFn := range closeFns { 2300 closeFn() 2301 } 2302 }() 2303 if err != nil { 2304 return nil, errors.Wrap(err, "get postings") 2305 } 2306 2307 // Get "add" and "remove" postings from groups. We iterate over postingGroups and their keys 2308 // again, and this is exactly the same order as before (when building the groups), so we can simply 2309 // use one incrementing index to fetch postings from returned slice. 2310 postingIndex := 0 2311 2312 var groupAdds, groupRemovals []index.Postings 2313 for _, g := range postingGroups { 2314 // We cannot add empty set to groupAdds, since they are intersected. 2315 if len(g.addKeys) > 0 { 2316 toMerge := make([]index.Postings, 0, len(g.addKeys)) 2317 for _, l := range g.addKeys { 2318 toMerge = append(toMerge, checkNilPosting(g.name, l, fetchedPostings[postingIndex])) 2319 postingIndex++ 2320 } 2321 2322 groupAdds = append(groupAdds, index.Merge(toMerge...)) 2323 } 2324 2325 for _, l := range g.removeKeys { 2326 groupRemovals = append(groupRemovals, checkNilPosting(g.name, l, fetchedPostings[postingIndex])) 2327 postingIndex++ 2328 } 2329 } 2330 2331 result := index.Without(index.Intersect(groupAdds...), index.Merge(groupRemovals...)) 2332 ps, err := ExpandPostingsWithContext(ctx, result) 2333 if err != nil { 2334 return nil, errors.Wrap(err, "expand") 2335 } 2336 r.storeExpandedPostingsToCache(ms, index.NewListPostings(ps), len(ps)) 2337 2338 if len(ps) > 0 { 2339 // As of version two all series entries are 16 byte padded. All references 2340 // we get have to account for that to get the correct offset. 2341 version, err := r.block.indexHeaderReader.IndexVersion() 2342 if err != nil { 2343 return nil, errors.Wrap(err, "get index version") 2344 } 2345 if version >= 2 { 2346 for i, id := range ps { 2347 ps[i] = id * 16 2348 } 2349 } 2350 } 2351 return ps, nil 2352 } 2353 2354 // ExpandPostingsWithContext returns the postings expanded as a slice and considers context. 2355 func ExpandPostingsWithContext(ctx context.Context, p index.Postings) (res []storage.SeriesRef, err error) { 2356 for p.Next() { 2357 if ctx.Err() != nil { 2358 return nil, ctx.Err() 2359 } 2360 res = append(res, p.At()) 2361 } 2362 return res, p.Err() 2363 } 2364 2365 // postingGroup keeps posting keys for one or more matchers with the same label name. Logical result of the group is: 2366 // If addAll is set: special All postings minus postings for removeKeys labels. No need to merge postings for addKeys in this case. 2367 // If addAll is not set: Merge of postings for "addKeys" labels minus postings for removeKeys labels 2368 // This computation happens in ExpandedPostings. 2369 type postingGroup struct { 2370 addAll bool 2371 name string 2372 addKeys []string 2373 removeKeys []string 2374 } 2375 2376 func newPostingGroup(addAll bool, name string, addKeys, removeKeys []string) *postingGroup { 2377 return &postingGroup{ 2378 addAll: addAll, 2379 name: name, 2380 addKeys: addKeys, 2381 removeKeys: removeKeys, 2382 } 2383 } 2384 2385 func (pg postingGroup) merge(other *postingGroup) *postingGroup { 2386 if other == nil { 2387 return &pg 2388 } 2389 // This shouldn't happen, but add this as a safeguard. 2390 if pg.name != other.name { 2391 return nil 2392 } 2393 var i, j int 2394 // Both add all, merge remove keys. 2395 if pg.addAll && other.addAll { 2396 // Fast path to not allocate output slice if no remove keys are specified. 2397 // This is possible when matcher is `=~".*"`. 2398 if len(pg.removeKeys) == 0 { 2399 pg.removeKeys = other.removeKeys 2400 return &pg 2401 } else if len(other.removeKeys) == 0 { 2402 return &pg 2403 } 2404 output := make([]string, 0, len(pg.removeKeys)+len(other.removeKeys)) 2405 for i < len(pg.removeKeys) && j < len(other.removeKeys) { 2406 if pg.removeKeys[i] < other.removeKeys[j] { 2407 output = append(output, pg.removeKeys[i]) 2408 i++ 2409 } else if pg.removeKeys[i] > other.removeKeys[j] { 2410 output = append(output, other.removeKeys[j]) 2411 j++ 2412 } else { 2413 output = append(output, pg.removeKeys[i]) 2414 i++ 2415 j++ 2416 } 2417 } 2418 if i < len(pg.removeKeys) { 2419 output = append(output, pg.removeKeys[i:len(pg.removeKeys)]...) 2420 } 2421 if j < len(other.removeKeys) { 2422 output = append(output, other.removeKeys[j:len(other.removeKeys)]...) 2423 } 2424 pg.removeKeys = output 2425 } else if pg.addAll || other.addAll { 2426 // Subtract the remove keys. 2427 toRemove := other 2428 toAdd := &pg 2429 if pg.addAll { 2430 toRemove = &pg 2431 toAdd = other 2432 } 2433 var k int 2434 for i < len(toAdd.addKeys) && j < len(toRemove.removeKeys) { 2435 if toAdd.addKeys[i] < toRemove.removeKeys[j] { 2436 toAdd.addKeys[k] = toAdd.addKeys[i] 2437 k++ 2438 i++ 2439 } else if toAdd.addKeys[i] > toRemove.removeKeys[j] { 2440 j++ 2441 } else { 2442 i++ 2443 j++ 2444 } 2445 } 2446 for i < len(toAdd.addKeys) { 2447 toAdd.addKeys[k] = toAdd.addKeys[i] 2448 i++ 2449 k++ 2450 } 2451 pg.addKeys = toAdd.addKeys[:k] 2452 pg.addAll = false 2453 pg.removeKeys = nil 2454 } else { 2455 addKeys := make([]string, 0, len(pg.addKeys)+len(other.addKeys)) 2456 for i < len(pg.addKeys) && j < len(other.addKeys) { 2457 if pg.addKeys[i] == other.addKeys[j] { 2458 addKeys = append(addKeys, pg.addKeys[i]) 2459 i++ 2460 j++ 2461 } else if pg.addKeys[i] < other.addKeys[j] { 2462 i++ 2463 } else { 2464 j++ 2465 } 2466 } 2467 pg.addKeys = addKeys 2468 } 2469 return &pg 2470 } 2471 2472 func checkNilPosting(name, value string, p index.Postings) index.Postings { 2473 if p == nil { 2474 // This should not happen. Debug for https://github.com/thanos-io/thanos/issues/874. 2475 return index.ErrPostings(errors.Errorf("postings is nil for {%s=%s}. It was never fetched.", name, value)) 2476 } 2477 return p 2478 } 2479 2480 func matchersToPostingGroups(ctx context.Context, lvalsFn func(name string) ([]string, error), ms []*labels.Matcher) ([]*postingGroup, error) { 2481 matchersMap := make(map[string][]*labels.Matcher) 2482 for _, m := range ms { 2483 matchersMap[m.Name] = append(matchersMap[m.Name], m) 2484 } 2485 2486 pgs := make([]*postingGroup, 0) 2487 // NOTE: Derived from tsdb.PostingsForMatchers. 2488 for _, values := range matchersMap { 2489 var ( 2490 mergedPG *postingGroup 2491 pg *postingGroup 2492 vals []string 2493 err error 2494 valuesCached bool 2495 ) 2496 lvalsFunc := lvalsFn 2497 // Merge PostingGroups with the same matcher into 1 to 2498 // avoid fetching duplicate postings. 2499 for _, val := range values { 2500 pg, vals, err = toPostingGroup(ctx, lvalsFunc, val) 2501 if err != nil { 2502 return nil, errors.Wrap(err, "toPostingGroup") 2503 } 2504 // Cache label values because label name is the same. 2505 if !valuesCached && vals != nil { 2506 lvalsFunc = func(_ string) ([]string, error) { 2507 return vals, nil 2508 } 2509 valuesCached = true 2510 } 2511 2512 // If this groups adds nothing, it's an empty group. We can shortcut this, since intersection with empty 2513 // postings would return no postings anyway. 2514 // E.g. label="non-existing-value" returns empty group. 2515 if !pg.addAll && len(pg.addKeys) == 0 { 2516 return nil, nil 2517 } 2518 if mergedPG == nil { 2519 mergedPG = pg 2520 } else { 2521 mergedPG = mergedPG.merge(pg) 2522 } 2523 2524 // If this groups adds nothing, it's an empty group. We can shortcut this, since intersection with empty 2525 // postings would return no postings anyway. 2526 // E.g. label="non-existing-value" returns empty group. 2527 if !mergedPG.addAll && len(mergedPG.addKeys) == 0 { 2528 return nil, nil 2529 } 2530 } 2531 pgs = append(pgs, mergedPG) 2532 } 2533 slices.SortFunc(pgs, func(a, b *postingGroup) bool { 2534 return a.name < b.name 2535 }) 2536 return pgs, nil 2537 } 2538 2539 // NOTE: Derived from tsdb.postingsForMatcher. index.Merge is equivalent to map duplication. 2540 func toPostingGroup(ctx context.Context, lvalsFn func(name string) ([]string, error), m *labels.Matcher) (*postingGroup, []string, error) { 2541 // If the matcher selects an empty value, it selects all the series which don't 2542 // have the label name set too. See: https://github.com/prometheus/prometheus/issues/3575 2543 // and https://github.com/prometheus/prometheus/pull/3578#issuecomment-351653555. 2544 if m.Matches("") { 2545 var toRemove []string 2546 2547 // Fast-path for MatchNotRegexp matching. 2548 // Inverse of a MatchNotRegexp is MatchRegexp (double negation). 2549 // Fast-path for set matching. 2550 if m.Type == labels.MatchNotRegexp { 2551 if vals := findSetMatches(m.Value); len(vals) > 0 { 2552 sort.Strings(vals) 2553 return newPostingGroup(true, m.Name, nil, vals), nil, nil 2554 } 2555 } 2556 2557 // Fast-path for MatchNotEqual matching. 2558 // Inverse of a MatchNotEqual is MatchEqual (double negation). 2559 if m.Type == labels.MatchNotEqual { 2560 return newPostingGroup(true, m.Name, nil, []string{m.Value}), nil, nil 2561 } 2562 2563 vals, err := lvalsFn(m.Name) 2564 if err != nil { 2565 return nil, nil, err 2566 } 2567 2568 for _, val := range vals { 2569 if ctx.Err() != nil { 2570 return nil, nil, ctx.Err() 2571 } 2572 if !m.Matches(val) { 2573 toRemove = append(toRemove, val) 2574 } 2575 } 2576 2577 return newPostingGroup(true, m.Name, nil, toRemove), vals, nil 2578 } 2579 if m.Type == labels.MatchRegexp { 2580 if vals := findSetMatches(m.Value); len(vals) > 0 { 2581 sort.Strings(vals) 2582 return newPostingGroup(false, m.Name, vals, nil), nil, nil 2583 } 2584 } 2585 2586 // Fast-path for equal matching. 2587 if m.Type == labels.MatchEqual { 2588 return newPostingGroup(false, m.Name, []string{m.Value}, nil), nil, nil 2589 } 2590 2591 vals, err := lvalsFn(m.Name) 2592 if err != nil { 2593 return nil, nil, err 2594 } 2595 2596 var toAdd []string 2597 for _, val := range vals { 2598 if ctx.Err() != nil { 2599 return nil, nil, ctx.Err() 2600 } 2601 if m.Matches(val) { 2602 toAdd = append(toAdd, val) 2603 } 2604 } 2605 2606 return newPostingGroup(false, m.Name, toAdd, nil), vals, nil 2607 } 2608 2609 type postingPtr struct { 2610 keyID int 2611 ptr index.Range 2612 } 2613 2614 func (r *bucketIndexReader) fetchExpandedPostingsFromCache(ctx context.Context, ms []*labels.Matcher, bytesLimiter BytesLimiter) (bool, []storage.SeriesRef, error) { 2615 dataFromCache, hit := r.block.indexCache.FetchExpandedPostings(ctx, r.block.meta.ULID, ms) 2616 if !hit { 2617 return false, nil, nil 2618 } 2619 if err := bytesLimiter.Reserve(uint64(len(dataFromCache))); err != nil { 2620 return false, nil, httpgrpc.Errorf(int(codes.ResourceExhausted), "exceeded bytes limit while loading expanded postings from index cache: %s", err) 2621 } 2622 r.stats.DataDownloadedSizeSum += units.Base2Bytes(len(dataFromCache)) 2623 r.stats.postingsTouched++ 2624 r.stats.PostingsTouchedSizeSum += units.Base2Bytes(len(dataFromCache)) 2625 p, closeFns, err := r.decodeCachedPostings(dataFromCache) 2626 defer func() { 2627 for _, closeFn := range closeFns { 2628 closeFn() 2629 } 2630 }() 2631 // If failed to decode or expand cached postings, return and expand postings again. 2632 if err != nil { 2633 level.Error(r.block.logger).Log("msg", "failed to decode cached expanded postings, refetch postings", "id", r.block.meta.ULID.String(), "err", err) 2634 return false, nil, nil 2635 } 2636 2637 ps, err := ExpandPostingsWithContext(ctx, p) 2638 if err != nil { 2639 level.Error(r.block.logger).Log("msg", "failed to expand cached expanded postings, refetch postings", "id", r.block.meta.ULID.String(), "err", err) 2640 return false, nil, nil 2641 } 2642 2643 if len(ps) > 0 { 2644 // As of version two all series entries are 16 byte padded. All references 2645 // we get have to account for that to get the correct offset. 2646 version, err := r.block.indexHeaderReader.IndexVersion() 2647 if err != nil { 2648 return false, nil, errors.Wrap(err, "get index version") 2649 } 2650 if version >= 2 { 2651 for i, id := range ps { 2652 ps[i] = id * 16 2653 } 2654 } 2655 } 2656 return true, ps, nil 2657 } 2658 2659 func (r *bucketIndexReader) storeExpandedPostingsToCache(ms []*labels.Matcher, ps index.Postings, length int) { 2660 // Encode postings to cache. We compress and cache postings before adding 2661 // 16 bytes padding in order to make compressed size smaller. 2662 dataToCache, compressionDuration, compressionErrors, compressedSize := r.encodePostingsToCache(ps, length) 2663 r.stats.cachedPostingsCompressions++ 2664 r.stats.cachedPostingsCompressionErrors += compressionErrors 2665 r.stats.CachedPostingsCompressionTimeSum += compressionDuration 2666 r.stats.CachedPostingsCompressedSizeSum += units.Base2Bytes(compressedSize) 2667 r.stats.CachedPostingsOriginalSizeSum += units.Base2Bytes(length * 4) // Estimate the posting list size. 2668 r.block.indexCache.StoreExpandedPostings(r.block.meta.ULID, ms, dataToCache) 2669 } 2670 2671 var bufioReaderPool = sync.Pool{ 2672 New: func() any { 2673 return bufio.NewReader(nil) 2674 }, 2675 } 2676 2677 // fetchPostings fill postings requested by posting groups. 2678 // It returns one posting for each key, in the same order. 2679 // If postings for given key is not fetched, entry at given index will be nil. 2680 func (r *bucketIndexReader) fetchPostings(ctx context.Context, keys []labels.Label, bytesLimiter BytesLimiter) ([]index.Postings, []func(), error) { 2681 var closeFns []func() 2682 2683 timer := prometheus.NewTimer(r.block.metrics.postingsFetchDuration) 2684 defer timer.ObserveDuration() 2685 2686 var ptrs []postingPtr 2687 2688 output := make([]index.Postings, len(keys)) 2689 2690 // Fetch postings from the cache with a single call. 2691 fromCache, _ := r.block.indexCache.FetchMultiPostings(ctx, r.block.meta.ULID, keys) 2692 for _, dataFromCache := range fromCache { 2693 if err := bytesLimiter.Reserve(uint64(len(dataFromCache))); err != nil { 2694 return nil, closeFns, httpgrpc.Errorf(int(codes.ResourceExhausted), "exceeded bytes limit while loading postings from index cache: %s", err) 2695 } 2696 r.stats.DataDownloadedSizeSum += units.Base2Bytes(len(dataFromCache)) 2697 } 2698 2699 // Iterate over all groups and fetch posting from cache. 2700 // If we have a miss, mark key to be fetched in `ptrs` slice. 2701 // Overlaps are well handled by partitioner, so we don't need to deduplicate keys. 2702 for ix, key := range keys { 2703 if err := ctx.Err(); err != nil { 2704 return nil, closeFns, err 2705 } 2706 // Get postings for the given key from cache first. 2707 if b, ok := fromCache[key]; ok { 2708 r.stats.postingsTouched++ 2709 r.stats.PostingsTouchedSizeSum += units.Base2Bytes(len(b)) 2710 2711 l, closer, err := r.decodeCachedPostings(b) 2712 if err != nil { 2713 return nil, closeFns, errors.Wrap(err, "decode postings") 2714 } 2715 output[ix] = l 2716 closeFns = append(closeFns, closer...) 2717 continue 2718 } 2719 2720 // Cache miss; save pointer for actual posting in index stored in object store. 2721 ptr, err := r.block.indexHeaderReader.PostingsOffset(key.Name, key.Value) 2722 if err == indexheader.NotFoundRangeErr { 2723 // This block does not have any posting for given key. 2724 output[ix] = index.EmptyPostings() 2725 continue 2726 } 2727 2728 if err != nil { 2729 return nil, closeFns, errors.Wrap(err, "index header PostingsOffset") 2730 } 2731 2732 r.stats.postingsToFetch++ 2733 ptrs = append(ptrs, postingPtr{ptr: ptr, keyID: ix}) 2734 } 2735 2736 sort.Slice(ptrs, func(i, j int) bool { 2737 return ptrs[i].ptr.Start < ptrs[j].ptr.Start 2738 }) 2739 2740 // TODO(bwplotka): Asses how large in worst case scenario this can be. (e.g fetch for AllPostingsKeys) 2741 // Consider sub split if too big. 2742 parts := r.block.partitioner.Partition(len(ptrs), func(i int) (start, end uint64) { 2743 return uint64(ptrs[i].ptr.Start), uint64(ptrs[i].ptr.End) 2744 }) 2745 2746 for _, part := range parts { 2747 start := int64(part.Start) 2748 length := int64(part.End) - start 2749 2750 if err := bytesLimiter.Reserve(uint64(length)); err != nil { 2751 return nil, closeFns, httpgrpc.Errorf(int(codes.ResourceExhausted), "exceeded bytes limit while fetching postings: %s", err) 2752 } 2753 r.stats.DataDownloadedSizeSum += units.Base2Bytes(length) 2754 } 2755 2756 g, ctx := errgroup.WithContext(ctx) 2757 for _, part := range parts { 2758 i, j := part.ElemRng[0], part.ElemRng[1] 2759 2760 start := int64(part.Start) 2761 // We assume index does not have any ptrs that has 0 length. 2762 length := int64(part.End) - start 2763 2764 // Fetch from object storage concurrently and update stats and posting list. 2765 g.Go(func() error { 2766 begin := time.Now() 2767 2768 brdr := bufioReaderPool.Get().(*bufio.Reader) 2769 defer bufioReaderPool.Put(brdr) 2770 2771 partReader, err := r.block.bkt.GetRange(ctx, r.block.indexFilename(), start, length) 2772 if err != nil { 2773 return errors.Wrap(err, "read postings range") 2774 } 2775 defer runutil.CloseWithLogOnErr(r.block.logger, partReader, "readIndexRange close range reader") 2776 brdr.Reset(partReader) 2777 2778 rdr := newPostingsReaderBuilder(ctx, brdr, ptrs[i:j], start, length) 2779 2780 r.mtx.Lock() 2781 r.stats.postingsFetchCount++ 2782 r.stats.postingsFetched += j - i 2783 r.stats.PostingsFetchedSizeSum += units.Base2Bytes(int(length)) 2784 r.mtx.Unlock() 2785 2786 for rdr.Next() { 2787 diffVarintPostings, postingsCount, keyID := rdr.AtDiffVarint() 2788 2789 output[keyID] = newDiffVarintPostings(diffVarintPostings, nil) 2790 2791 startCompression := time.Now() 2792 dataToCache, err := snappyStreamedEncode(int(postingsCount), diffVarintPostings) 2793 if err != nil { 2794 r.mtx.Lock() 2795 r.stats.cachedPostingsCompressionErrors += 1 2796 r.mtx.Unlock() 2797 return errors.Wrap(err, "encoding with snappy") 2798 } 2799 2800 r.mtx.Lock() 2801 r.stats.postingsTouched++ 2802 r.stats.PostingsTouchedSizeSum += units.Base2Bytes(int(len(diffVarintPostings))) 2803 r.stats.cachedPostingsCompressions += 1 2804 r.stats.CachedPostingsOriginalSizeSum += units.Base2Bytes(len(diffVarintPostings)) 2805 r.stats.CachedPostingsCompressedSizeSum += units.Base2Bytes(len(dataToCache)) 2806 r.stats.CachedPostingsCompressionTimeSum += time.Since(startCompression) 2807 r.mtx.Unlock() 2808 2809 r.block.indexCache.StorePostings(r.block.meta.ULID, keys[keyID], dataToCache) 2810 } 2811 2812 r.mtx.Lock() 2813 r.stats.PostingsFetchDurationSum += time.Since(begin) 2814 r.mtx.Unlock() 2815 2816 if err := rdr.Error(); err != nil { 2817 return errors.Wrap(err, "reading postings") 2818 } 2819 return nil 2820 }) 2821 } 2822 2823 return output, closeFns, g.Wait() 2824 } 2825 2826 func (r *bucketIndexReader) decodeCachedPostings(b []byte) (index.Postings, []func(), error) { 2827 // Even if this instance is not using compression, there may be compressed 2828 // entries in the cache written by other stores. 2829 var ( 2830 l index.Postings 2831 err error 2832 closeFns []func() 2833 ) 2834 if isDiffVarintSnappyEncodedPostings(b) || isDiffVarintSnappyStreamedEncodedPostings(b) { 2835 s := time.Now() 2836 l, err = decodePostings(b) 2837 r.stats.cachedPostingsDecompressions += 1 2838 r.stats.CachedPostingsDecompressionTimeSum += time.Since(s) 2839 if err != nil { 2840 r.stats.cachedPostingsDecompressionErrors += 1 2841 } else { 2842 closeFns = append(closeFns, l.(closeablePostings).close) 2843 } 2844 } else { 2845 _, l, err = r.dec.Postings(b) 2846 } 2847 return l, closeFns, err 2848 } 2849 2850 func (r *bucketIndexReader) encodePostingsToCache(p index.Postings, length int) ([]byte, time.Duration, int, int) { 2851 var dataToCache []byte 2852 compressionTime := time.Duration(0) 2853 compressionErrors, compressedSize := 0, 0 2854 s := time.Now() 2855 data, err := diffVarintSnappyStreamedEncode(p, length) 2856 compressionTime = time.Since(s) 2857 if err == nil { 2858 dataToCache = data 2859 compressedSize = len(data) 2860 } else { 2861 compressionErrors = 1 2862 } 2863 return dataToCache, compressionTime, compressionErrors, compressedSize 2864 } 2865 2866 // bigEndianPostings implements the Postings interface over a byte stream of 2867 // big endian numbers. 2868 type bigEndianPostings struct { 2869 list []byte 2870 cur uint32 2871 } 2872 2873 // TODO(bwplotka): Expose those inside Prometheus. 2874 func newBigEndianPostings(list []byte) *bigEndianPostings { 2875 return &bigEndianPostings{list: list} 2876 } 2877 2878 func (it *bigEndianPostings) At() storage.SeriesRef { 2879 return storage.SeriesRef(it.cur) 2880 } 2881 2882 func (it *bigEndianPostings) Next() bool { 2883 if len(it.list) >= 4 { 2884 it.cur = binary.BigEndian.Uint32(it.list) 2885 it.list = it.list[4:] 2886 return true 2887 } 2888 return false 2889 } 2890 2891 func (it *bigEndianPostings) Seek(x storage.SeriesRef) bool { 2892 if storage.SeriesRef(it.cur) >= x { 2893 return true 2894 } 2895 2896 num := len(it.list) / 4 2897 // Do binary search between current position and end. 2898 i := sort.Search(num, func(i int) bool { 2899 return binary.BigEndian.Uint32(it.list[i*4:]) >= uint32(x) 2900 }) 2901 if i < num { 2902 j := i * 4 2903 it.cur = binary.BigEndian.Uint32(it.list[j:]) 2904 it.list = it.list[j+4:] 2905 return true 2906 } 2907 it.list = nil 2908 return false 2909 } 2910 2911 func (it *bigEndianPostings) Err() error { 2912 return nil 2913 } 2914 2915 // Returns number of remaining postings values. 2916 func (it *bigEndianPostings) length() int { 2917 return len(it.list) / 4 2918 } 2919 2920 func (r *bucketIndexReader) PreloadSeries(ctx context.Context, ids []storage.SeriesRef, bytesLimiter BytesLimiter) error { 2921 timer := prometheus.NewTimer(r.block.metrics.seriesFetchDuration) 2922 defer timer.ObserveDuration() 2923 2924 // Load series from cache, overwriting the list of ids to preload 2925 // with the missing ones. 2926 fromCache, ids := r.block.indexCache.FetchMultiSeries(ctx, r.block.meta.ULID, ids) 2927 for id, b := range fromCache { 2928 r.loadedSeries[id] = b 2929 if err := bytesLimiter.Reserve(uint64(len(b))); err != nil { 2930 return httpgrpc.Errorf(int(codes.ResourceExhausted), "exceeded bytes limit while loading series from index cache: %s", err) 2931 } 2932 r.stats.DataDownloadedSizeSum += units.Base2Bytes(len(b)) 2933 } 2934 2935 parts := r.block.partitioner.Partition(len(ids), func(i int) (start, end uint64) { 2936 return uint64(ids[i]), uint64(ids[i]) + uint64(r.block.estimatedMaxSeriesSize) 2937 }) 2938 2939 g, ctx := errgroup.WithContext(ctx) 2940 for _, p := range parts { 2941 s, e := p.Start, p.End 2942 i, j := p.ElemRng[0], p.ElemRng[1] 2943 2944 g.Go(func() error { 2945 return r.loadSeries(ctx, ids[i:j], false, s, e, bytesLimiter) 2946 }) 2947 } 2948 return g.Wait() 2949 } 2950 2951 func (r *bucketIndexReader) loadSeries(ctx context.Context, ids []storage.SeriesRef, refetch bool, start, end uint64, bytesLimiter BytesLimiter) error { 2952 begin := time.Now() 2953 2954 if bytesLimiter != nil { 2955 if err := bytesLimiter.Reserve(uint64(end - start)); err != nil { 2956 return httpgrpc.Errorf(int(codes.ResourceExhausted), "exceeded bytes limit while fetching series: %s", err) 2957 } 2958 r.mtx.Lock() 2959 r.stats.DataDownloadedSizeSum += units.Base2Bytes(end - start) 2960 r.mtx.Unlock() 2961 } 2962 2963 b, err := r.block.readIndexRange(ctx, int64(start), int64(end-start)) 2964 if err != nil { 2965 return errors.Wrap(err, "read series range") 2966 } 2967 2968 r.mtx.Lock() 2969 r.stats.seriesFetchCount++ 2970 r.stats.seriesFetched += len(ids) 2971 r.stats.SeriesFetchDurationSum += time.Since(begin) 2972 r.stats.SeriesFetchedSizeSum += units.Base2Bytes(int(end - start)) 2973 r.mtx.Unlock() 2974 2975 for i, id := range ids { 2976 c := b[uint64(id)-start:] 2977 2978 l, n := binary.Uvarint(c) 2979 if n < 1 { 2980 return errors.New("reading series length failed") 2981 } 2982 if len(c) < n+int(l) { 2983 if i == 0 && refetch { 2984 return errors.Errorf("invalid remaining size, even after refetch, remaining: %d, expected %d", len(c), n+int(l)) 2985 } 2986 2987 // Inefficient, but should be rare. 2988 r.block.metrics.seriesRefetches.Inc() 2989 level.Warn(r.block.logger).Log("msg", "series size exceeded expected size; refetching", "id", id, "series length", n+int(l), "maxSeriesSize", r.block.estimatedMaxSeriesSize) 2990 2991 // Fetch plus to get the size of next one if exists. 2992 return r.loadSeries(ctx, ids[i:], true, uint64(id), uint64(id)+uint64(n+int(l)+1), bytesLimiter) 2993 } 2994 c = c[n : n+int(l)] 2995 r.mtx.Lock() 2996 r.loadedSeries[id] = c 2997 r.block.indexCache.StoreSeries(r.block.meta.ULID, id, c) 2998 r.mtx.Unlock() 2999 } 3000 return nil 3001 } 3002 3003 type Part struct { 3004 Start uint64 3005 End uint64 3006 3007 ElemRng [2]int 3008 } 3009 3010 type Partitioner interface { 3011 // Partition partitions length entries into n <= length ranges that cover all 3012 // input ranges 3013 // It supports overlapping ranges. 3014 // NOTE: It expects range to be sorted by start time. 3015 Partition(length int, rng func(int) (uint64, uint64)) []Part 3016 } 3017 3018 type gapBasedPartitioner struct { 3019 maxGapSize uint64 3020 } 3021 3022 func NewGapBasedPartitioner(maxGapSize uint64) Partitioner { 3023 return gapBasedPartitioner{ 3024 maxGapSize: maxGapSize, 3025 } 3026 } 3027 3028 // Partition partitions length entries into n <= length ranges that cover all 3029 // input ranges by combining entries that are separated by reasonably small gaps. 3030 // It is used to combine multiple small ranges from object storage into bigger, more efficient/cheaper ones. 3031 func (g gapBasedPartitioner) Partition(length int, rng func(int) (uint64, uint64)) (parts []Part) { 3032 j := 0 3033 k := 0 3034 for k < length { 3035 j = k 3036 k++ 3037 3038 p := Part{} 3039 p.Start, p.End = rng(j) 3040 3041 // Keep growing the range until the end or we encounter a large gap. 3042 for ; k < length; k++ { 3043 s, e := rng(k) 3044 3045 if p.End+g.maxGapSize < s { 3046 break 3047 } 3048 3049 if p.End <= e { 3050 p.End = e 3051 } 3052 } 3053 p.ElemRng = [2]int{j, k} 3054 parts = append(parts, p) 3055 } 3056 return parts 3057 } 3058 3059 type symbolizedLabel struct { 3060 name, value uint32 3061 } 3062 3063 // LoadSeriesForTime populates the given symbolized labels for the series identified by the reference if at least one chunk is within 3064 // time selection. 3065 // LoadSeriesForTime also populates chunk metas slices if skipChunks if set to false. Chunks are also limited by the given time selection. 3066 // LoadSeriesForTime returns false, when there are no series data for given time range. 3067 // 3068 // Error is returned on decoding error or if the reference does not resolve to a known series. 3069 func (r *bucketIndexReader) LoadSeriesForTime(ref storage.SeriesRef, lset *[]symbolizedLabel, chks *[]chunks.Meta, skipChunks bool, mint, maxt int64) (ok bool, err error) { 3070 b, ok := r.loadedSeries[ref] 3071 if !ok { 3072 return false, errors.Errorf("series %d not found", ref) 3073 } 3074 3075 r.stats.seriesTouched++ 3076 r.stats.SeriesTouchedSizeSum += units.Base2Bytes(len(b)) 3077 return decodeSeriesForTime(b, lset, chks, skipChunks, mint, maxt) 3078 } 3079 3080 // Close released the underlying resources of the reader. 3081 func (r *bucketIndexReader) Close() error { 3082 r.block.pendingReaders.Done() 3083 return nil 3084 } 3085 3086 // LookupLabelsSymbols allows populates label set strings from symbolized label set. 3087 func (r *bucketIndexReader) LookupLabelsSymbols(symbolized []symbolizedLabel, lbls *labels.Labels) error { 3088 *lbls = (*lbls)[:0] 3089 for _, s := range symbolized { 3090 ln, err := r.dec.LookupSymbol(s.name) 3091 if err != nil { 3092 return errors.Wrap(err, "lookup label name") 3093 } 3094 lv, err := r.dec.LookupSymbol(s.value) 3095 if err != nil { 3096 return errors.Wrap(err, "lookup label value") 3097 } 3098 *lbls = append(*lbls, labels.Label{Name: ln, Value: lv}) 3099 } 3100 return nil 3101 } 3102 3103 // decodeSeriesForTime decodes a series entry from the given byte slice decoding only chunk metas that are within given min and max time. 3104 // If skipChunks is specified decodeSeriesForTime does not return any chunks, but only labels and only if at least single chunk is within time range. 3105 // decodeSeriesForTime returns false, when there are no series data for given time range. 3106 func decodeSeriesForTime(b []byte, lset *[]symbolizedLabel, chks *[]chunks.Meta, skipChunks bool, selectMint, selectMaxt int64) (ok bool, err error) { 3107 *lset = (*lset)[:0] 3108 *chks = (*chks)[:0] 3109 3110 d := encoding.Decbuf{B: b} 3111 3112 // Read labels without looking up symbols. 3113 k := d.Uvarint() 3114 for i := 0; i < k; i++ { 3115 lno := uint32(d.Uvarint()) 3116 lvo := uint32(d.Uvarint()) 3117 *lset = append(*lset, symbolizedLabel{name: lno, value: lvo}) 3118 } 3119 // Read the chunks meta data. 3120 k = d.Uvarint() 3121 if k == 0 { 3122 return false, d.Err() 3123 } 3124 3125 // First t0 is absolute, rest is just diff so different type is used (Uvarint64). 3126 mint := d.Varint64() 3127 maxt := int64(d.Uvarint64()) + mint 3128 // Similar for first ref. 3129 ref := int64(d.Uvarint64()) 3130 3131 for i := 0; i < k; i++ { 3132 if i > 0 { 3133 mint += int64(d.Uvarint64()) 3134 maxt = int64(d.Uvarint64()) + mint 3135 ref += d.Varint64() 3136 } 3137 3138 if mint > selectMaxt { 3139 break 3140 } 3141 3142 if maxt >= selectMint { 3143 // Found a chunk. 3144 if skipChunks { 3145 // We are not interested in chunks and we know there is at least one, that's enough to return series. 3146 return true, nil 3147 } 3148 3149 *chks = append(*chks, chunks.Meta{ 3150 Ref: chunks.ChunkRef(ref), 3151 MinTime: mint, 3152 MaxTime: maxt, 3153 }) 3154 } 3155 3156 mint = maxt 3157 } 3158 return len(*chks) > 0, d.Err() 3159 } 3160 3161 type loadIdx struct { 3162 offset uint32 3163 // Indices, not actual entries and chunks. 3164 seriesEntry int 3165 chunk int 3166 } 3167 3168 type bucketChunkReader struct { 3169 block *bucketBlock 3170 3171 toLoad [][]loadIdx 3172 3173 // Mutex protects access to following fields, when updated from chunks-loading goroutines. 3174 // After chunks are loaded, mutex is no longer used. 3175 mtx sync.Mutex 3176 stats *queryStats 3177 chunkBytes []*[]byte // Byte slice to return to the chunk pool on close. 3178 3179 loadingChunksMtx sync.Mutex 3180 loadingChunks bool 3181 finishLoadingChks chan struct{} 3182 } 3183 3184 func newBucketChunkReader(block *bucketBlock) *bucketChunkReader { 3185 return &bucketChunkReader{ 3186 block: block, 3187 stats: &queryStats{}, 3188 toLoad: make([][]loadIdx, len(block.chunkObjs)), 3189 } 3190 } 3191 3192 func (r *bucketChunkReader) reset() { 3193 for i := range r.toLoad { 3194 r.toLoad[i] = r.toLoad[i][:0] 3195 } 3196 r.loadingChunksMtx.Lock() 3197 r.loadingChunks = false 3198 r.finishLoadingChks = make(chan struct{}) 3199 r.loadingChunksMtx.Unlock() 3200 } 3201 3202 func (r *bucketChunkReader) Close() error { 3203 // NOTE(GiedriusS): we need to wait until loading chunks because loading 3204 // chunks modifies r.block.chunkPool. 3205 r.loadingChunksMtx.Lock() 3206 loadingChks := r.loadingChunks 3207 r.loadingChunksMtx.Unlock() 3208 3209 if loadingChks { 3210 <-r.finishLoadingChks 3211 } 3212 r.block.pendingReaders.Done() 3213 3214 for _, b := range r.chunkBytes { 3215 r.block.chunkPool.Put(b) 3216 } 3217 return nil 3218 } 3219 3220 // addLoad adds the chunk with id to the data set to be fetched. 3221 // Chunk will be fetched and saved to refs[seriesEntry][chunk] upon r.load(refs, <...>) call. 3222 func (r *bucketChunkReader) addLoad(id chunks.ChunkRef, seriesEntry, chunk int) error { 3223 var ( 3224 seq = int(id >> 32) 3225 off = uint32(id) 3226 ) 3227 if seq >= len(r.toLoad) { 3228 return errors.Errorf("reference sequence %d out of range", seq) 3229 } 3230 r.toLoad[seq] = append(r.toLoad[seq], loadIdx{off, seriesEntry, chunk}) 3231 return nil 3232 } 3233 3234 // load loads all added chunks and saves resulting aggrs to refs. 3235 func (r *bucketChunkReader) load(ctx context.Context, res []seriesEntry, aggrs []storepb.Aggr, calculateChunkChecksum bool, bytesLimiter BytesLimiter) error { 3236 r.loadingChunksMtx.Lock() 3237 r.loadingChunks = true 3238 r.loadingChunksMtx.Unlock() 3239 3240 defer func() { 3241 r.loadingChunksMtx.Lock() 3242 r.loadingChunks = false 3243 r.loadingChunksMtx.Unlock() 3244 3245 close(r.finishLoadingChks) 3246 }() 3247 3248 g, ctx := errgroup.WithContext(ctx) 3249 3250 for seq, pIdxs := range r.toLoad { 3251 sort.Slice(pIdxs, func(i, j int) bool { 3252 return pIdxs[i].offset < pIdxs[j].offset 3253 }) 3254 parts := r.block.partitioner.Partition(len(pIdxs), func(i int) (start, end uint64) { 3255 return uint64(pIdxs[i].offset), uint64(pIdxs[i].offset) + uint64(r.block.estimatedMaxChunkSize) 3256 }) 3257 3258 for _, p := range parts { 3259 if err := bytesLimiter.Reserve(uint64(p.End - p.Start)); err != nil { 3260 return httpgrpc.Errorf(int(codes.ResourceExhausted), "exceeded bytes limit while fetching chunks: %s", err) 3261 } 3262 r.stats.DataDownloadedSizeSum += units.Base2Bytes(p.End - p.Start) 3263 } 3264 3265 for _, p := range parts { 3266 seq := seq 3267 p := p 3268 indices := pIdxs[p.ElemRng[0]:p.ElemRng[1]] 3269 g.Go(func() error { 3270 return r.loadChunks(ctx, res, aggrs, seq, p, indices, calculateChunkChecksum, bytesLimiter) 3271 }) 3272 } 3273 } 3274 return g.Wait() 3275 } 3276 3277 // loadChunks will read range [start, end] from the segment file with sequence number seq. 3278 // This data range covers chunks starting at supplied offsets. 3279 func (r *bucketChunkReader) loadChunks(ctx context.Context, res []seriesEntry, aggrs []storepb.Aggr, seq int, part Part, pIdxs []loadIdx, calculateChunkChecksum bool, bytesLimiter BytesLimiter) error { 3280 var locked bool 3281 fetchBegin := time.Now() 3282 defer func() { 3283 if !locked { 3284 r.mtx.Lock() 3285 } 3286 r.stats.ChunksFetchDurationSum += time.Since(fetchBegin) 3287 r.mtx.Unlock() 3288 }() 3289 3290 // Get a reader for the required range. 3291 reader, err := r.block.chunkRangeReader(ctx, seq, int64(part.Start), int64(part.End-part.Start)) 3292 if err != nil { 3293 return errors.Wrap(err, "get range reader") 3294 } 3295 defer runutil.CloseWithLogOnErr(r.block.logger, reader, "readChunkRange close range reader") 3296 bufReader := bufio.NewReaderSize(reader, r.block.estimatedMaxChunkSize) 3297 3298 locked = true 3299 r.mtx.Lock() 3300 3301 r.stats.chunksFetchCount++ 3302 r.stats.chunksFetched += len(pIdxs) 3303 r.stats.ChunksFetchedSizeSum += units.Base2Bytes(int(part.End - part.Start)) 3304 3305 var ( 3306 buf []byte 3307 readOffset = int(pIdxs[0].offset) 3308 3309 // Save a few allocations. 3310 written int 3311 diff uint32 3312 chunkLen int 3313 n int 3314 ) 3315 3316 bufPooled, err := r.block.chunkPool.Get(r.block.estimatedMaxChunkSize) 3317 if err == nil { 3318 buf = *bufPooled 3319 } else { 3320 buf = make([]byte, r.block.estimatedMaxChunkSize) 3321 } 3322 defer r.block.chunkPool.Put(&buf) 3323 3324 for i, pIdx := range pIdxs { 3325 // Fast forward range reader to the next chunk start in case of sparse (for our purposes) byte range. 3326 for readOffset < int(pIdx.offset) { 3327 written, err = bufReader.Discard(int(pIdx.offset) - int(readOffset)) 3328 if err != nil { 3329 return errors.Wrap(err, "fast forward range reader") 3330 } 3331 readOffset += written 3332 } 3333 // Presume chunk length to be reasonably large for common use cases. 3334 // However, declaration for EstimatedMaxChunkSize warns us some chunks could be larger in some rare cases. 3335 // This is handled further down below. 3336 chunkLen = r.block.estimatedMaxChunkSize 3337 if i+1 < len(pIdxs) { 3338 if diff = pIdxs[i+1].offset - pIdx.offset; int(diff) < chunkLen { 3339 chunkLen = int(diff) 3340 } 3341 } 3342 cb := buf[:chunkLen] 3343 n, err = io.ReadFull(bufReader, cb) 3344 readOffset += n 3345 // Unexpected EOF for last chunk could be a valid case. Any other errors are definitely real. 3346 if err != nil && !(errors.Is(err, io.ErrUnexpectedEOF) && i == len(pIdxs)-1) { 3347 return errors.Wrapf(err, "read range for seq %d offset %x", seq, pIdx.offset) 3348 } 3349 3350 chunkDataLen, n := binary.Uvarint(cb) 3351 if n < 1 { 3352 return errors.New("reading chunk length failed") 3353 } 3354 3355 // Chunk length is n (number of bytes used to encode chunk data), 1 for chunk encoding and chunkDataLen for actual chunk data. 3356 // There is also crc32 after the chunk, but we ignore that. 3357 chunkLen = n + 1 + int(chunkDataLen) 3358 if chunkLen <= len(cb) { 3359 err = populateChunk(&(res[pIdx.seriesEntry].chks[pIdx.chunk]), rawChunk(cb[n:chunkLen]), aggrs, r.save, calculateChunkChecksum) 3360 if err != nil { 3361 return errors.Wrap(err, "populate chunk") 3362 } 3363 r.stats.chunksTouched++ 3364 r.stats.ChunksTouchedSizeSum += units.Base2Bytes(int(chunkDataLen)) 3365 continue 3366 } 3367 3368 r.block.metrics.chunkRefetches.Inc() 3369 // If we didn't fetch enough data for the chunk, fetch more. 3370 fetchBegin = time.Now() 3371 // Read entire chunk into new buffer. 3372 // TODO: readChunkRange call could be avoided for any chunk but last in this particular part. 3373 if err := bytesLimiter.Reserve(uint64(chunkLen)); err != nil { 3374 return httpgrpc.Errorf(int(codes.ResourceExhausted), "exceeded bytes limit while fetching chunks: %s", err) 3375 } 3376 r.stats.DataDownloadedSizeSum += units.Base2Bytes(chunkLen) 3377 r.mtx.Unlock() 3378 locked = false 3379 3380 nb, err := r.block.readChunkRange(ctx, seq, int64(pIdx.offset), int64(chunkLen), []byteRange{{offset: 0, length: chunkLen}}) 3381 if err != nil { 3382 return errors.Wrapf(err, "preloaded chunk too small, expecting %d, and failed to fetch full chunk", chunkLen) 3383 } 3384 if len(*nb) != chunkLen { 3385 return errors.Errorf("preloaded chunk too small, expecting %d", chunkLen) 3386 } 3387 3388 r.mtx.Lock() 3389 locked = true 3390 3391 r.stats.chunksFetchCount++ 3392 r.stats.ChunksFetchedSizeSum += units.Base2Bytes(len(*nb)) 3393 err = populateChunk(&(res[pIdx.seriesEntry].chks[pIdx.chunk]), rawChunk((*nb)[n:]), aggrs, r.save, calculateChunkChecksum) 3394 if err != nil { 3395 r.block.chunkPool.Put(nb) 3396 return errors.Wrap(err, "populate chunk") 3397 } 3398 r.stats.chunksTouched++ 3399 r.stats.ChunksTouchedSizeSum += units.Base2Bytes(int(chunkDataLen)) 3400 3401 r.block.chunkPool.Put(nb) 3402 } 3403 return nil 3404 } 3405 3406 // save saves a copy of b's payload to a memory pool of its own and returns a new byte slice referencing said copy. 3407 // Returned slice becomes invalid once r.block.chunkPool.Put() is called. 3408 func (r *bucketChunkReader) save(b []byte) ([]byte, error) { 3409 // Ensure we never grow slab beyond original capacity. 3410 if len(r.chunkBytes) == 0 || 3411 cap(*r.chunkBytes[len(r.chunkBytes)-1])-len(*r.chunkBytes[len(r.chunkBytes)-1]) < len(b) { 3412 s, err := r.block.chunkPool.Get(len(b)) 3413 if err != nil { 3414 return nil, errors.Wrap(err, "allocate chunk bytes") 3415 } 3416 r.chunkBytes = append(r.chunkBytes, s) 3417 } 3418 slab := r.chunkBytes[len(r.chunkBytes)-1] 3419 *slab = append(*slab, b...) 3420 return (*slab)[len(*slab)-len(b):], nil 3421 } 3422 3423 // rawChunk is a helper type that wraps a chunk's raw bytes and implements the chunkenc.Chunk 3424 // interface over it. 3425 // It is used to Store API responses which don't need to introspect and validate the chunk's contents. 3426 type rawChunk []byte 3427 3428 func (b rawChunk) Encoding() chunkenc.Encoding { 3429 return chunkenc.Encoding(b[0]) 3430 } 3431 3432 func (b rawChunk) Bytes() []byte { 3433 return b[1:] 3434 } 3435 func (b rawChunk) Compact() {} 3436 3437 func (b rawChunk) Iterator(_ chunkenc.Iterator) chunkenc.Iterator { 3438 panic("invalid call") 3439 } 3440 3441 func (b rawChunk) Appender() (chunkenc.Appender, error) { 3442 panic("invalid call") 3443 } 3444 3445 func (b rawChunk) NumSamples() int { 3446 panic("invalid call") 3447 } 3448 3449 type queryStats struct { 3450 blocksQueried int 3451 3452 postingsTouched int 3453 PostingsTouchedSizeSum units.Base2Bytes 3454 postingsToFetch int 3455 postingsFetched int 3456 PostingsFetchedSizeSum units.Base2Bytes 3457 postingsFetchCount int 3458 PostingsFetchDurationSum time.Duration 3459 3460 cachedPostingsCompressions int 3461 cachedPostingsCompressionErrors int 3462 CachedPostingsOriginalSizeSum units.Base2Bytes 3463 CachedPostingsCompressedSizeSum units.Base2Bytes 3464 CachedPostingsCompressionTimeSum time.Duration 3465 cachedPostingsDecompressions int 3466 cachedPostingsDecompressionErrors int 3467 CachedPostingsDecompressionTimeSum time.Duration 3468 3469 seriesTouched int 3470 SeriesTouchedSizeSum units.Base2Bytes 3471 seriesFetched int 3472 SeriesFetchedSizeSum units.Base2Bytes 3473 seriesFetchCount int 3474 SeriesFetchDurationSum time.Duration 3475 3476 chunksTouched int 3477 ChunksTouchedSizeSum units.Base2Bytes 3478 chunksFetched int 3479 ChunksFetchedSizeSum units.Base2Bytes 3480 chunksFetchCount int 3481 ChunksFetchDurationSum time.Duration 3482 3483 GetAllDuration time.Duration 3484 mergedSeriesCount int 3485 mergedChunksCount int 3486 MergeDuration time.Duration 3487 3488 DataDownloadedSizeSum units.Base2Bytes 3489 } 3490 3491 func (s queryStats) merge(o *queryStats) *queryStats { 3492 s.blocksQueried += o.blocksQueried 3493 3494 s.postingsToFetch += o.postingsToFetch 3495 s.postingsTouched += o.postingsTouched 3496 s.PostingsTouchedSizeSum += o.PostingsTouchedSizeSum 3497 s.postingsFetched += o.postingsFetched 3498 s.PostingsFetchedSizeSum += o.PostingsFetchedSizeSum 3499 s.postingsFetchCount += o.postingsFetchCount 3500 s.PostingsFetchDurationSum += o.PostingsFetchDurationSum 3501 3502 s.cachedPostingsCompressions += o.cachedPostingsCompressions 3503 s.cachedPostingsCompressionErrors += o.cachedPostingsCompressionErrors 3504 s.CachedPostingsOriginalSizeSum += o.CachedPostingsOriginalSizeSum 3505 s.CachedPostingsCompressedSizeSum += o.CachedPostingsCompressedSizeSum 3506 s.CachedPostingsCompressionTimeSum += o.CachedPostingsCompressionTimeSum 3507 s.cachedPostingsDecompressions += o.cachedPostingsDecompressions 3508 s.cachedPostingsDecompressionErrors += o.cachedPostingsDecompressionErrors 3509 s.CachedPostingsDecompressionTimeSum += o.CachedPostingsDecompressionTimeSum 3510 3511 s.seriesTouched += o.seriesTouched 3512 s.SeriesTouchedSizeSum += o.SeriesTouchedSizeSum 3513 s.seriesFetched += o.seriesFetched 3514 s.SeriesFetchedSizeSum += o.SeriesFetchedSizeSum 3515 s.seriesFetchCount += o.seriesFetchCount 3516 s.SeriesFetchDurationSum += o.SeriesFetchDurationSum 3517 3518 s.chunksTouched += o.chunksTouched 3519 s.ChunksTouchedSizeSum += o.ChunksTouchedSizeSum 3520 s.chunksFetched += o.chunksFetched 3521 s.ChunksFetchedSizeSum += o.ChunksFetchedSizeSum 3522 s.chunksFetchCount += o.chunksFetchCount 3523 s.ChunksFetchDurationSum += o.ChunksFetchDurationSum 3524 3525 s.GetAllDuration += o.GetAllDuration 3526 s.mergedSeriesCount += o.mergedSeriesCount 3527 s.mergedChunksCount += o.mergedChunksCount 3528 s.MergeDuration += o.MergeDuration 3529 3530 s.DataDownloadedSizeSum += o.DataDownloadedSizeSum 3531 3532 return &s 3533 } 3534 3535 func (s queryStats) toHints() *hintspb.QueryStats { 3536 return &hintspb.QueryStats{ 3537 BlocksQueried: int64(s.blocksQueried), 3538 PostingsTouched: int64(s.postingsTouched), 3539 PostingsTouchedSizeSum: int64(s.PostingsTouchedSizeSum), 3540 PostingsToFetch: int64(s.postingsToFetch), 3541 PostingsFetched: int64(s.postingsFetched), 3542 PostingsFetchedSizeSum: int64(s.PostingsFetchedSizeSum), 3543 PostingsFetchCount: int64(s.postingsFetchCount), 3544 SeriesTouched: int64(s.seriesTouched), 3545 SeriesTouchedSizeSum: int64(s.SeriesTouchedSizeSum), 3546 SeriesFetched: int64(s.seriesFetched), 3547 SeriesFetchedSizeSum: int64(s.SeriesFetchedSizeSum), 3548 SeriesFetchCount: int64(s.seriesFetchCount), 3549 ChunksTouched: int64(s.chunksTouched), 3550 ChunksTouchedSizeSum: int64(s.ChunksTouchedSizeSum), 3551 ChunksFetched: int64(s.chunksFetched), 3552 ChunksFetchedSizeSum: int64(s.ChunksFetchedSizeSum), 3553 ChunksFetchCount: int64(s.chunksFetchCount), 3554 MergedSeriesCount: int64(s.mergedSeriesCount), 3555 MergedChunksCount: int64(s.mergedChunksCount), 3556 DataDownloadedSizeSum: int64(s.DataDownloadedSizeSum), 3557 } 3558 } 3559 3560 // NewDefaultChunkBytesPool returns a chunk bytes pool with default settings. 3561 func NewDefaultChunkBytesPool(maxChunkPoolBytes uint64) (pool.Bytes, error) { 3562 return pool.NewBucketedBytes(chunkBytesPoolMinSize, chunkBytesPoolMaxSize, 2, maxChunkPoolBytes) 3563 }