github.com/muhammadn/cortex@v1.9.1-0.20220510110439-46bb7000d03d/pkg/storage/tsdb/bucketindex/loader.go

github.com/muhammadn/cortex@v1.9.1-0.20220510110439-46bb7000d03d/pkg/storage/tsdb/bucketindex/loader.go (about)

     1  package bucketindex
     2  
     3  import (
     4  	"context"
     5  	"sync"
     6  	"time"
     7  
     8  	"github.com/go-kit/log"
     9  	"github.com/go-kit/log/level"
    10  	"github.com/grafana/dskit/services"
    11  	"github.com/pkg/errors"
    12  	"github.com/prometheus/client_golang/prometheus"
    13  	"github.com/prometheus/client_golang/prometheus/promauto"
    14  	"github.com/thanos-io/thanos/pkg/objstore"
    15  	"go.uber.org/atomic"
    16  
    17  	"github.com/cortexproject/cortex/pkg/storage/bucket"
    18  	"github.com/cortexproject/cortex/pkg/util"
    19  )
    20  
    21  const (
    22  	// readIndexTimeout is the maximum allowed time when reading a single bucket index
    23  	// from the storage. It's hard-coded to a reasonably high value.
    24  	readIndexTimeout = 15 * time.Second
    25  )
    26  
    27  type LoaderConfig struct {
    28  	CheckInterval         time.Duration
    29  	UpdateOnStaleInterval time.Duration
    30  	UpdateOnErrorInterval time.Duration
    31  	IdleTimeout           time.Duration
    32  }
    33  
    34  // Loader is responsible to lazy load bucket indexes and, once loaded for the first time,
    35  // keep them updated in background. Loaded indexes are automatically offloaded once the
    36  // idle timeout expires.
    37  type Loader struct {
    38  	services.Service
    39  
    40  	bkt         objstore.Bucket
    41  	logger      log.Logger
    42  	cfg         LoaderConfig
    43  	cfgProvider bucket.TenantConfigProvider
    44  
    45  	indexesMx sync.RWMutex
    46  	indexes   map[string]*cachedIndex
    47  
    48  	// Metrics.
    49  	loadAttempts prometheus.Counter
    50  	loadFailures prometheus.Counter
    51  	loadDuration prometheus.Histogram
    52  	loaded       prometheus.GaugeFunc
    53  }
    54  
    55  // NewLoader makes a new Loader.
    56  func NewLoader(cfg LoaderConfig, bucketClient objstore.Bucket, cfgProvider bucket.TenantConfigProvider, logger log.Logger, reg prometheus.Registerer) *Loader {
    57  	l := &Loader{
    58  		bkt:         bucketClient,
    59  		logger:      logger,
    60  		cfg:         cfg,
    61  		cfgProvider: cfgProvider,
    62  		indexes:     map[string]*cachedIndex{},
    63  
    64  		loadAttempts: promauto.With(reg).NewCounter(prometheus.CounterOpts{
    65  			Name: "cortex_bucket_index_loads_total",
    66  			Help: "Total number of bucket index loading attempts.",
    67  		}),
    68  		loadFailures: promauto.With(reg).NewCounter(prometheus.CounterOpts{
    69  			Name: "cortex_bucket_index_load_failures_total",
    70  			Help: "Total number of bucket index loading failures.",
    71  		}),
    72  		loadDuration: promauto.With(reg).NewHistogram(prometheus.HistogramOpts{
    73  			Name:    "cortex_bucket_index_load_duration_seconds",
    74  			Help:    "Duration of the a single bucket index loading operation in seconds.",
    75  			Buckets: []float64{0.01, 0.02, 0.05, 0.1, 0.2, 0.3, 1, 10},
    76  		}),
    77  	}
    78  
    79  	l.loaded = promauto.With(reg).NewGaugeFunc(prometheus.GaugeOpts{
    80  		Name: "cortex_bucket_index_loaded",
    81  		Help: "Number of bucket indexes currently loaded in-memory.",
    82  	}, l.countLoadedIndexesMetric)
    83  
    84  	// Apply a jitter to the sync frequency in order to increase the probability
    85  	// of hitting the shared cache (if any).
    86  	checkInterval := util.DurationWithJitter(cfg.CheckInterval, 0.2)
    87  	l.Service = services.NewTimerService(checkInterval, nil, l.checkCachedIndexes, nil)
    88  
    89  	return l
    90  }
    91  
    92  // GetIndex returns the bucket index for the given user. It returns the in-memory cached
    93  // index if available, or load it from the bucket otherwise.
    94  func (l *Loader) GetIndex(ctx context.Context, userID string) (*Index, error) {
    95  	l.indexesMx.RLock()
    96  	if entry := l.indexes[userID]; entry != nil {
    97  		idx := entry.index
    98  		err := entry.err
    99  		l.indexesMx.RUnlock()
   100  
   101  		// We don't check if the index is stale because it's the responsibility
   102  		// of the background job to keep it updated.
   103  		entry.requestedAt.Store(time.Now().Unix())
   104  		return idx, err
   105  	}
   106  	l.indexesMx.RUnlock()
   107  
   108  	startTime := time.Now()
   109  	l.loadAttempts.Inc()
   110  	idx, err := ReadIndex(ctx, l.bkt, userID, l.cfgProvider, l.logger)
   111  	if err != nil {
   112  		// Cache the error, to avoid hammering the object store in case of persistent issues
   113  		// (eg. corrupted bucket index or not existing).
   114  		l.cacheIndex(userID, nil, err)
   115  
   116  		if errors.Is(err, ErrIndexNotFound) {
   117  			level.Warn(l.logger).Log("msg", "bucket index not found", "user", userID)
   118  		} else {
   119  			// We don't track ErrIndexNotFound as failure because it's a legit case (eg. a tenant just
   120  			// started to remote write and its blocks haven't uploaded to storage yet).
   121  			l.loadFailures.Inc()
   122  			level.Error(l.logger).Log("msg", "unable to load bucket index", "user", userID, "err", err)
   123  		}
   124  
   125  		return nil, err
   126  	}
   127  
   128  	// Cache the index.
   129  	l.cacheIndex(userID, idx, nil)
   130  
   131  	elapsedTime := time.Since(startTime)
   132  	l.loadDuration.Observe(elapsedTime.Seconds())
   133  	level.Info(l.logger).Log("msg", "loaded bucket index", "user", userID, "duration", elapsedTime)
   134  	return idx, nil
   135  }
   136  
   137  func (l *Loader) cacheIndex(userID string, idx *Index, err error) {
   138  	l.indexesMx.Lock()
   139  	defer l.indexesMx.Unlock()
   140  
   141  	// Not an issue if, due to concurrency, another index was already cached
   142  	// and we overwrite it: last will win.
   143  	l.indexes[userID] = newCachedIndex(idx, err)
   144  }
   145  
   146  // checkCachedIndexes checks all cached indexes and, for each of them, does two things:
   147  // 1. Offload indexes not requested since >= idle timeout
   148  // 2. Update indexes which have been updated last time since >= update timeout
   149  func (l *Loader) checkCachedIndexes(ctx context.Context) error {
   150  	// Build a list of users for which we should update or delete the index.
   151  	toUpdate, toDelete := l.checkCachedIndexesToUpdateAndDelete()
   152  
   153  	// Delete unused indexes.
   154  	for _, userID := range toDelete {
   155  		l.deleteCachedIndex(userID)
   156  	}
   157  
   158  	// Update actively used indexes.
   159  	for _, userID := range toUpdate {
   160  		l.updateCachedIndex(ctx, userID)
   161  	}
   162  
   163  	// Never return error, otherwise the service terminates.
   164  	return nil
   165  }
   166  
   167  func (l *Loader) checkCachedIndexesToUpdateAndDelete() (toUpdate, toDelete []string) {
   168  	now := time.Now()
   169  
   170  	l.indexesMx.RLock()
   171  	defer l.indexesMx.RUnlock()
   172  
   173  	for userID, entry := range l.indexes {
   174  		// Given ErrIndexNotFound is a legit case and assuming UpdateOnErrorInterval is lower than
   175  		// UpdateOnStaleInterval, we don't consider ErrIndexNotFound as an error with regards to the
   176  		// refresh interval and so it will updated once stale.
   177  		isError := entry.err != nil && !errors.Is(entry.err, ErrIndexNotFound)
   178  
   179  		switch {
   180  		case now.Sub(entry.getRequestedAt()) >= l.cfg.IdleTimeout:
   181  			toDelete = append(toDelete, userID)
   182  		case isError && now.Sub(entry.getUpdatedAt()) >= l.cfg.UpdateOnErrorInterval:
   183  			toUpdate = append(toUpdate, userID)
   184  		case !isError && now.Sub(entry.getUpdatedAt()) >= l.cfg.UpdateOnStaleInterval:
   185  			toUpdate = append(toUpdate, userID)
   186  		}
   187  	}
   188  
   189  	return
   190  }
   191  
   192  func (l *Loader) updateCachedIndex(ctx context.Context, userID string) {
   193  	readCtx, cancel := context.WithTimeout(ctx, readIndexTimeout)
   194  	defer cancel()
   195  
   196  	l.loadAttempts.Inc()
   197  	startTime := time.Now()
   198  	idx, err := ReadIndex(readCtx, l.bkt, userID, l.cfgProvider, l.logger)
   199  	if err != nil && !errors.Is(err, ErrIndexNotFound) {
   200  		l.loadFailures.Inc()
   201  		level.Warn(l.logger).Log("msg", "unable to update bucket index", "user", userID, "err", err)
   202  		return
   203  	}
   204  
   205  	l.loadDuration.Observe(time.Since(startTime).Seconds())
   206  
   207  	// We cache it either it was successfully refreshed or wasn't found. An use case for caching the ErrIndexNotFound
   208  	// is when a tenant has rules configured but hasn't started remote writing yet. Rules will be evaluated and
   209  	// bucket index loaded by the ruler.
   210  	l.indexesMx.Lock()
   211  	l.indexes[userID].index = idx
   212  	l.indexes[userID].err = err
   213  	l.indexes[userID].setUpdatedAt(startTime)
   214  	l.indexesMx.Unlock()
   215  }
   216  
   217  func (l *Loader) deleteCachedIndex(userID string) {
   218  	l.indexesMx.Lock()
   219  	delete(l.indexes, userID)
   220  	l.indexesMx.Unlock()
   221  
   222  	level.Info(l.logger).Log("msg", "unloaded bucket index", "user", userID, "reason", "idle")
   223  }
   224  
   225  func (l *Loader) countLoadedIndexesMetric() float64 {
   226  	l.indexesMx.RLock()
   227  	defer l.indexesMx.RUnlock()
   228  
   229  	count := 0
   230  	for _, idx := range l.indexes {
   231  		if idx.index != nil {
   232  			count++
   233  		}
   234  	}
   235  	return float64(count)
   236  }
   237  
   238  type cachedIndex struct {
   239  	// We cache either the index or the error occurred while fetching it. They're
   240  	// mutually exclusive.
   241  	index *Index
   242  	err   error
   243  
   244  	// Unix timestamp (seconds) of when the index has been updated from the storage the last time.
   245  	updatedAt atomic.Int64
   246  
   247  	// Unix timestamp (seconds) of when the index has been requested the last time.
   248  	requestedAt atomic.Int64
   249  }
   250  
   251  func newCachedIndex(idx *Index, err error) *cachedIndex {
   252  	entry := &cachedIndex{
   253  		index: idx,
   254  		err:   err,
   255  	}
   256  
   257  	now := time.Now()
   258  	entry.setUpdatedAt(now)
   259  	entry.setRequestedAt(now)
   260  
   261  	return entry
   262  }
   263  
   264  func (i *cachedIndex) setUpdatedAt(ts time.Time) {
   265  	i.updatedAt.Store(ts.Unix())
   266  }
   267  
   268  func (i *cachedIndex) getUpdatedAt() time.Time {
   269  	return time.Unix(i.updatedAt.Load(), 0)
   270  }
   271  
   272  func (i *cachedIndex) setRequestedAt(ts time.Time) {
   273  	i.requestedAt.Store(ts.Unix())
   274  }
   275  
   276  func (i *cachedIndex) getRequestedAt() time.Time {
   277  	return time.Unix(i.requestedAt.Load(), 0)
   278  }