github.com/grafana/pyroscope@v1.18.0/pkg/phlaredb/bucketindex/loader.go (about)

     1  // SPDX-License-Identifier: AGPL-3.0-only
     2  // Provenance-includes-location: https://github.com/cortexproject/cortex/blob/master/pkg/storage/tsdb/bucketindex/loader.go
     3  // Provenance-includes-license: Apache-2.0
     4  // Provenance-includes-copyright: The Cortex Authors.
     5  
     6  package bucketindex
     7  
     8  import (
     9  	"context"
    10  	"sync"
    11  	"time"
    12  
    13  	"github.com/go-kit/log"
    14  	"github.com/go-kit/log/level"
    15  	"github.com/grafana/dskit/services"
    16  	"github.com/pkg/errors"
    17  	"github.com/prometheus/client_golang/prometheus"
    18  	"github.com/prometheus/client_golang/prometheus/promauto"
    19  	"go.uber.org/atomic"
    20  
    21  	"github.com/grafana/pyroscope/pkg/objstore"
    22  	"github.com/grafana/pyroscope/pkg/util"
    23  )
    24  
    25  type LoaderConfig struct {
    26  	CheckInterval         time.Duration
    27  	UpdateOnStaleInterval time.Duration
    28  	UpdateOnErrorInterval time.Duration
    29  	IdleTimeout           time.Duration
    30  }
    31  
    32  // Loader is responsible to lazy load bucket indexes and, once loaded for the first time,
    33  // keep them updated in background. Loaded indexes are automatically offloaded once the
    34  // idle timeout expires.
    35  type Loader struct {
    36  	services.Service
    37  
    38  	bkt         objstore.Bucket
    39  	logger      log.Logger
    40  	cfg         LoaderConfig
    41  	cfgProvider objstore.TenantConfigProvider
    42  
    43  	indexesMx sync.RWMutex
    44  	indexes   map[string]*cachedIndex
    45  
    46  	// Metrics.
    47  	loadAttempts prometheus.Counter
    48  	loadFailures prometheus.Counter
    49  	loadDuration prometheus.Histogram
    50  	loaded       prometheus.GaugeFunc
    51  }
    52  
    53  // NewLoader makes a new Loader.
    54  func NewLoader(cfg LoaderConfig, bucketClient objstore.Bucket, cfgProvider objstore.TenantConfigProvider, logger log.Logger, reg prometheus.Registerer) *Loader {
    55  	l := &Loader{
    56  		bkt:         bucketClient,
    57  		logger:      logger,
    58  		cfg:         cfg,
    59  		cfgProvider: cfgProvider,
    60  		indexes:     map[string]*cachedIndex{},
    61  
    62  		loadAttempts: promauto.With(reg).NewCounter(prometheus.CounterOpts{
    63  			Name: "pyroscope_bucket_index_loads_total",
    64  			Help: "Total number of bucket index loading attempts.",
    65  		}),
    66  		loadFailures: promauto.With(reg).NewCounter(prometheus.CounterOpts{
    67  			Name: "pyroscope_bucket_index_load_failures_total",
    68  			Help: "Total number of bucket index loading failures.",
    69  		}),
    70  		loadDuration: promauto.With(reg).NewHistogram(prometheus.HistogramOpts{
    71  			Name:    "pyroscope_bucket_index_load_duration_seconds",
    72  			Help:    "Duration of the a single bucket index loading operation in seconds.",
    73  			Buckets: []float64{0.01, 0.02, 0.05, 0.1, 0.2, 0.3, 1, 10},
    74  		}),
    75  	}
    76  
    77  	l.loaded = promauto.With(reg).NewGaugeFunc(prometheus.GaugeOpts{
    78  		Name: "pyroscope_bucket_index_loaded",
    79  		Help: "Number of bucket indexes currently loaded in-memory.",
    80  	}, l.countLoadedIndexesMetric)
    81  
    82  	// Apply a jitter to the sync frequency in order to increase the probability
    83  	// of hitting the shared cache (if any).
    84  	checkInterval := util.DurationWithJitter(cfg.CheckInterval, 0.2)
    85  	l.Service = services.NewTimerService(checkInterval, nil, l.checkCachedIndexes, nil)
    86  
    87  	return l
    88  }
    89  
    90  // GetIndex returns the bucket index for the given user. It returns the in-memory cached
    91  // index if available, or load it from the bucket otherwise.
    92  func (l *Loader) GetIndex(ctx context.Context, userID string) (*Index, error) {
    93  	l.indexesMx.RLock()
    94  	if entry := l.indexes[userID]; entry != nil {
    95  		idx := entry.index
    96  		err := entry.err
    97  		l.indexesMx.RUnlock()
    98  
    99  		// We don't check if the index is stale because it's the responsibility
   100  		// of the background job to keep it updated.
   101  		entry.requestedAt.Store(time.Now().Unix())
   102  		return idx, err
   103  	}
   104  	l.indexesMx.RUnlock()
   105  
   106  	startTime := time.Now()
   107  	l.loadAttempts.Inc()
   108  	idx, err := ReadIndex(ctx, l.bkt, userID, l.cfgProvider, l.logger)
   109  	if err != nil {
   110  		// Cache the error, to avoid hammering the object store in case of persistent issues
   111  		// (eg. corrupted bucket index or not existing).
   112  		l.cacheIndex(userID, nil, err)
   113  
   114  		if errors.Is(err, ErrIndexNotFound) {
   115  			level.Warn(l.logger).Log("msg", "bucket index not found", "tenant", userID)
   116  		} else {
   117  			// We don't track ErrIndexNotFound as failure because it's a legit case (eg. a tenant just
   118  			// started to remote write and its blocks haven't uploaded to storage yet).
   119  			l.loadFailures.Inc()
   120  			level.Error(l.logger).Log("msg", "unable to load bucket index", "tenant", userID, "err", err)
   121  		}
   122  
   123  		return nil, err
   124  	}
   125  
   126  	// Cache the index.
   127  	l.cacheIndex(userID, idx, nil)
   128  
   129  	elapsedTime := time.Since(startTime)
   130  	l.loadDuration.Observe(elapsedTime.Seconds())
   131  	level.Info(l.logger).Log("msg", "loaded bucket index", "tenant", userID, "duration", elapsedTime)
   132  	return idx, nil
   133  }
   134  
   135  func (l *Loader) cacheIndex(userID string, idx *Index, err error) {
   136  	l.indexesMx.Lock()
   137  	defer l.indexesMx.Unlock()
   138  
   139  	// Not an issue if, due to concurrency, another index was already cached
   140  	// and we overwrite it: last will win.
   141  	l.indexes[userID] = newCachedIndex(idx, err)
   142  }
   143  
   144  // checkCachedIndexes checks all cached indexes and, for each of them, does two things:
   145  // 1. Offload indexes not requested since >= idle timeout
   146  // 2. Update indexes which have been updated last time since >= update timeout
   147  func (l *Loader) checkCachedIndexes(ctx context.Context) error {
   148  	// Build a list of users for which we should update or delete the index.
   149  	toUpdate, toDelete := l.checkCachedIndexesToUpdateAndDelete()
   150  
   151  	// Delete unused indexes.
   152  	for _, userID := range toDelete {
   153  		l.deleteCachedIndex(userID)
   154  	}
   155  
   156  	// Update actively used indexes.
   157  	for _, userID := range toUpdate {
   158  		l.updateCachedIndex(ctx, userID)
   159  	}
   160  
   161  	// Never return error, otherwise the service terminates.
   162  	return nil
   163  }
   164  
   165  func (l *Loader) checkCachedIndexesToUpdateAndDelete() (toUpdate, toDelete []string) {
   166  	now := time.Now()
   167  
   168  	l.indexesMx.RLock()
   169  	defer l.indexesMx.RUnlock()
   170  
   171  	for userID, entry := range l.indexes {
   172  		// Given ErrIndexNotFound is a legit case and assuming UpdateOnErrorInterval is lower than
   173  		// UpdateOnStaleInterval, we don't consider ErrIndexNotFound as an error with regards to the
   174  		// refresh interval and so it will updated once stale.
   175  		isError := entry.err != nil && !errors.Is(entry.err, ErrIndexNotFound)
   176  
   177  		switch {
   178  		case now.Sub(entry.getRequestedAt()) >= l.cfg.IdleTimeout:
   179  			toDelete = append(toDelete, userID)
   180  		case isError && now.Sub(entry.getUpdatedAt()) >= l.cfg.UpdateOnErrorInterval:
   181  			toUpdate = append(toUpdate, userID)
   182  		case !isError && now.Sub(entry.getUpdatedAt()) >= l.cfg.UpdateOnStaleInterval:
   183  			toUpdate = append(toUpdate, userID)
   184  		}
   185  	}
   186  
   187  	return
   188  }
   189  
   190  func (l *Loader) updateCachedIndex(ctx context.Context, userID string) {
   191  	l.loadAttempts.Inc()
   192  	startTime := time.Now()
   193  	idx, err := ReadIndex(ctx, l.bkt, userID, l.cfgProvider, l.logger)
   194  	if err != nil && !errors.Is(err, ErrIndexNotFound) {
   195  		l.loadFailures.Inc()
   196  		level.Warn(l.logger).Log("msg", "unable to update bucket index", "tenant", userID, "err", err)
   197  		return
   198  	}
   199  
   200  	l.loadDuration.Observe(time.Since(startTime).Seconds())
   201  
   202  	// We cache it either it was successfully refreshed or wasn't found. An use case for caching the ErrIndexNotFound
   203  	// is when a tenant has rules configured but hasn't started remote writing yet. Rules will be evaluated and
   204  	// bucket index loaded by the ruler.
   205  	l.indexesMx.Lock()
   206  	l.indexes[userID].index = idx
   207  	l.indexes[userID].err = err
   208  	l.indexes[userID].setUpdatedAt(startTime)
   209  	l.indexesMx.Unlock()
   210  }
   211  
   212  func (l *Loader) deleteCachedIndex(userID string) {
   213  	l.indexesMx.Lock()
   214  	delete(l.indexes, userID)
   215  	l.indexesMx.Unlock()
   216  
   217  	level.Info(l.logger).Log("msg", "unloaded bucket index", "tenant", userID, "reason", "idle")
   218  }
   219  
   220  func (l *Loader) countLoadedIndexesMetric() float64 {
   221  	l.indexesMx.RLock()
   222  	defer l.indexesMx.RUnlock()
   223  
   224  	count := 0
   225  	for _, idx := range l.indexes {
   226  		if idx.index != nil {
   227  			count++
   228  		}
   229  	}
   230  	return float64(count)
   231  }
   232  
   233  type cachedIndex struct {
   234  	// We cache either the index or the error occurred while fetching it. They're
   235  	// mutually exclusive.
   236  	index *Index
   237  	err   error
   238  
   239  	// Unix timestamp (seconds) of when the index has been updated from the storage the last time.
   240  	updatedAt atomic.Int64
   241  
   242  	// Unix timestamp (seconds) of when the index has been requested the last time.
   243  	requestedAt atomic.Int64
   244  }
   245  
   246  func newCachedIndex(idx *Index, err error) *cachedIndex {
   247  	entry := &cachedIndex{
   248  		index: idx,
   249  		err:   err,
   250  	}
   251  
   252  	now := time.Now()
   253  	entry.setUpdatedAt(now)
   254  	entry.setRequestedAt(now)
   255  
   256  	return entry
   257  }
   258  
   259  func (i *cachedIndex) setUpdatedAt(ts time.Time) {
   260  	i.updatedAt.Store(ts.Unix())
   261  }
   262  
   263  func (i *cachedIndex) getUpdatedAt() time.Time {
   264  	return time.Unix(i.updatedAt.Load(), 0)
   265  }
   266  
   267  func (i *cachedIndex) setRequestedAt(ts time.Time) {
   268  	i.requestedAt.Store(ts.Unix())
   269  }
   270  
   271  func (i *cachedIndex) getRequestedAt() time.Time {
   272  	return time.Unix(i.requestedAt.Load(), 0)
   273  }