github.com/grafana/pyroscope@v1.18.0/pkg/phlaredb/bucketindex/loader.go (about) 1 // SPDX-License-Identifier: AGPL-3.0-only 2 // Provenance-includes-location: https://github.com/cortexproject/cortex/blob/master/pkg/storage/tsdb/bucketindex/loader.go 3 // Provenance-includes-license: Apache-2.0 4 // Provenance-includes-copyright: The Cortex Authors. 5 6 package bucketindex 7 8 import ( 9 "context" 10 "sync" 11 "time" 12 13 "github.com/go-kit/log" 14 "github.com/go-kit/log/level" 15 "github.com/grafana/dskit/services" 16 "github.com/pkg/errors" 17 "github.com/prometheus/client_golang/prometheus" 18 "github.com/prometheus/client_golang/prometheus/promauto" 19 "go.uber.org/atomic" 20 21 "github.com/grafana/pyroscope/pkg/objstore" 22 "github.com/grafana/pyroscope/pkg/util" 23 ) 24 25 type LoaderConfig struct { 26 CheckInterval time.Duration 27 UpdateOnStaleInterval time.Duration 28 UpdateOnErrorInterval time.Duration 29 IdleTimeout time.Duration 30 } 31 32 // Loader is responsible to lazy load bucket indexes and, once loaded for the first time, 33 // keep them updated in background. Loaded indexes are automatically offloaded once the 34 // idle timeout expires. 35 type Loader struct { 36 services.Service 37 38 bkt objstore.Bucket 39 logger log.Logger 40 cfg LoaderConfig 41 cfgProvider objstore.TenantConfigProvider 42 43 indexesMx sync.RWMutex 44 indexes map[string]*cachedIndex 45 46 // Metrics. 47 loadAttempts prometheus.Counter 48 loadFailures prometheus.Counter 49 loadDuration prometheus.Histogram 50 loaded prometheus.GaugeFunc 51 } 52 53 // NewLoader makes a new Loader. 54 func NewLoader(cfg LoaderConfig, bucketClient objstore.Bucket, cfgProvider objstore.TenantConfigProvider, logger log.Logger, reg prometheus.Registerer) *Loader { 55 l := &Loader{ 56 bkt: bucketClient, 57 logger: logger, 58 cfg: cfg, 59 cfgProvider: cfgProvider, 60 indexes: map[string]*cachedIndex{}, 61 62 loadAttempts: promauto.With(reg).NewCounter(prometheus.CounterOpts{ 63 Name: "pyroscope_bucket_index_loads_total", 64 Help: "Total number of bucket index loading attempts.", 65 }), 66 loadFailures: promauto.With(reg).NewCounter(prometheus.CounterOpts{ 67 Name: "pyroscope_bucket_index_load_failures_total", 68 Help: "Total number of bucket index loading failures.", 69 }), 70 loadDuration: promauto.With(reg).NewHistogram(prometheus.HistogramOpts{ 71 Name: "pyroscope_bucket_index_load_duration_seconds", 72 Help: "Duration of the a single bucket index loading operation in seconds.", 73 Buckets: []float64{0.01, 0.02, 0.05, 0.1, 0.2, 0.3, 1, 10}, 74 }), 75 } 76 77 l.loaded = promauto.With(reg).NewGaugeFunc(prometheus.GaugeOpts{ 78 Name: "pyroscope_bucket_index_loaded", 79 Help: "Number of bucket indexes currently loaded in-memory.", 80 }, l.countLoadedIndexesMetric) 81 82 // Apply a jitter to the sync frequency in order to increase the probability 83 // of hitting the shared cache (if any). 84 checkInterval := util.DurationWithJitter(cfg.CheckInterval, 0.2) 85 l.Service = services.NewTimerService(checkInterval, nil, l.checkCachedIndexes, nil) 86 87 return l 88 } 89 90 // GetIndex returns the bucket index for the given user. It returns the in-memory cached 91 // index if available, or load it from the bucket otherwise. 92 func (l *Loader) GetIndex(ctx context.Context, userID string) (*Index, error) { 93 l.indexesMx.RLock() 94 if entry := l.indexes[userID]; entry != nil { 95 idx := entry.index 96 err := entry.err 97 l.indexesMx.RUnlock() 98 99 // We don't check if the index is stale because it's the responsibility 100 // of the background job to keep it updated. 101 entry.requestedAt.Store(time.Now().Unix()) 102 return idx, err 103 } 104 l.indexesMx.RUnlock() 105 106 startTime := time.Now() 107 l.loadAttempts.Inc() 108 idx, err := ReadIndex(ctx, l.bkt, userID, l.cfgProvider, l.logger) 109 if err != nil { 110 // Cache the error, to avoid hammering the object store in case of persistent issues 111 // (eg. corrupted bucket index or not existing). 112 l.cacheIndex(userID, nil, err) 113 114 if errors.Is(err, ErrIndexNotFound) { 115 level.Warn(l.logger).Log("msg", "bucket index not found", "tenant", userID) 116 } else { 117 // We don't track ErrIndexNotFound as failure because it's a legit case (eg. a tenant just 118 // started to remote write and its blocks haven't uploaded to storage yet). 119 l.loadFailures.Inc() 120 level.Error(l.logger).Log("msg", "unable to load bucket index", "tenant", userID, "err", err) 121 } 122 123 return nil, err 124 } 125 126 // Cache the index. 127 l.cacheIndex(userID, idx, nil) 128 129 elapsedTime := time.Since(startTime) 130 l.loadDuration.Observe(elapsedTime.Seconds()) 131 level.Info(l.logger).Log("msg", "loaded bucket index", "tenant", userID, "duration", elapsedTime) 132 return idx, nil 133 } 134 135 func (l *Loader) cacheIndex(userID string, idx *Index, err error) { 136 l.indexesMx.Lock() 137 defer l.indexesMx.Unlock() 138 139 // Not an issue if, due to concurrency, another index was already cached 140 // and we overwrite it: last will win. 141 l.indexes[userID] = newCachedIndex(idx, err) 142 } 143 144 // checkCachedIndexes checks all cached indexes and, for each of them, does two things: 145 // 1. Offload indexes not requested since >= idle timeout 146 // 2. Update indexes which have been updated last time since >= update timeout 147 func (l *Loader) checkCachedIndexes(ctx context.Context) error { 148 // Build a list of users for which we should update or delete the index. 149 toUpdate, toDelete := l.checkCachedIndexesToUpdateAndDelete() 150 151 // Delete unused indexes. 152 for _, userID := range toDelete { 153 l.deleteCachedIndex(userID) 154 } 155 156 // Update actively used indexes. 157 for _, userID := range toUpdate { 158 l.updateCachedIndex(ctx, userID) 159 } 160 161 // Never return error, otherwise the service terminates. 162 return nil 163 } 164 165 func (l *Loader) checkCachedIndexesToUpdateAndDelete() (toUpdate, toDelete []string) { 166 now := time.Now() 167 168 l.indexesMx.RLock() 169 defer l.indexesMx.RUnlock() 170 171 for userID, entry := range l.indexes { 172 // Given ErrIndexNotFound is a legit case and assuming UpdateOnErrorInterval is lower than 173 // UpdateOnStaleInterval, we don't consider ErrIndexNotFound as an error with regards to the 174 // refresh interval and so it will updated once stale. 175 isError := entry.err != nil && !errors.Is(entry.err, ErrIndexNotFound) 176 177 switch { 178 case now.Sub(entry.getRequestedAt()) >= l.cfg.IdleTimeout: 179 toDelete = append(toDelete, userID) 180 case isError && now.Sub(entry.getUpdatedAt()) >= l.cfg.UpdateOnErrorInterval: 181 toUpdate = append(toUpdate, userID) 182 case !isError && now.Sub(entry.getUpdatedAt()) >= l.cfg.UpdateOnStaleInterval: 183 toUpdate = append(toUpdate, userID) 184 } 185 } 186 187 return 188 } 189 190 func (l *Loader) updateCachedIndex(ctx context.Context, userID string) { 191 l.loadAttempts.Inc() 192 startTime := time.Now() 193 idx, err := ReadIndex(ctx, l.bkt, userID, l.cfgProvider, l.logger) 194 if err != nil && !errors.Is(err, ErrIndexNotFound) { 195 l.loadFailures.Inc() 196 level.Warn(l.logger).Log("msg", "unable to update bucket index", "tenant", userID, "err", err) 197 return 198 } 199 200 l.loadDuration.Observe(time.Since(startTime).Seconds()) 201 202 // We cache it either it was successfully refreshed or wasn't found. An use case for caching the ErrIndexNotFound 203 // is when a tenant has rules configured but hasn't started remote writing yet. Rules will be evaluated and 204 // bucket index loaded by the ruler. 205 l.indexesMx.Lock() 206 l.indexes[userID].index = idx 207 l.indexes[userID].err = err 208 l.indexes[userID].setUpdatedAt(startTime) 209 l.indexesMx.Unlock() 210 } 211 212 func (l *Loader) deleteCachedIndex(userID string) { 213 l.indexesMx.Lock() 214 delete(l.indexes, userID) 215 l.indexesMx.Unlock() 216 217 level.Info(l.logger).Log("msg", "unloaded bucket index", "tenant", userID, "reason", "idle") 218 } 219 220 func (l *Loader) countLoadedIndexesMetric() float64 { 221 l.indexesMx.RLock() 222 defer l.indexesMx.RUnlock() 223 224 count := 0 225 for _, idx := range l.indexes { 226 if idx.index != nil { 227 count++ 228 } 229 } 230 return float64(count) 231 } 232 233 type cachedIndex struct { 234 // We cache either the index or the error occurred while fetching it. They're 235 // mutually exclusive. 236 index *Index 237 err error 238 239 // Unix timestamp (seconds) of when the index has been updated from the storage the last time. 240 updatedAt atomic.Int64 241 242 // Unix timestamp (seconds) of when the index has been requested the last time. 243 requestedAt atomic.Int64 244 } 245 246 func newCachedIndex(idx *Index, err error) *cachedIndex { 247 entry := &cachedIndex{ 248 index: idx, 249 err: err, 250 } 251 252 now := time.Now() 253 entry.setUpdatedAt(now) 254 entry.setRequestedAt(now) 255 256 return entry 257 } 258 259 func (i *cachedIndex) setUpdatedAt(ts time.Time) { 260 i.updatedAt.Store(ts.Unix()) 261 } 262 263 func (i *cachedIndex) getUpdatedAt() time.Time { 264 return time.Unix(i.updatedAt.Load(), 0) 265 } 266 267 func (i *cachedIndex) setRequestedAt(ts time.Time) { 268 i.requestedAt.Store(ts.Unix()) 269 } 270 271 func (i *cachedIndex) getRequestedAt() time.Time { 272 return time.Unix(i.requestedAt.Load(), 0) 273 }