github.com/muhammadn/cortex@v1.9.1-0.20220510110439-46bb7000d03d/pkg/storage/tsdb/bucketindex/loader.go (about) 1 package bucketindex 2 3 import ( 4 "context" 5 "sync" 6 "time" 7 8 "github.com/go-kit/log" 9 "github.com/go-kit/log/level" 10 "github.com/grafana/dskit/services" 11 "github.com/pkg/errors" 12 "github.com/prometheus/client_golang/prometheus" 13 "github.com/prometheus/client_golang/prometheus/promauto" 14 "github.com/thanos-io/thanos/pkg/objstore" 15 "go.uber.org/atomic" 16 17 "github.com/cortexproject/cortex/pkg/storage/bucket" 18 "github.com/cortexproject/cortex/pkg/util" 19 ) 20 21 const ( 22 // readIndexTimeout is the maximum allowed time when reading a single bucket index 23 // from the storage. It's hard-coded to a reasonably high value. 24 readIndexTimeout = 15 * time.Second 25 ) 26 27 type LoaderConfig struct { 28 CheckInterval time.Duration 29 UpdateOnStaleInterval time.Duration 30 UpdateOnErrorInterval time.Duration 31 IdleTimeout time.Duration 32 } 33 34 // Loader is responsible to lazy load bucket indexes and, once loaded for the first time, 35 // keep them updated in background. Loaded indexes are automatically offloaded once the 36 // idle timeout expires. 37 type Loader struct { 38 services.Service 39 40 bkt objstore.Bucket 41 logger log.Logger 42 cfg LoaderConfig 43 cfgProvider bucket.TenantConfigProvider 44 45 indexesMx sync.RWMutex 46 indexes map[string]*cachedIndex 47 48 // Metrics. 49 loadAttempts prometheus.Counter 50 loadFailures prometheus.Counter 51 loadDuration prometheus.Histogram 52 loaded prometheus.GaugeFunc 53 } 54 55 // NewLoader makes a new Loader. 56 func NewLoader(cfg LoaderConfig, bucketClient objstore.Bucket, cfgProvider bucket.TenantConfigProvider, logger log.Logger, reg prometheus.Registerer) *Loader { 57 l := &Loader{ 58 bkt: bucketClient, 59 logger: logger, 60 cfg: cfg, 61 cfgProvider: cfgProvider, 62 indexes: map[string]*cachedIndex{}, 63 64 loadAttempts: promauto.With(reg).NewCounter(prometheus.CounterOpts{ 65 Name: "cortex_bucket_index_loads_total", 66 Help: "Total number of bucket index loading attempts.", 67 }), 68 loadFailures: promauto.With(reg).NewCounter(prometheus.CounterOpts{ 69 Name: "cortex_bucket_index_load_failures_total", 70 Help: "Total number of bucket index loading failures.", 71 }), 72 loadDuration: promauto.With(reg).NewHistogram(prometheus.HistogramOpts{ 73 Name: "cortex_bucket_index_load_duration_seconds", 74 Help: "Duration of the a single bucket index loading operation in seconds.", 75 Buckets: []float64{0.01, 0.02, 0.05, 0.1, 0.2, 0.3, 1, 10}, 76 }), 77 } 78 79 l.loaded = promauto.With(reg).NewGaugeFunc(prometheus.GaugeOpts{ 80 Name: "cortex_bucket_index_loaded", 81 Help: "Number of bucket indexes currently loaded in-memory.", 82 }, l.countLoadedIndexesMetric) 83 84 // Apply a jitter to the sync frequency in order to increase the probability 85 // of hitting the shared cache (if any). 86 checkInterval := util.DurationWithJitter(cfg.CheckInterval, 0.2) 87 l.Service = services.NewTimerService(checkInterval, nil, l.checkCachedIndexes, nil) 88 89 return l 90 } 91 92 // GetIndex returns the bucket index for the given user. It returns the in-memory cached 93 // index if available, or load it from the bucket otherwise. 94 func (l *Loader) GetIndex(ctx context.Context, userID string) (*Index, error) { 95 l.indexesMx.RLock() 96 if entry := l.indexes[userID]; entry != nil { 97 idx := entry.index 98 err := entry.err 99 l.indexesMx.RUnlock() 100 101 // We don't check if the index is stale because it's the responsibility 102 // of the background job to keep it updated. 103 entry.requestedAt.Store(time.Now().Unix()) 104 return idx, err 105 } 106 l.indexesMx.RUnlock() 107 108 startTime := time.Now() 109 l.loadAttempts.Inc() 110 idx, err := ReadIndex(ctx, l.bkt, userID, l.cfgProvider, l.logger) 111 if err != nil { 112 // Cache the error, to avoid hammering the object store in case of persistent issues 113 // (eg. corrupted bucket index or not existing). 114 l.cacheIndex(userID, nil, err) 115 116 if errors.Is(err, ErrIndexNotFound) { 117 level.Warn(l.logger).Log("msg", "bucket index not found", "user", userID) 118 } else { 119 // We don't track ErrIndexNotFound as failure because it's a legit case (eg. a tenant just 120 // started to remote write and its blocks haven't uploaded to storage yet). 121 l.loadFailures.Inc() 122 level.Error(l.logger).Log("msg", "unable to load bucket index", "user", userID, "err", err) 123 } 124 125 return nil, err 126 } 127 128 // Cache the index. 129 l.cacheIndex(userID, idx, nil) 130 131 elapsedTime := time.Since(startTime) 132 l.loadDuration.Observe(elapsedTime.Seconds()) 133 level.Info(l.logger).Log("msg", "loaded bucket index", "user", userID, "duration", elapsedTime) 134 return idx, nil 135 } 136 137 func (l *Loader) cacheIndex(userID string, idx *Index, err error) { 138 l.indexesMx.Lock() 139 defer l.indexesMx.Unlock() 140 141 // Not an issue if, due to concurrency, another index was already cached 142 // and we overwrite it: last will win. 143 l.indexes[userID] = newCachedIndex(idx, err) 144 } 145 146 // checkCachedIndexes checks all cached indexes and, for each of them, does two things: 147 // 1. Offload indexes not requested since >= idle timeout 148 // 2. Update indexes which have been updated last time since >= update timeout 149 func (l *Loader) checkCachedIndexes(ctx context.Context) error { 150 // Build a list of users for which we should update or delete the index. 151 toUpdate, toDelete := l.checkCachedIndexesToUpdateAndDelete() 152 153 // Delete unused indexes. 154 for _, userID := range toDelete { 155 l.deleteCachedIndex(userID) 156 } 157 158 // Update actively used indexes. 159 for _, userID := range toUpdate { 160 l.updateCachedIndex(ctx, userID) 161 } 162 163 // Never return error, otherwise the service terminates. 164 return nil 165 } 166 167 func (l *Loader) checkCachedIndexesToUpdateAndDelete() (toUpdate, toDelete []string) { 168 now := time.Now() 169 170 l.indexesMx.RLock() 171 defer l.indexesMx.RUnlock() 172 173 for userID, entry := range l.indexes { 174 // Given ErrIndexNotFound is a legit case and assuming UpdateOnErrorInterval is lower than 175 // UpdateOnStaleInterval, we don't consider ErrIndexNotFound as an error with regards to the 176 // refresh interval and so it will updated once stale. 177 isError := entry.err != nil && !errors.Is(entry.err, ErrIndexNotFound) 178 179 switch { 180 case now.Sub(entry.getRequestedAt()) >= l.cfg.IdleTimeout: 181 toDelete = append(toDelete, userID) 182 case isError && now.Sub(entry.getUpdatedAt()) >= l.cfg.UpdateOnErrorInterval: 183 toUpdate = append(toUpdate, userID) 184 case !isError && now.Sub(entry.getUpdatedAt()) >= l.cfg.UpdateOnStaleInterval: 185 toUpdate = append(toUpdate, userID) 186 } 187 } 188 189 return 190 } 191 192 func (l *Loader) updateCachedIndex(ctx context.Context, userID string) { 193 readCtx, cancel := context.WithTimeout(ctx, readIndexTimeout) 194 defer cancel() 195 196 l.loadAttempts.Inc() 197 startTime := time.Now() 198 idx, err := ReadIndex(readCtx, l.bkt, userID, l.cfgProvider, l.logger) 199 if err != nil && !errors.Is(err, ErrIndexNotFound) { 200 l.loadFailures.Inc() 201 level.Warn(l.logger).Log("msg", "unable to update bucket index", "user", userID, "err", err) 202 return 203 } 204 205 l.loadDuration.Observe(time.Since(startTime).Seconds()) 206 207 // We cache it either it was successfully refreshed or wasn't found. An use case for caching the ErrIndexNotFound 208 // is when a tenant has rules configured but hasn't started remote writing yet. Rules will be evaluated and 209 // bucket index loaded by the ruler. 210 l.indexesMx.Lock() 211 l.indexes[userID].index = idx 212 l.indexes[userID].err = err 213 l.indexes[userID].setUpdatedAt(startTime) 214 l.indexesMx.Unlock() 215 } 216 217 func (l *Loader) deleteCachedIndex(userID string) { 218 l.indexesMx.Lock() 219 delete(l.indexes, userID) 220 l.indexesMx.Unlock() 221 222 level.Info(l.logger).Log("msg", "unloaded bucket index", "user", userID, "reason", "idle") 223 } 224 225 func (l *Loader) countLoadedIndexesMetric() float64 { 226 l.indexesMx.RLock() 227 defer l.indexesMx.RUnlock() 228 229 count := 0 230 for _, idx := range l.indexes { 231 if idx.index != nil { 232 count++ 233 } 234 } 235 return float64(count) 236 } 237 238 type cachedIndex struct { 239 // We cache either the index or the error occurred while fetching it. They're 240 // mutually exclusive. 241 index *Index 242 err error 243 244 // Unix timestamp (seconds) of when the index has been updated from the storage the last time. 245 updatedAt atomic.Int64 246 247 // Unix timestamp (seconds) of when the index has been requested the last time. 248 requestedAt atomic.Int64 249 } 250 251 func newCachedIndex(idx *Index, err error) *cachedIndex { 252 entry := &cachedIndex{ 253 index: idx, 254 err: err, 255 } 256 257 now := time.Now() 258 entry.setUpdatedAt(now) 259 entry.setRequestedAt(now) 260 261 return entry 262 } 263 264 func (i *cachedIndex) setUpdatedAt(ts time.Time) { 265 i.updatedAt.Store(ts.Unix()) 266 } 267 268 func (i *cachedIndex) getUpdatedAt() time.Time { 269 return time.Unix(i.updatedAt.Load(), 0) 270 } 271 272 func (i *cachedIndex) setRequestedAt(ts time.Time) { 273 i.requestedAt.Store(ts.Unix()) 274 } 275 276 func (i *cachedIndex) getRequestedAt() time.Time { 277 return time.Unix(i.requestedAt.Load(), 0) 278 }