github.com/pyroscope-io/pyroscope@v0.37.3-0.20230725203016-5f6947968bd0/pkg/storage/storage.go (about) 1 package storage 2 3 // revive:disable:max-public-structs complex package 4 5 import ( 6 "context" 7 "errors" 8 "runtime" 9 "sync" 10 "time" 11 12 "github.com/dgraph-io/badger/v2" 13 "github.com/prometheus/client_golang/prometheus" 14 "github.com/sirupsen/logrus" 15 16 "github.com/pyroscope-io/pyroscope/pkg/health" 17 "github.com/pyroscope-io/pyroscope/pkg/model/appmetadata" 18 "github.com/pyroscope-io/pyroscope/pkg/storage/cache" 19 "github.com/pyroscope-io/pyroscope/pkg/storage/labels" 20 "github.com/pyroscope-io/pyroscope/pkg/storage/segment" 21 "github.com/pyroscope-io/pyroscope/pkg/util/bytesize" 22 ) 23 24 var ( 25 errRetention = errors.New("could not write because of retention settings") 26 errOutOfSpace = errors.New("running out of space") 27 errClosed = errors.New("storage closed") 28 ) 29 30 type Storage struct { 31 config *Config 32 *storageOptions 33 34 logger *logrus.Logger 35 *metrics 36 37 segments BadgerDBWithCache 38 dimensions BadgerDBWithCache 39 dicts BadgerDBWithCache 40 trees BadgerDBWithCache 41 main BadgerDBWithCache 42 labels *labels.Labels 43 exemplars *exemplars 44 45 appSvc ApplicationMetadataSaver 46 hc *health.Controller 47 48 // Maintenance tasks are executed exclusively to avoid competition: 49 // extensive writing during GC is harmful and deteriorates the 50 // overall performance. Same for write back, eviction, and retention 51 // tasks. 52 tasksMutex sync.Mutex 53 tasksWG sync.WaitGroup 54 stop chan struct{} 55 putMutex sync.Mutex 56 } 57 58 type storageOptions struct { 59 badgerGCTaskInterval time.Duration 60 metricsUpdateTaskInterval time.Duration 61 writeBackTaskInterval time.Duration 62 evictionTaskInterval time.Duration 63 retentionTaskInterval time.Duration 64 cacheTTL time.Duration 65 gcSizeDiff bytesize.ByteSize 66 } 67 68 // MetricsExporter exports values of particular stack traces sample from profiling 69 // data as a Prometheus metrics. 70 type MetricsExporter interface { 71 // Evaluate evaluates metrics export rules against the input key and creates 72 // prometheus counters for new time series, if required. Returned observer can 73 // be used to evaluate and observe particular samples. 74 // 75 // If there are no matching rules, the function returns false. 76 Evaluate(*PutInput) (SampleObserver, bool) 77 } 78 79 type SampleObserver interface { 80 // Observe adds v to the matched counters if k satisfies node selector. 81 // k is a sample stack trace where frames are delimited by semicolon. 82 // v is the sample value. 83 Observe(k []byte, v int) 84 } 85 86 // ApplicationMetadataSaver saves application metadata 87 type ApplicationMetadataSaver interface { 88 CreateOrUpdate(ctx context.Context, application appmetadata.ApplicationMetadata) error 89 } 90 91 func New(c *Config, logger *logrus.Logger, reg prometheus.Registerer, hc *health.Controller, appSvc ApplicationMetadataSaver) (*Storage, error) { 92 s := &Storage{ 93 config: c, 94 storageOptions: &storageOptions{ 95 // Interval at which GC triggered if the db size has increased more 96 // than by gcSizeDiff since the last probe. 97 badgerGCTaskInterval: 5 * time.Minute, 98 // DB size and cache size metrics are updated periodically. 99 metricsUpdateTaskInterval: 10 * time.Second, 100 writeBackTaskInterval: time.Minute, 101 evictionTaskInterval: 20 * time.Second, 102 retentionTaskInterval: 10 * time.Minute, 103 cacheTTL: 2 * time.Minute, 104 // gcSizeDiff specifies the minimal storage size difference that 105 // causes garbage collection to trigger. 106 gcSizeDiff: bytesize.GB, 107 }, 108 109 hc: hc, 110 logger: logger, 111 metrics: newMetrics(reg), 112 stop: make(chan struct{}), 113 appSvc: appSvc, 114 } 115 116 if c.NewBadger == nil { 117 c.NewBadger = s.newBadger 118 } 119 120 var err error 121 if s.main, err = c.NewBadger("main", "", nil); err != nil { 122 return nil, err 123 } 124 if s.dicts, err = c.NewBadger("dicts", dictionaryPrefix, dictionaryCodec{}); err != nil { 125 return nil, err 126 } 127 if s.dimensions, err = c.NewBadger("dimensions", dimensionPrefix, dimensionCodec{}); err != nil { 128 return nil, err 129 } 130 if s.segments, err = c.NewBadger("segments", segmentPrefix, segmentCodec{}); err != nil { 131 return nil, err 132 } 133 if s.trees, err = c.NewBadger("trees", treePrefix, treeCodec{s}); err != nil { 134 return nil, err 135 } 136 137 pdb, err := c.NewBadger("profiles", exemplarDataPrefix, nil) 138 if err != nil { 139 return nil, err 140 } 141 142 s.initExemplarsStorage(pdb) 143 s.labels = labels.New(s.main.DBInstance()) 144 145 if err = s.migrate(); err != nil { 146 return nil, err 147 } 148 149 s.periodicTask(s.writeBackTaskInterval, s.writeBackTask) 150 151 if !s.config.inMemory { 152 // TODO(kolesnikovae): Allow failure and skip evictionTask? 153 memTotal, err := getMemTotal() 154 if err != nil { 155 return nil, err 156 } 157 158 s.periodicTask(s.evictionTaskInterval, s.evictionTask(memTotal)) 159 s.maintenanceTask(s.retentionTaskInterval, s.retentionTask) 160 s.periodicTask(s.metricsUpdateTaskInterval, s.updateMetricsTask) 161 } 162 163 return s, nil 164 } 165 166 func (s *Storage) Close() error { 167 // Stop all periodic and maintenance tasks. 168 close(s.stop) 169 s.logger.Debug("waiting for storage tasks to finish") 170 s.tasksWG.Wait() 171 s.logger.Debug("storage tasks finished") 172 173 // Flush caches. Dictionaries DB has to close last because trees depends on it. 174 // Exemplars DB does not have a cache but depends on Dictionaries DB as well: 175 // there is no need to force synchronization, as exemplars storage listens to 176 // the s.stop channel and stops synchronously. 177 caches := []BadgerDBWithCache{ 178 s.trees, 179 s.segments, 180 s.dimensions, 181 } 182 wg := new(sync.WaitGroup) 183 wg.Add(len(caches)) 184 for _, d := range caches { 185 go func(d BadgerDBWithCache) { 186 d.CacheInstance().Flush() 187 wg.Done() 188 }(d) 189 } 190 wg.Wait() 191 192 // Flush dictionaries cache only when all the dependant caches are flushed. 193 s.dicts.CacheInstance().Flush() 194 195 // Close databases. Order does not matter. 196 dbs := []BadgerDBWithCache{ 197 s.trees, 198 s.segments, 199 s.dimensions, 200 s.exemplars.db, 201 s.dicts, 202 s.main, // Also stores labels. 203 } 204 wg = new(sync.WaitGroup) 205 wg.Add(len(dbs)) 206 for _, d := range dbs { 207 go func(d BadgerDBWithCache) { 208 defer wg.Done() 209 if err := d.DBInstance().Close(); err != nil { 210 s.logger.WithField("name", d.Name()).WithError(err).Error("closing database") 211 } 212 }(d) 213 } 214 wg.Wait() 215 return nil 216 } 217 218 func (s *Storage) DiskUsage() map[string]bytesize.ByteSize { 219 m := make(map[string]bytesize.ByteSize) 220 for _, d := range s.databases() { 221 m[d.Name()] = d.Size() 222 } 223 return m 224 } 225 226 func (s *Storage) CacheStats() map[string]uint64 { 227 m := make(map[string]uint64) 228 for _, d := range s.databases() { 229 if d.CacheInstance() != nil { 230 m[d.Name()] = d.CacheSize() 231 } 232 } 233 return m 234 } 235 236 func (s *Storage) withContext(fn func(context.Context)) { 237 ctx, cancel := context.WithCancel(context.Background()) 238 defer cancel() 239 go func() { 240 select { 241 case <-ctx.Done(): 242 case <-s.stop: 243 cancel() 244 } 245 }() 246 fn(ctx) 247 } 248 249 // maintenanceTask periodically runs f exclusively. 250 func (s *Storage) maintenanceTask(interval time.Duration, f func()) { 251 s.periodicTask(interval, func() { 252 s.tasksMutex.Lock() 253 defer s.tasksMutex.Unlock() 254 f() 255 }) 256 } 257 258 func (s *Storage) periodicTask(interval time.Duration, f func()) { 259 s.tasksWG.Add(1) 260 go func() { 261 timer := time.NewTimer(interval) 262 defer func() { 263 timer.Stop() 264 s.tasksWG.Done() 265 }() 266 select { 267 case <-s.stop: 268 return 269 default: 270 f() 271 } 272 for { 273 select { 274 case <-s.stop: 275 return 276 case <-timer.C: 277 f() 278 timer.Reset(interval) 279 } 280 } 281 }() 282 } 283 284 func (s *Storage) evictionTask(memTotal uint64) func() { 285 var m runtime.MemStats 286 return func() { 287 timer := prometheus.NewTimer(prometheus.ObserverFunc(s.metrics.evictionTaskDuration.Observe)) 288 defer timer.ObserveDuration() 289 runtime.ReadMemStats(&m) 290 used := float64(m.Alloc) / float64(memTotal) 291 percent := s.config.cacheEvictVolume 292 if used < s.config.cacheEvictThreshold { 293 return 294 } 295 // Dimensions, dictionaries, and segments should not be evicted, 296 // as they are almost 100% in use and will be loaded back, causing 297 // more allocations. Unused items should be unloaded from cache by 298 // TTL expiration. Although, these objects must be written to disk, 299 // the order matters. 300 // 301 // It should be noted that in case of a crash or kill, data may become 302 // inconsistent: we should unite databases and do this in a tx. 303 // This is also applied to writeBack task. 304 s.trees.Evict(percent) 305 s.dicts.WriteBack() 306 // s.dimensions.WriteBack() 307 // s.segments.WriteBack() 308 // GC does not really release OS memory, so relying on MemStats.Alloc 309 // causes cache to evict the vast majority of items. debug.FreeOSMemory() 310 // could be used instead, but this can be even more expensive. 311 runtime.GC() 312 } 313 } 314 315 func (s *Storage) writeBackTask() { 316 timer := prometheus.NewTimer(prometheus.ObserverFunc(s.metrics.writeBackTaskDuration.Observe)) 317 defer timer.ObserveDuration() 318 for _, d := range s.databases() { 319 if d.CacheInstance() != nil { 320 d.WriteBack() 321 } 322 } 323 } 324 325 func (s *Storage) updateMetricsTask() { 326 for _, d := range s.databases() { 327 s.metrics.dbSize.WithLabelValues(d.Name()).Set(float64(d.Size())) 328 if d.CacheInstance() != nil { 329 s.metrics.cacheSize.WithLabelValues(d.Name()).Set(float64(d.CacheSize())) 330 } 331 } 332 } 333 334 func (s *Storage) retentionTask() { 335 rp := s.retentionPolicy() 336 if !rp.LowerTimeBoundary().IsZero() { 337 s.withContext(func(ctx context.Context) { 338 s.enforceRetentionPolicy(ctx, rp) 339 }) 340 } 341 } 342 343 func (s *Storage) exemplarsRetentionTask() { 344 rp := s.retentionPolicy() 345 if !rp.ExemplarsRetentionTime.IsZero() { 346 s.withContext(func(ctx context.Context) { 347 s.exemplars.enforceRetentionPolicy(ctx, rp) 348 }) 349 } 350 } 351 352 func (s *Storage) retentionPolicy() *segment.RetentionPolicy { 353 exemplarsRetention := s.config.retentionExemplars 354 if exemplarsRetention == 0 { 355 exemplarsRetention = s.config.retention 356 } 357 return segment.NewRetentionPolicy(). 358 SetAbsolutePeriod(s.config.retention). 359 SetExemplarsRetentionPeriod(exemplarsRetention). 360 SetLevels( 361 s.config.retentionLevels.Zero, 362 s.config.retentionLevels.One, 363 s.config.retentionLevels.Two) 364 } 365 366 func (s *Storage) databases() []BadgerDBWithCache { 367 return []BadgerDBWithCache{ 368 s.main, 369 s.dimensions, 370 s.segments, 371 s.dicts, 372 s.trees, 373 s.exemplars.db, 374 } 375 } 376 377 func (s *Storage) SegmentsInternals() (*badger.DB, *cache.Cache) { 378 return s.segments.DBInstance(), s.segments.CacheInstance() 379 } 380 func (s *Storage) DimensionsInternals() (*badger.DB, *cache.Cache) { 381 return s.dimensions.DBInstance(), s.dimensions.CacheInstance() 382 } 383 func (s *Storage) DictsInternals() (*badger.DB, *cache.Cache) { 384 return s.dicts.DBInstance(), s.dicts.CacheInstance() 385 } 386 func (s *Storage) TreesInternals() (*badger.DB, *cache.Cache) { 387 return s.trees.DBInstance(), s.trees.CacheInstance() 388 } 389 func (s *Storage) MainInternals() (*badger.DB, *cache.Cache) { 390 return s.main.DBInstance(), s.main.CacheInstance() 391 } 392 func (s *Storage) ExemplarsInternals() (*badger.DB, func()) { 393 return s.exemplars.db.DBInstance(), s.exemplars.Sync 394 }