github.com/pyroscope-io/pyroscope@v0.37.3-0.20230725203016-5f6947968bd0/pkg/storage/storage.go (about)

     1  package storage
     2  
     3  // revive:disable:max-public-structs complex package
     4  
     5  import (
     6  	"context"
     7  	"errors"
     8  	"runtime"
     9  	"sync"
    10  	"time"
    11  
    12  	"github.com/dgraph-io/badger/v2"
    13  	"github.com/prometheus/client_golang/prometheus"
    14  	"github.com/sirupsen/logrus"
    15  
    16  	"github.com/pyroscope-io/pyroscope/pkg/health"
    17  	"github.com/pyroscope-io/pyroscope/pkg/model/appmetadata"
    18  	"github.com/pyroscope-io/pyroscope/pkg/storage/cache"
    19  	"github.com/pyroscope-io/pyroscope/pkg/storage/labels"
    20  	"github.com/pyroscope-io/pyroscope/pkg/storage/segment"
    21  	"github.com/pyroscope-io/pyroscope/pkg/util/bytesize"
    22  )
    23  
    24  var (
    25  	errRetention  = errors.New("could not write because of retention settings")
    26  	errOutOfSpace = errors.New("running out of space")
    27  	errClosed     = errors.New("storage closed")
    28  )
    29  
    30  type Storage struct {
    31  	config *Config
    32  	*storageOptions
    33  
    34  	logger *logrus.Logger
    35  	*metrics
    36  
    37  	segments   BadgerDBWithCache
    38  	dimensions BadgerDBWithCache
    39  	dicts      BadgerDBWithCache
    40  	trees      BadgerDBWithCache
    41  	main       BadgerDBWithCache
    42  	labels     *labels.Labels
    43  	exemplars  *exemplars
    44  
    45  	appSvc ApplicationMetadataSaver
    46  	hc     *health.Controller
    47  
    48  	// Maintenance tasks are executed exclusively to avoid competition:
    49  	// extensive writing during GC is harmful and deteriorates the
    50  	// overall performance. Same for write back, eviction, and retention
    51  	// tasks.
    52  	tasksMutex sync.Mutex
    53  	tasksWG    sync.WaitGroup
    54  	stop       chan struct{}
    55  	putMutex   sync.Mutex
    56  }
    57  
    58  type storageOptions struct {
    59  	badgerGCTaskInterval      time.Duration
    60  	metricsUpdateTaskInterval time.Duration
    61  	writeBackTaskInterval     time.Duration
    62  	evictionTaskInterval      time.Duration
    63  	retentionTaskInterval     time.Duration
    64  	cacheTTL                  time.Duration
    65  	gcSizeDiff                bytesize.ByteSize
    66  }
    67  
    68  // MetricsExporter exports values of particular stack traces sample from profiling
    69  // data as a Prometheus metrics.
    70  type MetricsExporter interface {
    71  	// Evaluate evaluates metrics export rules against the input key and creates
    72  	// prometheus counters for new time series, if required. Returned observer can
    73  	// be used to evaluate and observe particular samples.
    74  	//
    75  	// If there are no matching rules, the function returns false.
    76  	Evaluate(*PutInput) (SampleObserver, bool)
    77  }
    78  
    79  type SampleObserver interface {
    80  	// Observe adds v to the matched counters if k satisfies node selector.
    81  	// k is a sample stack trace where frames are delimited by semicolon.
    82  	// v is the sample value.
    83  	Observe(k []byte, v int)
    84  }
    85  
    86  // ApplicationMetadataSaver saves application metadata
    87  type ApplicationMetadataSaver interface {
    88  	CreateOrUpdate(ctx context.Context, application appmetadata.ApplicationMetadata) error
    89  }
    90  
    91  func New(c *Config, logger *logrus.Logger, reg prometheus.Registerer, hc *health.Controller, appSvc ApplicationMetadataSaver) (*Storage, error) {
    92  	s := &Storage{
    93  		config: c,
    94  		storageOptions: &storageOptions{
    95  			// Interval at which GC triggered if the db size has increased more
    96  			// than by gcSizeDiff since the last probe.
    97  			badgerGCTaskInterval: 5 * time.Minute,
    98  			// DB size and cache size metrics are updated periodically.
    99  			metricsUpdateTaskInterval: 10 * time.Second,
   100  			writeBackTaskInterval:     time.Minute,
   101  			evictionTaskInterval:      20 * time.Second,
   102  			retentionTaskInterval:     10 * time.Minute,
   103  			cacheTTL:                  2 * time.Minute,
   104  			// gcSizeDiff specifies the minimal storage size difference that
   105  			// causes garbage collection to trigger.
   106  			gcSizeDiff: bytesize.GB,
   107  		},
   108  
   109  		hc:      hc,
   110  		logger:  logger,
   111  		metrics: newMetrics(reg),
   112  		stop:    make(chan struct{}),
   113  		appSvc:  appSvc,
   114  	}
   115  
   116  	if c.NewBadger == nil {
   117  		c.NewBadger = s.newBadger
   118  	}
   119  
   120  	var err error
   121  	if s.main, err = c.NewBadger("main", "", nil); err != nil {
   122  		return nil, err
   123  	}
   124  	if s.dicts, err = c.NewBadger("dicts", dictionaryPrefix, dictionaryCodec{}); err != nil {
   125  		return nil, err
   126  	}
   127  	if s.dimensions, err = c.NewBadger("dimensions", dimensionPrefix, dimensionCodec{}); err != nil {
   128  		return nil, err
   129  	}
   130  	if s.segments, err = c.NewBadger("segments", segmentPrefix, segmentCodec{}); err != nil {
   131  		return nil, err
   132  	}
   133  	if s.trees, err = c.NewBadger("trees", treePrefix, treeCodec{s}); err != nil {
   134  		return nil, err
   135  	}
   136  
   137  	pdb, err := c.NewBadger("profiles", exemplarDataPrefix, nil)
   138  	if err != nil {
   139  		return nil, err
   140  	}
   141  
   142  	s.initExemplarsStorage(pdb)
   143  	s.labels = labels.New(s.main.DBInstance())
   144  
   145  	if err = s.migrate(); err != nil {
   146  		return nil, err
   147  	}
   148  
   149  	s.periodicTask(s.writeBackTaskInterval, s.writeBackTask)
   150  
   151  	if !s.config.inMemory {
   152  		// TODO(kolesnikovae): Allow failure and skip evictionTask?
   153  		memTotal, err := getMemTotal()
   154  		if err != nil {
   155  			return nil, err
   156  		}
   157  
   158  		s.periodicTask(s.evictionTaskInterval, s.evictionTask(memTotal))
   159  		s.maintenanceTask(s.retentionTaskInterval, s.retentionTask)
   160  		s.periodicTask(s.metricsUpdateTaskInterval, s.updateMetricsTask)
   161  	}
   162  
   163  	return s, nil
   164  }
   165  
   166  func (s *Storage) Close() error {
   167  	// Stop all periodic and maintenance tasks.
   168  	close(s.stop)
   169  	s.logger.Debug("waiting for storage tasks to finish")
   170  	s.tasksWG.Wait()
   171  	s.logger.Debug("storage tasks finished")
   172  
   173  	// Flush caches. Dictionaries DB has to close last because trees depends on it.
   174  	// Exemplars DB does not have a cache but depends on Dictionaries DB as well:
   175  	// there is no need to force synchronization, as exemplars storage listens to
   176  	// the s.stop channel and stops synchronously.
   177  	caches := []BadgerDBWithCache{
   178  		s.trees,
   179  		s.segments,
   180  		s.dimensions,
   181  	}
   182  	wg := new(sync.WaitGroup)
   183  	wg.Add(len(caches))
   184  	for _, d := range caches {
   185  		go func(d BadgerDBWithCache) {
   186  			d.CacheInstance().Flush()
   187  			wg.Done()
   188  		}(d)
   189  	}
   190  	wg.Wait()
   191  
   192  	// Flush dictionaries cache only when all the dependant caches are flushed.
   193  	s.dicts.CacheInstance().Flush()
   194  
   195  	// Close databases. Order does not matter.
   196  	dbs := []BadgerDBWithCache{
   197  		s.trees,
   198  		s.segments,
   199  		s.dimensions,
   200  		s.exemplars.db,
   201  		s.dicts,
   202  		s.main, // Also stores labels.
   203  	}
   204  	wg = new(sync.WaitGroup)
   205  	wg.Add(len(dbs))
   206  	for _, d := range dbs {
   207  		go func(d BadgerDBWithCache) {
   208  			defer wg.Done()
   209  			if err := d.DBInstance().Close(); err != nil {
   210  				s.logger.WithField("name", d.Name()).WithError(err).Error("closing database")
   211  			}
   212  		}(d)
   213  	}
   214  	wg.Wait()
   215  	return nil
   216  }
   217  
   218  func (s *Storage) DiskUsage() map[string]bytesize.ByteSize {
   219  	m := make(map[string]bytesize.ByteSize)
   220  	for _, d := range s.databases() {
   221  		m[d.Name()] = d.Size()
   222  	}
   223  	return m
   224  }
   225  
   226  func (s *Storage) CacheStats() map[string]uint64 {
   227  	m := make(map[string]uint64)
   228  	for _, d := range s.databases() {
   229  		if d.CacheInstance() != nil {
   230  			m[d.Name()] = d.CacheSize()
   231  		}
   232  	}
   233  	return m
   234  }
   235  
   236  func (s *Storage) withContext(fn func(context.Context)) {
   237  	ctx, cancel := context.WithCancel(context.Background())
   238  	defer cancel()
   239  	go func() {
   240  		select {
   241  		case <-ctx.Done():
   242  		case <-s.stop:
   243  			cancel()
   244  		}
   245  	}()
   246  	fn(ctx)
   247  }
   248  
   249  // maintenanceTask periodically runs f exclusively.
   250  func (s *Storage) maintenanceTask(interval time.Duration, f func()) {
   251  	s.periodicTask(interval, func() {
   252  		s.tasksMutex.Lock()
   253  		defer s.tasksMutex.Unlock()
   254  		f()
   255  	})
   256  }
   257  
   258  func (s *Storage) periodicTask(interval time.Duration, f func()) {
   259  	s.tasksWG.Add(1)
   260  	go func() {
   261  		timer := time.NewTimer(interval)
   262  		defer func() {
   263  			timer.Stop()
   264  			s.tasksWG.Done()
   265  		}()
   266  		select {
   267  		case <-s.stop:
   268  			return
   269  		default:
   270  			f()
   271  		}
   272  		for {
   273  			select {
   274  			case <-s.stop:
   275  				return
   276  			case <-timer.C:
   277  				f()
   278  				timer.Reset(interval)
   279  			}
   280  		}
   281  	}()
   282  }
   283  
   284  func (s *Storage) evictionTask(memTotal uint64) func() {
   285  	var m runtime.MemStats
   286  	return func() {
   287  		timer := prometheus.NewTimer(prometheus.ObserverFunc(s.metrics.evictionTaskDuration.Observe))
   288  		defer timer.ObserveDuration()
   289  		runtime.ReadMemStats(&m)
   290  		used := float64(m.Alloc) / float64(memTotal)
   291  		percent := s.config.cacheEvictVolume
   292  		if used < s.config.cacheEvictThreshold {
   293  			return
   294  		}
   295  		// Dimensions, dictionaries, and segments should not be evicted,
   296  		// as they are almost 100% in use and will be loaded back, causing
   297  		// more allocations. Unused items should be unloaded from cache by
   298  		// TTL expiration. Although, these objects must be written to disk,
   299  		// the order matters.
   300  		//
   301  		// It should be noted that in case of a crash or kill, data may become
   302  		// inconsistent: we should unite databases and do this in a tx.
   303  		// This is also applied to writeBack task.
   304  		s.trees.Evict(percent)
   305  		s.dicts.WriteBack()
   306  		// s.dimensions.WriteBack()
   307  		// s.segments.WriteBack()
   308  		// GC does not really release OS memory, so relying on MemStats.Alloc
   309  		// causes cache to evict the vast majority of items. debug.FreeOSMemory()
   310  		// could be used instead, but this can be even more expensive.
   311  		runtime.GC()
   312  	}
   313  }
   314  
   315  func (s *Storage) writeBackTask() {
   316  	timer := prometheus.NewTimer(prometheus.ObserverFunc(s.metrics.writeBackTaskDuration.Observe))
   317  	defer timer.ObserveDuration()
   318  	for _, d := range s.databases() {
   319  		if d.CacheInstance() != nil {
   320  			d.WriteBack()
   321  		}
   322  	}
   323  }
   324  
   325  func (s *Storage) updateMetricsTask() {
   326  	for _, d := range s.databases() {
   327  		s.metrics.dbSize.WithLabelValues(d.Name()).Set(float64(d.Size()))
   328  		if d.CacheInstance() != nil {
   329  			s.metrics.cacheSize.WithLabelValues(d.Name()).Set(float64(d.CacheSize()))
   330  		}
   331  	}
   332  }
   333  
   334  func (s *Storage) retentionTask() {
   335  	rp := s.retentionPolicy()
   336  	if !rp.LowerTimeBoundary().IsZero() {
   337  		s.withContext(func(ctx context.Context) {
   338  			s.enforceRetentionPolicy(ctx, rp)
   339  		})
   340  	}
   341  }
   342  
   343  func (s *Storage) exemplarsRetentionTask() {
   344  	rp := s.retentionPolicy()
   345  	if !rp.ExemplarsRetentionTime.IsZero() {
   346  		s.withContext(func(ctx context.Context) {
   347  			s.exemplars.enforceRetentionPolicy(ctx, rp)
   348  		})
   349  	}
   350  }
   351  
   352  func (s *Storage) retentionPolicy() *segment.RetentionPolicy {
   353  	exemplarsRetention := s.config.retentionExemplars
   354  	if exemplarsRetention == 0 {
   355  		exemplarsRetention = s.config.retention
   356  	}
   357  	return segment.NewRetentionPolicy().
   358  		SetAbsolutePeriod(s.config.retention).
   359  		SetExemplarsRetentionPeriod(exemplarsRetention).
   360  		SetLevels(
   361  			s.config.retentionLevels.Zero,
   362  			s.config.retentionLevels.One,
   363  			s.config.retentionLevels.Two)
   364  }
   365  
   366  func (s *Storage) databases() []BadgerDBWithCache {
   367  	return []BadgerDBWithCache{
   368  		s.main,
   369  		s.dimensions,
   370  		s.segments,
   371  		s.dicts,
   372  		s.trees,
   373  		s.exemplars.db,
   374  	}
   375  }
   376  
   377  func (s *Storage) SegmentsInternals() (*badger.DB, *cache.Cache) {
   378  	return s.segments.DBInstance(), s.segments.CacheInstance()
   379  }
   380  func (s *Storage) DimensionsInternals() (*badger.DB, *cache.Cache) {
   381  	return s.dimensions.DBInstance(), s.dimensions.CacheInstance()
   382  }
   383  func (s *Storage) DictsInternals() (*badger.DB, *cache.Cache) {
   384  	return s.dicts.DBInstance(), s.dicts.CacheInstance()
   385  }
   386  func (s *Storage) TreesInternals() (*badger.DB, *cache.Cache) {
   387  	return s.trees.DBInstance(), s.trees.CacheInstance()
   388  }
   389  func (s *Storage) MainInternals() (*badger.DB, *cache.Cache) {
   390  	return s.main.DBInstance(), s.main.CacheInstance()
   391  }
   392  func (s *Storage) ExemplarsInternals() (*badger.DB, func()) {
   393  	return s.exemplars.db.DBInstance(), s.exemplars.Sync
   394  }