github.com/pyroscope-io/pyroscope@v0.37.3-0.20230725203016-5f6947968bd0/pkg/storage/storage_exemplars.go (about)

     1  package storage
     2  
     3  import (
     4  	"bytes"
     5  	"context"
     6  	"errors"
     7  	"fmt"
     8  	"strconv"
     9  	"sync"
    10  	"time"
    11  
    12  	"github.com/dgraph-io/badger/v2"
    13  	"github.com/prometheus/client_golang/prometheus"
    14  	"github.com/sirupsen/logrus"
    15  
    16  	"github.com/pyroscope-io/pyroscope/pkg/storage/dict"
    17  	"github.com/pyroscope-io/pyroscope/pkg/storage/metadata"
    18  	"github.com/pyroscope-io/pyroscope/pkg/storage/segment"
    19  	"github.com/pyroscope-io/pyroscope/pkg/storage/tree"
    20  	"github.com/pyroscope-io/pyroscope/pkg/util/varint"
    21  )
    22  
    23  // TODO(kolesnikovae): decouple from Storage.
    24  
    25  const (
    26  	exemplarDataPrefix      Prefix = "v:"
    27  	exemplarTimestampPrefix Prefix = "t:"
    28  	exemplarsCurrentFormat         = 2
    29  
    30  	defaultExemplarsBatchQueueSize = 5
    31  	defaultExemplarsBatchSize      = 10 << 10 // 10K
    32  	defaultExemplarsBatchDuration  = time.Second * 5
    33  )
    34  
    35  type exemplars struct {
    36  	logger  *logrus.Logger
    37  	config  *Config
    38  	metrics *metrics
    39  	db      BadgerDBWithCache
    40  	dicts   BadgerDBWithCache
    41  
    42  	once         sync.Once
    43  	mu           sync.Mutex
    44  	currentBatch *exemplarsBatch
    45  	batches      chan *exemplarsBatch
    46  }
    47  
    48  var (
    49  	errBatchIsFull       = errors.New("exemplars batch is full")
    50  	errProfileIDRequired = errors.New("profile id label required")
    51  )
    52  
    53  type exemplarsBatch struct {
    54  	batchSize int
    55  	entries   map[string]*exemplarEntry
    56  	config    *Config
    57  	metrics   *metrics
    58  	dicts     BadgerDBWithCache
    59  }
    60  
    61  type exemplarEntry struct {
    62  	// DB exemplar key and its parts.
    63  	Key       []byte
    64  	AppName   string
    65  	ProfileID string
    66  
    67  	// Value.
    68  	StartTime int64
    69  	EndTime   int64
    70  	Labels    map[string]string
    71  	Tree      *tree.Tree
    72  }
    73  
    74  func (e *exemplars) exemplarsQueueSize() int {
    75  	if e.config.exemplarsBatchQueueSize != 0 {
    76  		return e.config.exemplarsBatchQueueSize
    77  	}
    78  	return defaultExemplarsBatchQueueSize
    79  }
    80  
    81  func (e *exemplars) exemplarsBatchSize() int {
    82  	if e.config.exemplarsBatchSize != 0 {
    83  		return e.config.exemplarsBatchSize
    84  	}
    85  	return defaultExemplarsBatchSize
    86  }
    87  
    88  func (e *exemplars) exemplarsBatchDuration() time.Duration {
    89  	if e.config.exemplarsBatchDuration != 0 {
    90  		return e.config.exemplarsBatchDuration
    91  	}
    92  	return defaultExemplarsBatchDuration
    93  }
    94  
    95  func (e *exemplars) newExemplarsBatch() *exemplarsBatch {
    96  	batchSize := e.exemplarsBatchSize()
    97  	return &exemplarsBatch{
    98  		batchSize: batchSize,
    99  		metrics:   e.metrics,
   100  		config:    e.config,
   101  		dicts:     e.dicts,
   102  		entries:   make(map[string]*exemplarEntry, batchSize),
   103  	}
   104  }
   105  
   106  func (s *Storage) initExemplarsStorage(db BadgerDBWithCache) {
   107  	e := exemplars{
   108  		logger:  s.logger,
   109  		config:  s.config,
   110  		metrics: s.metrics,
   111  		dicts:   s.dicts,
   112  		db:      db,
   113  	}
   114  
   115  	e.batches = make(chan *exemplarsBatch, e.exemplarsQueueSize())
   116  	e.currentBatch = e.newExemplarsBatch()
   117  
   118  	s.exemplars = &e
   119  	s.tasksWG.Add(1)
   120  
   121  	go func() {
   122  		retentionTicker := time.NewTicker(s.retentionTaskInterval)
   123  		batchFlushTicker := time.NewTicker(e.exemplarsBatchDuration())
   124  		defer func() {
   125  			batchFlushTicker.Stop()
   126  			retentionTicker.Stop()
   127  			s.tasksWG.Done()
   128  		}()
   129  		for {
   130  			select {
   131  			default:
   132  			case batch, ok := <-e.batches:
   133  				if ok {
   134  					e.flush(batch)
   135  				}
   136  			}
   137  
   138  			select {
   139  			case <-s.stop:
   140  				e.logger.Debug("flushing batches queue")
   141  				e.flushBatchQueue()
   142  				return
   143  
   144  			case <-batchFlushTicker.C:
   145  				e.logger.Debug("flushing current batch")
   146  				e.mu.Lock()
   147  				e.flushCurrentBatch()
   148  				e.mu.Unlock()
   149  
   150  			case batch, ok := <-e.batches:
   151  				if ok {
   152  					e.flush(batch)
   153  				}
   154  
   155  			case <-retentionTicker.C:
   156  				s.exemplarsRetentionTask()
   157  			}
   158  		}
   159  	}()
   160  }
   161  
   162  func (e *exemplars) enforceRetentionPolicy(ctx context.Context, rp *segment.RetentionPolicy) {
   163  	observer := prometheus.ObserverFunc(e.metrics.exemplarsRetentionTaskDuration.Observe)
   164  	timer := prometheus.NewTimer(observer)
   165  	defer timer.ObserveDuration()
   166  
   167  	e.logger.Debug("enforcing exemplars retention policy")
   168  	err := e.truncateBefore(ctx, rp.ExemplarsRetentionTime)
   169  	switch {
   170  	case err == nil:
   171  	case errors.Is(ctx.Err(), context.Canceled):
   172  		e.logger.Warn("enforcing exemplars retention policy canceled")
   173  	default:
   174  		e.logger.WithError(err).Error("failed to enforce exemplars retention policy")
   175  	}
   176  }
   177  
   178  // exemplarKey creates a key in the v:{app_name}:{profile_id} format
   179  func exemplarKey(appName, profileID string) []byte {
   180  	return exemplarDataPrefix.key(appName + ":" + profileID)
   181  }
   182  
   183  // parseExemplarTimestamp returns timestamp and the profile
   184  // data key (in v:{app_name}:{profile_id} format), if the given timestamp key is valid.
   185  func parseExemplarTimestamp(k []byte) (int64, []byte, bool) {
   186  	v, ok := exemplarTimestampPrefix.trim(k)
   187  	if !ok {
   188  		return 0, nil, false
   189  	}
   190  	i := bytes.IndexByte(v, ':')
   191  	if i < 0 {
   192  		return 0, nil, false
   193  	}
   194  	t, err := strconv.ParseInt(string(v[:i]), 10, 64)
   195  	if err != nil {
   196  		return 0, nil, false
   197  	}
   198  	return t, append(exemplarDataPrefix.bytes(), v[i+1:]...), true
   199  }
   200  
   201  func exemplarKeyToTimestampKey(k []byte, t int64) ([]byte, bool) {
   202  	if v, ok := exemplarDataPrefix.trim(k); ok {
   203  		return append(exemplarTimestampPrefix.key(strconv.FormatInt(t, 10)+":"), v...), true
   204  	}
   205  	return nil, false
   206  }
   207  
   208  func (e *exemplars) flushCurrentBatch() {
   209  	entries := len(e.currentBatch.entries)
   210  	if entries == 0 {
   211  		return
   212  	}
   213  	b := e.currentBatch
   214  	e.currentBatch = e.newExemplarsBatch()
   215  	select {
   216  	case e.batches <- b:
   217  	default:
   218  		e.metrics.exemplarsDiscardedTotal.Add(float64(entries))
   219  	}
   220  }
   221  
   222  func (e *exemplars) Sync() {
   223  	e.mu.Lock()
   224  	defer e.mu.Unlock()
   225  	e.flush(e.currentBatch)
   226  	e.currentBatch = e.newExemplarsBatch()
   227  	n := len(e.batches)
   228  	var i int
   229  	for {
   230  		if i == n {
   231  			return
   232  		}
   233  		select {
   234  		default:
   235  			return
   236  		case b, ok := <-e.batches:
   237  			if !ok {
   238  				return
   239  			}
   240  			e.flush(b)
   241  			i++
   242  		}
   243  	}
   244  }
   245  
   246  func (e *exemplars) flushBatchQueue() {
   247  	e.once.Do(func() {
   248  		e.flush(e.currentBatch)
   249  		close(e.batches)
   250  		for batch := range e.batches {
   251  			e.flush(batch)
   252  		}
   253  	})
   254  }
   255  
   256  func (e *exemplars) flush(b *exemplarsBatch) {
   257  	if len(b.entries) == 0 {
   258  		return
   259  	}
   260  	e.logger.Debug("flushing completed batch")
   261  	err := e.db.Update(func(txn *badger.Txn) error {
   262  		for _, entry := range b.entries {
   263  			if err := b.writeExemplarToDB(txn, entry); err != nil {
   264  				return err
   265  			}
   266  		}
   267  		return nil
   268  	})
   269  
   270  	if err != nil {
   271  		e.logger.WithError(err).Error("failed to write exemplars batch")
   272  	}
   273  }
   274  
   275  func (e *exemplars) insert(ctx context.Context, input *PutInput) error {
   276  	if input.Val == nil || input.Val.Samples() == 0 {
   277  		return nil
   278  	}
   279  	e.mu.Lock()
   280  	defer e.mu.Unlock()
   281  	err := e.currentBatch.insert(ctx, input)
   282  	if err == errBatchIsFull {
   283  		e.flushCurrentBatch()
   284  		return e.currentBatch.insert(ctx, input)
   285  	}
   286  	return err
   287  }
   288  
   289  func (e *exemplars) fetch(ctx context.Context, appName string, profileIDs []string, fn func(exemplarEntry) error) error {
   290  	d, ok := e.dicts.Lookup(appName)
   291  	if !ok {
   292  		return nil
   293  	}
   294  	dx := d.(*dict.Dict)
   295  	return e.db.View(func(txn *badger.Txn) error {
   296  		for _, profileID := range profileIDs {
   297  			if err := ctx.Err(); err != nil {
   298  				return err
   299  			}
   300  			k := exemplarKey(appName, profileID)
   301  			item, err := txn.Get(k)
   302  			switch {
   303  			default:
   304  				return err
   305  			case errors.Is(err, badger.ErrKeyNotFound):
   306  			case err == nil:
   307  				// TODO(kolesnikovae): Optimize:
   308  				//   It makes sense to lookup the dictionary keys only after all
   309  				//   exemplars fetched and merged.
   310  				err = item.Value(func(val []byte) error {
   311  					e.metrics.exemplarsReadBytes.Observe(float64(len(val)))
   312  					var x exemplarEntry
   313  					if err = x.Deserialize(dx, val); err != nil {
   314  						return err
   315  					}
   316  					x.Key = k
   317  					x.AppName = appName
   318  					x.ProfileID = profileID
   319  					return fn(x)
   320  				})
   321  				if err != nil {
   322  					return err
   323  				}
   324  			}
   325  		}
   326  		return nil
   327  	})
   328  }
   329  
   330  func (e *exemplars) truncateBefore(ctx context.Context, before time.Time) (err error) {
   331  	for more := true; more; {
   332  		select {
   333  		case <-ctx.Done():
   334  			return ctx.Err()
   335  		case batch, ok := <-e.batches:
   336  			if ok {
   337  				e.flush(batch)
   338  			}
   339  		default:
   340  			if more, err = e.truncateN(before, defaultBatchSize); err != nil {
   341  				return err
   342  			}
   343  		}
   344  	}
   345  	return nil
   346  }
   347  
   348  func (e *exemplars) truncateN(before time.Time, count int) (bool, error) {
   349  	beforeTs := before.UnixNano()
   350  	keys := make([][]byte, 0, 2*count)
   351  	err := e.db.View(func(txn *badger.Txn) error {
   352  		it := txn.NewIterator(badger.IteratorOptions{
   353  			Prefix: exemplarTimestampPrefix.bytes(),
   354  		})
   355  		defer it.Close()
   356  		for it.Rewind(); it.Valid(); it.Next() {
   357  			if len(keys) == cap(keys) {
   358  				return nil
   359  			}
   360  			item := it.Item()
   361  			keyTs, exKey, ok := parseExemplarTimestamp(item.Key())
   362  			if !ok {
   363  				continue
   364  			}
   365  			if keyTs > beforeTs {
   366  				break
   367  			}
   368  			keys = append(keys, item.KeyCopy(nil))
   369  			keys = append(keys, exKey)
   370  		}
   371  		return nil
   372  	})
   373  
   374  	if err != nil {
   375  		return false, err
   376  	}
   377  	if len(keys) == 0 {
   378  		return false, nil
   379  	}
   380  
   381  	batch := e.db.NewWriteBatch()
   382  	defer batch.Cancel()
   383  	for i := range keys {
   384  		if err = batch.Delete(keys[i]); err != nil {
   385  			return false, err
   386  		}
   387  	}
   388  
   389  	if err = batch.Flush(); err == nil {
   390  		e.metrics.exemplarsRemovedTotal.Add(float64(len(keys) / 2))
   391  	}
   392  
   393  	return true, err
   394  }
   395  
   396  func (s *Storage) ensureAppSegmentExists(in *PutInput) error {
   397  	k := segment.AppSegmentKey(in.Key.AppName())
   398  	r, err := s.segments.GetOrCreate(k)
   399  	if err != nil {
   400  		return fmt.Errorf("segments cache for %v: %w", k, err)
   401  	}
   402  	st := r.(*segment.Segment)
   403  	st.SetMetadata(metadata.Metadata{
   404  		SpyName:         in.SpyName,
   405  		SampleRate:      in.SampleRate,
   406  		Units:           in.Units,
   407  		AggregationType: in.AggregationType,
   408  	})
   409  	s.segments.Put(k, st)
   410  	return err
   411  }
   412  
   413  func (b *exemplarsBatch) insert(_ context.Context, input *PutInput) error {
   414  	if len(b.entries) == b.batchSize {
   415  		return errBatchIsFull
   416  	}
   417  	profileID, ok := input.Key.ProfileID()
   418  	if !ok {
   419  		return errProfileIDRequired
   420  	}
   421  	appName := input.Key.AppName()
   422  	k := exemplarKey(appName, profileID)
   423  	key := string(k)
   424  	e, ok := b.entries[key]
   425  	if ok {
   426  		e.Tree.Merge(input.Val)
   427  		e.updateTime(input.StartTime.UnixNano(), input.EndTime.UnixNano())
   428  		return nil
   429  	}
   430  	b.entries[key] = &exemplarEntry{
   431  		Key:       k,
   432  		AppName:   appName,
   433  		ProfileID: profileID,
   434  
   435  		StartTime: input.StartTime.UnixNano(),
   436  		EndTime:   input.EndTime.UnixNano(),
   437  		Labels:    input.Key.Labels(),
   438  		Tree:      input.Val,
   439  	}
   440  	return nil
   441  }
   442  
   443  func (b *exemplarsBatch) writeExemplarToDB(txn *badger.Txn, e *exemplarEntry) error {
   444  	k, ok := exemplarKeyToTimestampKey(e.Key, e.EndTime)
   445  	if !ok {
   446  		return fmt.Errorf("invalid exemplar key")
   447  	}
   448  	if err := txn.Set(k, nil); err != nil {
   449  		return err
   450  	}
   451  	d, err := b.dicts.GetOrCreate(e.AppName)
   452  	if err != nil {
   453  		return err
   454  	}
   455  	dx := d.(*dict.Dict)
   456  
   457  	item, err := txn.Get(e.Key)
   458  	switch {
   459  	default:
   460  		return err
   461  	case errors.Is(err, badger.ErrKeyNotFound):
   462  		// Fast path: there is no exemplar with this key in the database.
   463  	case err == nil:
   464  		// Merge with the found exemplar using the buffer provided.
   465  		// Ideally, we should also drop existing timestamp key and create a new one,
   466  		// so that the exemplar wouldn't be deleted before its actual EndTime passes
   467  		// the retention policy threshold. The time difference is negligible, therefore
   468  		// it's not happening: only the first EndTime is honored.
   469  		err = item.Value(func(val []byte) error {
   470  			b.metrics.exemplarsReadBytes.Observe(float64(len(val)))
   471  			var x exemplarEntry
   472  			if err = x.Deserialize(dx, val); err == nil {
   473  				e = x.Merge(e)
   474  			}
   475  			return err
   476  		})
   477  		if err != nil {
   478  			return err
   479  		}
   480  	}
   481  
   482  	r, err := e.Serialize(dx, b.config.maxNodesSerialization)
   483  	if err != nil {
   484  		return err
   485  	}
   486  	if err = txn.Set(e.Key, r); err != nil {
   487  		return err
   488  	}
   489  	b.metrics.exemplarsWriteBytes.Observe(float64(len(r)))
   490  	return nil
   491  }
   492  
   493  func (e *exemplarEntry) Merge(src *exemplarEntry) *exemplarEntry {
   494  	e.updateTime(src.StartTime, src.EndTime)
   495  	e.Tree.Merge(src.Tree)
   496  	e.Key = src.Key
   497  	return e
   498  }
   499  
   500  func (e *exemplarEntry) updateTime(st, et int64) {
   501  	if st < e.StartTime {
   502  		e.StartTime = st
   503  	}
   504  	if et > e.EndTime {
   505  		e.EndTime = et
   506  	}
   507  }
   508  
   509  func (e *exemplarEntry) Serialize(d *dict.Dict, maxNodes int) ([]byte, error) {
   510  	b := bytes.NewBuffer(make([]byte, 0, 1<<10)) // 1 KB.
   511  	b.WriteByte(exemplarsCurrentFormat)          // Version.
   512  	if err := e.Tree.SerializeTruncate(d, maxNodes, b); err != nil {
   513  		return nil, err
   514  	}
   515  
   516  	vw := varint.NewWriter()
   517  	_, _ = vw.Write(b, uint64(e.StartTime))
   518  	_, _ = vw.Write(b, uint64(e.EndTime))
   519  
   520  	// Strip profile_id and __name__ labels.
   521  	labels := make([]string, 0, len(e.Labels)*2)
   522  	for k, v := range e.Labels {
   523  		if k == segment.ProfileIDLabelName || k == "__name__" {
   524  			continue
   525  		}
   526  		labels = append(labels, k, v)
   527  	}
   528  	// Write labels as an array of string pairs.
   529  	_, _ = vw.Write(b, uint64(len(labels)))
   530  	for _, v := range labels {
   531  		bs := []byte(v)
   532  		_, _ = vw.Write(b, uint64(len(bs)))
   533  		_, _ = b.Write(bs)
   534  	}
   535  
   536  	return b.Bytes(), nil
   537  }
   538  
   539  func (e *exemplarEntry) Deserialize(d *dict.Dict, b []byte) error {
   540  	buf := bytes.NewBuffer(b)
   541  	v, err := buf.ReadByte()
   542  	if err != nil {
   543  		return err
   544  	}
   545  	switch v {
   546  	case 1:
   547  		return e.deserializeV1(d, buf)
   548  	case 2:
   549  		return e.deserializeV2(d, buf)
   550  	default:
   551  		return fmt.Errorf("unknown exemplar format version %d", v)
   552  	}
   553  }
   554  
   555  func (e *exemplarEntry) deserializeV1(d *dict.Dict, src *bytes.Buffer) error {
   556  	t, err := tree.Deserialize(d, src)
   557  	if err != nil {
   558  		return err
   559  	}
   560  	e.Tree = t
   561  	return nil
   562  }
   563  
   564  func (e *exemplarEntry) deserializeV2(d *dict.Dict, src *bytes.Buffer) error {
   565  	t, err := tree.Deserialize(d, src)
   566  	if err != nil {
   567  		return err
   568  	}
   569  	e.Tree = t
   570  
   571  	st, err := varint.Read(src)
   572  	if err != nil {
   573  		return err
   574  	}
   575  	e.StartTime = int64(st)
   576  	et, err := varint.Read(src)
   577  	if err != nil {
   578  		return err
   579  	}
   580  	e.EndTime = int64(et)
   581  
   582  	n, err := varint.Read(src)
   583  	if err != nil {
   584  		return err
   585  	}
   586  	if e.Labels == nil {
   587  		e.Labels = make(map[string]string, n)
   588  	}
   589  	var k string
   590  	for i := uint64(0); i < n; i++ {
   591  		m, err := varint.Read(src)
   592  		if err != nil {
   593  			return err
   594  		}
   595  		v := string(src.Next(int(m)))
   596  		if i%2 != 0 {
   597  			e.Labels[k] = v
   598  		} else {
   599  			k = v
   600  		}
   601  	}
   602  
   603  	return nil
   604  }