github.com/yankunsam/loki/v2@v2.6.3-0.20220817130409-389df5235c27/pkg/storage/stores/index/stats/stats.go

github.com/yankunsam/loki/v2@v2.6.3-0.20220817130409-389df5235c27/pkg/storage/stores/index/stats/stats.go (about)

     1  package stats
     2  
     3  import (
     4  	"encoding/binary"
     5  	"sync"
     6  
     7  	"github.com/prometheus/common/model"
     8  	"github.com/willf/bloom"
     9  
    10  	"github.com/grafana/loki/pkg/logproto"
    11  	"github.com/grafana/loki/pkg/storage/stores/tsdb/index"
    12  )
    13  
    14  var BloomPool PoolBloom
    15  
    16  type Stats = logproto.IndexStatsResponse
    17  
    18  func MergeStats(xs ...*Stats) (s Stats) {
    19  	for _, x := range xs {
    20  		if x == nil {
    21  			continue
    22  		}
    23  		s.Streams += x.Streams
    24  		s.Chunks += x.Chunks
    25  		s.Bytes += x.Bytes
    26  		s.Entries += x.Entries
    27  
    28  	}
    29  	return s
    30  }
    31  
    32  type PoolBloom struct {
    33  	pool sync.Pool
    34  }
    35  
    36  func (p *PoolBloom) Get() *Blooms {
    37  	if x := p.pool.Get(); x != nil {
    38  		return x.(*Blooms)
    39  	}
    40  
    41  	return newBlooms()
    42  
    43  }
    44  
    45  func (p *PoolBloom) Put(x *Blooms) {
    46  	x.Streams.ClearAll()
    47  	x.Chunks.ClearAll()
    48  	x.stats = Stats{}
    49  	p.pool.Put(x)
    50  }
    51  
    52  // These are very expensive in terms of memory usage,
    53  // each requiring ~12.5MB. Therefore we heavily rely on pool usage.
    54  // See https://hur.st/bloomfilter for play around with this idea.
    55  // We use bloom filter per process per query to avoid double-counting duplicates
    56  // when calculating statistics across multiple tsdb files, however
    57  // we cannot guarantee this when querying across period config boundaries
    58  // as the data is requested via separate calls to the underlying store,
    59  // which may reside on a different process (index-gateway).
    60  // This is an accepted fault and we may double-count some values which
    61  // are on both sides of a schema line:
    62  // streams+chunks and thus bytes/lines.
    63  // To avoid this, we'd need significant refactoring
    64  // to ensure we resolve statistics for all periods together
    65  // and this doesn't seem worth it: the code paths for iterating across different
    66  // stores are separate.
    67  // Another option is to ship the bloom filter bitmaps sequentially to each
    68  // store, but this is too inefficient (~12.5MB payloads).
    69  // signed, @owen-d
    70  func newBlooms() *Blooms {
    71  	// 1 million streams @ 1% error =~ 1.14MB
    72  	streams := bloom.NewWithEstimates(1e6, 0.01)
    73  	// 10 million chunks @ 1% error =~ 11.43MB
    74  	chunks := bloom.NewWithEstimates(10e6, 0.01)
    75  	return &Blooms{
    76  		Streams: streams,
    77  		Chunks:  chunks,
    78  	}
    79  }
    80  
    81  // TODO(owen-d): shard this across a slice of smaller bloom filters to reduce
    82  // lock contention
    83  // Bloom filters for estimating duplicate statistics across both series
    84  // and chunks within TSDB indices. These are used to calculate data topology
    85  // statistics prior to running queries.
    86  type Blooms struct {
    87  	sync.RWMutex
    88  	Streams, Chunks *bloom.BloomFilter
    89  	stats           Stats
    90  }
    91  
    92  func (b *Blooms) Stats() Stats { return b.stats }
    93  
    94  func (b *Blooms) AddStream(fp model.Fingerprint) {
    95  	key := make([]byte, 8)
    96  	binary.BigEndian.PutUint64(key, uint64(fp))
    97  	b.add(b.Streams, key, func() {
    98  		b.stats.Streams++
    99  	})
   100  }
   101  
   102  func (b *Blooms) AddChunk(fp model.Fingerprint, chk index.ChunkMeta) {
   103  	// fingerprint + mintime + maxtime + checksum
   104  	ln := 8 + 8 + 8 + 4
   105  	key := make([]byte, ln)
   106  	binary.BigEndian.PutUint64(key, uint64(fp))
   107  	binary.BigEndian.PutUint64(key[8:], uint64(chk.MinTime))
   108  	binary.BigEndian.PutUint64(key[16:], uint64(chk.MaxTime))
   109  	binary.BigEndian.PutUint32(key[24:], chk.Checksum)
   110  	b.add(b.Chunks, key, func() {
   111  		b.stats.Chunks++
   112  		b.stats.Bytes += uint64(chk.KB << 10)
   113  		b.stats.Entries += uint64(chk.Entries)
   114  	})
   115  }
   116  
   117  func (b *Blooms) add(filter *bloom.BloomFilter, key []byte, update func()) {
   118  	b.RLock()
   119  	ok := filter.Test(key)
   120  	b.RUnlock()
   121  
   122  	if ok {
   123  		return
   124  	}
   125  
   126  	b.Lock()
   127  	defer b.Unlock()
   128  	if ok = filter.TestAndAdd(key); !ok {
   129  		update()
   130  	}
   131  }