github.com/yankunsam/loki/v2@v2.6.3-0.20220817130409-389df5235c27/pkg/storage/stores/index/stats/stats.go (about) 1 package stats 2 3 import ( 4 "encoding/binary" 5 "sync" 6 7 "github.com/prometheus/common/model" 8 "github.com/willf/bloom" 9 10 "github.com/grafana/loki/pkg/logproto" 11 "github.com/grafana/loki/pkg/storage/stores/tsdb/index" 12 ) 13 14 var BloomPool PoolBloom 15 16 type Stats = logproto.IndexStatsResponse 17 18 func MergeStats(xs ...*Stats) (s Stats) { 19 for _, x := range xs { 20 if x == nil { 21 continue 22 } 23 s.Streams += x.Streams 24 s.Chunks += x.Chunks 25 s.Bytes += x.Bytes 26 s.Entries += x.Entries 27 28 } 29 return s 30 } 31 32 type PoolBloom struct { 33 pool sync.Pool 34 } 35 36 func (p *PoolBloom) Get() *Blooms { 37 if x := p.pool.Get(); x != nil { 38 return x.(*Blooms) 39 } 40 41 return newBlooms() 42 43 } 44 45 func (p *PoolBloom) Put(x *Blooms) { 46 x.Streams.ClearAll() 47 x.Chunks.ClearAll() 48 x.stats = Stats{} 49 p.pool.Put(x) 50 } 51 52 // These are very expensive in terms of memory usage, 53 // each requiring ~12.5MB. Therefore we heavily rely on pool usage. 54 // See https://hur.st/bloomfilter for play around with this idea. 55 // We use bloom filter per process per query to avoid double-counting duplicates 56 // when calculating statistics across multiple tsdb files, however 57 // we cannot guarantee this when querying across period config boundaries 58 // as the data is requested via separate calls to the underlying store, 59 // which may reside on a different process (index-gateway). 60 // This is an accepted fault and we may double-count some values which 61 // are on both sides of a schema line: 62 // streams+chunks and thus bytes/lines. 63 // To avoid this, we'd need significant refactoring 64 // to ensure we resolve statistics for all periods together 65 // and this doesn't seem worth it: the code paths for iterating across different 66 // stores are separate. 67 // Another option is to ship the bloom filter bitmaps sequentially to each 68 // store, but this is too inefficient (~12.5MB payloads). 69 // signed, @owen-d 70 func newBlooms() *Blooms { 71 // 1 million streams @ 1% error =~ 1.14MB 72 streams := bloom.NewWithEstimates(1e6, 0.01) 73 // 10 million chunks @ 1% error =~ 11.43MB 74 chunks := bloom.NewWithEstimates(10e6, 0.01) 75 return &Blooms{ 76 Streams: streams, 77 Chunks: chunks, 78 } 79 } 80 81 // TODO(owen-d): shard this across a slice of smaller bloom filters to reduce 82 // lock contention 83 // Bloom filters for estimating duplicate statistics across both series 84 // and chunks within TSDB indices. These are used to calculate data topology 85 // statistics prior to running queries. 86 type Blooms struct { 87 sync.RWMutex 88 Streams, Chunks *bloom.BloomFilter 89 stats Stats 90 } 91 92 func (b *Blooms) Stats() Stats { return b.stats } 93 94 func (b *Blooms) AddStream(fp model.Fingerprint) { 95 key := make([]byte, 8) 96 binary.BigEndian.PutUint64(key, uint64(fp)) 97 b.add(b.Streams, key, func() { 98 b.stats.Streams++ 99 }) 100 } 101 102 func (b *Blooms) AddChunk(fp model.Fingerprint, chk index.ChunkMeta) { 103 // fingerprint + mintime + maxtime + checksum 104 ln := 8 + 8 + 8 + 4 105 key := make([]byte, ln) 106 binary.BigEndian.PutUint64(key, uint64(fp)) 107 binary.BigEndian.PutUint64(key[8:], uint64(chk.MinTime)) 108 binary.BigEndian.PutUint64(key[16:], uint64(chk.MaxTime)) 109 binary.BigEndian.PutUint32(key[24:], chk.Checksum) 110 b.add(b.Chunks, key, func() { 111 b.stats.Chunks++ 112 b.stats.Bytes += uint64(chk.KB << 10) 113 b.stats.Entries += uint64(chk.Entries) 114 }) 115 } 116 117 func (b *Blooms) add(filter *bloom.BloomFilter, key []byte, update func()) { 118 b.RLock() 119 ok := filter.Test(key) 120 b.RUnlock() 121 122 if ok { 123 return 124 } 125 126 b.Lock() 127 defer b.Unlock() 128 if ok = filter.TestAndAdd(key); !ok { 129 update() 130 } 131 }