github.com/grafana/pyroscope@v1.18.0/pkg/segmentwriter/memdb/index/index.go (about)

     1  // Copyright 2017 The Prometheus Authors
     2  // Licensed under the Apache License, Version 2.0 (the "License");
     3  // you may not use this file except in compliance with the License.
     4  // You may obtain a copy of the License at
     5  //
     6  // http://www.apache.org/licenses/LICENSE-2.0
     7  //
     8  // Unless required by applicable law or agreed to in writing, software
     9  // distributed under the License is distributed on an "AS IS" BASIS,
    10  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    11  // See the License for the specific language governing permissions and
    12  // limitations under the License.
    13  
    14  // A tsdb index writer, that does not use files and mmap
    15  // To be for tiny segments in  v2 POC branch
    16  // Inspired by loki https://raw.githubusercontent.com/grafana/loki/main/pkg/storage/wal/index/index.go
    17  // But actually copied from pyroscope and modified accordingly
    18  
    19  package index
    20  
    21  import (
    22  	"bytes"
    23  	"context"
    24  	"encoding/binary"
    25  	"fmt"
    26  	"hash"
    27  	"hash/crc32"
    28  	"io"
    29  	"math"
    30  	"os"
    31  	"sort"
    32  	"unsafe"
    33  
    34  	"github.com/grafana/pyroscope/pkg/phlaredb/tsdb/index"
    35  
    36  	"github.com/pkg/errors"
    37  	"github.com/prometheus/common/model"
    38  	"github.com/prometheus/prometheus/model/labels"
    39  	"github.com/prometheus/prometheus/storage"
    40  	tsdb_enc "github.com/prometheus/prometheus/tsdb/encoding"
    41  
    42  	typesv1 "github.com/grafana/pyroscope/api/gen/proto/go/types/v1"
    43  	phlaremodel "github.com/grafana/pyroscope/pkg/model"
    44  	"github.com/grafana/pyroscope/pkg/phlaredb/block"
    45  	"github.com/grafana/pyroscope/pkg/phlaredb/tsdb/encoding"
    46  )
    47  
    48  const (
    49  	// MagicIndex 4 bytes at the head of an index file.
    50  	MagicIndex = 0xBAAAD700
    51  	// HeaderLen represents number of bytes reserved of index for header.
    52  	HeaderLen = 5
    53  
    54  	// FormatV1 represents 1 version of index.
    55  	FormatV1 = 1
    56  	// FormatV2 represents 2 version of index.
    57  	FormatV2 = 2
    58  
    59  	IndexFilename = "index"
    60  
    61  	// store every 1024 series' fingerprints in the fingerprint offsets table
    62  	fingerprintInterval = 1 << 10
    63  
    64  	SegmentsIndexWriterBufSize = 2 * 0x1000 // small for segments
    65  	BlocksIndexWriterBufSize   = 1 << 22    // large for blocks
    66  )
    67  
    68  type indexWriterStage uint8
    69  
    70  const (
    71  	idxStageNone indexWriterStage = iota
    72  	idxStageSymbols
    73  	idxStageSeries
    74  	idxStageDone
    75  )
    76  
    77  func (s indexWriterStage) String() string {
    78  	switch s {
    79  	case idxStageNone:
    80  		return "none"
    81  	case idxStageSymbols:
    82  		return "symbols"
    83  	case idxStageSeries:
    84  		return "series"
    85  	case idxStageDone:
    86  		return "done"
    87  	}
    88  	return "<unknown>"
    89  }
    90  
    91  // The table gets initialized with sync.Once but may still cause a race
    92  // with any other use of the crc32 package anywhere. Thus we initialize it
    93  // before.
    94  var castagnoliTable *crc32.Table
    95  
    96  func init() {
    97  	castagnoliTable = crc32.MakeTable(crc32.Castagnoli)
    98  }
    99  
   100  // newCRC32 initializes a CRC32 hash with a preconfigured polynomial, so the
   101  // polynomial may be easily changed in one location at a later time, if necessary.
   102  func newCRC32() hash.Hash32 {
   103  	return crc32.New(castagnoliTable)
   104  }
   105  
   106  type symbolCacheEntry struct {
   107  	index          uint32
   108  	lastValue      string
   109  	lastValueIndex uint32
   110  }
   111  
   112  // Writer implements the IndexWriter interface for the standard
   113  // serialization format.
   114  type Writer struct {
   115  	ctx context.Context
   116  
   117  	f *BufferWriter
   118  
   119  	// Temporary file for postings.
   120  	fP *BufferWriter
   121  	// Temporary file for posting offsets table.
   122  	fPO   *BufferWriter
   123  	cntPO uint64
   124  
   125  	toc           TOC
   126  	stage         indexWriterStage
   127  	postingsStart uint64 // Due to padding, can differ from TOC entry.
   128  
   129  	// Reusable memory.
   130  	buf1 encoding.Encbuf
   131  	buf2 encoding.Encbuf
   132  
   133  	numSymbols  int
   134  	symbols     *Symbols
   135  	symbolFile  io.Closer
   136  	lastSymbol  string
   137  	symbolCache map[string]symbolCacheEntry
   138  
   139  	labelIndexes []labelIndexHashEntry // Label index offsets.
   140  	labelNames   map[string]uint64     // Label names, and their usage.
   141  	// Keeps track of the fingerprint/offset for every n series
   142  	fingerprintOffsets index.FingerprintOffsets
   143  
   144  	// Hold last series to validate that clients insert new series in order.
   145  	lastSeries     phlaremodel.Labels
   146  	lastSeriesHash uint64
   147  	lastRef        storage.SeriesRef
   148  
   149  	crc32 hash.Hash
   150  
   151  	Version int
   152  }
   153  
   154  // TOC represents index Table Of Content that states where each section of index starts.
   155  type TOC struct {
   156  	Symbols            uint64
   157  	Series             uint64
   158  	LabelIndices       uint64
   159  	LabelIndicesTable  uint64
   160  	Postings           uint64
   161  	PostingsTable      uint64
   162  	FingerprintOffsets uint64
   163  	Metadata           Metadata
   164  }
   165  
   166  // Metadata is TSDB-level metadata
   167  type Metadata struct {
   168  	From, Through int64
   169  	Checksum      uint32
   170  }
   171  
   172  func (m *Metadata) EnsureBounds(from, through int64) {
   173  	if m.From == 0 || from < m.From {
   174  		m.From = from
   175  	}
   176  
   177  	if m.Through == 0 || through > m.Through {
   178  		m.Through = through
   179  	}
   180  }
   181  
   182  // NewTOCFromByteSlice return parsed TOC from given index byte slice.
   183  func NewTOCFromByteSlice(bs ByteSlice) (*TOC, error) {
   184  	if bs.Len() < indexTOCLen {
   185  		return nil, tsdb_enc.ErrInvalidSize
   186  	}
   187  	b := bs.Range(bs.Len()-indexTOCLen, bs.Len())
   188  
   189  	expCRC := binary.BigEndian.Uint32(b[len(b)-4:])
   190  	d := encoding.DecWrap(tsdb_enc.Decbuf{B: b[:len(b)-4]})
   191  	if d.Crc32(castagnoliTable) != expCRC {
   192  		return nil, errors.Wrap(tsdb_enc.ErrInvalidChecksum, "read TOC")
   193  	}
   194  
   195  	if err := d.Err(); err != nil {
   196  		return nil, err
   197  	}
   198  
   199  	return &TOC{
   200  		Symbols:            d.Be64(),
   201  		Series:             d.Be64(),
   202  		LabelIndices:       d.Be64(),
   203  		LabelIndicesTable:  d.Be64(),
   204  		Postings:           d.Be64(),
   205  		PostingsTable:      d.Be64(),
   206  		FingerprintOffsets: d.Be64(),
   207  		Metadata: Metadata{
   208  			From:     d.Be64int64(),
   209  			Through:  d.Be64int64(),
   210  			Checksum: expCRC,
   211  		},
   212  	}, nil
   213  }
   214  
   215  // NewWriter returns a new Writer to the given filename. It serializes data in format version 2.
   216  func NewWriter(ctx context.Context, bufferSize int) (*Writer, error) {
   217  	iw := &Writer{
   218  		ctx:   ctx,
   219  		f:     GetBufferWriterFromPool(),
   220  		fP:    GetBufferWriterFromPool(),
   221  		fPO:   GetBufferWriterFromPool(),
   222  		stage: idxStageNone,
   223  
   224  		// Reusable memory.
   225  		buf1: encoding.EncWrap(tsdb_enc.Encbuf{B: make([]byte, 0, bufferSize)}),
   226  		buf2: encoding.EncWrap(tsdb_enc.Encbuf{B: make([]byte, 0, bufferSize)}),
   227  
   228  		symbolCache: make(map[string]symbolCacheEntry, 1<<8),
   229  		labelNames:  make(map[string]uint64, 1<<8),
   230  		crc32:       newCRC32(),
   231  	}
   232  	if err := iw.writeMeta(); err != nil {
   233  		return nil, err
   234  	}
   235  	return iw, nil
   236  }
   237  
   238  func (w *Writer) write(bufs ...[]byte) error {
   239  	return w.f.Write(bufs...)
   240  }
   241  
   242  func (w *Writer) writeAt(buf []byte, pos uint64) error {
   243  	return w.f.WriteAt(buf, pos)
   244  }
   245  
   246  func (w *Writer) addPadding(size int) error {
   247  	return w.f.AddPadding(size)
   248  }
   249  
   250  // ensureStage handles transitions between write stages and ensures that IndexWriter
   251  // methods are called in an order valid for the implementation.
   252  func (w *Writer) ensureStage(s indexWriterStage) error {
   253  	select {
   254  	case <-w.ctx.Done():
   255  		return w.ctx.Err()
   256  	default:
   257  	}
   258  
   259  	if w.stage == s {
   260  		return nil
   261  	}
   262  	if w.stage < s-1 {
   263  		// A stage has been skipped.
   264  		if err := w.ensureStage(s - 1); err != nil {
   265  			return err
   266  		}
   267  	}
   268  	if w.stage > s {
   269  		return errors.Errorf("invalid stage %q, currently at %q", s, w.stage)
   270  	}
   271  
   272  	// Mark start of sections in table of contents.
   273  	switch s {
   274  	case idxStageSymbols:
   275  		w.toc.Symbols = w.f.pos
   276  		if err := w.startSymbols(); err != nil {
   277  			return err
   278  		}
   279  	case idxStageSeries:
   280  		if err := w.finishSymbols(); err != nil {
   281  			return err
   282  		}
   283  		w.toc.Series = w.f.pos
   284  
   285  	case idxStageDone:
   286  		w.toc.LabelIndices = w.f.pos
   287  		// LabelIndices generation depends on the posting offset
   288  		// table produced at this stage.
   289  		if err := w.writePostingsToTmpFiles(); err != nil {
   290  			return err
   291  		}
   292  		if err := w.writeLabelIndices(); err != nil {
   293  			return err
   294  		}
   295  
   296  		w.toc.Postings = w.f.pos
   297  		if err := w.writePostings(); err != nil {
   298  			return err
   299  		}
   300  
   301  		w.toc.LabelIndicesTable = w.f.pos
   302  		if err := w.writeLabelIndexesOffsetTable(); err != nil {
   303  			return err
   304  		}
   305  
   306  		w.toc.PostingsTable = w.f.pos
   307  		if err := w.writePostingsOffsetTable(); err != nil {
   308  			return err
   309  		}
   310  
   311  		w.toc.FingerprintOffsets = w.f.pos
   312  		if err := w.writeFingerprintOffsetsTable(); err != nil {
   313  			return err
   314  		}
   315  
   316  		if err := w.writeTOC(); err != nil {
   317  			return err
   318  		}
   319  	}
   320  
   321  	w.stage = s
   322  	return nil
   323  }
   324  
   325  func (w *Writer) writeMeta() error {
   326  	w.buf1.Reset()
   327  	w.buf1.PutBE32(MagicIndex)
   328  	w.buf1.PutByte(FormatV2)
   329  
   330  	return w.write(w.buf1.Get())
   331  }
   332  
   333  // AddSeries adds the series one at a time along with its chunks.
   334  // Requires a specific fingerprint to be passed in the case where the "desired"
   335  // fingerprint differs from what labels.Hash() produces. For example,
   336  // multitenant TSDBs embed a tenant label, but the actual series has no such
   337  // label and so the derived fingerprint differs.
   338  func (w *Writer) AddSeries(ref storage.SeriesRef, lset phlaremodel.Labels, fp model.Fingerprint, chunks ...index.ChunkMeta) error {
   339  	if err := w.ensureStage(idxStageSeries); err != nil {
   340  		return err
   341  	}
   342  
   343  	// Put the supplied fingerprint instead of the calculated hash.
   344  	// This allows us to have a synthetic label (__loki_tenant__) in
   345  	// the pre-compacted TSDBs which map to fingerprints (and chunks)
   346  	// without this label in storage.
   347  	labelHash := uint64(fp)
   348  
   349  	if ref < w.lastRef && len(w.lastSeries) != 0 {
   350  		return errors.Errorf("series with reference greater than %d already added", ref)
   351  	}
   352  	// We add padding to 16 bytes to increase the addressable space we get through 4 byte
   353  	// series references.
   354  	if err := w.addPadding(16); err != nil {
   355  		return errors.Errorf("failed to write padding bytes: %v", err)
   356  	}
   357  
   358  	if w.f.pos%16 != 0 {
   359  		return errors.Errorf("series write not 16-byte aligned at %d", w.f.pos)
   360  	}
   361  
   362  	w.buf2.Reset()
   363  	w.buf2.PutBE64(labelHash)
   364  	w.buf2.PutUvarint(len(lset))
   365  
   366  	for _, l := range lset {
   367  		var err error
   368  		cacheEntry, ok := w.symbolCache[l.Name]
   369  		nameIndex := cacheEntry.index
   370  		if !ok {
   371  			nameIndex, err = w.symbols.ReverseLookup(l.Name)
   372  			if err != nil {
   373  				return errors.Errorf("symbol entry for %q does not exist, %v", l.Name, err)
   374  			}
   375  		}
   376  		w.labelNames[l.Name]++
   377  		w.buf2.PutUvarint32(nameIndex)
   378  
   379  		valueIndex := cacheEntry.lastValueIndex
   380  		if !ok || cacheEntry.lastValue != l.Value {
   381  			valueIndex, err = w.symbols.ReverseLookup(l.Value)
   382  			if err != nil {
   383  				return errors.Errorf("symbol entry for %q does not exist, %v", l.Value, err)
   384  			}
   385  			w.symbolCache[l.Name] = symbolCacheEntry{
   386  				index:          nameIndex,
   387  				lastValue:      l.Value,
   388  				lastValueIndex: valueIndex,
   389  			}
   390  		}
   391  		w.buf2.PutUvarint32(valueIndex)
   392  	}
   393  
   394  	w.buf2.PutUvarint(len(chunks))
   395  
   396  	if len(chunks) > 0 {
   397  		c := chunks[0]
   398  		w.toc.Metadata.EnsureBounds(c.MinTime, c.MaxTime)
   399  
   400  		w.buf2.PutVarint64(c.MinTime)
   401  		w.buf2.PutUvarint64(uint64(c.MaxTime - c.MinTime))
   402  		w.buf2.PutUvarint32(c.KB)
   403  		w.buf2.PutUvarint32(c.SeriesIndex)
   404  		w.buf2.PutBE32(c.Checksum)
   405  		t0 := c.MaxTime
   406  
   407  		for _, c := range chunks[1:] {
   408  			w.toc.Metadata.EnsureBounds(c.MinTime, c.MaxTime)
   409  			// Encode the diff against previous chunk as varint
   410  			// instead of uvarint because chunks may overlap
   411  			w.buf2.PutVarint64(c.MinTime - t0)
   412  			w.buf2.PutUvarint64(uint64(c.MaxTime - c.MinTime))
   413  			w.buf2.PutUvarint32(c.KB)
   414  			w.buf2.PutUvarint32(c.SeriesIndex)
   415  			t0 = c.MaxTime
   416  
   417  			w.buf2.PutBE32(c.Checksum)
   418  		}
   419  	}
   420  
   421  	w.buf1.Reset()
   422  	w.buf1.PutUvarint(w.buf2.Len())
   423  
   424  	w.buf2.PutHash(w.crc32)
   425  
   426  	w.lastSeries = append(w.lastSeries[:0], lset...)
   427  	w.lastSeriesHash = labelHash
   428  	w.lastRef = ref
   429  
   430  	if ref%fingerprintInterval == 0 {
   431  		sRef := w.f.pos / 16
   432  		w.fingerprintOffsets = append(w.fingerprintOffsets, [2]uint64{sRef, labelHash})
   433  	}
   434  
   435  	if err := w.write(w.buf1.Get(), w.buf2.Get()); err != nil {
   436  		return errors.Wrap(err, "write series data")
   437  	}
   438  
   439  	return nil
   440  }
   441  
   442  func (w *Writer) startSymbols() error {
   443  	// We are at w.toc.Symbols.
   444  	// Leave 4 bytes of space for the length, and another 4 for the number of symbols
   445  	// which will both be calculated later.
   446  	return w.write([]byte("alenblen"))
   447  }
   448  
   449  func (w *Writer) AddSymbol(sym string) error {
   450  	if err := w.ensureStage(idxStageSymbols); err != nil {
   451  		return err
   452  	}
   453  	if w.numSymbols != 0 && sym <= w.lastSymbol {
   454  		return errors.Errorf("symbol %q out-of-order", sym)
   455  	}
   456  	w.lastSymbol = sym
   457  	w.numSymbols++
   458  	w.buf1.Reset()
   459  	w.buf1.PutUvarintStr(sym)
   460  	return w.write(w.buf1.Get())
   461  }
   462  
   463  func (w *Writer) finishSymbols() error {
   464  	symbolTableSize := w.f.pos - w.toc.Symbols - 4
   465  	// The symbol table's <len> part is 4 bytes. So the total symbol table size must be less than or equal to 2^32-1
   466  	if symbolTableSize > math.MaxUint32 {
   467  		return errors.Errorf("symbol table size exceeds 4 bytes: %d", symbolTableSize)
   468  	}
   469  
   470  	// Write out the length and symbol count.
   471  	w.buf1.Reset()
   472  	w.buf1.PutBE32int(int(symbolTableSize))
   473  	w.buf1.PutBE32int(w.numSymbols)
   474  	if err := w.writeAt(w.buf1.Get(), w.toc.Symbols); err != nil {
   475  		return err
   476  	}
   477  
   478  	hashPos := w.f.pos
   479  	// Leave space for the hash. We can only calculate it
   480  	// now that the number of symbols is known, so mmap and do it from there.
   481  	if err := w.write([]byte("hash")); err != nil {
   482  		return err
   483  	}
   484  	if err := w.f.Flush(); err != nil {
   485  		return err
   486  	}
   487  
   488  	//sf, err := fileutil.OpenMmapFile(w.f.name)
   489  	buf, sf, err := w.f.Buffer()
   490  	if err != nil {
   491  		return err
   492  	}
   493  	w.symbolFile = sf
   494  	hash := crc32.Checksum(buf[w.toc.Symbols+4:hashPos], castagnoliTable)
   495  	w.buf1.Reset()
   496  	w.buf1.PutBE32(hash)
   497  	if err := w.writeAt(w.buf1.Get(), hashPos); err != nil {
   498  		return err
   499  	}
   500  
   501  	// Load in the symbol table efficiently for the rest of the index writing.
   502  	w.symbols, err = NewSymbols(RealByteSlice(buf), FormatV2, int(w.toc.Symbols))
   503  	if err != nil {
   504  		return errors.Wrap(err, "read symbols")
   505  	}
   506  	return nil
   507  }
   508  
   509  func (w *Writer) writeLabelIndices() error {
   510  	if err := w.fPO.Flush(); err != nil {
   511  		return err
   512  	}
   513  
   514  	// Find all the label values in the tmp posting offset table.
   515  	//f, err := fileutil.OpenMmapFile(w.fPO.name)
   516  	buf, closer, err := w.fPO.Buffer()
   517  	if err != nil {
   518  		return err
   519  	}
   520  	defer closer.Close()
   521  
   522  	d := encoding.DecWrap(tsdb_enc.NewDecbufRaw(RealByteSlice(buf), int(w.fPO.pos)))
   523  	cnt := w.cntPO
   524  	current := []byte{}
   525  	values := []uint32{}
   526  	for d.Err() == nil && cnt > 0 {
   527  		cnt--
   528  		d.Uvarint()                           // Keycount.
   529  		name := d.UvarintBytes()              // Label name.
   530  		value := yoloString(d.UvarintBytes()) // Label value.
   531  		d.Uvarint64()                         // Offset.
   532  		if len(name) == 0 {
   533  			continue // All index is ignored.
   534  		}
   535  
   536  		if !bytes.Equal(name, current) && len(values) > 0 {
   537  			// We've reached a new label name.
   538  			if err := w.writeLabelIndex(string(current), values); err != nil {
   539  				return err
   540  			}
   541  			values = values[:0]
   542  		}
   543  		current = name
   544  		sid, err := w.symbols.ReverseLookup(value)
   545  		if err != nil {
   546  			return err
   547  		}
   548  		values = append(values, sid)
   549  	}
   550  	if d.Err() != nil {
   551  		return d.Err()
   552  	}
   553  
   554  	// Handle the last label.
   555  	if len(values) > 0 {
   556  		if err := w.writeLabelIndex(string(current), values); err != nil {
   557  			return err
   558  		}
   559  	}
   560  	return nil
   561  }
   562  
   563  func (w *Writer) writeLabelIndex(name string, values []uint32) error {
   564  	// Align beginning to 4 bytes for more efficient index list scans.
   565  	if err := w.addPadding(4); err != nil {
   566  		return err
   567  	}
   568  
   569  	w.labelIndexes = append(w.labelIndexes, labelIndexHashEntry{
   570  		keys:   []string{name},
   571  		offset: w.f.pos,
   572  	})
   573  
   574  	startPos := w.f.pos
   575  	// Leave 4 bytes of space for the length, which will be calculated later.
   576  	if err := w.write([]byte("alen")); err != nil {
   577  		return err
   578  	}
   579  	w.crc32.Reset()
   580  
   581  	w.buf1.Reset()
   582  	w.buf1.PutBE32int(1) // Number of names.
   583  	w.buf1.PutBE32int(len(values))
   584  	w.buf1.WriteToHash(w.crc32)
   585  	if err := w.write(w.buf1.Get()); err != nil {
   586  		return err
   587  	}
   588  
   589  	for _, v := range values {
   590  		w.buf1.Reset()
   591  		w.buf1.PutBE32(v)
   592  		w.buf1.WriteToHash(w.crc32)
   593  		if err := w.write(w.buf1.Get()); err != nil {
   594  			return err
   595  		}
   596  	}
   597  
   598  	// Write out the length.
   599  	w.buf1.Reset()
   600  	l := w.f.pos - startPos - 4
   601  	if l > math.MaxUint32 {
   602  		return errors.Errorf("label index size exceeds 4 bytes: %d", l)
   603  	}
   604  	w.buf1.PutBE32int(int(l))
   605  	if err := w.writeAt(w.buf1.Get(), startPos); err != nil {
   606  		return err
   607  	}
   608  
   609  	w.buf1.Reset()
   610  	w.buf1.PutHashSum(w.crc32)
   611  	return w.write(w.buf1.Get())
   612  }
   613  
   614  // writeLabelIndexesOffsetTable writes the label indices offset table.
   615  func (w *Writer) writeLabelIndexesOffsetTable() error {
   616  	startPos := w.f.pos
   617  	// Leave 4 bytes of space for the length, which will be calculated later.
   618  	if err := w.write([]byte("alen")); err != nil {
   619  		return err
   620  	}
   621  	w.crc32.Reset()
   622  
   623  	w.buf1.Reset()
   624  	w.buf1.PutBE32int(len(w.labelIndexes))
   625  	w.buf1.WriteToHash(w.crc32)
   626  	if err := w.write(w.buf1.Get()); err != nil {
   627  		return err
   628  	}
   629  
   630  	for _, e := range w.labelIndexes {
   631  		w.buf1.Reset()
   632  		w.buf1.PutUvarint(len(e.keys))
   633  		for _, k := range e.keys {
   634  			w.buf1.PutUvarintStr(k)
   635  		}
   636  		w.buf1.PutUvarint64(e.offset)
   637  		w.buf1.WriteToHash(w.crc32)
   638  		if err := w.write(w.buf1.Get()); err != nil {
   639  			return err
   640  		}
   641  	}
   642  	// Write out the length.
   643  	w.buf1.Reset()
   644  	l := w.f.pos - startPos - 4
   645  	if l > math.MaxUint32 {
   646  		return errors.Errorf("label indexes offset table size exceeds 4 bytes: %d", l)
   647  	}
   648  	w.buf1.PutBE32int(int(l))
   649  	if err := w.writeAt(w.buf1.Get(), startPos); err != nil {
   650  		return err
   651  	}
   652  
   653  	w.buf1.Reset()
   654  	w.buf1.PutHashSum(w.crc32)
   655  	return w.write(w.buf1.Get())
   656  }
   657  
   658  // writePostingsOffsetTable writes the postings offset table.
   659  func (w *Writer) writePostingsOffsetTable() error {
   660  	// Ensure everything is in the temporary file.
   661  	if err := w.fPO.Flush(); err != nil {
   662  		return err
   663  	}
   664  
   665  	startPos := w.f.pos
   666  	// Leave 4 bytes of space for the length, which will be calculated later.
   667  	if err := w.write([]byte("alen")); err != nil {
   668  		return err
   669  	}
   670  
   671  	// Copy over the tmp posting offset table, however we need to
   672  	// adjust the offsets.
   673  	adjustment := w.postingsStart
   674  
   675  	w.buf1.Reset()
   676  	w.crc32.Reset()
   677  	w.buf1.PutBE32int(int(w.cntPO)) // Count.
   678  	w.buf1.WriteToHash(w.crc32)
   679  	if err := w.write(w.buf1.Get()); err != nil {
   680  		return err
   681  	}
   682  
   683  	//f, err := fileutil.OpenMmapFile(w.fPO.name)
   684  	buf, closer, err := w.fPO.Buffer()
   685  	if err != nil {
   686  		return err
   687  	}
   688  	defer func() {
   689  		if closer != nil {
   690  			closer.Close()
   691  		}
   692  	}()
   693  	d := encoding.DecWrap(tsdb_enc.NewDecbufRaw(RealByteSlice(buf), int(w.fPO.pos)))
   694  	cnt := w.cntPO
   695  	for d.Err() == nil && cnt > 0 {
   696  		w.buf1.Reset()
   697  		w.buf1.PutUvarint(d.Uvarint())                     // Keycount.
   698  		w.buf1.PutUvarintStr(yoloString(d.UvarintBytes())) // Label name.
   699  		w.buf1.PutUvarintStr(yoloString(d.UvarintBytes())) // Label value.
   700  		w.buf1.PutUvarint64(d.Uvarint64() + adjustment)    // Offset.
   701  		w.buf1.WriteToHash(w.crc32)
   702  		if err := w.write(w.buf1.Get()); err != nil {
   703  			return err
   704  		}
   705  		cnt--
   706  	}
   707  	if d.Err() != nil {
   708  		return d.Err()
   709  	}
   710  
   711  	// Cleanup temporary file.
   712  	//if err := f.Close(); err != nil {
   713  	//	return err
   714  	//}
   715  	//f = nil
   716  	if err := w.fPO.Close(); err != nil {
   717  		return err
   718  	}
   719  	if err := w.fPO.Remove(); err != nil {
   720  		return err
   721  	}
   722  	//w.fPO = nil
   723  
   724  	// Write out the length.
   725  	w.buf1.Reset()
   726  	l := w.f.pos - startPos - 4
   727  	if l > math.MaxUint32 {
   728  		return errors.Errorf("postings offset table size exceeds 4 bytes: %d", l)
   729  	}
   730  	w.buf1.PutBE32int(int(l))
   731  	if err := w.writeAt(w.buf1.Get(), startPos); err != nil {
   732  		return err
   733  	}
   734  
   735  	// Finally write the hash.
   736  	w.buf1.Reset()
   737  	w.buf1.PutHashSum(w.crc32)
   738  	return w.write(w.buf1.Get())
   739  }
   740  
   741  func (w *Writer) writeFingerprintOffsetsTable() error {
   742  	w.buf1.Reset()
   743  	w.buf2.Reset()
   744  
   745  	w.buf1.PutBE32int(len(w.fingerprintOffsets)) // Count.
   746  	// build offsets
   747  	for _, x := range w.fingerprintOffsets {
   748  		w.buf1.PutBE64(x[0]) // series offset
   749  		w.buf1.PutBE64(x[1]) // hash
   750  	}
   751  
   752  	// write length
   753  	ln := w.buf1.Len()
   754  	// TODO(owen-d): can remove the uint32 cast in the future
   755  	// Had to uint32 wrap these for arm32 builds, which we'll remove in the future.
   756  	if uint32(ln) > uint32(math.MaxUint32) {
   757  		return errors.Errorf("fingerprint offset size exceeds 4 bytes: %d", ln)
   758  	}
   759  
   760  	w.buf2.PutBE32int(ln)
   761  	if err := w.write(w.buf2.Get()); err != nil {
   762  		return err
   763  	}
   764  
   765  	// write offsets+checksum
   766  	w.buf1.PutHash(w.crc32)
   767  	if err := w.write(w.buf1.Get()); err != nil {
   768  		return errors.Wrap(err, "failure writing fingerprint offsets")
   769  	}
   770  	return nil
   771  }
   772  
   773  const indexTOCLen = 8*9 + crc32.Size
   774  
   775  func (w *Writer) writeTOC() error {
   776  	w.buf1.Reset()
   777  
   778  	w.buf1.PutBE64(w.toc.Symbols)
   779  	w.buf1.PutBE64(w.toc.Series)
   780  	w.buf1.PutBE64(w.toc.LabelIndices)
   781  	w.buf1.PutBE64(w.toc.LabelIndicesTable)
   782  	w.buf1.PutBE64(w.toc.Postings)
   783  	w.buf1.PutBE64(w.toc.PostingsTable)
   784  	w.buf1.PutBE64(w.toc.FingerprintOffsets)
   785  
   786  	// metadata
   787  	w.buf1.PutBE64int64(w.toc.Metadata.From)
   788  	w.buf1.PutBE64int64(w.toc.Metadata.Through)
   789  
   790  	w.buf1.PutHash(w.crc32)
   791  
   792  	return w.write(w.buf1.Get())
   793  }
   794  
   795  func (w *Writer) writePostingsToTmpFiles() error {
   796  	names := make([]string, 0, len(w.labelNames))
   797  	for n := range w.labelNames {
   798  		names = append(names, n)
   799  	}
   800  	sort.Strings(names)
   801  
   802  	if err := w.f.Flush(); err != nil {
   803  		return err
   804  	}
   805  	//f, err := fileutil.OpenMmapFile(w.f.name)
   806  	buf, closer, err := w.f.Buffer()
   807  	if err != nil {
   808  		return err
   809  	}
   810  	defer closer.Close()
   811  
   812  	// Write out the special all posting.
   813  	offsets := []uint32{}
   814  	d := encoding.DecWrap(tsdb_enc.NewDecbufRaw(RealByteSlice(buf), int(w.toc.LabelIndices)))
   815  	d.Skip(int(w.toc.Series))
   816  	for d.Len() > 0 {
   817  		d.ConsumePadding()
   818  		startPos := w.toc.LabelIndices - uint64(d.Len())
   819  		if startPos%16 != 0 {
   820  			return errors.Errorf("series not 16-byte aligned at %d", startPos)
   821  		}
   822  		offsets = append(offsets, uint32(startPos/16))
   823  		// Skip to next series.
   824  		x := d.Uvarint()
   825  		d.Skip(x + crc32.Size)
   826  		if err := d.Err(); err != nil {
   827  			return err
   828  		}
   829  	}
   830  	if err := w.writePosting("", "", offsets); err != nil {
   831  		return err
   832  	}
   833  	maxPostings := uint64(len(offsets)) // No label name can have more postings than this.
   834  
   835  	for len(names) > 0 {
   836  		batchNames := []string{}
   837  		var c uint64
   838  		// Try to bunch up label names into one loop, but avoid
   839  		// using more memory than a single label name can.
   840  		for len(names) > 0 {
   841  			if w.labelNames[names[0]]+c > maxPostings {
   842  				if c > 0 {
   843  					break
   844  				}
   845  				return fmt.Errorf("corruption detected when writing postings to index: label %q has %d uses, but maxPostings is %d", names[0], w.labelNames[names[0]], maxPostings)
   846  			}
   847  			batchNames = append(batchNames, names[0])
   848  			c += w.labelNames[names[0]]
   849  			names = names[1:]
   850  		}
   851  
   852  		nameSymbols := map[uint32]string{}
   853  		for _, name := range batchNames {
   854  			sid, err := w.symbols.ReverseLookup(name)
   855  			if err != nil {
   856  				return err
   857  			}
   858  			nameSymbols[sid] = name
   859  		}
   860  		// Label name -> label value -> positions.
   861  		postings := map[uint32]map[uint32][]uint32{}
   862  
   863  		d := encoding.DecWrap(tsdb_enc.NewDecbufRaw(RealByteSlice(buf), int(w.toc.LabelIndices)))
   864  		d.Skip(int(w.toc.Series))
   865  		for d.Len() > 0 {
   866  			d.ConsumePadding()
   867  			startPos := w.toc.LabelIndices - uint64(d.Len())
   868  			l := d.Uvarint() // Length of this series in bytes.
   869  			startLen := d.Len()
   870  
   871  			_ = d.Be64() // skip fingerprint
   872  			// See if label names we want are in the series.
   873  			numLabels := d.Uvarint()
   874  			for i := 0; i < numLabels; i++ {
   875  				lno := uint32(d.Uvarint())
   876  				lvo := uint32(d.Uvarint())
   877  
   878  				if _, ok := nameSymbols[lno]; ok {
   879  					if _, ok := postings[lno]; !ok {
   880  						postings[lno] = map[uint32][]uint32{}
   881  					}
   882  					postings[lno][lvo] = append(postings[lno][lvo], uint32(startPos/16))
   883  				}
   884  			}
   885  			// Skip to next series.
   886  			d.Skip(l - (startLen - d.Len()) + crc32.Size)
   887  			if err := d.Err(); err != nil {
   888  				return err
   889  			}
   890  		}
   891  
   892  		for _, name := range batchNames {
   893  			// Write out postings for this label name.
   894  			sid, err := w.symbols.ReverseLookup(name)
   895  			if err != nil {
   896  				return err
   897  			}
   898  			values := make([]uint32, 0, len(postings[sid]))
   899  			for v := range postings[sid] {
   900  				values = append(values, v)
   901  			}
   902  			// Symbol numbers are in order, so the strings will also be in order.
   903  			sort.Sort(uint32slice(values))
   904  			for _, v := range values {
   905  				value, err := w.symbols.Lookup(v)
   906  				if err != nil {
   907  					return err
   908  				}
   909  				if err := w.writePosting(name, value, postings[sid][v]); err != nil {
   910  					return err
   911  				}
   912  			}
   913  		}
   914  		select {
   915  		case <-w.ctx.Done():
   916  			return w.ctx.Err()
   917  		default:
   918  		}
   919  	}
   920  	return nil
   921  }
   922  
   923  func (w *Writer) writePosting(name, value string, offs []uint32) error {
   924  	// Align beginning to 4 bytes for more efficient postings list scans.
   925  	if err := w.fP.AddPadding(4); err != nil {
   926  		return err
   927  	}
   928  
   929  	// Write out postings offset table to temporary file as we go.
   930  	w.buf1.Reset()
   931  	w.buf1.PutUvarint(2)
   932  	w.buf1.PutUvarintStr(name)
   933  	w.buf1.PutUvarintStr(value)
   934  	w.buf1.PutUvarint64(w.fP.pos) // This is relative to the postings tmp file, not the final index file.
   935  	if err := w.fPO.Write(w.buf1.Get()); err != nil {
   936  		return err
   937  	}
   938  	w.cntPO++
   939  
   940  	w.buf1.Reset()
   941  	w.buf1.PutBE32int(len(offs))
   942  
   943  	for _, off := range offs {
   944  		if off > (1<<32)-1 {
   945  			return errors.Errorf("series offset %d exceeds 4 bytes", off)
   946  		}
   947  		w.buf1.PutBE32(off)
   948  	}
   949  
   950  	w.buf2.Reset()
   951  	l := w.buf1.Len()
   952  	// We convert to uint to make code compile on 32-bit systems, as math.MaxUint32 doesn't fit into int there.
   953  	if uint(l) > math.MaxUint32 {
   954  		return errors.Errorf("posting size exceeds 4 bytes: %d", l)
   955  	}
   956  	w.buf2.PutBE32int(l)
   957  	w.buf1.PutHash(w.crc32)
   958  	return w.fP.Write(w.buf2.Get(), w.buf1.Get())
   959  }
   960  
   961  func (w *Writer) writePostings() error {
   962  	// There's padding in the tmp file, make sure it actually works.
   963  	if err := w.f.AddPadding(4); err != nil {
   964  		return err
   965  	}
   966  	w.postingsStart = w.f.pos
   967  
   968  	// Copy temporary file into main index.
   969  	if err := w.fP.Flush(); err != nil {
   970  		return err
   971  	}
   972  	//if _, err := w.fP.f.Seek(0, 0); err != nil {
   973  	//	return err
   974  	//}
   975  	// Don't need to calculate a checksum, so can copy directly.
   976  	//n, err := io.CopyBuffer(w.f.fbuf, w.fP.f, make([]byte, 1<<20))
   977  	//buf := make([]byte, cap(w.buf1.B))
   978  	//buf := w.buf1.B[:cap(w.buf1.B)]
   979  	//n, err := io.CopyBuffer(w.f.fbuf, w.fP.f, buf)
   980  	//if err != nil {
   981  	//	return err
   982  	//}
   983  	n, err := w.f.ReadFrom(w.fP)
   984  	if err != nil {
   985  		return err
   986  	}
   987  	if uint64(n) != w.fP.pos {
   988  		return errors.Errorf("wrote %d bytes to posting temporary file, but only read back %d", w.fP.pos, n)
   989  	}
   990  	//w.f.pos += uint64(n)
   991  
   992  	if err := w.fP.Close(); err != nil {
   993  		return err
   994  	}
   995  	if err := w.fP.Remove(); err != nil {
   996  		return err
   997  	}
   998  	//w.fP = nil
   999  	return nil
  1000  }
  1001  
  1002  type uint32slice []uint32
  1003  
  1004  func (s uint32slice) Len() int           { return len(s) }
  1005  func (s uint32slice) Swap(i, j int)      { s[i], s[j] = s[j], s[i] }
  1006  func (s uint32slice) Less(i, j int) bool { return s[i] < s[j] }
  1007  
  1008  type labelIndexHashEntry struct {
  1009  	keys   []string
  1010  	offset uint64
  1011  }
  1012  
  1013  func (w *Writer) Close() error {
  1014  	// Even if this fails, we need to close all the files.
  1015  	ensureErr := w.ensureStage(idxStageDone)
  1016  
  1017  	if w.symbolFile != nil {
  1018  		if err := w.symbolFile.Close(); err != nil {
  1019  			return err
  1020  		}
  1021  	}
  1022  	if w.fP != nil {
  1023  		if err := w.fP.Close(); err != nil {
  1024  			return err
  1025  		}
  1026  	}
  1027  	if w.fPO != nil {
  1028  		if err := w.fPO.Close(); err != nil {
  1029  			return err
  1030  		}
  1031  	}
  1032  	if err := w.f.Close(); err != nil {
  1033  		return err
  1034  	}
  1035  	// w.f is kept around a bit longer and returned to pool by users
  1036  	PutBufferWriterToPool(w.fP)
  1037  	PutBufferWriterToPool(w.fPO)
  1038  	w.fP = nil
  1039  	w.fPO = nil
  1040  
  1041  	return ensureErr
  1042  }
  1043  
  1044  // StringIter iterates over a sorted list of strings.
  1045  type StringIter interface {
  1046  	// Next advances the iterator and returns true if another value was found.
  1047  	Next() bool
  1048  
  1049  	// At returns the value at the current iterator position.
  1050  	At() string
  1051  
  1052  	// Err returns the last error of the iterator.
  1053  	Err() error
  1054  }
  1055  
  1056  type Reader struct {
  1057  	b   ByteSlice
  1058  	toc *TOC
  1059  
  1060  	// Close that releases the underlying resources of the byte slice.
  1061  	c io.Closer
  1062  
  1063  	// Map of LabelName to a list of some LabelValues's position in the offset table.
  1064  	// The first and last values for each name are always present.
  1065  	postings map[string][]postingOffset
  1066  	// For the v1 format, labelname -> labelvalue -> offset.
  1067  	postingsV1 map[string]map[string]uint64
  1068  
  1069  	symbols     *Symbols
  1070  	nameSymbols map[uint32]string // Cache of the label name symbol lookups,
  1071  	// as there are not many and they are half of all lookups.
  1072  
  1073  	fingerprintOffsets index.FingerprintOffsets
  1074  
  1075  	dec *Decoder
  1076  
  1077  	version int
  1078  }
  1079  
  1080  type postingOffset struct {
  1081  	value string
  1082  	off   int
  1083  }
  1084  
  1085  // ByteSlice abstracts a byte slice.
  1086  type ByteSlice interface {
  1087  	Len() int
  1088  	Range(start, end int) []byte
  1089  }
  1090  
  1091  type RealByteSlice []byte
  1092  
  1093  func (b RealByteSlice) Len() int {
  1094  	return len(b)
  1095  }
  1096  
  1097  func (b RealByteSlice) Range(start, end int) []byte {
  1098  	return b[start:end]
  1099  }
  1100  
  1101  func (b RealByteSlice) Sub(start, end int) ByteSlice {
  1102  	return b[start:end]
  1103  }
  1104  
  1105  // NewReader returns a new index reader on the given byte slice. It automatically
  1106  // handles different format versions.
  1107  func NewReader(b ByteSlice) (*Reader, error) {
  1108  	return newReader(b, io.NopCloser(nil))
  1109  }
  1110  
  1111  type nopCloser struct{}
  1112  
  1113  func (nopCloser) Close() error { return nil }
  1114  
  1115  // NewFileReader returns a new index reader against the given index file.
  1116  func NewFileReader(path string) (*Reader, error) {
  1117  	b, err := os.ReadFile(path)
  1118  	if err != nil {
  1119  		return nil, err
  1120  	}
  1121  	r, err := newReader(RealByteSlice(b), nopCloser{})
  1122  	if err != nil {
  1123  		return r, err
  1124  	}
  1125  
  1126  	return r, nil
  1127  }
  1128  
  1129  func newReader(b ByteSlice, c io.Closer) (*Reader, error) {
  1130  	r := &Reader{
  1131  		b:        b,
  1132  		c:        c,
  1133  		postings: map[string][]postingOffset{},
  1134  	}
  1135  
  1136  	// Verify header.
  1137  	if r.b.Len() < HeaderLen {
  1138  		return nil, errors.Wrap(tsdb_enc.ErrInvalidSize, "index header")
  1139  	}
  1140  	if m := binary.BigEndian.Uint32(r.b.Range(0, 4)); m != MagicIndex {
  1141  		return nil, errors.Errorf("invalid magic number %x", m)
  1142  	}
  1143  	r.version = int(r.b.Range(4, 5)[0])
  1144  
  1145  	if r.version != FormatV1 && r.version != FormatV2 {
  1146  		return nil, errors.Errorf("unknown index file version %d", r.version)
  1147  	}
  1148  
  1149  	var err error
  1150  	r.toc, err = NewTOCFromByteSlice(b)
  1151  	if err != nil {
  1152  		return nil, errors.Wrap(err, "read TOC")
  1153  	}
  1154  
  1155  	r.symbols, err = NewSymbols(r.b, r.version, int(r.toc.Symbols))
  1156  	if err != nil {
  1157  		return nil, errors.Wrap(err, "read symbols")
  1158  	}
  1159  
  1160  	if r.version == FormatV1 {
  1161  		// Earlier V1 formats don't have a sorted postings offset table, so
  1162  		// load the whole offset table into memory.
  1163  		r.postingsV1 = map[string]map[string]uint64{}
  1164  		if err := ReadOffsetTable(r.b, r.toc.PostingsTable, func(key []string, off uint64, _ int) error {
  1165  			if len(key) != 2 {
  1166  				return errors.Errorf("unexpected key length for posting table %d", len(key))
  1167  			}
  1168  			if _, ok := r.postingsV1[key[0]]; !ok {
  1169  				r.postingsV1[key[0]] = map[string]uint64{}
  1170  				r.postings[key[0]] = nil // Used to get a list of labelnames in places.
  1171  			}
  1172  			r.postingsV1[key[0]][key[1]] = off
  1173  			return nil
  1174  		}); err != nil {
  1175  			return nil, errors.Wrap(err, "read postings table")
  1176  		}
  1177  	} else {
  1178  		var lastKey []string
  1179  		lastOff := 0
  1180  		valueCount := 0
  1181  		// For the postings offset table we keep every label name but only every nth
  1182  		// label value (plus the first and last one), to save memory.
  1183  		if err := ReadOffsetTable(r.b, r.toc.PostingsTable, func(key []string, _ uint64, off int) error {
  1184  			if len(key) != 2 {
  1185  				return errors.Errorf("unexpected key length for posting table %d", len(key))
  1186  			}
  1187  			if _, ok := r.postings[key[0]]; !ok {
  1188  				// Next label name.
  1189  				r.postings[key[0]] = []postingOffset{}
  1190  				if lastKey != nil {
  1191  					// Always include last value for each label name.
  1192  					r.postings[lastKey[0]] = append(r.postings[lastKey[0]], postingOffset{value: lastKey[1], off: lastOff})
  1193  				}
  1194  				lastKey = nil
  1195  				valueCount = 0
  1196  			}
  1197  			if valueCount%symbolFactor == 0 {
  1198  				r.postings[key[0]] = append(r.postings[key[0]], postingOffset{value: key[1], off: off})
  1199  				lastKey = nil
  1200  			} else {
  1201  				lastKey = key
  1202  				lastOff = off
  1203  			}
  1204  			valueCount++
  1205  			return nil
  1206  		}); err != nil {
  1207  			return nil, errors.Wrap(err, "read postings table")
  1208  		}
  1209  		if lastKey != nil {
  1210  			r.postings[lastKey[0]] = append(r.postings[lastKey[0]], postingOffset{value: lastKey[1], off: lastOff})
  1211  		}
  1212  		// Trim any extra space in the slices.
  1213  		for k, v := range r.postings {
  1214  			l := make([]postingOffset, len(v))
  1215  			copy(l, v)
  1216  			r.postings[k] = l
  1217  		}
  1218  	}
  1219  
  1220  	r.nameSymbols = make(map[uint32]string, len(r.postings))
  1221  	for k := range r.postings {
  1222  		if k == "" {
  1223  			continue
  1224  		}
  1225  		off, err := r.symbols.ReverseLookup(k)
  1226  		if err != nil {
  1227  			return nil, errors.Wrap(err, "reverse symbol lookup")
  1228  		}
  1229  		r.nameSymbols[off] = k
  1230  	}
  1231  
  1232  	r.fingerprintOffsets, err = readFingerprintOffsetsTable(r.b, r.toc.FingerprintOffsets)
  1233  	if err != nil {
  1234  		return nil, errors.Wrap(err, "loading fingerprint offsets")
  1235  	}
  1236  
  1237  	r.dec = &Decoder{LookupSymbol: r.lookupSymbol}
  1238  
  1239  	return r, nil
  1240  }
  1241  
  1242  // Version returns the file format version of the underlying index.
  1243  func (r *Reader) Version() int {
  1244  	return r.version
  1245  }
  1246  
  1247  // FileInfo returns some general stats about the underlying file
  1248  func (r *Reader) FileInfo() block.File {
  1249  	k, v := index.AllPostingsKey()
  1250  	postings, err := r.Postings(k, nil, v)
  1251  	if err != nil {
  1252  		panic(err)
  1253  	}
  1254  	var numSeries uint64
  1255  	for postings.Next() {
  1256  		numSeries++
  1257  	}
  1258  	return block.File{
  1259  		RelPath:   block.IndexFilename,
  1260  		SizeBytes: uint64(r.Size()),
  1261  		TSDB: &block.TSDBFile{
  1262  			NumSeries: numSeries,
  1263  		},
  1264  	}
  1265  }
  1266  
  1267  // Range marks a byte range.
  1268  type Range struct {
  1269  	Start, End int64
  1270  }
  1271  
  1272  // PostingsRanges returns a new map of byte range in the underlying index file
  1273  // for all postings lists.
  1274  func (r *Reader) PostingsRanges() (map[labels.Label]Range, error) {
  1275  	m := map[labels.Label]Range{}
  1276  	if err := ReadOffsetTable(r.b, r.toc.PostingsTable, func(key []string, off uint64, _ int) error {
  1277  		if len(key) != 2 {
  1278  			return errors.Errorf("unexpected key length for posting table %d", len(key))
  1279  		}
  1280  		d := encoding.DecWrap(tsdb_enc.NewDecbufAt(r.b, int(off), castagnoliTable))
  1281  		if d.Err() != nil {
  1282  			return d.Err()
  1283  		}
  1284  		m[labels.Label{Name: key[0], Value: key[1]}] = Range{
  1285  			Start: int64(off) + 4,
  1286  			End:   int64(off) + 4 + int64(d.Len()),
  1287  		}
  1288  		return nil
  1289  	}); err != nil {
  1290  		return nil, errors.Wrap(err, "read postings table")
  1291  	}
  1292  	return m, nil
  1293  }
  1294  
  1295  type Symbols struct {
  1296  	bs      ByteSlice
  1297  	version int
  1298  	off     int
  1299  
  1300  	offsets []int
  1301  	seen    int
  1302  }
  1303  
  1304  const symbolFactor = 32
  1305  
  1306  // NewSymbols returns a Symbols object for symbol lookups.
  1307  func NewSymbols(bs ByteSlice, version, off int) (*Symbols, error) {
  1308  	s := &Symbols{
  1309  		bs:      bs,
  1310  		version: version,
  1311  		off:     off,
  1312  	}
  1313  	d := encoding.DecWrap(tsdb_enc.NewDecbufAt(bs, off, castagnoliTable))
  1314  	var (
  1315  		origLen = d.Len()
  1316  		cnt     = d.Be32int()
  1317  		basePos = off + 4
  1318  	)
  1319  	s.offsets = make([]int, 0, 1+cnt/symbolFactor)
  1320  	for d.Err() == nil && s.seen < cnt {
  1321  		if s.seen%symbolFactor == 0 {
  1322  			s.offsets = append(s.offsets, basePos+origLen-d.Len())
  1323  		}
  1324  		d.UvarintBytes() // The symbol.
  1325  		s.seen++
  1326  	}
  1327  	if d.Err() != nil {
  1328  		return nil, d.Err()
  1329  	}
  1330  	return s, nil
  1331  }
  1332  
  1333  func (s Symbols) Lookup(o uint32) (string, error) {
  1334  	d := encoding.DecWrap(tsdb_enc.Decbuf{
  1335  		B: s.bs.Range(0, s.bs.Len()),
  1336  	})
  1337  
  1338  	if s.version == FormatV2 {
  1339  		if int(o) >= s.seen {
  1340  			return "", errors.Errorf("unknown symbol offset %d", o)
  1341  		}
  1342  		d.Skip(s.offsets[int(o/symbolFactor)])
  1343  		// Walk until we find the one we want.
  1344  		for i := o - (o / symbolFactor * symbolFactor); i > 0; i-- {
  1345  			d.UvarintBytes()
  1346  		}
  1347  	} else {
  1348  		d.Skip(int(o))
  1349  	}
  1350  	sym := d.UvarintStr()
  1351  	if d.Err() != nil {
  1352  		return "", d.Err()
  1353  	}
  1354  	return sym, nil
  1355  }
  1356  
  1357  func (s Symbols) ReverseLookup(sym string) (uint32, error) {
  1358  	if len(s.offsets) == 0 {
  1359  		return 0, errors.Errorf("unknown symbol %q - no symbols", sym)
  1360  	}
  1361  	i := sort.Search(len(s.offsets), func(i int) bool {
  1362  		// Any decoding errors here will be lost, however
  1363  		// we already read through all of this at startup.
  1364  		d := encoding.DecWrap(tsdb_enc.Decbuf{
  1365  			B: s.bs.Range(0, s.bs.Len()),
  1366  		})
  1367  		d.Skip(s.offsets[i])
  1368  		return yoloString(d.UvarintBytes()) > sym
  1369  	})
  1370  	d := encoding.DecWrap(tsdb_enc.Decbuf{
  1371  		B: s.bs.Range(0, s.bs.Len()),
  1372  	})
  1373  	if i > 0 {
  1374  		i--
  1375  	}
  1376  	d.Skip(s.offsets[i])
  1377  	res := i * symbolFactor
  1378  	var lastLen int
  1379  	var lastSymbol string
  1380  	for d.Err() == nil && res <= s.seen {
  1381  		lastLen = d.Len()
  1382  		lastSymbol = yoloString(d.UvarintBytes())
  1383  		if lastSymbol >= sym {
  1384  			break
  1385  		}
  1386  		res++
  1387  	}
  1388  	if d.Err() != nil {
  1389  		return 0, d.Err()
  1390  	}
  1391  	if lastSymbol != sym {
  1392  		return 0, errors.Errorf("unknown symbol %q", sym)
  1393  	}
  1394  	if s.version == FormatV2 {
  1395  		return uint32(res), nil
  1396  	}
  1397  	return uint32(s.bs.Len() - lastLen), nil
  1398  }
  1399  
  1400  func (s Symbols) Size() int {
  1401  	return len(s.offsets) * 8
  1402  }
  1403  
  1404  func (s Symbols) Iter() StringIter {
  1405  	d := encoding.DecWrap(tsdb_enc.NewDecbufAt(s.bs, s.off, castagnoliTable))
  1406  	cnt := d.Be32int()
  1407  	return &symbolsIter{
  1408  		d:   d,
  1409  		cnt: cnt,
  1410  	}
  1411  }
  1412  
  1413  // symbolsIter implements StringIter.
  1414  type symbolsIter struct {
  1415  	d   encoding.Decbuf
  1416  	cnt int
  1417  	cur string
  1418  	err error
  1419  }
  1420  
  1421  func (s *symbolsIter) Next() bool {
  1422  	if s.cnt == 0 || s.err != nil {
  1423  		return false
  1424  	}
  1425  	s.cur = yoloString(s.d.UvarintBytes())
  1426  	s.cnt--
  1427  	if s.d.Err() != nil {
  1428  		s.err = s.d.Err()
  1429  		return false
  1430  	}
  1431  	return true
  1432  }
  1433  
  1434  func (s symbolsIter) At() string { return s.cur }
  1435  func (s symbolsIter) Err() error { return s.err }
  1436  
  1437  // ReadOffsetTable reads an offset table and at the given position calls f for each
  1438  // found entry. If f returns an error it stops decoding and returns the received error.
  1439  func ReadOffsetTable(bs ByteSlice, off uint64, f func([]string, uint64, int) error) error {
  1440  	d := encoding.DecWrap(tsdb_enc.NewDecbufAt(bs, int(off), castagnoliTable))
  1441  	startLen := d.Len()
  1442  	cnt := d.Be32()
  1443  
  1444  	for d.Err() == nil && d.Len() > 0 && cnt > 0 {
  1445  		offsetPos := startLen - d.Len()
  1446  		keyCount := d.Uvarint()
  1447  		// The Postings offset table takes only 2 keys per entry (name and value of label),
  1448  		// and the LabelIndices offset table takes only 1 key per entry (a label name).
  1449  		// Hence setting the size to max of both, i.e. 2.
  1450  		keys := make([]string, 0, 2)
  1451  
  1452  		for i := 0; i < keyCount; i++ {
  1453  			keys = append(keys, d.UvarintStr())
  1454  		}
  1455  		o := d.Uvarint64()
  1456  		if d.Err() != nil {
  1457  			break
  1458  		}
  1459  		if err := f(keys, o, offsetPos); err != nil {
  1460  			return err
  1461  		}
  1462  		cnt--
  1463  	}
  1464  	return d.Err()
  1465  }
  1466  
  1467  func readFingerprintOffsetsTable(bs ByteSlice, off uint64) (index.FingerprintOffsets, error) {
  1468  	d := encoding.DecWrap(tsdb_enc.NewDecbufAt(bs, int(off), castagnoliTable))
  1469  	cnt := d.Be32()
  1470  	res := make(index.FingerprintOffsets, 0, int(cnt))
  1471  
  1472  	for d.Err() == nil && d.Len() > 0 && cnt > 0 {
  1473  		res = append(res, [2]uint64{d.Be64(), d.Be64()})
  1474  		cnt--
  1475  	}
  1476  
  1477  	return res, d.Err()
  1478  }
  1479  
  1480  // Close the reader and its underlying resources.
  1481  func (r *Reader) Close() error {
  1482  	return r.c.Close()
  1483  }
  1484  
  1485  func (r *Reader) lookupSymbol(o uint32) (string, error) {
  1486  	if s, ok := r.nameSymbols[o]; ok {
  1487  		return s, nil
  1488  	}
  1489  	return r.symbols.Lookup(o)
  1490  }
  1491  
  1492  func (r *Reader) Bounds() (int64, int64) {
  1493  	return r.toc.Metadata.From, r.toc.Metadata.Through
  1494  }
  1495  
  1496  func (r *Reader) Checksum() uint32 {
  1497  	return r.toc.Metadata.Checksum
  1498  }
  1499  
  1500  // Symbols returns an iterator over the symbols that exist within the index.
  1501  func (r *Reader) Symbols() StringIter {
  1502  	return r.symbols.Iter()
  1503  }
  1504  
  1505  // SymbolTableSize returns the symbol table size in bytes.
  1506  func (r *Reader) SymbolTableSize() uint64 {
  1507  	return uint64(r.symbols.Size())
  1508  }
  1509  
  1510  // SortedLabelValues returns value tuples that exist for the given label name.
  1511  // It is not safe to use the return value beyond the lifetime of the byte slice
  1512  // passed into the Reader.
  1513  func (r *Reader) SortedLabelValues(name string, matchers ...*labels.Matcher) ([]string, error) {
  1514  	values, err := r.LabelValues(name, matchers...)
  1515  	if err == nil && r.version == FormatV1 {
  1516  		sort.Strings(values)
  1517  	}
  1518  	return values, err
  1519  }
  1520  
  1521  // LabelValues returns value tuples that exist for the given label name.
  1522  // It is not safe to use the return value beyond the lifetime of the byte slice
  1523  // passed into the Reader.
  1524  // TODO(replay): Support filtering by matchers
  1525  func (r *Reader) LabelValues(name string, matchers ...*labels.Matcher) ([]string, error) {
  1526  	if len(matchers) > 0 {
  1527  		return nil, errors.Errorf("matchers parameter is not implemented: %+v", matchers)
  1528  	}
  1529  
  1530  	if r.version == FormatV1 {
  1531  		e, ok := r.postingsV1[name]
  1532  		if !ok {
  1533  			return nil, nil
  1534  		}
  1535  		values := make([]string, 0, len(e))
  1536  		for k := range e {
  1537  			values = append(values, k)
  1538  		}
  1539  		return values, nil
  1540  
  1541  	}
  1542  	e, ok := r.postings[name]
  1543  	if !ok {
  1544  		return nil, nil
  1545  	}
  1546  	if len(e) == 0 {
  1547  		return nil, nil
  1548  	}
  1549  	values := make([]string, 0, len(e)*symbolFactor)
  1550  
  1551  	d := encoding.DecWrap(tsdb_enc.NewDecbufAt(r.b, int(r.toc.PostingsTable), nil))
  1552  	d.Skip(e[0].off)
  1553  	lastVal := e[len(e)-1].value
  1554  
  1555  	skip := 0
  1556  	for d.Err() == nil {
  1557  		if skip == 0 {
  1558  			// These are always the same number of bytes,
  1559  			// and it's faster to skip than parse.
  1560  			skip = d.Len()
  1561  			d.Uvarint()      // Keycount.
  1562  			d.UvarintBytes() // Label name.
  1563  			skip -= d.Len()
  1564  		} else {
  1565  			d.Skip(skip)
  1566  		}
  1567  		s := yoloString(d.UvarintBytes()) // Label value.
  1568  		values = append(values, s)
  1569  		if s == lastVal {
  1570  			break
  1571  		}
  1572  		d.Uvarint64() // Offset.
  1573  	}
  1574  	if d.Err() != nil {
  1575  		return nil, errors.Wrap(d.Err(), "get postings offset entry")
  1576  	}
  1577  	return values, nil
  1578  }
  1579  
  1580  // LabelNamesFor returns all the label names for the series referred to by IDs.
  1581  // The names returned are sorted.
  1582  func (r *Reader) LabelNamesFor(ids ...storage.SeriesRef) ([]string, error) {
  1583  	// Gather offsetsMap the name offsetsMap in the symbol table first
  1584  	offsetsMap := make(map[uint32]struct{})
  1585  	for _, id := range ids {
  1586  		offset := id
  1587  		// In version 2 series IDs are no longer exact references but series are 16-byte padded
  1588  		// and the ID is the multiple of 16 of the actual position.
  1589  		if r.version == FormatV2 {
  1590  			offset = id * 16
  1591  		}
  1592  
  1593  		d := encoding.DecWrap(tsdb_enc.NewDecbufUvarintAt(r.b, int(offset), castagnoliTable))
  1594  		buf := d.Get()
  1595  		if d.Err() != nil {
  1596  			return nil, errors.Wrap(d.Err(), "get buffer for series")
  1597  		}
  1598  
  1599  		offsets, err := r.dec.LabelNamesOffsetsFor(buf)
  1600  		if err != nil {
  1601  			return nil, errors.Wrap(err, "get label name offsets")
  1602  		}
  1603  		for _, off := range offsets {
  1604  			offsetsMap[off] = struct{}{}
  1605  		}
  1606  	}
  1607  
  1608  	// Lookup the unique symbols.
  1609  	names := make([]string, 0, len(offsetsMap))
  1610  	for off := range offsetsMap {
  1611  		name, err := r.lookupSymbol(off)
  1612  		if err != nil {
  1613  			return nil, errors.Wrap(err, "lookup symbol in LabelNamesFor")
  1614  		}
  1615  		names = append(names, name)
  1616  	}
  1617  
  1618  	sort.Strings(names)
  1619  
  1620  	return names, nil
  1621  }
  1622  
  1623  // LabelValueFor returns label value for the given label name in the series referred to by ID.
  1624  func (r *Reader) LabelValueFor(id storage.SeriesRef, label string) (string, error) {
  1625  	offset := id
  1626  	// In version 2 series IDs are no longer exact references but series are 16-byte padded
  1627  	// and the ID is the multiple of 16 of the actual position.
  1628  	if r.version == FormatV2 {
  1629  		offset = id * 16
  1630  	}
  1631  	d := encoding.DecWrap(tsdb_enc.NewDecbufUvarintAt(r.b, int(offset), castagnoliTable))
  1632  	buf := d.Get()
  1633  	if d.Err() != nil {
  1634  		return "", errors.Wrap(d.Err(), "label values for")
  1635  	}
  1636  
  1637  	value, err := r.dec.LabelValueFor(buf, label)
  1638  	if err != nil {
  1639  		return "", storage.ErrNotFound
  1640  	}
  1641  
  1642  	if value == "" {
  1643  		return "", storage.ErrNotFound
  1644  	}
  1645  
  1646  	return value, nil
  1647  }
  1648  
  1649  // Series reads the series with the given ID and writes its labels and chunks into lbls and chks.
  1650  func (r *Reader) Series(id storage.SeriesRef, lbls *phlaremodel.Labels, chks *[]index.ChunkMeta) (uint64, error) {
  1651  	offset := id
  1652  	// In version 2 series IDs are no longer exact references but series are 16-byte padded
  1653  	// and the ID is the multiple of 16 of the actual position.
  1654  	if r.version == FormatV2 {
  1655  		offset = id * 16
  1656  	}
  1657  	d := encoding.DecWrap(tsdb_enc.NewDecbufUvarintAt(r.b, int(offset), castagnoliTable))
  1658  	if d.Err() != nil {
  1659  		return 0, d.Err()
  1660  	}
  1661  
  1662  	fprint, err := r.dec.Series(d.Get(), lbls, chks, false)
  1663  	if err != nil {
  1664  		return 0, errors.Wrap(err, "read series")
  1665  	}
  1666  	return fprint, nil
  1667  }
  1668  
  1669  // SeriesBy is like Series but allows to group labels by name. This avoid looking up all label symbols for requested series.
  1670  func (r *Reader) SeriesBy(id storage.SeriesRef, lbls *phlaremodel.Labels, chks *[]index.ChunkMeta, by ...string) (uint64, error) {
  1671  	offset := id
  1672  	// In version 2 series IDs are no longer exact references but series are 16-byte padded
  1673  	// and the ID is the multiple of 16 of the actual position.
  1674  	if r.version == FormatV2 {
  1675  		offset = id * 16
  1676  	}
  1677  	d := encoding.DecWrap(tsdb_enc.NewDecbufUvarintAt(r.b, int(offset), castagnoliTable))
  1678  	if d.Err() != nil {
  1679  		return 0, d.Err()
  1680  	}
  1681  
  1682  	fprint, err := r.dec.Series(d.Get(), lbls, chks, true, by...)
  1683  	if err != nil {
  1684  		return 0, errors.Wrap(err, "read series")
  1685  	}
  1686  	return fprint, nil
  1687  }
  1688  
  1689  func (r *Reader) Postings(name string, shard *index.ShardAnnotation, values ...string) (index.Postings, error) {
  1690  	if r.version == FormatV1 {
  1691  		e, ok := r.postingsV1[name]
  1692  		if !ok {
  1693  			return index.EmptyPostings(), nil
  1694  		}
  1695  		res := make([]index.Postings, 0, len(values))
  1696  		for _, v := range values {
  1697  			postingsOff, ok := e[v]
  1698  			if !ok {
  1699  				continue
  1700  			}
  1701  			// Read from the postings table.
  1702  			d := encoding.DecWrap(tsdb_enc.NewDecbufAt(r.b, int(postingsOff), castagnoliTable))
  1703  			_, p, err := r.dec.Postings(d.Get())
  1704  			if err != nil {
  1705  				return nil, errors.Wrap(err, "decode postings")
  1706  			}
  1707  			res = append(res, p)
  1708  		}
  1709  		return index.Merge(res...), nil
  1710  	}
  1711  
  1712  	e, ok := r.postings[name]
  1713  	if !ok {
  1714  		return index.EmptyPostings(), nil
  1715  	}
  1716  
  1717  	if len(values) == 0 {
  1718  		return index.EmptyPostings(), nil
  1719  	}
  1720  
  1721  	res := make([]index.Postings, 0, len(values))
  1722  	skip := 0
  1723  	valueIndex := 0
  1724  	for valueIndex < len(values) && values[valueIndex] < e[0].value {
  1725  		// Discard values before the start.
  1726  		valueIndex++
  1727  	}
  1728  	for valueIndex < len(values) {
  1729  		value := values[valueIndex]
  1730  
  1731  		i := sort.Search(len(e), func(i int) bool { return e[i].value >= value })
  1732  		if i == len(e) {
  1733  			// We're past the end.
  1734  			break
  1735  		}
  1736  		if i > 0 && e[i].value != value {
  1737  			// Need to look from previous entry.
  1738  			i--
  1739  		}
  1740  		// Don't Crc32 the entire postings offset table, this is very slow
  1741  		// so hope any issues were caught at startup.
  1742  		d := encoding.DecWrap(tsdb_enc.NewDecbufAt(r.b, int(r.toc.PostingsTable), nil))
  1743  		d.Skip(e[i].off)
  1744  
  1745  		// Iterate on the offset table.
  1746  		var postingsOff uint64 // The offset into the postings table.
  1747  		for d.Err() == nil {
  1748  			if skip == 0 {
  1749  				// These are always the same number of bytes,
  1750  				// and it's faster to skip than parse.
  1751  				skip = d.Len()
  1752  				d.Uvarint()      // Keycount.
  1753  				d.UvarintBytes() // Label name.
  1754  				skip -= d.Len()
  1755  			} else {
  1756  				d.Skip(skip)
  1757  			}
  1758  			v := d.UvarintBytes()       // Label value.
  1759  			postingsOff = d.Uvarint64() // Offset.
  1760  			for string(v) >= value {
  1761  				if string(v) == value {
  1762  					// Read from the postings table.
  1763  					d2 := encoding.DecWrap(tsdb_enc.NewDecbufAt(r.b, int(postingsOff), castagnoliTable))
  1764  					_, p, err := r.dec.Postings(d2.Get())
  1765  					if err != nil {
  1766  						return nil, errors.Wrap(err, "decode postings")
  1767  					}
  1768  					res = append(res, p)
  1769  				}
  1770  				valueIndex++
  1771  				if valueIndex == len(values) {
  1772  					break
  1773  				}
  1774  				value = values[valueIndex]
  1775  			}
  1776  			if i+1 == len(e) || value >= e[i+1].value || valueIndex == len(values) {
  1777  				// Need to go to a later postings offset entry, if there is one.
  1778  				break
  1779  			}
  1780  		}
  1781  		if d.Err() != nil {
  1782  			return nil, errors.Wrap(d.Err(), "get postings offset entry")
  1783  		}
  1784  	}
  1785  
  1786  	merged := index.Merge(res...)
  1787  	if shard != nil {
  1788  		return index.NewShardedPostings(merged, *shard, r.fingerprintOffsets), nil
  1789  	}
  1790  
  1791  	return merged, nil
  1792  }
  1793  
  1794  // Size returns the size of an index file.
  1795  func (r *Reader) Size() int64 {
  1796  	return int64(r.b.Len())
  1797  }
  1798  
  1799  // LabelNames returns all the unique label names present in the index.
  1800  // TODO(twilkie) implement support for matchers
  1801  func (r *Reader) LabelNames(matchers ...*labels.Matcher) ([]string, error) {
  1802  	if len(matchers) > 0 {
  1803  		return nil, errors.Errorf("matchers parameter is not implemented: %+v", matchers)
  1804  	}
  1805  
  1806  	labelNames := make([]string, 0, len(r.postings))
  1807  	allPostingsKeyName, _ := index.AllPostingsKey()
  1808  	for name := range r.postings {
  1809  		//if name == index.allPostingsKey.Name {
  1810  		if name == allPostingsKeyName {
  1811  			// This is not from any metric.
  1812  			continue
  1813  		}
  1814  		labelNames = append(labelNames, name)
  1815  	}
  1816  	sort.Strings(labelNames)
  1817  	return labelNames, nil
  1818  }
  1819  
  1820  // Decoder provides decoding methods for the v1 and v2 index file format.
  1821  //
  1822  // It currently does not contain decoding methods for all entry types but can be extended
  1823  // by them if there's demand.
  1824  type Decoder struct {
  1825  	LookupSymbol func(uint32) (string, error)
  1826  }
  1827  
  1828  // Postings returns a postings list for b and its number of elements.
  1829  func (dec *Decoder) Postings(b []byte) (int, index.Postings, error) {
  1830  	d := encoding.DecWrap(tsdb_enc.Decbuf{B: b})
  1831  	n := d.Be32int()
  1832  	l := d.Get()
  1833  	if d.Err() != nil {
  1834  		return 0, nil, d.Err()
  1835  	}
  1836  	if len(l) != 4*n {
  1837  		return 0, nil, fmt.Errorf("unexpected postings length, should be %d bytes for %d postings, got %d bytes", 4*n, n, len(l))
  1838  	}
  1839  	return n, index.NewBigEndianPostings(l), nil
  1840  }
  1841  
  1842  // LabelNamesOffsetsFor decodes the offsets of the name symbols for a given series.
  1843  // They are returned in the same order they're stored, which should be sorted lexicographically.
  1844  func (dec *Decoder) LabelNamesOffsetsFor(b []byte) ([]uint32, error) {
  1845  	d := encoding.DecWrap(tsdb_enc.Decbuf{B: b})
  1846  	_ = d.Be64() // skip fingerprint
  1847  	k := d.Uvarint()
  1848  
  1849  	offsets := make([]uint32, k)
  1850  	for i := 0; i < k; i++ {
  1851  		offsets[i] = uint32(d.Uvarint())
  1852  		_ = d.Uvarint() // skip the label value
  1853  
  1854  		if d.Err() != nil {
  1855  			return nil, errors.Wrap(d.Err(), "read series label offsets")
  1856  		}
  1857  	}
  1858  
  1859  	return offsets, d.Err()
  1860  }
  1861  
  1862  // LabelValueFor decodes a label for a given series.
  1863  func (dec *Decoder) LabelValueFor(b []byte, label string) (string, error) {
  1864  	d := encoding.DecWrap(tsdb_enc.Decbuf{B: b})
  1865  	_ = d.Be64() // skip fingerprint
  1866  	k := d.Uvarint()
  1867  
  1868  	for i := 0; i < k; i++ {
  1869  		lno := uint32(d.Uvarint())
  1870  		lvo := uint32(d.Uvarint())
  1871  
  1872  		if d.Err() != nil {
  1873  			return "", errors.Wrap(d.Err(), "read series label offsets")
  1874  		}
  1875  
  1876  		ln, err := dec.LookupSymbol(lno)
  1877  		if err != nil {
  1878  			return "", errors.Wrap(err, "lookup label name")
  1879  		}
  1880  
  1881  		if ln == label {
  1882  			lv, err := dec.LookupSymbol(lvo)
  1883  			if err != nil {
  1884  				return "", errors.Wrap(err, "lookup label value")
  1885  			}
  1886  
  1887  			return lv, nil
  1888  		}
  1889  	}
  1890  
  1891  	return "", d.Err()
  1892  }
  1893  
  1894  // Series decodes a series entry from the given byte slice into lset and chks.
  1895  func (dec *Decoder) Series(b []byte, lbls *phlaremodel.Labels, chks *[]index.ChunkMeta, group bool, by ...string) (uint64, error) {
  1896  	if lbls != nil {
  1897  		*lbls = (*lbls)[:0]
  1898  	}
  1899  	*chks = (*chks)[:0]
  1900  
  1901  	d := encoding.DecWrap(tsdb_enc.Decbuf{B: b})
  1902  
  1903  	fprint := d.Be64()
  1904  	k := d.Uvarint()
  1905  
  1906  	for i := 0; i < k; i++ {
  1907  		lno := uint32(d.Uvarint())
  1908  		lvo := uint32(d.Uvarint())
  1909  
  1910  		if d.Err() != nil {
  1911  			return 0, errors.Wrap(d.Err(), "read series label offsets")
  1912  		}
  1913  		if lbls == nil {
  1914  			continue
  1915  		}
  1916  		if group && len(by) == 0 {
  1917  			// If we're grouping by all labels, we don't need to decode them.
  1918  			continue
  1919  		}
  1920  		ln, err := dec.LookupSymbol(lno)
  1921  		if err != nil {
  1922  			return 0, errors.Wrap(err, "lookup label name")
  1923  		}
  1924  		if group {
  1925  			var found bool
  1926  			for _, b := range by {
  1927  				if b == ln {
  1928  					found = true
  1929  					break
  1930  				}
  1931  			}
  1932  			if !found {
  1933  				continue
  1934  			}
  1935  		}
  1936  		lv, err := dec.LookupSymbol(lvo)
  1937  		if err != nil {
  1938  			return 0, errors.Wrap(err, "lookup label value")
  1939  		}
  1940  
  1941  		*lbls = append(*lbls, &typesv1.LabelPair{Name: ln, Value: lv})
  1942  	}
  1943  
  1944  	// Read the chunks meta data.
  1945  	k = d.Uvarint()
  1946  
  1947  	if k == 0 {
  1948  		return 0, d.Err()
  1949  	}
  1950  
  1951  	t0 := d.Varint64()
  1952  	maxt := int64(d.Uvarint64()) + t0
  1953  	kb := uint32(d.Uvarint())
  1954  	entries := uint32(d.Uvarint64())
  1955  	checksum := d.Be32()
  1956  
  1957  	*chks = append(*chks, index.ChunkMeta{
  1958  		Checksum:    checksum,
  1959  		MinTime:     t0,
  1960  		MaxTime:     maxt,
  1961  		KB:          kb,
  1962  		SeriesIndex: entries,
  1963  	})
  1964  	t0 = maxt
  1965  
  1966  	for i := 1; i < k; i++ {
  1967  		// Decode the diff against previous chunk as varint
  1968  		// instead of uvarint because chunks may overlap
  1969  		mint := d.Varint64() + t0
  1970  		maxt := int64(d.Uvarint64()) + mint
  1971  		kb := uint32(d.Uvarint())
  1972  		entries := uint32(d.Uvarint64())
  1973  		checksum := d.Be32()
  1974  		t0 = maxt
  1975  
  1976  		if d.Err() != nil {
  1977  			return 0, errors.Wrapf(d.Err(), "read meta for chunk %d", i)
  1978  		}
  1979  
  1980  		*chks = append(*chks, index.ChunkMeta{
  1981  			Checksum:    checksum,
  1982  			MinTime:     mint,
  1983  			MaxTime:     maxt,
  1984  			KB:          kb,
  1985  			SeriesIndex: entries,
  1986  		})
  1987  	}
  1988  	return fprint, d.Err()
  1989  }
  1990  
  1991  func yoloString(b []byte) string {
  1992  	return *((*string)(unsafe.Pointer(&b)))
  1993  }
  1994  
  1995  // todo better name, nicer api
  1996  func (w *Writer) ReleaseIndexBuffer() *BufferWriter {
  1997  	res := w.f
  1998  	w.f = nil
  1999  	return res
  2000  }
  2001  
  2002  // todo better name, nicer api
  2003  func (w *Writer) ReleaseIndex() []byte {
  2004  	bw := w.ReleaseIndexBuffer()
  2005  	defer PutBufferWriterToPool(bw)
  2006  	buffer, _, _ := bw.Buffer()
  2007  	res := make([]byte, len(buffer))
  2008  	copy(res, buffer)
  2009  	return res
  2010  }