github.com/yankunsam/loki/v2@v2.6.3-0.20220817130409-389df5235c27/pkg/storage/stores/tsdb/index/index.go (about)

     1  // Copyright 2017 The Prometheus Authors
     2  // Licensed under the Apache License, Version 2.0 (the "License");
     3  // you may not use this file except in compliance with the License.
     4  // You may obtain a copy of the License at
     5  //
     6  // http://www.apache.org/licenses/LICENSE-2.0
     7  //
     8  // Unless required by applicable law or agreed to in writing, software
     9  // distributed under the License is distributed on an "AS IS" BASIS,
    10  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    11  // See the License for the specific language governing permissions and
    12  // limitations under the License.
    13  
    14  package index
    15  
    16  import (
    17  	"bufio"
    18  	"bytes"
    19  	"context"
    20  	"encoding/binary"
    21  	"fmt"
    22  	"hash"
    23  	"hash/crc32"
    24  	"io"
    25  	"io/ioutil"
    26  	"math"
    27  	"os"
    28  	"path/filepath"
    29  	"sort"
    30  	"unsafe"
    31  
    32  	"github.com/pkg/errors"
    33  	"github.com/prometheus/common/model"
    34  	"github.com/prometheus/prometheus/model/labels"
    35  	"github.com/prometheus/prometheus/storage"
    36  	tsdb_enc "github.com/prometheus/prometheus/tsdb/encoding"
    37  	"github.com/prometheus/prometheus/tsdb/fileutil"
    38  
    39  	"github.com/grafana/loki/pkg/util/encoding"
    40  )
    41  
    42  const (
    43  	// MagicIndex 4 bytes at the head of an index file.
    44  	MagicIndex = 0xBAAAD700
    45  	// HeaderLen represents number of bytes reserved of index for header.
    46  	HeaderLen = 5
    47  
    48  	// FormatV1 represents 1 version of index.
    49  	FormatV1 = 1
    50  	// FormatV2 represents 2 version of index.
    51  	FormatV2 = 2
    52  
    53  	IndexFilename = "index"
    54  
    55  	// store every 1024 series' fingerprints in the fingerprint offsets table
    56  	fingerprintInterval = 1 << 10
    57  )
    58  
    59  type indexWriterStage uint8
    60  
    61  const (
    62  	idxStageNone indexWriterStage = iota
    63  	idxStageSymbols
    64  	idxStageSeries
    65  	idxStageDone
    66  )
    67  
    68  func (s indexWriterStage) String() string {
    69  	switch s {
    70  	case idxStageNone:
    71  		return "none"
    72  	case idxStageSymbols:
    73  		return "symbols"
    74  	case idxStageSeries:
    75  		return "series"
    76  	case idxStageDone:
    77  		return "done"
    78  	}
    79  	return "<unknown>"
    80  }
    81  
    82  // The table gets initialized with sync.Once but may still cause a race
    83  // with any other use of the crc32 package anywhere. Thus we initialize it
    84  // before.
    85  var castagnoliTable *crc32.Table
    86  
    87  func init() {
    88  	castagnoliTable = crc32.MakeTable(crc32.Castagnoli)
    89  }
    90  
    91  // newCRC32 initializes a CRC32 hash with a preconfigured polynomial, so the
    92  // polynomial may be easily changed in one location at a later time, if necessary.
    93  func newCRC32() hash.Hash32 {
    94  	return crc32.New(castagnoliTable)
    95  }
    96  
    97  type symbolCacheEntry struct {
    98  	index          uint32
    99  	lastValue      string
   100  	lastValueIndex uint32
   101  }
   102  
   103  // Writer implements the IndexWriter interface for the standard
   104  // serialization format.
   105  type Writer struct {
   106  	ctx context.Context
   107  
   108  	// For the main index file.
   109  	f *FileWriter
   110  
   111  	// Temporary file for postings.
   112  	fP *FileWriter
   113  	// Temporary file for posting offsets table.
   114  	fPO   *FileWriter
   115  	cntPO uint64
   116  
   117  	toc           TOC
   118  	stage         indexWriterStage
   119  	postingsStart uint64 // Due to padding, can differ from TOC entry.
   120  
   121  	// Reusable memory.
   122  	buf1 encoding.Encbuf
   123  	buf2 encoding.Encbuf
   124  
   125  	numSymbols  int
   126  	symbols     *Symbols
   127  	symbolFile  *fileutil.MmapFile
   128  	lastSymbol  string
   129  	symbolCache map[string]symbolCacheEntry
   130  
   131  	labelIndexes []labelIndexHashEntry // Label index offsets.
   132  	labelNames   map[string]uint64     // Label names, and their usage.
   133  	// Keeps track of the fingerprint/offset for every n series
   134  	fingerprintOffsets FingerprintOffsets
   135  
   136  	// Hold last series to validate that clients insert new series in order.
   137  	lastSeries     labels.Labels
   138  	lastSeriesHash uint64
   139  	lastRef        storage.SeriesRef
   140  
   141  	crc32 hash.Hash
   142  
   143  	Version int
   144  }
   145  
   146  // TOC represents index Table Of Content that states where each section of index starts.
   147  type TOC struct {
   148  	Symbols            uint64
   149  	Series             uint64
   150  	LabelIndices       uint64
   151  	LabelIndicesTable  uint64
   152  	Postings           uint64
   153  	PostingsTable      uint64
   154  	FingerprintOffsets uint64
   155  	Metadata           Metadata
   156  }
   157  
   158  // Metadata is TSDB-level metadata
   159  type Metadata struct {
   160  	From, Through int64
   161  	Checksum      uint32
   162  }
   163  
   164  func (m *Metadata) EnsureBounds(from, through int64) {
   165  	if m.From == 0 || from < m.From {
   166  		m.From = from
   167  	}
   168  
   169  	if m.Through == 0 || through > m.Through {
   170  		m.Through = through
   171  	}
   172  
   173  }
   174  
   175  // NewTOCFromByteSlice return parsed TOC from given index byte slice.
   176  func NewTOCFromByteSlice(bs ByteSlice) (*TOC, error) {
   177  	if bs.Len() < indexTOCLen {
   178  		return nil, tsdb_enc.ErrInvalidSize
   179  	}
   180  	b := bs.Range(bs.Len()-indexTOCLen, bs.Len())
   181  
   182  	expCRC := binary.BigEndian.Uint32(b[len(b)-4:])
   183  	d := encoding.DecWrap(tsdb_enc.Decbuf{B: b[:len(b)-4]})
   184  	if d.Crc32(castagnoliTable) != expCRC {
   185  		return nil, errors.Wrap(tsdb_enc.ErrInvalidChecksum, "read TOC")
   186  	}
   187  
   188  	if err := d.Err(); err != nil {
   189  		return nil, err
   190  	}
   191  
   192  	return &TOC{
   193  		Symbols:            d.Be64(),
   194  		Series:             d.Be64(),
   195  		LabelIndices:       d.Be64(),
   196  		LabelIndicesTable:  d.Be64(),
   197  		Postings:           d.Be64(),
   198  		PostingsTable:      d.Be64(),
   199  		FingerprintOffsets: d.Be64(),
   200  		Metadata: Metadata{
   201  			From:     d.Be64int64(),
   202  			Through:  d.Be64int64(),
   203  			Checksum: expCRC,
   204  		},
   205  	}, nil
   206  }
   207  
   208  // NewWriter returns a new Writer to the given filename. It serializes data in format version 2.
   209  func NewWriter(ctx context.Context, fn string) (*Writer, error) {
   210  	dir := filepath.Dir(fn)
   211  
   212  	df, err := fileutil.OpenDir(dir)
   213  	if err != nil {
   214  		return nil, err
   215  	}
   216  	defer df.Close() // Close for platform windows.
   217  
   218  	if err := os.RemoveAll(fn); err != nil {
   219  		return nil, errors.Wrap(err, "remove any existing index at path")
   220  	}
   221  
   222  	// Main index file we are building.
   223  	f, err := NewFileWriter(fn)
   224  	if err != nil {
   225  		return nil, err
   226  	}
   227  	// Temporary file for postings.
   228  	fP, err := NewFileWriter(fn + "_tmp_p")
   229  	if err != nil {
   230  		return nil, err
   231  	}
   232  	// Temporary file for posting offset table.
   233  	fPO, err := NewFileWriter(fn + "_tmp_po")
   234  	if err != nil {
   235  		return nil, err
   236  	}
   237  	if err := df.Sync(); err != nil {
   238  		return nil, errors.Wrap(err, "sync dir")
   239  	}
   240  
   241  	iw := &Writer{
   242  		ctx:   ctx,
   243  		f:     f,
   244  		fP:    fP,
   245  		fPO:   fPO,
   246  		stage: idxStageNone,
   247  
   248  		// Reusable memory.
   249  		buf1: encoding.EncWrap(tsdb_enc.Encbuf{B: make([]byte, 0, 1<<22)}),
   250  		buf2: encoding.EncWrap(tsdb_enc.Encbuf{B: make([]byte, 0, 1<<22)}),
   251  
   252  		symbolCache: make(map[string]symbolCacheEntry, 1<<8),
   253  		labelNames:  make(map[string]uint64, 1<<8),
   254  		crc32:       newCRC32(),
   255  	}
   256  	if err := iw.writeMeta(); err != nil {
   257  		return nil, err
   258  	}
   259  	return iw, nil
   260  }
   261  
   262  func (w *Writer) write(bufs ...[]byte) error {
   263  	return w.f.Write(bufs...)
   264  }
   265  
   266  func (w *Writer) writeAt(buf []byte, pos uint64) error {
   267  	return w.f.WriteAt(buf, pos)
   268  }
   269  
   270  func (w *Writer) addPadding(size int) error {
   271  	return w.f.AddPadding(size)
   272  }
   273  
   274  type FileWriter struct {
   275  	f    *os.File
   276  	fbuf *bufio.Writer
   277  	pos  uint64
   278  	name string
   279  }
   280  
   281  func NewFileWriter(name string) (*FileWriter, error) {
   282  	f, err := os.OpenFile(name, os.O_CREATE|os.O_RDWR, 0o666)
   283  	if err != nil {
   284  		return nil, err
   285  	}
   286  	return &FileWriter{
   287  		f:    f,
   288  		fbuf: bufio.NewWriterSize(f, 1<<22),
   289  		pos:  0,
   290  		name: name,
   291  	}, nil
   292  }
   293  
   294  func (fw *FileWriter) Pos() uint64 {
   295  	return fw.pos
   296  }
   297  
   298  func (fw *FileWriter) Write(bufs ...[]byte) error {
   299  	for _, b := range bufs {
   300  		n, err := fw.fbuf.Write(b)
   301  		fw.pos += uint64(n)
   302  		if err != nil {
   303  			return err
   304  		}
   305  		// For now the index file must not grow beyond 64GiB. Some of the fixed-sized
   306  		// offset references in v1 are only 4 bytes large.
   307  		// Once we move to compressed/varint representations in those areas, this limitation
   308  		// can be lifted.
   309  		if fw.pos > 16*math.MaxUint32 {
   310  			return errors.Errorf("%q exceeding max size of 64GiB", fw.name)
   311  		}
   312  	}
   313  	return nil
   314  }
   315  
   316  func (fw *FileWriter) Flush() error {
   317  	return fw.fbuf.Flush()
   318  }
   319  
   320  func (fw *FileWriter) WriteAt(buf []byte, pos uint64) error {
   321  	if err := fw.Flush(); err != nil {
   322  		return err
   323  	}
   324  	_, err := fw.f.WriteAt(buf, int64(pos))
   325  	return err
   326  }
   327  
   328  // AddPadding adds zero byte padding until the file size is a multiple size.
   329  func (fw *FileWriter) AddPadding(size int) error {
   330  	p := fw.pos % uint64(size)
   331  	if p == 0 {
   332  		return nil
   333  	}
   334  	p = uint64(size) - p
   335  
   336  	if err := fw.Write(make([]byte, p)); err != nil {
   337  		return errors.Wrap(err, "add padding")
   338  	}
   339  	return nil
   340  }
   341  
   342  func (fw *FileWriter) Close() error {
   343  	if err := fw.Flush(); err != nil {
   344  		return err
   345  	}
   346  	if err := fw.f.Sync(); err != nil {
   347  		return err
   348  	}
   349  	return fw.f.Close()
   350  }
   351  
   352  func (fw *FileWriter) Remove() error {
   353  	return os.Remove(fw.name)
   354  }
   355  
   356  // ensureStage handles transitions between write stages and ensures that IndexWriter
   357  // methods are called in an order valid for the implementation.
   358  func (w *Writer) ensureStage(s indexWriterStage) error {
   359  	select {
   360  	case <-w.ctx.Done():
   361  		return w.ctx.Err()
   362  	default:
   363  	}
   364  
   365  	if w.stage == s {
   366  		return nil
   367  	}
   368  	if w.stage < s-1 {
   369  		// A stage has been skipped.
   370  		if err := w.ensureStage(s - 1); err != nil {
   371  			return err
   372  		}
   373  	}
   374  	if w.stage > s {
   375  		return errors.Errorf("invalid stage %q, currently at %q", s, w.stage)
   376  	}
   377  
   378  	// Mark start of sections in table of contents.
   379  	switch s {
   380  	case idxStageSymbols:
   381  		w.toc.Symbols = w.f.pos
   382  		if err := w.startSymbols(); err != nil {
   383  			return err
   384  		}
   385  	case idxStageSeries:
   386  		if err := w.finishSymbols(); err != nil {
   387  			return err
   388  		}
   389  		w.toc.Series = w.f.pos
   390  
   391  	case idxStageDone:
   392  		w.toc.LabelIndices = w.f.pos
   393  		// LabelIndices generation depends on the posting offset
   394  		// table produced at this stage.
   395  		if err := w.writePostingsToTmpFiles(); err != nil {
   396  			return err
   397  		}
   398  		if err := w.writeLabelIndices(); err != nil {
   399  			return err
   400  		}
   401  
   402  		w.toc.Postings = w.f.pos
   403  		if err := w.writePostings(); err != nil {
   404  			return err
   405  		}
   406  
   407  		w.toc.LabelIndicesTable = w.f.pos
   408  		if err := w.writeLabelIndexesOffsetTable(); err != nil {
   409  			return err
   410  		}
   411  
   412  		w.toc.PostingsTable = w.f.pos
   413  		if err := w.writePostingsOffsetTable(); err != nil {
   414  			return err
   415  		}
   416  
   417  		w.toc.FingerprintOffsets = w.f.pos
   418  		if err := w.writeFingerprintOffsetsTable(); err != nil {
   419  			return err
   420  		}
   421  
   422  		if err := w.writeTOC(); err != nil {
   423  			return err
   424  		}
   425  	}
   426  
   427  	w.stage = s
   428  	return nil
   429  }
   430  
   431  func (w *Writer) writeMeta() error {
   432  	w.buf1.Reset()
   433  	w.buf1.PutBE32(MagicIndex)
   434  	w.buf1.PutByte(FormatV2)
   435  
   436  	return w.write(w.buf1.Get())
   437  }
   438  
   439  // AddSeries adds the series one at a time along with its chunks.
   440  // Requires a specific fingerprint to be passed in the case where the "desired"
   441  // fingerprint differs from what labels.Hash() produces. For example,
   442  // multitenant TSDBs embed a tenant label, but the actual series has no such
   443  // label and so the derived fingerprint differs.
   444  func (w *Writer) AddSeries(ref storage.SeriesRef, lset labels.Labels, fp model.Fingerprint, chunks ...ChunkMeta) error {
   445  	if err := w.ensureStage(idxStageSeries); err != nil {
   446  		return err
   447  	}
   448  
   449  	// Put the supplied fingerprint instead of the calculated hash.
   450  	// This allows us to have a synthetic label (__loki_tenant__) in
   451  	// the pre-compacted TSDBs which map to fingerprints (and chunks)
   452  	// without this label in storage.
   453  	labelHash := uint64(fp)
   454  
   455  	lastHash := w.lastSeriesHash
   456  	// Ensure series are sorted by the priorities: [`hash(labels)`, `labels`]
   457  	if (labelHash < lastHash && len(w.lastSeries) > 0) || labelHash == lastHash && labels.Compare(lset, w.lastSeries) < 0 {
   458  		return errors.Errorf("out-of-order series added with label set %q", lset)
   459  	}
   460  
   461  	if ref < w.lastRef && len(w.lastSeries) != 0 {
   462  		return errors.Errorf("series with reference greater than %d already added", ref)
   463  	}
   464  	// We add padding to 16 bytes to increase the addressable space we get through 4 byte
   465  	// series references.
   466  	if err := w.addPadding(16); err != nil {
   467  		return errors.Errorf("failed to write padding bytes: %v", err)
   468  	}
   469  
   470  	if w.f.pos%16 != 0 {
   471  		return errors.Errorf("series write not 16-byte aligned at %d", w.f.pos)
   472  	}
   473  
   474  	w.buf2.Reset()
   475  	w.buf2.PutBE64(labelHash)
   476  	w.buf2.PutUvarint(len(lset))
   477  
   478  	for _, l := range lset {
   479  		var err error
   480  		cacheEntry, ok := w.symbolCache[l.Name]
   481  		nameIndex := cacheEntry.index
   482  		if !ok {
   483  			nameIndex, err = w.symbols.ReverseLookup(l.Name)
   484  			if err != nil {
   485  				return errors.Errorf("symbol entry for %q does not exist, %v", l.Name, err)
   486  			}
   487  		}
   488  		w.labelNames[l.Name]++
   489  		w.buf2.PutUvarint32(nameIndex)
   490  
   491  		valueIndex := cacheEntry.lastValueIndex
   492  		if !ok || cacheEntry.lastValue != l.Value {
   493  			valueIndex, err = w.symbols.ReverseLookup(l.Value)
   494  			if err != nil {
   495  				return errors.Errorf("symbol entry for %q does not exist, %v", l.Value, err)
   496  			}
   497  			w.symbolCache[l.Name] = symbolCacheEntry{
   498  				index:          nameIndex,
   499  				lastValue:      l.Value,
   500  				lastValueIndex: valueIndex,
   501  			}
   502  		}
   503  		w.buf2.PutUvarint32(valueIndex)
   504  	}
   505  
   506  	w.buf2.PutUvarint(len(chunks))
   507  
   508  	if len(chunks) > 0 {
   509  		c := chunks[0]
   510  		w.toc.Metadata.EnsureBounds(c.MinTime, c.MaxTime)
   511  
   512  		w.buf2.PutVarint64(c.MinTime)
   513  		w.buf2.PutUvarint64(uint64(c.MaxTime - c.MinTime))
   514  		w.buf2.PutUvarint32(c.KB)
   515  		w.buf2.PutUvarint32(c.Entries)
   516  		w.buf2.PutBE32(c.Checksum)
   517  		t0 := c.MaxTime
   518  
   519  		for _, c := range chunks[1:] {
   520  			w.toc.Metadata.EnsureBounds(c.MinTime, c.MaxTime)
   521  			// Encode the diff against previous chunk as varint
   522  			// instead of uvarint because chunks may overlap
   523  			w.buf2.PutVarint64(c.MinTime - t0)
   524  			w.buf2.PutUvarint64(uint64(c.MaxTime - c.MinTime))
   525  			w.buf2.PutUvarint32(c.KB)
   526  			w.buf2.PutUvarint32(c.Entries)
   527  			t0 = c.MaxTime
   528  
   529  			w.buf2.PutBE32(c.Checksum)
   530  		}
   531  	}
   532  
   533  	w.buf1.Reset()
   534  	w.buf1.PutUvarint(w.buf2.Len())
   535  
   536  	w.buf2.PutHash(w.crc32)
   537  
   538  	w.lastSeries = append(w.lastSeries[:0], lset...)
   539  	w.lastSeriesHash = labelHash
   540  	w.lastRef = ref
   541  
   542  	if ref%fingerprintInterval == 0 {
   543  		// series references are the 16-byte aligned offsets
   544  		// Do NOT ask me how long I debugged this particular bit >:O
   545  		sRef := w.f.pos / 16
   546  		w.fingerprintOffsets = append(w.fingerprintOffsets, [2]uint64{sRef, labelHash})
   547  	}
   548  
   549  	if err := w.write(w.buf1.Get(), w.buf2.Get()); err != nil {
   550  		return errors.Wrap(err, "write series data")
   551  	}
   552  
   553  	return nil
   554  }
   555  
   556  func (w *Writer) startSymbols() error {
   557  	// We are at w.toc.Symbols.
   558  	// Leave 4 bytes of space for the length, and another 4 for the number of symbols
   559  	// which will both be calculated later.
   560  	return w.write([]byte("alenblen"))
   561  }
   562  
   563  func (w *Writer) AddSymbol(sym string) error {
   564  	if err := w.ensureStage(idxStageSymbols); err != nil {
   565  		return err
   566  	}
   567  	if w.numSymbols != 0 && sym <= w.lastSymbol {
   568  		return errors.Errorf("symbol %q out-of-order", sym)
   569  	}
   570  	w.lastSymbol = sym
   571  	w.numSymbols++
   572  	w.buf1.Reset()
   573  	w.buf1.PutUvarintStr(sym)
   574  	return w.write(w.buf1.Get())
   575  }
   576  
   577  func (w *Writer) finishSymbols() error {
   578  	symbolTableSize := w.f.pos - w.toc.Symbols - 4
   579  	// The symbol table's <len> part is 4 bytes. So the total symbol table size must be less than or equal to 2^32-1
   580  	if symbolTableSize > math.MaxUint32 {
   581  		return errors.Errorf("symbol table size exceeds 4 bytes: %d", symbolTableSize)
   582  	}
   583  
   584  	// Write out the length and symbol count.
   585  	w.buf1.Reset()
   586  	w.buf1.PutBE32int(int(symbolTableSize))
   587  	w.buf1.PutBE32int(w.numSymbols)
   588  	if err := w.writeAt(w.buf1.Get(), w.toc.Symbols); err != nil {
   589  		return err
   590  	}
   591  
   592  	hashPos := w.f.pos
   593  	// Leave space for the hash. We can only calculate it
   594  	// now that the number of symbols is known, so mmap and do it from there.
   595  	if err := w.write([]byte("hash")); err != nil {
   596  		return err
   597  	}
   598  	if err := w.f.Flush(); err != nil {
   599  		return err
   600  	}
   601  
   602  	sf, err := fileutil.OpenMmapFile(w.f.name)
   603  	if err != nil {
   604  		return err
   605  	}
   606  	w.symbolFile = sf
   607  	hash := crc32.Checksum(w.symbolFile.Bytes()[w.toc.Symbols+4:hashPos], castagnoliTable)
   608  	w.buf1.Reset()
   609  	w.buf1.PutBE32(hash)
   610  	if err := w.writeAt(w.buf1.Get(), hashPos); err != nil {
   611  		return err
   612  	}
   613  
   614  	// Load in the symbol table efficiently for the rest of the index writing.
   615  	w.symbols, err = NewSymbols(RealByteSlice(w.symbolFile.Bytes()), FormatV2, int(w.toc.Symbols))
   616  	if err != nil {
   617  		return errors.Wrap(err, "read symbols")
   618  	}
   619  	return nil
   620  }
   621  
   622  func (w *Writer) writeLabelIndices() error {
   623  	if err := w.fPO.Flush(); err != nil {
   624  		return err
   625  	}
   626  
   627  	// Find all the label values in the tmp posting offset table.
   628  	f, err := fileutil.OpenMmapFile(w.fPO.name)
   629  	if err != nil {
   630  		return err
   631  	}
   632  	defer f.Close()
   633  
   634  	d := encoding.DecWrap(tsdb_enc.NewDecbufRaw(RealByteSlice(f.Bytes()), int(w.fPO.pos)))
   635  	cnt := w.cntPO
   636  	current := []byte{}
   637  	values := []uint32{}
   638  	for d.Err() == nil && cnt > 0 {
   639  		cnt--
   640  		d.Uvarint()                           // Keycount.
   641  		name := d.UvarintBytes()              // Label name.
   642  		value := yoloString(d.UvarintBytes()) // Label value.
   643  		d.Uvarint64()                         // Offset.
   644  		if len(name) == 0 {
   645  			continue // All index is ignored.
   646  		}
   647  
   648  		if !bytes.Equal(name, current) && len(values) > 0 {
   649  			// We've reached a new label name.
   650  			if err := w.writeLabelIndex(string(current), values); err != nil {
   651  				return err
   652  			}
   653  			values = values[:0]
   654  		}
   655  		current = name
   656  		sid, err := w.symbols.ReverseLookup(value)
   657  		if err != nil {
   658  			return err
   659  		}
   660  		values = append(values, sid)
   661  	}
   662  	if d.Err() != nil {
   663  		return d.Err()
   664  	}
   665  
   666  	// Handle the last label.
   667  	if len(values) > 0 {
   668  		if err := w.writeLabelIndex(string(current), values); err != nil {
   669  			return err
   670  		}
   671  	}
   672  	return nil
   673  }
   674  
   675  func (w *Writer) writeLabelIndex(name string, values []uint32) error {
   676  	// Align beginning to 4 bytes for more efficient index list scans.
   677  	if err := w.addPadding(4); err != nil {
   678  		return err
   679  	}
   680  
   681  	w.labelIndexes = append(w.labelIndexes, labelIndexHashEntry{
   682  		keys:   []string{name},
   683  		offset: w.f.pos,
   684  	})
   685  
   686  	startPos := w.f.pos
   687  	// Leave 4 bytes of space for the length, which will be calculated later.
   688  	if err := w.write([]byte("alen")); err != nil {
   689  		return err
   690  	}
   691  	w.crc32.Reset()
   692  
   693  	w.buf1.Reset()
   694  	w.buf1.PutBE32int(1) // Number of names.
   695  	w.buf1.PutBE32int(len(values))
   696  	w.buf1.WriteToHash(w.crc32)
   697  	if err := w.write(w.buf1.Get()); err != nil {
   698  		return err
   699  	}
   700  
   701  	for _, v := range values {
   702  		w.buf1.Reset()
   703  		w.buf1.PutBE32(v)
   704  		w.buf1.WriteToHash(w.crc32)
   705  		if err := w.write(w.buf1.Get()); err != nil {
   706  			return err
   707  		}
   708  	}
   709  
   710  	// Write out the length.
   711  	w.buf1.Reset()
   712  	l := w.f.pos - startPos - 4
   713  	if l > math.MaxUint32 {
   714  		return errors.Errorf("label index size exceeds 4 bytes: %d", l)
   715  	}
   716  	w.buf1.PutBE32int(int(l))
   717  	if err := w.writeAt(w.buf1.Get(), startPos); err != nil {
   718  		return err
   719  	}
   720  
   721  	w.buf1.Reset()
   722  	w.buf1.PutHashSum(w.crc32)
   723  	return w.write(w.buf1.Get())
   724  }
   725  
   726  // writeLabelIndexesOffsetTable writes the label indices offset table.
   727  func (w *Writer) writeLabelIndexesOffsetTable() error {
   728  	startPos := w.f.pos
   729  	// Leave 4 bytes of space for the length, which will be calculated later.
   730  	if err := w.write([]byte("alen")); err != nil {
   731  		return err
   732  	}
   733  	w.crc32.Reset()
   734  
   735  	w.buf1.Reset()
   736  	w.buf1.PutBE32int(len(w.labelIndexes))
   737  	w.buf1.WriteToHash(w.crc32)
   738  	if err := w.write(w.buf1.Get()); err != nil {
   739  		return err
   740  	}
   741  
   742  	for _, e := range w.labelIndexes {
   743  		w.buf1.Reset()
   744  		w.buf1.PutUvarint(len(e.keys))
   745  		for _, k := range e.keys {
   746  			w.buf1.PutUvarintStr(k)
   747  		}
   748  		w.buf1.PutUvarint64(e.offset)
   749  		w.buf1.WriteToHash(w.crc32)
   750  		if err := w.write(w.buf1.Get()); err != nil {
   751  			return err
   752  		}
   753  	}
   754  	// Write out the length.
   755  	w.buf1.Reset()
   756  	l := w.f.pos - startPos - 4
   757  	if l > math.MaxUint32 {
   758  		return errors.Errorf("label indexes offset table size exceeds 4 bytes: %d", l)
   759  	}
   760  	w.buf1.PutBE32int(int(l))
   761  	if err := w.writeAt(w.buf1.Get(), startPos); err != nil {
   762  		return err
   763  	}
   764  
   765  	w.buf1.Reset()
   766  	w.buf1.PutHashSum(w.crc32)
   767  	return w.write(w.buf1.Get())
   768  }
   769  
   770  // writePostingsOffsetTable writes the postings offset table.
   771  func (w *Writer) writePostingsOffsetTable() error {
   772  	// Ensure everything is in the temporary file.
   773  	if err := w.fPO.Flush(); err != nil {
   774  		return err
   775  	}
   776  
   777  	startPos := w.f.pos
   778  	// Leave 4 bytes of space for the length, which will be calculated later.
   779  	if err := w.write([]byte("alen")); err != nil {
   780  		return err
   781  	}
   782  
   783  	// Copy over the tmp posting offset table, however we need to
   784  	// adjust the offsets.
   785  	adjustment := w.postingsStart
   786  
   787  	w.buf1.Reset()
   788  	w.crc32.Reset()
   789  	w.buf1.PutBE32int(int(w.cntPO)) // Count.
   790  	w.buf1.WriteToHash(w.crc32)
   791  	if err := w.write(w.buf1.Get()); err != nil {
   792  		return err
   793  	}
   794  
   795  	f, err := fileutil.OpenMmapFile(w.fPO.name)
   796  	if err != nil {
   797  		return err
   798  	}
   799  	defer func() {
   800  		if f != nil {
   801  			f.Close()
   802  		}
   803  	}()
   804  	d := encoding.DecWrap(tsdb_enc.NewDecbufRaw(RealByteSlice(f.Bytes()), int(w.fPO.pos)))
   805  	cnt := w.cntPO
   806  	for d.Err() == nil && cnt > 0 {
   807  		w.buf1.Reset()
   808  		w.buf1.PutUvarint(d.Uvarint())                     // Keycount.
   809  		w.buf1.PutUvarintStr(yoloString(d.UvarintBytes())) // Label name.
   810  		w.buf1.PutUvarintStr(yoloString(d.UvarintBytes())) // Label value.
   811  		w.buf1.PutUvarint64(d.Uvarint64() + adjustment)    // Offset.
   812  		w.buf1.WriteToHash(w.crc32)
   813  		if err := w.write(w.buf1.Get()); err != nil {
   814  			return err
   815  		}
   816  		cnt--
   817  	}
   818  	if d.Err() != nil {
   819  		return d.Err()
   820  	}
   821  
   822  	// Cleanup temporary file.
   823  	if err := f.Close(); err != nil {
   824  		return err
   825  	}
   826  	f = nil
   827  	if err := w.fPO.Close(); err != nil {
   828  		return err
   829  	}
   830  	if err := w.fPO.Remove(); err != nil {
   831  		return err
   832  	}
   833  	w.fPO = nil
   834  
   835  	// Write out the length.
   836  	w.buf1.Reset()
   837  	l := w.f.pos - startPos - 4
   838  	if l > math.MaxUint32 {
   839  		return errors.Errorf("postings offset table size exceeds 4 bytes: %d", l)
   840  	}
   841  	w.buf1.PutBE32int(int(l))
   842  	if err := w.writeAt(w.buf1.Get(), startPos); err != nil {
   843  		return err
   844  	}
   845  
   846  	// Finally write the hash.
   847  	w.buf1.Reset()
   848  	w.buf1.PutHashSum(w.crc32)
   849  	return w.write(w.buf1.Get())
   850  }
   851  
   852  func (w *Writer) writeFingerprintOffsetsTable() error {
   853  	w.buf1.Reset()
   854  	w.buf2.Reset()
   855  
   856  	w.buf1.PutBE32int(len(w.fingerprintOffsets)) // Count.
   857  	// build offsets
   858  	for _, x := range w.fingerprintOffsets {
   859  		w.buf1.PutBE64(x[0]) // series offset
   860  		w.buf1.PutBE64(x[1]) // hash
   861  	}
   862  
   863  	// write length
   864  	ln := w.buf1.Len()
   865  	// TODO(owen-d): can remove the uint32 cast in the future
   866  	// Had to uint32 wrap these for arm32 builds, which we'll remove in the future.
   867  	if uint32(ln) > uint32(math.MaxUint32) {
   868  		return errors.Errorf("fingerprint offset size exceeds 4 bytes: %d", ln)
   869  	}
   870  
   871  	w.buf2.PutBE32int(ln)
   872  	if err := w.write(w.buf2.Get()); err != nil {
   873  		return err
   874  	}
   875  
   876  	// write offsets+checksum
   877  	w.buf1.PutHash(w.crc32)
   878  	if err := w.write(w.buf1.Get()); err != nil {
   879  		return errors.Wrap(err, "failure writing fingerprint offsets")
   880  	}
   881  	return nil
   882  }
   883  
   884  const indexTOCLen = 8*9 + crc32.Size
   885  
   886  func (w *Writer) writeTOC() error {
   887  	w.buf1.Reset()
   888  
   889  	w.buf1.PutBE64(w.toc.Symbols)
   890  	w.buf1.PutBE64(w.toc.Series)
   891  	w.buf1.PutBE64(w.toc.LabelIndices)
   892  	w.buf1.PutBE64(w.toc.LabelIndicesTable)
   893  	w.buf1.PutBE64(w.toc.Postings)
   894  	w.buf1.PutBE64(w.toc.PostingsTable)
   895  	w.buf1.PutBE64(w.toc.FingerprintOffsets)
   896  
   897  	// metadata
   898  	w.buf1.PutBE64int64(w.toc.Metadata.From)
   899  	w.buf1.PutBE64int64(w.toc.Metadata.Through)
   900  
   901  	w.buf1.PutHash(w.crc32)
   902  
   903  	return w.write(w.buf1.Get())
   904  }
   905  
   906  func (w *Writer) writePostingsToTmpFiles() error {
   907  	names := make([]string, 0, len(w.labelNames))
   908  	for n := range w.labelNames {
   909  		names = append(names, n)
   910  	}
   911  	sort.Strings(names)
   912  
   913  	if err := w.f.Flush(); err != nil {
   914  		return err
   915  	}
   916  	f, err := fileutil.OpenMmapFile(w.f.name)
   917  	if err != nil {
   918  		return err
   919  	}
   920  	defer f.Close()
   921  
   922  	// Write out the special all posting.
   923  	offsets := []uint32{}
   924  	d := encoding.DecWrap(tsdb_enc.NewDecbufRaw(RealByteSlice(f.Bytes()), int(w.toc.LabelIndices)))
   925  	d.Skip(int(w.toc.Series))
   926  	for d.Len() > 0 {
   927  		d.ConsumePadding()
   928  		startPos := w.toc.LabelIndices - uint64(d.Len())
   929  		if startPos%16 != 0 {
   930  			return errors.Errorf("series not 16-byte aligned at %d", startPos)
   931  		}
   932  		offsets = append(offsets, uint32(startPos/16))
   933  		// Skip to next series.
   934  		x := d.Uvarint()
   935  		d.Skip(x + crc32.Size)
   936  		if err := d.Err(); err != nil {
   937  			return err
   938  		}
   939  	}
   940  	if err := w.writePosting("", "", offsets); err != nil {
   941  		return err
   942  	}
   943  	maxPostings := uint64(len(offsets)) // No label name can have more postings than this.
   944  
   945  	for len(names) > 0 {
   946  		batchNames := []string{}
   947  		var c uint64
   948  		// Try to bunch up label names into one loop, but avoid
   949  		// using more memory than a single label name can.
   950  		for len(names) > 0 {
   951  			if w.labelNames[names[0]]+c > maxPostings {
   952  				break
   953  			}
   954  			batchNames = append(batchNames, names[0])
   955  			c += w.labelNames[names[0]]
   956  			names = names[1:]
   957  		}
   958  
   959  		nameSymbols := map[uint32]string{}
   960  		for _, name := range batchNames {
   961  			sid, err := w.symbols.ReverseLookup(name)
   962  			if err != nil {
   963  				return err
   964  			}
   965  			nameSymbols[sid] = name
   966  		}
   967  		// Label name -> label value -> positions.
   968  		postings := map[uint32]map[uint32][]uint32{}
   969  
   970  		d := encoding.DecWrap(tsdb_enc.NewDecbufRaw(RealByteSlice(f.Bytes()), int(w.toc.LabelIndices)))
   971  		d.Skip(int(w.toc.Series))
   972  		for d.Len() > 0 {
   973  			d.ConsumePadding()
   974  			startPos := w.toc.LabelIndices - uint64(d.Len())
   975  			l := d.Uvarint() // Length of this series in bytes.
   976  			startLen := d.Len()
   977  
   978  			_ = d.Be64() // skip fingerprint
   979  			// See if label names we want are in the series.
   980  			numLabels := d.Uvarint()
   981  			for i := 0; i < numLabels; i++ {
   982  				lno := uint32(d.Uvarint())
   983  				lvo := uint32(d.Uvarint())
   984  
   985  				if _, ok := nameSymbols[lno]; ok {
   986  					if _, ok := postings[lno]; !ok {
   987  						postings[lno] = map[uint32][]uint32{}
   988  					}
   989  					postings[lno][lvo] = append(postings[lno][lvo], uint32(startPos/16))
   990  				}
   991  			}
   992  			// Skip to next series.
   993  			d.Skip(l - (startLen - d.Len()) + crc32.Size)
   994  			if err := d.Err(); err != nil {
   995  				return err
   996  			}
   997  		}
   998  
   999  		for _, name := range batchNames {
  1000  			// Write out postings for this label name.
  1001  			sid, err := w.symbols.ReverseLookup(name)
  1002  			if err != nil {
  1003  				return err
  1004  			}
  1005  			values := make([]uint32, 0, len(postings[sid]))
  1006  			for v := range postings[sid] {
  1007  				values = append(values, v)
  1008  			}
  1009  			// Symbol numbers are in order, so the strings will also be in order.
  1010  			sort.Sort(uint32slice(values))
  1011  			for _, v := range values {
  1012  				value, err := w.symbols.Lookup(v)
  1013  				if err != nil {
  1014  					return err
  1015  				}
  1016  				if err := w.writePosting(name, value, postings[sid][v]); err != nil {
  1017  					return err
  1018  				}
  1019  			}
  1020  		}
  1021  		select {
  1022  		case <-w.ctx.Done():
  1023  			return w.ctx.Err()
  1024  		default:
  1025  		}
  1026  
  1027  	}
  1028  	return nil
  1029  }
  1030  
  1031  func (w *Writer) writePosting(name, value string, offs []uint32) error {
  1032  	// Align beginning to 4 bytes for more efficient postings list scans.
  1033  	if err := w.fP.AddPadding(4); err != nil {
  1034  		return err
  1035  	}
  1036  
  1037  	// Write out postings offset table to temporary file as we go.
  1038  	w.buf1.Reset()
  1039  	w.buf1.PutUvarint(2)
  1040  	w.buf1.PutUvarintStr(name)
  1041  	w.buf1.PutUvarintStr(value)
  1042  	w.buf1.PutUvarint64(w.fP.pos) // This is relative to the postings tmp file, not the final index file.
  1043  	if err := w.fPO.Write(w.buf1.Get()); err != nil {
  1044  		return err
  1045  	}
  1046  	w.cntPO++
  1047  
  1048  	w.buf1.Reset()
  1049  	w.buf1.PutBE32int(len(offs))
  1050  
  1051  	for _, off := range offs {
  1052  		if off > (1<<32)-1 {
  1053  			return errors.Errorf("series offset %d exceeds 4 bytes", off)
  1054  		}
  1055  		w.buf1.PutBE32(off)
  1056  	}
  1057  
  1058  	w.buf2.Reset()
  1059  	l := w.buf1.Len()
  1060  	// We convert to uint to make code compile on 32-bit systems, as math.MaxUint32 doesn't fit into int there.
  1061  	if uint(l) > math.MaxUint32 {
  1062  		return errors.Errorf("posting size exceeds 4 bytes: %d", l)
  1063  	}
  1064  	w.buf2.PutBE32int(l)
  1065  	w.buf1.PutHash(w.crc32)
  1066  	return w.fP.Write(w.buf2.Get(), w.buf1.Get())
  1067  }
  1068  
  1069  func (w *Writer) writePostings() error {
  1070  	// There's padding in the tmp file, make sure it actually works.
  1071  	if err := w.f.AddPadding(4); err != nil {
  1072  		return err
  1073  	}
  1074  	w.postingsStart = w.f.pos
  1075  
  1076  	// Copy temporary file into main index.
  1077  	if err := w.fP.Flush(); err != nil {
  1078  		return err
  1079  	}
  1080  	if _, err := w.fP.f.Seek(0, 0); err != nil {
  1081  		return err
  1082  	}
  1083  	// Don't need to calculate a checksum, so can copy directly.
  1084  	n, err := io.CopyBuffer(w.f.fbuf, w.fP.f, make([]byte, 1<<20))
  1085  	if err != nil {
  1086  		return err
  1087  	}
  1088  	if uint64(n) != w.fP.pos {
  1089  		return errors.Errorf("wrote %d bytes to posting temporary file, but only read back %d", w.fP.pos, n)
  1090  	}
  1091  	w.f.pos += uint64(n)
  1092  
  1093  	if err := w.fP.Close(); err != nil {
  1094  		return err
  1095  	}
  1096  	if err := w.fP.Remove(); err != nil {
  1097  		return err
  1098  	}
  1099  	w.fP = nil
  1100  	return nil
  1101  }
  1102  
  1103  type uint32slice []uint32
  1104  
  1105  func (s uint32slice) Len() int           { return len(s) }
  1106  func (s uint32slice) Swap(i, j int)      { s[i], s[j] = s[j], s[i] }
  1107  func (s uint32slice) Less(i, j int) bool { return s[i] < s[j] }
  1108  
  1109  type labelIndexHashEntry struct {
  1110  	keys   []string
  1111  	offset uint64
  1112  }
  1113  
  1114  func (w *Writer) Close() error {
  1115  	// Even if this fails, we need to close all the files.
  1116  	ensureErr := w.ensureStage(idxStageDone)
  1117  
  1118  	if w.symbolFile != nil {
  1119  		if err := w.symbolFile.Close(); err != nil {
  1120  			return err
  1121  		}
  1122  	}
  1123  	if w.fP != nil {
  1124  		if err := w.fP.Close(); err != nil {
  1125  			return err
  1126  		}
  1127  	}
  1128  	if w.fPO != nil {
  1129  		if err := w.fPO.Close(); err != nil {
  1130  			return err
  1131  		}
  1132  	}
  1133  	if err := w.f.Close(); err != nil {
  1134  		return err
  1135  	}
  1136  	return ensureErr
  1137  }
  1138  
  1139  // StringIter iterates over a sorted list of strings.
  1140  type StringIter interface {
  1141  	// Next advances the iterator and returns true if another value was found.
  1142  	Next() bool
  1143  
  1144  	// At returns the value at the current iterator position.
  1145  	At() string
  1146  
  1147  	// Err returns the last error of the iterator.
  1148  	Err() error
  1149  }
  1150  
  1151  type Reader struct {
  1152  	b   ByteSlice
  1153  	toc *TOC
  1154  
  1155  	// Close that releases the underlying resources of the byte slice.
  1156  	c io.Closer
  1157  
  1158  	// Map of LabelName to a list of some LabelValues's position in the offset table.
  1159  	// The first and last values for each name are always present.
  1160  	postings map[string][]postingOffset
  1161  	// For the v1 format, labelname -> labelvalue -> offset.
  1162  	postingsV1 map[string]map[string]uint64
  1163  
  1164  	symbols     *Symbols
  1165  	nameSymbols map[uint32]string // Cache of the label name symbol lookups,
  1166  	// as there are not many and they are half of all lookups.
  1167  
  1168  	fingerprintOffsets FingerprintOffsets
  1169  
  1170  	dec *Decoder
  1171  
  1172  	version int
  1173  }
  1174  
  1175  type postingOffset struct {
  1176  	value string
  1177  	off   int
  1178  }
  1179  
  1180  // ByteSlice abstracts a byte slice.
  1181  type ByteSlice interface {
  1182  	Len() int
  1183  	Range(start, end int) []byte
  1184  }
  1185  
  1186  type RealByteSlice []byte
  1187  
  1188  func (b RealByteSlice) Len() int {
  1189  	return len(b)
  1190  }
  1191  
  1192  func (b RealByteSlice) Range(start, end int) []byte {
  1193  	return b[start:end]
  1194  }
  1195  
  1196  func (b RealByteSlice) Sub(start, end int) ByteSlice {
  1197  	return b[start:end]
  1198  }
  1199  
  1200  // NewReader returns a new index reader on the given byte slice. It automatically
  1201  // handles different format versions.
  1202  func NewReader(b ByteSlice) (*Reader, error) {
  1203  	return newReader(b, ioutil.NopCloser(nil))
  1204  }
  1205  
  1206  type nopCloser struct{}
  1207  
  1208  func (nopCloser) Close() error { return nil }
  1209  
  1210  // NewFileReader returns a new index reader against the given index file.
  1211  func NewFileReader(path string) (*Reader, error) {
  1212  	b, err := ioutil.ReadFile(path)
  1213  	if err != nil {
  1214  		return nil, err
  1215  	}
  1216  	r, err := newReader(RealByteSlice(b), nopCloser{})
  1217  	if err != nil {
  1218  		return r, err
  1219  	}
  1220  
  1221  	return r, nil
  1222  }
  1223  
  1224  func newReader(b ByteSlice, c io.Closer) (*Reader, error) {
  1225  	r := &Reader{
  1226  		b:        b,
  1227  		c:        c,
  1228  		postings: map[string][]postingOffset{},
  1229  	}
  1230  
  1231  	// Verify header.
  1232  	if r.b.Len() < HeaderLen {
  1233  		return nil, errors.Wrap(tsdb_enc.ErrInvalidSize, "index header")
  1234  	}
  1235  	if m := binary.BigEndian.Uint32(r.b.Range(0, 4)); m != MagicIndex {
  1236  		return nil, errors.Errorf("invalid magic number %x", m)
  1237  	}
  1238  	r.version = int(r.b.Range(4, 5)[0])
  1239  
  1240  	if r.version != FormatV1 && r.version != FormatV2 {
  1241  		return nil, errors.Errorf("unknown index file version %d", r.version)
  1242  	}
  1243  
  1244  	var err error
  1245  	r.toc, err = NewTOCFromByteSlice(b)
  1246  	if err != nil {
  1247  		return nil, errors.Wrap(err, "read TOC")
  1248  	}
  1249  
  1250  	r.symbols, err = NewSymbols(r.b, r.version, int(r.toc.Symbols))
  1251  	if err != nil {
  1252  		return nil, errors.Wrap(err, "read symbols")
  1253  	}
  1254  
  1255  	if r.version == FormatV1 {
  1256  		// Earlier V1 formats don't have a sorted postings offset table, so
  1257  		// load the whole offset table into memory.
  1258  		r.postingsV1 = map[string]map[string]uint64{}
  1259  		if err := ReadOffsetTable(r.b, r.toc.PostingsTable, func(key []string, off uint64, _ int) error {
  1260  			if len(key) != 2 {
  1261  				return errors.Errorf("unexpected key length for posting table %d", len(key))
  1262  			}
  1263  			if _, ok := r.postingsV1[key[0]]; !ok {
  1264  				r.postingsV1[key[0]] = map[string]uint64{}
  1265  				r.postings[key[0]] = nil // Used to get a list of labelnames in places.
  1266  			}
  1267  			r.postingsV1[key[0]][key[1]] = off
  1268  			return nil
  1269  		}); err != nil {
  1270  			return nil, errors.Wrap(err, "read postings table")
  1271  		}
  1272  	} else {
  1273  		var lastKey []string
  1274  		lastOff := 0
  1275  		valueCount := 0
  1276  		// For the postings offset table we keep every label name but only every nth
  1277  		// label value (plus the first and last one), to save memory.
  1278  		if err := ReadOffsetTable(r.b, r.toc.PostingsTable, func(key []string, _ uint64, off int) error {
  1279  			if len(key) != 2 {
  1280  				return errors.Errorf("unexpected key length for posting table %d", len(key))
  1281  			}
  1282  			if _, ok := r.postings[key[0]]; !ok {
  1283  				// Next label name.
  1284  				r.postings[key[0]] = []postingOffset{}
  1285  				if lastKey != nil {
  1286  					// Always include last value for each label name.
  1287  					r.postings[lastKey[0]] = append(r.postings[lastKey[0]], postingOffset{value: lastKey[1], off: lastOff})
  1288  				}
  1289  				lastKey = nil
  1290  				valueCount = 0
  1291  			}
  1292  			if valueCount%symbolFactor == 0 {
  1293  				r.postings[key[0]] = append(r.postings[key[0]], postingOffset{value: key[1], off: off})
  1294  				lastKey = nil
  1295  			} else {
  1296  				lastKey = key
  1297  				lastOff = off
  1298  			}
  1299  			valueCount++
  1300  			return nil
  1301  		}); err != nil {
  1302  			return nil, errors.Wrap(err, "read postings table")
  1303  		}
  1304  		if lastKey != nil {
  1305  			r.postings[lastKey[0]] = append(r.postings[lastKey[0]], postingOffset{value: lastKey[1], off: lastOff})
  1306  		}
  1307  		// Trim any extra space in the slices.
  1308  		for k, v := range r.postings {
  1309  			l := make([]postingOffset, len(v))
  1310  			copy(l, v)
  1311  			r.postings[k] = l
  1312  		}
  1313  	}
  1314  
  1315  	r.nameSymbols = make(map[uint32]string, len(r.postings))
  1316  	for k := range r.postings {
  1317  		if k == "" {
  1318  			continue
  1319  		}
  1320  		off, err := r.symbols.ReverseLookup(k)
  1321  		if err != nil {
  1322  			return nil, errors.Wrap(err, "reverse symbol lookup")
  1323  		}
  1324  		r.nameSymbols[off] = k
  1325  	}
  1326  
  1327  	r.fingerprintOffsets, err = readFingerprintOffsetsTable(r.b, r.toc.FingerprintOffsets)
  1328  	if err != nil {
  1329  		return nil, errors.Wrap(err, "loading fingerprint offsets")
  1330  	}
  1331  
  1332  	r.dec = &Decoder{LookupSymbol: r.lookupSymbol}
  1333  
  1334  	return r, nil
  1335  }
  1336  
  1337  // Version returns the file format version of the underlying index.
  1338  func (r *Reader) Version() int {
  1339  	return r.version
  1340  }
  1341  
  1342  // Range marks a byte range.
  1343  type Range struct {
  1344  	Start, End int64
  1345  }
  1346  
  1347  // PostingsRanges returns a new map of byte range in the underlying index file
  1348  // for all postings lists.
  1349  func (r *Reader) PostingsRanges() (map[labels.Label]Range, error) {
  1350  	m := map[labels.Label]Range{}
  1351  	if err := ReadOffsetTable(r.b, r.toc.PostingsTable, func(key []string, off uint64, _ int) error {
  1352  		if len(key) != 2 {
  1353  			return errors.Errorf("unexpected key length for posting table %d", len(key))
  1354  		}
  1355  		d := encoding.DecWrap(tsdb_enc.NewDecbufAt(r.b, int(off), castagnoliTable))
  1356  		if d.Err() != nil {
  1357  			return d.Err()
  1358  		}
  1359  		m[labels.Label{Name: key[0], Value: key[1]}] = Range{
  1360  			Start: int64(off) + 4,
  1361  			End:   int64(off) + 4 + int64(d.Len()),
  1362  		}
  1363  		return nil
  1364  	}); err != nil {
  1365  		return nil, errors.Wrap(err, "read postings table")
  1366  	}
  1367  	return m, nil
  1368  }
  1369  
  1370  type Symbols struct {
  1371  	bs      ByteSlice
  1372  	version int
  1373  	off     int
  1374  
  1375  	offsets []int
  1376  	seen    int
  1377  }
  1378  
  1379  const symbolFactor = 32
  1380  
  1381  // NewSymbols returns a Symbols object for symbol lookups.
  1382  func NewSymbols(bs ByteSlice, version, off int) (*Symbols, error) {
  1383  	s := &Symbols{
  1384  		bs:      bs,
  1385  		version: version,
  1386  		off:     off,
  1387  	}
  1388  	d := encoding.DecWrap(tsdb_enc.NewDecbufAt(bs, off, castagnoliTable))
  1389  	var (
  1390  		origLen = d.Len()
  1391  		cnt     = d.Be32int()
  1392  		basePos = off + 4
  1393  	)
  1394  	s.offsets = make([]int, 0, 1+cnt/symbolFactor)
  1395  	for d.Err() == nil && s.seen < cnt {
  1396  		if s.seen%symbolFactor == 0 {
  1397  			s.offsets = append(s.offsets, basePos+origLen-d.Len())
  1398  		}
  1399  		d.UvarintBytes() // The symbol.
  1400  		s.seen++
  1401  	}
  1402  	if d.Err() != nil {
  1403  		return nil, d.Err()
  1404  	}
  1405  	return s, nil
  1406  }
  1407  
  1408  func (s Symbols) Lookup(o uint32) (string, error) {
  1409  	d := encoding.DecWrap(tsdb_enc.Decbuf{
  1410  		B: s.bs.Range(0, s.bs.Len()),
  1411  	})
  1412  
  1413  	if s.version == FormatV2 {
  1414  		if int(o) >= s.seen {
  1415  			return "", errors.Errorf("unknown symbol offset %d", o)
  1416  		}
  1417  		d.Skip(s.offsets[int(o/symbolFactor)])
  1418  		// Walk until we find the one we want.
  1419  		for i := o - (o / symbolFactor * symbolFactor); i > 0; i-- {
  1420  			d.UvarintBytes()
  1421  		}
  1422  	} else {
  1423  		d.Skip(int(o))
  1424  	}
  1425  	sym := d.UvarintStr()
  1426  	if d.Err() != nil {
  1427  		return "", d.Err()
  1428  	}
  1429  	return sym, nil
  1430  }
  1431  
  1432  func (s Symbols) ReverseLookup(sym string) (uint32, error) {
  1433  	if len(s.offsets) == 0 {
  1434  		return 0, errors.Errorf("unknown symbol %q - no symbols", sym)
  1435  	}
  1436  	i := sort.Search(len(s.offsets), func(i int) bool {
  1437  		// Any decoding errors here will be lost, however
  1438  		// we already read through all of this at startup.
  1439  		d := encoding.DecWrap(tsdb_enc.Decbuf{
  1440  			B: s.bs.Range(0, s.bs.Len()),
  1441  		})
  1442  		d.Skip(s.offsets[i])
  1443  		return yoloString(d.UvarintBytes()) > sym
  1444  	})
  1445  	d := encoding.DecWrap(tsdb_enc.Decbuf{
  1446  		B: s.bs.Range(0, s.bs.Len()),
  1447  	})
  1448  	if i > 0 {
  1449  		i--
  1450  	}
  1451  	d.Skip(s.offsets[i])
  1452  	res := i * symbolFactor
  1453  	var lastLen int
  1454  	var lastSymbol string
  1455  	for d.Err() == nil && res <= s.seen {
  1456  		lastLen = d.Len()
  1457  		lastSymbol = yoloString(d.UvarintBytes())
  1458  		if lastSymbol >= sym {
  1459  			break
  1460  		}
  1461  		res++
  1462  	}
  1463  	if d.Err() != nil {
  1464  		return 0, d.Err()
  1465  	}
  1466  	if lastSymbol != sym {
  1467  		return 0, errors.Errorf("unknown symbol %q", sym)
  1468  	}
  1469  	if s.version == FormatV2 {
  1470  		return uint32(res), nil
  1471  	}
  1472  	return uint32(s.bs.Len() - lastLen), nil
  1473  }
  1474  
  1475  func (s Symbols) Size() int {
  1476  	return len(s.offsets) * 8
  1477  }
  1478  
  1479  func (s Symbols) Iter() StringIter {
  1480  	d := encoding.DecWrap(tsdb_enc.NewDecbufAt(s.bs, s.off, castagnoliTable))
  1481  	cnt := d.Be32int()
  1482  	return &symbolsIter{
  1483  		d:   d,
  1484  		cnt: cnt,
  1485  	}
  1486  }
  1487  
  1488  // symbolsIter implements StringIter.
  1489  type symbolsIter struct {
  1490  	d   encoding.Decbuf
  1491  	cnt int
  1492  	cur string
  1493  	err error
  1494  }
  1495  
  1496  func (s *symbolsIter) Next() bool {
  1497  	if s.cnt == 0 || s.err != nil {
  1498  		return false
  1499  	}
  1500  	s.cur = yoloString(s.d.UvarintBytes())
  1501  	s.cnt--
  1502  	if s.d.Err() != nil {
  1503  		s.err = s.d.Err()
  1504  		return false
  1505  	}
  1506  	return true
  1507  }
  1508  
  1509  func (s symbolsIter) At() string { return s.cur }
  1510  func (s symbolsIter) Err() error { return s.err }
  1511  
  1512  // ReadOffsetTable reads an offset table and at the given position calls f for each
  1513  // found entry. If f returns an error it stops decoding and returns the received error.
  1514  func ReadOffsetTable(bs ByteSlice, off uint64, f func([]string, uint64, int) error) error {
  1515  	d := encoding.DecWrap(tsdb_enc.NewDecbufAt(bs, int(off), castagnoliTable))
  1516  	startLen := d.Len()
  1517  	cnt := d.Be32()
  1518  
  1519  	for d.Err() == nil && d.Len() > 0 && cnt > 0 {
  1520  		offsetPos := startLen - d.Len()
  1521  		keyCount := d.Uvarint()
  1522  		// The Postings offset table takes only 2 keys per entry (name and value of label),
  1523  		// and the LabelIndices offset table takes only 1 key per entry (a label name).
  1524  		// Hence setting the size to max of both, i.e. 2.
  1525  		keys := make([]string, 0, 2)
  1526  
  1527  		for i := 0; i < keyCount; i++ {
  1528  			keys = append(keys, d.UvarintStr())
  1529  		}
  1530  		o := d.Uvarint64()
  1531  		if d.Err() != nil {
  1532  			break
  1533  		}
  1534  		if err := f(keys, o, offsetPos); err != nil {
  1535  			return err
  1536  		}
  1537  		cnt--
  1538  	}
  1539  	return d.Err()
  1540  }
  1541  
  1542  func readFingerprintOffsetsTable(bs ByteSlice, off uint64) (FingerprintOffsets, error) {
  1543  	d := encoding.DecWrap(tsdb_enc.NewDecbufAt(bs, int(off), castagnoliTable))
  1544  	cnt := d.Be32()
  1545  	res := make(FingerprintOffsets, 0, int(cnt))
  1546  
  1547  	for d.Err() == nil && d.Len() > 0 && cnt > 0 {
  1548  		res = append(res, [2]uint64{d.Be64(), d.Be64()})
  1549  		cnt--
  1550  	}
  1551  
  1552  	return res, d.Err()
  1553  
  1554  }
  1555  
  1556  // Close the reader and its underlying resources.
  1557  func (r *Reader) Close() error {
  1558  	return r.c.Close()
  1559  }
  1560  
  1561  func (r *Reader) lookupSymbol(o uint32) (string, error) {
  1562  	if s, ok := r.nameSymbols[o]; ok {
  1563  		return s, nil
  1564  	}
  1565  	return r.symbols.Lookup(o)
  1566  }
  1567  
  1568  func (r *Reader) Bounds() (int64, int64) {
  1569  	return r.toc.Metadata.From, r.toc.Metadata.Through
  1570  }
  1571  
  1572  func (r *Reader) Checksum() uint32 {
  1573  	return r.toc.Metadata.Checksum
  1574  }
  1575  
  1576  // Symbols returns an iterator over the symbols that exist within the index.
  1577  func (r *Reader) Symbols() StringIter {
  1578  	return r.symbols.Iter()
  1579  }
  1580  
  1581  // SymbolTableSize returns the symbol table size in bytes.
  1582  func (r *Reader) SymbolTableSize() uint64 {
  1583  	return uint64(r.symbols.Size())
  1584  }
  1585  
  1586  // SortedLabelValues returns value tuples that exist for the given label name.
  1587  // It is not safe to use the return value beyond the lifetime of the byte slice
  1588  // passed into the Reader.
  1589  func (r *Reader) SortedLabelValues(name string, matchers ...*labels.Matcher) ([]string, error) {
  1590  	values, err := r.LabelValues(name, matchers...)
  1591  	if err == nil && r.version == FormatV1 {
  1592  		sort.Strings(values)
  1593  	}
  1594  	return values, err
  1595  }
  1596  
  1597  // LabelValues returns value tuples that exist for the given label name.
  1598  // It is not safe to use the return value beyond the lifetime of the byte slice
  1599  // passed into the Reader.
  1600  // TODO(replay): Support filtering by matchers
  1601  func (r *Reader) LabelValues(name string, matchers ...*labels.Matcher) ([]string, error) {
  1602  	if len(matchers) > 0 {
  1603  		return nil, errors.Errorf("matchers parameter is not implemented: %+v", matchers)
  1604  	}
  1605  
  1606  	if r.version == FormatV1 {
  1607  		e, ok := r.postingsV1[name]
  1608  		if !ok {
  1609  			return nil, nil
  1610  		}
  1611  		values := make([]string, 0, len(e))
  1612  		for k := range e {
  1613  			values = append(values, k)
  1614  		}
  1615  		return values, nil
  1616  
  1617  	}
  1618  	e, ok := r.postings[name]
  1619  	if !ok {
  1620  		return nil, nil
  1621  	}
  1622  	if len(e) == 0 {
  1623  		return nil, nil
  1624  	}
  1625  	values := make([]string, 0, len(e)*symbolFactor)
  1626  
  1627  	d := encoding.DecWrap(tsdb_enc.NewDecbufAt(r.b, int(r.toc.PostingsTable), nil))
  1628  	d.Skip(e[0].off)
  1629  	lastVal := e[len(e)-1].value
  1630  
  1631  	skip := 0
  1632  	for d.Err() == nil {
  1633  		if skip == 0 {
  1634  			// These are always the same number of bytes,
  1635  			// and it's faster to skip than parse.
  1636  			skip = d.Len()
  1637  			d.Uvarint()      // Keycount.
  1638  			d.UvarintBytes() // Label name.
  1639  			skip -= d.Len()
  1640  		} else {
  1641  			d.Skip(skip)
  1642  		}
  1643  		s := yoloString(d.UvarintBytes()) // Label value.
  1644  		values = append(values, s)
  1645  		if s == lastVal {
  1646  			break
  1647  		}
  1648  		d.Uvarint64() // Offset.
  1649  	}
  1650  	if d.Err() != nil {
  1651  		return nil, errors.Wrap(d.Err(), "get postings offset entry")
  1652  	}
  1653  	return values, nil
  1654  }
  1655  
  1656  // LabelNamesFor returns all the label names for the series referred to by IDs.
  1657  // The names returned are sorted.
  1658  func (r *Reader) LabelNamesFor(ids ...storage.SeriesRef) ([]string, error) {
  1659  	// Gather offsetsMap the name offsetsMap in the symbol table first
  1660  	offsetsMap := make(map[uint32]struct{})
  1661  	for _, id := range ids {
  1662  		offset := id
  1663  		// In version 2 series IDs are no longer exact references but series are 16-byte padded
  1664  		// and the ID is the multiple of 16 of the actual position.
  1665  		if r.version == FormatV2 {
  1666  			offset = id * 16
  1667  		}
  1668  
  1669  		d := encoding.DecWrap(tsdb_enc.NewDecbufUvarintAt(r.b, int(offset), castagnoliTable))
  1670  		buf := d.Get()
  1671  		if d.Err() != nil {
  1672  			return nil, errors.Wrap(d.Err(), "get buffer for series")
  1673  		}
  1674  
  1675  		offsets, err := r.dec.LabelNamesOffsetsFor(buf)
  1676  		if err != nil {
  1677  			return nil, errors.Wrap(err, "get label name offsets")
  1678  		}
  1679  		for _, off := range offsets {
  1680  			offsetsMap[off] = struct{}{}
  1681  		}
  1682  	}
  1683  
  1684  	// Lookup the unique symbols.
  1685  	names := make([]string, 0, len(offsetsMap))
  1686  	for off := range offsetsMap {
  1687  		name, err := r.lookupSymbol(off)
  1688  		if err != nil {
  1689  			return nil, errors.Wrap(err, "lookup symbol in LabelNamesFor")
  1690  		}
  1691  		names = append(names, name)
  1692  	}
  1693  
  1694  	sort.Strings(names)
  1695  
  1696  	return names, nil
  1697  }
  1698  
  1699  // LabelValueFor returns label value for the given label name in the series referred to by ID.
  1700  func (r *Reader) LabelValueFor(id storage.SeriesRef, label string) (string, error) {
  1701  	offset := id
  1702  	// In version 2 series IDs are no longer exact references but series are 16-byte padded
  1703  	// and the ID is the multiple of 16 of the actual position.
  1704  	if r.version == FormatV2 {
  1705  		offset = id * 16
  1706  	}
  1707  	d := encoding.DecWrap(tsdb_enc.NewDecbufUvarintAt(r.b, int(offset), castagnoliTable))
  1708  	buf := d.Get()
  1709  	if d.Err() != nil {
  1710  		return "", errors.Wrap(d.Err(), "label values for")
  1711  	}
  1712  
  1713  	value, err := r.dec.LabelValueFor(buf, label)
  1714  	if err != nil {
  1715  		return "", storage.ErrNotFound
  1716  	}
  1717  
  1718  	if value == "" {
  1719  		return "", storage.ErrNotFound
  1720  	}
  1721  
  1722  	return value, nil
  1723  }
  1724  
  1725  // Series reads the series with the given ID and writes its labels and chunks into lbls and chks.
  1726  func (r *Reader) Series(id storage.SeriesRef, lbls *labels.Labels, chks *[]ChunkMeta) (uint64, error) {
  1727  	offset := id
  1728  	// In version 2 series IDs are no longer exact references but series are 16-byte padded
  1729  	// and the ID is the multiple of 16 of the actual position.
  1730  	if r.version == FormatV2 {
  1731  		offset = id * 16
  1732  	}
  1733  	d := encoding.DecWrap(tsdb_enc.NewDecbufUvarintAt(r.b, int(offset), castagnoliTable))
  1734  	if d.Err() != nil {
  1735  		return 0, d.Err()
  1736  	}
  1737  
  1738  	fprint, err := r.dec.Series(d.Get(), lbls, chks)
  1739  	if err != nil {
  1740  		return 0, errors.Wrap(err, "read series")
  1741  	}
  1742  	return fprint, nil
  1743  }
  1744  
  1745  func (r *Reader) Postings(name string, shard *ShardAnnotation, values ...string) (Postings, error) {
  1746  	if r.version == FormatV1 {
  1747  		e, ok := r.postingsV1[name]
  1748  		if !ok {
  1749  			return EmptyPostings(), nil
  1750  		}
  1751  		res := make([]Postings, 0, len(values))
  1752  		for _, v := range values {
  1753  			postingsOff, ok := e[v]
  1754  			if !ok {
  1755  				continue
  1756  			}
  1757  			// Read from the postings table.
  1758  			d := encoding.DecWrap(tsdb_enc.NewDecbufAt(r.b, int(postingsOff), castagnoliTable))
  1759  			_, p, err := r.dec.Postings(d.Get())
  1760  			if err != nil {
  1761  				return nil, errors.Wrap(err, "decode postings")
  1762  			}
  1763  			res = append(res, p)
  1764  		}
  1765  		return Merge(res...), nil
  1766  	}
  1767  
  1768  	e, ok := r.postings[name]
  1769  	if !ok {
  1770  		return EmptyPostings(), nil
  1771  	}
  1772  
  1773  	if len(values) == 0 {
  1774  		return EmptyPostings(), nil
  1775  	}
  1776  
  1777  	res := make([]Postings, 0, len(values))
  1778  	skip := 0
  1779  	valueIndex := 0
  1780  	for valueIndex < len(values) && values[valueIndex] < e[0].value {
  1781  		// Discard values before the start.
  1782  		valueIndex++
  1783  	}
  1784  	for valueIndex < len(values) {
  1785  		value := values[valueIndex]
  1786  
  1787  		i := sort.Search(len(e), func(i int) bool { return e[i].value >= value })
  1788  		if i == len(e) {
  1789  			// We're past the end.
  1790  			break
  1791  		}
  1792  		if i > 0 && e[i].value != value {
  1793  			// Need to look from previous entry.
  1794  			i--
  1795  		}
  1796  		// Don't Crc32 the entire postings offset table, this is very slow
  1797  		// so hope any issues were caught at startup.
  1798  		d := encoding.DecWrap(tsdb_enc.NewDecbufAt(r.b, int(r.toc.PostingsTable), nil))
  1799  		d.Skip(e[i].off)
  1800  
  1801  		// Iterate on the offset table.
  1802  		var postingsOff uint64 // The offset into the postings table.
  1803  		for d.Err() == nil {
  1804  			if skip == 0 {
  1805  				// These are always the same number of bytes,
  1806  				// and it's faster to skip than parse.
  1807  				skip = d.Len()
  1808  				d.Uvarint()      // Keycount.
  1809  				d.UvarintBytes() // Label name.
  1810  				skip -= d.Len()
  1811  			} else {
  1812  				d.Skip(skip)
  1813  			}
  1814  			v := d.UvarintBytes()       // Label value.
  1815  			postingsOff = d.Uvarint64() // Offset.
  1816  			for string(v) >= value {
  1817  				if string(v) == value {
  1818  					// Read from the postings table.
  1819  					d2 := encoding.DecWrap(tsdb_enc.NewDecbufAt(r.b, int(postingsOff), castagnoliTable))
  1820  					_, p, err := r.dec.Postings(d2.Get())
  1821  					if err != nil {
  1822  						return nil, errors.Wrap(err, "decode postings")
  1823  					}
  1824  					res = append(res, p)
  1825  				}
  1826  				valueIndex++
  1827  				if valueIndex == len(values) {
  1828  					break
  1829  				}
  1830  				value = values[valueIndex]
  1831  			}
  1832  			if i+1 == len(e) || value >= e[i+1].value || valueIndex == len(values) {
  1833  				// Need to go to a later postings offset entry, if there is one.
  1834  				break
  1835  			}
  1836  		}
  1837  		if d.Err() != nil {
  1838  			return nil, errors.Wrap(d.Err(), "get postings offset entry")
  1839  		}
  1840  	}
  1841  
  1842  	merged := Merge(res...)
  1843  	if shard != nil {
  1844  		return NewShardedPostings(merged, *shard, r.fingerprintOffsets), nil
  1845  	}
  1846  
  1847  	return merged, nil
  1848  }
  1849  
  1850  // Size returns the size of an index file.
  1851  func (r *Reader) Size() int64 {
  1852  	return int64(r.b.Len())
  1853  }
  1854  
  1855  // LabelNames returns all the unique label names present in the index.
  1856  // TODO(twilkie) implement support for matchers
  1857  func (r *Reader) LabelNames(matchers ...*labels.Matcher) ([]string, error) {
  1858  	if len(matchers) > 0 {
  1859  		return nil, errors.Errorf("matchers parameter is not implemented: %+v", matchers)
  1860  	}
  1861  
  1862  	labelNames := make([]string, 0, len(r.postings))
  1863  	for name := range r.postings {
  1864  		if name == allPostingsKey.Name {
  1865  			// This is not from any metric.
  1866  			continue
  1867  		}
  1868  		labelNames = append(labelNames, name)
  1869  	}
  1870  	sort.Strings(labelNames)
  1871  	return labelNames, nil
  1872  }
  1873  
  1874  // NewStringListIter returns a StringIter for the given sorted list of strings.
  1875  func NewStringListIter(s []string) StringIter {
  1876  	return &stringListIter{l: s}
  1877  }
  1878  
  1879  // symbolsIter implements StringIter.
  1880  type stringListIter struct {
  1881  	l   []string
  1882  	cur string
  1883  }
  1884  
  1885  func (s *stringListIter) Next() bool {
  1886  	if len(s.l) == 0 {
  1887  		return false
  1888  	}
  1889  	s.cur = s.l[0]
  1890  	s.l = s.l[1:]
  1891  	return true
  1892  }
  1893  func (s stringListIter) At() string { return s.cur }
  1894  func (s stringListIter) Err() error { return nil }
  1895  
  1896  // Decoder provides decoding methods for the v1 and v2 index file format.
  1897  //
  1898  // It currently does not contain decoding methods for all entry types but can be extended
  1899  // by them if there's demand.
  1900  type Decoder struct {
  1901  	LookupSymbol func(uint32) (string, error)
  1902  }
  1903  
  1904  // Postings returns a postings list for b and its number of elements.
  1905  func (dec *Decoder) Postings(b []byte) (int, Postings, error) {
  1906  	d := encoding.DecWrap(tsdb_enc.Decbuf{B: b})
  1907  	n := d.Be32int()
  1908  	l := d.Get()
  1909  	if d.Err() != nil {
  1910  		return 0, nil, d.Err()
  1911  	}
  1912  	if len(l) != 4*n {
  1913  		return 0, nil, fmt.Errorf("unexpected postings length, should be %d bytes for %d postings, got %d bytes", 4*n, n, len(l))
  1914  	}
  1915  	return n, newBigEndianPostings(l), nil
  1916  }
  1917  
  1918  // LabelNamesOffsetsFor decodes the offsets of the name symbols for a given series.
  1919  // They are returned in the same order they're stored, which should be sorted lexicographically.
  1920  func (dec *Decoder) LabelNamesOffsetsFor(b []byte) ([]uint32, error) {
  1921  	d := encoding.DecWrap(tsdb_enc.Decbuf{B: b})
  1922  	_ = d.Be64() // skip fingerprint
  1923  	k := d.Uvarint()
  1924  
  1925  	offsets := make([]uint32, k)
  1926  	for i := 0; i < k; i++ {
  1927  		offsets[i] = uint32(d.Uvarint())
  1928  		_ = d.Uvarint() // skip the label value
  1929  
  1930  		if d.Err() != nil {
  1931  			return nil, errors.Wrap(d.Err(), "read series label offsets")
  1932  		}
  1933  	}
  1934  
  1935  	return offsets, d.Err()
  1936  }
  1937  
  1938  // LabelValueFor decodes a label for a given series.
  1939  func (dec *Decoder) LabelValueFor(b []byte, label string) (string, error) {
  1940  	d := encoding.DecWrap(tsdb_enc.Decbuf{B: b})
  1941  	_ = d.Be64() // skip fingerprint
  1942  	k := d.Uvarint()
  1943  
  1944  	for i := 0; i < k; i++ {
  1945  		lno := uint32(d.Uvarint())
  1946  		lvo := uint32(d.Uvarint())
  1947  
  1948  		if d.Err() != nil {
  1949  			return "", errors.Wrap(d.Err(), "read series label offsets")
  1950  		}
  1951  
  1952  		ln, err := dec.LookupSymbol(lno)
  1953  		if err != nil {
  1954  			return "", errors.Wrap(err, "lookup label name")
  1955  		}
  1956  
  1957  		if ln == label {
  1958  			lv, err := dec.LookupSymbol(lvo)
  1959  			if err != nil {
  1960  				return "", errors.Wrap(err, "lookup label value")
  1961  			}
  1962  
  1963  			return lv, nil
  1964  		}
  1965  	}
  1966  
  1967  	return "", d.Err()
  1968  }
  1969  
  1970  // Series decodes a series entry from the given byte slice into lset and chks.
  1971  func (dec *Decoder) Series(b []byte, lbls *labels.Labels, chks *[]ChunkMeta) (uint64, error) {
  1972  	*lbls = (*lbls)[:0]
  1973  	*chks = (*chks)[:0]
  1974  
  1975  	d := encoding.DecWrap(tsdb_enc.Decbuf{B: b})
  1976  
  1977  	fprint := d.Be64()
  1978  	k := d.Uvarint()
  1979  
  1980  	for i := 0; i < k; i++ {
  1981  		lno := uint32(d.Uvarint())
  1982  		lvo := uint32(d.Uvarint())
  1983  
  1984  		if d.Err() != nil {
  1985  			return 0, errors.Wrap(d.Err(), "read series label offsets")
  1986  		}
  1987  
  1988  		ln, err := dec.LookupSymbol(lno)
  1989  		if err != nil {
  1990  			return 0, errors.Wrap(err, "lookup label name")
  1991  		}
  1992  		lv, err := dec.LookupSymbol(lvo)
  1993  		if err != nil {
  1994  			return 0, errors.Wrap(err, "lookup label value")
  1995  		}
  1996  
  1997  		*lbls = append(*lbls, labels.Label{Name: ln, Value: lv})
  1998  	}
  1999  
  2000  	// Read the chunks meta data.
  2001  	k = d.Uvarint()
  2002  
  2003  	if k == 0 {
  2004  		return 0, d.Err()
  2005  	}
  2006  
  2007  	t0 := d.Varint64()
  2008  	maxt := int64(d.Uvarint64()) + t0
  2009  	kb := uint32(d.Uvarint())
  2010  	entries := uint32(d.Uvarint64())
  2011  	checksum := d.Be32()
  2012  
  2013  	*chks = append(*chks, ChunkMeta{
  2014  		Checksum: checksum,
  2015  		MinTime:  t0,
  2016  		MaxTime:  maxt,
  2017  		KB:       kb,
  2018  		Entries:  entries,
  2019  	})
  2020  	t0 = maxt
  2021  
  2022  	for i := 1; i < k; i++ {
  2023  		// Decode the diff against previous chunk as varint
  2024  		// instead of uvarint because chunks may overlap
  2025  		mint := d.Varint64() + t0
  2026  		maxt := int64(d.Uvarint64()) + mint
  2027  		kb := uint32(d.Uvarint())
  2028  		entries := uint32(d.Uvarint64())
  2029  		checksum := d.Be32()
  2030  		t0 = maxt
  2031  
  2032  		if d.Err() != nil {
  2033  			return 0, errors.Wrapf(d.Err(), "read meta for chunk %d", i)
  2034  		}
  2035  
  2036  		*chks = append(*chks, ChunkMeta{
  2037  			Checksum: checksum,
  2038  			MinTime:  mint,
  2039  			MaxTime:  maxt,
  2040  			KB:       kb,
  2041  			Entries:  entries,
  2042  		})
  2043  	}
  2044  	return fprint, d.Err()
  2045  }
  2046  
  2047  func yoloString(b []byte) string {
  2048  	return *((*string)(unsafe.Pointer(&b)))
  2049  }