github.com/thanos-io/thanos@v0.32.5/pkg/block/indexheader/binary_reader.go (about)

     1  // Copyright (c) The Thanos Authors.
     2  // Licensed under the Apache License 2.0.
     3  
     4  package indexheader
     5  
     6  import (
     7  	"bufio"
     8  	"bytes"
     9  	"context"
    10  	"encoding/binary"
    11  	"hash"
    12  	"hash/crc32"
    13  	"io"
    14  	"math"
    15  	"os"
    16  	"path/filepath"
    17  	"sort"
    18  	"sync"
    19  	"time"
    20  	"unsafe"
    21  
    22  	"github.com/go-kit/log"
    23  	"github.com/go-kit/log/level"
    24  	"github.com/oklog/ulid"
    25  	"github.com/pkg/errors"
    26  	"github.com/prometheus/prometheus/tsdb/encoding"
    27  	"github.com/prometheus/prometheus/tsdb/fileutil"
    28  	"github.com/prometheus/prometheus/tsdb/index"
    29  	"github.com/thanos-io/objstore"
    30  
    31  	"github.com/thanos-io/thanos/pkg/block"
    32  	"github.com/thanos-io/thanos/pkg/runutil"
    33  )
    34  
    35  const (
    36  	// BinaryFormatV1 represents first version of index-header file.
    37  	BinaryFormatV1 = 1
    38  
    39  	indexTOCLen  = 6*8 + crc32.Size
    40  	binaryTOCLen = 2*8 + crc32.Size
    41  	// headerLen represents number of bytes reserved of index header for header.
    42  	headerLen = 4 + 1 + 1 + 8
    43  
    44  	// MagicIndex are 4 bytes at the head of an index-header file.
    45  	MagicIndex = 0xBAAAD792
    46  
    47  	postingLengthFieldSize = 4
    48  )
    49  
    50  // The table gets initialized with sync.Once but may still cause a race
    51  // with any other use of the crc32 package anywhere. Thus we initialize it
    52  // before.
    53  var castagnoliTable *crc32.Table
    54  
    55  func init() {
    56  	castagnoliTable = crc32.MakeTable(crc32.Castagnoli)
    57  }
    58  
    59  // newCRC32 initializes a CRC32 hash with a preconfigured polynomial, so the
    60  // polynomial may be easily changed in one location at a later time, if necessary.
    61  func newCRC32() hash.Hash32 {
    62  	return crc32.New(castagnoliTable)
    63  }
    64  
    65  // BinaryTOC is a table of content for index-header file.
    66  type BinaryTOC struct {
    67  	// Symbols holds start to the same symbols section as index related to this index header.
    68  	Symbols uint64
    69  	// PostingsOffsetTable holds start to the same Postings Offset Table section as index related to this index header.
    70  	PostingsOffsetTable uint64
    71  }
    72  
    73  // WriteBinary build index header from the pieces of index in object storage, and cached in file if necessary.
    74  func WriteBinary(ctx context.Context, bkt objstore.BucketReader, id ulid.ULID, filename string) ([]byte, error) {
    75  	ir, indexVersion, err := newChunkedIndexReader(ctx, bkt, id)
    76  	if err != nil {
    77  		return nil, errors.Wrap(err, "new index reader")
    78  	}
    79  	tmpFilename := ""
    80  	if filename != "" {
    81  		tmpFilename = filename + ".tmp"
    82  	}
    83  
    84  	// Buffer for copying and encbuffers.
    85  	// This also will control the size of file writer buffer.
    86  	buf := make([]byte, 32*1024)
    87  	bw, err := newBinaryWriter(id, tmpFilename, buf)
    88  	if err != nil {
    89  		return nil, errors.Wrap(err, "new binary index header writer")
    90  	}
    91  	defer runutil.CloseWithErrCapture(&err, bw, "close binary writer for %s", tmpFilename)
    92  
    93  	if err := bw.AddIndexMeta(indexVersion, ir.toc.PostingsTable); err != nil {
    94  		return nil, errors.Wrap(err, "add index meta")
    95  	}
    96  
    97  	if err := ir.CopySymbols(bw.SymbolsWriter(), buf); err != nil {
    98  		return nil, err
    99  	}
   100  
   101  	if err := bw.writer.Flush(); err != nil {
   102  		return nil, errors.Wrap(err, "flush")
   103  	}
   104  
   105  	if err := ir.CopyPostingsOffsets(bw.PostingOffsetsWriter(), buf); err != nil {
   106  		return nil, err
   107  	}
   108  
   109  	if err := bw.writer.Flush(); err != nil {
   110  		return nil, errors.Wrap(err, "flush")
   111  	}
   112  
   113  	if err := bw.WriteTOC(); err != nil {
   114  		return nil, errors.Wrap(err, "write index header TOC")
   115  	}
   116  
   117  	if err := bw.writer.Flush(); err != nil {
   118  		return nil, errors.Wrap(err, "flush")
   119  	}
   120  
   121  	if err := bw.writer.Sync(); err != nil {
   122  		return nil, errors.Wrap(err, "sync")
   123  	}
   124  
   125  	if tmpFilename != "" {
   126  		// Create index-header in atomic way, to avoid partial writes (e.g during restart or crash of store GW).
   127  		return nil, os.Rename(tmpFilename, filename)
   128  	}
   129  
   130  	return bw.Buffer(), nil
   131  }
   132  
   133  type chunkedIndexReader struct {
   134  	ctx  context.Context
   135  	path string
   136  	size uint64
   137  	bkt  objstore.BucketReader
   138  	toc  *index.TOC
   139  }
   140  
   141  func newChunkedIndexReader(ctx context.Context, bkt objstore.BucketReader, id ulid.ULID) (*chunkedIndexReader, int, error) {
   142  	indexFilepath := filepath.Join(id.String(), block.IndexFilename)
   143  	attrs, err := bkt.Attributes(ctx, indexFilepath)
   144  	if err != nil {
   145  		return nil, 0, errors.Wrapf(err, "get object attributes of %s", indexFilepath)
   146  	}
   147  
   148  	rc, err := bkt.GetRange(ctx, indexFilepath, 0, index.HeaderLen)
   149  	if err != nil {
   150  		return nil, 0, errors.Wrapf(err, "get TOC from object storage of %s", indexFilepath)
   151  	}
   152  
   153  	b, err := io.ReadAll(rc)
   154  	if err != nil {
   155  		runutil.CloseWithErrCapture(&err, rc, "close reader")
   156  		return nil, 0, errors.Wrapf(err, "get header from object storage of %s", indexFilepath)
   157  	}
   158  
   159  	if err := rc.Close(); err != nil {
   160  		return nil, 0, errors.Wrap(err, "close reader")
   161  	}
   162  
   163  	if m := binary.BigEndian.Uint32(b[0:4]); m != index.MagicIndex {
   164  		return nil, 0, errors.Errorf("invalid magic number %x for %s", m, indexFilepath)
   165  	}
   166  
   167  	version := int(b[4:5][0])
   168  
   169  	if version != index.FormatV1 && version != index.FormatV2 {
   170  		return nil, 0, errors.Errorf("not supported index file version %d of %s", version, indexFilepath)
   171  	}
   172  
   173  	ir := &chunkedIndexReader{
   174  		ctx:  ctx,
   175  		path: indexFilepath,
   176  		size: uint64(attrs.Size),
   177  		bkt:  bkt,
   178  	}
   179  
   180  	toc, err := ir.readTOC()
   181  	if err != nil {
   182  		return nil, 0, err
   183  	}
   184  	ir.toc = toc
   185  
   186  	return ir, version, nil
   187  }
   188  
   189  func (r *chunkedIndexReader) readTOC() (*index.TOC, error) {
   190  	rc, err := r.bkt.GetRange(r.ctx, r.path, int64(r.size-indexTOCLen-crc32.Size), indexTOCLen+crc32.Size)
   191  	if err != nil {
   192  		return nil, errors.Wrapf(err, "get TOC from object storage of %s", r.path)
   193  	}
   194  
   195  	tocBytes, err := io.ReadAll(rc)
   196  	if err != nil {
   197  		runutil.CloseWithErrCapture(&err, rc, "close toc reader")
   198  		return nil, errors.Wrapf(err, "get TOC from object storage of %s", r.path)
   199  	}
   200  
   201  	if err := rc.Close(); err != nil {
   202  		return nil, errors.Wrap(err, "close toc reader")
   203  	}
   204  
   205  	toc, err := index.NewTOCFromByteSlice(realByteSlice(tocBytes))
   206  	if err != nil {
   207  		return nil, errors.Wrap(err, "new TOC")
   208  	}
   209  	return toc, nil
   210  }
   211  
   212  func (r *chunkedIndexReader) CopySymbols(w io.Writer, buf []byte) (err error) {
   213  	rc, err := r.bkt.GetRange(r.ctx, r.path, int64(r.toc.Symbols), int64(r.toc.Series-r.toc.Symbols))
   214  	if err != nil {
   215  		return errors.Wrapf(err, "get symbols from object storage of %s", r.path)
   216  	}
   217  	defer runutil.CloseWithErrCapture(&err, rc, "close symbol reader")
   218  
   219  	if _, err := io.CopyBuffer(w, rc, buf); err != nil {
   220  		return errors.Wrap(err, "copy symbols")
   221  	}
   222  
   223  	return nil
   224  }
   225  
   226  func (r *chunkedIndexReader) CopyPostingsOffsets(w io.Writer, buf []byte) (err error) {
   227  	rc, err := r.bkt.GetRange(r.ctx, r.path, int64(r.toc.PostingsTable), int64(r.size-r.toc.PostingsTable))
   228  	if err != nil {
   229  		return errors.Wrapf(err, "get posting offset table from object storage of %s", r.path)
   230  	}
   231  	defer runutil.CloseWithErrCapture(&err, rc, "close posting offsets reader")
   232  
   233  	if _, err := io.CopyBuffer(w, rc, buf); err != nil {
   234  		return errors.Wrap(err, "copy posting offsets")
   235  	}
   236  
   237  	return nil
   238  }
   239  
   240  // TODO(bwplotka): Add padding for efficient read.
   241  type binaryWriter struct {
   242  	writer PosWriter
   243  
   244  	toc BinaryTOC
   245  
   246  	// Reusable memory.
   247  	buf encoding.Encbuf
   248  
   249  	crc32 hash.Hash
   250  }
   251  
   252  func newBinaryWriter(id ulid.ULID, cacheFilename string, buf []byte) (w *binaryWriter, err error) {
   253  	var binWriter PosWriter
   254  	if cacheFilename != "" {
   255  		dir := filepath.Dir(cacheFilename)
   256  
   257  		df, err := fileutil.OpenDir(dir)
   258  		if os.IsNotExist(err) {
   259  			if err := os.MkdirAll(dir, os.ModePerm); err != nil {
   260  				return nil, err
   261  			}
   262  			df, err = fileutil.OpenDir(dir)
   263  		}
   264  		if err != nil {
   265  			return nil, err
   266  		}
   267  
   268  		defer runutil.CloseWithErrCapture(&err, df, "dir close")
   269  
   270  		if err := os.RemoveAll(cacheFilename); err != nil {
   271  			return nil, errors.Wrap(err, "remove any existing index at path")
   272  		}
   273  
   274  		var fileWriter *FileWriter
   275  		fileWriter, err = NewFileWriter(cacheFilename, len(buf))
   276  		if err != nil {
   277  			return nil, err
   278  		}
   279  		if err := df.Sync(); err != nil {
   280  			return nil, errors.Wrap(err, "sync dir")
   281  		}
   282  		binWriter = fileWriter
   283  	} else {
   284  		binWriter = NewMemoryWriter(id, len(buf))
   285  	}
   286  
   287  	w = &binaryWriter{
   288  		writer: binWriter,
   289  
   290  		// Reusable memory.
   291  		buf:   encoding.Encbuf{B: buf},
   292  		crc32: newCRC32(),
   293  	}
   294  
   295  	w.buf.Reset()
   296  	w.buf.PutBE32(MagicIndex)
   297  	w.buf.PutByte(BinaryFormatV1)
   298  
   299  	return w, w.writer.Write(w.buf.Get())
   300  }
   301  
   302  type PosWriterWithBuffer interface {
   303  	PosWriter
   304  	Buffer() []byte
   305  }
   306  
   307  type PosWriter interface {
   308  	Pos() uint64
   309  	Write(bufs ...[]byte) error
   310  	Flush() error
   311  	Sync() error
   312  	Close() error
   313  }
   314  
   315  type MemoryWriter struct {
   316  	id  ulid.ULID
   317  	buf *bytes.Buffer
   318  	pos uint64
   319  }
   320  
   321  func NewMemoryWriter(id ulid.ULID, size int) *MemoryWriter {
   322  	return &MemoryWriter{
   323  		id:  id,
   324  		buf: bytes.NewBuffer(make([]byte, 0, size)),
   325  		pos: 0,
   326  	}
   327  }
   328  
   329  func (mw *MemoryWriter) Pos() uint64 {
   330  	return mw.pos
   331  }
   332  
   333  func (mw *MemoryWriter) Write(bufs ...[]byte) error {
   334  	for _, b := range bufs {
   335  		n, err := mw.buf.Write(b)
   336  		mw.pos += uint64(n)
   337  		if err != nil {
   338  			return err
   339  		}
   340  		// For now the index file must not grow beyond 64GiB. Some of the fixed-sized
   341  		// offset references in v1 are only 4 bytes large.
   342  		// Once we move to compressed/varint representations in those areas, this limitation
   343  		// can be lifted.
   344  		if mw.pos > 16*math.MaxUint32 {
   345  			return errors.Errorf("%q exceeding max size of 64GiB", mw.id)
   346  		}
   347  	}
   348  	return nil
   349  }
   350  
   351  func (mw *MemoryWriter) Buffer() []byte {
   352  	return mw.buf.Bytes()
   353  }
   354  
   355  func (mw *MemoryWriter) Flush() error {
   356  	return nil
   357  }
   358  
   359  func (mw *MemoryWriter) Sync() error {
   360  	return nil
   361  }
   362  
   363  func (mw *MemoryWriter) Close() error {
   364  	return mw.Flush()
   365  }
   366  
   367  type FileWriter struct {
   368  	f          *os.File
   369  	fileWriter *bufio.Writer
   370  	name       string
   371  	pos        uint64
   372  }
   373  
   374  // TODO(bwplotka): Added size to method, upstream this.
   375  func NewFileWriter(name string, size int) (*FileWriter, error) {
   376  	f, err := os.OpenFile(filepath.Clean(name), os.O_CREATE|os.O_RDWR, 0600)
   377  	if err != nil {
   378  		return nil, err
   379  	}
   380  	return &FileWriter{
   381  		f:          f,
   382  		fileWriter: bufio.NewWriterSize(f, size),
   383  		name:       name,
   384  		pos:        0,
   385  	}, nil
   386  }
   387  
   388  func (fw *FileWriter) Pos() uint64 {
   389  	return fw.pos
   390  }
   391  
   392  func (fw *FileWriter) Write(bufs ...[]byte) error {
   393  	for _, b := range bufs {
   394  		n, err := fw.fileWriter.Write(b)
   395  		fw.pos += uint64(n)
   396  		if err != nil {
   397  			return err
   398  		}
   399  		// For now the index file must not grow beyond 64GiB. Some of the fixed-sized
   400  		// offset references in v1 are only 4 bytes large.
   401  		// Once we move to compressed/varint representations in those areas, this limitation
   402  		// can be lifted.
   403  		if fw.pos > 16*math.MaxUint32 {
   404  			return errors.Errorf("%q exceeding max size of 64GiB", fw.name)
   405  		}
   406  	}
   407  	return nil
   408  }
   409  
   410  func (fw *FileWriter) Flush() error {
   411  	return fw.fileWriter.Flush()
   412  }
   413  
   414  func (fw *FileWriter) Close() error {
   415  	if err := fw.Flush(); err != nil {
   416  		return err
   417  	}
   418  	if err := fw.f.Sync(); err != nil {
   419  		return err
   420  	}
   421  	return fw.f.Close()
   422  }
   423  
   424  func (fw *FileWriter) Sync() error {
   425  	return fw.f.Sync()
   426  }
   427  
   428  func (fw *FileWriter) Remove() error {
   429  	return os.Remove(fw.name)
   430  }
   431  
   432  func (w *binaryWriter) AddIndexMeta(indexVersion int, indexPostingOffsetTable uint64) error {
   433  	w.buf.Reset()
   434  	w.buf.PutByte(byte(indexVersion))
   435  	w.buf.PutBE64(indexPostingOffsetTable)
   436  	return w.writer.Write(w.buf.Get())
   437  }
   438  
   439  func (w *binaryWriter) SymbolsWriter() io.Writer {
   440  	w.toc.Symbols = w.writer.Pos()
   441  	return w
   442  }
   443  
   444  func (w *binaryWriter) PostingOffsetsWriter() io.Writer {
   445  	w.toc.PostingsOffsetTable = w.writer.Pos()
   446  	return w
   447  }
   448  
   449  func (w *binaryWriter) WriteTOC() error {
   450  	w.buf.Reset()
   451  
   452  	w.buf.PutBE64(w.toc.Symbols)
   453  	w.buf.PutBE64(w.toc.PostingsOffsetTable)
   454  
   455  	w.buf.PutHash(w.crc32)
   456  
   457  	return w.writer.Write(w.buf.Get())
   458  }
   459  
   460  func (w *binaryWriter) Write(p []byte) (int, error) {
   461  	n := w.writer.Pos()
   462  	err := w.writer.Write(p)
   463  	return int(w.writer.Pos() - n), err
   464  }
   465  
   466  func (w *binaryWriter) Buffer() []byte {
   467  	pwb, ok := w.writer.(PosWriterWithBuffer)
   468  	if ok {
   469  		return pwb.Buffer()
   470  	}
   471  	return nil
   472  }
   473  
   474  func (w *binaryWriter) Close() error {
   475  	return w.writer.Close()
   476  }
   477  
   478  type postingValueOffsets struct {
   479  	offsets       []postingOffset
   480  	lastValOffset int64
   481  }
   482  
   483  type postingOffset struct {
   484  	// label value.
   485  	value string
   486  	// offset of this entry in posting offset table in index-header file.
   487  	tableOff int
   488  }
   489  
   490  const valueSymbolsCacheSize = 1024
   491  
   492  type BinaryReader struct {
   493  	b   index.ByteSlice
   494  	toc *BinaryTOC
   495  
   496  	// Close that releases the underlying resources of the byte slice.
   497  	c io.Closer
   498  
   499  	// Map of LabelName to a list of some LabelValues's position in the offset table.
   500  	// The first and last values for each name are always present, we keep only 1/postingOffsetsInMemSampling of the rest.
   501  	postings map[string]*postingValueOffsets
   502  	// For the v1 format, labelname -> labelvalue -> offset.
   503  	postingsV1 map[string]map[string]index.Range
   504  
   505  	// Symbols struct that keeps only 1/postingOffsetsInMemSampling in the memory, then looks up the rest via mmap.
   506  	symbols *index.Symbols
   507  	// Cache of the label name symbol lookups,
   508  	// as there are not many and they are half of all lookups.
   509  	nameSymbols map[uint32]string
   510  	// Direct cache of values. This is much faster than an LRU cache and still provides
   511  	// a reasonable cache hit ratio.
   512  	valueSymbolsMx sync.Mutex
   513  	valueSymbols   [valueSymbolsCacheSize]struct {
   514  		index  uint32
   515  		symbol string
   516  	}
   517  
   518  	dec *index.Decoder
   519  
   520  	version             int
   521  	indexVersion        int
   522  	indexLastPostingEnd int64
   523  
   524  	postingOffsetsInMemSampling int
   525  }
   526  
   527  // NewBinaryReader loads or builds new index-header if not present on disk.
   528  func NewBinaryReader(ctx context.Context, logger log.Logger, bkt objstore.BucketReader, dir string, id ulid.ULID, postingOffsetsInMemSampling int) (*BinaryReader, error) {
   529  	if dir != "" {
   530  		binfn := filepath.Join(dir, id.String(), block.IndexHeaderFilename)
   531  		br, err := newFileBinaryReader(binfn, postingOffsetsInMemSampling)
   532  		if err == nil {
   533  			return br, nil
   534  		}
   535  
   536  		level.Debug(logger).Log("msg", "failed to read index-header from disk; recreating", "path", binfn, "err", err)
   537  
   538  		start := time.Now()
   539  		if _, err := WriteBinary(ctx, bkt, id, binfn); err != nil {
   540  			return nil, errors.Wrap(err, "write index header")
   541  		}
   542  
   543  		level.Debug(logger).Log("msg", "built index-header file", "path", binfn, "elapsed", time.Since(start))
   544  		return newFileBinaryReader(binfn, postingOffsetsInMemSampling)
   545  	} else {
   546  		buf, err := WriteBinary(ctx, bkt, id, "")
   547  		if err != nil {
   548  			return nil, errors.Wrap(err, "generate index header")
   549  		}
   550  
   551  		return newMemoryBinaryReader(buf, postingOffsetsInMemSampling)
   552  	}
   553  }
   554  
   555  func newMemoryBinaryReader(buf []byte, postingOffsetsInMemSampling int) (bw *BinaryReader, err error) {
   556  	r := &BinaryReader{
   557  		b:                           realByteSlice(buf),
   558  		c:                           nil,
   559  		postings:                    map[string]*postingValueOffsets{},
   560  		postingOffsetsInMemSampling: postingOffsetsInMemSampling,
   561  	}
   562  
   563  	if err := r.init(); err != nil {
   564  		return nil, err
   565  	}
   566  
   567  	return r, nil
   568  }
   569  
   570  func newFileBinaryReader(path string, postingOffsetsInMemSampling int) (bw *BinaryReader, err error) {
   571  	f, err := fileutil.OpenMmapFile(path)
   572  	if err != nil {
   573  		return nil, err
   574  	}
   575  	defer func() {
   576  		if err != nil {
   577  			runutil.CloseWithErrCapture(&err, f, "index header close")
   578  		}
   579  	}()
   580  
   581  	r := &BinaryReader{
   582  		b:                           realByteSlice(f.Bytes()),
   583  		c:                           f,
   584  		postings:                    map[string]*postingValueOffsets{},
   585  		postingOffsetsInMemSampling: postingOffsetsInMemSampling,
   586  	}
   587  
   588  	if err := r.init(); err != nil {
   589  		return nil, err
   590  	}
   591  
   592  	return r, nil
   593  }
   594  
   595  // newBinaryTOCFromByteSlice return parsed TOC from given index header byte slice.
   596  func newBinaryTOCFromByteSlice(bs index.ByteSlice) (*BinaryTOC, error) {
   597  	if bs.Len() < binaryTOCLen {
   598  		return nil, encoding.ErrInvalidSize
   599  	}
   600  	b := bs.Range(bs.Len()-binaryTOCLen, bs.Len())
   601  
   602  	expCRC := binary.BigEndian.Uint32(b[len(b)-4:])
   603  	d := encoding.Decbuf{B: b[:len(b)-4]}
   604  
   605  	if d.Crc32(castagnoliTable) != expCRC {
   606  		return nil, errors.Wrap(encoding.ErrInvalidChecksum, "read index header TOC")
   607  	}
   608  
   609  	if err := d.Err(); err != nil {
   610  		return nil, err
   611  	}
   612  
   613  	return &BinaryTOC{
   614  		Symbols:             d.Be64(),
   615  		PostingsOffsetTable: d.Be64(),
   616  	}, nil
   617  }
   618  
   619  func (r *BinaryReader) init() (err error) {
   620  	// Verify header.
   621  	if r.b.Len() < headerLen {
   622  		return errors.Wrap(encoding.ErrInvalidSize, "index header's header")
   623  	}
   624  	if m := binary.BigEndian.Uint32(r.b.Range(0, 4)); m != MagicIndex {
   625  		return errors.Errorf("invalid magic number %x", m)
   626  	}
   627  	r.version = int(r.b.Range(4, 5)[0])
   628  	r.indexVersion = int(r.b.Range(5, 6)[0])
   629  
   630  	r.indexLastPostingEnd = int64(binary.BigEndian.Uint64(r.b.Range(6, headerLen)))
   631  
   632  	if r.version != BinaryFormatV1 {
   633  		return errors.Errorf("unknown index header file version %d", r.version)
   634  	}
   635  
   636  	r.toc, err = newBinaryTOCFromByteSlice(r.b)
   637  	if err != nil {
   638  		return errors.Wrap(err, "read index header TOC")
   639  	}
   640  
   641  	// TODO(bwplotka): Consider contributing to Prometheus to allow specifying custom number for symbolsFactor.
   642  	r.symbols, err = index.NewSymbols(r.b, r.indexVersion, int(r.toc.Symbols))
   643  	if err != nil {
   644  		return errors.Wrap(err, "read symbols")
   645  	}
   646  
   647  	var lastName, lastValue []byte
   648  	if r.indexVersion == index.FormatV1 {
   649  		// Earlier V1 formats don't have a sorted postings offset table, so
   650  		// load the whole offset table into memory.
   651  		r.postingsV1 = map[string]map[string]index.Range{}
   652  
   653  		var prevRng index.Range
   654  		if err := index.ReadPostingsOffsetTable(r.b, r.toc.PostingsOffsetTable, func(name, value []byte, postingsOffset uint64, _ int) error {
   655  			if lastName != nil {
   656  				prevRng.End = int64(postingsOffset - crc32.Size)
   657  				r.postingsV1[string(lastName)][string(lastValue)] = prevRng
   658  			}
   659  
   660  			if _, ok := r.postingsV1[string(name)]; !ok {
   661  				r.postingsV1[string(name)] = map[string]index.Range{}
   662  				r.postings[string(name)] = nil // Used to get a list of labelnames in places.
   663  			}
   664  
   665  			lastName = name
   666  			lastValue = value
   667  			prevRng = index.Range{Start: int64(postingsOffset + postingLengthFieldSize)}
   668  			return nil
   669  		}); err != nil {
   670  			return errors.Wrap(err, "read postings table")
   671  		}
   672  		if string(lastName) != "" {
   673  			prevRng.End = r.indexLastPostingEnd - crc32.Size
   674  			r.postingsV1[string(lastName)][string(lastValue)] = prevRng
   675  		}
   676  	} else {
   677  		lastTableOff := 0
   678  		valueCount := 0
   679  
   680  		// For the postings offset table we keep every label name but only every nth
   681  		// label value (plus the first and last one), to save memory.
   682  		if err := index.ReadPostingsOffsetTable(r.b, r.toc.PostingsOffsetTable, func(name, value []byte, postingsOffset uint64, labelOffset int) error {
   683  			if _, ok := r.postings[string(name)]; !ok {
   684  				// Not seen before label name.
   685  				r.postings[string(name)] = &postingValueOffsets{}
   686  				if lastName != nil {
   687  					// Always include last value for each label name, unless it was just added in previous iteration based
   688  					// on valueCount.
   689  					if (valueCount-1)%r.postingOffsetsInMemSampling != 0 {
   690  						r.postings[string(lastName)].offsets = append(r.postings[string(lastName)].offsets, postingOffset{value: string(lastValue), tableOff: lastTableOff})
   691  					}
   692  					r.postings[string(lastName)].lastValOffset = int64(postingsOffset - crc32.Size)
   693  					lastName = nil
   694  					lastValue = nil
   695  				}
   696  				valueCount = 0
   697  			}
   698  
   699  			lastName = name
   700  			lastValue = value
   701  			lastTableOff = labelOffset
   702  			valueCount++
   703  
   704  			if (valueCount-1)%r.postingOffsetsInMemSampling == 0 {
   705  				r.postings[string(name)].offsets = append(r.postings[string(name)].offsets, postingOffset{value: string(value), tableOff: labelOffset})
   706  			}
   707  
   708  			return nil
   709  		}); err != nil {
   710  			return errors.Wrap(err, "read postings table")
   711  		}
   712  		if lastName != nil {
   713  			if (valueCount-1)%r.postingOffsetsInMemSampling != 0 {
   714  				// Always include last value for each label name if not included already based on valueCount.
   715  				r.postings[string(lastName)].offsets = append(r.postings[string(lastName)].offsets, postingOffset{value: string(lastValue), tableOff: lastTableOff})
   716  			}
   717  			// In any case lastValOffset is unknown as don't have next posting anymore. Guess from TOC table.
   718  			// In worst case we will overfetch a few bytes.
   719  			r.postings[string(lastName)].lastValOffset = r.indexLastPostingEnd - crc32.Size
   720  		}
   721  		// Trim any extra space in the slices.
   722  		for k, v := range r.postings {
   723  			l := make([]postingOffset, len(v.offsets))
   724  			copy(l, v.offsets)
   725  			r.postings[k].offsets = l
   726  		}
   727  	}
   728  
   729  	r.nameSymbols = make(map[uint32]string, len(r.postings))
   730  	for k := range r.postings {
   731  		if k == "" {
   732  			continue
   733  		}
   734  		off, err := r.symbols.ReverseLookup(k)
   735  		if err != nil {
   736  			return errors.Wrap(err, "reverse symbol lookup")
   737  		}
   738  		r.nameSymbols[off] = k
   739  	}
   740  
   741  	r.dec = &index.Decoder{LookupSymbol: r.LookupSymbol}
   742  
   743  	return nil
   744  }
   745  
   746  func (r *BinaryReader) IndexVersion() (int, error) {
   747  	return r.indexVersion, nil
   748  }
   749  
   750  // TODO(bwplotka): Get advantage of multi value offset fetch.
   751  func (r *BinaryReader) PostingsOffset(name, value string) (index.Range, error) {
   752  	rngs, err := r.postingsOffset(name, value)
   753  	if err != nil {
   754  		return index.Range{}, err
   755  	}
   756  	if len(rngs) != 1 {
   757  		return index.Range{}, NotFoundRangeErr
   758  	}
   759  	return rngs[0], nil
   760  }
   761  
   762  func skipNAndName(d *encoding.Decbuf, buf *int) {
   763  	if *buf == 0 {
   764  		// Keycount+LabelName are always the same number of bytes,
   765  		// and it's faster to skip than parse.
   766  		*buf = d.Len()
   767  		d.Uvarint()      // Keycount.
   768  		d.UvarintBytes() // Label name.
   769  		*buf -= d.Len()
   770  		return
   771  	}
   772  	d.Skip(*buf)
   773  }
   774  func (r *BinaryReader) postingsOffset(name string, values ...string) ([]index.Range, error) {
   775  	rngs := make([]index.Range, 0, len(values))
   776  	if r.indexVersion == index.FormatV1 {
   777  		e, ok := r.postingsV1[name]
   778  		if !ok {
   779  			return nil, nil
   780  		}
   781  		for _, v := range values {
   782  			rng, ok := e[v]
   783  			if !ok {
   784  				continue
   785  			}
   786  			rngs = append(rngs, rng)
   787  		}
   788  		return rngs, nil
   789  	}
   790  
   791  	e, ok := r.postings[name]
   792  	if !ok {
   793  		return nil, nil
   794  	}
   795  
   796  	if len(values) == 0 {
   797  		return nil, nil
   798  	}
   799  
   800  	buf := 0
   801  	valueIndex := 0
   802  	for valueIndex < len(values) && values[valueIndex] < e.offsets[0].value {
   803  		// Discard values before the start.
   804  		valueIndex++
   805  	}
   806  
   807  	var newSameRngs []index.Range // The start, end offsets in the postings table in the original index file.
   808  	for valueIndex < len(values) {
   809  		wantedValue := values[valueIndex]
   810  
   811  		i := sort.Search(len(e.offsets), func(i int) bool { return e.offsets[i].value >= wantedValue })
   812  		if i == len(e.offsets) {
   813  			// We're past the end.
   814  			break
   815  		}
   816  		if i > 0 && e.offsets[i].value != wantedValue {
   817  			// Need to look from previous entry.
   818  			i--
   819  		}
   820  
   821  		// Don't Crc32 the entire postings offset table, this is very slow
   822  		// so hope any issues were caught at startup.
   823  		d := encoding.NewDecbufAt(r.b, int(r.toc.PostingsOffsetTable), nil)
   824  		d.Skip(e.offsets[i].tableOff)
   825  
   826  		// Iterate on the offset table.
   827  		newSameRngs = newSameRngs[:0]
   828  		for d.Err() == nil {
   829  			// Posting format entry is as follows:
   830  			// │ ┌────────────────────────────────────────┐ │
   831  			// │ │  n = 2 <1b>                            │ │
   832  			// │ ├──────────────────────┬─────────────────┤ │
   833  			// │ │ len(name) <uvarint>  │ name <bytes>    │ │
   834  			// │ ├──────────────────────┼─────────────────┤ │
   835  			// │ │ len(value) <uvarint> │ value <bytes>   │ │
   836  			// │ ├──────────────────────┴─────────────────┤ │
   837  			// │ │  offset <uvarint64>                    │ │
   838  			// │ └────────────────────────────────────────┘ │
   839  			// First, let's skip n and name.
   840  			skipNAndName(&d, &buf)
   841  			value := d.UvarintBytes() // Label value.
   842  			postingOffset := int64(d.Uvarint64())
   843  
   844  			if len(newSameRngs) > 0 {
   845  				// We added some ranges in previous iteration. Use next posting offset as end of all our new ranges.
   846  				for j := range newSameRngs {
   847  					newSameRngs[j].End = postingOffset - crc32.Size
   848  				}
   849  				rngs = append(rngs, newSameRngs...)
   850  				newSameRngs = newSameRngs[:0]
   851  			}
   852  
   853  			for string(value) >= wantedValue {
   854  				// If wantedValue is equals of greater than current value, loop over all given wanted values in the values until
   855  				// this is no longer true or there are no more values wanted.
   856  				// This ensures we cover case when someone asks for postingsOffset(name, value1, value1, value1).
   857  
   858  				// Record on the way if wanted value is equal to the current value.
   859  				if string(value) == wantedValue {
   860  					newSameRngs = append(newSameRngs, index.Range{Start: postingOffset + postingLengthFieldSize})
   861  				}
   862  				valueIndex++
   863  				if valueIndex == len(values) {
   864  					break
   865  				}
   866  				wantedValue = values[valueIndex]
   867  			}
   868  
   869  			if i+1 == len(e.offsets) {
   870  				// No more offsets for this name.
   871  				// Break this loop and record lastOffset on the way for ranges we just added if any.
   872  				for j := range newSameRngs {
   873  					newSameRngs[j].End = e.lastValOffset
   874  				}
   875  				rngs = append(rngs, newSameRngs...)
   876  				break
   877  			}
   878  
   879  			if valueIndex != len(values) && wantedValue <= e.offsets[i+1].value {
   880  				// wantedValue is smaller or same as the next offset we know about, let's iterate further to add those.
   881  				continue
   882  			}
   883  
   884  			// Nothing wanted or wantedValue is larger than next offset we know about.
   885  			// Let's exit and do binary search again / exit if nothing wanted.
   886  
   887  			if len(newSameRngs) > 0 {
   888  				// We added some ranges in this iteration. Use next posting offset as the end of our ranges.
   889  				// We know it exists as we never go further in this loop than e.offsets[i, i+1].
   890  
   891  				skipNAndName(&d, &buf)
   892  				d.UvarintBytes() // Label value.
   893  				postingOffset := int64(d.Uvarint64())
   894  
   895  				for j := range newSameRngs {
   896  					newSameRngs[j].End = postingOffset - crc32.Size
   897  				}
   898  				rngs = append(rngs, newSameRngs...)
   899  			}
   900  			break
   901  		}
   902  		if d.Err() != nil {
   903  			return nil, errors.Wrap(d.Err(), "get postings offset entry")
   904  		}
   905  	}
   906  
   907  	return rngs, nil
   908  }
   909  
   910  func (r *BinaryReader) LookupSymbol(o uint32) (string, error) {
   911  	cacheIndex := o % valueSymbolsCacheSize
   912  	r.valueSymbolsMx.Lock()
   913  	if cached := r.valueSymbols[cacheIndex]; cached.index == o && cached.symbol != "" {
   914  		v := cached.symbol
   915  		r.valueSymbolsMx.Unlock()
   916  		return v, nil
   917  	}
   918  	r.valueSymbolsMx.Unlock()
   919  
   920  	if s, ok := r.nameSymbols[o]; ok {
   921  		return s, nil
   922  	}
   923  
   924  	if r.indexVersion == index.FormatV1 {
   925  		// For v1 little trick is needed. Refs are actual offset inside index, not index-header. This is different
   926  		// of the header length difference between two files.
   927  		o += headerLen - index.HeaderLen
   928  	}
   929  
   930  	s, err := r.symbols.Lookup(o)
   931  	if err != nil {
   932  		return s, err
   933  	}
   934  
   935  	r.valueSymbolsMx.Lock()
   936  	r.valueSymbols[cacheIndex].index = o
   937  	r.valueSymbols[cacheIndex].symbol = s
   938  	r.valueSymbolsMx.Unlock()
   939  
   940  	return s, nil
   941  }
   942  
   943  func (r *BinaryReader) LabelValues(name string) ([]string, error) {
   944  	if r.indexVersion == index.FormatV1 {
   945  		e, ok := r.postingsV1[name]
   946  		if !ok {
   947  			return nil, nil
   948  		}
   949  		values := make([]string, 0, len(e))
   950  		for k := range e {
   951  			values = append(values, k)
   952  		}
   953  		sort.Strings(values)
   954  		return values, nil
   955  
   956  	}
   957  	e, ok := r.postings[name]
   958  	if !ok {
   959  		return nil, nil
   960  	}
   961  	if len(e.offsets) == 0 {
   962  		return nil, nil
   963  	}
   964  	values := make([]string, 0, len(e.offsets)*r.postingOffsetsInMemSampling)
   965  
   966  	d := encoding.NewDecbufAt(r.b, int(r.toc.PostingsOffsetTable), nil)
   967  	d.Skip(e.offsets[0].tableOff)
   968  	lastVal := e.offsets[len(e.offsets)-1].value
   969  
   970  	skip := 0
   971  	for d.Err() == nil {
   972  		if skip == 0 {
   973  			// These are always the same number of bytes,
   974  			// and it's faster to skip than parse.
   975  			skip = d.Len()
   976  			d.Uvarint()      // Keycount.
   977  			d.UvarintBytes() // Label name.
   978  			skip -= d.Len()
   979  		} else {
   980  			d.Skip(skip)
   981  		}
   982  		s := yoloString(d.UvarintBytes()) // Label value.
   983  		values = append(values, s)
   984  		if s == lastVal {
   985  			break
   986  		}
   987  		d.Uvarint64() // Offset.
   988  	}
   989  	if d.Err() != nil {
   990  		return nil, errors.Wrap(d.Err(), "get postings offset entry")
   991  	}
   992  	return values, nil
   993  }
   994  
   995  func yoloString(b []byte) string {
   996  	return *((*string)(unsafe.Pointer(&b)))
   997  }
   998  
   999  func (r *BinaryReader) LabelNames() ([]string, error) {
  1000  	allPostingsKeyName, _ := index.AllPostingsKey()
  1001  	labelNames := make([]string, 0, len(r.postings))
  1002  	for name := range r.postings {
  1003  		if name == allPostingsKeyName {
  1004  			// This is not from any metric.
  1005  			continue
  1006  		}
  1007  		labelNames = append(labelNames, name)
  1008  	}
  1009  	sort.Strings(labelNames)
  1010  	return labelNames, nil
  1011  }
  1012  
  1013  func (r *BinaryReader) Close() error {
  1014  	if r.c == nil {
  1015  		return nil
  1016  	}
  1017  	return r.c.Close()
  1018  }
  1019  
  1020  type realByteSlice []byte
  1021  
  1022  func (b realByteSlice) Len() int {
  1023  	return len(b)
  1024  }
  1025  
  1026  func (b realByteSlice) Range(start, end int) []byte {
  1027  	return b[start:end]
  1028  }
  1029  
  1030  func (b realByteSlice) Sub(start, end int) index.ByteSlice {
  1031  	return b[start:end]
  1032  }