
     1  // Copyright (c) The Thanos Authors.
     2  // Licensed under the Apache License 2.0.
     4  package store
     6  import (
     7  	"bytes"
     8  	"encoding/binary"
     9  	"fmt"
    10  	"hash/crc32"
    11  	"io"
    13  	""
    14  	""
    15  	""
    16  	""
    17  	""
    18  	""
    19  	extsnappy ""
    20  	""
    21  )
    23  // This file implements encoding and decoding of postings using diff (or delta) + varint
    24  // number encoding. On top of that, we apply Snappy compression.
    25  //
    26  // On its own, Snappy compressing raw postings doesn't really help, because there is no
    27  // repetition in raw data. Using diff (delta) between postings entries makes values small,
    28  // and Varint is very efficient at encoding small values (values < 128 are encoded as
    29  // single byte, values < 16384 are encoded as two bytes). Diff + varint reduces postings size
    30  // significantly (to about 20% of original), snappy then halves it to ~10% of the original.
    32  const (
    33  	codecHeaderSnappy         = "dvs" // As in "diff+varint+snappy".
    34  	codecHeaderStreamedSnappy = "dss" // As in "diffvarint+streamed snappy".
    35  )
    37  func decodePostings(input []byte) (closeablePostings, error) {
    38  	var df func([]byte, bool) (closeablePostings, error)
    40  	switch {
    41  	case isDiffVarintSnappyEncodedPostings(input):
    42  		df = diffVarintSnappyDecode
    43  	case isDiffVarintSnappyStreamedEncodedPostings(input):
    44  		df = diffVarintSnappyStreamedDecode
    45  	default:
    46  		return nil, fmt.Errorf("unrecognize postings format")
    47  	}
    49  	return df(input, false)
    50  }
    52  // isDiffVarintSnappyEncodedPostings returns true, if input looks like it has been encoded by diff+varint+snappy codec.
    53  func isDiffVarintSnappyEncodedPostings(input []byte) bool {
    54  	return bytes.HasPrefix(input, []byte(codecHeaderSnappy))
    55  }
    57  // isDiffVarintSnappyStreamedEncodedPostings returns true, if input looks like it has been encoded by diff+varint+snappy streamed codec.
    58  func isDiffVarintSnappyStreamedEncodedPostings(input []byte) bool {
    59  	return bytes.HasPrefix(input, []byte(codecHeaderStreamedSnappy))
    60  }
    62  // estimateSnappyStreamSize estimates the number of bytes
    63  // needed for encoding length postings. Note that in reality
    64  // the number of bytes needed could be much bigger if postings
    65  // different by a lot. Practically, stddev=64 is used.
    66  func estimateSnappyStreamSize(length int) int {
    67  	// Snappy stream writes data in chunks up to 65536 in size.
    68  	// The stream begins with bytes 0xff 0x06 0x00 0x00 's' 'N' 'a' 'P' 'p' 'Y'.
    69  	// Our encoded data also needs a header.
    70  	// Each encoded (or uncompressed) chunk needs tag (chunk type 1B + chunk len 3B) + checksum 4B.
    72  	// Mark for encoded data.
    73  	ret := len(codecHeaderStreamedSnappy)
    74  	// Magic snappy stream start.
    75  	ret += 10
    77  	const maxBlockSize = 65536
    79  	length = 5 * length / 4 // estimate 1.25B per posting.
    81  	blocks := length / maxBlockSize
    83  	ret += blocks * snappy.MaxEncodedLen(maxBlockSize)
    84  	length -= blocks * maxBlockSize
    85  	if length > 0 {
    86  		ret += snappy.MaxEncodedLen(length)
    87  	}
    89  	return ret
    90  }
    92  func diffVarintSnappyStreamedEncode(p index.Postings, length int) ([]byte, error) {
    93  	compressedBuf := bytes.NewBuffer(make([]byte, 0, estimateSnappyStreamSize(length)))
    94  	if n, err := compressedBuf.WriteString(codecHeaderStreamedSnappy); err != nil {
    95  		return nil, fmt.Errorf("writing streamed snappy header")
    96  	} else if n != len(codecHeaderStreamedSnappy) {
    97  		return nil, fmt.Errorf("short-write streamed snappy header")
    98  	}
   100  	uvarintEncodeBuf := make([]byte, binary.MaxVarintLen64)
   102  	sw, err := extsnappy.Compressor.Compress(compressedBuf)
   103  	if err != nil {
   104  		return nil, fmt.Errorf("creating snappy compressor: %w", err)
   105  	}
   107  	prev := storage.SeriesRef(0)
   108  	for p.Next() {
   109  		v := p.At()
   110  		if v < prev {
   111  			return nil, errors.Errorf("postings entries must be in increasing order, current: %d, previous: %d", v, prev)
   112  		}
   114  		uvarintSize := binary.PutUvarint(uvarintEncodeBuf, uint64(v-prev))
   115  		if written, err := sw.Write(uvarintEncodeBuf[:uvarintSize]); err != nil {
   116  			return nil, errors.Wrap(err, "writing uvarint encoded byte")
   117  		} else if written != uvarintSize {
   118  			return nil, errors.Wrap(err, "short-write for uvarint encoded byte")
   119  		}
   121  		prev = v
   122  	}
   123  	if p.Err() != nil {
   124  		return nil, p.Err()
   125  	}
   126  	if err := sw.Close(); err != nil {
   127  		return nil, errors.Wrap(err, "closing snappy stream writer")
   128  	}
   130  	return compressedBuf.Bytes(), nil
   131  }
   133  func diffVarintSnappyStreamedDecode(input []byte, disablePooling bool) (closeablePostings, error) {
   134  	if !isDiffVarintSnappyStreamedEncodedPostings(input) {
   135  		return nil, errors.New("header not found")
   136  	}
   138  	return newStreamedDiffVarintPostings(input[len(codecHeaderStreamedSnappy):], disablePooling)
   139  }
   141  type streamedDiffVarintPostings struct {
   142  	curSeries storage.SeriesRef
   144  	err               error
   145  	input, buf        []byte
   146  	maximumDecodedLen int
   148  	db *encoding.Decbuf
   150  	readSnappyIdentifier bool
   151  	disablePooling       bool
   152  }
   154  const (
   155  	chunkTypeCompressedData   = 0x00
   156  	chunkTypeUncompressedData = 0x01
   157  	chunkTypeStreamIdentifier = 0xff
   158  	chunkTypePadding          = 0xfe
   159  	checksumSize              = 4
   160  )
   162  func maximumDecodedLenSnappyStreamed(in []byte) (int, error) {
   163  	maxDecodedLen := -1
   165  	for len(in) > 0 {
   166  		// Chunk type.
   167  		chunkType := in[0]
   168  		in = in[1:]
   169  		chunkLen := int(in[0]) | int(in[1])<<8 | int(in[2])<<16
   170  		in = in[3:]
   172  		switch chunkType {
   173  		case chunkTypeCompressedData:
   174  			bl := in[:chunkLen]
   175  			// NOTE: checksum will be checked later on.
   176  			decodedLen, err := s2.DecodedLen(bl[checksumSize:])
   177  			if err != nil {
   178  				return 0, err
   179  			}
   180  			if decodedLen > maxDecodedLen {
   181  				maxDecodedLen = decodedLen
   182  			}
   183  		case chunkTypeUncompressedData:
   184  			// NOTE: checksum will be checked later on.
   185  			n := chunkLen - checksumSize
   186  			if n > maxDecodedLen {
   187  				maxDecodedLen = n
   188  			}
   189  		}
   190  		in = in[chunkLen:]
   191  	}
   192  	return maxDecodedLen, nil
   193  }
   195  var decodedBufPool = pool.MustNewBucketedBytes(1024, 65536, 2, 0)
   197  func newStreamedDiffVarintPostings(input []byte, disablePooling bool) (closeablePostings, error) {
   198  	// We can't use the regular s2.Reader because it assumes a stream.
   199  	// We already everything in memory so let's avoid copying.
   200  	// Algorithm:
   201  	// 1. Step through all chunks all get maximum decoded len.
   202  	// 2. Read into decoded step by step. For decoding call s2.Decode(r.decoded, buf).
   203  	maximumDecodedLen, err := maximumDecodedLenSnappyStreamed(input)
   204  	if err != nil {
   205  		return nil, err
   206  	}
   208  	return &streamedDiffVarintPostings{
   209  		input:             input,
   210  		maximumDecodedLen: maximumDecodedLen,
   211  		db:                &encoding.Decbuf{},
   212  		disablePooling:    disablePooling,
   213  	}, nil
   214  }
   216  func (it *streamedDiffVarintPostings) close() {
   217  	if it.buf == nil {
   218  		return
   219  	}
   220  	if it.disablePooling {
   221  		return
   222  	}
   223  	decodedBufPool.Put(&it.buf)
   224  }
   226  func (it *streamedDiffVarintPostings) At() storage.SeriesRef {
   227  	return it.curSeries
   228  }
   230  func (it *streamedDiffVarintPostings) readNextChunk(remainder []byte) bool {
   231  	// Normal EOF.
   232  	if len(it.input) == 0 {
   233  		return false
   234  	}
   236  	// Read next chunk into it.db.B.
   237  	chunkType := it.input[0]
   238  	it.input = it.input[1:]
   240  	if len(it.input) < 3 {
   241  		it.err = io.ErrUnexpectedEOF
   242  		return false
   243  	}
   245  	chunkLen := int(it.input[0]) | int(it.input[1])<<8 | int(it.input[2])<<16
   246  	it.input = it.input[3:]
   248  	switch chunkType {
   249  	case chunkTypeStreamIdentifier:
   250  		const magicBody = "sNaPpY"
   251  		if chunkLen != len(magicBody) {
   252  			it.err = fmt.Errorf("corrupted identifier")
   253  			return false
   254  		}
   255  		if string(it.input[:len(magicBody)]) != magicBody {
   256  			it.err = fmt.Errorf("got bad identifier %s", string(it.input[:6]))
   257  			return false
   258  		}
   259  		it.input = it.input[6:]
   260  		it.readSnappyIdentifier = true
   261  		return it.readNextChunk(nil)
   262  	case chunkTypeCompressedData:
   263  		if !it.readSnappyIdentifier {
   264  			it.err = fmt.Errorf("missing magic snappy marker")
   265  			return false
   266  		}
   267  		if len(it.input) < 4 {
   268  			it.err = io.ErrUnexpectedEOF
   269  			return false
   270  		}
   271  		checksum := uint32(it.input[0]) | uint32(it.input[1])<<8 | uint32(it.input[2])<<16 | uint32(it.input[3])<<24
   272  		if len(it.input) < chunkLen {
   273  			it.err = io.ErrUnexpectedEOF
   274  			return false
   275  		}
   277  		if it.buf == nil {
   278  			if it.disablePooling {
   279  				it.buf = make([]byte, it.maximumDecodedLen)
   280  			} else {
   281  				b, err := decodedBufPool.Get(it.maximumDecodedLen)
   282  				if err != nil {
   283  					it.err = err
   284  					return false
   285  				}
   286  				it.buf = *b
   287  			}
   288  		}
   290  		encodedBuf := it.input[:chunkLen]
   292  		// NOTE(GiedriusS): we can probably optimize this better but this should be rare enough
   293  		// and not cause any problems.
   294  		if len(remainder) > 0 {
   295  			remainderCopy := make([]byte, 0, len(remainder))
   296  			remainderCopy = append(remainderCopy, remainder...)
   297  			remainder = remainderCopy
   298  		}
   299  		decoded, err := s2.Decode(it.buf, encodedBuf[checksumSize:])
   300  		if err != nil {
   301  			it.err = err
   302  			return false
   303  		}
   304  		if crc(decoded) != checksum {
   305  			it.err = fmt.Errorf("mismatched checksum (got %v, expected %v)", crc(decoded), checksum)
   306  			return false
   307  		}
   308  		if len(remainder) > 0 {
   309  			it.db.B = append(remainder, decoded...)
   310  		} else {
   311  			it.db.B = decoded
   312  		}
   313  	case chunkTypeUncompressedData:
   314  		if !it.readSnappyIdentifier {
   315  			it.err = fmt.Errorf("missing magic snappy marker")
   316  			return false
   317  		}
   318  		if len(it.input) < 4 {
   319  			it.err = io.ErrUnexpectedEOF
   320  			return false
   321  		}
   322  		checksum := uint32(it.input[0]) | uint32(it.input[1])<<8 | uint32(it.input[2])<<16 | uint32(it.input[3])<<24
   323  		if len(it.input) < chunkLen {
   324  			it.err = io.ErrUnexpectedEOF
   325  			return false
   326  		}
   327  		uncompressedData := it.input[checksumSize:chunkLen]
   328  		if crc(uncompressedData) != checksum {
   329  			it.err = fmt.Errorf("mismatched checksum (got %v, expected %v)", crc(uncompressedData), checksum)
   330  			return false
   331  		}
   333  		// NOTE(GiedriusS): we can probably optimize this better but this should be rare enough
   334  		// and not cause any problems.
   335  		if len(remainder) > 0 {
   336  			remainderCopy := make([]byte, 0, len(remainder))
   337  			remainderCopy = append(remainderCopy, remainder...)
   338  			remainder = remainderCopy
   339  		}
   341  		if len(remainder) > 0 {
   342  			it.db.B = append(remainder, uncompressedData...)
   343  		} else {
   344  			it.db.B = uncompressedData
   345  		}
   346  	default:
   347  		if chunkType <= 0x7f {
   348  			it.err = fmt.Errorf("unsupported chunk type %v", chunkType)
   349  			return false
   350  		}
   351  		if chunkType > 0xfd {
   352  			it.err = fmt.Errorf("invalid chunk type %v", chunkType)
   353  			return false
   354  		}
   355  	}
   356  	it.input = it.input[chunkLen:]
   358  	return true
   359  }
   361  func (it *streamedDiffVarintPostings) Next() bool {
   362  	// Continue reading next chunks until there is at least binary.MaxVarintLen64.
   363  	// If we cannot add any more chunks then return false.
   364  	for {
   365  		val := it.db.Uvarint64()
   366  		if it.db.Err() != nil {
   367  			if !it.readNextChunk(it.db.B) {
   368  				return false
   369  			}
   370  			it.db.E = nil
   371  			continue
   372  		}
   374  		it.curSeries = it.curSeries + storage.SeriesRef(val)
   375  		return true
   376  	}
   377  }
   379  func (it *streamedDiffVarintPostings) Err() error {
   380  	return it.err
   381  }
   383  func (it *streamedDiffVarintPostings) Seek(x storage.SeriesRef) bool {
   384  	if it.curSeries >= x {
   385  		return true
   386  	}
   388  	// We cannot do any search due to how values are stored,
   389  	// so we simply advance until we find the right value.
   390  	for it.Next() {
   391  		if it.At() >= x {
   392  			return true
   393  		}
   394  	}
   396  	return false
   397  }
   399  // diffVarintSnappyEncode encodes postings into diff+varint representation,
   400  // and applies snappy compression on the result.
   401  // Returned byte slice starts with codecHeaderSnappy header.
   402  // Length argument is expected number of postings, used for preallocating buffer.
   403  // TODO(GiedriusS): remove for v1.0.
   404  func diffVarintSnappyEncode(p index.Postings, length int) ([]byte, error) {
   405  	buf, err := diffVarintEncodeNoHeader(p, length)
   406  	if err != nil {
   407  		return nil, err
   408  	}
   410  	// Make result buffer large enough to hold our header and compressed block.
   411  	result := make([]byte, len(codecHeaderSnappy)+snappy.MaxEncodedLen(len(buf)))
   412  	copy(result, codecHeaderSnappy)
   414  	compressed := snappy.Encode(result[len(codecHeaderSnappy):], buf)
   416  	// Slice result buffer based on compressed size.
   417  	result = result[:len(codecHeaderSnappy)+len(compressed)]
   418  	return result, nil
   419  }
   421  // diffVarintEncodeNoHeader encodes postings into diff+varint representation.
   422  // It doesn't add any header to the output bytes.
   423  // Length argument is expected number of postings, used for preallocating buffer.
   424  func diffVarintEncodeNoHeader(p index.Postings, length int) ([]byte, error) {
   425  	buf := encoding.Encbuf{}
   427  	// This encoding uses around ~1 bytes per posting, but let's use
   428  	// conservative 1.25 bytes per posting to avoid extra allocations.
   429  	if length > 0 {
   430  		buf.B = make([]byte, 0, 5*length/4)
   431  	}
   433  	prev := storage.SeriesRef(0)
   434  	for p.Next() {
   435  		v := p.At()
   436  		if v < prev {
   437  			return nil, errors.Errorf("postings entries must be in increasing order, current: %d, previous: %d", v, prev)
   438  		}
   440  		// This is the 'diff' part -- compute difference from previous value.
   441  		buf.PutUvarint64(uint64(v - prev))
   442  		prev = v
   443  	}
   444  	if p.Err() != nil {
   445  		return nil, p.Err()
   446  	}
   448  	return buf.B, nil
   449  }
   451  // Creating 15 buckets from 1k to 32mb.
   452  var snappyDecodePool = pool.MustNewBucketedBytes(1024, 32*1024*1024, 2, 0)
   454  type closeablePostings interface {
   455  	index.Postings
   456  	close()
   457  }
   459  // alias returns true if given slices have the same both backing array.
   460  // See:
   461  func alias(x, y []byte) bool {
   462  	return cap(x) > 0 && cap(y) > 0 && &x[0:cap(x)][cap(x)-1] == &y[0:cap(y)][cap(y)-1]
   463  }
   465  // TODO(GiedriusS): remove for v1.0.
   466  func diffVarintSnappyDecode(input []byte, disablePooling bool) (closeablePostings, error) {
   467  	if !isDiffVarintSnappyEncodedPostings(input) {
   468  		return nil, errors.New("header not found")
   469  	}
   471  	toFree := make([][]byte, 0, 2)
   473  	var dstBuf []byte
   474  	if !disablePooling {
   475  		if len, err := s2.DecodedLen(input[len(codecHeaderSnappy):]); err == nil {
   476  			if decodeBuf, err := snappyDecodePool.Get(len); err == nil && decodeBuf != nil {
   477  				dstBuf = *decodeBuf
   478  				toFree = append(toFree, dstBuf)
   479  			}
   480  		}
   481  	}
   483  	raw, err := s2.Decode(dstBuf, input[len(codecHeaderSnappy):])
   484  	if err != nil {
   485  		return nil, errors.Wrap(err, "snappy decode")
   486  	}
   488  	if !alias(raw, dstBuf) && !disablePooling {
   489  		toFree = append(toFree, raw)
   490  	}
   492  	return newDiffVarintPostings(raw, toFree), nil
   493  }
   495  func newDiffVarintPostings(input []byte, freeSlices [][]byte) *diffVarintPostings {
   496  	return &diffVarintPostings{freeSlices: freeSlices, buf: &encoding.Decbuf{B: input}}
   497  }
   499  // diffVarintPostings is an implementation of index.Postings based on diff+varint encoded data.
   500  type diffVarintPostings struct {
   501  	buf        *encoding.Decbuf
   502  	cur        storage.SeriesRef
   503  	freeSlices [][]byte
   504  }
   506  func (it *diffVarintPostings) close() {
   507  	for i := range it.freeSlices {
   508  		snappyDecodePool.Put(&it.freeSlices[i])
   509  	}
   510  }
   512  func (it *diffVarintPostings) At() storage.SeriesRef {
   513  	return it.cur
   514  }
   516  func (it *diffVarintPostings) Next() bool {
   517  	if it.buf.Err() != nil || it.buf.Len() == 0 {
   518  		return false
   519  	}
   521  	val := it.buf.Uvarint64()
   522  	if it.buf.Err() != nil {
   523  		return false
   524  	}
   526  	it.cur = it.cur + storage.SeriesRef(val)
   527  	return true
   528  }
   530  func (it *diffVarintPostings) Seek(x storage.SeriesRef) bool {
   531  	if it.cur >= x {
   532  		return true
   533  	}
   535  	// We cannot do any search due to how values are stored,
   536  	// so we simply advance until we find the right value.
   537  	for it.Next() {
   538  		if it.At() >= x {
   539  			return true
   540  		}
   541  	}
   543  	return false
   544  }
   546  func (it *diffVarintPostings) Err() error {
   547  	return it.buf.Err()
   548  }
   550  func snappyStreamedEncode(postingsLength int, diffVarintPostings []byte) ([]byte, error) {
   551  	compressedBuf := bytes.NewBuffer(make([]byte, 0, estimateSnappyStreamSize(postingsLength)))
   552  	if n, err := compressedBuf.WriteString(codecHeaderStreamedSnappy); err != nil {
   553  		return nil, fmt.Errorf("writing streamed snappy header")
   554  	} else if n != len(codecHeaderStreamedSnappy) {
   555  		return nil, fmt.Errorf("short-write streamed snappy header")
   556  	}
   558  	sw, err := extsnappy.Compressor.Compress(compressedBuf)
   559  	if err != nil {
   560  		return nil, fmt.Errorf("creating snappy compressor: %w", err)
   561  	}
   562  	_, err = sw.Write(diffVarintPostings)
   563  	if err != nil {
   564  		return nil, err
   565  	}
   566  	if err := sw.Close(); err != nil {
   567  		return nil, errors.Wrap(err, "closing snappy stream writer")
   568  	}
   570  	return compressedBuf.Bytes(), nil
   571  }
   573  var crcTable = crc32.MakeTable(crc32.Castagnoli)
   575  // crc implements the checksum specified in section 3 of
   576  //
   577  func crc(b []byte) uint32 {
   578  	c := crc32.Update(0, crcTable, b)
   579  	return c>>15 | c<<17 + 0xa282ead8
   580  }