github.com/pingcap/badger@v1.5.1-0.20230103063557-828f39b09b6d/table/sstable/builder.go (about)

     1  /*
     2   * Copyright 2017 Dgraph Labs, Inc. and Contributors
     3   *
     4   * Licensed under the Apache License, Version 2.0 (the "License");
     5   * you may not use this file except in compliance with the License.
     6   * You may obtain a copy of the License at
     7   *
     8   *     http://www.apache.org/licenses/LICENSE-2.0
     9   *
    10   * Unless required by applicable law or agreed to in writing, software
    11   * distributed under the License is distributed on an "AS IS" BASIS,
    12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13   * See the License for the specific language governing permissions and
    14   * limitations under the License.
    15   */
    16  
    17  package sstable
    18  
    19  import (
    20  	"bytes"
    21  	"encoding/binary"
    22  	"math"
    23  	"os"
    24  	"reflect"
    25  	"unsafe"
    26  
    27  	"github.com/coocood/bbloom"
    28  	"github.com/dgryski/go-farm"
    29  	"github.com/pingcap/badger/fileutil"
    30  	"github.com/pingcap/badger/options"
    31  	"github.com/pingcap/badger/surf"
    32  	"github.com/pingcap/badger/y"
    33  	"golang.org/x/time/rate"
    34  )
    35  
    36  type entrySlice struct {
    37  	data    []byte
    38  	endOffs []uint32
    39  }
    40  
    41  func (es *entrySlice) append(entry []byte) {
    42  	es.data = append(es.data, entry...)
    43  	es.endOffs = append(es.endOffs, uint32(len(es.data)))
    44  }
    45  
    46  func (es *entrySlice) appendVal(val *y.ValueStruct) {
    47  	es.data = val.EncodeTo(es.data)
    48  	es.endOffs = append(es.endOffs, uint32(len(es.data)))
    49  }
    50  
    51  func (es *entrySlice) getLast() []byte {
    52  	return es.getEntry(es.length() - 1)
    53  }
    54  
    55  func (es *entrySlice) getEntry(i int) []byte {
    56  	var startOff uint32
    57  	if i > 0 {
    58  		startOff = es.endOffs[i-1]
    59  	}
    60  	endOff := es.endOffs[i]
    61  	return es.data[startOff:endOff]
    62  }
    63  
    64  func (es *entrySlice) length() int {
    65  	return len(es.endOffs)
    66  }
    67  
    68  func (es *entrySlice) size() int {
    69  	return len(es.data) + 4*len(es.endOffs)
    70  }
    71  
    72  func (es *entrySlice) reset() {
    73  	es.data = es.data[:0]
    74  	es.endOffs = es.endOffs[:0]
    75  }
    76  
    77  const headerSize = 4
    78  
    79  // Builder is used in building a table.
    80  type Builder struct {
    81  	counter int // Number of keys written for the current block.
    82  
    83  	file          *os.File
    84  	w             tableWriter
    85  	buf           []byte
    86  	writtenLen    int
    87  	rawWrittenLen int
    88  	compression   options.CompressionType
    89  
    90  	baseKeys entrySlice
    91  
    92  	blockEndOffsets []uint32 // Base offsets of every block.
    93  
    94  	// end offsets of every entry within the current block being built.
    95  	// The offsets are relative to the start of the block.
    96  	entryEndOffsets []uint32
    97  
    98  	smallest y.Key
    99  	biggest  y.Key
   100  
   101  	hashEntries []hashEntry
   102  	bloomFpr    float64
   103  	useGlobalTS bool
   104  	opt         options.TableBuilderOptions
   105  	useSuRF     bool
   106  
   107  	surfKeys [][]byte
   108  	surfVals [][]byte
   109  
   110  	tmpKeys    entrySlice
   111  	tmpVals    entrySlice
   112  	tmpOldOffs []uint32
   113  
   114  	singleKeyOldVers entrySlice
   115  	oldBlock         []byte
   116  }
   117  
   118  type tableWriter interface {
   119  	Reset(f *os.File)
   120  	Write(b []byte) (int, error)
   121  	Offset() int64
   122  	Finish() error
   123  }
   124  
   125  type inMemWriter struct {
   126  	*bytes.Buffer
   127  }
   128  
   129  func (w *inMemWriter) Reset(_ *os.File) {
   130  	w.Buffer.Reset()
   131  }
   132  
   133  func (w *inMemWriter) Offset() int64 {
   134  	return int64(w.Len())
   135  }
   136  
   137  func (w *inMemWriter) Finish() error {
   138  	return nil
   139  }
   140  
   141  // NewTableBuilder makes a new TableBuilder.
   142  // If the f is nil, the builder builds in-memory result.
   143  // If the limiter is nil, the write speed during table build will not be limited.
   144  func NewTableBuilder(f *os.File, limiter *rate.Limiter, level int, opt options.TableBuilderOptions) *Builder {
   145  	t := float64(opt.LevelSizeMultiplier)
   146  	fprBase := math.Pow(t, 1/(t-1)) * opt.LogicalBloomFPR * (t - 1)
   147  	levelFactor := math.Pow(t, float64(opt.MaxLevels-level))
   148  	b := &Builder{
   149  		file:        f,
   150  		buf:         make([]byte, 0, 4*1024),
   151  		hashEntries: make([]hashEntry, 0, 4*1024),
   152  		bloomFpr:    fprBase / levelFactor,
   153  		compression: opt.CompressionPerLevel[level],
   154  		opt:         opt,
   155  		useSuRF:     level >= opt.SuRFStartLevel,
   156  		// add one byte so the offset would never be 0, so oldOffset is 0 means no old version.
   157  		oldBlock: []byte{0},
   158  	}
   159  	if f != nil {
   160  		b.w = fileutil.NewDirectWriter(f, opt.WriteBufferSize, limiter)
   161  	} else {
   162  		b.w = &inMemWriter{Buffer: bytes.NewBuffer(make([]byte, 0, opt.MaxTableSize))}
   163  	}
   164  	return b
   165  }
   166  
   167  func NewExternalTableBuilder(f *os.File, limiter *rate.Limiter, opt options.TableBuilderOptions, compression options.CompressionType) *Builder {
   168  	return &Builder{
   169  		file:        f,
   170  		w:           fileutil.NewDirectWriter(f, opt.WriteBufferSize, limiter),
   171  		buf:         make([]byte, 0, 4*1024),
   172  		hashEntries: make([]hashEntry, 0, 4*1024),
   173  		bloomFpr:    opt.LogicalBloomFPR,
   174  		useGlobalTS: true,
   175  		compression: compression,
   176  		opt:         opt,
   177  	}
   178  }
   179  
   180  // Reset this builder with new file.
   181  func (b *Builder) Reset(f *os.File) {
   182  	b.file = f
   183  	b.resetBuffers()
   184  	b.w.Reset(f)
   185  }
   186  
   187  // SetIsManaged should be called when ingesting a table into a managed DB.
   188  func (b *Builder) SetIsManaged() {
   189  	b.useGlobalTS = false
   190  }
   191  
   192  func (b *Builder) resetBuffers() {
   193  	b.counter = 0
   194  	b.buf = b.buf[:0]
   195  	b.writtenLen = 0
   196  	b.rawWrittenLen = 0
   197  	b.baseKeys.reset()
   198  	b.blockEndOffsets = b.blockEndOffsets[:0]
   199  	b.entryEndOffsets = b.entryEndOffsets[:0]
   200  	b.hashEntries = b.hashEntries[:0]
   201  	b.surfKeys = nil
   202  	b.surfVals = nil
   203  	b.smallest.UserKey = b.smallest.UserKey[:0]
   204  	b.biggest.UserKey = b.biggest.UserKey[:0]
   205  	b.oldBlock = b.oldBlock[:0]
   206  }
   207  
   208  // Close closes the TableBuilder.
   209  func (b *Builder) Close() {}
   210  
   211  // Empty returns whether it's empty.
   212  func (b *Builder) Empty() bool { return b.writtenLen+len(b.buf)+b.tmpKeys.length() == 0 }
   213  
   214  // keyDiff returns the first index at which the two keys are different.
   215  func keyDiffIdx(k1, k2 []byte) int {
   216  	var i int
   217  	for i = 0; i < len(k1) && i < len(k2); i++ {
   218  		if k1[i] != k2[i] {
   219  			break
   220  		}
   221  	}
   222  	return i
   223  }
   224  
   225  func (b *Builder) addIndex(key y.Key) {
   226  	if b.smallest.IsEmpty() {
   227  		b.smallest.Copy(key)
   228  	}
   229  	if b.biggest.SameUserKey(key) {
   230  		return
   231  	}
   232  	b.biggest.Copy(key)
   233  
   234  	keyHash := farm.Fingerprint64(key.UserKey)
   235  	// It is impossible that a single table contains 16 million keys.
   236  	y.Assert(b.baseKeys.length() < maxBlockCnt)
   237  
   238  	pos := entryPosition{uint16(b.baseKeys.length()), uint8(b.counter)}
   239  	if b.useSuRF {
   240  		b.surfKeys = append(b.surfKeys, y.SafeCopy(nil, key.UserKey))
   241  		b.surfVals = append(b.surfVals, pos.encode())
   242  	} else {
   243  		b.hashEntries = append(b.hashEntries, hashEntry{pos, keyHash})
   244  	}
   245  }
   246  
   247  func (b *Builder) addHelper(key y.Key, v y.ValueStruct) {
   248  	// Add key to bloom filter.
   249  	if len(key.UserKey) > 0 {
   250  		b.addIndex(key)
   251  	}
   252  	b.tmpKeys.append(key.UserKey)
   253  	v.Version = key.Version
   254  	b.tmpVals.appendVal(&v)
   255  	b.tmpOldOffs = append(b.tmpOldOffs, 0)
   256  	b.counter++
   257  }
   258  
   259  // oldEntry format:
   260  //   numEntries(4) | endOffsets(4 * numEntries) | entries
   261  //
   262  // entry format:
   263  //   version(8) | value
   264  func (b *Builder) addOld(key y.Key, v y.ValueStruct) {
   265  	v.Version = key.Version
   266  	keyIdx := b.tmpKeys.length() - 1
   267  	startOff := b.tmpOldOffs[keyIdx]
   268  	if startOff == 0 {
   269  		startOff = uint32(len(b.oldBlock))
   270  		b.tmpOldOffs[keyIdx] = startOff
   271  	}
   272  	b.singleKeyOldVers.appendVal(&v)
   273  }
   274  
   275  // entryFormat
   276  // no old entry:
   277  //  diffKeyLen(2) | diffKey | 0 | version(8) | value
   278  // has old entry:
   279  //  diffKeyLen(2) | diffKey | 1 | oldOffset(4) | version(8) | value
   280  func (b *Builder) finishBlock() error {
   281  	if b.tmpKeys.length() == 0 {
   282  		return nil
   283  	}
   284  	if b.singleKeyOldVers.length() > 0 {
   285  		b.flushSingleKeyOldVers()
   286  	}
   287  	firstKey := b.tmpKeys.getEntry(0)
   288  	lastKey := b.tmpKeys.getLast()
   289  	blockCommonLen := keyDiffIdx(firstKey, lastKey)
   290  	for i := 0; i < b.tmpKeys.length(); i++ {
   291  		key := b.tmpKeys.getEntry(i)
   292  		b.buf = appendU16(b.buf, uint16(len(key)-blockCommonLen))
   293  		b.buf = append(b.buf, key[blockCommonLen:]...)
   294  		if b.tmpOldOffs[i] == 0 {
   295  			b.buf = append(b.buf, 0)
   296  		} else {
   297  			b.buf = append(b.buf, 1)
   298  			b.buf = append(b.buf, u32ToBytes(b.tmpOldOffs[i])...)
   299  		}
   300  		b.buf = append(b.buf, b.tmpVals.getEntry(i)...)
   301  		b.entryEndOffsets = append(b.entryEndOffsets, uint32(len(b.buf)))
   302  	}
   303  	b.buf = append(b.buf, u32SliceToBytes(b.entryEndOffsets)...)
   304  	b.buf = append(b.buf, u32ToBytes(uint32(len(b.entryEndOffsets)))...)
   305  	b.buf = appendU16(b.buf, uint16(blockCommonLen))
   306  
   307  	// Add base key.
   308  	b.baseKeys.append(firstKey)
   309  
   310  	before := b.w.Offset()
   311  	if err := b.compression.Compress(b.w, b.buf); err != nil {
   312  		return err
   313  	}
   314  	size := b.w.Offset() - before
   315  	b.blockEndOffsets = append(b.blockEndOffsets, uint32(b.writtenLen+int(size)))
   316  	b.writtenLen += int(size)
   317  	b.rawWrittenLen += len(b.buf)
   318  
   319  	// Reset the block for the next build.
   320  	b.entryEndOffsets = b.entryEndOffsets[:0]
   321  	b.counter = 0
   322  	b.buf = b.buf[:0]
   323  	b.tmpKeys.reset()
   324  	b.tmpVals.reset()
   325  	b.tmpOldOffs = b.tmpOldOffs[:0]
   326  	return nil
   327  }
   328  
   329  // Add adds a key-value pair to the block.
   330  // If doNotRestart is true, we will not restart even if b.counter >= restartInterval.
   331  func (b *Builder) Add(key y.Key, value y.ValueStruct) error {
   332  	var lastUserKey []byte
   333  	if b.tmpKeys.length() > 0 {
   334  		lastUserKey = b.tmpKeys.getLast()
   335  	}
   336  	// Check old before check finish block, so two blocks never have the same key.
   337  	if bytes.Equal(lastUserKey, key.UserKey) {
   338  		b.addOld(key, value)
   339  		return nil
   340  	} else if b.singleKeyOldVers.length() > 0 {
   341  		b.flushSingleKeyOldVers()
   342  	}
   343  	if b.shouldFinishBlock() {
   344  		if err := b.finishBlock(); err != nil {
   345  			return err
   346  		}
   347  	}
   348  	b.addHelper(key, value)
   349  	return nil // Currently, there is no meaningful error.
   350  }
   351  
   352  func (b *Builder) flushSingleKeyOldVers() {
   353  	// numEntries
   354  	b.oldBlock = append(b.oldBlock, u32ToBytes(uint32(b.singleKeyOldVers.length()))...)
   355  	// endOffsets
   356  	b.oldBlock = append(b.oldBlock, u32SliceToBytes(b.singleKeyOldVers.endOffs)...)
   357  	// entries
   358  	b.oldBlock = append(b.oldBlock, b.singleKeyOldVers.data...)
   359  	b.singleKeyOldVers.reset()
   360  }
   361  
   362  func (b *Builder) shouldFinishBlock() bool {
   363  	// If there is no entry till now, we will return false.
   364  	if b.tmpKeys.length() == 0 {
   365  		return false
   366  	}
   367  	return uint32(b.tmpKeys.size()+b.tmpVals.size()) > uint32(b.opt.BlockSize)
   368  }
   369  
   370  // ReachedCapacity returns true if we... roughly (?) reached capacity?
   371  func (b *Builder) ReachedCapacity(capacity int64) bool {
   372  	estimateSz := b.rawWrittenLen + len(b.buf) +
   373  		4*len(b.blockEndOffsets) + b.baseKeys.size() + len(b.oldBlock)
   374  	return int64(estimateSz) > capacity
   375  }
   376  
   377  // EstimateSize returns the size of the SST to build.
   378  func (b *Builder) EstimateSize() int {
   379  	size := b.rawWrittenLen + len(b.buf) + 4*len(b.blockEndOffsets) + b.baseKeys.size() + len(b.oldBlock)
   380  	if !b.useSuRF {
   381  		size += 3 * int(float32(len(b.hashEntries))/b.opt.HashUtilRatio)
   382  	}
   383  	return size
   384  }
   385  
   386  const (
   387  	idSmallest byte = iota
   388  	idBiggest
   389  	idBaseKeysEndOffs
   390  	idBaseKeys
   391  	idBlockEndOffsets
   392  	idBloomFilter
   393  	idHashIndex
   394  	idSuRFIndex
   395  	idOldBlockLen
   396  )
   397  
   398  // BuildResult contains the build result info, if it's file based compaction, fileName should be used to open Table.
   399  // If it's in memory compaction, FileData and IndexData contains the data.
   400  type BuildResult struct {
   401  	FileName  string
   402  	FileData  []byte
   403  	IndexData []byte
   404  }
   405  
   406  // Finish finishes the table by appending the index.
   407  func (b *Builder) Finish() (*BuildResult, error) {
   408  	err := b.finishBlock() // This will never start a new block.
   409  	if err != nil {
   410  		return nil, err
   411  	}
   412  	if len(b.oldBlock) > 1 {
   413  		_, err = b.w.Write(b.oldBlock)
   414  		if err != nil {
   415  			return nil, err
   416  		}
   417  	}
   418  	if err = b.w.Finish(); err != nil {
   419  		return nil, err
   420  	}
   421  	result := new(BuildResult)
   422  	if b.file != nil {
   423  		idxFile, err := y.OpenTruncFile(IndexFilename(b.file.Name()), false)
   424  		if err != nil {
   425  			return nil, err
   426  		}
   427  		result.FileName = b.file.Name()
   428  		b.w.Reset(idxFile)
   429  	} else {
   430  		result.FileData = y.Copy(b.w.(*inMemWriter).Bytes())
   431  		b.w.Reset(nil)
   432  	}
   433  
   434  	// Don't compress the global ts, because it may be updated during ingest.
   435  	ts := uint64(0)
   436  	if b.useGlobalTS {
   437  		// External builder doesn't append ts to the keys, the output sst should has a non-zero global ts.
   438  		ts = 1
   439  	}
   440  
   441  	encoder := newMetaEncoder(b.buf, b.compression, ts)
   442  	encoder.append(b.smallest.UserKey, idSmallest)
   443  	encoder.append(b.biggest.UserKey, idBiggest)
   444  	encoder.append(u32SliceToBytes(b.baseKeys.endOffs), idBaseKeysEndOffs)
   445  	encoder.append(b.baseKeys.data, idBaseKeys)
   446  	encoder.append(u32SliceToBytes(b.blockEndOffsets), idBlockEndOffsets)
   447  	if len(b.oldBlock) > 1 {
   448  		encoder.append(u32ToBytes(uint32(len(b.oldBlock))), idOldBlockLen)
   449  	}
   450  
   451  	var bloomFilter []byte
   452  	if !b.useSuRF {
   453  		bf := bbloom.New(float64(len(b.hashEntries)), b.bloomFpr)
   454  		for _, he := range b.hashEntries {
   455  			bf.Add(he.hash)
   456  		}
   457  		bloomFilter = bf.BinaryMarshal()
   458  	}
   459  	encoder.append(bloomFilter, idBloomFilter)
   460  
   461  	var hashIndex []byte
   462  	if !b.useSuRF {
   463  		hashIndex = buildHashIndex(b.hashEntries, b.opt.HashUtilRatio)
   464  	}
   465  	encoder.append(hashIndex, idHashIndex)
   466  
   467  	var surfIndex []byte
   468  	if b.useSuRF && len(b.surfKeys) > 0 {
   469  		hl := uint32(b.opt.SuRFOptions.HashSuffixLen)
   470  		rl := uint32(b.opt.SuRFOptions.RealSuffixLen)
   471  		sb := surf.NewBuilder(3, hl, rl)
   472  		sf := sb.Build(b.surfKeys, b.surfVals, b.opt.SuRFOptions.BitsPerKeyHint)
   473  		surfIndex = sf.Marshal()
   474  	}
   475  	encoder.append(surfIndex, idSuRFIndex)
   476  
   477  	if err = encoder.finish(b.w); err != nil {
   478  		return nil, err
   479  	}
   480  
   481  	if err = b.w.Finish(); err != nil {
   482  		return nil, err
   483  	}
   484  	if b.file == nil {
   485  		result.IndexData = y.Copy(b.w.(*inMemWriter).Bytes())
   486  	}
   487  	return result, nil
   488  }
   489  
   490  func appendU16(buf []byte, v uint16) []byte {
   491  	return append(buf, byte(v), byte(v>>8))
   492  }
   493  
   494  func u32ToBytes(v uint32) []byte {
   495  	var uBuf [4]byte
   496  	binary.LittleEndian.PutUint32(uBuf[:], v)
   497  	return uBuf[:]
   498  }
   499  
   500  func u64ToBytes(v uint64) []byte {
   501  	var uBuf [8]byte
   502  	binary.LittleEndian.PutUint64(uBuf[:], v)
   503  	return uBuf[:]
   504  }
   505  
   506  func u32SliceToBytes(u32s []uint32) []byte {
   507  	if len(u32s) == 0 {
   508  		return nil
   509  	}
   510  	var b []byte
   511  	hdr := (*reflect.SliceHeader)(unsafe.Pointer(&b))
   512  	hdr.Len = len(u32s) * 4
   513  	hdr.Cap = hdr.Len
   514  	hdr.Data = uintptr(unsafe.Pointer(&u32s[0]))
   515  	return b
   516  }
   517  
   518  func bytesToU32Slice(b []byte) []uint32 {
   519  	if len(b) == 0 {
   520  		return nil
   521  	}
   522  	var u32s []uint32
   523  	hdr := (*reflect.SliceHeader)(unsafe.Pointer(&u32s))
   524  	hdr.Len = len(b) / 4
   525  	hdr.Cap = hdr.Len
   526  	hdr.Data = uintptr(unsafe.Pointer(&b[0]))
   527  	return u32s
   528  }
   529  
   530  func bytesToU32(b []byte) uint32 {
   531  	return binary.LittleEndian.Uint32(b)
   532  }
   533  
   534  func bytesToU64(b []byte) uint64 {
   535  	return binary.LittleEndian.Uint64(b)
   536  }
   537  
   538  type metaEncoder struct {
   539  	buf         []byte
   540  	compression options.CompressionType
   541  }
   542  
   543  func newMetaEncoder(buf []byte, compression options.CompressionType, globalTS uint64) *metaEncoder {
   544  	buf = append(buf, u64ToBytes(globalTS)...)
   545  	buf = append(buf, byte(compression))
   546  	return &metaEncoder{
   547  		buf:         buf,
   548  		compression: compression,
   549  	}
   550  }
   551  
   552  func (e *metaEncoder) append(d []byte, id byte) {
   553  	e.buf = append(e.buf, id)
   554  	e.buf = append(e.buf, u32ToBytes(uint32(len(d)))...)
   555  	e.buf = append(e.buf, d...)
   556  }
   557  
   558  func (e *metaEncoder) finish(w tableWriter) error {
   559  	if e.compression == options.None {
   560  		_, err := w.Write(e.buf)
   561  		return err
   562  	}
   563  
   564  	if _, err := w.Write(e.buf[:9]); err != nil {
   565  		return err
   566  	}
   567  	return e.compression.Compress(w, e.buf[9:])
   568  }
   569  
   570  type metaDecoder struct {
   571  	buf         []byte
   572  	globalTS    uint64
   573  	compression options.CompressionType
   574  
   575  	cursor int
   576  }
   577  
   578  func newMetaDecoder(buf []byte) (*metaDecoder, error) {
   579  	globalTS := bytesToU64(buf[:8])
   580  	compression := options.CompressionType(buf[8])
   581  	buf = buf[9:]
   582  	if compression != options.None {
   583  		buf1, err := compression.Decompress(buf)
   584  		if err != nil {
   585  			return nil, err
   586  		}
   587  		buf = buf1
   588  	}
   589  	return &metaDecoder{
   590  		buf:         buf,
   591  		globalTS:    globalTS,
   592  		compression: compression,
   593  	}, nil
   594  }
   595  
   596  func (e *metaDecoder) valid() bool {
   597  	return e.cursor < len(e.buf)
   598  }
   599  
   600  func (e *metaDecoder) currentId() byte {
   601  	return e.buf[e.cursor]
   602  }
   603  
   604  func (e *metaDecoder) decode() []byte {
   605  	cursor := e.cursor + 1
   606  	l := int(bytesToU32(e.buf[cursor:]))
   607  	cursor += 4
   608  	d := e.buf[cursor : cursor+l]
   609  	return d
   610  }
   611  
   612  func (e *metaDecoder) next() {
   613  	l := int(bytesToU32(e.buf[e.cursor+1:]))
   614  	e.cursor += 1 + 4 + l
   615  }