github.com/coocood/badger@v1.5.1-0.20200528065104-c02ac3616d04/table/sstable/builder.go (about)

     1  /*
     2   * Copyright 2017 Dgraph Labs, Inc. and Contributors
     3   *
     4   * Licensed under the Apache License, Version 2.0 (the "License");
     5   * you may not use this file except in compliance with the License.
     6   * You may obtain a copy of the License at
     7   *
     8   *     http://www.apache.org/licenses/LICENSE-2.0
     9   *
    10   * Unless required by applicable law or agreed to in writing, software
    11   * distributed under the License is distributed on an "AS IS" BASIS,
    12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13   * See the License for the specific language governing permissions and
    14   * limitations under the License.
    15   */
    16  
    17  package sstable
    18  
    19  import (
    20  	"bytes"
    21  	"encoding/binary"
    22  	"math"
    23  	"os"
    24  	"reflect"
    25  	"unsafe"
    26  
    27  	"github.com/coocood/badger/fileutil"
    28  	"github.com/coocood/badger/options"
    29  	"github.com/coocood/badger/surf"
    30  	"github.com/coocood/badger/y"
    31  	"github.com/coocood/bbloom"
    32  	"github.com/dgryski/go-farm"
    33  	"golang.org/x/time/rate"
    34  )
    35  
    36  type entrySlice struct {
    37  	data    []byte
    38  	endOffs []uint32
    39  }
    40  
    41  func (es *entrySlice) append(entry []byte) {
    42  	es.data = append(es.data, entry...)
    43  	es.endOffs = append(es.endOffs, uint32(len(es.data)))
    44  }
    45  
    46  func (es *entrySlice) appendVal(val *y.ValueStruct) {
    47  	es.data = val.EncodeTo(es.data)
    48  	es.endOffs = append(es.endOffs, uint32(len(es.data)))
    49  }
    50  
    51  func (es *entrySlice) getLast() []byte {
    52  	return es.getEntry(es.length() - 1)
    53  }
    54  
    55  func (es *entrySlice) getEntry(i int) []byte {
    56  	var startOff uint32
    57  	if i > 0 {
    58  		startOff = es.endOffs[i-1]
    59  	}
    60  	endOff := es.endOffs[i]
    61  	return es.data[startOff:endOff]
    62  }
    63  
    64  func (es *entrySlice) length() int {
    65  	return len(es.endOffs)
    66  }
    67  
    68  func (es *entrySlice) size() int {
    69  	return len(es.data) + 4*len(es.endOffs)
    70  }
    71  
    72  func (es *entrySlice) reset() {
    73  	es.data = es.data[:0]
    74  	es.endOffs = es.endOffs[:0]
    75  }
    76  
    77  const headerSize = 4
    78  
    79  // Builder is used in building a table.
    80  type Builder struct {
    81  	counter int // Number of keys written for the current block.
    82  
    83  	idxFileName   string
    84  	w             *fileutil.DirectWriter
    85  	buf           []byte
    86  	writtenLen    int
    87  	rawWrittenLen int
    88  	compression   options.CompressionType
    89  
    90  	baseKeys entrySlice
    91  
    92  	blockEndOffsets []uint32 // Base offsets of every block.
    93  
    94  	// end offsets of every entry within the current block being built.
    95  	// The offsets are relative to the start of the block.
    96  	entryEndOffsets []uint32
    97  
    98  	smallest y.Key
    99  	biggest  y.Key
   100  
   101  	hashEntries []hashEntry
   102  	bloomFpr    float64
   103  	useGlobalTS bool
   104  	opt         options.TableBuilderOptions
   105  	useSuRF     bool
   106  
   107  	surfKeys [][]byte
   108  	surfVals [][]byte
   109  
   110  	tmpKeys    entrySlice
   111  	tmpVals    entrySlice
   112  	tmpOldOffs []uint32
   113  
   114  	singleKeyOldVers entrySlice
   115  	oldBlock         []byte
   116  }
   117  
   118  // NewTableBuilder makes a new TableBuilder.
   119  // If the limiter is nil, the write speed during table build will not be limited.
   120  func NewTableBuilder(f *os.File, limiter *rate.Limiter, level int, opt options.TableBuilderOptions) *Builder {
   121  	t := float64(opt.LevelSizeMultiplier)
   122  	fprBase := math.Pow(t, 1/(t-1)) * opt.LogicalBloomFPR * (t - 1)
   123  	levelFactor := math.Pow(t, float64(opt.MaxLevels-level))
   124  
   125  	return &Builder{
   126  		idxFileName: f.Name() + idxFileSuffix,
   127  		w:           fileutil.NewDirectWriter(f, opt.WriteBufferSize, limiter),
   128  		buf:         make([]byte, 0, 4*1024),
   129  		hashEntries: make([]hashEntry, 0, 4*1024),
   130  		bloomFpr:    fprBase / levelFactor,
   131  		compression: opt.CompressionPerLevel[level],
   132  		opt:         opt,
   133  		useSuRF:     level >= opt.SuRFStartLevel,
   134  		// add one byte so the offset would never be 0, so oldOffset is 0 means no old version.
   135  		oldBlock: []byte{0},
   136  	}
   137  }
   138  
   139  func NewExternalTableBuilder(f *os.File, limiter *rate.Limiter, opt options.TableBuilderOptions, compression options.CompressionType) *Builder {
   140  	return &Builder{
   141  		idxFileName: f.Name() + idxFileSuffix,
   142  		w:           fileutil.NewDirectWriter(f, opt.WriteBufferSize, limiter),
   143  		buf:         make([]byte, 0, 4*1024),
   144  		hashEntries: make([]hashEntry, 0, 4*1024),
   145  		bloomFpr:    opt.LogicalBloomFPR,
   146  		useGlobalTS: true,
   147  		compression: compression,
   148  		opt:         opt,
   149  	}
   150  }
   151  
   152  // Reset this builder with new file.
   153  func (b *Builder) Reset(f *os.File) {
   154  	b.resetBuffers()
   155  	b.w.Reset(f)
   156  	b.idxFileName = f.Name() + idxFileSuffix
   157  }
   158  
   159  // SetIsManaged should be called when ingesting a table into a managed DB.
   160  func (b *Builder) SetIsManaged() {
   161  	b.useGlobalTS = false
   162  }
   163  
   164  func (b *Builder) resetBuffers() {
   165  	b.counter = 0
   166  	b.buf = b.buf[:0]
   167  	b.writtenLen = 0
   168  	b.rawWrittenLen = 0
   169  	b.baseKeys.reset()
   170  	b.blockEndOffsets = b.blockEndOffsets[:0]
   171  	b.entryEndOffsets = b.entryEndOffsets[:0]
   172  	b.hashEntries = b.hashEntries[:0]
   173  	b.surfKeys = nil
   174  	b.surfVals = nil
   175  	b.smallest.UserKey = b.smallest.UserKey[:0]
   176  	b.biggest.UserKey = b.biggest.UserKey[:0]
   177  	b.oldBlock = b.oldBlock[:0]
   178  }
   179  
   180  // Close closes the TableBuilder.
   181  func (b *Builder) Close() {}
   182  
   183  // Empty returns whether it's empty.
   184  func (b *Builder) Empty() bool { return b.writtenLen+len(b.buf)+b.tmpKeys.length() == 0 }
   185  
   186  // keyDiff returns the first index at which the two keys are different.
   187  func keyDiffIdx(k1, k2 []byte) int {
   188  	var i int
   189  	for i = 0; i < len(k1) && i < len(k2); i++ {
   190  		if k1[i] != k2[i] {
   191  			break
   192  		}
   193  	}
   194  	return i
   195  }
   196  
   197  func (b *Builder) addIndex(key y.Key) {
   198  	if b.smallest.IsEmpty() {
   199  		b.smallest.Copy(key)
   200  	}
   201  	if b.biggest.SameUserKey(key) {
   202  		return
   203  	}
   204  	b.biggest.Copy(key)
   205  
   206  	keyHash := farm.Fingerprint64(key.UserKey)
   207  	// It is impossible that a single table contains 16 million keys.
   208  	y.Assert(b.baseKeys.length() < maxBlockCnt)
   209  
   210  	pos := entryPosition{uint16(b.baseKeys.length()), uint8(b.counter)}
   211  	if b.useSuRF {
   212  		b.surfKeys = append(b.surfKeys, y.SafeCopy(nil, key.UserKey))
   213  		b.surfVals = append(b.surfVals, pos.encode())
   214  	} else {
   215  		b.hashEntries = append(b.hashEntries, hashEntry{pos, keyHash})
   216  	}
   217  }
   218  
   219  func (b *Builder) addHelper(key y.Key, v y.ValueStruct) {
   220  	// Add key to bloom filter.
   221  	if len(key.UserKey) > 0 {
   222  		b.addIndex(key)
   223  	}
   224  	b.tmpKeys.append(key.UserKey)
   225  	v.Version = key.Version
   226  	b.tmpVals.appendVal(&v)
   227  	b.tmpOldOffs = append(b.tmpOldOffs, 0)
   228  	b.counter++
   229  }
   230  
   231  // oldEntry format:
   232  //   numEntries(4) | endOffsets(4 * numEntries) | entries
   233  //
   234  // entry format:
   235  //   version(8) | value
   236  func (b *Builder) addOld(key y.Key, v y.ValueStruct) {
   237  	v.Version = key.Version
   238  	keyIdx := b.tmpKeys.length() - 1
   239  	startOff := b.tmpOldOffs[keyIdx]
   240  	if startOff == 0 {
   241  		startOff = uint32(len(b.oldBlock))
   242  		b.tmpOldOffs[keyIdx] = startOff
   243  	}
   244  	b.singleKeyOldVers.appendVal(&v)
   245  }
   246  
   247  // entryFormat
   248  // no old entry:
   249  //  diffKeyLen(2) | diffKey | 0 | version(8) | value
   250  // has old entry:
   251  //  diffKeyLen(2) | diffKey | 1 | oldOffset(4) | version(8) | value
   252  func (b *Builder) finishBlock() error {
   253  	if b.tmpKeys.length() == 0 {
   254  		return nil
   255  	}
   256  	if b.singleKeyOldVers.length() > 0 {
   257  		b.flushSingleKeyOldVers()
   258  	}
   259  	firstKey := b.tmpKeys.getEntry(0)
   260  	lastKey := b.tmpKeys.getLast()
   261  	blockCommonLen := keyDiffIdx(firstKey, lastKey)
   262  	for i := 0; i < b.tmpKeys.length(); i++ {
   263  		key := b.tmpKeys.getEntry(i)
   264  		b.buf = appendU16(b.buf, uint16(len(key)-blockCommonLen))
   265  		b.buf = append(b.buf, key[blockCommonLen:]...)
   266  		if b.tmpOldOffs[i] == 0 {
   267  			b.buf = append(b.buf, 0)
   268  		} else {
   269  			b.buf = append(b.buf, 1)
   270  			b.buf = append(b.buf, u32ToBytes(b.tmpOldOffs[i])...)
   271  		}
   272  		b.buf = append(b.buf, b.tmpVals.getEntry(i)...)
   273  		b.entryEndOffsets = append(b.entryEndOffsets, uint32(len(b.buf)))
   274  	}
   275  	b.buf = append(b.buf, u32SliceToBytes(b.entryEndOffsets)...)
   276  	b.buf = append(b.buf, u32ToBytes(uint32(len(b.entryEndOffsets)))...)
   277  	b.buf = appendU16(b.buf, uint16(blockCommonLen))
   278  
   279  	// Add base key.
   280  	b.baseKeys.append(firstKey)
   281  
   282  	before := b.w.Offset()
   283  	if err := b.compression.Compress(b.w, b.buf); err != nil {
   284  		return err
   285  	}
   286  	size := b.w.Offset() - before
   287  	b.blockEndOffsets = append(b.blockEndOffsets, uint32(b.writtenLen+int(size)))
   288  	b.writtenLen += int(size)
   289  	b.rawWrittenLen += len(b.buf)
   290  
   291  	// Reset the block for the next build.
   292  	b.entryEndOffsets = b.entryEndOffsets[:0]
   293  	b.counter = 0
   294  	b.buf = b.buf[:0]
   295  	b.tmpKeys.reset()
   296  	b.tmpVals.reset()
   297  	b.tmpOldOffs = b.tmpOldOffs[:0]
   298  	return nil
   299  }
   300  
   301  // Add adds a key-value pair to the block.
   302  // If doNotRestart is true, we will not restart even if b.counter >= restartInterval.
   303  func (b *Builder) Add(key y.Key, value y.ValueStruct) error {
   304  	var lastUserKey []byte
   305  	if b.tmpKeys.length() > 0 {
   306  		lastUserKey = b.tmpKeys.getLast()
   307  	}
   308  	// Check old before check finish block, so two blocks never have the same key.
   309  	if bytes.Equal(lastUserKey, key.UserKey) {
   310  		b.addOld(key, value)
   311  		return nil
   312  	} else if b.singleKeyOldVers.length() > 0 {
   313  		b.flushSingleKeyOldVers()
   314  	}
   315  	if b.shouldFinishBlock() {
   316  		if err := b.finishBlock(); err != nil {
   317  			return err
   318  		}
   319  	}
   320  	b.addHelper(key, value)
   321  	return nil // Currently, there is no meaningful error.
   322  }
   323  
   324  func (b *Builder) flushSingleKeyOldVers() {
   325  	// numEntries
   326  	b.oldBlock = append(b.oldBlock, u32ToBytes(uint32(b.singleKeyOldVers.length()))...)
   327  	// endOffsets
   328  	b.oldBlock = append(b.oldBlock, u32SliceToBytes(b.singleKeyOldVers.endOffs)...)
   329  	// entries
   330  	b.oldBlock = append(b.oldBlock, b.singleKeyOldVers.data...)
   331  	b.singleKeyOldVers.reset()
   332  }
   333  
   334  func (b *Builder) shouldFinishBlock() bool {
   335  	// If there is no entry till now, we will return false.
   336  	if b.tmpKeys.length() == 0 {
   337  		return false
   338  	}
   339  	return uint32(b.tmpKeys.size()+b.tmpVals.size()) > uint32(b.opt.BlockSize)
   340  }
   341  
   342  // ReachedCapacity returns true if we... roughly (?) reached capacity?
   343  func (b *Builder) ReachedCapacity(capacity int64) bool {
   344  	estimateSz := b.rawWrittenLen + len(b.buf) +
   345  		4*len(b.blockEndOffsets) + b.baseKeys.size() + len(b.oldBlock)
   346  	return int64(estimateSz) > capacity
   347  }
   348  
   349  // EstimateSize returns the size of the SST to build.
   350  func (b *Builder) EstimateSize() int {
   351  	size := b.rawWrittenLen + len(b.buf) + 4*len(b.blockEndOffsets) + b.baseKeys.size() + len(b.oldBlock)
   352  	if !b.useSuRF {
   353  		size += 3 * int(float32(len(b.hashEntries))/b.opt.HashUtilRatio)
   354  	}
   355  	return size
   356  }
   357  
   358  const (
   359  	idSmallest byte = iota
   360  	idBiggest
   361  	idBaseKeysEndOffs
   362  	idBaseKeys
   363  	idBlockEndOffsets
   364  	idBloomFilter
   365  	idHashIndex
   366  	idSuRFIndex
   367  	idOldBlockLen
   368  )
   369  
   370  // Finish finishes the table by appending the index.
   371  func (b *Builder) Finish() error {
   372  	err := b.finishBlock() // This will never start a new block.
   373  	if err != nil {
   374  		return err
   375  	}
   376  	if len(b.oldBlock) > 1 {
   377  		err = b.w.Append(b.oldBlock)
   378  		if err != nil {
   379  			return err
   380  		}
   381  	}
   382  	if err = b.w.Finish(); err != nil {
   383  		return err
   384  	}
   385  	idxFile, err := y.OpenTruncFile(b.idxFileName, false)
   386  	if err != nil {
   387  		return err
   388  	}
   389  	b.w.Reset(idxFile)
   390  
   391  	// Don't compress the global ts, because it may be updated during ingest.
   392  	ts := uint64(0)
   393  	if b.useGlobalTS {
   394  		// External builder doesn't append ts to the keys, the output sst should has a non-zero global ts.
   395  		ts = 1
   396  	}
   397  
   398  	encoder := newMetaEncoder(b.buf, b.compression, ts)
   399  	encoder.append(b.smallest.UserKey, idSmallest)
   400  	encoder.append(b.biggest.UserKey, idBiggest)
   401  	encoder.append(u32SliceToBytes(b.baseKeys.endOffs), idBaseKeysEndOffs)
   402  	encoder.append(b.baseKeys.data, idBaseKeys)
   403  	encoder.append(u32SliceToBytes(b.blockEndOffsets), idBlockEndOffsets)
   404  	if len(b.oldBlock) > 1 {
   405  		encoder.append(u32ToBytes(uint32(len(b.oldBlock))), idOldBlockLen)
   406  	}
   407  
   408  	var bloomFilter []byte
   409  	if !b.useSuRF {
   410  		bf := bbloom.New(float64(len(b.hashEntries)), b.bloomFpr)
   411  		for _, he := range b.hashEntries {
   412  			bf.Add(he.hash)
   413  		}
   414  		bloomFilter = bf.BinaryMarshal()
   415  	}
   416  	encoder.append(bloomFilter, idBloomFilter)
   417  
   418  	var hashIndex []byte
   419  	if !b.useSuRF {
   420  		hashIndex = buildHashIndex(b.hashEntries, b.opt.HashUtilRatio)
   421  	}
   422  	encoder.append(hashIndex, idHashIndex)
   423  
   424  	var surfIndex []byte
   425  	if b.useSuRF && len(b.surfKeys) > 0 {
   426  		hl := uint32(b.opt.SuRFOptions.HashSuffixLen)
   427  		rl := uint32(b.opt.SuRFOptions.RealSuffixLen)
   428  		sb := surf.NewBuilder(3, hl, rl)
   429  		sf := sb.Build(b.surfKeys, b.surfVals, b.opt.SuRFOptions.BitsPerKeyHint)
   430  		surfIndex = sf.Marshal()
   431  	}
   432  	encoder.append(surfIndex, idSuRFIndex)
   433  
   434  	if err := encoder.finish(b.w); err != nil {
   435  		return err
   436  	}
   437  
   438  	return b.w.Finish()
   439  }
   440  
   441  func appendU16(buf []byte, v uint16) []byte {
   442  	return append(buf, byte(v), byte(v>>8))
   443  }
   444  
   445  func u32ToBytes(v uint32) []byte {
   446  	var uBuf [4]byte
   447  	binary.LittleEndian.PutUint32(uBuf[:], v)
   448  	return uBuf[:]
   449  }
   450  
   451  func u64ToBytes(v uint64) []byte {
   452  	var uBuf [8]byte
   453  	binary.LittleEndian.PutUint64(uBuf[:], v)
   454  	return uBuf[:]
   455  }
   456  
   457  func u32SliceToBytes(u32s []uint32) []byte {
   458  	if len(u32s) == 0 {
   459  		return nil
   460  	}
   461  	var b []byte
   462  	hdr := (*reflect.SliceHeader)(unsafe.Pointer(&b))
   463  	hdr.Len = len(u32s) * 4
   464  	hdr.Cap = hdr.Len
   465  	hdr.Data = uintptr(unsafe.Pointer(&u32s[0]))
   466  	return b
   467  }
   468  
   469  func bytesToU32Slice(b []byte) []uint32 {
   470  	if len(b) == 0 {
   471  		return nil
   472  	}
   473  	var u32s []uint32
   474  	hdr := (*reflect.SliceHeader)(unsafe.Pointer(&u32s))
   475  	hdr.Len = len(b) / 4
   476  	hdr.Cap = hdr.Len
   477  	hdr.Data = uintptr(unsafe.Pointer(&b[0]))
   478  	return u32s
   479  }
   480  
   481  func bytesToU32(b []byte) uint32 {
   482  	return binary.LittleEndian.Uint32(b)
   483  }
   484  
   485  func bytesToU64(b []byte) uint64 {
   486  	return binary.LittleEndian.Uint64(b)
   487  }
   488  
   489  type metaEncoder struct {
   490  	buf         []byte
   491  	compression options.CompressionType
   492  }
   493  
   494  func newMetaEncoder(buf []byte, compression options.CompressionType, globalTS uint64) *metaEncoder {
   495  	buf = append(buf, u64ToBytes(globalTS)...)
   496  	buf = append(buf, byte(compression))
   497  	return &metaEncoder{
   498  		buf:         buf,
   499  		compression: compression,
   500  	}
   501  }
   502  
   503  func (e *metaEncoder) append(d []byte, id byte) {
   504  	e.buf = append(e.buf, id)
   505  	e.buf = append(e.buf, u32ToBytes(uint32(len(d)))...)
   506  	e.buf = append(e.buf, d...)
   507  }
   508  
   509  func (e *metaEncoder) finish(w *fileutil.DirectWriter) error {
   510  	if e.compression == options.None {
   511  		return w.Append(e.buf)
   512  	}
   513  
   514  	if err := w.Append(e.buf[:9]); err != nil {
   515  		return err
   516  	}
   517  	return e.compression.Compress(w, e.buf[9:])
   518  }
   519  
   520  type metaDecoder struct {
   521  	buf         []byte
   522  	globalTS    uint64
   523  	compression options.CompressionType
   524  
   525  	cursor int
   526  }
   527  
   528  func newMetaDecoder(buf []byte) (*metaDecoder, error) {
   529  	globalTS := bytesToU64(buf[:8])
   530  	compression := options.CompressionType(buf[8])
   531  	buf = buf[9:]
   532  	if compression != options.None {
   533  		buf1, err := compression.Decompress(buf)
   534  		if err != nil {
   535  			return nil, err
   536  		}
   537  		buf = buf1
   538  	}
   539  	return &metaDecoder{
   540  		buf:         buf,
   541  		globalTS:    globalTS,
   542  		compression: compression,
   543  	}, nil
   544  }
   545  
   546  func (e *metaDecoder) valid() bool {
   547  	return e.cursor < len(e.buf)
   548  }
   549  
   550  func (e *metaDecoder) currentId() byte {
   551  	return e.buf[e.cursor]
   552  }
   553  
   554  func (e *metaDecoder) decode() []byte {
   555  	cursor := e.cursor + 1
   556  	l := int(bytesToU32(e.buf[cursor:]))
   557  	cursor += 4
   558  	d := e.buf[cursor : cursor+l]
   559  	return d
   560  }
   561  
   562  func (e *metaDecoder) next() {
   563  	l := int(bytesToU32(e.buf[e.cursor+1:]))
   564  	e.cursor += 1 + 4 + l
   565  }