github.com/pingcap/badger@v1.5.1-0.20230103063557-828f39b09b6d/table/sstable/table.go (about)

     1  /*
     2   * Copyright 2017 Dgraph Labs, Inc. and Contributors
     3   *
     4   * Licensed under the Apache License, Version 2.0 (the "License");
     5   * you may not use this file except in compliance with the License.
     6   * You may obtain a copy of the License at
     7   *
     8   *     http://www.apache.org/licenses/LICENSE-2.0
     9   *
    10   * Unless required by applicable law or agreed to in writing, software
    11   * distributed under the License is distributed on an "AS IS" BASIS,
    12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13   * See the License for the specific language governing permissions and
    14   * limitations under the License.
    15   */
    16  
    17  package sstable
    18  
    19  import (
    20  	"fmt"
    21  	"io"
    22  	"math"
    23  	"os"
    24  	"path"
    25  	"path/filepath"
    26  	"strconv"
    27  	"strings"
    28  	"sync"
    29  	"sync/atomic"
    30  	"unsafe"
    31  
    32  	"github.com/coocood/bbloom"
    33  	"github.com/pingcap/badger/buffer"
    34  	"github.com/pingcap/badger/cache"
    35  	"github.com/pingcap/badger/fileutil"
    36  	"github.com/pingcap/badger/options"
    37  	"github.com/pingcap/badger/surf"
    38  	"github.com/pingcap/badger/y"
    39  	"github.com/pingcap/errors"
    40  )
    41  
    42  const (
    43  	fileSuffix    = ".sst"
    44  	idxFileSuffix = ".idx"
    45  
    46  	intSize = int(unsafe.Sizeof(int(0)))
    47  )
    48  
    49  func IndexFilename(tableFilename string) string { return tableFilename + idxFileSuffix }
    50  
    51  type tableIndex struct {
    52  	blockEndOffsets []uint32
    53  	baseKeys        entrySlice
    54  	bf              *bbloom.Bloom
    55  	hIdx            *hashIndex
    56  	surf            *surf.SuRF
    57  }
    58  
    59  // Table represents a loaded table file with the info we have about it
    60  type Table struct {
    61  	sync.Mutex
    62  
    63  	fd      *os.File // Own fd.
    64  	indexFd *os.File
    65  
    66  	globalTs          uint64
    67  	tableSize         int64
    68  	numBlocks         int
    69  	smallest, biggest y.Key
    70  	id                uint64
    71  
    72  	blockCache *cache.Cache
    73  	blocksData []byte
    74  
    75  	indexCache *cache.Cache
    76  	index      *tableIndex
    77  	indexOnce  sync.Once
    78  	indexData  []byte
    79  
    80  	compacting int32
    81  
    82  	compression options.CompressionType
    83  
    84  	oldBlockLen int64
    85  	oldBlock    []byte
    86  }
    87  
    88  // CompressionType returns the compression algorithm used for block compression.
    89  func (t *Table) CompressionType() options.CompressionType {
    90  	return t.compression
    91  }
    92  
    93  // Delete delete table's file from disk.
    94  func (t *Table) Delete() error {
    95  	if t.fd == nil {
    96  		t.blocksData = nil
    97  		t.indexData = nil
    98  		return nil
    99  	}
   100  	if t.blockCache != nil {
   101  		for blk := 0; blk < t.numBlocks; blk++ {
   102  			key := t.blockCacheKey(blk)
   103  			if v, ok := t.blockCache.Get(key); ok {
   104  				if b, ok := v.(*block); ok {
   105  					b.done()
   106  				}
   107  				t.blockCache.Del(key)
   108  			}
   109  		}
   110  	}
   111  	if t.indexCache != nil {
   112  		t.indexCache.Del(t.id)
   113  	}
   114  	if len(t.blocksData) != 0 {
   115  		y.Munmap(t.blocksData)
   116  	}
   117  	t.index = nil
   118  	if len(t.indexData) != 0 {
   119  		y.Munmap(t.indexData)
   120  	}
   121  	if err := t.fd.Truncate(0); err != nil {
   122  		// This is very important to let the FS know that the file is deleted.
   123  		return err
   124  	}
   125  	filename := t.fd.Name()
   126  	if err := t.fd.Close(); err != nil {
   127  		return err
   128  	}
   129  	if err := os.Remove(filename); err != nil {
   130  		return err
   131  	}
   132  	return os.Remove(filename + idxFileSuffix)
   133  }
   134  
   135  // OpenTable assumes file has only one table and opens it.  Takes ownership of fd upon function
   136  // entry.  Returns a table with one reference count on it (decrementing which may delete the file!
   137  // -- consider t.Close() instead).  The fd has to writeable because we call Truncate on it before
   138  // deleting.
   139  func OpenTable(filename string, blockCache *cache.Cache, indexCache *cache.Cache) (*Table, error) {
   140  	id, ok := ParseFileID(filename)
   141  	if !ok {
   142  		return nil, errors.Errorf("Invalid filename: %s", filename)
   143  	}
   144  
   145  	// TODO: after we support cache of L2 storage, we will open block data file in cache manager.
   146  	fd, err := y.OpenExistingFile(filename, 0)
   147  	if err != nil {
   148  		return nil, err
   149  	}
   150  
   151  	indexFd, err := y.OpenExistingFile(filename+idxFileSuffix, 0)
   152  	if err != nil {
   153  		return nil, err
   154  	}
   155  
   156  	t := &Table{
   157  		fd:         fd,
   158  		indexFd:    indexFd,
   159  		id:         id,
   160  		blockCache: blockCache,
   161  		indexCache: indexCache,
   162  	}
   163  
   164  	if err := t.initTableInfo(); err != nil {
   165  		t.Close()
   166  		return nil, err
   167  	}
   168  	if blockCache == nil || t.oldBlockLen > 0 {
   169  		t.blocksData, err = y.Mmap(fd, false, t.Size())
   170  		if err != nil {
   171  			t.Close()
   172  			return nil, y.Wrapf(err, "Unable to map file")
   173  		}
   174  		t.setOldBlock()
   175  	}
   176  	return t, nil
   177  }
   178  
   179  func (t *Table) setOldBlock() {
   180  	t.oldBlock = t.blocksData[t.tableSize-t.oldBlockLen : t.tableSize]
   181  }
   182  
   183  // OpenInMemoryTable opens a table that has data in memory.
   184  func OpenInMemoryTable(blockData, indexData []byte) (*Table, error) {
   185  	t := &Table{
   186  		blocksData: blockData,
   187  		indexData:  indexData,
   188  	}
   189  	if err := t.initTableInfo(); err != nil {
   190  		return nil, err
   191  	}
   192  	t.setOldBlock()
   193  	return t, nil
   194  }
   195  
   196  // Close closes the open table.  (Releases resources back to the OS.)
   197  func (t *Table) Close() error {
   198  	if t.fd != nil {
   199  		t.fd.Close()
   200  	}
   201  	if t.indexFd != nil {
   202  		if len(t.indexData) != 0 {
   203  			y.Munmap(t.indexData)
   204  		}
   205  		t.indexFd.Close()
   206  	}
   207  	return nil
   208  }
   209  
   210  func (t *Table) NewIterator(reversed bool) y.Iterator {
   211  	return t.newIterator(reversed)
   212  }
   213  
   214  func (t *Table) Get(key y.Key, keyHash uint64) (y.ValueStruct, error) {
   215  	resultKey, resultVs, ok, err := t.pointGet(key, keyHash)
   216  	if err != nil {
   217  		return y.ValueStruct{}, err
   218  	}
   219  	if !ok {
   220  		it := t.NewIterator(false)
   221  		defer it.Close()
   222  		it.Seek(key.UserKey)
   223  		if !it.Valid() {
   224  			return y.ValueStruct{}, nil
   225  		}
   226  		if !key.SameUserKey(it.Key()) {
   227  			return y.ValueStruct{}, nil
   228  		}
   229  		resultKey, resultVs = it.Key(), it.Value()
   230  	} else if resultKey.IsEmpty() {
   231  		return y.ValueStruct{}, nil
   232  	}
   233  	result := resultVs
   234  	result.Version = resultKey.Version
   235  	return result, nil
   236  }
   237  
   238  // pointGet try to lookup a key and its value by table's hash index.
   239  // If it find an hash collision the last return value will be false,
   240  // which means caller should fallback to seek search. Otherwise it value will be true.
   241  // If the hash index does not contain such an element the returned key will be nil.
   242  func (t *Table) pointGet(key y.Key, keyHash uint64) (y.Key, y.ValueStruct, bool, error) {
   243  	idx, err := t.getIndex()
   244  	if err != nil {
   245  		return y.Key{}, y.ValueStruct{}, false, err
   246  	}
   247  	if idx.bf != nil && !idx.bf.Has(keyHash) {
   248  		return y.Key{}, y.ValueStruct{}, true, err
   249  	}
   250  
   251  	blkIdx, offset := uint32(resultFallback), uint8(0)
   252  	if idx.hIdx != nil {
   253  		blkIdx, offset = idx.hIdx.lookup(keyHash)
   254  	} else if idx.surf != nil {
   255  		v, ok := idx.surf.Get(key.UserKey)
   256  		if !ok {
   257  			blkIdx = resultNoEntry
   258  		} else {
   259  			var pos entryPosition
   260  			pos.decode(v)
   261  			blkIdx, offset = uint32(pos.blockIdx), pos.offset
   262  		}
   263  	}
   264  	if blkIdx == resultFallback {
   265  		return y.Key{}, y.ValueStruct{}, false, nil
   266  	}
   267  	if blkIdx == resultNoEntry {
   268  		return y.Key{}, y.ValueStruct{}, true, nil
   269  	}
   270  
   271  	it := t.newIterator(false)
   272  	defer it.Close()
   273  	it.seekFromOffset(int(blkIdx), int(offset), key.UserKey)
   274  
   275  	if !it.Valid() || !key.SameUserKey(it.Key()) {
   276  		return y.Key{}, y.ValueStruct{}, true, it.Error()
   277  	}
   278  	if !y.SeekToVersion(it, key.Version) {
   279  		return y.Key{}, y.ValueStruct{}, true, it.Error()
   280  	}
   281  	return it.Key(), it.Value(), true, nil
   282  }
   283  
   284  func (t *Table) read(off int, sz int) ([]byte, error) {
   285  	if len(t.blocksData) > 0 {
   286  		if len(t.blocksData[off:]) < sz {
   287  			return nil, y.ErrEOF
   288  		}
   289  		return t.blocksData[off : off+sz], nil
   290  	}
   291  	res := buffer.GetBuffer(sz)
   292  	_, err := t.fd.ReadAt(res, int64(off))
   293  	return res, err
   294  }
   295  
   296  func (t *Table) initTableInfo() error {
   297  	d, err := t.loadIndexData(false)
   298  	if err != nil {
   299  		return err
   300  	}
   301  
   302  	t.compression = d.compression
   303  	t.globalTs = d.globalTS
   304  
   305  	for ; d.valid(); d.next() {
   306  		switch d.currentId() {
   307  		case idSmallest:
   308  			if k := d.decode(); len(k) != 0 {
   309  				t.smallest = y.KeyWithTs(y.Copy(k), math.MaxUint64)
   310  			}
   311  		case idBiggest:
   312  			if k := d.decode(); len(k) != 0 {
   313  				t.biggest = y.KeyWithTs(y.Copy(k), 0)
   314  			}
   315  		case idBlockEndOffsets:
   316  			offsets := bytesToU32Slice(d.decode())
   317  			t.tableSize = int64(offsets[len(offsets)-1])
   318  			t.numBlocks = len(offsets)
   319  		case idOldBlockLen:
   320  			t.oldBlockLen = int64(bytesToU32(d.decode()))
   321  			t.tableSize += t.oldBlockLen
   322  		}
   323  	}
   324  	return nil
   325  }
   326  
   327  func (t *Table) readTableIndex(d *metaDecoder) *tableIndex {
   328  	idx := new(tableIndex)
   329  	for ; d.valid(); d.next() {
   330  		switch d.currentId() {
   331  		case idBaseKeysEndOffs:
   332  			idx.baseKeys.endOffs = bytesToU32Slice(d.decode())
   333  		case idBaseKeys:
   334  			idx.baseKeys.data = d.decode()
   335  		case idBlockEndOffsets:
   336  			idx.blockEndOffsets = bytesToU32Slice(d.decode())
   337  		case idBloomFilter:
   338  			if d := d.decode(); len(d) != 0 {
   339  				idx.bf = new(bbloom.Bloom)
   340  				idx.bf.BinaryUnmarshal(d)
   341  			}
   342  		case idHashIndex:
   343  			if d := d.decode(); len(d) != 0 {
   344  				idx.hIdx = new(hashIndex)
   345  				idx.hIdx.readIndex(d)
   346  			}
   347  		case idSuRFIndex:
   348  			if d := d.decode(); len(d) != 0 {
   349  				idx.surf = new(surf.SuRF)
   350  				idx.surf.Unmarshal(d)
   351  			}
   352  		}
   353  	}
   354  	return idx
   355  }
   356  
   357  func (t *Table) getIndex() (*tableIndex, error) {
   358  	if t.indexCache == nil {
   359  		var err error
   360  		t.indexOnce.Do(func() {
   361  			var d *metaDecoder
   362  			d, err = t.loadIndexData(true)
   363  			if err != nil {
   364  				return
   365  			}
   366  			t.index = t.readTableIndex(d)
   367  		})
   368  		return t.index, nil
   369  	}
   370  
   371  	index, err := t.indexCache.GetOrCompute(t.id, func() (interface{}, int64, error) {
   372  		d, err := t.loadIndexData(false)
   373  		if err != nil {
   374  			return nil, 0, err
   375  		}
   376  		return t.readTableIndex(d), int64(len(d.buf)), nil
   377  	})
   378  	if err != nil {
   379  		return nil, err
   380  	}
   381  	return index.(*tableIndex), nil
   382  }
   383  
   384  func (t *Table) loadIndexData(useMmap bool) (*metaDecoder, error) {
   385  	if t.indexFd == nil {
   386  		return newMetaDecoder(t.indexData)
   387  	}
   388  	fstat, err := t.indexFd.Stat()
   389  	if err != nil {
   390  		return nil, err
   391  	}
   392  	var idxData []byte
   393  
   394  	if useMmap {
   395  		idxData, err = y.Mmap(t.indexFd, false, fstat.Size())
   396  		if err != nil {
   397  			return nil, err
   398  		}
   399  		t.indexData = idxData
   400  	} else {
   401  		idxData = buffer.GetBuffer(int(fstat.Size()))
   402  		if _, err = t.indexFd.ReadAt(idxData, 0); err != nil {
   403  			return nil, err
   404  		}
   405  	}
   406  
   407  	decoder, err := newMetaDecoder(idxData)
   408  	if err != nil {
   409  		return nil, err
   410  	}
   411  	if decoder.compression != options.None && useMmap {
   412  		y.Munmap(idxData)
   413  		t.indexData = nil
   414  	}
   415  	return decoder, nil
   416  }
   417  
   418  type block struct {
   419  	offset  int
   420  	data    []byte
   421  	baseKey []byte
   422  
   423  	reference int32
   424  }
   425  
   426  func OnEvict(key uint64, value interface{}) {
   427  	if b, ok := value.(*block); ok {
   428  		b.done()
   429  	}
   430  }
   431  
   432  func (b *block) add() (ok bool) {
   433  	for {
   434  		old := atomic.LoadInt32(&b.reference)
   435  		if old == 0 {
   436  			return false
   437  		}
   438  		new := old + 1
   439  		if atomic.CompareAndSwapInt32(&b.reference, old, new) {
   440  			return true
   441  		}
   442  	}
   443  }
   444  
   445  func (b *block) done() {
   446  	if b != nil && atomic.AddInt32(&b.reference, -1) == 0 {
   447  		buffer.PutBuffer(b.data)
   448  		b.data = nil
   449  	}
   450  }
   451  
   452  func (b *block) size() int64 {
   453  	return int64(intSize + len(b.data))
   454  }
   455  
   456  func (t *Table) block(idx int, index *tableIndex) (*block, error) {
   457  	y.Assert(idx >= 0)
   458  
   459  	if idx >= len(index.blockEndOffsets) {
   460  		return &block{}, io.EOF
   461  	}
   462  
   463  	if t.blockCache == nil {
   464  		return t.loadBlock(idx, index)
   465  	}
   466  
   467  	key := t.blockCacheKey(idx)
   468  	blk, err := t.blockCache.GetOrCompute(key, func() (interface{}, int64, error) {
   469  		b, e := t.loadBlock(idx, index)
   470  		if e != nil {
   471  			return nil, 0, e
   472  		}
   473  		b.reference = 1
   474  		return b, int64(len(b.data)), nil
   475  	})
   476  	if err != nil {
   477  		return &block{}, err
   478  	}
   479  	b := blk.(*block)
   480  	if ok := b.add(); !ok {
   481  		return &block{}, errors.Errorf("block is evicted")
   482  	}
   483  	return b, nil
   484  }
   485  
   486  func (t *Table) loadBlock(idx int, index *tableIndex) (*block, error) {
   487  	var startOffset int
   488  	if idx > 0 {
   489  		startOffset = int(index.blockEndOffsets[idx-1])
   490  	}
   491  	blk := &block{
   492  		offset: startOffset,
   493  	}
   494  	endOffset := int(index.blockEndOffsets[idx])
   495  	dataLen := endOffset - startOffset
   496  	var err error
   497  	if blk.data, err = t.read(blk.offset, dataLen); err != nil {
   498  		return &block{}, errors.Wrapf(err,
   499  			"failed to read from file: %s at offset: %d, len: %d", t.fd.Name(), blk.offset, dataLen)
   500  	}
   501  
   502  	blk.data, err = t.compression.Decompress(blk.data)
   503  	if err != nil {
   504  		return &block{}, errors.Wrapf(err,
   505  			"failed to decode compressed data in file: %s at offset: %d, len: %d",
   506  			t.fd.Name(), blk.offset, dataLen)
   507  	}
   508  	blk.baseKey = index.baseKeys.getEntry(idx)
   509  	return blk, nil
   510  }
   511  
   512  // HasGlobalTs returns table does set global ts.
   513  func (t *Table) HasGlobalTs() bool {
   514  	return t.globalTs != 0
   515  }
   516  
   517  // SetGlobalTs update the global ts of external ingested tables.
   518  func (t *Table) SetGlobalTs(ts uint64) error {
   519  	if _, err := t.indexFd.WriteAt(u64ToBytes(ts), 0); err != nil {
   520  		return err
   521  	}
   522  	if err := fileutil.Fsync(t.indexFd); err != nil {
   523  		return err
   524  	}
   525  	t.globalTs = ts
   526  	return nil
   527  }
   528  
   529  func (t *Table) MarkCompacting(flag bool) {
   530  	if flag {
   531  		atomic.StoreInt32(&t.compacting, 1)
   532  	}
   533  	atomic.StoreInt32(&t.compacting, 0)
   534  }
   535  
   536  func (t *Table) IsCompacting() bool {
   537  	return atomic.LoadInt32(&t.compacting) == 1
   538  }
   539  
   540  func (t *Table) blockCacheKey(idx int) uint64 {
   541  	y.Assert(t.ID() < math.MaxUint32)
   542  	y.Assert(idx < math.MaxUint32)
   543  	return (t.ID() << 32) | uint64(idx)
   544  }
   545  
   546  // Size is its file size in bytes
   547  func (t *Table) Size() int64 { return t.tableSize }
   548  
   549  // Smallest is its smallest key, or nil if there are none
   550  func (t *Table) Smallest() y.Key { return t.smallest }
   551  
   552  // Biggest is its biggest key, or nil if there are none
   553  func (t *Table) Biggest() y.Key { return t.biggest }
   554  
   555  // Filename is NOT the file name.  Just kidding, it is.
   556  func (t *Table) Filename() string { return t.fd.Name() }
   557  
   558  // ID is the table's ID number (used to make the file name).
   559  func (t *Table) ID() uint64 { return t.id }
   560  
   561  func (t *Table) HasOverlap(start, end y.Key, includeEnd bool) bool {
   562  	if start.Compare(t.Biggest()) > 0 {
   563  		return false
   564  	}
   565  
   566  	if cmp := end.Compare(t.Smallest()); cmp < 0 {
   567  		return false
   568  	} else if cmp == 0 {
   569  		return includeEnd
   570  	}
   571  
   572  	idx, err := t.getIndex()
   573  	if err != nil {
   574  		return true
   575  	}
   576  
   577  	if idx.surf != nil {
   578  		return idx.surf.HasOverlap(start.UserKey, end.UserKey, includeEnd)
   579  	}
   580  
   581  	// If there are errors occurred during seeking,
   582  	// we assume the table has overlapped with the range to prevent data loss.
   583  	it := t.newIteratorWithIdx(false, idx)
   584  	defer it.Close()
   585  	it.Seek(start.UserKey)
   586  	if !it.Valid() {
   587  		return it.Error() != nil
   588  	}
   589  	if cmp := it.Key().Compare(end); cmp > 0 {
   590  		return false
   591  	} else if cmp == 0 {
   592  		return includeEnd
   593  	}
   594  	return true
   595  }
   596  
   597  // ParseFileID reads the file id out of a filename.
   598  func ParseFileID(name string) (uint64, bool) {
   599  	name = path.Base(name)
   600  	if !strings.HasSuffix(name, fileSuffix) {
   601  		return 0, false
   602  	}
   603  	//	suffix := name[len(fileSuffix):]
   604  	name = strings.TrimSuffix(name, fileSuffix)
   605  	id, err := strconv.ParseUint(name, 16, 64)
   606  	if err != nil {
   607  		return 0, false
   608  	}
   609  	return id, true
   610  }
   611  
   612  // IDToFilename does the inverse of ParseFileID
   613  func IDToFilename(id uint64) string {
   614  	return fmt.Sprintf("%08x", id) + fileSuffix
   615  }
   616  
   617  // NewFilename should be named TableFilepath -- it combines the dir with the ID to make a table
   618  // filepath.
   619  func NewFilename(id uint64, dir string) string {
   620  	return filepath.Join(dir, IDToFilename(id))
   621  }