github.com/coocood/badger@v1.5.1-0.20200528065104-c02ac3616d04/table/sstable/table.go (about)

     1  /*
     2   * Copyright 2017 Dgraph Labs, Inc. and Contributors
     3   *
     4   * Licensed under the Apache License, Version 2.0 (the "License");
     5   * you may not use this file except in compliance with the License.
     6   * You may obtain a copy of the License at
     7   *
     8   *     http://www.apache.org/licenses/LICENSE-2.0
     9   *
    10   * Unless required by applicable law or agreed to in writing, software
    11   * distributed under the License is distributed on an "AS IS" BASIS,
    12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13   * See the License for the specific language governing permissions and
    14   * limitations under the License.
    15   */
    16  
    17  package sstable
    18  
    19  import (
    20  	"fmt"
    21  	"io"
    22  	"math"
    23  	"os"
    24  	"path"
    25  	"path/filepath"
    26  	"strconv"
    27  	"strings"
    28  	"sync"
    29  	"sync/atomic"
    30  	"unsafe"
    31  
    32  	"github.com/coocood/badger/cache"
    33  	"github.com/coocood/badger/fileutil"
    34  	"github.com/coocood/badger/options"
    35  	"github.com/coocood/badger/surf"
    36  	"github.com/coocood/badger/y"
    37  	"github.com/coocood/bbloom"
    38  	"github.com/pingcap/errors"
    39  )
    40  
    41  const (
    42  	fileSuffix    = ".sst"
    43  	idxFileSuffix = ".idx"
    44  
    45  	intSize = int(unsafe.Sizeof(int(0)))
    46  )
    47  
    48  func IndexFilename(tableFilename string) string { return tableFilename + idxFileSuffix }
    49  
    50  type tableIndex struct {
    51  	blockEndOffsets []uint32
    52  	baseKeys        entrySlice
    53  	bf              *bbloom.Bloom
    54  	hIdx            *hashIndex
    55  	surf            *surf.SuRF
    56  }
    57  
    58  // Table represents a loaded table file with the info we have about it
    59  type Table struct {
    60  	sync.Mutex
    61  
    62  	fd      *os.File // Own fd.
    63  	indexFd *os.File
    64  
    65  	globalTs          uint64
    66  	tableSize         int64
    67  	numBlocks         int
    68  	smallest, biggest y.Key
    69  	id                uint64
    70  
    71  	blockCache *cache.Cache
    72  	blocksMmap []byte
    73  
    74  	indexCache *cache.Cache
    75  	index      *tableIndex
    76  	indexOnce  sync.Once
    77  	indexMmap  []byte
    78  
    79  	compacting int32
    80  
    81  	compression options.CompressionType
    82  
    83  	oldBlockLen int64
    84  	oldBlock    []byte
    85  }
    86  
    87  // CompressionType returns the compression algorithm used for block compression.
    88  func (t *Table) CompressionType() options.CompressionType {
    89  	return t.compression
    90  }
    91  
    92  // Delete delete table's file from disk.
    93  func (t *Table) Delete() error {
    94  	if t.blockCache != nil {
    95  		for blk := 0; blk < t.numBlocks; blk++ {
    96  			t.blockCache.Del(t.blockCacheKey(blk))
    97  		}
    98  	}
    99  	if t.indexCache != nil {
   100  		t.indexCache.Del(t.id)
   101  	}
   102  	if len(t.blocksMmap) != 0 {
   103  		y.Munmap(t.blocksMmap)
   104  	}
   105  	t.index = nil
   106  	if len(t.indexMmap) != 0 {
   107  		y.Munmap(t.indexMmap)
   108  	}
   109  	if err := t.fd.Truncate(0); err != nil {
   110  		// This is very important to let the FS know that the file is deleted.
   111  		return err
   112  	}
   113  	filename := t.fd.Name()
   114  	if err := t.fd.Close(); err != nil {
   115  		return err
   116  	}
   117  	if err := os.Remove(filename); err != nil {
   118  		return err
   119  	}
   120  	return os.Remove(filename + idxFileSuffix)
   121  }
   122  
   123  // OpenTable assumes file has only one table and opens it.  Takes ownership of fd upon function
   124  // entry.  Returns a table with one reference count on it (decrementing which may delete the file!
   125  // -- consider t.Close() instead).  The fd has to writeable because we call Truncate on it before
   126  // deleting.
   127  func OpenTable(filename string, blockCache *cache.Cache, indexCache *cache.Cache) (*Table, error) {
   128  	id, ok := ParseFileID(filename)
   129  	if !ok {
   130  		return nil, errors.Errorf("Invalid filename: %s", filename)
   131  	}
   132  
   133  	// TODO: after we support cache of L2 storage, we will open block data file in cache manager.
   134  	fd, err := y.OpenExistingFile(filename, 0)
   135  	if err != nil {
   136  		return nil, err
   137  	}
   138  
   139  	indexFd, err := y.OpenExistingFile(filename+idxFileSuffix, 0)
   140  	if err != nil {
   141  		return nil, err
   142  	}
   143  
   144  	t := &Table{
   145  		fd:         fd,
   146  		indexFd:    indexFd,
   147  		id:         id,
   148  		blockCache: blockCache,
   149  		indexCache: indexCache,
   150  	}
   151  
   152  	if err := t.initTableInfo(); err != nil {
   153  		t.Close()
   154  		return nil, err
   155  	}
   156  	if blockCache == nil || t.oldBlockLen > 0 {
   157  		t.blocksMmap, err = y.Mmap(fd, false, t.Size())
   158  		if err != nil {
   159  			t.Close()
   160  			return nil, y.Wrapf(err, "Unable to map file")
   161  		}
   162  		t.oldBlock = t.blocksMmap[t.tableSize-t.oldBlockLen : t.tableSize]
   163  	}
   164  	return t, nil
   165  }
   166  
   167  // Close closes the open table.  (Releases resources back to the OS.)
   168  func (t *Table) Close() error {
   169  	if t.fd != nil {
   170  		t.fd.Close()
   171  	}
   172  	if t.indexFd != nil {
   173  		if len(t.indexMmap) != 0 {
   174  			y.Munmap(t.indexMmap)
   175  		}
   176  		t.indexFd.Close()
   177  	}
   178  	return nil
   179  }
   180  
   181  func (t *Table) NewIterator(reversed bool) y.Iterator {
   182  	return t.newIterator(reversed)
   183  }
   184  
   185  func (t *Table) Get(key y.Key, keyHash uint64) (y.ValueStruct, error) {
   186  	resultKey, resultVs, ok, err := t.pointGet(key, keyHash)
   187  	if err != nil {
   188  		return y.ValueStruct{}, err
   189  	}
   190  	if !ok {
   191  		it := t.NewIterator(false)
   192  		it.Seek(key.UserKey)
   193  		if !it.Valid() {
   194  			return y.ValueStruct{}, nil
   195  		}
   196  		if !key.SameUserKey(it.Key()) {
   197  			return y.ValueStruct{}, nil
   198  		}
   199  		resultKey, resultVs = it.Key(), it.Value()
   200  	} else if resultKey.IsEmpty() {
   201  		return y.ValueStruct{}, nil
   202  	}
   203  	result := resultVs
   204  	result.Version = resultKey.Version
   205  	return result, nil
   206  }
   207  
   208  // pointGet try to lookup a key and its value by table's hash index.
   209  // If it find an hash collision the last return value will be false,
   210  // which means caller should fallback to seek search. Otherwise it value will be true.
   211  // If the hash index does not contain such an element the returned key will be nil.
   212  func (t *Table) pointGet(key y.Key, keyHash uint64) (y.Key, y.ValueStruct, bool, error) {
   213  	idx, err := t.getIndex()
   214  	if err != nil {
   215  		return y.Key{}, y.ValueStruct{}, false, err
   216  	}
   217  	if idx.bf != nil && !idx.bf.Has(keyHash) {
   218  		return y.Key{}, y.ValueStruct{}, true, err
   219  	}
   220  
   221  	blkIdx, offset := uint32(resultFallback), uint8(0)
   222  	if idx.hIdx != nil {
   223  		blkIdx, offset = idx.hIdx.lookup(keyHash)
   224  	} else if idx.surf != nil {
   225  		v, ok := idx.surf.Get(key.UserKey)
   226  		if !ok {
   227  			blkIdx = resultNoEntry
   228  		} else {
   229  			var pos entryPosition
   230  			pos.decode(v)
   231  			blkIdx, offset = uint32(pos.blockIdx), pos.offset
   232  		}
   233  	}
   234  	if blkIdx == resultFallback {
   235  		return y.Key{}, y.ValueStruct{}, false, nil
   236  	}
   237  	if blkIdx == resultNoEntry {
   238  		return y.Key{}, y.ValueStruct{}, true, nil
   239  	}
   240  
   241  	it := t.newIterator(false)
   242  	it.seekFromOffset(int(blkIdx), int(offset), key.UserKey)
   243  
   244  	if !it.Valid() || !key.SameUserKey(it.Key()) {
   245  		return y.Key{}, y.ValueStruct{}, true, it.Error()
   246  	}
   247  	if !y.SeekToVersion(it, key.Version) {
   248  		return y.Key{}, y.ValueStruct{}, true, it.Error()
   249  	}
   250  	return it.Key(), it.Value(), true, nil
   251  }
   252  
   253  func (t *Table) read(off int, sz int) ([]byte, error) {
   254  	if len(t.blocksMmap) > 0 {
   255  		if len(t.blocksMmap[off:]) < sz {
   256  			return nil, y.ErrEOF
   257  		}
   258  		return t.blocksMmap[off : off+sz], nil
   259  	}
   260  	res := make([]byte, sz)
   261  	_, err := t.fd.ReadAt(res, int64(off))
   262  	return res, err
   263  }
   264  
   265  func (t *Table) initTableInfo() error {
   266  	d, err := t.loadIndexData(false)
   267  	if err != nil {
   268  		return err
   269  	}
   270  
   271  	t.compression = d.compression
   272  	t.globalTs = d.globalTS
   273  
   274  	for ; d.valid(); d.next() {
   275  		switch d.currentId() {
   276  		case idSmallest:
   277  			if k := d.decode(); len(k) != 0 {
   278  				t.smallest = y.KeyWithTs(y.Copy(k), math.MaxUint64)
   279  			}
   280  		case idBiggest:
   281  			if k := d.decode(); len(k) != 0 {
   282  				t.biggest = y.KeyWithTs(y.Copy(k), 0)
   283  			}
   284  		case idBlockEndOffsets:
   285  			offsets := bytesToU32Slice(d.decode())
   286  			t.tableSize = int64(offsets[len(offsets)-1])
   287  			t.numBlocks = len(offsets)
   288  		case idOldBlockLen:
   289  			t.oldBlockLen = int64(bytesToU32(d.decode()))
   290  			t.tableSize += t.oldBlockLen
   291  		}
   292  	}
   293  	return nil
   294  }
   295  
   296  func (t *Table) readTableIndex(d *metaDecoder) *tableIndex {
   297  	idx := new(tableIndex)
   298  	for ; d.valid(); d.next() {
   299  		switch d.currentId() {
   300  		case idBaseKeysEndOffs:
   301  			idx.baseKeys.endOffs = bytesToU32Slice(d.decode())
   302  		case idBaseKeys:
   303  			idx.baseKeys.data = d.decode()
   304  		case idBlockEndOffsets:
   305  			idx.blockEndOffsets = bytesToU32Slice(d.decode())
   306  		case idBloomFilter:
   307  			if d := d.decode(); len(d) != 0 {
   308  				idx.bf = new(bbloom.Bloom)
   309  				idx.bf.BinaryUnmarshal(d)
   310  			}
   311  		case idHashIndex:
   312  			if d := d.decode(); len(d) != 0 {
   313  				idx.hIdx = new(hashIndex)
   314  				idx.hIdx.readIndex(d)
   315  			}
   316  		case idSuRFIndex:
   317  			if d := d.decode(); len(d) != 0 {
   318  				idx.surf = new(surf.SuRF)
   319  				idx.surf.Unmarshal(d)
   320  			}
   321  		}
   322  	}
   323  	return idx
   324  }
   325  
   326  func (t *Table) getIndex() (*tableIndex, error) {
   327  	if t.indexCache == nil {
   328  		var err error
   329  		t.indexOnce.Do(func() {
   330  			var d *metaDecoder
   331  			d, err = t.loadIndexData(true)
   332  			if err != nil {
   333  				return
   334  			}
   335  			t.index = t.readTableIndex(d)
   336  		})
   337  		return t.index, nil
   338  	}
   339  
   340  	index, err := t.indexCache.GetOrCompute(t.id, func() (interface{}, int64, error) {
   341  		d, err := t.loadIndexData(false)
   342  		if err != nil {
   343  			return nil, 0, err
   344  		}
   345  		return t.readTableIndex(d), int64(len(d.buf)), nil
   346  	})
   347  	if err != nil {
   348  		return nil, err
   349  	}
   350  	return index.(*tableIndex), nil
   351  }
   352  
   353  func (t *Table) loadIndexData(useMmap bool) (*metaDecoder, error) {
   354  	fstat, err := t.indexFd.Stat()
   355  	if err != nil {
   356  		return nil, err
   357  	}
   358  	var idxData []byte
   359  
   360  	if useMmap {
   361  		idxData, err = y.Mmap(t.indexFd, false, fstat.Size())
   362  		if err != nil {
   363  			return nil, err
   364  		}
   365  		t.indexMmap = idxData
   366  	} else {
   367  		idxData = make([]byte, fstat.Size())
   368  		if _, err = t.indexFd.ReadAt(idxData, 0); err != nil {
   369  			return nil, err
   370  		}
   371  	}
   372  
   373  	decoder, err := newMetaDecoder(idxData)
   374  	if err != nil {
   375  		return nil, err
   376  	}
   377  	if decoder.compression != options.None && useMmap {
   378  		y.Munmap(idxData)
   379  		t.indexMmap = nil
   380  	}
   381  	return decoder, nil
   382  }
   383  
   384  type block struct {
   385  	offset  int
   386  	data    []byte
   387  	baseKey []byte
   388  }
   389  
   390  func (b *block) size() int64 {
   391  	return int64(intSize + len(b.data))
   392  }
   393  
   394  func (t *Table) block(idx int, index *tableIndex) (block, error) {
   395  	y.Assert(idx >= 0)
   396  
   397  	if idx >= len(index.blockEndOffsets) {
   398  		return block{}, io.EOF
   399  	}
   400  
   401  	if t.blockCache == nil {
   402  		return t.loadBlock(idx, index)
   403  	}
   404  
   405  	key := t.blockCacheKey(idx)
   406  	blk, err := t.blockCache.GetOrCompute(key, func() (interface{}, int64, error) {
   407  		b, e := t.loadBlock(idx, index)
   408  		if e != nil {
   409  			return nil, 0, e
   410  		}
   411  		return b, int64(len(b.data)), nil
   412  	})
   413  	if err != nil {
   414  		return block{}, err
   415  	}
   416  	return blk.(block), nil
   417  }
   418  
   419  func (t *Table) loadBlock(idx int, index *tableIndex) (block, error) {
   420  	var startOffset int
   421  	if idx > 0 {
   422  		startOffset = int(index.blockEndOffsets[idx-1])
   423  	}
   424  	blk := block{
   425  		offset: startOffset,
   426  	}
   427  	endOffset := int(index.blockEndOffsets[idx])
   428  	dataLen := endOffset - startOffset
   429  	var err error
   430  	if blk.data, err = t.read(blk.offset, dataLen); err != nil {
   431  		return block{}, errors.Wrapf(err,
   432  			"failed to read from file: %s at offset: %d, len: %d", t.fd.Name(), blk.offset, dataLen)
   433  	}
   434  
   435  	blk.data, err = t.compression.Decompress(blk.data)
   436  	if err != nil {
   437  		return block{}, errors.Wrapf(err,
   438  			"failed to decode compressed data in file: %s at offset: %d, len: %d",
   439  			t.fd.Name(), blk.offset, dataLen)
   440  	}
   441  	blk.baseKey = index.baseKeys.getEntry(idx)
   442  	return blk, nil
   443  }
   444  
   445  // HasGlobalTs returns table does set global ts.
   446  func (t *Table) HasGlobalTs() bool {
   447  	return t.globalTs != 0
   448  }
   449  
   450  // SetGlobalTs update the global ts of external ingested tables.
   451  func (t *Table) SetGlobalTs(ts uint64) error {
   452  	if _, err := t.indexFd.WriteAt(u64ToBytes(ts), 0); err != nil {
   453  		return err
   454  	}
   455  	if err := fileutil.Fsync(t.indexFd); err != nil {
   456  		return err
   457  	}
   458  	t.globalTs = ts
   459  	return nil
   460  }
   461  
   462  func (t *Table) MarkCompacting(flag bool) {
   463  	if flag {
   464  		atomic.StoreInt32(&t.compacting, 1)
   465  	}
   466  	atomic.StoreInt32(&t.compacting, 0)
   467  }
   468  
   469  func (t *Table) IsCompacting() bool {
   470  	return atomic.LoadInt32(&t.compacting) == 1
   471  }
   472  
   473  func (t *Table) blockCacheKey(idx int) uint64 {
   474  	y.Assert(t.ID() < math.MaxUint32)
   475  	y.Assert(idx < math.MaxUint32)
   476  	return (t.ID() << 32) | uint64(idx)
   477  }
   478  
   479  // Size is its file size in bytes
   480  func (t *Table) Size() int64 { return t.tableSize }
   481  
   482  // Smallest is its smallest key, or nil if there are none
   483  func (t *Table) Smallest() y.Key { return t.smallest }
   484  
   485  // Biggest is its biggest key, or nil if there are none
   486  func (t *Table) Biggest() y.Key { return t.biggest }
   487  
   488  // Filename is NOT the file name.  Just kidding, it is.
   489  func (t *Table) Filename() string { return t.fd.Name() }
   490  
   491  // ID is the table's ID number (used to make the file name).
   492  func (t *Table) ID() uint64 { return t.id }
   493  
   494  func (t *Table) HasOverlap(start, end y.Key, includeEnd bool) bool {
   495  	if start.Compare(t.Biggest()) > 0 {
   496  		return false
   497  	}
   498  
   499  	if cmp := end.Compare(t.Smallest()); cmp < 0 {
   500  		return false
   501  	} else if cmp == 0 {
   502  		return includeEnd
   503  	}
   504  
   505  	idx, err := t.getIndex()
   506  	if err != nil {
   507  		return true
   508  	}
   509  
   510  	if idx.surf != nil {
   511  		return idx.surf.HasOverlap(start.UserKey, end.UserKey, includeEnd)
   512  	}
   513  
   514  	// If there are errors occurred during seeking,
   515  	// we assume the table has overlapped with the range to prevent data loss.
   516  	it := t.newIteratorWithIdx(false, idx)
   517  	it.Seek(start.UserKey)
   518  	if !it.Valid() {
   519  		return it.Error() != nil
   520  	}
   521  	if cmp := it.Key().Compare(end); cmp > 0 {
   522  		return false
   523  	} else if cmp == 0 {
   524  		return includeEnd
   525  	}
   526  	return true
   527  }
   528  
   529  // ParseFileID reads the file id out of a filename.
   530  func ParseFileID(name string) (uint64, bool) {
   531  	name = path.Base(name)
   532  	if !strings.HasSuffix(name, fileSuffix) {
   533  		return 0, false
   534  	}
   535  	//	suffix := name[len(fileSuffix):]
   536  	name = strings.TrimSuffix(name, fileSuffix)
   537  	id, err := strconv.ParseUint(name, 16, 64)
   538  	if err != nil {
   539  		return 0, false
   540  	}
   541  	return id, true
   542  }
   543  
   544  // IDToFilename does the inverse of ParseFileID
   545  func IDToFilename(id uint64) string {
   546  	return fmt.Sprintf("%08x", id) + fileSuffix
   547  }
   548  
   549  // NewFilename should be named TableFilepath -- it combines the dir with the ID to make a table
   550  // filepath.
   551  func NewFilename(id uint64, dir string) string {
   552  	return filepath.Join(dir, IDToFilename(id))
   553  }