github.com/hasnat/dolt/go@v0.0.0-20210628190320-9eb5d843fbb7/store/nbs/table.go

github.com/hasnat/dolt/go@v0.0.0-20210628190320-9eb5d843fbb7/store/nbs/table.go (about)

     1  // Copyright 2019 Dolthub, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  //
    15  // This file incorporates work covered by the following copyright and
    16  // permission notice:
    17  //
    18  // Copyright 2016 Attic Labs, Inc. All rights reserved.
    19  // Licensed under the Apache License, version 2.0:
    20  // http://www.apache.org/licenses/LICENSE-2.0
    21  
    22  package nbs
    23  
    24  import (
    25  	"bytes"
    26  	"context"
    27  	"crypto/sha512"
    28  	"encoding/base32"
    29  	"encoding/binary"
    30  	"hash/crc32"
    31  	"io"
    32  
    33  	"golang.org/x/sync/errgroup"
    34  
    35  	"github.com/dolthub/dolt/go/store/chunks"
    36  	"github.com/dolthub/dolt/go/store/hash"
    37  )
    38  
    39  /*
    40     An NBS Table stores N byte slices ("chunks") which are addressed by a 20-byte hash of their
    41     contents. The footer encodes N as well as the total bytes consumed by all contained chunks.
    42     An Index maps each address to the position of its corresponding chunk. Addresses are logically sorted within the Index, but the corresponding chunks need not be.
    43  
    44     Table:
    45     +----------------+----------------+-----+----------------+-------+--------+
    46     | Chunk Record 0 | Chunk Record 1 | ... | Chunk Record N | Index | Footer |
    47     +----------------+----------------+-----+----------------+-------+--------+
    48  
    49     Chunk Record:
    50     +---------------------------+----------------+
    51     | (Chunk Length) Chunk Data | (Uint32) CRC32 |
    52     +---------------------------+----------------+
    53  
    54     Index:
    55     +------------+---------+----------+
    56     | Prefix Map | Lengths | Suffixes |
    57     +------------+---------+----------+
    58  
    59     Prefix Map:
    60     +--------------+--------------+-----+----------------+
    61     | Prefix Tuple | Prefix Tuple | ... | Prefix Tuple N |
    62     +--------------+--------------+-----+----------------+
    63  
    64       -The Prefix Map contains N Prefix Tuples.
    65       -Each Prefix Tuple corresponds to a unique Chunk Record in the Table.
    66       -The Prefix Tuples are sorted in increasing lexicographic order within the Prefix Map.
    67       -NB: THE SAME PREFIX MAY APPEAR MULTIPLE TIMES, as distinct Hashes (referring to distinct Chunks) may share the same Prefix.
    68  
    69     Prefix Tuple:
    70     +-----------------+------------------+
    71     | (8) Hash Prefix | (Uint32) Ordinal |
    72     +-----------------+------------------+
    73  
    74       -First 8 bytes of a Chunk's Hash
    75       -Ordinal is the 0-based ordinal position of the associated record within the sequence of chunk records, the associated Length within Lengths, and the associated Hash Suffix within Suffixes.
    76  
    77     Lengths:
    78     +-----------------+-----------------+-----+-------------------+
    79     | (Uint32) Length | (Uint32) Length | ... | (Uint32) Length N |
    80     +-----------------+-----------------+-----+-------------------+
    81  
    82       - Each Length is the length of a Chunk Record in this Table.
    83       - Length M must correspond to Chunk Record M for 0 <= M <= N
    84  
    85     Suffixes:
    86     +------------------+------------------+-----+--------------------+
    87     | (12) Hash Suffix | (12) Hash Suffix | ... | (12) Hash Suffix N |
    88     +------------------+------------------+-----+--------------------+
    89  
    90       - Each Hash Suffix is the last 12 bytes of a Chunk in this Table.
    91       - Hash Suffix M must correspond to Chunk Record M for 0 <= M <= N
    92  
    93     Footer:
    94     +----------------------+----------------------------------------+------------------+
    95     | (Uint32) Chunk Count | (Uint64) Total Uncompressed Chunk Data | (8) Magic Number |
    96     +----------------------+----------------------------------------+------------------+
    97  
    98       -Total Uncompressed Chunk Data is the sum of the uncompressed byte lengths of all contained chunk byte slices.
    99       -Magic Number is the first 8 bytes of the SHA256 hash of "https://github.com/attic-labs/nbs".
   100  
   101      NOTE: Unsigned integer quanities, hashes and hash suffix are all encoded big-endian
   102  
   103  
   104    Looking up Chunks in an NBS Table
   105    There are two phases to loading chunk data for a given Hash from an NBS Table: Checking for the chunk's presence, and fetching the chunk's bytes. When performing a has-check, only the first phase is necessary.
   106  
   107    Phase one: Chunk presence
   108    - Slice off the first 8 bytes of your Hash to create a Prefix
   109    - Since the Prefix Tuples in the Prefix Map are in lexicographic order, binary search the Prefix Map for the desired Prefix.
   110    - For all Prefix Tuples with a matching Prefix:
   111      - Load the Ordinal
   112      - Use the Ordinal to index into Suffixes
   113      - Check the Suffix of your Hash against the loaded Suffix
   114      - If they match, your chunk is in this Table in the Chunk Record indicated by Ordinal
   115      - If they don't match, continue to the next matching Prefix Tuple
   116    - If not found, your chunk is not in this Table.
   117  
   118    Phase two: Loading Chunk data
   119    - Take the Ordinal discovered in Phase one
   120    - Calculate the Offset of your desired Chunk Record: Sum(Lengths[0]...Lengths[Ordinal-1])
   121    - Load Lengths[Ordinal] bytes from Table[Offset]
   122    - Check the first 4 bytes of the loaded data against the last 4 bytes of your desired Hash. They should match, and the rest of the data is your Chunk data.
   123  */
   124  
   125  const (
   126  	addrSize        = 20
   127  	addrPrefixSize  = 8
   128  	addrSuffixSize  = addrSize - addrPrefixSize
   129  	uint64Size      = 8
   130  	uint32Size      = 4
   131  	ordinalSize     = uint32Size
   132  	lengthSize      = uint32Size
   133  	magicNumber     = "\xff\xb5\xd8\xc2\x24\x63\xee\x50"
   134  	magicNumberSize = 8 //len(magicNumber)
   135  	footerSize      = uint32Size + uint64Size + magicNumberSize
   136  	prefixTupleSize = addrPrefixSize + ordinalSize
   137  	checksumSize    = uint32Size
   138  	maxChunkSize    = 0xffffffff // Snappy won't compress slices bigger than this
   139  )
   140  
   141  var crcTable = crc32.MakeTable(crc32.Castagnoli)
   142  
   143  func crc(b []byte) uint32 {
   144  	return crc32.Update(0, crcTable, b)
   145  }
   146  
   147  func computeAddrDefault(data []byte) addr {
   148  	r := sha512.Sum512(data)
   149  	h := addr{}
   150  	copy(h[:], r[:addrSize])
   151  	return h
   152  }
   153  
   154  var computeAddr = computeAddrDefault
   155  
   156  type addr [addrSize]byte
   157  
   158  var encoding = base32.NewEncoding("0123456789abcdefghijklmnopqrstuv")
   159  
   160  func (a addr) String() string {
   161  	return encoding.EncodeToString(a[:])
   162  }
   163  
   164  func (a addr) Prefix() uint64 {
   165  	return binary.BigEndian.Uint64(a[:])
   166  }
   167  
   168  func (a addr) Checksum() uint32 {
   169  	return binary.BigEndian.Uint32(a[addrSize-checksumSize:])
   170  }
   171  
   172  func parseAddr(str string) (addr, error) {
   173  	var h addr
   174  	_, err := encoding.Decode(h[:], []byte(str))
   175  	return h, err
   176  }
   177  
   178  func ValidateAddr(s string) bool {
   179  	_, err := encoding.DecodeString(s)
   180  	return err == nil
   181  }
   182  
   183  type addrSlice []addr
   184  
   185  func (hs addrSlice) Len() int           { return len(hs) }
   186  func (hs addrSlice) Less(i, j int) bool { return bytes.Compare(hs[i][:], hs[j][:]) < 0 }
   187  func (hs addrSlice) Swap(i, j int)      { hs[i], hs[j] = hs[j], hs[i] }
   188  
   189  type hasRecord struct {
   190  	a      *addr
   191  	prefix uint64
   192  	order  int
   193  	has    bool
   194  }
   195  
   196  type hasRecordByPrefix []hasRecord
   197  
   198  func (hs hasRecordByPrefix) Len() int           { return len(hs) }
   199  func (hs hasRecordByPrefix) Less(i, j int) bool { return hs[i].prefix < hs[j].prefix }
   200  func (hs hasRecordByPrefix) Swap(i, j int)      { hs[i], hs[j] = hs[j], hs[i] }
   201  
   202  type hasRecordByOrder []hasRecord
   203  
   204  func (hs hasRecordByOrder) Len() int           { return len(hs) }
   205  func (hs hasRecordByOrder) Less(i, j int) bool { return hs[i].order < hs[j].order }
   206  func (hs hasRecordByOrder) Swap(i, j int)      { hs[i], hs[j] = hs[j], hs[i] }
   207  
   208  type getRecord struct {
   209  	a      *addr
   210  	prefix uint64
   211  	found  bool
   212  }
   213  
   214  type getRecordByPrefix []getRecord
   215  
   216  func (hs getRecordByPrefix) Len() int           { return len(hs) }
   217  func (hs getRecordByPrefix) Less(i, j int) bool { return hs[i].prefix < hs[j].prefix }
   218  func (hs getRecordByPrefix) Swap(i, j int)      { hs[i], hs[j] = hs[j], hs[i] }
   219  
   220  type extractRecord struct {
   221  	a    addr
   222  	data []byte
   223  	err  error
   224  }
   225  
   226  type chunkReader interface {
   227  	has(h addr) (bool, error)
   228  	hasMany(addrs []hasRecord) (bool, error)
   229  	get(ctx context.Context, h addr, stats *Stats) ([]byte, error)
   230  	getMany(ctx context.Context, eg *errgroup.Group, reqs []getRecord, found func(*chunks.Chunk), stats *Stats) (bool, error)
   231  	getManyCompressed(ctx context.Context, eg *errgroup.Group, reqs []getRecord, found func(CompressedChunk), stats *Stats) (bool, error)
   232  	extract(ctx context.Context, chunks chan<- extractRecord) error
   233  	count() (uint32, error)
   234  	uncompressedLen() (uint64, error)
   235  
   236  	// Close releases resources retained by the |chunkReader|.
   237  	Close() error
   238  }
   239  
   240  type chunkReadPlanner interface {
   241  	findOffsets(reqs []getRecord) (ors offsetRecSlice, remaining bool)
   242  	getManyAtOffsets(
   243  		ctx context.Context,
   244  		eg *errgroup.Group,
   245  		offsetRecords offsetRecSlice,
   246  		found func(*chunks.Chunk),
   247  		stats *Stats,
   248  	) error
   249  	getManyCompressedAtOffsets(
   250  		ctx context.Context,
   251  		eg *errgroup.Group,
   252  		offsetRecords offsetRecSlice,
   253  		found func(CompressedChunk),
   254  		stats *Stats,
   255  	) error
   256  }
   257  
   258  type chunkSource interface {
   259  	chunkReader
   260  	hash() (addr, error)
   261  	calcReads(reqs []getRecord, blockSize uint64) (reads int, remaining bool, err error)
   262  
   263  	// opens a Reader to the first byte of the chunkData segment of this table.
   264  	reader(context.Context) (io.Reader, error)
   265  	index() (tableIndex, error)
   266  
   267  	// Clone returns a |chunkSource| with the same contents as the
   268  	// original, but with independent |Close| behavior. A |chunkSource|
   269  	// cannot be |Close|d more than once, so if a |chunkSource| is being
   270  	// retained in two objects with independent life-cycle, it should be
   271  	// |Clone|d first.
   272  	Clone() chunkSource
   273  }
   274  
   275  type chunkSources []chunkSource
   276  
   277  // TableFile is an interface for working with an existing table file
   278  type TableFile interface {
   279  	// FileID gets the id of the file
   280  	FileID() string
   281  
   282  	// NumChunks returns the number of chunks in a table file
   283  	NumChunks() int
   284  
   285  	// Open returns an io.ReadCloser which can be used to read the bytes of a table file.
   286  	Open(ctx context.Context) (io.ReadCloser, error)
   287  }
   288  
   289  // Describes what is possible to do with TableFiles in a TableFileStore.
   290  type TableFileStoreOps struct {
   291  	// True is the TableFileStore supports reading table files.
   292  	CanRead bool
   293  	// True is the TableFileStore supports writing table files.
   294  	CanWrite bool
   295  	// True is the TableFileStore supports pruning unused table files.
   296  	CanPrune bool
   297  	// True is the TableFileStore supports garbage collecting chunks.
   298  	CanGC bool
   299  }
   300  
   301  // TableFileStore is an interface for interacting with table files directly
   302  type TableFileStore interface {
   303  	// Sources retrieves the current root hash, a list of all the table files (which may include appendix table files),
   304  	// and a second list containing only appendix table files.
   305  	Sources(ctx context.Context) (hash.Hash, []TableFile, []TableFile, error)
   306  
   307  	// Size  returns the total size, in bytes, of the table files in this Store.
   308  	Size(ctx context.Context) (uint64, error)
   309  
   310  	// WriteTableFile will read a table file from the provided reader and write it to the TableFileStore.
   311  	WriteTableFile(ctx context.Context, fileId string, numChunks int, rd io.Reader, contentLength uint64, contentHash []byte) error
   312  
   313  	// PruneTableFiles deletes old table files that are no longer referenced in the manifest.
   314  	PruneTableFiles(ctx context.Context) error
   315  
   316  	// SetRootChunk changes the root chunk hash from the previous value to the new root.
   317  	SetRootChunk(ctx context.Context, root, previous hash.Hash) error
   318  
   319  	// SupportedOperations returns a description of the support TableFile operations. Some stores only support reading table files, not writing.
   320  	SupportedOperations() TableFileStoreOps
   321  }