github.com/hasnat/dolt/go@v0.0.0-20210628190320-9eb5d843fbb7/store/nbs/table_persister.go (about)

     1  // Copyright 2019 Dolthub, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  //
    15  // This file incorporates work covered by the following copyright and
    16  // permission notice:
    17  //
    18  // Copyright 2016 Attic Labs, Inc. All rights reserved.
    19  // Licensed under the Apache License, version 2.0:
    20  // http://www.apache.org/licenses/LICENSE-2.0
    21  
    22  package nbs
    23  
    24  import (
    25  	"bytes"
    26  	"context"
    27  	"crypto/sha512"
    28  	"encoding/binary"
    29  	"errors"
    30  	"fmt"
    31  	"sort"
    32  	"sync"
    33  
    34  	"github.com/dolthub/dolt/go/store/util/sizecache"
    35  )
    36  
    37  // tablePersister allows interaction with persistent storage. It provides
    38  // primitives for pushing the contents of a memTable to persistent storage,
    39  // opening persistent tables for reading, and conjoining a number of existing
    40  // chunkSources into one. A tablePersister implementation must be goroutine-
    41  // safe.
    42  type tablePersister interface {
    43  	// Persist makes the contents of mt durable. Chunks already present in
    44  	// |haver| may be dropped in the process.
    45  	Persist(ctx context.Context, mt *memTable, haver chunkReader, stats *Stats) (chunkSource, error)
    46  
    47  	// ConjoinAll conjoins all chunks in |sources| into a single, new
    48  	// chunkSource.
    49  	ConjoinAll(ctx context.Context, sources chunkSources, stats *Stats) (chunkSource, error)
    50  
    51  	// Open a table named |name|, containing |chunkCount| chunks.
    52  	Open(ctx context.Context, name addr, chunkCount uint32, stats *Stats) (chunkSource, error)
    53  
    54  	// PruneTableFiles deletes old table files that are no longer referenced in the manifest.
    55  	PruneTableFiles(ctx context.Context, contents manifestContents) error
    56  }
    57  
    58  // indexCache provides sized storage for table indices. While getting and/or
    59  // setting the cache entry for a given table name, the caller MUST hold the
    60  // lock that for that entry.
    61  type indexCache struct {
    62  	cache  *sizecache.SizeCache
    63  	cond   *sync.Cond
    64  	locked map[addr]struct{}
    65  }
    66  
    67  // Returns an indexCache which will burn roughly |size| bytes of memory.
    68  func newIndexCache(size uint64) *indexCache {
    69  	return &indexCache{sizecache.New(size), sync.NewCond(&sync.Mutex{}), map[addr]struct{}{}}
    70  }
    71  
    72  // Take an exclusive lock on the cache entry for |name|. Callers must do this
    73  // before calling get(addr) or put(addr, index)
    74  func (sic *indexCache) lockEntry(name addr) {
    75  	sic.cond.L.Lock()
    76  	defer sic.cond.L.Unlock()
    77  
    78  	for {
    79  		if _, present := sic.locked[name]; !present {
    80  			sic.locked[name] = struct{}{}
    81  			break
    82  		}
    83  		sic.cond.Wait()
    84  	}
    85  }
    86  
    87  func (sic *indexCache) unlockEntry(name addr) error {
    88  	sic.cond.L.Lock()
    89  	defer sic.cond.L.Unlock()
    90  
    91  	_, ok := sic.locked[name]
    92  
    93  	if !ok {
    94  		return fmt.Errorf("failed to unlock %s", name.String())
    95  	}
    96  
    97  	delete(sic.locked, name)
    98  
    99  	sic.cond.Broadcast()
   100  
   101  	return nil
   102  }
   103  
   104  func (sic *indexCache) get(name addr) (onHeapTableIndex, bool) {
   105  	if idx, found := sic.cache.Get(name); found {
   106  		return idx.(onHeapTableIndex), true
   107  	}
   108  	return onHeapTableIndex{}, false
   109  }
   110  
   111  func (sic *indexCache) put(name addr, idx onHeapTableIndex) {
   112  	indexSize := uint64(idx.chunkCount) * (addrSize + ordinalSize + lengthSize + uint64Size)
   113  	sic.cache.Add(name, indexSize, idx)
   114  }
   115  
   116  type chunkSourcesByAscendingCount struct {
   117  	sources chunkSources
   118  	err     error
   119  }
   120  
   121  func (csbc chunkSourcesByAscendingCount) Len() int { return len(csbc.sources) }
   122  func (csbc chunkSourcesByAscendingCount) Less(i, j int) bool {
   123  	srcI, srcJ := csbc.sources[i], csbc.sources[j]
   124  	cntI, err := srcI.count()
   125  
   126  	if err != nil {
   127  		csbc.err = err
   128  		return false
   129  	}
   130  
   131  	cntJ, err := srcJ.count()
   132  
   133  	if err != nil {
   134  		csbc.err = err
   135  		return false
   136  	}
   137  
   138  	if cntI == cntJ {
   139  		hi, err := srcI.hash()
   140  
   141  		if err != nil {
   142  			csbc.err = err
   143  			return false
   144  		}
   145  
   146  		hj, err := srcJ.hash()
   147  
   148  		if err != nil {
   149  			csbc.err = err
   150  			return false
   151  		}
   152  
   153  		return bytes.Compare(hi[:], hj[:]) < 0
   154  	}
   155  
   156  	return cntI < cntJ
   157  }
   158  
   159  func (csbc chunkSourcesByAscendingCount) Swap(i, j int) {
   160  	csbc.sources[i], csbc.sources[j] = csbc.sources[j], csbc.sources[i]
   161  }
   162  
   163  type chunkSourcesByDescendingDataSize struct {
   164  	sws []sourceWithSize
   165  	err error
   166  }
   167  
   168  func newChunkSourcesByDescendingDataSize(sws []sourceWithSize) chunkSourcesByDescendingDataSize {
   169  	return chunkSourcesByDescendingDataSize{sws, nil}
   170  }
   171  
   172  func (csbds chunkSourcesByDescendingDataSize) Len() int { return len(csbds.sws) }
   173  func (csbds chunkSourcesByDescendingDataSize) Less(i, j int) bool {
   174  	swsI, swsJ := csbds.sws[i], csbds.sws[j]
   175  	if swsI.dataLen == swsJ.dataLen {
   176  		hi, err := swsI.source.hash()
   177  
   178  		if err != nil {
   179  			csbds.err = err
   180  			return false
   181  		}
   182  
   183  		hj, err := swsJ.source.hash()
   184  
   185  		if err != nil {
   186  			csbds.err = err
   187  			return false
   188  		}
   189  
   190  		return bytes.Compare(hi[:], hj[:]) < 0
   191  	}
   192  	return swsI.dataLen > swsJ.dataLen
   193  }
   194  func (csbds chunkSourcesByDescendingDataSize) Swap(i, j int) {
   195  	csbds.sws[i], csbds.sws[j] = csbds.sws[j], csbds.sws[i]
   196  }
   197  
   198  type sourceWithSize struct {
   199  	source  chunkSource
   200  	dataLen uint64
   201  }
   202  
   203  type compactionPlan struct {
   204  	sources             chunkSourcesByDescendingDataSize
   205  	mergedIndex         []byte
   206  	chunkCount          uint32
   207  	totalCompressedData uint64
   208  }
   209  
   210  func (cp compactionPlan) suffixes() []byte {
   211  	suffixesStart := uint64(cp.chunkCount) * (prefixTupleSize + lengthSize)
   212  	return cp.mergedIndex[suffixesStart : suffixesStart+uint64(cp.chunkCount)*addrSuffixSize]
   213  }
   214  
   215  func planConjoin(sources chunkSources, stats *Stats) (plan compactionPlan, err error) {
   216  	var totalUncompressedData uint64
   217  	for _, src := range sources {
   218  		var uncmp uint64
   219  		uncmp, err = src.uncompressedLen()
   220  
   221  		if err != nil {
   222  			return compactionPlan{}, err
   223  		}
   224  
   225  		totalUncompressedData += uncmp
   226  		index, err := src.index()
   227  
   228  		if err != nil {
   229  			return compactionPlan{}, err
   230  		}
   231  
   232  		plan.chunkCount += index.ChunkCount()
   233  
   234  		// Calculate the amount of chunk data in |src|
   235  		chunkDataLen := calcChunkDataLen(index)
   236  		plan.sources.sws = append(plan.sources.sws, sourceWithSize{src, chunkDataLen})
   237  		plan.totalCompressedData += chunkDataLen
   238  	}
   239  	sort.Sort(plan.sources)
   240  
   241  	if plan.sources.err != nil {
   242  		return compactionPlan{}, plan.sources.err
   243  	}
   244  
   245  	lengthsPos := lengthsOffset(plan.chunkCount)
   246  	suffixesPos := suffixesOffset(plan.chunkCount)
   247  	plan.mergedIndex = make([]byte, indexSize(plan.chunkCount)+footerSize)
   248  
   249  	prefixIndexRecs := make(prefixIndexSlice, 0, plan.chunkCount)
   250  	var ordinalOffset uint32
   251  	for _, sws := range plan.sources.sws {
   252  		var index tableIndex
   253  		index, err = sws.source.index()
   254  
   255  		if err != nil {
   256  			return compactionPlan{}, err
   257  		}
   258  
   259  		ordinals := index.Ordinals()
   260  		prefixes := index.Prefixes()
   261  
   262  		// Add all the prefix tuples from this index to the list of all prefixIndexRecs, modifying the ordinals such that all entries from the 1st item in sources come after those in the 0th and so on.
   263  		for j, prefix := range prefixes {
   264  			rec := prefixIndexRec{prefix: prefix, order: ordinalOffset + ordinals[j]}
   265  			prefixIndexRecs = append(prefixIndexRecs, rec)
   266  		}
   267  
   268  		var cnt uint32
   269  		cnt, err = sws.source.count()
   270  
   271  		if err != nil {
   272  			return compactionPlan{}, err
   273  		}
   274  
   275  		ordinalOffset += cnt
   276  
   277  		if onHeap, ok := index.(onHeapTableIndex); ok {
   278  			// TODO: copy the lengths and suffixes as a byte-copy from src BUG #3438
   279  			// Bring over the lengths block, in order
   280  			for _, length := range onHeap.lengths {
   281  				binary.BigEndian.PutUint32(plan.mergedIndex[lengthsPos:], length)
   282  				lengthsPos += lengthSize
   283  			}
   284  
   285  			// Bring over the suffixes block, in order
   286  			n := copy(plan.mergedIndex[suffixesPos:], onHeap.suffixes)
   287  
   288  			if n != len(onHeap.suffixes) {
   289  				return compactionPlan{}, errors.New("failed to copy all data")
   290  			}
   291  
   292  			suffixesPos += uint64(n)
   293  		} else {
   294  			// Build up the index one entry at a time.
   295  			var a addr
   296  			for i := 0; i < len(ordinals); i++ {
   297  				e := index.IndexEntry(uint32(i), &a)
   298  				li := lengthsPos + lengthSize*uint64(ordinals[i])
   299  				si := suffixesPos + addrSuffixSize*uint64(ordinals[i])
   300  				binary.BigEndian.PutUint32(plan.mergedIndex[li:], e.Length())
   301  				copy(plan.mergedIndex[si:], a[addrPrefixSize:])
   302  			}
   303  			lengthsPos += lengthSize * uint64(len(ordinals))
   304  			suffixesPos += addrSuffixSize * uint64(len(ordinals))
   305  		}
   306  	}
   307  
   308  	// Sort all prefixTuples by hash and then insert them starting at the beginning of plan.mergedIndex
   309  	sort.Sort(prefixIndexRecs)
   310  	var pfxPos uint64
   311  	for _, pi := range prefixIndexRecs {
   312  		binary.BigEndian.PutUint64(plan.mergedIndex[pfxPos:], pi.prefix)
   313  		pfxPos += addrPrefixSize
   314  		binary.BigEndian.PutUint32(plan.mergedIndex[pfxPos:], pi.order)
   315  		pfxPos += ordinalSize
   316  	}
   317  
   318  	writeFooter(plan.mergedIndex[uint64(len(plan.mergedIndex))-footerSize:], plan.chunkCount, totalUncompressedData)
   319  
   320  	stats.BytesPerConjoin.Sample(uint64(plan.totalCompressedData) + uint64(len(plan.mergedIndex)))
   321  	return plan, nil
   322  }
   323  
   324  func nameFromSuffixes(suffixes []byte) (name addr) {
   325  	sha := sha512.New()
   326  	sha.Write(suffixes)
   327  
   328  	var h []byte
   329  	h = sha.Sum(h) // Appends hash to h
   330  	copy(name[:], h)
   331  	return
   332  }
   333  
   334  func calcChunkDataLen(index tableIndex) uint64 {
   335  	return index.TableFileSize() - indexSize(index.ChunkCount()) - footerSize
   336  }