github.com/dolthub/dolt/go@v0.40.5-0.20240520175717-68db7794bea6/store/nbs/table_persister.go (about)

     1  // Copyright 2019 Dolthub, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  //
    15  // This file incorporates work covered by the following copyright and
    16  // permission notice:
    17  //
    18  // Copyright 2016 Attic Labs, Inc. All rights reserved.
    19  // Licensed under the Apache License, version 2.0:
    20  // http://www.apache.org/licenses/LICENSE-2.0
    21  
    22  package nbs
    23  
    24  import (
    25  	"bytes"
    26  	"context"
    27  	"crypto/sha512"
    28  	"encoding/binary"
    29  	"errors"
    30  	"io"
    31  	"sort"
    32  	"time"
    33  
    34  	"github.com/dolthub/dolt/go/store/chunks"
    35  	"github.com/dolthub/dolt/go/store/hash"
    36  )
    37  
    38  var errCacheMiss = errors.New("index cache miss")
    39  
    40  type cleanupFunc func()
    41  
    42  // tablePersister allows interaction with persistent storage. It provides
    43  // primitives for pushing the contents of a memTable to persistent storage,
    44  // opening persistent tables for reading, and conjoining a number of existing
    45  // chunkSources into one. A tablePersister implementation must be goroutine-
    46  // safe.
    47  type tablePersister interface {
    48  	// Persist makes the contents of mt durable. Chunks already present in
    49  	// |haver| may be dropped in the process.
    50  	Persist(ctx context.Context, mt *memTable, haver chunkReader, stats *Stats) (chunkSource, error)
    51  
    52  	// ConjoinAll conjoins all chunks in |sources| into a single, new
    53  	// chunkSource. It returns a |cleanupFunc| which can be called to
    54  	// potentially release resources associated with the |sources| once
    55  	// they are no longer needed.
    56  	ConjoinAll(ctx context.Context, sources chunkSources, stats *Stats) (chunkSource, cleanupFunc, error)
    57  
    58  	// Open a table named |name|, containing |chunkCount| chunks.
    59  	Open(ctx context.Context, name hash.Hash, chunkCount uint32, stats *Stats) (chunkSource, error)
    60  
    61  	// Exists checks if a table named |name| exists.
    62  	Exists(ctx context.Context, name hash.Hash, chunkCount uint32, stats *Stats) (bool, error)
    63  
    64  	// PruneTableFiles deletes table files which the persister would normally be responsible for and
    65  	// which are not in the included |keeper| set and have not be written or modified more recently
    66  	// than the provided |mtime|.
    67  	PruneTableFiles(ctx context.Context, keeper func() []hash.Hash, mtime time.Time) error
    68  
    69  	AccessMode() chunks.ExclusiveAccessMode
    70  
    71  	io.Closer
    72  }
    73  
    74  type tableFilePersister interface {
    75  	tablePersister
    76  
    77  	// CopyTableFile copies the table file with the given fileId from the reader to the TableFileStore.
    78  	CopyTableFile(ctx context.Context, r io.Reader, fileId string, fileSz uint64, chunkCount uint32) error
    79  
    80  	// Path returns the file system path. Use CopyTableFile instead of Path to
    81  	// copy a file to the TableFileStore. Path cannot be removed because it's used
    82  	// in remotesrv.
    83  	Path() string
    84  }
    85  
    86  type movingTableFilePersister interface {
    87  	TryMoveCmpChunkTableWriter(ctx context.Context, filename string, w *CmpChunkTableWriter) error
    88  }
    89  
    90  type chunkSourcesByDescendingDataSize struct {
    91  	sws []sourceWithSize
    92  }
    93  
    94  func (csbds chunkSourcesByDescendingDataSize) Len() int { return len(csbds.sws) }
    95  func (csbds chunkSourcesByDescendingDataSize) Less(i, j int) bool {
    96  	swsI, swsJ := csbds.sws[i], csbds.sws[j]
    97  	if swsI.dataLen == swsJ.dataLen {
    98  		hi := swsI.source.hash()
    99  		hj := swsJ.source.hash()
   100  		return bytes.Compare(hi[:], hj[:]) < 0
   101  	}
   102  	return swsI.dataLen > swsJ.dataLen
   103  }
   104  func (csbds chunkSourcesByDescendingDataSize) Swap(i, j int) {
   105  	csbds.sws[i], csbds.sws[j] = csbds.sws[j], csbds.sws[i]
   106  }
   107  
   108  type sourceWithSize struct {
   109  	source  chunkSource
   110  	dataLen uint64
   111  }
   112  
   113  type compactionPlan struct {
   114  	sources             chunkSourcesByDescendingDataSize
   115  	mergedIndex         []byte
   116  	chunkCount          uint32
   117  	totalCompressedData uint64
   118  }
   119  
   120  func (cp compactionPlan) suffixes() []byte {
   121  	suffixesStart := uint64(cp.chunkCount) * (prefixTupleSize + lengthSize)
   122  	return cp.mergedIndex[suffixesStart : suffixesStart+uint64(cp.chunkCount)*hash.SuffixLen]
   123  }
   124  
   125  // planRangeCopyConjoin computes a conjoin plan for tablePersisters that can conjoin
   126  // chunkSources using range copies (copy only chunk records, not chunk indexes).
   127  func planRangeCopyConjoin(sources chunkSources, stats *Stats) (compactionPlan, error) {
   128  	var sized []sourceWithSize
   129  	for _, src := range sources {
   130  		index, err := src.index()
   131  		if err != nil {
   132  			return compactionPlan{}, err
   133  		}
   134  		// Calculate the amount of chunk data in |src|
   135  		sized = append(sized, sourceWithSize{src, calcChunkRangeSize(index)})
   136  	}
   137  	return planConjoin(sized, stats)
   138  }
   139  
   140  // calcChunkRangeSize computes the size of the chunk records for a table file.
   141  func calcChunkRangeSize(index tableIndex) uint64 {
   142  	return index.tableFileSize() - indexSize(index.chunkCount()) - footerSize
   143  }
   144  
   145  func planConjoin(sources []sourceWithSize, stats *Stats) (plan compactionPlan, err error) {
   146  	// place largest chunk sources at the beginning of the conjoin
   147  	plan.sources = chunkSourcesByDescendingDataSize{sws: sources}
   148  	sort.Sort(plan.sources)
   149  
   150  	var totalUncompressedData uint64
   151  	for _, s := range sources {
   152  		var uncmp uint64
   153  		if uncmp, err = s.source.uncompressedLen(); err != nil {
   154  			return compactionPlan{}, err
   155  		}
   156  		totalUncompressedData += uncmp
   157  
   158  		index, err := s.source.index()
   159  		if err != nil {
   160  			return compactionPlan{}, err
   161  		}
   162  		// Calculate the amount of chunk data in |src|
   163  		plan.totalCompressedData += s.dataLen
   164  		plan.chunkCount += index.chunkCount()
   165  	}
   166  
   167  	lengthsPos := lengthsOffset(plan.chunkCount)
   168  	suffixesPos := suffixesOffset(plan.chunkCount)
   169  	plan.mergedIndex = make([]byte, indexSize(plan.chunkCount)+footerSize)
   170  
   171  	prefixIndexRecs := make(prefixIndexSlice, 0, plan.chunkCount)
   172  	var ordinalOffset uint32
   173  	for _, sws := range plan.sources.sws {
   174  		var index tableIndex
   175  		index, err = sws.source.index()
   176  
   177  		if err != nil {
   178  			return compactionPlan{}, err
   179  		}
   180  
   181  		ordinals, err := index.ordinals()
   182  		if err != nil {
   183  			return compactionPlan{}, err
   184  		}
   185  		prefixes, err := index.prefixes()
   186  		if err != nil {
   187  			return compactionPlan{}, err
   188  		}
   189  
   190  		// Add all the prefix tuples from this index to the list of all prefixIndexRecs, modifying the ordinals such that all entries from the 1st item in sources come after those in the 0th and so on.
   191  		for j, prefix := range prefixes {
   192  			rec := prefixIndexRec{order: ordinalOffset + ordinals[j]}
   193  			binary.BigEndian.PutUint64(rec.addr[:], prefix)
   194  			prefixIndexRecs = append(prefixIndexRecs, rec)
   195  		}
   196  
   197  		var cnt uint32
   198  		cnt, err = sws.source.count()
   199  
   200  		if err != nil {
   201  			return compactionPlan{}, err
   202  		}
   203  
   204  		ordinalOffset += cnt
   205  
   206  		if onHeap, ok := index.(onHeapTableIndex); ok {
   207  			// TODO: copy the lengths and suffixes as a byte-copy from src BUG #3438
   208  			// Bring over the lengths block, in order
   209  			for ord := uint32(0); ord < onHeap.chunkCount(); ord++ {
   210  				e := onHeap.getIndexEntry(ord)
   211  				binary.BigEndian.PutUint32(plan.mergedIndex[lengthsPos:], e.Length())
   212  				lengthsPos += lengthSize
   213  			}
   214  
   215  			// Bring over the suffixes block, in order
   216  			n := copy(plan.mergedIndex[suffixesPos:], onHeap.suffixes)
   217  
   218  			if n != len(onHeap.suffixes) {
   219  				return compactionPlan{}, errors.New("failed to copy all data")
   220  			}
   221  
   222  			suffixesPos += uint64(n)
   223  		} else {
   224  			// Build up the index one entry at a time.
   225  			var h hash.Hash
   226  			for i := 0; i < len(ordinals); i++ {
   227  				e, err := index.indexEntry(uint32(i), &h)
   228  				if err != nil {
   229  					return compactionPlan{}, err
   230  				}
   231  				li := lengthsPos + lengthSize*uint64(ordinals[i])
   232  				si := suffixesPos + hash.SuffixLen*uint64(ordinals[i])
   233  				binary.BigEndian.PutUint32(plan.mergedIndex[li:], e.Length())
   234  				copy(plan.mergedIndex[si:], h[hash.PrefixLen:])
   235  			}
   236  			lengthsPos += lengthSize * uint64(len(ordinals))
   237  			suffixesPos += hash.SuffixLen * uint64(len(ordinals))
   238  		}
   239  	}
   240  
   241  	// Sort all prefixTuples by hash and then insert them starting at the beginning of plan.mergedIndex
   242  	sort.Sort(prefixIndexRecs)
   243  	var pfxPos uint64
   244  	for _, pi := range prefixIndexRecs {
   245  		binary.BigEndian.PutUint64(plan.mergedIndex[pfxPos:], pi.addr.Prefix())
   246  		pfxPos += hash.PrefixLen
   247  		binary.BigEndian.PutUint32(plan.mergedIndex[pfxPos:], pi.order)
   248  		pfxPos += ordinalSize
   249  	}
   250  
   251  	writeFooter(plan.mergedIndex[uint64(len(plan.mergedIndex))-footerSize:], plan.chunkCount, totalUncompressedData)
   252  
   253  	stats.BytesPerConjoin.Sample(uint64(plan.totalCompressedData) + uint64(len(plan.mergedIndex)))
   254  	return plan, nil
   255  }
   256  
   257  func nameFromSuffixes(suffixes []byte) (name hash.Hash) {
   258  	sha := sha512.New()
   259  	sha.Write(suffixes)
   260  
   261  	var h []byte
   262  	h = sha.Sum(h) // Appends hash to h
   263  	return hash.New(h[:hash.ByteLen])
   264  }