github.com/dolthub/dolt/go@v0.40.5-0.20240520175717-68db7794bea6/store/nbs/table_persister.go (about) 1 // Copyright 2019 Dolthub, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 // 15 // This file incorporates work covered by the following copyright and 16 // permission notice: 17 // 18 // Copyright 2016 Attic Labs, Inc. All rights reserved. 19 // Licensed under the Apache License, version 2.0: 20 // http://www.apache.org/licenses/LICENSE-2.0 21 22 package nbs 23 24 import ( 25 "bytes" 26 "context" 27 "crypto/sha512" 28 "encoding/binary" 29 "errors" 30 "io" 31 "sort" 32 "time" 33 34 "github.com/dolthub/dolt/go/store/chunks" 35 "github.com/dolthub/dolt/go/store/hash" 36 ) 37 38 var errCacheMiss = errors.New("index cache miss") 39 40 type cleanupFunc func() 41 42 // tablePersister allows interaction with persistent storage. It provides 43 // primitives for pushing the contents of a memTable to persistent storage, 44 // opening persistent tables for reading, and conjoining a number of existing 45 // chunkSources into one. A tablePersister implementation must be goroutine- 46 // safe. 47 type tablePersister interface { 48 // Persist makes the contents of mt durable. Chunks already present in 49 // |haver| may be dropped in the process. 50 Persist(ctx context.Context, mt *memTable, haver chunkReader, stats *Stats) (chunkSource, error) 51 52 // ConjoinAll conjoins all chunks in |sources| into a single, new 53 // chunkSource. It returns a |cleanupFunc| which can be called to 54 // potentially release resources associated with the |sources| once 55 // they are no longer needed. 56 ConjoinAll(ctx context.Context, sources chunkSources, stats *Stats) (chunkSource, cleanupFunc, error) 57 58 // Open a table named |name|, containing |chunkCount| chunks. 59 Open(ctx context.Context, name hash.Hash, chunkCount uint32, stats *Stats) (chunkSource, error) 60 61 // Exists checks if a table named |name| exists. 62 Exists(ctx context.Context, name hash.Hash, chunkCount uint32, stats *Stats) (bool, error) 63 64 // PruneTableFiles deletes table files which the persister would normally be responsible for and 65 // which are not in the included |keeper| set and have not be written or modified more recently 66 // than the provided |mtime|. 67 PruneTableFiles(ctx context.Context, keeper func() []hash.Hash, mtime time.Time) error 68 69 AccessMode() chunks.ExclusiveAccessMode 70 71 io.Closer 72 } 73 74 type tableFilePersister interface { 75 tablePersister 76 77 // CopyTableFile copies the table file with the given fileId from the reader to the TableFileStore. 78 CopyTableFile(ctx context.Context, r io.Reader, fileId string, fileSz uint64, chunkCount uint32) error 79 80 // Path returns the file system path. Use CopyTableFile instead of Path to 81 // copy a file to the TableFileStore. Path cannot be removed because it's used 82 // in remotesrv. 83 Path() string 84 } 85 86 type movingTableFilePersister interface { 87 TryMoveCmpChunkTableWriter(ctx context.Context, filename string, w *CmpChunkTableWriter) error 88 } 89 90 type chunkSourcesByDescendingDataSize struct { 91 sws []sourceWithSize 92 } 93 94 func (csbds chunkSourcesByDescendingDataSize) Len() int { return len(csbds.sws) } 95 func (csbds chunkSourcesByDescendingDataSize) Less(i, j int) bool { 96 swsI, swsJ := csbds.sws[i], csbds.sws[j] 97 if swsI.dataLen == swsJ.dataLen { 98 hi := swsI.source.hash() 99 hj := swsJ.source.hash() 100 return bytes.Compare(hi[:], hj[:]) < 0 101 } 102 return swsI.dataLen > swsJ.dataLen 103 } 104 func (csbds chunkSourcesByDescendingDataSize) Swap(i, j int) { 105 csbds.sws[i], csbds.sws[j] = csbds.sws[j], csbds.sws[i] 106 } 107 108 type sourceWithSize struct { 109 source chunkSource 110 dataLen uint64 111 } 112 113 type compactionPlan struct { 114 sources chunkSourcesByDescendingDataSize 115 mergedIndex []byte 116 chunkCount uint32 117 totalCompressedData uint64 118 } 119 120 func (cp compactionPlan) suffixes() []byte { 121 suffixesStart := uint64(cp.chunkCount) * (prefixTupleSize + lengthSize) 122 return cp.mergedIndex[suffixesStart : suffixesStart+uint64(cp.chunkCount)*hash.SuffixLen] 123 } 124 125 // planRangeCopyConjoin computes a conjoin plan for tablePersisters that can conjoin 126 // chunkSources using range copies (copy only chunk records, not chunk indexes). 127 func planRangeCopyConjoin(sources chunkSources, stats *Stats) (compactionPlan, error) { 128 var sized []sourceWithSize 129 for _, src := range sources { 130 index, err := src.index() 131 if err != nil { 132 return compactionPlan{}, err 133 } 134 // Calculate the amount of chunk data in |src| 135 sized = append(sized, sourceWithSize{src, calcChunkRangeSize(index)}) 136 } 137 return planConjoin(sized, stats) 138 } 139 140 // calcChunkRangeSize computes the size of the chunk records for a table file. 141 func calcChunkRangeSize(index tableIndex) uint64 { 142 return index.tableFileSize() - indexSize(index.chunkCount()) - footerSize 143 } 144 145 func planConjoin(sources []sourceWithSize, stats *Stats) (plan compactionPlan, err error) { 146 // place largest chunk sources at the beginning of the conjoin 147 plan.sources = chunkSourcesByDescendingDataSize{sws: sources} 148 sort.Sort(plan.sources) 149 150 var totalUncompressedData uint64 151 for _, s := range sources { 152 var uncmp uint64 153 if uncmp, err = s.source.uncompressedLen(); err != nil { 154 return compactionPlan{}, err 155 } 156 totalUncompressedData += uncmp 157 158 index, err := s.source.index() 159 if err != nil { 160 return compactionPlan{}, err 161 } 162 // Calculate the amount of chunk data in |src| 163 plan.totalCompressedData += s.dataLen 164 plan.chunkCount += index.chunkCount() 165 } 166 167 lengthsPos := lengthsOffset(plan.chunkCount) 168 suffixesPos := suffixesOffset(plan.chunkCount) 169 plan.mergedIndex = make([]byte, indexSize(plan.chunkCount)+footerSize) 170 171 prefixIndexRecs := make(prefixIndexSlice, 0, plan.chunkCount) 172 var ordinalOffset uint32 173 for _, sws := range plan.sources.sws { 174 var index tableIndex 175 index, err = sws.source.index() 176 177 if err != nil { 178 return compactionPlan{}, err 179 } 180 181 ordinals, err := index.ordinals() 182 if err != nil { 183 return compactionPlan{}, err 184 } 185 prefixes, err := index.prefixes() 186 if err != nil { 187 return compactionPlan{}, err 188 } 189 190 // Add all the prefix tuples from this index to the list of all prefixIndexRecs, modifying the ordinals such that all entries from the 1st item in sources come after those in the 0th and so on. 191 for j, prefix := range prefixes { 192 rec := prefixIndexRec{order: ordinalOffset + ordinals[j]} 193 binary.BigEndian.PutUint64(rec.addr[:], prefix) 194 prefixIndexRecs = append(prefixIndexRecs, rec) 195 } 196 197 var cnt uint32 198 cnt, err = sws.source.count() 199 200 if err != nil { 201 return compactionPlan{}, err 202 } 203 204 ordinalOffset += cnt 205 206 if onHeap, ok := index.(onHeapTableIndex); ok { 207 // TODO: copy the lengths and suffixes as a byte-copy from src BUG #3438 208 // Bring over the lengths block, in order 209 for ord := uint32(0); ord < onHeap.chunkCount(); ord++ { 210 e := onHeap.getIndexEntry(ord) 211 binary.BigEndian.PutUint32(plan.mergedIndex[lengthsPos:], e.Length()) 212 lengthsPos += lengthSize 213 } 214 215 // Bring over the suffixes block, in order 216 n := copy(plan.mergedIndex[suffixesPos:], onHeap.suffixes) 217 218 if n != len(onHeap.suffixes) { 219 return compactionPlan{}, errors.New("failed to copy all data") 220 } 221 222 suffixesPos += uint64(n) 223 } else { 224 // Build up the index one entry at a time. 225 var h hash.Hash 226 for i := 0; i < len(ordinals); i++ { 227 e, err := index.indexEntry(uint32(i), &h) 228 if err != nil { 229 return compactionPlan{}, err 230 } 231 li := lengthsPos + lengthSize*uint64(ordinals[i]) 232 si := suffixesPos + hash.SuffixLen*uint64(ordinals[i]) 233 binary.BigEndian.PutUint32(plan.mergedIndex[li:], e.Length()) 234 copy(plan.mergedIndex[si:], h[hash.PrefixLen:]) 235 } 236 lengthsPos += lengthSize * uint64(len(ordinals)) 237 suffixesPos += hash.SuffixLen * uint64(len(ordinals)) 238 } 239 } 240 241 // Sort all prefixTuples by hash and then insert them starting at the beginning of plan.mergedIndex 242 sort.Sort(prefixIndexRecs) 243 var pfxPos uint64 244 for _, pi := range prefixIndexRecs { 245 binary.BigEndian.PutUint64(plan.mergedIndex[pfxPos:], pi.addr.Prefix()) 246 pfxPos += hash.PrefixLen 247 binary.BigEndian.PutUint32(plan.mergedIndex[pfxPos:], pi.order) 248 pfxPos += ordinalSize 249 } 250 251 writeFooter(plan.mergedIndex[uint64(len(plan.mergedIndex))-footerSize:], plan.chunkCount, totalUncompressedData) 252 253 stats.BytesPerConjoin.Sample(uint64(plan.totalCompressedData) + uint64(len(plan.mergedIndex))) 254 return plan, nil 255 } 256 257 func nameFromSuffixes(suffixes []byte) (name hash.Hash) { 258 sha := sha512.New() 259 sha.Write(suffixes) 260 261 var h []byte 262 h = sha.Sum(h) // Appends hash to h 263 return hash.New(h[:hash.ByteLen]) 264 }