github.com/hasnat/dolt/go@v0.0.0-20210628190320-9eb5d843fbb7/store/nbs/table_persister.go (about) 1 // Copyright 2019 Dolthub, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 // 15 // This file incorporates work covered by the following copyright and 16 // permission notice: 17 // 18 // Copyright 2016 Attic Labs, Inc. All rights reserved. 19 // Licensed under the Apache License, version 2.0: 20 // http://www.apache.org/licenses/LICENSE-2.0 21 22 package nbs 23 24 import ( 25 "bytes" 26 "context" 27 "crypto/sha512" 28 "encoding/binary" 29 "errors" 30 "fmt" 31 "sort" 32 "sync" 33 34 "github.com/dolthub/dolt/go/store/util/sizecache" 35 ) 36 37 // tablePersister allows interaction with persistent storage. It provides 38 // primitives for pushing the contents of a memTable to persistent storage, 39 // opening persistent tables for reading, and conjoining a number of existing 40 // chunkSources into one. A tablePersister implementation must be goroutine- 41 // safe. 42 type tablePersister interface { 43 // Persist makes the contents of mt durable. Chunks already present in 44 // |haver| may be dropped in the process. 45 Persist(ctx context.Context, mt *memTable, haver chunkReader, stats *Stats) (chunkSource, error) 46 47 // ConjoinAll conjoins all chunks in |sources| into a single, new 48 // chunkSource. 49 ConjoinAll(ctx context.Context, sources chunkSources, stats *Stats) (chunkSource, error) 50 51 // Open a table named |name|, containing |chunkCount| chunks. 52 Open(ctx context.Context, name addr, chunkCount uint32, stats *Stats) (chunkSource, error) 53 54 // PruneTableFiles deletes old table files that are no longer referenced in the manifest. 55 PruneTableFiles(ctx context.Context, contents manifestContents) error 56 } 57 58 // indexCache provides sized storage for table indices. While getting and/or 59 // setting the cache entry for a given table name, the caller MUST hold the 60 // lock that for that entry. 61 type indexCache struct { 62 cache *sizecache.SizeCache 63 cond *sync.Cond 64 locked map[addr]struct{} 65 } 66 67 // Returns an indexCache which will burn roughly |size| bytes of memory. 68 func newIndexCache(size uint64) *indexCache { 69 return &indexCache{sizecache.New(size), sync.NewCond(&sync.Mutex{}), map[addr]struct{}{}} 70 } 71 72 // Take an exclusive lock on the cache entry for |name|. Callers must do this 73 // before calling get(addr) or put(addr, index) 74 func (sic *indexCache) lockEntry(name addr) { 75 sic.cond.L.Lock() 76 defer sic.cond.L.Unlock() 77 78 for { 79 if _, present := sic.locked[name]; !present { 80 sic.locked[name] = struct{}{} 81 break 82 } 83 sic.cond.Wait() 84 } 85 } 86 87 func (sic *indexCache) unlockEntry(name addr) error { 88 sic.cond.L.Lock() 89 defer sic.cond.L.Unlock() 90 91 _, ok := sic.locked[name] 92 93 if !ok { 94 return fmt.Errorf("failed to unlock %s", name.String()) 95 } 96 97 delete(sic.locked, name) 98 99 sic.cond.Broadcast() 100 101 return nil 102 } 103 104 func (sic *indexCache) get(name addr) (onHeapTableIndex, bool) { 105 if idx, found := sic.cache.Get(name); found { 106 return idx.(onHeapTableIndex), true 107 } 108 return onHeapTableIndex{}, false 109 } 110 111 func (sic *indexCache) put(name addr, idx onHeapTableIndex) { 112 indexSize := uint64(idx.chunkCount) * (addrSize + ordinalSize + lengthSize + uint64Size) 113 sic.cache.Add(name, indexSize, idx) 114 } 115 116 type chunkSourcesByAscendingCount struct { 117 sources chunkSources 118 err error 119 } 120 121 func (csbc chunkSourcesByAscendingCount) Len() int { return len(csbc.sources) } 122 func (csbc chunkSourcesByAscendingCount) Less(i, j int) bool { 123 srcI, srcJ := csbc.sources[i], csbc.sources[j] 124 cntI, err := srcI.count() 125 126 if err != nil { 127 csbc.err = err 128 return false 129 } 130 131 cntJ, err := srcJ.count() 132 133 if err != nil { 134 csbc.err = err 135 return false 136 } 137 138 if cntI == cntJ { 139 hi, err := srcI.hash() 140 141 if err != nil { 142 csbc.err = err 143 return false 144 } 145 146 hj, err := srcJ.hash() 147 148 if err != nil { 149 csbc.err = err 150 return false 151 } 152 153 return bytes.Compare(hi[:], hj[:]) < 0 154 } 155 156 return cntI < cntJ 157 } 158 159 func (csbc chunkSourcesByAscendingCount) Swap(i, j int) { 160 csbc.sources[i], csbc.sources[j] = csbc.sources[j], csbc.sources[i] 161 } 162 163 type chunkSourcesByDescendingDataSize struct { 164 sws []sourceWithSize 165 err error 166 } 167 168 func newChunkSourcesByDescendingDataSize(sws []sourceWithSize) chunkSourcesByDescendingDataSize { 169 return chunkSourcesByDescendingDataSize{sws, nil} 170 } 171 172 func (csbds chunkSourcesByDescendingDataSize) Len() int { return len(csbds.sws) } 173 func (csbds chunkSourcesByDescendingDataSize) Less(i, j int) bool { 174 swsI, swsJ := csbds.sws[i], csbds.sws[j] 175 if swsI.dataLen == swsJ.dataLen { 176 hi, err := swsI.source.hash() 177 178 if err != nil { 179 csbds.err = err 180 return false 181 } 182 183 hj, err := swsJ.source.hash() 184 185 if err != nil { 186 csbds.err = err 187 return false 188 } 189 190 return bytes.Compare(hi[:], hj[:]) < 0 191 } 192 return swsI.dataLen > swsJ.dataLen 193 } 194 func (csbds chunkSourcesByDescendingDataSize) Swap(i, j int) { 195 csbds.sws[i], csbds.sws[j] = csbds.sws[j], csbds.sws[i] 196 } 197 198 type sourceWithSize struct { 199 source chunkSource 200 dataLen uint64 201 } 202 203 type compactionPlan struct { 204 sources chunkSourcesByDescendingDataSize 205 mergedIndex []byte 206 chunkCount uint32 207 totalCompressedData uint64 208 } 209 210 func (cp compactionPlan) suffixes() []byte { 211 suffixesStart := uint64(cp.chunkCount) * (prefixTupleSize + lengthSize) 212 return cp.mergedIndex[suffixesStart : suffixesStart+uint64(cp.chunkCount)*addrSuffixSize] 213 } 214 215 func planConjoin(sources chunkSources, stats *Stats) (plan compactionPlan, err error) { 216 var totalUncompressedData uint64 217 for _, src := range sources { 218 var uncmp uint64 219 uncmp, err = src.uncompressedLen() 220 221 if err != nil { 222 return compactionPlan{}, err 223 } 224 225 totalUncompressedData += uncmp 226 index, err := src.index() 227 228 if err != nil { 229 return compactionPlan{}, err 230 } 231 232 plan.chunkCount += index.ChunkCount() 233 234 // Calculate the amount of chunk data in |src| 235 chunkDataLen := calcChunkDataLen(index) 236 plan.sources.sws = append(plan.sources.sws, sourceWithSize{src, chunkDataLen}) 237 plan.totalCompressedData += chunkDataLen 238 } 239 sort.Sort(plan.sources) 240 241 if plan.sources.err != nil { 242 return compactionPlan{}, plan.sources.err 243 } 244 245 lengthsPos := lengthsOffset(plan.chunkCount) 246 suffixesPos := suffixesOffset(plan.chunkCount) 247 plan.mergedIndex = make([]byte, indexSize(plan.chunkCount)+footerSize) 248 249 prefixIndexRecs := make(prefixIndexSlice, 0, plan.chunkCount) 250 var ordinalOffset uint32 251 for _, sws := range plan.sources.sws { 252 var index tableIndex 253 index, err = sws.source.index() 254 255 if err != nil { 256 return compactionPlan{}, err 257 } 258 259 ordinals := index.Ordinals() 260 prefixes := index.Prefixes() 261 262 // Add all the prefix tuples from this index to the list of all prefixIndexRecs, modifying the ordinals such that all entries from the 1st item in sources come after those in the 0th and so on. 263 for j, prefix := range prefixes { 264 rec := prefixIndexRec{prefix: prefix, order: ordinalOffset + ordinals[j]} 265 prefixIndexRecs = append(prefixIndexRecs, rec) 266 } 267 268 var cnt uint32 269 cnt, err = sws.source.count() 270 271 if err != nil { 272 return compactionPlan{}, err 273 } 274 275 ordinalOffset += cnt 276 277 if onHeap, ok := index.(onHeapTableIndex); ok { 278 // TODO: copy the lengths and suffixes as a byte-copy from src BUG #3438 279 // Bring over the lengths block, in order 280 for _, length := range onHeap.lengths { 281 binary.BigEndian.PutUint32(plan.mergedIndex[lengthsPos:], length) 282 lengthsPos += lengthSize 283 } 284 285 // Bring over the suffixes block, in order 286 n := copy(plan.mergedIndex[suffixesPos:], onHeap.suffixes) 287 288 if n != len(onHeap.suffixes) { 289 return compactionPlan{}, errors.New("failed to copy all data") 290 } 291 292 suffixesPos += uint64(n) 293 } else { 294 // Build up the index one entry at a time. 295 var a addr 296 for i := 0; i < len(ordinals); i++ { 297 e := index.IndexEntry(uint32(i), &a) 298 li := lengthsPos + lengthSize*uint64(ordinals[i]) 299 si := suffixesPos + addrSuffixSize*uint64(ordinals[i]) 300 binary.BigEndian.PutUint32(plan.mergedIndex[li:], e.Length()) 301 copy(plan.mergedIndex[si:], a[addrPrefixSize:]) 302 } 303 lengthsPos += lengthSize * uint64(len(ordinals)) 304 suffixesPos += addrSuffixSize * uint64(len(ordinals)) 305 } 306 } 307 308 // Sort all prefixTuples by hash and then insert them starting at the beginning of plan.mergedIndex 309 sort.Sort(prefixIndexRecs) 310 var pfxPos uint64 311 for _, pi := range prefixIndexRecs { 312 binary.BigEndian.PutUint64(plan.mergedIndex[pfxPos:], pi.prefix) 313 pfxPos += addrPrefixSize 314 binary.BigEndian.PutUint32(plan.mergedIndex[pfxPos:], pi.order) 315 pfxPos += ordinalSize 316 } 317 318 writeFooter(plan.mergedIndex[uint64(len(plan.mergedIndex))-footerSize:], plan.chunkCount, totalUncompressedData) 319 320 stats.BytesPerConjoin.Sample(uint64(plan.totalCompressedData) + uint64(len(plan.mergedIndex))) 321 return plan, nil 322 } 323 324 func nameFromSuffixes(suffixes []byte) (name addr) { 325 sha := sha512.New() 326 sha.Write(suffixes) 327 328 var h []byte 329 h = sha.Sum(h) // Appends hash to h 330 copy(name[:], h) 331 return 332 } 333 334 func calcChunkDataLen(index tableIndex) uint64 { 335 return index.TableFileSize() - indexSize(index.ChunkCount()) - footerSize 336 }