github.com/dolthub/dolt/go@v0.40.5-0.20240520175717-68db7794bea6/store/nbs/table_index.go (about) 1 // Copyright 2022 Dolthub, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package nbs 16 17 import ( 18 "bytes" 19 "context" 20 "encoding/binary" 21 "errors" 22 "fmt" 23 "io" 24 "runtime" 25 "runtime/debug" 26 "sync/atomic" 27 28 "github.com/dolthub/dolt/go/libraries/utils/iohelp" 29 "github.com/dolthub/dolt/go/store/hash" 30 ) 31 32 var ( 33 ErrWrongBufferSize = errors.New("buffer length and/or capacity incorrect for chunkCount specified in footer") 34 ErrWrongCopySize = errors.New("could not copy enough bytes") 35 ) 36 37 // By setting this to false, you can make tablefile index creation cheaper. In 38 // exchange, the panics which leaked table files create do not come with as 39 // much information. 40 41 var TableIndexGCFinalizerWithStackTrace = true 42 43 type tableIndex interface { 44 // entrySuffixMatches returns true if the entry at index |idx| matches 45 // the suffix of the address |h|. Used by |lookup| after finding 46 // matching indexes based on |Prefixes|. 47 entrySuffixMatches(idx uint32, h *hash.Hash) (bool, error) 48 49 // indexEntry returns the |indexEntry| at |idx|. Optionally puts the 50 // full address of that entry in |a| if |a| is not |nil|. 51 indexEntry(idx uint32, a *hash.Hash) (indexEntry, error) 52 53 // lookup returns an |indexEntry| for the chunk corresponding to the 54 // provided address |h|. Second returns is |true| if an entry exists 55 // and |false| otherwise. 56 lookup(h *hash.Hash) (indexEntry, bool, error) 57 58 // Ordinals returns a slice of indexes which maps the |i|th chunk in 59 // the indexed file to its corresponding entry in index. The |i|th 60 // entry in the result is the |i|th chunk in the indexed file, and its 61 // corresponding value in the slice is the index entry that maps to it. 62 ordinals() ([]uint32, error) 63 64 // Prefixes returns the sorted slice of |uint64| |addr| prefixes; each 65 // entry corresponds to an indexed chunk address. 66 prefixes() ([]uint64, error) 67 68 // chunkCount returns the total number of chunks in the indexed file. 69 chunkCount() uint32 70 71 // tableFileSize returns the total size of the indexed table file, in bytes. 72 tableFileSize() uint64 73 74 // totalUncompressedData returns the total uncompressed data size of 75 // the table file. Used for informational statistics only. 76 totalUncompressedData() uint64 77 78 // Close releases any resources used by this tableIndex. 79 Close() error 80 81 // clone returns a |tableIndex| with the same contents which can be 82 // |Close|d independently. 83 clone() (tableIndex, error) 84 } 85 86 func ReadTableFooter(rd io.ReadSeeker) (chunkCount uint32, totalUncompressedData uint64, err error) { 87 footerSize := int64(magicNumberSize + uint64Size + uint32Size) 88 _, err = rd.Seek(-footerSize, io.SeekEnd) 89 90 if err != nil { 91 return 0, 0, err 92 } 93 94 footer, err := iohelp.ReadNBytes(rd, int(footerSize)) 95 96 if err != nil { 97 return 0, 0, err 98 } 99 100 if string(footer[uint32Size+uint64Size:]) != magicNumber { 101 // Give a nice error message if this is a table file format which we will support in the future. 102 possibleDarc := string(footer[len(footer)-doltMagicSize:]) 103 if possibleDarc == doltMagicNumber { 104 return 0, 0, ErrUnsupportedTableFileFormat 105 } 106 107 return 0, 0, ErrInvalidTableFile 108 } 109 110 chunkCount = binary.BigEndian.Uint32(footer) 111 totalUncompressedData = binary.BigEndian.Uint64(footer[uint32Size:]) 112 113 return 114 } 115 116 // parses a valid nbs tableIndex from a byte stream. |buff| must end with an NBS index 117 // and footer and its length must match the expected indexSize for the chunkCount specified in the footer. 118 // Retains the buffer and does not allocate new memory except for offsets, computes on buff in place. 119 func parseTableIndex(ctx context.Context, buff []byte, q MemoryQuotaProvider) (onHeapTableIndex, error) { 120 chunkCount, totalUncompressedData, err := ReadTableFooter(bytes.NewReader(buff)) 121 if err != nil { 122 return onHeapTableIndex{}, err 123 } 124 125 chunks2 := chunkCount / 2 126 chunks1 := chunkCount - chunks2 127 offsetsBuff1, err := q.AcquireQuotaBytes(ctx, int(chunks1*offsetSize)) 128 if err != nil { 129 return onHeapTableIndex{}, err 130 } 131 idx, err := newOnHeapTableIndex(buff, offsetsBuff1, chunkCount, totalUncompressedData, q) 132 if err != nil { 133 q.ReleaseQuotaBytes(len(offsetsBuff1)) 134 } 135 return idx, err 136 } 137 138 // similar to parseTableIndex except that it uses the given |offsetsBuff1| 139 // instead of allocating the additional space. 140 func parseTableIndexWithOffsetBuff(buff []byte, offsetsBuff1 []byte, q MemoryQuotaProvider) (onHeapTableIndex, error) { 141 chunkCount, totalUncompressedData, err := ReadTableFooter(bytes.NewReader(buff)) 142 if err != nil { 143 return onHeapTableIndex{}, err 144 } 145 146 return newOnHeapTableIndex(buff, offsetsBuff1, chunkCount, totalUncompressedData, q) 147 } 148 149 // parseTableIndexByCopy reads the footer, copies indexSize(chunkCount) bytes, and parses an on heap table index. 150 // Useful to create an onHeapTableIndex without retaining the entire underlying array of data. 151 func parseTableIndexByCopy(ctx context.Context, buff []byte, q MemoryQuotaProvider) (onHeapTableIndex, error) { 152 return readTableIndexByCopy(ctx, bytes.NewReader(buff), q) 153 } 154 155 // readTableIndexByCopy loads an index into memory from an io.ReadSeeker 156 // Caution: Allocates new memory for entire index 157 func readTableIndexByCopy(ctx context.Context, rd io.ReadSeeker, q MemoryQuotaProvider) (onHeapTableIndex, error) { 158 chunkCount, totalUncompressedData, err := ReadTableFooter(rd) 159 if err != nil { 160 return onHeapTableIndex{}, err 161 } 162 idxSz := int64(indexSize(chunkCount) + footerSize) 163 _, err = rd.Seek(-idxSz, io.SeekEnd) 164 if err != nil { 165 return onHeapTableIndex{}, err 166 } 167 168 if int64(int(idxSz)) != idxSz { 169 return onHeapTableIndex{}, fmt.Errorf("table file index is too large to read on this platform. index size %d > max int.", idxSz) 170 } 171 172 buff, err := q.AcquireQuotaBytes(ctx, int(idxSz)) 173 if err != nil { 174 return onHeapTableIndex{}, err 175 } 176 177 _, err = io.ReadFull(rd, buff) 178 if err != nil { 179 q.ReleaseQuotaBytes(len(buff)) 180 return onHeapTableIndex{}, err 181 } 182 183 chunks1 := chunkCount - (chunkCount / 2) 184 offsets1Buff, err := q.AcquireQuotaBytes(ctx, int(chunks1*offsetSize)) 185 if err != nil { 186 q.ReleaseQuotaBytes(len(buff)) 187 return onHeapTableIndex{}, err 188 } 189 190 idx, err := newOnHeapTableIndex(buff, offsets1Buff, chunkCount, totalUncompressedData, q) 191 if err != nil { 192 q.ReleaseQuotaBytes(len(buff)) 193 q.ReleaseQuotaBytes(len(offsets1Buff)) 194 } 195 return idx, err 196 } 197 198 type onHeapTableIndex struct { 199 // prefixTuples is a packed array of 12 byte tuples: 200 // (8 byte addr prefix, 4 byte uint32 ordinal) 201 // it is sorted by addr prefix, the ordinal value 202 // can be used to lookup offset and addr suffix 203 prefixTuples []byte 204 205 // the offsets arrays contains packed uint64s 206 offsets1 []byte 207 offsets2 []byte 208 209 // suffixes is a array of 12 byte addr suffixes 210 suffixes []byte 211 212 // footer contains in the table file footer 213 footer []byte 214 215 q MemoryQuotaProvider 216 refCnt *int32 217 218 count uint32 219 tableFileSz uint64 220 uncompressedSz uint64 221 } 222 223 var _ tableIndex = &onHeapTableIndex{} 224 225 // newOnHeapTableIndex converts a table file index with stored lengths on 226 // |indexBuff| into an index with stored offsets. Since offsets are twice the 227 // size of a length, we need to allocate additional space to store all the 228 // offsets. It stores the first n - n/2 offsets in |offsetsBuff1| (the 229 // additional space) and the rest into the region of |indexBuff| previously 230 // occupied by lengths. |onHeapTableIndex| computes directly on the given 231 // |indexBuff| and |offsetsBuff1| buffers. 232 func newOnHeapTableIndex(indexBuff []byte, offsetsBuff1 []byte, count uint32, totalUncompressedData uint64, q MemoryQuotaProvider) (onHeapTableIndex, error) { 233 if len(indexBuff) != int(indexSize(count)+footerSize) { 234 return onHeapTableIndex{}, ErrWrongBufferSize 235 } 236 237 tuples := indexBuff[:prefixTupleSize*count] 238 lengths := indexBuff[prefixTupleSize*count : prefixTupleSize*count+lengthSize*count] 239 suffixes := indexBuff[prefixTupleSize*count+lengthSize*count : indexSize(count)] 240 footer := indexBuff[indexSize(count):] 241 242 chunks2 := count / 2 243 244 r := NewOffsetsReader(bytes.NewReader(lengths)) 245 _, err := io.ReadFull(r, offsetsBuff1) 246 if err != nil { 247 return onHeapTableIndex{}, err 248 } 249 250 // reuse |lengths| for offsets 251 offsetsBuff2 := lengths 252 if chunks2 > 0 { 253 b := offsetsBuff2[:chunks2*offsetSize] 254 if _, err = io.ReadFull(r, b); err != nil { 255 return onHeapTableIndex{}, err 256 } 257 } 258 259 refCnt := new(int32) 260 *refCnt = 1 261 262 if TableIndexGCFinalizerWithStackTrace { 263 stack := string(debug.Stack()) 264 runtime.SetFinalizer(refCnt, func(i *int32) { 265 panic(fmt.Sprintf("OnHeapTableIndex %x not closed:\n%s", refCnt, stack)) 266 }) 267 } else { 268 runtime.SetFinalizer(refCnt, func(i *int32) { 269 panic(fmt.Sprintf("OnHeapTableIndex %x was not closed", refCnt)) 270 }) 271 } 272 273 return onHeapTableIndex{ 274 refCnt: refCnt, 275 q: q, 276 prefixTuples: tuples, 277 offsets1: offsetsBuff1, 278 offsets2: offsetsBuff2, 279 suffixes: suffixes, 280 footer: footer, 281 count: count, 282 uncompressedSz: totalUncompressedData, 283 }, nil 284 } 285 286 func (ti onHeapTableIndex) entrySuffixMatches(idx uint32, h *hash.Hash) (bool, error) { 287 ord := ti.ordinalAt(idx) 288 o := ord * hash.SuffixLen 289 b := ti.suffixes[o : o+hash.SuffixLen] 290 return bytes.Equal(h[hash.PrefixLen:], b), nil 291 } 292 293 func (ti onHeapTableIndex) indexEntry(idx uint32, a *hash.Hash) (entry indexEntry, err error) { 294 prefix, ord := ti.tupleAt(idx) 295 296 if a != nil { 297 binary.BigEndian.PutUint64(a[:], prefix) 298 299 o := int64(hash.SuffixLen * ord) 300 b := ti.suffixes[o : o+hash.SuffixLen] 301 copy(a[hash.PrefixLen:], b) 302 } 303 304 return ti.getIndexEntry(ord), nil 305 } 306 307 func (ti onHeapTableIndex) getIndexEntry(ord uint32) indexEntry { 308 var prevOff uint64 309 if ord == 0 { 310 prevOff = 0 311 } else { 312 prevOff = ti.offsetAt(ord - 1) 313 } 314 ordOff := ti.offsetAt(ord) 315 length := uint32(ordOff - prevOff) 316 return indexResult{ 317 offset: prevOff, 318 length: length, 319 } 320 } 321 322 func (ti onHeapTableIndex) lookup(h *hash.Hash) (indexEntry, bool, error) { 323 ord, err := ti.lookupOrdinal(h) 324 if err != nil { 325 return indexResult{}, false, err 326 } 327 if ord == ti.count { 328 return indexResult{}, false, nil 329 } 330 return ti.getIndexEntry(ord), true, nil 331 } 332 333 // lookupOrdinal returns the ordinal of |h| if present. Returns |ti.count| 334 // if absent. 335 func (ti onHeapTableIndex) lookupOrdinal(h *hash.Hash) (uint32, error) { 336 prefix := h.Prefix() 337 338 for idx := ti.findPrefix(prefix); idx < ti.count && ti.prefixAt(idx) == prefix; idx++ { 339 m, err := ti.entrySuffixMatches(idx, h) 340 if err != nil { 341 return ti.count, err 342 } 343 if m { 344 return ti.ordinalAt(idx), nil 345 } 346 } 347 348 return ti.count, nil 349 } 350 351 // findPrefix returns the first position in |tr.prefixes| whose value == |prefix|. 352 // Returns |tr.chunkCount| if absent 353 func (ti onHeapTableIndex) findPrefix(prefix uint64) (idx uint32) { 354 // NOTE: The golang impl of sort.Search is basically inlined here. This method can be called in 355 // an extremely tight loop and inlining the code was a significant perf improvement. 356 idx, j := 0, ti.count 357 for idx < j { 358 h := idx + (j-idx)/2 // avoid overflow when computing h 359 // i ≤ h < j 360 o := int64(prefixTupleSize * h) 361 tmp := binary.BigEndian.Uint64(ti.prefixTuples[o : o+hash.PrefixLen]) 362 if tmp < prefix { 363 idx = h + 1 // preserves f(i-1) == false 364 } else { 365 j = h // preserves f(j) == true 366 } 367 } 368 return 369 } 370 371 func (ti onHeapTableIndex) tupleAt(idx uint32) (prefix uint64, ord uint32) { 372 off := int64(prefixTupleSize * idx) 373 b := ti.prefixTuples[off : off+prefixTupleSize] 374 375 prefix = binary.BigEndian.Uint64(b[:]) 376 ord = binary.BigEndian.Uint32(b[hash.PrefixLen:]) 377 return prefix, ord 378 } 379 380 func (ti onHeapTableIndex) prefixAt(idx uint32) uint64 { 381 off := int64(prefixTupleSize * idx) 382 b := ti.prefixTuples[off : off+hash.PrefixLen] 383 return binary.BigEndian.Uint64(b) 384 } 385 386 func (ti onHeapTableIndex) ordinalAt(idx uint32) uint32 { 387 off := int64(prefixTupleSize*idx) + hash.PrefixLen 388 b := ti.prefixTuples[off : off+ordinalSize] 389 return binary.BigEndian.Uint32(b) 390 } 391 392 // the first n - n/2 offsets are stored in offsetsB1 and the rest in offsetsB2 393 func (ti onHeapTableIndex) offsetAt(ord uint32) uint64 { 394 chunks1 := ti.count - ti.count/2 395 var b []byte 396 if ord < chunks1 { 397 off := int64(offsetSize * ord) 398 b = ti.offsets1[off : off+offsetSize] 399 } else { 400 off := int64(offsetSize * (ord - chunks1)) 401 b = ti.offsets2[off : off+offsetSize] 402 } 403 return binary.BigEndian.Uint64(b) 404 } 405 406 func (ti onHeapTableIndex) ordinals() ([]uint32, error) { 407 // todo: |o| is not accounted for in the memory quota 408 o := make([]uint32, ti.count) 409 for i, off := uint32(0), 0; i < ti.count; i, off = i+1, off+prefixTupleSize { 410 b := ti.prefixTuples[off+hash.PrefixLen : off+prefixTupleSize] 411 o[i] = binary.BigEndian.Uint32(b) 412 } 413 return o, nil 414 } 415 416 func (ti onHeapTableIndex) prefixes() ([]uint64, error) { 417 // todo: |p| is not accounted for in the memory quota 418 p := make([]uint64, ti.count) 419 for i, off := uint32(0), 0; i < ti.count; i, off = i+1, off+prefixTupleSize { 420 b := ti.prefixTuples[off : off+hash.PrefixLen] 421 p[i] = binary.BigEndian.Uint64(b) 422 } 423 return p, nil 424 } 425 426 func (ti onHeapTableIndex) hashAt(idx uint32) hash.Hash { 427 // Get tuple 428 off := int64(prefixTupleSize * idx) 429 tuple := ti.prefixTuples[off : off+prefixTupleSize] 430 431 // Get prefix, ordinal, and suffix 432 prefix := tuple[:hash.PrefixLen] 433 ord := binary.BigEndian.Uint32(tuple[hash.PrefixLen:]) * hash.SuffixLen 434 suffix := ti.suffixes[ord : ord+hash.SuffixLen] // suffix is 12 bytes 435 436 // Combine prefix and suffix to get hash 437 buf := [hash.ByteLen]byte{} 438 copy(buf[:hash.PrefixLen], prefix) 439 copy(buf[hash.PrefixLen:], suffix) 440 441 return buf 442 } 443 444 // prefixIdxLBound returns the first position in |tr.prefixes| whose value is <= |prefix|. 445 // will return index less than where prefix would be if prefix is not found. 446 func (ti onHeapTableIndex) prefixIdxLBound(prefix uint64) uint32 { 447 l, r := uint32(0), ti.count 448 for l < r { 449 m := l + (r-l)/2 // find middle, rounding down 450 if ti.prefixAt(m) < prefix { 451 l = m + 1 452 } else { 453 r = m 454 } 455 } 456 457 return l 458 } 459 460 // prefixIdxLBound returns the first position in |tr.prefixes| whose value is >= |prefix|. 461 // will return index greater than where prefix would be if prefix is not found. 462 func (ti onHeapTableIndex) prefixIdxUBound(prefix uint64) (idx uint32) { 463 l, r := uint32(0), ti.count 464 for l < r { 465 m := l + (r-l+1)/2 // find middle, rounding up 466 if m >= ti.count { // prevent index out of bounds 467 return r 468 } 469 pre := ti.prefixAt(m) 470 if pre <= prefix { 471 l = m 472 } else { 473 r = m - 1 474 } 475 } 476 477 return l 478 } 479 480 func (ti onHeapTableIndex) padStringAndDecode(s string, p string) uint64 { 481 if len(p) != 1 { 482 panic("pad string must be of length 1") // This is a programmer error that should never get out of PR. 483 } 484 485 for len(s) < hash.StringLen { 486 if p == "0" { 487 s = s + p // Pad on the right side. 488 } else { 489 s = p + s // pad on the left side. 490 } 491 } 492 493 // Decode 494 h := hash.Parse(s) 495 return binary.BigEndian.Uint64(h[:]) 496 } 497 498 func (ti onHeapTableIndex) chunkCount() uint32 { 499 return ti.count 500 } 501 502 // tableFileSize returns the size of the table file that this index references. 503 // This assumes that the index follows immediately after the last chunk in the 504 // file and that the last chunk in the file is in the index. 505 func (ti onHeapTableIndex) tableFileSize() (sz uint64) { 506 sz = footerSize 507 if ti.count > 0 { 508 last := ti.getIndexEntry(ti.count - 1) 509 sz += last.Offset() 510 sz += uint64(last.Length()) 511 sz += indexSize(ti.count) 512 } 513 return 514 } 515 516 func (ti onHeapTableIndex) totalUncompressedData() uint64 { 517 return ti.uncompressedSz 518 } 519 520 func (ti onHeapTableIndex) Close() error { 521 cnt := atomic.AddInt32(ti.refCnt, -1) 522 if cnt < 0 { 523 panic("Close() called and reduced ref count to < 0.") 524 } else if cnt > 0 { 525 return nil 526 } 527 528 runtime.SetFinalizer(ti.refCnt, nil) 529 ti.q.ReleaseQuotaBytes(len(ti.prefixTuples) + len(ti.offsets1) + len(ti.offsets2) + len(ti.suffixes) + len(ti.footer)) 530 return nil 531 } 532 533 func (ti onHeapTableIndex) clone() (tableIndex, error) { 534 cnt := atomic.AddInt32(ti.refCnt, 1) 535 if cnt == 1 { 536 panic("Clone() called after last Close(). This index is no longer valid.") 537 } 538 return ti, nil 539 } 540 541 func (ti onHeapTableIndex) ResolveShortHash(short []byte) ([]string, error) { 542 // Convert to string 543 shortHash := string(short) 544 545 // Calculate length 546 sLen := len(shortHash) 547 548 // Find lower and upper bounds of prefix indexes to check 549 var pIdxL, pIdxU uint32 550 if sLen >= 13 { 551 // Convert short string to prefix 552 sPrefix := ti.padStringAndDecode(shortHash, "0") 553 554 // Binary Search for prefix 555 pIdxL = ti.findPrefix(sPrefix) 556 557 // Prefix doesn't exist 558 if pIdxL == ti.count { 559 return []string{}, errors.New("can't find prefix") 560 } 561 562 // Find last equal 563 pIdxU = pIdxL + 1 564 for sPrefix == ti.prefixAt(pIdxU) { 565 pIdxU++ 566 } 567 } else { 568 // Convert short string to lower and upper bounds 569 sPrefixL := ti.padStringAndDecode(shortHash, "0") 570 sPrefixU := ti.padStringAndDecode(shortHash, "v") 571 572 // Binary search for lower and upper bounds 573 pIdxL = ti.prefixIdxLBound(sPrefixL) 574 pIdxU = ti.prefixIdxUBound(sPrefixU) 575 } 576 577 // Go through all equal prefixes 578 var res []string 579 for i := pIdxL; i < pIdxU; i++ { 580 // Get full hash at index 581 h := ti.hashAt(i) 582 583 // Convert to string representation 584 hashStr := h.String() 585 586 // If it matches append to result 587 if hashStr[:sLen] == shortHash { 588 res = append(res, hashStr) 589 } 590 } 591 592 return res, nil 593 }