github.com/ledgerwatch/erigon-lib@v1.0.0/state/btree_index.go (about) 1 package state 2 3 import ( 4 "bufio" 5 "bytes" 6 "context" 7 "encoding/binary" 8 "errors" 9 "fmt" 10 "math" 11 "math/bits" 12 "os" 13 "path" 14 "path/filepath" 15 "time" 16 17 "github.com/c2h5oh/datasize" 18 "github.com/edsrzf/mmap-go" 19 "github.com/ledgerwatch/erigon-lib/common/dbg" 20 "github.com/ledgerwatch/log/v3" 21 22 "github.com/ledgerwatch/erigon-lib/common/background" 23 24 "github.com/ledgerwatch/erigon-lib/common" 25 "github.com/ledgerwatch/erigon-lib/common/length" 26 "github.com/ledgerwatch/erigon-lib/compress" 27 "github.com/ledgerwatch/erigon-lib/etl" 28 ) 29 30 func logBase(n, base uint64) uint64 { 31 return uint64(math.Ceil(math.Log(float64(n)) / math.Log(float64(base)))) 32 } 33 34 func min64(a, b uint64) uint64 { 35 if a < b { 36 return a 37 } 38 return b 39 } 40 41 type markupCursor struct { 42 l uint64 //l - level 43 p uint64 //p - pos inside level 44 di uint64 //di - data array index 45 si uint64 //si - current, actual son index 46 } 47 48 type node struct { 49 p uint64 // pos inside level 50 d uint64 51 s uint64 // sons pos inside level 52 fc uint64 53 key []byte 54 val []byte 55 } 56 57 type Cursor struct { 58 ctx context.Context 59 ix *btAlloc 60 61 key []byte 62 value []byte 63 d uint64 64 } 65 66 func (a *btAlloc) newCursor(ctx context.Context, k, v []byte, d uint64) *Cursor { 67 return &Cursor{ 68 ctx: ctx, 69 key: common.Copy(k), 70 value: common.Copy(v), 71 d: d, 72 ix: a, 73 } 74 } 75 76 func (c *Cursor) Key() []byte { 77 return c.key 78 } 79 80 func (c *Cursor) Ordinal() uint64 { 81 return c.d 82 } 83 84 func (c *Cursor) Value() []byte { 85 return c.value 86 } 87 88 func (c *Cursor) Next() bool { 89 if c.d > c.ix.K-1 { 90 return false 91 } 92 k, v, err := c.ix.dataLookup(c.d + 1) 93 if err != nil { 94 return false 95 } 96 c.key = common.Copy(k) 97 c.value = common.Copy(v) 98 c.d++ 99 return true 100 } 101 102 type btAlloc struct { 103 d uint64 // depth 104 M uint64 // child limit of any node 105 N uint64 106 K uint64 107 vx []uint64 // vertex count on level 108 sons [][]uint64 // i - level; 0 <= i < d; j_k - amount, j_k+1 - child count 109 cursors []markupCursor 110 nodes [][]node 111 naccess uint64 112 trace bool 113 114 dataLookup func(di uint64) ([]byte, []byte, error) 115 } 116 117 func newBtAlloc(k, M uint64, trace bool) *btAlloc { 118 if k == 0 { 119 return nil 120 } 121 122 d := logBase(k, M) 123 a := &btAlloc{ 124 vx: make([]uint64, d+1), 125 sons: make([][]uint64, d+1), 126 cursors: make([]markupCursor, d), 127 nodes: make([][]node, d), 128 M: M, 129 K: k, 130 d: d, 131 trace: trace, 132 } 133 if trace { 134 fmt.Printf("k=%d d=%d, M=%d\n", k, d, M) 135 } 136 a.vx[0], a.vx[d] = 1, k 137 138 if k < M/2 { 139 a.N = k 140 a.nodes = make([][]node, 1) 141 return a 142 } 143 144 //nnc := func(vx uint64) uint64 { 145 // return uint64(math.Ceil(float64(vx) / float64(M))) 146 //} 147 nvc := func(vx uint64) uint64 { 148 return uint64(math.Ceil(float64(vx) / float64(M>>1))) 149 } 150 151 for i := a.d - 1; i > 0; i-- { 152 nnc := uint64(math.Ceil(float64(a.vx[i+1]) / float64(M))) 153 //nvc := uint64(math.Floor(float64(a.vx[i+1]) / float64(m))-1) 154 //nnc := a.vx[i+1] / M 155 //nvc := a.vx[i+1] / m 156 //bvc := a.vx[i+1] / (m + (m >> 1)) 157 a.vx[i] = min64(uint64(math.Pow(float64(M), float64(i))), nnc) 158 } 159 160 ncount := uint64(0) 161 pnv := uint64(0) 162 for l := a.d - 1; l > 0; l-- { 163 //s := nnc(a.vx[l+1]) 164 sh := nvc(a.vx[l+1]) 165 166 if sh&1 == 1 { 167 a.sons[l] = append(a.sons[l], sh>>1, M, 1, M>>1) 168 } else { 169 a.sons[l] = append(a.sons[l], sh>>1, M) 170 } 171 172 for ik := 0; ik < len(a.sons[l]); ik += 2 { 173 ncount += a.sons[l][ik] * a.sons[l][ik+1] 174 if l == 1 { 175 pnv += a.sons[l][ik] 176 } 177 } 178 } 179 a.sons[0] = []uint64{1, pnv} 180 ncount += a.sons[0][0] * a.sons[0][1] // last one 181 a.N = ncount 182 183 if trace { 184 for i, v := range a.sons { 185 fmt.Printf("L%d=%v\n", i, v) 186 } 187 } 188 189 return a 190 } 191 192 // nolint 193 // another implementation of traverseDfs supposed to be a bit cleaner but buggy yet 194 func (a *btAlloc) traverseTrick() { 195 for l := 0; l < len(a.sons)-1; l++ { 196 if len(a.sons[l]) < 2 { 197 panic("invalid btree allocation markup") 198 } 199 a.cursors[l] = markupCursor{uint64(l), 1, 0, 0} 200 a.nodes[l] = make([]node, 0) 201 } 202 203 lf := a.cursors[len(a.cursors)-1] 204 c := a.cursors[(len(a.cursors) - 2)] 205 206 var d uint64 207 var fin bool 208 209 lf.di = d 210 lf.si++ 211 d++ 212 a.cursors[len(a.cursors)-1] = lf 213 214 moved := true 215 for int(c.p) <= len(a.sons[c.l]) { 216 if fin || d > a.K { 217 break 218 } 219 c, lf = a.cursors[c.l], a.cursors[lf.l] 220 221 c.di = d 222 c.si++ 223 224 sons := a.sons[lf.l][lf.p] 225 for i := uint64(1); i < sons; i++ { 226 lf.si++ 227 d++ 228 } 229 lf.di = d 230 d++ 231 232 a.nodes[lf.l] = append(a.nodes[lf.l], node{p: lf.p, s: lf.si, d: lf.di}) 233 a.nodes[c.l] = append(a.nodes[c.l], node{p: c.p, s: c.si, d: c.di}) 234 a.cursors[lf.l] = lf 235 a.cursors[c.l] = c 236 237 for l := lf.l; l >= 0; l-- { 238 sc := a.cursors[l] 239 sons, gsons := a.sons[sc.l][sc.p-1], a.sons[sc.l][sc.p] 240 if l < c.l && moved { 241 sc.di = d 242 a.nodes[sc.l] = append(a.nodes[sc.l], node{d: sc.di}) 243 sc.si++ 244 d++ 245 } 246 moved = (sc.si-1)/gsons != sc.si/gsons 247 if sc.si/gsons >= sons { 248 sz := uint64(len(a.sons[sc.l]) - 1) 249 if sc.p+2 > sz { 250 fin = l == lf.l 251 break 252 } else { 253 sc.p += 2 254 sc.si, sc.di = 0, 0 255 } 256 //moved = true 257 } 258 if l == lf.l { 259 sc.si++ 260 sc.di = d 261 d++ 262 } 263 a.cursors[l] = sc 264 if l == 0 { 265 break 266 } 267 } 268 moved = false 269 } 270 } 271 272 func (a *btAlloc) traverseDfs() { 273 for l := 0; l < len(a.sons)-1; l++ { 274 a.cursors[l] = markupCursor{uint64(l), 1, 0, 0} 275 a.nodes[l] = make([]node, 0) 276 } 277 278 if len(a.cursors) <= 1 { 279 if a.nodes[0] == nil { 280 a.nodes[0] = make([]node, 0) 281 } 282 a.nodes[0] = append(a.nodes[0], node{d: a.K}) 283 a.N = a.K 284 if a.trace { 285 fmt.Printf("ncount=%d ∂%.5f\n", a.N, float64(a.N-a.K)/float64(a.N)) 286 } 287 return 288 } 289 290 c := a.cursors[len(a.cursors)-1] 291 pc := a.cursors[(len(a.cursors) - 2)] 292 root := new(node) 293 trace := false 294 295 var di uint64 296 for stop := false; !stop; { 297 // fill leaves, mark parent if needed (until all grandparents not marked up until root) 298 // check if eldest parent has brothers 299 // -- has bros -> fill their leaves from the bottom 300 // -- no bros -> shift cursor (tricky) 301 if di > a.K { 302 a.N = di - 1 // actually filled node count 303 if a.trace { 304 fmt.Printf("ncount=%d ∂%.5f\n", a.N, float64(a.N-a.K)/float64(a.N)) 305 } 306 break 307 } 308 309 bros, parents := a.sons[c.l][c.p], a.sons[c.l][c.p-1] 310 for i := uint64(0); i < bros; i++ { 311 c.di = di 312 if trace { 313 fmt.Printf("L%d |%d| d %2d s %2d\n", c.l, c.p, c.di, c.si) 314 } 315 c.si++ 316 di++ 317 318 if i == 0 { 319 pc.di = di 320 if trace { 321 fmt.Printf("P%d |%d| d %2d s %2d\n", pc.l, pc.p, pc.di, pc.si) 322 } 323 pc.si++ 324 di++ 325 } 326 if di > a.K { 327 a.N = di - 1 // actually filled node count 328 stop = true 329 break 330 } 331 } 332 333 a.nodes[c.l] = append(a.nodes[c.l], node{p: c.p, d: c.di, s: c.si}) 334 a.nodes[pc.l] = append(a.nodes[pc.l], node{p: pc.p, d: pc.di, s: pc.si, fc: uint64(len(a.nodes[c.l]) - 1)}) 335 336 pid := c.si / bros 337 if pid >= parents { 338 if c.p+2 >= uint64(len(a.sons[c.l])) { 339 stop = true // end of row 340 if trace { 341 fmt.Printf("F%d |%d| d %2d\n", c.l, c.p, c.di) 342 } 343 } else { 344 c.p += 2 345 c.si = 0 346 c.di = 0 347 } 348 } 349 a.cursors[c.l] = c 350 a.cursors[pc.l] = pc 351 352 //nolint 353 for l := pc.l; l >= 0; l-- { 354 pc := a.cursors[l] 355 uncles := a.sons[pc.l][pc.p] 356 grands := a.sons[pc.l][pc.p-1] 357 358 pi1 := pc.si / uncles 359 pc.si++ 360 pc.di = 0 361 362 pi2 := pc.si / uncles 363 moved := pi2-pi1 != 0 364 365 switch { 366 case pc.l > 0: 367 gp := a.cursors[pc.l-1] 368 if gp.di == 0 { 369 gp.di = di 370 di++ 371 if trace { 372 fmt.Printf("P%d |%d| d %2d s %2d\n", gp.l, gp.p, gp.di, gp.si) 373 } 374 a.nodes[gp.l] = append(a.nodes[gp.l], node{p: gp.p, d: gp.di, s: gp.si, fc: uint64(len(a.nodes[l]) - 1)}) 375 a.cursors[gp.l] = gp 376 } 377 default: 378 if root.d == 0 { 379 root.d = di 380 //di++ 381 if trace { 382 fmt.Printf("ROOT | d %2d\n", root.d) 383 } 384 } 385 } 386 387 //fmt.Printf("P%d |%d| d %2d s %2d pid %d\n", pc.l, pc.p, pc.di, pc.si-1) 388 if pi2 >= grands { // skip one step of si due to different parental filling order 389 if pc.p+2 >= uint64(len(a.sons[pc.l])) { 390 if trace { 391 fmt.Printf("EoRow %d |%d|\n", pc.l, pc.p) 392 } 393 break // end of row 394 } 395 //fmt.Printf("N %d d%d s%d\n", pc.l, pc.di, pc.si) 396 //fmt.Printf("P%d |%d| d %2d s %2d pid %d\n", pc.l, pc.p, pc.di, pc.si, pid) 397 pc.p += 2 398 pc.si = 0 399 pc.di = 0 400 } 401 a.cursors[pc.l] = pc 402 403 if !moved { 404 break 405 } 406 } 407 } 408 409 if a.trace { 410 fmt.Printf("ncount=%d ∂%.5f\n", a.N, float64(a.N-a.K)/float64(a.N)) 411 } 412 } 413 414 func (a *btAlloc) bsKey(x []byte, l, r uint64) (*Cursor, error) { 415 for l <= r { 416 di := (l + r) >> 1 417 418 mk, value, err := a.dataLookup(di) 419 a.naccess++ 420 421 cmp := bytes.Compare(mk, x) 422 switch { 423 case err != nil: 424 if errors.Is(err, ErrBtIndexLookupBounds) { 425 return nil, nil 426 } 427 return nil, err 428 case cmp == 0: 429 return a.newCursor(context.TODO(), mk, value, di), nil 430 case cmp == -1: 431 l = di + 1 432 default: 433 r = di 434 } 435 if l == r { 436 break 437 } 438 } 439 k, v, err := a.dataLookup(l) 440 if err != nil { 441 if errors.Is(err, ErrBtIndexLookupBounds) { 442 return nil, nil 443 } 444 return nil, fmt.Errorf("key >= %x was not found. %w", x, err) 445 } 446 return a.newCursor(context.TODO(), k, v, l), nil 447 } 448 449 func (a *btAlloc) bsNode(i, l, r uint64, x []byte) (n node, lm int64, rm int64) { 450 lm, rm = -1, -1 451 var m uint64 452 453 for l < r { 454 m = (l + r) >> 1 455 456 a.naccess++ 457 cmp := bytes.Compare(a.nodes[i][m].key, x) 458 switch { 459 case cmp == 0: 460 return a.nodes[i][m], int64(m), int64(m) 461 case cmp > 0: 462 r = m 463 rm = int64(m) 464 case cmp < 0: 465 lm = int64(m) 466 l = m + 1 467 default: 468 panic(fmt.Errorf("compare error %d, %x ? %x", cmp, n.key, x)) 469 } 470 } 471 return a.nodes[i][m], lm, rm 472 } 473 474 // find position of key with node.di <= d at level lvl 475 func (a *btAlloc) seekLeast(lvl, d uint64) uint64 { 476 for i := range a.nodes[lvl] { 477 if a.nodes[lvl][i].d >= d { 478 return uint64(i) 479 } 480 } 481 return uint64(len(a.nodes[lvl])) 482 } 483 484 func (a *btAlloc) Seek(ik []byte) (*Cursor, error) { 485 if a.trace { 486 fmt.Printf("seek key %x\n", ik) 487 } 488 489 var ( 490 lm, rm int64 491 L, R = uint64(0), uint64(len(a.nodes[0]) - 1) 492 minD, maxD = uint64(0), a.K 493 ln node 494 ) 495 496 for l, level := range a.nodes { 497 if len(level) == 1 && l == 0 { 498 ln = a.nodes[0][0] 499 maxD = ln.d 500 break 501 } 502 ln, lm, rm = a.bsNode(uint64(l), L, R, ik) 503 if ln.key == nil { // should return node which is nearest to key from the left so never nil 504 if a.trace { 505 fmt.Printf("found nil key %x pos_range[%d-%d] naccess_ram=%d\n", l, lm, rm, a.naccess) 506 } 507 return nil, fmt.Errorf("bt index nil node at level %d", l) 508 } 509 510 switch bytes.Compare(ln.key, ik) { 511 case 1: // key > ik 512 maxD = ln.d 513 case -1: // key < ik 514 minD = ln.d 515 case 0: 516 if a.trace { 517 fmt.Printf("found key %x v=%x naccess_ram=%d\n", ik, ln.val /*level[m].d,*/, a.naccess) 518 } 519 return a.newCursor(context.TODO(), common.Copy(ln.key), common.Copy(ln.val), ln.d), nil 520 } 521 522 if rm-lm >= 1 { 523 break 524 } 525 if lm >= 0 { 526 minD = a.nodes[l][lm].d 527 L = level[lm].fc 528 } else if l+1 != len(a.nodes) { 529 L = a.seekLeast(uint64(l+1), minD) 530 if L == uint64(len(a.nodes[l+1])) { 531 L-- 532 } 533 } 534 if rm >= 0 { 535 maxD = a.nodes[l][rm].d 536 R = level[rm].fc 537 } else if l+1 != len(a.nodes) { 538 R = a.seekLeast(uint64(l+1), maxD) 539 if R == uint64(len(a.nodes[l+1])) { 540 R-- 541 } 542 } 543 544 if a.trace { 545 fmt.Printf("range={%x d=%d p=%d} (%d, %d) L=%d naccess_ram=%d\n", ln.key, ln.d, ln.p, minD, maxD, l, a.naccess) 546 } 547 } 548 549 a.naccess = 0 // reset count before actually go to disk 550 cursor, err := a.bsKey(ik, minD, maxD) 551 if err != nil { 552 if a.trace { 553 fmt.Printf("key %x not found\n", ik) 554 } 555 return nil, err 556 } 557 558 if a.trace { 559 fmt.Printf("finally found key %x v=%x naccess_disk=%d\n", cursor.key, cursor.value, a.naccess) 560 } 561 return cursor, nil 562 } 563 564 func (a *btAlloc) fillSearchMx() { 565 for i, n := range a.nodes { 566 if a.trace { 567 fmt.Printf("D%d |%d| ", i, len(n)) 568 } 569 for j, s := range n { 570 if a.trace { 571 fmt.Printf("%d ", s.d) 572 } 573 if s.d >= a.K { 574 break 575 } 576 577 kb, v, err := a.dataLookup(s.d) 578 if err != nil { 579 fmt.Printf("d %d not found %v\n", s.d, err) 580 } 581 a.nodes[i][j].key = common.Copy(kb) 582 a.nodes[i][j].val = common.Copy(v) 583 } 584 if a.trace { 585 fmt.Printf("\n") 586 } 587 } 588 } 589 590 // deprecated 591 type BtIndexReader struct { 592 index *BtIndex 593 } 594 595 func NewBtIndexReader(index *BtIndex) *BtIndexReader { 596 return &BtIndexReader{ 597 index: index, 598 } 599 } 600 601 // Lookup wraps index Lookup 602 func (r *BtIndexReader) Lookup(key []byte) uint64 { 603 if r.index != nil { 604 return r.index.Lookup(key) 605 } 606 return 0 607 } 608 609 func (r *BtIndexReader) Lookup2(key1, key2 []byte) uint64 { 610 fk := make([]byte, 52) 611 copy(fk[:length.Addr], key1) 612 copy(fk[length.Addr:], key2) 613 614 if r.index != nil { 615 return r.index.Lookup(fk) 616 } 617 return 0 618 } 619 620 func (r *BtIndexReader) Seek(x []byte) (*Cursor, error) { 621 if r.index != nil { 622 cursor, err := r.index.alloc.Seek(x) 623 if err != nil { 624 return nil, fmt.Errorf("seek key %x: %w", x, err) 625 } 626 return cursor, nil 627 } 628 return nil, fmt.Errorf("seek has been failed") 629 } 630 631 func (r *BtIndexReader) Empty() bool { 632 return r.index.Empty() 633 } 634 635 type BtIndexWriter struct { 636 built bool 637 lvl log.Lvl 638 maxOffset uint64 639 prevOffset uint64 640 minDelta uint64 641 indexW *bufio.Writer 642 indexF *os.File 643 bucketCollector *etl.Collector // Collector that sorts by buckets 644 645 indexFileName string 646 indexFile, tmpFilePath string 647 648 tmpDir string 649 numBuf [8]byte 650 keyCount uint64 651 etlBufLimit datasize.ByteSize 652 bytesPerRec int 653 logger log.Logger 654 noFsync bool // fsync is enabled by default, but tests can manually disable 655 } 656 657 type BtIndexWriterArgs struct { 658 IndexFile string // File name where the index and the minimal perfect hash function will be written to 659 TmpDir string 660 KeyCount int 661 EtlBufLimit datasize.ByteSize 662 } 663 664 const BtreeLogPrefix = "btree" 665 666 // NewBtIndexWriter creates a new BtIndexWriter instance with given number of keys 667 // Typical bucket size is 100 - 2048, larger bucket sizes result in smaller representations of hash functions, at a cost of slower access 668 // salt parameters is used to randomise the hash function construction, to ensure that different Erigon instances (nodes) 669 // are likely to use different hash function, to collision attacks are unlikely to slow down any meaningful number of nodes at the same time 670 func NewBtIndexWriter(args BtIndexWriterArgs, logger log.Logger) (*BtIndexWriter, error) { 671 btw := &BtIndexWriter{lvl: log.LvlDebug, logger: logger} 672 btw.tmpDir = args.TmpDir 673 btw.indexFile = args.IndexFile 674 btw.tmpFilePath = args.IndexFile + ".tmp" 675 676 _, fname := filepath.Split(btw.indexFile) 677 btw.indexFileName = fname 678 btw.etlBufLimit = args.EtlBufLimit 679 if btw.etlBufLimit == 0 { 680 btw.etlBufLimit = etl.BufferOptimalSize 681 } 682 683 btw.bucketCollector = etl.NewCollector(BtreeLogPrefix+" "+fname, btw.tmpDir, etl.NewSortableBuffer(btw.etlBufLimit), logger) 684 btw.bucketCollector.LogLvl(log.LvlDebug) 685 686 btw.maxOffset = 0 687 return btw, nil 688 } 689 690 // loadFuncBucket is required to satisfy the type etl.LoadFunc type, to use with collector.Load 691 func (btw *BtIndexWriter) loadFuncBucket(k, v []byte, _ etl.CurrentTableReader, _ etl.LoadNextFunc) error { 692 // k is the BigEndian encoding of the bucket number, and the v is the key that is assigned into that bucket 693 //if uint64(len(btw.vals)) >= btw.batchSizeLimit { 694 // if err := btw.drainBatch(); err != nil { 695 // return err 696 // } 697 //} 698 699 // if _, err := btw.indexW.Write(k); err != nil { 700 // return err 701 // } 702 if _, err := btw.indexW.Write(v[8-btw.bytesPerRec:]); err != nil { 703 return err 704 } 705 706 //btw.keys = append(btw.keys, binary.BigEndian.Uint64(k), binary.BigEndian.Uint64(k[8:])) 707 //btw.vals = append(btw.vals, binary.BigEndian.Uint64(v)) 708 return nil 709 } 710 711 // Build has to be called after all the keys have been added, and it initiates the process 712 // of building the perfect hash function and writing index into a file 713 func (btw *BtIndexWriter) Build() error { 714 if btw.built { 715 return fmt.Errorf("already built") 716 } 717 //if btw.keysAdded != btw.keyCount { 718 // return fmt.Errorf("expected keys %d, got %d", btw.keyCount, btw.keysAdded) 719 //} 720 var err error 721 if btw.indexF, err = os.Create(btw.tmpFilePath); err != nil { 722 return fmt.Errorf("create index file %s: %w", btw.indexFile, err) 723 } 724 defer btw.indexF.Close() 725 btw.indexW = bufio.NewWriterSize(btw.indexF, etl.BufIOSize) 726 727 // Write number of keys 728 binary.BigEndian.PutUint64(btw.numBuf[:], btw.keyCount) 729 if _, err = btw.indexW.Write(btw.numBuf[:]); err != nil { 730 return fmt.Errorf("write number of keys: %w", err) 731 } 732 // Write number of bytes per index record 733 btw.bytesPerRec = common.BitLenToByteLen(bits.Len64(btw.maxOffset)) 734 if err = btw.indexW.WriteByte(byte(btw.bytesPerRec)); err != nil { 735 return fmt.Errorf("write bytes per record: %w", err) 736 } 737 738 defer btw.bucketCollector.Close() 739 log.Log(btw.lvl, "[index] calculating", "file", btw.indexFileName) 740 if err := btw.bucketCollector.Load(nil, "", btw.loadFuncBucket, etl.TransformArgs{}); err != nil { 741 return err 742 } 743 744 btw.logger.Log(btw.lvl, "[index] write", "file", btw.indexFileName) 745 btw.built = true 746 747 if err = btw.indexW.Flush(); err != nil { 748 return err 749 } 750 if err = btw.fsync(); err != nil { 751 return err 752 } 753 if err = btw.indexF.Close(); err != nil { 754 return err 755 } 756 if err = os.Rename(btw.tmpFilePath, btw.indexFile); err != nil { 757 return err 758 } 759 return nil 760 } 761 762 func (btw *BtIndexWriter) DisableFsync() { btw.noFsync = true } 763 764 // fsync - other processes/goroutines must see only "fully-complete" (valid) files. No partial-writes. 765 // To achieve it: write to .tmp file then `rename` when file is ready. 766 // Machine may power-off right after `rename` - it means `fsync` must be before `rename` 767 func (btw *BtIndexWriter) fsync() error { 768 if btw.noFsync { 769 return nil 770 } 771 if err := btw.indexF.Sync(); err != nil { 772 btw.logger.Warn("couldn't fsync", "err", err, "file", btw.tmpFilePath) 773 return err 774 } 775 return nil 776 } 777 778 func (btw *BtIndexWriter) Close() { 779 if btw.indexF != nil { 780 btw.indexF.Close() 781 } 782 if btw.bucketCollector != nil { 783 btw.bucketCollector.Close() 784 } 785 //if btw.offsetCollector != nil { 786 // btw.offsetCollector.Close() 787 //} 788 } 789 790 func (btw *BtIndexWriter) AddKey(key []byte, offset uint64) error { 791 if btw.built { 792 return fmt.Errorf("cannot add keys after perfect hash function had been built") 793 } 794 795 binary.BigEndian.PutUint64(btw.numBuf[:], offset) 796 if offset > btw.maxOffset { 797 btw.maxOffset = offset 798 } 799 if btw.keyCount > 0 { 800 delta := offset - btw.prevOffset 801 if btw.keyCount == 1 || delta < btw.minDelta { 802 btw.minDelta = delta 803 } 804 } 805 806 if err := btw.bucketCollector.Collect(key, btw.numBuf[:]); err != nil { 807 return err 808 } 809 btw.keyCount++ 810 btw.prevOffset = offset 811 return nil 812 } 813 814 type BtIndex struct { 815 alloc *btAlloc 816 m mmap.MMap 817 data []byte 818 file *os.File 819 size int64 820 modTime time.Time 821 filePath string 822 keyCount uint64 823 bytesPerRec int 824 dataoffset uint64 825 auxBuf []byte 826 decompressor *compress.Decompressor 827 getter *compress.Getter 828 } 829 830 func CreateBtreeIndex(indexPath, dataPath string, M uint64, logger log.Logger) (*BtIndex, error) { 831 err := BuildBtreeIndex(dataPath, indexPath, logger) 832 if err != nil { 833 return nil, err 834 } 835 return OpenBtreeIndex(indexPath, dataPath, M) 836 } 837 838 var DefaultBtreeM = uint64(2048) 839 840 func CreateBtreeIndexWithDecompressor(indexPath string, M uint64, decompressor *compress.Decompressor, p *background.Progress, tmpdir string, logger log.Logger) (*BtIndex, error) { 841 err := BuildBtreeIndexWithDecompressor(indexPath, decompressor, p, tmpdir, logger) 842 if err != nil { 843 return nil, err 844 } 845 return OpenBtreeIndexWithDecompressor(indexPath, M, decompressor) 846 } 847 848 func BuildBtreeIndexWithDecompressor(indexPath string, kv *compress.Decompressor, p *background.Progress, tmpdir string, logger log.Logger) error { 849 defer kv.EnableReadAhead().DisableReadAhead() 850 851 args := BtIndexWriterArgs{ 852 IndexFile: indexPath, 853 TmpDir: tmpdir, 854 } 855 856 iw, err := NewBtIndexWriter(args, logger) 857 if err != nil { 858 return err 859 } 860 861 getter := kv.MakeGetter() 862 getter.Reset(0) 863 864 key := make([]byte, 0, 64) 865 ks := make(map[int]int) 866 867 var pos, kp uint64 868 emptys := 0 869 for getter.HasNext() { 870 p.Processed.Add(1) 871 key, kp = getter.Next(key[:0]) 872 err = iw.AddKey(key, pos) 873 if err != nil { 874 return err 875 } 876 877 pos, _ = getter.Skip() 878 if pos-kp == 1 { 879 ks[len(key)]++ 880 emptys++ 881 } 882 } 883 //fmt.Printf("emptys %d %#+v\n", emptys, ks) 884 885 if err := iw.Build(); err != nil { 886 return err 887 } 888 iw.Close() 889 return nil 890 } 891 892 // Opens .kv at dataPath and generates index over it to file 'indexPath' 893 func BuildBtreeIndex(dataPath, indexPath string, logger log.Logger) error { 894 decomp, err := compress.NewDecompressor(dataPath) 895 if err != nil { 896 return err 897 } 898 defer decomp.Close() 899 900 defer decomp.EnableReadAhead().DisableReadAhead() 901 902 args := BtIndexWriterArgs{ 903 IndexFile: indexPath, 904 TmpDir: filepath.Dir(indexPath), 905 } 906 907 iw, err := NewBtIndexWriter(args, logger) 908 if err != nil { 909 return err 910 } 911 defer iw.Close() 912 913 getter := decomp.MakeGetter() 914 getter.Reset(0) 915 916 key := make([]byte, 0, 64) 917 918 var pos uint64 919 for getter.HasNext() { 920 key, _ = getter.Next(key[:0]) 921 err = iw.AddKey(key, pos) 922 if err != nil { 923 return err 924 } 925 926 pos, _ = getter.Skip() 927 } 928 decomp.Close() 929 930 if err := iw.Build(); err != nil { 931 return err 932 } 933 iw.Close() 934 return nil 935 } 936 937 func OpenBtreeIndexWithDecompressor(indexPath string, M uint64, kv *compress.Decompressor) (*BtIndex, error) { 938 s, err := os.Stat(indexPath) 939 if err != nil { 940 return nil, err 941 } 942 943 idx := &BtIndex{ 944 filePath: indexPath, 945 size: s.Size(), 946 modTime: s.ModTime(), 947 auxBuf: make([]byte, 64), 948 } 949 950 idx.file, err = os.Open(indexPath) 951 if err != nil { 952 return nil, err 953 } 954 955 idx.m, err = mmap.MapRegion(idx.file, int(idx.size), mmap.RDONLY, 0, 0) 956 if err != nil { 957 return nil, err 958 } 959 idx.data = idx.m[:idx.size] 960 961 // Read number of keys and bytes per record 962 pos := 8 963 idx.keyCount = binary.BigEndian.Uint64(idx.data[:pos]) 964 if idx.keyCount == 0 { 965 return idx, nil 966 } 967 idx.bytesPerRec = int(idx.data[pos]) 968 pos += 1 969 970 //p := (*[]byte)(unsafe.Pointer(&idx.data[pos])) 971 //l := int(idx.keyCount)*idx.bytesPerRec + (16 * int(idx.keyCount)) 972 973 idx.getter = kv.MakeGetter() 974 975 idx.dataoffset = uint64(pos) 976 idx.alloc = newBtAlloc(idx.keyCount, M, false) 977 if idx.alloc != nil { 978 idx.alloc.dataLookup = idx.dataLookup 979 idx.alloc.traverseDfs() 980 defer idx.decompressor.EnableReadAhead().DisableReadAhead() 981 idx.alloc.fillSearchMx() 982 } 983 return idx, nil 984 } 985 986 func OpenBtreeIndex(indexPath, dataPath string, M uint64) (*BtIndex, error) { 987 s, err := os.Stat(indexPath) 988 if err != nil { 989 return nil, err 990 } 991 992 idx := &BtIndex{ 993 filePath: indexPath, 994 size: s.Size(), 995 modTime: s.ModTime(), 996 auxBuf: make([]byte, 64), 997 } 998 999 idx.file, err = os.Open(indexPath) 1000 if err != nil { 1001 return nil, err 1002 } 1003 1004 idx.m, err = mmap.MapRegion(idx.file, int(idx.size), mmap.RDONLY, 0, 0) 1005 if err != nil { 1006 return nil, err 1007 } 1008 idx.data = idx.m[:idx.size] 1009 1010 // Read number of keys and bytes per record 1011 pos := 8 1012 idx.keyCount = binary.BigEndian.Uint64(idx.data[:pos]) 1013 idx.bytesPerRec = int(idx.data[pos]) 1014 pos += 1 1015 1016 // offset := int(idx.keyCount) * idx.bytesPerRec //+ (idx.keySize * int(idx.keyCount)) 1017 // if offset < 0 { 1018 // return nil, fmt.Errorf("offset is: %d which is below zero, the file: %s is broken", offset, indexPath) 1019 // } 1020 1021 //p := (*[]byte)(unsafe.Pointer(&idx.data[pos])) 1022 //l := int(idx.keyCount)*idx.bytesPerRec + (16 * int(idx.keyCount)) 1023 1024 idx.decompressor, err = compress.NewDecompressor(dataPath) 1025 if err != nil { 1026 idx.Close() 1027 return nil, err 1028 } 1029 idx.getter = idx.decompressor.MakeGetter() 1030 1031 idx.dataoffset = uint64(pos) 1032 idx.alloc = newBtAlloc(idx.keyCount, M, false) 1033 if idx.alloc != nil { 1034 idx.alloc.dataLookup = idx.dataLookup 1035 idx.alloc.traverseDfs() 1036 defer idx.decompressor.EnableReadAhead().DisableReadAhead() 1037 idx.alloc.fillSearchMx() 1038 } 1039 return idx, nil 1040 } 1041 1042 var ErrBtIndexLookupBounds = errors.New("BtIndex: lookup di bounds error") 1043 1044 // dataLookup fetches key and value from data file by di (data index) 1045 // di starts from 0 so di is never >= keyCount 1046 func (b *BtIndex) dataLookup(di uint64) ([]byte, []byte, error) { 1047 if di >= b.keyCount { 1048 return nil, nil, fmt.Errorf("%w: keyCount=%d, item %d requested. file: %s", ErrBtIndexLookupBounds, b.keyCount, di+1, b.FileName()) 1049 } 1050 p := int(b.dataoffset) + int(di)*b.bytesPerRec 1051 if len(b.data) < p+b.bytesPerRec { 1052 return nil, nil, fmt.Errorf("data lookup gone too far (%d after %d). keyCount=%d, requesed item %d. file: %s", p+b.bytesPerRec-len(b.data), len(b.data), b.keyCount, di, b.FileName()) 1053 } 1054 1055 var aux [8]byte 1056 dst := aux[8-b.bytesPerRec:] 1057 copy(dst, b.data[p:p+b.bytesPerRec]) 1058 1059 offset := binary.BigEndian.Uint64(aux[:]) 1060 b.getter.Reset(offset) 1061 if !b.getter.HasNext() { 1062 return nil, nil, fmt.Errorf("pair %d not found. keyCount=%d. file: %s", di, b.keyCount, b.FileName()) 1063 } 1064 1065 key, kp := b.getter.Next(nil) 1066 1067 if !b.getter.HasNext() { 1068 return nil, nil, fmt.Errorf("pair %d not found. keyCount=%d. file: %s", di, b.keyCount, b.FileName()) 1069 } 1070 val, vp := b.getter.Next(nil) 1071 _, _ = kp, vp 1072 return key, val, nil 1073 } 1074 1075 func (b *BtIndex) Size() int64 { return b.size } 1076 1077 func (b *BtIndex) ModTime() time.Time { return b.modTime } 1078 1079 func (b *BtIndex) FilePath() string { return b.filePath } 1080 1081 func (b *BtIndex) FileName() string { return path.Base(b.filePath) } 1082 1083 func (b *BtIndex) Empty() bool { return b == nil || b.keyCount == 0 } 1084 1085 func (b *BtIndex) KeyCount() uint64 { return b.keyCount } 1086 1087 func (b *BtIndex) Close() { 1088 if b == nil { 1089 return 1090 } 1091 if b.file != nil { 1092 if err := b.m.Unmap(); err != nil { 1093 log.Log(dbg.FileCloseLogLevel, "unmap", "err", err, "file", b.FileName(), "stack", dbg.Stack()) 1094 } 1095 b.m = nil 1096 if err := b.file.Close(); err != nil { 1097 log.Log(dbg.FileCloseLogLevel, "close", "err", err, "file", b.FileName(), "stack", dbg.Stack()) 1098 } 1099 b.file = nil 1100 } 1101 if b.decompressor != nil { 1102 b.decompressor.Close() 1103 b.decompressor = nil 1104 } 1105 } 1106 1107 func (b *BtIndex) Seek(x []byte) (*Cursor, error) { 1108 if b.alloc == nil { 1109 return nil, nil 1110 } 1111 cursor, err := b.alloc.Seek(x) 1112 if err != nil { 1113 return nil, fmt.Errorf("seek key %x: %w", x, err) 1114 } 1115 // cursor could be nil along with err if nothing found 1116 return cursor, nil 1117 } 1118 1119 // deprecated 1120 func (b *BtIndex) Lookup(key []byte) uint64 { 1121 if b.alloc == nil { 1122 return 0 1123 } 1124 cursor, err := b.alloc.Seek(key) 1125 if err != nil { 1126 panic(err) 1127 } 1128 return binary.BigEndian.Uint64(cursor.value) 1129 } 1130 1131 func (b *BtIndex) OrdinalLookup(i uint64) *Cursor { 1132 if b.alloc == nil { 1133 return nil 1134 } 1135 if i > b.alloc.K { 1136 return nil 1137 } 1138 k, v, err := b.dataLookup(i) 1139 if err != nil { 1140 return nil 1141 } 1142 1143 return &Cursor{ 1144 key: k, value: v, d: i, ix: b.alloc, 1145 } 1146 }