github.com/ledgerwatch/erigon-lib@v1.0.0/state/btree_index.go (about)

     1  package state
     2  
     3  import (
     4  	"bufio"
     5  	"bytes"
     6  	"context"
     7  	"encoding/binary"
     8  	"errors"
     9  	"fmt"
    10  	"math"
    11  	"math/bits"
    12  	"os"
    13  	"path"
    14  	"path/filepath"
    15  	"time"
    16  
    17  	"github.com/c2h5oh/datasize"
    18  	"github.com/edsrzf/mmap-go"
    19  	"github.com/ledgerwatch/erigon-lib/common/dbg"
    20  	"github.com/ledgerwatch/log/v3"
    21  
    22  	"github.com/ledgerwatch/erigon-lib/common/background"
    23  
    24  	"github.com/ledgerwatch/erigon-lib/common"
    25  	"github.com/ledgerwatch/erigon-lib/common/length"
    26  	"github.com/ledgerwatch/erigon-lib/compress"
    27  	"github.com/ledgerwatch/erigon-lib/etl"
    28  )
    29  
    30  func logBase(n, base uint64) uint64 {
    31  	return uint64(math.Ceil(math.Log(float64(n)) / math.Log(float64(base))))
    32  }
    33  
    34  func min64(a, b uint64) uint64 {
    35  	if a < b {
    36  		return a
    37  	}
    38  	return b
    39  }
    40  
    41  type markupCursor struct {
    42  	l  uint64 //l - level
    43  	p  uint64 //p - pos inside level
    44  	di uint64 //di - data array index
    45  	si uint64 //si - current, actual son index
    46  }
    47  
    48  type node struct {
    49  	p   uint64 // pos inside level
    50  	d   uint64
    51  	s   uint64 // sons pos inside level
    52  	fc  uint64
    53  	key []byte
    54  	val []byte
    55  }
    56  
    57  type Cursor struct {
    58  	ctx context.Context
    59  	ix  *btAlloc
    60  
    61  	key   []byte
    62  	value []byte
    63  	d     uint64
    64  }
    65  
    66  func (a *btAlloc) newCursor(ctx context.Context, k, v []byte, d uint64) *Cursor {
    67  	return &Cursor{
    68  		ctx:   ctx,
    69  		key:   common.Copy(k),
    70  		value: common.Copy(v),
    71  		d:     d,
    72  		ix:    a,
    73  	}
    74  }
    75  
    76  func (c *Cursor) Key() []byte {
    77  	return c.key
    78  }
    79  
    80  func (c *Cursor) Ordinal() uint64 {
    81  	return c.d
    82  }
    83  
    84  func (c *Cursor) Value() []byte {
    85  	return c.value
    86  }
    87  
    88  func (c *Cursor) Next() bool {
    89  	if c.d > c.ix.K-1 {
    90  		return false
    91  	}
    92  	k, v, err := c.ix.dataLookup(c.d + 1)
    93  	if err != nil {
    94  		return false
    95  	}
    96  	c.key = common.Copy(k)
    97  	c.value = common.Copy(v)
    98  	c.d++
    99  	return true
   100  }
   101  
   102  type btAlloc struct {
   103  	d       uint64 // depth
   104  	M       uint64 // child limit of any node
   105  	N       uint64
   106  	K       uint64
   107  	vx      []uint64   // vertex count on level
   108  	sons    [][]uint64 // i - level; 0 <= i < d; j_k - amount, j_k+1 - child count
   109  	cursors []markupCursor
   110  	nodes   [][]node
   111  	naccess uint64
   112  	trace   bool
   113  
   114  	dataLookup func(di uint64) ([]byte, []byte, error)
   115  }
   116  
   117  func newBtAlloc(k, M uint64, trace bool) *btAlloc {
   118  	if k == 0 {
   119  		return nil
   120  	}
   121  
   122  	d := logBase(k, M)
   123  	a := &btAlloc{
   124  		vx:      make([]uint64, d+1),
   125  		sons:    make([][]uint64, d+1),
   126  		cursors: make([]markupCursor, d),
   127  		nodes:   make([][]node, d),
   128  		M:       M,
   129  		K:       k,
   130  		d:       d,
   131  		trace:   trace,
   132  	}
   133  	if trace {
   134  		fmt.Printf("k=%d d=%d, M=%d\n", k, d, M)
   135  	}
   136  	a.vx[0], a.vx[d] = 1, k
   137  
   138  	if k < M/2 {
   139  		a.N = k
   140  		a.nodes = make([][]node, 1)
   141  		return a
   142  	}
   143  
   144  	//nnc := func(vx uint64) uint64 {
   145  	//	return uint64(math.Ceil(float64(vx) / float64(M)))
   146  	//}
   147  	nvc := func(vx uint64) uint64 {
   148  		return uint64(math.Ceil(float64(vx) / float64(M>>1)))
   149  	}
   150  
   151  	for i := a.d - 1; i > 0; i-- {
   152  		nnc := uint64(math.Ceil(float64(a.vx[i+1]) / float64(M)))
   153  		//nvc := uint64(math.Floor(float64(a.vx[i+1]) / float64(m))-1)
   154  		//nnc := a.vx[i+1] / M
   155  		//nvc := a.vx[i+1] / m
   156  		//bvc := a.vx[i+1] / (m + (m >> 1))
   157  		a.vx[i] = min64(uint64(math.Pow(float64(M), float64(i))), nnc)
   158  	}
   159  
   160  	ncount := uint64(0)
   161  	pnv := uint64(0)
   162  	for l := a.d - 1; l > 0; l-- {
   163  		//s := nnc(a.vx[l+1])
   164  		sh := nvc(a.vx[l+1])
   165  
   166  		if sh&1 == 1 {
   167  			a.sons[l] = append(a.sons[l], sh>>1, M, 1, M>>1)
   168  		} else {
   169  			a.sons[l] = append(a.sons[l], sh>>1, M)
   170  		}
   171  
   172  		for ik := 0; ik < len(a.sons[l]); ik += 2 {
   173  			ncount += a.sons[l][ik] * a.sons[l][ik+1]
   174  			if l == 1 {
   175  				pnv += a.sons[l][ik]
   176  			}
   177  		}
   178  	}
   179  	a.sons[0] = []uint64{1, pnv}
   180  	ncount += a.sons[0][0] * a.sons[0][1] // last one
   181  	a.N = ncount
   182  
   183  	if trace {
   184  		for i, v := range a.sons {
   185  			fmt.Printf("L%d=%v\n", i, v)
   186  		}
   187  	}
   188  
   189  	return a
   190  }
   191  
   192  // nolint
   193  // another implementation of traverseDfs supposed to be a bit cleaner but buggy yet
   194  func (a *btAlloc) traverseTrick() {
   195  	for l := 0; l < len(a.sons)-1; l++ {
   196  		if len(a.sons[l]) < 2 {
   197  			panic("invalid btree allocation markup")
   198  		}
   199  		a.cursors[l] = markupCursor{uint64(l), 1, 0, 0}
   200  		a.nodes[l] = make([]node, 0)
   201  	}
   202  
   203  	lf := a.cursors[len(a.cursors)-1]
   204  	c := a.cursors[(len(a.cursors) - 2)]
   205  
   206  	var d uint64
   207  	var fin bool
   208  
   209  	lf.di = d
   210  	lf.si++
   211  	d++
   212  	a.cursors[len(a.cursors)-1] = lf
   213  
   214  	moved := true
   215  	for int(c.p) <= len(a.sons[c.l]) {
   216  		if fin || d > a.K {
   217  			break
   218  		}
   219  		c, lf = a.cursors[c.l], a.cursors[lf.l]
   220  
   221  		c.di = d
   222  		c.si++
   223  
   224  		sons := a.sons[lf.l][lf.p]
   225  		for i := uint64(1); i < sons; i++ {
   226  			lf.si++
   227  			d++
   228  		}
   229  		lf.di = d
   230  		d++
   231  
   232  		a.nodes[lf.l] = append(a.nodes[lf.l], node{p: lf.p, s: lf.si, d: lf.di})
   233  		a.nodes[c.l] = append(a.nodes[c.l], node{p: c.p, s: c.si, d: c.di})
   234  		a.cursors[lf.l] = lf
   235  		a.cursors[c.l] = c
   236  
   237  		for l := lf.l; l >= 0; l-- {
   238  			sc := a.cursors[l]
   239  			sons, gsons := a.sons[sc.l][sc.p-1], a.sons[sc.l][sc.p]
   240  			if l < c.l && moved {
   241  				sc.di = d
   242  				a.nodes[sc.l] = append(a.nodes[sc.l], node{d: sc.di})
   243  				sc.si++
   244  				d++
   245  			}
   246  			moved = (sc.si-1)/gsons != sc.si/gsons
   247  			if sc.si/gsons >= sons {
   248  				sz := uint64(len(a.sons[sc.l]) - 1)
   249  				if sc.p+2 > sz {
   250  					fin = l == lf.l
   251  					break
   252  				} else {
   253  					sc.p += 2
   254  					sc.si, sc.di = 0, 0
   255  				}
   256  				//moved = true
   257  			}
   258  			if l == lf.l {
   259  				sc.si++
   260  				sc.di = d
   261  				d++
   262  			}
   263  			a.cursors[l] = sc
   264  			if l == 0 {
   265  				break
   266  			}
   267  		}
   268  		moved = false
   269  	}
   270  }
   271  
   272  func (a *btAlloc) traverseDfs() {
   273  	for l := 0; l < len(a.sons)-1; l++ {
   274  		a.cursors[l] = markupCursor{uint64(l), 1, 0, 0}
   275  		a.nodes[l] = make([]node, 0)
   276  	}
   277  
   278  	if len(a.cursors) <= 1 {
   279  		if a.nodes[0] == nil {
   280  			a.nodes[0] = make([]node, 0)
   281  		}
   282  		a.nodes[0] = append(a.nodes[0], node{d: a.K})
   283  		a.N = a.K
   284  		if a.trace {
   285  			fmt.Printf("ncount=%d ∂%.5f\n", a.N, float64(a.N-a.K)/float64(a.N))
   286  		}
   287  		return
   288  	}
   289  
   290  	c := a.cursors[len(a.cursors)-1]
   291  	pc := a.cursors[(len(a.cursors) - 2)]
   292  	root := new(node)
   293  	trace := false
   294  
   295  	var di uint64
   296  	for stop := false; !stop; {
   297  		// fill leaves, mark parent if needed (until all grandparents not marked up until root)
   298  		// check if eldest parent has brothers
   299  		//     -- has bros -> fill their leaves from the bottom
   300  		//     -- no bros  -> shift cursor (tricky)
   301  		if di > a.K {
   302  			a.N = di - 1 // actually filled node count
   303  			if a.trace {
   304  				fmt.Printf("ncount=%d ∂%.5f\n", a.N, float64(a.N-a.K)/float64(a.N))
   305  			}
   306  			break
   307  		}
   308  
   309  		bros, parents := a.sons[c.l][c.p], a.sons[c.l][c.p-1]
   310  		for i := uint64(0); i < bros; i++ {
   311  			c.di = di
   312  			if trace {
   313  				fmt.Printf("L%d |%d| d %2d s %2d\n", c.l, c.p, c.di, c.si)
   314  			}
   315  			c.si++
   316  			di++
   317  
   318  			if i == 0 {
   319  				pc.di = di
   320  				if trace {
   321  					fmt.Printf("P%d |%d| d %2d s %2d\n", pc.l, pc.p, pc.di, pc.si)
   322  				}
   323  				pc.si++
   324  				di++
   325  			}
   326  			if di > a.K {
   327  				a.N = di - 1 // actually filled node count
   328  				stop = true
   329  				break
   330  			}
   331  		}
   332  
   333  		a.nodes[c.l] = append(a.nodes[c.l], node{p: c.p, d: c.di, s: c.si})
   334  		a.nodes[pc.l] = append(a.nodes[pc.l], node{p: pc.p, d: pc.di, s: pc.si, fc: uint64(len(a.nodes[c.l]) - 1)})
   335  
   336  		pid := c.si / bros
   337  		if pid >= parents {
   338  			if c.p+2 >= uint64(len(a.sons[c.l])) {
   339  				stop = true // end of row
   340  				if trace {
   341  					fmt.Printf("F%d |%d| d %2d\n", c.l, c.p, c.di)
   342  				}
   343  			} else {
   344  				c.p += 2
   345  				c.si = 0
   346  				c.di = 0
   347  			}
   348  		}
   349  		a.cursors[c.l] = c
   350  		a.cursors[pc.l] = pc
   351  
   352  		//nolint
   353  		for l := pc.l; l >= 0; l-- {
   354  			pc := a.cursors[l]
   355  			uncles := a.sons[pc.l][pc.p]
   356  			grands := a.sons[pc.l][pc.p-1]
   357  
   358  			pi1 := pc.si / uncles
   359  			pc.si++
   360  			pc.di = 0
   361  
   362  			pi2 := pc.si / uncles
   363  			moved := pi2-pi1 != 0
   364  
   365  			switch {
   366  			case pc.l > 0:
   367  				gp := a.cursors[pc.l-1]
   368  				if gp.di == 0 {
   369  					gp.di = di
   370  					di++
   371  					if trace {
   372  						fmt.Printf("P%d |%d| d %2d s %2d\n", gp.l, gp.p, gp.di, gp.si)
   373  					}
   374  					a.nodes[gp.l] = append(a.nodes[gp.l], node{p: gp.p, d: gp.di, s: gp.si, fc: uint64(len(a.nodes[l]) - 1)})
   375  					a.cursors[gp.l] = gp
   376  				}
   377  			default:
   378  				if root.d == 0 {
   379  					root.d = di
   380  					//di++
   381  					if trace {
   382  						fmt.Printf("ROOT | d %2d\n", root.d)
   383  					}
   384  				}
   385  			}
   386  
   387  			//fmt.Printf("P%d |%d| d %2d s %2d pid %d\n", pc.l, pc.p, pc.di, pc.si-1)
   388  			if pi2 >= grands { // skip one step of si due to different parental filling order
   389  				if pc.p+2 >= uint64(len(a.sons[pc.l])) {
   390  					if trace {
   391  						fmt.Printf("EoRow %d |%d|\n", pc.l, pc.p)
   392  					}
   393  					break // end of row
   394  				}
   395  				//fmt.Printf("N %d d%d s%d\n", pc.l, pc.di, pc.si)
   396  				//fmt.Printf("P%d |%d| d %2d s %2d pid %d\n", pc.l, pc.p, pc.di, pc.si, pid)
   397  				pc.p += 2
   398  				pc.si = 0
   399  				pc.di = 0
   400  			}
   401  			a.cursors[pc.l] = pc
   402  
   403  			if !moved {
   404  				break
   405  			}
   406  		}
   407  	}
   408  
   409  	if a.trace {
   410  		fmt.Printf("ncount=%d ∂%.5f\n", a.N, float64(a.N-a.K)/float64(a.N))
   411  	}
   412  }
   413  
   414  func (a *btAlloc) bsKey(x []byte, l, r uint64) (*Cursor, error) {
   415  	for l <= r {
   416  		di := (l + r) >> 1
   417  
   418  		mk, value, err := a.dataLookup(di)
   419  		a.naccess++
   420  
   421  		cmp := bytes.Compare(mk, x)
   422  		switch {
   423  		case err != nil:
   424  			if errors.Is(err, ErrBtIndexLookupBounds) {
   425  				return nil, nil
   426  			}
   427  			return nil, err
   428  		case cmp == 0:
   429  			return a.newCursor(context.TODO(), mk, value, di), nil
   430  		case cmp == -1:
   431  			l = di + 1
   432  		default:
   433  			r = di
   434  		}
   435  		if l == r {
   436  			break
   437  		}
   438  	}
   439  	k, v, err := a.dataLookup(l)
   440  	if err != nil {
   441  		if errors.Is(err, ErrBtIndexLookupBounds) {
   442  			return nil, nil
   443  		}
   444  		return nil, fmt.Errorf("key >= %x was not found. %w", x, err)
   445  	}
   446  	return a.newCursor(context.TODO(), k, v, l), nil
   447  }
   448  
   449  func (a *btAlloc) bsNode(i, l, r uint64, x []byte) (n node, lm int64, rm int64) {
   450  	lm, rm = -1, -1
   451  	var m uint64
   452  
   453  	for l < r {
   454  		m = (l + r) >> 1
   455  
   456  		a.naccess++
   457  		cmp := bytes.Compare(a.nodes[i][m].key, x)
   458  		switch {
   459  		case cmp == 0:
   460  			return a.nodes[i][m], int64(m), int64(m)
   461  		case cmp > 0:
   462  			r = m
   463  			rm = int64(m)
   464  		case cmp < 0:
   465  			lm = int64(m)
   466  			l = m + 1
   467  		default:
   468  			panic(fmt.Errorf("compare error %d, %x ? %x", cmp, n.key, x))
   469  		}
   470  	}
   471  	return a.nodes[i][m], lm, rm
   472  }
   473  
   474  // find position of key with node.di <= d at level lvl
   475  func (a *btAlloc) seekLeast(lvl, d uint64) uint64 {
   476  	for i := range a.nodes[lvl] {
   477  		if a.nodes[lvl][i].d >= d {
   478  			return uint64(i)
   479  		}
   480  	}
   481  	return uint64(len(a.nodes[lvl]))
   482  }
   483  
   484  func (a *btAlloc) Seek(ik []byte) (*Cursor, error) {
   485  	if a.trace {
   486  		fmt.Printf("seek key %x\n", ik)
   487  	}
   488  
   489  	var (
   490  		lm, rm     int64
   491  		L, R       = uint64(0), uint64(len(a.nodes[0]) - 1)
   492  		minD, maxD = uint64(0), a.K
   493  		ln         node
   494  	)
   495  
   496  	for l, level := range a.nodes {
   497  		if len(level) == 1 && l == 0 {
   498  			ln = a.nodes[0][0]
   499  			maxD = ln.d
   500  			break
   501  		}
   502  		ln, lm, rm = a.bsNode(uint64(l), L, R, ik)
   503  		if ln.key == nil { // should return node which is nearest to key from the left so never nil
   504  			if a.trace {
   505  				fmt.Printf("found nil key %x pos_range[%d-%d] naccess_ram=%d\n", l, lm, rm, a.naccess)
   506  			}
   507  			return nil, fmt.Errorf("bt index nil node at level %d", l)
   508  		}
   509  
   510  		switch bytes.Compare(ln.key, ik) {
   511  		case 1: // key > ik
   512  			maxD = ln.d
   513  		case -1: // key < ik
   514  			minD = ln.d
   515  		case 0:
   516  			if a.trace {
   517  				fmt.Printf("found key %x v=%x naccess_ram=%d\n", ik, ln.val /*level[m].d,*/, a.naccess)
   518  			}
   519  			return a.newCursor(context.TODO(), common.Copy(ln.key), common.Copy(ln.val), ln.d), nil
   520  		}
   521  
   522  		if rm-lm >= 1 {
   523  			break
   524  		}
   525  		if lm >= 0 {
   526  			minD = a.nodes[l][lm].d
   527  			L = level[lm].fc
   528  		} else if l+1 != len(a.nodes) {
   529  			L = a.seekLeast(uint64(l+1), minD)
   530  			if L == uint64(len(a.nodes[l+1])) {
   531  				L--
   532  			}
   533  		}
   534  		if rm >= 0 {
   535  			maxD = a.nodes[l][rm].d
   536  			R = level[rm].fc
   537  		} else if l+1 != len(a.nodes) {
   538  			R = a.seekLeast(uint64(l+1), maxD)
   539  			if R == uint64(len(a.nodes[l+1])) {
   540  				R--
   541  			}
   542  		}
   543  
   544  		if a.trace {
   545  			fmt.Printf("range={%x d=%d p=%d} (%d, %d) L=%d naccess_ram=%d\n", ln.key, ln.d, ln.p, minD, maxD, l, a.naccess)
   546  		}
   547  	}
   548  
   549  	a.naccess = 0 // reset count before actually go to disk
   550  	cursor, err := a.bsKey(ik, minD, maxD)
   551  	if err != nil {
   552  		if a.trace {
   553  			fmt.Printf("key %x not found\n", ik)
   554  		}
   555  		return nil, err
   556  	}
   557  
   558  	if a.trace {
   559  		fmt.Printf("finally found key %x v=%x naccess_disk=%d\n", cursor.key, cursor.value, a.naccess)
   560  	}
   561  	return cursor, nil
   562  }
   563  
   564  func (a *btAlloc) fillSearchMx() {
   565  	for i, n := range a.nodes {
   566  		if a.trace {
   567  			fmt.Printf("D%d |%d| ", i, len(n))
   568  		}
   569  		for j, s := range n {
   570  			if a.trace {
   571  				fmt.Printf("%d ", s.d)
   572  			}
   573  			if s.d >= a.K {
   574  				break
   575  			}
   576  
   577  			kb, v, err := a.dataLookup(s.d)
   578  			if err != nil {
   579  				fmt.Printf("d %d not found %v\n", s.d, err)
   580  			}
   581  			a.nodes[i][j].key = common.Copy(kb)
   582  			a.nodes[i][j].val = common.Copy(v)
   583  		}
   584  		if a.trace {
   585  			fmt.Printf("\n")
   586  		}
   587  	}
   588  }
   589  
   590  // deprecated
   591  type BtIndexReader struct {
   592  	index *BtIndex
   593  }
   594  
   595  func NewBtIndexReader(index *BtIndex) *BtIndexReader {
   596  	return &BtIndexReader{
   597  		index: index,
   598  	}
   599  }
   600  
   601  // Lookup wraps index Lookup
   602  func (r *BtIndexReader) Lookup(key []byte) uint64 {
   603  	if r.index != nil {
   604  		return r.index.Lookup(key)
   605  	}
   606  	return 0
   607  }
   608  
   609  func (r *BtIndexReader) Lookup2(key1, key2 []byte) uint64 {
   610  	fk := make([]byte, 52)
   611  	copy(fk[:length.Addr], key1)
   612  	copy(fk[length.Addr:], key2)
   613  
   614  	if r.index != nil {
   615  		return r.index.Lookup(fk)
   616  	}
   617  	return 0
   618  }
   619  
   620  func (r *BtIndexReader) Seek(x []byte) (*Cursor, error) {
   621  	if r.index != nil {
   622  		cursor, err := r.index.alloc.Seek(x)
   623  		if err != nil {
   624  			return nil, fmt.Errorf("seek key %x: %w", x, err)
   625  		}
   626  		return cursor, nil
   627  	}
   628  	return nil, fmt.Errorf("seek has been failed")
   629  }
   630  
   631  func (r *BtIndexReader) Empty() bool {
   632  	return r.index.Empty()
   633  }
   634  
   635  type BtIndexWriter struct {
   636  	built           bool
   637  	lvl             log.Lvl
   638  	maxOffset       uint64
   639  	prevOffset      uint64
   640  	minDelta        uint64
   641  	indexW          *bufio.Writer
   642  	indexF          *os.File
   643  	bucketCollector *etl.Collector // Collector that sorts by buckets
   644  
   645  	indexFileName          string
   646  	indexFile, tmpFilePath string
   647  
   648  	tmpDir      string
   649  	numBuf      [8]byte
   650  	keyCount    uint64
   651  	etlBufLimit datasize.ByteSize
   652  	bytesPerRec int
   653  	logger      log.Logger
   654  	noFsync     bool // fsync is enabled by default, but tests can manually disable
   655  }
   656  
   657  type BtIndexWriterArgs struct {
   658  	IndexFile   string // File name where the index and the minimal perfect hash function will be written to
   659  	TmpDir      string
   660  	KeyCount    int
   661  	EtlBufLimit datasize.ByteSize
   662  }
   663  
   664  const BtreeLogPrefix = "btree"
   665  
   666  // NewBtIndexWriter creates a new BtIndexWriter instance with given number of keys
   667  // Typical bucket size is 100 - 2048, larger bucket sizes result in smaller representations of hash functions, at a cost of slower access
   668  // salt parameters is used to randomise the hash function construction, to ensure that different Erigon instances (nodes)
   669  // are likely to use different hash function, to collision attacks are unlikely to slow down any meaningful number of nodes at the same time
   670  func NewBtIndexWriter(args BtIndexWriterArgs, logger log.Logger) (*BtIndexWriter, error) {
   671  	btw := &BtIndexWriter{lvl: log.LvlDebug, logger: logger}
   672  	btw.tmpDir = args.TmpDir
   673  	btw.indexFile = args.IndexFile
   674  	btw.tmpFilePath = args.IndexFile + ".tmp"
   675  
   676  	_, fname := filepath.Split(btw.indexFile)
   677  	btw.indexFileName = fname
   678  	btw.etlBufLimit = args.EtlBufLimit
   679  	if btw.etlBufLimit == 0 {
   680  		btw.etlBufLimit = etl.BufferOptimalSize
   681  	}
   682  
   683  	btw.bucketCollector = etl.NewCollector(BtreeLogPrefix+" "+fname, btw.tmpDir, etl.NewSortableBuffer(btw.etlBufLimit), logger)
   684  	btw.bucketCollector.LogLvl(log.LvlDebug)
   685  
   686  	btw.maxOffset = 0
   687  	return btw, nil
   688  }
   689  
   690  // loadFuncBucket is required to satisfy the type etl.LoadFunc type, to use with collector.Load
   691  func (btw *BtIndexWriter) loadFuncBucket(k, v []byte, _ etl.CurrentTableReader, _ etl.LoadNextFunc) error {
   692  	// k is the BigEndian encoding of the bucket number, and the v is the key that is assigned into that bucket
   693  	//if uint64(len(btw.vals)) >= btw.batchSizeLimit {
   694  	//	if err := btw.drainBatch(); err != nil {
   695  	//		return err
   696  	//	}
   697  	//}
   698  
   699  	// if _, err := btw.indexW.Write(k); err != nil {
   700  	// 	return err
   701  	// }
   702  	if _, err := btw.indexW.Write(v[8-btw.bytesPerRec:]); err != nil {
   703  		return err
   704  	}
   705  
   706  	//btw.keys = append(btw.keys, binary.BigEndian.Uint64(k), binary.BigEndian.Uint64(k[8:]))
   707  	//btw.vals = append(btw.vals, binary.BigEndian.Uint64(v))
   708  	return nil
   709  }
   710  
   711  // Build has to be called after all the keys have been added, and it initiates the process
   712  // of building the perfect hash function and writing index into a file
   713  func (btw *BtIndexWriter) Build() error {
   714  	if btw.built {
   715  		return fmt.Errorf("already built")
   716  	}
   717  	//if btw.keysAdded != btw.keyCount {
   718  	//	return fmt.Errorf("expected keys %d, got %d", btw.keyCount, btw.keysAdded)
   719  	//}
   720  	var err error
   721  	if btw.indexF, err = os.Create(btw.tmpFilePath); err != nil {
   722  		return fmt.Errorf("create index file %s: %w", btw.indexFile, err)
   723  	}
   724  	defer btw.indexF.Close()
   725  	btw.indexW = bufio.NewWriterSize(btw.indexF, etl.BufIOSize)
   726  
   727  	// Write number of keys
   728  	binary.BigEndian.PutUint64(btw.numBuf[:], btw.keyCount)
   729  	if _, err = btw.indexW.Write(btw.numBuf[:]); err != nil {
   730  		return fmt.Errorf("write number of keys: %w", err)
   731  	}
   732  	// Write number of bytes per index record
   733  	btw.bytesPerRec = common.BitLenToByteLen(bits.Len64(btw.maxOffset))
   734  	if err = btw.indexW.WriteByte(byte(btw.bytesPerRec)); err != nil {
   735  		return fmt.Errorf("write bytes per record: %w", err)
   736  	}
   737  
   738  	defer btw.bucketCollector.Close()
   739  	log.Log(btw.lvl, "[index] calculating", "file", btw.indexFileName)
   740  	if err := btw.bucketCollector.Load(nil, "", btw.loadFuncBucket, etl.TransformArgs{}); err != nil {
   741  		return err
   742  	}
   743  
   744  	btw.logger.Log(btw.lvl, "[index] write", "file", btw.indexFileName)
   745  	btw.built = true
   746  
   747  	if err = btw.indexW.Flush(); err != nil {
   748  		return err
   749  	}
   750  	if err = btw.fsync(); err != nil {
   751  		return err
   752  	}
   753  	if err = btw.indexF.Close(); err != nil {
   754  		return err
   755  	}
   756  	if err = os.Rename(btw.tmpFilePath, btw.indexFile); err != nil {
   757  		return err
   758  	}
   759  	return nil
   760  }
   761  
   762  func (btw *BtIndexWriter) DisableFsync() { btw.noFsync = true }
   763  
   764  // fsync - other processes/goroutines must see only "fully-complete" (valid) files. No partial-writes.
   765  // To achieve it: write to .tmp file then `rename` when file is ready.
   766  // Machine may power-off right after `rename` - it means `fsync` must be before `rename`
   767  func (btw *BtIndexWriter) fsync() error {
   768  	if btw.noFsync {
   769  		return nil
   770  	}
   771  	if err := btw.indexF.Sync(); err != nil {
   772  		btw.logger.Warn("couldn't fsync", "err", err, "file", btw.tmpFilePath)
   773  		return err
   774  	}
   775  	return nil
   776  }
   777  
   778  func (btw *BtIndexWriter) Close() {
   779  	if btw.indexF != nil {
   780  		btw.indexF.Close()
   781  	}
   782  	if btw.bucketCollector != nil {
   783  		btw.bucketCollector.Close()
   784  	}
   785  	//if btw.offsetCollector != nil {
   786  	//	btw.offsetCollector.Close()
   787  	//}
   788  }
   789  
   790  func (btw *BtIndexWriter) AddKey(key []byte, offset uint64) error {
   791  	if btw.built {
   792  		return fmt.Errorf("cannot add keys after perfect hash function had been built")
   793  	}
   794  
   795  	binary.BigEndian.PutUint64(btw.numBuf[:], offset)
   796  	if offset > btw.maxOffset {
   797  		btw.maxOffset = offset
   798  	}
   799  	if btw.keyCount > 0 {
   800  		delta := offset - btw.prevOffset
   801  		if btw.keyCount == 1 || delta < btw.minDelta {
   802  			btw.minDelta = delta
   803  		}
   804  	}
   805  
   806  	if err := btw.bucketCollector.Collect(key, btw.numBuf[:]); err != nil {
   807  		return err
   808  	}
   809  	btw.keyCount++
   810  	btw.prevOffset = offset
   811  	return nil
   812  }
   813  
   814  type BtIndex struct {
   815  	alloc        *btAlloc
   816  	m            mmap.MMap
   817  	data         []byte
   818  	file         *os.File
   819  	size         int64
   820  	modTime      time.Time
   821  	filePath     string
   822  	keyCount     uint64
   823  	bytesPerRec  int
   824  	dataoffset   uint64
   825  	auxBuf       []byte
   826  	decompressor *compress.Decompressor
   827  	getter       *compress.Getter
   828  }
   829  
   830  func CreateBtreeIndex(indexPath, dataPath string, M uint64, logger log.Logger) (*BtIndex, error) {
   831  	err := BuildBtreeIndex(dataPath, indexPath, logger)
   832  	if err != nil {
   833  		return nil, err
   834  	}
   835  	return OpenBtreeIndex(indexPath, dataPath, M)
   836  }
   837  
   838  var DefaultBtreeM = uint64(2048)
   839  
   840  func CreateBtreeIndexWithDecompressor(indexPath string, M uint64, decompressor *compress.Decompressor, p *background.Progress, tmpdir string, logger log.Logger) (*BtIndex, error) {
   841  	err := BuildBtreeIndexWithDecompressor(indexPath, decompressor, p, tmpdir, logger)
   842  	if err != nil {
   843  		return nil, err
   844  	}
   845  	return OpenBtreeIndexWithDecompressor(indexPath, M, decompressor)
   846  }
   847  
   848  func BuildBtreeIndexWithDecompressor(indexPath string, kv *compress.Decompressor, p *background.Progress, tmpdir string, logger log.Logger) error {
   849  	defer kv.EnableReadAhead().DisableReadAhead()
   850  
   851  	args := BtIndexWriterArgs{
   852  		IndexFile: indexPath,
   853  		TmpDir:    tmpdir,
   854  	}
   855  
   856  	iw, err := NewBtIndexWriter(args, logger)
   857  	if err != nil {
   858  		return err
   859  	}
   860  
   861  	getter := kv.MakeGetter()
   862  	getter.Reset(0)
   863  
   864  	key := make([]byte, 0, 64)
   865  	ks := make(map[int]int)
   866  
   867  	var pos, kp uint64
   868  	emptys := 0
   869  	for getter.HasNext() {
   870  		p.Processed.Add(1)
   871  		key, kp = getter.Next(key[:0])
   872  		err = iw.AddKey(key, pos)
   873  		if err != nil {
   874  			return err
   875  		}
   876  
   877  		pos, _ = getter.Skip()
   878  		if pos-kp == 1 {
   879  			ks[len(key)]++
   880  			emptys++
   881  		}
   882  	}
   883  	//fmt.Printf("emptys %d %#+v\n", emptys, ks)
   884  
   885  	if err := iw.Build(); err != nil {
   886  		return err
   887  	}
   888  	iw.Close()
   889  	return nil
   890  }
   891  
   892  // Opens .kv at dataPath and generates index over it to file 'indexPath'
   893  func BuildBtreeIndex(dataPath, indexPath string, logger log.Logger) error {
   894  	decomp, err := compress.NewDecompressor(dataPath)
   895  	if err != nil {
   896  		return err
   897  	}
   898  	defer decomp.Close()
   899  
   900  	defer decomp.EnableReadAhead().DisableReadAhead()
   901  
   902  	args := BtIndexWriterArgs{
   903  		IndexFile: indexPath,
   904  		TmpDir:    filepath.Dir(indexPath),
   905  	}
   906  
   907  	iw, err := NewBtIndexWriter(args, logger)
   908  	if err != nil {
   909  		return err
   910  	}
   911  	defer iw.Close()
   912  
   913  	getter := decomp.MakeGetter()
   914  	getter.Reset(0)
   915  
   916  	key := make([]byte, 0, 64)
   917  
   918  	var pos uint64
   919  	for getter.HasNext() {
   920  		key, _ = getter.Next(key[:0])
   921  		err = iw.AddKey(key, pos)
   922  		if err != nil {
   923  			return err
   924  		}
   925  
   926  		pos, _ = getter.Skip()
   927  	}
   928  	decomp.Close()
   929  
   930  	if err := iw.Build(); err != nil {
   931  		return err
   932  	}
   933  	iw.Close()
   934  	return nil
   935  }
   936  
   937  func OpenBtreeIndexWithDecompressor(indexPath string, M uint64, kv *compress.Decompressor) (*BtIndex, error) {
   938  	s, err := os.Stat(indexPath)
   939  	if err != nil {
   940  		return nil, err
   941  	}
   942  
   943  	idx := &BtIndex{
   944  		filePath: indexPath,
   945  		size:     s.Size(),
   946  		modTime:  s.ModTime(),
   947  		auxBuf:   make([]byte, 64),
   948  	}
   949  
   950  	idx.file, err = os.Open(indexPath)
   951  	if err != nil {
   952  		return nil, err
   953  	}
   954  
   955  	idx.m, err = mmap.MapRegion(idx.file, int(idx.size), mmap.RDONLY, 0, 0)
   956  	if err != nil {
   957  		return nil, err
   958  	}
   959  	idx.data = idx.m[:idx.size]
   960  
   961  	// Read number of keys and bytes per record
   962  	pos := 8
   963  	idx.keyCount = binary.BigEndian.Uint64(idx.data[:pos])
   964  	if idx.keyCount == 0 {
   965  		return idx, nil
   966  	}
   967  	idx.bytesPerRec = int(idx.data[pos])
   968  	pos += 1
   969  
   970  	//p := (*[]byte)(unsafe.Pointer(&idx.data[pos]))
   971  	//l := int(idx.keyCount)*idx.bytesPerRec + (16 * int(idx.keyCount))
   972  
   973  	idx.getter = kv.MakeGetter()
   974  
   975  	idx.dataoffset = uint64(pos)
   976  	idx.alloc = newBtAlloc(idx.keyCount, M, false)
   977  	if idx.alloc != nil {
   978  		idx.alloc.dataLookup = idx.dataLookup
   979  		idx.alloc.traverseDfs()
   980  		defer idx.decompressor.EnableReadAhead().DisableReadAhead()
   981  		idx.alloc.fillSearchMx()
   982  	}
   983  	return idx, nil
   984  }
   985  
   986  func OpenBtreeIndex(indexPath, dataPath string, M uint64) (*BtIndex, error) {
   987  	s, err := os.Stat(indexPath)
   988  	if err != nil {
   989  		return nil, err
   990  	}
   991  
   992  	idx := &BtIndex{
   993  		filePath: indexPath,
   994  		size:     s.Size(),
   995  		modTime:  s.ModTime(),
   996  		auxBuf:   make([]byte, 64),
   997  	}
   998  
   999  	idx.file, err = os.Open(indexPath)
  1000  	if err != nil {
  1001  		return nil, err
  1002  	}
  1003  
  1004  	idx.m, err = mmap.MapRegion(idx.file, int(idx.size), mmap.RDONLY, 0, 0)
  1005  	if err != nil {
  1006  		return nil, err
  1007  	}
  1008  	idx.data = idx.m[:idx.size]
  1009  
  1010  	// Read number of keys and bytes per record
  1011  	pos := 8
  1012  	idx.keyCount = binary.BigEndian.Uint64(idx.data[:pos])
  1013  	idx.bytesPerRec = int(idx.data[pos])
  1014  	pos += 1
  1015  
  1016  	// offset := int(idx.keyCount) * idx.bytesPerRec //+ (idx.keySize * int(idx.keyCount))
  1017  	// if offset < 0 {
  1018  	// 	return nil, fmt.Errorf("offset is: %d which is below zero, the file: %s is broken", offset, indexPath)
  1019  	// }
  1020  
  1021  	//p := (*[]byte)(unsafe.Pointer(&idx.data[pos]))
  1022  	//l := int(idx.keyCount)*idx.bytesPerRec + (16 * int(idx.keyCount))
  1023  
  1024  	idx.decompressor, err = compress.NewDecompressor(dataPath)
  1025  	if err != nil {
  1026  		idx.Close()
  1027  		return nil, err
  1028  	}
  1029  	idx.getter = idx.decompressor.MakeGetter()
  1030  
  1031  	idx.dataoffset = uint64(pos)
  1032  	idx.alloc = newBtAlloc(idx.keyCount, M, false)
  1033  	if idx.alloc != nil {
  1034  		idx.alloc.dataLookup = idx.dataLookup
  1035  		idx.alloc.traverseDfs()
  1036  		defer idx.decompressor.EnableReadAhead().DisableReadAhead()
  1037  		idx.alloc.fillSearchMx()
  1038  	}
  1039  	return idx, nil
  1040  }
  1041  
  1042  var ErrBtIndexLookupBounds = errors.New("BtIndex: lookup di bounds error")
  1043  
  1044  // dataLookup fetches key and value from data file by di (data index)
  1045  // di starts from 0 so di is never >= keyCount
  1046  func (b *BtIndex) dataLookup(di uint64) ([]byte, []byte, error) {
  1047  	if di >= b.keyCount {
  1048  		return nil, nil, fmt.Errorf("%w: keyCount=%d, item %d requested. file: %s", ErrBtIndexLookupBounds, b.keyCount, di+1, b.FileName())
  1049  	}
  1050  	p := int(b.dataoffset) + int(di)*b.bytesPerRec
  1051  	if len(b.data) < p+b.bytesPerRec {
  1052  		return nil, nil, fmt.Errorf("data lookup gone too far (%d after %d). keyCount=%d, requesed item %d. file: %s", p+b.bytesPerRec-len(b.data), len(b.data), b.keyCount, di, b.FileName())
  1053  	}
  1054  
  1055  	var aux [8]byte
  1056  	dst := aux[8-b.bytesPerRec:]
  1057  	copy(dst, b.data[p:p+b.bytesPerRec])
  1058  
  1059  	offset := binary.BigEndian.Uint64(aux[:])
  1060  	b.getter.Reset(offset)
  1061  	if !b.getter.HasNext() {
  1062  		return nil, nil, fmt.Errorf("pair %d not found. keyCount=%d. file: %s", di, b.keyCount, b.FileName())
  1063  	}
  1064  
  1065  	key, kp := b.getter.Next(nil)
  1066  
  1067  	if !b.getter.HasNext() {
  1068  		return nil, nil, fmt.Errorf("pair %d not found. keyCount=%d. file: %s", di, b.keyCount, b.FileName())
  1069  	}
  1070  	val, vp := b.getter.Next(nil)
  1071  	_, _ = kp, vp
  1072  	return key, val, nil
  1073  }
  1074  
  1075  func (b *BtIndex) Size() int64 { return b.size }
  1076  
  1077  func (b *BtIndex) ModTime() time.Time { return b.modTime }
  1078  
  1079  func (b *BtIndex) FilePath() string { return b.filePath }
  1080  
  1081  func (b *BtIndex) FileName() string { return path.Base(b.filePath) }
  1082  
  1083  func (b *BtIndex) Empty() bool { return b == nil || b.keyCount == 0 }
  1084  
  1085  func (b *BtIndex) KeyCount() uint64 { return b.keyCount }
  1086  
  1087  func (b *BtIndex) Close() {
  1088  	if b == nil {
  1089  		return
  1090  	}
  1091  	if b.file != nil {
  1092  		if err := b.m.Unmap(); err != nil {
  1093  			log.Log(dbg.FileCloseLogLevel, "unmap", "err", err, "file", b.FileName(), "stack", dbg.Stack())
  1094  		}
  1095  		b.m = nil
  1096  		if err := b.file.Close(); err != nil {
  1097  			log.Log(dbg.FileCloseLogLevel, "close", "err", err, "file", b.FileName(), "stack", dbg.Stack())
  1098  		}
  1099  		b.file = nil
  1100  	}
  1101  	if b.decompressor != nil {
  1102  		b.decompressor.Close()
  1103  		b.decompressor = nil
  1104  	}
  1105  }
  1106  
  1107  func (b *BtIndex) Seek(x []byte) (*Cursor, error) {
  1108  	if b.alloc == nil {
  1109  		return nil, nil
  1110  	}
  1111  	cursor, err := b.alloc.Seek(x)
  1112  	if err != nil {
  1113  		return nil, fmt.Errorf("seek key %x: %w", x, err)
  1114  	}
  1115  	// cursor could be nil along with err if nothing found
  1116  	return cursor, nil
  1117  }
  1118  
  1119  // deprecated
  1120  func (b *BtIndex) Lookup(key []byte) uint64 {
  1121  	if b.alloc == nil {
  1122  		return 0
  1123  	}
  1124  	cursor, err := b.alloc.Seek(key)
  1125  	if err != nil {
  1126  		panic(err)
  1127  	}
  1128  	return binary.BigEndian.Uint64(cursor.value)
  1129  }
  1130  
  1131  func (b *BtIndex) OrdinalLookup(i uint64) *Cursor {
  1132  	if b.alloc == nil {
  1133  		return nil
  1134  	}
  1135  	if i > b.alloc.K {
  1136  		return nil
  1137  	}
  1138  	k, v, err := b.dataLookup(i)
  1139  	if err != nil {
  1140  		return nil
  1141  	}
  1142  
  1143  	return &Cursor{
  1144  		key: k, value: v, d: i, ix: b.alloc,
  1145  	}
  1146  }