github.com/ledgerwatch/erigon-lib@v1.0.0/recsplit/recsplit.go (about)

     1  /*
     2     Copyright 2021 The Erigon contributors
     3  
     4     Licensed under the Apache License, Version 2.0 (the "License");
     5     you may not use this file except in compliance with the License.
     6     You may obtain a copy of the License at
     7  
     8         http://www.apache.org/licenses/LICENSE-2.0
     9  
    10     Unless required by applicable law or agreed to in writing, software
    11     distributed under the License is distributed on an "AS IS" BASIS,
    12     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13     See the License for the specific language governing permissions and
    14     limitations under the License.
    15  */
    16  
    17  package recsplit
    18  
    19  import (
    20  	"bufio"
    21  	"context"
    22  	"crypto/rand"
    23  	"encoding/binary"
    24  	"fmt"
    25  	"io"
    26  	"math"
    27  	"math/bits"
    28  	"os"
    29  	"path/filepath"
    30  
    31  	"github.com/c2h5oh/datasize"
    32  	"github.com/ledgerwatch/log/v3"
    33  	"github.com/spaolacci/murmur3"
    34  
    35  	"github.com/ledgerwatch/erigon-lib/common"
    36  	"github.com/ledgerwatch/erigon-lib/common/assert"
    37  	"github.com/ledgerwatch/erigon-lib/etl"
    38  	"github.com/ledgerwatch/erigon-lib/recsplit/eliasfano16"
    39  	"github.com/ledgerwatch/erigon-lib/recsplit/eliasfano32"
    40  )
    41  
    42  var ErrCollision = fmt.Errorf("duplicate key")
    43  
    44  const RecSplitLogPrefix = "recsplit"
    45  
    46  const MaxLeafSize = 24
    47  
    48  /** David Stafford's (http://zimbry.blogspot.com/2011/09/better-bit-mixing-improving-on.html)
    49   * 13th variant of the 64-bit finalizer function in Austin Appleby's
    50   * MurmurHash3 (https://github.com/aappleby/smhasher).
    51   *
    52   * @param z a 64-bit integer.
    53   * @return a 64-bit integer obtained by mixing the bits of `z`.
    54   */
    55  
    56  func remix(z uint64) uint64 {
    57  	z = (z ^ (z >> 30)) * 0xbf58476d1ce4e5b9
    58  	z = (z ^ (z >> 27)) * 0x94d049bb133111eb
    59  	return z ^ (z >> 31)
    60  }
    61  
    62  // RecSplit is the implementation of Recursive Split algorithm for constructing perfect hash mapping, described in
    63  // https://arxiv.org/pdf/1910.06416.pdf Emmanuel Esposito, Thomas Mueller Graf, and Sebastiano Vigna.
    64  // Recsplit: Minimal perfect hashing via recursive splitting. In 2020 Proceedings of the Symposium on Algorithm Engineering and Experiments (ALENEX),
    65  // pages 175−185. SIAM, 2020.
    66  type RecSplit struct {
    67  	hasher          murmur3.Hash128 // Salted hash function to use for splitting into initial buckets and mapping to 64-bit fingerprints
    68  	offsetCollector *etl.Collector  // Collector that sorts by offsets
    69  	indexW          *bufio.Writer
    70  	indexF          *os.File
    71  	offsetEf        *eliasfano32.EliasFano // Elias Fano instance for encoding the offsets
    72  	bucketCollector *etl.Collector         // Collector that sorts by buckets
    73  
    74  	indexFileName          string
    75  	indexFile, tmpFilePath string
    76  
    77  	tmpDir            string
    78  	gr                GolombRice // Helper object to encode the tree of hash function salts using Golomb-Rice code.
    79  	bucketPosAcc      []uint64   // Accumulator for position of every bucket in the encoding of the hash function
    80  	startSeed         []uint64
    81  	count             []uint16
    82  	currentBucket     []uint64 // 64-bit fingerprints of keys in the current bucket accumulated before the recsplit is performed for that bucket
    83  	currentBucketOffs []uint64 // Index offsets for the current bucket
    84  	offsetBuffer      []uint64
    85  	buffer            []uint64
    86  	golombRice        []uint32
    87  	bucketSizeAcc     []uint64 // Bucket size accumulator
    88  	// Helper object to encode the sequence of cumulative number of keys in the buckets
    89  	// and the sequence of of cumulative bit offsets of buckets in the Golomb-Rice code.
    90  	ef                 eliasfano16.DoubleEliasFano
    91  	lvl                log.Lvl
    92  	bytesPerRec        int
    93  	minDelta           uint64 // minDelta for Elias Fano encoding of "enum -> offset" index
    94  	prevOffset         uint64 // Previously added offset (for calculating minDelta for Elias Fano encoding of "enum -> offset" index)
    95  	bucketSize         int
    96  	keyExpectedCount   uint64 // Number of keys in the hash table
    97  	keysAdded          uint64 // Number of keys actually added to the recSplit (to check the match with keyExpectedCount)
    98  	maxOffset          uint64 // Maximum value of index offset to later decide how many bytes to use for the encoding
    99  	currentBucketIdx   uint64 // Current bucket being accumulated
   100  	baseDataID         uint64 // Minimal app-specific ID of entries of this index - helps app understand what data stored in given shard - persistent field
   101  	bucketCount        uint64 // Number of buckets
   102  	etlBufLimit        datasize.ByteSize
   103  	salt               uint32 // Murmur3 hash used for converting keys to 64-bit values and assigning to buckets
   104  	leafSize           uint16 // Leaf size for recursive split algorithm
   105  	secondaryAggrBound uint16 // The lower bound for secondary key aggregation (computed from leadSize)
   106  	primaryAggrBound   uint16 // The lower bound for primary key aggregation (computed from leafSize)
   107  	bucketKeyBuf       [16]byte
   108  	numBuf             [8]byte
   109  	collision          bool
   110  	enums              bool // Whether to build two level index with perfect hash table pointing to enumeration and enumeration pointing to offsets
   111  	built              bool // Flag indicating that the hash function has been built and no more keys can be added
   112  	trace              bool
   113  	logger             log.Logger
   114  
   115  	noFsync bool // fsync is enabled by default, but tests can manually disable
   116  }
   117  
   118  type RecSplitArgs struct {
   119  	// Whether two level index needs to be built, where perfect hash map points to an enumeration, and enumeration points to offsets
   120  	// if Enum=false: can have unsorted and duplicated values
   121  	// if Enum=true:  must have sorted values (can have duplicates) - monotonically growing sequence
   122  	Enums bool
   123  
   124  	IndexFile   string // File name where the index and the minimal perfect hash function will be written to
   125  	TmpDir      string
   126  	StartSeed   []uint64 // For each level of recursive split, the hash seed (salt) used for that level - need to be generated randomly and be large enough to accomodate all the levels
   127  	KeyCount    int
   128  	BucketSize  int
   129  	BaseDataID  uint64
   130  	EtlBufLimit datasize.ByteSize
   131  	Salt        uint32 // Hash seed (salt) for the hash function used for allocating the initial buckets - need to be generated randomly
   132  	LeafSize    uint16
   133  }
   134  
   135  // NewRecSplit creates a new RecSplit instance with given number of keys and given bucket size
   136  // Typical bucket size is 100 - 2000, larger bucket sizes result in smaller representations of hash functions, at a cost of slower access
   137  // salt parameters is used to randomise the hash function construction, to ensure that different Erigon instances (nodes)
   138  // are likely to use different hash function, to collision attacks are unlikely to slow down any meaningful number of nodes at the same time
   139  func NewRecSplit(args RecSplitArgs, logger log.Logger) (*RecSplit, error) {
   140  	bucketCount := (args.KeyCount + args.BucketSize - 1) / args.BucketSize
   141  	rs := &RecSplit{bucketSize: args.BucketSize, keyExpectedCount: uint64(args.KeyCount), bucketCount: uint64(bucketCount), lvl: log.LvlDebug, logger: logger}
   142  	if len(args.StartSeed) == 0 {
   143  		args.StartSeed = []uint64{0x106393c187cae21a, 0x6453cec3f7376937, 0x643e521ddbd2be98, 0x3740c6412f6572cb, 0x717d47562f1ce470, 0x4cd6eb4c63befb7c, 0x9bfd8c5e18c8da73,
   144  			0x082f20e10092a9a3, 0x2ada2ce68d21defc, 0xe33cb4f3e7c6466b, 0x3980be458c509c59, 0xc466fd9584828e8c, 0x45f0aabe1a61ede6, 0xf6e7b8b33ad9b98d,
   145  			0x4ef95e25f4b4983d, 0x81175195173b92d3, 0x4e50927d8dd15978, 0x1ea2099d1fafae7f, 0x425c8a06fbaaa815, 0xcd4216006c74052a}
   146  	}
   147  	rs.salt = args.Salt
   148  	if rs.salt == 0 {
   149  		seedBytes := make([]byte, 4)
   150  		if _, err := rand.Read(seedBytes); err != nil {
   151  			return nil, err
   152  		}
   153  		rs.salt = binary.BigEndian.Uint32(seedBytes)
   154  	}
   155  	rs.hasher = murmur3.New128WithSeed(rs.salt)
   156  	rs.tmpDir = args.TmpDir
   157  	rs.indexFile = args.IndexFile
   158  	rs.tmpFilePath = args.IndexFile + ".tmp"
   159  	_, fname := filepath.Split(rs.indexFile)
   160  	rs.indexFileName = fname
   161  	rs.baseDataID = args.BaseDataID
   162  	rs.etlBufLimit = args.EtlBufLimit
   163  	if rs.etlBufLimit == 0 {
   164  		rs.etlBufLimit = etl.BufferOptimalSize
   165  	}
   166  	rs.bucketCollector = etl.NewCollector(RecSplitLogPrefix+" "+fname, rs.tmpDir, etl.NewSortableBuffer(rs.etlBufLimit), logger)
   167  	rs.bucketCollector.LogLvl(log.LvlDebug)
   168  	rs.enums = args.Enums
   169  	if args.Enums {
   170  		rs.offsetCollector = etl.NewCollector(RecSplitLogPrefix+" "+fname, rs.tmpDir, etl.NewSortableBuffer(rs.etlBufLimit), logger)
   171  		rs.offsetCollector.LogLvl(log.LvlDebug)
   172  	}
   173  	rs.currentBucket = make([]uint64, 0, args.BucketSize)
   174  	rs.currentBucketOffs = make([]uint64, 0, args.BucketSize)
   175  	rs.maxOffset = 0
   176  	rs.bucketSizeAcc = make([]uint64, 1, bucketCount+1)
   177  	rs.bucketPosAcc = make([]uint64, 1, bucketCount+1)
   178  	if args.LeafSize > MaxLeafSize {
   179  		return nil, fmt.Errorf("exceeded max leaf size %d: %d", MaxLeafSize, args.LeafSize)
   180  	}
   181  	rs.leafSize = args.LeafSize
   182  	rs.primaryAggrBound = rs.leafSize * uint16(math.Max(2, math.Ceil(0.35*float64(rs.leafSize)+1./2.)))
   183  	if rs.leafSize < 7 {
   184  		rs.secondaryAggrBound = rs.primaryAggrBound * 2
   185  	} else {
   186  		rs.secondaryAggrBound = rs.primaryAggrBound * uint16(math.Ceil(0.21*float64(rs.leafSize)+9./10.))
   187  	}
   188  	rs.startSeed = args.StartSeed
   189  	rs.count = make([]uint16, rs.secondaryAggrBound)
   190  	return rs, nil
   191  }
   192  
   193  func (rs *RecSplit) Close() {
   194  	if rs.indexF != nil {
   195  		rs.indexF.Close()
   196  	}
   197  	if rs.bucketCollector != nil {
   198  		rs.bucketCollector.Close()
   199  	}
   200  	if rs.offsetCollector != nil {
   201  		rs.offsetCollector.Close()
   202  	}
   203  }
   204  
   205  func (rs *RecSplit) LogLvl(lvl log.Lvl) { rs.lvl = lvl }
   206  
   207  func (rs *RecSplit) SetTrace(trace bool) {
   208  	rs.trace = trace
   209  }
   210  
   211  // remap converts the number x which is assumed to be uniformly distributed over the range [0..2^64) to the number that is uniformly
   212  // distributed over the range [0..n)
   213  func remap(x uint64, n uint64) uint64 {
   214  	hi, _ := bits.Mul64(x, n)
   215  	return hi
   216  }
   217  
   218  const mask48 uint64 = (1 << 48) - 1
   219  
   220  // remap converts the number x which is assumed to be uniformly distributed over the range [0..2^64) to the number that is uniformly
   221  // distributed over the range [0..n), under assumption that n is less than 2^16
   222  func remap16(x uint64, n uint16) uint16 {
   223  	return uint16(((x & mask48) * uint64(n)) >> 48)
   224  }
   225  
   226  // ResetNextSalt resets the RecSplit and uses the next salt value to try to avoid collisions
   227  // when mapping keys to 64-bit values
   228  func (rs *RecSplit) ResetNextSalt() {
   229  	rs.built = false
   230  	rs.collision = false
   231  	rs.keysAdded = 0
   232  	rs.salt++
   233  	rs.hasher = murmur3.New128WithSeed(rs.salt)
   234  	if rs.bucketCollector != nil {
   235  		rs.bucketCollector.Close()
   236  	}
   237  	rs.bucketCollector = etl.NewCollector(RecSplitLogPrefix+" "+rs.indexFileName, rs.tmpDir, etl.NewSortableBuffer(rs.etlBufLimit), rs.logger)
   238  	if rs.offsetCollector != nil {
   239  		rs.offsetCollector.Close()
   240  		rs.offsetCollector = etl.NewCollector(RecSplitLogPrefix+" "+rs.indexFileName, rs.tmpDir, etl.NewSortableBuffer(rs.etlBufLimit), rs.logger)
   241  	}
   242  	rs.currentBucket = rs.currentBucket[:0]
   243  	rs.currentBucketOffs = rs.currentBucketOffs[:0]
   244  	rs.maxOffset = 0
   245  	rs.bucketSizeAcc = rs.bucketSizeAcc[:1] // First entry is always zero
   246  	rs.bucketPosAcc = rs.bucketPosAcc[:1]   // First entry is always zero
   247  }
   248  
   249  func splitParams(m, leafSize, primaryAggrBound, secondaryAggrBound uint16) (fanout, unit uint16) {
   250  	if m > secondaryAggrBound { // High-level aggregation (fanout 2)
   251  		unit = secondaryAggrBound * (((m+1)/2 + secondaryAggrBound - 1) / secondaryAggrBound)
   252  		fanout = 2
   253  	} else if m > primaryAggrBound { // Second-level aggregation
   254  		unit = primaryAggrBound
   255  		fanout = (m + primaryAggrBound - 1) / primaryAggrBound
   256  	} else { // First-level aggregation
   257  		unit = leafSize
   258  		fanout = (m + leafSize - 1) / leafSize
   259  	}
   260  	return
   261  }
   262  
   263  func computeGolombRice(m uint16, table []uint32, leafSize, primaryAggrBound, secondaryAggrBound uint16) {
   264  	fanout, unit := splitParams(m, leafSize, primaryAggrBound, secondaryAggrBound)
   265  	k := make([]uint16, fanout)
   266  	k[fanout-1] = m
   267  	for i := uint16(0); i < fanout-1; i++ {
   268  		k[i] = unit
   269  		k[fanout-1] -= k[i]
   270  	}
   271  	sqrtProd := float64(1)
   272  	for i := uint16(0); i < fanout; i++ {
   273  		sqrtProd *= math.Sqrt(float64(k[i]))
   274  	}
   275  	p := math.Sqrt(float64(m)) / (math.Pow(2*math.Pi, (float64(fanout)-1.)/2.0) * sqrtProd)
   276  	golombRiceLength := uint32(math.Ceil(math.Log2(-math.Log((math.Sqrt(5)+1.0)/2.0) / math.Log1p(-p)))) // log2 Golomb modulus
   277  	if golombRiceLength > 0x1F {
   278  		panic("golombRiceLength > 0x1F")
   279  	}
   280  	table[m] = golombRiceLength << 27
   281  	for i := uint16(0); i < fanout; i++ {
   282  		golombRiceLength += table[k[i]] & 0xFFFF
   283  	}
   284  	if golombRiceLength > 0xFFFF {
   285  		panic("golombRiceLength > 0xFFFF")
   286  	}
   287  	table[m] |= golombRiceLength // Sum of Golomb-Rice codeslengths in the subtree, stored in the lower 16 bits
   288  	nodes := uint32(1)
   289  	for i := uint16(0); i < fanout; i++ {
   290  		nodes += (table[k[i]] >> 16) & 0x7FF
   291  	}
   292  	if leafSize >= 3 && nodes > 0x7FF {
   293  		panic("rs.leafSize >= 3 && nodes > 0x7FF")
   294  	}
   295  	table[m] |= nodes << 16
   296  }
   297  
   298  // golombParam returns the optimal Golomb parameter to use for encoding
   299  // salt for the part of the hash function separating m elements. It is based on
   300  // calculations with assumptions that we draw hash functions at random
   301  func (rs *RecSplit) golombParam(m uint16) int {
   302  	s := uint16(len(rs.golombRice))
   303  	for m >= s {
   304  		rs.golombRice = append(rs.golombRice, 0)
   305  		// For the case where bucket is larger than planned
   306  		if s == 0 {
   307  			rs.golombRice[0] = (bijMemo[0] << 27) | bijMemo[0]
   308  		} else if s <= rs.leafSize {
   309  			rs.golombRice[s] = (bijMemo[s] << 27) | (uint32(1) << 16) | bijMemo[s]
   310  		} else {
   311  			computeGolombRice(s, rs.golombRice, rs.leafSize, rs.primaryAggrBound, rs.secondaryAggrBound)
   312  		}
   313  		s++
   314  	}
   315  	return int(rs.golombRice[m] >> 27)
   316  }
   317  
   318  // Add key to the RecSplit. There can be many more keys than what fits in RAM, and RecSplit
   319  // spills data onto disk to accomodate that. The key gets copied by the collector, therefore
   320  // the slice underlying key is not getting accessed by RecSplit after this invocation.
   321  func (rs *RecSplit) AddKey(key []byte, offset uint64) error {
   322  	if rs.built {
   323  		return fmt.Errorf("cannot add keys after perfect hash function had been built")
   324  	}
   325  	rs.hasher.Reset()
   326  	rs.hasher.Write(key) //nolint:errcheck
   327  	hi, lo := rs.hasher.Sum128()
   328  	binary.BigEndian.PutUint64(rs.bucketKeyBuf[:], remap(hi, rs.bucketCount))
   329  	binary.BigEndian.PutUint64(rs.bucketKeyBuf[8:], lo)
   330  	binary.BigEndian.PutUint64(rs.numBuf[:], offset)
   331  	if offset > rs.maxOffset {
   332  		rs.maxOffset = offset
   333  	}
   334  	if rs.keysAdded > 0 {
   335  		delta := offset - rs.prevOffset
   336  		if rs.keysAdded == 1 || delta < rs.minDelta {
   337  			rs.minDelta = delta
   338  		}
   339  	}
   340  
   341  	if rs.enums {
   342  		if err := rs.offsetCollector.Collect(rs.numBuf[:], nil); err != nil {
   343  			return err
   344  		}
   345  		binary.BigEndian.PutUint64(rs.numBuf[:], rs.keysAdded)
   346  		if err := rs.bucketCollector.Collect(rs.bucketKeyBuf[:], rs.numBuf[:]); err != nil {
   347  			return err
   348  		}
   349  	} else {
   350  		if err := rs.bucketCollector.Collect(rs.bucketKeyBuf[:], rs.numBuf[:]); err != nil {
   351  			return err
   352  		}
   353  	}
   354  	rs.keysAdded++
   355  	rs.prevOffset = offset
   356  	return nil
   357  }
   358  
   359  func (rs *RecSplit) AddOffset(offset uint64) error {
   360  	if rs.enums {
   361  		binary.BigEndian.PutUint64(rs.numBuf[:], offset)
   362  		if err := rs.offsetCollector.Collect(rs.numBuf[:], nil); err != nil {
   363  			return err
   364  		}
   365  	}
   366  	return nil
   367  }
   368  
   369  func (rs *RecSplit) recsplitCurrentBucket() error {
   370  	// Extend rs.bucketSizeAcc to accomodate current bucket index + 1
   371  	for len(rs.bucketSizeAcc) <= int(rs.currentBucketIdx)+1 {
   372  		rs.bucketSizeAcc = append(rs.bucketSizeAcc, rs.bucketSizeAcc[len(rs.bucketSizeAcc)-1])
   373  	}
   374  	rs.bucketSizeAcc[int(rs.currentBucketIdx)+1] += uint64(len(rs.currentBucket))
   375  	// Sets of size 0 and 1 are not further processed, just write them to index
   376  	if len(rs.currentBucket) > 1 {
   377  		for i, key := range rs.currentBucket[1:] {
   378  			if key == rs.currentBucket[i] {
   379  				rs.collision = true
   380  				return fmt.Errorf("%w: %x", ErrCollision, key)
   381  			}
   382  		}
   383  		bitPos := rs.gr.bitCount
   384  		if rs.buffer == nil {
   385  			rs.buffer = make([]uint64, len(rs.currentBucket))
   386  			rs.offsetBuffer = make([]uint64, len(rs.currentBucketOffs))
   387  		} else {
   388  			for len(rs.buffer) < len(rs.currentBucket) {
   389  				rs.buffer = append(rs.buffer, 0)
   390  				rs.offsetBuffer = append(rs.offsetBuffer, 0)
   391  			}
   392  		}
   393  		unary, err := rs.recsplit(0 /* level */, rs.currentBucket, rs.currentBucketOffs, nil /* unary */)
   394  		if err != nil {
   395  			return err
   396  		}
   397  		rs.gr.appendUnaryAll(unary)
   398  		if rs.trace {
   399  			fmt.Printf("recsplitBucket(%d, %d, bitsize = %d)\n", rs.currentBucketIdx, len(rs.currentBucket), rs.gr.bitCount-bitPos)
   400  		}
   401  	} else {
   402  		for _, offset := range rs.currentBucketOffs {
   403  			binary.BigEndian.PutUint64(rs.numBuf[:], offset)
   404  			if _, err := rs.indexW.Write(rs.numBuf[8-rs.bytesPerRec:]); err != nil {
   405  				return err
   406  			}
   407  		}
   408  	}
   409  	// Extend rs.bucketPosAcc to accomodate current bucket index + 1
   410  	for len(rs.bucketPosAcc) <= int(rs.currentBucketIdx)+1 {
   411  		rs.bucketPosAcc = append(rs.bucketPosAcc, rs.bucketPosAcc[len(rs.bucketPosAcc)-1])
   412  	}
   413  	rs.bucketPosAcc[int(rs.currentBucketIdx)+1] = uint64(rs.gr.Bits())
   414  	// clear for the next buckey
   415  	rs.currentBucket = rs.currentBucket[:0]
   416  	rs.currentBucketOffs = rs.currentBucketOffs[:0]
   417  	return nil
   418  }
   419  
   420  // recsplit applies recSplit algorithm to the given bucket
   421  func (rs *RecSplit) recsplit(level int, bucket []uint64, offsets []uint64, unary []uint64) ([]uint64, error) {
   422  	if rs.trace {
   423  		fmt.Printf("recsplit(%d, %d, %x)\n", level, len(bucket), bucket)
   424  	}
   425  	// Pick initial salt for this level of recursive split
   426  	salt := rs.startSeed[level]
   427  	m := uint16(len(bucket))
   428  	if m <= rs.leafSize {
   429  		// No need to build aggregation levels - just find find bijection
   430  		var mask uint32
   431  		for {
   432  			mask = 0
   433  			var fail bool
   434  			for i := uint16(0); !fail && i < m; i++ {
   435  				bit := uint32(1) << remap16(remix(bucket[i]+salt), m)
   436  				if mask&bit != 0 {
   437  					fail = true
   438  				} else {
   439  					mask |= bit
   440  				}
   441  			}
   442  			if !fail {
   443  				break
   444  			}
   445  			salt++
   446  		}
   447  		for i := uint16(0); i < m; i++ {
   448  			j := remap16(remix(bucket[i]+salt), m)
   449  			rs.offsetBuffer[j] = offsets[i]
   450  		}
   451  		for _, offset := range rs.offsetBuffer[:m] {
   452  			binary.BigEndian.PutUint64(rs.numBuf[:], offset)
   453  			if _, err := rs.indexW.Write(rs.numBuf[8-rs.bytesPerRec:]); err != nil {
   454  				return nil, err
   455  			}
   456  		}
   457  		salt -= rs.startSeed[level]
   458  		log2golomb := rs.golombParam(m)
   459  		if rs.trace {
   460  			fmt.Printf("encode bij %d with log2golomn %d at p = %d\n", salt, log2golomb, rs.gr.bitCount)
   461  		}
   462  		rs.gr.appendFixed(salt, log2golomb)
   463  		unary = append(unary, salt>>log2golomb)
   464  	} else {
   465  		fanout, unit := splitParams(m, rs.leafSize, rs.primaryAggrBound, rs.secondaryAggrBound)
   466  		count := rs.count
   467  		for {
   468  			for i := uint16(0); i < fanout-1; i++ {
   469  				count[i] = 0
   470  			}
   471  			var fail bool
   472  			for i := uint16(0); i < m; i++ {
   473  				count[remap16(remix(bucket[i]+salt), m)/unit]++
   474  			}
   475  			for i := uint16(0); i < fanout-1; i++ {
   476  				fail = fail || (count[i] != unit)
   477  			}
   478  			if !fail {
   479  				break
   480  			}
   481  			salt++
   482  		}
   483  		for i, c := uint16(0), uint16(0); i < fanout; i++ {
   484  			count[i] = c
   485  			c += unit
   486  		}
   487  		for i := uint16(0); i < m; i++ {
   488  			j := remap16(remix(bucket[i]+salt), m) / unit
   489  			rs.buffer[count[j]] = bucket[i]
   490  			rs.offsetBuffer[count[j]] = offsets[i]
   491  			count[j]++
   492  		}
   493  		copy(bucket, rs.buffer)
   494  		copy(offsets, rs.offsetBuffer)
   495  		salt -= rs.startSeed[level]
   496  		log2golomb := rs.golombParam(m)
   497  		if rs.trace {
   498  			fmt.Printf("encode fanout %d: %d with log2golomn %d at p = %d\n", fanout, salt, log2golomb, rs.gr.bitCount)
   499  		}
   500  		rs.gr.appendFixed(salt, log2golomb)
   501  		unary = append(unary, salt>>log2golomb)
   502  		var err error
   503  		var i uint16
   504  		for i = 0; i < m-unit; i += unit {
   505  			if unary, err = rs.recsplit(level+1, bucket[i:i+unit], offsets[i:i+unit], unary); err != nil {
   506  				return nil, err
   507  			}
   508  		}
   509  		if m-i > 1 {
   510  			if unary, err = rs.recsplit(level+1, bucket[i:], offsets[i:], unary); err != nil {
   511  				return nil, err
   512  			}
   513  		} else if m-i == 1 {
   514  			binary.BigEndian.PutUint64(rs.numBuf[:], offsets[i])
   515  			if _, err := rs.indexW.Write(rs.numBuf[8-rs.bytesPerRec:]); err != nil {
   516  				return nil, err
   517  			}
   518  		}
   519  	}
   520  	return unary, nil
   521  }
   522  
   523  // loadFuncBucket is required to satisfy the type etl.LoadFunc type, to use with collector.Load
   524  func (rs *RecSplit) loadFuncBucket(k, v []byte, _ etl.CurrentTableReader, _ etl.LoadNextFunc) error {
   525  	// k is the BigEndian encoding of the bucket number, and the v is the key that is assigned into that bucket
   526  	bucketIdx := binary.BigEndian.Uint64(k)
   527  	if rs.currentBucketIdx != bucketIdx {
   528  		if rs.currentBucketIdx != math.MaxUint64 {
   529  			if err := rs.recsplitCurrentBucket(); err != nil {
   530  				return err
   531  			}
   532  		}
   533  		rs.currentBucketIdx = bucketIdx
   534  	}
   535  	rs.currentBucket = append(rs.currentBucket, binary.BigEndian.Uint64(k[8:]))
   536  	rs.currentBucketOffs = append(rs.currentBucketOffs, binary.BigEndian.Uint64(v))
   537  	return nil
   538  }
   539  
   540  func (rs *RecSplit) loadFuncOffset(k, _ []byte, _ etl.CurrentTableReader, _ etl.LoadNextFunc) error {
   541  	offset := binary.BigEndian.Uint64(k)
   542  	rs.offsetEf.AddOffset(offset)
   543  	return nil
   544  }
   545  
   546  // Build has to be called after all the keys have been added, and it initiates the process
   547  // of building the perfect hash function and writing index into a file
   548  func (rs *RecSplit) Build(ctx context.Context) error {
   549  	if rs.built {
   550  		return fmt.Errorf("already built")
   551  	}
   552  	if rs.keysAdded != rs.keyExpectedCount {
   553  		return fmt.Errorf("expected keys %d, got %d", rs.keyExpectedCount, rs.keysAdded)
   554  	}
   555  	var err error
   556  	if rs.indexF, err = os.Create(rs.tmpFilePath); err != nil {
   557  		return fmt.Errorf("create index file %s: %w", rs.indexFile, err)
   558  	}
   559  	defer rs.indexF.Close()
   560  	rs.indexW = bufio.NewWriterSize(rs.indexF, etl.BufIOSize)
   561  	// Write minimal app-specific dataID in this index file
   562  	binary.BigEndian.PutUint64(rs.numBuf[:], rs.baseDataID)
   563  	if _, err = rs.indexW.Write(rs.numBuf[:]); err != nil {
   564  		return fmt.Errorf("write number of keys: %w", err)
   565  	}
   566  
   567  	// Write number of keys
   568  	binary.BigEndian.PutUint64(rs.numBuf[:], rs.keysAdded)
   569  	if _, err = rs.indexW.Write(rs.numBuf[:]); err != nil {
   570  		return fmt.Errorf("write number of keys: %w", err)
   571  	}
   572  	// Write number of bytes per index record
   573  	rs.bytesPerRec = common.BitLenToByteLen(bits.Len64(rs.maxOffset))
   574  	if err = rs.indexW.WriteByte(byte(rs.bytesPerRec)); err != nil {
   575  		return fmt.Errorf("write bytes per record: %w", err)
   576  	}
   577  
   578  	rs.currentBucketIdx = math.MaxUint64 // To make sure 0 bucket is detected
   579  	defer rs.bucketCollector.Close()
   580  	if rs.lvl < log.LvlTrace {
   581  		log.Log(rs.lvl, "[index] calculating", "file", rs.indexFileName)
   582  	}
   583  	if err := rs.bucketCollector.Load(nil, "", rs.loadFuncBucket, etl.TransformArgs{Quit: ctx.Done()}); err != nil {
   584  		return err
   585  	}
   586  	if len(rs.currentBucket) > 0 {
   587  		if err := rs.recsplitCurrentBucket(); err != nil {
   588  			return err
   589  		}
   590  	}
   591  
   592  	if assert.Enable {
   593  		rs.indexW.Flush()
   594  		rs.indexF.Seek(0, 0)
   595  		b, _ := io.ReadAll(rs.indexF)
   596  		if len(b) != 9+int(rs.keysAdded)*rs.bytesPerRec {
   597  			panic(fmt.Errorf("expected: %d, got: %d; rs.keysAdded=%d, rs.bytesPerRec=%d, %s", 9+int(rs.keysAdded)*rs.bytesPerRec, len(b), rs.keysAdded, rs.bytesPerRec, rs.indexFile))
   598  		}
   599  	}
   600  	if rs.lvl < log.LvlTrace {
   601  		log.Log(rs.lvl, "[index] write", "file", rs.indexFileName)
   602  	}
   603  	if rs.enums {
   604  		rs.offsetEf = eliasfano32.NewEliasFano(rs.keysAdded, rs.maxOffset)
   605  		defer rs.offsetCollector.Close()
   606  		if err := rs.offsetCollector.Load(nil, "", rs.loadFuncOffset, etl.TransformArgs{}); err != nil {
   607  			return err
   608  		}
   609  		rs.offsetEf.Build()
   610  	}
   611  	rs.gr.appendFixed(1, 1) // Sentinel (avoids checking for parts of size 1)
   612  	// Construct Elias Fano index
   613  	rs.ef.Build(rs.bucketSizeAcc, rs.bucketPosAcc)
   614  	rs.built = true
   615  
   616  	// Write out bucket count, bucketSize, leafSize
   617  	binary.BigEndian.PutUint64(rs.numBuf[:], rs.bucketCount)
   618  	if _, err := rs.indexW.Write(rs.numBuf[:8]); err != nil {
   619  		return fmt.Errorf("writing bucketCount: %w", err)
   620  	}
   621  	binary.BigEndian.PutUint16(rs.numBuf[:], uint16(rs.bucketSize))
   622  	if _, err := rs.indexW.Write(rs.numBuf[:2]); err != nil {
   623  		return fmt.Errorf("writing bucketSize: %w", err)
   624  	}
   625  	binary.BigEndian.PutUint16(rs.numBuf[:], rs.leafSize)
   626  	if _, err := rs.indexW.Write(rs.numBuf[:2]); err != nil {
   627  		return fmt.Errorf("writing leafSize: %w", err)
   628  	}
   629  	// Write out salt
   630  	binary.BigEndian.PutUint32(rs.numBuf[:], rs.salt)
   631  	if _, err := rs.indexW.Write(rs.numBuf[:4]); err != nil {
   632  		return fmt.Errorf("writing salt: %w", err)
   633  	}
   634  	// Write out start seeds
   635  	if err := rs.indexW.WriteByte(byte(len(rs.startSeed))); err != nil {
   636  		return fmt.Errorf("writing len of start seeds: %w", err)
   637  	}
   638  	for _, s := range rs.startSeed {
   639  		binary.BigEndian.PutUint64(rs.numBuf[:], s)
   640  		if _, err := rs.indexW.Write(rs.numBuf[:8]); err != nil {
   641  			return fmt.Errorf("writing start seed: %w", err)
   642  		}
   643  	}
   644  
   645  	if rs.enums {
   646  		if err := rs.indexW.WriteByte(1); err != nil {
   647  			return fmt.Errorf("writing enums = true: %w", err)
   648  		}
   649  	} else {
   650  		if err := rs.indexW.WriteByte(0); err != nil {
   651  			return fmt.Errorf("writing enums = true: %w", err)
   652  		}
   653  	}
   654  	if rs.enums {
   655  		// Write out elias fano for offsets
   656  		if err := rs.offsetEf.Write(rs.indexW); err != nil {
   657  			return fmt.Errorf("writing elias fano for offsets: %w", err)
   658  		}
   659  	}
   660  	// Write out the size of golomb rice params
   661  	binary.BigEndian.PutUint16(rs.numBuf[:], uint16(len(rs.golombRice)))
   662  	if _, err := rs.indexW.Write(rs.numBuf[:4]); err != nil {
   663  		return fmt.Errorf("writing golomb rice param size: %w", err)
   664  	}
   665  	// Write out golomb rice
   666  	if err := rs.gr.Write(rs.indexW); err != nil {
   667  		return fmt.Errorf("writing golomb rice: %w", err)
   668  	}
   669  	// Write out elias fano
   670  	if err := rs.ef.Write(rs.indexW); err != nil {
   671  		return fmt.Errorf("writing elias fano: %w", err)
   672  	}
   673  
   674  	if err = rs.indexW.Flush(); err != nil {
   675  		return err
   676  	}
   677  	if err = rs.fsync(); err != nil {
   678  		return err
   679  	}
   680  	if err = rs.indexF.Close(); err != nil {
   681  		return err
   682  	}
   683  	if err = os.Rename(rs.tmpFilePath, rs.indexFile); err != nil {
   684  		return err
   685  	}
   686  	return nil
   687  }
   688  
   689  func (rs *RecSplit) DisableFsync() { rs.noFsync = true }
   690  
   691  // Fsync - other processes/goroutines must see only "fully-complete" (valid) files. No partial-writes.
   692  // To achieve it: write to .tmp file then `rename` when file is ready.
   693  // Machine may power-off right after `rename` - it means `fsync` must be before `rename`
   694  func (rs *RecSplit) fsync() error {
   695  	if rs.noFsync {
   696  		return nil
   697  	}
   698  	if err := rs.indexF.Sync(); err != nil {
   699  		rs.logger.Warn("couldn't fsync", "err", err, "file", rs.tmpFilePath)
   700  		return err
   701  	}
   702  	return nil
   703  }
   704  
   705  // Stats returns the size of golomb rice encoding and ellias fano encoding
   706  func (rs *RecSplit) Stats() (int, int) {
   707  	return len(rs.gr.Data()), len(rs.ef.Data())
   708  }
   709  
   710  // Collision returns true if there was a collision detected during mapping of keys
   711  // into 64-bit values
   712  // RecSplit needs to be reset, re-populated with keys, and rebuilt
   713  func (rs *RecSplit) Collision() bool {
   714  	return rs.collision
   715  }