github.com/ledgerwatch/erigon-lib@v1.0.0/recsplit/recsplit.go (about) 1 /* 2 Copyright 2021 The Erigon contributors 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package recsplit 18 19 import ( 20 "bufio" 21 "context" 22 "crypto/rand" 23 "encoding/binary" 24 "fmt" 25 "io" 26 "math" 27 "math/bits" 28 "os" 29 "path/filepath" 30 31 "github.com/c2h5oh/datasize" 32 "github.com/ledgerwatch/log/v3" 33 "github.com/spaolacci/murmur3" 34 35 "github.com/ledgerwatch/erigon-lib/common" 36 "github.com/ledgerwatch/erigon-lib/common/assert" 37 "github.com/ledgerwatch/erigon-lib/etl" 38 "github.com/ledgerwatch/erigon-lib/recsplit/eliasfano16" 39 "github.com/ledgerwatch/erigon-lib/recsplit/eliasfano32" 40 ) 41 42 var ErrCollision = fmt.Errorf("duplicate key") 43 44 const RecSplitLogPrefix = "recsplit" 45 46 const MaxLeafSize = 24 47 48 /** David Stafford's (http://zimbry.blogspot.com/2011/09/better-bit-mixing-improving-on.html) 49 * 13th variant of the 64-bit finalizer function in Austin Appleby's 50 * MurmurHash3 (https://github.com/aappleby/smhasher). 51 * 52 * @param z a 64-bit integer. 53 * @return a 64-bit integer obtained by mixing the bits of `z`. 54 */ 55 56 func remix(z uint64) uint64 { 57 z = (z ^ (z >> 30)) * 0xbf58476d1ce4e5b9 58 z = (z ^ (z >> 27)) * 0x94d049bb133111eb 59 return z ^ (z >> 31) 60 } 61 62 // RecSplit is the implementation of Recursive Split algorithm for constructing perfect hash mapping, described in 63 // https://arxiv.org/pdf/1910.06416.pdf Emmanuel Esposito, Thomas Mueller Graf, and Sebastiano Vigna. 64 // Recsplit: Minimal perfect hashing via recursive splitting. In 2020 Proceedings of the Symposium on Algorithm Engineering and Experiments (ALENEX), 65 // pages 175−185. SIAM, 2020. 66 type RecSplit struct { 67 hasher murmur3.Hash128 // Salted hash function to use for splitting into initial buckets and mapping to 64-bit fingerprints 68 offsetCollector *etl.Collector // Collector that sorts by offsets 69 indexW *bufio.Writer 70 indexF *os.File 71 offsetEf *eliasfano32.EliasFano // Elias Fano instance for encoding the offsets 72 bucketCollector *etl.Collector // Collector that sorts by buckets 73 74 indexFileName string 75 indexFile, tmpFilePath string 76 77 tmpDir string 78 gr GolombRice // Helper object to encode the tree of hash function salts using Golomb-Rice code. 79 bucketPosAcc []uint64 // Accumulator for position of every bucket in the encoding of the hash function 80 startSeed []uint64 81 count []uint16 82 currentBucket []uint64 // 64-bit fingerprints of keys in the current bucket accumulated before the recsplit is performed for that bucket 83 currentBucketOffs []uint64 // Index offsets for the current bucket 84 offsetBuffer []uint64 85 buffer []uint64 86 golombRice []uint32 87 bucketSizeAcc []uint64 // Bucket size accumulator 88 // Helper object to encode the sequence of cumulative number of keys in the buckets 89 // and the sequence of of cumulative bit offsets of buckets in the Golomb-Rice code. 90 ef eliasfano16.DoubleEliasFano 91 lvl log.Lvl 92 bytesPerRec int 93 minDelta uint64 // minDelta for Elias Fano encoding of "enum -> offset" index 94 prevOffset uint64 // Previously added offset (for calculating minDelta for Elias Fano encoding of "enum -> offset" index) 95 bucketSize int 96 keyExpectedCount uint64 // Number of keys in the hash table 97 keysAdded uint64 // Number of keys actually added to the recSplit (to check the match with keyExpectedCount) 98 maxOffset uint64 // Maximum value of index offset to later decide how many bytes to use for the encoding 99 currentBucketIdx uint64 // Current bucket being accumulated 100 baseDataID uint64 // Minimal app-specific ID of entries of this index - helps app understand what data stored in given shard - persistent field 101 bucketCount uint64 // Number of buckets 102 etlBufLimit datasize.ByteSize 103 salt uint32 // Murmur3 hash used for converting keys to 64-bit values and assigning to buckets 104 leafSize uint16 // Leaf size for recursive split algorithm 105 secondaryAggrBound uint16 // The lower bound for secondary key aggregation (computed from leadSize) 106 primaryAggrBound uint16 // The lower bound for primary key aggregation (computed from leafSize) 107 bucketKeyBuf [16]byte 108 numBuf [8]byte 109 collision bool 110 enums bool // Whether to build two level index with perfect hash table pointing to enumeration and enumeration pointing to offsets 111 built bool // Flag indicating that the hash function has been built and no more keys can be added 112 trace bool 113 logger log.Logger 114 115 noFsync bool // fsync is enabled by default, but tests can manually disable 116 } 117 118 type RecSplitArgs struct { 119 // Whether two level index needs to be built, where perfect hash map points to an enumeration, and enumeration points to offsets 120 // if Enum=false: can have unsorted and duplicated values 121 // if Enum=true: must have sorted values (can have duplicates) - monotonically growing sequence 122 Enums bool 123 124 IndexFile string // File name where the index and the minimal perfect hash function will be written to 125 TmpDir string 126 StartSeed []uint64 // For each level of recursive split, the hash seed (salt) used for that level - need to be generated randomly and be large enough to accomodate all the levels 127 KeyCount int 128 BucketSize int 129 BaseDataID uint64 130 EtlBufLimit datasize.ByteSize 131 Salt uint32 // Hash seed (salt) for the hash function used for allocating the initial buckets - need to be generated randomly 132 LeafSize uint16 133 } 134 135 // NewRecSplit creates a new RecSplit instance with given number of keys and given bucket size 136 // Typical bucket size is 100 - 2000, larger bucket sizes result in smaller representations of hash functions, at a cost of slower access 137 // salt parameters is used to randomise the hash function construction, to ensure that different Erigon instances (nodes) 138 // are likely to use different hash function, to collision attacks are unlikely to slow down any meaningful number of nodes at the same time 139 func NewRecSplit(args RecSplitArgs, logger log.Logger) (*RecSplit, error) { 140 bucketCount := (args.KeyCount + args.BucketSize - 1) / args.BucketSize 141 rs := &RecSplit{bucketSize: args.BucketSize, keyExpectedCount: uint64(args.KeyCount), bucketCount: uint64(bucketCount), lvl: log.LvlDebug, logger: logger} 142 if len(args.StartSeed) == 0 { 143 args.StartSeed = []uint64{0x106393c187cae21a, 0x6453cec3f7376937, 0x643e521ddbd2be98, 0x3740c6412f6572cb, 0x717d47562f1ce470, 0x4cd6eb4c63befb7c, 0x9bfd8c5e18c8da73, 144 0x082f20e10092a9a3, 0x2ada2ce68d21defc, 0xe33cb4f3e7c6466b, 0x3980be458c509c59, 0xc466fd9584828e8c, 0x45f0aabe1a61ede6, 0xf6e7b8b33ad9b98d, 145 0x4ef95e25f4b4983d, 0x81175195173b92d3, 0x4e50927d8dd15978, 0x1ea2099d1fafae7f, 0x425c8a06fbaaa815, 0xcd4216006c74052a} 146 } 147 rs.salt = args.Salt 148 if rs.salt == 0 { 149 seedBytes := make([]byte, 4) 150 if _, err := rand.Read(seedBytes); err != nil { 151 return nil, err 152 } 153 rs.salt = binary.BigEndian.Uint32(seedBytes) 154 } 155 rs.hasher = murmur3.New128WithSeed(rs.salt) 156 rs.tmpDir = args.TmpDir 157 rs.indexFile = args.IndexFile 158 rs.tmpFilePath = args.IndexFile + ".tmp" 159 _, fname := filepath.Split(rs.indexFile) 160 rs.indexFileName = fname 161 rs.baseDataID = args.BaseDataID 162 rs.etlBufLimit = args.EtlBufLimit 163 if rs.etlBufLimit == 0 { 164 rs.etlBufLimit = etl.BufferOptimalSize 165 } 166 rs.bucketCollector = etl.NewCollector(RecSplitLogPrefix+" "+fname, rs.tmpDir, etl.NewSortableBuffer(rs.etlBufLimit), logger) 167 rs.bucketCollector.LogLvl(log.LvlDebug) 168 rs.enums = args.Enums 169 if args.Enums { 170 rs.offsetCollector = etl.NewCollector(RecSplitLogPrefix+" "+fname, rs.tmpDir, etl.NewSortableBuffer(rs.etlBufLimit), logger) 171 rs.offsetCollector.LogLvl(log.LvlDebug) 172 } 173 rs.currentBucket = make([]uint64, 0, args.BucketSize) 174 rs.currentBucketOffs = make([]uint64, 0, args.BucketSize) 175 rs.maxOffset = 0 176 rs.bucketSizeAcc = make([]uint64, 1, bucketCount+1) 177 rs.bucketPosAcc = make([]uint64, 1, bucketCount+1) 178 if args.LeafSize > MaxLeafSize { 179 return nil, fmt.Errorf("exceeded max leaf size %d: %d", MaxLeafSize, args.LeafSize) 180 } 181 rs.leafSize = args.LeafSize 182 rs.primaryAggrBound = rs.leafSize * uint16(math.Max(2, math.Ceil(0.35*float64(rs.leafSize)+1./2.))) 183 if rs.leafSize < 7 { 184 rs.secondaryAggrBound = rs.primaryAggrBound * 2 185 } else { 186 rs.secondaryAggrBound = rs.primaryAggrBound * uint16(math.Ceil(0.21*float64(rs.leafSize)+9./10.)) 187 } 188 rs.startSeed = args.StartSeed 189 rs.count = make([]uint16, rs.secondaryAggrBound) 190 return rs, nil 191 } 192 193 func (rs *RecSplit) Close() { 194 if rs.indexF != nil { 195 rs.indexF.Close() 196 } 197 if rs.bucketCollector != nil { 198 rs.bucketCollector.Close() 199 } 200 if rs.offsetCollector != nil { 201 rs.offsetCollector.Close() 202 } 203 } 204 205 func (rs *RecSplit) LogLvl(lvl log.Lvl) { rs.lvl = lvl } 206 207 func (rs *RecSplit) SetTrace(trace bool) { 208 rs.trace = trace 209 } 210 211 // remap converts the number x which is assumed to be uniformly distributed over the range [0..2^64) to the number that is uniformly 212 // distributed over the range [0..n) 213 func remap(x uint64, n uint64) uint64 { 214 hi, _ := bits.Mul64(x, n) 215 return hi 216 } 217 218 const mask48 uint64 = (1 << 48) - 1 219 220 // remap converts the number x which is assumed to be uniformly distributed over the range [0..2^64) to the number that is uniformly 221 // distributed over the range [0..n), under assumption that n is less than 2^16 222 func remap16(x uint64, n uint16) uint16 { 223 return uint16(((x & mask48) * uint64(n)) >> 48) 224 } 225 226 // ResetNextSalt resets the RecSplit and uses the next salt value to try to avoid collisions 227 // when mapping keys to 64-bit values 228 func (rs *RecSplit) ResetNextSalt() { 229 rs.built = false 230 rs.collision = false 231 rs.keysAdded = 0 232 rs.salt++ 233 rs.hasher = murmur3.New128WithSeed(rs.salt) 234 if rs.bucketCollector != nil { 235 rs.bucketCollector.Close() 236 } 237 rs.bucketCollector = etl.NewCollector(RecSplitLogPrefix+" "+rs.indexFileName, rs.tmpDir, etl.NewSortableBuffer(rs.etlBufLimit), rs.logger) 238 if rs.offsetCollector != nil { 239 rs.offsetCollector.Close() 240 rs.offsetCollector = etl.NewCollector(RecSplitLogPrefix+" "+rs.indexFileName, rs.tmpDir, etl.NewSortableBuffer(rs.etlBufLimit), rs.logger) 241 } 242 rs.currentBucket = rs.currentBucket[:0] 243 rs.currentBucketOffs = rs.currentBucketOffs[:0] 244 rs.maxOffset = 0 245 rs.bucketSizeAcc = rs.bucketSizeAcc[:1] // First entry is always zero 246 rs.bucketPosAcc = rs.bucketPosAcc[:1] // First entry is always zero 247 } 248 249 func splitParams(m, leafSize, primaryAggrBound, secondaryAggrBound uint16) (fanout, unit uint16) { 250 if m > secondaryAggrBound { // High-level aggregation (fanout 2) 251 unit = secondaryAggrBound * (((m+1)/2 + secondaryAggrBound - 1) / secondaryAggrBound) 252 fanout = 2 253 } else if m > primaryAggrBound { // Second-level aggregation 254 unit = primaryAggrBound 255 fanout = (m + primaryAggrBound - 1) / primaryAggrBound 256 } else { // First-level aggregation 257 unit = leafSize 258 fanout = (m + leafSize - 1) / leafSize 259 } 260 return 261 } 262 263 func computeGolombRice(m uint16, table []uint32, leafSize, primaryAggrBound, secondaryAggrBound uint16) { 264 fanout, unit := splitParams(m, leafSize, primaryAggrBound, secondaryAggrBound) 265 k := make([]uint16, fanout) 266 k[fanout-1] = m 267 for i := uint16(0); i < fanout-1; i++ { 268 k[i] = unit 269 k[fanout-1] -= k[i] 270 } 271 sqrtProd := float64(1) 272 for i := uint16(0); i < fanout; i++ { 273 sqrtProd *= math.Sqrt(float64(k[i])) 274 } 275 p := math.Sqrt(float64(m)) / (math.Pow(2*math.Pi, (float64(fanout)-1.)/2.0) * sqrtProd) 276 golombRiceLength := uint32(math.Ceil(math.Log2(-math.Log((math.Sqrt(5)+1.0)/2.0) / math.Log1p(-p)))) // log2 Golomb modulus 277 if golombRiceLength > 0x1F { 278 panic("golombRiceLength > 0x1F") 279 } 280 table[m] = golombRiceLength << 27 281 for i := uint16(0); i < fanout; i++ { 282 golombRiceLength += table[k[i]] & 0xFFFF 283 } 284 if golombRiceLength > 0xFFFF { 285 panic("golombRiceLength > 0xFFFF") 286 } 287 table[m] |= golombRiceLength // Sum of Golomb-Rice codeslengths in the subtree, stored in the lower 16 bits 288 nodes := uint32(1) 289 for i := uint16(0); i < fanout; i++ { 290 nodes += (table[k[i]] >> 16) & 0x7FF 291 } 292 if leafSize >= 3 && nodes > 0x7FF { 293 panic("rs.leafSize >= 3 && nodes > 0x7FF") 294 } 295 table[m] |= nodes << 16 296 } 297 298 // golombParam returns the optimal Golomb parameter to use for encoding 299 // salt for the part of the hash function separating m elements. It is based on 300 // calculations with assumptions that we draw hash functions at random 301 func (rs *RecSplit) golombParam(m uint16) int { 302 s := uint16(len(rs.golombRice)) 303 for m >= s { 304 rs.golombRice = append(rs.golombRice, 0) 305 // For the case where bucket is larger than planned 306 if s == 0 { 307 rs.golombRice[0] = (bijMemo[0] << 27) | bijMemo[0] 308 } else if s <= rs.leafSize { 309 rs.golombRice[s] = (bijMemo[s] << 27) | (uint32(1) << 16) | bijMemo[s] 310 } else { 311 computeGolombRice(s, rs.golombRice, rs.leafSize, rs.primaryAggrBound, rs.secondaryAggrBound) 312 } 313 s++ 314 } 315 return int(rs.golombRice[m] >> 27) 316 } 317 318 // Add key to the RecSplit. There can be many more keys than what fits in RAM, and RecSplit 319 // spills data onto disk to accomodate that. The key gets copied by the collector, therefore 320 // the slice underlying key is not getting accessed by RecSplit after this invocation. 321 func (rs *RecSplit) AddKey(key []byte, offset uint64) error { 322 if rs.built { 323 return fmt.Errorf("cannot add keys after perfect hash function had been built") 324 } 325 rs.hasher.Reset() 326 rs.hasher.Write(key) //nolint:errcheck 327 hi, lo := rs.hasher.Sum128() 328 binary.BigEndian.PutUint64(rs.bucketKeyBuf[:], remap(hi, rs.bucketCount)) 329 binary.BigEndian.PutUint64(rs.bucketKeyBuf[8:], lo) 330 binary.BigEndian.PutUint64(rs.numBuf[:], offset) 331 if offset > rs.maxOffset { 332 rs.maxOffset = offset 333 } 334 if rs.keysAdded > 0 { 335 delta := offset - rs.prevOffset 336 if rs.keysAdded == 1 || delta < rs.minDelta { 337 rs.minDelta = delta 338 } 339 } 340 341 if rs.enums { 342 if err := rs.offsetCollector.Collect(rs.numBuf[:], nil); err != nil { 343 return err 344 } 345 binary.BigEndian.PutUint64(rs.numBuf[:], rs.keysAdded) 346 if err := rs.bucketCollector.Collect(rs.bucketKeyBuf[:], rs.numBuf[:]); err != nil { 347 return err 348 } 349 } else { 350 if err := rs.bucketCollector.Collect(rs.bucketKeyBuf[:], rs.numBuf[:]); err != nil { 351 return err 352 } 353 } 354 rs.keysAdded++ 355 rs.prevOffset = offset 356 return nil 357 } 358 359 func (rs *RecSplit) AddOffset(offset uint64) error { 360 if rs.enums { 361 binary.BigEndian.PutUint64(rs.numBuf[:], offset) 362 if err := rs.offsetCollector.Collect(rs.numBuf[:], nil); err != nil { 363 return err 364 } 365 } 366 return nil 367 } 368 369 func (rs *RecSplit) recsplitCurrentBucket() error { 370 // Extend rs.bucketSizeAcc to accomodate current bucket index + 1 371 for len(rs.bucketSizeAcc) <= int(rs.currentBucketIdx)+1 { 372 rs.bucketSizeAcc = append(rs.bucketSizeAcc, rs.bucketSizeAcc[len(rs.bucketSizeAcc)-1]) 373 } 374 rs.bucketSizeAcc[int(rs.currentBucketIdx)+1] += uint64(len(rs.currentBucket)) 375 // Sets of size 0 and 1 are not further processed, just write them to index 376 if len(rs.currentBucket) > 1 { 377 for i, key := range rs.currentBucket[1:] { 378 if key == rs.currentBucket[i] { 379 rs.collision = true 380 return fmt.Errorf("%w: %x", ErrCollision, key) 381 } 382 } 383 bitPos := rs.gr.bitCount 384 if rs.buffer == nil { 385 rs.buffer = make([]uint64, len(rs.currentBucket)) 386 rs.offsetBuffer = make([]uint64, len(rs.currentBucketOffs)) 387 } else { 388 for len(rs.buffer) < len(rs.currentBucket) { 389 rs.buffer = append(rs.buffer, 0) 390 rs.offsetBuffer = append(rs.offsetBuffer, 0) 391 } 392 } 393 unary, err := rs.recsplit(0 /* level */, rs.currentBucket, rs.currentBucketOffs, nil /* unary */) 394 if err != nil { 395 return err 396 } 397 rs.gr.appendUnaryAll(unary) 398 if rs.trace { 399 fmt.Printf("recsplitBucket(%d, %d, bitsize = %d)\n", rs.currentBucketIdx, len(rs.currentBucket), rs.gr.bitCount-bitPos) 400 } 401 } else { 402 for _, offset := range rs.currentBucketOffs { 403 binary.BigEndian.PutUint64(rs.numBuf[:], offset) 404 if _, err := rs.indexW.Write(rs.numBuf[8-rs.bytesPerRec:]); err != nil { 405 return err 406 } 407 } 408 } 409 // Extend rs.bucketPosAcc to accomodate current bucket index + 1 410 for len(rs.bucketPosAcc) <= int(rs.currentBucketIdx)+1 { 411 rs.bucketPosAcc = append(rs.bucketPosAcc, rs.bucketPosAcc[len(rs.bucketPosAcc)-1]) 412 } 413 rs.bucketPosAcc[int(rs.currentBucketIdx)+1] = uint64(rs.gr.Bits()) 414 // clear for the next buckey 415 rs.currentBucket = rs.currentBucket[:0] 416 rs.currentBucketOffs = rs.currentBucketOffs[:0] 417 return nil 418 } 419 420 // recsplit applies recSplit algorithm to the given bucket 421 func (rs *RecSplit) recsplit(level int, bucket []uint64, offsets []uint64, unary []uint64) ([]uint64, error) { 422 if rs.trace { 423 fmt.Printf("recsplit(%d, %d, %x)\n", level, len(bucket), bucket) 424 } 425 // Pick initial salt for this level of recursive split 426 salt := rs.startSeed[level] 427 m := uint16(len(bucket)) 428 if m <= rs.leafSize { 429 // No need to build aggregation levels - just find find bijection 430 var mask uint32 431 for { 432 mask = 0 433 var fail bool 434 for i := uint16(0); !fail && i < m; i++ { 435 bit := uint32(1) << remap16(remix(bucket[i]+salt), m) 436 if mask&bit != 0 { 437 fail = true 438 } else { 439 mask |= bit 440 } 441 } 442 if !fail { 443 break 444 } 445 salt++ 446 } 447 for i := uint16(0); i < m; i++ { 448 j := remap16(remix(bucket[i]+salt), m) 449 rs.offsetBuffer[j] = offsets[i] 450 } 451 for _, offset := range rs.offsetBuffer[:m] { 452 binary.BigEndian.PutUint64(rs.numBuf[:], offset) 453 if _, err := rs.indexW.Write(rs.numBuf[8-rs.bytesPerRec:]); err != nil { 454 return nil, err 455 } 456 } 457 salt -= rs.startSeed[level] 458 log2golomb := rs.golombParam(m) 459 if rs.trace { 460 fmt.Printf("encode bij %d with log2golomn %d at p = %d\n", salt, log2golomb, rs.gr.bitCount) 461 } 462 rs.gr.appendFixed(salt, log2golomb) 463 unary = append(unary, salt>>log2golomb) 464 } else { 465 fanout, unit := splitParams(m, rs.leafSize, rs.primaryAggrBound, rs.secondaryAggrBound) 466 count := rs.count 467 for { 468 for i := uint16(0); i < fanout-1; i++ { 469 count[i] = 0 470 } 471 var fail bool 472 for i := uint16(0); i < m; i++ { 473 count[remap16(remix(bucket[i]+salt), m)/unit]++ 474 } 475 for i := uint16(0); i < fanout-1; i++ { 476 fail = fail || (count[i] != unit) 477 } 478 if !fail { 479 break 480 } 481 salt++ 482 } 483 for i, c := uint16(0), uint16(0); i < fanout; i++ { 484 count[i] = c 485 c += unit 486 } 487 for i := uint16(0); i < m; i++ { 488 j := remap16(remix(bucket[i]+salt), m) / unit 489 rs.buffer[count[j]] = bucket[i] 490 rs.offsetBuffer[count[j]] = offsets[i] 491 count[j]++ 492 } 493 copy(bucket, rs.buffer) 494 copy(offsets, rs.offsetBuffer) 495 salt -= rs.startSeed[level] 496 log2golomb := rs.golombParam(m) 497 if rs.trace { 498 fmt.Printf("encode fanout %d: %d with log2golomn %d at p = %d\n", fanout, salt, log2golomb, rs.gr.bitCount) 499 } 500 rs.gr.appendFixed(salt, log2golomb) 501 unary = append(unary, salt>>log2golomb) 502 var err error 503 var i uint16 504 for i = 0; i < m-unit; i += unit { 505 if unary, err = rs.recsplit(level+1, bucket[i:i+unit], offsets[i:i+unit], unary); err != nil { 506 return nil, err 507 } 508 } 509 if m-i > 1 { 510 if unary, err = rs.recsplit(level+1, bucket[i:], offsets[i:], unary); err != nil { 511 return nil, err 512 } 513 } else if m-i == 1 { 514 binary.BigEndian.PutUint64(rs.numBuf[:], offsets[i]) 515 if _, err := rs.indexW.Write(rs.numBuf[8-rs.bytesPerRec:]); err != nil { 516 return nil, err 517 } 518 } 519 } 520 return unary, nil 521 } 522 523 // loadFuncBucket is required to satisfy the type etl.LoadFunc type, to use with collector.Load 524 func (rs *RecSplit) loadFuncBucket(k, v []byte, _ etl.CurrentTableReader, _ etl.LoadNextFunc) error { 525 // k is the BigEndian encoding of the bucket number, and the v is the key that is assigned into that bucket 526 bucketIdx := binary.BigEndian.Uint64(k) 527 if rs.currentBucketIdx != bucketIdx { 528 if rs.currentBucketIdx != math.MaxUint64 { 529 if err := rs.recsplitCurrentBucket(); err != nil { 530 return err 531 } 532 } 533 rs.currentBucketIdx = bucketIdx 534 } 535 rs.currentBucket = append(rs.currentBucket, binary.BigEndian.Uint64(k[8:])) 536 rs.currentBucketOffs = append(rs.currentBucketOffs, binary.BigEndian.Uint64(v)) 537 return nil 538 } 539 540 func (rs *RecSplit) loadFuncOffset(k, _ []byte, _ etl.CurrentTableReader, _ etl.LoadNextFunc) error { 541 offset := binary.BigEndian.Uint64(k) 542 rs.offsetEf.AddOffset(offset) 543 return nil 544 } 545 546 // Build has to be called after all the keys have been added, and it initiates the process 547 // of building the perfect hash function and writing index into a file 548 func (rs *RecSplit) Build(ctx context.Context) error { 549 if rs.built { 550 return fmt.Errorf("already built") 551 } 552 if rs.keysAdded != rs.keyExpectedCount { 553 return fmt.Errorf("expected keys %d, got %d", rs.keyExpectedCount, rs.keysAdded) 554 } 555 var err error 556 if rs.indexF, err = os.Create(rs.tmpFilePath); err != nil { 557 return fmt.Errorf("create index file %s: %w", rs.indexFile, err) 558 } 559 defer rs.indexF.Close() 560 rs.indexW = bufio.NewWriterSize(rs.indexF, etl.BufIOSize) 561 // Write minimal app-specific dataID in this index file 562 binary.BigEndian.PutUint64(rs.numBuf[:], rs.baseDataID) 563 if _, err = rs.indexW.Write(rs.numBuf[:]); err != nil { 564 return fmt.Errorf("write number of keys: %w", err) 565 } 566 567 // Write number of keys 568 binary.BigEndian.PutUint64(rs.numBuf[:], rs.keysAdded) 569 if _, err = rs.indexW.Write(rs.numBuf[:]); err != nil { 570 return fmt.Errorf("write number of keys: %w", err) 571 } 572 // Write number of bytes per index record 573 rs.bytesPerRec = common.BitLenToByteLen(bits.Len64(rs.maxOffset)) 574 if err = rs.indexW.WriteByte(byte(rs.bytesPerRec)); err != nil { 575 return fmt.Errorf("write bytes per record: %w", err) 576 } 577 578 rs.currentBucketIdx = math.MaxUint64 // To make sure 0 bucket is detected 579 defer rs.bucketCollector.Close() 580 if rs.lvl < log.LvlTrace { 581 log.Log(rs.lvl, "[index] calculating", "file", rs.indexFileName) 582 } 583 if err := rs.bucketCollector.Load(nil, "", rs.loadFuncBucket, etl.TransformArgs{Quit: ctx.Done()}); err != nil { 584 return err 585 } 586 if len(rs.currentBucket) > 0 { 587 if err := rs.recsplitCurrentBucket(); err != nil { 588 return err 589 } 590 } 591 592 if assert.Enable { 593 rs.indexW.Flush() 594 rs.indexF.Seek(0, 0) 595 b, _ := io.ReadAll(rs.indexF) 596 if len(b) != 9+int(rs.keysAdded)*rs.bytesPerRec { 597 panic(fmt.Errorf("expected: %d, got: %d; rs.keysAdded=%d, rs.bytesPerRec=%d, %s", 9+int(rs.keysAdded)*rs.bytesPerRec, len(b), rs.keysAdded, rs.bytesPerRec, rs.indexFile)) 598 } 599 } 600 if rs.lvl < log.LvlTrace { 601 log.Log(rs.lvl, "[index] write", "file", rs.indexFileName) 602 } 603 if rs.enums { 604 rs.offsetEf = eliasfano32.NewEliasFano(rs.keysAdded, rs.maxOffset) 605 defer rs.offsetCollector.Close() 606 if err := rs.offsetCollector.Load(nil, "", rs.loadFuncOffset, etl.TransformArgs{}); err != nil { 607 return err 608 } 609 rs.offsetEf.Build() 610 } 611 rs.gr.appendFixed(1, 1) // Sentinel (avoids checking for parts of size 1) 612 // Construct Elias Fano index 613 rs.ef.Build(rs.bucketSizeAcc, rs.bucketPosAcc) 614 rs.built = true 615 616 // Write out bucket count, bucketSize, leafSize 617 binary.BigEndian.PutUint64(rs.numBuf[:], rs.bucketCount) 618 if _, err := rs.indexW.Write(rs.numBuf[:8]); err != nil { 619 return fmt.Errorf("writing bucketCount: %w", err) 620 } 621 binary.BigEndian.PutUint16(rs.numBuf[:], uint16(rs.bucketSize)) 622 if _, err := rs.indexW.Write(rs.numBuf[:2]); err != nil { 623 return fmt.Errorf("writing bucketSize: %w", err) 624 } 625 binary.BigEndian.PutUint16(rs.numBuf[:], rs.leafSize) 626 if _, err := rs.indexW.Write(rs.numBuf[:2]); err != nil { 627 return fmt.Errorf("writing leafSize: %w", err) 628 } 629 // Write out salt 630 binary.BigEndian.PutUint32(rs.numBuf[:], rs.salt) 631 if _, err := rs.indexW.Write(rs.numBuf[:4]); err != nil { 632 return fmt.Errorf("writing salt: %w", err) 633 } 634 // Write out start seeds 635 if err := rs.indexW.WriteByte(byte(len(rs.startSeed))); err != nil { 636 return fmt.Errorf("writing len of start seeds: %w", err) 637 } 638 for _, s := range rs.startSeed { 639 binary.BigEndian.PutUint64(rs.numBuf[:], s) 640 if _, err := rs.indexW.Write(rs.numBuf[:8]); err != nil { 641 return fmt.Errorf("writing start seed: %w", err) 642 } 643 } 644 645 if rs.enums { 646 if err := rs.indexW.WriteByte(1); err != nil { 647 return fmt.Errorf("writing enums = true: %w", err) 648 } 649 } else { 650 if err := rs.indexW.WriteByte(0); err != nil { 651 return fmt.Errorf("writing enums = true: %w", err) 652 } 653 } 654 if rs.enums { 655 // Write out elias fano for offsets 656 if err := rs.offsetEf.Write(rs.indexW); err != nil { 657 return fmt.Errorf("writing elias fano for offsets: %w", err) 658 } 659 } 660 // Write out the size of golomb rice params 661 binary.BigEndian.PutUint16(rs.numBuf[:], uint16(len(rs.golombRice))) 662 if _, err := rs.indexW.Write(rs.numBuf[:4]); err != nil { 663 return fmt.Errorf("writing golomb rice param size: %w", err) 664 } 665 // Write out golomb rice 666 if err := rs.gr.Write(rs.indexW); err != nil { 667 return fmt.Errorf("writing golomb rice: %w", err) 668 } 669 // Write out elias fano 670 if err := rs.ef.Write(rs.indexW); err != nil { 671 return fmt.Errorf("writing elias fano: %w", err) 672 } 673 674 if err = rs.indexW.Flush(); err != nil { 675 return err 676 } 677 if err = rs.fsync(); err != nil { 678 return err 679 } 680 if err = rs.indexF.Close(); err != nil { 681 return err 682 } 683 if err = os.Rename(rs.tmpFilePath, rs.indexFile); err != nil { 684 return err 685 } 686 return nil 687 } 688 689 func (rs *RecSplit) DisableFsync() { rs.noFsync = true } 690 691 // Fsync - other processes/goroutines must see only "fully-complete" (valid) files. No partial-writes. 692 // To achieve it: write to .tmp file then `rename` when file is ready. 693 // Machine may power-off right after `rename` - it means `fsync` must be before `rename` 694 func (rs *RecSplit) fsync() error { 695 if rs.noFsync { 696 return nil 697 } 698 if err := rs.indexF.Sync(); err != nil { 699 rs.logger.Warn("couldn't fsync", "err", err, "file", rs.tmpFilePath) 700 return err 701 } 702 return nil 703 } 704 705 // Stats returns the size of golomb rice encoding and ellias fano encoding 706 func (rs *RecSplit) Stats() (int, int) { 707 return len(rs.gr.Data()), len(rs.ef.Data()) 708 } 709 710 // Collision returns true if there was a collision detected during mapping of keys 711 // into 64-bit values 712 // RecSplit needs to be reset, re-populated with keys, and rebuilt 713 func (rs *RecSplit) Collision() bool { 714 return rs.collision 715 }