github.com/ledgerwatch/erigon-lib@v1.0.0/recsplit/eliasfano32/elias_fano.go (about) 1 /* 2 Copyright 2022 Erigon contributors 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package eliasfano32 18 19 import ( 20 "encoding/binary" 21 "fmt" 22 "io" 23 "math" 24 "math/bits" 25 "sort" 26 "unsafe" 27 28 "github.com/ledgerwatch/erigon-lib/common/bitutil" 29 "github.com/ledgerwatch/erigon-lib/kv/iter" 30 ) 31 32 // EliasFano algo overview https://www.antoniomallia.it/sorted-integers-compression-with-elias-fano-encoding.html 33 // P. Elias. Efficient storage and retrieval by content and address of static files. J. ACM, 21(2):246–260, 1974. 34 // Partitioned Elias-Fano Indexes http://groups.di.unipi.it/~ottavian/files/elias_fano_sigir14.pdf 35 36 const ( 37 log2q uint64 = 8 38 q uint64 = 1 << log2q 39 qMask uint64 = q - 1 40 superQ uint64 = 1 << 14 41 superQMask uint64 = superQ - 1 42 qPerSuperQ uint64 = superQ / q // 64 43 superQSize uint64 = 1 + qPerSuperQ/2 // 1 + 64/2 = 33 44 ) 45 46 // EliasFano can be used to encode one monotone sequence 47 type EliasFano struct { 48 data []uint64 49 lowerBits []uint64 50 upperBits []uint64 51 jump []uint64 52 lowerBitsMask uint64 53 count uint64 54 u uint64 55 l uint64 56 maxOffset uint64 57 i uint64 58 wordsUpperBits int 59 } 60 61 func NewEliasFano(count uint64, maxOffset uint64) *EliasFano { 62 if count == 0 { 63 panic(fmt.Sprintf("too small count: %d", count)) 64 } 65 //fmt.Printf("count=%d,maxOffset=%d,minDelta=%d\n", count, maxOffset, minDelta) 66 ef := &EliasFano{ 67 count: count - 1, 68 maxOffset: maxOffset, 69 } 70 ef.u = maxOffset + 1 71 ef.wordsUpperBits = ef.deriveFields() 72 return ef 73 } 74 75 func (ef *EliasFano) AddOffset(offset uint64) { 76 //fmt.Printf("0x%x,\n", offset) 77 if ef.l != 0 { 78 setBits(ef.lowerBits, ef.i*ef.l, int(ef.l), offset&ef.lowerBitsMask) 79 } 80 //pos := ((offset - ef.delta) >> ef.l) + ef.i 81 set(ef.upperBits, (offset>>ef.l)+ef.i) 82 //fmt.Printf("add:%x, pos=%x, set=%x, res=%x\n", offset, pos, pos/64, uint64(1)<<(pos%64)) 83 ef.i++ 84 } 85 86 func (ef *EliasFano) jumpSizeWords() int { 87 size := ((ef.count + 1) / superQ) * superQSize // Whole blocks 88 if (ef.count+1)%superQ != 0 { 89 size += 1 + (((ef.count+1)%superQ+q-1)/q+3)/2 // Partial block 90 } 91 return int(size) 92 } 93 94 func (ef *EliasFano) deriveFields() int { 95 if ef.u/(ef.count+1) == 0 { 96 ef.l = 0 97 } else { 98 ef.l = 63 ^ uint64(bits.LeadingZeros64(ef.u/(ef.count+1))) // pos of first non-zero bit 99 } 100 ef.lowerBitsMask = (uint64(1) << ef.l) - 1 101 wordsLowerBits := int(((ef.count+1)*ef.l+63)/64 + 1) 102 wordsUpperBits := int((ef.count + 1 + (ef.u >> ef.l) + 63) / 64) 103 jumpWords := ef.jumpSizeWords() 104 totalWords := wordsLowerBits + wordsUpperBits + jumpWords 105 //fmt.Printf("EF: %d, %d,%d,%d\n", totalWords, wordsLowerBits, wordsUpperBits, jumpWords) 106 if ef.data == nil { 107 ef.data = make([]uint64, totalWords) 108 } else { 109 ef.data = ef.data[:totalWords] 110 } 111 112 ef.lowerBits = ef.data[:wordsLowerBits] 113 ef.upperBits = ef.data[wordsLowerBits : wordsLowerBits+wordsUpperBits] 114 ef.jump = ef.data[wordsLowerBits+wordsUpperBits:] 115 return wordsUpperBits 116 } 117 118 // Build construct Elias Fano index for a given sequences 119 func (ef *EliasFano) Build() { 120 for i, c, lastSuperQ := uint64(0), uint64(0), uint64(0); i < uint64(ef.wordsUpperBits); i++ { 121 for b := uint64(0); b < 64; b++ { 122 if ef.upperBits[i]&(uint64(1)<<b) != 0 { 123 if (c & superQMask) == 0 { 124 // When c is multiple of 2^14 (4096) 125 lastSuperQ = i*64 + b 126 ef.jump[(c/superQ)*superQSize] = lastSuperQ 127 } 128 if (c & qMask) != 0 { 129 c++ 130 continue 131 } 132 // When c is multiple of 2^8 (256) 133 var offset = i*64 + b - lastSuperQ // offset can be either 0, 256, 512, 768, ..., up to 4096-256 134 // offset needs to be encoded as 16-bit integer, therefore the following check 135 if offset >= (1 << 32) { 136 fmt.Printf("ef.l=%x,ef.u=%x\n", ef.l, ef.u) 137 fmt.Printf("offset=%x,lastSuperQ=%x,i=%x,b=%x,c=%x\n", offset, lastSuperQ, i, b, c) 138 panic("") 139 } 140 // c % superQ is the bit index inside the group of 4096 bits 141 jumpSuperQ := (c / superQ) * superQSize 142 jumpInsideSuperQ := (c % superQ) / q 143 idx64, shift := jumpSuperQ+1+(jumpInsideSuperQ>>1), 32*(jumpInsideSuperQ%2) 144 mask := uint64(0xffffffff) << shift 145 ef.jump[idx64] = (ef.jump[idx64] &^ mask) | (offset << shift) 146 c++ 147 } 148 } 149 } 150 } 151 152 func (ef *EliasFano) get(i uint64) (val uint64, window uint64, sel int, currWord uint64, lower uint64) { 153 lower = i * ef.l 154 idx64, shift := lower/64, lower%64 155 lower = ef.lowerBits[idx64] >> shift 156 if shift > 0 { 157 lower |= ef.lowerBits[idx64+1] << (64 - shift) 158 } 159 160 jumpSuperQ := (i / superQ) * superQSize 161 jumpInsideSuperQ := (i % superQ) / q 162 idx64, shift = jumpSuperQ+1+(jumpInsideSuperQ>>1), 32*(jumpInsideSuperQ%2) 163 mask := uint64(0xffffffff) << shift 164 jump := ef.jump[jumpSuperQ] + (ef.jump[idx64]&mask)>>shift 165 166 currWord = jump / 64 167 window = ef.upperBits[currWord] & (uint64(0xffffffffffffffff) << (jump % 64)) 168 d := int(i & qMask) 169 170 for bitCount := bits.OnesCount64(window); bitCount <= d; bitCount = bits.OnesCount64(window) { 171 currWord++ 172 window = ef.upperBits[currWord] 173 d -= bitCount 174 } 175 176 sel = bitutil.Select64(window, d) 177 val = (currWord*64+uint64(sel)-i)<<ef.l | (lower & ef.lowerBitsMask) 178 179 return 180 } 181 182 func (ef *EliasFano) Get(i uint64) uint64 { 183 val, _, _, _, _ := ef.get(i) 184 return val 185 } 186 187 func (ef *EliasFano) Get2(i uint64) (val uint64, valNext uint64) { 188 var window uint64 189 var sel int 190 var currWord uint64 191 var lower uint64 192 val, window, sel, currWord, lower = ef.get(i) 193 window &= (uint64(0xffffffffffffffff) << sel) << 1 194 for window == 0 { 195 currWord++ 196 window = ef.upperBits[currWord] 197 } 198 199 lower >>= ef.l 200 valNext = (currWord*64+uint64(bits.TrailingZeros64(window))-i-1)<<ef.l | (lower & ef.lowerBitsMask) 201 return 202 } 203 204 func (ef *EliasFano) upper(i uint64) uint64 { 205 jumpSuperQ := (i / superQ) * superQSize 206 jumpInsideSuperQ := (i % superQ) / q 207 idx64, shift := jumpSuperQ+1+(jumpInsideSuperQ>>1), 32*(jumpInsideSuperQ%2) 208 mask := uint64(0xffffffff) << shift 209 jump := ef.jump[jumpSuperQ] + (ef.jump[idx64]&mask)>>shift 210 currWord := jump / 64 211 window := ef.upperBits[currWord] & (uint64(0xffffffffffffffff) << (jump % 64)) 212 d := int(i & qMask) 213 214 for bitCount := bits.OnesCount64(window); bitCount <= d; bitCount = bits.OnesCount64(window) { 215 currWord++ 216 window = ef.upperBits[currWord] 217 d -= bitCount 218 } 219 220 sel := bitutil.Select64(window, d) 221 return currWord*64 + uint64(sel) - i 222 } 223 224 // Search returns the value in the sequence, equal or greater than given value 225 func (ef *EliasFano) search(v uint64) (nextV uint64, nextI uint64, ok bool) { 226 if v == 0 { 227 return ef.Min(), 0, true 228 } 229 if v == ef.Max() { 230 return ef.Max(), ef.count, true 231 } 232 if v > ef.Max() { 233 return 0, 0, false 234 } 235 236 hi := v >> ef.l 237 i := sort.Search(int(ef.count+1), func(i int) bool { 238 return ef.upper(uint64(i)) >= hi 239 }) 240 for j := uint64(i); j <= ef.count; j++ { 241 val, _, _, _, _ := ef.get(j) 242 if val >= v { 243 return val, j, true 244 } 245 } 246 return 0, 0, false 247 } 248 249 func (ef *EliasFano) Search(v uint64) (uint64, bool) { 250 n, _, ok := ef.search(v) 251 return n, ok 252 } 253 254 func (ef *EliasFano) Max() uint64 { 255 return ef.maxOffset 256 } 257 258 func (ef *EliasFano) Min() uint64 { 259 return ef.Get(0) 260 } 261 262 func (ef *EliasFano) Count() uint64 { 263 return ef.count + 1 264 } 265 266 func (ef *EliasFano) Iterator() *EliasFanoIter { 267 return &EliasFanoIter{ef: ef, upperMask: 1, upperStep: uint64(1) << ef.l, lowerBits: ef.lowerBits, upperBits: ef.upperBits, count: ef.count, l: ef.l, lowerBitsMask: ef.lowerBitsMask} 268 } 269 func (ef *EliasFano) ReverseIterator() *iter.ArrStream[uint64] { 270 //TODO: this is very un-optimal, need implement proper reverse-iterator 271 it := ef.Iterator() 272 var values []uint64 273 for it.HasNext() { 274 v, err := it.Next() 275 if err != nil { 276 panic(err) 277 } 278 values = append(values, v) 279 } 280 return iter.ReverseArray[uint64](values) 281 } 282 283 type EliasFanoIter struct { 284 ef *EliasFano 285 lowerBits []uint64 286 upperBits []uint64 287 288 //constants 289 count uint64 290 lowerBitsMask uint64 291 l uint64 292 upperStep uint64 293 294 //fields of current value 295 upper uint64 296 upperIdx uint64 297 298 //fields of next value 299 idx uint64 300 lowerIdx uint64 301 upperMask uint64 302 } 303 304 func (efi *EliasFanoIter) HasNext() bool { 305 return efi.idx <= efi.count 306 } 307 308 func (efi *EliasFanoIter) Reset() { 309 efi.upperMask = 1 310 efi.upperStep = uint64(1) << efi.l 311 efi.upperIdx = 0 312 313 efi.upper = 0 314 efi.lowerIdx = 0 315 efi.idx = 0 316 } 317 318 func (efi *EliasFanoIter) SeekDeprecated(n uint64) { 319 efi.Reset() 320 _, i, ok := efi.ef.search(n) 321 if !ok { 322 efi.idx = efi.count + 1 323 return 324 } 325 for j := uint64(0); j < i; j++ { 326 efi.increment() 327 } 328 //fmt.Printf("seek: efi.upperMask(%d)=%d, upperIdx=%d, lowerIdx=%d, idx=%d\n", n, bits.TrailingZeros64(efi.upperMask), efi.upperIdx, efi.lowerIdx, efi.idx) 329 //fmt.Printf("seek: efi.upper=%d\n", efi.upper) 330 } 331 332 func (efi *EliasFanoIter) Seek(n uint64) { 333 //fmt.Printf("b seek2: efi.upperMask(%d)=%d, upperIdx=%d, lowerIdx=%d, idx=%d\n", n, bits.TrailingZeros64(efi.upperMask), efi.upperIdx, efi.lowerIdx, efi.idx) 334 //fmt.Printf("b seek2: efi.upper=%d\n", efi.upper) 335 efi.Reset() 336 nn, nextI, ok := efi.ef.search(n) 337 _ = nn 338 if !ok { 339 efi.idx = efi.count + 1 340 return 341 } 342 if nextI == 0 { 343 return 344 } 345 346 // fields of current value 347 v, _, sel, currWords, lower := efi.ef.get(nextI - 1) //TODO: search can return same info 348 efi.upper = v &^ (lower & efi.ef.lowerBitsMask) 349 efi.upperIdx = currWords 350 351 // fields of next value 352 efi.lowerIdx = nextI * efi.l 353 efi.idx = nextI 354 efi.upperMask = 1 << (sel + 1) 355 356 //fmt.Printf("seek2: efi.upperMask(%d)=%d, upperIdx=%d, lowerIdx=%d, idx=%d\n", n, bits.TrailingZeros64(efi.upperMask), efi.upperIdx, efi.lowerIdx, efi.idx) 357 //fmt.Printf("seek2: efi.upper=%d\n", efi.upper) 358 } 359 360 func (efi *EliasFanoIter) increment() { 361 if efi.upperMask == 0 { 362 efi.upperIdx++ 363 efi.upperMask = 1 364 } 365 for efi.upperBits[efi.upperIdx]&efi.upperMask == 0 { 366 efi.upper += efi.upperStep 367 efi.upperMask <<= 1 368 if efi.upperMask == 0 { 369 efi.upperIdx++ 370 efi.upperMask = 1 371 } 372 } 373 efi.upperMask <<= 1 374 efi.lowerIdx += efi.l 375 efi.idx++ 376 } 377 378 func (efi *EliasFanoIter) Next() (uint64, error) { 379 idx64, shift := efi.lowerIdx/64, efi.lowerIdx%64 380 lower := efi.lowerBits[idx64] >> shift 381 if shift > 0 { 382 lower |= efi.lowerBits[idx64+1] << (64 - shift) 383 } 384 efi.increment() 385 return efi.upper | (lower & efi.lowerBitsMask), nil 386 } 387 388 // Write outputs the state of golomb rice encoding into a writer, which can be recovered later by Read 389 func (ef *EliasFano) Write(w io.Writer) error { 390 var numBuf [8]byte 391 binary.BigEndian.PutUint64(numBuf[:], ef.count) 392 if _, e := w.Write(numBuf[:]); e != nil { 393 return e 394 } 395 binary.BigEndian.PutUint64(numBuf[:], ef.u) 396 if _, e := w.Write(numBuf[:]); e != nil { 397 return e 398 } 399 b := unsafe.Slice((*byte)(unsafe.Pointer(&ef.data[0])), len(ef.data)*uint64Size) 400 if _, e := w.Write(b); e != nil { 401 return e 402 } 403 return nil 404 } 405 406 // Write outputs the state of golomb rice encoding into a writer, which can be recovered later by Read 407 func (ef *EliasFano) AppendBytes(buf []byte) []byte { 408 var numBuf [8]byte 409 binary.BigEndian.PutUint64(numBuf[:], ef.count) 410 buf = append(buf, numBuf[:]...) 411 binary.BigEndian.PutUint64(numBuf[:], ef.u) 412 buf = append(buf, numBuf[:]...) 413 b := unsafe.Slice((*byte)(unsafe.Pointer(&ef.data[0])), len(ef.data)*uint64Size) 414 buf = append(buf, b...) 415 return buf 416 } 417 418 // Read inputs the state of golomb rice encoding from a reader s 419 func ReadEliasFano(r []byte) (*EliasFano, int) { 420 ef := &EliasFano{ 421 count: binary.BigEndian.Uint64(r[:8]), 422 u: binary.BigEndian.Uint64(r[8:16]), 423 data: unsafe.Slice((*uint64)(unsafe.Pointer(&r[16])), (len(r)-16)/uint64Size), 424 } 425 ef.maxOffset = ef.u - 1 426 ef.deriveFields() 427 return ef, 16 + 8*len(ef.data) 428 } 429 430 // Reset - like ReadEliasFano, but for existing object 431 func (ef *EliasFano) Reset(r []byte) { 432 ef.count = binary.BigEndian.Uint64(r[:8]) 433 ef.u = binary.BigEndian.Uint64(r[8:16]) 434 ef.data = unsafe.Slice((*uint64)(unsafe.Pointer(&r[16])), (len(r)-16)/uint64Size) 435 ef.maxOffset = ef.u - 1 436 ef.deriveFields() 437 } 438 439 func Max(r []byte) uint64 { return binary.BigEndian.Uint64(r[8:16]) - 1 } 440 func Count(r []byte) uint64 { return binary.BigEndian.Uint64(r[:8]) + 1 } 441 442 const uint64Size = 8 443 444 func Min(r []byte) uint64 { 445 count := binary.BigEndian.Uint64(r[:8]) 446 u := binary.BigEndian.Uint64(r[8:16]) 447 p := unsafe.Slice((*uint64)(unsafe.Pointer(&r[16])), (len(r)-16)/uint64Size) 448 var l uint64 449 if u/(count+1) == 0 { 450 l = 0 451 } else { 452 l = 63 ^ uint64(bits.LeadingZeros64(u/(count+1))) // pos of first non-zero bit 453 } 454 wordsLowerBits := int(((count+1)*l+63)/64 + 1) 455 wordsUpperBits := int((count + 1 + (u >> l) + 63) / 64) 456 lowerBits := p[:wordsLowerBits] 457 upperBits := p[wordsLowerBits : wordsLowerBits+wordsUpperBits] 458 jump := p[wordsLowerBits+wordsUpperBits:] 459 lower := lowerBits[0] 460 461 mask := uint64(0xffffffff) 462 j := jump[0] + jump[1]&mask 463 currWord := j / 64 464 window := upperBits[currWord] & (uint64(0xffffffffffffffff) << (j % 64)) 465 466 if bitCount := bits.OnesCount64(window); bitCount <= 0 { 467 currWord++ 468 window = upperBits[currWord] 469 } 470 sel := bitutil.Select64(window, 0) 471 lowerBitsMask := (uint64(1) << l) - 1 472 val := (currWord*64+uint64(sel))<<l | (lower & lowerBitsMask) 473 return val 474 } 475 476 // DoubleEliasFano can be used to encode two monotone sequences 477 // it is called "double" because the lower bits array contains two sequences interleaved 478 type DoubleEliasFano struct { 479 data []uint64 480 lowerBits []uint64 481 upperBitsPosition []uint64 482 upperBitsCumKeys []uint64 483 jump []uint64 484 lowerBitsMaskCumKeys uint64 485 lowerBitsMaskPosition uint64 486 numBuckets uint64 487 uCumKeys uint64 488 uPosition uint64 489 lPosition uint64 490 lCumKeys uint64 491 cumKeysMinDelta uint64 492 posMinDelta uint64 493 } 494 495 func (ef *DoubleEliasFano) deriveFields() (int, int) { 496 if ef.uPosition/(ef.numBuckets+1) == 0 { 497 ef.lPosition = 0 498 } else { 499 ef.lPosition = 63 ^ uint64(bits.LeadingZeros64(ef.uPosition/(ef.numBuckets+1))) 500 } 501 if ef.uCumKeys/(ef.numBuckets+1) == 0 { 502 ef.lCumKeys = 0 503 } else { 504 ef.lCumKeys = 63 ^ uint64(bits.LeadingZeros64(ef.uCumKeys/(ef.numBuckets+1))) 505 } 506 //fmt.Printf("uPosition = %d, lPosition = %d, uCumKeys = %d, lCumKeys = %d\n", ef.uPosition, ef.lPosition, ef.uCumKeys, ef.lCumKeys) 507 if ef.lCumKeys*2+ef.lPosition > 56 { 508 panic(fmt.Sprintf("ef.lCumKeys (%d) * 2 + ef.lPosition (%d) > 56", ef.lCumKeys, ef.lPosition)) 509 } 510 ef.lowerBitsMaskCumKeys = (uint64(1) << ef.lCumKeys) - 1 511 ef.lowerBitsMaskPosition = (uint64(1) << ef.lPosition) - 1 512 wordsLowerBits := int(((ef.numBuckets+1)*(ef.lCumKeys+ef.lPosition)+63)/64 + 1) 513 wordsCumKeys := int((ef.numBuckets + 1 + (ef.uCumKeys >> ef.lCumKeys) + 63) / 64) 514 wordsPosition := int((ef.numBuckets + 1 + (ef.uPosition >> ef.lPosition) + 63) / 64) 515 jumpWords := ef.jumpSizeWords() 516 totalWords := wordsLowerBits + wordsCumKeys + wordsPosition + jumpWords 517 if ef.data == nil { 518 ef.data = make([]uint64, totalWords) 519 } else { 520 ef.data = ef.data[:totalWords] 521 } 522 ef.lowerBits = ef.data[:wordsLowerBits] 523 ef.upperBitsCumKeys = ef.data[wordsLowerBits : wordsLowerBits+wordsCumKeys] 524 ef.upperBitsPosition = ef.data[wordsLowerBits+wordsCumKeys : wordsLowerBits+wordsCumKeys+wordsPosition] 525 ef.jump = ef.data[wordsLowerBits+wordsCumKeys+wordsPosition:] 526 return wordsCumKeys, wordsPosition 527 } 528 529 // Build construct double Elias Fano index for two given sequences 530 func (ef *DoubleEliasFano) Build(cumKeys []uint64, position []uint64) { 531 //fmt.Printf("cumKeys = %d\nposition = %d\n", cumKeys, position) 532 if len(cumKeys) != len(position) { 533 panic("len(cumKeys) != len(position)") 534 } 535 ef.numBuckets = uint64(len(cumKeys) - 1) 536 ef.posMinDelta = math.MaxUint64 537 ef.cumKeysMinDelta = math.MaxUint64 538 for i := uint64(1); i <= ef.numBuckets; i++ { 539 if cumKeys[i] < cumKeys[i-1] { 540 panic("cumKeys[i] <= cumKeys[i-1]") 541 } 542 nkeysDelta := cumKeys[i] - cumKeys[i-1] 543 if nkeysDelta < ef.cumKeysMinDelta { 544 ef.cumKeysMinDelta = nkeysDelta 545 } 546 if position[i] < position[i-1] { 547 panic("position[i] < position[i-1]") 548 } 549 bucketBits := position[i] - position[i-1] 550 if bucketBits < ef.posMinDelta { 551 ef.posMinDelta = bucketBits 552 } 553 } 554 //fmt.Printf("cumKeysMinDelta = %d, posMinDelta = %d\n", ef.cumKeysMinDelta, ef.posMinDelta) 555 ef.uPosition = position[ef.numBuckets] - ef.numBuckets*ef.posMinDelta + 1 556 ef.uCumKeys = cumKeys[ef.numBuckets] - ef.numBuckets*ef.cumKeysMinDelta + 1 // Largest possible encoding of the cumKeys 557 wordsCumKeys, wordsPosition := ef.deriveFields() 558 559 for i, cumDelta, bitDelta := uint64(0), uint64(0), uint64(0); i <= ef.numBuckets; i, cumDelta, bitDelta = i+1, cumDelta+ef.cumKeysMinDelta, bitDelta+ef.posMinDelta { 560 if ef.lCumKeys != 0 { 561 //fmt.Printf("i=%d, set_bits cum for %d = %b\n", i, cumKeys[i]-cumDelta, (cumKeys[i]-cumDelta)&ef.lowerBitsMaskCumKeys) 562 setBits(ef.lowerBits, i*(ef.lCumKeys+ef.lPosition), int(ef.lCumKeys), (cumKeys[i]-cumDelta)&ef.lowerBitsMaskCumKeys) 563 //fmt.Printf("loweBits %b\n", ef.lowerBits) 564 } 565 set(ef.upperBitsCumKeys, ((cumKeys[i]-cumDelta)>>ef.lCumKeys)+i) 566 //fmt.Printf("i=%d, set cum for %d = %d\n", i, cumKeys[i]-cumDelta, (cumKeys[i]-cumDelta)>>ef.lCumKeys+i) 567 568 if ef.lPosition != 0 { 569 //fmt.Printf("i=%d, set_bits pos for %d = %b\n", i, position[i]-bitDelta, (position[i]-bitDelta)&ef.lowerBitsMaskPosition) 570 setBits(ef.lowerBits, i*(ef.lCumKeys+ef.lPosition)+ef.lCumKeys, int(ef.lPosition), (position[i]-bitDelta)&ef.lowerBitsMaskPosition) 571 //fmt.Printf("lowerBits %b\n", ef.lowerBits) 572 } 573 set(ef.upperBitsPosition, ((position[i]-bitDelta)>>ef.lPosition)+i) 574 //fmt.Printf("i=%d, set pos for %d = %d\n", i, position[i]-bitDelta, (position[i]-bitDelta)>>ef.lPosition+i) 575 } 576 //fmt.Printf("loweBits %b\n", ef.lowerBits) 577 //fmt.Printf("upperBitsCumKeys %b\n", ef.upperBitsCumKeys) 578 //fmt.Printf("upperBitsPosition %b\n", ef.upperBitsPosition) 579 // i iterates over the 64-bit words in the wordCumKeys vector 580 // c iterates over bits in the wordCumKeys 581 // lastSuperQ is the largest multiple of 2^14 (4096) which is no larger than c 582 // c/superQ is the index of the current 4096 block of bits 583 // superQSize is how many words is required to encode one block of 4096 bits. It is 17 words which is 1088 bits 584 for i, c, lastSuperQ := uint64(0), uint64(0), uint64(0); i < uint64(wordsCumKeys); i++ { 585 for b := uint64(0); b < 64; b++ { 586 if ef.upperBitsCumKeys[i]&(uint64(1)<<b) != 0 { 587 if (c & superQMask) == 0 { 588 // When c is multiple of 2^14 (4096) 589 lastSuperQ = i*64 + b 590 ef.jump[(c/superQ)*(superQSize*2)] = lastSuperQ 591 } 592 if (c & qMask) == 0 { 593 // When c is multiple of 2^8 (256) 594 var offset = i*64 + b - lastSuperQ // offset can be either 0, 256, 512, 768, ..., up to 4096-256 595 // offset needs to be encoded as 16-bit integer, therefore the following check 596 if offset >= (1 << 32) { 597 panic("") 598 } 599 // c % superQ is the bit index inside the group of 4096 bits 600 jumpSuperQ := (c / superQ) * (superQSize * 2) 601 jumpInsideSuperQ := 2 * (c % superQ) / q 602 idx64 := jumpSuperQ + 2 + (jumpInsideSuperQ >> 1) 603 shift := 32 * (jumpInsideSuperQ % 2) 604 mask := uint64(0xffffffff) << shift 605 ef.jump[idx64] = (ef.jump[idx64] &^ mask) | (offset << shift) 606 } 607 c++ 608 } 609 } 610 } 611 612 for i, c, lastSuperQ := uint64(0), uint64(0), uint64(0); i < uint64(wordsPosition); i++ { 613 for b := uint64(0); b < 64; b++ { 614 if ef.upperBitsPosition[i]&(uint64(1)<<b) != 0 { 615 if (c & superQMask) == 0 { 616 lastSuperQ = i*64 + b 617 ef.jump[(c/superQ)*(superQSize*2)+1] = lastSuperQ 618 } 619 if (c & qMask) == 0 { 620 var offset = i*64 + b - lastSuperQ 621 if offset >= (1 << 32) { 622 panic("") 623 } 624 jumpSuperQ := (c / superQ) * (superQSize * 2) 625 jumpInsideSuperQ := 2*((c%superQ)/q) + 1 626 idx64 := jumpSuperQ + 2 + (jumpInsideSuperQ >> 1) 627 shift := 32 * (jumpInsideSuperQ % 2) 628 mask := uint64(0xffffffff) << shift 629 ef.jump[idx64] = (ef.jump[idx64] &^ mask) | (offset << shift) 630 } 631 c++ 632 } 633 } 634 } 635 //fmt.Printf("jump: %x\n", ef.jump) 636 } 637 638 // setBits assumes that bits are set in monotonic order, so that 639 // we can skip the masking for the second word 640 func setBits(bits []uint64, start uint64, width int, value uint64) { 641 idx64, shift := start>>6, int(start&63) 642 mask := (uint64(1)<<width - 1) << shift 643 //fmt.Printf("mask = %b, idx64 = %d\n", mask, idx64) 644 bits[idx64] = (bits[idx64] &^ mask) | (value << shift) 645 //fmt.Printf("start = %d, width = %d, shift + width = %d\n", start, width, shift+width) 646 if shift+width > 64 { 647 // changes two 64-bit words 648 bits[idx64+1] = value >> (64 - shift) 649 } 650 } 651 652 func set(bits []uint64, pos uint64) { 653 //bits[pos>>6] |= uint64(1) << (pos & 63) 654 bits[pos/64] |= uint64(1) << (pos % 64) 655 } 656 657 func (ef *DoubleEliasFano) jumpSizeWords() int { 658 size := ((ef.numBuckets + 1) / superQ) * superQSize * 2 // Whole blocks 659 if (ef.numBuckets+1)%superQ != 0 { 660 size += (1 + (((ef.numBuckets+1)%superQ+q-1)/q+3)/2) * 2 // Partial block 661 } 662 return int(size) 663 } 664 665 // Data returns binary representation of double Ellias-Fano index that has been built 666 func (ef *DoubleEliasFano) Data() []uint64 { 667 return ef.data 668 } 669 670 func (ef *DoubleEliasFano) get2(i uint64) (cumKeys uint64, position uint64, 671 windowCumKeys uint64, selectCumKeys int, currWordCumKeys uint64, lower uint64, cumDelta uint64) { 672 posLower := i * (ef.lCumKeys + ef.lPosition) 673 idx64, shift := posLower/64, posLower%64 674 lower = ef.lowerBits[idx64] >> shift 675 if shift > 0 { 676 lower |= ef.lowerBits[idx64+1] << (64 - shift) 677 } 678 //fmt.Printf("i = %d, posLower = %d, lower = %b\n", i, posLower, lower) 679 680 jumpSuperQ := (i / superQ) * superQSize * 2 681 jumpInsideSuperQ := (i % superQ) / q 682 idx16 := 2*(jumpSuperQ+2) + 2*jumpInsideSuperQ 683 idx64, shift = idx16/2, 32*(idx16%2) 684 mask := uint64(0xffffffff) << shift 685 jumpCumKeys := ef.jump[jumpSuperQ] + (ef.jump[idx64]&mask)>>shift 686 idx16++ 687 idx64, shift = idx16/2, 32*(idx16%2) 688 mask = uint64(0xffffffff) << shift 689 jumpPosition := ef.jump[jumpSuperQ+1] + (ef.jump[idx64]&mask)>>shift 690 //fmt.Printf("i = %d, jumpCumKeys = %d, jumpPosition = %d\n", i, jumpCumKeys, jumpPosition) 691 692 currWordCumKeys = jumpCumKeys / 64 693 currWordPosition := jumpPosition / 64 694 windowCumKeys = ef.upperBitsCumKeys[currWordCumKeys] & (uint64(0xffffffffffffffff) << (jumpCumKeys % 64)) 695 windowPosition := ef.upperBitsPosition[currWordPosition] & (uint64(0xffffffffffffffff) << (jumpPosition % 64)) 696 deltaCumKeys := int(i & qMask) 697 deltaPosition := int(i & qMask) 698 699 for bitCount := bits.OnesCount64(windowCumKeys); bitCount <= deltaCumKeys; bitCount = bits.OnesCount64(windowCumKeys) { 700 //fmt.Printf("i = %d, bitCount cum = %d\n", i, bitCount) 701 currWordCumKeys++ 702 windowCumKeys = ef.upperBitsCumKeys[currWordCumKeys] 703 deltaCumKeys -= bitCount 704 } 705 for bitCount := bits.OnesCount64(windowPosition); bitCount <= deltaPosition; bitCount = bits.OnesCount64(windowPosition) { 706 //fmt.Printf("i = %d, bitCount pos = %d\n", i, bitCount) 707 currWordPosition++ 708 windowPosition = ef.upperBitsPosition[currWordPosition] 709 deltaPosition -= bitCount 710 } 711 712 selectCumKeys = bitutil.Select64(windowCumKeys, deltaCumKeys) 713 //fmt.Printf("i = %d, select cum in %b for %d = %d\n", i, windowCumKeys, deltaCumKeys, selectCumKeys) 714 cumDelta = i * ef.cumKeysMinDelta 715 cumKeys = ((currWordCumKeys*64+uint64(selectCumKeys)-i)<<ef.lCumKeys | (lower & ef.lowerBitsMaskCumKeys)) + cumDelta 716 717 lower >>= ef.lCumKeys 718 //fmt.Printf("i = %d, lower = %b\n", i, lower) 719 selectPosition := bitutil.Select64(windowPosition, deltaPosition) 720 //fmt.Printf("i = %d, select pos in %b for %d = %d\n", i, windowPosition, deltaPosition, selectPosition) 721 bitDelta := i * ef.posMinDelta 722 position = ((currWordPosition*64+uint64(selectPosition)-i)<<ef.lPosition | (lower & ef.lowerBitsMaskPosition)) + bitDelta 723 return 724 } 725 726 func (ef *DoubleEliasFano) Get2(i uint64) (cumKeys, position uint64) { 727 cumKeys, position, _, _, _, _, _ = ef.get2(i) 728 return 729 } 730 731 func (ef *DoubleEliasFano) Get3(i uint64) (cumKeys, cumKeysNext, position uint64) { 732 var windowCumKeys uint64 733 var selectCumKeys int 734 var currWordCumKeys uint64 735 var lower uint64 736 var cumDelta uint64 737 cumKeys, position, windowCumKeys, selectCumKeys, currWordCumKeys, lower, cumDelta = ef.get2(i) 738 windowCumKeys &= (uint64(0xffffffffffffffff) << selectCumKeys) << 1 739 for windowCumKeys == 0 { 740 currWordCumKeys++ 741 windowCumKeys = ef.upperBitsCumKeys[currWordCumKeys] 742 } 743 744 lower >>= ef.lPosition 745 cumKeysNext = ((currWordCumKeys*64+uint64(bits.TrailingZeros64(windowCumKeys))-i-1)<<ef.lCumKeys | (lower & ef.lowerBitsMaskCumKeys)) + cumDelta + ef.cumKeysMinDelta 746 return 747 } 748 749 // Write outputs the state of golomb rice encoding into a writer, which can be recovered later by Read 750 func (ef *DoubleEliasFano) Write(w io.Writer) error { 751 var numBuf [8]byte 752 binary.BigEndian.PutUint64(numBuf[:], ef.numBuckets) 753 if _, e := w.Write(numBuf[:]); e != nil { 754 return e 755 } 756 binary.BigEndian.PutUint64(numBuf[:], ef.uCumKeys) 757 if _, e := w.Write(numBuf[:]); e != nil { 758 return e 759 } 760 binary.BigEndian.PutUint64(numBuf[:], ef.uPosition) 761 if _, e := w.Write(numBuf[:]); e != nil { 762 return e 763 } 764 binary.BigEndian.PutUint64(numBuf[:], ef.cumKeysMinDelta) 765 if _, e := w.Write(numBuf[:]); e != nil { 766 return e 767 } 768 binary.BigEndian.PutUint64(numBuf[:], ef.posMinDelta) 769 if _, e := w.Write(numBuf[:]); e != nil { 770 return e 771 } 772 b := unsafe.Slice((*byte)(unsafe.Pointer(&ef.data[0])), len(ef.data)*uint64Size) 773 if _, e := w.Write(b); e != nil { 774 return e 775 } 776 return nil 777 } 778 779 // Read inputs the state of golomb rice encoding from a reader s 780 func (ef *DoubleEliasFano) Read(r []byte) int { 781 ef.numBuckets = binary.BigEndian.Uint64(r[:8]) 782 ef.uCumKeys = binary.BigEndian.Uint64(r[8:16]) 783 ef.uPosition = binary.BigEndian.Uint64(r[16:24]) 784 ef.cumKeysMinDelta = binary.BigEndian.Uint64(r[24:32]) 785 ef.posMinDelta = binary.BigEndian.Uint64(r[32:40]) 786 ef.data = unsafe.Slice((*uint64)(unsafe.Pointer(&r[40])), (len(r)-40)/uint64Size) 787 ef.deriveFields() 788 return 40 + 8*len(ef.data) 789 }