github.com/ledgerwatch/erigon-lib@v1.0.0/compress/decompress.go (about) 1 /* 2 Copyright 2022 Erigon contributors 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package compress 18 19 import ( 20 "bytes" 21 "encoding/binary" 22 "fmt" 23 "os" 24 "path/filepath" 25 "strconv" 26 "time" 27 28 "github.com/ledgerwatch/erigon-lib/common/dbg" 29 "github.com/ledgerwatch/erigon-lib/mmap" 30 "github.com/ledgerwatch/log/v3" 31 ) 32 33 type word []byte // plain text word associated with code from dictionary 34 35 type codeword struct { 36 pattern *word // Pattern corresponding to entries 37 ptr *patternTable // pointer to deeper level tables 38 code uint16 // code associated with that word 39 len byte // Number of bits in the codes 40 } 41 42 type patternTable struct { 43 patterns []*codeword 44 bitLen int // Number of bits to lookup in the table 45 } 46 47 func newPatternTable(bitLen int) *patternTable { 48 pt := &patternTable{ 49 bitLen: bitLen, 50 } 51 if bitLen <= condensePatternTableBitThreshold { 52 pt.patterns = make([]*codeword, 1<<pt.bitLen) 53 } 54 return pt 55 } 56 57 func (pt *patternTable) insertWord(cw *codeword) { 58 if pt.bitLen <= condensePatternTableBitThreshold { 59 codeStep := uint16(1) << uint16(cw.len) 60 codeFrom, codeTo := cw.code, cw.code+codeStep 61 if pt.bitLen != int(cw.len) && cw.len > 0 { 62 codeTo = codeFrom | (uint16(1) << pt.bitLen) 63 } 64 65 for c := codeFrom; c < codeTo; c += codeStep { 66 pt.patterns[c] = cw 67 } 68 return 69 } 70 71 pt.patterns = append(pt.patterns, cw) 72 } 73 74 func (pt *patternTable) condensedTableSearch(code uint16) *codeword { 75 if pt.bitLen <= condensePatternTableBitThreshold { 76 return pt.patterns[code] 77 } 78 for _, cur := range pt.patterns { 79 if cur.code == code { 80 return cur 81 } 82 d := code - cur.code 83 if d&1 != 0 { 84 continue 85 } 86 if checkDistance(int(cur.len), int(d)) { 87 return cur 88 } 89 } 90 return nil 91 } 92 93 type posTable struct { 94 pos []uint64 95 lens []byte 96 ptrs []*posTable 97 bitLen int 98 } 99 100 // Decompressor provides access to the superstrings in a file produced by a compressor 101 type Decompressor struct { 102 f *os.File 103 mmapHandle2 *[mmap.MaxMapSize]byte // mmap handle for windows (this is used to close mmap) 104 dict *patternTable 105 posDict *posTable 106 mmapHandle1 []byte // mmap handle for unix (this is used to close mmap) 107 data []byte // slice of correct size for the decompressor to work with 108 wordsStart uint64 // Offset of whether the superstrings actually start 109 size int64 110 modTime time.Time 111 wordsCount uint64 112 emptyWordsCount uint64 113 114 filePath, fileName string 115 } 116 117 // Tables with bitlen greater than threshold will be condensed. 118 // Condensing reduces size of decompression table but leads to slower reads. 119 // To disable condesning at all set to 9 (we dont use tables larger than 2^9) 120 // To enable condensing for tables of size larger 64 = 6 121 // for all tables = 0 122 // There is no sense to condense tables of size [1 - 64] in terms of performance 123 // 124 // Should be set before calling NewDecompression. 125 var condensePatternTableBitThreshold = 9 126 127 func init() { 128 v, _ := os.LookupEnv("DECOMPRESS_CONDENSITY") 129 if v != "" { 130 i, err := strconv.Atoi(v) 131 if err != nil { 132 panic(err) 133 } 134 if i < 3 || i > 9 { 135 panic("DECOMPRESS_CONDENSITY: only numbers in range 3-9 are acceptable ") 136 } 137 condensePatternTableBitThreshold = i 138 fmt.Printf("set DECOMPRESS_CONDENSITY to %d\n", i) 139 } 140 } 141 142 func SetDecompressionTableCondensity(fromBitSize int) { 143 condensePatternTableBitThreshold = fromBitSize 144 } 145 146 func NewDecompressor(compressedFilePath string) (d *Decompressor, err error) { 147 _, fName := filepath.Split(compressedFilePath) 148 d = &Decompressor{ 149 filePath: compressedFilePath, 150 fileName: fName, 151 } 152 defer func() { 153 154 if rec := recover(); rec != nil { 155 err = fmt.Errorf("decompressing file: %s, %+v, trace: %s", compressedFilePath, rec, dbg.Stack()) 156 } 157 }() 158 159 d.f, err = os.Open(compressedFilePath) 160 if err != nil { 161 return nil, err 162 } 163 var stat os.FileInfo 164 if stat, err = d.f.Stat(); err != nil { 165 return nil, err 166 } 167 d.size = stat.Size() 168 if d.size < 32 { 169 return nil, fmt.Errorf("compressed file is too short: %d", d.size) 170 } 171 d.modTime = stat.ModTime() 172 if d.mmapHandle1, d.mmapHandle2, err = mmap.Mmap(d.f, int(d.size)); err != nil { 173 return nil, err 174 } 175 // read patterns from file 176 d.data = d.mmapHandle1[:d.size] 177 defer d.EnableReadAhead().DisableReadAhead() //speedup opening on slow drives 178 179 d.wordsCount = binary.BigEndian.Uint64(d.data[:8]) 180 d.emptyWordsCount = binary.BigEndian.Uint64(d.data[8:16]) 181 dictSize := binary.BigEndian.Uint64(d.data[16:24]) 182 data := d.data[24 : 24+dictSize] 183 184 var depths []uint64 185 var patterns [][]byte 186 var i uint64 187 var patternMaxDepth uint64 188 189 for i < dictSize { 190 d, ns := binary.Uvarint(data[i:]) 191 if d > 64 { // mainnet has maxDepth 31 192 return nil, fmt.Errorf("dictionary is invalid: patternMaxDepth=%d", d) 193 } 194 depths = append(depths, d) 195 if d > patternMaxDepth { 196 patternMaxDepth = d 197 } 198 i += uint64(ns) 199 l, n := binary.Uvarint(data[i:]) 200 i += uint64(n) 201 patterns = append(patterns, data[i:i+l]) 202 //fmt.Printf("depth = %d, pattern = [%x]\n", d, data[i:i+l]) 203 i += l 204 } 205 206 if dictSize > 0 { 207 var bitLen int 208 if patternMaxDepth > 9 { 209 bitLen = 9 210 } else { 211 bitLen = int(patternMaxDepth) 212 } 213 // fmt.Printf("pattern maxDepth=%d\n", tree.maxDepth) 214 d.dict = newPatternTable(bitLen) 215 buildCondensedPatternTable(d.dict, depths, patterns, 0, 0, 0, patternMaxDepth) 216 } 217 218 // read positions 219 pos := 24 + dictSize 220 dictSize = binary.BigEndian.Uint64(d.data[pos : pos+8]) 221 data = d.data[pos+8 : pos+8+dictSize] 222 223 var posDepths []uint64 224 var poss []uint64 225 var posMaxDepth uint64 226 227 i = 0 228 for i < dictSize { 229 d, ns := binary.Uvarint(data[i:]) 230 if d > 2048 { 231 return nil, fmt.Errorf("dictionary is invalid: posMaxDepth=%d", d) 232 } 233 posDepths = append(posDepths, d) 234 if d > posMaxDepth { 235 posMaxDepth = d 236 } 237 i += uint64(ns) 238 pos, n := binary.Uvarint(data[i:]) 239 i += uint64(n) 240 poss = append(poss, pos) 241 } 242 243 if dictSize > 0 { 244 var bitLen int 245 if posMaxDepth > 9 { 246 bitLen = 9 247 } else { 248 bitLen = int(posMaxDepth) 249 } 250 //fmt.Printf("pos maxDepth=%d\n", tree.maxDepth) 251 tableSize := 1 << bitLen 252 d.posDict = &posTable{ 253 bitLen: bitLen, 254 pos: make([]uint64, tableSize), 255 lens: make([]byte, tableSize), 256 ptrs: make([]*posTable, tableSize), 257 } 258 buildPosTable(posDepths, poss, d.posDict, 0, 0, 0, posMaxDepth) 259 } 260 d.wordsStart = pos + 8 + dictSize 261 return d, nil 262 } 263 264 func buildCondensedPatternTable(table *patternTable, depths []uint64, patterns [][]byte, code uint16, bits int, depth uint64, maxDepth uint64) int { 265 if len(depths) == 0 { 266 return 0 267 } 268 if depth == depths[0] { 269 pattern := word(patterns[0]) 270 //fmt.Printf("depth=%d, maxDepth=%d, code=[%b], codeLen=%d, pattern=[%x]\n", depth, maxDepth, code, bits, pattern) 271 cw := &codeword{code: code, pattern: &pattern, len: byte(bits), ptr: nil} 272 table.insertWord(cw) 273 return 1 274 } 275 if bits == 9 { 276 var bitLen int 277 if maxDepth > 9 { 278 bitLen = 9 279 } else { 280 bitLen = int(maxDepth) 281 } 282 cw := &codeword{code: code, pattern: nil, len: byte(0), ptr: newPatternTable(bitLen)} 283 table.insertWord(cw) 284 return buildCondensedPatternTable(cw.ptr, depths, patterns, 0, 0, depth, maxDepth) 285 } 286 b0 := buildCondensedPatternTable(table, depths, patterns, code, bits+1, depth+1, maxDepth-1) 287 return b0 + buildCondensedPatternTable(table, depths[b0:], patterns[b0:], (uint16(1)<<bits)|code, bits+1, depth+1, maxDepth-1) 288 } 289 290 func buildPosTable(depths []uint64, poss []uint64, table *posTable, code uint16, bits int, depth uint64, maxDepth uint64) int { 291 if len(depths) == 0 { 292 return 0 293 } 294 if depth == depths[0] { 295 p := poss[0] 296 //fmt.Printf("depth=%d, maxDepth=%d, code=[%b], codeLen=%d, pos=%d\n", depth, maxDepth, code, bits, p) 297 if table.bitLen == bits { 298 table.pos[code] = p 299 table.lens[code] = byte(bits) 300 table.ptrs[code] = nil 301 } else { 302 codeStep := uint16(1) << bits 303 codeFrom := code 304 codeTo := code | (uint16(1) << table.bitLen) 305 for c := codeFrom; c < codeTo; c += codeStep { 306 table.pos[c] = p 307 table.lens[c] = byte(bits) 308 table.ptrs[c] = nil 309 } 310 } 311 return 1 312 } 313 if bits == 9 { 314 var bitLen int 315 if maxDepth > 9 { 316 bitLen = 9 317 } else { 318 bitLen = int(maxDepth) 319 } 320 tableSize := 1 << bitLen 321 newTable := &posTable{ 322 bitLen: bitLen, 323 pos: make([]uint64, tableSize), 324 lens: make([]byte, tableSize), 325 ptrs: make([]*posTable, tableSize), 326 } 327 table.pos[code] = 0 328 table.lens[code] = byte(0) 329 table.ptrs[code] = newTable 330 return buildPosTable(depths, poss, newTable, 0, 0, depth, maxDepth) 331 } 332 b0 := buildPosTable(depths, poss, table, code, bits+1, depth+1, maxDepth-1) 333 return b0 + buildPosTable(depths[b0:], poss[b0:], table, (uint16(1)<<bits)|code, bits+1, depth+1, maxDepth-1) 334 } 335 336 func (d *Decompressor) Size() int64 { 337 return d.size 338 } 339 340 func (d *Decompressor) ModTime() time.Time { 341 return d.modTime 342 } 343 344 func (d *Decompressor) Close() { 345 if d.f != nil { 346 if err := mmap.Munmap(d.mmapHandle1, d.mmapHandle2); err != nil { 347 log.Log(dbg.FileCloseLogLevel, "unmap", "err", err, "file", d.FileName(), "stack", dbg.Stack()) 348 } 349 if err := d.f.Close(); err != nil { 350 log.Log(dbg.FileCloseLogLevel, "close", "err", err, "file", d.FileName(), "stack", dbg.Stack()) 351 } 352 d.f = nil 353 } 354 } 355 356 func (d *Decompressor) FilePath() string { return d.filePath } 357 func (d *Decompressor) FileName() string { return d.fileName } 358 359 // WithReadAhead - Expect read in sequential order. (Hence, pages in the given range can be aggressively read ahead, and may be freed soon after they are accessed.) 360 func (d *Decompressor) WithReadAhead(f func() error) error { 361 if d == nil || d.mmapHandle1 == nil { 362 return nil 363 } 364 _ = mmap.MadviseSequential(d.mmapHandle1) 365 //_ = mmap.MadviseWillNeed(d.mmapHandle1) 366 defer mmap.MadviseRandom(d.mmapHandle1) 367 return f() 368 } 369 370 // DisableReadAhead - usage: `defer d.EnableReadAhead().DisableReadAhead()`. Please don't use this funcs without `defer` to avoid leak. 371 func (d *Decompressor) DisableReadAhead() { 372 if d == nil || d.mmapHandle1 == nil { 373 return 374 } 375 _ = mmap.MadviseRandom(d.mmapHandle1) 376 } 377 func (d *Decompressor) EnableReadAhead() *Decompressor { 378 if d == nil || d.mmapHandle1 == nil { 379 return d 380 } 381 _ = mmap.MadviseSequential(d.mmapHandle1) 382 return d 383 } 384 func (d *Decompressor) EnableMadvNormal() *Decompressor { 385 if d == nil || d.mmapHandle1 == nil { 386 return d 387 } 388 _ = mmap.MadviseNormal(d.mmapHandle1) 389 return d 390 } 391 func (d *Decompressor) EnableWillNeed() *Decompressor { 392 if d == nil || d.mmapHandle1 == nil { 393 return d 394 } 395 _ = mmap.MadviseWillNeed(d.mmapHandle1) 396 return d 397 } 398 399 // Getter represent "reader" or "interator" that can move accross the data of the decompressor 400 // The full state of the getter can be captured by saving dataP, and dataBit 401 type Getter struct { 402 patternDict *patternTable 403 posDict *posTable 404 fName string 405 data []byte 406 dataP uint64 407 dataBit int // Value 0..7 - position of the bit 408 trace bool 409 } 410 411 func (g *Getter) Trace(t bool) { g.trace = t } 412 func (g *Getter) FileName() string { return g.fName } 413 414 func (g *Getter) nextPos(clean bool) (pos uint64) { 415 if clean && g.dataBit > 0 { 416 g.dataP++ 417 g.dataBit = 0 418 } 419 table := g.posDict 420 if table.bitLen == 0 { 421 return table.pos[0] 422 } 423 for l := byte(0); l == 0; { 424 code := uint16(g.data[g.dataP]) >> g.dataBit 425 if 8-g.dataBit < table.bitLen && int(g.dataP)+1 < len(g.data) { 426 code |= uint16(g.data[g.dataP+1]) << (8 - g.dataBit) 427 } 428 code &= (uint16(1) << table.bitLen) - 1 429 l = table.lens[code] 430 if l == 0 { 431 table = table.ptrs[code] 432 g.dataBit += 9 433 } else { 434 g.dataBit += int(l) 435 pos = table.pos[code] 436 } 437 g.dataP += uint64(g.dataBit / 8) 438 g.dataBit %= 8 439 } 440 return pos 441 } 442 443 func (g *Getter) nextPattern() []byte { 444 table := g.patternDict 445 446 if table.bitLen == 0 { 447 return *table.patterns[0].pattern 448 } 449 450 var l byte 451 var pattern []byte 452 for l == 0 { 453 code := uint16(g.data[g.dataP]) >> g.dataBit 454 if 8-g.dataBit < table.bitLen && int(g.dataP)+1 < len(g.data) { 455 code |= uint16(g.data[g.dataP+1]) << (8 - g.dataBit) 456 } 457 code &= (uint16(1) << table.bitLen) - 1 458 459 cw := table.condensedTableSearch(code) 460 l = cw.len 461 if l == 0 { 462 table = cw.ptr 463 g.dataBit += 9 464 } else { 465 g.dataBit += int(l) 466 pattern = *cw.pattern 467 } 468 g.dataP += uint64(g.dataBit / 8) 469 g.dataBit %= 8 470 } 471 return pattern 472 } 473 474 var condensedWordDistances = buildCondensedWordDistances() 475 476 func checkDistance(power int, d int) bool { 477 for _, dist := range condensedWordDistances[power] { 478 if dist == d { 479 return true 480 } 481 } 482 return false 483 } 484 485 func buildCondensedWordDistances() [][]int { 486 dist2 := make([][]int, 10) 487 for i := 1; i <= 9; i++ { 488 dl := make([]int, 0) 489 for j := 1 << i; j < 512; j += 1 << i { 490 dl = append(dl, j) 491 } 492 dist2[i] = dl 493 } 494 return dist2 495 } 496 497 func (g *Getter) Size() int { 498 return len(g.data) 499 } 500 501 func (d *Decompressor) Count() int { return int(d.wordsCount) } 502 func (d *Decompressor) EmptyWordsCount() int { return int(d.emptyWordsCount) } 503 504 // MakeGetter creates an object that can be used to access superstrings in the decompressor's file 505 // Getter is not thread-safe, but there can be multiple getters used simultaneously and concurrently 506 // for the same decompressor 507 func (d *Decompressor) MakeGetter() *Getter { 508 return &Getter{ 509 posDict: d.posDict, 510 data: d.data[d.wordsStart:], 511 patternDict: d.dict, 512 fName: d.fileName, 513 } 514 } 515 516 func (g *Getter) Reset(offset uint64) { 517 g.dataP = offset 518 g.dataBit = 0 519 } 520 521 func (g *Getter) HasNext() bool { 522 return g.dataP < uint64(len(g.data)) 523 } 524 525 // Next extracts a compressed word from current offset in the file 526 // and appends it to the given buf, returning the result of appending 527 // After extracting next word, it moves to the beginning of the next one 528 func (g *Getter) Next(buf []byte) ([]byte, uint64) { 529 savePos := g.dataP 530 wordLen := g.nextPos(true) 531 wordLen-- // because when create huffman tree we do ++ , because 0 is terminator 532 if wordLen == 0 { 533 if g.dataBit > 0 { 534 g.dataP++ 535 g.dataBit = 0 536 } 537 if buf == nil { // wordLen == 0, means we have valid record of 0 size. nil - is the marker of "something not found" 538 buf = []byte{} 539 } 540 return buf, g.dataP 541 } 542 bufPos := len(buf) // Tracking position in buf where to insert part of the word 543 lastUncovered := len(buf) 544 if len(buf)+int(wordLen) > cap(buf) { 545 newBuf := make([]byte, len(buf)+int(wordLen)) 546 copy(newBuf, buf) 547 buf = newBuf 548 } else { 549 // Expand buffer 550 buf = buf[:len(buf)+int(wordLen)] 551 } 552 // Loop below fills in the patterns 553 for pos := g.nextPos(false /* clean */); pos != 0; pos = g.nextPos(false) { 554 bufPos += int(pos) - 1 // Positions where to insert patterns are encoded relative to one another 555 pt := g.nextPattern() 556 copy(buf[bufPos:], pt) 557 } 558 if g.dataBit > 0 { 559 g.dataP++ 560 g.dataBit = 0 561 } 562 postLoopPos := g.dataP 563 g.dataP = savePos 564 g.dataBit = 0 565 g.nextPos(true /* clean */) // Reset the state of huffman reader 566 bufPos = lastUncovered // Restore to the beginning of buf 567 // Loop below fills the data which is not in the patterns 568 for pos := g.nextPos(false); pos != 0; pos = g.nextPos(false) { 569 bufPos += int(pos) - 1 // Positions where to insert patterns are encoded relative to one another 570 if bufPos > lastUncovered { 571 dif := uint64(bufPos - lastUncovered) 572 copy(buf[lastUncovered:bufPos], g.data[postLoopPos:postLoopPos+dif]) 573 postLoopPos += dif 574 } 575 lastUncovered = bufPos + len(g.nextPattern()) 576 } 577 if int(wordLen) > lastUncovered { 578 dif := wordLen - uint64(lastUncovered) 579 copy(buf[lastUncovered:wordLen], g.data[postLoopPos:postLoopPos+dif]) 580 postLoopPos += dif 581 } 582 g.dataP = postLoopPos 583 g.dataBit = 0 584 return buf, postLoopPos 585 } 586 587 func (g *Getter) NextUncompressed() ([]byte, uint64) { 588 wordLen := g.nextPos(true) 589 wordLen-- // because when create huffman tree we do ++ , because 0 is terminator 590 if wordLen == 0 { 591 if g.dataBit > 0 { 592 g.dataP++ 593 g.dataBit = 0 594 } 595 return g.data[g.dataP:g.dataP], g.dataP 596 } 597 g.nextPos(false) 598 if g.dataBit > 0 { 599 g.dataP++ 600 g.dataBit = 0 601 } 602 pos := g.dataP 603 g.dataP += wordLen 604 return g.data[pos:g.dataP], g.dataP 605 } 606 607 // Skip moves offset to the next word and returns the new offset and the length of the word. 608 func (g *Getter) Skip() (uint64, int) { 609 l := g.nextPos(true) 610 l-- // because when create huffman tree we do ++ , because 0 is terminator 611 if l == 0 { 612 if g.dataBit > 0 { 613 g.dataP++ 614 g.dataBit = 0 615 } 616 return g.dataP, 0 617 } 618 wordLen := int(l) 619 620 var add uint64 621 var bufPos int 622 var lastUncovered int 623 for pos := g.nextPos(false /* clean */); pos != 0; pos = g.nextPos(false) { 624 bufPos += int(pos) - 1 625 if wordLen < bufPos { 626 panic(fmt.Sprintf("likely .idx is invalid: %s", g.fName)) 627 } 628 if bufPos > lastUncovered { 629 add += uint64(bufPos - lastUncovered) 630 } 631 lastUncovered = bufPos + len(g.nextPattern()) 632 } 633 if g.dataBit > 0 { 634 g.dataP++ 635 g.dataBit = 0 636 } 637 if int(l) > lastUncovered { 638 add += l - uint64(lastUncovered) 639 } 640 // Uncovered characters 641 g.dataP += add 642 return g.dataP, wordLen 643 } 644 645 func (g *Getter) SkipUncompressed() (uint64, int) { 646 wordLen := g.nextPos(true) 647 wordLen-- // because when create huffman tree we do ++ , because 0 is terminator 648 if wordLen == 0 { 649 if g.dataBit > 0 { 650 g.dataP++ 651 g.dataBit = 0 652 } 653 return g.dataP, 0 654 } 655 g.nextPos(false) 656 if g.dataBit > 0 { 657 g.dataP++ 658 g.dataBit = 0 659 } 660 g.dataP += wordLen 661 return g.dataP, int(wordLen) 662 } 663 664 // Match returns true and next offset if the word at current offset fully matches the buf 665 // returns false and current offset otherwise. 666 func (g *Getter) Match(buf []byte) (bool, uint64) { 667 savePos := g.dataP 668 wordLen := g.nextPos(true) 669 wordLen-- // because when create huffman tree we do ++ , because 0 is terminator 670 lenBuf := len(buf) 671 if wordLen == 0 || int(wordLen) != lenBuf { 672 if g.dataBit > 0 { 673 g.dataP++ 674 g.dataBit = 0 675 } 676 if lenBuf != 0 { 677 g.dataP, g.dataBit = savePos, 0 678 } 679 return lenBuf == int(wordLen), g.dataP 680 } 681 682 var bufPos int 683 // In the first pass, we only check patterns 684 for pos := g.nextPos(false /* clean */); pos != 0; pos = g.nextPos(false) { 685 bufPos += int(pos) - 1 686 pattern := g.nextPattern() 687 if lenBuf < bufPos+len(pattern) || !bytes.Equal(buf[bufPos:bufPos+len(pattern)], pattern) { 688 g.dataP, g.dataBit = savePos, 0 689 return false, savePos 690 } 691 } 692 if g.dataBit > 0 { 693 g.dataP++ 694 g.dataBit = 0 695 } 696 postLoopPos := g.dataP 697 g.dataP, g.dataBit = savePos, 0 698 g.nextPos(true /* clean */) // Reset the state of huffman decoder 699 // Second pass - we check spaces not covered by the patterns 700 var lastUncovered int 701 bufPos = 0 702 for pos := g.nextPos(false /* clean */); pos != 0; pos = g.nextPos(false) { 703 bufPos += int(pos) - 1 704 if bufPos > lastUncovered { 705 dif := uint64(bufPos - lastUncovered) 706 if lenBuf < bufPos || !bytes.Equal(buf[lastUncovered:bufPos], g.data[postLoopPos:postLoopPos+dif]) { 707 g.dataP, g.dataBit = savePos, 0 708 return false, savePos 709 } 710 postLoopPos += dif 711 } 712 lastUncovered = bufPos + len(g.nextPattern()) 713 } 714 if int(wordLen) > lastUncovered { 715 dif := wordLen - uint64(lastUncovered) 716 if lenBuf < int(wordLen) || !bytes.Equal(buf[lastUncovered:wordLen], g.data[postLoopPos:postLoopPos+dif]) { 717 g.dataP, g.dataBit = savePos, 0 718 return false, savePos 719 } 720 postLoopPos += dif 721 } 722 if lenBuf != int(wordLen) { 723 g.dataP, g.dataBit = savePos, 0 724 return false, savePos 725 } 726 g.dataP, g.dataBit = postLoopPos, 0 727 return true, postLoopPos 728 } 729 730 // MatchPrefix only checks if the word at the current offset has a buf prefix. Does not move offset to the next word. 731 func (g *Getter) MatchPrefix(prefix []byte) bool { 732 savePos := g.dataP 733 defer func() { 734 g.dataP, g.dataBit = savePos, 0 735 }() 736 737 wordLen := g.nextPos(true /* clean */) 738 wordLen-- // because when create huffman tree we do ++ , because 0 is terminator 739 prefixLen := len(prefix) 740 if wordLen == 0 || int(wordLen) < prefixLen { 741 if g.dataBit > 0 { 742 g.dataP++ 743 g.dataBit = 0 744 } 745 if prefixLen != 0 { 746 g.dataP, g.dataBit = savePos, 0 747 } 748 return prefixLen == int(wordLen) 749 } 750 751 var bufPos int 752 // In the first pass, we only check patterns 753 // Only run this loop as far as the prefix goes, there is no need to check further 754 for pos := g.nextPos(false /* clean */); pos != 0; pos = g.nextPos(false) { 755 bufPos += int(pos) - 1 756 pattern := g.nextPattern() 757 var comparisonLen int 758 if prefixLen < bufPos+len(pattern) { 759 comparisonLen = prefixLen - bufPos 760 } else { 761 comparisonLen = len(pattern) 762 } 763 if bufPos < prefixLen { 764 if !bytes.Equal(prefix[bufPos:bufPos+comparisonLen], pattern[:comparisonLen]) { 765 return false 766 } 767 } 768 } 769 770 if g.dataBit > 0 { 771 g.dataP++ 772 g.dataBit = 0 773 } 774 postLoopPos := g.dataP 775 g.dataP, g.dataBit = savePos, 0 776 g.nextPos(true /* clean */) // Reset the state of huffman decoder 777 // Second pass - we check spaces not covered by the patterns 778 var lastUncovered int 779 bufPos = 0 780 for pos := g.nextPos(false /* clean */); pos != 0 && lastUncovered < prefixLen; pos = g.nextPos(false) { 781 bufPos += int(pos) - 1 782 if bufPos > lastUncovered { 783 dif := uint64(bufPos - lastUncovered) 784 var comparisonLen int 785 if prefixLen < lastUncovered+int(dif) { 786 comparisonLen = prefixLen - lastUncovered 787 } else { 788 comparisonLen = int(dif) 789 } 790 if !bytes.Equal(prefix[lastUncovered:lastUncovered+comparisonLen], g.data[postLoopPos:postLoopPos+uint64(comparisonLen)]) { 791 return false 792 } 793 postLoopPos += dif 794 } 795 lastUncovered = bufPos + len(g.nextPattern()) 796 } 797 if prefixLen > lastUncovered && int(wordLen) > lastUncovered { 798 dif := wordLen - uint64(lastUncovered) 799 var comparisonLen int 800 if prefixLen < int(wordLen) { 801 comparisonLen = prefixLen - lastUncovered 802 } else { 803 comparisonLen = int(dif) 804 } 805 if !bytes.Equal(prefix[lastUncovered:lastUncovered+comparisonLen], g.data[postLoopPos:postLoopPos+uint64(comparisonLen)]) { 806 return false 807 } 808 } 809 return true 810 } 811 812 // MatchCmp lexicographically compares given buf with the word at the current offset in the file. 813 // returns 0 if buf == word, -1 if buf < word, 1 if buf > word 814 func (g *Getter) MatchCmp(buf []byte) int { 815 savePos := g.dataP 816 wordLen := g.nextPos(true) 817 wordLen-- // because when create huffman tree we do ++ , because 0 is terminator 818 lenBuf := len(buf) 819 if wordLen == 0 && lenBuf != 0 { 820 g.dataP, g.dataBit = savePos, 0 821 return 1 822 } 823 if wordLen == 0 && lenBuf == 0 { 824 if g.dataBit > 0 { 825 g.dataP++ 826 g.dataBit = 0 827 } 828 return 0 829 } 830 831 decoded := make([]byte, wordLen) 832 var bufPos int 833 // In the first pass, we only check patterns 834 for pos := g.nextPos(false /* clean */); pos != 0; pos = g.nextPos(false) { 835 bufPos += int(pos) - 1 836 pattern := g.nextPattern() 837 copy(decoded[bufPos:], pattern) 838 } 839 if g.dataBit > 0 { 840 g.dataP++ 841 g.dataBit = 0 842 } 843 postLoopPos := g.dataP 844 g.dataP, g.dataBit = savePos, 0 845 g.nextPos(true /* clean */) // Reset the state of huffman decoder 846 // Second pass - we check spaces not covered by the patterns 847 var lastUncovered int 848 bufPos = 0 849 for pos := g.nextPos(false /* clean */); pos != 0; pos = g.nextPos(false) { 850 bufPos += int(pos) - 1 851 // fmt.Printf("BUF POS: %d, POS: %d, lastUncovered: %d\n", bufPos, pos, lastUncovered) 852 if bufPos > lastUncovered { 853 dif := uint64(bufPos - lastUncovered) 854 copy(decoded[lastUncovered:bufPos], g.data[postLoopPos:postLoopPos+dif]) 855 postLoopPos += dif 856 } 857 lastUncovered = bufPos + len(g.nextPattern()) 858 } 859 860 if int(wordLen) > lastUncovered { 861 dif := wordLen - uint64(lastUncovered) 862 copy(decoded[lastUncovered:wordLen], g.data[postLoopPos:postLoopPos+dif]) 863 postLoopPos += dif 864 } 865 cmp := bytes.Compare(buf, decoded) 866 if cmp == 0 { 867 g.dataP, g.dataBit = postLoopPos, 0 868 } else { 869 g.dataP, g.dataBit = savePos, 0 870 } 871 return cmp 872 } 873 874 // MatchPrefixCmp lexicographically compares given prefix with the word at the current offset in the file. 875 // returns 0 if buf == word, -1 if buf < word, 1 if buf > word 876 func (g *Getter) MatchPrefixCmp(prefix []byte) int { 877 savePos := g.dataP 878 defer func() { 879 g.dataP, g.dataBit = savePos, 0 880 }() 881 882 wordLen := g.nextPos(true /* clean */) 883 wordLen-- // because when create huffman tree we do ++ , because 0 is terminator 884 prefixLen := len(prefix) 885 if wordLen == 0 && prefixLen != 0 { 886 return 1 887 } 888 if prefixLen == 0 { 889 return 0 890 } 891 892 decoded := make([]byte, wordLen) 893 var bufPos int 894 // In the first pass, we only check patterns 895 // Only run this loop as far as the prefix goes, there is no need to check further 896 for pos := g.nextPos(false /* clean */); pos != 0; pos = g.nextPos(false) { 897 bufPos += int(pos) - 1 898 if bufPos > prefixLen { 899 break 900 } 901 pattern := g.nextPattern() 902 copy(decoded[bufPos:], pattern) 903 } 904 905 if g.dataBit > 0 { 906 g.dataP++ 907 g.dataBit = 0 908 } 909 postLoopPos := g.dataP 910 g.dataP, g.dataBit = savePos, 0 911 g.nextPos(true /* clean */) // Reset the state of huffman decoder 912 // Second pass - we check spaces not covered by the patterns 913 var lastUncovered int 914 bufPos = 0 915 for pos := g.nextPos(false /* clean */); pos != 0 && lastUncovered < prefixLen; pos = g.nextPos(false) { 916 bufPos += int(pos) - 1 917 if bufPos > lastUncovered { 918 dif := uint64(bufPos - lastUncovered) 919 copy(decoded[lastUncovered:bufPos], g.data[postLoopPos:postLoopPos+dif]) 920 postLoopPos += dif 921 } 922 lastUncovered = bufPos + len(g.nextPattern()) 923 } 924 if prefixLen > lastUncovered && int(wordLen) > lastUncovered { 925 dif := wordLen - uint64(lastUncovered) 926 copy(decoded[lastUncovered:wordLen], g.data[postLoopPos:postLoopPos+dif]) 927 // postLoopPos += dif 928 } 929 var cmp int 930 if prefixLen > int(wordLen) { 931 // TODO(racytech): handle this case 932 // e.g: prefix = 'aaacb' 933 // word = 'aaa' 934 cmp = bytes.Compare(prefix, decoded) 935 } else { 936 cmp = bytes.Compare(prefix, decoded[:prefixLen]) 937 } 938 939 return cmp 940 } 941 942 func (g *Getter) MatchPrefixUncompressed(prefix []byte) int { 943 savePos := g.dataP 944 defer func() { 945 g.dataP, g.dataBit = savePos, 0 946 }() 947 948 wordLen := g.nextPos(true /* clean */) 949 wordLen-- // because when create huffman tree we do ++ , because 0 is terminator 950 prefixLen := len(prefix) 951 if wordLen == 0 && prefixLen != 0 { 952 return 1 953 } 954 if prefixLen == 0 { 955 return 0 956 } 957 958 g.nextPos(true) 959 960 // if prefixLen > int(wordLen) { 961 // // TODO(racytech): handle this case 962 // // e.g: prefix = 'aaacb' 963 // // word = 'aaa' 964 // } 965 966 return bytes.Compare(prefix, g.data[g.dataP:g.dataP+wordLen]) 967 } 968 969 // FastNext extracts a compressed word from current offset in the file 970 // into the given buf, returning a new byte slice which contains extracted word. 971 // It is important to allocate enough buf size. Could throw an error if word in file is larger then the buf size. 972 // After extracting next word, it moves to the beginning of the next one 973 func (g *Getter) FastNext(buf []byte) ([]byte, uint64) { 974 defer func() { 975 if rec := recover(); rec != nil { 976 panic(fmt.Sprintf("file: %s, %s, %s", g.fName, rec, dbg.Stack())) 977 } 978 }() 979 980 savePos := g.dataP 981 wordLen := g.nextPos(true) 982 wordLen-- // because when create huffman tree we do ++ , because 0 is terminator 983 // decoded := make([]byte, wordLen) 984 if wordLen == 0 { 985 if g.dataBit > 0 { 986 g.dataP++ 987 g.dataBit = 0 988 } 989 return buf[:wordLen], g.dataP 990 } 991 bufPos := 0 // Tracking position in buf where to insert part of the word 992 lastUncovered := 0 993 994 // if int(wordLen) > cap(buf) { 995 // newBuf := make([]byte, int(wordLen)) 996 // buf = newBuf 997 // } 998 // Loop below fills in the patterns 999 for pos := g.nextPos(false /* clean */); pos != 0; pos = g.nextPos(false) { 1000 bufPos += int(pos) - 1 // Positions where to insert patterns are encoded relative to one another 1001 pt := g.nextPattern() 1002 copy(buf[bufPos:], pt) 1003 } 1004 if g.dataBit > 0 { 1005 g.dataP++ 1006 g.dataBit = 0 1007 } 1008 postLoopPos := g.dataP 1009 g.dataP = savePos 1010 g.dataBit = 0 1011 g.nextPos(true /* clean */) // Reset the state of huffman reader 1012 bufPos = lastUncovered // Restore to the beginning of buf 1013 // Loop below fills the data which is not in the patterns 1014 for pos := g.nextPos(false); pos != 0; pos = g.nextPos(false) { 1015 bufPos += int(pos) - 1 // Positions where to insert patterns are encoded relative to one another 1016 if bufPos > lastUncovered { 1017 dif := uint64(bufPos - lastUncovered) 1018 copy(buf[lastUncovered:bufPos], g.data[postLoopPos:postLoopPos+dif]) 1019 postLoopPos += dif 1020 } 1021 lastUncovered = bufPos + len(g.nextPattern()) 1022 } 1023 if int(wordLen) > lastUncovered { 1024 dif := wordLen - uint64(lastUncovered) 1025 copy(buf[lastUncovered:wordLen], g.data[postLoopPos:postLoopPos+dif]) 1026 postLoopPos += dif 1027 } 1028 g.dataP = postLoopPos 1029 g.dataBit = 0 1030 return buf[:wordLen], postLoopPos 1031 }