github.com/ledgerwatch/erigon-lib@v1.0.0/compress/decompress.go

github.com/ledgerwatch/erigon-lib@v1.0.0/compress/decompress.go (about)

     1  /*
     2     Copyright 2022 Erigon contributors
     3  
     4     Licensed under the Apache License, Version 2.0 (the "License");
     5     you may not use this file except in compliance with the License.
     6     You may obtain a copy of the License at
     7  
     8         http://www.apache.org/licenses/LICENSE-2.0
     9  
    10     Unless required by applicable law or agreed to in writing, software
    11     distributed under the License is distributed on an "AS IS" BASIS,
    12     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13     See the License for the specific language governing permissions and
    14     limitations under the License.
    15  */
    16  
    17  package compress
    18  
    19  import (
    20  	"bytes"
    21  	"encoding/binary"
    22  	"fmt"
    23  	"os"
    24  	"path/filepath"
    25  	"strconv"
    26  	"time"
    27  
    28  	"github.com/ledgerwatch/erigon-lib/common/dbg"
    29  	"github.com/ledgerwatch/erigon-lib/mmap"
    30  	"github.com/ledgerwatch/log/v3"
    31  )
    32  
    33  type word []byte // plain text word associated with code from dictionary
    34  
    35  type codeword struct {
    36  	pattern *word         // Pattern corresponding to entries
    37  	ptr     *patternTable // pointer to deeper level tables
    38  	code    uint16        // code associated with that word
    39  	len     byte          // Number of bits in the codes
    40  }
    41  
    42  type patternTable struct {
    43  	patterns []*codeword
    44  	bitLen   int // Number of bits to lookup in the table
    45  }
    46  
    47  func newPatternTable(bitLen int) *patternTable {
    48  	pt := &patternTable{
    49  		bitLen: bitLen,
    50  	}
    51  	if bitLen <= condensePatternTableBitThreshold {
    52  		pt.patterns = make([]*codeword, 1<<pt.bitLen)
    53  	}
    54  	return pt
    55  }
    56  
    57  func (pt *patternTable) insertWord(cw *codeword) {
    58  	if pt.bitLen <= condensePatternTableBitThreshold {
    59  		codeStep := uint16(1) << uint16(cw.len)
    60  		codeFrom, codeTo := cw.code, cw.code+codeStep
    61  		if pt.bitLen != int(cw.len) && cw.len > 0 {
    62  			codeTo = codeFrom | (uint16(1) << pt.bitLen)
    63  		}
    64  
    65  		for c := codeFrom; c < codeTo; c += codeStep {
    66  			pt.patterns[c] = cw
    67  		}
    68  		return
    69  	}
    70  
    71  	pt.patterns = append(pt.patterns, cw)
    72  }
    73  
    74  func (pt *patternTable) condensedTableSearch(code uint16) *codeword {
    75  	if pt.bitLen <= condensePatternTableBitThreshold {
    76  		return pt.patterns[code]
    77  	}
    78  	for _, cur := range pt.patterns {
    79  		if cur.code == code {
    80  			return cur
    81  		}
    82  		d := code - cur.code
    83  		if d&1 != 0 {
    84  			continue
    85  		}
    86  		if checkDistance(int(cur.len), int(d)) {
    87  			return cur
    88  		}
    89  	}
    90  	return nil
    91  }
    92  
    93  type posTable struct {
    94  	pos    []uint64
    95  	lens   []byte
    96  	ptrs   []*posTable
    97  	bitLen int
    98  }
    99  
   100  // Decompressor provides access to the superstrings in a file produced by a compressor
   101  type Decompressor struct {
   102  	f               *os.File
   103  	mmapHandle2     *[mmap.MaxMapSize]byte // mmap handle for windows (this is used to close mmap)
   104  	dict            *patternTable
   105  	posDict         *posTable
   106  	mmapHandle1     []byte // mmap handle for unix (this is used to close mmap)
   107  	data            []byte // slice of correct size for the decompressor to work with
   108  	wordsStart      uint64 // Offset of whether the superstrings actually start
   109  	size            int64
   110  	modTime         time.Time
   111  	wordsCount      uint64
   112  	emptyWordsCount uint64
   113  
   114  	filePath, fileName string
   115  }
   116  
   117  // Tables with bitlen greater than threshold will be condensed.
   118  // Condensing reduces size of decompression table but leads to slower reads.
   119  // To disable condesning at all set to 9 (we dont use tables larger than 2^9)
   120  // To enable condensing for tables of size larger 64 = 6
   121  // for all tables                                    = 0
   122  // There is no sense to condense tables of size [1 - 64] in terms of performance
   123  //
   124  // Should be set before calling NewDecompression.
   125  var condensePatternTableBitThreshold = 9
   126  
   127  func init() {
   128  	v, _ := os.LookupEnv("DECOMPRESS_CONDENSITY")
   129  	if v != "" {
   130  		i, err := strconv.Atoi(v)
   131  		if err != nil {
   132  			panic(err)
   133  		}
   134  		if i < 3 || i > 9 {
   135  			panic("DECOMPRESS_CONDENSITY: only numbers in range 3-9 are acceptable ")
   136  		}
   137  		condensePatternTableBitThreshold = i
   138  		fmt.Printf("set DECOMPRESS_CONDENSITY to %d\n", i)
   139  	}
   140  }
   141  
   142  func SetDecompressionTableCondensity(fromBitSize int) {
   143  	condensePatternTableBitThreshold = fromBitSize
   144  }
   145  
   146  func NewDecompressor(compressedFilePath string) (d *Decompressor, err error) {
   147  	_, fName := filepath.Split(compressedFilePath)
   148  	d = &Decompressor{
   149  		filePath: compressedFilePath,
   150  		fileName: fName,
   151  	}
   152  	defer func() {
   153  
   154  		if rec := recover(); rec != nil {
   155  			err = fmt.Errorf("decompressing file: %s, %+v, trace: %s", compressedFilePath, rec, dbg.Stack())
   156  		}
   157  	}()
   158  
   159  	d.f, err = os.Open(compressedFilePath)
   160  	if err != nil {
   161  		return nil, err
   162  	}
   163  	var stat os.FileInfo
   164  	if stat, err = d.f.Stat(); err != nil {
   165  		return nil, err
   166  	}
   167  	d.size = stat.Size()
   168  	if d.size < 32 {
   169  		return nil, fmt.Errorf("compressed file is too short: %d", d.size)
   170  	}
   171  	d.modTime = stat.ModTime()
   172  	if d.mmapHandle1, d.mmapHandle2, err = mmap.Mmap(d.f, int(d.size)); err != nil {
   173  		return nil, err
   174  	}
   175  	// read patterns from file
   176  	d.data = d.mmapHandle1[:d.size]
   177  	defer d.EnableReadAhead().DisableReadAhead() //speedup opening on slow drives
   178  
   179  	d.wordsCount = binary.BigEndian.Uint64(d.data[:8])
   180  	d.emptyWordsCount = binary.BigEndian.Uint64(d.data[8:16])
   181  	dictSize := binary.BigEndian.Uint64(d.data[16:24])
   182  	data := d.data[24 : 24+dictSize]
   183  
   184  	var depths []uint64
   185  	var patterns [][]byte
   186  	var i uint64
   187  	var patternMaxDepth uint64
   188  
   189  	for i < dictSize {
   190  		d, ns := binary.Uvarint(data[i:])
   191  		if d > 64 { // mainnet has maxDepth 31
   192  			return nil, fmt.Errorf("dictionary is invalid: patternMaxDepth=%d", d)
   193  		}
   194  		depths = append(depths, d)
   195  		if d > patternMaxDepth {
   196  			patternMaxDepth = d
   197  		}
   198  		i += uint64(ns)
   199  		l, n := binary.Uvarint(data[i:])
   200  		i += uint64(n)
   201  		patterns = append(patterns, data[i:i+l])
   202  		//fmt.Printf("depth = %d, pattern = [%x]\n", d, data[i:i+l])
   203  		i += l
   204  	}
   205  
   206  	if dictSize > 0 {
   207  		var bitLen int
   208  		if patternMaxDepth > 9 {
   209  			bitLen = 9
   210  		} else {
   211  			bitLen = int(patternMaxDepth)
   212  		}
   213  		// fmt.Printf("pattern maxDepth=%d\n", tree.maxDepth)
   214  		d.dict = newPatternTable(bitLen)
   215  		buildCondensedPatternTable(d.dict, depths, patterns, 0, 0, 0, patternMaxDepth)
   216  	}
   217  
   218  	// read positions
   219  	pos := 24 + dictSize
   220  	dictSize = binary.BigEndian.Uint64(d.data[pos : pos+8])
   221  	data = d.data[pos+8 : pos+8+dictSize]
   222  
   223  	var posDepths []uint64
   224  	var poss []uint64
   225  	var posMaxDepth uint64
   226  
   227  	i = 0
   228  	for i < dictSize {
   229  		d, ns := binary.Uvarint(data[i:])
   230  		if d > 2048 {
   231  			return nil, fmt.Errorf("dictionary is invalid: posMaxDepth=%d", d)
   232  		}
   233  		posDepths = append(posDepths, d)
   234  		if d > posMaxDepth {
   235  			posMaxDepth = d
   236  		}
   237  		i += uint64(ns)
   238  		pos, n := binary.Uvarint(data[i:])
   239  		i += uint64(n)
   240  		poss = append(poss, pos)
   241  	}
   242  
   243  	if dictSize > 0 {
   244  		var bitLen int
   245  		if posMaxDepth > 9 {
   246  			bitLen = 9
   247  		} else {
   248  			bitLen = int(posMaxDepth)
   249  		}
   250  		//fmt.Printf("pos maxDepth=%d\n", tree.maxDepth)
   251  		tableSize := 1 << bitLen
   252  		d.posDict = &posTable{
   253  			bitLen: bitLen,
   254  			pos:    make([]uint64, tableSize),
   255  			lens:   make([]byte, tableSize),
   256  			ptrs:   make([]*posTable, tableSize),
   257  		}
   258  		buildPosTable(posDepths, poss, d.posDict, 0, 0, 0, posMaxDepth)
   259  	}
   260  	d.wordsStart = pos + 8 + dictSize
   261  	return d, nil
   262  }
   263  
   264  func buildCondensedPatternTable(table *patternTable, depths []uint64, patterns [][]byte, code uint16, bits int, depth uint64, maxDepth uint64) int {
   265  	if len(depths) == 0 {
   266  		return 0
   267  	}
   268  	if depth == depths[0] {
   269  		pattern := word(patterns[0])
   270  		//fmt.Printf("depth=%d, maxDepth=%d, code=[%b], codeLen=%d, pattern=[%x]\n", depth, maxDepth, code, bits, pattern)
   271  		cw := &codeword{code: code, pattern: &pattern, len: byte(bits), ptr: nil}
   272  		table.insertWord(cw)
   273  		return 1
   274  	}
   275  	if bits == 9 {
   276  		var bitLen int
   277  		if maxDepth > 9 {
   278  			bitLen = 9
   279  		} else {
   280  			bitLen = int(maxDepth)
   281  		}
   282  		cw := &codeword{code: code, pattern: nil, len: byte(0), ptr: newPatternTable(bitLen)}
   283  		table.insertWord(cw)
   284  		return buildCondensedPatternTable(cw.ptr, depths, patterns, 0, 0, depth, maxDepth)
   285  	}
   286  	b0 := buildCondensedPatternTable(table, depths, patterns, code, bits+1, depth+1, maxDepth-1)
   287  	return b0 + buildCondensedPatternTable(table, depths[b0:], patterns[b0:], (uint16(1)<<bits)|code, bits+1, depth+1, maxDepth-1)
   288  }
   289  
   290  func buildPosTable(depths []uint64, poss []uint64, table *posTable, code uint16, bits int, depth uint64, maxDepth uint64) int {
   291  	if len(depths) == 0 {
   292  		return 0
   293  	}
   294  	if depth == depths[0] {
   295  		p := poss[0]
   296  		//fmt.Printf("depth=%d, maxDepth=%d, code=[%b], codeLen=%d, pos=%d\n", depth, maxDepth, code, bits, p)
   297  		if table.bitLen == bits {
   298  			table.pos[code] = p
   299  			table.lens[code] = byte(bits)
   300  			table.ptrs[code] = nil
   301  		} else {
   302  			codeStep := uint16(1) << bits
   303  			codeFrom := code
   304  			codeTo := code | (uint16(1) << table.bitLen)
   305  			for c := codeFrom; c < codeTo; c += codeStep {
   306  				table.pos[c] = p
   307  				table.lens[c] = byte(bits)
   308  				table.ptrs[c] = nil
   309  			}
   310  		}
   311  		return 1
   312  	}
   313  	if bits == 9 {
   314  		var bitLen int
   315  		if maxDepth > 9 {
   316  			bitLen = 9
   317  		} else {
   318  			bitLen = int(maxDepth)
   319  		}
   320  		tableSize := 1 << bitLen
   321  		newTable := &posTable{
   322  			bitLen: bitLen,
   323  			pos:    make([]uint64, tableSize),
   324  			lens:   make([]byte, tableSize),
   325  			ptrs:   make([]*posTable, tableSize),
   326  		}
   327  		table.pos[code] = 0
   328  		table.lens[code] = byte(0)
   329  		table.ptrs[code] = newTable
   330  		return buildPosTable(depths, poss, newTable, 0, 0, depth, maxDepth)
   331  	}
   332  	b0 := buildPosTable(depths, poss, table, code, bits+1, depth+1, maxDepth-1)
   333  	return b0 + buildPosTable(depths[b0:], poss[b0:], table, (uint16(1)<<bits)|code, bits+1, depth+1, maxDepth-1)
   334  }
   335  
   336  func (d *Decompressor) Size() int64 {
   337  	return d.size
   338  }
   339  
   340  func (d *Decompressor) ModTime() time.Time {
   341  	return d.modTime
   342  }
   343  
   344  func (d *Decompressor) Close() {
   345  	if d.f != nil {
   346  		if err := mmap.Munmap(d.mmapHandle1, d.mmapHandle2); err != nil {
   347  			log.Log(dbg.FileCloseLogLevel, "unmap", "err", err, "file", d.FileName(), "stack", dbg.Stack())
   348  		}
   349  		if err := d.f.Close(); err != nil {
   350  			log.Log(dbg.FileCloseLogLevel, "close", "err", err, "file", d.FileName(), "stack", dbg.Stack())
   351  		}
   352  		d.f = nil
   353  	}
   354  }
   355  
   356  func (d *Decompressor) FilePath() string { return d.filePath }
   357  func (d *Decompressor) FileName() string { return d.fileName }
   358  
   359  // WithReadAhead - Expect read in sequential order. (Hence, pages in the given range can be aggressively read ahead, and may be freed soon after they are accessed.)
   360  func (d *Decompressor) WithReadAhead(f func() error) error {
   361  	if d == nil || d.mmapHandle1 == nil {
   362  		return nil
   363  	}
   364  	_ = mmap.MadviseSequential(d.mmapHandle1)
   365  	//_ = mmap.MadviseWillNeed(d.mmapHandle1)
   366  	defer mmap.MadviseRandom(d.mmapHandle1)
   367  	return f()
   368  }
   369  
   370  // DisableReadAhead - usage: `defer d.EnableReadAhead().DisableReadAhead()`. Please don't use this funcs without `defer` to avoid leak.
   371  func (d *Decompressor) DisableReadAhead() {
   372  	if d == nil || d.mmapHandle1 == nil {
   373  		return
   374  	}
   375  	_ = mmap.MadviseRandom(d.mmapHandle1)
   376  }
   377  func (d *Decompressor) EnableReadAhead() *Decompressor {
   378  	if d == nil || d.mmapHandle1 == nil {
   379  		return d
   380  	}
   381  	_ = mmap.MadviseSequential(d.mmapHandle1)
   382  	return d
   383  }
   384  func (d *Decompressor) EnableMadvNormal() *Decompressor {
   385  	if d == nil || d.mmapHandle1 == nil {
   386  		return d
   387  	}
   388  	_ = mmap.MadviseNormal(d.mmapHandle1)
   389  	return d
   390  }
   391  func (d *Decompressor) EnableWillNeed() *Decompressor {
   392  	if d == nil || d.mmapHandle1 == nil {
   393  		return d
   394  	}
   395  	_ = mmap.MadviseWillNeed(d.mmapHandle1)
   396  	return d
   397  }
   398  
   399  // Getter represent "reader" or "interator" that can move accross the data of the decompressor
   400  // The full state of the getter can be captured by saving dataP, and dataBit
   401  type Getter struct {
   402  	patternDict *patternTable
   403  	posDict     *posTable
   404  	fName       string
   405  	data        []byte
   406  	dataP       uint64
   407  	dataBit     int // Value 0..7 - position of the bit
   408  	trace       bool
   409  }
   410  
   411  func (g *Getter) Trace(t bool)     { g.trace = t }
   412  func (g *Getter) FileName() string { return g.fName }
   413  
   414  func (g *Getter) nextPos(clean bool) (pos uint64) {
   415  	if clean && g.dataBit > 0 {
   416  		g.dataP++
   417  		g.dataBit = 0
   418  	}
   419  	table := g.posDict
   420  	if table.bitLen == 0 {
   421  		return table.pos[0]
   422  	}
   423  	for l := byte(0); l == 0; {
   424  		code := uint16(g.data[g.dataP]) >> g.dataBit
   425  		if 8-g.dataBit < table.bitLen && int(g.dataP)+1 < len(g.data) {
   426  			code |= uint16(g.data[g.dataP+1]) << (8 - g.dataBit)
   427  		}
   428  		code &= (uint16(1) << table.bitLen) - 1
   429  		l = table.lens[code]
   430  		if l == 0 {
   431  			table = table.ptrs[code]
   432  			g.dataBit += 9
   433  		} else {
   434  			g.dataBit += int(l)
   435  			pos = table.pos[code]
   436  		}
   437  		g.dataP += uint64(g.dataBit / 8)
   438  		g.dataBit %= 8
   439  	}
   440  	return pos
   441  }
   442  
   443  func (g *Getter) nextPattern() []byte {
   444  	table := g.patternDict
   445  
   446  	if table.bitLen == 0 {
   447  		return *table.patterns[0].pattern
   448  	}
   449  
   450  	var l byte
   451  	var pattern []byte
   452  	for l == 0 {
   453  		code := uint16(g.data[g.dataP]) >> g.dataBit
   454  		if 8-g.dataBit < table.bitLen && int(g.dataP)+1 < len(g.data) {
   455  			code |= uint16(g.data[g.dataP+1]) << (8 - g.dataBit)
   456  		}
   457  		code &= (uint16(1) << table.bitLen) - 1
   458  
   459  		cw := table.condensedTableSearch(code)
   460  		l = cw.len
   461  		if l == 0 {
   462  			table = cw.ptr
   463  			g.dataBit += 9
   464  		} else {
   465  			g.dataBit += int(l)
   466  			pattern = *cw.pattern
   467  		}
   468  		g.dataP += uint64(g.dataBit / 8)
   469  		g.dataBit %= 8
   470  	}
   471  	return pattern
   472  }
   473  
   474  var condensedWordDistances = buildCondensedWordDistances()
   475  
   476  func checkDistance(power int, d int) bool {
   477  	for _, dist := range condensedWordDistances[power] {
   478  		if dist == d {
   479  			return true
   480  		}
   481  	}
   482  	return false
   483  }
   484  
   485  func buildCondensedWordDistances() [][]int {
   486  	dist2 := make([][]int, 10)
   487  	for i := 1; i <= 9; i++ {
   488  		dl := make([]int, 0)
   489  		for j := 1 << i; j < 512; j += 1 << i {
   490  			dl = append(dl, j)
   491  		}
   492  		dist2[i] = dl
   493  	}
   494  	return dist2
   495  }
   496  
   497  func (g *Getter) Size() int {
   498  	return len(g.data)
   499  }
   500  
   501  func (d *Decompressor) Count() int           { return int(d.wordsCount) }
   502  func (d *Decompressor) EmptyWordsCount() int { return int(d.emptyWordsCount) }
   503  
   504  // MakeGetter creates an object that can be used to access superstrings in the decompressor's file
   505  // Getter is not thread-safe, but there can be multiple getters used simultaneously and concurrently
   506  // for the same decompressor
   507  func (d *Decompressor) MakeGetter() *Getter {
   508  	return &Getter{
   509  		posDict:     d.posDict,
   510  		data:        d.data[d.wordsStart:],
   511  		patternDict: d.dict,
   512  		fName:       d.fileName,
   513  	}
   514  }
   515  
   516  func (g *Getter) Reset(offset uint64) {
   517  	g.dataP = offset
   518  	g.dataBit = 0
   519  }
   520  
   521  func (g *Getter) HasNext() bool {
   522  	return g.dataP < uint64(len(g.data))
   523  }
   524  
   525  // Next extracts a compressed word from current offset in the file
   526  // and appends it to the given buf, returning the result of appending
   527  // After extracting next word, it moves to the beginning of the next one
   528  func (g *Getter) Next(buf []byte) ([]byte, uint64) {
   529  	savePos := g.dataP
   530  	wordLen := g.nextPos(true)
   531  	wordLen-- // because when create huffman tree we do ++ , because 0 is terminator
   532  	if wordLen == 0 {
   533  		if g.dataBit > 0 {
   534  			g.dataP++
   535  			g.dataBit = 0
   536  		}
   537  		if buf == nil { // wordLen == 0, means we have valid record of 0 size. nil - is the marker of "something not found"
   538  			buf = []byte{}
   539  		}
   540  		return buf, g.dataP
   541  	}
   542  	bufPos := len(buf) // Tracking position in buf where to insert part of the word
   543  	lastUncovered := len(buf)
   544  	if len(buf)+int(wordLen) > cap(buf) {
   545  		newBuf := make([]byte, len(buf)+int(wordLen))
   546  		copy(newBuf, buf)
   547  		buf = newBuf
   548  	} else {
   549  		// Expand buffer
   550  		buf = buf[:len(buf)+int(wordLen)]
   551  	}
   552  	// Loop below fills in the patterns
   553  	for pos := g.nextPos(false /* clean */); pos != 0; pos = g.nextPos(false) {
   554  		bufPos += int(pos) - 1 // Positions where to insert patterns are encoded relative to one another
   555  		pt := g.nextPattern()
   556  		copy(buf[bufPos:], pt)
   557  	}
   558  	if g.dataBit > 0 {
   559  		g.dataP++
   560  		g.dataBit = 0
   561  	}
   562  	postLoopPos := g.dataP
   563  	g.dataP = savePos
   564  	g.dataBit = 0
   565  	g.nextPos(true /* clean */) // Reset the state of huffman reader
   566  	bufPos = lastUncovered      // Restore to the beginning of buf
   567  	// Loop below fills the data which is not in the patterns
   568  	for pos := g.nextPos(false); pos != 0; pos = g.nextPos(false) {
   569  		bufPos += int(pos) - 1 // Positions where to insert patterns are encoded relative to one another
   570  		if bufPos > lastUncovered {
   571  			dif := uint64(bufPos - lastUncovered)
   572  			copy(buf[lastUncovered:bufPos], g.data[postLoopPos:postLoopPos+dif])
   573  			postLoopPos += dif
   574  		}
   575  		lastUncovered = bufPos + len(g.nextPattern())
   576  	}
   577  	if int(wordLen) > lastUncovered {
   578  		dif := wordLen - uint64(lastUncovered)
   579  		copy(buf[lastUncovered:wordLen], g.data[postLoopPos:postLoopPos+dif])
   580  		postLoopPos += dif
   581  	}
   582  	g.dataP = postLoopPos
   583  	g.dataBit = 0
   584  	return buf, postLoopPos
   585  }
   586  
   587  func (g *Getter) NextUncompressed() ([]byte, uint64) {
   588  	wordLen := g.nextPos(true)
   589  	wordLen-- // because when create huffman tree we do ++ , because 0 is terminator
   590  	if wordLen == 0 {
   591  		if g.dataBit > 0 {
   592  			g.dataP++
   593  			g.dataBit = 0
   594  		}
   595  		return g.data[g.dataP:g.dataP], g.dataP
   596  	}
   597  	g.nextPos(false)
   598  	if g.dataBit > 0 {
   599  		g.dataP++
   600  		g.dataBit = 0
   601  	}
   602  	pos := g.dataP
   603  	g.dataP += wordLen
   604  	return g.data[pos:g.dataP], g.dataP
   605  }
   606  
   607  // Skip moves offset to the next word and returns the new offset and the length of the word.
   608  func (g *Getter) Skip() (uint64, int) {
   609  	l := g.nextPos(true)
   610  	l-- // because when create huffman tree we do ++ , because 0 is terminator
   611  	if l == 0 {
   612  		if g.dataBit > 0 {
   613  			g.dataP++
   614  			g.dataBit = 0
   615  		}
   616  		return g.dataP, 0
   617  	}
   618  	wordLen := int(l)
   619  
   620  	var add uint64
   621  	var bufPos int
   622  	var lastUncovered int
   623  	for pos := g.nextPos(false /* clean */); pos != 0; pos = g.nextPos(false) {
   624  		bufPos += int(pos) - 1
   625  		if wordLen < bufPos {
   626  			panic(fmt.Sprintf("likely .idx is invalid: %s", g.fName))
   627  		}
   628  		if bufPos > lastUncovered {
   629  			add += uint64(bufPos - lastUncovered)
   630  		}
   631  		lastUncovered = bufPos + len(g.nextPattern())
   632  	}
   633  	if g.dataBit > 0 {
   634  		g.dataP++
   635  		g.dataBit = 0
   636  	}
   637  	if int(l) > lastUncovered {
   638  		add += l - uint64(lastUncovered)
   639  	}
   640  	// Uncovered characters
   641  	g.dataP += add
   642  	return g.dataP, wordLen
   643  }
   644  
   645  func (g *Getter) SkipUncompressed() (uint64, int) {
   646  	wordLen := g.nextPos(true)
   647  	wordLen-- // because when create huffman tree we do ++ , because 0 is terminator
   648  	if wordLen == 0 {
   649  		if g.dataBit > 0 {
   650  			g.dataP++
   651  			g.dataBit = 0
   652  		}
   653  		return g.dataP, 0
   654  	}
   655  	g.nextPos(false)
   656  	if g.dataBit > 0 {
   657  		g.dataP++
   658  		g.dataBit = 0
   659  	}
   660  	g.dataP += wordLen
   661  	return g.dataP, int(wordLen)
   662  }
   663  
   664  // Match returns true and next offset if the word at current offset fully matches the buf
   665  // returns false and current offset otherwise.
   666  func (g *Getter) Match(buf []byte) (bool, uint64) {
   667  	savePos := g.dataP
   668  	wordLen := g.nextPos(true)
   669  	wordLen-- // because when create huffman tree we do ++ , because 0 is terminator
   670  	lenBuf := len(buf)
   671  	if wordLen == 0 || int(wordLen) != lenBuf {
   672  		if g.dataBit > 0 {
   673  			g.dataP++
   674  			g.dataBit = 0
   675  		}
   676  		if lenBuf != 0 {
   677  			g.dataP, g.dataBit = savePos, 0
   678  		}
   679  		return lenBuf == int(wordLen), g.dataP
   680  	}
   681  
   682  	var bufPos int
   683  	// In the first pass, we only check patterns
   684  	for pos := g.nextPos(false /* clean */); pos != 0; pos = g.nextPos(false) {
   685  		bufPos += int(pos) - 1
   686  		pattern := g.nextPattern()
   687  		if lenBuf < bufPos+len(pattern) || !bytes.Equal(buf[bufPos:bufPos+len(pattern)], pattern) {
   688  			g.dataP, g.dataBit = savePos, 0
   689  			return false, savePos
   690  		}
   691  	}
   692  	if g.dataBit > 0 {
   693  		g.dataP++
   694  		g.dataBit = 0
   695  	}
   696  	postLoopPos := g.dataP
   697  	g.dataP, g.dataBit = savePos, 0
   698  	g.nextPos(true /* clean */) // Reset the state of huffman decoder
   699  	// Second pass - we check spaces not covered by the patterns
   700  	var lastUncovered int
   701  	bufPos = 0
   702  	for pos := g.nextPos(false /* clean */); pos != 0; pos = g.nextPos(false) {
   703  		bufPos += int(pos) - 1
   704  		if bufPos > lastUncovered {
   705  			dif := uint64(bufPos - lastUncovered)
   706  			if lenBuf < bufPos || !bytes.Equal(buf[lastUncovered:bufPos], g.data[postLoopPos:postLoopPos+dif]) {
   707  				g.dataP, g.dataBit = savePos, 0
   708  				return false, savePos
   709  			}
   710  			postLoopPos += dif
   711  		}
   712  		lastUncovered = bufPos + len(g.nextPattern())
   713  	}
   714  	if int(wordLen) > lastUncovered {
   715  		dif := wordLen - uint64(lastUncovered)
   716  		if lenBuf < int(wordLen) || !bytes.Equal(buf[lastUncovered:wordLen], g.data[postLoopPos:postLoopPos+dif]) {
   717  			g.dataP, g.dataBit = savePos, 0
   718  			return false, savePos
   719  		}
   720  		postLoopPos += dif
   721  	}
   722  	if lenBuf != int(wordLen) {
   723  		g.dataP, g.dataBit = savePos, 0
   724  		return false, savePos
   725  	}
   726  	g.dataP, g.dataBit = postLoopPos, 0
   727  	return true, postLoopPos
   728  }
   729  
   730  // MatchPrefix only checks if the word at the current offset has a buf prefix. Does not move offset to the next word.
   731  func (g *Getter) MatchPrefix(prefix []byte) bool {
   732  	savePos := g.dataP
   733  	defer func() {
   734  		g.dataP, g.dataBit = savePos, 0
   735  	}()
   736  
   737  	wordLen := g.nextPos(true /* clean */)
   738  	wordLen-- // because when create huffman tree we do ++ , because 0 is terminator
   739  	prefixLen := len(prefix)
   740  	if wordLen == 0 || int(wordLen) < prefixLen {
   741  		if g.dataBit > 0 {
   742  			g.dataP++
   743  			g.dataBit = 0
   744  		}
   745  		if prefixLen != 0 {
   746  			g.dataP, g.dataBit = savePos, 0
   747  		}
   748  		return prefixLen == int(wordLen)
   749  	}
   750  
   751  	var bufPos int
   752  	// In the first pass, we only check patterns
   753  	// Only run this loop as far as the prefix goes, there is no need to check further
   754  	for pos := g.nextPos(false /* clean */); pos != 0; pos = g.nextPos(false) {
   755  		bufPos += int(pos) - 1
   756  		pattern := g.nextPattern()
   757  		var comparisonLen int
   758  		if prefixLen < bufPos+len(pattern) {
   759  			comparisonLen = prefixLen - bufPos
   760  		} else {
   761  			comparisonLen = len(pattern)
   762  		}
   763  		if bufPos < prefixLen {
   764  			if !bytes.Equal(prefix[bufPos:bufPos+comparisonLen], pattern[:comparisonLen]) {
   765  				return false
   766  			}
   767  		}
   768  	}
   769  
   770  	if g.dataBit > 0 {
   771  		g.dataP++
   772  		g.dataBit = 0
   773  	}
   774  	postLoopPos := g.dataP
   775  	g.dataP, g.dataBit = savePos, 0
   776  	g.nextPos(true /* clean */) // Reset the state of huffman decoder
   777  	// Second pass - we check spaces not covered by the patterns
   778  	var lastUncovered int
   779  	bufPos = 0
   780  	for pos := g.nextPos(false /* clean */); pos != 0 && lastUncovered < prefixLen; pos = g.nextPos(false) {
   781  		bufPos += int(pos) - 1
   782  		if bufPos > lastUncovered {
   783  			dif := uint64(bufPos - lastUncovered)
   784  			var comparisonLen int
   785  			if prefixLen < lastUncovered+int(dif) {
   786  				comparisonLen = prefixLen - lastUncovered
   787  			} else {
   788  				comparisonLen = int(dif)
   789  			}
   790  			if !bytes.Equal(prefix[lastUncovered:lastUncovered+comparisonLen], g.data[postLoopPos:postLoopPos+uint64(comparisonLen)]) {
   791  				return false
   792  			}
   793  			postLoopPos += dif
   794  		}
   795  		lastUncovered = bufPos + len(g.nextPattern())
   796  	}
   797  	if prefixLen > lastUncovered && int(wordLen) > lastUncovered {
   798  		dif := wordLen - uint64(lastUncovered)
   799  		var comparisonLen int
   800  		if prefixLen < int(wordLen) {
   801  			comparisonLen = prefixLen - lastUncovered
   802  		} else {
   803  			comparisonLen = int(dif)
   804  		}
   805  		if !bytes.Equal(prefix[lastUncovered:lastUncovered+comparisonLen], g.data[postLoopPos:postLoopPos+uint64(comparisonLen)]) {
   806  			return false
   807  		}
   808  	}
   809  	return true
   810  }
   811  
   812  // MatchCmp lexicographically compares given buf with the word at the current offset in the file.
   813  // returns 0 if buf == word, -1 if buf < word, 1 if buf > word
   814  func (g *Getter) MatchCmp(buf []byte) int {
   815  	savePos := g.dataP
   816  	wordLen := g.nextPos(true)
   817  	wordLen-- // because when create huffman tree we do ++ , because 0 is terminator
   818  	lenBuf := len(buf)
   819  	if wordLen == 0 && lenBuf != 0 {
   820  		g.dataP, g.dataBit = savePos, 0
   821  		return 1
   822  	}
   823  	if wordLen == 0 && lenBuf == 0 {
   824  		if g.dataBit > 0 {
   825  			g.dataP++
   826  			g.dataBit = 0
   827  		}
   828  		return 0
   829  	}
   830  
   831  	decoded := make([]byte, wordLen)
   832  	var bufPos int
   833  	// In the first pass, we only check patterns
   834  	for pos := g.nextPos(false /* clean */); pos != 0; pos = g.nextPos(false) {
   835  		bufPos += int(pos) - 1
   836  		pattern := g.nextPattern()
   837  		copy(decoded[bufPos:], pattern)
   838  	}
   839  	if g.dataBit > 0 {
   840  		g.dataP++
   841  		g.dataBit = 0
   842  	}
   843  	postLoopPos := g.dataP
   844  	g.dataP, g.dataBit = savePos, 0
   845  	g.nextPos(true /* clean */) // Reset the state of huffman decoder
   846  	// Second pass - we check spaces not covered by the patterns
   847  	var lastUncovered int
   848  	bufPos = 0
   849  	for pos := g.nextPos(false /* clean */); pos != 0; pos = g.nextPos(false) {
   850  		bufPos += int(pos) - 1
   851  		// fmt.Printf("BUF POS: %d, POS: %d, lastUncovered: %d\n", bufPos, pos, lastUncovered)
   852  		if bufPos > lastUncovered {
   853  			dif := uint64(bufPos - lastUncovered)
   854  			copy(decoded[lastUncovered:bufPos], g.data[postLoopPos:postLoopPos+dif])
   855  			postLoopPos += dif
   856  		}
   857  		lastUncovered = bufPos + len(g.nextPattern())
   858  	}
   859  
   860  	if int(wordLen) > lastUncovered {
   861  		dif := wordLen - uint64(lastUncovered)
   862  		copy(decoded[lastUncovered:wordLen], g.data[postLoopPos:postLoopPos+dif])
   863  		postLoopPos += dif
   864  	}
   865  	cmp := bytes.Compare(buf, decoded)
   866  	if cmp == 0 {
   867  		g.dataP, g.dataBit = postLoopPos, 0
   868  	} else {
   869  		g.dataP, g.dataBit = savePos, 0
   870  	}
   871  	return cmp
   872  }
   873  
   874  // MatchPrefixCmp lexicographically compares given prefix with the word at the current offset in the file.
   875  // returns 0 if buf == word, -1 if buf < word, 1 if buf > word
   876  func (g *Getter) MatchPrefixCmp(prefix []byte) int {
   877  	savePos := g.dataP
   878  	defer func() {
   879  		g.dataP, g.dataBit = savePos, 0
   880  	}()
   881  
   882  	wordLen := g.nextPos(true /* clean */)
   883  	wordLen-- // because when create huffman tree we do ++ , because 0 is terminator
   884  	prefixLen := len(prefix)
   885  	if wordLen == 0 && prefixLen != 0 {
   886  		return 1
   887  	}
   888  	if prefixLen == 0 {
   889  		return 0
   890  	}
   891  
   892  	decoded := make([]byte, wordLen)
   893  	var bufPos int
   894  	// In the first pass, we only check patterns
   895  	// Only run this loop as far as the prefix goes, there is no need to check further
   896  	for pos := g.nextPos(false /* clean */); pos != 0; pos = g.nextPos(false) {
   897  		bufPos += int(pos) - 1
   898  		if bufPos > prefixLen {
   899  			break
   900  		}
   901  		pattern := g.nextPattern()
   902  		copy(decoded[bufPos:], pattern)
   903  	}
   904  
   905  	if g.dataBit > 0 {
   906  		g.dataP++
   907  		g.dataBit = 0
   908  	}
   909  	postLoopPos := g.dataP
   910  	g.dataP, g.dataBit = savePos, 0
   911  	g.nextPos(true /* clean */) // Reset the state of huffman decoder
   912  	// Second pass - we check spaces not covered by the patterns
   913  	var lastUncovered int
   914  	bufPos = 0
   915  	for pos := g.nextPos(false /* clean */); pos != 0 && lastUncovered < prefixLen; pos = g.nextPos(false) {
   916  		bufPos += int(pos) - 1
   917  		if bufPos > lastUncovered {
   918  			dif := uint64(bufPos - lastUncovered)
   919  			copy(decoded[lastUncovered:bufPos], g.data[postLoopPos:postLoopPos+dif])
   920  			postLoopPos += dif
   921  		}
   922  		lastUncovered = bufPos + len(g.nextPattern())
   923  	}
   924  	if prefixLen > lastUncovered && int(wordLen) > lastUncovered {
   925  		dif := wordLen - uint64(lastUncovered)
   926  		copy(decoded[lastUncovered:wordLen], g.data[postLoopPos:postLoopPos+dif])
   927  		// postLoopPos += dif
   928  	}
   929  	var cmp int
   930  	if prefixLen > int(wordLen) {
   931  		// TODO(racytech): handle this case
   932  		// e.g: prefix = 'aaacb'
   933  		// 		word = 'aaa'
   934  		cmp = bytes.Compare(prefix, decoded)
   935  	} else {
   936  		cmp = bytes.Compare(prefix, decoded[:prefixLen])
   937  	}
   938  
   939  	return cmp
   940  }
   941  
   942  func (g *Getter) MatchPrefixUncompressed(prefix []byte) int {
   943  	savePos := g.dataP
   944  	defer func() {
   945  		g.dataP, g.dataBit = savePos, 0
   946  	}()
   947  
   948  	wordLen := g.nextPos(true /* clean */)
   949  	wordLen-- // because when create huffman tree we do ++ , because 0 is terminator
   950  	prefixLen := len(prefix)
   951  	if wordLen == 0 && prefixLen != 0 {
   952  		return 1
   953  	}
   954  	if prefixLen == 0 {
   955  		return 0
   956  	}
   957  
   958  	g.nextPos(true)
   959  
   960  	// if prefixLen > int(wordLen) {
   961  	// 	// TODO(racytech): handle this case
   962  	// 	// e.g: prefix = 'aaacb'
   963  	// 	// 		word = 'aaa'
   964  	// }
   965  
   966  	return bytes.Compare(prefix, g.data[g.dataP:g.dataP+wordLen])
   967  }
   968  
   969  // FastNext extracts a compressed word from current offset in the file
   970  // into the given buf, returning a new byte slice which contains extracted word.
   971  // It is important to allocate enough buf size. Could throw an error if word in file is larger then the buf size.
   972  // After extracting next word, it moves to the beginning of the next one
   973  func (g *Getter) FastNext(buf []byte) ([]byte, uint64) {
   974  	defer func() {
   975  		if rec := recover(); rec != nil {
   976  			panic(fmt.Sprintf("file: %s, %s, %s", g.fName, rec, dbg.Stack()))
   977  		}
   978  	}()
   979  
   980  	savePos := g.dataP
   981  	wordLen := g.nextPos(true)
   982  	wordLen-- // because when create huffman tree we do ++ , because 0 is terminator
   983  	// decoded := make([]byte, wordLen)
   984  	if wordLen == 0 {
   985  		if g.dataBit > 0 {
   986  			g.dataP++
   987  			g.dataBit = 0
   988  		}
   989  		return buf[:wordLen], g.dataP
   990  	}
   991  	bufPos := 0 // Tracking position in buf where to insert part of the word
   992  	lastUncovered := 0
   993  
   994  	// if int(wordLen) > cap(buf) {
   995  	// 	newBuf := make([]byte, int(wordLen))
   996  	// 	buf = newBuf
   997  	// }
   998  	// Loop below fills in the patterns
   999  	for pos := g.nextPos(false /* clean */); pos != 0; pos = g.nextPos(false) {
  1000  		bufPos += int(pos) - 1 // Positions where to insert patterns are encoded relative to one another
  1001  		pt := g.nextPattern()
  1002  		copy(buf[bufPos:], pt)
  1003  	}
  1004  	if g.dataBit > 0 {
  1005  		g.dataP++
  1006  		g.dataBit = 0
  1007  	}
  1008  	postLoopPos := g.dataP
  1009  	g.dataP = savePos
  1010  	g.dataBit = 0
  1011  	g.nextPos(true /* clean */) // Reset the state of huffman reader
  1012  	bufPos = lastUncovered      // Restore to the beginning of buf
  1013  	// Loop below fills the data which is not in the patterns
  1014  	for pos := g.nextPos(false); pos != 0; pos = g.nextPos(false) {
  1015  		bufPos += int(pos) - 1 // Positions where to insert patterns are encoded relative to one another
  1016  		if bufPos > lastUncovered {
  1017  			dif := uint64(bufPos - lastUncovered)
  1018  			copy(buf[lastUncovered:bufPos], g.data[postLoopPos:postLoopPos+dif])
  1019  			postLoopPos += dif
  1020  		}
  1021  		lastUncovered = bufPos + len(g.nextPattern())
  1022  	}
  1023  	if int(wordLen) > lastUncovered {
  1024  		dif := wordLen - uint64(lastUncovered)
  1025  		copy(buf[lastUncovered:wordLen], g.data[postLoopPos:postLoopPos+dif])
  1026  		postLoopPos += dif
  1027  	}
  1028  	g.dataP = postLoopPos
  1029  	g.dataBit = 0
  1030  	return buf[:wordLen], postLoopPos
  1031  }