github.com/ledgerwatch/erigon-lib@v1.0.0/recsplit/golomb_rice.go (about)

     1  /*
     2     Copyright 2021 Erigon contributors
     3  
     4     Licensed under the Apache License, Version 2.0 (the "License");
     5     you may not use this file except in compliance with the License.
     6     You may obtain a copy of the License at
     7  
     8         http://www.apache.org/licenses/LICENSE-2.0
     9  
    10     Unless required by applicable law or agreed to in writing, software
    11     distributed under the License is distributed on an "AS IS" BASIS,
    12     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13     See the License for the specific language governing permissions and
    14     limitations under the License.
    15  */
    16  
    17  package recsplit
    18  
    19  import (
    20  	"encoding/binary"
    21  	"io"
    22  	"math/bits"
    23  	"unsafe"
    24  
    25  	"github.com/ledgerwatch/erigon-lib/common/bitutil"
    26  )
    27  
    28  // Optimal Golomb-Rice parameters for leaves
    29  var bijMemo = []uint32{0, 0, 0, 1, 3, 4, 5, 7, 8, 10, 11, 12, 14, 15, 16, 18, 19, 21, 22, 23, 25, 26, 28, 29, 30}
    30  
    31  // GolombRice can build up the golomb-rice encoding of the sequeuce of numbers, as well as read the numbers back from it.
    32  type GolombRice struct {
    33  	data     []uint64 // Present in the builder and in the reader
    34  	bitCount int      // Speficic to the builder - number of bits added to the encoding so far
    35  }
    36  
    37  // appendUnaryAll adds the unary encoding of specified sequence of numbers to the end of the
    38  // current encoding
    39  func (g *GolombRice) appendUnaryAll(unary []uint64) {
    40  	bitInc := 0
    41  	for _, u := range unary {
    42  		// Each number u uses u+1 bits for its unary representation
    43  		bitInc += int(u) + 1
    44  	}
    45  	targetSize := (g.bitCount + bitInc + 63) / 64
    46  	for len(g.data) < targetSize {
    47  		g.data = append(g.data, 0)
    48  	}
    49  
    50  	for _, u := range unary {
    51  		g.bitCount += int(u)
    52  		appendPtr := g.bitCount / 64
    53  		g.data[appendPtr] |= uint64(1) << (g.bitCount & 63)
    54  		g.bitCount++
    55  	}
    56  }
    57  
    58  // appendFixed encodes the next value using specified Golomb parameter. Since we are using Golomb-Rice encoding,
    59  // all Golomb parameters are powers of two. Therefore we input log2 of golomb parameter, rather than golomn paramter itself,
    60  // for convinience
    61  func (g *GolombRice) appendFixed(v uint64, log2golomb int) {
    62  	if log2golomb == 0 {
    63  		return
    64  	}
    65  	lowerBits := v & ((uint64(1) << log2golomb) - 1) // Extract the part of the number that will be encoded using truncated binary encoding
    66  	usedBits := g.bitCount & 63                      // How many bits of the last element of b.data is used by previous value
    67  	targetSize := (g.bitCount + log2golomb + 63) / 64
    68  	//fmt.Printf("g.bitCount = %d, log2golomb = %d, targetSize = %d\n", g.bitCount, log2golomb, targetSize)
    69  	for len(g.data) < targetSize {
    70  		g.data = append(g.data, 0)
    71  	}
    72  	appendPtr := g.bitCount / 64 // The index in b.data corresponding to the last element used by previous value, or if previous values fits perfectly, the index of the next free element
    73  	curWord := g.data[appendPtr]
    74  	curWord |= lowerBits << usedBits // curWord now contains the new value potentially combined with the part of the previous value
    75  	if usedBits+log2golomb > 64 {
    76  		// New value overflows to the next element
    77  		g.data[appendPtr] = curWord
    78  		appendPtr++
    79  		curWord = lowerBits >> (64 - usedBits) // curWord now contains the part of the new value that overflows
    80  	}
    81  	g.data[appendPtr] = curWord
    82  	g.bitCount += log2golomb
    83  }
    84  
    85  // Bits returns currrent number of bits in the compact encoding of the hash function representation
    86  func (g *GolombRice) Bits() int {
    87  	return g.bitCount
    88  }
    89  
    90  func (g *GolombRiceReader) ReadReset(bitPos, unaryOffset int) {
    91  	g.currFixedOffset = bitPos
    92  	unaryPos := bitPos + unaryOffset
    93  	g.currPtrUnary = unaryPos / 64
    94  	g.currWindowUnary = g.data[g.currPtrUnary] >> (unaryPos & 63)
    95  	g.currPtrUnary++
    96  	g.validLowerBitsUnary = 64 - (unaryPos & 63)
    97  }
    98  
    99  func (g *GolombRiceReader) SkipSubtree(nodes, fixedLen int) {
   100  	if nodes <= 0 {
   101  		panic("nodes <= 0")
   102  	}
   103  	missing := nodes
   104  	var cnt int
   105  	for cnt = bits.OnesCount64(g.currWindowUnary); cnt < missing; cnt = bits.OnesCount64(g.currWindowUnary) {
   106  		g.currWindowUnary = g.data[g.currPtrUnary]
   107  		g.currPtrUnary++
   108  		missing -= cnt
   109  		g.validLowerBitsUnary = 64
   110  	}
   111  	cnt = bitutil.Select64(g.currWindowUnary, missing-1)
   112  	g.currWindowUnary >>= cnt
   113  	g.currWindowUnary >>= 1
   114  	g.validLowerBitsUnary -= cnt + 1
   115  
   116  	g.currFixedOffset += fixedLen
   117  }
   118  
   119  func (g *GolombRiceReader) ReadNext(log2golomb int) uint64 {
   120  	var result uint64
   121  
   122  	if g.currWindowUnary == 0 {
   123  		result += uint64(g.validLowerBitsUnary)
   124  		g.currWindowUnary = g.data[g.currPtrUnary]
   125  		g.currPtrUnary++
   126  		g.validLowerBitsUnary = 64
   127  		for g.currWindowUnary == 0 {
   128  			result += 64
   129  			g.currWindowUnary = g.data[g.currPtrUnary]
   130  			g.currPtrUnary++
   131  		}
   132  	}
   133  
   134  	pos := bits.TrailingZeros64(g.currWindowUnary)
   135  
   136  	g.currWindowUnary >>= pos
   137  	g.currWindowUnary >>= 1
   138  	g.validLowerBitsUnary -= pos + 1
   139  
   140  	result += uint64(pos)
   141  	result <<= log2golomb
   142  
   143  	idx64 := g.currFixedOffset >> 6
   144  	var fixed uint64
   145  	shift := g.currFixedOffset & 63
   146  	fixed = g.data[idx64] >> shift
   147  	if shift+log2golomb > 64 {
   148  		fixed |= g.data[idx64+1] << (64 - shift)
   149  	}
   150  	result |= fixed & ((uint64(1) << log2golomb) - 1)
   151  	g.currFixedOffset += log2golomb
   152  	return result
   153  }
   154  
   155  // Data returns the binary representation of the Golomb-Rice code that is built
   156  func (g *GolombRice) Data() []uint64 {
   157  	return g.data
   158  }
   159  
   160  const maxDataSize = 0xFFFFFFFFFFFF
   161  
   162  // Write outputs the state of golomb rice encoding into a writer, which can be recovered later by Read
   163  func (g *GolombRice) Write(w io.Writer) error {
   164  	var numBuf [8]byte
   165  	binary.BigEndian.PutUint64(numBuf[:], uint64(len(g.data)))
   166  	if _, e := w.Write(numBuf[:]); e != nil {
   167  		return e
   168  	}
   169  	p := (*[maxDataSize]byte)(unsafe.Pointer(&g.data[0]))
   170  	b := (*p)[:]
   171  	if _, e := w.Write(b[:len(g.data)*8]); e != nil {
   172  		return e
   173  	}
   174  	return nil
   175  }
   176  
   177  type GolombRiceReader struct {
   178  	data                []uint64 // Present in the builder and in the reader
   179  	currFixedOffset     int      // Specific to the reader
   180  	currWindowUnary     uint64
   181  	currPtrUnary        int
   182  	validLowerBitsUnary int
   183  }