github.com/ledgerwatch/erigon-lib@v1.0.0/recsplit/index.go (about)

     1  /*
     2     Copyright 2022 The Erigon contributors
     3  
     4     Licensed under the Apache License, Version 2.0 (the "License");
     5     you may not use this file except in compliance with the License.
     6     You may obtain a copy of the License at
     7  
     8         http://www.apache.org/licenses/LICENSE-2.0
     9  
    10     Unless required by applicable law or agreed to in writing, software
    11     distributed under the License is distributed on an "AS IS" BASIS,
    12     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13     See the License for the specific language governing permissions and
    14     limitations under the License.
    15  */
    16  
    17  package recsplit
    18  
    19  import (
    20  	"bufio"
    21  	"encoding/binary"
    22  	"fmt"
    23  	"math"
    24  	"math/bits"
    25  	"os"
    26  	"path/filepath"
    27  	"sync"
    28  	"time"
    29  	"unsafe"
    30  
    31  	"github.com/ledgerwatch/erigon-lib/common/dbg"
    32  	"github.com/ledgerwatch/log/v3"
    33  
    34  	"github.com/ledgerwatch/erigon-lib/common"
    35  	"github.com/ledgerwatch/erigon-lib/mmap"
    36  	"github.com/ledgerwatch/erigon-lib/recsplit/eliasfano16"
    37  	"github.com/ledgerwatch/erigon-lib/recsplit/eliasfano32"
    38  )
    39  
    40  // Index implements index lookup from the file created by the RecSplit
    41  type Index struct {
    42  	offsetEf           *eliasfano32.EliasFano
    43  	f                  *os.File
    44  	mmapHandle2        *[mmap.MaxMapSize]byte // mmap handle for windows (this is used to close mmap)
    45  	filePath, fileName string
    46  
    47  	grData             []uint64
    48  	data               []byte // slice of correct size for the index to work with
    49  	startSeed          []uint64
    50  	golombRice         []uint32
    51  	mmapHandle1        []byte // mmap handle for unix (this is used to close mmap)
    52  	ef                 eliasfano16.DoubleEliasFano
    53  	bucketSize         int
    54  	size               int64
    55  	modTime            time.Time
    56  	baseDataID         uint64 // Index internaly organized as [0,N) array. Use this field to map EntityID=[M;M+N) to [0,N)
    57  	bucketCount        uint64 // Number of buckets
    58  	keyCount           uint64
    59  	recMask            uint64
    60  	bytesPerRec        int
    61  	salt               uint32
    62  	leafSize           uint16 // Leaf size for recursive split algorithms
    63  	secondaryAggrBound uint16 // The lower bound for secondary key aggregation (computed from leadSize)
    64  	primaryAggrBound   uint16 // The lower bound for primary key aggregation (computed from leafSize)
    65  	enums              bool
    66  
    67  	readers *sync.Pool
    68  }
    69  
    70  func MustOpen(indexFile string) *Index {
    71  	idx, err := OpenIndex(indexFile)
    72  	if err != nil {
    73  		panic(err)
    74  	}
    75  	return idx
    76  }
    77  
    78  func OpenIndex(indexFilePath string) (*Index, error) {
    79  	_, fName := filepath.Split(indexFilePath)
    80  	idx := &Index{
    81  		filePath: indexFilePath,
    82  		fileName: fName,
    83  	}
    84  	var err error
    85  	idx.f, err = os.Open(indexFilePath)
    86  	if err != nil {
    87  		return nil, err
    88  	}
    89  	var stat os.FileInfo
    90  	if stat, err = idx.f.Stat(); err != nil {
    91  		return nil, err
    92  	}
    93  	idx.size = stat.Size()
    94  	idx.modTime = stat.ModTime()
    95  	if idx.mmapHandle1, idx.mmapHandle2, err = mmap.Mmap(idx.f, int(idx.size)); err != nil {
    96  		return nil, err
    97  	}
    98  	idx.data = idx.mmapHandle1[:idx.size]
    99  	defer idx.EnableReadAhead().DisableReadAhead()
   100  
   101  	// Read number of keys and bytes per record
   102  	idx.baseDataID = binary.BigEndian.Uint64(idx.data[:8])
   103  	idx.keyCount = binary.BigEndian.Uint64(idx.data[8:16])
   104  	idx.bytesPerRec = int(idx.data[16])
   105  	idx.recMask = (uint64(1) << (8 * idx.bytesPerRec)) - 1
   106  	offset := 16 + 1 + int(idx.keyCount)*idx.bytesPerRec
   107  
   108  	if offset < 0 {
   109  		return nil, fmt.Errorf("offset is: %d which is below zero, the file: %s is broken", offset, indexFilePath)
   110  	}
   111  
   112  	// Bucket count, bucketSize, leafSize
   113  	idx.bucketCount = binary.BigEndian.Uint64(idx.data[offset:])
   114  	offset += 8
   115  	idx.bucketSize = int(binary.BigEndian.Uint16(idx.data[offset:]))
   116  	offset += 2
   117  	idx.leafSize = binary.BigEndian.Uint16(idx.data[offset:])
   118  	offset += 2
   119  	idx.primaryAggrBound = idx.leafSize * uint16(math.Max(2, math.Ceil(0.35*float64(idx.leafSize)+1./2.)))
   120  	if idx.leafSize < 7 {
   121  		idx.secondaryAggrBound = idx.primaryAggrBound * 2
   122  	} else {
   123  		idx.secondaryAggrBound = idx.primaryAggrBound * uint16(math.Ceil(0.21*float64(idx.leafSize)+9./10.))
   124  	}
   125  	// Salt
   126  	idx.salt = binary.BigEndian.Uint32(idx.data[offset:])
   127  	offset += 4
   128  	// Start seed
   129  	startSeedLen := int(idx.data[offset])
   130  	offset++
   131  	idx.startSeed = make([]uint64, startSeedLen)
   132  	for i := 0; i < startSeedLen; i++ {
   133  		idx.startSeed[i] = binary.BigEndian.Uint64(idx.data[offset:])
   134  		offset += 8
   135  	}
   136  	idx.enums = idx.data[offset] != 0
   137  	offset++
   138  	if idx.enums {
   139  		var size int
   140  		idx.offsetEf, size = eliasfano32.ReadEliasFano(idx.data[offset:])
   141  		offset += size
   142  	}
   143  	// Size of golomb rice params
   144  	golombParamSize := binary.BigEndian.Uint16(idx.data[offset:])
   145  	offset += 4
   146  	idx.golombRice = make([]uint32, golombParamSize)
   147  	for i := uint16(0); i < golombParamSize; i++ {
   148  		if i == 0 {
   149  			idx.golombRice[i] = (bijMemo[i] << 27) | bijMemo[i]
   150  		} else if i <= idx.leafSize {
   151  			idx.golombRice[i] = (bijMemo[i] << 27) | (uint32(1) << 16) | bijMemo[i]
   152  		} else {
   153  			computeGolombRice(i, idx.golombRice, idx.leafSize, idx.primaryAggrBound, idx.secondaryAggrBound)
   154  		}
   155  	}
   156  
   157  	l := binary.BigEndian.Uint64(idx.data[offset:])
   158  	offset += 8
   159  	p := (*[maxDataSize / 8]uint64)(unsafe.Pointer(&idx.data[offset]))
   160  	idx.grData = p[:l]
   161  	offset += 8 * int(l)
   162  	idx.ef.Read(idx.data[offset:])
   163  
   164  	idx.readers = &sync.Pool{
   165  		New: func() interface{} {
   166  			return NewIndexReader(idx)
   167  		},
   168  	}
   169  	return idx, nil
   170  }
   171  
   172  func (idx *Index) Size() int64        { return idx.size }
   173  func (idx *Index) ModTime() time.Time { return idx.modTime }
   174  func (idx *Index) BaseDataID() uint64 { return idx.baseDataID }
   175  func (idx *Index) FilePath() string   { return idx.filePath }
   176  func (idx *Index) FileName() string   { return idx.fileName }
   177  
   178  func (idx *Index) Close() {
   179  	if idx == nil {
   180  		return
   181  	}
   182  	if idx.f != nil {
   183  		if err := mmap.Munmap(idx.mmapHandle1, idx.mmapHandle2); err != nil {
   184  			log.Log(dbg.FileCloseLogLevel, "unmap", "err", err, "file", idx.FileName(), "stack", dbg.Stack())
   185  		}
   186  		if err := idx.f.Close(); err != nil {
   187  			log.Log(dbg.FileCloseLogLevel, "close", "err", err, "file", idx.FileName(), "stack", dbg.Stack())
   188  		}
   189  		idx.f = nil
   190  	}
   191  }
   192  
   193  func (idx *Index) skipBits(m uint16) int {
   194  	return int(idx.golombRice[m] & 0xffff)
   195  }
   196  
   197  func (idx *Index) skipNodes(m uint16) int {
   198  	return int(idx.golombRice[m]>>16) & 0x7FF
   199  }
   200  
   201  // golombParam returns the optimal Golomb parameter to use for encoding
   202  // salt for the part of the hash function separating m elements. It is based on
   203  // calculations with assumptions that we draw hash functions at random
   204  func (idx *Index) golombParam(m uint16) int {
   205  	return int(idx.golombRice[m] >> 27)
   206  }
   207  
   208  func (idx *Index) Empty() bool {
   209  	return idx.keyCount == 0
   210  }
   211  
   212  func (idx *Index) KeyCount() uint64 {
   213  	return idx.keyCount
   214  }
   215  
   216  // Lookup is not thread-safe because it used id.hasher
   217  func (idx *Index) Lookup(bucketHash, fingerprint uint64) uint64 {
   218  	if idx.keyCount == 0 {
   219  		_, fName := filepath.Split(idx.filePath)
   220  		panic("no Lookup should be done when keyCount==0, please use Empty function to guard " + fName)
   221  	}
   222  	if idx.keyCount == 1 {
   223  		return 0
   224  	}
   225  	var gr GolombRiceReader
   226  	gr.data = idx.grData
   227  
   228  	bucket := remap(bucketHash, idx.bucketCount)
   229  	cumKeys, cumKeysNext, bitPos := idx.ef.Get3(bucket)
   230  	m := uint16(cumKeysNext - cumKeys) // Number of keys in this bucket
   231  	gr.ReadReset(int(bitPos), idx.skipBits(m))
   232  	var level int
   233  	for m > idx.secondaryAggrBound { // fanout = 2
   234  		d := gr.ReadNext(idx.golombParam(m))
   235  		hmod := remap16(remix(fingerprint+idx.startSeed[level]+d), m)
   236  		split := (((m+1)/2 + idx.secondaryAggrBound - 1) / idx.secondaryAggrBound) * idx.secondaryAggrBound
   237  		if hmod < split {
   238  			m = split
   239  		} else {
   240  			gr.SkipSubtree(idx.skipNodes(split), idx.skipBits(split))
   241  			m -= split
   242  			cumKeys += uint64(split)
   243  		}
   244  		level++
   245  	}
   246  	if m > idx.primaryAggrBound {
   247  		d := gr.ReadNext(idx.golombParam(m))
   248  		hmod := remap16(remix(fingerprint+idx.startSeed[level]+d), m)
   249  		part := hmod / idx.primaryAggrBound
   250  		if idx.primaryAggrBound < m-part*idx.primaryAggrBound {
   251  			m = idx.primaryAggrBound
   252  		} else {
   253  			m = m - part*idx.primaryAggrBound
   254  		}
   255  		cumKeys += uint64(idx.primaryAggrBound * part)
   256  		if part != 0 {
   257  			gr.SkipSubtree(idx.skipNodes(idx.primaryAggrBound)*int(part), idx.skipBits(idx.primaryAggrBound)*int(part))
   258  		}
   259  		level++
   260  	}
   261  	if m > idx.leafSize {
   262  		d := gr.ReadNext(idx.golombParam(m))
   263  		hmod := remap16(remix(fingerprint+idx.startSeed[level]+d), m)
   264  		part := hmod / idx.leafSize
   265  		if idx.leafSize < m-part*idx.leafSize {
   266  			m = idx.leafSize
   267  		} else {
   268  			m = m - part*idx.leafSize
   269  		}
   270  		cumKeys += uint64(idx.leafSize * part)
   271  		if part != 0 {
   272  			gr.SkipSubtree(int(part), idx.skipBits(idx.leafSize)*int(part))
   273  		}
   274  		level++
   275  	}
   276  	b := gr.ReadNext(idx.golombParam(m))
   277  	rec := int(cumKeys) + int(remap16(remix(fingerprint+idx.startSeed[level]+b), m))
   278  	pos := 1 + 8 + idx.bytesPerRec*(rec+1)
   279  
   280  	return binary.BigEndian.Uint64(idx.data[pos:]) & idx.recMask
   281  }
   282  
   283  // OrdinalLookup returns the offset of i-th element in the index
   284  // Perfect hash table lookup is not performed, only access to the
   285  // Elias-Fano structure containing all offsets.
   286  func (idx *Index) OrdinalLookup(i uint64) uint64 {
   287  	return idx.offsetEf.Get(i)
   288  }
   289  
   290  func (idx *Index) ExtractOffsets() map[uint64]uint64 {
   291  	m := map[uint64]uint64{}
   292  	pos := 1 + 8 + idx.bytesPerRec
   293  	for rec := uint64(0); rec < idx.keyCount; rec++ {
   294  		offset := binary.BigEndian.Uint64(idx.data[pos:]) & idx.recMask
   295  		m[offset] = 0
   296  		pos += idx.bytesPerRec
   297  	}
   298  	return m
   299  }
   300  
   301  func (idx *Index) RewriteWithOffsets(w *bufio.Writer, m map[uint64]uint64) error {
   302  	// New max offset
   303  	var maxOffset uint64
   304  	for _, offset := range m {
   305  		if offset > maxOffset {
   306  			maxOffset = offset
   307  		}
   308  	}
   309  	bytesPerRec := common.BitLenToByteLen(bits.Len64(maxOffset))
   310  	var numBuf [8]byte
   311  	// Write baseDataID
   312  	binary.BigEndian.PutUint64(numBuf[:], idx.baseDataID)
   313  	if _, err := w.Write(numBuf[:]); err != nil {
   314  		return fmt.Errorf("write number of keys: %w", err)
   315  	}
   316  
   317  	// Write number of keys
   318  	binary.BigEndian.PutUint64(numBuf[:], idx.keyCount)
   319  	if _, err := w.Write(numBuf[:]); err != nil {
   320  		return fmt.Errorf("write number of keys: %w", err)
   321  	}
   322  	// Write number of bytes per index record
   323  	if err := w.WriteByte(byte(bytesPerRec)); err != nil {
   324  		return fmt.Errorf("write bytes per record: %w", err)
   325  	}
   326  	pos := 1 + 8 + idx.bytesPerRec
   327  	for rec := uint64(0); rec < idx.keyCount; rec++ {
   328  		offset := binary.BigEndian.Uint64(idx.data[pos:]) & idx.recMask
   329  		pos += idx.bytesPerRec
   330  		binary.BigEndian.PutUint64(numBuf[:], m[offset])
   331  		if _, err := w.Write(numBuf[8-bytesPerRec:]); err != nil {
   332  			return err
   333  		}
   334  	}
   335  	// Write the rest as it is (TODO - wrong for indices with enums)
   336  	if _, err := w.Write(idx.data[16+1+int(idx.keyCount)*idx.bytesPerRec:]); err != nil {
   337  		return err
   338  	}
   339  	return nil
   340  }
   341  
   342  // DisableReadAhead - usage: `defer d.EnableReadAhead().DisableReadAhead()`. Please don't use this funcs without `defer` to avoid leak.
   343  func (idx *Index) DisableReadAhead() {
   344  	if idx == nil || idx.mmapHandle1 == nil {
   345  		return
   346  	}
   347  	_ = mmap.MadviseRandom(idx.mmapHandle1)
   348  }
   349  func (idx *Index) EnableReadAhead() *Index {
   350  	_ = mmap.MadviseSequential(idx.mmapHandle1)
   351  	return idx
   352  }
   353  func (idx *Index) EnableMadvNormal() *Index {
   354  	_ = mmap.MadviseNormal(idx.mmapHandle1)
   355  	return idx
   356  }
   357  func (idx *Index) EnableWillNeed() *Index {
   358  	_ = mmap.MadviseWillNeed(idx.mmapHandle1)
   359  	return idx
   360  }
   361  
   362  func (idx *Index) GetReaderFromPool() *IndexReader {
   363  	return idx.readers.Get().(*IndexReader)
   364  }