github.com/ledgerwatch/erigon-lib@v1.0.0/recsplit/index.go (about) 1 /* 2 Copyright 2022 The Erigon contributors 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package recsplit 18 19 import ( 20 "bufio" 21 "encoding/binary" 22 "fmt" 23 "math" 24 "math/bits" 25 "os" 26 "path/filepath" 27 "sync" 28 "time" 29 "unsafe" 30 31 "github.com/ledgerwatch/erigon-lib/common/dbg" 32 "github.com/ledgerwatch/log/v3" 33 34 "github.com/ledgerwatch/erigon-lib/common" 35 "github.com/ledgerwatch/erigon-lib/mmap" 36 "github.com/ledgerwatch/erigon-lib/recsplit/eliasfano16" 37 "github.com/ledgerwatch/erigon-lib/recsplit/eliasfano32" 38 ) 39 40 // Index implements index lookup from the file created by the RecSplit 41 type Index struct { 42 offsetEf *eliasfano32.EliasFano 43 f *os.File 44 mmapHandle2 *[mmap.MaxMapSize]byte // mmap handle for windows (this is used to close mmap) 45 filePath, fileName string 46 47 grData []uint64 48 data []byte // slice of correct size for the index to work with 49 startSeed []uint64 50 golombRice []uint32 51 mmapHandle1 []byte // mmap handle for unix (this is used to close mmap) 52 ef eliasfano16.DoubleEliasFano 53 bucketSize int 54 size int64 55 modTime time.Time 56 baseDataID uint64 // Index internaly organized as [0,N) array. Use this field to map EntityID=[M;M+N) to [0,N) 57 bucketCount uint64 // Number of buckets 58 keyCount uint64 59 recMask uint64 60 bytesPerRec int 61 salt uint32 62 leafSize uint16 // Leaf size for recursive split algorithms 63 secondaryAggrBound uint16 // The lower bound for secondary key aggregation (computed from leadSize) 64 primaryAggrBound uint16 // The lower bound for primary key aggregation (computed from leafSize) 65 enums bool 66 67 readers *sync.Pool 68 } 69 70 func MustOpen(indexFile string) *Index { 71 idx, err := OpenIndex(indexFile) 72 if err != nil { 73 panic(err) 74 } 75 return idx 76 } 77 78 func OpenIndex(indexFilePath string) (*Index, error) { 79 _, fName := filepath.Split(indexFilePath) 80 idx := &Index{ 81 filePath: indexFilePath, 82 fileName: fName, 83 } 84 var err error 85 idx.f, err = os.Open(indexFilePath) 86 if err != nil { 87 return nil, err 88 } 89 var stat os.FileInfo 90 if stat, err = idx.f.Stat(); err != nil { 91 return nil, err 92 } 93 idx.size = stat.Size() 94 idx.modTime = stat.ModTime() 95 if idx.mmapHandle1, idx.mmapHandle2, err = mmap.Mmap(idx.f, int(idx.size)); err != nil { 96 return nil, err 97 } 98 idx.data = idx.mmapHandle1[:idx.size] 99 defer idx.EnableReadAhead().DisableReadAhead() 100 101 // Read number of keys and bytes per record 102 idx.baseDataID = binary.BigEndian.Uint64(idx.data[:8]) 103 idx.keyCount = binary.BigEndian.Uint64(idx.data[8:16]) 104 idx.bytesPerRec = int(idx.data[16]) 105 idx.recMask = (uint64(1) << (8 * idx.bytesPerRec)) - 1 106 offset := 16 + 1 + int(idx.keyCount)*idx.bytesPerRec 107 108 if offset < 0 { 109 return nil, fmt.Errorf("offset is: %d which is below zero, the file: %s is broken", offset, indexFilePath) 110 } 111 112 // Bucket count, bucketSize, leafSize 113 idx.bucketCount = binary.BigEndian.Uint64(idx.data[offset:]) 114 offset += 8 115 idx.bucketSize = int(binary.BigEndian.Uint16(idx.data[offset:])) 116 offset += 2 117 idx.leafSize = binary.BigEndian.Uint16(idx.data[offset:]) 118 offset += 2 119 idx.primaryAggrBound = idx.leafSize * uint16(math.Max(2, math.Ceil(0.35*float64(idx.leafSize)+1./2.))) 120 if idx.leafSize < 7 { 121 idx.secondaryAggrBound = idx.primaryAggrBound * 2 122 } else { 123 idx.secondaryAggrBound = idx.primaryAggrBound * uint16(math.Ceil(0.21*float64(idx.leafSize)+9./10.)) 124 } 125 // Salt 126 idx.salt = binary.BigEndian.Uint32(idx.data[offset:]) 127 offset += 4 128 // Start seed 129 startSeedLen := int(idx.data[offset]) 130 offset++ 131 idx.startSeed = make([]uint64, startSeedLen) 132 for i := 0; i < startSeedLen; i++ { 133 idx.startSeed[i] = binary.BigEndian.Uint64(idx.data[offset:]) 134 offset += 8 135 } 136 idx.enums = idx.data[offset] != 0 137 offset++ 138 if idx.enums { 139 var size int 140 idx.offsetEf, size = eliasfano32.ReadEliasFano(idx.data[offset:]) 141 offset += size 142 } 143 // Size of golomb rice params 144 golombParamSize := binary.BigEndian.Uint16(idx.data[offset:]) 145 offset += 4 146 idx.golombRice = make([]uint32, golombParamSize) 147 for i := uint16(0); i < golombParamSize; i++ { 148 if i == 0 { 149 idx.golombRice[i] = (bijMemo[i] << 27) | bijMemo[i] 150 } else if i <= idx.leafSize { 151 idx.golombRice[i] = (bijMemo[i] << 27) | (uint32(1) << 16) | bijMemo[i] 152 } else { 153 computeGolombRice(i, idx.golombRice, idx.leafSize, idx.primaryAggrBound, idx.secondaryAggrBound) 154 } 155 } 156 157 l := binary.BigEndian.Uint64(idx.data[offset:]) 158 offset += 8 159 p := (*[maxDataSize / 8]uint64)(unsafe.Pointer(&idx.data[offset])) 160 idx.grData = p[:l] 161 offset += 8 * int(l) 162 idx.ef.Read(idx.data[offset:]) 163 164 idx.readers = &sync.Pool{ 165 New: func() interface{} { 166 return NewIndexReader(idx) 167 }, 168 } 169 return idx, nil 170 } 171 172 func (idx *Index) Size() int64 { return idx.size } 173 func (idx *Index) ModTime() time.Time { return idx.modTime } 174 func (idx *Index) BaseDataID() uint64 { return idx.baseDataID } 175 func (idx *Index) FilePath() string { return idx.filePath } 176 func (idx *Index) FileName() string { return idx.fileName } 177 178 func (idx *Index) Close() { 179 if idx == nil { 180 return 181 } 182 if idx.f != nil { 183 if err := mmap.Munmap(idx.mmapHandle1, idx.mmapHandle2); err != nil { 184 log.Log(dbg.FileCloseLogLevel, "unmap", "err", err, "file", idx.FileName(), "stack", dbg.Stack()) 185 } 186 if err := idx.f.Close(); err != nil { 187 log.Log(dbg.FileCloseLogLevel, "close", "err", err, "file", idx.FileName(), "stack", dbg.Stack()) 188 } 189 idx.f = nil 190 } 191 } 192 193 func (idx *Index) skipBits(m uint16) int { 194 return int(idx.golombRice[m] & 0xffff) 195 } 196 197 func (idx *Index) skipNodes(m uint16) int { 198 return int(idx.golombRice[m]>>16) & 0x7FF 199 } 200 201 // golombParam returns the optimal Golomb parameter to use for encoding 202 // salt for the part of the hash function separating m elements. It is based on 203 // calculations with assumptions that we draw hash functions at random 204 func (idx *Index) golombParam(m uint16) int { 205 return int(idx.golombRice[m] >> 27) 206 } 207 208 func (idx *Index) Empty() bool { 209 return idx.keyCount == 0 210 } 211 212 func (idx *Index) KeyCount() uint64 { 213 return idx.keyCount 214 } 215 216 // Lookup is not thread-safe because it used id.hasher 217 func (idx *Index) Lookup(bucketHash, fingerprint uint64) uint64 { 218 if idx.keyCount == 0 { 219 _, fName := filepath.Split(idx.filePath) 220 panic("no Lookup should be done when keyCount==0, please use Empty function to guard " + fName) 221 } 222 if idx.keyCount == 1 { 223 return 0 224 } 225 var gr GolombRiceReader 226 gr.data = idx.grData 227 228 bucket := remap(bucketHash, idx.bucketCount) 229 cumKeys, cumKeysNext, bitPos := idx.ef.Get3(bucket) 230 m := uint16(cumKeysNext - cumKeys) // Number of keys in this bucket 231 gr.ReadReset(int(bitPos), idx.skipBits(m)) 232 var level int 233 for m > idx.secondaryAggrBound { // fanout = 2 234 d := gr.ReadNext(idx.golombParam(m)) 235 hmod := remap16(remix(fingerprint+idx.startSeed[level]+d), m) 236 split := (((m+1)/2 + idx.secondaryAggrBound - 1) / idx.secondaryAggrBound) * idx.secondaryAggrBound 237 if hmod < split { 238 m = split 239 } else { 240 gr.SkipSubtree(idx.skipNodes(split), idx.skipBits(split)) 241 m -= split 242 cumKeys += uint64(split) 243 } 244 level++ 245 } 246 if m > idx.primaryAggrBound { 247 d := gr.ReadNext(idx.golombParam(m)) 248 hmod := remap16(remix(fingerprint+idx.startSeed[level]+d), m) 249 part := hmod / idx.primaryAggrBound 250 if idx.primaryAggrBound < m-part*idx.primaryAggrBound { 251 m = idx.primaryAggrBound 252 } else { 253 m = m - part*idx.primaryAggrBound 254 } 255 cumKeys += uint64(idx.primaryAggrBound * part) 256 if part != 0 { 257 gr.SkipSubtree(idx.skipNodes(idx.primaryAggrBound)*int(part), idx.skipBits(idx.primaryAggrBound)*int(part)) 258 } 259 level++ 260 } 261 if m > idx.leafSize { 262 d := gr.ReadNext(idx.golombParam(m)) 263 hmod := remap16(remix(fingerprint+idx.startSeed[level]+d), m) 264 part := hmod / idx.leafSize 265 if idx.leafSize < m-part*idx.leafSize { 266 m = idx.leafSize 267 } else { 268 m = m - part*idx.leafSize 269 } 270 cumKeys += uint64(idx.leafSize * part) 271 if part != 0 { 272 gr.SkipSubtree(int(part), idx.skipBits(idx.leafSize)*int(part)) 273 } 274 level++ 275 } 276 b := gr.ReadNext(idx.golombParam(m)) 277 rec := int(cumKeys) + int(remap16(remix(fingerprint+idx.startSeed[level]+b), m)) 278 pos := 1 + 8 + idx.bytesPerRec*(rec+1) 279 280 return binary.BigEndian.Uint64(idx.data[pos:]) & idx.recMask 281 } 282 283 // OrdinalLookup returns the offset of i-th element in the index 284 // Perfect hash table lookup is not performed, only access to the 285 // Elias-Fano structure containing all offsets. 286 func (idx *Index) OrdinalLookup(i uint64) uint64 { 287 return idx.offsetEf.Get(i) 288 } 289 290 func (idx *Index) ExtractOffsets() map[uint64]uint64 { 291 m := map[uint64]uint64{} 292 pos := 1 + 8 + idx.bytesPerRec 293 for rec := uint64(0); rec < idx.keyCount; rec++ { 294 offset := binary.BigEndian.Uint64(idx.data[pos:]) & idx.recMask 295 m[offset] = 0 296 pos += idx.bytesPerRec 297 } 298 return m 299 } 300 301 func (idx *Index) RewriteWithOffsets(w *bufio.Writer, m map[uint64]uint64) error { 302 // New max offset 303 var maxOffset uint64 304 for _, offset := range m { 305 if offset > maxOffset { 306 maxOffset = offset 307 } 308 } 309 bytesPerRec := common.BitLenToByteLen(bits.Len64(maxOffset)) 310 var numBuf [8]byte 311 // Write baseDataID 312 binary.BigEndian.PutUint64(numBuf[:], idx.baseDataID) 313 if _, err := w.Write(numBuf[:]); err != nil { 314 return fmt.Errorf("write number of keys: %w", err) 315 } 316 317 // Write number of keys 318 binary.BigEndian.PutUint64(numBuf[:], idx.keyCount) 319 if _, err := w.Write(numBuf[:]); err != nil { 320 return fmt.Errorf("write number of keys: %w", err) 321 } 322 // Write number of bytes per index record 323 if err := w.WriteByte(byte(bytesPerRec)); err != nil { 324 return fmt.Errorf("write bytes per record: %w", err) 325 } 326 pos := 1 + 8 + idx.bytesPerRec 327 for rec := uint64(0); rec < idx.keyCount; rec++ { 328 offset := binary.BigEndian.Uint64(idx.data[pos:]) & idx.recMask 329 pos += idx.bytesPerRec 330 binary.BigEndian.PutUint64(numBuf[:], m[offset]) 331 if _, err := w.Write(numBuf[8-bytesPerRec:]); err != nil { 332 return err 333 } 334 } 335 // Write the rest as it is (TODO - wrong for indices with enums) 336 if _, err := w.Write(idx.data[16+1+int(idx.keyCount)*idx.bytesPerRec:]); err != nil { 337 return err 338 } 339 return nil 340 } 341 342 // DisableReadAhead - usage: `defer d.EnableReadAhead().DisableReadAhead()`. Please don't use this funcs without `defer` to avoid leak. 343 func (idx *Index) DisableReadAhead() { 344 if idx == nil || idx.mmapHandle1 == nil { 345 return 346 } 347 _ = mmap.MadviseRandom(idx.mmapHandle1) 348 } 349 func (idx *Index) EnableReadAhead() *Index { 350 _ = mmap.MadviseSequential(idx.mmapHandle1) 351 return idx 352 } 353 func (idx *Index) EnableMadvNormal() *Index { 354 _ = mmap.MadviseNormal(idx.mmapHandle1) 355 return idx 356 } 357 func (idx *Index) EnableWillNeed() *Index { 358 _ = mmap.MadviseWillNeed(idx.mmapHandle1) 359 return idx 360 } 361 362 func (idx *Index) GetReaderFromPool() *IndexReader { 363 return idx.readers.Get().(*IndexReader) 364 }