github.com/TrueBlocks/trueblocks-core/src/apps/chifra@v0.0.0-20241022031540-b362680128f7/pkg/index/bloom.go (about) 1 package index 2 3 // Copyright 2021 The TrueBlocks Authors. All rights reserved. 4 // Use of this source code is governed by a license that can 5 // be found in the LICENSE file. 6 7 import ( 8 "encoding/binary" 9 "errors" 10 "io" 11 "os" 12 13 "github.com/TrueBlocks/trueblocks-core/src/apps/chifra/pkg/base" 14 "github.com/TrueBlocks/trueblocks-core/src/apps/chifra/pkg/file" 15 "github.com/TrueBlocks/trueblocks-core/src/apps/chifra/pkg/logger" 16 ) 17 18 const ( 19 // The number of bytes in a single BloomByte structure 20 BLOOM_WIDTH_IN_BYTES = (BLOOM_WIDTH_IN_BITS / 8) 21 // The number of bits in a single BloomByte structure 22 BLOOM_WIDTH_IN_BITS = (1048576) 23 // The maximum number of addresses to add to a bloomBytes before creating a new one 24 MAX_ADDRS_IN_BLOOM = 50000 25 ) 26 27 // bloomBytes store the actual bits of the bloom filter. There is at least one but likely more bloomBytes contained in 28 // each Bloom. The NInserted value, which is for statistical purposes only, records the number of addresses 29 // inserted in the Bytes. 30 type bloomBytes struct { 31 NInserted uint32 // Do not change the size of this field, it's stored on disc 32 Bytes []byte 33 } 34 35 type bloomHeader struct { 36 Magic uint16 `json:"magic"` 37 Hash base.Hash `json:"hash"` 38 } 39 40 // Bloom structures contain an array of bloomBytes each BLOOM_WIDTH_IN_BYTES wide. A new bloomBytes is added to 41 // the Bloom when around MAX_ADDRS_IN_BLOOM addresses has been added. These Adaptive Bloom Filters allow us to 42 // maintain a near-constant false-positive rate at the expense of slightly larger bloom filters than might be expected. 43 type Bloom struct { 44 File *os.File 45 SizeOnDisc int64 46 Range base.FileRange 47 HeaderSize int64 48 Header bloomHeader 49 Count uint32 // Do not change the size of this field, it's stored on disc 50 Blooms []bloomBytes 51 } 52 53 // OpenBloom returns a newly initialized bloom filter. The bloom filter's file pointer is open (if there 54 // have been no errors) and its header data has been read into memory. The array has been created with 55 // enough space for Count blooms but has not been read from disc. The file remains open for reading (if 56 // there is no error) and is positioned at the start of the file. 57 func OpenBloom(path string, check bool) (Bloom, error) { 58 var err error 59 var bl Bloom 60 61 if !file.FileExists(path) { 62 return bl, errors.New("required bloom file (" + path + ") missing") 63 } 64 65 bl.SizeOnDisc = file.FileSize(path) 66 if bl.Range, err = base.RangeFromFilenameE(path); err != nil { 67 return bl, err 68 } 69 70 if bl.File, err = os.OpenFile(path, os.O_RDONLY, 0644); err != nil { 71 return bl, err 72 } 73 74 _, _ = bl.File.Seek(0, io.SeekStart) // already true, but can't hurt 75 if err = bl.readHeader(check); err != nil { // Note that it may not find a header, but it leaves the file pointer pointing to the count 76 return bl, err 77 } 78 79 if err = binary.Read(bl.File, binary.LittleEndian, &bl.Count); err != nil { 80 return bl, err 81 } 82 83 bl.Blooms = make([]bloomBytes, 0, bl.Count) 84 _, _ = bl.File.Seek(int64(bl.HeaderSize), io.SeekStart) // Point to the start of Count 85 return bl, nil 86 } 87 88 // Close closes the file if it's opened 89 func (bl *Bloom) Close() { 90 if bl.File != nil { 91 bl.File.Close() 92 bl.File = nil 93 } 94 } 95 96 // InsertAddress adds an address to the bloom filter. 97 func (bl *Bloom) InsertAddress(addr base.Address) { 98 99 // Check and initialize if empty. 100 if len(bl.Blooms) == 0 { 101 bl.Blooms = append(bl.Blooms, bloomBytes{}) 102 bl.Blooms[bl.Count].Bytes = make([]byte, BLOOM_WIDTH_IN_BYTES) 103 bl.Count++ 104 } 105 106 // Get location and convert address to bits. 107 loc := len(bl.Blooms) - 1 108 bits := bl.addressToBits(addr) 109 110 // Set address bits in bloom. 111 for _, bit := range bits { 112 which := (bit / 8) 113 whence := (bit % 8) 114 index := BLOOM_WIDTH_IN_BYTES - which - 1 115 mask := uint8(1 << whence) 116 bl.Blooms[loc].Bytes[index] |= mask 117 } 118 119 // Update insert count and check for overflow. 120 bl.Blooms[loc].NInserted++ 121 if bl.Blooms[loc].NInserted > MAX_ADDRS_IN_BLOOM { 122 bl.Blooms = append(bl.Blooms, bloomBytes{}) 123 bl.Blooms[bl.Count].Bytes = make([]byte, BLOOM_WIDTH_IN_BYTES) 124 bl.Count++ 125 } 126 } 127 128 // addressToBits extracts five bits from a 20-byte address to determine its presence in the bloom filter. 129 // It divides the address into five 4-byte segments, converts each to a 32-bit integer, and then takes the modulo 130 // with the bloom array item width. 131 func (bl *Bloom) addressToBits(addr base.Address) (bits [5]uint32) { 132 133 // Convert address to byte slice. 134 slice := addr.Bytes() 135 136 // Validate address length. 137 if len(slice) != 20 { 138 logger.Fatal("should not happen ==> invalid address length.") 139 } 140 141 // Split address into five segments and calculate corresponding bits. 142 for i, cnt := 0, 0; i < len(slice); i += 4 { 143 bytes := slice[i : i+4] 144 bits[cnt] = (binary.BigEndian.Uint32(bytes) % uint32(BLOOM_WIDTH_IN_BITS)) 145 cnt++ 146 } 147 148 return 149 }