github.com/TrueBlocks/trueblocks-core/src/apps/chifra@v0.0.0-20241022031540-b362680128f7/pkg/index/bloom.go (about)

     1  package index
     2  
     3  // Copyright 2021 The TrueBlocks Authors. All rights reserved.
     4  // Use of this source code is governed by a license that can
     5  // be found in the LICENSE file.
     6  
     7  import (
     8  	"encoding/binary"
     9  	"errors"
    10  	"io"
    11  	"os"
    12  
    13  	"github.com/TrueBlocks/trueblocks-core/src/apps/chifra/pkg/base"
    14  	"github.com/TrueBlocks/trueblocks-core/src/apps/chifra/pkg/file"
    15  	"github.com/TrueBlocks/trueblocks-core/src/apps/chifra/pkg/logger"
    16  )
    17  
    18  const (
    19  	// The number of bytes in a single BloomByte structure
    20  	BLOOM_WIDTH_IN_BYTES = (BLOOM_WIDTH_IN_BITS / 8)
    21  	// The number of bits in a single BloomByte structure
    22  	BLOOM_WIDTH_IN_BITS = (1048576)
    23  	// The maximum number of addresses to add to a bloomBytes before creating a new one
    24  	MAX_ADDRS_IN_BLOOM = 50000
    25  )
    26  
    27  // bloomBytes store the actual bits of the bloom filter. There is at least one but likely more bloomBytes contained in
    28  // each Bloom. The NInserted value, which is for statistical purposes only, records the number of addresses
    29  // inserted in the Bytes.
    30  type bloomBytes struct {
    31  	NInserted uint32 // Do not change the size of this field, it's stored on disc
    32  	Bytes     []byte
    33  }
    34  
    35  type bloomHeader struct {
    36  	Magic uint16    `json:"magic"`
    37  	Hash  base.Hash `json:"hash"`
    38  }
    39  
    40  // Bloom structures contain an array of bloomBytes each BLOOM_WIDTH_IN_BYTES wide. A new bloomBytes is added to
    41  // the Bloom when around MAX_ADDRS_IN_BLOOM addresses has been added. These Adaptive Bloom Filters allow us to
    42  // maintain a near-constant false-positive rate at the expense of slightly larger bloom filters than might be expected.
    43  type Bloom struct {
    44  	File       *os.File
    45  	SizeOnDisc int64
    46  	Range      base.FileRange
    47  	HeaderSize int64
    48  	Header     bloomHeader
    49  	Count      uint32 // Do not change the size of this field, it's stored on disc
    50  	Blooms     []bloomBytes
    51  }
    52  
    53  // OpenBloom returns a newly initialized bloom filter. The bloom filter's file pointer is open (if there
    54  // have been no errors) and its header data has been read into memory. The array has been created with
    55  // enough space for Count blooms but has not been read from disc. The file remains open for reading (if
    56  // there is no error) and is positioned at the start of the file.
    57  func OpenBloom(path string, check bool) (Bloom, error) {
    58  	var err error
    59  	var bl Bloom
    60  
    61  	if !file.FileExists(path) {
    62  		return bl, errors.New("required bloom file (" + path + ") missing")
    63  	}
    64  
    65  	bl.SizeOnDisc = file.FileSize(path)
    66  	if bl.Range, err = base.RangeFromFilenameE(path); err != nil {
    67  		return bl, err
    68  	}
    69  
    70  	if bl.File, err = os.OpenFile(path, os.O_RDONLY, 0644); err != nil {
    71  		return bl, err
    72  	}
    73  
    74  	_, _ = bl.File.Seek(0, io.SeekStart)        // already true, but can't hurt
    75  	if err = bl.readHeader(check); err != nil { // Note that it may not find a header, but it leaves the file pointer pointing to the count
    76  		return bl, err
    77  	}
    78  
    79  	if err = binary.Read(bl.File, binary.LittleEndian, &bl.Count); err != nil {
    80  		return bl, err
    81  	}
    82  
    83  	bl.Blooms = make([]bloomBytes, 0, bl.Count)
    84  	_, _ = bl.File.Seek(int64(bl.HeaderSize), io.SeekStart) // Point to the start of Count
    85  	return bl, nil
    86  }
    87  
    88  // Close closes the file if it's opened
    89  func (bl *Bloom) Close() {
    90  	if bl.File != nil {
    91  		bl.File.Close()
    92  		bl.File = nil
    93  	}
    94  }
    95  
    96  // InsertAddress adds an address to the bloom filter.
    97  func (bl *Bloom) InsertAddress(addr base.Address) {
    98  
    99  	// Check and initialize if empty.
   100  	if len(bl.Blooms) == 0 {
   101  		bl.Blooms = append(bl.Blooms, bloomBytes{})
   102  		bl.Blooms[bl.Count].Bytes = make([]byte, BLOOM_WIDTH_IN_BYTES)
   103  		bl.Count++
   104  	}
   105  
   106  	// Get location and convert address to bits.
   107  	loc := len(bl.Blooms) - 1
   108  	bits := bl.addressToBits(addr)
   109  
   110  	// Set address bits in bloom.
   111  	for _, bit := range bits {
   112  		which := (bit / 8)
   113  		whence := (bit % 8)
   114  		index := BLOOM_WIDTH_IN_BYTES - which - 1
   115  		mask := uint8(1 << whence)
   116  		bl.Blooms[loc].Bytes[index] |= mask
   117  	}
   118  
   119  	// Update insert count and check for overflow.
   120  	bl.Blooms[loc].NInserted++
   121  	if bl.Blooms[loc].NInserted > MAX_ADDRS_IN_BLOOM {
   122  		bl.Blooms = append(bl.Blooms, bloomBytes{})
   123  		bl.Blooms[bl.Count].Bytes = make([]byte, BLOOM_WIDTH_IN_BYTES)
   124  		bl.Count++
   125  	}
   126  }
   127  
   128  // addressToBits extracts five bits from a 20-byte address to determine its presence in the bloom filter.
   129  // It divides the address into five 4-byte segments, converts each to a 32-bit integer, and then takes the modulo
   130  // with the bloom array item width.
   131  func (bl *Bloom) addressToBits(addr base.Address) (bits [5]uint32) {
   132  
   133  	// Convert address to byte slice.
   134  	slice := addr.Bytes()
   135  
   136  	// Validate address length.
   137  	if len(slice) != 20 {
   138  		logger.Fatal("should not happen ==> invalid address length.")
   139  	}
   140  
   141  	// Split address into five segments and calculate corresponding bits.
   142  	for i, cnt := 0, 0; i < len(slice); i += 4 {
   143  		bytes := slice[i : i+4]
   144  		bits[cnt] = (binary.BigEndian.Uint32(bytes) % uint32(BLOOM_WIDTH_IN_BITS))
   145  		cnt++
   146  	}
   147  
   148  	return
   149  }