github.com/panmari/cuckoofilter@v1.0.7-0.20231223155748-763d1d471ee8/cuckoofilter.go (about)

     1  package cuckoo
     2  
     3  import (
     4  	"bytes"
     5  	"encoding/binary"
     6  	"fmt"
     7  )
     8  
     9  // maxCuckooKickouts is the maximum number of times reinsert
    10  // is attempted.
    11  const maxCuckooKickouts = 500
    12  
    13  // Filter is a probabilistic counter.
    14  type Filter struct {
    15  	buckets []bucket
    16  	count   uint
    17  	// Bit mask set to len(buckets) - 1. As len(buckets) is always a power of 2,
    18  	// applying this mask mimics the operation x % len(buckets).
    19  	bucketIndexMask uint
    20  }
    21  
    22  // NewFilter returns a new cuckoofilter suitable for the given number of elements.
    23  // When inserting more elements, insertion speed will drop significantly and insertions might fail altogether.
    24  // A capacity of 1000000 is a normal default, which allocates
    25  // about ~2MB on 64-bit machines.
    26  func NewFilter(numElements uint) *Filter {
    27  	numBuckets := getNextPow2(uint64(numElements / bucketSize))
    28  	if float64(numElements)/float64(numBuckets*bucketSize) > 0.96 {
    29  		numBuckets <<= 1
    30  	}
    31  	if numBuckets == 0 {
    32  		numBuckets = 1
    33  	}
    34  	buckets := make([]bucket, numBuckets)
    35  	return &Filter{
    36  		buckets:         buckets,
    37  		count:           0,
    38  		bucketIndexMask: uint(len(buckets) - 1),
    39  	}
    40  }
    41  
    42  // Lookup returns true if data is in the filter.
    43  func (cf *Filter) Lookup(data []byte) bool {
    44  	i1, fp := getIndexAndFingerprint(data, cf.bucketIndexMask)
    45  	if b := cf.buckets[i1]; b.contains(fp) {
    46  		return true
    47  	}
    48  	i2 := getAltIndex(fp, i1, cf.bucketIndexMask)
    49  	b := cf.buckets[i2]
    50  	return b.contains(fp)
    51  }
    52  
    53  // Reset removes all items from the filter, setting count to 0.
    54  func (cf *Filter) Reset() {
    55  	for i := range cf.buckets {
    56  		cf.buckets[i].reset()
    57  	}
    58  	cf.count = 0
    59  }
    60  
    61  // Insert data into the filter. Returns false if insertion failed. In the resulting state, the filter
    62  // * Might return false negatives
    63  // * Deletes are not guaranteed to work
    64  // To increase success rate of inserts, create a larger filter.
    65  func (cf *Filter) Insert(data []byte) bool {
    66  	i1, fp := getIndexAndFingerprint(data, cf.bucketIndexMask)
    67  	if cf.insert(fp, i1) {
    68  		return true
    69  	}
    70  	i2 := getAltIndex(fp, i1, cf.bucketIndexMask)
    71  	if cf.insert(fp, i2) {
    72  		return true
    73  	}
    74  	return cf.reinsert(fp, randi(i1, i2))
    75  }
    76  
    77  func (cf *Filter) insert(fp fingerprint, i uint) bool {
    78  	if cf.buckets[i].insert(fp) {
    79  		cf.count++
    80  		return true
    81  	}
    82  	return false
    83  }
    84  
    85  func (cf *Filter) reinsert(fp fingerprint, i uint) bool {
    86  	for k := 0; k < maxCuckooKickouts; k++ {
    87  		j := fastrandn(bucketSize)
    88  		// Swap fingerprint with bucket entry.
    89  		cf.buckets[i][j], fp = fp, cf.buckets[i][j]
    90  
    91  		// Move kicked out fingerprint to alternate location.
    92  		i = getAltIndex(fp, i, cf.bucketIndexMask)
    93  		if cf.insert(fp, i) {
    94  			return true
    95  		}
    96  	}
    97  	return false
    98  }
    99  
   100  // Delete data from the filter. Returns true if the data was found and deleted.
   101  func (cf *Filter) Delete(data []byte) bool {
   102  	i1, fp := getIndexAndFingerprint(data, cf.bucketIndexMask)
   103  	i2 := getAltIndex(fp, i1, cf.bucketIndexMask)
   104  	return cf.delete(fp, i1) || cf.delete(fp, i2)
   105  }
   106  
   107  func (cf *Filter) delete(fp fingerprint, i uint) bool {
   108  	if cf.buckets[i].delete(fp) {
   109  		cf.count--
   110  		return true
   111  	}
   112  	return false
   113  }
   114  
   115  // Count returns the number of items in the filter.
   116  func (cf *Filter) Count() uint {
   117  	return cf.count
   118  }
   119  
   120  // LoadFactor returns the fraction slots that are occupied.
   121  func (cf *Filter) LoadFactor() float64 {
   122  	return float64(cf.count) / float64(len(cf.buckets)*bucketSize)
   123  }
   124  
   125  const bytesPerBucket = bucketSize * fingerprintSizeBits / 8
   126  
   127  // Encode returns a byte slice representing a Cuckoofilter.
   128  func (cf *Filter) Encode() []byte {
   129  	res := new(bytes.Buffer)
   130  	res.Grow(len(cf.buckets) * bytesPerBucket)
   131  
   132  	for _, b := range cf.buckets {
   133  		for _, fp := range b {
   134  			binary.Write(res, binary.LittleEndian, fp)
   135  		}
   136  	}
   137  	return res.Bytes()
   138  }
   139  
   140  // Decode returns a Cuckoofilter from a byte slice created using Encode.
   141  func Decode(data []byte) (*Filter, error) {
   142  	if len(data)%bucketSize != 0 {
   143  		return nil, fmt.Errorf("bytes must to be multiple of %d, got %d", bucketSize, len(data))
   144  	}
   145  	numBuckets := len(data) / bytesPerBucket
   146  	if numBuckets < 1 {
   147  		return nil, fmt.Errorf("bytes can not be smaller than %d, size in bytes is %d", bytesPerBucket, len(data))
   148  	}
   149  	if getNextPow2(uint64(numBuckets)) != uint(numBuckets) {
   150  		return nil, fmt.Errorf("numBuckets must to be a power of 2, got %d", numBuckets)
   151  	}
   152  	var count uint
   153  	buckets := make([]bucket, numBuckets)
   154  	reader := bytes.NewReader(data)
   155  
   156  	for i, b := range buckets {
   157  		for j := range b {
   158  			binary.Read(reader, binary.LittleEndian, &buckets[i][j])
   159  			if buckets[i][j] != nullFp {
   160  				count++
   161  			}
   162  		}
   163  	}
   164  	return &Filter{
   165  		buckets:         buckets,
   166  		count:           count,
   167  		bucketIndexMask: uint(len(buckets) - 1),
   168  	}, nil
   169  }