github.com/etecs-ru/ristretto@v0.9.1/z/bbloom.go (about)

     1  // The MIT License (MIT)
     2  // Copyright (c) 2014 Andreas Briese, eduToolbox@Bri-C GmbH, Sarstedt
     3  
     4  // Permission is hereby granted, free of charge, to any person obtaining a copy of
     5  // this software and associated documentation files (the "Software"), to deal in
     6  // the Software without restriction, including without limitation the rights to
     7  // use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
     8  // the Software, and to permit persons to whom the Software is furnished to do so,
     9  // subject to the following conditions:
    10  
    11  // The above copyright notice and this permission notice shall be included in all
    12  // copies or substantial portions of the Software.
    13  
    14  // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    15  // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
    16  // FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
    17  // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
    18  // IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
    19  // CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
    20  
    21  package z
    22  
    23  import (
    24  	"bytes"
    25  	"encoding/json"
    26  	"log"
    27  	"math"
    28  	"unsafe"
    29  )
    30  
    31  // helper
    32  var mask = []uint8{1, 2, 4, 8, 16, 32, 64, 128}
    33  
    34  func getSize(ui64 uint64) (size, exponent uint64) {
    35  	if ui64 < uint64(512) {
    36  		ui64 = uint64(512)
    37  	}
    38  	size = uint64(1)
    39  	for size < ui64 {
    40  		size <<= 1
    41  		exponent++
    42  	}
    43  	return size, exponent
    44  }
    45  
    46  func calcSizeByWrongPositives(numEntries, wrongs float64) (uint64, uint64) {
    47  	size := -1 * numEntries * math.Log(wrongs) / math.Pow(float64(0.69314718056), 2)
    48  	locs := math.Ceil(float64(0.69314718056) * size / numEntries)
    49  	return uint64(size), uint64(locs)
    50  }
    51  
    52  // NewBloomFilter returns a new bloomfilter.
    53  func NewBloomFilter(params ...float64) (bloomfilter *Bloom) {
    54  	var entries, locs uint64
    55  	if len(params) == 2 {
    56  		if params[1] < 1 {
    57  			entries, locs = calcSizeByWrongPositives(params[0], params[1])
    58  		} else {
    59  			entries, locs = uint64(params[0]), uint64(params[1])
    60  		}
    61  	} else {
    62  		log.Fatal("usage: New(float64(number_of_entries), float64(number_of_hashlocations))" +
    63  			" i.e. New(float64(1000), float64(3)) or New(float64(number_of_entries)," +
    64  			" float64(number_of_hashlocations)) i.e. New(float64(1000), float64(0.03))")
    65  	}
    66  	size, exponent := getSize(entries)
    67  	bloomfilter = &Bloom{
    68  		sizeExp: exponent,
    69  		size:    size - 1,
    70  		setLocs: locs,
    71  		shift:   64 - exponent,
    72  	}
    73  	bloomfilter.Size(size)
    74  	return bloomfilter
    75  }
    76  
    77  // Bloom filter
    78  type Bloom struct {
    79  	bitset  []uint64
    80  	ElemNum uint64
    81  	sizeExp uint64
    82  	size    uint64
    83  	setLocs uint64
    84  	shift   uint64
    85  }
    86  
    87  // <--- http://www.cse.yorku.ca/~oz/hash.html
    88  // modified Berkeley DB Hash (32bit)
    89  // hash is casted to l, h = 16bit fragments
    90  // func (bl Bloom) absdbm(b *[]byte) (l, h uint64) {
    91  // 	hash := uint64(len(*b))
    92  // 	for _, c := range *b {
    93  // 		hash = uint64(c) + (hash << 6) + (hash << bl.sizeExp) - hash
    94  // 	}
    95  // 	h = hash >> bl.shift
    96  // 	l = hash << bl.shift >> bl.shift
    97  // 	return l, h
    98  // }
    99  
   100  // Add adds hash of a key to the bloomfilter.
   101  func (bl *Bloom) Add(hash uint64) {
   102  	h := hash >> bl.shift
   103  	l := hash << bl.shift >> bl.shift
   104  	for i := uint64(0); i < bl.setLocs; i++ {
   105  		bl.Set((h + i*l) & bl.size)
   106  		bl.ElemNum++
   107  	}
   108  }
   109  
   110  // Has checks if bit(s) for entry hash is/are set,
   111  // returns true if the hash was added to the Bloom Filter.
   112  func (bl Bloom) Has(hash uint64) bool {
   113  	h := hash >> bl.shift
   114  	l := hash << bl.shift >> bl.shift
   115  	for i := uint64(0); i < bl.setLocs; i++ {
   116  		if !bl.IsSet((h + i*l) & bl.size) {
   117  			return false
   118  		}
   119  	}
   120  	return true
   121  }
   122  
   123  // AddIfNotHas only Adds hash, if it's not present in the bloomfilter.
   124  // Returns true if hash was added.
   125  // Returns false if hash was already registered in the bloomfilter.
   126  func (bl *Bloom) AddIfNotHas(hash uint64) bool {
   127  	if bl.Has(hash) {
   128  		return false
   129  	}
   130  	bl.Add(hash)
   131  	return true
   132  }
   133  
   134  // TotalSize returns the total size of the bloom filter.
   135  func (bl *Bloom) TotalSize() int {
   136  	// The bl struct has 5 members and each one is 8 byte. The bitset is a
   137  	// uint64 byte slice.
   138  	return len(bl.bitset)*8 + 5*8
   139  }
   140  
   141  // Size makes Bloom filter with as bitset of size sz.
   142  func (bl *Bloom) Size(sz uint64) {
   143  	bl.bitset = make([]uint64, sz>>6)
   144  }
   145  
   146  // Clear resets the Bloom filter.
   147  func (bl *Bloom) Clear() {
   148  	for i := range bl.bitset {
   149  		bl.bitset[i] = 0
   150  	}
   151  }
   152  
   153  // Set sets the bit[idx] of bitset.
   154  func (bl *Bloom) Set(idx uint64) {
   155  	ptr := unsafe.Pointer(uintptr(unsafe.Pointer(&bl.bitset[idx>>6])) + uintptr((idx%64)>>3))
   156  	*(*uint8)(ptr) |= mask[idx%8]
   157  }
   158  
   159  // IsSet checks if bit[idx] of bitset is set, returns true/false.
   160  func (bl *Bloom) IsSet(idx uint64) bool {
   161  	ptr := unsafe.Pointer(uintptr(unsafe.Pointer(&bl.bitset[idx>>6])) + uintptr((idx%64)>>3))
   162  	r := ((*(*uint8)(ptr)) >> (idx % 8)) & 1
   163  	return r == 1
   164  }
   165  
   166  // bloomJSONImExport
   167  // Im/Export structure used by JSONMarshal / JSONUnmarshal
   168  type bloomJSONImExport struct {
   169  	FilterSet []byte
   170  	SetLocs   uint64
   171  }
   172  
   173  // NewWithBoolset takes a []byte slice and number of locs per entry,
   174  // returns the bloomfilter with a bitset populated according to the input []byte.
   175  func newWithBoolset(bs *[]byte, locs uint64) *Bloom {
   176  	bloomfilter := NewBloomFilter(float64(len(*bs)<<3), float64(locs))
   177  	for i, b := range *bs {
   178  		*(*uint8)(unsafe.Pointer(uintptr(unsafe.Pointer(&bloomfilter.bitset[0])) + uintptr(i))) = b
   179  	}
   180  	return bloomfilter
   181  }
   182  
   183  // JSONUnmarshal takes JSON-Object (type bloomJSONImExport) as []bytes
   184  // returns bloom32 / bloom64 object.
   185  func JSONUnmarshal(dbData []byte) (*Bloom, error) {
   186  	bloomImEx := bloomJSONImExport{}
   187  	if err := json.Unmarshal(dbData, &bloomImEx); err != nil {
   188  		return nil, err
   189  	}
   190  	buf := bytes.NewBuffer(bloomImEx.FilterSet)
   191  	bs := buf.Bytes()
   192  	bf := newWithBoolset(&bs, bloomImEx.SetLocs)
   193  	return bf, nil
   194  }
   195  
   196  // JSONMarshal returns JSON-object (type bloomJSONImExport) as []byte.
   197  func (bl Bloom) JSONMarshal() []byte {
   198  	bloomImEx := bloomJSONImExport{}
   199  	bloomImEx.SetLocs = bl.setLocs
   200  	bloomImEx.FilterSet = make([]byte, len(bl.bitset)<<3)
   201  	for i := range bloomImEx.FilterSet {
   202  		bloomImEx.FilterSet[i] = *(*byte)(unsafe.Pointer(uintptr(unsafe.Pointer(&bl.bitset[0])) +
   203  			uintptr(i)))
   204  	}
   205  	data, err := json.Marshal(bloomImEx)
   206  	if err != nil {
   207  		log.Fatal("json.Marshal failed: ", err)
   208  	}
   209  	return data
   210  }