github.com/matrixorigin/matrixone@v1.2.0/pkg/common/bloomfilter/bloomfilter.go (about)

     1  // Copyright 2021 - 2023 Matrix Origin
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package bloomfilter
    16  
    17  import (
    18  	"github.com/matrixorigin/matrixone/pkg/common/hashmap"
    19  	"github.com/matrixorigin/matrixone/pkg/container/hashtable"
    20  	"github.com/matrixorigin/matrixone/pkg/container/vector"
    21  )
    22  
    23  func (bf *BloomFilter) Clean() {
    24  	bf.bitmap.Reset()
    25  	bf.bitmap = nil
    26  	bf.hashSeed = nil
    27  	bf.keys = nil
    28  	bf.states = nil
    29  	bf.vals = nil
    30  	bf.addVals = nil
    31  }
    32  
    33  func (bf *BloomFilter) Add(v *vector.Vector) {
    34  	length := v.Length()
    35  	bitSize := uint64(bf.bitmap.Len())
    36  	step := hashmap.UnitLimit
    37  	lastSeed := len(bf.hashSeed) - 1
    38  
    39  	var i, j, n, k, idx int
    40  	getIdxVal := func(v uint64) uint64 {
    41  		if v >= bitSize {
    42  			return v % bitSize
    43  		}
    44  		return v
    45  	}
    46  
    47  	// There is no question of correctness if no distinction is made.
    48  	// However, there is an unacceptable slowdown in calling the Add method.
    49  	for i = 0; i < length; i += step {
    50  		n = length - i
    51  		if n > step {
    52  			n = step
    53  		}
    54  		encodeHashKeys(bf.keys, v, i, n)
    55  
    56  		idx = 0
    57  		for k = 0; k < lastSeed; k++ {
    58  			hashtable.BytesBatchGenHashStatesWithSeed(&bf.keys[0], &bf.states[0], n, bf.hashSeed[k])
    59  			for j = 0; j < n; j++ {
    60  				bf.addVals[idx] = getIdxVal(bf.states[j][0])
    61  				idx++
    62  				bf.addVals[idx] = getIdxVal(bf.states[j][1])
    63  				idx++
    64  				bf.addVals[idx] = getIdxVal(bf.states[j][2])
    65  				idx++
    66  			}
    67  		}
    68  		hashtable.BytesBatchGenHashStatesWithSeed(&bf.keys[0], &bf.states[0], n, bf.hashSeed[lastSeed])
    69  		for j = 0; j < n; j++ {
    70  			bf.addVals[idx] = getIdxVal(bf.states[j][0])
    71  			idx++
    72  			bf.addVals[idx] = getIdxVal(bf.states[j][1])
    73  			idx++
    74  			bf.addVals[idx] = getIdxVal(bf.states[j][2])
    75  			idx++
    76  			bf.keys[j] = bf.keys[j][:0]
    77  		}
    78  		bf.bitmap.AddMany(bf.addVals[:idx])
    79  	}
    80  }
    81  
    82  func (bf *BloomFilter) Test(v *vector.Vector, callBack func(bool, int)) {
    83  	bf.handle(v, func(idx, beginIdx int) {
    84  		exist := true
    85  		vals := bf.vals[idx]
    86  		for j := 0; j < bf.valLength; j++ {
    87  			exist = bf.bitmap.Contains(vals[j])
    88  			if !exist {
    89  				break
    90  			}
    91  		}
    92  		callBack(exist, beginIdx+idx)
    93  	},
    94  	)
    95  }
    96  
    97  func (bf *BloomFilter) TestAndAdd(v *vector.Vector, callBack func(bool, int)) {
    98  	bf.handle(v, func(idx, beginIdx int) {
    99  		var contains bool
   100  		exist := true
   101  		vals := bf.vals[idx]
   102  		for j := 0; j < bf.valLength; j++ {
   103  			if exist {
   104  				contains = bf.bitmap.Contains(vals[j])
   105  				if !contains {
   106  					bf.bitmap.Add(vals[j])
   107  					exist = false
   108  				}
   109  			} else {
   110  				bf.bitmap.Add(vals[j])
   111  			}
   112  		}
   113  		callBack(exist, beginIdx+idx)
   114  	})
   115  
   116  }
   117  
   118  // for an incoming vector, compute the hash value of each of its elements, and manipulate it with func tf.fn
   119  func (bf *BloomFilter) handle(v *vector.Vector, callBack func(int, int)) {
   120  	length := v.Length()
   121  	bitSize := uint64(bf.bitmap.Len())
   122  	lastSeed := len(bf.hashSeed) - 1
   123  	step := hashmap.UnitLimit
   124  
   125  	var i, j, n, k, idx int
   126  	getIdxVal := func(v uint64) uint64 {
   127  		if v >= bitSize {
   128  			return v % bitSize
   129  		}
   130  		return v
   131  	}
   132  
   133  	// The reason we need to distinguish whether an operator is an Add or not is
   134  	// because it determines whether we can call tf.fn more efficiently or not.
   135  	//
   136  	// There is no question of correctness if no distinction is made. However, there is an unacceptable slowdown in calling the Add method.
   137  	for i = 0; i < length; i += step {
   138  		n = length - i
   139  		if n > step {
   140  			n = step
   141  		}
   142  
   143  		encodeHashKeys(bf.keys, v, i, n)
   144  
   145  		for k = 0; k < lastSeed; k++ {
   146  			hashtable.BytesBatchGenHashStatesWithSeed(&bf.keys[0], &bf.states[0], n, bf.hashSeed[k])
   147  			idx = k * 3
   148  			for j = 0; j < n; j++ {
   149  				bf.vals[j][idx] = getIdxVal(bf.states[j][0])
   150  				bf.vals[j][idx+1] = getIdxVal(bf.states[j][1])
   151  				bf.vals[j][idx+2] = getIdxVal(bf.states[j][2])
   152  			}
   153  		}
   154  		hashtable.BytesBatchGenHashStatesWithSeed(&bf.keys[0], &bf.states[0], n, bf.hashSeed[lastSeed])
   155  		idx = lastSeed * 3
   156  		for j = 0; j < n; j++ {
   157  			bf.vals[j][idx] = getIdxVal(bf.states[j][0])
   158  			bf.vals[j][idx+1] = getIdxVal(bf.states[j][1])
   159  			bf.vals[j][idx+2] = getIdxVal(bf.states[j][2])
   160  			bf.keys[j] = bf.keys[j][:0]
   161  			callBack(j, i)
   162  		}
   163  	}
   164  }