github.com/matrixorigin/matrixone@v1.2.0/pkg/common/bloomfilter/bloomfilter.go (about) 1 // Copyright 2021 - 2023 Matrix Origin 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package bloomfilter 16 17 import ( 18 "github.com/matrixorigin/matrixone/pkg/common/hashmap" 19 "github.com/matrixorigin/matrixone/pkg/container/hashtable" 20 "github.com/matrixorigin/matrixone/pkg/container/vector" 21 ) 22 23 func (bf *BloomFilter) Clean() { 24 bf.bitmap.Reset() 25 bf.bitmap = nil 26 bf.hashSeed = nil 27 bf.keys = nil 28 bf.states = nil 29 bf.vals = nil 30 bf.addVals = nil 31 } 32 33 func (bf *BloomFilter) Add(v *vector.Vector) { 34 length := v.Length() 35 bitSize := uint64(bf.bitmap.Len()) 36 step := hashmap.UnitLimit 37 lastSeed := len(bf.hashSeed) - 1 38 39 var i, j, n, k, idx int 40 getIdxVal := func(v uint64) uint64 { 41 if v >= bitSize { 42 return v % bitSize 43 } 44 return v 45 } 46 47 // There is no question of correctness if no distinction is made. 48 // However, there is an unacceptable slowdown in calling the Add method. 49 for i = 0; i < length; i += step { 50 n = length - i 51 if n > step { 52 n = step 53 } 54 encodeHashKeys(bf.keys, v, i, n) 55 56 idx = 0 57 for k = 0; k < lastSeed; k++ { 58 hashtable.BytesBatchGenHashStatesWithSeed(&bf.keys[0], &bf.states[0], n, bf.hashSeed[k]) 59 for j = 0; j < n; j++ { 60 bf.addVals[idx] = getIdxVal(bf.states[j][0]) 61 idx++ 62 bf.addVals[idx] = getIdxVal(bf.states[j][1]) 63 idx++ 64 bf.addVals[idx] = getIdxVal(bf.states[j][2]) 65 idx++ 66 } 67 } 68 hashtable.BytesBatchGenHashStatesWithSeed(&bf.keys[0], &bf.states[0], n, bf.hashSeed[lastSeed]) 69 for j = 0; j < n; j++ { 70 bf.addVals[idx] = getIdxVal(bf.states[j][0]) 71 idx++ 72 bf.addVals[idx] = getIdxVal(bf.states[j][1]) 73 idx++ 74 bf.addVals[idx] = getIdxVal(bf.states[j][2]) 75 idx++ 76 bf.keys[j] = bf.keys[j][:0] 77 } 78 bf.bitmap.AddMany(bf.addVals[:idx]) 79 } 80 } 81 82 func (bf *BloomFilter) Test(v *vector.Vector, callBack func(bool, int)) { 83 bf.handle(v, func(idx, beginIdx int) { 84 exist := true 85 vals := bf.vals[idx] 86 for j := 0; j < bf.valLength; j++ { 87 exist = bf.bitmap.Contains(vals[j]) 88 if !exist { 89 break 90 } 91 } 92 callBack(exist, beginIdx+idx) 93 }, 94 ) 95 } 96 97 func (bf *BloomFilter) TestAndAdd(v *vector.Vector, callBack func(bool, int)) { 98 bf.handle(v, func(idx, beginIdx int) { 99 var contains bool 100 exist := true 101 vals := bf.vals[idx] 102 for j := 0; j < bf.valLength; j++ { 103 if exist { 104 contains = bf.bitmap.Contains(vals[j]) 105 if !contains { 106 bf.bitmap.Add(vals[j]) 107 exist = false 108 } 109 } else { 110 bf.bitmap.Add(vals[j]) 111 } 112 } 113 callBack(exist, beginIdx+idx) 114 }) 115 116 } 117 118 // for an incoming vector, compute the hash value of each of its elements, and manipulate it with func tf.fn 119 func (bf *BloomFilter) handle(v *vector.Vector, callBack func(int, int)) { 120 length := v.Length() 121 bitSize := uint64(bf.bitmap.Len()) 122 lastSeed := len(bf.hashSeed) - 1 123 step := hashmap.UnitLimit 124 125 var i, j, n, k, idx int 126 getIdxVal := func(v uint64) uint64 { 127 if v >= bitSize { 128 return v % bitSize 129 } 130 return v 131 } 132 133 // The reason we need to distinguish whether an operator is an Add or not is 134 // because it determines whether we can call tf.fn more efficiently or not. 135 // 136 // There is no question of correctness if no distinction is made. However, there is an unacceptable slowdown in calling the Add method. 137 for i = 0; i < length; i += step { 138 n = length - i 139 if n > step { 140 n = step 141 } 142 143 encodeHashKeys(bf.keys, v, i, n) 144 145 for k = 0; k < lastSeed; k++ { 146 hashtable.BytesBatchGenHashStatesWithSeed(&bf.keys[0], &bf.states[0], n, bf.hashSeed[k]) 147 idx = k * 3 148 for j = 0; j < n; j++ { 149 bf.vals[j][idx] = getIdxVal(bf.states[j][0]) 150 bf.vals[j][idx+1] = getIdxVal(bf.states[j][1]) 151 bf.vals[j][idx+2] = getIdxVal(bf.states[j][2]) 152 } 153 } 154 hashtable.BytesBatchGenHashStatesWithSeed(&bf.keys[0], &bf.states[0], n, bf.hashSeed[lastSeed]) 155 idx = lastSeed * 3 156 for j = 0; j < n; j++ { 157 bf.vals[j][idx] = getIdxVal(bf.states[j][0]) 158 bf.vals[j][idx+1] = getIdxVal(bf.states[j][1]) 159 bf.vals[j][idx+2] = getIdxVal(bf.states[j][2]) 160 bf.keys[j] = bf.keys[j][:0] 161 callBack(j, i) 162 } 163 } 164 }