github.com/whtcorpsinc/MilevaDB-Prod@v0.0.0-20211104133533-f57f4be3b597/interlock/hash_table.go (about) 1 // Copyright 2020 WHTCORPS INC, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // See the License for the specific language governing permissions and 12 // limitations under the License. 13 14 package interlock 15 16 import ( 17 "fmt" 18 "hash" 19 "hash/fnv" 20 "sync/atomic" 21 "time" 22 23 "github.com/whtcorpsinc/errors" 24 "github.com/whtcorpsinc/milevadb/stochastikctx" 25 "github.com/whtcorpsinc/milevadb/stochastikctx/stmtctx" 26 "github.com/whtcorpsinc/milevadb/types" 27 "github.com/whtcorpsinc/milevadb/soliton/chunk" 28 "github.com/whtcorpsinc/milevadb/soliton/codec" 29 "github.com/whtcorpsinc/milevadb/soliton/disk" 30 "github.com/whtcorpsinc/milevadb/soliton/memory" 31 ) 32 33 const ( 34 // estCountMaxFactor defines the factor of estCountMax with maxChunkSize. 35 // estCountMax is maxChunkSize * estCountMaxFactor, the maximum threshold of estCount. 36 // if estCount is larger than estCountMax, set estCount to estCountMax. 37 // Set this threshold to prevent buildSideEstCount being too large and causing a performance and memory regression. 38 estCountMaxFactor = 10 * 1024 39 40 // estCountMinFactor defines the factor of estCountMin with maxChunkSize. 41 // estCountMin is maxChunkSize * estCountMinFactor, the minimum threshold of estCount. 42 // If estCount is smaller than estCountMin, set estCount to 0. 43 // Set this threshold to prevent buildSideEstCount being too small and causing a performance regression. 44 estCountMinFactor = 8 45 46 // estCountDivisor defines the divisor of buildSideEstCount. 47 // Set this divisor to prevent buildSideEstCount being too large and causing a performance regression. 48 estCountDivisor = 8 49 ) 50 51 // hashContext keeps the needed hash context of a EDB causet in hash join. 52 type hashContext struct { 53 allTypes []*types.FieldType 54 keyDefCausIdx []int 55 buf []byte 56 hashVals []hash.Hash64 57 hasNull []bool 58 } 59 60 func (hc *hashContext) initHash(rows int) { 61 if hc.buf == nil { 62 hc.buf = make([]byte, 1) 63 } 64 65 if len(hc.hashVals) < rows { 66 hc.hasNull = make([]bool, rows) 67 hc.hashVals = make([]hash.Hash64, rows) 68 for i := 0; i < rows; i++ { 69 hc.hashVals[i] = fnv.New64() 70 } 71 } else { 72 for i := 0; i < rows; i++ { 73 hc.hasNull[i] = false 74 hc.hashVals[i].Reset() 75 } 76 } 77 } 78 79 type hashStatistic struct { 80 probeDefCauslision int 81 buildBlockElapse time.Duration 82 } 83 84 func (s *hashStatistic) String() string { 85 return fmt.Sprintf("probe_defCauslision:%v, build:%v", s.probeDefCauslision, s.buildBlockElapse) 86 } 87 88 // hashEventContainer handles the rows and the hash map of a causet. 89 type hashEventContainer struct { 90 sc *stmtctx.StatementContext 91 hCtx *hashContext 92 stat hashStatistic 93 94 // hashBlock stores the map of hashKey and EventPtr 95 hashBlock baseHashBlock 96 97 rowContainer *chunk.EventContainer 98 } 99 100 func newHashEventContainer(sCtx stochastikctx.Context, estCount int, hCtx *hashContext) *hashEventContainer { 101 maxChunkSize := sCtx.GetStochastikVars().MaxChunkSize 102 rc := chunk.NewEventContainer(hCtx.allTypes, maxChunkSize) 103 c := &hashEventContainer{ 104 sc: sCtx.GetStochastikVars().StmtCtx, 105 hCtx: hCtx, 106 hashBlock: newConcurrentMapHashBlock(), 107 rowContainer: rc, 108 } 109 return c 110 } 111 112 // GetMatchedEventsAndPtrs get matched rows and Ptrs from probeEvent. It can be called 113 // in multiple goroutines while each goroutine should keep its own 114 // h and buf. 115 func (c *hashEventContainer) GetMatchedEventsAndPtrs(probeKey uint64, probeEvent chunk.Event, hCtx *hashContext) (matched []chunk.Event, matchedPtrs []chunk.EventPtr, err error) { 116 innerPtrs := c.hashBlock.Get(probeKey) 117 if len(innerPtrs) == 0 { 118 return 119 } 120 matched = make([]chunk.Event, 0, len(innerPtrs)) 121 var matchedEvent chunk.Event 122 matchedPtrs = make([]chunk.EventPtr, 0, len(innerPtrs)) 123 for _, ptr := range innerPtrs { 124 matchedEvent, err = c.rowContainer.GetEvent(ptr) 125 if err != nil { 126 return 127 } 128 var ok bool 129 ok, err = c.matchJoinKey(matchedEvent, probeEvent, hCtx) 130 if err != nil { 131 return 132 } 133 if !ok { 134 c.stat.probeDefCauslision++ 135 continue 136 } 137 matched = append(matched, matchedEvent) 138 matchedPtrs = append(matchedPtrs, ptr) 139 } 140 return 141 } 142 143 // matchJoinKey checks if join keys of buildEvent and probeEvent are logically equal. 144 func (c *hashEventContainer) matchJoinKey(buildEvent, probeEvent chunk.Event, probeHCtx *hashContext) (ok bool, err error) { 145 return codec.EqualChunkEvent(c.sc, 146 buildEvent, c.hCtx.allTypes, c.hCtx.keyDefCausIdx, 147 probeEvent, probeHCtx.allTypes, probeHCtx.keyDefCausIdx) 148 } 149 150 // alreadySpilledSafeForTest indicates that records have spilled out into disk. It's thread-safe. 151 func (c *hashEventContainer) alreadySpilledSafeForTest() bool { 152 return c.rowContainer.AlreadySpilledSafeForTest() 153 } 154 155 // PutChunk puts a chunk into hashEventContainer and build hash map. It's not thread-safe. 156 // key of hash causet: hash value of key defCausumns 157 // value of hash causet: EventPtr of the corresponded event 158 func (c *hashEventContainer) PutChunk(chk *chunk.Chunk, ignoreNulls []bool) error { 159 return c.PutChunkSelected(chk, nil, ignoreNulls) 160 } 161 162 // PutChunkSelected selectively puts a chunk into hashEventContainer and build hash map. It's not thread-safe. 163 // key of hash causet: hash value of key defCausumns 164 // value of hash causet: EventPtr of the corresponded event 165 func (c *hashEventContainer) PutChunkSelected(chk *chunk.Chunk, selected, ignoreNulls []bool) error { 166 start := time.Now() 167 defer func() { c.stat.buildBlockElapse += time.Since(start) }() 168 169 chkIdx := uint32(c.rowContainer.NumChunks()) 170 err := c.rowContainer.Add(chk) 171 if err != nil { 172 return err 173 } 174 numEvents := chk.NumEvents() 175 c.hCtx.initHash(numEvents) 176 177 hCtx := c.hCtx 178 for keyIdx, defCausIdx := range c.hCtx.keyDefCausIdx { 179 ignoreNull := len(ignoreNulls) > keyIdx && ignoreNulls[keyIdx] 180 err := codec.HashChunkSelected(c.sc, hCtx.hashVals, chk, hCtx.allTypes[defCausIdx], defCausIdx, hCtx.buf, hCtx.hasNull, selected, ignoreNull) 181 if err != nil { 182 return errors.Trace(err) 183 } 184 } 185 for i := 0; i < numEvents; i++ { 186 if (selected != nil && !selected[i]) || c.hCtx.hasNull[i] { 187 continue 188 } 189 key := c.hCtx.hashVals[i].Sum64() 190 rowPtr := chunk.EventPtr{ChkIdx: chkIdx, EventIdx: uint32(i)} 191 c.hashBlock.Put(key, rowPtr) 192 } 193 return nil 194 } 195 196 // getJoinKeyFromChkEvent fetches join keys from event and calculate the hash value. 197 func (*hashEventContainer) getJoinKeyFromChkEvent(sc *stmtctx.StatementContext, event chunk.Event, hCtx *hashContext) (hasNull bool, key uint64, err error) { 198 for _, i := range hCtx.keyDefCausIdx { 199 if event.IsNull(i) { 200 return true, 0, nil 201 } 202 } 203 hCtx.initHash(1) 204 err = codec.HashChunkEvent(sc, hCtx.hashVals[0], event, hCtx.allTypes, hCtx.keyDefCausIdx, hCtx.buf) 205 return false, hCtx.hashVals[0].Sum64(), err 206 } 207 208 // NumChunks returns the number of chunks in the rowContainer 209 func (c *hashEventContainer) NumChunks() int { 210 return c.rowContainer.NumChunks() 211 } 212 213 // NumEventsOfChunk returns the number of rows of a chunk 214 func (c *hashEventContainer) NumEventsOfChunk(chkID int) int { 215 return c.rowContainer.NumEventsOfChunk(chkID) 216 } 217 218 // GetChunk returns chkIdx th chunk of in memory records, only works if rowContainer is not spilled 219 func (c *hashEventContainer) GetChunk(chkIdx int) (*chunk.Chunk, error) { 220 return c.rowContainer.GetChunk(chkIdx) 221 } 222 223 // GetEvent returns the event the ptr pointed to in the rowContainer 224 func (c *hashEventContainer) GetEvent(ptr chunk.EventPtr) (chunk.Event, error) { 225 return c.rowContainer.GetEvent(ptr) 226 } 227 228 // Len returns number of records in the hash causet. 229 func (c *hashEventContainer) Len() uint64 { 230 return c.hashBlock.Len() 231 } 232 233 func (c *hashEventContainer) Close() error { 234 return c.rowContainer.Close() 235 } 236 237 // GetMemTracker returns the underlying memory usage tracker in hashEventContainer. 238 func (c *hashEventContainer) GetMemTracker() *memory.Tracker { return c.rowContainer.GetMemTracker() } 239 240 // GetDiskTracker returns the underlying disk usage tracker in hashEventContainer. 241 func (c *hashEventContainer) GetDiskTracker() *disk.Tracker { return c.rowContainer.GetDiskTracker() } 242 243 // CausetActionSpill returns a memory.SuperCowOrNoCausetOnExceed for spilling over to disk. 244 func (c *hashEventContainer) CausetActionSpill() memory.SuperCowOrNoCausetOnExceed { 245 return c.rowContainer.CausetActionSpill() 246 } 247 248 const ( 249 initialEntrySliceLen = 64 250 maxEntrySliceLen = 8192 251 ) 252 253 type entry struct { 254 ptr chunk.EventPtr 255 next *entry 256 } 257 258 type entryStore struct { 259 slices [][]entry 260 cursor int 261 } 262 263 func newEntryStore() *entryStore { 264 es := new(entryStore) 265 es.slices = [][]entry{make([]entry, initialEntrySliceLen)} 266 es.cursor = 0 267 return es 268 } 269 270 func (es *entryStore) GetStore() (e *entry) { 271 sliceIdx := uint32(len(es.slices) - 1) 272 slice := es.slices[sliceIdx] 273 if es.cursor >= cap(slice) { 274 size := cap(slice) * 2 275 if size >= maxEntrySliceLen { 276 size = maxEntrySliceLen 277 } 278 slice = make([]entry, size) 279 es.slices = append(es.slices, slice) 280 sliceIdx++ 281 es.cursor = 0 282 } 283 e = &es.slices[sliceIdx][es.cursor] 284 es.cursor++ 285 return 286 } 287 288 type baseHashBlock interface { 289 Put(hashKey uint64, rowPtr chunk.EventPtr) 290 Get(hashKey uint64) (rowPtrs []chunk.EventPtr) 291 Len() uint64 292 } 293 294 // TODO (fangzhuhe) remove unsafeHashBlock later if it not used anymore 295 // unsafeHashBlock stores multiple rowPtr of rows for a given key with minimum GC overhead. 296 // A given key can causetstore multiple values. 297 // It is not thread-safe, should only be used in one goroutine. 298 type unsafeHashBlock struct { 299 hashMap map[uint64]*entry 300 entryStore *entryStore 301 length uint64 302 } 303 304 // newUnsafeHashBlock creates a new unsafeHashBlock. estCount means the estimated size of the hashMap. 305 // If unknown, set it to 0. 306 func newUnsafeHashBlock(estCount int) *unsafeHashBlock { 307 ht := new(unsafeHashBlock) 308 ht.hashMap = make(map[uint64]*entry, estCount) 309 ht.entryStore = newEntryStore() 310 return ht 311 } 312 313 // Put puts the key/rowPtr pairs to the unsafeHashBlock, multiple rowPtrs are stored in a list. 314 func (ht *unsafeHashBlock) Put(hashKey uint64, rowPtr chunk.EventPtr) { 315 oldEntry := ht.hashMap[hashKey] 316 newEntry := ht.entryStore.GetStore() 317 newEntry.ptr = rowPtr 318 newEntry.next = oldEntry 319 ht.hashMap[hashKey] = newEntry 320 ht.length++ 321 } 322 323 // Get gets the values of the "key" and appends them to "values". 324 func (ht *unsafeHashBlock) Get(hashKey uint64) (rowPtrs []chunk.EventPtr) { 325 entryAddr := ht.hashMap[hashKey] 326 for entryAddr != nil { 327 rowPtrs = append(rowPtrs, entryAddr.ptr) 328 entryAddr = entryAddr.next 329 } 330 return 331 } 332 333 // Len returns the number of rowPtrs in the unsafeHashBlock, the number of keys may be less than Len 334 // if the same key is put more than once. 335 func (ht *unsafeHashBlock) Len() uint64 { return ht.length } 336 337 // concurrentMapHashBlock is a concurrent hash causet built on concurrentMap 338 type concurrentMapHashBlock struct { 339 hashMap concurrentMap 340 entryStore *entryStore 341 length uint64 342 } 343 344 // newConcurrentMapHashBlock creates a concurrentMapHashBlock 345 func newConcurrentMapHashBlock() *concurrentMapHashBlock { 346 ht := new(concurrentMapHashBlock) 347 ht.hashMap = newConcurrentMap() 348 ht.entryStore = newEntryStore() 349 ht.length = 0 350 return ht 351 } 352 353 // Len return the number of rowPtrs in the concurrentMapHashBlock 354 func (ht *concurrentMapHashBlock) Len() uint64 { 355 return ht.length 356 } 357 358 // Put puts the key/rowPtr pairs to the concurrentMapHashBlock, multiple rowPtrs are stored in a list. 359 func (ht *concurrentMapHashBlock) Put(hashKey uint64, rowPtr chunk.EventPtr) { 360 newEntry := ht.entryStore.GetStore() 361 newEntry.ptr = rowPtr 362 newEntry.next = nil 363 ht.hashMap.Insert(hashKey, newEntry) 364 atomic.AddUint64(&ht.length, 1) 365 } 366 367 // Get gets the values of the "key" and appends them to "values". 368 func (ht *concurrentMapHashBlock) Get(hashKey uint64) (rowPtrs []chunk.EventPtr) { 369 entryAddr, _ := ht.hashMap.Get(hashKey) 370 for entryAddr != nil { 371 rowPtrs = append(rowPtrs, entryAddr.ptr) 372 entryAddr = entryAddr.next 373 } 374 return 375 }