github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/sql/colexec/hashtable.go (about) 1 // Copyright 2020 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package colexec 12 13 import ( 14 "context" 15 "fmt" 16 17 "github.com/cockroachdb/cockroach/pkg/col/coldata" 18 "github.com/cockroachdb/cockroach/pkg/sql/colexecbase" 19 "github.com/cockroachdb/cockroach/pkg/sql/colexecbase/colexecerror" 20 "github.com/cockroachdb/cockroach/pkg/sql/colmem" 21 "github.com/cockroachdb/cockroach/pkg/sql/sqlbase" 22 "github.com/cockroachdb/cockroach/pkg/sql/types" 23 ) 24 25 // TODO(yuzefovich): support rehashing instead of large fixed bucket size. 26 const hashTableNumBuckets = 1 << 16 27 28 // hashTableBuildMode represents different modes in which the hashTable can be 29 // built. 30 type hashTableBuildMode int 31 32 const ( 33 // hashTableFullBuildMode is the mode where hashTable buffers all input 34 // tuples and populates first and next arrays for each hash bucket. 35 hashTableFullBuildMode hashTableBuildMode = iota 36 37 // hashTableDistinctBuildMode is the mode where hashTable only buffers 38 // distinct tuples and discards the duplicates. 39 hashTableDistinctBuildMode 40 ) 41 42 // hashTableProbeMode represents different modes of probing the hashTable. 43 type hashTableProbeMode int 44 45 const ( 46 // hashTableDefaultProbeMode is the default probing mode of the hashTable. 47 hashTableDefaultProbeMode hashTableProbeMode = iota 48 49 // hashTableDeletingProbeMode is the mode of probing the hashTable in which 50 // it "deletes" the tuples from itself once they are matched against 51 // probing tuples. 52 // For example, if we have a hashTable consisting of tuples {1, 1}, {1, 2}, 53 // {2, 3}, and the probing tuples are {1, 4}, {1, 5}, {1, 6}, then we get 54 // the following when probing on the first column only: 55 // {1, 4} -> {1, 1} | hashTable = {1, 2}, {2, 3} 56 // {1, 5} -> {1, 2} | hashTable = {2, 3} 57 // {1, 6} -> no match | hashTable = {2, 3} 58 // Note that the output of such probing is not fully deterministic when 59 // tuples contain non-equality columns. 60 hashTableDeletingProbeMode 61 ) 62 63 // hashTableBuildBuffer stores the information related to the build table. 64 type hashTableBuildBuffer struct { 65 // first stores the first keyID of the key that resides in each bucket. 66 // This keyID is used to determine the corresponding equality column key as 67 // well as output column values. 68 first []uint64 69 70 // next is a densely-packed list that stores the keyID of the next key in the 71 // hash table bucket chain, where an id of 0 is reserved to represent end of 72 // chain. 73 next []uint64 74 } 75 76 // hashTableProbeBuffer stores the information related to the probe table. 77 type hashTableProbeBuffer struct { 78 // first stores the first keyID of the key that resides in each bucket. 79 // This keyID is used to determine the corresponding equality column key as 80 // well as output column values. 81 first []uint64 82 83 // next is a densely-packed list that stores the keyID of the next key in the 84 // hash table bucket chain, where an id of 0 is reserved to represent end of 85 // chain. 86 next []uint64 87 88 // headID stores the first build table keyID that matched with the probe batch 89 // key at any given index. 90 headID []uint64 91 92 // differs stores whether the key at any index differs with the build table 93 // key. 94 differs []bool 95 96 // distinct stores whether the key in the probe batch is distinct in the build 97 // table. 98 distinct []bool 99 100 // keys stores the equality columns on the probe table for a single batch. 101 keys []coldata.Vec 102 // buckets is used to store the computed hash value of each key in a single 103 // batch. 104 buckets []uint64 105 106 // groupID stores the keyID that maps to the joining rows of the build table. 107 // The ith element of groupID stores the keyID of the build table that 108 // corresponds to the ith key in the probe table. 109 groupID []uint64 110 // toCheck stores the indices of the eqCol rows that have yet to be found or 111 // rejected. 112 toCheck []uint64 113 114 // hashBuffer stores the hash values of each tuple in the probe table. It will 115 // be dynamically updated when the hashTable is build in distinct mode. 116 hashBuffer []uint64 117 } 118 119 // hashTable is a structure used by the hash joiner to store the build table 120 // batches. Keys are stored according to the encoding of the equality column, 121 // which point to the corresponding output keyID. The keyID is calculated 122 // using the below equation: 123 // 124 // keyID = keys.indexOf(key) + 1 125 // 126 // and inversely: 127 // 128 // keys[keyID - 1] = key 129 // 130 // The table can then be probed in column batches to find at most one matching 131 // row per column batch row. 132 type hashTable struct { 133 allocator *colmem.Allocator 134 135 // buildScratch contains the scratch buffers required for the build table. 136 buildScratch hashTableBuildBuffer 137 138 // probeScratch contains the scratch buffers required for the probe table. 139 probeScratch hashTableProbeBuffer 140 141 // same and visited are only used when the hashTable contains non-distinct 142 // keys. 143 // 144 // same is a densely-packed list that stores the keyID of the next key in the 145 // hash table that has the same value as the current key. The headID of the key 146 // is the first key of that value found in the next linked list. This field 147 // will be lazily populated by the prober. 148 same []uint64 149 // visited represents whether each of the corresponding keys have been touched 150 // by the prober. 151 visited []bool 152 153 // vals stores the union of the equality and output columns of the build 154 // table. A key tuple is defined as the elements in each row of vals that 155 // makes up the equality columns. The ID of a key at any index of vals is 156 // index + 1. 157 vals *appendOnlyBufferedBatch 158 // keyCols stores the indices of vals which are key columns. 159 keyCols []uint32 160 161 // numBuckets returns the number of buckets the hashTable employs. This is 162 // equivalent to the size of first. 163 numBuckets uint64 164 165 // allowNullEquality determines if NULL keys should be treated as equal to 166 // each other. 167 allowNullEquality bool 168 169 overloadHelper overloadHelper 170 datumAlloc sqlbase.DatumAlloc 171 cancelChecker CancelChecker 172 173 buildMode hashTableBuildMode 174 probeMode hashTableProbeMode 175 } 176 177 var _ resetter = &hashTable{} 178 179 func newHashTable( 180 allocator *colmem.Allocator, 181 numBuckets uint64, 182 sourceTypes []*types.T, 183 eqCols []uint32, 184 allowNullEquality bool, 185 buildMode hashTableBuildMode, 186 probeMode hashTableProbeMode, 187 ) *hashTable { 188 if !allowNullEquality && probeMode == hashTableDeletingProbeMode { 189 // At the moment, we don't have a use case for such behavior, so let's 190 // assert that it is not requested. 191 colexecerror.InternalError("hashTableDeletingProbeMode is supported only when null equality is allowed") 192 } 193 ht := &hashTable{ 194 allocator: allocator, 195 196 buildScratch: hashTableBuildBuffer{ 197 first: make([]uint64, numBuckets), 198 }, 199 200 probeScratch: hashTableProbeBuffer{ 201 keys: make([]coldata.Vec, len(eqCols)), 202 buckets: make([]uint64, coldata.BatchSize()), 203 groupID: make([]uint64, coldata.BatchSize()), 204 headID: make([]uint64, coldata.BatchSize()), 205 toCheck: make([]uint64, coldata.BatchSize()), 206 differs: make([]bool, coldata.BatchSize()), 207 }, 208 209 vals: newAppendOnlyBufferedBatch(allocator, sourceTypes, 0 /* initialSize */), 210 keyCols: eqCols, 211 numBuckets: numBuckets, 212 allowNullEquality: allowNullEquality, 213 buildMode: buildMode, 214 probeMode: probeMode, 215 } 216 217 if buildMode == hashTableDistinctBuildMode { 218 ht.probeScratch.first = make([]uint64, numBuckets) 219 ht.probeScratch.next = make([]uint64, coldata.BatchSize()+1) 220 ht.buildScratch.next = make([]uint64, 1, coldata.BatchSize()+1) 221 ht.probeScratch.hashBuffer = make([]uint64, coldata.BatchSize()) 222 ht.probeScratch.distinct = make([]bool, coldata.BatchSize()) 223 } 224 225 return ht 226 } 227 228 // build executes the entirety of the hash table build phase using the input 229 // as the build source. The input is entirely consumed in the process. 230 func (ht *hashTable) build(ctx context.Context, input colexecbase.Operator) { 231 nKeyCols := len(ht.keyCols) 232 233 switch ht.buildMode { 234 case hashTableFullBuildMode: 235 for { 236 batch := input.Next(ctx) 237 if batch.Length() == 0 { 238 break 239 } 240 241 ht.allocator.PerformOperation(ht.vals.ColVecs(), func() { 242 ht.vals.append(batch, 0 /* startIdx */, batch.Length()) 243 }) 244 } 245 246 keyCols := make([]coldata.Vec, nKeyCols) 247 for i := 0; i < nKeyCols; i++ { 248 keyCols[i] = ht.vals.ColVec(int(ht.keyCols[i])) 249 } 250 251 // ht.next is used to store the computed hash value of each key. 252 ht.buildScratch.next = maybeAllocateUint64Array(ht.buildScratch.next, ht.vals.Length()+1) 253 ht.computeBuckets(ctx, ht.buildScratch.next[1:], keyCols, ht.vals.Length(), nil) 254 ht.buildNextChains(ctx, ht.buildScratch.first, ht.buildScratch.next, 1, ht.vals.Length()) 255 case hashTableDistinctBuildMode: 256 for { 257 batch := input.Next(ctx) 258 if batch.Length() == 0 { 259 break 260 } 261 262 srcVecs := batch.ColVecs() 263 264 for i := 0; i < nKeyCols; i++ { 265 ht.probeScratch.keys[i] = srcVecs[ht.keyCols[i]] 266 } 267 268 ht.computeBuckets(ctx, ht.probeScratch.next[1:], ht.probeScratch.keys, batch.Length(), batch.Selection()) 269 copy(ht.probeScratch.hashBuffer, ht.probeScratch.next[1:]) 270 271 // We should not zero out the entire `first` buffer here since the size of 272 // the `first` buffer same as the hash range (2^16) by default. The size 273 // of the hashBuffer is same as the batch size which is often a lot 274 // smaller than the hash range. Since we are only concerned with tuples 275 // inside the hashBuffer, we only need to zero out the corresponding 276 // entries in the `first` buffer that occurred in the hashBuffer. 277 for _, hash := range ht.probeScratch.hashBuffer[:batch.Length()] { 278 ht.probeScratch.first[hash] = 0 279 } 280 281 ht.buildNextChains(ctx, ht.probeScratch.first, ht.probeScratch.next, 1, batch.Length()) 282 283 ht.removeDuplicates(batch, ht.probeScratch.keys, ht.probeScratch.first, ht.probeScratch.next, ht.checkProbeForDistinct) 284 285 numBuffered := ht.vals.Length() 286 // We only check duplicates when there is at least one buffered 287 // tuple. 288 if numBuffered > 0 { 289 ht.removeDuplicates(batch, ht.probeScratch.keys, ht.buildScratch.first, ht.buildScratch.next, ht.checkBuildForDistinct) 290 } 291 292 ht.allocator.PerformOperation(ht.vals.ColVecs(), func() { 293 ht.vals.append(batch, 0 /* startIdx */, batch.Length()) 294 }) 295 296 ht.buildScratch.next = append(ht.buildScratch.next, ht.probeScratch.hashBuffer[:batch.Length()]...) 297 ht.buildNextChains(ctx, ht.buildScratch.first, ht.buildScratch.next, numBuffered+1, batch.Length()) 298 } 299 default: 300 colexecerror.InternalError(fmt.Sprintf("hashTable in unhandled state")) 301 } 302 } 303 304 // removeDuplicates checks the tuples in the probe table against another table 305 // and updates the selection vector of the probe table to only include distinct 306 // tuples. The table removeDuplicates will check against is specified by 307 // `first`, `next` vectors and `duplicatesChecker`. `duplicatesChecker` takes 308 // a slice of key columns of the probe table, number of tuple to check and the 309 // selection vector of the probe table and returns number of tuples that needs 310 // to be checked for next iteration. It populates the ht.probeScratch.headID to 311 // point to the keyIDs that need to be included in probe table's selection 312 // vector. 313 // NOTE: *first* and *next* vectors should be properly populated. 314 func (ht *hashTable) removeDuplicates( 315 batch coldata.Batch, 316 keyCols []coldata.Vec, 317 first, next []uint64, 318 duplicatesChecker func([]coldata.Vec, uint64, []int) uint64, 319 ) { 320 nToCheck := uint64(batch.Length()) 321 sel := batch.Selection() 322 323 for i := uint64(0); i < nToCheck; i++ { 324 ht.probeScratch.groupID[i] = first[ht.probeScratch.hashBuffer[i]] 325 ht.probeScratch.toCheck[i] = i 326 } 327 328 for nToCheck > 0 { 329 // Continue searching for the build table matching keys while the toCheck 330 // array is non-empty. 331 nToCheck = duplicatesChecker(keyCols, nToCheck, sel) 332 ht.findNext(next, nToCheck) 333 } 334 335 ht.updateSel(batch) 336 } 337 338 // checkCols performs a column by column checkCol on the key columns. 339 func (ht *hashTable) checkCols( 340 probeVecs, buildVecs []coldata.Vec, buildKeyCols []uint32, nToCheck uint64, probeSel []int, 341 ) { 342 switch ht.probeMode { 343 case hashTableDefaultProbeMode: 344 for i := range ht.keyCols { 345 ht.checkCol(probeVecs[i], buildVecs[buildKeyCols[i]], i, nToCheck, probeSel) 346 } 347 case hashTableDeletingProbeMode: 348 for i := range ht.keyCols { 349 ht.checkColDeleting(probeVecs[i], buildVecs[buildKeyCols[i]], i, nToCheck, probeSel) 350 } 351 default: 352 colexecerror.InternalError(fmt.Sprintf("unsupported hash table probe mode: %d", ht.probeMode)) 353 } 354 } 355 356 // checkColsForDistinctTuples performs a column by column check to find distinct 357 // tuples in the probe table that are not present in the build table. 358 func (ht *hashTable) checkColsForDistinctTuples( 359 probeVecs []coldata.Vec, nToCheck uint64, probeSel []int, 360 ) { 361 buildVecs := ht.vals.ColVecs() 362 for i := range ht.keyCols { 363 probeVec := probeVecs[i] 364 buildVec := buildVecs[ht.keyCols[i]] 365 366 ht.checkColForDistinctTuples(probeVec, buildVec, nToCheck, probeSel) 367 } 368 } 369 370 // computeBuckets computes the hash value of each key and stores the result in 371 // buckets. 372 func (ht *hashTable) computeBuckets( 373 ctx context.Context, buckets []uint64, keys []coldata.Vec, nKeys int, sel []int, 374 ) { 375 initHash(buckets, nKeys, defaultInitHashValue) 376 377 if nKeys == 0 { 378 // No work to do - avoid doing the loops below. 379 return 380 } 381 382 for i := range ht.keyCols { 383 rehash(ctx, buckets, keys[i], nKeys, sel, ht.cancelChecker, ht.overloadHelper, &ht.datumAlloc) 384 } 385 386 finalizeHash(buckets, nKeys, ht.numBuckets) 387 } 388 389 // buildNextChains builds the hash map from the computed hash values. 390 func (ht *hashTable) buildNextChains( 391 ctx context.Context, first, next []uint64, offset, batchSize int, 392 ) { 393 // The loop direction here is reversed to ensure that when we are building the 394 // next chain for the probe table, the keyID in each equality chain inside 395 // `next` is strictly in ascending order. This is crucial to ensure that when 396 // built in distinct mode, hash table will emit the first distinct tuple it 397 // encounters. When the next chain is built for build side, this invariant no 398 // longer holds for the equality chains inside `next`. This is ok however for 399 // the build side since all tuple that buffered on build side are already 400 // distinct, therefore we can be sure that when we emit a tuple, there cannot 401 // potentially be other tuples with the same key. 402 for id := offset + batchSize - 1; id >= offset; id-- { 403 ht.cancelChecker.check(ctx) 404 // keyID is stored into corresponding hash bucket at the front of the next 405 // chain. 406 hash := next[id] 407 firstKeyID := first[hash] 408 // This is to ensure that `first` always points to the tuple with smallest 409 // keyID in each equality chain. firstKeyID==0 means it is the first tuple 410 // that we have encountered with the given hash value. 411 if firstKeyID == 0 || uint64(id) < firstKeyID { 412 next[id] = first[hash] 413 first[hash] = uint64(id) 414 } else { 415 next[id] = next[firstKeyID] 416 next[firstKeyID] = uint64(id) 417 } 418 } 419 } 420 421 // maybeAllocate* methods make sure that the passed in array is allocated and 422 // of the desired length. 423 func maybeAllocateUint64Array(array []uint64, length int) []uint64 { 424 if array == nil || cap(array) < length { 425 return make([]uint64, length) 426 } 427 array = array[:length] 428 for n := 0; n < length; n += copy(array[n:], zeroUint64Column) { 429 } 430 return array 431 } 432 433 func maybeAllocateBoolArray(array []bool, length int) []bool { 434 if array == nil || cap(array) < length { 435 return make([]bool, length) 436 } 437 array = array[:length] 438 for n := 0; n < length; n += copy(array[n:], zeroBoolColumn) { 439 } 440 return array 441 } 442 443 func (ht *hashTable) maybeAllocateSameAndVisited() { 444 ht.same = maybeAllocateUint64Array(ht.same, ht.vals.Length()+1) 445 ht.visited = maybeAllocateBoolArray(ht.visited, ht.vals.Length()+1) 446 // Since keyID = 0 is reserved for end of list, it can be marked as visited 447 // at the beginning. 448 ht.visited[0] = true 449 } 450 451 // lookupInitial finds the corresponding hash table buckets for the equality 452 // column of the batch and stores the results in groupID. It also initializes 453 // toCheck with all indices in the range [0, batchSize). 454 func (ht *hashTable) lookupInitial(ctx context.Context, batchSize int, sel []int) { 455 ht.computeBuckets(ctx, ht.probeScratch.buckets, ht.probeScratch.keys, batchSize, sel) 456 for i := 0; i < batchSize; i++ { 457 ht.probeScratch.groupID[i] = ht.buildScratch.first[ht.probeScratch.buckets[i]] 458 ht.probeScratch.toCheck[i] = uint64(i) 459 } 460 } 461 462 // findNext determines the id of the next key inside the groupID buckets for 463 // each equality column key in toCheck. 464 func (ht *hashTable) findNext(next []uint64, nToCheck uint64) { 465 for i := uint64(0); i < nToCheck; i++ { 466 ht.probeScratch.groupID[ht.probeScratch.toCheck[i]] = 467 next[ht.probeScratch.groupID[ht.probeScratch.toCheck[i]]] 468 } 469 } 470 471 // reset resets the hashTable for reuse. 472 // NOTE: memory that already has been allocated for ht.vals is *not* released. 473 // However, resetting the length of ht.vals to zero doesn't confuse the 474 // allocator - it is smart enough to look at the capacities of the allocated 475 // vectors, and the capacities would stay the same until an actual new 476 // allocation is needed, and at that time the allocator will update the memory 477 // account accordingly. 478 func (ht *hashTable) reset(_ context.Context) { 479 for n := 0; n < len(ht.buildScratch.first); n += copy(ht.buildScratch.first[n:], zeroUint64Column) { 480 } 481 ht.vals.ResetInternalBatch() 482 ht.vals.SetLength(0) 483 // ht.next, ht.same and ht.visited are reset separately before 484 // they are used (these slices are not used in all of the code paths). 485 // ht.buckets doesn't need to be reset because buckets are always initialized 486 // when computing the hash. 487 copy(ht.probeScratch.groupID[:coldata.BatchSize()], zeroUint64Column) 488 // ht.toCheck doesn't need to be reset because it is populated manually every 489 // time before checking the columns. 490 copy(ht.probeScratch.headID[:coldata.BatchSize()], zeroUint64Column) 491 copy(ht.probeScratch.differs[:coldata.BatchSize()], zeroBoolColumn) 492 copy(ht.probeScratch.distinct, zeroBoolColumn) 493 }