github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/sql/colexec/external_hash_joiner.go (about) 1 // Copyright 2020 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package colexec 12 13 import ( 14 "context" 15 "fmt" 16 "math" 17 18 "github.com/cockroachdb/cockroach/pkg/col/coldata" 19 "github.com/cockroachdb/cockroach/pkg/sql/colcontainer" 20 "github.com/cockroachdb/cockroach/pkg/sql/colexecbase" 21 "github.com/cockroachdb/cockroach/pkg/sql/colexecbase/colexecerror" 22 "github.com/cockroachdb/cockroach/pkg/sql/colmem" 23 "github.com/cockroachdb/cockroach/pkg/sql/execinfrapb" 24 "github.com/cockroachdb/cockroach/pkg/sql/types" 25 "github.com/cockroachdb/cockroach/pkg/util/log" 26 "github.com/cockroachdb/cockroach/pkg/util/mon" 27 "github.com/cockroachdb/cockroach/pkg/util/syncutil" 28 "github.com/cockroachdb/errors" 29 "github.com/marusama/semaphore" 30 ) 31 32 // externalHashJoinerState indicates the current state of the external hash 33 // joiner. 34 type externalHashJoinerState int 35 36 const ( 37 // externalHJInitialPartitioning indicates that the operator is currently 38 // reading batches from both inputs and distributing tuples to different 39 // partitions based on the hash values. Once both inputs are exhausted, the 40 // external hash joiner transitions to externalHJJoinNewPartition state. 41 externalHJInitialPartitioning externalHashJoinerState = iota 42 // externalHJRecursivePartitioning indicates that the operator is recursively 43 // partitioning one of the existing partitions (that is too big to join at 44 // once). It will do so using a different hash function and will spill newly 45 // created partitions to disk. We also keep track whether repartitioning 46 // reduces the size of the partitions in question - if we see that the newly 47 // created largest partition is about the same in size as the "parent" 48 // partition (the percentage difference is less than 49 // externalHJRecursivePartitioningSizeDecreaseThreshold), it is likely that 50 // the partition consists of the tuples not distinct on the equality columns, 51 // so we fall back to using a combination of sort and merge join to process 52 // such partition. After repartitioning, the operator transitions to 53 // externalHJJoinNewPartition state. 54 externalHJRecursivePartitioning 55 // externalHJJoinNewPartition indicates that the operator should choose a 56 // partition index and join the corresponding partitions from both sides 57 // using the in-memory hash joiner. We will only join the partitions if the 58 // right side partition fits into memory (because in-memory hash joiner will 59 // fully buffer the right side but will process left side in the streaming 60 // fashion). If there are no partition indices that the operator can join, it 61 // transitions into externalHJRecursivePartitioning state. If there are no 62 // partition indices to join using in-memory hash joiner, but there are 63 // indices to join using sort + merge join strategy, the operator transitions 64 // to externalHJSortMergeNewPartition state. If there are no partition 65 // indices left at all to join, the operator transitions to 66 // externalHJFinished state. 67 externalHJJoinNewPartition 68 // externalHJJoining indicates that the operator is currently joining tuples 69 // from the corresponding partitions from both sides. An in-memory hash join 70 // operator is used to perform the join. Once the in-memory operator returns 71 // a zero-length batch (indicating that full output for the current 72 // partitions has been emitted), the external hash joiner transitions to 73 // externalHJJoinNewPartition state. 74 externalHJJoining 75 // externalHJSortMergeNewPartition indicates that the operator should choose 76 // a partition index to join using sort + merge join strategy. If there are 77 // no partition indices for this strategy left, the operator transitions to 78 // externalHJFinished state. 79 externalHJSortMergeNewPartition 80 // externalHJSortMergeJoining indicates that the operator is currently 81 // joining tuples from the corresponding partitions from both sides using 82 // (disk-backed) sort + merge join strategy. Once the in-memory merge joiner 83 // returns a zero-length batch (indicating that full output for the current 84 // partitions has been emitted), the external hash joiner transitions to 85 // externalHJSortMergeNewPartition state. 86 externalHJSortMergeJoining 87 // externalHJFinished indicates that the external hash joiner has emitted all 88 // tuples already and only zero-length batch will be emitted from now on. 89 externalHJFinished 90 ) 91 92 const ( 93 // externalHJRecursivePartitioningSizeDecreaseThreshold determines by how 94 // much the newly-created partitions in the recursive partitioning stage 95 // should be smaller than the "parent" partition in order to consider the 96 // repartitioning "successful". If this threshold is not met, then this newly 97 // created partition will be added to sort + merge join list (which, in a 98 // sense, serves as the base case for "recursion"). 99 externalHJRecursivePartitioningSizeDecreaseThreshold = 0.05 100 // externalHJDiskQueuesMemFraction determines the fraction of the available 101 // RAM that is allocated for the in-memory cache of disk queues. 102 externalHJDiskQueuesMemFraction = 0.5 103 // We need at least two buckets per side to make progress. However, the 104 // minimum number of partitions necessary are the partitions in use during a 105 // fallback to sort and merge join. We'll be using the minimum necessary per 106 // input + 2 (1 for each spilling queue that the merge joiner uses). For 107 // clarity this is what happens: 108 // - The 2 partitions that need to be sorted + merged will use an FD each: 2 109 // FDs. Meanwhile, each sorter will use up to externalSorterMinPartitions to 110 // sort and partition this input. At this stage 2 + 2 * 111 // externalSorterMinPartitions FDs are used. 112 // - Once the inputs (the hash joiner partitions) are finished, both FDs will 113 // be released. The merge joiner will now be in use, which uses two 114 // spillingQueues with 1 FD each for a total of 2. Since each sorter will 115 // use externalSorterMinPartitions, the FDs used at this stage are 2 + 116 // (2 * externalSorterMinPartitions) as well. Note that as soon as the 117 // sorter emits its first batch, it must be the case that the input to it 118 // has returned a zero batch, and thus the FD has been closed. 119 sortMergeNonSortMinFDsOpen = 2 120 externalHJMinPartitions = sortMergeNonSortMinFDsOpen + (externalSorterMinPartitions * 2) 121 // externalHJMinimalMaxRightPartitionSize determines the minimum value for 122 // maxRightPartitionSizeToJoin variable of the external hash joiner. 123 externalHJMinimalMaxRightPartitionSize = 64 << 10 /* 64 KiB */ 124 ) 125 126 // externalHashJoiner is an operator that performs Grace hash join algorithm 127 // and can spill to disk. The high level view is that it partitions the left 128 // and right side into large buckets by a hash function A, writes those buckets 129 // to disk, then iterates through pairs of those buckets and does a normal hash 130 // join with a different hash function B. 131 // 132 // In order to get different hash functions, we're using the same family of 133 // hash functions that in-memory hash joiner uses, but we will seed it with a 134 // different initial hash value. 135 // 136 // The operator works in two phases. 137 // 138 // Phase 1: partitioning 139 // In this phase, we iterate through both sides of the join, hashing every row 140 // using a hash function A that produces n partitions. This will produce n 141 // partitions for each side of the join, which will be persisted to disk 142 // separately. As memory fills up, each of these partitions is flushed to disk 143 // repeatedly until the inputs are exhausted. 144 // 145 // Phase 2: join 146 // Now, we retrieve pairs of partitions from disk and join each pair using the 147 // ordinary hash join algorithm (and a different hash function B). Since we're 148 // performing an equality join, we can guarantee that each row on the left side 149 // of the join, if it has a match, will be in the same partition on the right 150 // side of the join. So, it's safe to do the join in pieces, partition by 151 // partition. 152 // 153 // If one of the partitions itself runs out of memory, we can recursively apply 154 // this algorithm. The partition will be divided into sub-partitions by a new 155 // hash function, spilled to disk, and so on. If repartitioning doesn't reduce 156 // size of the partitions sufficiently, then such partitions will be handled 157 // using the combination of disk-backed sort and merge join operators. 158 type externalHashJoiner struct { 159 twoInputNode 160 NonExplainable 161 closerHelper 162 163 // mu is used to protect against concurrent IdempotentClose and Next calls, 164 // which are currently allowed. 165 // TODO(asubiotto): Explore calling IdempotentClose from the same goroutine as 166 // Next, which will simplify this model. 167 mu syncutil.Mutex 168 169 state externalHashJoinerState 170 unlimitedAllocator *colmem.Allocator 171 spec hashJoinerSpec 172 diskQueueCfg colcontainer.DiskQueueCfg 173 174 // fdState is used to acquire file descriptors up front. 175 fdState struct { 176 fdSemaphore semaphore.Semaphore 177 acquiredFDs int 178 } 179 180 // Partitioning phase variables. 181 leftPartitioner colcontainer.PartitionedQueue 182 rightPartitioner colcontainer.PartitionedQueue 183 tupleDistributor *tupleHashDistributor 184 // maxNumberActivePartitions determines the maximum number of active 185 // partitions that the operator is allowed to have. This number is computed 186 // semi-dynamically and will influence the choice of numBuckets value. 187 maxNumberActivePartitions int 188 // numBuckets is the number of buckets that a partition is divided into. 189 numBuckets int 190 // partitionsToJoinUsingInMemHash is a map from partitionIdx to a utility 191 // struct. This map contains all partition indices that need to be joined 192 // using the in-memory hash joiner. If the partition is too big, it will be 193 // tried to be repartitioned; if during repartitioning the size doesn't 194 // decrease enough, it will be added to partitionsToJoinUsingSortMerge. 195 partitionsToJoinUsingInMemHash map[int]*externalHJPartitionInfo 196 // partitionsToJoinUsingSortMerge contains all partition indices that need to 197 // be joined using sort + merge join strategy. Partition indices will be 198 // added into this map if recursive partitioning doesn't seem to make 199 // progress on partition' size reduction. 200 partitionsToJoinUsingSortMerge []int 201 // partitionIdxOffset stores the first "available" partition index to use. 202 // During the partitioning step, all tuples will go into one of the buckets 203 // in [partitionIdxOffset, partitionIdxOffset + numBuckets) range. 204 partitionIdxOffset int 205 // numRepartitions tracks the number of times the external hash joiner had to 206 // recursively repartition another partition because the latter was too big 207 // to join. 208 numRepartitions int 209 // scratch and recursiveScratch are helper structs. 210 scratch, recursiveScratch struct { 211 // Input sources can have different schemas, so when distributing tuples 212 // (i.e. copying them into scratch batch to be spilled) we might need two 213 // different batches. 214 leftBatch, rightBatch coldata.Batch 215 } 216 217 // Join phase variables. 218 leftJoinerInput, rightJoinerInput *partitionerToOperator 219 inMemHashJoiner *hashJoiner 220 // diskBackedSortMerge is a side chain of disk-backed sorters that feed into 221 // disk-backed merge joiner which the external hash joiner can fall back to. 222 diskBackedSortMerge resettableOperator 223 224 memState struct { 225 // maxRightPartitionSizeToJoin indicates the maximum memory size of a 226 // partition on the right side that we're ok with joining without having to 227 // repartition it. We pay attention only to the right side because in-memory 228 // hash joiner will buffer the whole right input before processing the left 229 // input in a "streaming" fashion. 230 maxRightPartitionSizeToJoin int64 231 } 232 233 testingKnobs struct { 234 // numForcedRepartitions is a number of times that the external hash joiner 235 // is forced to recursively repartition (even if it is otherwise not 236 // needed) before it proceeds to actual join partitions. 237 numForcedRepartitions int 238 // delegateFDAcquisitions, if true, means that a test wants to force the 239 // PartitionedDiskQueues to track the number of file descriptors the hash 240 // joiner will open/close. This disables the default behavior of acquiring 241 // all file descriptors up front in Next. 242 delegateFDAcquisitions bool 243 } 244 } 245 246 var _ closableOperator = &externalHashJoiner{} 247 248 type externalHJPartitionInfo struct { 249 rightMemSize int64 250 rightParentMemSize int64 251 } 252 253 type joinSide int 254 255 const ( 256 leftSide joinSide = iota 257 rightSide 258 ) 259 260 // newExternalHashJoiner returns a disk-backed hash joiner. 261 // - unlimitedAllocator must have been created with a memory account derived 262 // from an unlimited memory monitor. It will be used by several internal 263 // components of the external hash joiner which is responsible for making sure 264 // that the components stay within the memory limit. 265 // - numForcedRepartitions is a number of times that the external hash joiner 266 // is forced to recursively repartition (even if it is otherwise not needed). 267 // This should be non-zero only in tests. 268 // - delegateFDAcquisitions specifies whether the external hash joiner should 269 // let the partitioned disk queues acquire file descriptors instead of acquiring 270 // them up front in Next. Should be true only in tests. 271 func newExternalHashJoiner( 272 unlimitedAllocator *colmem.Allocator, 273 spec hashJoinerSpec, 274 leftInput, rightInput colexecbase.Operator, 275 memoryLimit int64, 276 diskQueueCfg colcontainer.DiskQueueCfg, 277 fdSemaphore semaphore.Semaphore, 278 createReusableDiskBackedSorter func(input colexecbase.Operator, inputTypes []*types.T, orderingCols []execinfrapb.Ordering_Column, maxNumberPartitions int) (colexecbase.Operator, error), 279 numForcedRepartitions int, 280 delegateFDAcquisitions bool, 281 diskAcc *mon.BoundAccount, 282 ) colexecbase.Operator { 283 if diskQueueCfg.CacheMode != colcontainer.DiskQueueCacheModeClearAndReuseCache { 284 colexecerror.InternalError(errors.Errorf("external hash joiner instantiated with suboptimal disk queue cache mode: %d", diskQueueCfg.CacheMode)) 285 } 286 partitionedDiskQueueSemaphore := fdSemaphore 287 if !delegateFDAcquisitions { 288 // To avoid deadlocks with other disk queues, we manually attempt to acquire 289 // the maximum number of descriptors all at once in Next. Passing in a nil 290 // semaphore indicates that the caller will do the acquiring. 291 partitionedDiskQueueSemaphore = nil 292 } 293 leftPartitioner := colcontainer.NewPartitionedDiskQueue( 294 spec.left.sourceTypes, diskQueueCfg, partitionedDiskQueueSemaphore, colcontainer.PartitionerStrategyDefault, diskAcc, 295 ) 296 leftJoinerInput := newPartitionerToOperator( 297 unlimitedAllocator, spec.left.sourceTypes, leftPartitioner, 0, /* partitionIdx */ 298 ) 299 rightPartitioner := colcontainer.NewPartitionedDiskQueue( 300 spec.right.sourceTypes, diskQueueCfg, partitionedDiskQueueSemaphore, colcontainer.PartitionerStrategyDefault, diskAcc, 301 ) 302 rightJoinerInput := newPartitionerToOperator( 303 unlimitedAllocator, spec.right.sourceTypes, rightPartitioner, 0, /* partitionIdx */ 304 ) 305 // With the default limit of 256 file descriptors, this results in 16 306 // partitions. This is a hard maximum of partitions that will be used by the 307 // external hash joiner. Below we check whether we have enough RAM to support 308 // the caches of this number of partitions. 309 // TODO(yuzefovich): this number should be tuned. 310 maxNumberActivePartitions := fdSemaphore.GetLimit() / 16 311 if diskQueueCfg.BufferSizeBytes > 0 { 312 diskQueuesTotalMemLimit := int(float64(memoryLimit) * externalHJDiskQueuesMemFraction) 313 numDiskQueuesThatFit := diskQueuesTotalMemLimit / diskQueueCfg.BufferSizeBytes 314 if numDiskQueuesThatFit < maxNumberActivePartitions { 315 maxNumberActivePartitions = numDiskQueuesThatFit 316 } 317 } 318 if maxNumberActivePartitions < externalHJMinPartitions { 319 maxNumberActivePartitions = externalHJMinPartitions 320 } 321 diskQueuesMemUsed := maxNumberActivePartitions * diskQueueCfg.BufferSizeBytes 322 makeOrderingCols := func(eqCols []uint32) []execinfrapb.Ordering_Column { 323 res := make([]execinfrapb.Ordering_Column, len(eqCols)) 324 for i, colIdx := range eqCols { 325 res[i].ColIdx = colIdx 326 } 327 return res 328 } 329 // We need to allocate 2 FDs for reading the partitions (reused by the merge 330 // joiner) that we need to join using sort + merge join strategy, and all 331 // others are divided between the two inputs. 332 externalSorterMaxNumberPartitions := (maxNumberActivePartitions - sortMergeNonSortMinFDsOpen) / 2 333 if externalSorterMaxNumberPartitions < externalSorterMinPartitions { 334 // This code gets a maximum number of partitions based on the semaphore 335 // limit. In tests, this limit is set artificially low to catch any 336 // violations of the limit, resulting in possibly computing a low number of 337 // partitions for the sorter, which we overwrite here. 338 externalSorterMaxNumberPartitions = externalSorterMinPartitions 339 } 340 leftOrdering := makeOrderingCols(spec.left.eqCols) 341 leftPartitionSorter, err := createReusableDiskBackedSorter( 342 leftJoinerInput, spec.left.sourceTypes, leftOrdering, externalSorterMaxNumberPartitions, 343 ) 344 if err != nil { 345 colexecerror.InternalError(err) 346 } 347 rightOrdering := makeOrderingCols(spec.right.eqCols) 348 rightPartitionSorter, err := createReusableDiskBackedSorter( 349 rightJoinerInput, spec.right.sourceTypes, rightOrdering, externalSorterMaxNumberPartitions, 350 ) 351 if err != nil { 352 colexecerror.InternalError(err) 353 } 354 diskBackedSortMerge, err := newMergeJoinOp( 355 unlimitedAllocator, memoryLimit, diskQueueCfg, 356 partitionedDiskQueueSemaphore, spec.joinType, leftPartitionSorter, rightPartitionSorter, 357 spec.left.sourceTypes, spec.right.sourceTypes, leftOrdering, rightOrdering, 358 diskAcc, 359 ) 360 if err != nil { 361 colexecerror.InternalError(err) 362 } 363 ehj := &externalHashJoiner{ 364 twoInputNode: newTwoInputNode(leftInput, rightInput), 365 unlimitedAllocator: unlimitedAllocator, 366 spec: spec, 367 diskQueueCfg: diskQueueCfg, 368 leftPartitioner: leftPartitioner, 369 rightPartitioner: rightPartitioner, 370 maxNumberActivePartitions: maxNumberActivePartitions, 371 // In the initial partitioning state we will use half of available 372 // partitions to write the partitioned input from the left side and another 373 // half for the right side. 374 // TODO(yuzefovich): figure out whether we should care about 375 // hj.numBuckets being a power of two (finalizeHash step is faster if so). 376 numBuckets: maxNumberActivePartitions / 2, 377 partitionsToJoinUsingInMemHash: make(map[int]*externalHJPartitionInfo), 378 partitionsToJoinUsingSortMerge: make([]int, 0), 379 leftJoinerInput: leftJoinerInput, 380 rightJoinerInput: rightJoinerInput, 381 inMemHashJoiner: newHashJoiner( 382 unlimitedAllocator, spec, leftJoinerInput, rightJoinerInput, 383 ).(*hashJoiner), 384 diskBackedSortMerge: diskBackedSortMerge, 385 } 386 ehj.fdState.fdSemaphore = fdSemaphore 387 // To simplify the accounting, we will assume that the in-memory hash 388 // joiner's memory usage is equal to the size of the right partition to be 389 // joined (which will be fully buffered). This is an underestimate because a 390 // single batch from the left partition will be read at a time as well as an 391 // output batch will be used, but that shouldn't matter in the grand scheme 392 // of things. 393 ehj.memState.maxRightPartitionSizeToJoin = memoryLimit - int64(diskQueuesMemUsed) 394 if ehj.memState.maxRightPartitionSizeToJoin < externalHJMinimalMaxRightPartitionSize { 395 ehj.memState.maxRightPartitionSizeToJoin = externalHJMinimalMaxRightPartitionSize 396 } 397 ehj.scratch.leftBatch = unlimitedAllocator.NewMemBatch(spec.left.sourceTypes) 398 ehj.recursiveScratch.leftBatch = unlimitedAllocator.NewMemBatch(spec.left.sourceTypes) 399 sameSourcesSchema := len(spec.left.sourceTypes) == len(spec.right.sourceTypes) 400 for i, leftType := range spec.left.sourceTypes { 401 if i < len(spec.right.sourceTypes) && !leftType.Identical(spec.right.sourceTypes[i]) { 402 sameSourcesSchema = false 403 } 404 } 405 if sameSourcesSchema { 406 // The schemas of both sources are the same, so we can reuse the left 407 // scratch batch. 408 ehj.scratch.rightBatch = ehj.scratch.leftBatch 409 ehj.recursiveScratch.rightBatch = ehj.recursiveScratch.leftBatch 410 } else { 411 ehj.scratch.rightBatch = unlimitedAllocator.NewMemBatch(spec.right.sourceTypes) 412 ehj.recursiveScratch.rightBatch = unlimitedAllocator.NewMemBatch(spec.right.sourceTypes) 413 } 414 ehj.testingKnobs.numForcedRepartitions = numForcedRepartitions 415 ehj.testingKnobs.delegateFDAcquisitions = delegateFDAcquisitions 416 return ehj 417 } 418 419 func (hj *externalHashJoiner) Init() { 420 hj.inputOne.Init() 421 hj.inputTwo.Init() 422 // In the join phase, hash join operator will use the default init hash 423 // value, so in order to use a "different" hash function in the partitioning 424 // phase we use a different init hash value. 425 hj.tupleDistributor = newTupleHashDistributor( 426 defaultInitHashValue+1, hj.numBuckets, 427 ) 428 hj.state = externalHJInitialPartitioning 429 } 430 431 func (hj *externalHashJoiner) partitionBatch( 432 ctx context.Context, batch coldata.Batch, side joinSide, parentMemSize int64, 433 ) { 434 batchLen := batch.Length() 435 if batchLen == 0 { 436 return 437 } 438 scratchBatch := hj.scratch.leftBatch 439 sourceSpec := hj.spec.left 440 partitioner := hj.leftPartitioner 441 if side == rightSide { 442 scratchBatch = hj.scratch.rightBatch 443 sourceSpec = hj.spec.right 444 partitioner = hj.rightPartitioner 445 } 446 selections := hj.tupleDistributor.distribute( 447 ctx, batch, sourceSpec.sourceTypes, sourceSpec.eqCols, 448 ) 449 for idx, sel := range selections { 450 partitionIdx := hj.partitionIdxOffset + idx 451 if len(sel) > 0 { 452 scratchBatch.ResetInternalBatch() 453 // The partitioner expects the batches without a selection vector, so we 454 // need to copy the tuples according to the selection vector into a 455 // scratch batch. 456 colVecs := scratchBatch.ColVecs() 457 hj.unlimitedAllocator.PerformOperation(colVecs, func() { 458 for i, colvec := range colVecs { 459 colvec.Copy(coldata.CopySliceArgs{ 460 SliceArgs: coldata.SliceArgs{ 461 Src: batch.ColVec(i), 462 Sel: sel, 463 SrcEndIdx: len(sel), 464 }, 465 }) 466 } 467 scratchBatch.SetLength(len(sel)) 468 }) 469 if err := partitioner.Enqueue(ctx, partitionIdx, scratchBatch); err != nil { 470 colexecerror.InternalError(err) 471 } 472 partitionInfo, ok := hj.partitionsToJoinUsingInMemHash[partitionIdx] 473 if !ok { 474 partitionInfo = &externalHJPartitionInfo{} 475 hj.partitionsToJoinUsingInMemHash[partitionIdx] = partitionInfo 476 } 477 if side == rightSide { 478 partitionInfo.rightParentMemSize = parentMemSize 479 // We cannot use allocator's methods directly because those 480 // look at the capacities of the vectors, and in our case only 481 // first len(sel) tuples belong to the "current" batch. 482 partitionInfo.rightMemSize += colmem.GetProportionalBatchMemSize(scratchBatch, int64(len(sel))) 483 } 484 } 485 } 486 } 487 488 func (hj *externalHashJoiner) Next(ctx context.Context) coldata.Batch { 489 hj.mu.Lock() 490 defer hj.mu.Unlock() 491 StateChanged: 492 for { 493 switch hj.state { 494 case externalHJInitialPartitioning: 495 leftBatch := hj.inputOne.Next(ctx) 496 rightBatch := hj.inputTwo.Next(ctx) 497 if leftBatch.Length() == 0 && rightBatch.Length() == 0 { 498 // Both inputs have been partitioned and spilled, so we transition to 499 // "joining" phase. Close all the open write file descriptors. 500 // 501 // TODO(yuzefovich): this will also clear the cache once the new PR is 502 // in. This means we will reallocate a cache whenever reading from the 503 // partitions. What I think we might want to do is not close the 504 // partitions here. Instead, we move on to joining, which will switch 505 // all of these reserved file descriptors to read in the best case (no 506 // repartitioning) and reuse the cache. Only if we need to repartition 507 // should we CloseAllOpenWriteFileDescriptors of both sides. It might 508 // also be more efficient to Dequeue from the partitions you'll read 509 // from before doing that to exempt them from releasing their FDs to 510 // the semaphore. 511 if err := hj.leftPartitioner.CloseAllOpenWriteFileDescriptors(ctx); err != nil { 512 colexecerror.InternalError(err) 513 } 514 if err := hj.rightPartitioner.CloseAllOpenWriteFileDescriptors(ctx); err != nil { 515 colexecerror.InternalError(err) 516 } 517 hj.inMemHashJoiner.Init() 518 hj.partitionIdxOffset += hj.numBuckets 519 hj.state = externalHJJoinNewPartition 520 continue 521 } 522 if !hj.testingKnobs.delegateFDAcquisitions && hj.fdState.acquiredFDs == 0 { 523 toAcquire := hj.maxNumberActivePartitions 524 if err := hj.fdState.fdSemaphore.Acquire(ctx, toAcquire); err != nil { 525 colexecerror.InternalError(err) 526 } 527 hj.fdState.acquiredFDs = toAcquire 528 } 529 hj.partitionBatch(ctx, leftBatch, leftSide, math.MaxInt64) 530 hj.partitionBatch(ctx, rightBatch, rightSide, math.MaxInt64) 531 532 case externalHJRecursivePartitioning: 533 hj.numRepartitions++ 534 if log.V(2) && hj.numRepartitions%10 == 0 { 535 log.Infof(ctx, 536 "external hash joiner is performing %d'th repartition", hj.numRepartitions, 537 ) 538 } 539 // In order to use a different hash function when repartitioning, we need 540 // to increase the seed value of the tuple distributor. 541 hj.tupleDistributor.initHashValue++ 542 // We're actively will be using hj.numBuckets + 1 partitions (because 543 // we're repartitioning one side at a time), so we can set hj.numBuckets 544 // higher than in the initial partitioning step. 545 // TODO(yuzefovich): figure out whether we should care about 546 // hj.numBuckets being a power of two (finalizeHash step is faster if so). 547 hj.numBuckets = hj.maxNumberActivePartitions - 1 548 hj.tupleDistributor.resetNumOutputs(hj.numBuckets) 549 for parentPartitionIdx, parentPartitionInfo := range hj.partitionsToJoinUsingInMemHash { 550 for _, side := range []joinSide{leftSide, rightSide} { 551 batch := hj.recursiveScratch.leftBatch 552 partitioner := hj.leftPartitioner 553 memSize := int64(math.MaxInt64) 554 if side == rightSide { 555 batch = hj.recursiveScratch.rightBatch 556 partitioner = hj.rightPartitioner 557 memSize = parentPartitionInfo.rightMemSize 558 } 559 for { 560 if err := partitioner.Dequeue(ctx, parentPartitionIdx, batch); err != nil { 561 colexecerror.InternalError(err) 562 } 563 if batch.Length() == 0 { 564 break 565 } 566 hj.partitionBatch(ctx, batch, side, memSize) 567 } 568 // We're done reading from this partition, and it will never be read 569 // from again, so we can close it. 570 if err := partitioner.CloseInactiveReadPartitions(ctx); err != nil { 571 colexecerror.InternalError(err) 572 } 573 // We're done writing to the newly created partitions. 574 // TODO(yuzefovich): we should not release the descriptors here. The 575 // invariant should be: we're entering 576 // externalHJRecursivePartitioning, at that stage we have at most 577 // numBuckets*2 file descriptors open. At the top of the state 578 // transition, close all open write file descriptors, which should 579 // reduce the open descriptors to 0. Now we open the two read' 580 // partitions for 2 file descriptors and whatever number of write 581 // partitions we want. This'll allow us to remove the call to 582 // CloseAllOpen... in the first state as well. 583 if err := partitioner.CloseAllOpenWriteFileDescriptors(ctx); err != nil { 584 colexecerror.InternalError(err) 585 } 586 } 587 for idx := 0; idx < hj.numBuckets; idx++ { 588 newPartitionIdx := hj.partitionIdxOffset + idx 589 if partitionInfo, ok := hj.partitionsToJoinUsingInMemHash[newPartitionIdx]; ok { 590 before, after := partitionInfo.rightParentMemSize, partitionInfo.rightMemSize 591 if before > 0 { 592 sizeDecrease := 1.0 - float64(after)/float64(before) 593 if sizeDecrease < externalHJRecursivePartitioningSizeDecreaseThreshold { 594 // We will need to join this partition using sort + merge 595 // join strategy. 596 hj.partitionsToJoinUsingSortMerge = append(hj.partitionsToJoinUsingSortMerge, newPartitionIdx) 597 delete(hj.partitionsToJoinUsingInMemHash, newPartitionIdx) 598 } 599 } 600 } 601 } 602 // We have successfully repartitioned the partitions with index 603 // 'parentPartitionIdx' on both sides, so we delete that index from the 604 // map and proceed on joining the newly created partitions. 605 delete(hj.partitionsToJoinUsingInMemHash, parentPartitionIdx) 606 hj.partitionIdxOffset += hj.numBuckets 607 hj.state = externalHJJoinNewPartition 608 continue StateChanged 609 } 610 611 case externalHJJoinNewPartition: 612 if hj.testingKnobs.numForcedRepartitions > 0 && len(hj.partitionsToJoinUsingInMemHash) > 0 { 613 hj.testingKnobs.numForcedRepartitions-- 614 hj.state = externalHJRecursivePartitioning 615 continue 616 } 617 // Find next partition that we can join without having to recursively 618 // repartition. 619 for partitionIdx, partitionInfo := range hj.partitionsToJoinUsingInMemHash { 620 if partitionInfo.rightMemSize <= hj.memState.maxRightPartitionSizeToJoin { 621 // Update the inputs to in-memory hash joiner and reset the latter. 622 hj.leftJoinerInput.partitionIdx = partitionIdx 623 hj.rightJoinerInput.partitionIdx = partitionIdx 624 hj.inMemHashJoiner.reset(ctx) 625 delete(hj.partitionsToJoinUsingInMemHash, partitionIdx) 626 hj.state = externalHJJoining 627 continue StateChanged 628 } 629 } 630 if len(hj.partitionsToJoinUsingInMemHash) == 0 { 631 // All partitions to join using the hash joiner have been processed. 632 if len(hj.partitionsToJoinUsingSortMerge) > 0 { 633 // But there are still some partitions to join using sort + merge 634 // join strategy. 635 hj.diskBackedSortMerge.Init() 636 if log.V(2) { 637 log.Infof(ctx, 638 "external hash joiner will join %d partitions using sort + merge join", 639 len(hj.partitionsToJoinUsingSortMerge), 640 ) 641 } 642 hj.state = externalHJSortMergeNewPartition 643 continue 644 } 645 // All partitions have been processed, so we transition to finished 646 // state. 647 hj.state = externalHJFinished 648 continue 649 } 650 // We have partitions that we cannot join without recursively 651 // repartitioning first, so we transition to the corresponding state. 652 hj.state = externalHJRecursivePartitioning 653 continue 654 655 case externalHJJoining: 656 b := hj.inMemHashJoiner.Next(ctx) 657 if b.Length() == 0 { 658 // We're done joining these partitions, so we close them and transition 659 // to joining new ones. 660 if err := hj.leftPartitioner.CloseInactiveReadPartitions(ctx); err != nil { 661 colexecerror.InternalError(err) 662 } 663 if err := hj.rightPartitioner.CloseInactiveReadPartitions(ctx); err != nil { 664 colexecerror.InternalError(err) 665 } 666 hj.state = externalHJJoinNewPartition 667 continue 668 } 669 return b 670 671 case externalHJSortMergeNewPartition: 672 if len(hj.partitionsToJoinUsingSortMerge) == 0 { 673 // All partitions have been processed, so we transition to finished 674 // state. 675 hj.state = externalHJFinished 676 continue 677 } 678 partitionIdx := hj.partitionsToJoinUsingSortMerge[0] 679 hj.partitionsToJoinUsingSortMerge = hj.partitionsToJoinUsingSortMerge[1:] 680 // Update the inputs to sort + merge joiner and reset that chain. 681 hj.leftJoinerInput.partitionIdx = partitionIdx 682 hj.rightJoinerInput.partitionIdx = partitionIdx 683 hj.diskBackedSortMerge.reset(ctx) 684 hj.state = externalHJSortMergeJoining 685 continue 686 687 case externalHJSortMergeJoining: 688 b := hj.diskBackedSortMerge.Next(ctx) 689 if b.Length() == 0 { 690 // We're done joining these partitions, so we close them and transition 691 // to joining new ones. 692 if err := hj.leftPartitioner.CloseInactiveReadPartitions(ctx); err != nil { 693 colexecerror.InternalError(err) 694 } 695 if err := hj.rightPartitioner.CloseInactiveReadPartitions(ctx); err != nil { 696 colexecerror.InternalError(err) 697 } 698 hj.state = externalHJSortMergeNewPartition 699 continue 700 } 701 return b 702 703 case externalHJFinished: 704 if err := hj.idempotentCloseLocked(ctx); err != nil { 705 colexecerror.InternalError(err) 706 } 707 return coldata.ZeroBatch 708 default: 709 colexecerror.InternalError(fmt.Sprintf("unexpected externalHashJoinerState %d", hj.state)) 710 } 711 } 712 } 713 714 func (hj *externalHashJoiner) IdempotentClose(ctx context.Context) error { 715 hj.mu.Lock() 716 defer hj.mu.Unlock() 717 return hj.idempotentCloseLocked(ctx) 718 } 719 720 func (hj *externalHashJoiner) idempotentCloseLocked(ctx context.Context) error { 721 if !hj.close() { 722 return nil 723 } 724 var retErr error 725 if err := hj.leftPartitioner.Close(ctx); err != nil { 726 retErr = err 727 } 728 if err := hj.rightPartitioner.Close(ctx); err != nil && retErr == nil { 729 retErr = err 730 } 731 if c, ok := hj.diskBackedSortMerge.(IdempotentCloser); ok { 732 if err := c.IdempotentClose(ctx); err != nil && retErr == nil { 733 retErr = err 734 } 735 } 736 if !hj.testingKnobs.delegateFDAcquisitions && hj.fdState.acquiredFDs > 0 { 737 hj.fdState.fdSemaphore.Release(hj.fdState.acquiredFDs) 738 hj.fdState.acquiredFDs = 0 739 } 740 return retErr 741 }