github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/sql/colexec/mergejoiner.go (about) 1 // Copyright 2019 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package colexec 12 13 import ( 14 "context" 15 "math" 16 "unsafe" 17 18 "github.com/cockroachdb/cockroach/pkg/col/coldata" 19 "github.com/cockroachdb/cockroach/pkg/col/typeconv" 20 "github.com/cockroachdb/cockroach/pkg/sql/colcontainer" 21 "github.com/cockroachdb/cockroach/pkg/sql/colexecbase" 22 "github.com/cockroachdb/cockroach/pkg/sql/colexecbase/colexecerror" 23 "github.com/cockroachdb/cockroach/pkg/sql/colmem" 24 "github.com/cockroachdb/cockroach/pkg/sql/execinfrapb" 25 "github.com/cockroachdb/cockroach/pkg/sql/sqlbase" 26 "github.com/cockroachdb/cockroach/pkg/sql/types" 27 "github.com/cockroachdb/cockroach/pkg/util/mon" 28 "github.com/cockroachdb/cockroach/pkg/util/syncutil" 29 "github.com/cockroachdb/errors" 30 "github.com/marusama/semaphore" 31 ) 32 33 // group is an ADT representing a contiguous set of rows that match on their 34 // equality columns. 35 type group struct { 36 rowStartIdx int 37 rowEndIdx int 38 // numRepeats is used when expanding each group into a cross product in the 39 // build phase. 40 numRepeats int 41 // toBuild is used in the build phase to determine the right output count. 42 // This field should stay in sync with the builder over time. 43 toBuild int 44 // nullGroup indicates whether the output corresponding to the group should 45 // consist of all nulls. 46 nullGroup bool 47 // unmatched indicates that the rows in the group do not have matching rows 48 // from the other side (i.e. other side's group will be a null group). 49 // NOTE: during the probing phase, the assumption is that such group will 50 // consist of a single row. 51 unmatched bool 52 } 53 54 // mjBuildFrom is an indicator of which source we're building the output from. 55 type mjBuildFrom int 56 57 const ( 58 // mjBuildFromBatch indicates that we should be building from the current 59 // probing batches. Note that in such case we might have multiple groups to 60 // build. 61 mjBuildFromBatch mjBuildFrom = iota 62 // mjBuildFromBufferedGroup indicates that we should be building from the 63 // buffered group. Note that in such case we might have at most one group to 64 // build. 65 mjBuildFromBufferedGroup 66 ) 67 68 // mjBuilderState contains all the state required to execute the build phase. 69 type mjBuilderState struct { 70 buildFrom mjBuildFrom 71 72 // Fields to identify the groups in the input sources. 73 lGroups []group 74 rGroups []group 75 76 // outCount keeps record of the current number of rows in the output. 77 outCount int 78 // outFinished is used to determine if the builder is finished outputting 79 // the groups from input. 80 outFinished bool 81 82 // lBufferedGroupBatch and rBufferedGroupBatch are the current batches that 83 // we're building from when we're building the buffered group. 84 lBufferedGroupBatch coldata.Batch 85 rBufferedGroupBatch coldata.Batch 86 87 // Cross product materialization state. 88 left mjBuilderCrossProductState 89 right mjBuilderCrossProductState 90 } 91 92 // mjBuilderCrossProductState is used to keep track of builder state within the 93 // loops to materialize the cross product. Useful for picking up where we left 94 // off. 95 type mjBuilderCrossProductState struct { 96 groupsIdx int 97 curSrcStartIdx int 98 numRepeatsIdx int 99 // setOpLeftSrcIdx tracks the next tuple's index from the left buffered 100 // group for set operation joins. INTERSECT ALL and EXCEPT ALL joins are 101 // special because they need to emit the buffered group partially (namely, 102 // exactly group.rowEndIdx number of rows which could span multiple batches 103 // from the buffered group). 104 setOpLeftSrcIdx int 105 } 106 107 // mjBufferedGroup is a helper struct that stores information about the tuples 108 // from both inputs for the buffered group. 109 type mjBufferedGroup struct { 110 *spillingQueue 111 // firstTuple stores a single tuple that was first in the buffered group. 112 firstTuple []coldata.Vec 113 numTuples int 114 } 115 116 func (bg *mjBufferedGroup) reset(ctx context.Context) { 117 if bg.spillingQueue != nil { 118 bg.spillingQueue.reset(ctx) 119 } 120 bg.numTuples = 0 121 } 122 123 func (bg *mjBufferedGroup) close(ctx context.Context) error { 124 if bg.spillingQueue != nil { 125 if err := bg.spillingQueue.close(ctx); err != nil { 126 return err 127 } 128 bg.spillingQueue = nil 129 } 130 return nil 131 } 132 133 // mjProberState contains all the state required to execute in the probing 134 // phase. 135 type mjProberState struct { 136 // Fields to save the "working" batches to state in between outputs. 137 lBatch coldata.Batch 138 rBatch coldata.Batch 139 lIdx int 140 lLength int 141 rIdx int 142 rLength int 143 144 // Local buffer for the last left and right groups which is used when the 145 // group ends with a batch and the group on each side needs to be saved to 146 // state in order to be able to continue it in the next batch. 147 lBufferedGroup mjBufferedGroup 148 rBufferedGroup mjBufferedGroup 149 lBufferedGroupNeedToReset bool 150 rBufferedGroupNeedToReset bool 151 } 152 153 // mjState represents the state of the merge joiner. 154 type mjState int 155 156 const ( 157 // mjEntry is the entry state of the merge joiner where all the batches and 158 // indices are properly set, regardless if Next was called the first time or 159 // the 1000th time. This state also routes into the correct state based on 160 // the prober state after setup. 161 mjEntry mjState = iota 162 163 // mjSourceFinished is the state in which one of the input sources has no 164 // more available batches, thus signaling that the joiner should begin 165 // wrapping up execution by outputting any remaining groups in state. 166 mjSourceFinished 167 168 // mjFinishBufferedGroup is the state in which the previous state resulted in 169 // a group that ended with a batch. Such a group was buffered, and this state 170 // finishes that group and builds the output. 171 mjFinishBufferedGroup 172 173 // mjProbe is the main probing state in which the groups for the current 174 // batch are determined. 175 mjProbe 176 177 // mjBuild is the state in which the groups determined by the probing states 178 // are built, i.e. materialized to the output member by creating the cross 179 // product. 180 mjBuild 181 182 // mjDone is the final state of the merge joiner in which it'll be returning 183 // only zero-length batches. In this state, the disk infrastructure is 184 // cleaned up. 185 mjDone 186 ) 187 188 type mergeJoinInput struct { 189 // eqCols specify the indices of the source table equality columns during the 190 // merge join. 191 eqCols []uint32 192 193 // directions specifies the ordering direction of each column. Note that each 194 // direction corresponds to an equality column at the same location, i.e. the 195 // direction of eqCols[x] is encoded at directions[x], or 196 // len(eqCols) == len(directions). 197 directions []execinfrapb.Ordering_Column_Direction 198 199 // sourceTypes specify the types of the input columns of the source table for 200 // the merge joiner. 201 sourceTypes []*types.T 202 // canonicalTypeFamilies stores the canonical type families from 203 // sourceTypes. It is stored explicitly rather than being converted at 204 // runtime because that conversion would occur in tight loops and 205 // noticeably hurt the performance. 206 canonicalTypeFamilies []types.Family 207 208 // The distincter is used in the finishGroup phase, and is used only to 209 // determine where the current group ends, in the case that the group ended 210 // with a batch. 211 distincterInput *feedOperator 212 distincter colexecbase.Operator 213 distinctOutput []bool 214 215 // source specifies the input operator to the merge join. 216 source colexecbase.Operator 217 } 218 219 // The merge join operator uses a probe and build approach to generate the 220 // join. What this means is that instead of going through and expanding the 221 // cross product row by row, the operator performs two passes. 222 // The first pass generates a list of groups of matching rows based on the 223 // equality columns (where a "group" represents a contiguous set of rows that 224 // match on the equality columns). 225 // The second pass is where the groups and their associated cross products are 226 // materialized into the full output. 227 228 // Two buffers are used, one for the group on the left table and one for the 229 // group on the right table. These buffers are only used if the group ends with 230 // a batch, to make sure that we don't miss any cross product entries while 231 // expanding the groups (leftGroups and rightGroups) when a group spans 232 // multiple batches. 233 234 // newMergeJoinOp returns a new merge join operator with the given spec that 235 // implements sort-merge join. It performs a merge on the left and right input 236 // sources, based on the equality columns, assuming both inputs are in sorted 237 // order. 238 func newMergeJoinOp( 239 unlimitedAllocator *colmem.Allocator, 240 memoryLimit int64, 241 diskQueueCfg colcontainer.DiskQueueCfg, 242 fdSemaphore semaphore.Semaphore, 243 joinType sqlbase.JoinType, 244 left colexecbase.Operator, 245 right colexecbase.Operator, 246 leftTypes []*types.T, 247 rightTypes []*types.T, 248 leftOrdering []execinfrapb.Ordering_Column, 249 rightOrdering []execinfrapb.Ordering_Column, 250 diskAcc *mon.BoundAccount, 251 ) (resettableOperator, error) { 252 base, err := newMergeJoinBase( 253 unlimitedAllocator, memoryLimit, diskQueueCfg, fdSemaphore, joinType, 254 left, right, leftTypes, rightTypes, leftOrdering, rightOrdering, diskAcc, 255 ) 256 switch joinType { 257 case sqlbase.InnerJoin: 258 return &mergeJoinInnerOp{base}, err 259 case sqlbase.LeftOuterJoin: 260 return &mergeJoinLeftOuterOp{base}, err 261 case sqlbase.RightOuterJoin: 262 return &mergeJoinRightOuterOp{base}, err 263 case sqlbase.FullOuterJoin: 264 return &mergeJoinFullOuterOp{base}, err 265 case sqlbase.LeftSemiJoin: 266 return &mergeJoinLeftSemiOp{base}, err 267 case sqlbase.LeftAntiJoin: 268 return &mergeJoinLeftAntiOp{base}, err 269 case sqlbase.IntersectAllJoin: 270 return &mergeJoinIntersectAllOp{base}, err 271 case sqlbase.ExceptAllJoin: 272 return &mergeJoinExceptAllOp{base}, err 273 default: 274 return nil, errors.AssertionFailedf("merge join of type %s not supported", joinType) 275 } 276 } 277 278 // Const declarations for the merge joiner cross product (MJCP) zero state. 279 const ( 280 zeroMJCPGroupsIdx = 0 281 // The sentinel value for curSrcStartIdx is -1, as this: 282 // a) indicates that a src has not been started 283 // b) panics if the sentinel isn't checked 284 zeroMJCPCurSrcStartIdx = -1 285 zeroMJCPNumRepeatsIdx = 0 286 ) 287 288 // Package level struct for easy access to the MJCP zero state. 289 var zeroMJBuilderState = mjBuilderCrossProductState{ 290 groupsIdx: zeroMJCPGroupsIdx, 291 curSrcStartIdx: zeroMJCPCurSrcStartIdx, 292 numRepeatsIdx: zeroMJCPNumRepeatsIdx, 293 } 294 295 func (s *mjBuilderCrossProductState) reset() { 296 s.setBuilderColumnState(zeroMJBuilderState) 297 } 298 299 func (s *mjBuilderCrossProductState) setBuilderColumnState(target mjBuilderCrossProductState) { 300 s.groupsIdx = target.groupsIdx 301 s.curSrcStartIdx = target.curSrcStartIdx 302 s.numRepeatsIdx = target.numRepeatsIdx 303 s.setOpLeftSrcIdx = target.setOpLeftSrcIdx 304 } 305 306 func newMergeJoinBase( 307 unlimitedAllocator *colmem.Allocator, 308 memoryLimit int64, 309 diskQueueCfg colcontainer.DiskQueueCfg, 310 fdSemaphore semaphore.Semaphore, 311 joinType sqlbase.JoinType, 312 left colexecbase.Operator, 313 right colexecbase.Operator, 314 leftTypes []*types.T, 315 rightTypes []*types.T, 316 leftOrdering []execinfrapb.Ordering_Column, 317 rightOrdering []execinfrapb.Ordering_Column, 318 diskAcc *mon.BoundAccount, 319 ) (*mergeJoinBase, error) { 320 lEqCols := make([]uint32, len(leftOrdering)) 321 lDirections := make([]execinfrapb.Ordering_Column_Direction, len(leftOrdering)) 322 for i, c := range leftOrdering { 323 lEqCols[i] = c.ColIdx 324 lDirections[i] = c.Direction 325 } 326 327 rEqCols := make([]uint32, len(rightOrdering)) 328 rDirections := make([]execinfrapb.Ordering_Column_Direction, len(rightOrdering)) 329 for i, c := range rightOrdering { 330 rEqCols[i] = c.ColIdx 331 rDirections[i] = c.Direction 332 } 333 334 diskQueueCfg.CacheMode = colcontainer.DiskQueueCacheModeReuseCache 335 diskQueueCfg.SetDefaultBufferSizeBytesForCacheMode() 336 base := &mergeJoinBase{ 337 twoInputNode: newTwoInputNode(left, right), 338 unlimitedAllocator: unlimitedAllocator, 339 memoryLimit: memoryLimit, 340 diskQueueCfg: diskQueueCfg, 341 fdSemaphore: fdSemaphore, 342 joinType: joinType, 343 left: mergeJoinInput{ 344 source: left, 345 sourceTypes: leftTypes, 346 canonicalTypeFamilies: typeconv.ToCanonicalTypeFamilies(leftTypes), 347 eqCols: lEqCols, 348 directions: lDirections, 349 }, 350 right: mergeJoinInput{ 351 source: right, 352 sourceTypes: rightTypes, 353 canonicalTypeFamilies: typeconv.ToCanonicalTypeFamilies(rightTypes), 354 eqCols: rEqCols, 355 directions: rDirections, 356 }, 357 diskAcc: diskAcc, 358 } 359 var err error 360 base.left.distincterInput = &feedOperator{} 361 base.left.distincter, base.left.distinctOutput, err = OrderedDistinctColsToOperators( 362 base.left.distincterInput, lEqCols, leftTypes) 363 if err != nil { 364 return base, err 365 } 366 base.right.distincterInput = &feedOperator{} 367 base.right.distincter, base.right.distinctOutput, err = OrderedDistinctColsToOperators( 368 base.right.distincterInput, rEqCols, rightTypes) 369 if err != nil { 370 return base, err 371 } 372 return base, err 373 } 374 375 // mergeJoinBase extracts the common logic between all merge join operators. 376 type mergeJoinBase struct { 377 twoInputNode 378 closerHelper 379 380 // mu is used to protect against concurrent IdempotentClose and Next calls, 381 // which are currently allowed. 382 // TODO(asubiotto): Explore calling IdempotentClose from the same goroutine as 383 // Next, which will simplify this model. 384 mu syncutil.Mutex 385 386 unlimitedAllocator *colmem.Allocator 387 memoryLimit int64 388 diskQueueCfg colcontainer.DiskQueueCfg 389 fdSemaphore semaphore.Semaphore 390 joinType sqlbase.JoinType 391 left mergeJoinInput 392 right mergeJoinInput 393 394 // Output buffer definition. 395 output coldata.Batch 396 outputBatchSize int 397 // outputReady is a flag to indicate that merge joiner is ready to emit an 398 // output batch. 399 outputReady bool 400 401 // Local buffer for the "working" repeated groups. 402 groups circularGroupsBuffer 403 404 state mjState 405 proberState mjProberState 406 builderState mjBuilderState 407 scratch struct { 408 // tempVecs are temporary vectors that can be used during a cast 409 // operation in the probing phase. These vectors should *not* be 410 // exposed outside of the merge joiner. 411 tempVecs []coldata.Vec 412 // lBufferedGroupBatch and rBufferedGroupBatch are scratch batches that are 413 // used to select out the tuples that belong to the buffered batch before 414 // enqueueing them into corresponding mjBufferedGroups. These are lazily 415 // instantiated. 416 // TODO(yuzefovich): uncomment when spillingQueue actually copies the 417 // enqueued batches when those are kept in memory. 418 //lBufferedGroupBatch coldata.Batch 419 //rBufferedGroupBatch coldata.Batch 420 } 421 422 diskAcc *mon.BoundAccount 423 } 424 425 var _ resetter = &mergeJoinBase{} 426 var _ IdempotentCloser = &mergeJoinBase{} 427 428 func (o *mergeJoinBase) reset(ctx context.Context) { 429 if r, ok := o.left.source.(resetter); ok { 430 r.reset(ctx) 431 } 432 if r, ok := o.right.source.(resetter); ok { 433 r.reset(ctx) 434 } 435 o.outputReady = false 436 o.state = mjEntry 437 o.proberState.lBatch = nil 438 o.proberState.rBatch = nil 439 o.proberState.lBufferedGroup.reset(ctx) 440 o.proberState.rBufferedGroup.reset(ctx) 441 o.proberState.lBufferedGroupNeedToReset = false 442 o.proberState.rBufferedGroupNeedToReset = false 443 o.resetBuilderCrossProductState() 444 } 445 446 func (o *mergeJoinBase) InternalMemoryUsage() int { 447 const sizeOfGroup = int(unsafe.Sizeof(group{})) 448 return 8 * coldata.BatchSize() * sizeOfGroup // o.groups 449 } 450 451 func (o *mergeJoinBase) Init() { 452 o.initWithOutputBatchSize(coldata.BatchSize()) 453 } 454 455 func (o *mergeJoinBase) initWithOutputBatchSize(outBatchSize int) { 456 outputTypes := append([]*types.T{}, o.left.sourceTypes...) 457 if o.joinType.ShouldIncludeRightColsInOutput() { 458 outputTypes = append(outputTypes, o.right.sourceTypes...) 459 } 460 o.output = o.unlimitedAllocator.NewMemBatchWithSize(outputTypes, outBatchSize) 461 o.left.source.Init() 462 o.right.source.Init() 463 o.outputBatchSize = outBatchSize 464 // If there are no output columns, then the operator is for a COUNT query, 465 // in which case we treat the output batch size as the max int. 466 if o.output.Width() == 0 { 467 o.outputBatchSize = math.MaxInt64 468 } 469 470 o.proberState.lBufferedGroup.spillingQueue = newSpillingQueue( 471 o.unlimitedAllocator, o.left.sourceTypes, o.memoryLimit, 472 o.diskQueueCfg, o.fdSemaphore, coldata.BatchSize(), o.diskAcc, 473 ) 474 o.proberState.lBufferedGroup.firstTuple = make([]coldata.Vec, len(o.left.sourceTypes)) 475 for colIdx, t := range o.left.sourceTypes { 476 o.proberState.lBufferedGroup.firstTuple[colIdx] = o.unlimitedAllocator.NewMemColumn(t, 1) 477 } 478 o.proberState.rBufferedGroup.spillingQueue = newRewindableSpillingQueue( 479 o.unlimitedAllocator, o.right.sourceTypes, o.memoryLimit, 480 o.diskQueueCfg, o.fdSemaphore, coldata.BatchSize(), o.diskAcc, 481 ) 482 o.proberState.rBufferedGroup.firstTuple = make([]coldata.Vec, len(o.right.sourceTypes)) 483 for colIdx, t := range o.right.sourceTypes { 484 o.proberState.rBufferedGroup.firstTuple[colIdx] = o.unlimitedAllocator.NewMemColumn(t, 1) 485 } 486 487 o.builderState.lGroups = make([]group, 1) 488 o.builderState.rGroups = make([]group, 1) 489 490 o.groups = makeGroupsBuffer(coldata.BatchSize()) 491 o.resetBuilderCrossProductState() 492 } 493 494 func (o *mergeJoinBase) resetBuilderCrossProductState() { 495 o.builderState.left.reset() 496 o.builderState.right.reset() 497 } 498 499 // appendToBufferedGroup appends all the tuples from batch that are part of the 500 // same group as the ones in the buffered group that corresponds to the input 501 // source. This needs to happen when a group starts at the end of an input 502 // batch and can continue into the following batches. 503 func (o *mergeJoinBase) appendToBufferedGroup( 504 ctx context.Context, 505 input *mergeJoinInput, 506 batch coldata.Batch, 507 sel []int, 508 groupStartIdx int, 509 groupLength int, 510 ) { 511 if groupLength == 0 { 512 return 513 } 514 var ( 515 bufferedGroup *mjBufferedGroup 516 scratchBatch coldata.Batch 517 sourceTypes []*types.T 518 ) 519 if input == &o.left { 520 sourceTypes = o.left.sourceTypes 521 bufferedGroup = &o.proberState.lBufferedGroup 522 // TODO(yuzefovich): uncomment when spillingQueue actually copies the 523 // enqueued batches when those are kept in memory. 524 //if o.scratch.lBufferedGroupBatch == nil { 525 // o.scratch.lBufferedGroupBatch = o.unlimitedAllocator.NewMemBatch(o.left.sourceTypes) 526 //} 527 //scratchBatch = o.scratch.lBufferedGroupBatch 528 } else { 529 sourceTypes = o.right.sourceTypes 530 bufferedGroup = &o.proberState.rBufferedGroup 531 // TODO(yuzefovich): uncomment when spillingQueue actually copies the 532 // enqueued batches when those are kept in memory. 533 //if o.scratch.rBufferedGroupBatch == nil { 534 // o.scratch.rBufferedGroupBatch = o.unlimitedAllocator.NewMemBatch(o.right.sourceTypes) 535 //} 536 //scratchBatch = o.scratch.rBufferedGroupBatch 537 } 538 scratchBatch = o.unlimitedAllocator.NewMemBatchWithSize(sourceTypes, groupLength) 539 if bufferedGroup.numTuples == 0 { 540 o.unlimitedAllocator.PerformOperation(bufferedGroup.firstTuple, func() { 541 for colIdx := range sourceTypes { 542 bufferedGroup.firstTuple[colIdx].Copy( 543 coldata.CopySliceArgs{ 544 SliceArgs: coldata.SliceArgs{ 545 Src: batch.ColVec(colIdx), 546 Sel: sel, 547 DestIdx: 0, 548 SrcStartIdx: groupStartIdx, 549 SrcEndIdx: groupStartIdx + 1, 550 }, 551 }, 552 ) 553 } 554 }) 555 } 556 bufferedGroup.numTuples += groupLength 557 558 o.unlimitedAllocator.PerformOperation(scratchBatch.ColVecs(), func() { 559 for colIdx := range input.sourceTypes { 560 scratchBatch.ColVec(colIdx).Copy( 561 coldata.CopySliceArgs{ 562 SliceArgs: coldata.SliceArgs{ 563 Src: batch.ColVec(colIdx), 564 Sel: sel, 565 DestIdx: 0, 566 SrcStartIdx: groupStartIdx, 567 SrcEndIdx: groupStartIdx + groupLength, 568 }, 569 }, 570 ) 571 } 572 }) 573 scratchBatch.SetSelection(false) 574 scratchBatch.SetLength(groupLength) 575 if err := bufferedGroup.enqueue(ctx, scratchBatch); err != nil { 576 colexecerror.InternalError(err) 577 } 578 } 579 580 // setBuilderSourceToBatch sets the builder state to use groups from the 581 // circular group buffer and the batches from input. This happens when we have 582 // groups that are fully contained within a single input batch from each of the 583 // sources. 584 func (o *mergeJoinBase) setBuilderSourceToBatch() { 585 o.builderState.lGroups, o.builderState.rGroups = o.groups.getGroups() 586 o.builderState.buildFrom = mjBuildFromBatch 587 } 588 589 // initProberState sets the batches, lengths, and current indices to the right 590 // locations given the last iteration of the operator. 591 func (o *mergeJoinBase) initProberState(ctx context.Context) { 592 // If this is the first batch or we're done with the current batch, get the 593 // next batch. 594 if o.proberState.lBatch == nil || (o.proberState.lLength != 0 && o.proberState.lIdx == o.proberState.lLength) { 595 o.proberState.lIdx, o.proberState.lBatch = 0, o.left.source.Next(ctx) 596 o.proberState.lLength = o.proberState.lBatch.Length() 597 } 598 if o.proberState.rBatch == nil || (o.proberState.rLength != 0 && o.proberState.rIdx == o.proberState.rLength) { 599 o.proberState.rIdx, o.proberState.rBatch = 0, o.right.source.Next(ctx) 600 o.proberState.rLength = o.proberState.rBatch.Length() 601 } 602 if o.proberState.lBufferedGroupNeedToReset { 603 o.proberState.lBufferedGroup.reset(ctx) 604 o.proberState.lBufferedGroupNeedToReset = false 605 } 606 if o.proberState.rBufferedGroupNeedToReset { 607 o.proberState.rBufferedGroup.reset(ctx) 608 o.proberState.rBufferedGroupNeedToReset = false 609 } 610 } 611 612 // nonEmptyBufferedGroup returns true if there is a buffered group that needs 613 // to be finished. 614 func (o *mergeJoinBase) nonEmptyBufferedGroup() bool { 615 return o.proberState.lBufferedGroup.numTuples > 0 || o.proberState.rBufferedGroup.numTuples > 0 616 } 617 618 // sourceFinished returns true if either of input sources has no more rows. 619 func (o *mergeJoinBase) sourceFinished() bool { 620 return o.proberState.lLength == 0 || o.proberState.rLength == 0 621 } 622 623 // completeBufferedGroup extends the buffered group corresponding to input. 624 // First, we check that the first row in batch is still part of the same group. 625 // If this is the case, we use the Distinct operator to find the first 626 // occurrence in batch (or subsequent batches) that doesn't match the current 627 // group. 628 // NOTE: we will be buffering all batches until we find such non-matching tuple 629 // (or until we exhaust the input). 630 // TODO(yuzefovich): this can be refactored so that only the right side does 631 // unbounded buffering. 632 // SIDE EFFECT: can append to the buffered group corresponding to the source. 633 func (o *mergeJoinBase) completeBufferedGroup( 634 ctx context.Context, input *mergeJoinInput, batch coldata.Batch, rowIdx int, 635 ) (_ coldata.Batch, idx int, batchLength int) { 636 batchLength = batch.Length() 637 if o.isBufferedGroupFinished(input, batch, rowIdx) { 638 return batch, rowIdx, batchLength 639 } 640 641 isBufferedGroupComplete := false 642 input.distincter.(resetter).reset(ctx) 643 // Ignore the first row of the distincter in the first pass since we already 644 // know that we are in the same group and, thus, the row is not distinct, 645 // regardless of what the distincter outputs. 646 loopStartIndex := 1 647 var sel []int 648 for !isBufferedGroupComplete { 649 // Note that we're not resetting the distincter on every loop iteration 650 // because if we're doing the second, third, etc, iteration, then all the 651 // previous iterations had only the matching tuples to the buffered group, 652 // so the distincter - in a sense - compares the incoming tuples to the 653 // first tuple of the first iteration (which we know is the same group). 654 input.distincterInput.batch = batch 655 input.distincter.Next(ctx) 656 657 sel = batch.Selection() 658 var groupLength int 659 if sel != nil { 660 for groupLength = loopStartIndex; groupLength < batchLength; groupLength++ { 661 if input.distinctOutput[sel[groupLength]] { 662 // We found the beginning of a new group! 663 isBufferedGroupComplete = true 664 break 665 } 666 } 667 } else { 668 for groupLength = loopStartIndex; groupLength < batchLength; groupLength++ { 669 if input.distinctOutput[groupLength] { 670 // We found the beginning of a new group! 671 isBufferedGroupComplete = true 672 break 673 } 674 } 675 } 676 677 // Zero out the distinct output for the next pass. 678 copy(input.distinctOutput[:batchLength], zeroBoolColumn) 679 loopStartIndex = 0 680 681 // Buffer all the tuples that are part of the buffered group. 682 o.appendToBufferedGroup(ctx, input, batch, sel, rowIdx, groupLength) 683 rowIdx += groupLength 684 685 if !isBufferedGroupComplete { 686 // The buffered group is still not complete which means that we have 687 // just appended all the tuples from batch to it, so we need to get a 688 // fresh batch from the input. 689 rowIdx, batch = 0, input.source.Next(ctx) 690 batchLength = batch.Length() 691 if batchLength == 0 { 692 // The input has been exhausted, so the buffered group is now complete. 693 isBufferedGroupComplete = true 694 } 695 } 696 } 697 698 return batch, rowIdx, batchLength 699 } 700 701 // finishProbe completes the buffered groups on both sides of the input. 702 func (o *mergeJoinBase) finishProbe(ctx context.Context) { 703 o.proberState.lBatch, o.proberState.lIdx, o.proberState.lLength = o.completeBufferedGroup( 704 ctx, 705 &o.left, 706 o.proberState.lBatch, 707 o.proberState.lIdx, 708 ) 709 o.proberState.rBatch, o.proberState.rIdx, o.proberState.rLength = o.completeBufferedGroup( 710 ctx, 711 &o.right, 712 o.proberState.rBatch, 713 o.proberState.rIdx, 714 ) 715 } 716 717 func (o *mergeJoinBase) IdempotentClose(ctx context.Context) error { 718 o.mu.Lock() 719 defer o.mu.Unlock() 720 if !o.close() { 721 return nil 722 } 723 var lastErr error 724 for _, op := range []colexecbase.Operator{o.left.source, o.right.source} { 725 if c, ok := op.(IdempotentCloser); ok { 726 if err := c.IdempotentClose(ctx); err != nil { 727 lastErr = err 728 } 729 } 730 } 731 if err := o.proberState.lBufferedGroup.close(ctx); err != nil { 732 lastErr = err 733 } 734 if err := o.proberState.rBufferedGroup.close(ctx); err != nil { 735 lastErr = err 736 } 737 return lastErr 738 }