github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/sql/colexec/mergejoiner.go (about)

     1  // Copyright 2019 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package colexec
    12  
    13  import (
    14  	"context"
    15  	"math"
    16  	"unsafe"
    17  
    18  	"github.com/cockroachdb/cockroach/pkg/col/coldata"
    19  	"github.com/cockroachdb/cockroach/pkg/col/typeconv"
    20  	"github.com/cockroachdb/cockroach/pkg/sql/colcontainer"
    21  	"github.com/cockroachdb/cockroach/pkg/sql/colexecbase"
    22  	"github.com/cockroachdb/cockroach/pkg/sql/colexecbase/colexecerror"
    23  	"github.com/cockroachdb/cockroach/pkg/sql/colmem"
    24  	"github.com/cockroachdb/cockroach/pkg/sql/execinfrapb"
    25  	"github.com/cockroachdb/cockroach/pkg/sql/sqlbase"
    26  	"github.com/cockroachdb/cockroach/pkg/sql/types"
    27  	"github.com/cockroachdb/cockroach/pkg/util/mon"
    28  	"github.com/cockroachdb/cockroach/pkg/util/syncutil"
    29  	"github.com/cockroachdb/errors"
    30  	"github.com/marusama/semaphore"
    31  )
    32  
    33  // group is an ADT representing a contiguous set of rows that match on their
    34  // equality columns.
    35  type group struct {
    36  	rowStartIdx int
    37  	rowEndIdx   int
    38  	// numRepeats is used when expanding each group into a cross product in the
    39  	// build phase.
    40  	numRepeats int
    41  	// toBuild is used in the build phase to determine the right output count.
    42  	// This field should stay in sync with the builder over time.
    43  	toBuild int
    44  	// nullGroup indicates whether the output corresponding to the group should
    45  	// consist of all nulls.
    46  	nullGroup bool
    47  	// unmatched indicates that the rows in the group do not have matching rows
    48  	// from the other side (i.e. other side's group will be a null group).
    49  	// NOTE: during the probing phase, the assumption is that such group will
    50  	// consist of a single row.
    51  	unmatched bool
    52  }
    53  
    54  // mjBuildFrom is an indicator of which source we're building the output from.
    55  type mjBuildFrom int
    56  
    57  const (
    58  	// mjBuildFromBatch indicates that we should be building from the current
    59  	// probing batches. Note that in such case we might have multiple groups to
    60  	// build.
    61  	mjBuildFromBatch mjBuildFrom = iota
    62  	// mjBuildFromBufferedGroup indicates that we should be building from the
    63  	// buffered group. Note that in such case we might have at most one group to
    64  	// build.
    65  	mjBuildFromBufferedGroup
    66  )
    67  
    68  // mjBuilderState contains all the state required to execute the build phase.
    69  type mjBuilderState struct {
    70  	buildFrom mjBuildFrom
    71  
    72  	// Fields to identify the groups in the input sources.
    73  	lGroups []group
    74  	rGroups []group
    75  
    76  	// outCount keeps record of the current number of rows in the output.
    77  	outCount int
    78  	// outFinished is used to determine if the builder is finished outputting
    79  	// the groups from input.
    80  	outFinished bool
    81  
    82  	// lBufferedGroupBatch and rBufferedGroupBatch are the current batches that
    83  	// we're building from when we're building the buffered group.
    84  	lBufferedGroupBatch coldata.Batch
    85  	rBufferedGroupBatch coldata.Batch
    86  
    87  	// Cross product materialization state.
    88  	left  mjBuilderCrossProductState
    89  	right mjBuilderCrossProductState
    90  }
    91  
    92  // mjBuilderCrossProductState is used to keep track of builder state within the
    93  // loops to materialize the cross product. Useful for picking up where we left
    94  // off.
    95  type mjBuilderCrossProductState struct {
    96  	groupsIdx      int
    97  	curSrcStartIdx int
    98  	numRepeatsIdx  int
    99  	// setOpLeftSrcIdx tracks the next tuple's index from the left buffered
   100  	// group for set operation joins. INTERSECT ALL and EXCEPT ALL joins are
   101  	// special because they need to emit the buffered group partially (namely,
   102  	// exactly group.rowEndIdx number of rows which could span multiple batches
   103  	// from the buffered group).
   104  	setOpLeftSrcIdx int
   105  }
   106  
   107  // mjBufferedGroup is a helper struct that stores information about the tuples
   108  // from both inputs for the buffered group.
   109  type mjBufferedGroup struct {
   110  	*spillingQueue
   111  	// firstTuple stores a single tuple that was first in the buffered group.
   112  	firstTuple []coldata.Vec
   113  	numTuples  int
   114  }
   115  
   116  func (bg *mjBufferedGroup) reset(ctx context.Context) {
   117  	if bg.spillingQueue != nil {
   118  		bg.spillingQueue.reset(ctx)
   119  	}
   120  	bg.numTuples = 0
   121  }
   122  
   123  func (bg *mjBufferedGroup) close(ctx context.Context) error {
   124  	if bg.spillingQueue != nil {
   125  		if err := bg.spillingQueue.close(ctx); err != nil {
   126  			return err
   127  		}
   128  		bg.spillingQueue = nil
   129  	}
   130  	return nil
   131  }
   132  
   133  // mjProberState contains all the state required to execute in the probing
   134  // phase.
   135  type mjProberState struct {
   136  	// Fields to save the "working" batches to state in between outputs.
   137  	lBatch  coldata.Batch
   138  	rBatch  coldata.Batch
   139  	lIdx    int
   140  	lLength int
   141  	rIdx    int
   142  	rLength int
   143  
   144  	// Local buffer for the last left and right groups which is used when the
   145  	// group ends with a batch and the group on each side needs to be saved to
   146  	// state in order to be able to continue it in the next batch.
   147  	lBufferedGroup            mjBufferedGroup
   148  	rBufferedGroup            mjBufferedGroup
   149  	lBufferedGroupNeedToReset bool
   150  	rBufferedGroupNeedToReset bool
   151  }
   152  
   153  // mjState represents the state of the merge joiner.
   154  type mjState int
   155  
   156  const (
   157  	// mjEntry is the entry state of the merge joiner where all the batches and
   158  	// indices are properly set, regardless if Next was called the first time or
   159  	// the 1000th time. This state also routes into the correct state based on
   160  	// the prober state after setup.
   161  	mjEntry mjState = iota
   162  
   163  	// mjSourceFinished is the state in which one of the input sources has no
   164  	// more available batches, thus signaling that the joiner should begin
   165  	// wrapping up execution by outputting any remaining groups in state.
   166  	mjSourceFinished
   167  
   168  	// mjFinishBufferedGroup is the state in which the previous state resulted in
   169  	// a group that ended with a batch. Such a group was buffered, and this state
   170  	// finishes that group and builds the output.
   171  	mjFinishBufferedGroup
   172  
   173  	// mjProbe is the main probing state in which the groups for the current
   174  	// batch are determined.
   175  	mjProbe
   176  
   177  	// mjBuild is the state in which the groups determined by the probing states
   178  	// are built, i.e. materialized to the output member by creating the cross
   179  	// product.
   180  	mjBuild
   181  
   182  	// mjDone is the final state of the merge joiner in which it'll be returning
   183  	// only zero-length batches. In this state, the disk infrastructure is
   184  	// cleaned up.
   185  	mjDone
   186  )
   187  
   188  type mergeJoinInput struct {
   189  	// eqCols specify the indices of the source table equality columns during the
   190  	// merge join.
   191  	eqCols []uint32
   192  
   193  	// directions specifies the ordering direction of each column. Note that each
   194  	// direction corresponds to an equality column at the same location, i.e. the
   195  	// direction of eqCols[x] is encoded at directions[x], or
   196  	// len(eqCols) == len(directions).
   197  	directions []execinfrapb.Ordering_Column_Direction
   198  
   199  	// sourceTypes specify the types of the input columns of the source table for
   200  	// the merge joiner.
   201  	sourceTypes []*types.T
   202  	// canonicalTypeFamilies stores the canonical type families from
   203  	// sourceTypes. It is stored explicitly rather than being converted at
   204  	// runtime because that conversion would occur in tight loops and
   205  	// noticeably hurt the performance.
   206  	canonicalTypeFamilies []types.Family
   207  
   208  	// The distincter is used in the finishGroup phase, and is used only to
   209  	// determine where the current group ends, in the case that the group ended
   210  	// with a batch.
   211  	distincterInput *feedOperator
   212  	distincter      colexecbase.Operator
   213  	distinctOutput  []bool
   214  
   215  	// source specifies the input operator to the merge join.
   216  	source colexecbase.Operator
   217  }
   218  
   219  // The merge join operator uses a probe and build approach to generate the
   220  // join. What this means is that instead of going through and expanding the
   221  // cross product row by row, the operator performs two passes.
   222  // The first pass generates a list of groups of matching rows based on the
   223  // equality columns (where a "group" represents a contiguous set of rows that
   224  // match on the equality columns).
   225  // The second pass is where the groups and their associated cross products are
   226  // materialized into the full output.
   227  
   228  // Two buffers are used, one for the group on the left table and one for the
   229  // group on the right table. These buffers are only used if the group ends with
   230  // a batch, to make sure that we don't miss any cross product entries while
   231  // expanding the groups (leftGroups and rightGroups) when a group spans
   232  // multiple batches.
   233  
   234  // newMergeJoinOp returns a new merge join operator with the given spec that
   235  // implements sort-merge join. It performs a merge on the left and right input
   236  // sources, based on the equality columns, assuming both inputs are in sorted
   237  // order.
   238  func newMergeJoinOp(
   239  	unlimitedAllocator *colmem.Allocator,
   240  	memoryLimit int64,
   241  	diskQueueCfg colcontainer.DiskQueueCfg,
   242  	fdSemaphore semaphore.Semaphore,
   243  	joinType sqlbase.JoinType,
   244  	left colexecbase.Operator,
   245  	right colexecbase.Operator,
   246  	leftTypes []*types.T,
   247  	rightTypes []*types.T,
   248  	leftOrdering []execinfrapb.Ordering_Column,
   249  	rightOrdering []execinfrapb.Ordering_Column,
   250  	diskAcc *mon.BoundAccount,
   251  ) (resettableOperator, error) {
   252  	base, err := newMergeJoinBase(
   253  		unlimitedAllocator, memoryLimit, diskQueueCfg, fdSemaphore, joinType,
   254  		left, right, leftTypes, rightTypes, leftOrdering, rightOrdering, diskAcc,
   255  	)
   256  	switch joinType {
   257  	case sqlbase.InnerJoin:
   258  		return &mergeJoinInnerOp{base}, err
   259  	case sqlbase.LeftOuterJoin:
   260  		return &mergeJoinLeftOuterOp{base}, err
   261  	case sqlbase.RightOuterJoin:
   262  		return &mergeJoinRightOuterOp{base}, err
   263  	case sqlbase.FullOuterJoin:
   264  		return &mergeJoinFullOuterOp{base}, err
   265  	case sqlbase.LeftSemiJoin:
   266  		return &mergeJoinLeftSemiOp{base}, err
   267  	case sqlbase.LeftAntiJoin:
   268  		return &mergeJoinLeftAntiOp{base}, err
   269  	case sqlbase.IntersectAllJoin:
   270  		return &mergeJoinIntersectAllOp{base}, err
   271  	case sqlbase.ExceptAllJoin:
   272  		return &mergeJoinExceptAllOp{base}, err
   273  	default:
   274  		return nil, errors.AssertionFailedf("merge join of type %s not supported", joinType)
   275  	}
   276  }
   277  
   278  // Const declarations for the merge joiner cross product (MJCP) zero state.
   279  const (
   280  	zeroMJCPGroupsIdx = 0
   281  	// The sentinel value for curSrcStartIdx is -1, as this:
   282  	// a) indicates that a src has not been started
   283  	// b) panics if the sentinel isn't checked
   284  	zeroMJCPCurSrcStartIdx = -1
   285  	zeroMJCPNumRepeatsIdx  = 0
   286  )
   287  
   288  // Package level struct for easy access to the MJCP zero state.
   289  var zeroMJBuilderState = mjBuilderCrossProductState{
   290  	groupsIdx:      zeroMJCPGroupsIdx,
   291  	curSrcStartIdx: zeroMJCPCurSrcStartIdx,
   292  	numRepeatsIdx:  zeroMJCPNumRepeatsIdx,
   293  }
   294  
   295  func (s *mjBuilderCrossProductState) reset() {
   296  	s.setBuilderColumnState(zeroMJBuilderState)
   297  }
   298  
   299  func (s *mjBuilderCrossProductState) setBuilderColumnState(target mjBuilderCrossProductState) {
   300  	s.groupsIdx = target.groupsIdx
   301  	s.curSrcStartIdx = target.curSrcStartIdx
   302  	s.numRepeatsIdx = target.numRepeatsIdx
   303  	s.setOpLeftSrcIdx = target.setOpLeftSrcIdx
   304  }
   305  
   306  func newMergeJoinBase(
   307  	unlimitedAllocator *colmem.Allocator,
   308  	memoryLimit int64,
   309  	diskQueueCfg colcontainer.DiskQueueCfg,
   310  	fdSemaphore semaphore.Semaphore,
   311  	joinType sqlbase.JoinType,
   312  	left colexecbase.Operator,
   313  	right colexecbase.Operator,
   314  	leftTypes []*types.T,
   315  	rightTypes []*types.T,
   316  	leftOrdering []execinfrapb.Ordering_Column,
   317  	rightOrdering []execinfrapb.Ordering_Column,
   318  	diskAcc *mon.BoundAccount,
   319  ) (*mergeJoinBase, error) {
   320  	lEqCols := make([]uint32, len(leftOrdering))
   321  	lDirections := make([]execinfrapb.Ordering_Column_Direction, len(leftOrdering))
   322  	for i, c := range leftOrdering {
   323  		lEqCols[i] = c.ColIdx
   324  		lDirections[i] = c.Direction
   325  	}
   326  
   327  	rEqCols := make([]uint32, len(rightOrdering))
   328  	rDirections := make([]execinfrapb.Ordering_Column_Direction, len(rightOrdering))
   329  	for i, c := range rightOrdering {
   330  		rEqCols[i] = c.ColIdx
   331  		rDirections[i] = c.Direction
   332  	}
   333  
   334  	diskQueueCfg.CacheMode = colcontainer.DiskQueueCacheModeReuseCache
   335  	diskQueueCfg.SetDefaultBufferSizeBytesForCacheMode()
   336  	base := &mergeJoinBase{
   337  		twoInputNode:       newTwoInputNode(left, right),
   338  		unlimitedAllocator: unlimitedAllocator,
   339  		memoryLimit:        memoryLimit,
   340  		diskQueueCfg:       diskQueueCfg,
   341  		fdSemaphore:        fdSemaphore,
   342  		joinType:           joinType,
   343  		left: mergeJoinInput{
   344  			source:                left,
   345  			sourceTypes:           leftTypes,
   346  			canonicalTypeFamilies: typeconv.ToCanonicalTypeFamilies(leftTypes),
   347  			eqCols:                lEqCols,
   348  			directions:            lDirections,
   349  		},
   350  		right: mergeJoinInput{
   351  			source:                right,
   352  			sourceTypes:           rightTypes,
   353  			canonicalTypeFamilies: typeconv.ToCanonicalTypeFamilies(rightTypes),
   354  			eqCols:                rEqCols,
   355  			directions:            rDirections,
   356  		},
   357  		diskAcc: diskAcc,
   358  	}
   359  	var err error
   360  	base.left.distincterInput = &feedOperator{}
   361  	base.left.distincter, base.left.distinctOutput, err = OrderedDistinctColsToOperators(
   362  		base.left.distincterInput, lEqCols, leftTypes)
   363  	if err != nil {
   364  		return base, err
   365  	}
   366  	base.right.distincterInput = &feedOperator{}
   367  	base.right.distincter, base.right.distinctOutput, err = OrderedDistinctColsToOperators(
   368  		base.right.distincterInput, rEqCols, rightTypes)
   369  	if err != nil {
   370  		return base, err
   371  	}
   372  	return base, err
   373  }
   374  
   375  // mergeJoinBase extracts the common logic between all merge join operators.
   376  type mergeJoinBase struct {
   377  	twoInputNode
   378  	closerHelper
   379  
   380  	// mu is used to protect against concurrent IdempotentClose and Next calls,
   381  	// which are currently allowed.
   382  	// TODO(asubiotto): Explore calling IdempotentClose from the same goroutine as
   383  	//  Next, which will simplify this model.
   384  	mu syncutil.Mutex
   385  
   386  	unlimitedAllocator *colmem.Allocator
   387  	memoryLimit        int64
   388  	diskQueueCfg       colcontainer.DiskQueueCfg
   389  	fdSemaphore        semaphore.Semaphore
   390  	joinType           sqlbase.JoinType
   391  	left               mergeJoinInput
   392  	right              mergeJoinInput
   393  
   394  	// Output buffer definition.
   395  	output          coldata.Batch
   396  	outputBatchSize int
   397  	// outputReady is a flag to indicate that merge joiner is ready to emit an
   398  	// output batch.
   399  	outputReady bool
   400  
   401  	// Local buffer for the "working" repeated groups.
   402  	groups circularGroupsBuffer
   403  
   404  	state        mjState
   405  	proberState  mjProberState
   406  	builderState mjBuilderState
   407  	scratch      struct {
   408  		// tempVecs are temporary vectors that can be used during a cast
   409  		// operation in the probing phase. These vectors should *not* be
   410  		// exposed outside of the merge joiner.
   411  		tempVecs []coldata.Vec
   412  		// lBufferedGroupBatch and rBufferedGroupBatch are scratch batches that are
   413  		// used to select out the tuples that belong to the buffered batch before
   414  		// enqueueing them into corresponding mjBufferedGroups. These are lazily
   415  		// instantiated.
   416  		// TODO(yuzefovich): uncomment when spillingQueue actually copies the
   417  		// enqueued batches when those are kept in memory.
   418  		//lBufferedGroupBatch coldata.Batch
   419  		//rBufferedGroupBatch coldata.Batch
   420  	}
   421  
   422  	diskAcc *mon.BoundAccount
   423  }
   424  
   425  var _ resetter = &mergeJoinBase{}
   426  var _ IdempotentCloser = &mergeJoinBase{}
   427  
   428  func (o *mergeJoinBase) reset(ctx context.Context) {
   429  	if r, ok := o.left.source.(resetter); ok {
   430  		r.reset(ctx)
   431  	}
   432  	if r, ok := o.right.source.(resetter); ok {
   433  		r.reset(ctx)
   434  	}
   435  	o.outputReady = false
   436  	o.state = mjEntry
   437  	o.proberState.lBatch = nil
   438  	o.proberState.rBatch = nil
   439  	o.proberState.lBufferedGroup.reset(ctx)
   440  	o.proberState.rBufferedGroup.reset(ctx)
   441  	o.proberState.lBufferedGroupNeedToReset = false
   442  	o.proberState.rBufferedGroupNeedToReset = false
   443  	o.resetBuilderCrossProductState()
   444  }
   445  
   446  func (o *mergeJoinBase) InternalMemoryUsage() int {
   447  	const sizeOfGroup = int(unsafe.Sizeof(group{}))
   448  	return 8 * coldata.BatchSize() * sizeOfGroup // o.groups
   449  }
   450  
   451  func (o *mergeJoinBase) Init() {
   452  	o.initWithOutputBatchSize(coldata.BatchSize())
   453  }
   454  
   455  func (o *mergeJoinBase) initWithOutputBatchSize(outBatchSize int) {
   456  	outputTypes := append([]*types.T{}, o.left.sourceTypes...)
   457  	if o.joinType.ShouldIncludeRightColsInOutput() {
   458  		outputTypes = append(outputTypes, o.right.sourceTypes...)
   459  	}
   460  	o.output = o.unlimitedAllocator.NewMemBatchWithSize(outputTypes, outBatchSize)
   461  	o.left.source.Init()
   462  	o.right.source.Init()
   463  	o.outputBatchSize = outBatchSize
   464  	// If there are no output columns, then the operator is for a COUNT query,
   465  	// in which case we treat the output batch size as the max int.
   466  	if o.output.Width() == 0 {
   467  		o.outputBatchSize = math.MaxInt64
   468  	}
   469  
   470  	o.proberState.lBufferedGroup.spillingQueue = newSpillingQueue(
   471  		o.unlimitedAllocator, o.left.sourceTypes, o.memoryLimit,
   472  		o.diskQueueCfg, o.fdSemaphore, coldata.BatchSize(), o.diskAcc,
   473  	)
   474  	o.proberState.lBufferedGroup.firstTuple = make([]coldata.Vec, len(o.left.sourceTypes))
   475  	for colIdx, t := range o.left.sourceTypes {
   476  		o.proberState.lBufferedGroup.firstTuple[colIdx] = o.unlimitedAllocator.NewMemColumn(t, 1)
   477  	}
   478  	o.proberState.rBufferedGroup.spillingQueue = newRewindableSpillingQueue(
   479  		o.unlimitedAllocator, o.right.sourceTypes, o.memoryLimit,
   480  		o.diskQueueCfg, o.fdSemaphore, coldata.BatchSize(), o.diskAcc,
   481  	)
   482  	o.proberState.rBufferedGroup.firstTuple = make([]coldata.Vec, len(o.right.sourceTypes))
   483  	for colIdx, t := range o.right.sourceTypes {
   484  		o.proberState.rBufferedGroup.firstTuple[colIdx] = o.unlimitedAllocator.NewMemColumn(t, 1)
   485  	}
   486  
   487  	o.builderState.lGroups = make([]group, 1)
   488  	o.builderState.rGroups = make([]group, 1)
   489  
   490  	o.groups = makeGroupsBuffer(coldata.BatchSize())
   491  	o.resetBuilderCrossProductState()
   492  }
   493  
   494  func (o *mergeJoinBase) resetBuilderCrossProductState() {
   495  	o.builderState.left.reset()
   496  	o.builderState.right.reset()
   497  }
   498  
   499  // appendToBufferedGroup appends all the tuples from batch that are part of the
   500  // same group as the ones in the buffered group that corresponds to the input
   501  // source. This needs to happen when a group starts at the end of an input
   502  // batch and can continue into the following batches.
   503  func (o *mergeJoinBase) appendToBufferedGroup(
   504  	ctx context.Context,
   505  	input *mergeJoinInput,
   506  	batch coldata.Batch,
   507  	sel []int,
   508  	groupStartIdx int,
   509  	groupLength int,
   510  ) {
   511  	if groupLength == 0 {
   512  		return
   513  	}
   514  	var (
   515  		bufferedGroup *mjBufferedGroup
   516  		scratchBatch  coldata.Batch
   517  		sourceTypes   []*types.T
   518  	)
   519  	if input == &o.left {
   520  		sourceTypes = o.left.sourceTypes
   521  		bufferedGroup = &o.proberState.lBufferedGroup
   522  		// TODO(yuzefovich): uncomment when spillingQueue actually copies the
   523  		// enqueued batches when those are kept in memory.
   524  		//if o.scratch.lBufferedGroupBatch == nil {
   525  		//	o.scratch.lBufferedGroupBatch = o.unlimitedAllocator.NewMemBatch(o.left.sourceTypes)
   526  		//}
   527  		//scratchBatch = o.scratch.lBufferedGroupBatch
   528  	} else {
   529  		sourceTypes = o.right.sourceTypes
   530  		bufferedGroup = &o.proberState.rBufferedGroup
   531  		// TODO(yuzefovich): uncomment when spillingQueue actually copies the
   532  		// enqueued batches when those are kept in memory.
   533  		//if o.scratch.rBufferedGroupBatch == nil {
   534  		//	o.scratch.rBufferedGroupBatch = o.unlimitedAllocator.NewMemBatch(o.right.sourceTypes)
   535  		//}
   536  		//scratchBatch = o.scratch.rBufferedGroupBatch
   537  	}
   538  	scratchBatch = o.unlimitedAllocator.NewMemBatchWithSize(sourceTypes, groupLength)
   539  	if bufferedGroup.numTuples == 0 {
   540  		o.unlimitedAllocator.PerformOperation(bufferedGroup.firstTuple, func() {
   541  			for colIdx := range sourceTypes {
   542  				bufferedGroup.firstTuple[colIdx].Copy(
   543  					coldata.CopySliceArgs{
   544  						SliceArgs: coldata.SliceArgs{
   545  							Src:         batch.ColVec(colIdx),
   546  							Sel:         sel,
   547  							DestIdx:     0,
   548  							SrcStartIdx: groupStartIdx,
   549  							SrcEndIdx:   groupStartIdx + 1,
   550  						},
   551  					},
   552  				)
   553  			}
   554  		})
   555  	}
   556  	bufferedGroup.numTuples += groupLength
   557  
   558  	o.unlimitedAllocator.PerformOperation(scratchBatch.ColVecs(), func() {
   559  		for colIdx := range input.sourceTypes {
   560  			scratchBatch.ColVec(colIdx).Copy(
   561  				coldata.CopySliceArgs{
   562  					SliceArgs: coldata.SliceArgs{
   563  						Src:         batch.ColVec(colIdx),
   564  						Sel:         sel,
   565  						DestIdx:     0,
   566  						SrcStartIdx: groupStartIdx,
   567  						SrcEndIdx:   groupStartIdx + groupLength,
   568  					},
   569  				},
   570  			)
   571  		}
   572  	})
   573  	scratchBatch.SetSelection(false)
   574  	scratchBatch.SetLength(groupLength)
   575  	if err := bufferedGroup.enqueue(ctx, scratchBatch); err != nil {
   576  		colexecerror.InternalError(err)
   577  	}
   578  }
   579  
   580  // setBuilderSourceToBatch sets the builder state to use groups from the
   581  // circular group buffer and the batches from input. This happens when we have
   582  // groups that are fully contained within a single input batch from each of the
   583  // sources.
   584  func (o *mergeJoinBase) setBuilderSourceToBatch() {
   585  	o.builderState.lGroups, o.builderState.rGroups = o.groups.getGroups()
   586  	o.builderState.buildFrom = mjBuildFromBatch
   587  }
   588  
   589  // initProberState sets the batches, lengths, and current indices to the right
   590  // locations given the last iteration of the operator.
   591  func (o *mergeJoinBase) initProberState(ctx context.Context) {
   592  	// If this is the first batch or we're done with the current batch, get the
   593  	// next batch.
   594  	if o.proberState.lBatch == nil || (o.proberState.lLength != 0 && o.proberState.lIdx == o.proberState.lLength) {
   595  		o.proberState.lIdx, o.proberState.lBatch = 0, o.left.source.Next(ctx)
   596  		o.proberState.lLength = o.proberState.lBatch.Length()
   597  	}
   598  	if o.proberState.rBatch == nil || (o.proberState.rLength != 0 && o.proberState.rIdx == o.proberState.rLength) {
   599  		o.proberState.rIdx, o.proberState.rBatch = 0, o.right.source.Next(ctx)
   600  		o.proberState.rLength = o.proberState.rBatch.Length()
   601  	}
   602  	if o.proberState.lBufferedGroupNeedToReset {
   603  		o.proberState.lBufferedGroup.reset(ctx)
   604  		o.proberState.lBufferedGroupNeedToReset = false
   605  	}
   606  	if o.proberState.rBufferedGroupNeedToReset {
   607  		o.proberState.rBufferedGroup.reset(ctx)
   608  		o.proberState.rBufferedGroupNeedToReset = false
   609  	}
   610  }
   611  
   612  // nonEmptyBufferedGroup returns true if there is a buffered group that needs
   613  // to be finished.
   614  func (o *mergeJoinBase) nonEmptyBufferedGroup() bool {
   615  	return o.proberState.lBufferedGroup.numTuples > 0 || o.proberState.rBufferedGroup.numTuples > 0
   616  }
   617  
   618  // sourceFinished returns true if either of input sources has no more rows.
   619  func (o *mergeJoinBase) sourceFinished() bool {
   620  	return o.proberState.lLength == 0 || o.proberState.rLength == 0
   621  }
   622  
   623  // completeBufferedGroup extends the buffered group corresponding to input.
   624  // First, we check that the first row in batch is still part of the same group.
   625  // If this is the case, we use the Distinct operator to find the first
   626  // occurrence in batch (or subsequent batches) that doesn't match the current
   627  // group.
   628  // NOTE: we will be buffering all batches until we find such non-matching tuple
   629  // (or until we exhaust the input).
   630  // TODO(yuzefovich): this can be refactored so that only the right side does
   631  // unbounded buffering.
   632  // SIDE EFFECT: can append to the buffered group corresponding to the source.
   633  func (o *mergeJoinBase) completeBufferedGroup(
   634  	ctx context.Context, input *mergeJoinInput, batch coldata.Batch, rowIdx int,
   635  ) (_ coldata.Batch, idx int, batchLength int) {
   636  	batchLength = batch.Length()
   637  	if o.isBufferedGroupFinished(input, batch, rowIdx) {
   638  		return batch, rowIdx, batchLength
   639  	}
   640  
   641  	isBufferedGroupComplete := false
   642  	input.distincter.(resetter).reset(ctx)
   643  	// Ignore the first row of the distincter in the first pass since we already
   644  	// know that we are in the same group and, thus, the row is not distinct,
   645  	// regardless of what the distincter outputs.
   646  	loopStartIndex := 1
   647  	var sel []int
   648  	for !isBufferedGroupComplete {
   649  		// Note that we're not resetting the distincter on every loop iteration
   650  		// because if we're doing the second, third, etc, iteration, then all the
   651  		// previous iterations had only the matching tuples to the buffered group,
   652  		// so the distincter - in a sense - compares the incoming tuples to the
   653  		// first tuple of the first iteration (which we know is the same group).
   654  		input.distincterInput.batch = batch
   655  		input.distincter.Next(ctx)
   656  
   657  		sel = batch.Selection()
   658  		var groupLength int
   659  		if sel != nil {
   660  			for groupLength = loopStartIndex; groupLength < batchLength; groupLength++ {
   661  				if input.distinctOutput[sel[groupLength]] {
   662  					// We found the beginning of a new group!
   663  					isBufferedGroupComplete = true
   664  					break
   665  				}
   666  			}
   667  		} else {
   668  			for groupLength = loopStartIndex; groupLength < batchLength; groupLength++ {
   669  				if input.distinctOutput[groupLength] {
   670  					// We found the beginning of a new group!
   671  					isBufferedGroupComplete = true
   672  					break
   673  				}
   674  			}
   675  		}
   676  
   677  		// Zero out the distinct output for the next pass.
   678  		copy(input.distinctOutput[:batchLength], zeroBoolColumn)
   679  		loopStartIndex = 0
   680  
   681  		// Buffer all the tuples that are part of the buffered group.
   682  		o.appendToBufferedGroup(ctx, input, batch, sel, rowIdx, groupLength)
   683  		rowIdx += groupLength
   684  
   685  		if !isBufferedGroupComplete {
   686  			// The buffered group is still not complete which means that we have
   687  			// just appended all the tuples from batch to it, so we need to get a
   688  			// fresh batch from the input.
   689  			rowIdx, batch = 0, input.source.Next(ctx)
   690  			batchLength = batch.Length()
   691  			if batchLength == 0 {
   692  				// The input has been exhausted, so the buffered group is now complete.
   693  				isBufferedGroupComplete = true
   694  			}
   695  		}
   696  	}
   697  
   698  	return batch, rowIdx, batchLength
   699  }
   700  
   701  // finishProbe completes the buffered groups on both sides of the input.
   702  func (o *mergeJoinBase) finishProbe(ctx context.Context) {
   703  	o.proberState.lBatch, o.proberState.lIdx, o.proberState.lLength = o.completeBufferedGroup(
   704  		ctx,
   705  		&o.left,
   706  		o.proberState.lBatch,
   707  		o.proberState.lIdx,
   708  	)
   709  	o.proberState.rBatch, o.proberState.rIdx, o.proberState.rLength = o.completeBufferedGroup(
   710  		ctx,
   711  		&o.right,
   712  		o.proberState.rBatch,
   713  		o.proberState.rIdx,
   714  	)
   715  }
   716  
   717  func (o *mergeJoinBase) IdempotentClose(ctx context.Context) error {
   718  	o.mu.Lock()
   719  	defer o.mu.Unlock()
   720  	if !o.close() {
   721  		return nil
   722  	}
   723  	var lastErr error
   724  	for _, op := range []colexecbase.Operator{o.left.source, o.right.source} {
   725  		if c, ok := op.(IdempotentCloser); ok {
   726  			if err := c.IdempotentClose(ctx); err != nil {
   727  				lastErr = err
   728  			}
   729  		}
   730  	}
   731  	if err := o.proberState.lBufferedGroup.close(ctx); err != nil {
   732  		lastErr = err
   733  	}
   734  	if err := o.proberState.rBufferedGroup.close(ctx); err != nil {
   735  		lastErr = err
   736  	}
   737  	return lastErr
   738  }