github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/sql/colexec/hashjoiner.go

github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/sql/colexec/hashjoiner.go (about)

     1  // Copyright 2018 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package colexec
    12  
    13  import (
    14  	"context"
    15  
    16  	"github.com/cockroachdb/cockroach/pkg/col/coldata"
    17  	"github.com/cockroachdb/cockroach/pkg/sql/colexecbase"
    18  	"github.com/cockroachdb/cockroach/pkg/sql/colexecbase/colexecerror"
    19  	"github.com/cockroachdb/cockroach/pkg/sql/colmem"
    20  	"github.com/cockroachdb/cockroach/pkg/sql/sqlbase"
    21  	"github.com/cockroachdb/cockroach/pkg/sql/types"
    22  	"github.com/cockroachdb/errors"
    23  )
    24  
    25  // hashJoinerState represents the state of the hash join columnar operator.
    26  type hashJoinerState int
    27  
    28  const (
    29  	// hjBuilding represents the state the hashJoiner is in when it is in the
    30  	// build phase. Output columns from the build table are stored and a hash
    31  	// map is constructed from its equality columns.
    32  	hjBuilding = iota
    33  
    34  	// hjProbing represents the state the hashJoiner is in when it is in the
    35  	// probe phase. Probing is done in batches against the stored hash map.
    36  	hjProbing
    37  
    38  	// hjEmittingUnmatched represents the state the hashJoiner is in when it is
    39  	// emitting unmatched rows from its build table after having consumed the
    40  	// probe table. This happens in the case of an outer join on the build side.
    41  	hjEmittingUnmatched
    42  
    43  	// hjDone represents the state the hashJoiner is in when it has finished
    44  	// emitting all output rows. Note that the build side will have been fully
    45  	// consumed in this state, but the probe side *might* have not been fully
    46  	// consumed.
    47  	hjDone
    48  )
    49  
    50  // hashJoinerSpec is the specification for a hash join operator. The hash
    51  // joiner performs a join on the left and right's equal columns and returns
    52  // combined left and right output columns.
    53  type hashJoinerSpec struct {
    54  	joinType sqlbase.JoinType
    55  	// left and right are the specifications of the two input table sources to
    56  	// the hash joiner.
    57  	left  hashJoinerSourceSpec
    58  	right hashJoinerSourceSpec
    59  
    60  	// rightDistinct indicates whether or not the build table equality column
    61  	// tuples are distinct. If they are distinct, performance can be optimized.
    62  	rightDistinct bool
    63  }
    64  
    65  type hashJoinerSourceSpec struct {
    66  	// eqCols specify the indices of the source tables equality column during the
    67  	// hash join.
    68  	eqCols []uint32
    69  
    70  	// sourceTypes specify the types of the input columns of the source table for
    71  	// the hash joiner.
    72  	sourceTypes []*types.T
    73  
    74  	// outer specifies whether an outer join is required over the input.
    75  	outer bool
    76  }
    77  
    78  // hashJoiner performs a hash join on the input tables equality columns.
    79  // It requires that the output for every input batch in the probe phase fits
    80  // within coldata.BatchSize(), otherwise the behavior is undefined. A join is
    81  // performed and there is no guarantee on the ordering of the output columns.
    82  // The hash table will be built on the right side source, and the left side
    83  // source will be used for probing.
    84  //
    85  // Before the build phase, all equality and output columns from the build table
    86  // are collected and stored.
    87  //
    88  // In the vectorized implementation of the build phase, the following tasks are
    89  // performed:
    90  // 1. The bucket number (hash value) of each key tuple is computed and stored
    91  //    into a buckets array.
    92  // 2. The values in the buckets array is normalized to fit within the hash table
    93  //    numBuckets.
    94  // 3. The bucket-chaining hash table organization is prepared with the computed
    95  //    buckets.
    96  //
    97  // Depending on the value of the spec.rightDistinct flag, there are two
    98  // variations of the probe phase. The planner will set rightDistinct to true if
    99  // and only if the right equality columns make a distinct key.
   100  //
   101  // In the columnarized implementation of the distinct build table probe phase,
   102  // the following tasks are performed by the fastProbe function:
   103  //
   104  // 1. Compute the bucket number for each probe row's key tuple and store the
   105  //    results into the buckets array.
   106  // 2. In order to find the position of these key tuples in the hash table:
   107  // - First find the first element in the bucket's linked list for each key tuple
   108  //   and store it in the groupID array. Initialize the toCheck array with the
   109  //   full sequence of input indices (0...batchSize - 1).
   110  // - While toCheck is not empty, each element in toCheck represents a position
   111  //   of the key tuples for which the key has not yet been found in the hash
   112  //   table. Perform a multi-column equality check to see if the key columns
   113  //   match that of the build table's key columns at groupID.
   114  // - Update the differs array to store whether or not the probe's key tuple
   115  //   matched the corresponding build's key tuple.
   116  // - Select the indices that differed and store them into toCheck since they
   117  //   need to be further processed.
   118  // - For the differing tuples, find the next ID in that bucket of the hash table
   119  //   and put it into the groupID array.
   120  // 3. Now, groupID for every probe's key tuple contains the index of the
   121  //    matching build's key tuple in the hash table. Use it to project output
   122  //    columns from the has table to build the resulting batch.
   123  //
   124  // In the columnarized implementation of the non-distinct build table probe
   125  // phase, the following tasks are performed by the probe function:
   126  //
   127  // 1. Compute the bucket number for each probe row's key tuple and store the
   128  //    results into the buckets array.
   129  // 2. In order to find the position of these key tuples in the hash table:
   130  // - First find the first element in the bucket's linked list for each key tuple
   131  //   and store it in the groupID array. Initialize the toCheck array with the
   132  //   full sequence of input indices (0...batchSize - 1).
   133  // - While toCheck is not empty, each element in toCheck represents a position
   134  //   of the key tuples for which the key has not yet been visited by any prior
   135  //   probe. Perform a multi-column equality check to see if the key columns
   136  //   match that of the build table's key columns at groupID.
   137  // - Update the differs array to store whether or not the probe's key tuple
   138  //   matched the corresponding build's key tuple.
   139  // - For the indices that did not differ, we can lazily update the hashTable's
   140  //   same linked list to store a list of all identical keys starting at head.
   141  //   Once a key has been added to ht.same, ht.visited is set to true. For the
   142  //   indices that have never been visited, we want to continue checking this
   143  //   bucket for identical values by adding this key to toCheck.
   144  // - Select the indices that differed and store them into toCheck since they
   145  //   need to be further processed.
   146  // - For the differing tuples, find the next ID in that bucket of the hash table
   147  //   and put it into the groupID array.
   148  // 3. Now, head stores the keyID of the first match in the build table for every
   149  //    probe table key. ht.same is used to select all build key matches for each
   150  //    probe key, which are added to the resulting batch. Output batching is done
   151  //    to ensure that each batch is at most coldata.BatchSize().
   152  //
   153  // In the case that an outer join on the probe table side is performed, every
   154  // single probe row is kept even if its groupID is 0. If a groupID of 0 is
   155  // found, this means that the matching build table row should be all NULL. This
   156  // is done by setting probeRowUnmatched at that row to true.
   157  //
   158  // In the case that an outer join on the build table side is performed, an
   159  // emitUnmatched is performed after the probing ends. This is done by gathering
   160  // all build table rows that have never been matched and stitching it together
   161  // with NULL values on the probe side.
   162  type hashJoiner struct {
   163  	twoInputNode
   164  
   165  	allocator *colmem.Allocator
   166  	// spec holds the specification for the current hash join process.
   167  	spec hashJoinerSpec
   168  	// state stores the current state of the hash joiner.
   169  	state hashJoinerState
   170  	// ht holds the hashTable that is populated during the build phase and used
   171  	// during the probe phase.
   172  	ht *hashTable
   173  	// output stores the resulting output batch that is constructed and returned
   174  	// for every input batch during the probe phase.
   175  	output coldata.Batch
   176  	// outputBatchSize specifies the desired length of the output batch which by
   177  	// default is coldata.BatchSize() but can be varied in tests.
   178  	outputBatchSize int
   179  
   180  	// probeState is used in hjProbing state.
   181  	probeState struct {
   182  		// buildIdx and probeIdx represents the matching row indices that are used to
   183  		// stitch together the join results.
   184  		buildIdx []int
   185  		probeIdx []int
   186  
   187  		// probeRowUnmatched is used in the case that the prober.spec.outer is true.
   188  		// This means that an outer join is performed on the probe side and we use
   189  		// probeRowUnmatched to represent that the resulting columns should be NULL on
   190  		// the build table. This indicates that the probe table row did not match any
   191  		// build table rows.
   192  		probeRowUnmatched []bool
   193  		// buildRowMatched is used in the case that prober.buildOuter is true. This
   194  		// means that an outer join is performed on the build side and buildRowMatched
   195  		// marks all the build table rows that have been matched already. The rows
   196  		// that were unmatched are emitted during the emitUnmatched phase.
   197  		buildRowMatched []bool
   198  
   199  		// prevBatch, if not nil, indicates that the previous probe input batch has
   200  		// not been fully processed.
   201  		prevBatch coldata.Batch
   202  		// prevBatchResumeIdx indicates the index of the probe row to resume the
   203  		// collection from. It is used only in case of non-distinct build source
   204  		// (every probe row can have multiple matching build rows).
   205  		prevBatchResumeIdx int
   206  	}
   207  
   208  	// emittingUnmatchedState is used in hjEmittingUnmatched state.
   209  	emittingUnmatchedState struct {
   210  		rowIdx int
   211  	}
   212  
   213  	exportBufferedState struct {
   214  		rightExported      int
   215  		rightWindowedBatch coldata.Batch
   216  	}
   217  }
   218  
   219  var _ bufferingInMemoryOperator = &hashJoiner{}
   220  var _ resetter = &hashJoiner{}
   221  
   222  func (hj *hashJoiner) Init() {
   223  	hj.inputOne.Init()
   224  	hj.inputTwo.Init()
   225  
   226  	allowNullEquality, probeMode := false, hashTableDefaultProbeMode
   227  	if hj.spec.joinType.IsSetOpJoin() {
   228  		allowNullEquality = true
   229  		probeMode = hashTableDeletingProbeMode
   230  	}
   231  	hj.ht = newHashTable(
   232  		hj.allocator,
   233  		hashTableNumBuckets,
   234  		hj.spec.right.sourceTypes,
   235  		hj.spec.right.eqCols,
   236  		allowNullEquality,
   237  		hashTableFullBuildMode,
   238  		probeMode,
   239  	)
   240  
   241  	hj.exportBufferedState.rightWindowedBatch = hj.allocator.NewMemBatchWithSize(hj.spec.right.sourceTypes, 0 /* size */)
   242  	hj.state = hjBuilding
   243  }
   244  
   245  func (hj *hashJoiner) Next(ctx context.Context) coldata.Batch {
   246  	hj.resetOutput()
   247  	for {
   248  		switch hj.state {
   249  		case hjBuilding:
   250  			hj.build(ctx)
   251  			if hj.ht.vals.Length() == 0 {
   252  				// The build side is empty, so we can short-circuit probing
   253  				// phase altogether for INNER, RIGHT OUTER, LEFT SEMI, and
   254  				// INTERSECT ALL joins.
   255  				if hj.spec.joinType == sqlbase.InnerJoin ||
   256  					hj.spec.joinType == sqlbase.RightOuterJoin ||
   257  					hj.spec.joinType == sqlbase.LeftSemiJoin ||
   258  					hj.spec.joinType == sqlbase.IntersectAllJoin {
   259  					// The short-circuiting behavior is temporarily disabled
   260  					// because it causes flakiness of some tests due to #48785
   261  					// (concurrent calls to DrainMeta and Next).
   262  					// TODO(asubiotto): remove this once the issue is resolved.
   263  					// hj.state = hjDone
   264  					continue
   265  				}
   266  			}
   267  			continue
   268  		case hjProbing:
   269  			hj.exec(ctx)
   270  
   271  			if hj.output.Length() == 0 {
   272  				if hj.spec.right.outer {
   273  					hj.state = hjEmittingUnmatched
   274  				} else {
   275  					hj.state = hjDone
   276  				}
   277  				continue
   278  			}
   279  			return hj.output
   280  		case hjEmittingUnmatched:
   281  			if hj.emittingUnmatchedState.rowIdx == hj.ht.vals.Length() {
   282  				hj.state = hjDone
   283  				continue
   284  			}
   285  			hj.emitUnmatched()
   286  			return hj.output
   287  		case hjDone:
   288  			return coldata.ZeroBatch
   289  		default:
   290  			colexecerror.InternalError("hash joiner in unhandled state")
   291  			// This code is unreachable, but the compiler cannot infer that.
   292  			return nil
   293  		}
   294  	}
   295  }
   296  
   297  func (hj *hashJoiner) build(ctx context.Context) {
   298  	hj.ht.build(ctx, hj.inputTwo)
   299  
   300  	if !hj.spec.rightDistinct {
   301  		hj.ht.maybeAllocateSameAndVisited()
   302  	}
   303  
   304  	if hj.spec.right.outer {
   305  		if cap(hj.probeState.buildRowMatched) < hj.ht.vals.Length() {
   306  			hj.probeState.buildRowMatched = make([]bool, hj.ht.vals.Length())
   307  		} else {
   308  			hj.probeState.buildRowMatched = hj.probeState.buildRowMatched[:hj.ht.vals.Length()]
   309  			for n := 0; n < hj.ht.vals.Length(); n += copy(hj.probeState.buildRowMatched[n:], zeroBoolColumn) {
   310  			}
   311  		}
   312  	}
   313  
   314  	hj.state = hjProbing
   315  }
   316  
   317  // emitUnmatched populates the output batch to emit tuples from the build side
   318  // that didn't get a match. This will be called only for RIGHT OUTER and FULL
   319  // OUTER joins.
   320  func (hj *hashJoiner) emitUnmatched() {
   321  	// Set all elements in the probe columns of the output batch to null.
   322  	for i := range hj.spec.left.sourceTypes {
   323  		outCol := hj.output.ColVec(i)
   324  		outCol.Nulls().SetNulls()
   325  	}
   326  
   327  	nResults := 0
   328  
   329  	for nResults < hj.outputBatchSize && hj.emittingUnmatchedState.rowIdx < hj.ht.vals.Length() {
   330  		if !hj.probeState.buildRowMatched[hj.emittingUnmatchedState.rowIdx] {
   331  			hj.probeState.buildIdx[nResults] = hj.emittingUnmatchedState.rowIdx
   332  			nResults++
   333  		}
   334  		hj.emittingUnmatchedState.rowIdx++
   335  	}
   336  
   337  	outCols := hj.output.ColVecs()[len(hj.spec.left.sourceTypes) : len(hj.spec.left.sourceTypes)+len(hj.spec.right.sourceTypes)]
   338  	for i := range hj.spec.right.sourceTypes {
   339  		outCol := outCols[i]
   340  		valCol := hj.ht.vals.ColVec(i)
   341  		// NOTE: this Copy is not accounted for because we don't want for memory
   342  		// limit error to occur at this point - we have already built the hash
   343  		// table and now are only consuming the left source one batch at a time,
   344  		// so such behavior should be a minor deviation from the limit. If we were
   345  		// to hit the limit here, it would have been very hard to fall back to disk
   346  		// backed hash joiner because we might have already emitted partial output.
   347  		// This behavior is acceptable - we allocated hj.output batch already, so
   348  		// the concern here is only for the variable-sized types that exceed our
   349  		// estimations.
   350  		outCol.Copy(
   351  			coldata.CopySliceArgs{
   352  				SliceArgs: coldata.SliceArgs{
   353  					Src:       valCol,
   354  					SrcEndIdx: nResults,
   355  					Sel:       hj.probeState.buildIdx,
   356  				},
   357  			},
   358  		)
   359  	}
   360  
   361  	hj.output.SetLength(nResults)
   362  }
   363  
   364  // exec is a general prober that works with non-distinct build table equality
   365  // columns. It returns a Batch with N + M columns where N is the number of
   366  // left source columns and M is the number of right source columns. The first N
   367  // columns correspond to the respective left source columns, followed by the
   368  // right source columns as the last M elements. Even though all the columns are
   369  // present in the result, only the specified output columns store relevant
   370  // information. The remaining columns are there as dummy columns and their
   371  // states are undefined.
   372  //
   373  // rightDistinct is true if the build table equality columns are distinct. It
   374  // performs the same operation as the exec() function normally would while
   375  // taking a shortcut to improve speed.
   376  func (hj *hashJoiner) exec(ctx context.Context) {
   377  	hj.output.SetLength(0)
   378  
   379  	if batch := hj.probeState.prevBatch; batch != nil {
   380  		// The previous result was bigger than the maximum batch size, so we didn't
   381  		// finish outputting it in the last call to probe. Continue outputting the
   382  		// result from the previous batch.
   383  		hj.probeState.prevBatch = nil
   384  		batchSize := batch.Length()
   385  		sel := batch.Selection()
   386  
   387  		nResults := hj.collect(batch, batchSize, sel)
   388  		hj.congregate(nResults, batch, batchSize)
   389  	} else {
   390  		for {
   391  			batch := hj.inputOne.Next(ctx)
   392  			batchSize := batch.Length()
   393  
   394  			if batchSize == 0 {
   395  				break
   396  			}
   397  
   398  			for i, colIdx := range hj.spec.left.eqCols {
   399  				hj.ht.probeScratch.keys[i] = batch.ColVec(int(colIdx))
   400  			}
   401  
   402  			sel := batch.Selection()
   403  
   404  			var nToCheck uint64
   405  			switch hj.spec.joinType {
   406  			case sqlbase.LeftAntiJoin, sqlbase.ExceptAllJoin:
   407  				// The setup of probing for LEFT ANTI and EXCEPT ALL joins
   408  				// needs a special treatment in order to reuse the same "check"
   409  				// functions below.
   410  				//
   411  				// First, we compute the hash values for all tuples in the batch.
   412  				hj.ht.computeBuckets(
   413  					ctx, hj.ht.probeScratch.buckets, hj.ht.probeScratch.keys, batchSize, sel,
   414  				)
   415  				// Then, we iterate over all tuples to see whether there is at least
   416  				// one tuple in the hash table that has the same hash value.
   417  				for i := 0; i < batchSize; i++ {
   418  					if hj.ht.buildScratch.first[hj.ht.probeScratch.buckets[i]] != 0 {
   419  						// Non-zero "first" key indicates that there is a match of hashes
   420  						// and we need to include the current tuple to check whether it is
   421  						// an actual match.
   422  						hj.ht.probeScratch.groupID[i] = hj.ht.buildScratch.first[hj.ht.probeScratch.buckets[i]]
   423  						hj.ht.probeScratch.toCheck[nToCheck] = uint64(i)
   424  						nToCheck++
   425  					}
   426  				}
   427  				// We need to reset headID for all tuples in the batch to remove any
   428  				// leftover garbage from the previous iteration. For tuples that need
   429  				// to be checked, headID will be updated accordingly; for tuples that
   430  				// definitely don't have a match, the zero value will remain until the
   431  				// "collecting" and "congregation" step in which such tuple will be
   432  				// included into the output.
   433  				copy(hj.ht.probeScratch.headID[:batchSize], zeroUint64Column)
   434  			default:
   435  				// Initialize groupID with the initial hash buckets and toCheck with all
   436  				// applicable indices.
   437  				hj.ht.lookupInitial(ctx, batchSize, sel)
   438  				nToCheck = uint64(batchSize)
   439  			}
   440  
   441  			var nResults int
   442  
   443  			if hj.spec.rightDistinct {
   444  				for nToCheck > 0 {
   445  					// Continue searching along the hash table next chains for the corresponding
   446  					// buckets. If the key is found or end of next chain is reached, the key is
   447  					// removed from the toCheck array.
   448  					nToCheck = hj.ht.distinctCheck(nToCheck, sel)
   449  					hj.ht.findNext(hj.ht.buildScratch.next, nToCheck)
   450  				}
   451  
   452  				nResults = hj.distinctCollect(batch, batchSize, sel)
   453  			} else {
   454  				for nToCheck > 0 {
   455  					// Continue searching for the build table matching keys while the toCheck
   456  					// array is non-empty.
   457  					nToCheck = hj.ht.check(hj.ht.probeScratch.keys, hj.ht.keyCols, nToCheck, sel)
   458  					hj.ht.findNext(hj.ht.buildScratch.next, nToCheck)
   459  				}
   460  
   461  				// We're processing a new batch, so we'll reset the index to start
   462  				// collecting from.
   463  				hj.probeState.prevBatchResumeIdx = 0
   464  				nResults = hj.collect(batch, batchSize, sel)
   465  			}
   466  
   467  			hj.congregate(nResults, batch, batchSize)
   468  
   469  			if hj.output.Length() > 0 {
   470  				break
   471  			}
   472  		}
   473  	}
   474  }
   475  
   476  // congregate uses the probeIdx and buildIdx pairs to stitch together the
   477  // resulting join rows and add them to the output batch with the left table
   478  // columns preceding the right table columns.
   479  func (hj *hashJoiner) congregate(nResults int, batch coldata.Batch, batchSize int) {
   480  	// NOTE: Copy() calls are not accounted for because we don't want for memory
   481  	// limit error to occur at this point - we have already built the hash
   482  	// table and now are only consuming the left source one batch at a time,
   483  	// so such behavior should be a minor deviation from the limit. If we were
   484  	// to hit the limit here, it would have been very hard to fall back to disk
   485  	// backed hash joiner because we might have already emitted partial output.
   486  	// This behavior is acceptable - we allocated hj.output batch already, so the
   487  	// concern here is only for the variable-sized types that exceed our
   488  	// estimations.
   489  
   490  	if hj.spec.joinType.ShouldIncludeRightColsInOutput() {
   491  		rightColOffset := len(hj.spec.left.sourceTypes)
   492  		// If the hash table is empty, then there is nothing to copy. The nulls
   493  		// will be set below.
   494  		if hj.ht.vals.Length() > 0 {
   495  			outCols := hj.output.ColVecs()[rightColOffset : rightColOffset+len(hj.spec.right.sourceTypes)]
   496  			for i := range hj.spec.right.sourceTypes {
   497  				outCol := outCols[i]
   498  				valCol := hj.ht.vals.ColVec(i)
   499  				// Note that if for some index i, probeRowUnmatched[i] is true, then
   500  				// hj.buildIdx[i] == 0 which will copy the garbage zeroth row of the
   501  				// hash table, but we will set the NULL value below.
   502  				outCol.Copy(
   503  					coldata.CopySliceArgs{
   504  						SliceArgs: coldata.SliceArgs{
   505  							Src:       valCol,
   506  							SrcEndIdx: nResults,
   507  							Sel:       hj.probeState.buildIdx,
   508  						},
   509  					},
   510  				)
   511  			}
   512  		}
   513  		if hj.spec.left.outer {
   514  			// Add in the nulls we needed to set for the outer join.
   515  			for i := range hj.spec.right.sourceTypes {
   516  				outCol := hj.output.ColVec(i + rightColOffset)
   517  				nulls := outCol.Nulls()
   518  				for i, isNull := range hj.probeState.probeRowUnmatched {
   519  					if isNull {
   520  						nulls.SetNull(i)
   521  					}
   522  				}
   523  			}
   524  		}
   525  	}
   526  
   527  	outCols := hj.output.ColVecs()[:len(hj.spec.left.sourceTypes)]
   528  	for i := range hj.spec.left.sourceTypes {
   529  		outCol := outCols[i]
   530  		valCol := batch.ColVec(i)
   531  		outCol.Copy(
   532  			coldata.CopySliceArgs{
   533  				SliceArgs: coldata.SliceArgs{
   534  					Src:       valCol,
   535  					Sel:       hj.probeState.probeIdx,
   536  					SrcEndIdx: nResults,
   537  				},
   538  			},
   539  		)
   540  	}
   541  
   542  	if hj.spec.right.outer {
   543  		// In order to determine which rows to emit for the outer join on the build
   544  		// table in the end, we need to mark the matched build table rows.
   545  		if hj.spec.left.outer {
   546  			for i := 0; i < nResults; i++ {
   547  				if !hj.probeState.probeRowUnmatched[i] {
   548  					hj.probeState.buildRowMatched[hj.probeState.buildIdx[i]] = true
   549  				}
   550  			}
   551  		} else {
   552  			for i := 0; i < nResults; i++ {
   553  				hj.probeState.buildRowMatched[hj.probeState.buildIdx[i]] = true
   554  			}
   555  		}
   556  	}
   557  
   558  	hj.output.SetLength(nResults)
   559  }
   560  
   561  func (hj *hashJoiner) ExportBuffered(input colexecbase.Operator) coldata.Batch {
   562  	if hj.inputOne == input {
   563  		// We do not buffer anything from the left source. Furthermore, the memory
   564  		// limit can only hit during the building of the hash table step at which
   565  		// point we haven't requested a single batch from the left.
   566  		return coldata.ZeroBatch
   567  	} else if hj.inputTwo == input {
   568  		if hj.exportBufferedState.rightExported == hj.ht.vals.Length() {
   569  			return coldata.ZeroBatch
   570  		}
   571  		newRightExported := hj.exportBufferedState.rightExported + coldata.BatchSize()
   572  		if newRightExported > hj.ht.vals.Length() {
   573  			newRightExported = hj.ht.vals.Length()
   574  		}
   575  		startIdx, endIdx := hj.exportBufferedState.rightExported, newRightExported
   576  		b := hj.exportBufferedState.rightWindowedBatch
   577  		// We don't need to worry about selection vectors on hj.ht.vals because the
   578  		// tuples have been already selected during building of the hash table.
   579  		for i := range hj.spec.right.sourceTypes {
   580  			window := hj.ht.vals.ColVec(i).Window(startIdx, endIdx)
   581  			b.ReplaceCol(window, i)
   582  		}
   583  		b.SetLength(endIdx - startIdx)
   584  		hj.exportBufferedState.rightExported = newRightExported
   585  		return b
   586  	} else {
   587  		colexecerror.InternalError(errors.New(
   588  			"unexpectedly ExportBuffered is called with neither left nor right inputs to hash join",
   589  		))
   590  		// This code is unreachable, but the compiler cannot infer that.
   591  		return nil
   592  	}
   593  }
   594  
   595  func (hj *hashJoiner) resetOutput() {
   596  	if hj.output == nil {
   597  		outputTypes := append([]*types.T{}, hj.spec.left.sourceTypes...)
   598  		if hj.spec.joinType.ShouldIncludeRightColsInOutput() {
   599  			outputTypes = append(outputTypes, hj.spec.right.sourceTypes...)
   600  		}
   601  		hj.output = hj.allocator.NewMemBatch(outputTypes)
   602  	} else {
   603  		hj.output.ResetInternalBatch()
   604  	}
   605  }
   606  
   607  func (hj *hashJoiner) reset(ctx context.Context) {
   608  	for _, input := range []colexecbase.Operator{hj.inputOne, hj.inputTwo} {
   609  		if r, ok := input.(resetter); ok {
   610  			r.reset(ctx)
   611  		}
   612  	}
   613  	hj.state = hjBuilding
   614  	hj.ht.reset(ctx)
   615  	copy(hj.probeState.buildIdx[:coldata.BatchSize()], zeroIntColumn)
   616  	copy(hj.probeState.probeIdx[:coldata.BatchSize()], zeroIntColumn)
   617  	if hj.spec.left.outer {
   618  		copy(hj.probeState.probeRowUnmatched[:coldata.BatchSize()], zeroBoolColumn)
   619  	}
   620  	// hj.probeState.buildRowMatched is reset after building the hash table is
   621  	// complete in build() method.
   622  	hj.emittingUnmatchedState.rowIdx = 0
   623  	hj.exportBufferedState.rightExported = 0
   624  }
   625  
   626  // makeHashJoinerSpec creates a specification for columnar hash join operator.
   627  // leftEqCols and rightEqCols specify the equality columns while leftOutCols
   628  // and rightOutCols specifies the output columns. leftTypes and rightTypes
   629  // specify the input column types of the two sources. rightDistinct indicates
   630  // whether the equality columns of the right source form a key.
   631  func makeHashJoinerSpec(
   632  	joinType sqlbase.JoinType,
   633  	leftEqCols []uint32,
   634  	rightEqCols []uint32,
   635  	leftTypes []*types.T,
   636  	rightTypes []*types.T,
   637  	rightDistinct bool,
   638  ) (hashJoinerSpec, error) {
   639  	var (
   640  		spec                  hashJoinerSpec
   641  		leftOuter, rightOuter bool
   642  	)
   643  	switch joinType {
   644  	case sqlbase.InnerJoin:
   645  	case sqlbase.RightOuterJoin:
   646  		rightOuter = true
   647  	case sqlbase.LeftOuterJoin:
   648  		leftOuter = true
   649  	case sqlbase.FullOuterJoin:
   650  		rightOuter = true
   651  		leftOuter = true
   652  	case sqlbase.LeftSemiJoin:
   653  		// In a semi-join, we don't need to store anything but a single row per
   654  		// build row, since all we care about is whether a row on the left matches
   655  		// any row on the right.
   656  		// Note that this is *not* the case if we have an ON condition, since we'll
   657  		// also need to make sure that a row on the left passes the ON condition
   658  		// with the row on the right to emit it. However, we don't support ON
   659  		// conditions just yet. When we do, we'll have a separate case for that.
   660  		rightDistinct = true
   661  	case sqlbase.LeftAntiJoin:
   662  	case sqlbase.IntersectAllJoin:
   663  	case sqlbase.ExceptAllJoin:
   664  	default:
   665  		return spec, errors.AssertionFailedf("hash join of type %s not supported", joinType)
   666  	}
   667  
   668  	left := hashJoinerSourceSpec{
   669  		eqCols:      leftEqCols,
   670  		sourceTypes: leftTypes,
   671  		outer:       leftOuter,
   672  	}
   673  	right := hashJoinerSourceSpec{
   674  		eqCols:      rightEqCols,
   675  		sourceTypes: rightTypes,
   676  		outer:       rightOuter,
   677  	}
   678  	spec = hashJoinerSpec{
   679  		joinType:      joinType,
   680  		left:          left,
   681  		right:         right,
   682  		rightDistinct: rightDistinct,
   683  	}
   684  	return spec, nil
   685  }
   686  
   687  // newHashJoiner creates a new equality hash join operator on the left and
   688  // right input tables.
   689  func newHashJoiner(
   690  	allocator *colmem.Allocator, spec hashJoinerSpec, leftSource, rightSource colexecbase.Operator,
   691  ) colexecbase.Operator {
   692  	hj := &hashJoiner{
   693  		twoInputNode:    newTwoInputNode(leftSource, rightSource),
   694  		allocator:       allocator,
   695  		spec:            spec,
   696  		outputBatchSize: coldata.BatchSize(),
   697  	}
   698  	hj.probeState.buildIdx = make([]int, coldata.BatchSize())
   699  	hj.probeState.probeIdx = make([]int, coldata.BatchSize())
   700  	if spec.left.outer {
   701  		hj.probeState.probeRowUnmatched = make([]bool, coldata.BatchSize())
   702  	}
   703  	return hj
   704  }