github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/sql/rowexec/hashjoiner.go (about)

     1  // Copyright 2016 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package rowexec
    12  
    13  import (
    14  	"context"
    15  	"fmt"
    16  
    17  	"github.com/cockroachdb/cockroach/pkg/sql/execinfra"
    18  	"github.com/cockroachdb/cockroach/pkg/sql/execinfrapb"
    19  	"github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgcode"
    20  	"github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgerror"
    21  	"github.com/cockroachdb/cockroach/pkg/sql/rowcontainer"
    22  	"github.com/cockroachdb/cockroach/pkg/sql/sqlbase"
    23  	"github.com/cockroachdb/cockroach/pkg/util/humanizeutil"
    24  	"github.com/cockroachdb/cockroach/pkg/util/log"
    25  	"github.com/cockroachdb/cockroach/pkg/util/mon"
    26  	"github.com/cockroachdb/cockroach/pkg/util/tracing"
    27  	"github.com/opentracing/opentracing-go"
    28  )
    29  
    30  // hashJoinerInitialBufferSize controls the size of the initial buffering phase
    31  // (see hashJoiner). This only applies when falling back to disk is disabled.
    32  const hashJoinerInitialBufferSize = 4 * 1024 * 1024
    33  
    34  // hashJoinerState represents the state of the processor.
    35  type hashJoinerState int
    36  
    37  const (
    38  	hjStateUnknown hashJoinerState = iota
    39  	// hjBuilding represents the state the hashJoiner is in when it is trying to
    40  	// determine which side to store (i.e. which side is smallest).
    41  	// At most hashJoinerInitialBufferSize is used to buffer rows from either
    42  	// side. The first input to be finished within this limit is the smallest
    43  	// side. If both inputs still have rows, the hashJoiner will default to
    44  	// storing the right side. When a side is stored, a hash map is also
    45  	// constructed from the equality columns to the rows.
    46  	hjBuilding
    47  	// hjConsumingStoredSide represents the state the hashJoiner is in if a small
    48  	// side was not found. In this case, the hashJoiner will fully consume the
    49  	// right side. This state is skipped if the hashJoiner determined the smallest
    50  	// side, since it must have fully consumed that side.
    51  	hjConsumingStoredSide
    52  	// hjReadingProbeSide represents the state the hashJoiner is in when it reads
    53  	// rows from the input that wasn't chosen to be stored.
    54  	hjReadingProbeSide
    55  	// hjProbingRow represents the state the hashJoiner is in when it uses a row
    56  	// read in hjReadingProbeSide to probe the stored hash map with.
    57  	hjProbingRow
    58  	// hjEmittingUnmatched represents the state the hashJoiner is in when it is
    59  	// emitting unmatched rows from its stored side after having consumed the
    60  	// other side. This only happens when executing a FULL OUTER, LEFT/RIGHT
    61  	// OUTER and ANTI joins (depending on which side we store).
    62  	hjEmittingUnmatched
    63  )
    64  
    65  // hashJoiner performs a hash join. There is no guarantee on the output
    66  // ordering.
    67  type hashJoiner struct {
    68  	joinerBase
    69  
    70  	runningState hashJoinerState
    71  
    72  	diskMonitor *mon.BytesMonitor
    73  
    74  	leftSource, rightSource execinfra.RowSource
    75  
    76  	// initialBufferSize is the maximum amount of data we buffer from each stream
    77  	// as part of the initial buffering phase. Normally
    78  	// hashJoinerInitialBufferSize, can be tweaked for tests.
    79  	// TODO(yuzefovich): remove buffering stage from the hash joiner and always
    80  	// build from the right stream.
    81  	initialBufferSize int64
    82  
    83  	// We read a portion of both streams, in the hope that one is small. One of
    84  	// the containers will contain the entire "stored" stream, the other just the
    85  	// start of the other stream.
    86  	rows [2]rowcontainer.MemRowContainer
    87  
    88  	// storedSide is set by the initial buffering phase and indicates which
    89  	// stream we store fully and build the hashRowContainer from.
    90  	storedSide joinSide
    91  
    92  	// nullEquality indicates that NULL = NULL should be considered true. Used for
    93  	// INTERSECT and EXCEPT.
    94  	nullEquality bool
    95  
    96  	disableTempStorage bool
    97  	storedRows         rowcontainer.HashRowContainer
    98  
    99  	// Used by tests to force a storedSide.
   100  	forcedStoredSide *joinSide
   101  
   102  	// probingRowState is state used when hjProbingRow.
   103  	probingRowState struct {
   104  		// row is the row being probed with.
   105  		row sqlbase.EncDatumRow
   106  		// iter is an iterator over the bucket that matches row on the equality
   107  		// columns.
   108  		iter rowcontainer.RowMarkerIterator
   109  		// matched represents whether any row that matches row on equality columns
   110  		// has also passed the ON condition.
   111  		matched bool
   112  	}
   113  
   114  	// emittingUnmatchedState is used when hjEmittingUnmatched.
   115  	emittingUnmatchedState struct {
   116  		iter rowcontainer.RowIterator
   117  	}
   118  
   119  	// Context cancellation checker.
   120  	cancelChecker *sqlbase.CancelChecker
   121  }
   122  
   123  var _ execinfra.Processor = &hashJoiner{}
   124  var _ execinfra.RowSource = &hashJoiner{}
   125  var _ execinfra.OpNode = &hashJoiner{}
   126  
   127  const hashJoinerProcName = "hash joiner"
   128  
   129  // newHashJoiner creates a new hash join processor.
   130  // - disableTempStorage determines whether the hash joiner is allowed to spill
   131  // to disk. It should only be set to 'true' in tests.
   132  func newHashJoiner(
   133  	flowCtx *execinfra.FlowCtx,
   134  	processorID int32,
   135  	spec *execinfrapb.HashJoinerSpec,
   136  	leftSource execinfra.RowSource,
   137  	rightSource execinfra.RowSource,
   138  	post *execinfrapb.PostProcessSpec,
   139  	output execinfra.RowReceiver,
   140  	disableTempStorage bool,
   141  ) (*hashJoiner, error) {
   142  	h := &hashJoiner{
   143  		initialBufferSize: hashJoinerInitialBufferSize,
   144  		leftSource:        leftSource,
   145  		rightSource:       rightSource,
   146  	}
   147  
   148  	numMergedColumns := 0
   149  	if spec.MergedColumns {
   150  		numMergedColumns = len(spec.LeftEqColumns)
   151  	}
   152  	if err := h.joinerBase.init(
   153  		h,
   154  		flowCtx,
   155  		processorID,
   156  		leftSource.OutputTypes(),
   157  		rightSource.OutputTypes(),
   158  		spec.Type,
   159  		spec.OnExpr,
   160  		spec.LeftEqColumns,
   161  		spec.RightEqColumns,
   162  		uint32(numMergedColumns),
   163  		post,
   164  		output,
   165  		execinfra.ProcStateOpts{
   166  			InputsToDrain: []execinfra.RowSource{h.leftSource, h.rightSource},
   167  			TrailingMetaCallback: func(context.Context) []execinfrapb.ProducerMetadata {
   168  				h.close()
   169  				return nil
   170  			},
   171  		},
   172  	); err != nil {
   173  		return nil, err
   174  	}
   175  
   176  	ctx := h.FlowCtx.EvalCtx.Ctx()
   177  	h.disableTempStorage = disableTempStorage
   178  	if !h.disableTempStorage {
   179  		// Limit the memory use by creating a child monitor with a hard limit.
   180  		// The hashJoiner will overflow to disk if this limit is not enough.
   181  		limit := execinfra.GetWorkMemLimit(flowCtx.Cfg)
   182  		if h.FlowCtx.Cfg.TestingKnobs.ForceDiskSpill {
   183  			limit = 1
   184  		}
   185  		h.MemMonitor = execinfra.NewLimitedMonitor(ctx, flowCtx.EvalCtx.Mon, flowCtx.Cfg, "hashjoiner-limited")
   186  		h.diskMonitor = execinfra.NewMonitor(ctx, flowCtx.Cfg.DiskMonitor, "hashjoiner-disk")
   187  		// Override initialBufferSize to be half of this processor's memory
   188  		// limit. We consume up to h.initialBufferSize bytes from each input
   189  		// stream.
   190  		h.initialBufferSize = limit / 2
   191  	} else {
   192  		h.MemMonitor = execinfra.NewMonitor(ctx, flowCtx.EvalCtx.Mon, "hashjoiner-mem")
   193  	}
   194  
   195  	// If the trace is recording, instrument the hashJoiner to collect stats.
   196  	if sp := opentracing.SpanFromContext(ctx); sp != nil && tracing.IsRecording(sp) {
   197  		h.leftSource = newInputStatCollector(h.leftSource)
   198  		h.rightSource = newInputStatCollector(h.rightSource)
   199  		h.FinishTrace = h.outputStatsToTrace
   200  	}
   201  
   202  	h.rows[leftSide].InitWithMon(
   203  		nil /* ordering */, h.leftSource.OutputTypes(), h.EvalCtx, h.MemMonitor, 0, /* rowCapacity */
   204  	)
   205  	h.rows[rightSide].InitWithMon(
   206  		nil /* ordering */, h.rightSource.OutputTypes(), h.EvalCtx, h.MemMonitor, 0, /* rowCapacity */
   207  	)
   208  
   209  	if h.joinType == sqlbase.IntersectAllJoin || h.joinType == sqlbase.ExceptAllJoin {
   210  		h.nullEquality = true
   211  	}
   212  
   213  	return h, nil
   214  }
   215  
   216  // Start is part of the RowSource interface.
   217  func (h *hashJoiner) Start(ctx context.Context) context.Context {
   218  	h.leftSource.Start(ctx)
   219  	h.rightSource.Start(ctx)
   220  	ctx = h.StartInternal(ctx, hashJoinerProcName)
   221  	h.cancelChecker = sqlbase.NewCancelChecker(ctx)
   222  	h.runningState = hjBuilding
   223  	return ctx
   224  }
   225  
   226  // Next is part of the RowSource interface.
   227  func (h *hashJoiner) Next() (sqlbase.EncDatumRow, *execinfrapb.ProducerMetadata) {
   228  	for h.State == execinfra.StateRunning {
   229  		var row sqlbase.EncDatumRow
   230  		var meta *execinfrapb.ProducerMetadata
   231  		switch h.runningState {
   232  		case hjBuilding:
   233  			h.runningState, row, meta = h.build()
   234  		case hjConsumingStoredSide:
   235  			h.runningState, row, meta = h.consumeStoredSide()
   236  		case hjReadingProbeSide:
   237  			h.runningState, row, meta = h.readProbeSide()
   238  		case hjProbingRow:
   239  			h.runningState, row, meta = h.probeRow()
   240  		case hjEmittingUnmatched:
   241  			h.runningState, row, meta = h.emitUnmatched()
   242  		default:
   243  			log.Fatalf(h.Ctx, "unsupported state: %d", h.runningState)
   244  		}
   245  
   246  		if row == nil && meta == nil {
   247  			continue
   248  		}
   249  		if meta != nil {
   250  			return nil, meta
   251  		}
   252  		if outRow := h.ProcessRowHelper(row); outRow != nil {
   253  			return outRow, nil
   254  		}
   255  	}
   256  	return nil, h.DrainHelper()
   257  }
   258  
   259  // ConsumerClosed is part of the RowSource interface.
   260  func (h *hashJoiner) ConsumerClosed() {
   261  	h.close()
   262  }
   263  
   264  func (h *hashJoiner) build() (hashJoinerState, sqlbase.EncDatumRow, *execinfrapb.ProducerMetadata) {
   265  	// setStoredSideTransition is a helper function that sets storedSide on the
   266  	// hashJoiner and performs initialization before a transition to
   267  	// hjConsumingStoredSide.
   268  	setStoredSideTransition := func(
   269  		side joinSide,
   270  	) (hashJoinerState, sqlbase.EncDatumRow, *execinfrapb.ProducerMetadata) {
   271  		h.storedSide = side
   272  		if err := h.initStoredRows(); err != nil {
   273  			h.MoveToDraining(err)
   274  			return hjStateUnknown, nil, h.DrainHelper()
   275  		}
   276  		return hjConsumingStoredSide, nil, nil
   277  	}
   278  
   279  	if h.forcedStoredSide != nil {
   280  		return setStoredSideTransition(*h.forcedStoredSide)
   281  	}
   282  
   283  	for {
   284  		leftUsage := h.rows[leftSide].MemUsage()
   285  		rightUsage := h.rows[rightSide].MemUsage()
   286  
   287  		if leftUsage >= h.initialBufferSize && rightUsage >= h.initialBufferSize {
   288  			// Both sides have reached the buffer size limit. Move on to storing and
   289  			// fully consuming the right side.
   290  			log.VEventf(h.Ctx, 1, "buffer phase found no short stream with buffer size %d", h.initialBufferSize)
   291  			return setStoredSideTransition(rightSide)
   292  		}
   293  
   294  		side := rightSide
   295  		if leftUsage < rightUsage {
   296  			side = leftSide
   297  		}
   298  
   299  		row, meta, emitDirectly, err := h.receiveNext(side)
   300  		if err != nil {
   301  			h.MoveToDraining(err)
   302  			return hjStateUnknown, nil, h.DrainHelper()
   303  		} else if meta != nil {
   304  			if meta.Err != nil {
   305  				h.MoveToDraining(nil /* err */)
   306  				return hjStateUnknown, nil, meta
   307  			}
   308  			return hjBuilding, nil, meta
   309  		} else if emitDirectly {
   310  			return hjBuilding, row, nil
   311  		}
   312  
   313  		if row == nil {
   314  			// This side has been fully consumed, it is the shortest side.
   315  			// If storedSide is empty, we might be able to short-circuit.
   316  			if h.rows[side].Len() == 0 &&
   317  				(h.joinType == sqlbase.InnerJoin ||
   318  					(h.joinType == sqlbase.LeftOuterJoin && side == leftSide) ||
   319  					(h.joinType == sqlbase.RightOuterJoin && side == rightSide)) {
   320  				h.MoveToDraining(nil /* err */)
   321  				return hjStateUnknown, nil, h.DrainHelper()
   322  			}
   323  			// We could skip hjConsumingStoredSide and move straight to
   324  			// hjReadingProbeSide apart from the fact that hjConsumingStoredSide
   325  			// pre-reserves mark memory. To keep the code simple and avoid
   326  			// duplication, we move to hjConsumingStoredSide.
   327  			return setStoredSideTransition(side)
   328  		}
   329  
   330  		// Add the row to the correct container.
   331  		if err := h.rows[side].AddRow(h.Ctx, row); err != nil {
   332  			// If this error is a memory limit error, move to hjConsumingStoredSide.
   333  			h.storedSide = side
   334  			if sqlbase.IsOutOfMemoryError(err) {
   335  				if h.disableTempStorage {
   336  					err = pgerror.Wrapf(err, pgcode.OutOfMemory,
   337  						"error while attempting hashJoiner disk spill: temp storage disabled")
   338  				} else {
   339  					if err := h.initStoredRows(); err != nil {
   340  						h.MoveToDraining(err)
   341  						return hjStateUnknown, nil, h.DrainHelper()
   342  					}
   343  					addErr := h.storedRows.AddRow(h.Ctx, row)
   344  					if addErr == nil {
   345  						return hjConsumingStoredSide, nil, nil
   346  					}
   347  					err = pgerror.Wrapf(addErr, pgcode.OutOfMemory, "while spilling: %v", err)
   348  				}
   349  			}
   350  			h.MoveToDraining(err)
   351  			return hjStateUnknown, nil, h.DrainHelper()
   352  		}
   353  	}
   354  }
   355  
   356  // consumeStoredSide fully consumes the stored side and adds the rows to
   357  // h.storedRows. It assumes that h.storedRows has been initialized through
   358  // h.initStoredRows().
   359  func (h *hashJoiner) consumeStoredSide() (
   360  	hashJoinerState,
   361  	sqlbase.EncDatumRow,
   362  	*execinfrapb.ProducerMetadata,
   363  ) {
   364  	side := h.storedSide
   365  	for {
   366  		row, meta, emitDirectly, err := h.receiveNext(side)
   367  		if err != nil {
   368  			h.MoveToDraining(err)
   369  			return hjStateUnknown, nil, h.DrainHelper()
   370  		} else if meta != nil {
   371  			if meta.Err != nil {
   372  				h.MoveToDraining(nil /* err */)
   373  				return hjStateUnknown, nil, meta
   374  			}
   375  			return hjConsumingStoredSide, nil, meta
   376  		} else if emitDirectly {
   377  			return hjConsumingStoredSide, row, nil
   378  		}
   379  
   380  		if row == nil {
   381  			// The stored side has been fully consumed, move on to hjReadingProbeSide.
   382  			// If storedRows is in-memory, pre-reserve the memory needed to mark.
   383  			if rc, ok := h.storedRows.(*rowcontainer.HashMemRowContainer); ok {
   384  				// h.storedRows is hashMemRowContainer and not a disk backed one, so
   385  				// h.disableTempStorage is true and we cannot spill to disk, so we simply
   386  				// will return an error if it occurs.
   387  				err = rc.ReserveMarkMemoryMaybe(h.Ctx)
   388  			} else if hdbrc, ok := h.storedRows.(*rowcontainer.HashDiskBackedRowContainer); ok {
   389  				err = hdbrc.ReserveMarkMemoryMaybe(h.Ctx)
   390  			} else {
   391  				panic("unexpected type of storedRows in hashJoiner")
   392  			}
   393  			if err != nil {
   394  				h.MoveToDraining(err)
   395  				return hjStateUnknown, nil, h.DrainHelper()
   396  			}
   397  			return hjReadingProbeSide, nil, nil
   398  		}
   399  
   400  		err = h.storedRows.AddRow(h.Ctx, row)
   401  		// Regardless of the underlying row container (disk backed or in-memory
   402  		// only), we cannot do anything about an error if it occurs.
   403  		if err != nil {
   404  			h.MoveToDraining(err)
   405  			return hjStateUnknown, nil, h.DrainHelper()
   406  		}
   407  	}
   408  }
   409  
   410  func (h *hashJoiner) readProbeSide() (
   411  	hashJoinerState,
   412  	sqlbase.EncDatumRow,
   413  	*execinfrapb.ProducerMetadata,
   414  ) {
   415  	side := otherSide(h.storedSide)
   416  
   417  	var row sqlbase.EncDatumRow
   418  	// First process the rows that were already buffered.
   419  	if h.rows[side].Len() > 0 {
   420  		row = h.rows[side].EncRow(0)
   421  		h.rows[side].PopFirst()
   422  	} else {
   423  		var meta *execinfrapb.ProducerMetadata
   424  		var emitDirectly bool
   425  		var err error
   426  		row, meta, emitDirectly, err = h.receiveNext(side)
   427  		if err != nil {
   428  			h.MoveToDraining(err)
   429  			return hjStateUnknown, nil, h.DrainHelper()
   430  		} else if meta != nil {
   431  			if meta.Err != nil {
   432  				h.MoveToDraining(nil /* err */)
   433  				return hjStateUnknown, nil, meta
   434  			}
   435  			return hjReadingProbeSide, nil, meta
   436  		} else if emitDirectly {
   437  			return hjReadingProbeSide, row, nil
   438  		}
   439  
   440  		if row == nil {
   441  			// The probe side has been fully consumed. Move on to hjEmittingUnmatched
   442  			// if unmatched rows on the stored side need to be emitted, otherwise
   443  			// finish.
   444  			if shouldEmitUnmatchedRow(h.storedSide, h.joinType) {
   445  				i := h.storedRows.NewUnmarkedIterator(h.Ctx)
   446  				i.Rewind()
   447  				h.emittingUnmatchedState.iter = i
   448  				return hjEmittingUnmatched, nil, nil
   449  			}
   450  			h.MoveToDraining(nil /* err */)
   451  			return hjStateUnknown, nil, h.DrainHelper()
   452  		}
   453  	}
   454  
   455  	// Probe with this row. Get the iterator over the matching bucket ready for
   456  	// hjProbingRow.
   457  	h.probingRowState.row = row
   458  	h.probingRowState.matched = false
   459  	if h.probingRowState.iter == nil {
   460  		i, err := h.storedRows.NewBucketIterator(h.Ctx, row, h.eqCols[side])
   461  		if err != nil {
   462  			h.MoveToDraining(err)
   463  			return hjStateUnknown, nil, h.DrainHelper()
   464  		}
   465  		h.probingRowState.iter = i
   466  	} else {
   467  		if err := h.probingRowState.iter.Reset(h.Ctx, row); err != nil {
   468  			h.MoveToDraining(err)
   469  			return hjStateUnknown, nil, h.DrainHelper()
   470  		}
   471  	}
   472  	h.probingRowState.iter.Rewind()
   473  	return hjProbingRow, nil, nil
   474  }
   475  
   476  func (h *hashJoiner) probeRow() (
   477  	hashJoinerState,
   478  	sqlbase.EncDatumRow,
   479  	*execinfrapb.ProducerMetadata,
   480  ) {
   481  	i := h.probingRowState.iter
   482  	if ok, err := i.Valid(); err != nil {
   483  		h.MoveToDraining(err)
   484  		return hjStateUnknown, nil, h.DrainHelper()
   485  	} else if !ok {
   486  		// In this case we have reached the end of the matching bucket. Check if any
   487  		// rows passed the ON condition. If they did, move back to
   488  		// hjReadingProbeSide to get the next probe row.
   489  		if h.probingRowState.matched {
   490  			return hjReadingProbeSide, nil, nil
   491  		}
   492  		// If not, this probe row is unmatched. Check if it needs to be emitted.
   493  		if renderedRow, shouldEmit := h.shouldEmitUnmatched(
   494  			h.probingRowState.row, otherSide(h.storedSide),
   495  		); shouldEmit {
   496  			return hjReadingProbeSide, renderedRow, nil
   497  		}
   498  		return hjReadingProbeSide, nil, nil
   499  	}
   500  
   501  	if err := h.cancelChecker.Check(); err != nil {
   502  		h.MoveToDraining(err)
   503  		return hjStateUnknown, nil, h.DrainHelper()
   504  	}
   505  
   506  	row := h.probingRowState.row
   507  	otherRow, err := i.Row()
   508  	if err != nil {
   509  		h.MoveToDraining(err)
   510  		return hjStateUnknown, nil, h.DrainHelper()
   511  	}
   512  	defer i.Next()
   513  
   514  	var renderedRow sqlbase.EncDatumRow
   515  	if h.storedSide == rightSide {
   516  		renderedRow, err = h.render(row, otherRow)
   517  	} else {
   518  		renderedRow, err = h.render(otherRow, row)
   519  	}
   520  	if err != nil {
   521  		h.MoveToDraining(err)
   522  		return hjStateUnknown, nil, h.DrainHelper()
   523  	}
   524  
   525  	// If the ON condition failed, renderedRow is nil.
   526  	if renderedRow == nil {
   527  		return hjProbingRow, nil, nil
   528  	}
   529  
   530  	h.probingRowState.matched = true
   531  	shouldEmit := h.joinType != sqlbase.LeftAntiJoin && h.joinType != sqlbase.ExceptAllJoin
   532  	if shouldMark(h.storedSide, h.joinType) {
   533  		// Matched rows are marked on the stored side for 2 reasons.
   534  		// 1: For outer joins, anti joins, and EXCEPT ALL to iterate through
   535  		// the unmarked rows.
   536  		// 2: For semi-joins and INTERSECT ALL where the left-side is stored,
   537  		// multiple rows from the right may match to the same row on the left.
   538  		// The rows on the left should only be emitted the first time
   539  		// a right row matches it, then marked to not be emitted again.
   540  		// (Note: an alternative is to remove the entry from the stored
   541  		// side, but our containers do not support that today).
   542  		// TODO(peter): figure out a way to reduce this special casing below.
   543  		if i.IsMarked(h.Ctx) {
   544  			switch h.joinType {
   545  			case sqlbase.LeftSemiJoin:
   546  				shouldEmit = false
   547  			case sqlbase.IntersectAllJoin:
   548  				shouldEmit = false
   549  			case sqlbase.ExceptAllJoin:
   550  				// We want to mark a stored row if possible, so move on to the next
   551  				// match. Reset h.probingRowState.matched in case we don't find any more
   552  				// matches and want to emit this row.
   553  				h.probingRowState.matched = false
   554  				return hjProbingRow, nil, nil
   555  			}
   556  		} else if err := i.Mark(h.Ctx, true); err != nil {
   557  			h.MoveToDraining(err)
   558  			return hjStateUnknown, nil, h.DrainHelper()
   559  		}
   560  	}
   561  	nextState := hjProbingRow
   562  	if shouldShortCircuit(h.storedSide, h.joinType) {
   563  		nextState = hjReadingProbeSide
   564  	}
   565  	if shouldEmit {
   566  		if h.joinType == sqlbase.IntersectAllJoin {
   567  			// We found a match, so we are done with this row.
   568  			return hjReadingProbeSide, renderedRow, nil
   569  		}
   570  		return nextState, renderedRow, nil
   571  	}
   572  
   573  	return nextState, nil, nil
   574  }
   575  
   576  func (h *hashJoiner) emitUnmatched() (
   577  	hashJoinerState,
   578  	sqlbase.EncDatumRow,
   579  	*execinfrapb.ProducerMetadata,
   580  ) {
   581  	i := h.emittingUnmatchedState.iter
   582  	if ok, err := i.Valid(); err != nil {
   583  		h.MoveToDraining(err)
   584  		return hjStateUnknown, nil, h.DrainHelper()
   585  	} else if !ok {
   586  		// Done.
   587  		h.MoveToDraining(nil /* err */)
   588  		return hjStateUnknown, nil, h.DrainHelper()
   589  	}
   590  
   591  	if err := h.cancelChecker.Check(); err != nil {
   592  		h.MoveToDraining(err)
   593  		return hjStateUnknown, nil, h.DrainHelper()
   594  	}
   595  
   596  	row, err := i.Row()
   597  	if err != nil {
   598  		h.MoveToDraining(err)
   599  		return hjStateUnknown, nil, h.DrainHelper()
   600  	}
   601  	defer i.Next()
   602  
   603  	return hjEmittingUnmatched, h.renderUnmatchedRow(row, h.storedSide), nil
   604  }
   605  
   606  func (h *hashJoiner) close() {
   607  	if h.InternalClose() {
   608  		// We need to close only memRowContainer of the probe side because the
   609  		// stored side container will be closed by closing h.storedRows.
   610  		if h.storedSide == rightSide {
   611  			h.rows[leftSide].Close(h.Ctx)
   612  		} else {
   613  			h.rows[rightSide].Close(h.Ctx)
   614  		}
   615  		if h.storedRows != nil {
   616  			h.storedRows.Close(h.Ctx)
   617  		} else {
   618  			// h.storedRows has not been initialized, so we need to close the stored
   619  			// side container explicitly.
   620  			h.rows[h.storedSide].Close(h.Ctx)
   621  		}
   622  		if h.probingRowState.iter != nil {
   623  			h.probingRowState.iter.Close()
   624  		}
   625  		if h.emittingUnmatchedState.iter != nil {
   626  			h.emittingUnmatchedState.iter.Close()
   627  		}
   628  		h.MemMonitor.Stop(h.Ctx)
   629  		if h.diskMonitor != nil {
   630  			h.diskMonitor.Stop(h.Ctx)
   631  		}
   632  	}
   633  }
   634  
   635  // receiveNext reads from the source specified by side and returns the next row
   636  // or metadata to be processed by the hashJoiner. Unless h.nullEquality is true,
   637  // rows with NULLs in their equality columns are only returned if the joinType
   638  // specifies that unmatched rows should be returned for the given side. In this
   639  // case, a rendered row and true is returned, notifying the caller that the
   640  // returned row may be emitted directly.
   641  func (h *hashJoiner) receiveNext(
   642  	side joinSide,
   643  ) (sqlbase.EncDatumRow, *execinfrapb.ProducerMetadata, bool, error) {
   644  	source := h.leftSource
   645  	if side == rightSide {
   646  		source = h.rightSource
   647  	}
   648  	for {
   649  		if err := h.cancelChecker.Check(); err != nil {
   650  			return nil, nil, false, err
   651  		}
   652  		row, meta := source.Next()
   653  		if meta != nil {
   654  			return nil, meta, false, nil
   655  		} else if row == nil {
   656  			return nil, nil, false, nil
   657  		}
   658  		// We make the explicit check for whether or not the row contained a NULL value
   659  		// on an equality column. The reasoning here is because of the way we expect
   660  		// NULL equality checks to behave (i.e. NULL != NULL) and the fact that we
   661  		// use the encoding of any given row as key into our bucket. Thus if we
   662  		// encountered a NULL row when building the hashmap we have to store in
   663  		// order to use it for RIGHT OUTER joins but if we encounter another
   664  		// NULL row when going through the left stream (probing phase), matching
   665  		// this with the first NULL row would be incorrect.
   666  		//
   667  		// If we have have the following:
   668  		// CREATE TABLE t(x INT); INSERT INTO t(x) VALUES (NULL);
   669  		//    |  x   |
   670  		//     ------
   671  		//    | NULL |
   672  		//
   673  		// For the following query:
   674  		// SELECT * FROM t AS a FULL OUTER JOIN t AS b USING(x);
   675  		//
   676  		// We expect:
   677  		//    |  x   |
   678  		//     ------
   679  		//    | NULL |
   680  		//    | NULL |
   681  		//
   682  		// The following examples illustrates the behavior when joining on two
   683  		// or more columns, and only one of them contains NULL.
   684  		// If we have have the following:
   685  		// CREATE TABLE t(x INT, y INT);
   686  		// INSERT INTO t(x, y) VALUES (44,51), (NULL,52);
   687  		//    |  x   |  y   |
   688  		//     ------
   689  		//    |  44  |  51  |
   690  		//    | NULL |  52  |
   691  		//
   692  		// For the following query:
   693  		// SELECT * FROM t AS a FULL OUTER JOIN t AS b USING(x, y);
   694  		//
   695  		// We expect:
   696  		//    |  x   |  y   |
   697  		//     ------
   698  		//    |  44  |  51  |
   699  		//    | NULL |  52  |
   700  		//    | NULL |  52  |
   701  		hasNull := false
   702  		for _, c := range h.eqCols[side] {
   703  			if row[c].IsNull() {
   704  				hasNull = true
   705  				break
   706  			}
   707  		}
   708  		// row has no NULLs in its equality columns (or we are considering NULLs to
   709  		// be equal), so it might match a row from the other side.
   710  		if !hasNull || h.nullEquality {
   711  			return row, nil, false, nil
   712  		}
   713  
   714  		if renderedRow, shouldEmit := h.shouldEmitUnmatched(row, side); shouldEmit {
   715  			return renderedRow, nil, true, nil
   716  		}
   717  
   718  		// If this point is reached, row had NULLs in its equality columns but
   719  		// should not be emitted. Throw it away and get the next row.
   720  	}
   721  }
   722  
   723  // shouldEmitUnmatched returns whether this row should be emitted if it doesn't
   724  // match. If this is the case, a rendered row ready for emitting is returned as
   725  // well.
   726  func (h *hashJoiner) shouldEmitUnmatched(
   727  	row sqlbase.EncDatumRow, side joinSide,
   728  ) (sqlbase.EncDatumRow, bool) {
   729  	if !shouldEmitUnmatchedRow(side, h.joinType) {
   730  		return nil, false
   731  	}
   732  	return h.renderUnmatchedRow(row, side), true
   733  }
   734  
   735  // initStoredRows initializes a hashRowContainer and sets h.storedRows.
   736  func (h *hashJoiner) initStoredRows() error {
   737  	if !h.disableTempStorage {
   738  		hrc := rowcontainer.NewHashDiskBackedRowContainer(
   739  			&h.rows[h.storedSide],
   740  			h.EvalCtx,
   741  			h.MemMonitor,
   742  			h.diskMonitor,
   743  			h.FlowCtx.Cfg.TempStorage,
   744  		)
   745  		h.storedRows = hrc
   746  	} else {
   747  		hrc := rowcontainer.MakeHashMemRowContainer(&h.rows[h.storedSide])
   748  		h.storedRows = &hrc
   749  	}
   750  	return h.storedRows.Init(
   751  		h.Ctx,
   752  		shouldMark(h.storedSide, h.joinType),
   753  		h.rows[h.storedSide].Types(),
   754  		h.eqCols[h.storedSide],
   755  		h.nullEquality,
   756  	)
   757  }
   758  
   759  var _ execinfrapb.DistSQLSpanStats = &HashJoinerStats{}
   760  
   761  const hashJoinerTagPrefix = "hashjoiner."
   762  
   763  // Stats implements the SpanStats interface.
   764  func (hjs *HashJoinerStats) Stats() map[string]string {
   765  	// statsMap starts off as the left input stats map.
   766  	statsMap := hjs.LeftInputStats.Stats(hashJoinerTagPrefix + "left.")
   767  	rightInputStatsMap := hjs.RightInputStats.Stats(hashJoinerTagPrefix + "right.")
   768  	// Merge the two input maps.
   769  	for k, v := range rightInputStatsMap {
   770  		statsMap[k] = v
   771  	}
   772  	statsMap[hashJoinerTagPrefix+"stored_side"] = hjs.StoredSide
   773  	statsMap[hashJoinerTagPrefix+MaxMemoryTagSuffix] = humanizeutil.IBytes(hjs.MaxAllocatedMem)
   774  	statsMap[hashJoinerTagPrefix+MaxDiskTagSuffix] = humanizeutil.IBytes(hjs.MaxAllocatedDisk)
   775  	return statsMap
   776  }
   777  
   778  // StatsForQueryPlan implements the DistSQLSpanStats interface.
   779  func (hjs *HashJoinerStats) StatsForQueryPlan() []string {
   780  	stats := hjs.LeftInputStats.StatsForQueryPlan("left ")
   781  	stats = append(stats, hjs.RightInputStats.StatsForQueryPlan("right ")...)
   782  	stats = append(stats, fmt.Sprintf("stored side: %s", hjs.StoredSide))
   783  
   784  	if hjs.MaxAllocatedMem != 0 {
   785  		stats = append(stats,
   786  			fmt.Sprintf("%s: %s", MaxMemoryQueryPlanSuffix, humanizeutil.IBytes(hjs.MaxAllocatedMem)))
   787  	}
   788  
   789  	if hjs.MaxAllocatedDisk != 0 {
   790  		stats = append(stats,
   791  			fmt.Sprintf("%s: %s", MaxDiskQueryPlanSuffix, humanizeutil.IBytes(hjs.MaxAllocatedDisk)))
   792  	}
   793  
   794  	return stats
   795  }
   796  
   797  // outputStatsToTrace outputs the collected hashJoiner stats to the trace. Will
   798  // fail silently if the hashJoiner is not collecting stats.
   799  func (h *hashJoiner) outputStatsToTrace() {
   800  	lis, ok := getInputStats(h.FlowCtx, h.leftSource)
   801  	if !ok {
   802  		return
   803  	}
   804  	ris, ok := getInputStats(h.FlowCtx, h.rightSource)
   805  	if !ok {
   806  		return
   807  	}
   808  	if sp := opentracing.SpanFromContext(h.Ctx); sp != nil {
   809  		tracing.SetSpanStats(
   810  			sp,
   811  			&HashJoinerStats{
   812  				LeftInputStats:   lis,
   813  				RightInputStats:  ris,
   814  				StoredSide:       h.storedSide.String(),
   815  				MaxAllocatedMem:  h.MemMonitor.MaximumBytes(),
   816  				MaxAllocatedDisk: h.diskMonitor.MaximumBytes(),
   817  			},
   818  		)
   819  	}
   820  }
   821  
   822  // Some types of joins need to mark rows that matched.
   823  func shouldMark(storedSide joinSide, joinType sqlbase.JoinType) bool {
   824  	switch {
   825  	case joinType == sqlbase.LeftSemiJoin && storedSide == leftSide:
   826  		return true
   827  	case joinType == sqlbase.LeftAntiJoin && storedSide == leftSide:
   828  		return true
   829  	case joinType == sqlbase.ExceptAllJoin:
   830  		return true
   831  	case joinType == sqlbase.IntersectAllJoin:
   832  		return true
   833  	case shouldEmitUnmatchedRow(storedSide, joinType):
   834  		return true
   835  	default:
   836  		return false
   837  	}
   838  }
   839  
   840  // Some types of joins only need to know of the existence of a matching row in
   841  // the storedSide, depending on the storedSide, and don't need to know all the
   842  // rows. These can 'short circuit' to avoid iterating through them all.
   843  func shouldShortCircuit(storedSide joinSide, joinType sqlbase.JoinType) bool {
   844  	switch joinType {
   845  	case sqlbase.LeftSemiJoin:
   846  		return storedSide == rightSide
   847  	case sqlbase.ExceptAllJoin:
   848  		return true
   849  	default:
   850  		return false
   851  	}
   852  }
   853  
   854  // ChildCount is part of the execinfra.OpNode interface.
   855  func (h *hashJoiner) ChildCount(verbose bool) int {
   856  	if _, ok := h.leftSource.(execinfra.OpNode); ok {
   857  		if _, ok := h.rightSource.(execinfra.OpNode); ok {
   858  			return 2
   859  		}
   860  	}
   861  	return 0
   862  }
   863  
   864  // Child is part of the execinfra.OpNode interface.
   865  func (h *hashJoiner) Child(nth int, verbose bool) execinfra.OpNode {
   866  	switch nth {
   867  	case 0:
   868  		if n, ok := h.leftSource.(execinfra.OpNode); ok {
   869  			return n
   870  		}
   871  		panic("left input to hashJoiner is not an execinfra.OpNode")
   872  	case 1:
   873  		if n, ok := h.rightSource.(execinfra.OpNode); ok {
   874  			return n
   875  		}
   876  		panic("right input to hashJoiner is not an execinfra.OpNode")
   877  	default:
   878  		panic(fmt.Sprintf("invalid index %d", nth))
   879  	}
   880  }