github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/sql/rowexec/joinreader.go (about)

     1  // Copyright 2016 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package rowexec
    12  
    13  import (
    14  	"context"
    15  	"fmt"
    16  	"sort"
    17  
    18  	"github.com/cockroachdb/cockroach/pkg/sql/execinfra"
    19  	"github.com/cockroachdb/cockroach/pkg/sql/execinfrapb"
    20  	"github.com/cockroachdb/cockroach/pkg/sql/row"
    21  	"github.com/cockroachdb/cockroach/pkg/sql/rowcontainer"
    22  	"github.com/cockroachdb/cockroach/pkg/sql/scrub"
    23  	"github.com/cockroachdb/cockroach/pkg/sql/span"
    24  	"github.com/cockroachdb/cockroach/pkg/sql/sqlbase"
    25  	"github.com/cockroachdb/cockroach/pkg/sql/types"
    26  	"github.com/cockroachdb/cockroach/pkg/util"
    27  	"github.com/cockroachdb/cockroach/pkg/util/log"
    28  	"github.com/cockroachdb/cockroach/pkg/util/mon"
    29  	"github.com/cockroachdb/cockroach/pkg/util/tracing"
    30  	"github.com/cockroachdb/errors"
    31  	"github.com/opentracing/opentracing-go"
    32  )
    33  
    34  // joinReaderState represents the state of the processor.
    35  type joinReaderState int
    36  
    37  const (
    38  	jrStateUnknown joinReaderState = iota
    39  	// jrReadingInput means that a batch of rows is being read from the input.
    40  	jrReadingInput
    41  	// jrPerformingLookup means we are performing an index lookup for the current
    42  	// input row batch.
    43  	jrPerformingLookup
    44  	// jrEmittingRows means we are emitting the results of the index lookup.
    45  	jrEmittingRows
    46  )
    47  
    48  // joinReader performs a lookup join between `input` and the specified `index`.
    49  // `lookupCols` specifies the input columns which will be used for the index
    50  // lookup.
    51  type joinReader struct {
    52  	joinerBase
    53  	strategy joinReaderStrategy
    54  
    55  	// runningState represents the state of the joinReader. This is in addition to
    56  	// ProcessorBase.State - the runningState is only relevant when
    57  	// ProcessorBase.State == StateRunning.
    58  	runningState joinReaderState
    59  
    60  	diskMonitor *mon.BytesMonitor
    61  
    62  	desc      sqlbase.TableDescriptor
    63  	index     *sqlbase.IndexDescriptor
    64  	colIdxMap map[sqlbase.ColumnID]int
    65  
    66  	// fetcher wraps the row.Fetcher used to perform lookups. This enables the
    67  	// joinReader to wrap the fetcher with a stat collector when necessary.
    68  	fetcher            rowFetcher
    69  	alloc              sqlbase.DatumAlloc
    70  	rowAlloc           sqlbase.EncDatumRowAlloc
    71  	shouldLimitBatches bool
    72  
    73  	input      execinfra.RowSource
    74  	inputTypes []*types.T
    75  	// Column indexes in the input stream specifying the columns which match with
    76  	// the index columns. These are the equality columns of the join.
    77  	lookupCols []uint32
    78  
    79  	// Batch size for fetches. Not a constant so we can lower for testing.
    80  	batchSizeBytes    int64
    81  	curBatchSizeBytes int64
    82  
    83  	// State variables for each batch of input rows.
    84  	scratchInputRows sqlbase.EncDatumRows
    85  }
    86  
    87  var _ execinfra.Processor = &joinReader{}
    88  var _ execinfra.RowSource = &joinReader{}
    89  var _ execinfrapb.MetadataSource = &joinReader{}
    90  var _ execinfra.OpNode = &joinReader{}
    91  
    92  const joinReaderProcName = "join reader"
    93  
    94  // newJoinReader returns a new joinReader.
    95  func newJoinReader(
    96  	flowCtx *execinfra.FlowCtx,
    97  	processorID int32,
    98  	spec *execinfrapb.JoinReaderSpec,
    99  	input execinfra.RowSource,
   100  	post *execinfrapb.PostProcessSpec,
   101  	output execinfra.RowReceiver,
   102  ) (execinfra.RowSourcedProcessor, error) {
   103  	jr := &joinReader{
   104  		desc:       spec.Table,
   105  		input:      input,
   106  		inputTypes: input.OutputTypes(),
   107  		lookupCols: spec.LookupColumns,
   108  	}
   109  
   110  	var err error
   111  	var isSecondary bool
   112  	jr.index, isSecondary, err = jr.desc.FindIndexByIndexIdx(int(spec.IndexIdx))
   113  	if err != nil {
   114  		return nil, err
   115  	}
   116  	returnMutations := spec.Visibility == execinfra.ScanVisibilityPublicAndNotPublic
   117  	jr.colIdxMap = jr.desc.ColumnIdxMapWithMutations(returnMutations)
   118  
   119  	columnIDs, _ := jr.index.FullColumnIDs()
   120  	indexCols := make([]uint32, len(columnIDs))
   121  	columnTypes := jr.desc.ColumnTypesWithMutations(returnMutations)
   122  	for i, columnID := range columnIDs {
   123  		indexCols[i] = uint32(columnID)
   124  	}
   125  
   126  	// If the lookup columns form a key, there is only one result per lookup, so the fetcher
   127  	// should parallelize the key lookups it performs.
   128  	jr.shouldLimitBatches = !spec.LookupColumnsAreKey
   129  
   130  	if err := jr.joinerBase.init(
   131  		jr,
   132  		flowCtx,
   133  		processorID,
   134  		input.OutputTypes(),
   135  		columnTypes,
   136  		spec.Type,
   137  		spec.OnExpr,
   138  		jr.lookupCols,
   139  		indexCols,
   140  		0, /* numMergedColumns */
   141  		post,
   142  		output,
   143  		execinfra.ProcStateOpts{
   144  			InputsToDrain: []execinfra.RowSource{jr.input},
   145  			TrailingMetaCallback: func(ctx context.Context) []execinfrapb.ProducerMetadata {
   146  				jr.close()
   147  				return jr.generateMeta(ctx)
   148  			},
   149  		},
   150  	); err != nil {
   151  		return nil, err
   152  	}
   153  
   154  	collectingStats := false
   155  	if sp := opentracing.SpanFromContext(flowCtx.EvalCtx.Ctx()); sp != nil && tracing.IsRecording(sp) {
   156  		collectingStats = true
   157  	}
   158  
   159  	neededRightCols := jr.neededRightCols()
   160  	if isSecondary && !neededRightCols.SubsetOf(getIndexColSet(jr.index, jr.colIdxMap)) {
   161  		return nil, errors.Errorf("joinreader index does not cover all columns")
   162  	}
   163  
   164  	var fetcher row.Fetcher
   165  	_, _, err = initRowFetcher(
   166  		flowCtx, &fetcher, &jr.desc, int(spec.IndexIdx), jr.colIdxMap, false, /* reverse */
   167  		neededRightCols, false /* isCheck */, &jr.alloc, spec.Visibility, spec.LockingStrength,
   168  	)
   169  	if err != nil {
   170  		return nil, err
   171  	}
   172  	if collectingStats {
   173  		jr.input = newInputStatCollector(jr.input)
   174  		jr.fetcher = newRowFetcherStatCollector(&fetcher)
   175  		jr.FinishTrace = jr.outputStatsToTrace
   176  	} else {
   177  		jr.fetcher = &fetcher
   178  	}
   179  
   180  	jr.initJoinReaderStrategy(flowCtx, jr.desc.ColumnTypesWithMutations(returnMutations), len(columnIDs), spec.MaintainOrdering)
   181  	jr.batchSizeBytes = jr.strategy.getLookupRowsBatchSizeHint()
   182  
   183  	// TODO(radu): verify the input types match the index key types
   184  	return jr, nil
   185  }
   186  
   187  func (jr *joinReader) initJoinReaderStrategy(
   188  	flowCtx *execinfra.FlowCtx, typs []*types.T, numKeyCols int, maintainOrdering bool,
   189  ) {
   190  	spanBuilder := span.MakeBuilder(flowCtx.Codec(), &jr.desc, jr.index)
   191  	spanBuilder.SetNeededColumns(jr.neededRightCols())
   192  
   193  	spanGenerator := defaultSpanGenerator{
   194  		spanBuilder:          spanBuilder,
   195  		keyToInputRowIndices: make(map[string][]int),
   196  		numKeyCols:           numKeyCols,
   197  		lookupCols:           jr.lookupCols,
   198  	}
   199  	if !maintainOrdering {
   200  		jr.strategy = &joinReaderNoOrderingStrategy{
   201  			joinerBase:           &jr.joinerBase,
   202  			defaultSpanGenerator: spanGenerator,
   203  			isPartialJoin:        jr.joinType == sqlbase.LeftSemiJoin || jr.joinType == sqlbase.LeftAntiJoin,
   204  		}
   205  		return
   206  	}
   207  
   208  	ctx := flowCtx.EvalCtx.Ctx()
   209  	// Limit the memory use by creating a child monitor with a hard limit.
   210  	// joinReader will overflow to disk if this limit is not enough.
   211  	limit := execinfra.GetWorkMemLimit(flowCtx.Cfg)
   212  	if flowCtx.Cfg.TestingKnobs.ForceDiskSpill {
   213  		limit = 1
   214  	}
   215  	// Initialize memory monitors and row container for looked up rows.
   216  	jr.MemMonitor = execinfra.NewLimitedMonitor(ctx, flowCtx.EvalCtx.Mon, flowCtx.Cfg, "joiner-limited")
   217  	jr.diskMonitor = execinfra.NewMonitor(ctx, flowCtx.Cfg.DiskMonitor, "joinreader-disk")
   218  	drc := rowcontainer.NewDiskBackedNumberedRowContainer(
   219  		false, /* deDup */
   220  		typs,
   221  		jr.EvalCtx,
   222  		jr.FlowCtx.Cfg.TempStorage,
   223  		jr.MemMonitor,
   224  		jr.diskMonitor,
   225  		0, /* rowCapacity */
   226  	)
   227  	if limit < mon.DefaultPoolAllocationSize {
   228  		// The memory limit is too low for caching, most likely to force disk
   229  		// spilling for testing.
   230  		drc.DisableCache = true
   231  	}
   232  	jr.strategy = &joinReaderOrderingStrategy{
   233  		joinerBase:           &jr.joinerBase,
   234  		defaultSpanGenerator: spanGenerator,
   235  		isPartialJoin:        jr.joinType == sqlbase.LeftSemiJoin || jr.joinType == sqlbase.LeftAntiJoin,
   236  		lookedUpRows:         drc,
   237  	}
   238  }
   239  
   240  // getIndexColSet returns a set of all column indices for the given index.
   241  func getIndexColSet(
   242  	index *sqlbase.IndexDescriptor, colIdxMap map[sqlbase.ColumnID]int,
   243  ) util.FastIntSet {
   244  	cols := util.MakeFastIntSet()
   245  	err := index.RunOverAllColumns(func(id sqlbase.ColumnID) error {
   246  		cols.Add(colIdxMap[id])
   247  		return nil
   248  	})
   249  	if err != nil {
   250  		// This path should never be hit since the column function never returns an
   251  		// error.
   252  		panic(err)
   253  	}
   254  	return cols
   255  }
   256  
   257  // SetBatchSizeBytes sets the desired batch size. It should only be used in tests.
   258  func (jr *joinReader) SetBatchSizeBytes(batchSize int64) {
   259  	jr.batchSizeBytes = batchSize
   260  }
   261  
   262  // Spilled returns whether the joinReader spilled to disk.
   263  func (jr *joinReader) Spilled() bool {
   264  	return jr.strategy.spilled()
   265  }
   266  
   267  // neededRightCols returns the set of column indices which need to be fetched
   268  // from the right side of the join (jr.desc).
   269  func (jr *joinReader) neededRightCols() util.FastIntSet {
   270  	neededCols := jr.Out.NeededColumns()
   271  
   272  	// Get the columns from the right side of the join and shift them over by
   273  	// the size of the left side so the right side starts at 0.
   274  	neededRightCols := util.MakeFastIntSet()
   275  	for i, ok := neededCols.Next(len(jr.inputTypes)); ok; i, ok = neededCols.Next(i + 1) {
   276  		neededRightCols.Add(i - len(jr.inputTypes))
   277  	}
   278  
   279  	// Add columns needed by OnExpr.
   280  	for _, v := range jr.onCond.Vars.GetIndexedVars() {
   281  		rightIdx := v.Idx - len(jr.inputTypes)
   282  		if rightIdx >= 0 {
   283  			neededRightCols.Add(rightIdx)
   284  		}
   285  	}
   286  
   287  	return neededRightCols
   288  }
   289  
   290  // Next is part of the RowSource interface.
   291  func (jr *joinReader) Next() (sqlbase.EncDatumRow, *execinfrapb.ProducerMetadata) {
   292  	// The lookup join is implemented as follows:
   293  	// - Read the input rows in batches.
   294  	// - For each batch, map the rows onto index keys and perform an index
   295  	//   lookup for those keys. Note that multiple rows may map to the same key.
   296  	// - Retrieve the index lookup results in batches, since the index scan may
   297  	//   return more rows than the input batch size.
   298  	// - Join the index rows with the corresponding input rows and buffer the
   299  	//   results in jr.toEmit.
   300  	for jr.State == execinfra.StateRunning {
   301  		var row sqlbase.EncDatumRow
   302  		var meta *execinfrapb.ProducerMetadata
   303  		switch jr.runningState {
   304  		case jrReadingInput:
   305  			jr.runningState, meta = jr.readInput()
   306  		case jrPerformingLookup:
   307  			jr.runningState, meta = jr.performLookup()
   308  		case jrEmittingRows:
   309  			jr.runningState, row, meta = jr.emitRow()
   310  		default:
   311  			log.Fatalf(jr.Ctx, "unsupported state: %d", jr.runningState)
   312  		}
   313  		if row == nil && meta == nil {
   314  			continue
   315  		}
   316  		if meta != nil {
   317  			return nil, meta
   318  		}
   319  		if outRow := jr.ProcessRowHelper(row); outRow != nil {
   320  			return outRow, nil
   321  		}
   322  	}
   323  	return nil, jr.DrainHelper()
   324  }
   325  
   326  // readInput reads the next batch of input rows and starts an index scan.
   327  func (jr *joinReader) readInput() (joinReaderState, *execinfrapb.ProducerMetadata) {
   328  	// Read the next batch of input rows.
   329  	for jr.curBatchSizeBytes < jr.batchSizeBytes {
   330  		row, meta := jr.input.Next()
   331  		if meta != nil {
   332  			if meta.Err != nil {
   333  				jr.MoveToDraining(nil /* err */)
   334  				return jrStateUnknown, meta
   335  			}
   336  			return jrReadingInput, meta
   337  		}
   338  		if row == nil {
   339  			break
   340  		}
   341  		jr.curBatchSizeBytes += int64(row.Size())
   342  		jr.scratchInputRows = append(jr.scratchInputRows, jr.rowAlloc.CopyRow(row))
   343  	}
   344  
   345  	if len(jr.scratchInputRows) == 0 {
   346  		log.VEventf(jr.Ctx, 1, "no more input rows")
   347  		// We're done.
   348  		jr.MoveToDraining(nil)
   349  		return jrStateUnknown, jr.DrainHelper()
   350  	}
   351  	log.VEventf(jr.Ctx, 1, "read %d input rows", len(jr.scratchInputRows))
   352  
   353  	spans, err := jr.strategy.processLookupRows(jr.scratchInputRows)
   354  	if err != nil {
   355  		jr.MoveToDraining(err)
   356  		return jrStateUnknown, jr.DrainHelper()
   357  	}
   358  	jr.scratchInputRows = jr.scratchInputRows[:0]
   359  	jr.curBatchSizeBytes = 0
   360  	if len(spans) == 0 {
   361  		// All of the input rows were filtered out. Skip the index lookup.
   362  		return jrEmittingRows, nil
   363  	}
   364  	// Sort the spans so that we can rely upon the fetcher to limit the number of
   365  	// results per batch. It's safe to reorder the spans here because we already
   366  	// restore the original order of the output during the output collection
   367  	// phase.
   368  	sort.Sort(spans)
   369  	log.VEventf(jr.Ctx, 1, "scanning %d spans", len(spans))
   370  	if err := jr.fetcher.StartScan(
   371  		jr.Ctx, jr.FlowCtx.Txn, spans, jr.shouldLimitBatches, 0, /* limitHint */
   372  		jr.FlowCtx.TraceKV); err != nil {
   373  		jr.MoveToDraining(err)
   374  		return jrStateUnknown, jr.DrainHelper()
   375  	}
   376  
   377  	return jrPerformingLookup, nil
   378  }
   379  
   380  // performLookup reads the next batch of index rows.
   381  func (jr *joinReader) performLookup() (joinReaderState, *execinfrapb.ProducerMetadata) {
   382  	nCols := len(jr.lookupCols)
   383  
   384  	for {
   385  		// Construct a "partial key" of nCols, so we can match the key format that
   386  		// was stored in our keyToInputRowIndices map. This matches the format that
   387  		// is output in jr.generateSpan.
   388  		key, err := jr.fetcher.PartialKey(nCols)
   389  		if err != nil {
   390  			jr.MoveToDraining(err)
   391  			return jrStateUnknown, jr.DrainHelper()
   392  		}
   393  
   394  		// Fetch the next row and copy it into the row container.
   395  		lookedUpRow, _, _, err := jr.fetcher.NextRow(jr.Ctx)
   396  		if err != nil {
   397  			jr.MoveToDraining(scrub.UnwrapScrubError(err))
   398  			return jrStateUnknown, jr.DrainHelper()
   399  		}
   400  		if lookedUpRow == nil {
   401  			// Done with this input batch.
   402  			break
   403  		}
   404  
   405  		if nextState, err := jr.strategy.processLookedUpRow(jr.Ctx, lookedUpRow, key); err != nil {
   406  			jr.MoveToDraining(err)
   407  			return jrStateUnknown, jr.DrainHelper()
   408  		} else if nextState != jrPerformingLookup {
   409  			return nextState, nil
   410  		}
   411  	}
   412  	log.VEvent(jr.Ctx, 1, "done joining rows")
   413  	jr.strategy.prepareToEmit(jr.Ctx)
   414  
   415  	return jrEmittingRows, nil
   416  }
   417  
   418  // emitRow returns the next row from jr.toEmit, if present. Otherwise it
   419  // prepares for another input batch.
   420  func (jr *joinReader) emitRow() (
   421  	joinReaderState,
   422  	sqlbase.EncDatumRow,
   423  	*execinfrapb.ProducerMetadata,
   424  ) {
   425  	rowToEmit, nextState, err := jr.strategy.nextRowToEmit(jr.Ctx)
   426  	if err != nil {
   427  		jr.MoveToDraining(err)
   428  		return jrStateUnknown, nil, jr.DrainHelper()
   429  	}
   430  	return nextState, rowToEmit, nil
   431  }
   432  
   433  // Start is part of the RowSource interface.
   434  func (jr *joinReader) Start(ctx context.Context) context.Context {
   435  	jr.input.Start(ctx)
   436  	ctx = jr.StartInternal(ctx, joinReaderProcName)
   437  	jr.runningState = jrReadingInput
   438  	return ctx
   439  }
   440  
   441  // ConsumerClosed is part of the RowSource interface.
   442  func (jr *joinReader) ConsumerClosed() {
   443  	// The consumer is done, Next() will not be called again.
   444  	jr.close()
   445  }
   446  
   447  func (jr *joinReader) close() {
   448  	if jr.InternalClose() {
   449  		jr.strategy.close(jr.Ctx)
   450  		if jr.MemMonitor != nil {
   451  			jr.MemMonitor.Stop(jr.Ctx)
   452  		}
   453  		if jr.diskMonitor != nil {
   454  			jr.diskMonitor.Stop(jr.Ctx)
   455  		}
   456  	}
   457  }
   458  
   459  var _ execinfrapb.DistSQLSpanStats = &JoinReaderStats{}
   460  
   461  const joinReaderTagPrefix = "joinreader."
   462  
   463  // Stats implements the SpanStats interface.
   464  func (jrs *JoinReaderStats) Stats() map[string]string {
   465  	statsMap := jrs.InputStats.Stats(joinReaderTagPrefix)
   466  	toMerge := jrs.IndexLookupStats.Stats(joinReaderTagPrefix + "index.")
   467  	for k, v := range toMerge {
   468  		statsMap[k] = v
   469  	}
   470  	return statsMap
   471  }
   472  
   473  // StatsForQueryPlan implements the DistSQLSpanStats interface.
   474  func (jrs *JoinReaderStats) StatsForQueryPlan() []string {
   475  	is := append(
   476  		jrs.InputStats.StatsForQueryPlan(""),
   477  		jrs.IndexLookupStats.StatsForQueryPlan("index ")...,
   478  	)
   479  	return is
   480  }
   481  
   482  // outputStatsToTrace outputs the collected joinReader stats to the trace. Will
   483  // fail silently if the joinReader is not collecting stats.
   484  func (jr *joinReader) outputStatsToTrace() {
   485  	is, ok := getInputStats(jr.FlowCtx, jr.input)
   486  	if !ok {
   487  		return
   488  	}
   489  	ils, ok := getFetcherInputStats(jr.FlowCtx, jr.fetcher)
   490  	if !ok {
   491  		return
   492  	}
   493  
   494  	// TODO(asubiotto): Add memory and disk usage to EXPLAIN ANALYZE.
   495  	jrs := &JoinReaderStats{
   496  		InputStats:       is,
   497  		IndexLookupStats: ils,
   498  	}
   499  	if sp := opentracing.SpanFromContext(jr.Ctx); sp != nil {
   500  		tracing.SetSpanStats(sp, jrs)
   501  	}
   502  }
   503  
   504  func (jr *joinReader) generateMeta(ctx context.Context) []execinfrapb.ProducerMetadata {
   505  	if tfs := execinfra.GetLeafTxnFinalState(ctx, jr.FlowCtx.Txn); tfs != nil {
   506  		return []execinfrapb.ProducerMetadata{{LeafTxnFinalState: tfs}}
   507  	}
   508  	return nil
   509  }
   510  
   511  // DrainMeta is part of the MetadataSource interface.
   512  func (jr *joinReader) DrainMeta(ctx context.Context) []execinfrapb.ProducerMetadata {
   513  	return jr.generateMeta(ctx)
   514  }
   515  
   516  // ChildCount is part of the execinfra.OpNode interface.
   517  func (jr *joinReader) ChildCount(verbose bool) int {
   518  	if _, ok := jr.input.(execinfra.OpNode); ok {
   519  		return 1
   520  	}
   521  	return 0
   522  }
   523  
   524  // Child is part of the execinfra.OpNode interface.
   525  func (jr *joinReader) Child(nth int, verbose bool) execinfra.OpNode {
   526  	if nth == 0 {
   527  		if n, ok := jr.input.(execinfra.OpNode); ok {
   528  			return n
   529  		}
   530  		panic("input to joinReader is not an execinfra.OpNode")
   531  	}
   532  	panic(fmt.Sprintf("invalid index %d", nth))
   533  }