github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/sql/rowexec/interleaved_reader_joiner.go (about)

     1  // Copyright 2017 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package rowexec
    12  
    13  import (
    14  	"context"
    15  	"fmt"
    16  
    17  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    18  	"github.com/cockroachdb/cockroach/pkg/server/telemetry"
    19  	"github.com/cockroachdb/cockroach/pkg/sql/execinfra"
    20  	"github.com/cockroachdb/cockroach/pkg/sql/execinfrapb"
    21  	"github.com/cockroachdb/cockroach/pkg/sql/row"
    22  	"github.com/cockroachdb/cockroach/pkg/sql/scrub"
    23  	"github.com/cockroachdb/cockroach/pkg/sql/sqlbase"
    24  	"github.com/cockroachdb/cockroach/pkg/sql/sqltelemetry"
    25  	"github.com/cockroachdb/cockroach/pkg/util/log"
    26  	"github.com/cockroachdb/errors"
    27  )
    28  
    29  // irjState represents the state of the processor.
    30  type irjState int
    31  
    32  const (
    33  	irjStateUnknown irjState = iota
    34  	// irjReading causes the state machine to read the next row from the kvFetcher
    35  	// and potentially output a merged row.
    36  	irjReading
    37  	// irjUnmatchedChild indicates that the state machine should output the
    38  	// unmatched child row stored in the unmatchedChild field.
    39  	irjUnmatchedChild
    40  )
    41  
    42  type tableInfo struct {
    43  	tableID  sqlbase.ID
    44  	indexID  sqlbase.IndexID
    45  	post     execinfra.ProcOutputHelper
    46  	ordering sqlbase.ColumnOrdering
    47  }
    48  
    49  // interleavedReaderJoiner is at the start of a computation flow: it performs KV
    50  // operations to retrieve rows for two tables (ancestor and child), internally
    51  // filters the rows, performs a merge join with equality constraints.
    52  // See docs/RFCS/20171025_interleaved_table_joins.md
    53  type interleavedReaderJoiner struct {
    54  	joinerBase
    55  
    56  	// runningState represents the state of the processor. This is in addition to
    57  	// ProcessorBase.State - the runningState is only relevant when
    58  	// ProcessorBase.State == StateRunning.
    59  	runningState irjState
    60  
    61  	// Each tableInfo contains the output helper (for intermediate
    62  	// filtering) and ordering info for each table-index being joined.
    63  	tables    []tableInfo
    64  	allSpans  roachpb.Spans
    65  	limitHint int64
    66  
    67  	fetcher row.Fetcher
    68  	alloc   sqlbase.DatumAlloc
    69  
    70  	// TODO(richardwu): If we need to buffer more than 1 ancestor row for
    71  	// prefix joins, subset joins, and/or outer joins, we need to buffer an
    72  	// arbitrary number of ancestor and child rows.
    73  	// We can use streamMerger here for simplicity.
    74  	ancestorRow sqlbase.EncDatumRow
    75  	// These are required for OUTER joins where the ancestor need to be
    76  	// emitted regardless.
    77  	ancestorJoined     bool
    78  	ancestorJoinSide   joinSide
    79  	descendantJoinSide joinSide
    80  	unmatchedChild     sqlbase.EncDatumRow
    81  	// ancestorTablePos is the corresponding index of the ancestor table in
    82  	// tables.
    83  	ancestorTablePos int
    84  }
    85  
    86  func (irj *interleavedReaderJoiner) Start(ctx context.Context) context.Context {
    87  	irj.runningState = irjReading
    88  	ctx = irj.StartInternal(ctx, interleavedReaderJoinerProcName)
    89  	// TODO(radu,andrei,knz): set the traceKV flag when requested by the session.
    90  	if err := irj.fetcher.StartScan(
    91  		irj.Ctx, irj.FlowCtx.Txn, irj.allSpans, true /* limitBatches */, irj.limitHint, false, /* traceKV */
    92  	); err != nil {
    93  		irj.MoveToDraining(err)
    94  	}
    95  	return ctx
    96  }
    97  
    98  func (irj *interleavedReaderJoiner) Next() (sqlbase.EncDatumRow, *execinfrapb.ProducerMetadata) {
    99  	// Next is implemented as a state machine. The states are represented by the
   100  	// irjState enum at the top of this file.
   101  	// Roughly, the state machine is either in an initialization phase, a steady
   102  	// state phase that outputs either 1 or 0 rows on every call, or a special
   103  	// unmatched child phase that outputs a child row that doesn't match the last
   104  	// seen ancestor if the join type calls for it.
   105  	for irj.State == execinfra.StateRunning {
   106  		var row sqlbase.EncDatumRow
   107  		var meta *execinfrapb.ProducerMetadata
   108  		switch irj.runningState {
   109  		case irjReading:
   110  			irj.runningState, row, meta = irj.nextRow()
   111  		case irjUnmatchedChild:
   112  			rendered := irj.renderUnmatchedRow(irj.unmatchedChild, irj.descendantJoinSide)
   113  			row = irj.ProcessRowHelper(rendered)
   114  			irj.unmatchedChild = nil
   115  			irj.runningState = irjReading
   116  		default:
   117  			log.Fatalf(irj.Ctx, "unsupported state: %d", irj.runningState)
   118  		}
   119  		if row != nil || meta != nil {
   120  			return row, meta
   121  		}
   122  	}
   123  	return nil, irj.DrainHelper()
   124  }
   125  
   126  // findTable returns the tableInfo for the given table and index descriptor,
   127  // along with a boolean that is true if the found tableInfo represents the
   128  // ancestor table in this join. err is non-nil if the table was missing from the
   129  // list.
   130  func (irj *interleavedReaderJoiner) findTable(
   131  	table *sqlbase.TableDescriptor, index *sqlbase.IndexDescriptor,
   132  ) (tInfo *tableInfo, isAncestorRow bool, err error) {
   133  	for i := range irj.tables {
   134  		tInfo = &irj.tables[i]
   135  		if table.ID == tInfo.tableID && index.ID == tInfo.indexID {
   136  			if i == irj.ancestorTablePos {
   137  				isAncestorRow = true
   138  			}
   139  			return tInfo, isAncestorRow, nil
   140  		}
   141  	}
   142  	return nil,
   143  		false,
   144  		errors.Errorf("index %q.%q missing from interleaved join",
   145  			table.Name, index.Name)
   146  }
   147  
   148  // nextRow implements the steady state of the interleavedReaderJoiner. It
   149  // requests the next row from its backing kv fetcher, determines whether its an
   150  // ancestor or child row, and conditionally merges and outputs a result.
   151  func (irj *interleavedReaderJoiner) nextRow() (
   152  	irjState,
   153  	sqlbase.EncDatumRow,
   154  	*execinfrapb.ProducerMetadata,
   155  ) {
   156  	row, desc, index, err := irj.fetcher.NextRow(irj.Ctx)
   157  	if err != nil {
   158  		irj.MoveToDraining(scrub.UnwrapScrubError(err))
   159  		return irjStateUnknown, nil, irj.DrainHelper()
   160  	}
   161  	if row == nil {
   162  		// All done - just finish maybe emitting our last ancestor.
   163  		lastAncestor := irj.maybeUnmatchedAncestor()
   164  		irj.MoveToDraining(nil)
   165  		return irjReading, lastAncestor, nil
   166  	}
   167  
   168  	// Lookup the helper that belongs to this row.
   169  	tInfo, isAncestorRow, err := irj.findTable(desc, index)
   170  	if err != nil {
   171  		irj.MoveToDraining(err)
   172  		return irjStateUnknown, nil, irj.DrainHelper()
   173  	}
   174  
   175  	// We post-process the intermediate row from either table.
   176  	tableRow, ok, err := tInfo.post.ProcessRow(irj.Ctx, row)
   177  	if err != nil {
   178  		irj.MoveToDraining(err)
   179  		return irjStateUnknown, nil, irj.DrainHelper()
   180  	}
   181  	if !ok {
   182  		irj.MoveToDraining(nil)
   183  	}
   184  
   185  	// Row was filtered out.
   186  	if tableRow == nil {
   187  		return irjReading, nil, nil
   188  	}
   189  
   190  	if isAncestorRow {
   191  		maybeAncestor := irj.maybeUnmatchedAncestor()
   192  
   193  		irj.ancestorJoined = false
   194  		irj.ancestorRow = tInfo.post.RowAlloc.CopyRow(tableRow)
   195  
   196  		// If maybeAncestor is nil, we'll loop back around and read the next row
   197  		// without returning a row to the caller.
   198  		return irjReading, maybeAncestor, nil
   199  	}
   200  
   201  	// A child row (tableRow) is fetched.
   202  
   203  	// TODO(richardwu): Generalize this to 2+ tables and sibling
   204  	// tables.
   205  	var lrow, rrow sqlbase.EncDatumRow
   206  	if irj.ancestorTablePos == 0 {
   207  		lrow, rrow = irj.ancestorRow, tableRow
   208  	} else {
   209  		lrow, rrow = tableRow, irj.ancestorRow
   210  	}
   211  
   212  	// TODO(richardwu): this is a very expensive comparison
   213  	// in the hot path. We can avoid this if there is a foreign
   214  	// key constraint between the merge columns.
   215  	// That is: any child rows can be joined with the most
   216  	// recent parent row without this comparison.
   217  	cmp, err := CompareEncDatumRowForMerge(
   218  		irj.tables[0].post.OutputTypes,
   219  		lrow,
   220  		rrow,
   221  		irj.tables[0].ordering,
   222  		irj.tables[1].ordering,
   223  		false, /* nullEquality */
   224  		&irj.alloc,
   225  		irj.FlowCtx.EvalCtx,
   226  	)
   227  	if err != nil {
   228  		irj.MoveToDraining(err)
   229  		return irjStateUnknown, nil, irj.DrainHelper()
   230  	}
   231  
   232  	// The child row match the most recent ancestorRow on the
   233  	// equality columns.
   234  	// Try to join/render and emit.
   235  	if cmp == 0 {
   236  		renderedRow, err := irj.render(lrow, rrow)
   237  		if err != nil {
   238  			irj.MoveToDraining(err)
   239  			return irjStateUnknown, nil, irj.DrainHelper()
   240  		}
   241  		if renderedRow != nil {
   242  			irj.ancestorJoined = true
   243  		}
   244  		return irjReading, irj.ProcessRowHelper(renderedRow), nil
   245  	}
   246  
   247  	// Child does not match previous ancestorRow.
   248  	// Try to emit the ancestor row.
   249  	unmatchedAncestor := irj.maybeUnmatchedAncestor()
   250  
   251  	// Reset the ancestorRow (we know there are no more
   252  	// corresponding children rows).
   253  	irj.ancestorRow = nil
   254  	irj.ancestorJoined = false
   255  
   256  	newState := irjReading
   257  	// Set the unmatched child if necessary (we'll pick it up again after we emit
   258  	// the ancestor).
   259  	if shouldEmitUnmatchedRow(irj.descendantJoinSide, irj.joinType) {
   260  		irj.unmatchedChild = row
   261  		newState = irjUnmatchedChild
   262  	}
   263  
   264  	return newState, unmatchedAncestor, nil
   265  }
   266  
   267  func (irj *interleavedReaderJoiner) ConsumerClosed() {
   268  	// The consumer is done, Next() will not be called again.
   269  	irj.InternalClose()
   270  }
   271  
   272  var _ execinfra.Processor = &interleavedReaderJoiner{}
   273  var _ execinfra.RowSource = &interleavedReaderJoiner{}
   274  var _ execinfrapb.MetadataSource = &interleavedReaderJoiner{}
   275  var _ execinfra.OpNode = &interleavedReaderJoiner{}
   276  
   277  // newInterleavedReaderJoiner creates a interleavedReaderJoiner.
   278  func newInterleavedReaderJoiner(
   279  	flowCtx *execinfra.FlowCtx,
   280  	processorID int32,
   281  	spec *execinfrapb.InterleavedReaderJoinerSpec,
   282  	post *execinfrapb.PostProcessSpec,
   283  	output execinfra.RowReceiver,
   284  ) (*interleavedReaderJoiner, error) {
   285  	// NB: we hit this with a zero NodeID (but !ok) with multi-tenancy.
   286  	if nodeID, ok := flowCtx.NodeID.OptionalNodeID(); nodeID == 0 && ok {
   287  		return nil, errors.AssertionFailedf("attempting to create an interleavedReaderJoiner with uninitialized NodeID")
   288  	}
   289  
   290  	// Increment some telemetry counters about use of the interleaved table join feature.
   291  	telemetry.Inc(sqltelemetry.InterleavedTableJoinCounter)
   292  
   293  	// TODO(richardwu): We can relax this to < 2 (i.e. permit 2+ tables).
   294  	// This will require modifying joinerBase init logic.
   295  	if len(spec.Tables) != 2 {
   296  		return nil, errors.AssertionFailedf("interleavedReaderJoiner only reads from two tables in an interleaved hierarchy")
   297  	}
   298  
   299  	// Ensure the column orderings of all tables being merged are in the
   300  	// same direction.
   301  	for i, c := range spec.Tables[0].Ordering.Columns {
   302  		for _, table := range spec.Tables[1:] {
   303  			if table.Ordering.Columns[i].Direction != c.Direction {
   304  				return nil, errors.AssertionFailedf("unmatched column orderings")
   305  			}
   306  		}
   307  	}
   308  
   309  	tables := make([]tableInfo, len(spec.Tables))
   310  	// We need to take spans from all tables and merge them together
   311  	// for Fetcher.
   312  	allSpans := make(roachpb.Spans, 0, len(spec.Tables))
   313  
   314  	// We need to figure out which table is the ancestor.
   315  	var ancestorTablePos int
   316  	var numAncestorPKCols int
   317  	minAncestors := -1
   318  	for i, table := range spec.Tables {
   319  		index, _, err := table.Desc.FindIndexByIndexIdx(int(table.IndexIdx))
   320  		if err != nil {
   321  			return nil, err
   322  		}
   323  
   324  		// The simplest way is to find the table with the fewest
   325  		// interleave ancestors.
   326  		// TODO(richardwu): Adapt this for sibling joins and multi-table joins.
   327  		if minAncestors == -1 || len(index.Interleave.Ancestors) < minAncestors {
   328  			minAncestors = len(index.Interleave.Ancestors)
   329  			ancestorTablePos = i
   330  			numAncestorPKCols = len(index.ColumnIDs)
   331  		}
   332  
   333  		if err := tables[i].post.Init(
   334  			&table.Post, table.Desc.ColumnTypes(), flowCtx.NewEvalCtx(), nil, /*output*/
   335  		); err != nil {
   336  			return nil, errors.NewAssertionErrorWithWrappedErrf(err,
   337  				"failed to initialize post-processing helper")
   338  		}
   339  
   340  		tables[i].tableID = table.Desc.ID
   341  		tables[i].indexID = index.ID
   342  		tables[i].ordering = execinfrapb.ConvertToColumnOrdering(table.Ordering)
   343  		for _, trSpan := range table.Spans {
   344  			allSpans = append(allSpans, trSpan.Span)
   345  		}
   346  	}
   347  
   348  	if len(spec.Tables[0].Ordering.Columns) != numAncestorPKCols {
   349  		return nil, errors.AssertionFailedf(
   350  			"interleavedReaderJoiner only supports joins on the entire interleaved prefix")
   351  	}
   352  
   353  	allSpans, _ = roachpb.MergeSpans(allSpans)
   354  
   355  	ancestorJoinSide := leftSide
   356  	descendantJoinSide := rightSide
   357  	if ancestorTablePos == 1 {
   358  		ancestorJoinSide = rightSide
   359  		descendantJoinSide = leftSide
   360  	}
   361  
   362  	irj := &interleavedReaderJoiner{
   363  		tables:             tables,
   364  		allSpans:           allSpans,
   365  		ancestorTablePos:   ancestorTablePos,
   366  		ancestorJoinSide:   ancestorJoinSide,
   367  		descendantJoinSide: descendantJoinSide,
   368  	}
   369  
   370  	if err := irj.initRowFetcher(
   371  		flowCtx, spec.Tables, tables, spec.Reverse, spec.LockingStrength, &irj.alloc,
   372  	); err != nil {
   373  		return nil, err
   374  	}
   375  
   376  	irj.limitHint = execinfra.LimitHint(spec.LimitHint, post)
   377  
   378  	// TODO(richardwu): Generalize this to 2+ tables.
   379  	if err := irj.joinerBase.init(
   380  		irj,
   381  		flowCtx,
   382  		processorID,
   383  		irj.tables[0].post.OutputTypes,
   384  		irj.tables[1].post.OutputTypes,
   385  		spec.Type,
   386  		spec.OnExpr,
   387  		nil, /*leftEqColumns*/
   388  		nil, /*rightEqColumns*/
   389  		0,   /*numMergedColumns*/
   390  		post,
   391  		output,
   392  		execinfra.ProcStateOpts{
   393  			InputsToDrain:        []execinfra.RowSource{},
   394  			TrailingMetaCallback: irj.generateTrailingMeta,
   395  		},
   396  	); err != nil {
   397  		return nil, err
   398  	}
   399  
   400  	return irj, nil
   401  }
   402  
   403  func (irj *interleavedReaderJoiner) initRowFetcher(
   404  	flowCtx *execinfra.FlowCtx,
   405  	tables []execinfrapb.InterleavedReaderJoinerSpec_Table,
   406  	tableInfos []tableInfo,
   407  	reverseScan bool,
   408  	lockStr sqlbase.ScanLockingStrength,
   409  	alloc *sqlbase.DatumAlloc,
   410  ) error {
   411  	args := make([]row.FetcherTableArgs, len(tables))
   412  
   413  	for i, table := range tables {
   414  		desc := sqlbase.NewImmutableTableDescriptor(table.Desc)
   415  		var err error
   416  		args[i].Index, args[i].IsSecondaryIndex, err = desc.FindIndexByIndexIdx(int(table.IndexIdx))
   417  		if err != nil {
   418  			return err
   419  		}
   420  
   421  		args[i].ValNeededForCol = tableInfos[i].post.NeededColumns()
   422  		args[i].ColIdxMap = desc.ColumnIdxMap()
   423  		args[i].Desc = desc
   424  		args[i].Cols = desc.Columns
   425  		args[i].Spans = make(roachpb.Spans, len(table.Spans))
   426  		for j, trSpan := range table.Spans {
   427  			args[i].Spans[j] = trSpan.Span
   428  		}
   429  	}
   430  
   431  	return irj.fetcher.Init(
   432  		flowCtx.Codec(),
   433  		reverseScan,
   434  		lockStr,
   435  		true, /* returnRangeInfo */
   436  		true, /* isCheck */
   437  		alloc,
   438  		args...,
   439  	)
   440  }
   441  
   442  func (irj *interleavedReaderJoiner) generateTrailingMeta(
   443  	ctx context.Context,
   444  ) []execinfrapb.ProducerMetadata {
   445  	trailingMeta := irj.generateMeta(ctx)
   446  	irj.InternalClose()
   447  	return trailingMeta
   448  }
   449  
   450  func (irj *interleavedReaderJoiner) generateMeta(
   451  	ctx context.Context,
   452  ) []execinfrapb.ProducerMetadata {
   453  	var trailingMeta []execinfrapb.ProducerMetadata
   454  	nodeID, ok := irj.FlowCtx.NodeID.OptionalNodeID()
   455  	if ok {
   456  		ranges := execinfra.MisplannedRanges(ctx, irj.fetcher.GetRangesInfo(), nodeID)
   457  		if ranges != nil {
   458  			trailingMeta = append(trailingMeta, execinfrapb.ProducerMetadata{Ranges: ranges})
   459  		}
   460  	}
   461  	if tfs := execinfra.GetLeafTxnFinalState(ctx, irj.FlowCtx.Txn); tfs != nil {
   462  		trailingMeta = append(trailingMeta, execinfrapb.ProducerMetadata{LeafTxnFinalState: tfs})
   463  	}
   464  	return trailingMeta
   465  }
   466  
   467  // DrainMeta is part of the MetadataSource interface.
   468  func (irj *interleavedReaderJoiner) DrainMeta(ctx context.Context) []execinfrapb.ProducerMetadata {
   469  	return irj.generateMeta(ctx)
   470  }
   471  
   472  const interleavedReaderJoinerProcName = "interleaved reader joiner"
   473  
   474  func (irj *interleavedReaderJoiner) maybeUnmatchedAncestor() sqlbase.EncDatumRow {
   475  	// We first try to emit the previous ancestor row if it
   476  	// was never joined with a child row.
   477  	if irj.ancestorRow != nil && !irj.ancestorJoined {
   478  		if !shouldEmitUnmatchedRow(irj.ancestorJoinSide, irj.joinType) {
   479  			return nil
   480  		}
   481  
   482  		rendered := irj.renderUnmatchedRow(irj.ancestorRow, irj.ancestorJoinSide)
   483  		return irj.ProcessRowHelper(rendered)
   484  	}
   485  	return nil
   486  }
   487  
   488  // ChildCount is part of the execinfra.OpNode interface.
   489  func (irj *interleavedReaderJoiner) ChildCount(verbose bool) int {
   490  	return 0
   491  }
   492  
   493  // Child is part of the execinfra.OpNode interface.
   494  func (irj *interleavedReaderJoiner) Child(nth int, verbose bool) execinfra.OpNode {
   495  	panic(fmt.Sprintf("invalid index %d", nth))
   496  }