github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/sql/rowexec/zigzagjoiner.go (about)

     1  // Copyright 2018 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package rowexec
    12  
    13  import (
    14  	"context"
    15  	"fmt"
    16  
    17  	"github.com/cockroachdb/cockroach/pkg/kv"
    18  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    19  	"github.com/cockroachdb/cockroach/pkg/sql/execinfra"
    20  	"github.com/cockroachdb/cockroach/pkg/sql/execinfrapb"
    21  	"github.com/cockroachdb/cockroach/pkg/sql/row"
    22  	"github.com/cockroachdb/cockroach/pkg/sql/scrub"
    23  	"github.com/cockroachdb/cockroach/pkg/sql/sem/tree"
    24  	"github.com/cockroachdb/cockroach/pkg/sql/span"
    25  	"github.com/cockroachdb/cockroach/pkg/sql/sqlbase"
    26  	"github.com/cockroachdb/cockroach/pkg/sql/types"
    27  	"github.com/cockroachdb/cockroach/pkg/util"
    28  	"github.com/cockroachdb/cockroach/pkg/util/encoding"
    29  	"github.com/cockroachdb/cockroach/pkg/util/log"
    30  	"github.com/cockroachdb/errors"
    31  )
    32  
    33  // Consider the schema:
    34  //
    35  // CREATE TABLE abcd (a INT, b INT, c INT, d INT, PRIMARY KEY (a, b),
    36  // INDEX c_idx (c), INDEX d_idx (d));
    37  //
    38  // and the query:
    39  //
    40  // SELECT * FROM abcd@c_idx WHERE c = 2 AND d = 3;
    41  //
    42  //
    43  // Without a zigzag joiner, this query would previously execute: index scan on
    44  // `c_idx`, followed by an index join on the primary index, then filter out rows
    45  // where `d ≠ 3`.
    46  // This plan scans through all values in `c_idx` where `c = 2`, however if among
    47  // these rows there are not many where `d = 3` a lot of rows are unnecessarily
    48  // scanned. A zigzag join allows us to skip many of these rows and many times
    49  // will also render the index join unnecessary, by making use of `d_idx`.
    50  //
    51  // To see how this query would be executed, consider the equivalent query:
    52  //
    53  // SELECT t1.* FROM abcd@c_idx AS t1 JOIN abcd@d_idx ON t1.a = t2.a AND
    54  // t1.b = t2.b WHERE t1.c = 2 AND t2.d = 3;
    55  //
    56  // A zigzag joiner takes 2 sides as input. In the example above, the join would
    57  // be between `c_idx` and `d_idx`. Both sides will have the same equality
    58  // columns: (a, b) since that is the primary key of the table. The `c_idx` side
    59  // fixes a prefix (c) to a specific value (2), as does the `d_idx` side (d = 3).
    60  // This can be summarized as:
    61  // Side 1:
    62  //
    63  // - Index: `abcd@c_idx`, with columns (c | a, b)
    64  // - Equality columns: (a, b)
    65  // - Fixed columns: (c)
    66  // - Fixed values: (2)
    67  //
    68  // Side 2:
    69  //
    70  // - Index: `abcd@d_idx`, with columns (d | a, b)
    71  // - Equality columns: (a, b)
    72  // - Fixed columns: (d)
    73  // - Fixed values: (3)
    74  //
    75  // The actual execution can be visualized below :
    76  //
    77  //   c_idx         d_idx
    78  // c | a, b       d | a, b
    79  // ============= ============
    80  // --> 2   1  1 ----> 3   1  1 ---+ X
    81  //                                |
    82  // +----------------- 3   4  2 <--+
    83  // |                  3   4  3
    84  // |                  3   5  6
    85  // |                  3   7  2
    86  // +--> 2  8  2 -------------------+
    87  //                                 |
    88  // +----------------- 3   8  3 ----+
    89  // |
    90  // +-> 2  9  3 -----> 3   9  3 --+ X
    91  //                               |
    92  //                 nil (Done) <--+
    93  //
    94  //
    95  // - The execution starts by fetching the (2, 1, 1) row from c_idx. This is the
    96  // first row fetched when an index lookup in `c_idx` where `c = 2`. Let this be
    97  // the `baseRow`. This is the current contender for other rows to match.
    98  // - An index lookup is performed on `d_idx` for the first row where `d = 3`
    99  // that has equality column (a, b) values greater than or equal to (1, 1), which
   100  // are the values of the equality columns of the `baseRow`.
   101  // - The index lookup on `d_idx` retrieves the row (3, 1, 1)
   102  // - The equality columns of the (3, 1, 1) row are compared against the equality
   103  // columns of the base row (2, 1, 1). They are found to match.
   104  // - Since both indexes are sorted, once a match is found it is guaranteed that
   105  // all rows which match the `baseRow` will follow the two that were matched. All
   106  // of the possible matches are found and put into a container that is maintained
   107  // by each side. Since the equality columns is the primary key, only one match
   108  // can be produced in this example. The left container now contains (2, 1, 1)
   109  // and the right container now contains (3, 1, 1).
   110  // - The cross-product of the containers is emitted. In this case, just the row
   111  // (1, 1, 2, 3).
   112  // - The side with the latest match, (3, 1, 1) in this case, will fetch the next
   113  // row in the index (3, 4, 2). This becomes the new `baseRow`.
   114  // - As before, an index lookup is performed on the other side `c_idx` for the
   115  // first row where `c = 2` that has the equality column (a, b) values greater
   116  // than or equal to (4, 2), which are the values of the equality columns of the
   117  // `baseRow`. In this example, the processor can skip a group of rows that are
   118  // guaranteed to not be in the output of the join.
   119  // - The first row found is (2, 8, 2). Since the equality columns do not match
   120  // to the base row ((8, 2) ≠ (4, 2)), this row becomes the new base row and the
   121  // process is repeated.
   122  // - We are done when the index lookup returns `nil`. There were no more rows in
   123  // this index that could satisfy the join.
   124  //
   125  //
   126  // When Can a Zigzag Join Be Planned:
   127  //
   128  // Every side of a zigzag join has fixed columns, equality columns, and index
   129  // columns.
   130  //
   131  // A zigzag join can be used when for each side, there exists an index with the
   132  // prefix (fixed columns + equality columns). This guarantees that the rows on
   133  // both sides of the join, when iterating through the index, will have both
   134  // sides of the join sorted by its equality columns.
   135  //
   136  // When Should a Zigzag Join Be Planned:
   137  //
   138  // The intuition behind when a zigzag join should be used is when the carnality
   139  // of the output is much smaller than the size of either side of the join. If
   140  // this is not the case, it may end up being slower than other joins because it
   141  // is constantly alternating between sides of the join. Alternatively, the
   142  // zigzag join should be used in cases where an index scan would be used with a
   143  // filter on the results. Examples would be inverted index JSON queries and
   144  // queries such as the `SELECT * FROM abcd@c_idx WHERE c = 2 AND d = 3;` example
   145  // above.
   146  //
   147  // For a description of index columns, refer to Appendix A.
   148  //
   149  // Additional Cases
   150  //
   151  // Normal Joins
   152  // This algorithm can also be applied to normal joins such as:
   153  //
   154  // SELECT t1.a, t1.b FROM abcd t1 JOIN abcd t2 ON t1.b = t2.a WHERE t1.a = 3;
   155  //
   156  // (Using the same schema as above).
   157  //
   158  // The sides of this zigzag join would be:
   159  // Side 1:
   160  //
   161  // - Index: `abcd@primary`
   162  // - Equality columns: (b)
   163  // - Fixed columns: (a)
   164  // - Fixed values: (3)
   165  //
   166  // Side 2:
   167  //
   168  // - Index: `abcd@primary`
   169  // - Equality columns: (a)
   170  // - Fixed columns: None
   171  //- Fixed values: None
   172  //
   173  // Note: If the query were to `SELECT *` instead of `SELECT a, b` a further
   174  // index join would be needed, but this index join would only be applied on the
   175  // necessary rows.
   176  //
   177  // No Fixed Columns
   178  // As shown above, a side can have no fixed columns. This means that the
   179  // equality columns will be a prefix of the index. Specifically this means that
   180  // all rows in the index will be considered rather than doing a lookup on a
   181  // specific prefix.
   182  //
   183  // Multi-Way Join [not implemented]:
   184  // Also note that this algorithm can be extended to support a multi-way join by
   185  // performing index lookups in a round-robin fashion iterating through all of
   186  // the sides until a match is found on all sides of the join. It is expected
   187  // that a zigzag join’s utility will increase as the number of sides increases
   188  // because more rows will be able to be skipped.
   189  //
   190  //
   191  // Appendix A: Indexes
   192  //
   193  // The zigzag joins makes use of multiple indexes. Each index is composed of a
   194  // set of explicit columns, and a set of implicit columns. The union of these
   195  // sets will be referred to as index columns.
   196  //
   197  // The purpose of implicit columns in indexes is to provide unique keys for
   198  // RocksDB as well as to be able to relate the specified row back to the primary
   199  // index where the full row is stored.
   200  //
   201  // Consider the schema:
   202  //
   203  // CREATE TABLE abcd (a INT, b INT, c INT, d INT, (a, b) PRIMARY KEY,
   204  // INDEX c_idx (c), INDEX da_idx (d, a), INDEX db_idx (d, b));
   205  //
   206  // The following three indexes are created:
   207  //
   208  // - Primary Index: (Key format: `/Table/abcd/primary/<a_val>/<b_val>/`)
   209  // - Explicit columns: (a, b)
   210  // - Implicit columns: None
   211  // - Index columns: (a, b)
   212  // - c_idx: (Key format: `/Table/abcd/c_idx/<c_val>/<a_val>/<b_val>/`)
   213  // - Explicit columns: (c)
   214  // - Implicit columns: (a, b)
   215  // - Index columns (c, a, b)
   216  // - da_idx: (Key format: `/Table/abcd/d_idx/<d_val>/<a_val>/<b_val>/`)
   217  // - Explicit columns: (d, a)
   218  // - Implicit columns (b)
   219  // - Index columns: (d, a, b)
   220  // - db_idx: (Key format: `/Table/abcd/d_idx/<d_val>/<b_val>/<a_val>/`)
   221  // - Explicit columns: (d, b)
   222  // - Implicit columns (a)
   223  // - Index columns: (d, b, a)
   224  type zigzagJoiner struct {
   225  	joinerBase
   226  
   227  	evalCtx       *tree.EvalContext
   228  	cancelChecker *sqlbase.CancelChecker
   229  
   230  	// numTables stored the number of tables involved in the join.
   231  	numTables int
   232  	// side keeps track of which side is being processed.
   233  	side int
   234  
   235  	// Stores relevant information for each side of the join including table
   236  	// descriptors, index IDs, rowFetchers, and more. See zigzagJoinInfo for
   237  	// more information.
   238  	infos []*zigzagJoinerInfo
   239  
   240  	// Base row stores the that the algorithm is compared against and is updated
   241  	// with every change of side.
   242  	baseRow sqlbase.EncDatumRow
   243  
   244  	rowAlloc sqlbase.EncDatumRowAlloc
   245  
   246  	// TODO(andrei): get rid of this field and move the actions it gates into the
   247  	// Start() method.
   248  	started bool
   249  
   250  	// returnedMeta contains all the metadata that zigzag joiner has emitted.
   251  	returnedMeta []execinfrapb.ProducerMetadata
   252  }
   253  
   254  // Batch size is a parameter which determines how many rows should be fetched
   255  // at a time. Increasing this will improve performance for when matched rows
   256  // are grouped together, but increasing this too much will result in fetching
   257  // too many rows and therefore skipping less rows.
   258  const zigzagJoinerBatchSize = 5
   259  
   260  var _ execinfra.Processor = &zigzagJoiner{}
   261  var _ execinfra.RowSource = &zigzagJoiner{}
   262  var _ execinfrapb.MetadataSource = &zigzagJoiner{}
   263  var _ execinfra.OpNode = &zigzagJoiner{}
   264  
   265  const zigzagJoinerProcName = "zigzagJoiner"
   266  
   267  // newZigzagJoiner creates a new zigzag joiner given a spec and an EncDatumRow
   268  // holding the values of the prefix columns of the index specified in the spec.
   269  func newZigzagJoiner(
   270  	flowCtx *execinfra.FlowCtx,
   271  	processorID int32,
   272  	spec *execinfrapb.ZigzagJoinerSpec,
   273  	fixedValues []sqlbase.EncDatumRow,
   274  	post *execinfrapb.PostProcessSpec,
   275  	output execinfra.RowReceiver,
   276  ) (*zigzagJoiner, error) {
   277  	z := &zigzagJoiner{}
   278  
   279  	leftColumnTypes := spec.Tables[0].ColumnTypes()
   280  	rightColumnTypes := spec.Tables[1].ColumnTypes()
   281  	leftEqCols := make([]uint32, 0, len(spec.EqColumns[0].Columns))
   282  	rightEqCols := make([]uint32, 0, len(spec.EqColumns[1].Columns))
   283  	err := z.joinerBase.init(
   284  		z, /* self */
   285  		flowCtx,
   286  		processorID,
   287  		leftColumnTypes,
   288  		rightColumnTypes,
   289  		spec.Type,
   290  		spec.OnExpr,
   291  		leftEqCols,
   292  		rightEqCols,
   293  		0, /* numMerged */
   294  		post,
   295  		output,
   296  		execinfra.ProcStateOpts{}, // zigzagJoiner doesn't have any inputs to drain.
   297  	)
   298  	if err != nil {
   299  		return nil, err
   300  	}
   301  
   302  	z.numTables = len(spec.Tables)
   303  	z.infos = make([]*zigzagJoinerInfo, z.numTables)
   304  	z.returnedMeta = make([]execinfrapb.ProducerMetadata, 0, 1)
   305  
   306  	for i := range z.infos {
   307  		z.infos[i] = &zigzagJoinerInfo{}
   308  	}
   309  
   310  	colOffset := 0
   311  	for i := 0; i < z.numTables; i++ {
   312  		if fixedValues != nil && i < len(fixedValues) {
   313  			// Useful for testing. In cases where we plan a zigzagJoin in
   314  			// the planner, we specify fixed values as ValuesCoreSpecs in
   315  			// the spec itself.
   316  			z.infos[i].fixedValues = fixedValues[i]
   317  		} else if i < len(spec.FixedValues) {
   318  			z.infos[i].fixedValues, err = valuesSpecToEncDatum(spec.FixedValues[i])
   319  			if err != nil {
   320  				return nil, err
   321  			}
   322  		}
   323  		if err := z.setupInfo(flowCtx, spec, i, colOffset); err != nil {
   324  			return nil, err
   325  		}
   326  		colOffset += len(z.infos[i].table.Columns)
   327  	}
   328  	z.side = 0
   329  	return z, nil
   330  }
   331  
   332  // Helper function to convert a values spec containing one tuple into EncDatums for
   333  // each cell. Note that this function assumes that there is only one tuple in the
   334  // ValuesSpec (i.e. the way fixed values are encoded in the ZigzagJoinSpec).
   335  func valuesSpecToEncDatum(
   336  	valuesSpec *execinfrapb.ValuesCoreSpec,
   337  ) (res []sqlbase.EncDatum, err error) {
   338  	res = make([]sqlbase.EncDatum, len(valuesSpec.Columns))
   339  	rem := valuesSpec.RawBytes[0]
   340  	for i, colInfo := range valuesSpec.Columns {
   341  		res[i], rem, err = sqlbase.EncDatumFromBuffer(colInfo.Type, colInfo.Encoding, rem)
   342  		if err != nil {
   343  			return nil, err
   344  		}
   345  	}
   346  	return res, nil
   347  }
   348  
   349  // Start is part of the RowSource interface.
   350  func (z *zigzagJoiner) Start(ctx context.Context) context.Context {
   351  	ctx = z.StartInternal(ctx, zigzagJoinerProcName)
   352  	z.evalCtx = z.FlowCtx.NewEvalCtx()
   353  	z.cancelChecker = sqlbase.NewCancelChecker(ctx)
   354  	log.VEventf(ctx, 2, "starting zigzag joiner run")
   355  	return ctx
   356  }
   357  
   358  // zigzagJoinerInfo contains all the information that needs to be
   359  // stored for each side of the join.
   360  type zigzagJoinerInfo struct {
   361  	fetcher    row.Fetcher
   362  	alloc      *sqlbase.DatumAlloc
   363  	table      *sqlbase.TableDescriptor
   364  	index      *sqlbase.IndexDescriptor
   365  	indexTypes []*types.T
   366  	indexDirs  []sqlbase.IndexDescriptor_Direction
   367  
   368  	// Stores one batch of matches at a time. When all the rows are collected
   369  	// the cartesian product of the containers will be emitted.
   370  	container sqlbase.EncDatumRowContainer
   371  
   372  	// eqColumns is the ordinal positions of the equality columns.
   373  	eqColumns []uint32
   374  
   375  	// Prefix of the index key that has fixed values.
   376  	fixedValues sqlbase.EncDatumRow
   377  
   378  	// The current key being fetched by this side.
   379  	key roachpb.Key
   380  	// The prefix of the key which includes the table and index IDs.
   381  	prefix []byte
   382  	// endKey marks where this side should stop fetching, taking into account the
   383  	// fixedValues.
   384  	endKey roachpb.Key
   385  
   386  	spanBuilder *span.Builder
   387  }
   388  
   389  // Setup the curInfo struct for the current z.side, which specifies the side
   390  // number of the curInfo to set up.
   391  // Side specifies which the spec is associated with.
   392  // colOffset is specified to determine the appropriate range of output columns
   393  // to process. It is the number of columns in the tables of all previous sides
   394  // of the join.
   395  func (z *zigzagJoiner) setupInfo(
   396  	flowCtx *execinfra.FlowCtx, spec *execinfrapb.ZigzagJoinerSpec, side int, colOffset int,
   397  ) error {
   398  	z.side = side
   399  	info := z.infos[side]
   400  
   401  	info.alloc = &sqlbase.DatumAlloc{}
   402  	info.table = &spec.Tables[side]
   403  	info.eqColumns = spec.EqColumns[side].Columns
   404  	indexOrdinal := spec.IndexOrdinals[side]
   405  	if indexOrdinal == 0 {
   406  		info.index = &info.table.PrimaryIndex
   407  	} else {
   408  		info.index = &info.table.Indexes[indexOrdinal-1]
   409  	}
   410  
   411  	var columnIDs []sqlbase.ColumnID
   412  	columnIDs, info.indexDirs = info.index.FullColumnIDs()
   413  	info.indexTypes = make([]*types.T, len(columnIDs))
   414  	columnTypes := info.table.ColumnTypes()
   415  	colIdxMap := info.table.ColumnIdxMap()
   416  	for i, columnID := range columnIDs {
   417  		info.indexTypes[i] = columnTypes[colIdxMap[columnID]]
   418  	}
   419  
   420  	// Add the outputted columns.
   421  	neededCols := util.MakeFastIntSet()
   422  	outCols := z.Out.NeededColumns()
   423  	maxCol := colOffset + len(info.table.Columns)
   424  	for i, ok := outCols.Next(colOffset); ok && i < maxCol; i, ok = outCols.Next(i + 1) {
   425  		neededCols.Add(i - colOffset)
   426  	}
   427  
   428  	// Add the fixed columns.
   429  	for i := 0; i < len(info.fixedValues); i++ {
   430  		neededCols.Add(colIdxMap[columnIDs[i]])
   431  	}
   432  
   433  	// Add the equality columns.
   434  	for _, col := range info.eqColumns {
   435  		neededCols.Add(int(col))
   436  	}
   437  
   438  	// Setup the RowContainers.
   439  	info.container.Reset()
   440  
   441  	info.spanBuilder = span.MakeBuilder(flowCtx.Codec(), info.table, info.index)
   442  
   443  	// Setup the Fetcher.
   444  	_, _, err := initRowFetcher(
   445  		flowCtx,
   446  		&info.fetcher,
   447  		info.table,
   448  		int(indexOrdinal),
   449  		info.table.ColumnIdxMap(),
   450  		false, /* reverse */
   451  		neededCols,
   452  		false, /* check */
   453  		info.alloc,
   454  		execinfra.ScanVisibilityPublic,
   455  		// NB: zigzag joins are disabled when a row-level locking clause is
   456  		// supplied, so there is no locking strength on *ZigzagJoinerSpec.
   457  		sqlbase.ScanLockingStrength_FOR_NONE,
   458  	)
   459  	if err != nil {
   460  		return err
   461  	}
   462  
   463  	info.prefix = sqlbase.MakeIndexKeyPrefix(flowCtx.Codec(), info.table, info.index.ID)
   464  	span, err := z.produceSpanFromBaseRow()
   465  
   466  	if err != nil {
   467  		return err
   468  	}
   469  	info.key = span.Key
   470  	info.endKey = span.EndKey
   471  	return nil
   472  }
   473  
   474  func (z *zigzagJoiner) close() {
   475  	if z.InternalClose() {
   476  		log.VEventf(z.Ctx, 2, "exiting zigzag joiner run")
   477  	}
   478  }
   479  
   480  // producerMeta constructs the ProducerMetadata after consumption of rows has
   481  // terminated, either due to being indicated by the consumer, or because the
   482  // processor ran out of rows or encountered an error. It is ok for err to be
   483  // nil indicating that we're done producing rows even though no error occurred.
   484  func (z *zigzagJoiner) producerMeta(err error) *execinfrapb.ProducerMetadata {
   485  	var meta *execinfrapb.ProducerMetadata
   486  	if !z.Closed {
   487  		if err != nil {
   488  			meta = &execinfrapb.ProducerMetadata{Err: err}
   489  		} else if trace := execinfra.GetTraceData(z.Ctx); trace != nil {
   490  			meta = &execinfrapb.ProducerMetadata{TraceData: trace}
   491  		}
   492  		// We need to close as soon as we send producer metadata as we're done
   493  		// sending rows. The consumer is allowed to not call ConsumerDone().
   494  		z.close()
   495  	}
   496  	if meta != nil {
   497  		z.returnedMeta = append(z.returnedMeta, *meta)
   498  	}
   499  	return meta
   500  }
   501  
   502  func findColumnID(s []sqlbase.ColumnID, t sqlbase.ColumnID) int {
   503  	for i := range s {
   504  		if s[i] == t {
   505  			return i
   506  		}
   507  	}
   508  	return -1
   509  }
   510  
   511  // Fetches the first row from the current rowFetcher that does not have any of
   512  // the equality columns set to null.
   513  func (z *zigzagJoiner) fetchRow(ctx context.Context) (sqlbase.EncDatumRow, error) {
   514  	return z.fetchRowFromSide(ctx, z.side)
   515  }
   516  
   517  func (z *zigzagJoiner) fetchRowFromSide(
   518  	ctx context.Context, side int,
   519  ) (fetchedRow sqlbase.EncDatumRow, err error) {
   520  	// Keep fetching until a row is found that does not have null in an equality
   521  	// column.
   522  	hasNull := func(row sqlbase.EncDatumRow) bool {
   523  		for _, c := range z.infos[side].eqColumns {
   524  			if row[c].IsNull() {
   525  				return true
   526  			}
   527  		}
   528  		return false
   529  	}
   530  	for {
   531  		fetchedRow, _, _, err = z.infos[side].fetcher.NextRow(ctx)
   532  		if fetchedRow == nil || err != nil {
   533  			return fetchedRow, err
   534  		}
   535  		if !hasNull(fetchedRow) {
   536  			break
   537  		}
   538  	}
   539  	return fetchedRow, nil
   540  }
   541  
   542  // Return the datums from the equality columns from a given non-empty row
   543  // from the specified side.
   544  func (z *zigzagJoiner) extractEqDatums(row sqlbase.EncDatumRow, side int) sqlbase.EncDatumRow {
   545  	eqCols := z.infos[side].eqColumns
   546  	eqDatums := make(sqlbase.EncDatumRow, len(eqCols))
   547  	for i, col := range eqCols {
   548  		eqDatums[i] = row[col]
   549  	}
   550  	return eqDatums
   551  }
   552  
   553  // Generates a Key for an inverted index from the passed datums and side
   554  // info. Used by produceKeyFromBaseRow.
   555  func (z *zigzagJoiner) produceInvertedIndexKey(
   556  	info *zigzagJoinerInfo, datums sqlbase.EncDatumRow,
   557  ) (roachpb.Span, error) {
   558  	// For inverted indexes, the JSON field (first column in the index) is
   559  	// encoded a little differently. We need to explicitly call
   560  	// EncodeInvertedIndexKeys to generate the prefix. The rest of the
   561  	// index key containing the remaining neededDatums can be generated
   562  	// and appended using EncodeColumns.
   563  	colMap := make(map[sqlbase.ColumnID]int)
   564  	decodedDatums := make([]tree.Datum, len(datums))
   565  
   566  	// Ensure all EncDatums have been decoded.
   567  	for i, encDatum := range datums {
   568  		err := encDatum.EnsureDecoded(info.indexTypes[i], info.alloc)
   569  		if err != nil {
   570  			return roachpb.Span{}, err
   571  		}
   572  
   573  		decodedDatums[i] = encDatum.Datum
   574  		if i < len(info.index.ColumnIDs) {
   575  			colMap[info.index.ColumnIDs[i]] = i
   576  		} else {
   577  			// This column's value will be encoded in the second part (i.e.
   578  			// EncodeColumns).
   579  			colMap[info.index.ExtraColumnIDs[i-len(info.index.ColumnIDs)]] = i
   580  		}
   581  	}
   582  
   583  	keys, err := sqlbase.EncodeInvertedIndexKeys(
   584  		info.table,
   585  		info.index,
   586  		colMap,
   587  		decodedDatums,
   588  		info.prefix,
   589  	)
   590  	if err != nil {
   591  		return roachpb.Span{}, err
   592  	}
   593  	if len(keys) != 1 {
   594  		return roachpb.Span{}, errors.Errorf("%d fixed values passed in for inverted index", len(keys))
   595  	}
   596  
   597  	// Append remaining (non-JSON) datums to the key.
   598  	keyBytes, _, err := sqlbase.EncodeColumns(
   599  		info.index.ExtraColumnIDs[:len(datums)-1],
   600  		info.indexDirs[1:],
   601  		colMap,
   602  		decodedDatums,
   603  		keys[0],
   604  	)
   605  	key := roachpb.Key(keyBytes)
   606  	return roachpb.Span{Key: key, EndKey: key.PrefixEnd()}, err
   607  }
   608  
   609  // Generates a Key, corresponding to the current `z.baseRow` in
   610  // the index on the current side.
   611  func (z *zigzagJoiner) produceSpanFromBaseRow() (roachpb.Span, error) {
   612  	info := z.infos[z.side]
   613  	neededDatums := info.fixedValues
   614  	if z.baseRow != nil {
   615  		eqDatums := z.extractEqDatums(z.baseRow, z.prevSide())
   616  		neededDatums = append(neededDatums, eqDatums...)
   617  	}
   618  
   619  	// Construct correct row by concatenating right fixed datums with
   620  	// primary key extracted from `row`.
   621  	if info.index.Type == sqlbase.IndexDescriptor_INVERTED {
   622  		return z.produceInvertedIndexKey(info, neededDatums)
   623  	}
   624  
   625  	s, _, err := info.spanBuilder.SpanFromEncDatums(neededDatums, len(neededDatums))
   626  	return s, err
   627  }
   628  
   629  // Returns the column types of the equality columns.
   630  func (zi *zigzagJoinerInfo) eqColTypes() []*types.T {
   631  	eqColTypes := make([]*types.T, len(zi.eqColumns))
   632  	colTypes := zi.table.ColumnTypes()
   633  	for i := range eqColTypes {
   634  		eqColTypes[i] = colTypes[zi.eqColumns[i]]
   635  	}
   636  	return eqColTypes
   637  }
   638  
   639  // Returns the ordering of the equality columns.
   640  func (zi *zigzagJoinerInfo) eqOrdering() (sqlbase.ColumnOrdering, error) {
   641  	ordering := make(sqlbase.ColumnOrdering, len(zi.eqColumns))
   642  	for i := range zi.eqColumns {
   643  		colID := zi.table.Columns[zi.eqColumns[i]].ID
   644  		// Search the index columns, then the primary keys to find an ordering for
   645  		// the current column, 'colID'.
   646  		var direction encoding.Direction
   647  		var err error
   648  		if idx := findColumnID(zi.index.ColumnIDs, colID); idx != -1 {
   649  			direction, err = zi.index.ColumnDirections[idx].ToEncodingDirection()
   650  			if err != nil {
   651  				return nil, err
   652  			}
   653  		} else if idx := findColumnID(zi.table.PrimaryIndex.ColumnIDs, colID); idx != -1 {
   654  			direction, err = zi.table.PrimaryIndex.ColumnDirections[idx].ToEncodingDirection()
   655  			if err != nil {
   656  				return nil, err
   657  			}
   658  		} else {
   659  			return nil, errors.New("ordering of equality column not found in index or primary key")
   660  		}
   661  		ordering[i] = sqlbase.ColumnOrderInfo{ColIdx: i, Direction: direction}
   662  	}
   663  	return ordering, nil
   664  }
   665  
   666  // matchBase compares the equality columns of the given row to `z.baseRow`,
   667  // which is the previously fetched row. Returns whether or not the rows match
   668  // on the equality columns. The given row is from the specified `side`.
   669  func (z *zigzagJoiner) matchBase(curRow sqlbase.EncDatumRow, side int) (bool, error) {
   670  	if len(curRow) == 0 {
   671  		return false, nil
   672  	}
   673  
   674  	prevEqDatums := z.extractEqDatums(z.baseRow, z.prevSide())
   675  	curEqDatums := z.extractEqDatums(curRow, side)
   676  
   677  	eqColTypes := z.infos[side].eqColTypes()
   678  	ordering, err := z.infos[side].eqOrdering()
   679  	if err != nil {
   680  		return false, err
   681  	}
   682  
   683  	// Compare the equality columns of the baseRow to that of the curRow.
   684  	da := &sqlbase.DatumAlloc{}
   685  	cmp, err := prevEqDatums.Compare(eqColTypes, da, ordering, z.FlowCtx.EvalCtx, curEqDatums)
   686  	if err != nil {
   687  		return false, err
   688  	}
   689  	return cmp == 0, nil
   690  }
   691  
   692  // emitFromContainers returns the next row that is to be emitted from those
   693  // already stored in the containers.
   694  // Since this is called after the side has been incremented, it produces the
   695  // cartesian product of the previous side's container and the side before that
   696  // one.
   697  func (z *zigzagJoiner) emitFromContainers() (sqlbase.EncDatumRow, error) {
   698  	right := z.prevSide()
   699  	left := z.sideBefore(right)
   700  	for !z.infos[right].container.IsEmpty() {
   701  		leftRow := z.infos[left].container.Pop()
   702  		rightRow := z.infos[right].container.Peek()
   703  
   704  		// TODO(pbardea): Extend this logic to support multi-way joins.
   705  		if left == int(rightSide) {
   706  			leftRow, rightRow = rightRow, leftRow
   707  		}
   708  		renderedRow, err := z.render(leftRow, rightRow)
   709  		if err != nil {
   710  			return nil, err
   711  		}
   712  		if z.infos[left].container.IsEmpty() {
   713  			z.infos[right].container.Pop()
   714  		}
   715  		if renderedRow != nil {
   716  			// The pair satisfied the onExpr.
   717  			return renderedRow, nil
   718  		}
   719  	}
   720  
   721  	// All matches have been returned since the left index is negative.
   722  	// Empty the containers to reset their contents.
   723  	z.infos[left].container.Reset()
   724  	z.infos[right].container.Reset()
   725  
   726  	return nil, nil
   727  }
   728  
   729  // nextRow fetches the nextRow to emit from the join. It iterates through all
   730  // sides until a match is found then emits the results of the match one result
   731  // at a time.
   732  func (z *zigzagJoiner) nextRow(
   733  	ctx context.Context, txn *kv.Txn,
   734  ) (sqlbase.EncDatumRow, *execinfrapb.ProducerMetadata) {
   735  	for {
   736  		if err := z.cancelChecker.Check(); err != nil {
   737  			return nil, &execinfrapb.ProducerMetadata{Err: err}
   738  		}
   739  
   740  		// Check if there are any rows built up in the containers that need to be
   741  		// emitted.
   742  		if rowToEmit, err := z.emitFromContainers(); err != nil {
   743  			return nil, z.producerMeta(err)
   744  		} else if rowToEmit != nil {
   745  			return rowToEmit, nil
   746  		}
   747  
   748  		// If the baseRow is nil, the last fetched row was nil. That means that
   749  		// that there are no more matches in the join so we break and return nil
   750  		// to indicate that we are done to the caller.
   751  		if len(z.baseRow) == 0 {
   752  			return nil, nil
   753  		}
   754  
   755  		curInfo := z.infos[z.side]
   756  
   757  		// Generate a key from the last row seen from the last side. We're about to
   758  		// use it to jump to the next possible match on the current side.
   759  		span, err := z.produceSpanFromBaseRow()
   760  		if err != nil {
   761  			return nil, z.producerMeta(err)
   762  		}
   763  		curInfo.key = span.Key
   764  
   765  		err = curInfo.fetcher.StartScan(
   766  			ctx,
   767  			txn,
   768  			roachpb.Spans{roachpb.Span{Key: curInfo.key, EndKey: curInfo.endKey}},
   769  			true, /* batch limit */
   770  			zigzagJoinerBatchSize,
   771  			z.FlowCtx.TraceKV,
   772  		)
   773  		if err != nil {
   774  			return nil, z.producerMeta(err)
   775  		}
   776  
   777  		fetchedRow, err := z.fetchRow(ctx)
   778  		if err != nil {
   779  			return nil, z.producerMeta(err)
   780  		}
   781  		// If the next possible match on the current side that matches the previous
   782  		// row is `nil`, that means that there are no more matches in the join so
   783  		// we return nil to indicate that to the caller.
   784  		if fetchedRow == nil {
   785  			return nil, nil
   786  		}
   787  
   788  		matched, err := z.matchBase(fetchedRow, z.side)
   789  		if err != nil {
   790  			return nil, z.producerMeta(err)
   791  		}
   792  		if matched {
   793  			// We've detected a match! Now, we collect all subsequent matches on both
   794  			// sides for the current equality column values and add them to our
   795  			// list of rows to emit.
   796  			prevSide := z.prevSide()
   797  
   798  			// Store the matched rows in the appropriate container to emit.
   799  			prevRow := z.rowAlloc.AllocRow(len(z.baseRow))
   800  			copy(prevRow, z.baseRow)
   801  			z.infos[prevSide].container.Push(prevRow)
   802  			curRow := z.rowAlloc.AllocRow(len(fetchedRow))
   803  			copy(curRow, fetchedRow)
   804  			curInfo.container.Push(curRow)
   805  
   806  			// After collecting all matches from each side, the first unmatched
   807  			// row from each side is returned. We want the new baseRow to be
   808  			// the latest of these rows since no match can occur before the latter
   809  			// of the two rows.
   810  			prevNext, err := z.collectAllMatches(ctx, prevSide)
   811  			if err != nil {
   812  				return nil, z.producerMeta(err)
   813  			}
   814  			curNext, err := z.collectAllMatches(ctx, z.side)
   815  			if err != nil {
   816  				return nil, z.producerMeta(err)
   817  			}
   818  
   819  			// No more matches, so set the baseRow to nil to indicate that we should
   820  			// terminate after emitting all the rows stored in the container.
   821  			if len(prevNext) == 0 || len(curNext) == 0 {
   822  				z.baseRow = nil
   823  				continue
   824  			}
   825  
   826  			prevEqCols := z.extractEqDatums(prevNext, prevSide)
   827  			currentEqCols := z.extractEqDatums(curNext, z.side)
   828  			eqColTypes := curInfo.eqColTypes()
   829  			ordering, err := curInfo.eqOrdering()
   830  			if err != nil {
   831  				return nil, z.producerMeta(err)
   832  			}
   833  			da := &sqlbase.DatumAlloc{}
   834  			cmp, err := prevEqCols.Compare(eqColTypes, da, ordering, z.FlowCtx.EvalCtx, currentEqCols)
   835  			if err != nil {
   836  				return nil, z.producerMeta(err)
   837  			}
   838  			// We want the new current side to be the one that has the latest key
   839  			// since we know that this key will not be able to match any previous
   840  			// key. The current side should be the side after the baseRow's side.
   841  			if cmp < 0 {
   842  				// The current side had the later row, so increment the side.
   843  				z.side = z.nextSide()
   844  				z.baseRow = curNext
   845  			} else {
   846  				// The previous side had the later row so the side doesn't change.
   847  				z.baseRow = prevNext
   848  			}
   849  		} else {
   850  			// The current row doesn't match the base row, so update the base row to
   851  			// the current row and increment the side to repeat the process.
   852  			z.baseRow = fetchedRow
   853  			z.baseRow = z.rowAlloc.AllocRow(len(fetchedRow))
   854  			copy(z.baseRow, fetchedRow)
   855  			z.side = z.nextSide()
   856  		}
   857  	}
   858  }
   859  
   860  // nextSide returns the side after the current side.
   861  func (z *zigzagJoiner) nextSide() int {
   862  	return (z.side + 1) % z.numTables
   863  }
   864  
   865  // prevSide returns the side before the current side.
   866  func (z *zigzagJoiner) prevSide() int {
   867  	return z.sideBefore(z.side)
   868  }
   869  
   870  // sideBefore returns the side before the given side.
   871  func (z *zigzagJoiner) sideBefore(side int) int {
   872  	return (side + z.numTables - 1) % z.numTables
   873  }
   874  
   875  // Adds all rows that match the current base row from the specified side into
   876  // the appropriate container.
   877  // Returns the first row that doesn't match.
   878  func (z *zigzagJoiner) collectAllMatches(
   879  	ctx context.Context, side int,
   880  ) (sqlbase.EncDatumRow, error) {
   881  	matched := true
   882  	var row sqlbase.EncDatumRow
   883  	for matched {
   884  		var err error
   885  		fetchedRow, err := z.fetchRowFromSide(ctx, side)
   886  		row = z.rowAlloc.AllocRow(len(fetchedRow))
   887  		copy(row, fetchedRow)
   888  		if err != nil {
   889  			return nil, err
   890  		}
   891  		matched, err = z.matchBase(row, side)
   892  		if err != nil {
   893  			return nil, err
   894  		}
   895  		if matched {
   896  			z.infos[side].container.Push(row)
   897  		}
   898  	}
   899  	return row, nil
   900  }
   901  
   902  // Next is part of the RowSource interface.
   903  func (z *zigzagJoiner) Next() (sqlbase.EncDatumRow, *execinfrapb.ProducerMetadata) {
   904  	txn := z.FlowCtx.Txn
   905  
   906  	if !z.started {
   907  		z.started = true
   908  
   909  		curInfo := z.infos[z.side]
   910  		// Fetch initial batch.
   911  		err := curInfo.fetcher.StartScan(
   912  			z.Ctx,
   913  			txn,
   914  			roachpb.Spans{roachpb.Span{Key: curInfo.key, EndKey: curInfo.endKey}},
   915  			true, /* batch limit */
   916  			zigzagJoinerBatchSize,
   917  			z.FlowCtx.TraceKV,
   918  		)
   919  		if err != nil {
   920  			log.Errorf(z.Ctx, "scan error: %s", err)
   921  			return nil, z.producerMeta(err)
   922  		}
   923  		fetchedRow, err := z.fetchRow(z.Ctx)
   924  		if err != nil {
   925  			err = scrub.UnwrapScrubError(err)
   926  			return nil, z.producerMeta(err)
   927  		}
   928  		z.baseRow = z.rowAlloc.AllocRow(len(fetchedRow))
   929  		copy(z.baseRow, fetchedRow)
   930  		z.side = z.nextSide()
   931  	}
   932  
   933  	if z.Closed {
   934  		return nil, z.producerMeta(nil /* err */)
   935  	}
   936  
   937  	for {
   938  		row, meta := z.nextRow(z.Ctx, txn)
   939  		if z.Closed || meta != nil {
   940  			if meta != nil {
   941  				z.returnedMeta = append(z.returnedMeta, *meta)
   942  			}
   943  			return nil, meta
   944  		}
   945  		if row == nil {
   946  			z.MoveToDraining(nil /* err */)
   947  			break
   948  		}
   949  
   950  		outRow := z.ProcessRowHelper(row)
   951  		if outRow == nil {
   952  			continue
   953  		}
   954  		return outRow, nil
   955  	}
   956  	meta := z.DrainHelper()
   957  	if meta != nil {
   958  		z.returnedMeta = append(z.returnedMeta, *meta)
   959  	}
   960  	return nil, meta
   961  }
   962  
   963  // ConsumerClosed is part of the RowSource interface.
   964  func (z *zigzagJoiner) ConsumerClosed() {
   965  	// The consumer is done, Next() will not be called again.
   966  	z.close()
   967  }
   968  
   969  // DrainMeta is part of the MetadataSource interface.
   970  func (z *zigzagJoiner) DrainMeta(_ context.Context) []execinfrapb.ProducerMetadata {
   971  	return z.returnedMeta
   972  }
   973  
   974  // ChildCount is part of the execinfra.OpNode interface.
   975  func (z *zigzagJoiner) ChildCount(verbose bool) int {
   976  	return 0
   977  }
   978  
   979  // Child is part of the execinfra.OpNode interface.
   980  func (z *zigzagJoiner) Child(nth int, verbose bool) execinfra.OpNode {
   981  	panic(fmt.Sprintf("invalid index %d", nth))
   982  }