github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/sql/distsql_plan_join.go

github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/sql/distsql_plan_join.go (about)

     1  // Copyright 2017 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package sql
    12  
    13  import (
    14  	"bytes"
    15  	"fmt"
    16  	"math"
    17  	"sort"
    18  
    19  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    20  	"github.com/cockroachdb/cockroach/pkg/settings"
    21  	"github.com/cockroachdb/cockroach/pkg/sql/execinfrapb"
    22  	"github.com/cockroachdb/cockroach/pkg/sql/physicalplan"
    23  	"github.com/cockroachdb/cockroach/pkg/sql/sem/tree"
    24  	"github.com/cockroachdb/cockroach/pkg/sql/sqlbase"
    25  	"github.com/cockroachdb/cockroach/pkg/util/encoding"
    26  	"github.com/cockroachdb/errors"
    27  )
    28  
    29  var planInterleavedJoins = settings.RegisterBoolSetting(
    30  	"sql.distsql.interleaved_joins.enabled",
    31  	"if set we plan interleaved table joins instead of merge joins when possible",
    32  	true,
    33  )
    34  
    35  func (dsp *DistSQLPlanner) tryCreatePlanForInterleavedJoin(
    36  	planCtx *PlanningCtx, n *joinNode,
    37  ) (plan *PhysicalPlan, ok bool, err error) {
    38  	plan = &PhysicalPlan{}
    39  	if !useInterleavedJoin(n) {
    40  		return nil, false, nil
    41  	}
    42  
    43  	leftScan, leftOk := n.left.plan.(*scanNode)
    44  	rightScan, rightOk := n.right.plan.(*scanNode)
    45  
    46  	// We know they are scan nodes from useInterleaveJoin, but we add
    47  	// this check to prevent future panics.
    48  	if !leftOk || !rightOk {
    49  		return nil, false, errors.AssertionFailedf("left and right children of join node must be scan nodes to execute an interleaved join")
    50  	}
    51  
    52  	// We iterate through each table and collate their metadata for
    53  	// the InterleavedReaderJoinerSpec.
    54  	tables := make([]execinfrapb.InterleavedReaderJoinerSpec_Table, 2)
    55  	plans := make([]*PhysicalPlan, 2)
    56  	var totalLimitHint int64
    57  	for i, t := range []struct {
    58  		scan      *scanNode
    59  		eqIndices []int
    60  	}{
    61  		{
    62  			scan:      leftScan,
    63  			eqIndices: n.pred.leftEqualityIndices,
    64  		},
    65  		{
    66  			scan:      rightScan,
    67  			eqIndices: n.pred.rightEqualityIndices,
    68  		},
    69  	} {
    70  		// We don't really need to initialize a full-on plan to
    71  		// retrieve the metadata for each table reader, but this turns
    72  		// out to be very useful for computing ordering and remapping the
    73  		// onCond and columns.
    74  		var err error
    75  		if plans[i], err = dsp.createTableReaders(planCtx, t.scan); err != nil {
    76  			return nil, false, err
    77  		}
    78  
    79  		eqCols := eqCols(t.eqIndices, plans[i].PlanToStreamColMap)
    80  		ordering := distsqlOrdering(n.mergeJoinOrdering, eqCols)
    81  
    82  		// Doesn't matter which processor we choose since the metadata
    83  		// for TableReader is independent of node/processor instance.
    84  		tr := plans[i].Processors[0].Spec.Core.TableReader
    85  
    86  		tables[i] = execinfrapb.InterleavedReaderJoinerSpec_Table{
    87  			Desc:     tr.Table,
    88  			IndexIdx: tr.IndexIdx,
    89  			Post:     plans[i].GetLastStagePost(),
    90  			Ordering: ordering,
    91  		}
    92  
    93  		// We will set the limit hint of the final
    94  		// InterleavedReaderJoiner as the sum of the individual tables'
    95  		// limit hints.
    96  		// This is because the InterleavedReaderJoiner reads rows from
    97  		// all tables at the same time and so the hint applies to the
    98  		// total number of rows read from all tables.
    99  		if totalLimitHint >= math.MaxInt64-tr.LimitHint {
   100  			totalLimitHint = math.MaxInt64
   101  		} else {
   102  			totalLimitHint += tr.LimitHint
   103  		}
   104  	}
   105  
   106  	joinType := n.joinType
   107  
   108  	post, joinToStreamColMap := joinOutColumns(n, plans[0].PlanToStreamColMap, plans[1].PlanToStreamColMap)
   109  	onExpr, err := remapOnExpr(planCtx, n, plans[0].PlanToStreamColMap, plans[1].PlanToStreamColMap)
   110  	if err != nil {
   111  		return nil, false, err
   112  	}
   113  
   114  	ancestor, descendant := n.interleavedNodes()
   115  
   116  	// We partition each set of spans to their respective nodes.
   117  	ancsPartitions, err := dsp.PartitionSpans(planCtx, ancestor.spans)
   118  	if err != nil {
   119  		return nil, false, err
   120  	}
   121  	descPartitions, err := dsp.PartitionSpans(planCtx, descendant.spans)
   122  	if err != nil {
   123  		return nil, false, err
   124  	}
   125  
   126  	// We want to ensure that all child spans with a given interleave
   127  	// prefix value (which also happens to be our equality join columns)
   128  	// are read on the same node as the corresponding ancestor rows.
   129  	// We map all descendant spans in their partitions to the corresponding
   130  	// nodes of the ascendant spans.
   131  	//
   132  	// Example:
   133  	// Let PK1 and (PK1, PK2) be the primary keys of parent and child,
   134  	// respectively. PK1 is the interleave prefix.
   135  	// The filter WHERE PK1 = 1 AND PK2 IN (5, 7) will produce the
   136  	// parent and child spans
   137  	//   parent:  /1 - /2    (technically /1 - /1/#/8)
   138  	//   child:   /1/#/5 - /1/#/6, /1/#/7 - /1/#/8
   139  	// If the parent span is partitioned to node 1 and the child spans are
   140  	// partitioned to node 2 and 3, then we need to move the child spans
   141  	// to node 1 where the PK1 = 1 parent row is read.
   142  	if descPartitions, err = alignInterleavedSpans(n, ancsPartitions, descPartitions); err != nil {
   143  		return nil, false, err
   144  	}
   145  
   146  	// Figure out which nodes we need to schedule a processor on.
   147  	seen := make(map[roachpb.NodeID]struct{})
   148  	var nodes []roachpb.NodeID
   149  	for _, partitions := range [][]SpanPartition{ancsPartitions, descPartitions} {
   150  		for _, part := range partitions {
   151  			if _, ok := seen[part.Node]; !ok {
   152  				seen[part.Node] = struct{}{}
   153  				nodes = append(nodes, part.Node)
   154  			}
   155  		}
   156  	}
   157  
   158  	var ancsIdx, descIdx int
   159  	// The left table is in the 0th index, right table in the 1st index.
   160  	if leftScan == ancestor {
   161  		ancsIdx, descIdx = 0, 1
   162  	} else {
   163  		ancsIdx, descIdx = 1, 0
   164  	}
   165  
   166  	stageID := plan.NewStageID()
   167  
   168  	// We provision a separate InterleavedReaderJoiner per node that has
   169  	// rows from either table.
   170  	for _, nodeID := range nodes {
   171  		// Find the relevant span from each table for this node.
   172  		// Note it is possible that either set of spans can be empty
   173  		// (but not both).
   174  		var ancsSpans, descSpans roachpb.Spans
   175  		for _, part := range ancsPartitions {
   176  			if part.Node == nodeID {
   177  				ancsSpans = part.Spans
   178  				break
   179  			}
   180  		}
   181  		for _, part := range descPartitions {
   182  			if part.Node == nodeID {
   183  				descSpans = part.Spans
   184  				break
   185  			}
   186  		}
   187  		if len(ancsSpans) == 0 && len(descSpans) == 0 {
   188  			panic("cannot have empty set of spans for both tables for a given node")
   189  		}
   190  
   191  		// Make a copy of our spec for each table.
   192  		processorTables := make([]execinfrapb.InterleavedReaderJoinerSpec_Table, len(tables))
   193  		copy(processorTables, tables)
   194  		// We set the set of spans for each table to be read by the
   195  		// processor.
   196  		processorTables[ancsIdx].Spans = makeTableReaderSpans(ancsSpans)
   197  		processorTables[descIdx].Spans = makeTableReaderSpans(descSpans)
   198  
   199  		irj := &execinfrapb.InterleavedReaderJoinerSpec{
   200  			Tables: processorTables,
   201  			// We previously checked that both scans are in the
   202  			// same direction (useInterleavedJoin).
   203  			Reverse:           ancestor.reverse,
   204  			LimitHint:         totalLimitHint,
   205  			LockingStrength:   ancestor.lockingStrength,
   206  			LockingWaitPolicy: ancestor.lockingWaitPolicy,
   207  			OnExpr:            onExpr,
   208  			Type:              joinType,
   209  		}
   210  
   211  		proc := physicalplan.Processor{
   212  			Node: nodeID,
   213  			Spec: execinfrapb.ProcessorSpec{
   214  				Core:    execinfrapb.ProcessorCoreUnion{InterleavedReaderJoiner: irj},
   215  				Post:    post,
   216  				Output:  []execinfrapb.OutputRouterSpec{{Type: execinfrapb.OutputRouterSpec_PASS_THROUGH}},
   217  				StageID: stageID,
   218  			},
   219  		}
   220  
   221  		plan.Processors = append(plan.Processors, proc)
   222  	}
   223  
   224  	// Each result router correspond to each of the processors we appended.
   225  	plan.ResultRouters = make([]physicalplan.ProcessorIdx, len(nodes))
   226  	for i := 0; i < len(nodes); i++ {
   227  		plan.ResultRouters[i] = physicalplan.ProcessorIdx(i)
   228  	}
   229  
   230  	plan.PlanToStreamColMap = joinToStreamColMap
   231  	plan.ResultTypes, err = getTypesForPlanResult(n, joinToStreamColMap)
   232  	if err != nil {
   233  		return nil, false, err
   234  	}
   235  
   236  	plan.SetMergeOrdering(dsp.convertOrdering(n.reqOrdering, plan.PlanToStreamColMap))
   237  	return plan, true, nil
   238  }
   239  
   240  func joinOutColumns(
   241  	n *joinNode, leftPlanToStreamColMap, rightPlanToStreamColMap []int,
   242  ) (post execinfrapb.PostProcessSpec, joinToStreamColMap []int) {
   243  	joinToStreamColMap = makePlanToStreamColMap(len(n.columns))
   244  	post.Projection = true
   245  
   246  	// addOutCol appends to post.OutputColumns and returns the index
   247  	// in the slice of the added column.
   248  	addOutCol := func(col uint32) int {
   249  		idx := len(post.OutputColumns)
   250  		post.OutputColumns = append(post.OutputColumns, col)
   251  		return idx
   252  	}
   253  
   254  	// The join columns are in two groups:
   255  	//  - the columns on the left side (numLeftCols)
   256  	//  - the columns on the right side (numRightCols)
   257  	for i := 0; i < n.pred.numLeftCols; i++ {
   258  		joinToStreamColMap[i] = addOutCol(uint32(leftPlanToStreamColMap[i]))
   259  	}
   260  
   261  	if n.pred.joinType != sqlbase.LeftSemiJoin && n.pred.joinType != sqlbase.LeftAntiJoin {
   262  		for i := 0; i < n.pred.numRightCols; i++ {
   263  			joinToStreamColMap[n.pred.numLeftCols+i] = addOutCol(
   264  				uint32(n.pred.numLeftCols + rightPlanToStreamColMap[i]),
   265  			)
   266  		}
   267  	}
   268  
   269  	return post, joinToStreamColMap
   270  }
   271  
   272  // remapOnExpr remaps ordinal references in the on condition (which refer to the
   273  // join columns as described above) to values that make sense in the joiner (0
   274  // to N-1 for the left input columns, N to N+M-1 for the right input columns).
   275  func remapOnExpr(
   276  	planCtx *PlanningCtx, n *joinNode, leftPlanToStreamColMap, rightPlanToStreamColMap []int,
   277  ) (execinfrapb.Expression, error) {
   278  	if n.pred.onCond == nil {
   279  		return execinfrapb.Expression{}, nil
   280  	}
   281  
   282  	joinColMap := make([]int, n.pred.numLeftCols+n.pred.numRightCols)
   283  	idx := 0
   284  	leftCols := 0
   285  	for i := 0; i < n.pred.numLeftCols; i++ {
   286  		joinColMap[idx] = leftPlanToStreamColMap[i]
   287  		if leftPlanToStreamColMap[i] != -1 {
   288  			leftCols++
   289  		}
   290  		idx++
   291  	}
   292  	for i := 0; i < n.pred.numRightCols; i++ {
   293  		joinColMap[idx] = leftCols + rightPlanToStreamColMap[i]
   294  		idx++
   295  	}
   296  
   297  	return physicalplan.MakeExpression(n.pred.onCond, planCtx, joinColMap)
   298  }
   299  
   300  // eqCols produces a slice of ordinal references for the plan columns specified
   301  // in eqIndices using planToColMap.
   302  // That is: eqIndices contains a slice of plan column indexes and planToColMap
   303  // maps the plan column indexes to the ordinal references (index of the
   304  // intermediate row produced).
   305  func eqCols(eqIndices, planToColMap []int) []uint32 {
   306  	eqCols := make([]uint32, len(eqIndices))
   307  	for i, planCol := range eqIndices {
   308  		eqCols[i] = uint32(planToColMap[planCol])
   309  	}
   310  
   311  	return eqCols
   312  }
   313  
   314  // distsqlOrdering converts the ordering specified by mergeJoinOrdering in
   315  // terms of the index of eqCols to the ordinal references provided by eqCols.
   316  func distsqlOrdering(
   317  	mergeJoinOrdering sqlbase.ColumnOrdering, eqCols []uint32,
   318  ) execinfrapb.Ordering {
   319  	var ord execinfrapb.Ordering
   320  	ord.Columns = make([]execinfrapb.Ordering_Column, len(mergeJoinOrdering))
   321  	for i, c := range mergeJoinOrdering {
   322  		ord.Columns[i].ColIdx = eqCols[c.ColIdx]
   323  		dir := execinfrapb.Ordering_Column_ASC
   324  		if c.Direction == encoding.Descending {
   325  			dir = execinfrapb.Ordering_Column_DESC
   326  		}
   327  		ord.Columns[i].Direction = dir
   328  	}
   329  
   330  	return ord
   331  }
   332  
   333  func useInterleavedJoin(n *joinNode) bool {
   334  	// TODO(richardwu): We currently only do an interleave join on
   335  	// all equality columns. This can be relaxed once a hybrid
   336  	// hash-merge join is implemented in streamMerger.
   337  	if len(n.mergeJoinOrdering) != len(n.pred.leftEqualityIndices) {
   338  		return false
   339  	}
   340  
   341  	ancestor, descendant := n.interleavedNodes()
   342  
   343  	// There is no interleaved ancestor/descendant scan node and thus no
   344  	// interleaved relation.
   345  	if ancestor == nil || descendant == nil {
   346  		return false
   347  	}
   348  
   349  	// We cannot do an interleaved join if the tables require scanning in
   350  	// opposite directions.
   351  	if ancestor.reverse != descendant.reverse {
   352  		return false
   353  	}
   354  
   355  	var ancestorEqIndices []int
   356  	var descendantEqIndices []int
   357  	// We are guaranteed that both of the sources are scan nodes from
   358  	// n.interleavedNodes().
   359  	if ancestor == n.left.plan.(*scanNode) {
   360  		ancestorEqIndices = n.pred.leftEqualityIndices
   361  		descendantEqIndices = n.pred.rightEqualityIndices
   362  	} else {
   363  		ancestorEqIndices = n.pred.rightEqualityIndices
   364  		descendantEqIndices = n.pred.leftEqualityIndices
   365  	}
   366  
   367  	// We want full 1-1 correspondence between our join columns and the
   368  	// primary index of the ancestor.
   369  	//  TODO(richardwu): We can relax this once we implement a hybrid
   370  	//  hash/merge for interleaved joins after forming merge groups with the
   371  	//  interleave prefix (or when the merge join logic is combined with
   372  	//  the interleaved join logic).
   373  	if len(n.mergeJoinOrdering) != len(ancestor.index.ColumnIDs) {
   374  		return false
   375  	}
   376  
   377  	// We iterate through the ordering given by n.mergeJoinOrdering and check
   378  	// if the columns have a 1-1 correspondence to the interleaved
   379  	// ancestor's primary index columns (i.e. interleave prefix) as well as the
   380  	// descendant's primary index columns. We naively return false if any part
   381  	// of the ordering does not correspond.
   382  	for i, info := range n.mergeJoinOrdering {
   383  		colID := ancestor.index.ColumnIDs[i]
   384  		// info.ColIdx refers to i in ancestorEqIndices[i], which refers
   385  		// to the index of the source row. This corresponds to
   386  		// the index in scanNode.resultColumns. To convert the colID
   387  		// from the index descriptor, we can use the map provided by
   388  		// colIdxMap.
   389  		if ancestorEqIndices[info.ColIdx] != ancestor.colIdxMap[colID] ||
   390  			descendantEqIndices[info.ColIdx] != descendant.colIdxMap[colID] {
   391  			// The column in the ordering does not correspond to
   392  			// the column in the interleave prefix.
   393  			// We should not try to do an interleaved join.
   394  			return false
   395  		}
   396  	}
   397  
   398  	// The columns in n.mergeJoinOrdering has a 1-1 correspondence with the
   399  	// columns in the interleaved ancestor's primary index. We can indeed
   400  	// hint at the possibility of an interleaved join.
   401  	return true
   402  }
   403  
   404  // maximalJoinPrefix takes the common ancestor scanNode that the join is
   405  // defined on, the target scanNode that the index key belongs to and the index
   406  // key itself, and returns the maximal prefix of the key which is also a prefix
   407  // of all keys that need to be joined together.
   408  //
   409  // Let's denote a child key interleaved into a parent key in the following.
   410  // format:
   411  //   /table/index/<parent-pk1>/.../<parent-pkN>/#/<child-pk1>/.../<child-pkN>
   412  //
   413  // In the following examples, the ancestor is parent and the target is child.
   414  //
   415  // Let M be the longest prefix of the parent PK which is (equality) constrained
   416  // by the join. The maximal join prefix is:
   417  //   /table/index/<parent-pk1>/.../<parent-pkM>
   418  //
   419  // Examples (/table/index suppressed from keys):
   420  //  1. Full interleave (prefix) join:
   421  //
   422  //    1a. Parent table PK1
   423  //        Child table (PK1, PK2)
   424  //        Join on PK1
   425  //        For child key /5/#/42, the maximal join prefix is /5
   426  //
   427  //    1b. Parent table (PK1, PK2)
   428  //        Child table (PK1, PK2, PK3)
   429  //        Join on PK1, PK2
   430  //        for child key /5/6/#/42, the maximal join prefix is /5/6
   431  //
   432  //  2. Prefix joins:
   433  //        Parent table (PK1, PK2)
   434  //        Child table (PK1, PK2, PK3)
   435  //        Join on PK1 (this is a prefix of the parent PKs).
   436  //        For child key /5/6/#/42, the maximal join prefix is /5
   437  //
   438  //  3. Subset joins:
   439  //        Parent table (PK1, PK2, PK3)
   440  //        Child table (PK1, PK2, PK3, PK4)
   441  //        Join on PK1, PK3
   442  //        For child key /5/6/7/#/32, the maximal join prefix is /5
   443  //
   444  // This logic can also be extended in the general case to joins between sibling
   445  // joins with a common ancestor: the maximal join prefix will be applied to
   446  // both tables where each sibling scan is passed as the target scanNode.
   447  func maximalJoinPrefix(
   448  	ancestor *scanNode, target *scanNode, key roachpb.Key,
   449  ) (roachpb.Key, bool, error) {
   450  	// To calculate how long this prefix is, we take a look at the actual
   451  	// encoding of an interleaved table's key
   452  	//   /table/index/<parent-pk1>/.../<parent-pkN>/#/.../table/index/<child-pk1>/.../<child-pkN>
   453  	// For each ancestor (including parent), we have
   454  	//   table, index, '#' (interleaved sentinel)
   455  	// or 3 values to peek at.
   456  	// We truncate up to the key M which is the last column in our join.
   457  	//   /table/index/<parent-pk1>/.../<parent-pkM>
   458  	// For the full interleaved join case, we need to count the number of
   459  	// columns in the shared interleave prefix (pk1 to pkM). We traverse the
   460  	// InterleaveDescriptor and add up SharedPrefixLen.
   461  	// We finally subtract 1 since we do not want to include the last
   462  	// interleaved sentinel '#'.
   463  	// Thus we need to peek (encoding.PeekLength())
   464  	//    3 * count(interleaved ancestors) + sum(SharedPrefixLen) - 1
   465  	// times to get the actual byte length of the prefix.
   466  	//
   467  	// Example:
   468  	//
   469  	// Given the following interleaved hierarchy (where their primary keys
   470  	// are in parentheses)
   471  	//   parent	      (pid1)
   472  	//     child	      (pid1, cid1, cid2)
   473  	//        grandchild  (pid1, cid1, cid2, gcid1)
   474  	//
   475  	// Let our join be defined on (pid1, cid1, cid2) and we want to join
   476  	// the child and grandchild tables.
   477  	//
   478  	// A grandchild key could be (pid1=5, cid1=6, cid2=7, gcid1=8)
   479  	//    /<parent-id>/1/5/#/<child-id>/1/6/7/#/<gchild-id>/1/8
   480  	//
   481  	// We'd like to take the prefix up to and including <cid2> or
   482  	//    /<parent-id>/1/5/#/<child-id>/1/6/7
   483  	//
   484  	// We must call encoding.PeekLength() 8 times or
   485  	//   3 * nAncestors + sum(SharedPrefixLen) - 1 = 3 * 2 + (1 + 2) - 1 = 8
   486  	// where the ancestor is child.
   487  	//
   488  	// TODO(richardwu): this formula works only for full interleaved joins.
   489  	// For prefix/subset joins, instead of adding the SharedPrefixLen of
   490  	// the ancestor the join is defined on, we would add the number of
   491  	// prefix columns in our interleave prefix that we are joining on.
   492  	nAncestors := 0
   493  	sharedPrefixLen := 0
   494  	for _, targetAncs := range target.index.Interleave.Ancestors {
   495  		nAncestors++
   496  		sharedPrefixLen += int(targetAncs.SharedPrefixLen)
   497  		if targetAncs.TableID == ancestor.desc.ID && targetAncs.IndexID == ancestor.index.ID {
   498  			break
   499  		}
   500  	}
   501  
   502  	initialKey := key
   503  	prefixLen := 0
   504  	for i := 0; i < 3*nAncestors+sharedPrefixLen-1; i++ {
   505  		// It's possible for the span key to not contain the full join
   506  		// prefix (a key might refer to an ancestor further up the
   507  		// interleaved hierarchy).
   508  		if len(key) == 0 {
   509  			break
   510  		}
   511  		// Note: this key might have been edited with PrefixEnd. This can cause
   512  		// problems for certain datatypes, like strings, which have a sentinel byte
   513  		// sequence indicating the end of the type. In that case, PeekLength will
   514  		// fail. If that happens, we try to UndoPrefixEnd the key and check the
   515  		// length again.
   516  		// TODO(jordan): this function should be aware of whether a key has been
   517  		// PrefixEnd'd or not, and act accordingly.
   518  		valLen, err := encoding.PeekLength(key)
   519  		if err != nil {
   520  			key, ok := encoding.UndoPrefixEnd(key)
   521  			if !ok {
   522  				return nil, false, err
   523  			}
   524  			valLen, err = encoding.PeekLength(key)
   525  			if err != nil {
   526  				return nil, false, err
   527  			}
   528  		}
   529  		prefixLen += valLen
   530  		key = key[valLen:]
   531  	}
   532  
   533  	if len(key) > 0 {
   534  		// There are remaining bytes in the key: we truncate it and
   535  		// return true.
   536  		return initialKey[:prefixLen], true, nil
   537  	}
   538  
   539  	// The loop terminated early because the key is shorter than the
   540  	// full join prefix.
   541  	// We return false to denote that this key was not truncated to
   542  	// form the join prefix.
   543  	return initialKey, false, nil
   544  }
   545  
   546  // sortedSpanPartitions implements sort.Interface. Sorting is defined on the
   547  // node ID of each partition.
   548  type sortedSpanPartitions []SpanPartition
   549  
   550  func (s sortedSpanPartitions) Len() int           { return len(s) }
   551  func (s sortedSpanPartitions) Swap(i, j int)      { s[i], s[j] = s[j], s[i] }
   552  func (s sortedSpanPartitions) Less(i, j int) bool { return s[i].Node < s[j].Node }
   553  
   554  // alignInterleavedSpans takes the partitioned spans from both the parent
   555  // (parentSpans) and (not necessarily direct) child (childSpans), "aligns" them
   556  // and returns childSpans such that all child keys that need to be joined with
   557  // their corresponding parent keys are mapped to the parent keys' partition.
   558  // This ensures that we correctly join all parent-child rows within the
   559  // node-contained InterleavedReaderJoiner.
   560  //
   561  // For each parentSpan, a "join span" is computed.
   562  // The "join span" is a span that includes all child rows that need to be
   563  // joined with parent rows in the span.
   564  //
   565  // With the "join span" of each parent span, we can find any child spans that
   566  // need to be remapped to the same node as the parent span.
   567  //
   568  // We iterate through each child span and see which parent join span overlaps.
   569  //
   570  // If there is no overlap with any join span, there can't possibly be any join
   571  // results from this child span. We still need to keep it for outer joins, but
   572  // it doesn't need to be remapped.
   573  //
   574  // If there is overlap with some parent join span, there exist "some" child
   575  // keys in the span that need to be mapped to the parent span. The sections of
   576  // the child span that do not overlap need to be split off and potentially
   577  // remapped to other parent join spans.
   578  //
   579  // The child span gets split as necessary on the join span's boundaries. The
   580  // split that overlaps the join span is (re-)mapped to the parent span. Any
   581  // remaining splits are considered separately with the same logic.
   582  func alignInterleavedSpans(
   583  	n *joinNode, parentSpans []SpanPartition, childSpans []SpanPartition,
   584  ) ([]SpanPartition, error) {
   585  	mappedSpans := make(map[roachpb.NodeID]roachpb.Spans)
   586  
   587  	// Map parent spans to their join span.
   588  	joinSpans, err := joinSpans(n, parentSpans)
   589  	if err != nil {
   590  		return nil, err
   591  	}
   592  
   593  	// mapAndSplit takes a childSpan and finds the parentJoinSpan that has
   594  	// the parent row(s) with which the child row(s) are suppose to join.
   595  	// It does this by finding overlaps between childSpan and
   596  	// parentJoinSpan.
   597  	// It splits off the non-overlapping parts and appends them to
   598  	// the passed in nonOverlaps slice for repeated application.
   599  	mapAndSplit := func(curNodeID roachpb.NodeID, childSpan roachpb.Span, nonOverlaps roachpb.Spans) roachpb.Spans {
   600  		// TODO(richardwu): Instead of doing a linear search for each
   601  		// child span, we can make this O(logn) with binary search
   602  		// after pre-sorting the parent join spans.
   603  		for _, parentPart := range joinSpans {
   604  			for _, parentJoinSpan := range parentPart.Spans {
   605  				if parentJoinSpan.Overlaps(childSpan) {
   606  					// Initialize the overlap region
   607  					// as the entire childSpan.
   608  					overlap := childSpan
   609  					var nonOverlap roachpb.Span
   610  
   611  					// Check non-overlapping region
   612  					// before start key.
   613  					//	    |----parentJoinSpan----...
   614  					//  |----childSpan----...
   615  					if bytes.Compare(parentJoinSpan.Key, childSpan.Key) > 0 {
   616  						nonOverlap, overlap = overlap.SplitOnKey(parentJoinSpan.Key)
   617  						nonOverlaps = append(nonOverlaps, nonOverlap)
   618  					}
   619  
   620  					// Check non-overlapping region
   621  					// before end key.
   622  					//  ...----parentJoinSpan----|
   623  					//		  ...----childSpan----|
   624  					if bytes.Compare(parentJoinSpan.EndKey, childSpan.EndKey) < 0 {
   625  						overlap, nonOverlap = overlap.SplitOnKey(parentJoinSpan.EndKey)
   626  						nonOverlaps = append(nonOverlaps, nonOverlap)
   627  					}
   628  
   629  					// Map the overlap region to the
   630  					// partition/node of the
   631  					// parentJoinSpan.
   632  					mappedSpans[parentPart.Node] = append(mappedSpans[parentPart.Node], overlap)
   633  
   634  					return nonOverlaps
   635  				}
   636  			}
   637  		}
   638  
   639  		// There was no corresponding parentJoinSpan for this
   640  		// childSpan.  We simply map childSpan back to its current
   641  		// partition/node.
   642  		mappedSpans[curNodeID] = append(mappedSpans[curNodeID], childSpan)
   643  
   644  		return nonOverlaps
   645  	}
   646  
   647  	// Buffer to store spans that still need to be mapped.
   648  	// It is initialized with the initial childSpan and may be populated
   649  	// with non-overlapping sub-spans as mapAndSplit is invoked.
   650  	// Note this is unbounded since a mapAndSplit of one childSpan can
   651  	// cause two non-overlapping spans to be generated.
   652  	// We recurse on the non-overlapping spans until none are left before
   653  	// moving on to the next childSpan.
   654  	spansLeft := make(roachpb.Spans, 0, 2)
   655  	for _, childPart := range childSpans {
   656  		for _, childSpan := range childPart.Spans {
   657  			spansLeft = append(spansLeft, childSpan)
   658  			for len(spansLeft) > 0 {
   659  				// Copy out the last span in spansLeft to
   660  				// mapAndSplit.
   661  				spanToMap := spansLeft[len(spansLeft)-1]
   662  				// Discard the element from spansLeft and
   663  				// reclaim one buffer space.
   664  				spansLeft = spansLeft[:len(spansLeft)-1]
   665  				// We map every child span to its
   666  				// corresponding parent span.
   667  				// Splitting the child span may be
   668  				// necessary which may produce up to two
   669  				// non-overlapping sub-spans that are
   670  				// appended to spansLeft.
   671  				spansLeft = mapAndSplit(childPart.Node, spanToMap, spansLeft)
   672  			}
   673  		}
   674  	}
   675  
   676  	// It's possible from the mapAndSplit logic that we end up with
   677  	// adjacent spans on the same node. We want to clean this up by
   678  	// merging them.
   679  	alignedDescSpans := make(sortedSpanPartitions, 0, len(mappedSpans))
   680  	for nodeID, spans := range mappedSpans {
   681  		spans, _ = roachpb.MergeSpans(spans)
   682  		alignedDescSpans = append(
   683  			alignedDescSpans,
   684  			SpanPartition{
   685  				Node:  nodeID,
   686  				Spans: spans,
   687  			},
   688  		)
   689  	}
   690  
   691  	sort.Sort(alignedDescSpans)
   692  
   693  	return alignedDescSpans, nil
   694  }
   695  
   696  // The derivation of the "join span" for a parent span is as follows (see
   697  // comment above alignInterleaveSpans for why this is needed):
   698  //
   699  //   1. Start key of join span (the first parent key in parentSpan)
   700  //
   701  //      Take the maximalJoinPrefix (MJP) of parentSpan.Key. If the MJP Is
   702  //      the same with parentSpan.Key (no truncation occurred), then it is also
   703  //      the join span start key (examples A, B above).
   704  //      Otherwise, the parentSpan.Key contains more than parent keys, and
   705  //      because child rows come after parent rows, the join span start key is
   706  //      the PrefixEnd() of the MJP (examples C, D).
   707  //
   708  //   2. End key of the join span: the next parent key after the last parent key
   709  //      in parentSpan (it needs to be the next key because child rows come after
   710  //      the parent rows).
   711  //
   712  //      Take the maximalJoinPrefix (MJP) of parentSpan.EndKey. If the MJP
   713  //      is the same with parentSpan.EndKey (no truncation occurred), then it is
   714  //      also the join span end key (examples A, C).
   715  //      Otherwise, parentSpan.EndKey contains more than parent keys and needs to
   716  //      be extended to include all child rows for the last parent row; the join
   717  //      span end key is the PrefixEnd() of the MJP (examples B, D).
   718  //
   719  // To illustrate, we'll use some examples of parent spans (/table/index omitted
   720  // from keys):
   721  //   A. /1 - /3
   722  //      This span contains parent rows with primary keys 1, 2, and all
   723  //      corresponding child rows. The join span is the same: /1 - /3.
   724  //
   725  //   B. /1 - /3/#/1
   726  //      This span contains parent rows with primary key 1, 2, 3 and all child
   727  //      rows corresponding to 1, 2 (note that /3/#/1 comes after all the parent
   728  //      rows with 3 but before all corresponding child rows). The join span is:
   729  //      /1 - /4.
   730  //
   731  //   C. /1/#/1 - /4
   732  //      This span contains parent rows with primary key 2, 3 and all child rows
   733  //      corresponding to 1, 2, 3. The join span is: /2 - /4.
   734  //
   735  //   D. /1/#/1 - /2/#/1
   736  //      This span contains the parent row with primary key 2 and all child rows
   737  //      corresponding to 1, 2. The join span is: /2 - /3.
   738  //
   739  // The corresponding joinSpans for a set of parentSpans is disjoint if and only
   740  // if the parentSpans are disjoint in terms of the parent rows.
   741  // That is, as long as only 1 node reads a given parent row for all parent
   742  // rows, the joinSpans are guaranteed to be non-overlapping.
   743  // End keys are only pushed forward to the next parent row if the span contains
   744  // the previous parent row.
   745  // Since the previous row is read on that one node, it is not possible for the
   746  // subsequent span on a different node to contain the previous row.
   747  // The start key will be pushed forward to at least the next row, which
   748  // maintains the disjoint property.
   749  func joinSpans(n *joinNode, parentSpans []SpanPartition) ([]SpanPartition, error) {
   750  	joinSpans := make([]SpanPartition, len(parentSpans))
   751  
   752  	parent, child := n.interleavedNodes()
   753  
   754  	// Compute the join span for every parent span.
   755  	for i, parentPart := range parentSpans {
   756  		joinSpans[i].Node = parentPart.Node
   757  		joinSpans[i].Spans = make(roachpb.Spans, len(parentPart.Spans))
   758  
   759  		for j, parentSpan := range parentPart.Spans {
   760  			// Step 1: start key.
   761  			joinSpanStartKey, startTruncated, err := maximalJoinPrefix(parent, child, parentSpan.Key)
   762  			if err != nil {
   763  				return nil, err
   764  			}
   765  			if startTruncated {
   766  				// parentSpan.Key is a child key.
   767  				// Example C and D.
   768  				joinSpanStartKey = joinSpanStartKey.PrefixEnd()
   769  			}
   770  
   771  			// Step 2: end key.
   772  			joinSpanEndKey, endTruncated, err := maximalJoinPrefix(parent, child, parentSpan.EndKey)
   773  			if err != nil {
   774  				return nil, err
   775  			}
   776  
   777  			if endTruncated {
   778  				// parentSpan.EndKey is a child key.
   779  				// Example B and D.
   780  				joinSpanEndKey = joinSpanEndKey.PrefixEnd()
   781  			}
   782  
   783  			// We don't need to check if joinSpanStartKey <
   784  			// joinSpanEndKey since the invalid spans will be
   785  			// ignored during Span.Overlaps.
   786  			joinSpans[i].Spans[j] = roachpb.Span{
   787  				Key:    joinSpanStartKey,
   788  				EndKey: joinSpanEndKey,
   789  			}
   790  		}
   791  	}
   792  
   793  	return joinSpans, nil
   794  }
   795  
   796  func distsqlSetOpJoinType(setOpType tree.UnionType) sqlbase.JoinType {
   797  	switch setOpType {
   798  	case tree.ExceptOp:
   799  		return sqlbase.ExceptAllJoin
   800  	case tree.IntersectOp:
   801  		return sqlbase.IntersectAllJoin
   802  	default:
   803  		panic(fmt.Sprintf("set op type %v unsupported by joins", setOpType))
   804  	}
   805  }
   806  
   807  // getNodesOfRouters returns all nodes that routers are put on.
   808  func getNodesOfRouters(
   809  	routers []physicalplan.ProcessorIdx, processors []physicalplan.Processor,
   810  ) (nodes []roachpb.NodeID) {
   811  	seen := make(map[roachpb.NodeID]struct{})
   812  	for _, pIdx := range routers {
   813  		n := processors[pIdx].Node
   814  		if _, ok := seen[n]; !ok {
   815  			seen[n] = struct{}{}
   816  			nodes = append(nodes, n)
   817  		}
   818  	}
   819  	return nodes
   820  }
   821  
   822  func findJoinProcessorNodes(
   823  	leftRouters, rightRouters []physicalplan.ProcessorIdx, processors []physicalplan.Processor,
   824  ) (nodes []roachpb.NodeID) {
   825  	// TODO(radu): for now we run a join processor on every node that produces
   826  	// data for either source. In the future we should be smarter here.
   827  	return getNodesOfRouters(append(leftRouters, rightRouters...), processors)
   828  }