github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/sql/distsql_plan_join.go (about) 1 // Copyright 2017 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package sql 12 13 import ( 14 "bytes" 15 "fmt" 16 "math" 17 "sort" 18 19 "github.com/cockroachdb/cockroach/pkg/roachpb" 20 "github.com/cockroachdb/cockroach/pkg/settings" 21 "github.com/cockroachdb/cockroach/pkg/sql/execinfrapb" 22 "github.com/cockroachdb/cockroach/pkg/sql/physicalplan" 23 "github.com/cockroachdb/cockroach/pkg/sql/sem/tree" 24 "github.com/cockroachdb/cockroach/pkg/sql/sqlbase" 25 "github.com/cockroachdb/cockroach/pkg/util/encoding" 26 "github.com/cockroachdb/errors" 27 ) 28 29 var planInterleavedJoins = settings.RegisterBoolSetting( 30 "sql.distsql.interleaved_joins.enabled", 31 "if set we plan interleaved table joins instead of merge joins when possible", 32 true, 33 ) 34 35 func (dsp *DistSQLPlanner) tryCreatePlanForInterleavedJoin( 36 planCtx *PlanningCtx, n *joinNode, 37 ) (plan *PhysicalPlan, ok bool, err error) { 38 plan = &PhysicalPlan{} 39 if !useInterleavedJoin(n) { 40 return nil, false, nil 41 } 42 43 leftScan, leftOk := n.left.plan.(*scanNode) 44 rightScan, rightOk := n.right.plan.(*scanNode) 45 46 // We know they are scan nodes from useInterleaveJoin, but we add 47 // this check to prevent future panics. 48 if !leftOk || !rightOk { 49 return nil, false, errors.AssertionFailedf("left and right children of join node must be scan nodes to execute an interleaved join") 50 } 51 52 // We iterate through each table and collate their metadata for 53 // the InterleavedReaderJoinerSpec. 54 tables := make([]execinfrapb.InterleavedReaderJoinerSpec_Table, 2) 55 plans := make([]*PhysicalPlan, 2) 56 var totalLimitHint int64 57 for i, t := range []struct { 58 scan *scanNode 59 eqIndices []int 60 }{ 61 { 62 scan: leftScan, 63 eqIndices: n.pred.leftEqualityIndices, 64 }, 65 { 66 scan: rightScan, 67 eqIndices: n.pred.rightEqualityIndices, 68 }, 69 } { 70 // We don't really need to initialize a full-on plan to 71 // retrieve the metadata for each table reader, but this turns 72 // out to be very useful for computing ordering and remapping the 73 // onCond and columns. 74 var err error 75 if plans[i], err = dsp.createTableReaders(planCtx, t.scan); err != nil { 76 return nil, false, err 77 } 78 79 eqCols := eqCols(t.eqIndices, plans[i].PlanToStreamColMap) 80 ordering := distsqlOrdering(n.mergeJoinOrdering, eqCols) 81 82 // Doesn't matter which processor we choose since the metadata 83 // for TableReader is independent of node/processor instance. 84 tr := plans[i].Processors[0].Spec.Core.TableReader 85 86 tables[i] = execinfrapb.InterleavedReaderJoinerSpec_Table{ 87 Desc: tr.Table, 88 IndexIdx: tr.IndexIdx, 89 Post: plans[i].GetLastStagePost(), 90 Ordering: ordering, 91 } 92 93 // We will set the limit hint of the final 94 // InterleavedReaderJoiner as the sum of the individual tables' 95 // limit hints. 96 // This is because the InterleavedReaderJoiner reads rows from 97 // all tables at the same time and so the hint applies to the 98 // total number of rows read from all tables. 99 if totalLimitHint >= math.MaxInt64-tr.LimitHint { 100 totalLimitHint = math.MaxInt64 101 } else { 102 totalLimitHint += tr.LimitHint 103 } 104 } 105 106 joinType := n.joinType 107 108 post, joinToStreamColMap := joinOutColumns(n, plans[0].PlanToStreamColMap, plans[1].PlanToStreamColMap) 109 onExpr, err := remapOnExpr(planCtx, n, plans[0].PlanToStreamColMap, plans[1].PlanToStreamColMap) 110 if err != nil { 111 return nil, false, err 112 } 113 114 ancestor, descendant := n.interleavedNodes() 115 116 // We partition each set of spans to their respective nodes. 117 ancsPartitions, err := dsp.PartitionSpans(planCtx, ancestor.spans) 118 if err != nil { 119 return nil, false, err 120 } 121 descPartitions, err := dsp.PartitionSpans(planCtx, descendant.spans) 122 if err != nil { 123 return nil, false, err 124 } 125 126 // We want to ensure that all child spans with a given interleave 127 // prefix value (which also happens to be our equality join columns) 128 // are read on the same node as the corresponding ancestor rows. 129 // We map all descendant spans in their partitions to the corresponding 130 // nodes of the ascendant spans. 131 // 132 // Example: 133 // Let PK1 and (PK1, PK2) be the primary keys of parent and child, 134 // respectively. PK1 is the interleave prefix. 135 // The filter WHERE PK1 = 1 AND PK2 IN (5, 7) will produce the 136 // parent and child spans 137 // parent: /1 - /2 (technically /1 - /1/#/8) 138 // child: /1/#/5 - /1/#/6, /1/#/7 - /1/#/8 139 // If the parent span is partitioned to node 1 and the child spans are 140 // partitioned to node 2 and 3, then we need to move the child spans 141 // to node 1 where the PK1 = 1 parent row is read. 142 if descPartitions, err = alignInterleavedSpans(n, ancsPartitions, descPartitions); err != nil { 143 return nil, false, err 144 } 145 146 // Figure out which nodes we need to schedule a processor on. 147 seen := make(map[roachpb.NodeID]struct{}) 148 var nodes []roachpb.NodeID 149 for _, partitions := range [][]SpanPartition{ancsPartitions, descPartitions} { 150 for _, part := range partitions { 151 if _, ok := seen[part.Node]; !ok { 152 seen[part.Node] = struct{}{} 153 nodes = append(nodes, part.Node) 154 } 155 } 156 } 157 158 var ancsIdx, descIdx int 159 // The left table is in the 0th index, right table in the 1st index. 160 if leftScan == ancestor { 161 ancsIdx, descIdx = 0, 1 162 } else { 163 ancsIdx, descIdx = 1, 0 164 } 165 166 stageID := plan.NewStageID() 167 168 // We provision a separate InterleavedReaderJoiner per node that has 169 // rows from either table. 170 for _, nodeID := range nodes { 171 // Find the relevant span from each table for this node. 172 // Note it is possible that either set of spans can be empty 173 // (but not both). 174 var ancsSpans, descSpans roachpb.Spans 175 for _, part := range ancsPartitions { 176 if part.Node == nodeID { 177 ancsSpans = part.Spans 178 break 179 } 180 } 181 for _, part := range descPartitions { 182 if part.Node == nodeID { 183 descSpans = part.Spans 184 break 185 } 186 } 187 if len(ancsSpans) == 0 && len(descSpans) == 0 { 188 panic("cannot have empty set of spans for both tables for a given node") 189 } 190 191 // Make a copy of our spec for each table. 192 processorTables := make([]execinfrapb.InterleavedReaderJoinerSpec_Table, len(tables)) 193 copy(processorTables, tables) 194 // We set the set of spans for each table to be read by the 195 // processor. 196 processorTables[ancsIdx].Spans = makeTableReaderSpans(ancsSpans) 197 processorTables[descIdx].Spans = makeTableReaderSpans(descSpans) 198 199 irj := &execinfrapb.InterleavedReaderJoinerSpec{ 200 Tables: processorTables, 201 // We previously checked that both scans are in the 202 // same direction (useInterleavedJoin). 203 Reverse: ancestor.reverse, 204 LimitHint: totalLimitHint, 205 LockingStrength: ancestor.lockingStrength, 206 LockingWaitPolicy: ancestor.lockingWaitPolicy, 207 OnExpr: onExpr, 208 Type: joinType, 209 } 210 211 proc := physicalplan.Processor{ 212 Node: nodeID, 213 Spec: execinfrapb.ProcessorSpec{ 214 Core: execinfrapb.ProcessorCoreUnion{InterleavedReaderJoiner: irj}, 215 Post: post, 216 Output: []execinfrapb.OutputRouterSpec{{Type: execinfrapb.OutputRouterSpec_PASS_THROUGH}}, 217 StageID: stageID, 218 }, 219 } 220 221 plan.Processors = append(plan.Processors, proc) 222 } 223 224 // Each result router correspond to each of the processors we appended. 225 plan.ResultRouters = make([]physicalplan.ProcessorIdx, len(nodes)) 226 for i := 0; i < len(nodes); i++ { 227 plan.ResultRouters[i] = physicalplan.ProcessorIdx(i) 228 } 229 230 plan.PlanToStreamColMap = joinToStreamColMap 231 plan.ResultTypes, err = getTypesForPlanResult(n, joinToStreamColMap) 232 if err != nil { 233 return nil, false, err 234 } 235 236 plan.SetMergeOrdering(dsp.convertOrdering(n.reqOrdering, plan.PlanToStreamColMap)) 237 return plan, true, nil 238 } 239 240 func joinOutColumns( 241 n *joinNode, leftPlanToStreamColMap, rightPlanToStreamColMap []int, 242 ) (post execinfrapb.PostProcessSpec, joinToStreamColMap []int) { 243 joinToStreamColMap = makePlanToStreamColMap(len(n.columns)) 244 post.Projection = true 245 246 // addOutCol appends to post.OutputColumns and returns the index 247 // in the slice of the added column. 248 addOutCol := func(col uint32) int { 249 idx := len(post.OutputColumns) 250 post.OutputColumns = append(post.OutputColumns, col) 251 return idx 252 } 253 254 // The join columns are in two groups: 255 // - the columns on the left side (numLeftCols) 256 // - the columns on the right side (numRightCols) 257 for i := 0; i < n.pred.numLeftCols; i++ { 258 joinToStreamColMap[i] = addOutCol(uint32(leftPlanToStreamColMap[i])) 259 } 260 261 if n.pred.joinType != sqlbase.LeftSemiJoin && n.pred.joinType != sqlbase.LeftAntiJoin { 262 for i := 0; i < n.pred.numRightCols; i++ { 263 joinToStreamColMap[n.pred.numLeftCols+i] = addOutCol( 264 uint32(n.pred.numLeftCols + rightPlanToStreamColMap[i]), 265 ) 266 } 267 } 268 269 return post, joinToStreamColMap 270 } 271 272 // remapOnExpr remaps ordinal references in the on condition (which refer to the 273 // join columns as described above) to values that make sense in the joiner (0 274 // to N-1 for the left input columns, N to N+M-1 for the right input columns). 275 func remapOnExpr( 276 planCtx *PlanningCtx, n *joinNode, leftPlanToStreamColMap, rightPlanToStreamColMap []int, 277 ) (execinfrapb.Expression, error) { 278 if n.pred.onCond == nil { 279 return execinfrapb.Expression{}, nil 280 } 281 282 joinColMap := make([]int, n.pred.numLeftCols+n.pred.numRightCols) 283 idx := 0 284 leftCols := 0 285 for i := 0; i < n.pred.numLeftCols; i++ { 286 joinColMap[idx] = leftPlanToStreamColMap[i] 287 if leftPlanToStreamColMap[i] != -1 { 288 leftCols++ 289 } 290 idx++ 291 } 292 for i := 0; i < n.pred.numRightCols; i++ { 293 joinColMap[idx] = leftCols + rightPlanToStreamColMap[i] 294 idx++ 295 } 296 297 return physicalplan.MakeExpression(n.pred.onCond, planCtx, joinColMap) 298 } 299 300 // eqCols produces a slice of ordinal references for the plan columns specified 301 // in eqIndices using planToColMap. 302 // That is: eqIndices contains a slice of plan column indexes and planToColMap 303 // maps the plan column indexes to the ordinal references (index of the 304 // intermediate row produced). 305 func eqCols(eqIndices, planToColMap []int) []uint32 { 306 eqCols := make([]uint32, len(eqIndices)) 307 for i, planCol := range eqIndices { 308 eqCols[i] = uint32(planToColMap[planCol]) 309 } 310 311 return eqCols 312 } 313 314 // distsqlOrdering converts the ordering specified by mergeJoinOrdering in 315 // terms of the index of eqCols to the ordinal references provided by eqCols. 316 func distsqlOrdering( 317 mergeJoinOrdering sqlbase.ColumnOrdering, eqCols []uint32, 318 ) execinfrapb.Ordering { 319 var ord execinfrapb.Ordering 320 ord.Columns = make([]execinfrapb.Ordering_Column, len(mergeJoinOrdering)) 321 for i, c := range mergeJoinOrdering { 322 ord.Columns[i].ColIdx = eqCols[c.ColIdx] 323 dir := execinfrapb.Ordering_Column_ASC 324 if c.Direction == encoding.Descending { 325 dir = execinfrapb.Ordering_Column_DESC 326 } 327 ord.Columns[i].Direction = dir 328 } 329 330 return ord 331 } 332 333 func useInterleavedJoin(n *joinNode) bool { 334 // TODO(richardwu): We currently only do an interleave join on 335 // all equality columns. This can be relaxed once a hybrid 336 // hash-merge join is implemented in streamMerger. 337 if len(n.mergeJoinOrdering) != len(n.pred.leftEqualityIndices) { 338 return false 339 } 340 341 ancestor, descendant := n.interleavedNodes() 342 343 // There is no interleaved ancestor/descendant scan node and thus no 344 // interleaved relation. 345 if ancestor == nil || descendant == nil { 346 return false 347 } 348 349 // We cannot do an interleaved join if the tables require scanning in 350 // opposite directions. 351 if ancestor.reverse != descendant.reverse { 352 return false 353 } 354 355 var ancestorEqIndices []int 356 var descendantEqIndices []int 357 // We are guaranteed that both of the sources are scan nodes from 358 // n.interleavedNodes(). 359 if ancestor == n.left.plan.(*scanNode) { 360 ancestorEqIndices = n.pred.leftEqualityIndices 361 descendantEqIndices = n.pred.rightEqualityIndices 362 } else { 363 ancestorEqIndices = n.pred.rightEqualityIndices 364 descendantEqIndices = n.pred.leftEqualityIndices 365 } 366 367 // We want full 1-1 correspondence between our join columns and the 368 // primary index of the ancestor. 369 // TODO(richardwu): We can relax this once we implement a hybrid 370 // hash/merge for interleaved joins after forming merge groups with the 371 // interleave prefix (or when the merge join logic is combined with 372 // the interleaved join logic). 373 if len(n.mergeJoinOrdering) != len(ancestor.index.ColumnIDs) { 374 return false 375 } 376 377 // We iterate through the ordering given by n.mergeJoinOrdering and check 378 // if the columns have a 1-1 correspondence to the interleaved 379 // ancestor's primary index columns (i.e. interleave prefix) as well as the 380 // descendant's primary index columns. We naively return false if any part 381 // of the ordering does not correspond. 382 for i, info := range n.mergeJoinOrdering { 383 colID := ancestor.index.ColumnIDs[i] 384 // info.ColIdx refers to i in ancestorEqIndices[i], which refers 385 // to the index of the source row. This corresponds to 386 // the index in scanNode.resultColumns. To convert the colID 387 // from the index descriptor, we can use the map provided by 388 // colIdxMap. 389 if ancestorEqIndices[info.ColIdx] != ancestor.colIdxMap[colID] || 390 descendantEqIndices[info.ColIdx] != descendant.colIdxMap[colID] { 391 // The column in the ordering does not correspond to 392 // the column in the interleave prefix. 393 // We should not try to do an interleaved join. 394 return false 395 } 396 } 397 398 // The columns in n.mergeJoinOrdering has a 1-1 correspondence with the 399 // columns in the interleaved ancestor's primary index. We can indeed 400 // hint at the possibility of an interleaved join. 401 return true 402 } 403 404 // maximalJoinPrefix takes the common ancestor scanNode that the join is 405 // defined on, the target scanNode that the index key belongs to and the index 406 // key itself, and returns the maximal prefix of the key which is also a prefix 407 // of all keys that need to be joined together. 408 // 409 // Let's denote a child key interleaved into a parent key in the following. 410 // format: 411 // /table/index/<parent-pk1>/.../<parent-pkN>/#/<child-pk1>/.../<child-pkN> 412 // 413 // In the following examples, the ancestor is parent and the target is child. 414 // 415 // Let M be the longest prefix of the parent PK which is (equality) constrained 416 // by the join. The maximal join prefix is: 417 // /table/index/<parent-pk1>/.../<parent-pkM> 418 // 419 // Examples (/table/index suppressed from keys): 420 // 1. Full interleave (prefix) join: 421 // 422 // 1a. Parent table PK1 423 // Child table (PK1, PK2) 424 // Join on PK1 425 // For child key /5/#/42, the maximal join prefix is /5 426 // 427 // 1b. Parent table (PK1, PK2) 428 // Child table (PK1, PK2, PK3) 429 // Join on PK1, PK2 430 // for child key /5/6/#/42, the maximal join prefix is /5/6 431 // 432 // 2. Prefix joins: 433 // Parent table (PK1, PK2) 434 // Child table (PK1, PK2, PK3) 435 // Join on PK1 (this is a prefix of the parent PKs). 436 // For child key /5/6/#/42, the maximal join prefix is /5 437 // 438 // 3. Subset joins: 439 // Parent table (PK1, PK2, PK3) 440 // Child table (PK1, PK2, PK3, PK4) 441 // Join on PK1, PK3 442 // For child key /5/6/7/#/32, the maximal join prefix is /5 443 // 444 // This logic can also be extended in the general case to joins between sibling 445 // joins with a common ancestor: the maximal join prefix will be applied to 446 // both tables where each sibling scan is passed as the target scanNode. 447 func maximalJoinPrefix( 448 ancestor *scanNode, target *scanNode, key roachpb.Key, 449 ) (roachpb.Key, bool, error) { 450 // To calculate how long this prefix is, we take a look at the actual 451 // encoding of an interleaved table's key 452 // /table/index/<parent-pk1>/.../<parent-pkN>/#/.../table/index/<child-pk1>/.../<child-pkN> 453 // For each ancestor (including parent), we have 454 // table, index, '#' (interleaved sentinel) 455 // or 3 values to peek at. 456 // We truncate up to the key M which is the last column in our join. 457 // /table/index/<parent-pk1>/.../<parent-pkM> 458 // For the full interleaved join case, we need to count the number of 459 // columns in the shared interleave prefix (pk1 to pkM). We traverse the 460 // InterleaveDescriptor and add up SharedPrefixLen. 461 // We finally subtract 1 since we do not want to include the last 462 // interleaved sentinel '#'. 463 // Thus we need to peek (encoding.PeekLength()) 464 // 3 * count(interleaved ancestors) + sum(SharedPrefixLen) - 1 465 // times to get the actual byte length of the prefix. 466 // 467 // Example: 468 // 469 // Given the following interleaved hierarchy (where their primary keys 470 // are in parentheses) 471 // parent (pid1) 472 // child (pid1, cid1, cid2) 473 // grandchild (pid1, cid1, cid2, gcid1) 474 // 475 // Let our join be defined on (pid1, cid1, cid2) and we want to join 476 // the child and grandchild tables. 477 // 478 // A grandchild key could be (pid1=5, cid1=6, cid2=7, gcid1=8) 479 // /<parent-id>/1/5/#/<child-id>/1/6/7/#/<gchild-id>/1/8 480 // 481 // We'd like to take the prefix up to and including <cid2> or 482 // /<parent-id>/1/5/#/<child-id>/1/6/7 483 // 484 // We must call encoding.PeekLength() 8 times or 485 // 3 * nAncestors + sum(SharedPrefixLen) - 1 = 3 * 2 + (1 + 2) - 1 = 8 486 // where the ancestor is child. 487 // 488 // TODO(richardwu): this formula works only for full interleaved joins. 489 // For prefix/subset joins, instead of adding the SharedPrefixLen of 490 // the ancestor the join is defined on, we would add the number of 491 // prefix columns in our interleave prefix that we are joining on. 492 nAncestors := 0 493 sharedPrefixLen := 0 494 for _, targetAncs := range target.index.Interleave.Ancestors { 495 nAncestors++ 496 sharedPrefixLen += int(targetAncs.SharedPrefixLen) 497 if targetAncs.TableID == ancestor.desc.ID && targetAncs.IndexID == ancestor.index.ID { 498 break 499 } 500 } 501 502 initialKey := key 503 prefixLen := 0 504 for i := 0; i < 3*nAncestors+sharedPrefixLen-1; i++ { 505 // It's possible for the span key to not contain the full join 506 // prefix (a key might refer to an ancestor further up the 507 // interleaved hierarchy). 508 if len(key) == 0 { 509 break 510 } 511 // Note: this key might have been edited with PrefixEnd. This can cause 512 // problems for certain datatypes, like strings, which have a sentinel byte 513 // sequence indicating the end of the type. In that case, PeekLength will 514 // fail. If that happens, we try to UndoPrefixEnd the key and check the 515 // length again. 516 // TODO(jordan): this function should be aware of whether a key has been 517 // PrefixEnd'd or not, and act accordingly. 518 valLen, err := encoding.PeekLength(key) 519 if err != nil { 520 key, ok := encoding.UndoPrefixEnd(key) 521 if !ok { 522 return nil, false, err 523 } 524 valLen, err = encoding.PeekLength(key) 525 if err != nil { 526 return nil, false, err 527 } 528 } 529 prefixLen += valLen 530 key = key[valLen:] 531 } 532 533 if len(key) > 0 { 534 // There are remaining bytes in the key: we truncate it and 535 // return true. 536 return initialKey[:prefixLen], true, nil 537 } 538 539 // The loop terminated early because the key is shorter than the 540 // full join prefix. 541 // We return false to denote that this key was not truncated to 542 // form the join prefix. 543 return initialKey, false, nil 544 } 545 546 // sortedSpanPartitions implements sort.Interface. Sorting is defined on the 547 // node ID of each partition. 548 type sortedSpanPartitions []SpanPartition 549 550 func (s sortedSpanPartitions) Len() int { return len(s) } 551 func (s sortedSpanPartitions) Swap(i, j int) { s[i], s[j] = s[j], s[i] } 552 func (s sortedSpanPartitions) Less(i, j int) bool { return s[i].Node < s[j].Node } 553 554 // alignInterleavedSpans takes the partitioned spans from both the parent 555 // (parentSpans) and (not necessarily direct) child (childSpans), "aligns" them 556 // and returns childSpans such that all child keys that need to be joined with 557 // their corresponding parent keys are mapped to the parent keys' partition. 558 // This ensures that we correctly join all parent-child rows within the 559 // node-contained InterleavedReaderJoiner. 560 // 561 // For each parentSpan, a "join span" is computed. 562 // The "join span" is a span that includes all child rows that need to be 563 // joined with parent rows in the span. 564 // 565 // With the "join span" of each parent span, we can find any child spans that 566 // need to be remapped to the same node as the parent span. 567 // 568 // We iterate through each child span and see which parent join span overlaps. 569 // 570 // If there is no overlap with any join span, there can't possibly be any join 571 // results from this child span. We still need to keep it for outer joins, but 572 // it doesn't need to be remapped. 573 // 574 // If there is overlap with some parent join span, there exist "some" child 575 // keys in the span that need to be mapped to the parent span. The sections of 576 // the child span that do not overlap need to be split off and potentially 577 // remapped to other parent join spans. 578 // 579 // The child span gets split as necessary on the join span's boundaries. The 580 // split that overlaps the join span is (re-)mapped to the parent span. Any 581 // remaining splits are considered separately with the same logic. 582 func alignInterleavedSpans( 583 n *joinNode, parentSpans []SpanPartition, childSpans []SpanPartition, 584 ) ([]SpanPartition, error) { 585 mappedSpans := make(map[roachpb.NodeID]roachpb.Spans) 586 587 // Map parent spans to their join span. 588 joinSpans, err := joinSpans(n, parentSpans) 589 if err != nil { 590 return nil, err 591 } 592 593 // mapAndSplit takes a childSpan and finds the parentJoinSpan that has 594 // the parent row(s) with which the child row(s) are suppose to join. 595 // It does this by finding overlaps between childSpan and 596 // parentJoinSpan. 597 // It splits off the non-overlapping parts and appends them to 598 // the passed in nonOverlaps slice for repeated application. 599 mapAndSplit := func(curNodeID roachpb.NodeID, childSpan roachpb.Span, nonOverlaps roachpb.Spans) roachpb.Spans { 600 // TODO(richardwu): Instead of doing a linear search for each 601 // child span, we can make this O(logn) with binary search 602 // after pre-sorting the parent join spans. 603 for _, parentPart := range joinSpans { 604 for _, parentJoinSpan := range parentPart.Spans { 605 if parentJoinSpan.Overlaps(childSpan) { 606 // Initialize the overlap region 607 // as the entire childSpan. 608 overlap := childSpan 609 var nonOverlap roachpb.Span 610 611 // Check non-overlapping region 612 // before start key. 613 // |----parentJoinSpan----... 614 // |----childSpan----... 615 if bytes.Compare(parentJoinSpan.Key, childSpan.Key) > 0 { 616 nonOverlap, overlap = overlap.SplitOnKey(parentJoinSpan.Key) 617 nonOverlaps = append(nonOverlaps, nonOverlap) 618 } 619 620 // Check non-overlapping region 621 // before end key. 622 // ...----parentJoinSpan----| 623 // ...----childSpan----| 624 if bytes.Compare(parentJoinSpan.EndKey, childSpan.EndKey) < 0 { 625 overlap, nonOverlap = overlap.SplitOnKey(parentJoinSpan.EndKey) 626 nonOverlaps = append(nonOverlaps, nonOverlap) 627 } 628 629 // Map the overlap region to the 630 // partition/node of the 631 // parentJoinSpan. 632 mappedSpans[parentPart.Node] = append(mappedSpans[parentPart.Node], overlap) 633 634 return nonOverlaps 635 } 636 } 637 } 638 639 // There was no corresponding parentJoinSpan for this 640 // childSpan. We simply map childSpan back to its current 641 // partition/node. 642 mappedSpans[curNodeID] = append(mappedSpans[curNodeID], childSpan) 643 644 return nonOverlaps 645 } 646 647 // Buffer to store spans that still need to be mapped. 648 // It is initialized with the initial childSpan and may be populated 649 // with non-overlapping sub-spans as mapAndSplit is invoked. 650 // Note this is unbounded since a mapAndSplit of one childSpan can 651 // cause two non-overlapping spans to be generated. 652 // We recurse on the non-overlapping spans until none are left before 653 // moving on to the next childSpan. 654 spansLeft := make(roachpb.Spans, 0, 2) 655 for _, childPart := range childSpans { 656 for _, childSpan := range childPart.Spans { 657 spansLeft = append(spansLeft, childSpan) 658 for len(spansLeft) > 0 { 659 // Copy out the last span in spansLeft to 660 // mapAndSplit. 661 spanToMap := spansLeft[len(spansLeft)-1] 662 // Discard the element from spansLeft and 663 // reclaim one buffer space. 664 spansLeft = spansLeft[:len(spansLeft)-1] 665 // We map every child span to its 666 // corresponding parent span. 667 // Splitting the child span may be 668 // necessary which may produce up to two 669 // non-overlapping sub-spans that are 670 // appended to spansLeft. 671 spansLeft = mapAndSplit(childPart.Node, spanToMap, spansLeft) 672 } 673 } 674 } 675 676 // It's possible from the mapAndSplit logic that we end up with 677 // adjacent spans on the same node. We want to clean this up by 678 // merging them. 679 alignedDescSpans := make(sortedSpanPartitions, 0, len(mappedSpans)) 680 for nodeID, spans := range mappedSpans { 681 spans, _ = roachpb.MergeSpans(spans) 682 alignedDescSpans = append( 683 alignedDescSpans, 684 SpanPartition{ 685 Node: nodeID, 686 Spans: spans, 687 }, 688 ) 689 } 690 691 sort.Sort(alignedDescSpans) 692 693 return alignedDescSpans, nil 694 } 695 696 // The derivation of the "join span" for a parent span is as follows (see 697 // comment above alignInterleaveSpans for why this is needed): 698 // 699 // 1. Start key of join span (the first parent key in parentSpan) 700 // 701 // Take the maximalJoinPrefix (MJP) of parentSpan.Key. If the MJP Is 702 // the same with parentSpan.Key (no truncation occurred), then it is also 703 // the join span start key (examples A, B above). 704 // Otherwise, the parentSpan.Key contains more than parent keys, and 705 // because child rows come after parent rows, the join span start key is 706 // the PrefixEnd() of the MJP (examples C, D). 707 // 708 // 2. End key of the join span: the next parent key after the last parent key 709 // in parentSpan (it needs to be the next key because child rows come after 710 // the parent rows). 711 // 712 // Take the maximalJoinPrefix (MJP) of parentSpan.EndKey. If the MJP 713 // is the same with parentSpan.EndKey (no truncation occurred), then it is 714 // also the join span end key (examples A, C). 715 // Otherwise, parentSpan.EndKey contains more than parent keys and needs to 716 // be extended to include all child rows for the last parent row; the join 717 // span end key is the PrefixEnd() of the MJP (examples B, D). 718 // 719 // To illustrate, we'll use some examples of parent spans (/table/index omitted 720 // from keys): 721 // A. /1 - /3 722 // This span contains parent rows with primary keys 1, 2, and all 723 // corresponding child rows. The join span is the same: /1 - /3. 724 // 725 // B. /1 - /3/#/1 726 // This span contains parent rows with primary key 1, 2, 3 and all child 727 // rows corresponding to 1, 2 (note that /3/#/1 comes after all the parent 728 // rows with 3 but before all corresponding child rows). The join span is: 729 // /1 - /4. 730 // 731 // C. /1/#/1 - /4 732 // This span contains parent rows with primary key 2, 3 and all child rows 733 // corresponding to 1, 2, 3. The join span is: /2 - /4. 734 // 735 // D. /1/#/1 - /2/#/1 736 // This span contains the parent row with primary key 2 and all child rows 737 // corresponding to 1, 2. The join span is: /2 - /3. 738 // 739 // The corresponding joinSpans for a set of parentSpans is disjoint if and only 740 // if the parentSpans are disjoint in terms of the parent rows. 741 // That is, as long as only 1 node reads a given parent row for all parent 742 // rows, the joinSpans are guaranteed to be non-overlapping. 743 // End keys are only pushed forward to the next parent row if the span contains 744 // the previous parent row. 745 // Since the previous row is read on that one node, it is not possible for the 746 // subsequent span on a different node to contain the previous row. 747 // The start key will be pushed forward to at least the next row, which 748 // maintains the disjoint property. 749 func joinSpans(n *joinNode, parentSpans []SpanPartition) ([]SpanPartition, error) { 750 joinSpans := make([]SpanPartition, len(parentSpans)) 751 752 parent, child := n.interleavedNodes() 753 754 // Compute the join span for every parent span. 755 for i, parentPart := range parentSpans { 756 joinSpans[i].Node = parentPart.Node 757 joinSpans[i].Spans = make(roachpb.Spans, len(parentPart.Spans)) 758 759 for j, parentSpan := range parentPart.Spans { 760 // Step 1: start key. 761 joinSpanStartKey, startTruncated, err := maximalJoinPrefix(parent, child, parentSpan.Key) 762 if err != nil { 763 return nil, err 764 } 765 if startTruncated { 766 // parentSpan.Key is a child key. 767 // Example C and D. 768 joinSpanStartKey = joinSpanStartKey.PrefixEnd() 769 } 770 771 // Step 2: end key. 772 joinSpanEndKey, endTruncated, err := maximalJoinPrefix(parent, child, parentSpan.EndKey) 773 if err != nil { 774 return nil, err 775 } 776 777 if endTruncated { 778 // parentSpan.EndKey is a child key. 779 // Example B and D. 780 joinSpanEndKey = joinSpanEndKey.PrefixEnd() 781 } 782 783 // We don't need to check if joinSpanStartKey < 784 // joinSpanEndKey since the invalid spans will be 785 // ignored during Span.Overlaps. 786 joinSpans[i].Spans[j] = roachpb.Span{ 787 Key: joinSpanStartKey, 788 EndKey: joinSpanEndKey, 789 } 790 } 791 } 792 793 return joinSpans, nil 794 } 795 796 func distsqlSetOpJoinType(setOpType tree.UnionType) sqlbase.JoinType { 797 switch setOpType { 798 case tree.ExceptOp: 799 return sqlbase.ExceptAllJoin 800 case tree.IntersectOp: 801 return sqlbase.IntersectAllJoin 802 default: 803 panic(fmt.Sprintf("set op type %v unsupported by joins", setOpType)) 804 } 805 } 806 807 // getNodesOfRouters returns all nodes that routers are put on. 808 func getNodesOfRouters( 809 routers []physicalplan.ProcessorIdx, processors []physicalplan.Processor, 810 ) (nodes []roachpb.NodeID) { 811 seen := make(map[roachpb.NodeID]struct{}) 812 for _, pIdx := range routers { 813 n := processors[pIdx].Node 814 if _, ok := seen[n]; !ok { 815 seen[n] = struct{}{} 816 nodes = append(nodes, n) 817 } 818 } 819 return nodes 820 } 821 822 func findJoinProcessorNodes( 823 leftRouters, rightRouters []physicalplan.ProcessorIdx, processors []physicalplan.Processor, 824 ) (nodes []roachpb.NodeID) { 825 // TODO(radu): for now we run a join processor on every node that produces 826 // data for either source. In the future we should be smarter here. 827 return getNodesOfRouters(append(leftRouters, rightRouters...), processors) 828 }