github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/sql/rowexec/interleaved_reader_joiner.go (about) 1 // Copyright 2017 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package rowexec 12 13 import ( 14 "context" 15 "fmt" 16 17 "github.com/cockroachdb/cockroach/pkg/roachpb" 18 "github.com/cockroachdb/cockroach/pkg/server/telemetry" 19 "github.com/cockroachdb/cockroach/pkg/sql/execinfra" 20 "github.com/cockroachdb/cockroach/pkg/sql/execinfrapb" 21 "github.com/cockroachdb/cockroach/pkg/sql/row" 22 "github.com/cockroachdb/cockroach/pkg/sql/scrub" 23 "github.com/cockroachdb/cockroach/pkg/sql/sqlbase" 24 "github.com/cockroachdb/cockroach/pkg/sql/sqltelemetry" 25 "github.com/cockroachdb/cockroach/pkg/util/log" 26 "github.com/cockroachdb/errors" 27 ) 28 29 // irjState represents the state of the processor. 30 type irjState int 31 32 const ( 33 irjStateUnknown irjState = iota 34 // irjReading causes the state machine to read the next row from the kvFetcher 35 // and potentially output a merged row. 36 irjReading 37 // irjUnmatchedChild indicates that the state machine should output the 38 // unmatched child row stored in the unmatchedChild field. 39 irjUnmatchedChild 40 ) 41 42 type tableInfo struct { 43 tableID sqlbase.ID 44 indexID sqlbase.IndexID 45 post execinfra.ProcOutputHelper 46 ordering sqlbase.ColumnOrdering 47 } 48 49 // interleavedReaderJoiner is at the start of a computation flow: it performs KV 50 // operations to retrieve rows for two tables (ancestor and child), internally 51 // filters the rows, performs a merge join with equality constraints. 52 // See docs/RFCS/20171025_interleaved_table_joins.md 53 type interleavedReaderJoiner struct { 54 joinerBase 55 56 // runningState represents the state of the processor. This is in addition to 57 // ProcessorBase.State - the runningState is only relevant when 58 // ProcessorBase.State == StateRunning. 59 runningState irjState 60 61 // Each tableInfo contains the output helper (for intermediate 62 // filtering) and ordering info for each table-index being joined. 63 tables []tableInfo 64 allSpans roachpb.Spans 65 limitHint int64 66 67 fetcher row.Fetcher 68 alloc sqlbase.DatumAlloc 69 70 // TODO(richardwu): If we need to buffer more than 1 ancestor row for 71 // prefix joins, subset joins, and/or outer joins, we need to buffer an 72 // arbitrary number of ancestor and child rows. 73 // We can use streamMerger here for simplicity. 74 ancestorRow sqlbase.EncDatumRow 75 // These are required for OUTER joins where the ancestor need to be 76 // emitted regardless. 77 ancestorJoined bool 78 ancestorJoinSide joinSide 79 descendantJoinSide joinSide 80 unmatchedChild sqlbase.EncDatumRow 81 // ancestorTablePos is the corresponding index of the ancestor table in 82 // tables. 83 ancestorTablePos int 84 } 85 86 func (irj *interleavedReaderJoiner) Start(ctx context.Context) context.Context { 87 irj.runningState = irjReading 88 ctx = irj.StartInternal(ctx, interleavedReaderJoinerProcName) 89 // TODO(radu,andrei,knz): set the traceKV flag when requested by the session. 90 if err := irj.fetcher.StartScan( 91 irj.Ctx, irj.FlowCtx.Txn, irj.allSpans, true /* limitBatches */, irj.limitHint, false, /* traceKV */ 92 ); err != nil { 93 irj.MoveToDraining(err) 94 } 95 return ctx 96 } 97 98 func (irj *interleavedReaderJoiner) Next() (sqlbase.EncDatumRow, *execinfrapb.ProducerMetadata) { 99 // Next is implemented as a state machine. The states are represented by the 100 // irjState enum at the top of this file. 101 // Roughly, the state machine is either in an initialization phase, a steady 102 // state phase that outputs either 1 or 0 rows on every call, or a special 103 // unmatched child phase that outputs a child row that doesn't match the last 104 // seen ancestor if the join type calls for it. 105 for irj.State == execinfra.StateRunning { 106 var row sqlbase.EncDatumRow 107 var meta *execinfrapb.ProducerMetadata 108 switch irj.runningState { 109 case irjReading: 110 irj.runningState, row, meta = irj.nextRow() 111 case irjUnmatchedChild: 112 rendered := irj.renderUnmatchedRow(irj.unmatchedChild, irj.descendantJoinSide) 113 row = irj.ProcessRowHelper(rendered) 114 irj.unmatchedChild = nil 115 irj.runningState = irjReading 116 default: 117 log.Fatalf(irj.Ctx, "unsupported state: %d", irj.runningState) 118 } 119 if row != nil || meta != nil { 120 return row, meta 121 } 122 } 123 return nil, irj.DrainHelper() 124 } 125 126 // findTable returns the tableInfo for the given table and index descriptor, 127 // along with a boolean that is true if the found tableInfo represents the 128 // ancestor table in this join. err is non-nil if the table was missing from the 129 // list. 130 func (irj *interleavedReaderJoiner) findTable( 131 table *sqlbase.TableDescriptor, index *sqlbase.IndexDescriptor, 132 ) (tInfo *tableInfo, isAncestorRow bool, err error) { 133 for i := range irj.tables { 134 tInfo = &irj.tables[i] 135 if table.ID == tInfo.tableID && index.ID == tInfo.indexID { 136 if i == irj.ancestorTablePos { 137 isAncestorRow = true 138 } 139 return tInfo, isAncestorRow, nil 140 } 141 } 142 return nil, 143 false, 144 errors.Errorf("index %q.%q missing from interleaved join", 145 table.Name, index.Name) 146 } 147 148 // nextRow implements the steady state of the interleavedReaderJoiner. It 149 // requests the next row from its backing kv fetcher, determines whether its an 150 // ancestor or child row, and conditionally merges and outputs a result. 151 func (irj *interleavedReaderJoiner) nextRow() ( 152 irjState, 153 sqlbase.EncDatumRow, 154 *execinfrapb.ProducerMetadata, 155 ) { 156 row, desc, index, err := irj.fetcher.NextRow(irj.Ctx) 157 if err != nil { 158 irj.MoveToDraining(scrub.UnwrapScrubError(err)) 159 return irjStateUnknown, nil, irj.DrainHelper() 160 } 161 if row == nil { 162 // All done - just finish maybe emitting our last ancestor. 163 lastAncestor := irj.maybeUnmatchedAncestor() 164 irj.MoveToDraining(nil) 165 return irjReading, lastAncestor, nil 166 } 167 168 // Lookup the helper that belongs to this row. 169 tInfo, isAncestorRow, err := irj.findTable(desc, index) 170 if err != nil { 171 irj.MoveToDraining(err) 172 return irjStateUnknown, nil, irj.DrainHelper() 173 } 174 175 // We post-process the intermediate row from either table. 176 tableRow, ok, err := tInfo.post.ProcessRow(irj.Ctx, row) 177 if err != nil { 178 irj.MoveToDraining(err) 179 return irjStateUnknown, nil, irj.DrainHelper() 180 } 181 if !ok { 182 irj.MoveToDraining(nil) 183 } 184 185 // Row was filtered out. 186 if tableRow == nil { 187 return irjReading, nil, nil 188 } 189 190 if isAncestorRow { 191 maybeAncestor := irj.maybeUnmatchedAncestor() 192 193 irj.ancestorJoined = false 194 irj.ancestorRow = tInfo.post.RowAlloc.CopyRow(tableRow) 195 196 // If maybeAncestor is nil, we'll loop back around and read the next row 197 // without returning a row to the caller. 198 return irjReading, maybeAncestor, nil 199 } 200 201 // A child row (tableRow) is fetched. 202 203 // TODO(richardwu): Generalize this to 2+ tables and sibling 204 // tables. 205 var lrow, rrow sqlbase.EncDatumRow 206 if irj.ancestorTablePos == 0 { 207 lrow, rrow = irj.ancestorRow, tableRow 208 } else { 209 lrow, rrow = tableRow, irj.ancestorRow 210 } 211 212 // TODO(richardwu): this is a very expensive comparison 213 // in the hot path. We can avoid this if there is a foreign 214 // key constraint between the merge columns. 215 // That is: any child rows can be joined with the most 216 // recent parent row without this comparison. 217 cmp, err := CompareEncDatumRowForMerge( 218 irj.tables[0].post.OutputTypes, 219 lrow, 220 rrow, 221 irj.tables[0].ordering, 222 irj.tables[1].ordering, 223 false, /* nullEquality */ 224 &irj.alloc, 225 irj.FlowCtx.EvalCtx, 226 ) 227 if err != nil { 228 irj.MoveToDraining(err) 229 return irjStateUnknown, nil, irj.DrainHelper() 230 } 231 232 // The child row match the most recent ancestorRow on the 233 // equality columns. 234 // Try to join/render and emit. 235 if cmp == 0 { 236 renderedRow, err := irj.render(lrow, rrow) 237 if err != nil { 238 irj.MoveToDraining(err) 239 return irjStateUnknown, nil, irj.DrainHelper() 240 } 241 if renderedRow != nil { 242 irj.ancestorJoined = true 243 } 244 return irjReading, irj.ProcessRowHelper(renderedRow), nil 245 } 246 247 // Child does not match previous ancestorRow. 248 // Try to emit the ancestor row. 249 unmatchedAncestor := irj.maybeUnmatchedAncestor() 250 251 // Reset the ancestorRow (we know there are no more 252 // corresponding children rows). 253 irj.ancestorRow = nil 254 irj.ancestorJoined = false 255 256 newState := irjReading 257 // Set the unmatched child if necessary (we'll pick it up again after we emit 258 // the ancestor). 259 if shouldEmitUnmatchedRow(irj.descendantJoinSide, irj.joinType) { 260 irj.unmatchedChild = row 261 newState = irjUnmatchedChild 262 } 263 264 return newState, unmatchedAncestor, nil 265 } 266 267 func (irj *interleavedReaderJoiner) ConsumerClosed() { 268 // The consumer is done, Next() will not be called again. 269 irj.InternalClose() 270 } 271 272 var _ execinfra.Processor = &interleavedReaderJoiner{} 273 var _ execinfra.RowSource = &interleavedReaderJoiner{} 274 var _ execinfrapb.MetadataSource = &interleavedReaderJoiner{} 275 var _ execinfra.OpNode = &interleavedReaderJoiner{} 276 277 // newInterleavedReaderJoiner creates a interleavedReaderJoiner. 278 func newInterleavedReaderJoiner( 279 flowCtx *execinfra.FlowCtx, 280 processorID int32, 281 spec *execinfrapb.InterleavedReaderJoinerSpec, 282 post *execinfrapb.PostProcessSpec, 283 output execinfra.RowReceiver, 284 ) (*interleavedReaderJoiner, error) { 285 // NB: we hit this with a zero NodeID (but !ok) with multi-tenancy. 286 if nodeID, ok := flowCtx.NodeID.OptionalNodeID(); nodeID == 0 && ok { 287 return nil, errors.AssertionFailedf("attempting to create an interleavedReaderJoiner with uninitialized NodeID") 288 } 289 290 // Increment some telemetry counters about use of the interleaved table join feature. 291 telemetry.Inc(sqltelemetry.InterleavedTableJoinCounter) 292 293 // TODO(richardwu): We can relax this to < 2 (i.e. permit 2+ tables). 294 // This will require modifying joinerBase init logic. 295 if len(spec.Tables) != 2 { 296 return nil, errors.AssertionFailedf("interleavedReaderJoiner only reads from two tables in an interleaved hierarchy") 297 } 298 299 // Ensure the column orderings of all tables being merged are in the 300 // same direction. 301 for i, c := range spec.Tables[0].Ordering.Columns { 302 for _, table := range spec.Tables[1:] { 303 if table.Ordering.Columns[i].Direction != c.Direction { 304 return nil, errors.AssertionFailedf("unmatched column orderings") 305 } 306 } 307 } 308 309 tables := make([]tableInfo, len(spec.Tables)) 310 // We need to take spans from all tables and merge them together 311 // for Fetcher. 312 allSpans := make(roachpb.Spans, 0, len(spec.Tables)) 313 314 // We need to figure out which table is the ancestor. 315 var ancestorTablePos int 316 var numAncestorPKCols int 317 minAncestors := -1 318 for i, table := range spec.Tables { 319 index, _, err := table.Desc.FindIndexByIndexIdx(int(table.IndexIdx)) 320 if err != nil { 321 return nil, err 322 } 323 324 // The simplest way is to find the table with the fewest 325 // interleave ancestors. 326 // TODO(richardwu): Adapt this for sibling joins and multi-table joins. 327 if minAncestors == -1 || len(index.Interleave.Ancestors) < minAncestors { 328 minAncestors = len(index.Interleave.Ancestors) 329 ancestorTablePos = i 330 numAncestorPKCols = len(index.ColumnIDs) 331 } 332 333 if err := tables[i].post.Init( 334 &table.Post, table.Desc.ColumnTypes(), flowCtx.NewEvalCtx(), nil, /*output*/ 335 ); err != nil { 336 return nil, errors.NewAssertionErrorWithWrappedErrf(err, 337 "failed to initialize post-processing helper") 338 } 339 340 tables[i].tableID = table.Desc.ID 341 tables[i].indexID = index.ID 342 tables[i].ordering = execinfrapb.ConvertToColumnOrdering(table.Ordering) 343 for _, trSpan := range table.Spans { 344 allSpans = append(allSpans, trSpan.Span) 345 } 346 } 347 348 if len(spec.Tables[0].Ordering.Columns) != numAncestorPKCols { 349 return nil, errors.AssertionFailedf( 350 "interleavedReaderJoiner only supports joins on the entire interleaved prefix") 351 } 352 353 allSpans, _ = roachpb.MergeSpans(allSpans) 354 355 ancestorJoinSide := leftSide 356 descendantJoinSide := rightSide 357 if ancestorTablePos == 1 { 358 ancestorJoinSide = rightSide 359 descendantJoinSide = leftSide 360 } 361 362 irj := &interleavedReaderJoiner{ 363 tables: tables, 364 allSpans: allSpans, 365 ancestorTablePos: ancestorTablePos, 366 ancestorJoinSide: ancestorJoinSide, 367 descendantJoinSide: descendantJoinSide, 368 } 369 370 if err := irj.initRowFetcher( 371 flowCtx, spec.Tables, tables, spec.Reverse, spec.LockingStrength, &irj.alloc, 372 ); err != nil { 373 return nil, err 374 } 375 376 irj.limitHint = execinfra.LimitHint(spec.LimitHint, post) 377 378 // TODO(richardwu): Generalize this to 2+ tables. 379 if err := irj.joinerBase.init( 380 irj, 381 flowCtx, 382 processorID, 383 irj.tables[0].post.OutputTypes, 384 irj.tables[1].post.OutputTypes, 385 spec.Type, 386 spec.OnExpr, 387 nil, /*leftEqColumns*/ 388 nil, /*rightEqColumns*/ 389 0, /*numMergedColumns*/ 390 post, 391 output, 392 execinfra.ProcStateOpts{ 393 InputsToDrain: []execinfra.RowSource{}, 394 TrailingMetaCallback: irj.generateTrailingMeta, 395 }, 396 ); err != nil { 397 return nil, err 398 } 399 400 return irj, nil 401 } 402 403 func (irj *interleavedReaderJoiner) initRowFetcher( 404 flowCtx *execinfra.FlowCtx, 405 tables []execinfrapb.InterleavedReaderJoinerSpec_Table, 406 tableInfos []tableInfo, 407 reverseScan bool, 408 lockStr sqlbase.ScanLockingStrength, 409 alloc *sqlbase.DatumAlloc, 410 ) error { 411 args := make([]row.FetcherTableArgs, len(tables)) 412 413 for i, table := range tables { 414 desc := sqlbase.NewImmutableTableDescriptor(table.Desc) 415 var err error 416 args[i].Index, args[i].IsSecondaryIndex, err = desc.FindIndexByIndexIdx(int(table.IndexIdx)) 417 if err != nil { 418 return err 419 } 420 421 args[i].ValNeededForCol = tableInfos[i].post.NeededColumns() 422 args[i].ColIdxMap = desc.ColumnIdxMap() 423 args[i].Desc = desc 424 args[i].Cols = desc.Columns 425 args[i].Spans = make(roachpb.Spans, len(table.Spans)) 426 for j, trSpan := range table.Spans { 427 args[i].Spans[j] = trSpan.Span 428 } 429 } 430 431 return irj.fetcher.Init( 432 flowCtx.Codec(), 433 reverseScan, 434 lockStr, 435 true, /* returnRangeInfo */ 436 true, /* isCheck */ 437 alloc, 438 args..., 439 ) 440 } 441 442 func (irj *interleavedReaderJoiner) generateTrailingMeta( 443 ctx context.Context, 444 ) []execinfrapb.ProducerMetadata { 445 trailingMeta := irj.generateMeta(ctx) 446 irj.InternalClose() 447 return trailingMeta 448 } 449 450 func (irj *interleavedReaderJoiner) generateMeta( 451 ctx context.Context, 452 ) []execinfrapb.ProducerMetadata { 453 var trailingMeta []execinfrapb.ProducerMetadata 454 nodeID, ok := irj.FlowCtx.NodeID.OptionalNodeID() 455 if ok { 456 ranges := execinfra.MisplannedRanges(ctx, irj.fetcher.GetRangesInfo(), nodeID) 457 if ranges != nil { 458 trailingMeta = append(trailingMeta, execinfrapb.ProducerMetadata{Ranges: ranges}) 459 } 460 } 461 if tfs := execinfra.GetLeafTxnFinalState(ctx, irj.FlowCtx.Txn); tfs != nil { 462 trailingMeta = append(trailingMeta, execinfrapb.ProducerMetadata{LeafTxnFinalState: tfs}) 463 } 464 return trailingMeta 465 } 466 467 // DrainMeta is part of the MetadataSource interface. 468 func (irj *interleavedReaderJoiner) DrainMeta(ctx context.Context) []execinfrapb.ProducerMetadata { 469 return irj.generateMeta(ctx) 470 } 471 472 const interleavedReaderJoinerProcName = "interleaved reader joiner" 473 474 func (irj *interleavedReaderJoiner) maybeUnmatchedAncestor() sqlbase.EncDatumRow { 475 // We first try to emit the previous ancestor row if it 476 // was never joined with a child row. 477 if irj.ancestorRow != nil && !irj.ancestorJoined { 478 if !shouldEmitUnmatchedRow(irj.ancestorJoinSide, irj.joinType) { 479 return nil 480 } 481 482 rendered := irj.renderUnmatchedRow(irj.ancestorRow, irj.ancestorJoinSide) 483 return irj.ProcessRowHelper(rendered) 484 } 485 return nil 486 } 487 488 // ChildCount is part of the execinfra.OpNode interface. 489 func (irj *interleavedReaderJoiner) ChildCount(verbose bool) int { 490 return 0 491 } 492 493 // Child is part of the execinfra.OpNode interface. 494 func (irj *interleavedReaderJoiner) Child(nth int, verbose bool) execinfra.OpNode { 495 panic(fmt.Sprintf("invalid index %d", nth)) 496 }