github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/sql/rowexec/joinreader.go (about) 1 // Copyright 2016 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package rowexec 12 13 import ( 14 "context" 15 "fmt" 16 "sort" 17 18 "github.com/cockroachdb/cockroach/pkg/sql/execinfra" 19 "github.com/cockroachdb/cockroach/pkg/sql/execinfrapb" 20 "github.com/cockroachdb/cockroach/pkg/sql/row" 21 "github.com/cockroachdb/cockroach/pkg/sql/rowcontainer" 22 "github.com/cockroachdb/cockroach/pkg/sql/scrub" 23 "github.com/cockroachdb/cockroach/pkg/sql/span" 24 "github.com/cockroachdb/cockroach/pkg/sql/sqlbase" 25 "github.com/cockroachdb/cockroach/pkg/sql/types" 26 "github.com/cockroachdb/cockroach/pkg/util" 27 "github.com/cockroachdb/cockroach/pkg/util/log" 28 "github.com/cockroachdb/cockroach/pkg/util/mon" 29 "github.com/cockroachdb/cockroach/pkg/util/tracing" 30 "github.com/cockroachdb/errors" 31 "github.com/opentracing/opentracing-go" 32 ) 33 34 // joinReaderState represents the state of the processor. 35 type joinReaderState int 36 37 const ( 38 jrStateUnknown joinReaderState = iota 39 // jrReadingInput means that a batch of rows is being read from the input. 40 jrReadingInput 41 // jrPerformingLookup means we are performing an index lookup for the current 42 // input row batch. 43 jrPerformingLookup 44 // jrEmittingRows means we are emitting the results of the index lookup. 45 jrEmittingRows 46 ) 47 48 // joinReader performs a lookup join between `input` and the specified `index`. 49 // `lookupCols` specifies the input columns which will be used for the index 50 // lookup. 51 type joinReader struct { 52 joinerBase 53 strategy joinReaderStrategy 54 55 // runningState represents the state of the joinReader. This is in addition to 56 // ProcessorBase.State - the runningState is only relevant when 57 // ProcessorBase.State == StateRunning. 58 runningState joinReaderState 59 60 diskMonitor *mon.BytesMonitor 61 62 desc sqlbase.TableDescriptor 63 index *sqlbase.IndexDescriptor 64 colIdxMap map[sqlbase.ColumnID]int 65 66 // fetcher wraps the row.Fetcher used to perform lookups. This enables the 67 // joinReader to wrap the fetcher with a stat collector when necessary. 68 fetcher rowFetcher 69 alloc sqlbase.DatumAlloc 70 rowAlloc sqlbase.EncDatumRowAlloc 71 shouldLimitBatches bool 72 73 input execinfra.RowSource 74 inputTypes []*types.T 75 // Column indexes in the input stream specifying the columns which match with 76 // the index columns. These are the equality columns of the join. 77 lookupCols []uint32 78 79 // Batch size for fetches. Not a constant so we can lower for testing. 80 batchSizeBytes int64 81 curBatchSizeBytes int64 82 83 // State variables for each batch of input rows. 84 scratchInputRows sqlbase.EncDatumRows 85 } 86 87 var _ execinfra.Processor = &joinReader{} 88 var _ execinfra.RowSource = &joinReader{} 89 var _ execinfrapb.MetadataSource = &joinReader{} 90 var _ execinfra.OpNode = &joinReader{} 91 92 const joinReaderProcName = "join reader" 93 94 // newJoinReader returns a new joinReader. 95 func newJoinReader( 96 flowCtx *execinfra.FlowCtx, 97 processorID int32, 98 spec *execinfrapb.JoinReaderSpec, 99 input execinfra.RowSource, 100 post *execinfrapb.PostProcessSpec, 101 output execinfra.RowReceiver, 102 ) (execinfra.RowSourcedProcessor, error) { 103 jr := &joinReader{ 104 desc: spec.Table, 105 input: input, 106 inputTypes: input.OutputTypes(), 107 lookupCols: spec.LookupColumns, 108 } 109 110 var err error 111 var isSecondary bool 112 jr.index, isSecondary, err = jr.desc.FindIndexByIndexIdx(int(spec.IndexIdx)) 113 if err != nil { 114 return nil, err 115 } 116 returnMutations := spec.Visibility == execinfra.ScanVisibilityPublicAndNotPublic 117 jr.colIdxMap = jr.desc.ColumnIdxMapWithMutations(returnMutations) 118 119 columnIDs, _ := jr.index.FullColumnIDs() 120 indexCols := make([]uint32, len(columnIDs)) 121 columnTypes := jr.desc.ColumnTypesWithMutations(returnMutations) 122 for i, columnID := range columnIDs { 123 indexCols[i] = uint32(columnID) 124 } 125 126 // If the lookup columns form a key, there is only one result per lookup, so the fetcher 127 // should parallelize the key lookups it performs. 128 jr.shouldLimitBatches = !spec.LookupColumnsAreKey 129 130 if err := jr.joinerBase.init( 131 jr, 132 flowCtx, 133 processorID, 134 input.OutputTypes(), 135 columnTypes, 136 spec.Type, 137 spec.OnExpr, 138 jr.lookupCols, 139 indexCols, 140 0, /* numMergedColumns */ 141 post, 142 output, 143 execinfra.ProcStateOpts{ 144 InputsToDrain: []execinfra.RowSource{jr.input}, 145 TrailingMetaCallback: func(ctx context.Context) []execinfrapb.ProducerMetadata { 146 jr.close() 147 return jr.generateMeta(ctx) 148 }, 149 }, 150 ); err != nil { 151 return nil, err 152 } 153 154 collectingStats := false 155 if sp := opentracing.SpanFromContext(flowCtx.EvalCtx.Ctx()); sp != nil && tracing.IsRecording(sp) { 156 collectingStats = true 157 } 158 159 neededRightCols := jr.neededRightCols() 160 if isSecondary && !neededRightCols.SubsetOf(getIndexColSet(jr.index, jr.colIdxMap)) { 161 return nil, errors.Errorf("joinreader index does not cover all columns") 162 } 163 164 var fetcher row.Fetcher 165 _, _, err = initRowFetcher( 166 flowCtx, &fetcher, &jr.desc, int(spec.IndexIdx), jr.colIdxMap, false, /* reverse */ 167 neededRightCols, false /* isCheck */, &jr.alloc, spec.Visibility, spec.LockingStrength, 168 ) 169 if err != nil { 170 return nil, err 171 } 172 if collectingStats { 173 jr.input = newInputStatCollector(jr.input) 174 jr.fetcher = newRowFetcherStatCollector(&fetcher) 175 jr.FinishTrace = jr.outputStatsToTrace 176 } else { 177 jr.fetcher = &fetcher 178 } 179 180 jr.initJoinReaderStrategy(flowCtx, jr.desc.ColumnTypesWithMutations(returnMutations), len(columnIDs), spec.MaintainOrdering) 181 jr.batchSizeBytes = jr.strategy.getLookupRowsBatchSizeHint() 182 183 // TODO(radu): verify the input types match the index key types 184 return jr, nil 185 } 186 187 func (jr *joinReader) initJoinReaderStrategy( 188 flowCtx *execinfra.FlowCtx, typs []*types.T, numKeyCols int, maintainOrdering bool, 189 ) { 190 spanBuilder := span.MakeBuilder(flowCtx.Codec(), &jr.desc, jr.index) 191 spanBuilder.SetNeededColumns(jr.neededRightCols()) 192 193 spanGenerator := defaultSpanGenerator{ 194 spanBuilder: spanBuilder, 195 keyToInputRowIndices: make(map[string][]int), 196 numKeyCols: numKeyCols, 197 lookupCols: jr.lookupCols, 198 } 199 if !maintainOrdering { 200 jr.strategy = &joinReaderNoOrderingStrategy{ 201 joinerBase: &jr.joinerBase, 202 defaultSpanGenerator: spanGenerator, 203 isPartialJoin: jr.joinType == sqlbase.LeftSemiJoin || jr.joinType == sqlbase.LeftAntiJoin, 204 } 205 return 206 } 207 208 ctx := flowCtx.EvalCtx.Ctx() 209 // Limit the memory use by creating a child monitor with a hard limit. 210 // joinReader will overflow to disk if this limit is not enough. 211 limit := execinfra.GetWorkMemLimit(flowCtx.Cfg) 212 if flowCtx.Cfg.TestingKnobs.ForceDiskSpill { 213 limit = 1 214 } 215 // Initialize memory monitors and row container for looked up rows. 216 jr.MemMonitor = execinfra.NewLimitedMonitor(ctx, flowCtx.EvalCtx.Mon, flowCtx.Cfg, "joiner-limited") 217 jr.diskMonitor = execinfra.NewMonitor(ctx, flowCtx.Cfg.DiskMonitor, "joinreader-disk") 218 drc := rowcontainer.NewDiskBackedNumberedRowContainer( 219 false, /* deDup */ 220 typs, 221 jr.EvalCtx, 222 jr.FlowCtx.Cfg.TempStorage, 223 jr.MemMonitor, 224 jr.diskMonitor, 225 0, /* rowCapacity */ 226 ) 227 if limit < mon.DefaultPoolAllocationSize { 228 // The memory limit is too low for caching, most likely to force disk 229 // spilling for testing. 230 drc.DisableCache = true 231 } 232 jr.strategy = &joinReaderOrderingStrategy{ 233 joinerBase: &jr.joinerBase, 234 defaultSpanGenerator: spanGenerator, 235 isPartialJoin: jr.joinType == sqlbase.LeftSemiJoin || jr.joinType == sqlbase.LeftAntiJoin, 236 lookedUpRows: drc, 237 } 238 } 239 240 // getIndexColSet returns a set of all column indices for the given index. 241 func getIndexColSet( 242 index *sqlbase.IndexDescriptor, colIdxMap map[sqlbase.ColumnID]int, 243 ) util.FastIntSet { 244 cols := util.MakeFastIntSet() 245 err := index.RunOverAllColumns(func(id sqlbase.ColumnID) error { 246 cols.Add(colIdxMap[id]) 247 return nil 248 }) 249 if err != nil { 250 // This path should never be hit since the column function never returns an 251 // error. 252 panic(err) 253 } 254 return cols 255 } 256 257 // SetBatchSizeBytes sets the desired batch size. It should only be used in tests. 258 func (jr *joinReader) SetBatchSizeBytes(batchSize int64) { 259 jr.batchSizeBytes = batchSize 260 } 261 262 // Spilled returns whether the joinReader spilled to disk. 263 func (jr *joinReader) Spilled() bool { 264 return jr.strategy.spilled() 265 } 266 267 // neededRightCols returns the set of column indices which need to be fetched 268 // from the right side of the join (jr.desc). 269 func (jr *joinReader) neededRightCols() util.FastIntSet { 270 neededCols := jr.Out.NeededColumns() 271 272 // Get the columns from the right side of the join and shift them over by 273 // the size of the left side so the right side starts at 0. 274 neededRightCols := util.MakeFastIntSet() 275 for i, ok := neededCols.Next(len(jr.inputTypes)); ok; i, ok = neededCols.Next(i + 1) { 276 neededRightCols.Add(i - len(jr.inputTypes)) 277 } 278 279 // Add columns needed by OnExpr. 280 for _, v := range jr.onCond.Vars.GetIndexedVars() { 281 rightIdx := v.Idx - len(jr.inputTypes) 282 if rightIdx >= 0 { 283 neededRightCols.Add(rightIdx) 284 } 285 } 286 287 return neededRightCols 288 } 289 290 // Next is part of the RowSource interface. 291 func (jr *joinReader) Next() (sqlbase.EncDatumRow, *execinfrapb.ProducerMetadata) { 292 // The lookup join is implemented as follows: 293 // - Read the input rows in batches. 294 // - For each batch, map the rows onto index keys and perform an index 295 // lookup for those keys. Note that multiple rows may map to the same key. 296 // - Retrieve the index lookup results in batches, since the index scan may 297 // return more rows than the input batch size. 298 // - Join the index rows with the corresponding input rows and buffer the 299 // results in jr.toEmit. 300 for jr.State == execinfra.StateRunning { 301 var row sqlbase.EncDatumRow 302 var meta *execinfrapb.ProducerMetadata 303 switch jr.runningState { 304 case jrReadingInput: 305 jr.runningState, meta = jr.readInput() 306 case jrPerformingLookup: 307 jr.runningState, meta = jr.performLookup() 308 case jrEmittingRows: 309 jr.runningState, row, meta = jr.emitRow() 310 default: 311 log.Fatalf(jr.Ctx, "unsupported state: %d", jr.runningState) 312 } 313 if row == nil && meta == nil { 314 continue 315 } 316 if meta != nil { 317 return nil, meta 318 } 319 if outRow := jr.ProcessRowHelper(row); outRow != nil { 320 return outRow, nil 321 } 322 } 323 return nil, jr.DrainHelper() 324 } 325 326 // readInput reads the next batch of input rows and starts an index scan. 327 func (jr *joinReader) readInput() (joinReaderState, *execinfrapb.ProducerMetadata) { 328 // Read the next batch of input rows. 329 for jr.curBatchSizeBytes < jr.batchSizeBytes { 330 row, meta := jr.input.Next() 331 if meta != nil { 332 if meta.Err != nil { 333 jr.MoveToDraining(nil /* err */) 334 return jrStateUnknown, meta 335 } 336 return jrReadingInput, meta 337 } 338 if row == nil { 339 break 340 } 341 jr.curBatchSizeBytes += int64(row.Size()) 342 jr.scratchInputRows = append(jr.scratchInputRows, jr.rowAlloc.CopyRow(row)) 343 } 344 345 if len(jr.scratchInputRows) == 0 { 346 log.VEventf(jr.Ctx, 1, "no more input rows") 347 // We're done. 348 jr.MoveToDraining(nil) 349 return jrStateUnknown, jr.DrainHelper() 350 } 351 log.VEventf(jr.Ctx, 1, "read %d input rows", len(jr.scratchInputRows)) 352 353 spans, err := jr.strategy.processLookupRows(jr.scratchInputRows) 354 if err != nil { 355 jr.MoveToDraining(err) 356 return jrStateUnknown, jr.DrainHelper() 357 } 358 jr.scratchInputRows = jr.scratchInputRows[:0] 359 jr.curBatchSizeBytes = 0 360 if len(spans) == 0 { 361 // All of the input rows were filtered out. Skip the index lookup. 362 return jrEmittingRows, nil 363 } 364 // Sort the spans so that we can rely upon the fetcher to limit the number of 365 // results per batch. It's safe to reorder the spans here because we already 366 // restore the original order of the output during the output collection 367 // phase. 368 sort.Sort(spans) 369 log.VEventf(jr.Ctx, 1, "scanning %d spans", len(spans)) 370 if err := jr.fetcher.StartScan( 371 jr.Ctx, jr.FlowCtx.Txn, spans, jr.shouldLimitBatches, 0, /* limitHint */ 372 jr.FlowCtx.TraceKV); err != nil { 373 jr.MoveToDraining(err) 374 return jrStateUnknown, jr.DrainHelper() 375 } 376 377 return jrPerformingLookup, nil 378 } 379 380 // performLookup reads the next batch of index rows. 381 func (jr *joinReader) performLookup() (joinReaderState, *execinfrapb.ProducerMetadata) { 382 nCols := len(jr.lookupCols) 383 384 for { 385 // Construct a "partial key" of nCols, so we can match the key format that 386 // was stored in our keyToInputRowIndices map. This matches the format that 387 // is output in jr.generateSpan. 388 key, err := jr.fetcher.PartialKey(nCols) 389 if err != nil { 390 jr.MoveToDraining(err) 391 return jrStateUnknown, jr.DrainHelper() 392 } 393 394 // Fetch the next row and copy it into the row container. 395 lookedUpRow, _, _, err := jr.fetcher.NextRow(jr.Ctx) 396 if err != nil { 397 jr.MoveToDraining(scrub.UnwrapScrubError(err)) 398 return jrStateUnknown, jr.DrainHelper() 399 } 400 if lookedUpRow == nil { 401 // Done with this input batch. 402 break 403 } 404 405 if nextState, err := jr.strategy.processLookedUpRow(jr.Ctx, lookedUpRow, key); err != nil { 406 jr.MoveToDraining(err) 407 return jrStateUnknown, jr.DrainHelper() 408 } else if nextState != jrPerformingLookup { 409 return nextState, nil 410 } 411 } 412 log.VEvent(jr.Ctx, 1, "done joining rows") 413 jr.strategy.prepareToEmit(jr.Ctx) 414 415 return jrEmittingRows, nil 416 } 417 418 // emitRow returns the next row from jr.toEmit, if present. Otherwise it 419 // prepares for another input batch. 420 func (jr *joinReader) emitRow() ( 421 joinReaderState, 422 sqlbase.EncDatumRow, 423 *execinfrapb.ProducerMetadata, 424 ) { 425 rowToEmit, nextState, err := jr.strategy.nextRowToEmit(jr.Ctx) 426 if err != nil { 427 jr.MoveToDraining(err) 428 return jrStateUnknown, nil, jr.DrainHelper() 429 } 430 return nextState, rowToEmit, nil 431 } 432 433 // Start is part of the RowSource interface. 434 func (jr *joinReader) Start(ctx context.Context) context.Context { 435 jr.input.Start(ctx) 436 ctx = jr.StartInternal(ctx, joinReaderProcName) 437 jr.runningState = jrReadingInput 438 return ctx 439 } 440 441 // ConsumerClosed is part of the RowSource interface. 442 func (jr *joinReader) ConsumerClosed() { 443 // The consumer is done, Next() will not be called again. 444 jr.close() 445 } 446 447 func (jr *joinReader) close() { 448 if jr.InternalClose() { 449 jr.strategy.close(jr.Ctx) 450 if jr.MemMonitor != nil { 451 jr.MemMonitor.Stop(jr.Ctx) 452 } 453 if jr.diskMonitor != nil { 454 jr.diskMonitor.Stop(jr.Ctx) 455 } 456 } 457 } 458 459 var _ execinfrapb.DistSQLSpanStats = &JoinReaderStats{} 460 461 const joinReaderTagPrefix = "joinreader." 462 463 // Stats implements the SpanStats interface. 464 func (jrs *JoinReaderStats) Stats() map[string]string { 465 statsMap := jrs.InputStats.Stats(joinReaderTagPrefix) 466 toMerge := jrs.IndexLookupStats.Stats(joinReaderTagPrefix + "index.") 467 for k, v := range toMerge { 468 statsMap[k] = v 469 } 470 return statsMap 471 } 472 473 // StatsForQueryPlan implements the DistSQLSpanStats interface. 474 func (jrs *JoinReaderStats) StatsForQueryPlan() []string { 475 is := append( 476 jrs.InputStats.StatsForQueryPlan(""), 477 jrs.IndexLookupStats.StatsForQueryPlan("index ")..., 478 ) 479 return is 480 } 481 482 // outputStatsToTrace outputs the collected joinReader stats to the trace. Will 483 // fail silently if the joinReader is not collecting stats. 484 func (jr *joinReader) outputStatsToTrace() { 485 is, ok := getInputStats(jr.FlowCtx, jr.input) 486 if !ok { 487 return 488 } 489 ils, ok := getFetcherInputStats(jr.FlowCtx, jr.fetcher) 490 if !ok { 491 return 492 } 493 494 // TODO(asubiotto): Add memory and disk usage to EXPLAIN ANALYZE. 495 jrs := &JoinReaderStats{ 496 InputStats: is, 497 IndexLookupStats: ils, 498 } 499 if sp := opentracing.SpanFromContext(jr.Ctx); sp != nil { 500 tracing.SetSpanStats(sp, jrs) 501 } 502 } 503 504 func (jr *joinReader) generateMeta(ctx context.Context) []execinfrapb.ProducerMetadata { 505 if tfs := execinfra.GetLeafTxnFinalState(ctx, jr.FlowCtx.Txn); tfs != nil { 506 return []execinfrapb.ProducerMetadata{{LeafTxnFinalState: tfs}} 507 } 508 return nil 509 } 510 511 // DrainMeta is part of the MetadataSource interface. 512 func (jr *joinReader) DrainMeta(ctx context.Context) []execinfrapb.ProducerMetadata { 513 return jr.generateMeta(ctx) 514 } 515 516 // ChildCount is part of the execinfra.OpNode interface. 517 func (jr *joinReader) ChildCount(verbose bool) int { 518 if _, ok := jr.input.(execinfra.OpNode); ok { 519 return 1 520 } 521 return 0 522 } 523 524 // Child is part of the execinfra.OpNode interface. 525 func (jr *joinReader) Child(nth int, verbose bool) execinfra.OpNode { 526 if nth == 0 { 527 if n, ok := jr.input.(execinfra.OpNode); ok { 528 return n 529 } 530 panic("input to joinReader is not an execinfra.OpNode") 531 } 532 panic(fmt.Sprintf("invalid index %d", nth)) 533 }