github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/sql/rowexec/hashjoiner.go (about) 1 // Copyright 2016 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package rowexec 12 13 import ( 14 "context" 15 "fmt" 16 17 "github.com/cockroachdb/cockroach/pkg/sql/execinfra" 18 "github.com/cockroachdb/cockroach/pkg/sql/execinfrapb" 19 "github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgcode" 20 "github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgerror" 21 "github.com/cockroachdb/cockroach/pkg/sql/rowcontainer" 22 "github.com/cockroachdb/cockroach/pkg/sql/sqlbase" 23 "github.com/cockroachdb/cockroach/pkg/util/humanizeutil" 24 "github.com/cockroachdb/cockroach/pkg/util/log" 25 "github.com/cockroachdb/cockroach/pkg/util/mon" 26 "github.com/cockroachdb/cockroach/pkg/util/tracing" 27 "github.com/opentracing/opentracing-go" 28 ) 29 30 // hashJoinerInitialBufferSize controls the size of the initial buffering phase 31 // (see hashJoiner). This only applies when falling back to disk is disabled. 32 const hashJoinerInitialBufferSize = 4 * 1024 * 1024 33 34 // hashJoinerState represents the state of the processor. 35 type hashJoinerState int 36 37 const ( 38 hjStateUnknown hashJoinerState = iota 39 // hjBuilding represents the state the hashJoiner is in when it is trying to 40 // determine which side to store (i.e. which side is smallest). 41 // At most hashJoinerInitialBufferSize is used to buffer rows from either 42 // side. The first input to be finished within this limit is the smallest 43 // side. If both inputs still have rows, the hashJoiner will default to 44 // storing the right side. When a side is stored, a hash map is also 45 // constructed from the equality columns to the rows. 46 hjBuilding 47 // hjConsumingStoredSide represents the state the hashJoiner is in if a small 48 // side was not found. In this case, the hashJoiner will fully consume the 49 // right side. This state is skipped if the hashJoiner determined the smallest 50 // side, since it must have fully consumed that side. 51 hjConsumingStoredSide 52 // hjReadingProbeSide represents the state the hashJoiner is in when it reads 53 // rows from the input that wasn't chosen to be stored. 54 hjReadingProbeSide 55 // hjProbingRow represents the state the hashJoiner is in when it uses a row 56 // read in hjReadingProbeSide to probe the stored hash map with. 57 hjProbingRow 58 // hjEmittingUnmatched represents the state the hashJoiner is in when it is 59 // emitting unmatched rows from its stored side after having consumed the 60 // other side. This only happens when executing a FULL OUTER, LEFT/RIGHT 61 // OUTER and ANTI joins (depending on which side we store). 62 hjEmittingUnmatched 63 ) 64 65 // hashJoiner performs a hash join. There is no guarantee on the output 66 // ordering. 67 type hashJoiner struct { 68 joinerBase 69 70 runningState hashJoinerState 71 72 diskMonitor *mon.BytesMonitor 73 74 leftSource, rightSource execinfra.RowSource 75 76 // initialBufferSize is the maximum amount of data we buffer from each stream 77 // as part of the initial buffering phase. Normally 78 // hashJoinerInitialBufferSize, can be tweaked for tests. 79 // TODO(yuzefovich): remove buffering stage from the hash joiner and always 80 // build from the right stream. 81 initialBufferSize int64 82 83 // We read a portion of both streams, in the hope that one is small. One of 84 // the containers will contain the entire "stored" stream, the other just the 85 // start of the other stream. 86 rows [2]rowcontainer.MemRowContainer 87 88 // storedSide is set by the initial buffering phase and indicates which 89 // stream we store fully and build the hashRowContainer from. 90 storedSide joinSide 91 92 // nullEquality indicates that NULL = NULL should be considered true. Used for 93 // INTERSECT and EXCEPT. 94 nullEquality bool 95 96 disableTempStorage bool 97 storedRows rowcontainer.HashRowContainer 98 99 // Used by tests to force a storedSide. 100 forcedStoredSide *joinSide 101 102 // probingRowState is state used when hjProbingRow. 103 probingRowState struct { 104 // row is the row being probed with. 105 row sqlbase.EncDatumRow 106 // iter is an iterator over the bucket that matches row on the equality 107 // columns. 108 iter rowcontainer.RowMarkerIterator 109 // matched represents whether any row that matches row on equality columns 110 // has also passed the ON condition. 111 matched bool 112 } 113 114 // emittingUnmatchedState is used when hjEmittingUnmatched. 115 emittingUnmatchedState struct { 116 iter rowcontainer.RowIterator 117 } 118 119 // Context cancellation checker. 120 cancelChecker *sqlbase.CancelChecker 121 } 122 123 var _ execinfra.Processor = &hashJoiner{} 124 var _ execinfra.RowSource = &hashJoiner{} 125 var _ execinfra.OpNode = &hashJoiner{} 126 127 const hashJoinerProcName = "hash joiner" 128 129 // newHashJoiner creates a new hash join processor. 130 // - disableTempStorage determines whether the hash joiner is allowed to spill 131 // to disk. It should only be set to 'true' in tests. 132 func newHashJoiner( 133 flowCtx *execinfra.FlowCtx, 134 processorID int32, 135 spec *execinfrapb.HashJoinerSpec, 136 leftSource execinfra.RowSource, 137 rightSource execinfra.RowSource, 138 post *execinfrapb.PostProcessSpec, 139 output execinfra.RowReceiver, 140 disableTempStorage bool, 141 ) (*hashJoiner, error) { 142 h := &hashJoiner{ 143 initialBufferSize: hashJoinerInitialBufferSize, 144 leftSource: leftSource, 145 rightSource: rightSource, 146 } 147 148 numMergedColumns := 0 149 if spec.MergedColumns { 150 numMergedColumns = len(spec.LeftEqColumns) 151 } 152 if err := h.joinerBase.init( 153 h, 154 flowCtx, 155 processorID, 156 leftSource.OutputTypes(), 157 rightSource.OutputTypes(), 158 spec.Type, 159 spec.OnExpr, 160 spec.LeftEqColumns, 161 spec.RightEqColumns, 162 uint32(numMergedColumns), 163 post, 164 output, 165 execinfra.ProcStateOpts{ 166 InputsToDrain: []execinfra.RowSource{h.leftSource, h.rightSource}, 167 TrailingMetaCallback: func(context.Context) []execinfrapb.ProducerMetadata { 168 h.close() 169 return nil 170 }, 171 }, 172 ); err != nil { 173 return nil, err 174 } 175 176 ctx := h.FlowCtx.EvalCtx.Ctx() 177 h.disableTempStorage = disableTempStorage 178 if !h.disableTempStorage { 179 // Limit the memory use by creating a child monitor with a hard limit. 180 // The hashJoiner will overflow to disk if this limit is not enough. 181 limit := execinfra.GetWorkMemLimit(flowCtx.Cfg) 182 if h.FlowCtx.Cfg.TestingKnobs.ForceDiskSpill { 183 limit = 1 184 } 185 h.MemMonitor = execinfra.NewLimitedMonitor(ctx, flowCtx.EvalCtx.Mon, flowCtx.Cfg, "hashjoiner-limited") 186 h.diskMonitor = execinfra.NewMonitor(ctx, flowCtx.Cfg.DiskMonitor, "hashjoiner-disk") 187 // Override initialBufferSize to be half of this processor's memory 188 // limit. We consume up to h.initialBufferSize bytes from each input 189 // stream. 190 h.initialBufferSize = limit / 2 191 } else { 192 h.MemMonitor = execinfra.NewMonitor(ctx, flowCtx.EvalCtx.Mon, "hashjoiner-mem") 193 } 194 195 // If the trace is recording, instrument the hashJoiner to collect stats. 196 if sp := opentracing.SpanFromContext(ctx); sp != nil && tracing.IsRecording(sp) { 197 h.leftSource = newInputStatCollector(h.leftSource) 198 h.rightSource = newInputStatCollector(h.rightSource) 199 h.FinishTrace = h.outputStatsToTrace 200 } 201 202 h.rows[leftSide].InitWithMon( 203 nil /* ordering */, h.leftSource.OutputTypes(), h.EvalCtx, h.MemMonitor, 0, /* rowCapacity */ 204 ) 205 h.rows[rightSide].InitWithMon( 206 nil /* ordering */, h.rightSource.OutputTypes(), h.EvalCtx, h.MemMonitor, 0, /* rowCapacity */ 207 ) 208 209 if h.joinType == sqlbase.IntersectAllJoin || h.joinType == sqlbase.ExceptAllJoin { 210 h.nullEquality = true 211 } 212 213 return h, nil 214 } 215 216 // Start is part of the RowSource interface. 217 func (h *hashJoiner) Start(ctx context.Context) context.Context { 218 h.leftSource.Start(ctx) 219 h.rightSource.Start(ctx) 220 ctx = h.StartInternal(ctx, hashJoinerProcName) 221 h.cancelChecker = sqlbase.NewCancelChecker(ctx) 222 h.runningState = hjBuilding 223 return ctx 224 } 225 226 // Next is part of the RowSource interface. 227 func (h *hashJoiner) Next() (sqlbase.EncDatumRow, *execinfrapb.ProducerMetadata) { 228 for h.State == execinfra.StateRunning { 229 var row sqlbase.EncDatumRow 230 var meta *execinfrapb.ProducerMetadata 231 switch h.runningState { 232 case hjBuilding: 233 h.runningState, row, meta = h.build() 234 case hjConsumingStoredSide: 235 h.runningState, row, meta = h.consumeStoredSide() 236 case hjReadingProbeSide: 237 h.runningState, row, meta = h.readProbeSide() 238 case hjProbingRow: 239 h.runningState, row, meta = h.probeRow() 240 case hjEmittingUnmatched: 241 h.runningState, row, meta = h.emitUnmatched() 242 default: 243 log.Fatalf(h.Ctx, "unsupported state: %d", h.runningState) 244 } 245 246 if row == nil && meta == nil { 247 continue 248 } 249 if meta != nil { 250 return nil, meta 251 } 252 if outRow := h.ProcessRowHelper(row); outRow != nil { 253 return outRow, nil 254 } 255 } 256 return nil, h.DrainHelper() 257 } 258 259 // ConsumerClosed is part of the RowSource interface. 260 func (h *hashJoiner) ConsumerClosed() { 261 h.close() 262 } 263 264 func (h *hashJoiner) build() (hashJoinerState, sqlbase.EncDatumRow, *execinfrapb.ProducerMetadata) { 265 // setStoredSideTransition is a helper function that sets storedSide on the 266 // hashJoiner and performs initialization before a transition to 267 // hjConsumingStoredSide. 268 setStoredSideTransition := func( 269 side joinSide, 270 ) (hashJoinerState, sqlbase.EncDatumRow, *execinfrapb.ProducerMetadata) { 271 h.storedSide = side 272 if err := h.initStoredRows(); err != nil { 273 h.MoveToDraining(err) 274 return hjStateUnknown, nil, h.DrainHelper() 275 } 276 return hjConsumingStoredSide, nil, nil 277 } 278 279 if h.forcedStoredSide != nil { 280 return setStoredSideTransition(*h.forcedStoredSide) 281 } 282 283 for { 284 leftUsage := h.rows[leftSide].MemUsage() 285 rightUsage := h.rows[rightSide].MemUsage() 286 287 if leftUsage >= h.initialBufferSize && rightUsage >= h.initialBufferSize { 288 // Both sides have reached the buffer size limit. Move on to storing and 289 // fully consuming the right side. 290 log.VEventf(h.Ctx, 1, "buffer phase found no short stream with buffer size %d", h.initialBufferSize) 291 return setStoredSideTransition(rightSide) 292 } 293 294 side := rightSide 295 if leftUsage < rightUsage { 296 side = leftSide 297 } 298 299 row, meta, emitDirectly, err := h.receiveNext(side) 300 if err != nil { 301 h.MoveToDraining(err) 302 return hjStateUnknown, nil, h.DrainHelper() 303 } else if meta != nil { 304 if meta.Err != nil { 305 h.MoveToDraining(nil /* err */) 306 return hjStateUnknown, nil, meta 307 } 308 return hjBuilding, nil, meta 309 } else if emitDirectly { 310 return hjBuilding, row, nil 311 } 312 313 if row == nil { 314 // This side has been fully consumed, it is the shortest side. 315 // If storedSide is empty, we might be able to short-circuit. 316 if h.rows[side].Len() == 0 && 317 (h.joinType == sqlbase.InnerJoin || 318 (h.joinType == sqlbase.LeftOuterJoin && side == leftSide) || 319 (h.joinType == sqlbase.RightOuterJoin && side == rightSide)) { 320 h.MoveToDraining(nil /* err */) 321 return hjStateUnknown, nil, h.DrainHelper() 322 } 323 // We could skip hjConsumingStoredSide and move straight to 324 // hjReadingProbeSide apart from the fact that hjConsumingStoredSide 325 // pre-reserves mark memory. To keep the code simple and avoid 326 // duplication, we move to hjConsumingStoredSide. 327 return setStoredSideTransition(side) 328 } 329 330 // Add the row to the correct container. 331 if err := h.rows[side].AddRow(h.Ctx, row); err != nil { 332 // If this error is a memory limit error, move to hjConsumingStoredSide. 333 h.storedSide = side 334 if sqlbase.IsOutOfMemoryError(err) { 335 if h.disableTempStorage { 336 err = pgerror.Wrapf(err, pgcode.OutOfMemory, 337 "error while attempting hashJoiner disk spill: temp storage disabled") 338 } else { 339 if err := h.initStoredRows(); err != nil { 340 h.MoveToDraining(err) 341 return hjStateUnknown, nil, h.DrainHelper() 342 } 343 addErr := h.storedRows.AddRow(h.Ctx, row) 344 if addErr == nil { 345 return hjConsumingStoredSide, nil, nil 346 } 347 err = pgerror.Wrapf(addErr, pgcode.OutOfMemory, "while spilling: %v", err) 348 } 349 } 350 h.MoveToDraining(err) 351 return hjStateUnknown, nil, h.DrainHelper() 352 } 353 } 354 } 355 356 // consumeStoredSide fully consumes the stored side and adds the rows to 357 // h.storedRows. It assumes that h.storedRows has been initialized through 358 // h.initStoredRows(). 359 func (h *hashJoiner) consumeStoredSide() ( 360 hashJoinerState, 361 sqlbase.EncDatumRow, 362 *execinfrapb.ProducerMetadata, 363 ) { 364 side := h.storedSide 365 for { 366 row, meta, emitDirectly, err := h.receiveNext(side) 367 if err != nil { 368 h.MoveToDraining(err) 369 return hjStateUnknown, nil, h.DrainHelper() 370 } else if meta != nil { 371 if meta.Err != nil { 372 h.MoveToDraining(nil /* err */) 373 return hjStateUnknown, nil, meta 374 } 375 return hjConsumingStoredSide, nil, meta 376 } else if emitDirectly { 377 return hjConsumingStoredSide, row, nil 378 } 379 380 if row == nil { 381 // The stored side has been fully consumed, move on to hjReadingProbeSide. 382 // If storedRows is in-memory, pre-reserve the memory needed to mark. 383 if rc, ok := h.storedRows.(*rowcontainer.HashMemRowContainer); ok { 384 // h.storedRows is hashMemRowContainer and not a disk backed one, so 385 // h.disableTempStorage is true and we cannot spill to disk, so we simply 386 // will return an error if it occurs. 387 err = rc.ReserveMarkMemoryMaybe(h.Ctx) 388 } else if hdbrc, ok := h.storedRows.(*rowcontainer.HashDiskBackedRowContainer); ok { 389 err = hdbrc.ReserveMarkMemoryMaybe(h.Ctx) 390 } else { 391 panic("unexpected type of storedRows in hashJoiner") 392 } 393 if err != nil { 394 h.MoveToDraining(err) 395 return hjStateUnknown, nil, h.DrainHelper() 396 } 397 return hjReadingProbeSide, nil, nil 398 } 399 400 err = h.storedRows.AddRow(h.Ctx, row) 401 // Regardless of the underlying row container (disk backed or in-memory 402 // only), we cannot do anything about an error if it occurs. 403 if err != nil { 404 h.MoveToDraining(err) 405 return hjStateUnknown, nil, h.DrainHelper() 406 } 407 } 408 } 409 410 func (h *hashJoiner) readProbeSide() ( 411 hashJoinerState, 412 sqlbase.EncDatumRow, 413 *execinfrapb.ProducerMetadata, 414 ) { 415 side := otherSide(h.storedSide) 416 417 var row sqlbase.EncDatumRow 418 // First process the rows that were already buffered. 419 if h.rows[side].Len() > 0 { 420 row = h.rows[side].EncRow(0) 421 h.rows[side].PopFirst() 422 } else { 423 var meta *execinfrapb.ProducerMetadata 424 var emitDirectly bool 425 var err error 426 row, meta, emitDirectly, err = h.receiveNext(side) 427 if err != nil { 428 h.MoveToDraining(err) 429 return hjStateUnknown, nil, h.DrainHelper() 430 } else if meta != nil { 431 if meta.Err != nil { 432 h.MoveToDraining(nil /* err */) 433 return hjStateUnknown, nil, meta 434 } 435 return hjReadingProbeSide, nil, meta 436 } else if emitDirectly { 437 return hjReadingProbeSide, row, nil 438 } 439 440 if row == nil { 441 // The probe side has been fully consumed. Move on to hjEmittingUnmatched 442 // if unmatched rows on the stored side need to be emitted, otherwise 443 // finish. 444 if shouldEmitUnmatchedRow(h.storedSide, h.joinType) { 445 i := h.storedRows.NewUnmarkedIterator(h.Ctx) 446 i.Rewind() 447 h.emittingUnmatchedState.iter = i 448 return hjEmittingUnmatched, nil, nil 449 } 450 h.MoveToDraining(nil /* err */) 451 return hjStateUnknown, nil, h.DrainHelper() 452 } 453 } 454 455 // Probe with this row. Get the iterator over the matching bucket ready for 456 // hjProbingRow. 457 h.probingRowState.row = row 458 h.probingRowState.matched = false 459 if h.probingRowState.iter == nil { 460 i, err := h.storedRows.NewBucketIterator(h.Ctx, row, h.eqCols[side]) 461 if err != nil { 462 h.MoveToDraining(err) 463 return hjStateUnknown, nil, h.DrainHelper() 464 } 465 h.probingRowState.iter = i 466 } else { 467 if err := h.probingRowState.iter.Reset(h.Ctx, row); err != nil { 468 h.MoveToDraining(err) 469 return hjStateUnknown, nil, h.DrainHelper() 470 } 471 } 472 h.probingRowState.iter.Rewind() 473 return hjProbingRow, nil, nil 474 } 475 476 func (h *hashJoiner) probeRow() ( 477 hashJoinerState, 478 sqlbase.EncDatumRow, 479 *execinfrapb.ProducerMetadata, 480 ) { 481 i := h.probingRowState.iter 482 if ok, err := i.Valid(); err != nil { 483 h.MoveToDraining(err) 484 return hjStateUnknown, nil, h.DrainHelper() 485 } else if !ok { 486 // In this case we have reached the end of the matching bucket. Check if any 487 // rows passed the ON condition. If they did, move back to 488 // hjReadingProbeSide to get the next probe row. 489 if h.probingRowState.matched { 490 return hjReadingProbeSide, nil, nil 491 } 492 // If not, this probe row is unmatched. Check if it needs to be emitted. 493 if renderedRow, shouldEmit := h.shouldEmitUnmatched( 494 h.probingRowState.row, otherSide(h.storedSide), 495 ); shouldEmit { 496 return hjReadingProbeSide, renderedRow, nil 497 } 498 return hjReadingProbeSide, nil, nil 499 } 500 501 if err := h.cancelChecker.Check(); err != nil { 502 h.MoveToDraining(err) 503 return hjStateUnknown, nil, h.DrainHelper() 504 } 505 506 row := h.probingRowState.row 507 otherRow, err := i.Row() 508 if err != nil { 509 h.MoveToDraining(err) 510 return hjStateUnknown, nil, h.DrainHelper() 511 } 512 defer i.Next() 513 514 var renderedRow sqlbase.EncDatumRow 515 if h.storedSide == rightSide { 516 renderedRow, err = h.render(row, otherRow) 517 } else { 518 renderedRow, err = h.render(otherRow, row) 519 } 520 if err != nil { 521 h.MoveToDraining(err) 522 return hjStateUnknown, nil, h.DrainHelper() 523 } 524 525 // If the ON condition failed, renderedRow is nil. 526 if renderedRow == nil { 527 return hjProbingRow, nil, nil 528 } 529 530 h.probingRowState.matched = true 531 shouldEmit := h.joinType != sqlbase.LeftAntiJoin && h.joinType != sqlbase.ExceptAllJoin 532 if shouldMark(h.storedSide, h.joinType) { 533 // Matched rows are marked on the stored side for 2 reasons. 534 // 1: For outer joins, anti joins, and EXCEPT ALL to iterate through 535 // the unmarked rows. 536 // 2: For semi-joins and INTERSECT ALL where the left-side is stored, 537 // multiple rows from the right may match to the same row on the left. 538 // The rows on the left should only be emitted the first time 539 // a right row matches it, then marked to not be emitted again. 540 // (Note: an alternative is to remove the entry from the stored 541 // side, but our containers do not support that today). 542 // TODO(peter): figure out a way to reduce this special casing below. 543 if i.IsMarked(h.Ctx) { 544 switch h.joinType { 545 case sqlbase.LeftSemiJoin: 546 shouldEmit = false 547 case sqlbase.IntersectAllJoin: 548 shouldEmit = false 549 case sqlbase.ExceptAllJoin: 550 // We want to mark a stored row if possible, so move on to the next 551 // match. Reset h.probingRowState.matched in case we don't find any more 552 // matches and want to emit this row. 553 h.probingRowState.matched = false 554 return hjProbingRow, nil, nil 555 } 556 } else if err := i.Mark(h.Ctx, true); err != nil { 557 h.MoveToDraining(err) 558 return hjStateUnknown, nil, h.DrainHelper() 559 } 560 } 561 nextState := hjProbingRow 562 if shouldShortCircuit(h.storedSide, h.joinType) { 563 nextState = hjReadingProbeSide 564 } 565 if shouldEmit { 566 if h.joinType == sqlbase.IntersectAllJoin { 567 // We found a match, so we are done with this row. 568 return hjReadingProbeSide, renderedRow, nil 569 } 570 return nextState, renderedRow, nil 571 } 572 573 return nextState, nil, nil 574 } 575 576 func (h *hashJoiner) emitUnmatched() ( 577 hashJoinerState, 578 sqlbase.EncDatumRow, 579 *execinfrapb.ProducerMetadata, 580 ) { 581 i := h.emittingUnmatchedState.iter 582 if ok, err := i.Valid(); err != nil { 583 h.MoveToDraining(err) 584 return hjStateUnknown, nil, h.DrainHelper() 585 } else if !ok { 586 // Done. 587 h.MoveToDraining(nil /* err */) 588 return hjStateUnknown, nil, h.DrainHelper() 589 } 590 591 if err := h.cancelChecker.Check(); err != nil { 592 h.MoveToDraining(err) 593 return hjStateUnknown, nil, h.DrainHelper() 594 } 595 596 row, err := i.Row() 597 if err != nil { 598 h.MoveToDraining(err) 599 return hjStateUnknown, nil, h.DrainHelper() 600 } 601 defer i.Next() 602 603 return hjEmittingUnmatched, h.renderUnmatchedRow(row, h.storedSide), nil 604 } 605 606 func (h *hashJoiner) close() { 607 if h.InternalClose() { 608 // We need to close only memRowContainer of the probe side because the 609 // stored side container will be closed by closing h.storedRows. 610 if h.storedSide == rightSide { 611 h.rows[leftSide].Close(h.Ctx) 612 } else { 613 h.rows[rightSide].Close(h.Ctx) 614 } 615 if h.storedRows != nil { 616 h.storedRows.Close(h.Ctx) 617 } else { 618 // h.storedRows has not been initialized, so we need to close the stored 619 // side container explicitly. 620 h.rows[h.storedSide].Close(h.Ctx) 621 } 622 if h.probingRowState.iter != nil { 623 h.probingRowState.iter.Close() 624 } 625 if h.emittingUnmatchedState.iter != nil { 626 h.emittingUnmatchedState.iter.Close() 627 } 628 h.MemMonitor.Stop(h.Ctx) 629 if h.diskMonitor != nil { 630 h.diskMonitor.Stop(h.Ctx) 631 } 632 } 633 } 634 635 // receiveNext reads from the source specified by side and returns the next row 636 // or metadata to be processed by the hashJoiner. Unless h.nullEquality is true, 637 // rows with NULLs in their equality columns are only returned if the joinType 638 // specifies that unmatched rows should be returned for the given side. In this 639 // case, a rendered row and true is returned, notifying the caller that the 640 // returned row may be emitted directly. 641 func (h *hashJoiner) receiveNext( 642 side joinSide, 643 ) (sqlbase.EncDatumRow, *execinfrapb.ProducerMetadata, bool, error) { 644 source := h.leftSource 645 if side == rightSide { 646 source = h.rightSource 647 } 648 for { 649 if err := h.cancelChecker.Check(); err != nil { 650 return nil, nil, false, err 651 } 652 row, meta := source.Next() 653 if meta != nil { 654 return nil, meta, false, nil 655 } else if row == nil { 656 return nil, nil, false, nil 657 } 658 // We make the explicit check for whether or not the row contained a NULL value 659 // on an equality column. The reasoning here is because of the way we expect 660 // NULL equality checks to behave (i.e. NULL != NULL) and the fact that we 661 // use the encoding of any given row as key into our bucket. Thus if we 662 // encountered a NULL row when building the hashmap we have to store in 663 // order to use it for RIGHT OUTER joins but if we encounter another 664 // NULL row when going through the left stream (probing phase), matching 665 // this with the first NULL row would be incorrect. 666 // 667 // If we have have the following: 668 // CREATE TABLE t(x INT); INSERT INTO t(x) VALUES (NULL); 669 // | x | 670 // ------ 671 // | NULL | 672 // 673 // For the following query: 674 // SELECT * FROM t AS a FULL OUTER JOIN t AS b USING(x); 675 // 676 // We expect: 677 // | x | 678 // ------ 679 // | NULL | 680 // | NULL | 681 // 682 // The following examples illustrates the behavior when joining on two 683 // or more columns, and only one of them contains NULL. 684 // If we have have the following: 685 // CREATE TABLE t(x INT, y INT); 686 // INSERT INTO t(x, y) VALUES (44,51), (NULL,52); 687 // | x | y | 688 // ------ 689 // | 44 | 51 | 690 // | NULL | 52 | 691 // 692 // For the following query: 693 // SELECT * FROM t AS a FULL OUTER JOIN t AS b USING(x, y); 694 // 695 // We expect: 696 // | x | y | 697 // ------ 698 // | 44 | 51 | 699 // | NULL | 52 | 700 // | NULL | 52 | 701 hasNull := false 702 for _, c := range h.eqCols[side] { 703 if row[c].IsNull() { 704 hasNull = true 705 break 706 } 707 } 708 // row has no NULLs in its equality columns (or we are considering NULLs to 709 // be equal), so it might match a row from the other side. 710 if !hasNull || h.nullEquality { 711 return row, nil, false, nil 712 } 713 714 if renderedRow, shouldEmit := h.shouldEmitUnmatched(row, side); shouldEmit { 715 return renderedRow, nil, true, nil 716 } 717 718 // If this point is reached, row had NULLs in its equality columns but 719 // should not be emitted. Throw it away and get the next row. 720 } 721 } 722 723 // shouldEmitUnmatched returns whether this row should be emitted if it doesn't 724 // match. If this is the case, a rendered row ready for emitting is returned as 725 // well. 726 func (h *hashJoiner) shouldEmitUnmatched( 727 row sqlbase.EncDatumRow, side joinSide, 728 ) (sqlbase.EncDatumRow, bool) { 729 if !shouldEmitUnmatchedRow(side, h.joinType) { 730 return nil, false 731 } 732 return h.renderUnmatchedRow(row, side), true 733 } 734 735 // initStoredRows initializes a hashRowContainer and sets h.storedRows. 736 func (h *hashJoiner) initStoredRows() error { 737 if !h.disableTempStorage { 738 hrc := rowcontainer.NewHashDiskBackedRowContainer( 739 &h.rows[h.storedSide], 740 h.EvalCtx, 741 h.MemMonitor, 742 h.diskMonitor, 743 h.FlowCtx.Cfg.TempStorage, 744 ) 745 h.storedRows = hrc 746 } else { 747 hrc := rowcontainer.MakeHashMemRowContainer(&h.rows[h.storedSide]) 748 h.storedRows = &hrc 749 } 750 return h.storedRows.Init( 751 h.Ctx, 752 shouldMark(h.storedSide, h.joinType), 753 h.rows[h.storedSide].Types(), 754 h.eqCols[h.storedSide], 755 h.nullEquality, 756 ) 757 } 758 759 var _ execinfrapb.DistSQLSpanStats = &HashJoinerStats{} 760 761 const hashJoinerTagPrefix = "hashjoiner." 762 763 // Stats implements the SpanStats interface. 764 func (hjs *HashJoinerStats) Stats() map[string]string { 765 // statsMap starts off as the left input stats map. 766 statsMap := hjs.LeftInputStats.Stats(hashJoinerTagPrefix + "left.") 767 rightInputStatsMap := hjs.RightInputStats.Stats(hashJoinerTagPrefix + "right.") 768 // Merge the two input maps. 769 for k, v := range rightInputStatsMap { 770 statsMap[k] = v 771 } 772 statsMap[hashJoinerTagPrefix+"stored_side"] = hjs.StoredSide 773 statsMap[hashJoinerTagPrefix+MaxMemoryTagSuffix] = humanizeutil.IBytes(hjs.MaxAllocatedMem) 774 statsMap[hashJoinerTagPrefix+MaxDiskTagSuffix] = humanizeutil.IBytes(hjs.MaxAllocatedDisk) 775 return statsMap 776 } 777 778 // StatsForQueryPlan implements the DistSQLSpanStats interface. 779 func (hjs *HashJoinerStats) StatsForQueryPlan() []string { 780 stats := hjs.LeftInputStats.StatsForQueryPlan("left ") 781 stats = append(stats, hjs.RightInputStats.StatsForQueryPlan("right ")...) 782 stats = append(stats, fmt.Sprintf("stored side: %s", hjs.StoredSide)) 783 784 if hjs.MaxAllocatedMem != 0 { 785 stats = append(stats, 786 fmt.Sprintf("%s: %s", MaxMemoryQueryPlanSuffix, humanizeutil.IBytes(hjs.MaxAllocatedMem))) 787 } 788 789 if hjs.MaxAllocatedDisk != 0 { 790 stats = append(stats, 791 fmt.Sprintf("%s: %s", MaxDiskQueryPlanSuffix, humanizeutil.IBytes(hjs.MaxAllocatedDisk))) 792 } 793 794 return stats 795 } 796 797 // outputStatsToTrace outputs the collected hashJoiner stats to the trace. Will 798 // fail silently if the hashJoiner is not collecting stats. 799 func (h *hashJoiner) outputStatsToTrace() { 800 lis, ok := getInputStats(h.FlowCtx, h.leftSource) 801 if !ok { 802 return 803 } 804 ris, ok := getInputStats(h.FlowCtx, h.rightSource) 805 if !ok { 806 return 807 } 808 if sp := opentracing.SpanFromContext(h.Ctx); sp != nil { 809 tracing.SetSpanStats( 810 sp, 811 &HashJoinerStats{ 812 LeftInputStats: lis, 813 RightInputStats: ris, 814 StoredSide: h.storedSide.String(), 815 MaxAllocatedMem: h.MemMonitor.MaximumBytes(), 816 MaxAllocatedDisk: h.diskMonitor.MaximumBytes(), 817 }, 818 ) 819 } 820 } 821 822 // Some types of joins need to mark rows that matched. 823 func shouldMark(storedSide joinSide, joinType sqlbase.JoinType) bool { 824 switch { 825 case joinType == sqlbase.LeftSemiJoin && storedSide == leftSide: 826 return true 827 case joinType == sqlbase.LeftAntiJoin && storedSide == leftSide: 828 return true 829 case joinType == sqlbase.ExceptAllJoin: 830 return true 831 case joinType == sqlbase.IntersectAllJoin: 832 return true 833 case shouldEmitUnmatchedRow(storedSide, joinType): 834 return true 835 default: 836 return false 837 } 838 } 839 840 // Some types of joins only need to know of the existence of a matching row in 841 // the storedSide, depending on the storedSide, and don't need to know all the 842 // rows. These can 'short circuit' to avoid iterating through them all. 843 func shouldShortCircuit(storedSide joinSide, joinType sqlbase.JoinType) bool { 844 switch joinType { 845 case sqlbase.LeftSemiJoin: 846 return storedSide == rightSide 847 case sqlbase.ExceptAllJoin: 848 return true 849 default: 850 return false 851 } 852 } 853 854 // ChildCount is part of the execinfra.OpNode interface. 855 func (h *hashJoiner) ChildCount(verbose bool) int { 856 if _, ok := h.leftSource.(execinfra.OpNode); ok { 857 if _, ok := h.rightSource.(execinfra.OpNode); ok { 858 return 2 859 } 860 } 861 return 0 862 } 863 864 // Child is part of the execinfra.OpNode interface. 865 func (h *hashJoiner) Child(nth int, verbose bool) execinfra.OpNode { 866 switch nth { 867 case 0: 868 if n, ok := h.leftSource.(execinfra.OpNode); ok { 869 return n 870 } 871 panic("left input to hashJoiner is not an execinfra.OpNode") 872 case 1: 873 if n, ok := h.rightSource.(execinfra.OpNode); ok { 874 return n 875 } 876 panic("right input to hashJoiner is not an execinfra.OpNode") 877 default: 878 panic(fmt.Sprintf("invalid index %d", nth)) 879 } 880 }