github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/sql/colexec/parallel_unordered_synchronizer.go (about)

     1  // Copyright 2019 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package colexec
    12  
    13  import (
    14  	"context"
    15  	"sync"
    16  	"sync/atomic"
    17  
    18  	"github.com/cockroachdb/cockroach/pkg/col/coldata"
    19  	"github.com/cockroachdb/cockroach/pkg/sql/colexecbase"
    20  	"github.com/cockroachdb/cockroach/pkg/sql/colexecbase/colexecerror"
    21  	"github.com/cockroachdb/cockroach/pkg/sql/execinfra"
    22  	"github.com/cockroachdb/cockroach/pkg/sql/types"
    23  	"github.com/cockroachdb/cockroach/pkg/util/contextutil"
    24  )
    25  
    26  // unorderedSynchronizerMsg is a light wrapper over a coldata.Batch sent over a
    27  // channel so that the main goroutine can know which input this message
    28  // originated from.
    29  type unorderedSynchronizerMsg struct {
    30  	inputIdx int
    31  	b        coldata.Batch
    32  }
    33  
    34  var _ colexecbase.Operator = &ParallelUnorderedSynchronizer{}
    35  var _ execinfra.OpNode = &ParallelUnorderedSynchronizer{}
    36  
    37  // ParallelUnorderedSynchronizer is an Operator that combines multiple Operator streams
    38  // into one.
    39  type ParallelUnorderedSynchronizer struct {
    40  	inputs []colexecbase.Operator
    41  	// readNextBatch is a slice of channels, where each channel corresponds to the
    42  	// input at the same index in inputs. It is used as a barrier for input
    43  	// goroutines to wait on until the Next goroutine signals that it is safe to
    44  	// retrieve the next batch. This is done so that inputs that are running
    45  	// asynchronously do not overwrite batches returned previously, given that
    46  	// batches must be safe for reuse until the next call to Next.
    47  	readNextBatch []chan struct{}
    48  	// numFinishedInputs is incremented atomically whenever one of the provided
    49  	// inputs exits from a goroutine (gracefully or otherwise).
    50  	numFinishedInputs uint32
    51  	// lastReadInputIdx is the index of the input whose batch we last returned.
    52  	// Used so that on the next call to Next, we can resume the input.
    53  	lastReadInputIdx int
    54  	// batches are the last batches read from the corresponding input.
    55  	batches []coldata.Batch
    56  	// nextBatch is a slice of functions each of which obtains a next batch from
    57  	// the corresponding to it input.
    58  	nextBatch []func()
    59  
    60  	initialized bool
    61  	done        bool
    62  	// externalWaitGroup refers to the WaitGroup passed in externally. Since the
    63  	// ParallelUnorderedSynchronizer spawns goroutines, this allows callers to
    64  	// wait for the completion of these goroutines.
    65  	externalWaitGroup *sync.WaitGroup
    66  	// internalWaitGroup refers to the WaitGroup internally managed by the
    67  	// ParallelUnorderedSynchronizer. This will only ever be incremented by the
    68  	// ParallelUnorderedSynchronizer and decremented by the input goroutines. This
    69  	// allows the ParallelUnorderedSynchronizer to wait only on internal
    70  	// goroutines.
    71  	internalWaitGroup *sync.WaitGroup
    72  	cancelFn          context.CancelFunc
    73  	batchCh           chan *unorderedSynchronizerMsg
    74  	errCh             chan error
    75  }
    76  
    77  // ChildCount implements the execinfra.OpNode interface.
    78  func (s *ParallelUnorderedSynchronizer) ChildCount(verbose bool) int {
    79  	return len(s.inputs)
    80  }
    81  
    82  // Child implements the execinfra.OpNode interface.
    83  func (s *ParallelUnorderedSynchronizer) Child(nth int, verbose bool) execinfra.OpNode {
    84  	return s.inputs[nth]
    85  }
    86  
    87  // NewParallelUnorderedSynchronizer creates a new ParallelUnorderedSynchronizer.
    88  // On the first call to Next, len(inputs) goroutines are spawned to read each
    89  // input asynchronously (to not be limited by a slow input). These will
    90  // increment the passed-in WaitGroup and decrement when done. It is also
    91  // guaranteed that these spawned goroutines will have completed on any error or
    92  // zero-length batch received from Next.
    93  func NewParallelUnorderedSynchronizer(
    94  	inputs []colexecbase.Operator, typs []*types.T, wg *sync.WaitGroup,
    95  ) *ParallelUnorderedSynchronizer {
    96  	readNextBatch := make([]chan struct{}, len(inputs))
    97  	for i := range readNextBatch {
    98  		// Buffer readNextBatch chans to allow for non-blocking writes. There will
    99  		// only be one message on the channel at a time.
   100  		readNextBatch[i] = make(chan struct{}, 1)
   101  	}
   102  	return &ParallelUnorderedSynchronizer{
   103  		inputs:            inputs,
   104  		readNextBatch:     readNextBatch,
   105  		batches:           make([]coldata.Batch, len(inputs)),
   106  		nextBatch:         make([]func(), len(inputs)),
   107  		externalWaitGroup: wg,
   108  		internalWaitGroup: &sync.WaitGroup{},
   109  		batchCh:           make(chan *unorderedSynchronizerMsg, len(inputs)),
   110  		// errCh is buffered so that writers do not block. If errCh is full, the
   111  		// input goroutines will not push an error and exit immediately, given that
   112  		// the Next goroutine will read an error and panic anyway.
   113  		errCh: make(chan error, 1),
   114  	}
   115  }
   116  
   117  // Init is part of the Operator interface.
   118  func (s *ParallelUnorderedSynchronizer) Init() {
   119  	for _, input := range s.inputs {
   120  		input.Init()
   121  	}
   122  }
   123  
   124  // init starts one goroutine per input to read from each input asynchronously
   125  // and push to batchCh. Canceling the context results in all goroutines
   126  // terminating, otherwise they keep on pushing batches until a zero-length batch
   127  // is encountered. Once all inputs terminate, s.batchCh is closed. If an error
   128  // occurs, the goroutines will make a non-blocking best effort to push that
   129  // error on s.errCh, resulting in the first error pushed to be observed by the
   130  // Next goroutine. Inputs are asynchronous so that the synchronizer is minimally
   131  // affected by slow inputs.
   132  func (s *ParallelUnorderedSynchronizer) init(ctx context.Context) {
   133  	ctx, s.cancelFn = contextutil.WithCancel(ctx)
   134  	for i, input := range s.inputs {
   135  		s.nextBatch[i] = func(input colexecbase.Operator, inputIdx int) func() {
   136  			return func() {
   137  				s.batches[inputIdx] = input.Next(ctx)
   138  			}
   139  		}(input, i)
   140  		s.externalWaitGroup.Add(1)
   141  		s.internalWaitGroup.Add(1)
   142  		// TODO(asubiotto): Most inputs are Inboxes, and these have handler
   143  		// goroutines just sitting around waiting for cancellation. I wonder if we
   144  		// could reuse those goroutines to push batches to batchCh directly.
   145  		go func(input colexecbase.Operator, inputIdx int) {
   146  			defer func() {
   147  				if int(atomic.AddUint32(&s.numFinishedInputs, 1)) == len(s.inputs) {
   148  					close(s.batchCh)
   149  				}
   150  				s.internalWaitGroup.Done()
   151  				s.externalWaitGroup.Done()
   152  			}()
   153  			msg := &unorderedSynchronizerMsg{
   154  				inputIdx: inputIdx,
   155  			}
   156  			for {
   157  				if err := colexecerror.CatchVectorizedRuntimeError(s.nextBatch[inputIdx]); err != nil {
   158  					select {
   159  					// Non-blocking write to errCh, if an error is present the main
   160  					// goroutine will use that and cancel all inputs.
   161  					case s.errCh <- err:
   162  					default:
   163  					}
   164  					return
   165  				}
   166  				if s.batches[inputIdx].Length() == 0 {
   167  					return
   168  				}
   169  				msg.b = s.batches[inputIdx]
   170  				select {
   171  				case <-ctx.Done():
   172  					select {
   173  					// Non-blocking write to errCh, if an error is present the main
   174  					// goroutine will use that and cancel all inputs.
   175  					case s.errCh <- ctx.Err():
   176  					default:
   177  					}
   178  					return
   179  				case s.batchCh <- msg:
   180  				}
   181  
   182  				// Wait until Next goroutine tells us we are good to go.
   183  				select {
   184  				case <-s.readNextBatch[inputIdx]:
   185  				case <-ctx.Done():
   186  					select {
   187  					// Non-blocking write to errCh, if an error is present the main
   188  					// goroutine will use that and cancel all inputs.
   189  					case s.errCh <- ctx.Err():
   190  					default:
   191  					}
   192  					return
   193  				}
   194  			}
   195  		}(input, i)
   196  	}
   197  	s.initialized = true
   198  }
   199  
   200  // Next is part of the Operator interface.
   201  func (s *ParallelUnorderedSynchronizer) Next(ctx context.Context) coldata.Batch {
   202  	if s.done {
   203  		return coldata.ZeroBatch
   204  	}
   205  	if !s.initialized {
   206  		s.init(ctx)
   207  	} else {
   208  		// Signal the input whose batch we returned in the last call to Next that it
   209  		// is safe to retrieve the next batch. Since Next has been called, we can
   210  		// reuse memory instead of making safe copies of batches returned.
   211  		s.readNextBatch[s.lastReadInputIdx] <- struct{}{}
   212  	}
   213  	select {
   214  	case err := <-s.errCh:
   215  		if err != nil {
   216  			// If we got an error from one of our inputs, cancel all inputs and
   217  			// propagate this error through a panic.
   218  			s.cancelFn()
   219  			s.internalWaitGroup.Wait()
   220  			colexecerror.InternalError(err)
   221  		}
   222  	case msg := <-s.batchCh:
   223  		if msg == nil {
   224  			// All inputs have exited, double check that this is indeed the case.
   225  			s.internalWaitGroup.Wait()
   226  			// Check if this was a graceful termination or not.
   227  			select {
   228  			case err := <-s.errCh:
   229  				if err != nil {
   230  					colexecerror.InternalError(err)
   231  				}
   232  			default:
   233  			}
   234  			s.done = true
   235  			return coldata.ZeroBatch
   236  		}
   237  		s.lastReadInputIdx = msg.inputIdx
   238  		return msg.b
   239  	}
   240  	return nil
   241  }