github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/sql/colflow/vectorized_flow.go (about)

     1  // Copyright 2019 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package colflow
    12  
    13  import (
    14  	"context"
    15  	"fmt"
    16  	"math"
    17  	"path/filepath"
    18  	"strconv"
    19  	"strings"
    20  	"sync"
    21  	"sync/atomic"
    22  
    23  	"github.com/cockroachdb/cockroach/pkg/col/coldata"
    24  	"github.com/cockroachdb/cockroach/pkg/col/coldataext"
    25  	"github.com/cockroachdb/cockroach/pkg/rpc/nodedialer"
    26  	"github.com/cockroachdb/cockroach/pkg/settings"
    27  	"github.com/cockroachdb/cockroach/pkg/sql/colcontainer"
    28  	"github.com/cockroachdb/cockroach/pkg/sql/colexec"
    29  	"github.com/cockroachdb/cockroach/pkg/sql/colexecbase"
    30  	"github.com/cockroachdb/cockroach/pkg/sql/colexecbase/colexecerror"
    31  	"github.com/cockroachdb/cockroach/pkg/sql/colflow/colrpc"
    32  	"github.com/cockroachdb/cockroach/pkg/sql/colmem"
    33  	"github.com/cockroachdb/cockroach/pkg/sql/execinfra"
    34  	"github.com/cockroachdb/cockroach/pkg/sql/execinfrapb"
    35  	"github.com/cockroachdb/cockroach/pkg/sql/flowinfra"
    36  	"github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgcode"
    37  	"github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgerror"
    38  	"github.com/cockroachdb/cockroach/pkg/sql/rowexec"
    39  	"github.com/cockroachdb/cockroach/pkg/sql/sessiondata"
    40  	"github.com/cockroachdb/cockroach/pkg/sql/types"
    41  	"github.com/cockroachdb/cockroach/pkg/util"
    42  	"github.com/cockroachdb/cockroach/pkg/util/log"
    43  	"github.com/cockroachdb/cockroach/pkg/util/metric"
    44  	"github.com/cockroachdb/cockroach/pkg/util/mon"
    45  	"github.com/cockroachdb/cockroach/pkg/util/syncutil"
    46  	"github.com/cockroachdb/cockroach/pkg/util/timeutil"
    47  	"github.com/cockroachdb/cockroach/pkg/util/tracing"
    48  	"github.com/cockroachdb/cockroach/pkg/util/uuid"
    49  	"github.com/cockroachdb/errors"
    50  	"github.com/cockroachdb/logtags"
    51  	"github.com/marusama/semaphore"
    52  	opentracing "github.com/opentracing/opentracing-go"
    53  )
    54  
    55  // countingSemaphore is a semaphore that keeps track of the semaphore count from
    56  // its perspective.
    57  type countingSemaphore struct {
    58  	semaphore.Semaphore
    59  	globalCount *metric.Gauge
    60  	count       int64
    61  }
    62  
    63  func (s *countingSemaphore) Acquire(ctx context.Context, n int) error {
    64  	if err := s.Semaphore.Acquire(ctx, n); err != nil {
    65  		return err
    66  	}
    67  	atomic.AddInt64(&s.count, int64(n))
    68  	s.globalCount.Inc(int64(n))
    69  	return nil
    70  }
    71  
    72  func (s *countingSemaphore) TryAcquire(n int) bool {
    73  	success := s.Semaphore.TryAcquire(n)
    74  	if !success {
    75  		return false
    76  	}
    77  	atomic.AddInt64(&s.count, int64(n))
    78  	s.globalCount.Inc(int64(n))
    79  	return success
    80  }
    81  
    82  func (s *countingSemaphore) Release(n int) int {
    83  	atomic.AddInt64(&s.count, int64(-n))
    84  	s.globalCount.Dec(int64(n))
    85  	return s.Semaphore.Release(n)
    86  }
    87  
    88  type vectorizedFlow struct {
    89  	*flowinfra.FlowBase
    90  	// operatorConcurrency is set if any operators are executed in parallel.
    91  	operatorConcurrency bool
    92  
    93  	// countingSemaphore is a wrapper over a semaphore.Semaphore that keeps track
    94  	// of the number of resources held in a semaphore.Semaphore requested from the
    95  	// context of this flow so that these can be released unconditionally upon
    96  	// Cleanup.
    97  	countingSemaphore *countingSemaphore
    98  
    99  	// streamingMemAccounts are the memory accounts that are tracking the static
   100  	// memory usage of the whole vectorized flow as well as all dynamic memory of
   101  	// the streaming components.
   102  	streamingMemAccounts []*mon.BoundAccount
   103  
   104  	// monitors are the monitors (of both memory and disk usage) of the
   105  	// buffering components.
   106  	monitors []*mon.BytesMonitor
   107  	// accounts are the accounts that are tracking the dynamic memory and disk
   108  	// usage of the buffering components.
   109  	accounts []*mon.BoundAccount
   110  
   111  	tempStorage struct {
   112  		// path is the path to this flow's temporary storage directory.
   113  		path           string
   114  		createdStateMu struct {
   115  			syncutil.Mutex
   116  			// created is a protected boolean that is true when the flow's temporary
   117  			// storage directory has been created.
   118  			created bool
   119  		}
   120  	}
   121  
   122  	testingKnobs struct {
   123  		// onSetupFlow is a testing knob that is called before calling
   124  		// creator.setupFlow with the given creator.
   125  		onSetupFlow func(*vectorizedFlowCreator)
   126  	}
   127  }
   128  
   129  var _ flowinfra.Flow = &vectorizedFlow{}
   130  
   131  var vectorizedFlowPool = sync.Pool{
   132  	New: func() interface{} {
   133  		return &vectorizedFlow{}
   134  	},
   135  }
   136  
   137  // NewVectorizedFlow creates a new vectorized flow given the flow base.
   138  func NewVectorizedFlow(base *flowinfra.FlowBase) flowinfra.Flow {
   139  	vf := vectorizedFlowPool.Get().(*vectorizedFlow)
   140  	vf.FlowBase = base
   141  	return vf
   142  }
   143  
   144  // VectorizeTestingBatchSize is a testing cluster setting that sets the default
   145  // batch size used by the vectorized execution engine. A low batch size is
   146  // useful to test batch reuse.
   147  var VectorizeTestingBatchSize = settings.RegisterValidatedIntSetting(
   148  	"sql.testing.vectorize.batch_size",
   149  	fmt.Sprintf("the size of a batch of rows in the vectorized engine (0=default, value must be less than %d)", coldata.MaxBatchSize),
   150  	0,
   151  	func(newBatchSize int64) error {
   152  		if newBatchSize > coldata.MaxBatchSize {
   153  			return pgerror.Newf(pgcode.InvalidParameterValue, "batch size %d may not be larger than %d", newBatchSize, coldata.MaxBatchSize)
   154  		}
   155  		return nil
   156  	},
   157  )
   158  
   159  // Setup is part of the flowinfra.Flow interface.
   160  func (f *vectorizedFlow) Setup(
   161  	ctx context.Context, spec *execinfrapb.FlowSpec, opt flowinfra.FuseOpt,
   162  ) (context.Context, error) {
   163  	var err error
   164  	ctx, err = f.FlowBase.Setup(ctx, spec, opt)
   165  	if err != nil {
   166  		return ctx, err
   167  	}
   168  	log.VEventf(ctx, 1, "setting up vectorize flow %s", f.ID.Short())
   169  	recordingStats := false
   170  	if sp := opentracing.SpanFromContext(ctx); sp != nil && tracing.IsRecording(sp) {
   171  		recordingStats = true
   172  	}
   173  	helper := &vectorizedFlowCreatorHelper{f: f.FlowBase}
   174  
   175  	testingBatchSize := int64(0)
   176  	if f.FlowCtx.Cfg.Settings != nil {
   177  		testingBatchSize = VectorizeTestingBatchSize.Get(&f.FlowCtx.Cfg.Settings.SV)
   178  	}
   179  	if testingBatchSize != 0 {
   180  		if err := coldata.SetBatchSizeForTests(int(testingBatchSize)); err != nil {
   181  			return ctx, err
   182  		}
   183  	} else {
   184  		coldata.ResetBatchSizeForTests()
   185  	}
   186  
   187  	// Create a name for this flow's temporary directory. Note that this directory
   188  	// is lazily created when necessary and cleaned up in Cleanup(). The directory
   189  	// name is the flow's ID in most cases apart from when the flow's ID is unset
   190  	// (in the case of local flows). In this case the directory will be prefixed
   191  	// with "local-flow" and a uuid is generated on the spot to provide a unique
   192  	// name.
   193  	tempDirName := f.GetID().String()
   194  	if f.GetID().Equal(uuid.Nil) {
   195  		tempDirName = "local-flow" + uuid.FastMakeV4().String()
   196  	}
   197  	f.tempStorage.path = filepath.Join(f.Cfg.TempStoragePath, tempDirName)
   198  	diskQueueCfg := colcontainer.DiskQueueCfg{
   199  		FS:   f.Cfg.TempFS,
   200  		Path: f.tempStorage.path,
   201  		OnNewDiskQueueCb: func() {
   202  			f.tempStorage.createdStateMu.Lock()
   203  			defer f.tempStorage.createdStateMu.Unlock()
   204  			if f.tempStorage.createdStateMu.created {
   205  				// The temporary storage directory has already been created.
   206  				return
   207  			}
   208  			log.VEventf(ctx, 1, "flow %s spilled to disk, stack trace: %s", f.ID, util.GetSmallTrace(2))
   209  			if err := f.Cfg.TempFS.MkdirAll(f.tempStorage.path); err != nil {
   210  				colexecerror.InternalError(errors.Errorf("unable to create temporary storage directory: %v", err))
   211  			}
   212  			f.tempStorage.createdStateMu.created = true
   213  		},
   214  	}
   215  	if err := diskQueueCfg.EnsureDefaults(); err != nil {
   216  		return ctx, err
   217  	}
   218  	f.countingSemaphore = &countingSemaphore{Semaphore: f.Cfg.VecFDSemaphore, globalCount: f.Cfg.Metrics.VecOpenFDs}
   219  	creator := newVectorizedFlowCreator(
   220  		helper,
   221  		vectorizedRemoteComponentCreator{},
   222  		recordingStats,
   223  		f.GetWaitGroup(),
   224  		f.GetSyncFlowConsumer(),
   225  		f.GetFlowCtx().Cfg.NodeDialer,
   226  		f.GetID(),
   227  		diskQueueCfg,
   228  		f.countingSemaphore,
   229  	)
   230  	if f.testingKnobs.onSetupFlow != nil {
   231  		f.testingKnobs.onSetupFlow(creator)
   232  	}
   233  	_, err = creator.setupFlow(ctx, f.GetFlowCtx(), spec.Processors, opt)
   234  	if err == nil {
   235  		f.operatorConcurrency = creator.operatorConcurrency
   236  		f.streamingMemAccounts = append(f.streamingMemAccounts, creator.streamingMemAccounts...)
   237  		f.monitors = append(f.monitors, creator.monitors...)
   238  		f.accounts = append(f.accounts, creator.accounts...)
   239  		log.VEventf(ctx, 1, "vectorized flow setup succeeded")
   240  		return ctx, nil
   241  	}
   242  	// It is (theoretically) possible that some of the memory monitoring
   243  	// infrastructure was created even in case of an error, and we need to clean
   244  	// that up.
   245  	for _, acc := range creator.streamingMemAccounts {
   246  		acc.Close(ctx)
   247  	}
   248  	for _, acc := range creator.accounts {
   249  		acc.Close(ctx)
   250  	}
   251  	for _, mon := range creator.monitors {
   252  		mon.Stop(ctx)
   253  	}
   254  	log.VEventf(ctx, 1, "failed to vectorize: %s", err)
   255  	return ctx, err
   256  }
   257  
   258  // IsVectorized is part of the flowinfra.Flow interface.
   259  func (f *vectorizedFlow) IsVectorized() bool {
   260  	return true
   261  }
   262  
   263  // ConcurrentExecution is part of the flowinfra.Flow interface.
   264  func (f *vectorizedFlow) ConcurrentExecution() bool {
   265  	return f.operatorConcurrency || f.FlowBase.ConcurrentExecution()
   266  }
   267  
   268  // Release releases this vectorizedFlow back to the pool.
   269  func (f *vectorizedFlow) Release() {
   270  	*f = vectorizedFlow{}
   271  	vectorizedFlowPool.Put(f)
   272  }
   273  
   274  // Cleanup is part of the flowinfra.Flow interface.
   275  func (f *vectorizedFlow) Cleanup(ctx context.Context) {
   276  	// This cleans up all the memory and disk monitoring of the vectorized flow.
   277  	for _, acc := range f.streamingMemAccounts {
   278  		acc.Close(ctx)
   279  	}
   280  	for _, acc := range f.accounts {
   281  		acc.Close(ctx)
   282  	}
   283  	for _, mon := range f.monitors {
   284  		mon.Stop(ctx)
   285  	}
   286  
   287  	f.tempStorage.createdStateMu.Lock()
   288  	created := f.tempStorage.createdStateMu.created
   289  	f.tempStorage.createdStateMu.Unlock()
   290  	if created {
   291  		if err := f.Cfg.TempFS.RemoveAll(f.tempStorage.path); err != nil {
   292  			// Log error as a Warning but keep on going to close the memory
   293  			// infrastructure.
   294  			log.Warningf(
   295  				ctx,
   296  				"unable to remove flow %s's temporary directory at %s, files may be left over: %v",
   297  				f.GetID().Short(),
   298  				f.tempStorage.path,
   299  				err,
   300  			)
   301  		}
   302  	}
   303  	// Release any leftover temporary storage file descriptors from this flow.
   304  	if unreleased := atomic.LoadInt64(&f.countingSemaphore.count); unreleased > 0 {
   305  		f.countingSemaphore.Release(int(unreleased))
   306  	}
   307  	f.FlowBase.Cleanup(ctx)
   308  	f.Release()
   309  }
   310  
   311  // wrapWithVectorizedStatsCollector creates a new
   312  // colexec.VectorizedStatsCollector that wraps op and connects the newly
   313  // created wrapper with those corresponding to operators in inputs (the latter
   314  // must have already been wrapped).
   315  func (s *vectorizedFlowCreator) wrapWithVectorizedStatsCollector(
   316  	op colexecbase.Operator,
   317  	inputs []colexecbase.Operator,
   318  	id int32,
   319  	idTagKey string,
   320  	monitors []*mon.BytesMonitor,
   321  ) (*colexec.VectorizedStatsCollector, error) {
   322  	inputWatch := timeutil.NewStopWatch()
   323  	var memMonitors, diskMonitors []*mon.BytesMonitor
   324  	for _, m := range monitors {
   325  		if m.Resource() == mon.DiskResource {
   326  			diskMonitors = append(diskMonitors, m)
   327  		} else {
   328  			memMonitors = append(memMonitors, m)
   329  		}
   330  	}
   331  	vsc := colexec.NewVectorizedStatsCollector(
   332  		op, id, idTagKey, len(inputs) == 0, inputWatch, memMonitors, diskMonitors,
   333  	)
   334  	for _, input := range inputs {
   335  		sc, ok := input.(*colexec.VectorizedStatsCollector)
   336  		if !ok {
   337  			return nil, errors.New("unexpectedly an input is not collecting stats")
   338  		}
   339  		sc.SetOutputWatch(inputWatch)
   340  	}
   341  	s.vectorizedStatsCollectorsQueue = append(s.vectorizedStatsCollectorsQueue, vsc)
   342  	return vsc, nil
   343  }
   344  
   345  // finishVectorizedStatsCollectors finishes the given stats collectors and
   346  // outputs their stats to the trace contained in the ctx's span.
   347  func finishVectorizedStatsCollectors(
   348  	ctx context.Context,
   349  	flowID execinfrapb.FlowID,
   350  	deterministicStats bool,
   351  	vectorizedStatsCollectors []*colexec.VectorizedStatsCollector,
   352  ) {
   353  	flowIDString := flowID.String()
   354  	for _, vsc := range vectorizedStatsCollectors {
   355  		vsc.OutputStats(ctx, flowIDString, deterministicStats)
   356  	}
   357  }
   358  
   359  type runFn func(context.Context, context.CancelFunc)
   360  
   361  // flowCreatorHelper contains all the logic needed to add the vectorized
   362  // infrastructure to be run asynchronously as well as to perform some sanity
   363  // checks.
   364  type flowCreatorHelper interface {
   365  	// addStreamEndpoint stores information about an inbound stream.
   366  	addStreamEndpoint(execinfrapb.StreamID, *colrpc.Inbox, *sync.WaitGroup)
   367  	// checkInboundStreamID checks that the provided stream ID has not been seen
   368  	// yet.
   369  	checkInboundStreamID(execinfrapb.StreamID) error
   370  	// accumulateAsyncComponent stores a component (either a router or an outbox)
   371  	// to be run asynchronously.
   372  	accumulateAsyncComponent(runFn)
   373  	// addMaterializer adds a materializer to the flow.
   374  	addMaterializer(*colexec.Materializer)
   375  	// getCancelFlowFn returns a flow cancellation function.
   376  	getCancelFlowFn() context.CancelFunc
   377  }
   378  
   379  // opDAGWithMetaSources is a helper struct that stores an operator DAG as well
   380  // as the metadataSources and closers in this DAG that need to be drained and
   381  // closed.
   382  type opDAGWithMetaSources struct {
   383  	rootOperator    colexecbase.Operator
   384  	metadataSources []execinfrapb.MetadataSource
   385  	toClose         []colexec.IdempotentCloser
   386  }
   387  
   388  // remoteComponentCreator is an interface that abstracts the constructors for
   389  // several components in a remote flow. Mostly for testing purposes.
   390  type remoteComponentCreator interface {
   391  	newOutbox(
   392  		allocator *colmem.Allocator,
   393  		input colexecbase.Operator,
   394  		typs []*types.T,
   395  		metadataSources []execinfrapb.MetadataSource,
   396  		toClose []colexec.IdempotentCloser,
   397  	) (*colrpc.Outbox, error)
   398  	newInbox(allocator *colmem.Allocator, typs []*types.T, streamID execinfrapb.StreamID) (*colrpc.Inbox, error)
   399  }
   400  
   401  type vectorizedRemoteComponentCreator struct{}
   402  
   403  func (vectorizedRemoteComponentCreator) newOutbox(
   404  	allocator *colmem.Allocator,
   405  	input colexecbase.Operator,
   406  	typs []*types.T,
   407  	metadataSources []execinfrapb.MetadataSource,
   408  	toClose []colexec.IdempotentCloser,
   409  ) (*colrpc.Outbox, error) {
   410  	return colrpc.NewOutbox(allocator, input, typs, metadataSources, toClose)
   411  }
   412  
   413  func (vectorizedRemoteComponentCreator) newInbox(
   414  	allocator *colmem.Allocator, typs []*types.T, streamID execinfrapb.StreamID,
   415  ) (*colrpc.Inbox, error) {
   416  	return colrpc.NewInbox(allocator, typs, streamID)
   417  }
   418  
   419  // vectorizedFlowCreator performs all the setup of vectorized flows. Depending
   420  // on embedded flowCreatorHelper, it can either do the actual setup in order
   421  // to run the flow or do the setup needed to check that the flow is supported
   422  // through the vectorized engine.
   423  type vectorizedFlowCreator struct {
   424  	flowCreatorHelper
   425  	remoteComponentCreator
   426  
   427  	streamIDToInputOp              map[execinfrapb.StreamID]opDAGWithMetaSources
   428  	recordingStats                 bool
   429  	vectorizedStatsCollectorsQueue []*colexec.VectorizedStatsCollector
   430  	waitGroup                      *sync.WaitGroup
   431  	syncFlowConsumer               execinfra.RowReceiver
   432  	nodeDialer                     *nodedialer.Dialer
   433  	flowID                         execinfrapb.FlowID
   434  
   435  	// numOutboxes counts how many exec.Outboxes have been set up on this node.
   436  	// It must be accessed atomically.
   437  	numOutboxes       int32
   438  	materializerAdded bool
   439  
   440  	// leaves accumulates all operators that have no further outputs on the
   441  	// current node, for the purposes of EXPLAIN output.
   442  	leaves []execinfra.OpNode
   443  	// operatorConcurrency is set if any operators are executed in parallel.
   444  	operatorConcurrency bool
   445  	// streamingMemAccounts contains all memory accounts of the non-buffering
   446  	// components in the vectorized flow.
   447  	streamingMemAccounts []*mon.BoundAccount
   448  	// monitors contains all monitors (for both memory and disk usage) of the
   449  	// buffering components in the vectorized flow.
   450  	monitors []*mon.BytesMonitor
   451  	// accounts contains all monitors (for both memory and disk usage) of the
   452  	// buffering components in the vectorized flow.
   453  	accounts []*mon.BoundAccount
   454  
   455  	diskQueueCfg colcontainer.DiskQueueCfg
   456  	fdSemaphore  semaphore.Semaphore
   457  }
   458  
   459  func newVectorizedFlowCreator(
   460  	helper flowCreatorHelper,
   461  	componentCreator remoteComponentCreator,
   462  	recordingStats bool,
   463  	waitGroup *sync.WaitGroup,
   464  	syncFlowConsumer execinfra.RowReceiver,
   465  	nodeDialer *nodedialer.Dialer,
   466  	flowID execinfrapb.FlowID,
   467  	diskQueueCfg colcontainer.DiskQueueCfg,
   468  	fdSemaphore semaphore.Semaphore,
   469  ) *vectorizedFlowCreator {
   470  	return &vectorizedFlowCreator{
   471  		flowCreatorHelper:              helper,
   472  		remoteComponentCreator:         componentCreator,
   473  		streamIDToInputOp:              make(map[execinfrapb.StreamID]opDAGWithMetaSources),
   474  		recordingStats:                 recordingStats,
   475  		vectorizedStatsCollectorsQueue: make([]*colexec.VectorizedStatsCollector, 0, 2),
   476  		waitGroup:                      waitGroup,
   477  		syncFlowConsumer:               syncFlowConsumer,
   478  		nodeDialer:                     nodeDialer,
   479  		flowID:                         flowID,
   480  		diskQueueCfg:                   diskQueueCfg,
   481  		fdSemaphore:                    fdSemaphore,
   482  	}
   483  }
   484  
   485  // createBufferingUnlimitedMemMonitor instantiates an unlimited memory monitor.
   486  // These should only be used when spilling to disk and an operator is made aware
   487  // of a memory usage limit separately.
   488  // The receiver is updated to have a reference to the unlimited memory monitor.
   489  // TODO(asubiotto): This identical to the helper function in
   490  //  NewColOperatorResult, meaning that we should probably find a way to refactor
   491  //  this.
   492  func (s *vectorizedFlowCreator) createBufferingUnlimitedMemMonitor(
   493  	ctx context.Context, flowCtx *execinfra.FlowCtx, name string,
   494  ) *mon.BytesMonitor {
   495  	bufferingOpUnlimitedMemMonitor := execinfra.NewMonitor(
   496  		ctx, flowCtx.EvalCtx.Mon, name+"-unlimited",
   497  	)
   498  	s.monitors = append(s.monitors, bufferingOpUnlimitedMemMonitor)
   499  	return bufferingOpUnlimitedMemMonitor
   500  }
   501  
   502  // createDiskAccounts instantiates an unlimited disk monitor and disk accounts
   503  // to be used for disk spilling infrastructure in vectorized engine.
   504  // TODO(azhng): consolidate all allocation monitors/account management into one
   505  // place after branch cut for 20.1.
   506  func (s *vectorizedFlowCreator) createDiskAccounts(
   507  	ctx context.Context, flowCtx *execinfra.FlowCtx, name string, numAccounts int,
   508  ) (*mon.BytesMonitor, []*mon.BoundAccount) {
   509  	diskMonitor := execinfra.NewMonitor(ctx, flowCtx.Cfg.DiskMonitor, name)
   510  	s.monitors = append(s.monitors, diskMonitor)
   511  	diskAccounts := make([]*mon.BoundAccount, numAccounts)
   512  	for i := range diskAccounts {
   513  		diskAcc := diskMonitor.MakeBoundAccount()
   514  		diskAccounts[i] = &diskAcc
   515  	}
   516  	s.accounts = append(s.accounts, diskAccounts...)
   517  	return diskMonitor, diskAccounts
   518  }
   519  
   520  // newStreamingMemAccount creates a new memory account bound to the monitor in
   521  // flowCtx and accumulates it into streamingMemAccounts slice.
   522  func (s *vectorizedFlowCreator) newStreamingMemAccount(
   523  	flowCtx *execinfra.FlowCtx,
   524  ) *mon.BoundAccount {
   525  	streamingMemAccount := flowCtx.EvalCtx.Mon.MakeBoundAccount()
   526  	s.streamingMemAccounts = append(s.streamingMemAccounts, &streamingMemAccount)
   527  	return &streamingMemAccount
   528  }
   529  
   530  // setupRemoteOutputStream sets up an Outbox that will operate according to
   531  // the given StreamEndpointSpec. It will also drain all MetadataSources in the
   532  // metadataSourcesQueue.
   533  func (s *vectorizedFlowCreator) setupRemoteOutputStream(
   534  	ctx context.Context,
   535  	flowCtx *execinfra.FlowCtx,
   536  	op colexecbase.Operator,
   537  	outputTyps []*types.T,
   538  	stream *execinfrapb.StreamEndpointSpec,
   539  	metadataSourcesQueue []execinfrapb.MetadataSource,
   540  	toClose []colexec.IdempotentCloser,
   541  	factory coldata.ColumnFactory,
   542  ) (execinfra.OpNode, error) {
   543  	// TODO(yuzefovich): we should collect some statistics on the outbox (e.g.
   544  	// number of bytes sent).
   545  	outbox, err := s.remoteComponentCreator.newOutbox(
   546  		colmem.NewAllocator(ctx, s.newStreamingMemAccount(flowCtx), factory),
   547  		op, outputTyps, metadataSourcesQueue, toClose,
   548  	)
   549  	if err != nil {
   550  		return nil, err
   551  	}
   552  	atomic.AddInt32(&s.numOutboxes, 1)
   553  	run := func(ctx context.Context, cancelFn context.CancelFunc) {
   554  		outbox.Run(ctx, s.nodeDialer, stream.TargetNodeID, s.flowID, stream.StreamID, cancelFn)
   555  		currentOutboxes := atomic.AddInt32(&s.numOutboxes, -1)
   556  		// When the last Outbox on this node exits, we want to make sure that
   557  		// everything is shutdown; namely, we need to call cancelFn if:
   558  		// - it is the last Outbox
   559  		// - there is no root materializer on this node (if it were, it would take
   560  		// care of the cancellation itself)
   561  		// - cancelFn is non-nil (it can be nil in tests).
   562  		// Calling cancelFn will cancel the context that all infrastructure on this
   563  		// node is listening on, so it will shut everything down.
   564  		if currentOutboxes == 0 && !s.materializerAdded && cancelFn != nil {
   565  			cancelFn()
   566  		}
   567  	}
   568  	s.accumulateAsyncComponent(run)
   569  	return outbox, nil
   570  }
   571  
   572  // setupRouter sets up a vectorized hash router according to the output router
   573  // spec. If the outputs are local, these are added to s.streamIDToInputOp to be
   574  // used as inputs in further planning. metadataSourcesQueue is passed along to
   575  // any outboxes created to be drained, or stored in streamIDToInputOp for any
   576  // local outputs to pass that responsibility along. In any case,
   577  // metadataSourcesQueue will always be fully consumed.
   578  // NOTE: This method supports only BY_HASH routers. Callers should handle
   579  // PASS_THROUGH routers separately.
   580  func (s *vectorizedFlowCreator) setupRouter(
   581  	ctx context.Context,
   582  	flowCtx *execinfra.FlowCtx,
   583  	input colexecbase.Operator,
   584  	outputTyps []*types.T,
   585  	output *execinfrapb.OutputRouterSpec,
   586  	metadataSourcesQueue []execinfrapb.MetadataSource,
   587  	toClose []colexec.IdempotentCloser,
   588  	factory coldata.ColumnFactory,
   589  ) error {
   590  	if output.Type != execinfrapb.OutputRouterSpec_BY_HASH {
   591  		return errors.Errorf("vectorized output router type %s unsupported", output.Type)
   592  	}
   593  
   594  	// HashRouter memory monitor names are the concatenated output stream IDs.
   595  	streamIDs := make([]string, len(output.Streams))
   596  	for i, s := range output.Streams {
   597  		streamIDs[i] = strconv.Itoa(int(s.StreamID))
   598  	}
   599  	mmName := "hash-router-[" + strings.Join(streamIDs, ",") + "]"
   600  
   601  	hashRouterMemMonitor := s.createBufferingUnlimitedMemMonitor(ctx, flowCtx, mmName)
   602  	allocators := make([]*colmem.Allocator, len(output.Streams))
   603  	for i := range allocators {
   604  		acc := hashRouterMemMonitor.MakeBoundAccount()
   605  		allocators[i] = colmem.NewAllocator(ctx, &acc, factory)
   606  		s.accounts = append(s.accounts, &acc)
   607  	}
   608  	limit := execinfra.GetWorkMemLimit(flowCtx.Cfg)
   609  	if flowCtx.Cfg.TestingKnobs.ForceDiskSpill {
   610  		limit = 1
   611  	}
   612  	diskMon, diskAccounts := s.createDiskAccounts(ctx, flowCtx, mmName, len(output.Streams))
   613  	router, outputs := colexec.NewHashRouter(
   614  		allocators, input, outputTyps, output.HashColumns, limit,
   615  		s.diskQueueCfg, s.fdSemaphore, diskAccounts, toClose,
   616  	)
   617  	runRouter := func(ctx context.Context, _ context.CancelFunc) {
   618  		logtags.AddTag(ctx, "hashRouterID", mmName)
   619  		router.Run(ctx)
   620  	}
   621  	s.accumulateAsyncComponent(runRouter)
   622  
   623  	// Append the router to the metadata sources.
   624  	metadataSourcesQueue = append(metadataSourcesQueue, router)
   625  
   626  	foundLocalOutput := false
   627  	for i, op := range outputs {
   628  		stream := &output.Streams[i]
   629  		switch stream.Type {
   630  		case execinfrapb.StreamEndpointSpec_SYNC_RESPONSE:
   631  			return errors.Errorf("unexpected sync response output when setting up router")
   632  		case execinfrapb.StreamEndpointSpec_REMOTE:
   633  			// Note that here we pass in nil 'toClose' slice because hash
   634  			// router is responsible for closing all of the idempotent closers.
   635  			if _, err := s.setupRemoteOutputStream(
   636  				ctx, flowCtx, op, outputTyps, stream, metadataSourcesQueue, nil /* toClose */, factory,
   637  			); err != nil {
   638  				return err
   639  			}
   640  		case execinfrapb.StreamEndpointSpec_LOCAL:
   641  			foundLocalOutput = true
   642  			if s.recordingStats {
   643  				mons := []*mon.BytesMonitor{hashRouterMemMonitor, diskMon}
   644  				// Wrap local outputs with vectorized stats collectors when recording
   645  				// stats. This is mostly for compatibility but will provide some useful
   646  				// information (e.g. output stall time).
   647  				var err error
   648  				op, err = s.wrapWithVectorizedStatsCollector(
   649  					op, nil /* inputs */, int32(stream.StreamID),
   650  					execinfrapb.StreamIDTagKey, mons,
   651  				)
   652  				if err != nil {
   653  					return err
   654  				}
   655  			}
   656  			s.streamIDToInputOp[stream.StreamID] = opDAGWithMetaSources{
   657  				rootOperator: op, metadataSources: metadataSourcesQueue, toClose: toClose,
   658  			}
   659  		}
   660  		// Either the metadataSourcesQueue will be drained by an outbox or we
   661  		// created an opDAGWithMetaSources to pass along these metadataSources. We don't need to
   662  		// worry about metadata sources for following iterations of the loop.
   663  		metadataSourcesQueue = nil
   664  	}
   665  	if !foundLocalOutput {
   666  		// No local output means that our router is a leaf node.
   667  		s.leaves = append(s.leaves, router)
   668  	}
   669  	return nil
   670  }
   671  
   672  // setupInput sets up one or more input operators (local or remote) and a
   673  // synchronizer to expose these separate streams as one exec.Operator which is
   674  // returned. If s.recordingStats is true, these inputs and synchronizer are
   675  // wrapped in stats collectors if not done so, although these stats are not
   676  // exposed as of yet. Inboxes that are created are also returned as
   677  // []distqlpb.MetadataSource so that any remote metadata can be read through
   678  // calling DrainMeta.
   679  func (s *vectorizedFlowCreator) setupInput(
   680  	ctx context.Context,
   681  	flowCtx *execinfra.FlowCtx,
   682  	input execinfrapb.InputSyncSpec,
   683  	opt flowinfra.FuseOpt,
   684  	factory coldata.ColumnFactory,
   685  ) (op colexecbase.Operator, _ []execinfrapb.MetadataSource, _ error) {
   686  	inputStreamOps := make([]colexecbase.Operator, 0, len(input.Streams))
   687  	metaSources := make([]execinfrapb.MetadataSource, 0, len(input.Streams))
   688  	// Before we can safely use types we received over the wire in the
   689  	// operators, we need to make sure they are hydrated. In row execution
   690  	// engine it is done during the processor initialization, but operators
   691  	// don't do that. However, all operators (apart from the colBatchScan) get
   692  	// their types from InputSyncSpec, so this is a convenient place to do the
   693  	// hydration so that all operators get the valid types.
   694  	if err := execinfrapb.HydrateTypeSlice(flowCtx.EvalCtx, input.ColumnTypes); err != nil {
   695  		return nil, nil, err
   696  	}
   697  	for _, inputStream := range input.Streams {
   698  		switch inputStream.Type {
   699  		case execinfrapb.StreamEndpointSpec_LOCAL:
   700  			in := s.streamIDToInputOp[inputStream.StreamID]
   701  			inputStreamOps = append(inputStreamOps, in.rootOperator)
   702  			metaSources = append(metaSources, in.metadataSources...)
   703  		case execinfrapb.StreamEndpointSpec_REMOTE:
   704  			// If the input is remote, the input operator does not exist in
   705  			// streamIDToInputOp. Create an inbox.
   706  			if err := s.checkInboundStreamID(inputStream.StreamID); err != nil {
   707  				return nil, nil, err
   708  			}
   709  			inbox, err := s.remoteComponentCreator.newInbox(
   710  				colmem.NewAllocator(ctx, s.newStreamingMemAccount(flowCtx), factory),
   711  				input.ColumnTypes, inputStream.StreamID,
   712  			)
   713  			if err != nil {
   714  				return nil, nil, err
   715  			}
   716  			s.addStreamEndpoint(inputStream.StreamID, inbox, s.waitGroup)
   717  			metaSources = append(metaSources, inbox)
   718  			op = inbox
   719  			if s.recordingStats {
   720  				op, err = s.wrapWithVectorizedStatsCollector(
   721  					inbox, nil /* inputs */, int32(inputStream.StreamID),
   722  					execinfrapb.StreamIDTagKey, nil, /* monitors */
   723  				)
   724  				if err != nil {
   725  					return nil, nil, err
   726  				}
   727  			}
   728  			inputStreamOps = append(inputStreamOps, op)
   729  		default:
   730  			return nil, nil, errors.Errorf("unsupported input stream type %s", inputStream.Type)
   731  		}
   732  	}
   733  	op = inputStreamOps[0]
   734  	if len(inputStreamOps) > 1 {
   735  		var err error
   736  		statsInputs := inputStreamOps
   737  		if input.Type == execinfrapb.InputSyncSpec_ORDERED {
   738  			op, err = colexec.NewOrderedSynchronizer(
   739  				colmem.NewAllocator(ctx, s.newStreamingMemAccount(flowCtx), factory),
   740  				inputStreamOps, input.ColumnTypes, execinfrapb.ConvertToColumnOrdering(input.Ordering),
   741  			)
   742  			if err != nil {
   743  				return nil, nil, err
   744  			}
   745  		} else {
   746  			if opt == flowinfra.FuseAggressively {
   747  				op = colexec.NewSerialUnorderedSynchronizer(inputStreamOps, input.ColumnTypes)
   748  			} else {
   749  				op = colexec.NewParallelUnorderedSynchronizer(inputStreamOps, input.ColumnTypes, s.waitGroup)
   750  				s.operatorConcurrency = true
   751  			}
   752  			// Don't use the unordered synchronizer's inputs for stats collection
   753  			// given that they run concurrently. The stall time will be collected
   754  			// instead.
   755  			statsInputs = nil
   756  		}
   757  		if s.recordingStats {
   758  			// TODO(asubiotto): Once we have IDs for synchronizers, plumb them into
   759  			// this stats collector to display stats.
   760  			op, err = s.wrapWithVectorizedStatsCollector(
   761  				op, statsInputs, -1 /* id */, "" /* idTagKey */, nil, /* monitors */
   762  			)
   763  			if err != nil {
   764  				return nil, nil, err
   765  			}
   766  		}
   767  	}
   768  	return op, metaSources, nil
   769  }
   770  
   771  // setupOutput sets up any necessary infrastructure according to the output
   772  // spec of pspec. The metadataSourcesQueue is fully consumed by either
   773  // connecting it to a component that can drain these MetadataSources (root
   774  // materializer or outbox) or storing it in streamIDToInputOp with the given op
   775  // to be processed later.
   776  // NOTE: The caller must not reuse the metadataSourcesQueue.
   777  func (s *vectorizedFlowCreator) setupOutput(
   778  	ctx context.Context,
   779  	flowCtx *execinfra.FlowCtx,
   780  	pspec *execinfrapb.ProcessorSpec,
   781  	op colexecbase.Operator,
   782  	opOutputTypes []*types.T,
   783  	metadataSourcesQueue []execinfrapb.MetadataSource,
   784  	toClose []colexec.IdempotentCloser,
   785  	factory coldata.ColumnFactory,
   786  ) error {
   787  	output := &pspec.Output[0]
   788  	if output.Type != execinfrapb.OutputRouterSpec_PASS_THROUGH {
   789  		return s.setupRouter(
   790  			ctx,
   791  			flowCtx,
   792  			op,
   793  			opOutputTypes,
   794  			output,
   795  			// Pass in a copy of the queue to reset metadataSourcesQueue for
   796  			// further appends without overwriting.
   797  			metadataSourcesQueue,
   798  			toClose,
   799  			factory,
   800  		)
   801  	}
   802  
   803  	if len(output.Streams) != 1 {
   804  		return errors.Errorf("unsupported multi outputstream proc (%d streams)", len(output.Streams))
   805  	}
   806  	outputStream := &output.Streams[0]
   807  	switch outputStream.Type {
   808  	case execinfrapb.StreamEndpointSpec_LOCAL:
   809  		s.streamIDToInputOp[outputStream.StreamID] = opDAGWithMetaSources{
   810  			rootOperator: op, metadataSources: metadataSourcesQueue, toClose: toClose,
   811  		}
   812  	case execinfrapb.StreamEndpointSpec_REMOTE:
   813  		// Set up an Outbox. Note that we pass in a copy of metadataSourcesQueue
   814  		// so that we can reset it below and keep on writing to it.
   815  		if s.recordingStats {
   816  			// If recording stats, we add a metadata source that will generate all
   817  			// stats data as metadata for the stats collectors created so far.
   818  			vscs := append([]*colexec.VectorizedStatsCollector(nil), s.vectorizedStatsCollectorsQueue...)
   819  			s.vectorizedStatsCollectorsQueue = s.vectorizedStatsCollectorsQueue[:0]
   820  			metadataSourcesQueue = append(
   821  				metadataSourcesQueue,
   822  				execinfrapb.CallbackMetadataSource{
   823  					DrainMetaCb: func(ctx context.Context) []execinfrapb.ProducerMetadata {
   824  						// TODO(asubiotto): Who is responsible for the recording of the
   825  						// parent context?
   826  						// Start a separate recording so that GetRecording will return
   827  						// the recordings for only the child spans containing stats.
   828  						ctx, span := tracing.ChildSpanSeparateRecording(ctx, "")
   829  						finishVectorizedStatsCollectors(
   830  							ctx, flowCtx.ID, flowCtx.Cfg.TestingKnobs.DeterministicStats, vscs,
   831  						)
   832  						return []execinfrapb.ProducerMetadata{{TraceData: tracing.GetRecording(span)}}
   833  					},
   834  				},
   835  			)
   836  		}
   837  		outbox, err :=
   838  			s.setupRemoteOutputStream(ctx, flowCtx, op, opOutputTypes, outputStream, metadataSourcesQueue, toClose, factory)
   839  		if err != nil {
   840  			return err
   841  		}
   842  		// An outbox is a leaf: there's nothing that sees it as an input on this
   843  		// node.
   844  		s.leaves = append(s.leaves, outbox)
   845  	case execinfrapb.StreamEndpointSpec_SYNC_RESPONSE:
   846  		if s.syncFlowConsumer == nil {
   847  			return errors.New("syncFlowConsumer unset, unable to create materializer")
   848  		}
   849  		// Make the materializer, which will write to the given receiver.
   850  		columnTypes := s.syncFlowConsumer.Types()
   851  		if err := assertTypesMatch(columnTypes, opOutputTypes); err != nil {
   852  			return err
   853  		}
   854  		var outputStatsToTrace func()
   855  		if s.recordingStats {
   856  			// Make a copy given that vectorizedStatsCollectorsQueue is reset and
   857  			// appended to.
   858  			vscq := append([]*colexec.VectorizedStatsCollector(nil), s.vectorizedStatsCollectorsQueue...)
   859  			outputStatsToTrace = func() {
   860  				finishVectorizedStatsCollectors(
   861  					ctx, flowCtx.ID, flowCtx.Cfg.TestingKnobs.DeterministicStats, vscq,
   862  				)
   863  			}
   864  		}
   865  		proc, err := colexec.NewMaterializer(
   866  			flowCtx,
   867  			pspec.ProcessorID,
   868  			op,
   869  			columnTypes,
   870  			s.syncFlowConsumer,
   871  			metadataSourcesQueue,
   872  			toClose,
   873  			outputStatsToTrace,
   874  			s.getCancelFlowFn,
   875  		)
   876  		if err != nil {
   877  			return err
   878  		}
   879  		s.vectorizedStatsCollectorsQueue = s.vectorizedStatsCollectorsQueue[:0]
   880  		// A materializer is a leaf.
   881  		s.leaves = append(s.leaves, proc)
   882  		s.addMaterializer(proc)
   883  		s.materializerAdded = true
   884  	default:
   885  		return errors.Errorf("unsupported output stream type %s", outputStream.Type)
   886  	}
   887  	return nil
   888  }
   889  
   890  func (s *vectorizedFlowCreator) setupFlow(
   891  	ctx context.Context,
   892  	flowCtx *execinfra.FlowCtx,
   893  	processorSpecs []execinfrapb.ProcessorSpec,
   894  	opt flowinfra.FuseOpt,
   895  ) (leaves []execinfra.OpNode, err error) {
   896  	streamIDToSpecIdx := make(map[execinfrapb.StreamID]int)
   897  	factory := coldataext.NewExtendedColumnFactory(flowCtx.NewEvalCtx())
   898  	// queue is a queue of indices into processorSpecs, for topologically
   899  	// ordered processing.
   900  	queue := make([]int, 0, len(processorSpecs))
   901  	for i := range processorSpecs {
   902  		hasLocalInput := false
   903  		for j := range processorSpecs[i].Input {
   904  			input := &processorSpecs[i].Input[j]
   905  			for k := range input.Streams {
   906  				stream := &input.Streams[k]
   907  				streamIDToSpecIdx[stream.StreamID] = i
   908  				if stream.Type != execinfrapb.StreamEndpointSpec_REMOTE {
   909  					hasLocalInput = true
   910  				}
   911  			}
   912  		}
   913  		if hasLocalInput {
   914  			continue
   915  		}
   916  		// Queue all processors with either no inputs or remote inputs.
   917  		queue = append(queue, i)
   918  	}
   919  
   920  	inputs := make([]colexecbase.Operator, 0, 2)
   921  	for len(queue) > 0 {
   922  		pspec := &processorSpecs[queue[0]]
   923  		queue = queue[1:]
   924  		if len(pspec.Output) > 1 {
   925  			return nil, errors.Errorf("unsupported multi-output proc (%d outputs)", len(pspec.Output))
   926  		}
   927  
   928  		// metadataSourcesQueue contains all the MetadataSources that need to be
   929  		// drained. If in a given loop iteration no component that can drain
   930  		// metadata from these sources is found, the metadataSourcesQueue should be
   931  		// added as part of one of the last unconnected inputDAGs in
   932  		// streamIDToInputOp. This is to avoid cycles.
   933  		metadataSourcesQueue := make([]execinfrapb.MetadataSource, 0, 1)
   934  		// toClose is similar to metadataSourcesQueue with the difference that these
   935  		// components do not produce metadata and should be Closed even during
   936  		// non-graceful termination.
   937  		toClose := make([]colexec.IdempotentCloser, 0, 1)
   938  		inputs = inputs[:0]
   939  		for i := range pspec.Input {
   940  			input, metadataSources, err := s.setupInput(ctx, flowCtx, pspec.Input[i], opt, factory)
   941  			if err != nil {
   942  				return nil, err
   943  			}
   944  			metadataSourcesQueue = append(metadataSourcesQueue, metadataSources...)
   945  			inputs = append(inputs, input)
   946  		}
   947  
   948  		args := colexec.NewColOperatorArgs{
   949  			Spec:                 pspec,
   950  			Inputs:               inputs,
   951  			StreamingMemAccount:  s.newStreamingMemAccount(flowCtx),
   952  			ProcessorConstructor: rowexec.NewProcessor,
   953  			DiskQueueCfg:         s.diskQueueCfg,
   954  			FDSemaphore:          s.fdSemaphore,
   955  		}
   956  		result, err := colexec.NewColOperator(ctx, flowCtx, args)
   957  		// Even when err is non-nil, it is possible that the buffering memory
   958  		// monitor and account have been created, so we always want to accumulate
   959  		// them for a proper cleanup.
   960  		s.monitors = append(s.monitors, result.OpMonitors...)
   961  		s.accounts = append(s.accounts, result.OpAccounts...)
   962  		if err != nil {
   963  			return nil, errors.Wrapf(err, "unable to vectorize execution plan")
   964  		}
   965  		if flowCtx.Cfg != nil && flowCtx.Cfg.TestingKnobs.EnableVectorizedInvariantsChecker {
   966  			result.Op = colexec.NewInvariantsChecker(result.Op)
   967  		}
   968  		if flowCtx.EvalCtx.SessionData.VectorizeMode == sessiondata.Vectorize201Auto &&
   969  			!result.IsStreaming {
   970  			return nil, errors.Errorf("non-streaming operator encountered when vectorize=201auto")
   971  		}
   972  		// We created a streaming memory account when calling NewColOperator above,
   973  		// so there is definitely at least one memory account, and it doesn't
   974  		// matter which one we grow.
   975  		if err = s.streamingMemAccounts[0].Grow(ctx, int64(result.InternalMemUsage)); err != nil {
   976  			return nil, errors.Wrapf(err, "not enough memory to setup vectorized plan")
   977  		}
   978  		metadataSourcesQueue = append(metadataSourcesQueue, result.MetadataSources...)
   979  		toClose = append(toClose, result.ToClose...)
   980  
   981  		op := result.Op
   982  		if s.recordingStats {
   983  			op, err = s.wrapWithVectorizedStatsCollector(
   984  				op, inputs, pspec.ProcessorID, execinfrapb.ProcessorIDTagKey, result.OpMonitors,
   985  			)
   986  			if err != nil {
   987  				return nil, err
   988  			}
   989  		}
   990  
   991  		if (flowCtx.EvalCtx.SessionData.VectorizeMode == sessiondata.Vectorize201Auto) &&
   992  			pspec.Output[0].Type == execinfrapb.OutputRouterSpec_BY_HASH {
   993  			// colexec.HashRouter is not supported when vectorize=auto since it can
   994  			// buffer an unlimited number of tuples, even though it falls back to
   995  			// disk. vectorize=on does support this.
   996  			return nil, errors.Errorf("hash router encountered when vectorize=201auto")
   997  		}
   998  		if err = s.setupOutput(
   999  			ctx, flowCtx, pspec, op, result.ColumnTypes, metadataSourcesQueue, toClose, factory,
  1000  		); err != nil {
  1001  			return nil, err
  1002  		}
  1003  
  1004  		// Now queue all outputs from this op whose inputs are already all
  1005  		// populated.
  1006  	NEXTOUTPUT:
  1007  		for i := range pspec.Output {
  1008  			for j := range pspec.Output[i].Streams {
  1009  				outputStream := &pspec.Output[i].Streams[j]
  1010  				if outputStream.Type != execinfrapb.StreamEndpointSpec_LOCAL {
  1011  					continue
  1012  				}
  1013  				procIdx, ok := streamIDToSpecIdx[outputStream.StreamID]
  1014  				if !ok {
  1015  					return nil, errors.Errorf("couldn't find stream %d", outputStream.StreamID)
  1016  				}
  1017  				outputSpec := &processorSpecs[procIdx]
  1018  				for k := range outputSpec.Input {
  1019  					for l := range outputSpec.Input[k].Streams {
  1020  						inputStream := outputSpec.Input[k].Streams[l]
  1021  						if inputStream.StreamID == outputStream.StreamID {
  1022  							if err := assertTypesMatch(outputSpec.Input[k].ColumnTypes, result.ColumnTypes); err != nil {
  1023  								return nil, err
  1024  							}
  1025  						}
  1026  						if inputStream.Type == execinfrapb.StreamEndpointSpec_REMOTE {
  1027  							// Remote streams are not present in streamIDToInputOp. The
  1028  							// Inboxes that consume these streams are created at the same time
  1029  							// as the operator that needs them, so skip the creation check for
  1030  							// this input.
  1031  							continue
  1032  						}
  1033  						if _, ok := s.streamIDToInputOp[inputStream.StreamID]; !ok {
  1034  							continue NEXTOUTPUT
  1035  						}
  1036  					}
  1037  				}
  1038  				// We found an input op for every single stream in this output. Queue
  1039  				// it for processing.
  1040  				queue = append(queue, procIdx)
  1041  			}
  1042  		}
  1043  	}
  1044  
  1045  	if len(s.vectorizedStatsCollectorsQueue) > 0 {
  1046  		colexecerror.InternalError("not all vectorized stats collectors have been processed")
  1047  	}
  1048  	return s.leaves, nil
  1049  }
  1050  
  1051  // assertTypesMatch checks whether expected types match with actual types and
  1052  // returns an error if not.
  1053  func assertTypesMatch(expected []*types.T, actual []*types.T) error {
  1054  	for i := range expected {
  1055  		if !expected[i].Identical(actual[i]) {
  1056  			return errors.Errorf("mismatched types at index %d: expected %v\tactual %v ",
  1057  				i, expected, actual,
  1058  			)
  1059  		}
  1060  	}
  1061  	return nil
  1062  }
  1063  
  1064  type vectorizedInboundStreamHandler struct {
  1065  	*colrpc.Inbox
  1066  }
  1067  
  1068  var _ flowinfra.InboundStreamHandler = vectorizedInboundStreamHandler{}
  1069  
  1070  // Run is part of the flowinfra.InboundStreamHandler interface.
  1071  func (s vectorizedInboundStreamHandler) Run(
  1072  	ctx context.Context,
  1073  	stream execinfrapb.DistSQL_FlowStreamServer,
  1074  	_ *execinfrapb.ProducerMessage,
  1075  	_ *flowinfra.FlowBase,
  1076  ) error {
  1077  	return s.RunWithStream(ctx, stream)
  1078  }
  1079  
  1080  // Timeout is part of the flowinfra.InboundStreamHandler interface.
  1081  func (s vectorizedInboundStreamHandler) Timeout(err error) {
  1082  	s.Inbox.Timeout(err)
  1083  }
  1084  
  1085  // vectorizedFlowCreatorHelper is a flowCreatorHelper that sets up all the
  1086  // vectorized infrastructure to be actually run.
  1087  type vectorizedFlowCreatorHelper struct {
  1088  	f *flowinfra.FlowBase
  1089  }
  1090  
  1091  var _ flowCreatorHelper = &vectorizedFlowCreatorHelper{}
  1092  
  1093  func (r *vectorizedFlowCreatorHelper) addStreamEndpoint(
  1094  	streamID execinfrapb.StreamID, inbox *colrpc.Inbox, wg *sync.WaitGroup,
  1095  ) {
  1096  	r.f.AddRemoteStream(streamID, flowinfra.NewInboundStreamInfo(
  1097  		vectorizedInboundStreamHandler{inbox},
  1098  		wg,
  1099  	))
  1100  }
  1101  
  1102  func (r *vectorizedFlowCreatorHelper) checkInboundStreamID(sid execinfrapb.StreamID) error {
  1103  	return r.f.CheckInboundStreamID(sid)
  1104  }
  1105  
  1106  func (r *vectorizedFlowCreatorHelper) accumulateAsyncComponent(run runFn) {
  1107  	r.f.AddStartable(
  1108  		flowinfra.StartableFn(func(ctx context.Context, wg *sync.WaitGroup, cancelFn context.CancelFunc) {
  1109  			if wg != nil {
  1110  				wg.Add(1)
  1111  			}
  1112  			go func() {
  1113  				run(ctx, cancelFn)
  1114  				if wg != nil {
  1115  					wg.Done()
  1116  				}
  1117  			}()
  1118  		}))
  1119  }
  1120  
  1121  func (r *vectorizedFlowCreatorHelper) addMaterializer(m *colexec.Materializer) {
  1122  	processors := make([]execinfra.Processor, 1)
  1123  	processors[0] = m
  1124  	r.f.SetProcessors(processors)
  1125  }
  1126  
  1127  func (r *vectorizedFlowCreatorHelper) getCancelFlowFn() context.CancelFunc {
  1128  	return r.f.GetCancelFlowFn()
  1129  }
  1130  
  1131  // noopFlowCreatorHelper is a flowCreatorHelper that only performs sanity
  1132  // checks.
  1133  type noopFlowCreatorHelper struct {
  1134  	inboundStreams map[execinfrapb.StreamID]struct{}
  1135  }
  1136  
  1137  var _ flowCreatorHelper = &noopFlowCreatorHelper{}
  1138  
  1139  func newNoopFlowCreatorHelper() *noopFlowCreatorHelper {
  1140  	return &noopFlowCreatorHelper{
  1141  		inboundStreams: make(map[execinfrapb.StreamID]struct{}),
  1142  	}
  1143  }
  1144  
  1145  func (r *noopFlowCreatorHelper) addStreamEndpoint(
  1146  	streamID execinfrapb.StreamID, _ *colrpc.Inbox, _ *sync.WaitGroup,
  1147  ) {
  1148  	r.inboundStreams[streamID] = struct{}{}
  1149  }
  1150  
  1151  func (r *noopFlowCreatorHelper) checkInboundStreamID(sid execinfrapb.StreamID) error {
  1152  	if _, found := r.inboundStreams[sid]; found {
  1153  		return errors.Errorf("inbound stream %d already exists in map", sid)
  1154  	}
  1155  	return nil
  1156  }
  1157  
  1158  func (r *noopFlowCreatorHelper) accumulateAsyncComponent(runFn) {}
  1159  
  1160  func (r *noopFlowCreatorHelper) addMaterializer(*colexec.Materializer) {}
  1161  
  1162  func (r *noopFlowCreatorHelper) getCancelFlowFn() context.CancelFunc {
  1163  	return nil
  1164  }
  1165  
  1166  // SupportsVectorized checks whether flow is supported by the vectorized engine
  1167  // and returns an error if it isn't. Note that it does so by setting up the
  1168  // full flow without running the components asynchronously.
  1169  // It returns a list of the leaf operators of all flows for the purposes of
  1170  // EXPLAIN output.
  1171  // Note that passed-in output can be nil, but if it is non-nil, only Types()
  1172  // method on it might be called (nothing will actually get Push()'ed into it).
  1173  func SupportsVectorized(
  1174  	ctx context.Context,
  1175  	flowCtx *execinfra.FlowCtx,
  1176  	processorSpecs []execinfrapb.ProcessorSpec,
  1177  	fuseOpt flowinfra.FuseOpt,
  1178  	output execinfra.RowReceiver,
  1179  ) (leaves []execinfra.OpNode, err error) {
  1180  	if output == nil {
  1181  		output = &execinfra.RowChannel{}
  1182  	}
  1183  	creator := newVectorizedFlowCreator(newNoopFlowCreatorHelper(), vectorizedRemoteComponentCreator{}, false, nil, output, nil, execinfrapb.FlowID{}, colcontainer.DiskQueueCfg{}, flowCtx.Cfg.VecFDSemaphore)
  1184  	// We create an unlimited memory account because we're interested whether the
  1185  	// flow is supported via the vectorized engine in general (without paying
  1186  	// attention to the memory since it is node-dependent in the distributed
  1187  	// case).
  1188  	memoryMonitor := mon.MakeMonitor(
  1189  		"supports-vectorized",
  1190  		mon.MemoryResource,
  1191  		nil,           /* curCount */
  1192  		nil,           /* maxHist */
  1193  		-1,            /* increment */
  1194  		math.MaxInt64, /* noteworthy */
  1195  		flowCtx.Cfg.Settings,
  1196  	)
  1197  	memoryMonitor.Start(ctx, nil, mon.MakeStandaloneBudget(math.MaxInt64))
  1198  	defer memoryMonitor.Stop(ctx)
  1199  	defer func() {
  1200  		for _, acc := range creator.streamingMemAccounts {
  1201  			acc.Close(ctx)
  1202  		}
  1203  		for _, acc := range creator.accounts {
  1204  			acc.Close(ctx)
  1205  		}
  1206  		for _, mon := range creator.monitors {
  1207  			mon.Stop(ctx)
  1208  		}
  1209  	}()
  1210  	if vecErr := colexecerror.CatchVectorizedRuntimeError(func() {
  1211  		leaves, err = creator.setupFlow(ctx, flowCtx, processorSpecs, fuseOpt)
  1212  	}); vecErr != nil {
  1213  		return leaves, vecErr
  1214  	}
  1215  	return leaves, err
  1216  }
  1217  
  1218  // VectorizeAlwaysException is an object that returns whether or not execution
  1219  // should continue if vectorize=experimental_always and an error occurred when
  1220  // setting up the vectorized flow. Consider the case in which
  1221  // vectorize=experimental_always. The user must be able to unset this session
  1222  // variable without getting an error.
  1223  type VectorizeAlwaysException interface {
  1224  	// IsException returns whether this object should be an exception to the rule
  1225  	// that an inability to run this node in a vectorized flow should produce an
  1226  	// error.
  1227  	// TODO(asubiotto): This is the cleanest way I can think of to not error out
  1228  	// on SET statements when running with vectorize = experimental_always. If
  1229  	// there is a better way, we should get rid of this interface.
  1230  	IsException() bool
  1231  }