github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/sql/flowinfra/flow_registry.go (about)

     1  // Copyright 2016 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package flowinfra
    12  
    13  import (
    14  	"context"
    15  	"fmt"
    16  	"sync"
    17  	"time"
    18  
    19  	"github.com/cockroachdb/cockroach/pkg/base"
    20  	"github.com/cockroachdb/cockroach/pkg/settings"
    21  	"github.com/cockroachdb/cockroach/pkg/sql/execinfra"
    22  	"github.com/cockroachdb/cockroach/pkg/sql/execinfrapb"
    23  	"github.com/cockroachdb/cockroach/pkg/util/log"
    24  	"github.com/cockroachdb/cockroach/pkg/util/syncutil"
    25  	"github.com/cockroachdb/cockroach/pkg/util/timeutil"
    26  	"github.com/cockroachdb/errors"
    27  	"github.com/opentracing/opentracing-go"
    28  )
    29  
    30  var errNoInboundStreamConnection = errors.New("no inbound stream connection")
    31  
    32  // SettingFlowStreamTimeout is a cluster setting that sets the default flow
    33  // stream timeout.
    34  var SettingFlowStreamTimeout = settings.RegisterNonNegativeDurationSetting(
    35  	"sql.distsql.flow_stream_timeout",
    36  	"amount of time incoming streams wait for a flow to be set up before erroring out",
    37  	10*time.Second,
    38  )
    39  
    40  // expectedConnectionTime is the expected time taken by a flow to connect to its
    41  // consumers.
    42  const expectedConnectionTime time.Duration = 500 * time.Millisecond
    43  
    44  // InboundStreamInfo represents the endpoint where a data stream from another
    45  // node connects to a flow. The external node initiates this process through a
    46  // FlowStream RPC, which uses (*Flow).connectInboundStream() to associate the
    47  // stream to a receiver to push rows to.
    48  type InboundStreamInfo struct {
    49  	// receiver is the entity that will receive rows from another host, which is
    50  	// part of a processor (normally an input synchronizer) for row-based
    51  	// execution and a colrpc.Inbox for vectorized execution.
    52  	//
    53  	// During a FlowStream RPC, the stream is handed off to this strategy to
    54  	// process.
    55  	receiver  InboundStreamHandler
    56  	connected bool
    57  	// if set, indicates that we waited too long for an inbound connection, or
    58  	// we don't want this stream to connect anymore due to flow cancellation.
    59  	canceled bool
    60  	// finished is set if we have signaled that the stream is done transferring
    61  	// rows (to the flow's wait group).
    62  	finished bool
    63  
    64  	// waitGroup to signal on when finished.
    65  	waitGroup *sync.WaitGroup
    66  }
    67  
    68  // NewInboundStreamInfo returns a new InboundStreamInfo.
    69  func NewInboundStreamInfo(
    70  	receiver InboundStreamHandler, waitGroup *sync.WaitGroup,
    71  ) *InboundStreamInfo {
    72  	return &InboundStreamInfo{
    73  		receiver:  receiver,
    74  		waitGroup: waitGroup,
    75  	}
    76  }
    77  
    78  // flowEntry is a structure associated with a (potential) flow.
    79  type flowEntry struct {
    80  	// waitCh is set if one or more clients are waiting for the flow; the
    81  	// channel gets closed when the flow is registered.
    82  	waitCh chan struct{}
    83  
    84  	// refCount is used to allow multiple clients to wait for a flow - if the
    85  	// flow never shows up, the refCount is used to decide which client cleans
    86  	// up the entry.
    87  	refCount int
    88  
    89  	flow *FlowBase
    90  
    91  	// inboundStreams are streams that receive data from other hosts, through the
    92  	// FlowStream API. All fields in the inboundStreamInfos are protected by the
    93  	// FlowRegistry mutex (except the receiver, whose methods can be called
    94  	// freely).
    95  	inboundStreams map[execinfrapb.StreamID]*InboundStreamInfo
    96  
    97  	// streamTimer is a timer that fires after a timeout and verifies that all
    98  	// inbound streams have been connected.
    99  	streamTimer *time.Timer
   100  }
   101  
   102  // FlowRegistry allows clients to look up flows by ID and to wait for flows to
   103  // be registered. Multiple clients can wait concurrently for the same flow.
   104  type FlowRegistry struct {
   105  	syncutil.Mutex
   106  
   107  	// All fields in the flowEntry's are protected by the FlowRegistry mutex,
   108  	// except flow, whose methods can be called freely.
   109  	flows map[execinfrapb.FlowID]*flowEntry
   110  
   111  	// draining specifies whether the FlowRegistry is in drain mode. If it is,
   112  	// the FlowRegistry will not accept new flows.
   113  	draining bool
   114  
   115  	// flowDone is signaled whenever the size of flows decreases.
   116  	flowDone *sync.Cond
   117  
   118  	// testingRunBeforeDrainSleep is a testing knob executed when a draining
   119  	// FlowRegistry has no registered flows but must still wait for a minimum time
   120  	// for any incoming flows to register.
   121  	testingRunBeforeDrainSleep func()
   122  }
   123  
   124  // NewFlowRegistry creates a new FlowRegistry.
   125  //
   126  // instID is the ID of the current node. Used for debugging; pass 0 if you don't
   127  // care.
   128  func NewFlowRegistry(instID base.SQLInstanceID) *FlowRegistry {
   129  	fr := &FlowRegistry{flows: make(map[execinfrapb.FlowID]*flowEntry)}
   130  	fr.flowDone = sync.NewCond(fr)
   131  	return fr
   132  }
   133  
   134  // getEntryLocked returns the flowEntry associated with the id. If the entry
   135  // doesn't exist, one is created and inserted into the map.
   136  // It should only be called while holding the mutex.
   137  func (fr *FlowRegistry) getEntryLocked(id execinfrapb.FlowID) *flowEntry {
   138  	entry, ok := fr.flows[id]
   139  	if !ok {
   140  		entry = &flowEntry{}
   141  		fr.flows[id] = entry
   142  	}
   143  	return entry
   144  }
   145  
   146  // releaseEntryLocked decreases the refCount in the entry for the given id, and
   147  // cleans up the entry if the refCount reaches 0.
   148  // It should only be called while holding the mutex.
   149  func (fr *FlowRegistry) releaseEntryLocked(id execinfrapb.FlowID) {
   150  	entry := fr.flows[id]
   151  	if entry.refCount > 1 {
   152  		entry.refCount--
   153  	} else {
   154  		if entry.refCount != 1 {
   155  			panic(fmt.Sprintf("invalid refCount: %d", entry.refCount))
   156  		}
   157  		delete(fr.flows, id)
   158  		fr.flowDone.Signal()
   159  	}
   160  }
   161  
   162  type flowRetryableError struct {
   163  	cause error
   164  }
   165  
   166  func (e *flowRetryableError) Error() string {
   167  	return fmt.Sprintf("flow retryable error: %+v", e.cause)
   168  }
   169  
   170  // IsFlowRetryableError returns true if an error represents a retryable
   171  // flow error.
   172  func IsFlowRetryableError(e error) bool {
   173  	return errors.HasType(e, (*flowRetryableError)(nil))
   174  }
   175  
   176  // RegisterFlow makes a flow accessible to ConnectInboundStream. Any concurrent
   177  // ConnectInboundStream calls that are waiting for this flow are woken up.
   178  //
   179  // It is expected that UnregisterFlow will be called at some point to remove the
   180  // flow from the registry.
   181  //
   182  // inboundStreams are all the remote streams that will be connected into this
   183  // flow. If any of them is not connected within timeout, errors are propagated.
   184  // The inboundStreams are expected to have been initialized with their
   185  // WaitGroups (the group should have been incremented). RegisterFlow takes
   186  // responsibility for calling Done() on that WaitGroup; this responsibility will
   187  // be forwarded forward by ConnectInboundStream. In case this method returns an
   188  // error, the WaitGroup will be decremented.
   189  func (fr *FlowRegistry) RegisterFlow(
   190  	ctx context.Context,
   191  	id execinfrapb.FlowID,
   192  	f *FlowBase,
   193  	inboundStreams map[execinfrapb.StreamID]*InboundStreamInfo,
   194  	timeout time.Duration,
   195  ) (retErr error) {
   196  	fr.Lock()
   197  	defer fr.Unlock()
   198  	defer func() {
   199  		if retErr != nil {
   200  			for _, stream := range inboundStreams {
   201  				stream.waitGroup.Done()
   202  			}
   203  		}
   204  	}()
   205  
   206  	draining := fr.draining
   207  	if f.Cfg != nil {
   208  		if knobs, ok := f.Cfg.TestingKnobs.Flowinfra.(*TestingKnobs); ok && knobs != nil && knobs.FlowRegistryDraining != nil {
   209  			draining = knobs.FlowRegistryDraining()
   210  		}
   211  	}
   212  
   213  	if draining {
   214  		return &flowRetryableError{cause: errors.Errorf(
   215  			"could not register flowID %d because the registry is draining",
   216  			id,
   217  		)}
   218  	}
   219  	entry := fr.getEntryLocked(id)
   220  	if entry.flow != nil {
   221  		return errors.Errorf(
   222  			"flow already registered: flowID: %s.\n"+
   223  				"Current flow: %+v\nExisting flow: %+v",
   224  			f.spec.FlowID, f.spec, entry.flow.spec)
   225  	}
   226  	// Take a reference that will be removed by UnregisterFlow.
   227  	entry.refCount++
   228  	entry.flow = f
   229  	entry.inboundStreams = inboundStreams
   230  	// If there are any waiters, wake them up by closing waitCh.
   231  	if entry.waitCh != nil {
   232  		close(entry.waitCh)
   233  	}
   234  
   235  	if len(inboundStreams) > 0 {
   236  		// Set up a function to time out inbound streams after a while.
   237  		entry.streamTimer = time.AfterFunc(timeout, func() {
   238  			fr.Lock()
   239  			// We're giving up waiting for these inbound streams. We will push an
   240  			// error to its consumer after fr.Unlock; the error will propagate and
   241  			// eventually drain all the processors.
   242  			timedOutReceivers := fr.cancelPendingStreamsLocked(id)
   243  			fr.Unlock()
   244  			if len(timedOutReceivers) != 0 {
   245  				// The span in the context might be finished by the time this runs. In
   246  				// principle, we could ForkCtxSpan() beforehand, but we don't want to
   247  				// create the extra span every time.
   248  				timeoutCtx := opentracing.ContextWithSpan(ctx, nil)
   249  				log.Errorf(
   250  					timeoutCtx,
   251  					"flow id:%s : %d inbound streams timed out after %s; propagated error throughout flow",
   252  					id,
   253  					len(timedOutReceivers),
   254  					timeout,
   255  				)
   256  			}
   257  			for _, r := range timedOutReceivers {
   258  				go func(r InboundStreamHandler) {
   259  					r.Timeout(errNoInboundStreamConnection)
   260  				}(r)
   261  			}
   262  		})
   263  	}
   264  	return nil
   265  }
   266  
   267  // cancelPendingStreamsLocked cancels all of the streams that haven't been
   268  // connected yet in this flow, by setting them to finished and ending their
   269  // wait group. The method returns the list of RowReceivers corresponding to the
   270  // streams that were canceled. The caller is expected to send those
   271  // RowReceivers a cancellation message - this method can't do it because sending
   272  // those messages shouldn't happen under the flow registry's lock.
   273  func (fr *FlowRegistry) cancelPendingStreamsLocked(id execinfrapb.FlowID) []InboundStreamHandler {
   274  	entry := fr.flows[id]
   275  	if entry == nil || entry.flow == nil {
   276  		return nil
   277  	}
   278  	pendingReceivers := make([]InboundStreamHandler, 0)
   279  	for streamID, is := range entry.inboundStreams {
   280  		// Connected, non-finished inbound streams will get an error
   281  		// returned in ProcessInboundStream(). Non-connected streams
   282  		// are handled below.
   283  		if !is.connected && !is.finished && !is.canceled {
   284  			is.canceled = true
   285  			pendingReceivers = append(pendingReceivers, is.receiver)
   286  			fr.finishInboundStreamLocked(id, streamID)
   287  		}
   288  	}
   289  	return pendingReceivers
   290  }
   291  
   292  // UnregisterFlow removes a flow from the registry. Any subsequent
   293  // ConnectInboundStream calls for the flow will fail to find it and time out.
   294  func (fr *FlowRegistry) UnregisterFlow(id execinfrapb.FlowID) {
   295  	fr.Lock()
   296  	entry := fr.flows[id]
   297  	if entry.streamTimer != nil {
   298  		entry.streamTimer.Stop()
   299  		entry.streamTimer = nil
   300  	}
   301  	fr.releaseEntryLocked(id)
   302  	fr.Unlock()
   303  }
   304  
   305  // waitForFlowLocked  waits until the flow with the given id gets registered -
   306  // up to the given timeout - and returns the flowEntry. If the timeout elapses,
   307  // returns nil. It should only be called while holding the mutex. The mutex is
   308  // temporarily unlocked if we need to wait.
   309  // It is illegal to call this if the flow is already connected.
   310  func (fr *FlowRegistry) waitForFlowLocked(
   311  	ctx context.Context, id execinfrapb.FlowID, timeout time.Duration,
   312  ) *flowEntry {
   313  	entry := fr.getEntryLocked(id)
   314  	if entry.flow != nil {
   315  		log.Fatalf(ctx, "waitForFlowLocked called for a flow that's already registered: %d", id)
   316  	}
   317  
   318  	// Flow not registered (at least not yet).
   319  
   320  	// Set up a channel that gets closed when the flow shows up, or when the
   321  	// timeout elapses. The channel might have been created already if there are
   322  	// other waiters for the same id.
   323  	waitCh := entry.waitCh
   324  	if waitCh == nil {
   325  		waitCh = make(chan struct{})
   326  		entry.waitCh = waitCh
   327  	}
   328  	entry.refCount++
   329  	fr.Unlock()
   330  
   331  	select {
   332  	case <-waitCh:
   333  	case <-time.After(timeout):
   334  	case <-ctx.Done():
   335  	}
   336  
   337  	fr.Lock()
   338  
   339  	fr.releaseEntryLocked(id)
   340  	if entry.flow == nil {
   341  		return nil
   342  	}
   343  
   344  	return entry
   345  }
   346  
   347  // Drain waits at most flowDrainWait for currently running flows to finish and
   348  // at least minFlowDrainWait for any incoming flows to be registered. If there
   349  // are still flows active after flowDrainWait, Drain waits an extra
   350  // expectedConnectionTime so that any flows that were registered at the end of
   351  // the time window have a reasonable amount of time to connect to their
   352  // consumers, thus unblocking them.
   353  // The FlowRegistry rejects any new flows once it has finished draining.
   354  //
   355  // Note that since local flows are not added to the registry, they are not
   356  // waited for. However, this is fine since there should be no local flows
   357  // running when the FlowRegistry drains as the draining logic starts with
   358  // draining all client connections to a node.
   359  //
   360  // The reporter callback, if non-nil, is called on a best effort basis
   361  // to report work that needed to be done and which may or may not have
   362  // been done by the time this call returns. See the explanation in
   363  // pkg/server/drain.go for details.
   364  func (fr *FlowRegistry) Drain(
   365  	flowDrainWait time.Duration, minFlowDrainWait time.Duration, reporter func(int, string),
   366  ) {
   367  	allFlowsDone := make(chan struct{}, 1)
   368  	start := timeutil.Now()
   369  	stopWaiting := false
   370  
   371  	sleep := func(t time.Duration) {
   372  		if fr.testingRunBeforeDrainSleep != nil {
   373  			fr.testingRunBeforeDrainSleep()
   374  		}
   375  		time.Sleep(t)
   376  	}
   377  
   378  	defer func() {
   379  		// At this stage, we have either hit the flowDrainWait timeout or we have no
   380  		// flows left. We wait for an expectedConnectionTime longer so that we give
   381  		// any flows that were registered in the
   382  		// flowDrainWait - expectedConnectionTime window enough time to establish
   383  		// connections to their consumers so that the consumers do not block for a
   384  		// long time waiting for a connection to be established.
   385  		fr.Lock()
   386  		fr.draining = true
   387  		if len(fr.flows) > 0 {
   388  			fr.Unlock()
   389  			time.Sleep(expectedConnectionTime)
   390  			fr.Lock()
   391  		}
   392  		fr.Unlock()
   393  	}()
   394  
   395  	fr.Lock()
   396  	if len(fr.flows) == 0 {
   397  		fr.Unlock()
   398  		sleep(minFlowDrainWait)
   399  		fr.Lock()
   400  		// No flows were registered, return.
   401  		if len(fr.flows) == 0 {
   402  			fr.Unlock()
   403  			return
   404  		}
   405  	}
   406  	if reporter != nil {
   407  		// Report progress to the Drain RPC.
   408  		reporter(len(fr.flows), "distSQL execution flows")
   409  	}
   410  
   411  	go func() {
   412  		select {
   413  		case <-time.After(flowDrainWait):
   414  			fr.Lock()
   415  			stopWaiting = true
   416  			fr.flowDone.Signal()
   417  			fr.Unlock()
   418  		case <-allFlowsDone:
   419  		}
   420  	}()
   421  
   422  	for !(stopWaiting || len(fr.flows) == 0) {
   423  		fr.flowDone.Wait()
   424  	}
   425  	fr.Unlock()
   426  
   427  	// If we spent less time waiting for all registered flows to finish, wait
   428  	// for the minimum time for any new incoming flows and wait for these to
   429  	// finish.
   430  	waitTime := timeutil.Since(start)
   431  	if waitTime < minFlowDrainWait {
   432  		sleep(minFlowDrainWait - waitTime)
   433  		fr.Lock()
   434  		for !(stopWaiting || len(fr.flows) == 0) {
   435  			fr.flowDone.Wait()
   436  		}
   437  		fr.Unlock()
   438  	}
   439  
   440  	allFlowsDone <- struct{}{}
   441  }
   442  
   443  // Undrain causes the FlowRegistry to start accepting flows again.
   444  func (fr *FlowRegistry) Undrain() {
   445  	fr.Lock()
   446  	fr.draining = false
   447  	fr.Unlock()
   448  }
   449  
   450  // ConnectInboundStream finds the InboundStreamInfo for the given
   451  // <flowID,streamID> pair and marks it as connected. It waits up to timeout for
   452  // the stream to be registered with the registry. It also sends the handshake
   453  // messages to the producer of the stream.
   454  //
   455  // stream is the inbound stream.
   456  //
   457  // It returns the Flow that the stream is connecting to, the receiver that the
   458  // stream must push data to and a cleanup function that must be called to
   459  // unregister the flow from the registry after all the data has been pushed.
   460  //
   461  // The cleanup function will decrement the flow's WaitGroup, so that Flow.Wait()
   462  // is not blocked on this stream any more.
   463  // In case an error is returned, the cleanup function is nil, the Flow is not
   464  // considered connected and is not cleaned up.
   465  func (fr *FlowRegistry) ConnectInboundStream(
   466  	ctx context.Context,
   467  	flowID execinfrapb.FlowID,
   468  	streamID execinfrapb.StreamID,
   469  	stream execinfrapb.DistSQL_FlowStreamServer,
   470  	timeout time.Duration,
   471  ) (_ *FlowBase, _ InboundStreamHandler, _ func(), retErr error) {
   472  	fr.Lock()
   473  	defer fr.Unlock()
   474  
   475  	entry := fr.getEntryLocked(flowID)
   476  	if entry.flow == nil {
   477  		// Send the handshake message informing the producer that the consumer has
   478  		// not been scheduled yet. Another handshake will be sent below once the
   479  		// consumer has been connected.
   480  		deadline := timeutil.Now().Add(timeout)
   481  		if err := stream.Send(&execinfrapb.ConsumerSignal{
   482  			Handshake: &execinfrapb.ConsumerHandshake{
   483  				ConsumerScheduled:        false,
   484  				ConsumerScheduleDeadline: &deadline,
   485  				Version:                  execinfra.Version,
   486  				MinAcceptedVersion:       execinfra.MinAcceptedVersion,
   487  			},
   488  		}); err != nil {
   489  			// TODO(andrei): We failed to send a message to the producer; we'll return
   490  			// an error and leave this stream with connected == false so it times out
   491  			// later. We could call finishInboundStreamLocked() now so that the flow
   492  			// doesn't wait for the timeout and we could remember the error for the
   493  			// consumer if the consumer comes later, but I'm not sure what the best
   494  			// way to do that is. Similarly for the 2nd handshake message below,
   495  			// except there we already have the consumer and we can push the error.
   496  			return nil, nil, nil, err
   497  		}
   498  		entry = fr.waitForFlowLocked(ctx, flowID, timeout)
   499  		if entry == nil {
   500  			return nil, nil, nil, errors.Errorf("flow %s not found", flowID)
   501  		}
   502  	}
   503  
   504  	s, ok := entry.inboundStreams[streamID]
   505  	if !ok {
   506  		return nil, nil, nil, errors.Errorf("flow %s: no inbound stream %d", flowID, streamID)
   507  	}
   508  	if s.connected {
   509  		return nil, nil, nil, errors.Errorf("flow %s: inbound stream %d already connected", flowID, streamID)
   510  	}
   511  	if s.canceled {
   512  		return nil, nil, nil, errors.Errorf("flow %s: inbound stream %d came too late", flowID, streamID)
   513  	}
   514  
   515  	// We now mark the stream as connected but, if an error happens later because
   516  	// the handshake fails, we reset the state; we want the stream to be
   517  	// considered timed out when the moment comes just as if this connection
   518  	// attempt never happened.
   519  	s.connected = true
   520  	defer func() {
   521  		if retErr != nil {
   522  			s.connected = false
   523  		}
   524  	}()
   525  
   526  	if err := stream.Send(&execinfrapb.ConsumerSignal{
   527  		Handshake: &execinfrapb.ConsumerHandshake{
   528  			ConsumerScheduled:  true,
   529  			Version:            execinfra.Version,
   530  			MinAcceptedVersion: execinfra.MinAcceptedVersion,
   531  		},
   532  	}); err != nil {
   533  		return nil, nil, nil, err
   534  	}
   535  
   536  	cleanup := func() {
   537  		fr.Lock()
   538  		fr.finishInboundStreamLocked(flowID, streamID)
   539  		fr.Unlock()
   540  	}
   541  	return entry.flow, s.receiver, cleanup, nil
   542  }
   543  
   544  func (fr *FlowRegistry) finishInboundStreamLocked(
   545  	fid execinfrapb.FlowID, sid execinfrapb.StreamID,
   546  ) {
   547  	flowEntry := fr.getEntryLocked(fid)
   548  	streamEntry := flowEntry.inboundStreams[sid]
   549  
   550  	if !streamEntry.connected && !streamEntry.canceled {
   551  		panic("finising inbound stream that didn't connect or time out")
   552  	}
   553  	if streamEntry.finished {
   554  		panic("double finish")
   555  	}
   556  
   557  	streamEntry.finished = true
   558  	streamEntry.waitGroup.Done()
   559  }