google.golang.org/grpc@v1.74.2/xds/internal/clients/xdsclient/ads_stream.go (about)

     1  /*
     2   *
     3   * Copyright 2025 gRPC authors.
     4   *
     5   * Licensed under the Apache License, Version 2.0 (the "License");
     6   * you may not use this file except in compliance with the License.
     7   * You may obtain a copy of the License at
     8   *
     9   *     http://www.apache.org/licenses/LICENSE-2.0
    10   *
    11   * Unless required by applicable law or agreed to in writing, software
    12   * distributed under the License is distributed on an "AS IS" BASIS,
    13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14   * See the License for the specific language governing permissions and
    15   * limitations under the License.
    16   *
    17   */
    18  
    19  package xdsclient
    20  
    21  import (
    22  	"context"
    23  	"fmt"
    24  	"sync"
    25  	"sync/atomic"
    26  	"time"
    27  
    28  	"google.golang.org/grpc/grpclog"
    29  	igrpclog "google.golang.org/grpc/internal/grpclog"
    30  	"google.golang.org/grpc/xds/internal/clients"
    31  	"google.golang.org/grpc/xds/internal/clients/internal/backoff"
    32  	"google.golang.org/grpc/xds/internal/clients/internal/buffer"
    33  	"google.golang.org/grpc/xds/internal/clients/internal/pretty"
    34  	"google.golang.org/grpc/xds/internal/clients/xdsclient/internal/xdsresource"
    35  
    36  	"google.golang.org/protobuf/proto"
    37  	"google.golang.org/protobuf/types/known/anypb"
    38  
    39  	v3corepb "github.com/envoyproxy/go-control-plane/envoy/config/core/v3"
    40  	v3discoverypb "github.com/envoyproxy/go-control-plane/envoy/service/discovery/v3"
    41  	cpb "google.golang.org/genproto/googleapis/rpc/code"
    42  	statuspb "google.golang.org/genproto/googleapis/rpc/status"
    43  )
    44  
    45  const (
    46  	// Any per-RPC level logs which print complete request or response messages
    47  	// should be gated at this verbosity level. Other per-RPC level logs which print
    48  	// terse output should be at `INFO` and verbosity 2.
    49  	perRPCVerbosityLevel = 9
    50  )
    51  
    52  // response represents a response received on the ADS stream. It contains the
    53  // type URL, version, and resources for the response.
    54  type response struct {
    55  	typeURL   string
    56  	version   string
    57  	resources []*anypb.Any
    58  }
    59  
    60  // dataAndErrTuple is a struct that holds a resource and an error. It is used to
    61  // return a resource and any associated error from a function.
    62  type dataAndErrTuple struct {
    63  	Resource ResourceData
    64  	Err      error
    65  }
    66  
    67  // adsStreamEventHandler is an interface that defines the callbacks for events that
    68  // occur on the ADS stream. Methods on this interface may be invoked
    69  // concurrently and implementations need to handle them in a thread-safe manner.
    70  type adsStreamEventHandler interface {
    71  	onStreamError(error)                           // Called when the ADS stream breaks.
    72  	onWatchExpiry(ResourceType, string)            // Called when the watch timer expires for a resource.
    73  	onResponse(response, func()) ([]string, error) // Called when a response is received on the ADS stream.
    74  }
    75  
    76  // state corresponding to a resource type.
    77  type resourceTypeState struct {
    78  	version             string                                     // Last acked version. Should not be reset when the stream breaks.
    79  	nonce               string                                     // Last received nonce. Should be reset when the stream breaks.
    80  	bufferedRequests    chan struct{}                              // Channel to buffer requests when writing is blocked.
    81  	subscribedResources map[string]*xdsresource.ResourceWatchState // Map of subscribed resource names to their state.
    82  	pendingWrite        bool                                       // True if there is a pending write for this resource type.
    83  }
    84  
    85  // adsStreamImpl provides the functionality associated with an ADS (Aggregated
    86  // Discovery Service) stream on the client side. It manages the lifecycle of the
    87  // ADS stream, including creating the stream, sending requests, and handling
    88  // responses. It also handles flow control and retries for the stream.
    89  type adsStreamImpl struct {
    90  	// The following fields are initialized from arguments passed to the
    91  	// constructor and are read-only afterwards, and hence can be accessed
    92  	// without a mutex.
    93  	transport          clients.Transport       // Transport to use for ADS stream.
    94  	eventHandler       adsStreamEventHandler   // Callbacks into the xdsChannel.
    95  	backoff            func(int) time.Duration // Backoff for retries, after stream failures.
    96  	nodeProto          *v3corepb.Node          // Identifies the gRPC application.
    97  	watchExpiryTimeout time.Duration           // Resource watch expiry timeout
    98  	logger             *igrpclog.PrefixLogger
    99  
   100  	// The following fields are initialized in the constructor and are not
   101  	// written to afterwards, and hence can be accessed without a mutex.
   102  	streamCh     chan clients.Stream // New ADS streams are pushed here.
   103  	requestCh    *buffer.Unbounded   // Subscriptions and unsubscriptions are pushed here.
   104  	runnerDoneCh chan struct{}       // Notify completion of runner goroutine.
   105  	cancel       context.CancelFunc  // To cancel the context passed to the runner goroutine.
   106  
   107  	// Guards access to the below fields (and to the contents of the map).
   108  	mu                sync.Mutex
   109  	resourceTypeState map[ResourceType]*resourceTypeState // Map of resource types to their state.
   110  	fc                *adsFlowControl                     // Flow control for ADS stream.
   111  	firstRequest      bool                                // False after the first request is sent out.
   112  }
   113  
   114  // adsStreamOpts contains the options for creating a new ADS Stream.
   115  type adsStreamOpts struct {
   116  	transport          clients.Transport       // xDS transport to create the stream on.
   117  	eventHandler       adsStreamEventHandler   // Callbacks for stream events.
   118  	backoff            func(int) time.Duration // Backoff for retries, after stream failures.
   119  	nodeProto          *v3corepb.Node          // Node proto to identify the gRPC application.
   120  	watchExpiryTimeout time.Duration           // Resource watch expiry timeout.
   121  	logPrefix          string                  // Prefix to be used for log messages.
   122  }
   123  
   124  // newADSStreamImpl initializes a new adsStreamImpl instance using the given
   125  // parameters.  It also launches goroutines responsible for managing reads and
   126  // writes for messages of the underlying stream.
   127  func newADSStreamImpl(opts adsStreamOpts) *adsStreamImpl {
   128  	s := &adsStreamImpl{
   129  		transport:          opts.transport,
   130  		eventHandler:       opts.eventHandler,
   131  		backoff:            opts.backoff,
   132  		nodeProto:          opts.nodeProto,
   133  		watchExpiryTimeout: opts.watchExpiryTimeout,
   134  
   135  		streamCh:          make(chan clients.Stream, 1),
   136  		requestCh:         buffer.NewUnbounded(),
   137  		runnerDoneCh:      make(chan struct{}),
   138  		resourceTypeState: make(map[ResourceType]*resourceTypeState),
   139  	}
   140  
   141  	l := grpclog.Component("xds")
   142  	s.logger = igrpclog.NewPrefixLogger(l, opts.logPrefix+fmt.Sprintf("[ads-stream %p] ", s))
   143  
   144  	ctx, cancel := context.WithCancel(context.Background())
   145  	s.cancel = cancel
   146  	go s.runner(ctx)
   147  	return s
   148  }
   149  
   150  // Stop blocks until the stream is closed and all spawned goroutines exit.
   151  func (s *adsStreamImpl) Stop() {
   152  	s.cancel()
   153  	s.requestCh.Close()
   154  	<-s.runnerDoneCh
   155  	s.logger.Infof("Shutdown ADS stream")
   156  }
   157  
   158  // subscribe subscribes to the given resource. It is assumed that multiple
   159  // subscriptions for the same resource is deduped at the caller. A discovery
   160  // request is sent out on the underlying stream for the resource type when there
   161  // is sufficient flow control quota.
   162  func (s *adsStreamImpl) subscribe(typ ResourceType, name string) {
   163  	if s.logger.V(2) {
   164  		s.logger.Infof("Subscribing to resource %q of type %q", name, typ.TypeName)
   165  	}
   166  
   167  	s.mu.Lock()
   168  	defer s.mu.Unlock()
   169  
   170  	state, ok := s.resourceTypeState[typ]
   171  	if !ok {
   172  		// An entry in the type state map is created as part of the first
   173  		// subscription request for this type.
   174  		state = &resourceTypeState{
   175  			subscribedResources: make(map[string]*xdsresource.ResourceWatchState),
   176  			bufferedRequests:    make(chan struct{}, 1),
   177  		}
   178  		s.resourceTypeState[typ] = state
   179  	}
   180  
   181  	// Create state for the newly subscribed resource. The watch timer will
   182  	// be started when a request for this resource is actually sent out.
   183  	state.subscribedResources[name] = &xdsresource.ResourceWatchState{State: xdsresource.ResourceWatchStateStarted}
   184  	state.pendingWrite = true
   185  
   186  	// Send a request for the resource type with updated subscriptions.
   187  	s.requestCh.Put(typ)
   188  }
   189  
   190  // Unsubscribe cancels the subscription to the given resource. It is a no-op if
   191  // the given resource does not exist. The watch expiry timer associated with the
   192  // resource is stopped if one is active. A discovery request is sent out on the
   193  // stream for the resource type when there is sufficient flow control quota.
   194  func (s *adsStreamImpl) Unsubscribe(typ ResourceType, name string) {
   195  	if s.logger.V(2) {
   196  		s.logger.Infof("Unsubscribing to resource %q of type %q", name, typ.TypeName)
   197  	}
   198  
   199  	s.mu.Lock()
   200  	defer s.mu.Unlock()
   201  
   202  	state, ok := s.resourceTypeState[typ]
   203  	if !ok {
   204  		return
   205  	}
   206  
   207  	rs, ok := state.subscribedResources[name]
   208  	if !ok {
   209  		return
   210  	}
   211  	if rs.ExpiryTimer != nil {
   212  		rs.ExpiryTimer.Stop()
   213  	}
   214  	delete(state.subscribedResources, name)
   215  	state.pendingWrite = true
   216  
   217  	// Send a request for the resource type with updated subscriptions.
   218  	s.requestCh.Put(typ)
   219  }
   220  
   221  // runner is a long-running goroutine that handles the lifecycle of the ADS
   222  // stream. It spwans another goroutine to handle writes of discovery request
   223  // messages on the stream. Whenever an existing stream fails, it performs
   224  // exponential backoff (if no messages were received on that stream) before
   225  // creating a new stream.
   226  func (s *adsStreamImpl) runner(ctx context.Context) {
   227  	defer close(s.runnerDoneCh)
   228  
   229  	go s.send(ctx)
   230  
   231  	runStreamWithBackoff := func() error {
   232  		stream, err := s.transport.NewStream(ctx, "/envoy.service.discovery.v3.AggregatedDiscoveryService/StreamAggregatedResources")
   233  		if err != nil {
   234  			s.logger.Warningf("Failed to create a new ADS streaming RPC: %v", err)
   235  			s.onError(err, false)
   236  			return nil
   237  		}
   238  		if s.logger.V(2) {
   239  			s.logger.Infof("ADS stream created")
   240  		}
   241  
   242  		s.mu.Lock()
   243  		// Flow control is a property of the underlying streaming RPC call and
   244  		// needs to be initialized everytime a new one is created.
   245  		s.fc = newADSFlowControl(s.logger)
   246  		s.firstRequest = true
   247  		s.mu.Unlock()
   248  
   249  		// Ensure that the most recently created stream is pushed on the
   250  		// channel for the `send` goroutine to consume.
   251  		select {
   252  		case <-s.streamCh:
   253  		default:
   254  		}
   255  		s.streamCh <- stream
   256  
   257  		// Backoff state is reset upon successful receipt of at least one
   258  		// message from the server.
   259  		if s.recv(ctx, stream) {
   260  			return backoff.ErrResetBackoff
   261  		}
   262  		return nil
   263  	}
   264  	backoff.RunF(ctx, runStreamWithBackoff, s.backoff)
   265  }
   266  
   267  // send is a long running goroutine that handles sending discovery requests for
   268  // two scenarios:
   269  // - a new subscription or unsubscription request is received
   270  // - a new stream is created after the previous one failed
   271  func (s *adsStreamImpl) send(ctx context.Context) {
   272  	// Stores the most recent stream instance received on streamCh.
   273  	var stream clients.Stream
   274  	for {
   275  		select {
   276  		case <-ctx.Done():
   277  			return
   278  		case stream = <-s.streamCh:
   279  			if err := s.sendExisting(stream); err != nil {
   280  				// Send failed, clear the current stream. Attempt to resend will
   281  				// only be made after a new stream is created.
   282  				stream = nil
   283  				continue
   284  			}
   285  		case req, ok := <-s.requestCh.Get():
   286  			if !ok {
   287  				return
   288  			}
   289  			s.requestCh.Load()
   290  
   291  			typ := req.(ResourceType)
   292  			if err := s.sendNew(stream, typ); err != nil {
   293  				stream = nil
   294  				continue
   295  			}
   296  		}
   297  	}
   298  }
   299  
   300  // sendNew attempts to send a discovery request based on a new subscription or
   301  // unsubscription. If there is no flow control quota, the request is buffered
   302  // and will be sent later. This method also starts the watch expiry timer for
   303  // resources that were sent in the request for the first time, i.e. their watch
   304  // state is `watchStateStarted`.
   305  func (s *adsStreamImpl) sendNew(stream clients.Stream, typ ResourceType) error {
   306  	s.mu.Lock()
   307  	defer s.mu.Unlock()
   308  
   309  	// If there's no stream yet, skip the request. This request will be resent
   310  	// when a new stream is created. If no stream is created, the watcher will
   311  	// timeout (same as server not sending response back).
   312  	if stream == nil {
   313  		return nil
   314  	}
   315  
   316  	// If local processing of the most recently received response is not yet
   317  	// complete, i.e. fc.pending == true, queue this write and return early.
   318  	// This allows us to batch writes for requests which are generated as part
   319  	// of local processing of a received response.
   320  	state := s.resourceTypeState[typ]
   321  	if s.fc.pending.Load() {
   322  		select {
   323  		case state.bufferedRequests <- struct{}{}:
   324  		default:
   325  		}
   326  		return nil
   327  	}
   328  
   329  	return s.sendMessageIfWritePendingLocked(stream, typ, state)
   330  }
   331  
   332  // sendExisting sends out discovery requests for existing resources when
   333  // recovering from a broken stream.
   334  //
   335  // The stream argument is guaranteed to be non-nil.
   336  func (s *adsStreamImpl) sendExisting(stream clients.Stream) error {
   337  	s.mu.Lock()
   338  	defer s.mu.Unlock()
   339  
   340  	for typ, state := range s.resourceTypeState {
   341  		// Reset only the nonces map when the stream restarts.
   342  		//
   343  		// xDS spec says the following. See section:
   344  		// https://www.envoyproxy.io/docs/envoy/latest/api-docs/xds_protocol#ack-nack-and-resource-type-instance-version
   345  		//
   346  		// Note that the version for a resource type is not a property of an
   347  		// individual xDS stream but rather a property of the resources
   348  		// themselves. If the stream becomes broken and the client creates a new
   349  		// stream, the client’s initial request on the new stream should
   350  		// indicate the most recent version seen by the client on the previous
   351  		// stream
   352  		state.nonce = ""
   353  
   354  		if len(state.subscribedResources) == 0 {
   355  			continue
   356  		}
   357  
   358  		state.pendingWrite = true
   359  		if err := s.sendMessageIfWritePendingLocked(stream, typ, state); err != nil {
   360  			return err
   361  		}
   362  	}
   363  	return nil
   364  }
   365  
   366  // sendBuffered sends out discovery requests for resources that were buffered
   367  // when they were subscribed to, because local processing of the previously
   368  // received response was not yet complete.
   369  //
   370  // The stream argument is guaranteed to be non-nil.
   371  func (s *adsStreamImpl) sendBuffered(stream clients.Stream) error {
   372  	s.mu.Lock()
   373  	defer s.mu.Unlock()
   374  
   375  	for typ, state := range s.resourceTypeState {
   376  		select {
   377  		case <-state.bufferedRequests:
   378  			if err := s.sendMessageIfWritePendingLocked(stream, typ, state); err != nil {
   379  				return err
   380  			}
   381  		default:
   382  			// No buffered request.
   383  			continue
   384  		}
   385  	}
   386  	return nil
   387  }
   388  
   389  // sendMessageIfWritePendingLocked attempts to sends a discovery request to the
   390  // server, if there is a pending write for the given resource type.
   391  //
   392  // If the request is successfully sent, the pending write field is cleared and
   393  // watch timers are started for the resources in the request.
   394  //
   395  // Caller needs to hold c.mu.
   396  func (s *adsStreamImpl) sendMessageIfWritePendingLocked(stream clients.Stream, typ ResourceType, state *resourceTypeState) error {
   397  	if !state.pendingWrite {
   398  		if s.logger.V(2) {
   399  			s.logger.Infof("Skipping sending request for type %q, because all subscribed resources were already sent", typ.TypeURL)
   400  		}
   401  		return nil
   402  	}
   403  
   404  	names := resourceNames(state.subscribedResources)
   405  	if err := s.sendMessageLocked(stream, names, typ.TypeURL, state.version, state.nonce, nil); err != nil {
   406  		return err
   407  	}
   408  	state.pendingWrite = false
   409  
   410  	// Drain the buffered requests channel because we just sent a request for this
   411  	// resource type.
   412  	select {
   413  	case <-state.bufferedRequests:
   414  	default:
   415  	}
   416  
   417  	s.startWatchTimersLocked(typ, names)
   418  	return nil
   419  }
   420  
   421  // sendMessageLocked sends a discovery request to the server, populating the
   422  // different fields of the message with the given parameters. Returns a non-nil
   423  // error if the request could not be sent.
   424  //
   425  // Caller needs to hold c.mu.
   426  func (s *adsStreamImpl) sendMessageLocked(stream clients.Stream, names []string, url, version, nonce string, nackErr error) error {
   427  	req := &v3discoverypb.DiscoveryRequest{
   428  		ResourceNames: names,
   429  		TypeUrl:       url,
   430  		VersionInfo:   version,
   431  		ResponseNonce: nonce,
   432  	}
   433  
   434  	// The xDS protocol only requires that we send the node proto in the first
   435  	// discovery request on every stream. Sending the node proto in every
   436  	// request wastes CPU resources on the client and the server.
   437  	if s.firstRequest {
   438  		req.Node = s.nodeProto
   439  	}
   440  
   441  	if nackErr != nil {
   442  		req.ErrorDetail = &statuspb.Status{
   443  			Code: int32(cpb.Code_INVALID_ARGUMENT), Message: nackErr.Error(),
   444  		}
   445  	}
   446  
   447  	msg, err := proto.Marshal(req)
   448  	if err != nil {
   449  		s.logger.Warningf("Failed to marshal DiscoveryRequest: %v", err)
   450  		return err
   451  	}
   452  	if err := stream.Send(msg); err != nil {
   453  		s.logger.Warningf("Sending ADS request for type %q, resources: %v, version: %q, nonce: %q failed: %v", url, names, version, nonce, err)
   454  		return err
   455  	}
   456  	s.firstRequest = false
   457  
   458  	if s.logger.V(perRPCVerbosityLevel) {
   459  		s.logger.Infof("ADS request sent: %v", pretty.ToJSON(req))
   460  	} else if s.logger.V(2) {
   461  		s.logger.Warningf("ADS request sent for type %q, resources: %v, version: %q, nonce: %q", url, names, version, nonce)
   462  	}
   463  	return nil
   464  }
   465  
   466  // recv is responsible for receiving messages from the ADS stream.
   467  //
   468  // It performs the following actions:
   469  //   - Waits for local flow control to be available before sending buffered
   470  //     requests, if any.
   471  //   - Receives a message from the ADS stream. If an error is encountered here,
   472  //     it is handled by the onError method which propagates the error to all
   473  //     watchers.
   474  //   - Invokes the event handler's OnADSResponse method to process the message.
   475  //   - Sends an ACK or NACK to the server based on the response.
   476  //
   477  // It returns a boolean indicating whether at least one message was received
   478  // from the server.
   479  func (s *adsStreamImpl) recv(ctx context.Context, stream clients.Stream) bool {
   480  	msgReceived := false
   481  	for {
   482  		// Wait for ADS stream level flow control to be available, and send out
   483  		// a request if anything was buffered while we were waiting for local
   484  		// processing of the previous response to complete.
   485  		if !s.fc.wait(ctx) {
   486  			if s.logger.V(2) {
   487  				s.logger.Infof("ADS stream context canceled")
   488  			}
   489  			return msgReceived
   490  		}
   491  		s.sendBuffered(stream)
   492  
   493  		resources, url, version, nonce, err := s.recvMessage(stream)
   494  		if err != nil {
   495  			s.onError(err, msgReceived)
   496  			s.logger.Warningf("ADS stream closed: %v", err)
   497  			return msgReceived
   498  		}
   499  		msgReceived = true
   500  
   501  		// Invoke the onResponse event handler to parse the incoming message and
   502  		// decide whether to send an ACK or NACK.
   503  		resp := response{
   504  			resources: resources,
   505  			typeURL:   url,
   506  			version:   version,
   507  		}
   508  		var resourceNames []string
   509  		var nackErr error
   510  		s.fc.setPending()
   511  		resourceNames, nackErr = s.eventHandler.onResponse(resp, s.fc.onDone)
   512  		if xdsresource.ErrType(nackErr) == xdsresource.ErrorTypeResourceTypeUnsupported {
   513  			// A general guiding principle is that if the server sends
   514  			// something the client didn't actually subscribe to, then the
   515  			// client ignores it. Here, we have received a response with
   516  			// resources of a type that we don't know about.
   517  			//
   518  			// Sending a NACK doesn't really seem appropriate here, since we're
   519  			// not actually validating what the server sent and therefore don't
   520  			// know that it's invalid.  But we shouldn't ACK either, because we
   521  			// don't know that it is valid.
   522  			s.logger.Warningf("%v", nackErr)
   523  			continue
   524  		}
   525  
   526  		s.onRecv(stream, resourceNames, url, version, nonce, nackErr)
   527  	}
   528  }
   529  
   530  func (s *adsStreamImpl) recvMessage(stream clients.Stream) (resources []*anypb.Any, url, version, nonce string, err error) {
   531  	r, err := stream.Recv()
   532  	if err != nil {
   533  		return nil, "", "", "", err
   534  	}
   535  	var resp v3discoverypb.DiscoveryResponse
   536  	if err := proto.Unmarshal(r, &resp); err != nil {
   537  		s.logger.Infof("Failed to unmarshal response to DiscoveryResponse: %v", err)
   538  		return nil, "", "", "", fmt.Errorf("unexpected message type %T", r)
   539  	}
   540  	if s.logger.V(perRPCVerbosityLevel) {
   541  		s.logger.Infof("ADS response received: %v", pretty.ToJSON(&resp))
   542  	} else if s.logger.V(2) {
   543  		s.logger.Infof("ADS response received for type %q, version %q, nonce %q", resp.GetTypeUrl(), resp.GetVersionInfo(), resp.GetNonce())
   544  	}
   545  	return resp.GetResources(), resp.GetTypeUrl(), resp.GetVersionInfo(), resp.GetNonce(), nil
   546  }
   547  
   548  // onRecv is invoked when a response is received from the server. The arguments
   549  // passed to this method correspond to the most recently received response.
   550  //
   551  // It performs the following actions:
   552  //   - updates resource type specific state
   553  //   - updates resource specific state for resources in the response
   554  //   - sends an ACK or NACK to the server based on the response
   555  func (s *adsStreamImpl) onRecv(stream clients.Stream, names []string, url, version, nonce string, nackErr error) {
   556  	s.mu.Lock()
   557  	defer s.mu.Unlock()
   558  
   559  	// Lookup the resource type specific state based on the type URL.
   560  	var typ ResourceType
   561  	for t := range s.resourceTypeState {
   562  		if t.TypeURL == url {
   563  			typ = t
   564  			break
   565  		}
   566  	}
   567  	typeState, ok := s.resourceTypeState[typ]
   568  	if !ok {
   569  		s.logger.Warningf("ADS stream received a response for type %q, but no state exists for it", url)
   570  		return
   571  	}
   572  
   573  	// Update the resource type specific state. This includes:
   574  	//   - updating the nonce unconditionally
   575  	//   - updating the version only if the response is to be ACKed
   576  	previousVersion := typeState.version
   577  	typeState.nonce = nonce
   578  	if nackErr == nil {
   579  		typeState.version = version
   580  	}
   581  
   582  	// Update the resource specific state. For all resources received as
   583  	// part of this response that are in state `started` or `requested`,
   584  	// this includes:
   585  	//   - setting the watch state to watchstateReceived
   586  	//   - stopping the expiry timer, if one exists
   587  	for _, name := range names {
   588  		rs, ok := typeState.subscribedResources[name]
   589  		if !ok {
   590  			s.logger.Warningf("ADS stream received a response for resource %q, but no state exists for it", name)
   591  			continue
   592  		}
   593  		if ws := rs.State; ws == xdsresource.ResourceWatchStateStarted || ws == xdsresource.ResourceWatchStateRequested {
   594  			rs.State = xdsresource.ResourceWatchStateReceived
   595  			if rs.ExpiryTimer != nil {
   596  				rs.ExpiryTimer.Stop()
   597  				rs.ExpiryTimer = nil
   598  			}
   599  		}
   600  	}
   601  
   602  	// Send an ACK or NACK.
   603  	subscribedResourceNames := resourceNames(typeState.subscribedResources)
   604  	if nackErr != nil {
   605  		s.logger.Warningf("Sending NACK for resource type: %q, version: %q, nonce: %q, reason: %v", url, version, nonce, nackErr)
   606  		s.sendMessageLocked(stream, subscribedResourceNames, url, previousVersion, nonce, nackErr)
   607  		return
   608  	}
   609  
   610  	if s.logger.V(2) {
   611  		s.logger.Infof("Sending ACK for resource type: %q, version: %q, nonce: %q", url, version, nonce)
   612  	}
   613  	s.sendMessageLocked(stream, subscribedResourceNames, url, version, nonce, nil)
   614  }
   615  
   616  // onError is called when an error occurs on the ADS stream. It stops any
   617  // outstanding resource timers and resets the watch state to started for any
   618  // resources that were in the requested state. It also handles the case where
   619  // the ADS stream was closed after receiving a response, which is not
   620  // considered an error.
   621  func (s *adsStreamImpl) onError(err error, msgReceived bool) {
   622  	// For resources that been requested but not yet responded to by the
   623  	// management server, stop the resource timers and reset the watch state to
   624  	// watchStateStarted. This is because we don't want the expiry timer to be
   625  	// running when we don't have a stream open to the management server.
   626  	s.mu.Lock()
   627  	for _, state := range s.resourceTypeState {
   628  		for _, rs := range state.subscribedResources {
   629  			if rs.State != xdsresource.ResourceWatchStateRequested {
   630  				continue
   631  			}
   632  			if rs.ExpiryTimer != nil {
   633  				rs.ExpiryTimer.Stop()
   634  				rs.ExpiryTimer = nil
   635  			}
   636  			rs.State = xdsresource.ResourceWatchStateStarted
   637  		}
   638  	}
   639  	s.mu.Unlock()
   640  
   641  	// Note that we do not consider it an error if the ADS stream was closed
   642  	// after having received a response on the stream. This is because there
   643  	// are legitimate reasons why the server may need to close the stream during
   644  	// normal operations, such as needing to rebalance load or the underlying
   645  	// connection hitting its max connection age limit.
   646  	// (see [gRFC A9](https://github.com/grpc/proposal/blob/master/A9-server-side-conn-mgt.md)).
   647  	if msgReceived {
   648  		err = xdsresource.NewError(xdsresource.ErrTypeStreamFailedAfterRecv, err.Error())
   649  	}
   650  
   651  	s.eventHandler.onStreamError(err)
   652  }
   653  
   654  // startWatchTimersLocked starts the expiry timers for the given resource names
   655  // of the specified resource type.  For each resource name, if the resource
   656  // watch state is in the "started" state, it transitions the state to
   657  // "requested" and starts an expiry timer. When the timer expires, the resource
   658  // watch state is set to "timeout" and the event handler callback is called.
   659  //
   660  // The caller must hold the s.mu lock.
   661  func (s *adsStreamImpl) startWatchTimersLocked(typ ResourceType, names []string) {
   662  	typeState := s.resourceTypeState[typ]
   663  	for _, name := range names {
   664  		resourceState, ok := typeState.subscribedResources[name]
   665  		if !ok {
   666  			continue
   667  		}
   668  		if resourceState.State != xdsresource.ResourceWatchStateStarted {
   669  			continue
   670  		}
   671  		resourceState.State = xdsresource.ResourceWatchStateRequested
   672  
   673  		rs := resourceState
   674  		resourceState.ExpiryTimer = time.AfterFunc(s.watchExpiryTimeout, func() {
   675  			s.mu.Lock()
   676  			rs.State = xdsresource.ResourceWatchStateTimeout
   677  			rs.ExpiryTimer = nil
   678  			s.mu.Unlock()
   679  			s.eventHandler.onWatchExpiry(typ, name)
   680  		})
   681  	}
   682  }
   683  
   684  func (s *adsStreamImpl) adsResourceWatchStateForTesting(rType ResourceType, resourceName string) (xdsresource.ResourceWatchState, error) {
   685  	s.mu.Lock()
   686  	defer s.mu.Unlock()
   687  
   688  	state, ok := s.resourceTypeState[rType]
   689  	if !ok {
   690  		return xdsresource.ResourceWatchState{}, fmt.Errorf("unknown resource type: %v", rType)
   691  	}
   692  	resourceState, ok := state.subscribedResources[resourceName]
   693  	if !ok {
   694  		return xdsresource.ResourceWatchState{}, fmt.Errorf("unknown resource name: %v", resourceName)
   695  	}
   696  	return *resourceState, nil
   697  }
   698  
   699  func resourceNames(m map[string]*xdsresource.ResourceWatchState) []string {
   700  	ret := make([]string, len(m))
   701  	idx := 0
   702  	for name := range m {
   703  		ret[idx] = name
   704  		idx++
   705  	}
   706  	return ret
   707  }
   708  
   709  // adsFlowControl implements ADS stream level flow control that enables the
   710  // transport to block the reading of the next message off of the stream until
   711  // the previous update is consumed by all watchers.
   712  //
   713  // The lifetime of the flow control is tied to the lifetime of the stream.
   714  type adsFlowControl struct {
   715  	logger *igrpclog.PrefixLogger
   716  
   717  	// Whether the most recent update is pending consumption by all watchers.
   718  	pending atomic.Bool
   719  	// Channel used to notify when all the watchers have consumed the most
   720  	// recent update. Wait() blocks on reading a value from this channel.
   721  	readyCh chan struct{}
   722  }
   723  
   724  // newADSFlowControl returns a new adsFlowControl.
   725  func newADSFlowControl(logger *igrpclog.PrefixLogger) *adsFlowControl {
   726  	return &adsFlowControl{
   727  		logger:  logger,
   728  		readyCh: make(chan struct{}, 1),
   729  	}
   730  }
   731  
   732  // setPending changes the internal state to indicate that there is an update
   733  // pending consumption by all watchers.
   734  func (fc *adsFlowControl) setPending() {
   735  	fc.pending.Store(true)
   736  }
   737  
   738  // wait blocks until all the watchers have consumed the most recent update and
   739  // returns true. If the context expires before that, it returns false.
   740  func (fc *adsFlowControl) wait(ctx context.Context) bool {
   741  	// If there is no pending update, there is no need to block.
   742  	if !fc.pending.Load() {
   743  		// If all watchers finished processing the most recent update before the
   744  		// `recv` goroutine made the next call to `Wait()`, there would be an
   745  		// entry in the readyCh channel that needs to be drained to ensure that
   746  		// the next call to `Wait()` doesn't unblock before it actually should.
   747  		select {
   748  		case <-fc.readyCh:
   749  		default:
   750  		}
   751  		return true
   752  	}
   753  
   754  	select {
   755  	case <-ctx.Done():
   756  		return false
   757  	case <-fc.readyCh:
   758  		return true
   759  	}
   760  }
   761  
   762  // onDone indicates that all watchers have consumed the most recent update.
   763  func (fc *adsFlowControl) onDone() {
   764  	select {
   765  	// Writes to the readyCh channel should not block ideally. The default
   766  	// branch here is to appease the paranoid mind.
   767  	case fc.readyCh <- struct{}{}:
   768  	default:
   769  		if fc.logger.V(2) {
   770  			fc.logger.Infof("ADS stream flow control readyCh is full")
   771  		}
   772  	}
   773  	fc.pending.Store(false)
   774  }