google.golang.org/grpc@v1.72.2/xds/internal/clients/xdsclient/ads_stream.go (about) 1 /* 2 * 3 * Copyright 2025 gRPC authors. 4 * 5 * Licensed under the Apache License, Version 2.0 (the "License"); 6 * you may not use this file except in compliance with the License. 7 * You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 * 17 */ 18 19 package xdsclient 20 21 import ( 22 "context" 23 "fmt" 24 "sync" 25 "sync/atomic" 26 "time" 27 28 "google.golang.org/grpc/grpclog" 29 igrpclog "google.golang.org/grpc/internal/grpclog" 30 "google.golang.org/grpc/xds/internal/clients" 31 "google.golang.org/grpc/xds/internal/clients/internal/backoff" 32 "google.golang.org/grpc/xds/internal/clients/internal/buffer" 33 "google.golang.org/grpc/xds/internal/clients/internal/pretty" 34 "google.golang.org/grpc/xds/internal/clients/xdsclient/internal/xdsresource" 35 36 "google.golang.org/protobuf/proto" 37 "google.golang.org/protobuf/types/known/anypb" 38 39 v3corepb "github.com/envoyproxy/go-control-plane/envoy/config/core/v3" 40 v3discoverypb "github.com/envoyproxy/go-control-plane/envoy/service/discovery/v3" 41 cpb "google.golang.org/genproto/googleapis/rpc/code" 42 statuspb "google.golang.org/genproto/googleapis/rpc/status" 43 ) 44 45 const ( 46 // Any per-RPC level logs which print complete request or response messages 47 // should be gated at this verbosity level. Other per-RPC level logs which print 48 // terse output should be at `INFO` and verbosity 2. 49 perRPCVerbosityLevel = 9 50 ) 51 52 // response represents a response received on the ADS stream. It contains the 53 // type URL, version, and resources for the response. 54 type response struct { 55 typeURL string 56 version string 57 resources []*anypb.Any 58 } 59 60 // dataAndErrTuple is a struct that holds a resource and an error. It is used to 61 // return a resource and any associated error from a function. 62 type dataAndErrTuple struct { 63 Resource ResourceData 64 Err error 65 } 66 67 // adsStreamEventHandler is an interface that defines the callbacks for events that 68 // occur on the ADS stream. Methods on this interface may be invoked 69 // concurrently and implementations need to handle them in a thread-safe manner. 70 type adsStreamEventHandler interface { 71 onStreamError(error) // Called when the ADS stream breaks. 72 onWatchExpiry(ResourceType, string) // Called when the watch timer expires for a resource. 73 onResponse(response, func()) ([]string, error) // Called when a response is received on the ADS stream. 74 } 75 76 // watchState is a enum that describes the watch state of a particular 77 // resource. 78 type watchState int 79 80 const ( 81 // resourceWatchStateStarted is the state where a watch for a resource was 82 // started, but a request asking for that resource is yet to be sent to the 83 // management server. 84 resourceWatchStateStarted watchState = iota 85 // resourceWatchStateRequested is the state when a request has been sent for 86 // the resource being watched. 87 resourceWatchStateRequested 88 // ResourceWatchStateReceived is the state when a response has been received 89 // for the resource being watched. 90 resourceWatchStateReceived 91 // resourceWatchStateTimeout is the state when the watch timer associated 92 // with the resource expired because no response was received. 93 resourceWatchStateTimeout 94 ) 95 96 // resourceWatchState is the state corresponding to a resource being watched. 97 type resourceWatchState struct { 98 State watchState // Watch state of the resource. 99 ExpiryTimer *time.Timer // Timer for the expiry of the watch. 100 } 101 102 // state corresponding to a resource type. 103 type resourceTypeState struct { 104 version string // Last acked version. Should not be reset when the stream breaks. 105 nonce string // Last received nonce. Should be reset when the stream breaks. 106 bufferedRequests chan struct{} // Channel to buffer requests when writing is blocked. 107 subscribedResources map[string]*resourceWatchState // Map of subscribed resource names to their state. 108 pendingWrite bool // True if there is a pending write for this resource type. 109 } 110 111 // adsStreamImpl provides the functionality associated with an ADS (Aggregated 112 // Discovery Service) stream on the client side. It manages the lifecycle of the 113 // ADS stream, including creating the stream, sending requests, and handling 114 // responses. It also handles flow control and retries for the stream. 115 type adsStreamImpl struct { 116 // The following fields are initialized from arguments passed to the 117 // constructor and are read-only afterwards, and hence can be accessed 118 // without a mutex. 119 transport clients.Transport // Transport to use for ADS stream. 120 eventHandler adsStreamEventHandler // Callbacks into the xdsChannel. 121 backoff func(int) time.Duration // Backoff for retries, after stream failures. 122 nodeProto *v3corepb.Node // Identifies the gRPC application. 123 watchExpiryTimeout time.Duration // Resource watch expiry timeout 124 logger *igrpclog.PrefixLogger 125 126 // The following fields are initialized in the constructor and are not 127 // written to afterwards, and hence can be accessed without a mutex. 128 streamCh chan clients.Stream // New ADS streams are pushed here. 129 requestCh *buffer.Unbounded // Subscriptions and unsubscriptions are pushed here. 130 runnerDoneCh chan struct{} // Notify completion of runner goroutine. 131 cancel context.CancelFunc // To cancel the context passed to the runner goroutine. 132 133 // Guards access to the below fields (and to the contents of the map). 134 mu sync.Mutex 135 resourceTypeState map[ResourceType]*resourceTypeState // Map of resource types to their state. 136 fc *adsFlowControl // Flow control for ADS stream. 137 firstRequest bool // False after the first request is sent out. 138 } 139 140 // adsStreamOpts contains the options for creating a new ADS Stream. 141 type adsStreamOpts struct { 142 transport clients.Transport // xDS transport to create the stream on. 143 eventHandler adsStreamEventHandler // Callbacks for stream events. 144 backoff func(int) time.Duration // Backoff for retries, after stream failures. 145 nodeProto *v3corepb.Node // Node proto to identify the gRPC application. 146 watchExpiryTimeout time.Duration // Resource watch expiry timeout. 147 logPrefix string // Prefix to be used for log messages. 148 } 149 150 // newADSStreamImpl initializes a new adsStreamImpl instance using the given 151 // parameters. It also launches goroutines responsible for managing reads and 152 // writes for messages of the underlying stream. 153 func newADSStreamImpl(opts adsStreamOpts) *adsStreamImpl { 154 s := &adsStreamImpl{ 155 transport: opts.transport, 156 eventHandler: opts.eventHandler, 157 backoff: opts.backoff, 158 nodeProto: opts.nodeProto, 159 watchExpiryTimeout: opts.watchExpiryTimeout, 160 161 streamCh: make(chan clients.Stream, 1), 162 requestCh: buffer.NewUnbounded(), 163 runnerDoneCh: make(chan struct{}), 164 resourceTypeState: make(map[ResourceType]*resourceTypeState), 165 } 166 167 l := grpclog.Component("xds") 168 s.logger = igrpclog.NewPrefixLogger(l, opts.logPrefix+fmt.Sprintf("[ads-stream %p] ", s)) 169 170 ctx, cancel := context.WithCancel(context.Background()) 171 s.cancel = cancel 172 go s.runner(ctx) 173 return s 174 } 175 176 // Stop blocks until the stream is closed and all spawned goroutines exit. 177 func (s *adsStreamImpl) Stop() { 178 s.cancel() 179 s.requestCh.Close() 180 <-s.runnerDoneCh 181 s.logger.Infof("Shutdown ADS stream") 182 } 183 184 // subscribe subscribes to the given resource. It is assumed that multiple 185 // subscriptions for the same resource is deduped at the caller. A discovery 186 // request is sent out on the underlying stream for the resource type when there 187 // is sufficient flow control quota. 188 func (s *adsStreamImpl) subscribe(typ ResourceType, name string) { 189 if s.logger.V(2) { 190 s.logger.Infof("Subscribing to resource %q of type %q", name, typ.TypeName) 191 } 192 193 s.mu.Lock() 194 defer s.mu.Unlock() 195 196 state, ok := s.resourceTypeState[typ] 197 if !ok { 198 // An entry in the type state map is created as part of the first 199 // subscription request for this type. 200 state = &resourceTypeState{ 201 subscribedResources: make(map[string]*resourceWatchState), 202 bufferedRequests: make(chan struct{}, 1), 203 } 204 s.resourceTypeState[typ] = state 205 } 206 207 // Create state for the newly subscribed resource. The watch timer will 208 // be started when a request for this resource is actually sent out. 209 state.subscribedResources[name] = &resourceWatchState{State: resourceWatchStateStarted} 210 state.pendingWrite = true 211 212 // Send a request for the resource type with updated subscriptions. 213 s.requestCh.Put(typ) 214 } 215 216 // Unsubscribe cancels the subscription to the given resource. It is a no-op if 217 // the given resource does not exist. The watch expiry timer associated with the 218 // resource is stopped if one is active. A discovery request is sent out on the 219 // stream for the resource type when there is sufficient flow control quota. 220 func (s *adsStreamImpl) Unsubscribe(typ ResourceType, name string) { 221 if s.logger.V(2) { 222 s.logger.Infof("Unsubscribing to resource %q of type %q", name, typ.TypeName) 223 } 224 225 s.mu.Lock() 226 defer s.mu.Unlock() 227 228 state, ok := s.resourceTypeState[typ] 229 if !ok { 230 return 231 } 232 233 rs, ok := state.subscribedResources[name] 234 if !ok { 235 return 236 } 237 if rs.ExpiryTimer != nil { 238 rs.ExpiryTimer.Stop() 239 } 240 delete(state.subscribedResources, name) 241 state.pendingWrite = true 242 243 // Send a request for the resource type with updated subscriptions. 244 s.requestCh.Put(typ) 245 } 246 247 // runner is a long-running goroutine that handles the lifecycle of the ADS 248 // stream. It spwans another goroutine to handle writes of discovery request 249 // messages on the stream. Whenever an existing stream fails, it performs 250 // exponential backoff (if no messages were received on that stream) before 251 // creating a new stream. 252 func (s *adsStreamImpl) runner(ctx context.Context) { 253 defer close(s.runnerDoneCh) 254 255 go s.send(ctx) 256 257 runStreamWithBackoff := func() error { 258 stream, err := s.transport.NewStream(ctx, "/envoy.service.discovery.v3.AggregatedDiscoveryService/StreamAggregatedResources") 259 if err != nil { 260 s.logger.Warningf("Failed to create a new ADS streaming RPC: %v", err) 261 s.onError(err, false) 262 return nil 263 } 264 if s.logger.V(2) { 265 s.logger.Infof("ADS stream created") 266 } 267 268 s.mu.Lock() 269 // Flow control is a property of the underlying streaming RPC call and 270 // needs to be initialized everytime a new one is created. 271 s.fc = newADSFlowControl(s.logger) 272 s.firstRequest = true 273 s.mu.Unlock() 274 275 // Ensure that the most recently created stream is pushed on the 276 // channel for the `send` goroutine to consume. 277 select { 278 case <-s.streamCh: 279 default: 280 } 281 s.streamCh <- stream 282 283 // Backoff state is reset upon successful receipt of at least one 284 // message from the server. 285 if s.recv(ctx, stream) { 286 return backoff.ErrResetBackoff 287 } 288 return nil 289 } 290 backoff.RunF(ctx, runStreamWithBackoff, s.backoff) 291 } 292 293 // send is a long running goroutine that handles sending discovery requests for 294 // two scenarios: 295 // - a new subscription or unsubscription request is received 296 // - a new stream is created after the previous one failed 297 func (s *adsStreamImpl) send(ctx context.Context) { 298 // Stores the most recent stream instance received on streamCh. 299 var stream clients.Stream 300 for { 301 select { 302 case <-ctx.Done(): 303 return 304 case stream = <-s.streamCh: 305 if err := s.sendExisting(stream); err != nil { 306 // Send failed, clear the current stream. Attempt to resend will 307 // only be made after a new stream is created. 308 stream = nil 309 continue 310 } 311 case req, ok := <-s.requestCh.Get(): 312 if !ok { 313 return 314 } 315 s.requestCh.Load() 316 317 typ := req.(ResourceType) 318 if err := s.sendNew(stream, typ); err != nil { 319 stream = nil 320 continue 321 } 322 } 323 } 324 } 325 326 // sendNew attempts to send a discovery request based on a new subscription or 327 // unsubscription. If there is no flow control quota, the request is buffered 328 // and will be sent later. This method also starts the watch expiry timer for 329 // resources that were sent in the request for the first time, i.e. their watch 330 // state is `watchStateStarted`. 331 func (s *adsStreamImpl) sendNew(stream clients.Stream, typ ResourceType) error { 332 s.mu.Lock() 333 defer s.mu.Unlock() 334 335 // If there's no stream yet, skip the request. This request will be resent 336 // when a new stream is created. If no stream is created, the watcher will 337 // timeout (same as server not sending response back). 338 if stream == nil { 339 return nil 340 } 341 342 // If local processing of the most recently received response is not yet 343 // complete, i.e. fc.pending == true, queue this write and return early. 344 // This allows us to batch writes for requests which are generated as part 345 // of local processing of a received response. 346 state := s.resourceTypeState[typ] 347 if s.fc.pending.Load() { 348 select { 349 case state.bufferedRequests <- struct{}{}: 350 default: 351 } 352 return nil 353 } 354 355 return s.sendMessageIfWritePendingLocked(stream, typ, state) 356 } 357 358 // sendExisting sends out discovery requests for existing resources when 359 // recovering from a broken stream. 360 // 361 // The stream argument is guaranteed to be non-nil. 362 func (s *adsStreamImpl) sendExisting(stream clients.Stream) error { 363 s.mu.Lock() 364 defer s.mu.Unlock() 365 366 for typ, state := range s.resourceTypeState { 367 // Reset only the nonces map when the stream restarts. 368 // 369 // xDS spec says the following. See section: 370 // https://www.envoyproxy.io/docs/envoy/latest/api-docs/xds_protocol#ack-nack-and-resource-type-instance-version 371 // 372 // Note that the version for a resource type is not a property of an 373 // individual xDS stream but rather a property of the resources 374 // themselves. If the stream becomes broken and the client creates a new 375 // stream, the client’s initial request on the new stream should 376 // indicate the most recent version seen by the client on the previous 377 // stream 378 state.nonce = "" 379 380 if len(state.subscribedResources) == 0 { 381 continue 382 } 383 384 state.pendingWrite = true 385 if err := s.sendMessageIfWritePendingLocked(stream, typ, state); err != nil { 386 return err 387 } 388 } 389 return nil 390 } 391 392 // sendBuffered sends out discovery requests for resources that were buffered 393 // when they were subscribed to, because local processing of the previously 394 // received response was not yet complete. 395 // 396 // The stream argument is guaranteed to be non-nil. 397 func (s *adsStreamImpl) sendBuffered(stream clients.Stream) error { 398 s.mu.Lock() 399 defer s.mu.Unlock() 400 401 for typ, state := range s.resourceTypeState { 402 select { 403 case <-state.bufferedRequests: 404 if err := s.sendMessageIfWritePendingLocked(stream, typ, state); err != nil { 405 return err 406 } 407 default: 408 // No buffered request. 409 continue 410 } 411 } 412 return nil 413 } 414 415 // sendMessageIfWritePendingLocked attempts to sends a discovery request to the 416 // server, if there is a pending write for the given resource type. 417 // 418 // If the request is successfully sent, the pending write field is cleared and 419 // watch timers are started for the resources in the request. 420 // 421 // Caller needs to hold c.mu. 422 func (s *adsStreamImpl) sendMessageIfWritePendingLocked(stream clients.Stream, typ ResourceType, state *resourceTypeState) error { 423 if !state.pendingWrite { 424 if s.logger.V(2) { 425 s.logger.Infof("Skipping sending request for type %q, because all subscribed resources were already sent", typ.TypeURL) 426 } 427 return nil 428 } 429 430 names := resourceNames(state.subscribedResources) 431 if err := s.sendMessageLocked(stream, names, typ.TypeURL, state.version, state.nonce, nil); err != nil { 432 return err 433 } 434 state.pendingWrite = false 435 436 // Drain the buffered requests channel because we just sent a request for this 437 // resource type. 438 select { 439 case <-state.bufferedRequests: 440 default: 441 } 442 443 s.startWatchTimersLocked(typ, names) 444 return nil 445 } 446 447 // sendMessageLocked sends a discovery request to the server, populating the 448 // different fields of the message with the given parameters. Returns a non-nil 449 // error if the request could not be sent. 450 // 451 // Caller needs to hold c.mu. 452 func (s *adsStreamImpl) sendMessageLocked(stream clients.Stream, names []string, url, version, nonce string, nackErr error) error { 453 req := &v3discoverypb.DiscoveryRequest{ 454 ResourceNames: names, 455 TypeUrl: url, 456 VersionInfo: version, 457 ResponseNonce: nonce, 458 } 459 460 // The xDS protocol only requires that we send the node proto in the first 461 // discovery request on every stream. Sending the node proto in every 462 // request wastes CPU resources on the client and the server. 463 if s.firstRequest { 464 req.Node = s.nodeProto 465 } 466 467 if nackErr != nil { 468 req.ErrorDetail = &statuspb.Status{ 469 Code: int32(cpb.Code_INVALID_ARGUMENT), Message: nackErr.Error(), 470 } 471 } 472 473 msg, err := proto.Marshal(req) 474 if err != nil { 475 s.logger.Warningf("Failed to marshal DiscoveryRequest: %v", err) 476 return err 477 } 478 if err := stream.Send(msg); err != nil { 479 s.logger.Warningf("Sending ADS request for type %q, resources: %v, version: %q, nonce: %q failed: %v", url, names, version, nonce, err) 480 return err 481 } 482 s.firstRequest = false 483 484 if s.logger.V(perRPCVerbosityLevel) { 485 s.logger.Infof("ADS request sent: %v", pretty.ToJSON(req)) 486 } else if s.logger.V(2) { 487 s.logger.Warningf("ADS request sent for type %q, resources: %v, version: %q, nonce: %q", url, names, version, nonce) 488 } 489 return nil 490 } 491 492 // recv is responsible for receiving messages from the ADS stream. 493 // 494 // It performs the following actions: 495 // - Waits for local flow control to be available before sending buffered 496 // requests, if any. 497 // - Receives a message from the ADS stream. If an error is encountered here, 498 // it is handled by the onError method which propagates the error to all 499 // watchers. 500 // - Invokes the event handler's OnADSResponse method to process the message. 501 // - Sends an ACK or NACK to the server based on the response. 502 // 503 // It returns a boolean indicating whether at least one message was received 504 // from the server. 505 func (s *adsStreamImpl) recv(ctx context.Context, stream clients.Stream) bool { 506 msgReceived := false 507 for { 508 // Wait for ADS stream level flow control to be available, and send out 509 // a request if anything was buffered while we were waiting for local 510 // processing of the previous response to complete. 511 if !s.fc.wait(ctx) { 512 if s.logger.V(2) { 513 s.logger.Infof("ADS stream context canceled") 514 } 515 return msgReceived 516 } 517 s.sendBuffered(stream) 518 519 resources, url, version, nonce, err := s.recvMessage(stream) 520 if err != nil { 521 s.onError(err, msgReceived) 522 s.logger.Warningf("ADS stream closed: %v", err) 523 return msgReceived 524 } 525 msgReceived = true 526 527 // Invoke the onResponse event handler to parse the incoming message and 528 // decide whether to send an ACK or NACK. 529 resp := response{ 530 resources: resources, 531 typeURL: url, 532 version: version, 533 } 534 var resourceNames []string 535 var nackErr error 536 s.fc.setPending() 537 resourceNames, nackErr = s.eventHandler.onResponse(resp, s.fc.onDone) 538 if xdsresource.ErrType(nackErr) == xdsresource.ErrorTypeResourceTypeUnsupported { 539 // A general guiding principle is that if the server sends 540 // something the client didn't actually subscribe to, then the 541 // client ignores it. Here, we have received a response with 542 // resources of a type that we don't know about. 543 // 544 // Sending a NACK doesn't really seem appropriate here, since we're 545 // not actually validating what the server sent and therefore don't 546 // know that it's invalid. But we shouldn't ACK either, because we 547 // don't know that it is valid. 548 s.logger.Warningf("%v", nackErr) 549 continue 550 } 551 552 s.onRecv(stream, resourceNames, url, version, nonce, nackErr) 553 } 554 } 555 556 func (s *adsStreamImpl) recvMessage(stream clients.Stream) (resources []*anypb.Any, url, version, nonce string, err error) { 557 r, err := stream.Recv() 558 if err != nil { 559 return nil, "", "", "", err 560 } 561 var resp v3discoverypb.DiscoveryResponse 562 if err := proto.Unmarshal(r, &resp); err != nil { 563 s.logger.Infof("Failed to unmarshal response to DiscoveryResponse: %v", err) 564 return nil, "", "", "", fmt.Errorf("unexpected message type %T", r) 565 } 566 if s.logger.V(perRPCVerbosityLevel) { 567 s.logger.Infof("ADS response received: %v", pretty.ToJSON(&resp)) 568 } else if s.logger.V(2) { 569 s.logger.Infof("ADS response received for type %q, version %q, nonce %q", resp.GetTypeUrl(), resp.GetVersionInfo(), resp.GetNonce()) 570 } 571 return resp.GetResources(), resp.GetTypeUrl(), resp.GetVersionInfo(), resp.GetNonce(), nil 572 } 573 574 // onRecv is invoked when a response is received from the server. The arguments 575 // passed to this method correspond to the most recently received response. 576 // 577 // It performs the following actions: 578 // - updates resource type specific state 579 // - updates resource specific state for resources in the response 580 // - sends an ACK or NACK to the server based on the response 581 func (s *adsStreamImpl) onRecv(stream clients.Stream, names []string, url, version, nonce string, nackErr error) { 582 s.mu.Lock() 583 defer s.mu.Unlock() 584 585 // Lookup the resource type specific state based on the type URL. 586 var typ ResourceType 587 for t := range s.resourceTypeState { 588 if t.TypeURL == url { 589 typ = t 590 break 591 } 592 } 593 typeState, ok := s.resourceTypeState[typ] 594 if !ok { 595 s.logger.Warningf("ADS stream received a response for type %q, but no state exists for it", url) 596 return 597 } 598 599 // Update the resource type specific state. This includes: 600 // - updating the nonce unconditionally 601 // - updating the version only if the response is to be ACKed 602 previousVersion := typeState.version 603 typeState.nonce = nonce 604 if nackErr == nil { 605 typeState.version = version 606 } 607 608 // Update the resource specific state. For all resources received as 609 // part of this response that are in state `started` or `requested`, 610 // this includes: 611 // - setting the watch state to watchstateReceived 612 // - stopping the expiry timer, if one exists 613 for _, name := range names { 614 rs, ok := typeState.subscribedResources[name] 615 if !ok { 616 s.logger.Warningf("ADS stream received a response for resource %q, but no state exists for it", name) 617 continue 618 } 619 if ws := rs.State; ws == resourceWatchStateStarted || ws == resourceWatchStateRequested { 620 rs.State = resourceWatchStateReceived 621 if rs.ExpiryTimer != nil { 622 rs.ExpiryTimer.Stop() 623 rs.ExpiryTimer = nil 624 } 625 } 626 } 627 628 // Send an ACK or NACK. 629 subscribedResourceNames := resourceNames(typeState.subscribedResources) 630 if nackErr != nil { 631 s.logger.Warningf("Sending NACK for resource type: %q, version: %q, nonce: %q, reason: %v", url, version, nonce, nackErr) 632 s.sendMessageLocked(stream, subscribedResourceNames, url, previousVersion, nonce, nackErr) 633 return 634 } 635 636 if s.logger.V(2) { 637 s.logger.Infof("Sending ACK for resource type: %q, version: %q, nonce: %q", url, version, nonce) 638 } 639 s.sendMessageLocked(stream, subscribedResourceNames, url, version, nonce, nil) 640 } 641 642 // onError is called when an error occurs on the ADS stream. It stops any 643 // outstanding resource timers and resets the watch state to started for any 644 // resources that were in the requested state. It also handles the case where 645 // the ADS stream was closed after receiving a response, which is not 646 // considered an error. 647 func (s *adsStreamImpl) onError(err error, msgReceived bool) { 648 // For resources that been requested but not yet responded to by the 649 // management server, stop the resource timers and reset the watch state to 650 // watchStateStarted. This is because we don't want the expiry timer to be 651 // running when we don't have a stream open to the management server. 652 s.mu.Lock() 653 for _, state := range s.resourceTypeState { 654 for _, rs := range state.subscribedResources { 655 if rs.State != resourceWatchStateRequested { 656 continue 657 } 658 if rs.ExpiryTimer != nil { 659 rs.ExpiryTimer.Stop() 660 rs.ExpiryTimer = nil 661 } 662 rs.State = resourceWatchStateStarted 663 } 664 } 665 s.mu.Unlock() 666 667 // Note that we do not consider it an error if the ADS stream was closed 668 // after having received a response on the stream. This is because there 669 // are legitimate reasons why the server may need to close the stream during 670 // normal operations, such as needing to rebalance load or the underlying 671 // connection hitting its max connection age limit. 672 // (see [gRFC A9](https://github.com/grpc/proposal/blob/master/A9-server-side-conn-mgt.md)). 673 if msgReceived { 674 err = xdsresource.NewError(xdsresource.ErrTypeStreamFailedAfterRecv, err.Error()) 675 } 676 677 s.eventHandler.onStreamError(err) 678 } 679 680 // startWatchTimersLocked starts the expiry timers for the given resource names 681 // of the specified resource type. For each resource name, if the resource 682 // watch state is in the "started" state, it transitions the state to 683 // "requested" and starts an expiry timer. When the timer expires, the resource 684 // watch state is set to "timeout" and the event handler callback is called. 685 // 686 // The caller must hold the s.mu lock. 687 func (s *adsStreamImpl) startWatchTimersLocked(typ ResourceType, names []string) { 688 typeState := s.resourceTypeState[typ] 689 for _, name := range names { 690 resourceState, ok := typeState.subscribedResources[name] 691 if !ok { 692 continue 693 } 694 if resourceState.State != resourceWatchStateStarted { 695 continue 696 } 697 resourceState.State = resourceWatchStateRequested 698 699 rs := resourceState 700 resourceState.ExpiryTimer = time.AfterFunc(s.watchExpiryTimeout, func() { 701 s.mu.Lock() 702 rs.State = resourceWatchStateTimeout 703 rs.ExpiryTimer = nil 704 s.mu.Unlock() 705 s.eventHandler.onWatchExpiry(typ, name) 706 }) 707 } 708 } 709 710 func resourceNames(m map[string]*resourceWatchState) []string { 711 ret := make([]string, len(m)) 712 idx := 0 713 for name := range m { 714 ret[idx] = name 715 idx++ 716 } 717 return ret 718 } 719 720 // adsFlowControl implements ADS stream level flow control that enables the 721 // transport to block the reading of the next message off of the stream until 722 // the previous update is consumed by all watchers. 723 // 724 // The lifetime of the flow control is tied to the lifetime of the stream. 725 type adsFlowControl struct { 726 logger *igrpclog.PrefixLogger 727 728 // Whether the most recent update is pending consumption by all watchers. 729 pending atomic.Bool 730 // Channel used to notify when all the watchers have consumed the most 731 // recent update. Wait() blocks on reading a value from this channel. 732 readyCh chan struct{} 733 } 734 735 // newADSFlowControl returns a new adsFlowControl. 736 func newADSFlowControl(logger *igrpclog.PrefixLogger) *adsFlowControl { 737 return &adsFlowControl{ 738 logger: logger, 739 readyCh: make(chan struct{}, 1), 740 } 741 } 742 743 // setPending changes the internal state to indicate that there is an update 744 // pending consumption by all watchers. 745 func (fc *adsFlowControl) setPending() { 746 fc.pending.Store(true) 747 } 748 749 // wait blocks until all the watchers have consumed the most recent update and 750 // returns true. If the context expires before that, it returns false. 751 func (fc *adsFlowControl) wait(ctx context.Context) bool { 752 // If there is no pending update, there is no need to block. 753 if !fc.pending.Load() { 754 // If all watchers finished processing the most recent update before the 755 // `recv` goroutine made the next call to `Wait()`, there would be an 756 // entry in the readyCh channel that needs to be drained to ensure that 757 // the next call to `Wait()` doesn't unblock before it actually should. 758 select { 759 case <-fc.readyCh: 760 default: 761 } 762 return true 763 } 764 765 select { 766 case <-ctx.Done(): 767 return false 768 case <-fc.readyCh: 769 return true 770 } 771 } 772 773 // onDone indicates that all watchers have consumed the most recent update. 774 func (fc *adsFlowControl) onDone() { 775 select { 776 // Writes to the readyCh channel should not block ideally. The default 777 // branch here is to appease the paranoid mind. 778 case fc.readyCh <- struct{}{}: 779 default: 780 if fc.logger.V(2) { 781 fc.logger.Infof("ADS stream flow control readyCh is full") 782 } 783 } 784 fc.pending.Store(false) 785 }