google.golang.org/grpc@v1.62.1/xds/internal/xdsclient/authority.go (about) 1 /* 2 * 3 * Copyright 2021 gRPC authors. 4 * 5 * Licensed under the Apache License, Version 2.0 (the "License"); 6 * you may not use this file except in compliance with the License. 7 * You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 */ 17 18 package xdsclient 19 20 import ( 21 "context" 22 "errors" 23 "fmt" 24 "strings" 25 "sync" 26 "time" 27 28 "google.golang.org/grpc/internal/grpclog" 29 "google.golang.org/grpc/internal/grpcsync" 30 "google.golang.org/grpc/xds/internal/xdsclient/bootstrap" 31 "google.golang.org/grpc/xds/internal/xdsclient/load" 32 "google.golang.org/grpc/xds/internal/xdsclient/transport" 33 "google.golang.org/grpc/xds/internal/xdsclient/xdsresource" 34 "google.golang.org/protobuf/types/known/anypb" 35 ) 36 37 type watchState int 38 39 const ( 40 watchStateStarted watchState = iota // Watch started, request not yet set. 41 watchStateRequested // Request sent for resource being watched. 42 watchStateReceived // Response received for resource being watched. 43 watchStateTimeout // Watch timer expired, no response. 44 watchStateCanceled // Watch cancelled. 45 ) 46 47 type resourceState struct { 48 watchers map[xdsresource.ResourceWatcher]bool // Set of watchers for this resource 49 cache xdsresource.ResourceData // Most recent ACKed update for this resource 50 md xdsresource.UpdateMetadata // Metadata for the most recent update 51 deletionIgnored bool // True if resource deletion was ignored for a prior update 52 53 // Common watch state for all watchers of this resource. 54 wTimer *time.Timer // Expiry timer 55 wState watchState // State of the watch 56 } 57 58 // authority wraps all state associated with a single management server. It 59 // contains the transport used to communicate with the management server and a 60 // cache of resource state for resources requested from the management server. 61 // 62 // Bootstrap configuration could contain multiple entries in the authorities map 63 // that share the same server config (server address and credentials to use). We 64 // share the same authority instance amongst these entries, and the reference 65 // counting is taken care of by the `clientImpl` type. 66 type authority struct { 67 serverCfg *bootstrap.ServerConfig // Server config for this authority 68 bootstrapCfg *bootstrap.Config // Full bootstrap configuration 69 refCount int // Reference count of watches referring to this authority 70 serializer *grpcsync.CallbackSerializer // Callback serializer for invoking watch callbacks 71 resourceTypeGetter func(string) xdsresource.Type // ResourceType registry lookup 72 transport *transport.Transport // Underlying xDS transport to the management server 73 watchExpiryTimeout time.Duration // Resource watch expiry timeout 74 logger *grpclog.PrefixLogger 75 76 // A two level map containing the state of all the resources being watched. 77 // 78 // The first level map key is the ResourceType (Listener, Route etc). This 79 // allows us to have a single map for all resources instead of having per 80 // resource-type maps. 81 // 82 // The second level map key is the resource name, with the value being the 83 // actual state of the resource. 84 resourcesMu sync.Mutex 85 resources map[xdsresource.Type]map[string]*resourceState 86 closed bool 87 } 88 89 // authorityArgs is a convenience struct to wrap arguments required to create a 90 // new authority. All fields here correspond directly to appropriate fields 91 // stored in the authority struct. 92 type authorityArgs struct { 93 // The reason for passing server config and bootstrap config separately 94 // (although the former is part of the latter) is because authorities in the 95 // bootstrap config might contain an empty server config, and in this case, 96 // the top-level server config is to be used. 97 // 98 // There are two code paths from where a new authority struct might be 99 // created. One is when a watch is registered for a resource, and one is 100 // when load reporting needs to be started. We have the authority name in 101 // the first case, but do in the second. We only have the server config in 102 // the second case. 103 serverCfg *bootstrap.ServerConfig 104 bootstrapCfg *bootstrap.Config 105 serializer *grpcsync.CallbackSerializer 106 resourceTypeGetter func(string) xdsresource.Type 107 watchExpiryTimeout time.Duration 108 logger *grpclog.PrefixLogger 109 } 110 111 func newAuthority(args authorityArgs) (*authority, error) { 112 ret := &authority{ 113 serverCfg: args.serverCfg, 114 bootstrapCfg: args.bootstrapCfg, 115 serializer: args.serializer, 116 resourceTypeGetter: args.resourceTypeGetter, 117 watchExpiryTimeout: args.watchExpiryTimeout, 118 logger: args.logger, 119 resources: make(map[xdsresource.Type]map[string]*resourceState), 120 } 121 122 tr, err := transport.New(transport.Options{ 123 ServerCfg: *args.serverCfg, 124 OnRecvHandler: ret.handleResourceUpdate, 125 OnErrorHandler: ret.newConnectionError, 126 OnSendHandler: ret.transportOnSendHandler, 127 Logger: args.logger, 128 NodeProto: args.bootstrapCfg.NodeProto, 129 }) 130 if err != nil { 131 return nil, fmt.Errorf("creating new transport to %q: %v", args.serverCfg, err) 132 } 133 ret.transport = tr 134 return ret, nil 135 } 136 137 // transportOnSendHandler is called by the underlying transport when it sends a 138 // resource request successfully. Timers are activated for resources waiting for 139 // a response. 140 func (a *authority) transportOnSendHandler(u *transport.ResourceSendInfo) { 141 rType := a.resourceTypeGetter(u.URL) 142 // Resource type not found is not expected under normal circumstances, since 143 // the resource type url passed to the transport is determined by the authority. 144 if rType == nil { 145 a.logger.Warningf("Unknown resource type url: %s.", u.URL) 146 return 147 } 148 a.resourcesMu.Lock() 149 defer a.resourcesMu.Unlock() 150 a.startWatchTimersLocked(rType, u.ResourceNames) 151 } 152 153 func (a *authority) handleResourceUpdate(resourceUpdate transport.ResourceUpdate) error { 154 rType := a.resourceTypeGetter(resourceUpdate.URL) 155 if rType == nil { 156 return xdsresource.NewErrorf(xdsresource.ErrorTypeResourceTypeUnsupported, "Resource URL %v unknown in response from server", resourceUpdate.URL) 157 } 158 159 opts := &xdsresource.DecodeOptions{BootstrapConfig: a.bootstrapCfg} 160 updates, md, err := decodeAllResources(opts, rType, resourceUpdate) 161 a.updateResourceStateAndScheduleCallbacks(rType, updates, md) 162 return err 163 } 164 165 func (a *authority) updateResourceStateAndScheduleCallbacks(rType xdsresource.Type, updates map[string]resourceDataErrTuple, md xdsresource.UpdateMetadata) { 166 a.resourcesMu.Lock() 167 defer a.resourcesMu.Unlock() 168 169 resourceStates := a.resources[rType] 170 for name, uErr := range updates { 171 if state, ok := resourceStates[name]; ok { 172 // Cancel the expiry timer associated with the resource once a 173 // response is received, irrespective of whether the update is a 174 // good one or not. 175 // 176 // We check for watch states `started` and `requested` here to 177 // accommodate for a race which can happen in the following 178 // scenario: 179 // - When a watch is registered, it is possible that the ADS stream 180 // is not yet created. In this case, the request for the resource 181 // is not sent out immediately. An entry in the `resourceStates` 182 // map is created with a watch state of `started`. 183 // - Once the stream is created, it is possible that the management 184 // server might respond with the requested resource before we send 185 // out request for the same. If we don't check for `started` here, 186 // and move the state to `received`, we will end up starting the 187 // timer when the request gets sent out. And since the mangement 188 // server already sent us the resource, there is a good chance 189 // that it will not send it again. This would eventually lead to 190 // the timer firing, even though we have the resource in the 191 // cache. 192 if state.wState == watchStateStarted || state.wState == watchStateRequested { 193 // It is OK to ignore the return value from Stop() here because 194 // if the timer has already fired, it means that the timer watch 195 // expiry callback is blocked on the same lock that we currently 196 // hold. Since we move the state to `received` here, the timer 197 // callback will be a no-op. 198 if state.wTimer != nil { 199 state.wTimer.Stop() 200 } 201 state.wState = watchStateReceived 202 } 203 204 if uErr.err != nil { 205 // On error, keep previous version of the resource. But update 206 // status and error. 207 state.md.ErrState = md.ErrState 208 state.md.Status = md.Status 209 for watcher := range state.watchers { 210 watcher := watcher 211 err := uErr.err 212 a.serializer.Schedule(func(context.Context) { watcher.OnError(err) }) 213 } 214 continue 215 } 216 217 if state.deletionIgnored { 218 state.deletionIgnored = false 219 a.logger.Infof("A valid update was received for resource %q of type %q after previously ignoring a deletion", name, rType.TypeName()) 220 } 221 // Notify watchers only if this is a first time update or it is different 222 // from the one currently cached. 223 if state.cache == nil || !state.cache.Equal(uErr.resource) { 224 for watcher := range state.watchers { 225 watcher := watcher 226 resource := uErr.resource 227 a.serializer.Schedule(func(context.Context) { watcher.OnUpdate(resource) }) 228 } 229 } 230 // Sync cache. 231 a.logger.Debugf("Resource type %q with name %q added to cache", rType.TypeName(), name) 232 state.cache = uErr.resource 233 // Set status to ACK, and clear error state. The metadata might be a 234 // NACK metadata because some other resources in the same response 235 // are invalid. 236 state.md = md 237 state.md.ErrState = nil 238 state.md.Status = xdsresource.ServiceStatusACKed 239 if md.ErrState != nil { 240 state.md.Version = md.ErrState.Version 241 } 242 } 243 } 244 245 // If this resource type requires that all resources be present in every 246 // SotW response from the server, a response that does not include a 247 // previously seen resource will be interpreted as a deletion of that 248 // resource unless ignore_resource_deletion option was set in the server 249 // config. 250 if !rType.AllResourcesRequiredInSotW() { 251 return 252 } 253 for name, state := range resourceStates { 254 if state.cache == nil { 255 // If the resource state does not contain a cached update, which can 256 // happen when: 257 // - resource was newly requested but has not yet been received, or, 258 // - resource was removed as part of a previous update, 259 // we don't want to generate an error for the watchers. 260 // 261 // For the first of the above two conditions, this ADS response may 262 // be in reaction to an earlier request that did not yet request the 263 // new resource, so its absence from the response does not 264 // necessarily indicate that the resource does not exist. For that 265 // case, we rely on the request timeout instead. 266 // 267 // For the second of the above two conditions, we already generated 268 // an error when we received the first response which removed this 269 // resource. So, there is no need to generate another one. 270 continue 271 } 272 if _, ok := updates[name]; !ok { 273 // The metadata status is set to "ServiceStatusNotExist" if a 274 // previous update deleted this resource, in which case we do not 275 // want to repeatedly call the watch callbacks with a 276 // "resource-not-found" error. 277 if state.md.Status == xdsresource.ServiceStatusNotExist { 278 continue 279 } 280 // Per A53, resource deletions are ignored if the `ignore_resource_deletion` 281 // server feature is enabled through the bootstrap configuration. If the 282 // resource deletion is to be ignored, the resource is not removed from 283 // the cache and the corresponding OnResourceDoesNotExist() callback is 284 // not invoked on the watchers. 285 if a.serverCfg.IgnoreResourceDeletion { 286 if !state.deletionIgnored { 287 state.deletionIgnored = true 288 a.logger.Warningf("Ignoring resource deletion for resource %q of type %q", name, rType.TypeName()) 289 } 290 continue 291 } 292 // If resource exists in cache, but not in the new update, delete 293 // the resource from cache, and also send a resource not found error 294 // to indicate resource removed. Metadata for the resource is still 295 // maintained, as this is required by CSDS. 296 state.cache = nil 297 state.md = xdsresource.UpdateMetadata{Status: xdsresource.ServiceStatusNotExist} 298 for watcher := range state.watchers { 299 watcher := watcher 300 a.serializer.Schedule(func(context.Context) { watcher.OnResourceDoesNotExist() }) 301 } 302 } 303 } 304 } 305 306 type resourceDataErrTuple struct { 307 resource xdsresource.ResourceData 308 err error 309 } 310 311 func decodeAllResources(opts *xdsresource.DecodeOptions, rType xdsresource.Type, update transport.ResourceUpdate) (map[string]resourceDataErrTuple, xdsresource.UpdateMetadata, error) { 312 timestamp := time.Now() 313 md := xdsresource.UpdateMetadata{ 314 Version: update.Version, 315 Timestamp: timestamp, 316 } 317 318 topLevelErrors := make([]error, 0) // Tracks deserialization errors, where we don't have a resource name. 319 perResourceErrors := make(map[string]error) // Tracks resource validation errors, where we have a resource name. 320 ret := make(map[string]resourceDataErrTuple) // Return result, a map from resource name to either resource data or error. 321 for _, r := range update.Resources { 322 result, err := rType.Decode(opts, r) 323 324 // Name field of the result is left unpopulated only when resource 325 // deserialization fails. 326 name := "" 327 if result != nil { 328 name = xdsresource.ParseName(result.Name).String() 329 } 330 if err == nil { 331 ret[name] = resourceDataErrTuple{resource: result.Resource} 332 continue 333 } 334 if name == "" { 335 topLevelErrors = append(topLevelErrors, err) 336 continue 337 } 338 perResourceErrors[name] = err 339 // Add place holder in the map so we know this resource name was in 340 // the response. 341 ret[name] = resourceDataErrTuple{err: err} 342 } 343 344 if len(topLevelErrors) == 0 && len(perResourceErrors) == 0 { 345 md.Status = xdsresource.ServiceStatusACKed 346 return ret, md, nil 347 } 348 349 md.Status = xdsresource.ServiceStatusNACKed 350 errRet := combineErrors(rType.TypeName(), topLevelErrors, perResourceErrors) 351 md.ErrState = &xdsresource.UpdateErrorMetadata{ 352 Version: update.Version, 353 Err: errRet, 354 Timestamp: timestamp, 355 } 356 return ret, md, errRet 357 } 358 359 // startWatchTimersLocked is invoked upon transport.OnSend() callback with resources 360 // requested on the underlying ADS stream. This satisfies the conditions to start 361 // watch timers per A57 [https://github.com/grpc/proposal/blob/master/A57-xds-client-failure-mode-behavior.md#handling-resources-that-do-not-exist] 362 // 363 // Caller must hold a.resourcesMu. 364 func (a *authority) startWatchTimersLocked(rType xdsresource.Type, resourceNames []string) { 365 resourceStates := a.resources[rType] 366 for _, resourceName := range resourceNames { 367 if state, ok := resourceStates[resourceName]; ok { 368 if state.wState != watchStateStarted { 369 continue 370 } 371 state.wTimer = time.AfterFunc(a.watchExpiryTimeout, func() { 372 a.handleWatchTimerExpiry(rType, resourceName, state) 373 }) 374 state.wState = watchStateRequested 375 } 376 } 377 } 378 379 // stopWatchTimersLocked is invoked upon connection errors to stops watch timers 380 // for resources that have been requested, but not yet responded to by the management 381 // server. 382 // 383 // Caller must hold a.resourcesMu. 384 func (a *authority) stopWatchTimersLocked() { 385 for _, rType := range a.resources { 386 for resourceName, state := range rType { 387 if state.wState != watchStateRequested { 388 continue 389 } 390 if !state.wTimer.Stop() { 391 // If the timer has already fired, it means that the timer watch expiry 392 // callback is blocked on the same lock that we currently hold. Don't change 393 // the watch state and instead let the watch expiry callback handle it. 394 a.logger.Warningf("Watch timer for resource %v already fired. Ignoring here.", resourceName) 395 continue 396 } 397 state.wTimer = nil 398 state.wState = watchStateStarted 399 } 400 } 401 } 402 403 // newConnectionError is called by the underlying transport when it receives a 404 // connection error. The error will be forwarded to all the resource watchers. 405 func (a *authority) newConnectionError(err error) { 406 a.resourcesMu.Lock() 407 defer a.resourcesMu.Unlock() 408 409 a.stopWatchTimersLocked() 410 411 // We do not consider it an error if the ADS stream was closed after having received 412 // a response on the stream. This is because there are legitimate reasons why the server 413 // may need to close the stream during normal operations, such as needing to rebalance 414 // load or the underlying connection hitting its max connection age limit. 415 // See gRFC A57 for more details. 416 if xdsresource.ErrType(err) == xdsresource.ErrTypeStreamFailedAfterRecv { 417 a.logger.Warningf("Watchers not notified since ADS stream failed after having received at least one response: %v", err) 418 return 419 } 420 421 for _, rType := range a.resources { 422 for _, state := range rType { 423 // Propagate the connection error from the transport layer to all watchers. 424 for watcher := range state.watchers { 425 watcher := watcher 426 a.serializer.Schedule(func(context.Context) { 427 watcher.OnError(xdsresource.NewErrorf(xdsresource.ErrorTypeConnection, "xds: error received from xDS stream: %v", err)) 428 }) 429 } 430 } 431 } 432 } 433 434 // Increments the reference count. Caller must hold parent's authorityMu. 435 func (a *authority) refLocked() { 436 a.refCount++ 437 } 438 439 // Decrements the reference count. Caller must hold parent's authorityMu. 440 func (a *authority) unrefLocked() int { 441 a.refCount-- 442 return a.refCount 443 } 444 445 func (a *authority) close() { 446 a.transport.Close() 447 448 a.resourcesMu.Lock() 449 a.closed = true 450 a.resourcesMu.Unlock() 451 } 452 453 func (a *authority) watchResource(rType xdsresource.Type, resourceName string, watcher xdsresource.ResourceWatcher) func() { 454 a.logger.Debugf("New watch for type %q, resource name %q", rType.TypeName(), resourceName) 455 a.resourcesMu.Lock() 456 defer a.resourcesMu.Unlock() 457 458 // Lookup the ResourceType specific resources from the top-level map. If 459 // there is no entry for this ResourceType, create one. 460 resources := a.resources[rType] 461 if resources == nil { 462 resources = make(map[string]*resourceState) 463 a.resources[rType] = resources 464 } 465 466 // Lookup the resourceState for the particular resource that the watch is 467 // being registered for. If this is the first watch for this resource, 468 // instruct the transport layer to send a DiscoveryRequest for the same. 469 state := resources[resourceName] 470 if state == nil { 471 a.logger.Debugf("First watch for type %q, resource name %q", rType.TypeName(), resourceName) 472 state = &resourceState{ 473 watchers: make(map[xdsresource.ResourceWatcher]bool), 474 md: xdsresource.UpdateMetadata{Status: xdsresource.ServiceStatusRequested}, 475 wState: watchStateStarted, 476 } 477 resources[resourceName] = state 478 a.sendDiscoveryRequestLocked(rType, resources) 479 } 480 // Always add the new watcher to the set of watchers. 481 state.watchers[watcher] = true 482 483 // If we have a cached copy of the resource, notify the new watcher. 484 if state.cache != nil { 485 a.logger.Debugf("Resource type %q with resource name %q found in cache: %s", rType.TypeName(), resourceName, state.cache.ToJSON()) 486 resource := state.cache 487 a.serializer.Schedule(func(context.Context) { watcher.OnUpdate(resource) }) 488 } 489 490 return func() { 491 a.resourcesMu.Lock() 492 defer a.resourcesMu.Unlock() 493 494 // We already have a reference to the resourceState for this particular 495 // resource. Avoid indexing into the two-level map to figure this out. 496 497 // Delete this particular watcher from the list of watchers, so that its 498 // callback will not be invoked in the future. 499 state.wState = watchStateCanceled 500 delete(state.watchers, watcher) 501 if len(state.watchers) > 0 { 502 return 503 } 504 505 // There are no more watchers for this resource, delete the state 506 // associated with it, and instruct the transport to send a request 507 // which does not include this resource name. 508 a.logger.Debugf("Removing last watch for type %q, resource name %q", rType.TypeName(), resourceName) 509 delete(resources, resourceName) 510 a.sendDiscoveryRequestLocked(rType, resources) 511 } 512 } 513 514 func (a *authority) handleWatchTimerExpiry(rType xdsresource.Type, resourceName string, state *resourceState) { 515 a.resourcesMu.Lock() 516 defer a.resourcesMu.Unlock() 517 518 if a.closed { 519 return 520 } 521 a.logger.Warningf("Watch for resource %q of type %s timed out", resourceName, rType.TypeName()) 522 523 switch state.wState { 524 case watchStateRequested: 525 // This is the only state where we need to handle the timer expiry by 526 // invoking appropriate watch callbacks. This is handled outside the switch. 527 case watchStateCanceled: 528 return 529 default: 530 a.logger.Warningf("Unexpected watch state %q for resource %q.", state.wState, resourceName) 531 return 532 } 533 534 state.wState = watchStateTimeout 535 // With the watch timer firing, it is safe to assume that the resource does 536 // not exist on the management server. 537 state.cache = nil 538 state.md = xdsresource.UpdateMetadata{Status: xdsresource.ServiceStatusNotExist} 539 for watcher := range state.watchers { 540 watcher := watcher 541 a.serializer.Schedule(func(context.Context) { watcher.OnResourceDoesNotExist() }) 542 } 543 } 544 545 func (a *authority) triggerResourceNotFoundForTesting(rType xdsresource.Type, resourceName string) { 546 a.resourcesMu.Lock() 547 defer a.resourcesMu.Unlock() 548 549 if a.closed { 550 return 551 } 552 resourceStates := a.resources[rType] 553 state, ok := resourceStates[resourceName] 554 if !ok { 555 return 556 } 557 // if watchStateTimeout already triggered resource not found above from 558 // normal watch expiry. 559 if state.wState == watchStateCanceled || state.wState == watchStateTimeout { 560 return 561 } 562 state.wState = watchStateTimeout 563 state.cache = nil 564 state.md = xdsresource.UpdateMetadata{Status: xdsresource.ServiceStatusNotExist} 565 for watcher := range state.watchers { 566 watcher := watcher 567 a.serializer.Schedule(func(context.Context) { watcher.OnResourceDoesNotExist() }) 568 } 569 } 570 571 // sendDiscoveryRequestLocked sends a discovery request for the specified 572 // resource type and resource names. Even though this method does not directly 573 // access the resource cache, it is important that `resourcesMu` be beld when 574 // calling this method to ensure that a consistent snapshot of resource names is 575 // being requested. 576 func (a *authority) sendDiscoveryRequestLocked(rType xdsresource.Type, resources map[string]*resourceState) { 577 resourcesToRequest := make([]string, len(resources)) 578 i := 0 579 for name := range resources { 580 resourcesToRequest[i] = name 581 i++ 582 } 583 a.transport.SendRequest(rType.TypeURL(), resourcesToRequest) 584 } 585 586 func (a *authority) reportLoad() (*load.Store, func()) { 587 return a.transport.ReportLoad() 588 } 589 590 func (a *authority) dumpResources() map[string]map[string]xdsresource.UpdateWithMD { 591 a.resourcesMu.Lock() 592 defer a.resourcesMu.Unlock() 593 594 dump := make(map[string]map[string]xdsresource.UpdateWithMD) 595 for rType, resourceStates := range a.resources { 596 states := make(map[string]xdsresource.UpdateWithMD) 597 for name, state := range resourceStates { 598 var raw *anypb.Any 599 if state.cache != nil { 600 raw = state.cache.Raw() 601 } 602 states[name] = xdsresource.UpdateWithMD{ 603 MD: state.md, 604 Raw: raw, 605 } 606 } 607 dump[rType.TypeURL()] = states 608 } 609 return dump 610 } 611 612 func combineErrors(rType string, topLevelErrors []error, perResourceErrors map[string]error) error { 613 var errStrB strings.Builder 614 errStrB.WriteString(fmt.Sprintf("error parsing %q response: ", rType)) 615 if len(topLevelErrors) > 0 { 616 errStrB.WriteString("top level errors: ") 617 for i, err := range topLevelErrors { 618 if i != 0 { 619 errStrB.WriteString(";\n") 620 } 621 errStrB.WriteString(err.Error()) 622 } 623 } 624 if len(perResourceErrors) > 0 { 625 var i int 626 for name, err := range perResourceErrors { 627 if i != 0 { 628 errStrB.WriteString(";\n") 629 } 630 i++ 631 errStrB.WriteString(fmt.Sprintf("resource %q: %v", name, err.Error())) 632 } 633 } 634 return errors.New(errStrB.String()) 635 }