google.golang.org/grpc@v1.74.2/xds/internal/balancer/clusterimpl/clusterimpl.go (about) 1 /* 2 * 3 * Copyright 2020 gRPC authors. 4 * 5 * Licensed under the Apache License, Version 2.0 (the "License"); 6 * you may not use this file except in compliance with the License. 7 * You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 * 17 */ 18 19 // Package clusterimpl implements the xds_cluster_impl balancing policy. It 20 // handles the cluster features (e.g. circuit_breaking, RPC dropping). 21 // 22 // Note that it doesn't handle name resolution, which is done by policy 23 // xds_cluster_resolver. 24 package clusterimpl 25 26 import ( 27 "context" 28 "encoding/json" 29 "fmt" 30 "sync" 31 "sync/atomic" 32 "time" 33 34 "google.golang.org/grpc/balancer" 35 "google.golang.org/grpc/connectivity" 36 "google.golang.org/grpc/internal" 37 "google.golang.org/grpc/internal/balancer/gracefulswitch" 38 "google.golang.org/grpc/internal/grpclog" 39 "google.golang.org/grpc/internal/pretty" 40 "google.golang.org/grpc/internal/xds" 41 "google.golang.org/grpc/internal/xds/bootstrap" 42 "google.golang.org/grpc/resolver" 43 "google.golang.org/grpc/serviceconfig" 44 xdsinternal "google.golang.org/grpc/xds/internal" 45 "google.golang.org/grpc/xds/internal/balancer/loadstore" 46 "google.golang.org/grpc/xds/internal/clients" 47 "google.golang.org/grpc/xds/internal/clients/lrsclient" 48 "google.golang.org/grpc/xds/internal/xdsclient" 49 ) 50 51 const ( 52 // Name is the name of the cluster_impl balancer. 53 Name = "xds_cluster_impl_experimental" 54 defaultRequestCountMax = 1024 55 loadStoreStopTimeout = 1 * time.Second 56 ) 57 58 var ( 59 connectedAddress = internal.ConnectedAddress.(func(balancer.SubConnState) resolver.Address) 60 // Below function is no-op in actual code, but can be overridden in 61 // tests to give tests visibility into exactly when certain events happen. 62 clientConnUpdateHook = func() {} 63 pickerUpdateHook = func() {} 64 ) 65 66 func init() { 67 balancer.Register(bb{}) 68 } 69 70 type bb struct{} 71 72 func (bb) Build(cc balancer.ClientConn, bOpts balancer.BuildOptions) balancer.Balancer { 73 b := &clusterImplBalancer{ 74 ClientConn: cc, 75 loadWrapper: loadstore.NewWrapper(), 76 requestCountMax: defaultRequestCountMax, 77 } 78 b.logger = prefixLogger(b) 79 b.child = gracefulswitch.NewBalancer(b, bOpts) 80 b.logger.Infof("Created") 81 return b 82 } 83 84 func (bb) Name() string { 85 return Name 86 } 87 88 func (bb) ParseConfig(c json.RawMessage) (serviceconfig.LoadBalancingConfig, error) { 89 return parseConfig(c) 90 } 91 92 type clusterImplBalancer struct { 93 balancer.ClientConn 94 95 // The following fields are set at creation time, and are read-only after 96 // that, and therefore need not be protected by a mutex. 97 logger *grpclog.PrefixLogger 98 // TODO: #8366 - Refactor usage of loadWrapper to easily plugin a test 99 // load reporter from tests. 100 loadWrapper *loadstore.Wrapper 101 102 // The following fields are only accessed from balancer API methods, which 103 // are guaranteed to be called serially by gRPC. 104 xdsClient xdsclient.XDSClient // Sent down in ResolverState attributes. 105 cancelLoadReport func(context.Context) // To stop reporting load through the above xDS client. 106 edsServiceName string // EDS service name to report load for. 107 lrsServer *bootstrap.ServerConfig // Load reporting server configuration. 108 dropCategories []DropConfig // The categories for drops. 109 child *gracefulswitch.Balancer 110 111 // The following fields are protected by mu, since they are accessed in 112 // balancer API methods and in methods called from the child policy. 113 mu sync.Mutex 114 clusterName string // The cluster name for credentials handshaking. 115 inhibitPickerUpdates bool // Inhibits state updates from child policy when processing an update from the parent. 116 pendingPickerUpdates bool // True if a picker update from the child policy was inhibited when processing an update from the parent. 117 childState balancer.State // Most recent state update from the child policy. 118 drops []*dropper // Drops implementation. 119 requestCounterCluster string // The cluster name for the request counter, from LB config. 120 requestCounterService string // The service name for the request counter, from LB config. 121 requestCountMax uint32 // Max concurrent requests, from LB config. 122 requestCounter *xdsclient.ClusterRequestsCounter // Tracks total inflight requests for a given service. 123 telemetryLabels map[string]string // Telemetry labels to set on picks, from LB config. 124 } 125 126 // handleDropAndRequestCountLocked compares drop and request counter in newConfig with 127 // the one currently used by picker, and is protected by b.mu. It returns a boolean 128 // indicating if a new picker needs to be generated. 129 func (b *clusterImplBalancer) handleDropAndRequestCountLocked(newConfig *LBConfig) bool { 130 var updatePicker bool 131 if !equalDropCategories(b.dropCategories, newConfig.DropCategories) { 132 b.dropCategories = newConfig.DropCategories 133 b.drops = make([]*dropper, 0, len(newConfig.DropCategories)) 134 for _, c := range newConfig.DropCategories { 135 b.drops = append(b.drops, newDropper(c)) 136 } 137 updatePicker = true 138 } 139 140 if b.requestCounterCluster != newConfig.Cluster || b.requestCounterService != newConfig.EDSServiceName { 141 b.requestCounterCluster = newConfig.Cluster 142 b.requestCounterService = newConfig.EDSServiceName 143 b.requestCounter = xdsclient.GetClusterRequestsCounter(newConfig.Cluster, newConfig.EDSServiceName) 144 updatePicker = true 145 } 146 var newRequestCountMax uint32 = 1024 147 if newConfig.MaxConcurrentRequests != nil { 148 newRequestCountMax = *newConfig.MaxConcurrentRequests 149 } 150 if b.requestCountMax != newRequestCountMax { 151 b.requestCountMax = newRequestCountMax 152 updatePicker = true 153 } 154 155 return updatePicker 156 } 157 158 func (b *clusterImplBalancer) newPickerLocked() *picker { 159 return &picker{ 160 drops: b.drops, 161 s: b.childState, 162 loadStore: b.loadWrapper, 163 counter: b.requestCounter, 164 countMax: b.requestCountMax, 165 telemetryLabels: b.telemetryLabels, 166 } 167 } 168 169 // updateLoadStore checks the config for load store, and decides whether it 170 // needs to restart the load reporting stream. 171 func (b *clusterImplBalancer) updateLoadStore(newConfig *LBConfig) error { 172 var updateLoadClusterAndService bool 173 174 // ClusterName is different, restart. ClusterName is from ClusterName and 175 // EDSServiceName. 176 clusterName := b.getClusterName() 177 if clusterName != newConfig.Cluster { 178 updateLoadClusterAndService = true 179 b.setClusterName(newConfig.Cluster) 180 clusterName = newConfig.Cluster 181 } 182 if b.edsServiceName != newConfig.EDSServiceName { 183 updateLoadClusterAndService = true 184 b.edsServiceName = newConfig.EDSServiceName 185 } 186 if updateLoadClusterAndService { 187 // This updates the clusterName and serviceName that will be reported 188 // for the loads. The update here is too early, the perfect timing is 189 // when the picker is updated with the new connection. But from this 190 // balancer's point of view, it's impossible to tell. 191 // 192 // On the other hand, this will almost never happen. Each LRS policy 193 // shouldn't get updated config. The parent should do a graceful switch 194 // when the clusterName or serviceName is changed. 195 b.loadWrapper.UpdateClusterAndService(clusterName, b.edsServiceName) 196 } 197 198 var ( 199 stopOldLoadReport bool 200 startNewLoadReport bool 201 ) 202 203 // Check if it's necessary to restart load report. 204 if b.lrsServer == nil { 205 if newConfig.LoadReportingServer != nil { 206 // Old is nil, new is not nil, start new LRS. 207 b.lrsServer = newConfig.LoadReportingServer 208 startNewLoadReport = true 209 } 210 // Old is nil, new is nil, do nothing. 211 } else if newConfig.LoadReportingServer == nil { 212 // Old is not nil, new is nil, stop old, don't start new. 213 b.lrsServer = newConfig.LoadReportingServer 214 stopOldLoadReport = true 215 } else { 216 // Old is not nil, new is not nil, compare string values, if 217 // different, stop old and start new. 218 if !b.lrsServer.Equal(newConfig.LoadReportingServer) { 219 b.lrsServer = newConfig.LoadReportingServer 220 stopOldLoadReport = true 221 startNewLoadReport = true 222 } 223 } 224 225 if stopOldLoadReport { 226 if b.cancelLoadReport != nil { 227 stopCtx, stopCancel := context.WithTimeout(context.Background(), loadStoreStopTimeout) 228 defer stopCancel() 229 b.cancelLoadReport(stopCtx) 230 b.cancelLoadReport = nil 231 if !startNewLoadReport { 232 // If a new LRS stream will be started later, no need to update 233 // it to nil here. 234 b.loadWrapper.UpdateLoadStore(nil) 235 } 236 } 237 } 238 if startNewLoadReport { 239 var loadStore *lrsclient.LoadStore 240 if b.xdsClient != nil { 241 loadStore, b.cancelLoadReport = b.xdsClient.ReportLoad(b.lrsServer) 242 } 243 b.loadWrapper.UpdateLoadStore(loadStore) 244 } 245 246 return nil 247 } 248 249 func (b *clusterImplBalancer) UpdateClientConnState(s balancer.ClientConnState) error { 250 defer clientConnUpdateHook() 251 252 b.mu.Lock() 253 b.inhibitPickerUpdates = true 254 b.mu.Unlock() 255 if b.logger.V(2) { 256 b.logger.Infof("Received configuration: %s", pretty.ToJSON(s.BalancerConfig)) 257 } 258 newConfig, ok := s.BalancerConfig.(*LBConfig) 259 if !ok { 260 return fmt.Errorf("unexpected balancer config with type: %T", s.BalancerConfig) 261 } 262 263 // Need to check for potential errors at the beginning of this function, so 264 // that on errors, we reject the whole config, instead of applying part of 265 // it. 266 bb := balancer.Get(newConfig.ChildPolicy.Name) 267 if bb == nil { 268 return fmt.Errorf("child policy %q not registered", newConfig.ChildPolicy.Name) 269 } 270 271 if b.xdsClient == nil { 272 c := xdsclient.FromResolverState(s.ResolverState) 273 if c == nil { 274 return balancer.ErrBadResolverState 275 } 276 b.xdsClient = c 277 } 278 279 // Update load reporting config. This needs to be done before updating the 280 // child policy because we need the loadStore from the updated client to be 281 // passed to the ccWrapper, so that the next picker from the child policy 282 // will pick up the new loadStore. 283 if err := b.updateLoadStore(newConfig); err != nil { 284 return err 285 } 286 287 // Build config for the gracefulswitch balancer. It is safe to ignore JSON 288 // marshaling errors here, since the config was already validated as part of 289 // ParseConfig(). 290 cfg := []map[string]any{{newConfig.ChildPolicy.Name: newConfig.ChildPolicy.Config}} 291 cfgJSON, _ := json.Marshal(cfg) 292 parsedCfg, err := gracefulswitch.ParseConfig(cfgJSON) 293 if err != nil { 294 return err 295 } 296 297 // Addresses and sub-balancer config are sent to sub-balancer. 298 err = b.child.UpdateClientConnState(balancer.ClientConnState{ 299 ResolverState: s.ResolverState, 300 BalancerConfig: parsedCfg, 301 }) 302 303 b.mu.Lock() 304 b.telemetryLabels = newConfig.TelemetryLabels 305 // We want to send a picker update to the parent if one of the two 306 // conditions are met: 307 // - drop/request config has changed *and* there is already a picker from 308 // the child, or 309 // - there is a pending picker update from the child (and this covers the 310 // case where the drop/request config has not changed, but the child sent 311 // a picker update while we were still processing config from our parent). 312 if (b.handleDropAndRequestCountLocked(newConfig) && b.childState.Picker != nil) || b.pendingPickerUpdates { 313 b.pendingPickerUpdates = false 314 b.ClientConn.UpdateState(balancer.State{ 315 ConnectivityState: b.childState.ConnectivityState, 316 Picker: b.newPickerLocked(), 317 }) 318 } 319 b.inhibitPickerUpdates = false 320 b.mu.Unlock() 321 pickerUpdateHook() 322 return err 323 } 324 325 func (b *clusterImplBalancer) ResolverError(err error) { 326 b.child.ResolverError(err) 327 } 328 329 func (b *clusterImplBalancer) updateSubConnState(_ balancer.SubConn, s balancer.SubConnState, cb func(balancer.SubConnState)) { 330 // Trigger re-resolution when a SubConn turns transient failure. This is 331 // necessary for the LogicalDNS in cluster_resolver policy to re-resolve. 332 // 333 // Note that this happens not only for the addresses from DNS, but also for 334 // EDS (cluster_impl doesn't know if it's DNS or EDS, only the parent 335 // knows). The parent priority policy is configured to ignore re-resolution 336 // signal from the EDS children. 337 if s.ConnectivityState == connectivity.TransientFailure { 338 b.ClientConn.ResolveNow(resolver.ResolveNowOptions{}) 339 } 340 341 if cb != nil { 342 cb(s) 343 } 344 } 345 346 func (b *clusterImplBalancer) UpdateSubConnState(sc balancer.SubConn, s balancer.SubConnState) { 347 b.logger.Errorf("UpdateSubConnState(%v, %+v) called unexpectedly", sc, s) 348 } 349 350 func (b *clusterImplBalancer) Close() { 351 b.child.Close() 352 b.childState = balancer.State{} 353 354 if b.cancelLoadReport != nil { 355 stopCtx, stopCancel := context.WithTimeout(context.Background(), loadStoreStopTimeout) 356 defer stopCancel() 357 b.cancelLoadReport(stopCtx) 358 b.cancelLoadReport = nil 359 } 360 b.logger.Infof("Shutdown") 361 } 362 363 func (b *clusterImplBalancer) ExitIdle() { 364 b.child.ExitIdle() 365 } 366 367 // Override methods to accept updates from the child LB. 368 369 func (b *clusterImplBalancer) UpdateState(state balancer.State) { 370 b.mu.Lock() 371 defer b.mu.Unlock() 372 373 // Inhibit sending a picker update to our parent as part of handling new 374 // state from the child, if we are currently handling an update from our 375 // parent. Update the childState field regardless. 376 b.childState = state 377 if b.inhibitPickerUpdates { 378 b.pendingPickerUpdates = true 379 if b.logger.V(2) { 380 b.logger.Infof("Received a picker update from the child when processing an update from the parent") 381 } 382 return 383 } 384 385 b.ClientConn.UpdateState(balancer.State{ 386 ConnectivityState: state.ConnectivityState, 387 Picker: b.newPickerLocked(), 388 }) 389 pickerUpdateHook() 390 } 391 392 func (b *clusterImplBalancer) setClusterName(n string) { 393 b.mu.Lock() 394 defer b.mu.Unlock() 395 b.clusterName = n 396 } 397 398 func (b *clusterImplBalancer) getClusterName() string { 399 b.mu.Lock() 400 defer b.mu.Unlock() 401 return b.clusterName 402 } 403 404 // scWrapper is a wrapper of SubConn with locality ID. The locality ID can be 405 // retrieved from the addresses when creating SubConn. 406 // 407 // All SubConns passed to the child policies are wrapped in this, so that the 408 // picker can get the localityID from the picked SubConn, and do load reporting. 409 // 410 // After wrapping, all SubConns to and from the parent ClientConn (e.g. for 411 // SubConn state update, update/remove SubConn) must be the original SubConns. 412 // All SubConns to and from the child policy (NewSubConn, forwarding SubConn 413 // state update) must be the wrapper. The balancer keeps a map from the original 414 // SubConn to the wrapper for this purpose. 415 type scWrapper struct { 416 balancer.SubConn 417 // locality needs to be atomic because it can be updated while being read by 418 // the picker. 419 locality atomic.Pointer[clients.Locality] 420 } 421 422 func (scw *scWrapper) updateLocalityID(lID clients.Locality) { 423 scw.locality.Store(&lID) 424 } 425 426 func (scw *scWrapper) localityID() clients.Locality { 427 lID := scw.locality.Load() 428 if lID == nil { 429 return clients.Locality{} 430 } 431 return *lID 432 } 433 434 func (b *clusterImplBalancer) NewSubConn(addrs []resolver.Address, opts balancer.NewSubConnOptions) (balancer.SubConn, error) { 435 clusterName := b.getClusterName() 436 newAddrs := make([]resolver.Address, len(addrs)) 437 for i, addr := range addrs { 438 newAddrs[i] = xds.SetXDSHandshakeClusterName(addr, clusterName) 439 } 440 var sc balancer.SubConn 441 scw := &scWrapper{} 442 oldListener := opts.StateListener 443 opts.StateListener = func(state balancer.SubConnState) { 444 b.updateSubConnState(sc, state, oldListener) 445 if state.ConnectivityState != connectivity.Ready { 446 return 447 } 448 // Read connected address and call updateLocalityID() based on the connected 449 // address's locality. https://github.com/grpc/grpc-go/issues/7339 450 addr := connectedAddress(state) 451 lID := xdsinternal.GetLocalityID(addr) 452 if (lID == clients.Locality{}) { 453 if b.logger.V(2) { 454 b.logger.Infof("Locality ID for %s unexpectedly empty", addr) 455 } 456 return 457 } 458 scw.updateLocalityID(lID) 459 } 460 sc, err := b.ClientConn.NewSubConn(newAddrs, opts) 461 if err != nil { 462 return nil, err 463 } 464 scw.SubConn = sc 465 return scw, nil 466 } 467 468 func (b *clusterImplBalancer) RemoveSubConn(sc balancer.SubConn) { 469 b.logger.Errorf("RemoveSubConn(%v) called unexpectedly", sc) 470 } 471 472 func (b *clusterImplBalancer) UpdateAddresses(sc balancer.SubConn, addrs []resolver.Address) { 473 clusterName := b.getClusterName() 474 newAddrs := make([]resolver.Address, len(addrs)) 475 var lID clients.Locality 476 for i, addr := range addrs { 477 newAddrs[i] = xds.SetXDSHandshakeClusterName(addr, clusterName) 478 lID = xdsinternal.GetLocalityID(newAddrs[i]) 479 } 480 if scw, ok := sc.(*scWrapper); ok { 481 scw.updateLocalityID(lID) 482 // Need to get the original SubConn from the wrapper before calling 483 // parent ClientConn. 484 sc = scw.SubConn 485 } 486 b.ClientConn.UpdateAddresses(sc, newAddrs) 487 }