google.golang.org/grpc@v1.72.2/xds/internal/balancer/clusterimpl/clusterimpl.go (about) 1 /* 2 * 3 * Copyright 2020 gRPC authors. 4 * 5 * Licensed under the Apache License, Version 2.0 (the "License"); 6 * you may not use this file except in compliance with the License. 7 * You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 * 17 */ 18 19 // Package clusterimpl implements the xds_cluster_impl balancing policy. It 20 // handles the cluster features (e.g. circuit_breaking, RPC dropping). 21 // 22 // Note that it doesn't handle name resolution, which is done by policy 23 // xds_cluster_resolver. 24 package clusterimpl 25 26 import ( 27 "encoding/json" 28 "fmt" 29 "sync" 30 "sync/atomic" 31 32 "google.golang.org/grpc/balancer" 33 "google.golang.org/grpc/connectivity" 34 "google.golang.org/grpc/internal" 35 "google.golang.org/grpc/internal/balancer/gracefulswitch" 36 "google.golang.org/grpc/internal/grpclog" 37 "google.golang.org/grpc/internal/pretty" 38 "google.golang.org/grpc/internal/xds" 39 "google.golang.org/grpc/internal/xds/bootstrap" 40 "google.golang.org/grpc/resolver" 41 "google.golang.org/grpc/serviceconfig" 42 xdsinternal "google.golang.org/grpc/xds/internal" 43 "google.golang.org/grpc/xds/internal/balancer/loadstore" 44 "google.golang.org/grpc/xds/internal/xdsclient" 45 "google.golang.org/grpc/xds/internal/xdsclient/load" 46 ) 47 48 const ( 49 // Name is the name of the cluster_impl balancer. 50 Name = "xds_cluster_impl_experimental" 51 defaultRequestCountMax = 1024 52 ) 53 54 var ( 55 connectedAddress = internal.ConnectedAddress.(func(balancer.SubConnState) resolver.Address) 56 // Below function is no-op in actual code, but can be overridden in 57 // tests to give tests visibility into exactly when certain events happen. 58 clientConnUpdateHook = func() {} 59 pickerUpdateHook = func() {} 60 ) 61 62 func init() { 63 balancer.Register(bb{}) 64 } 65 66 type bb struct{} 67 68 func (bb) Build(cc balancer.ClientConn, bOpts balancer.BuildOptions) balancer.Balancer { 69 b := &clusterImplBalancer{ 70 ClientConn: cc, 71 loadWrapper: loadstore.NewWrapper(), 72 requestCountMax: defaultRequestCountMax, 73 } 74 b.logger = prefixLogger(b) 75 b.child = gracefulswitch.NewBalancer(b, bOpts) 76 b.logger.Infof("Created") 77 return b 78 } 79 80 func (bb) Name() string { 81 return Name 82 } 83 84 func (bb) ParseConfig(c json.RawMessage) (serviceconfig.LoadBalancingConfig, error) { 85 return parseConfig(c) 86 } 87 88 type clusterImplBalancer struct { 89 balancer.ClientConn 90 91 // The following fields are set at creation time, and are read-only after 92 // that, and therefore need not be protected by a mutex. 93 logger *grpclog.PrefixLogger 94 loadWrapper *loadstore.Wrapper 95 96 // The following fields are only accessed from balancer API methods, which 97 // are guaranteed to be called serially by gRPC. 98 xdsClient xdsclient.XDSClient // Sent down in ResolverState attributes. 99 cancelLoadReport func() // To stop reporting load through the above xDS client. 100 edsServiceName string // EDS service name to report load for. 101 lrsServer *bootstrap.ServerConfig // Load reporting server configuration. 102 dropCategories []DropConfig // The categories for drops. 103 child *gracefulswitch.Balancer 104 105 // The following fields are protected by mu, since they are accessed in 106 // balancer API methods and in methods called from the child policy. 107 mu sync.Mutex 108 clusterName string // The cluster name for credentials handshaking. 109 inhibitPickerUpdates bool // Inhibits state updates from child policy when processing an update from the parent. 110 pendingPickerUpdates bool // True if a picker update from the child policy was inhibited when processing an update from the parent. 111 childState balancer.State // Most recent state update from the child policy. 112 drops []*dropper // Drops implementation. 113 requestCounterCluster string // The cluster name for the request counter, from LB config. 114 requestCounterService string // The service name for the request counter, from LB config. 115 requestCountMax uint32 // Max concurrent requests, from LB config. 116 requestCounter *xdsclient.ClusterRequestsCounter // Tracks total inflight requests for a given service. 117 telemetryLabels map[string]string // Telemetry labels to set on picks, from LB config. 118 } 119 120 // handleDropAndRequestCountLocked compares drop and request counter in newConfig with 121 // the one currently used by picker, and is protected by b.mu. It returns a boolean 122 // indicating if a new picker needs to be generated. 123 func (b *clusterImplBalancer) handleDropAndRequestCountLocked(newConfig *LBConfig) bool { 124 var updatePicker bool 125 if !equalDropCategories(b.dropCategories, newConfig.DropCategories) { 126 b.dropCategories = newConfig.DropCategories 127 b.drops = make([]*dropper, 0, len(newConfig.DropCategories)) 128 for _, c := range newConfig.DropCategories { 129 b.drops = append(b.drops, newDropper(c)) 130 } 131 updatePicker = true 132 } 133 134 if b.requestCounterCluster != newConfig.Cluster || b.requestCounterService != newConfig.EDSServiceName { 135 b.requestCounterCluster = newConfig.Cluster 136 b.requestCounterService = newConfig.EDSServiceName 137 b.requestCounter = xdsclient.GetClusterRequestsCounter(newConfig.Cluster, newConfig.EDSServiceName) 138 updatePicker = true 139 } 140 var newRequestCountMax uint32 = 1024 141 if newConfig.MaxConcurrentRequests != nil { 142 newRequestCountMax = *newConfig.MaxConcurrentRequests 143 } 144 if b.requestCountMax != newRequestCountMax { 145 b.requestCountMax = newRequestCountMax 146 updatePicker = true 147 } 148 149 return updatePicker 150 } 151 152 func (b *clusterImplBalancer) newPickerLocked() *picker { 153 return &picker{ 154 drops: b.drops, 155 s: b.childState, 156 loadStore: b.loadWrapper, 157 counter: b.requestCounter, 158 countMax: b.requestCountMax, 159 telemetryLabels: b.telemetryLabels, 160 } 161 } 162 163 // updateLoadStore checks the config for load store, and decides whether it 164 // needs to restart the load reporting stream. 165 func (b *clusterImplBalancer) updateLoadStore(newConfig *LBConfig) error { 166 var updateLoadClusterAndService bool 167 168 // ClusterName is different, restart. ClusterName is from ClusterName and 169 // EDSServiceName. 170 clusterName := b.getClusterName() 171 if clusterName != newConfig.Cluster { 172 updateLoadClusterAndService = true 173 b.setClusterName(newConfig.Cluster) 174 clusterName = newConfig.Cluster 175 } 176 if b.edsServiceName != newConfig.EDSServiceName { 177 updateLoadClusterAndService = true 178 b.edsServiceName = newConfig.EDSServiceName 179 } 180 if updateLoadClusterAndService { 181 // This updates the clusterName and serviceName that will be reported 182 // for the loads. The update here is too early, the perfect timing is 183 // when the picker is updated with the new connection. But from this 184 // balancer's point of view, it's impossible to tell. 185 // 186 // On the other hand, this will almost never happen. Each LRS policy 187 // shouldn't get updated config. The parent should do a graceful switch 188 // when the clusterName or serviceName is changed. 189 b.loadWrapper.UpdateClusterAndService(clusterName, b.edsServiceName) 190 } 191 192 var ( 193 stopOldLoadReport bool 194 startNewLoadReport bool 195 ) 196 197 // Check if it's necessary to restart load report. 198 if b.lrsServer == nil { 199 if newConfig.LoadReportingServer != nil { 200 // Old is nil, new is not nil, start new LRS. 201 b.lrsServer = newConfig.LoadReportingServer 202 startNewLoadReport = true 203 } 204 // Old is nil, new is nil, do nothing. 205 } else if newConfig.LoadReportingServer == nil { 206 // Old is not nil, new is nil, stop old, don't start new. 207 b.lrsServer = newConfig.LoadReportingServer 208 stopOldLoadReport = true 209 } else { 210 // Old is not nil, new is not nil, compare string values, if 211 // different, stop old and start new. 212 if !b.lrsServer.Equal(newConfig.LoadReportingServer) { 213 b.lrsServer = newConfig.LoadReportingServer 214 stopOldLoadReport = true 215 startNewLoadReport = true 216 } 217 } 218 219 if stopOldLoadReport { 220 if b.cancelLoadReport != nil { 221 b.cancelLoadReport() 222 b.cancelLoadReport = nil 223 if !startNewLoadReport { 224 // If a new LRS stream will be started later, no need to update 225 // it to nil here. 226 b.loadWrapper.UpdateLoadStore(nil) 227 } 228 } 229 } 230 if startNewLoadReport { 231 var loadStore *load.Store 232 if b.xdsClient != nil { 233 loadStore, b.cancelLoadReport = b.xdsClient.ReportLoad(b.lrsServer) 234 } 235 b.loadWrapper.UpdateLoadStore(loadStore) 236 } 237 238 return nil 239 } 240 241 func (b *clusterImplBalancer) UpdateClientConnState(s balancer.ClientConnState) error { 242 defer clientConnUpdateHook() 243 244 b.mu.Lock() 245 b.inhibitPickerUpdates = true 246 b.mu.Unlock() 247 if b.logger.V(2) { 248 b.logger.Infof("Received configuration: %s", pretty.ToJSON(s.BalancerConfig)) 249 } 250 newConfig, ok := s.BalancerConfig.(*LBConfig) 251 if !ok { 252 return fmt.Errorf("unexpected balancer config with type: %T", s.BalancerConfig) 253 } 254 255 // Need to check for potential errors at the beginning of this function, so 256 // that on errors, we reject the whole config, instead of applying part of 257 // it. 258 bb := balancer.Get(newConfig.ChildPolicy.Name) 259 if bb == nil { 260 return fmt.Errorf("child policy %q not registered", newConfig.ChildPolicy.Name) 261 } 262 263 if b.xdsClient == nil { 264 c := xdsclient.FromResolverState(s.ResolverState) 265 if c == nil { 266 return balancer.ErrBadResolverState 267 } 268 b.xdsClient = c 269 } 270 271 // Update load reporting config. This needs to be done before updating the 272 // child policy because we need the loadStore from the updated client to be 273 // passed to the ccWrapper, so that the next picker from the child policy 274 // will pick up the new loadStore. 275 if err := b.updateLoadStore(newConfig); err != nil { 276 return err 277 } 278 279 // Build config for the gracefulswitch balancer. It is safe to ignore JSON 280 // marshaling errors here, since the config was already validated as part of 281 // ParseConfig(). 282 cfg := []map[string]any{{newConfig.ChildPolicy.Name: newConfig.ChildPolicy.Config}} 283 cfgJSON, _ := json.Marshal(cfg) 284 parsedCfg, err := gracefulswitch.ParseConfig(cfgJSON) 285 if err != nil { 286 return err 287 } 288 289 // Addresses and sub-balancer config are sent to sub-balancer. 290 err = b.child.UpdateClientConnState(balancer.ClientConnState{ 291 ResolverState: s.ResolverState, 292 BalancerConfig: parsedCfg, 293 }) 294 295 b.mu.Lock() 296 b.telemetryLabels = newConfig.TelemetryLabels 297 // We want to send a picker update to the parent if one of the two 298 // conditions are met: 299 // - drop/request config has changed *and* there is already a picker from 300 // the child, or 301 // - there is a pending picker update from the child (and this covers the 302 // case where the drop/request config has not changed, but the child sent 303 // a picker update while we were still processing config from our parent). 304 if (b.handleDropAndRequestCountLocked(newConfig) && b.childState.Picker != nil) || b.pendingPickerUpdates { 305 b.pendingPickerUpdates = false 306 b.ClientConn.UpdateState(balancer.State{ 307 ConnectivityState: b.childState.ConnectivityState, 308 Picker: b.newPickerLocked(), 309 }) 310 } 311 b.inhibitPickerUpdates = false 312 b.mu.Unlock() 313 pickerUpdateHook() 314 return err 315 } 316 317 func (b *clusterImplBalancer) ResolverError(err error) { 318 b.child.ResolverError(err) 319 } 320 321 func (b *clusterImplBalancer) updateSubConnState(_ balancer.SubConn, s balancer.SubConnState, cb func(balancer.SubConnState)) { 322 // Trigger re-resolution when a SubConn turns transient failure. This is 323 // necessary for the LogicalDNS in cluster_resolver policy to re-resolve. 324 // 325 // Note that this happens not only for the addresses from DNS, but also for 326 // EDS (cluster_impl doesn't know if it's DNS or EDS, only the parent 327 // knows). The parent priority policy is configured to ignore re-resolution 328 // signal from the EDS children. 329 if s.ConnectivityState == connectivity.TransientFailure { 330 b.ClientConn.ResolveNow(resolver.ResolveNowOptions{}) 331 } 332 333 if cb != nil { 334 cb(s) 335 } 336 } 337 338 func (b *clusterImplBalancer) UpdateSubConnState(sc balancer.SubConn, s balancer.SubConnState) { 339 b.logger.Errorf("UpdateSubConnState(%v, %+v) called unexpectedly", sc, s) 340 } 341 342 func (b *clusterImplBalancer) Close() { 343 b.child.Close() 344 b.childState = balancer.State{} 345 346 if b.cancelLoadReport != nil { 347 b.cancelLoadReport() 348 b.cancelLoadReport = nil 349 } 350 b.logger.Infof("Shutdown") 351 } 352 353 func (b *clusterImplBalancer) ExitIdle() { 354 b.child.ExitIdle() 355 } 356 357 // Override methods to accept updates from the child LB. 358 359 func (b *clusterImplBalancer) UpdateState(state balancer.State) { 360 b.mu.Lock() 361 defer b.mu.Unlock() 362 363 // Inhibit sending a picker update to our parent as part of handling new 364 // state from the child, if we are currently handling an update from our 365 // parent. Update the childState field regardless. 366 b.childState = state 367 if b.inhibitPickerUpdates { 368 b.pendingPickerUpdates = true 369 if b.logger.V(2) { 370 b.logger.Infof("Received a picker update from the child when processing an update from the parent") 371 } 372 return 373 } 374 375 b.ClientConn.UpdateState(balancer.State{ 376 ConnectivityState: state.ConnectivityState, 377 Picker: b.newPickerLocked(), 378 }) 379 pickerUpdateHook() 380 } 381 382 func (b *clusterImplBalancer) setClusterName(n string) { 383 b.mu.Lock() 384 defer b.mu.Unlock() 385 b.clusterName = n 386 } 387 388 func (b *clusterImplBalancer) getClusterName() string { 389 b.mu.Lock() 390 defer b.mu.Unlock() 391 return b.clusterName 392 } 393 394 // scWrapper is a wrapper of SubConn with locality ID. The locality ID can be 395 // retrieved from the addresses when creating SubConn. 396 // 397 // All SubConns passed to the child policies are wrapped in this, so that the 398 // picker can get the localityID from the picked SubConn, and do load reporting. 399 // 400 // After wrapping, all SubConns to and from the parent ClientConn (e.g. for 401 // SubConn state update, update/remove SubConn) must be the original SubConns. 402 // All SubConns to and from the child policy (NewSubConn, forwarding SubConn 403 // state update) must be the wrapper. The balancer keeps a map from the original 404 // SubConn to the wrapper for this purpose. 405 type scWrapper struct { 406 balancer.SubConn 407 // locality needs to be atomic because it can be updated while being read by 408 // the picker. 409 locality atomic.Value // type xdsinternal.LocalityID 410 } 411 412 func (scw *scWrapper) updateLocalityID(lID xdsinternal.LocalityID) { 413 scw.locality.Store(lID) 414 } 415 416 func (scw *scWrapper) localityID() xdsinternal.LocalityID { 417 lID, _ := scw.locality.Load().(xdsinternal.LocalityID) 418 return lID 419 } 420 421 func (b *clusterImplBalancer) NewSubConn(addrs []resolver.Address, opts balancer.NewSubConnOptions) (balancer.SubConn, error) { 422 clusterName := b.getClusterName() 423 newAddrs := make([]resolver.Address, len(addrs)) 424 for i, addr := range addrs { 425 newAddrs[i] = xds.SetXDSHandshakeClusterName(addr, clusterName) 426 } 427 var sc balancer.SubConn 428 scw := &scWrapper{} 429 oldListener := opts.StateListener 430 opts.StateListener = func(state balancer.SubConnState) { 431 b.updateSubConnState(sc, state, oldListener) 432 if state.ConnectivityState != connectivity.Ready { 433 return 434 } 435 // Read connected address and call updateLocalityID() based on the connected 436 // address's locality. https://github.com/grpc/grpc-go/issues/7339 437 addr := connectedAddress(state) 438 lID := xdsinternal.GetLocalityID(addr) 439 if lID.Empty() { 440 if b.logger.V(2) { 441 b.logger.Infof("Locality ID for %s unexpectedly empty", addr) 442 } 443 return 444 } 445 scw.updateLocalityID(lID) 446 } 447 sc, err := b.ClientConn.NewSubConn(newAddrs, opts) 448 if err != nil { 449 return nil, err 450 } 451 scw.SubConn = sc 452 return scw, nil 453 } 454 455 func (b *clusterImplBalancer) RemoveSubConn(sc balancer.SubConn) { 456 b.logger.Errorf("RemoveSubConn(%v) called unexpectedly", sc) 457 } 458 459 func (b *clusterImplBalancer) UpdateAddresses(sc balancer.SubConn, addrs []resolver.Address) { 460 clusterName := b.getClusterName() 461 newAddrs := make([]resolver.Address, len(addrs)) 462 var lID xdsinternal.LocalityID 463 for i, addr := range addrs { 464 newAddrs[i] = xds.SetXDSHandshakeClusterName(addr, clusterName) 465 lID = xdsinternal.GetLocalityID(newAddrs[i]) 466 } 467 if scw, ok := sc.(*scWrapper); ok { 468 scw.updateLocalityID(lID) 469 // Need to get the original SubConn from the wrapper before calling 470 // parent ClientConn. 471 sc = scw.SubConn 472 } 473 b.ClientConn.UpdateAddresses(sc, newAddrs) 474 }