google.golang.org/grpc@v1.62.1/xds/internal/balancer/clusterimpl/clusterimpl.go (about) 1 /* 2 * 3 * Copyright 2020 gRPC authors. 4 * 5 * Licensed under the Apache License, Version 2.0 (the "License"); 6 * you may not use this file except in compliance with the License. 7 * You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 * 17 */ 18 19 // Package clusterimpl implements the xds_cluster_impl balancing policy. It 20 // handles the cluster features (e.g. circuit_breaking, RPC dropping). 21 // 22 // Note that it doesn't handle name resolution, which is done by policy 23 // xds_cluster_resolver. 24 package clusterimpl 25 26 import ( 27 "encoding/json" 28 "fmt" 29 "sync" 30 "sync/atomic" 31 32 "google.golang.org/grpc/balancer" 33 "google.golang.org/grpc/connectivity" 34 "google.golang.org/grpc/internal" 35 "google.golang.org/grpc/internal/balancer/gracefulswitch" 36 "google.golang.org/grpc/internal/buffer" 37 "google.golang.org/grpc/internal/grpclog" 38 "google.golang.org/grpc/internal/grpcsync" 39 "google.golang.org/grpc/internal/pretty" 40 "google.golang.org/grpc/resolver" 41 "google.golang.org/grpc/serviceconfig" 42 xdsinternal "google.golang.org/grpc/xds/internal" 43 "google.golang.org/grpc/xds/internal/balancer/loadstore" 44 "google.golang.org/grpc/xds/internal/xdsclient" 45 "google.golang.org/grpc/xds/internal/xdsclient/bootstrap" 46 "google.golang.org/grpc/xds/internal/xdsclient/load" 47 ) 48 49 const ( 50 // Name is the name of the cluster_impl balancer. 51 Name = "xds_cluster_impl_experimental" 52 defaultRequestCountMax = 1024 53 ) 54 55 func init() { 56 balancer.Register(bb{}) 57 } 58 59 type bb struct{} 60 61 func (bb) Build(cc balancer.ClientConn, bOpts balancer.BuildOptions) balancer.Balancer { 62 b := &clusterImplBalancer{ 63 ClientConn: cc, 64 bOpts: bOpts, 65 closed: grpcsync.NewEvent(), 66 done: grpcsync.NewEvent(), 67 loadWrapper: loadstore.NewWrapper(), 68 pickerUpdateCh: buffer.NewUnbounded(), 69 requestCountMax: defaultRequestCountMax, 70 } 71 b.logger = prefixLogger(b) 72 b.child = gracefulswitch.NewBalancer(b, bOpts) 73 go b.run() 74 b.logger.Infof("Created") 75 return b 76 } 77 78 func (bb) Name() string { 79 return Name 80 } 81 82 func (bb) ParseConfig(c json.RawMessage) (serviceconfig.LoadBalancingConfig, error) { 83 return parseConfig(c) 84 } 85 86 type clusterImplBalancer struct { 87 balancer.ClientConn 88 89 // mu guarantees mutual exclusion between Close() and handling of picker 90 // update to the parent ClientConn in run(). It's to make sure that the 91 // run() goroutine doesn't send picker update to parent after the balancer 92 // is closed. 93 // 94 // It's only used by the run() goroutine, but not the other exported 95 // functions. Because the exported functions are guaranteed to be 96 // synchronized with Close(). 97 mu sync.Mutex 98 closed *grpcsync.Event 99 done *grpcsync.Event 100 101 bOpts balancer.BuildOptions 102 logger *grpclog.PrefixLogger 103 xdsClient xdsclient.XDSClient 104 105 config *LBConfig 106 child *gracefulswitch.Balancer 107 cancelLoadReport func() 108 edsServiceName string 109 lrsServer *bootstrap.ServerConfig 110 loadWrapper *loadstore.Wrapper 111 112 clusterNameMu sync.Mutex 113 clusterName string 114 115 // childState/drops/requestCounter keeps the state used by the most recently 116 // generated picker. All fields can only be accessed in run(). And run() is 117 // the only goroutine that sends picker to the parent ClientConn. All 118 // requests to update picker need to be sent to pickerUpdateCh. 119 childState balancer.State 120 dropCategories []DropConfig // The categories for drops. 121 drops []*dropper 122 requestCounterCluster string // The cluster name for the request counter. 123 requestCounterService string // The service name for the request counter. 124 requestCounter *xdsclient.ClusterRequestsCounter 125 requestCountMax uint32 126 pickerUpdateCh *buffer.Unbounded 127 } 128 129 // updateLoadStore checks the config for load store, and decides whether it 130 // needs to restart the load reporting stream. 131 func (b *clusterImplBalancer) updateLoadStore(newConfig *LBConfig) error { 132 var updateLoadClusterAndService bool 133 134 // ClusterName is different, restart. ClusterName is from ClusterName and 135 // EDSServiceName. 136 clusterName := b.getClusterName() 137 if clusterName != newConfig.Cluster { 138 updateLoadClusterAndService = true 139 b.setClusterName(newConfig.Cluster) 140 clusterName = newConfig.Cluster 141 } 142 if b.edsServiceName != newConfig.EDSServiceName { 143 updateLoadClusterAndService = true 144 b.edsServiceName = newConfig.EDSServiceName 145 } 146 if updateLoadClusterAndService { 147 // This updates the clusterName and serviceName that will be reported 148 // for the loads. The update here is too early, the perfect timing is 149 // when the picker is updated with the new connection. But from this 150 // balancer's point of view, it's impossible to tell. 151 // 152 // On the other hand, this will almost never happen. Each LRS policy 153 // shouldn't get updated config. The parent should do a graceful switch 154 // when the clusterName or serviceName is changed. 155 b.loadWrapper.UpdateClusterAndService(clusterName, b.edsServiceName) 156 } 157 158 var ( 159 stopOldLoadReport bool 160 startNewLoadReport bool 161 ) 162 163 // Check if it's necessary to restart load report. 164 if b.lrsServer == nil { 165 if newConfig.LoadReportingServer != nil { 166 // Old is nil, new is not nil, start new LRS. 167 b.lrsServer = newConfig.LoadReportingServer 168 startNewLoadReport = true 169 } 170 // Old is nil, new is nil, do nothing. 171 } else if newConfig.LoadReportingServer == nil { 172 // Old is not nil, new is nil, stop old, don't start new. 173 b.lrsServer = newConfig.LoadReportingServer 174 stopOldLoadReport = true 175 } else { 176 // Old is not nil, new is not nil, compare string values, if 177 // different, stop old and start new. 178 if !b.lrsServer.Equal(newConfig.LoadReportingServer) { 179 b.lrsServer = newConfig.LoadReportingServer 180 stopOldLoadReport = true 181 startNewLoadReport = true 182 } 183 } 184 185 if stopOldLoadReport { 186 if b.cancelLoadReport != nil { 187 b.cancelLoadReport() 188 b.cancelLoadReport = nil 189 if !startNewLoadReport { 190 // If a new LRS stream will be started later, no need to update 191 // it to nil here. 192 b.loadWrapper.UpdateLoadStore(nil) 193 } 194 } 195 } 196 if startNewLoadReport { 197 var loadStore *load.Store 198 if b.xdsClient != nil { 199 loadStore, b.cancelLoadReport = b.xdsClient.ReportLoad(b.lrsServer) 200 } 201 b.loadWrapper.UpdateLoadStore(loadStore) 202 } 203 204 return nil 205 } 206 207 func (b *clusterImplBalancer) UpdateClientConnState(s balancer.ClientConnState) error { 208 if b.closed.HasFired() { 209 b.logger.Warningf("xds: received ClientConnState {%+v} after clusterImplBalancer was closed", s) 210 return nil 211 } 212 213 b.logger.Infof("Received update from resolver, balancer config: %+v", pretty.ToJSON(s.BalancerConfig)) 214 newConfig, ok := s.BalancerConfig.(*LBConfig) 215 if !ok { 216 return fmt.Errorf("unexpected balancer config with type: %T", s.BalancerConfig) 217 } 218 219 // Need to check for potential errors at the beginning of this function, so 220 // that on errors, we reject the whole config, instead of applying part of 221 // it. 222 bb := balancer.Get(newConfig.ChildPolicy.Name) 223 if bb == nil { 224 return fmt.Errorf("balancer %q not registered", newConfig.ChildPolicy.Name) 225 } 226 227 if b.xdsClient == nil { 228 c := xdsclient.FromResolverState(s.ResolverState) 229 if c == nil { 230 return balancer.ErrBadResolverState 231 } 232 b.xdsClient = c 233 } 234 235 // Update load reporting config. This needs to be done before updating the 236 // child policy because we need the loadStore from the updated client to be 237 // passed to the ccWrapper, so that the next picker from the child policy 238 // will pick up the new loadStore. 239 if err := b.updateLoadStore(newConfig); err != nil { 240 return err 241 } 242 243 if b.config == nil || b.config.ChildPolicy.Name != newConfig.ChildPolicy.Name { 244 if err := b.child.SwitchTo(bb); err != nil { 245 return fmt.Errorf("error switching to child of type %q: %v", newConfig.ChildPolicy.Name, err) 246 } 247 } 248 b.config = newConfig 249 250 // Notify run() of this new config, in case drop and request counter need 251 // update (which means a new picker needs to be generated). 252 b.pickerUpdateCh.Put(newConfig) 253 254 // Addresses and sub-balancer config are sent to sub-balancer. 255 return b.child.UpdateClientConnState(balancer.ClientConnState{ 256 ResolverState: s.ResolverState, 257 BalancerConfig: b.config.ChildPolicy.Config, 258 }) 259 } 260 261 func (b *clusterImplBalancer) ResolverError(err error) { 262 if b.closed.HasFired() { 263 b.logger.Warningf("xds: received resolver error {%+v} after clusterImplBalancer was closed", err) 264 return 265 } 266 b.child.ResolverError(err) 267 } 268 269 func (b *clusterImplBalancer) updateSubConnState(sc balancer.SubConn, s balancer.SubConnState, cb func(balancer.SubConnState)) { 270 if b.closed.HasFired() { 271 b.logger.Warningf("xds: received subconn state change {%+v, %+v} after clusterImplBalancer was closed", sc, s) 272 return 273 } 274 275 // Trigger re-resolution when a SubConn turns transient failure. This is 276 // necessary for the LogicalDNS in cluster_resolver policy to re-resolve. 277 // 278 // Note that this happens not only for the addresses from DNS, but also for 279 // EDS (cluster_impl doesn't know if it's DNS or EDS, only the parent 280 // knows). The parent priority policy is configured to ignore re-resolution 281 // signal from the EDS children. 282 if s.ConnectivityState == connectivity.TransientFailure { 283 b.ClientConn.ResolveNow(resolver.ResolveNowOptions{}) 284 } 285 286 if cb != nil { 287 cb(s) 288 } 289 } 290 291 func (b *clusterImplBalancer) UpdateSubConnState(sc balancer.SubConn, s balancer.SubConnState) { 292 b.logger.Errorf("UpdateSubConnState(%v, %+v) called unexpectedly", sc, s) 293 } 294 295 func (b *clusterImplBalancer) Close() { 296 b.mu.Lock() 297 b.closed.Fire() 298 b.mu.Unlock() 299 300 b.child.Close() 301 b.childState = balancer.State{} 302 b.pickerUpdateCh.Close() 303 <-b.done.Done() 304 b.logger.Infof("Shutdown") 305 } 306 307 func (b *clusterImplBalancer) ExitIdle() { 308 b.child.ExitIdle() 309 } 310 311 // Override methods to accept updates from the child LB. 312 313 func (b *clusterImplBalancer) UpdateState(state balancer.State) { 314 // Instead of updating parent ClientConn inline, send state to run(). 315 b.pickerUpdateCh.Put(state) 316 } 317 318 func (b *clusterImplBalancer) setClusterName(n string) { 319 b.clusterNameMu.Lock() 320 defer b.clusterNameMu.Unlock() 321 b.clusterName = n 322 } 323 324 func (b *clusterImplBalancer) getClusterName() string { 325 b.clusterNameMu.Lock() 326 defer b.clusterNameMu.Unlock() 327 return b.clusterName 328 } 329 330 // scWrapper is a wrapper of SubConn with locality ID. The locality ID can be 331 // retrieved from the addresses when creating SubConn. 332 // 333 // All SubConns passed to the child policies are wrapped in this, so that the 334 // picker can get the localityID from the picked SubConn, and do load reporting. 335 // 336 // After wrapping, all SubConns to and from the parent ClientConn (e.g. for 337 // SubConn state update, update/remove SubConn) must be the original SubConns. 338 // All SubConns to and from the child policy (NewSubConn, forwarding SubConn 339 // state update) must be the wrapper. The balancer keeps a map from the original 340 // SubConn to the wrapper for this purpose. 341 type scWrapper struct { 342 balancer.SubConn 343 // locality needs to be atomic because it can be updated while being read by 344 // the picker. 345 locality atomic.Value // type xdsinternal.LocalityID 346 } 347 348 func (scw *scWrapper) updateLocalityID(lID xdsinternal.LocalityID) { 349 scw.locality.Store(lID) 350 } 351 352 func (scw *scWrapper) localityID() xdsinternal.LocalityID { 353 lID, _ := scw.locality.Load().(xdsinternal.LocalityID) 354 return lID 355 } 356 357 func (b *clusterImplBalancer) NewSubConn(addrs []resolver.Address, opts balancer.NewSubConnOptions) (balancer.SubConn, error) { 358 clusterName := b.getClusterName() 359 newAddrs := make([]resolver.Address, len(addrs)) 360 var lID xdsinternal.LocalityID 361 for i, addr := range addrs { 362 newAddrs[i] = internal.SetXDSHandshakeClusterName(addr, clusterName) 363 lID = xdsinternal.GetLocalityID(newAddrs[i]) 364 } 365 var sc balancer.SubConn 366 oldListener := opts.StateListener 367 opts.StateListener = func(state balancer.SubConnState) { b.updateSubConnState(sc, state, oldListener) } 368 sc, err := b.ClientConn.NewSubConn(newAddrs, opts) 369 if err != nil { 370 return nil, err 371 } 372 // Wrap this SubConn in a wrapper, and add it to the map. 373 ret := &scWrapper{SubConn: sc} 374 ret.updateLocalityID(lID) 375 return ret, nil 376 } 377 378 func (b *clusterImplBalancer) RemoveSubConn(sc balancer.SubConn) { 379 b.logger.Errorf("RemoveSubConn(%v) called unexpectedly", sc) 380 } 381 382 func (b *clusterImplBalancer) UpdateAddresses(sc balancer.SubConn, addrs []resolver.Address) { 383 clusterName := b.getClusterName() 384 newAddrs := make([]resolver.Address, len(addrs)) 385 var lID xdsinternal.LocalityID 386 for i, addr := range addrs { 387 newAddrs[i] = internal.SetXDSHandshakeClusterName(addr, clusterName) 388 lID = xdsinternal.GetLocalityID(newAddrs[i]) 389 } 390 if scw, ok := sc.(*scWrapper); ok { 391 scw.updateLocalityID(lID) 392 // Need to get the original SubConn from the wrapper before calling 393 // parent ClientConn. 394 sc = scw.SubConn 395 } 396 b.ClientConn.UpdateAddresses(sc, newAddrs) 397 } 398 399 type dropConfigs struct { 400 drops []*dropper 401 requestCounter *xdsclient.ClusterRequestsCounter 402 requestCountMax uint32 403 } 404 405 // handleDropAndRequestCount compares drop and request counter in newConfig with 406 // the one currently used by picker. It returns a new dropConfigs if a new 407 // picker needs to be generated, otherwise it returns nil. 408 func (b *clusterImplBalancer) handleDropAndRequestCount(newConfig *LBConfig) *dropConfigs { 409 // Compare new drop config. And update picker if it's changed. 410 var updatePicker bool 411 if !equalDropCategories(b.dropCategories, newConfig.DropCategories) { 412 b.dropCategories = newConfig.DropCategories 413 b.drops = make([]*dropper, 0, len(newConfig.DropCategories)) 414 for _, c := range newConfig.DropCategories { 415 b.drops = append(b.drops, newDropper(c)) 416 } 417 updatePicker = true 418 } 419 420 // Compare cluster name. And update picker if it's changed, because circuit 421 // breaking's stream counter will be different. 422 if b.requestCounterCluster != newConfig.Cluster || b.requestCounterService != newConfig.EDSServiceName { 423 b.requestCounterCluster = newConfig.Cluster 424 b.requestCounterService = newConfig.EDSServiceName 425 b.requestCounter = xdsclient.GetClusterRequestsCounter(newConfig.Cluster, newConfig.EDSServiceName) 426 updatePicker = true 427 } 428 // Compare upper bound of stream count. And update picker if it's changed. 429 // This is also for circuit breaking. 430 var newRequestCountMax uint32 = 1024 431 if newConfig.MaxConcurrentRequests != nil { 432 newRequestCountMax = *newConfig.MaxConcurrentRequests 433 } 434 if b.requestCountMax != newRequestCountMax { 435 b.requestCountMax = newRequestCountMax 436 updatePicker = true 437 } 438 439 if !updatePicker { 440 return nil 441 } 442 return &dropConfigs{ 443 drops: b.drops, 444 requestCounter: b.requestCounter, 445 requestCountMax: b.requestCountMax, 446 } 447 } 448 449 func (b *clusterImplBalancer) run() { 450 defer b.done.Fire() 451 for { 452 select { 453 case update, ok := <-b.pickerUpdateCh.Get(): 454 if !ok { 455 return 456 } 457 b.pickerUpdateCh.Load() 458 b.mu.Lock() 459 if b.closed.HasFired() { 460 b.mu.Unlock() 461 return 462 } 463 switch u := update.(type) { 464 case balancer.State: 465 b.childState = u 466 b.ClientConn.UpdateState(balancer.State{ 467 ConnectivityState: b.childState.ConnectivityState, 468 Picker: newPicker(b.childState, &dropConfigs{ 469 drops: b.drops, 470 requestCounter: b.requestCounter, 471 requestCountMax: b.requestCountMax, 472 }, b.loadWrapper), 473 }) 474 case *LBConfig: 475 dc := b.handleDropAndRequestCount(u) 476 if dc != nil && b.childState.Picker != nil { 477 b.ClientConn.UpdateState(balancer.State{ 478 ConnectivityState: b.childState.ConnectivityState, 479 Picker: newPicker(b.childState, dc, b.loadWrapper), 480 }) 481 } 482 } 483 b.mu.Unlock() 484 case <-b.closed.Done(): 485 if b.cancelLoadReport != nil { 486 b.cancelLoadReport() 487 b.cancelLoadReport = nil 488 } 489 return 490 } 491 } 492 }