dubbo.apache.org/dubbo-go/v3@v3.1.1/xds/balancer/clusterimpl/clusterimpl.go (about) 1 /* 2 * Licensed to the Apache Software Foundation (ASF) under one or more 3 * contributor license agreements. See the NOTICE file distributed with 4 * this work for additional information regarding copyright ownership. 5 * The ASF licenses this file to You under the Apache License, Version 2.0 6 * (the "License"); you may not use this file except in compliance with 7 * the License. You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 */ 17 18 /* 19 * 20 * Copyright 2020 gRPC authors. 21 * 22 */ 23 24 // Package clusterimpl implements the xds_cluster_impl balancing policy. It 25 // handles the cluster features (e.g. circuit_breaking, RPC dropping). 26 // 27 // Note that it doesn't handle name resolution, which is done by policy 28 // xds_cluster_resolver. 29 package clusterimpl 30 31 import ( 32 "encoding/json" 33 "fmt" 34 "sync" 35 "sync/atomic" 36 ) 37 38 import ( 39 dubbogoLogger "github.com/dubbogo/gost/log/logger" 40 41 "google.golang.org/grpc/balancer" 42 43 "google.golang.org/grpc/connectivity" 44 45 "google.golang.org/grpc/resolver" 46 47 "google.golang.org/grpc/serviceconfig" 48 ) 49 50 import ( 51 internal "dubbo.apache.org/dubbo-go/v3/xds" 52 "dubbo.apache.org/dubbo-go/v3/xds/balancer/loadstore" 53 "dubbo.apache.org/dubbo-go/v3/xds/client" 54 "dubbo.apache.org/dubbo-go/v3/xds/client/load" 55 "dubbo.apache.org/dubbo-go/v3/xds/client/resource" 56 "dubbo.apache.org/dubbo-go/v3/xds/utils/buffer" 57 "dubbo.apache.org/dubbo-go/v3/xds/utils/grpcsync" 58 "dubbo.apache.org/dubbo-go/v3/xds/utils/pretty" 59 ) 60 61 const ( 62 // Name is the name of the cluster_impl balancer. 63 Name = "xds_cluster_impl_experimental" 64 defaultRequestCountMax = 1024 65 ) 66 67 func init() { 68 balancer.Register(bb{}) 69 } 70 71 type bb struct{} 72 73 func (bb) Build(cc balancer.ClientConn, bOpts balancer.BuildOptions) balancer.Balancer { 74 b := &clusterImplBalancer{ 75 ClientConn: cc, 76 bOpts: bOpts, 77 closed: grpcsync.NewEvent(), 78 done: grpcsync.NewEvent(), 79 loadWrapper: loadstore.NewWrapper(), 80 scWrappers: make(map[balancer.SubConn]*scWrapper), 81 pickerUpdateCh: buffer.NewUnbounded(), 82 requestCountMax: defaultRequestCountMax, 83 } 84 b.logger = dubbogoLogger.GetLogger() 85 go b.run() 86 b.logger.Infof("Created") 87 return b 88 } 89 90 func (bb) Name() string { 91 return Name 92 } 93 94 func (bb) ParseConfig(c json.RawMessage) (serviceconfig.LoadBalancingConfig, error) { 95 return parseConfig(c) 96 } 97 98 type clusterImplBalancer struct { 99 balancer.ClientConn 100 101 // mu guarantees mutual exclusion between Close() and handling of picker 102 // update to the parent ClientConn in run(). It's to make sure that the 103 // run() goroutine doesn't send picker update to parent after the balancer 104 // is closed. 105 // 106 // It's only used by the run() goroutine, but not the other exported 107 // functions. Because the exported functions are guaranteed to be 108 // synchronized with Close(). 109 mu sync.Mutex 110 closed *grpcsync.Event 111 done *grpcsync.Event 112 113 bOpts balancer.BuildOptions 114 logger dubbogoLogger.Logger 115 xdsClient client.XDSClient 116 117 config *LBConfig 118 childLB balancer.Balancer 119 cancelLoadReport func() 120 edsServiceName string 121 lrsServerName *string 122 loadWrapper *loadstore.Wrapper 123 124 clusterNameMu sync.Mutex 125 clusterName string 126 127 scWrappersMu sync.Mutex 128 // The SubConns passed to the child policy are wrapped in a wrapper, to keep 129 // locality ID. But when the parent ClientConn sends updates, it's going to 130 // give the original SubConn, not the wrapper. But the child policies only 131 // know about the wrapper, so when forwarding SubConn updates, they must be 132 // sent for the wrappers. 133 // 134 // This keeps a map from original SubConn to wrapper, so that when 135 // forwarding the SubConn state update, the child policy will get the 136 // wrappers. 137 scWrappers map[balancer.SubConn]*scWrapper 138 139 // childState/drops/requestCounter keeps the state used by the most recently 140 // generated picker. All fields can only be accessed in run(). And run() is 141 // the only goroutine that sends picker to the parent ClientConn. All 142 // requests to update picker need to be sent to pickerUpdateCh. 143 childState balancer.State 144 dropCategories []DropConfig // The categories for drops. 145 drops []*dropper 146 requestCounterCluster string // The cluster name for the request counter. 147 requestCounterService string // The service name for the request counter. 148 requestCounter *client.ClusterRequestsCounter 149 requestCountMax uint32 150 pickerUpdateCh *buffer.Unbounded 151 } 152 153 // updateLoadStore checks the config for load store, and decides whether it 154 // needs to restart the load reporting stream. 155 func (b *clusterImplBalancer) updateLoadStore(newConfig *LBConfig) error { 156 var updateLoadClusterAndService bool 157 158 // ClusterName is different, restart. ClusterName is from ClusterName and 159 // EDSServiceName. 160 clusterName := b.getClusterName() 161 if clusterName != newConfig.Cluster { 162 updateLoadClusterAndService = true 163 b.setClusterName(newConfig.Cluster) 164 clusterName = newConfig.Cluster 165 } 166 if b.edsServiceName != newConfig.EDSServiceName { 167 updateLoadClusterAndService = true 168 b.edsServiceName = newConfig.EDSServiceName 169 } 170 if updateLoadClusterAndService { 171 // This updates the clusterName and serviceName that will be reported 172 // for the loads. The update here is too early, the perfect timing is 173 // when the picker is updated with the new connection. But from this 174 // balancer's point of view, it's impossible to tell. 175 // 176 // On the other hand, this will almost never happen. Each LRS policy 177 // shouldn't get updated config. The parent should do a graceful switch 178 // when the clusterName or serviceName is changed. 179 b.loadWrapper.UpdateClusterAndService(clusterName, b.edsServiceName) 180 } 181 182 var ( 183 stopOldLoadReport bool 184 startNewLoadReport bool 185 ) 186 187 // Check if it's necessary to restart load report. 188 if b.lrsServerName == nil { 189 if newConfig.LoadReportingServerName != nil { 190 // Old is nil, new is not nil, start new LRS. 191 b.lrsServerName = newConfig.LoadReportingServerName 192 startNewLoadReport = true 193 } 194 // Old is nil, new is nil, do nothing. 195 } else if newConfig.LoadReportingServerName == nil { 196 // Old is not nil, new is nil, stop old, don't start new. 197 b.lrsServerName = newConfig.LoadReportingServerName 198 stopOldLoadReport = true 199 } else { 200 // Old is not nil, new is not nil, compare string values, if 201 // different, stop old and start new. 202 if *b.lrsServerName != *newConfig.LoadReportingServerName { 203 b.lrsServerName = newConfig.LoadReportingServerName 204 stopOldLoadReport = true 205 startNewLoadReport = true 206 } 207 } 208 209 if stopOldLoadReport { 210 if b.cancelLoadReport != nil { 211 b.cancelLoadReport() 212 b.cancelLoadReport = nil 213 if !startNewLoadReport { 214 // If a new LRS stream will be started later, no need to update 215 // it to nil here. 216 b.loadWrapper.UpdateLoadStore(nil) 217 } 218 } 219 } 220 if startNewLoadReport { 221 var loadStore *load.Store 222 if b.xdsClient != nil { 223 loadStore, b.cancelLoadReport = b.xdsClient.ReportLoad(*b.lrsServerName) 224 } 225 b.loadWrapper.UpdateLoadStore(loadStore) 226 } 227 228 return nil 229 } 230 231 func (b *clusterImplBalancer) UpdateClientConnState(s balancer.ClientConnState) error { 232 if b.closed.HasFired() { 233 b.logger.Warnf("xds: received ClientConnState {%+v} after clusterImplBalancer was closed", s) 234 return nil 235 } 236 237 b.logger.Infof("Received update from resolver, balancer config: %+v", pretty.ToJSON(s.BalancerConfig)) 238 newConfig, ok := s.BalancerConfig.(*LBConfig) 239 if !ok { 240 return fmt.Errorf("unexpected balancer config with type: %T", s.BalancerConfig) 241 } 242 243 // Need to check for potential errors at the beginning of this function, so 244 // that on errors, we reject the whole config, instead of applying part of 245 // it. 246 bb := balancer.Get(newConfig.ChildPolicy.Name) 247 if bb == nil { 248 return fmt.Errorf("balancer %q not registered", newConfig.ChildPolicy.Name) 249 } 250 251 if b.xdsClient == nil { 252 c := client.FromResolverState(s.ResolverState) 253 if c == nil { 254 return balancer.ErrBadResolverState 255 } 256 b.xdsClient = c 257 } 258 259 // Update load reporting config. This needs to be done before updating the 260 // child policy because we need the loadStore from the updated client to be 261 // passed to the ccWrapper, so that the next picker from the child policy 262 // will pick up the new loadStore. 263 if err := b.updateLoadStore(newConfig); err != nil { 264 return err 265 } 266 267 // If child policy is a different type, recreate the sub-balancer. 268 if b.config == nil || b.config.ChildPolicy.Name != newConfig.ChildPolicy.Name { 269 if b.childLB != nil { 270 b.childLB.Close() 271 } 272 b.childLB = bb.Build(b, b.bOpts) 273 } 274 b.config = newConfig 275 276 if b.childLB == nil { 277 // This is not an expected situation, and should be super rare in 278 // practice. 279 // 280 // When this happens, we already applied all the other configurations 281 // (drop/circuit breaking), but there's no child policy. This balancer 282 // will be stuck, and we report the error to the parent. 283 return fmt.Errorf("child policy is nil, this means balancer %q's Build() returned nil", newConfig.ChildPolicy.Name) 284 } 285 286 // Notify run() of this new config, in case drop and request counter need 287 // update (which means a new picker needs to be generated). 288 b.pickerUpdateCh.Put(newConfig) 289 290 // Addresses and sub-balancer config are sent to sub-balancer. 291 return b.childLB.UpdateClientConnState(balancer.ClientConnState{ 292 ResolverState: s.ResolverState, 293 BalancerConfig: b.config.ChildPolicy.Config, 294 }) 295 } 296 297 func (b *clusterImplBalancer) ResolverError(err error) { 298 if b.closed.HasFired() { 299 b.logger.Warnf("xds: received resolver error {%+v} after clusterImplBalancer was closed", err) 300 return 301 } 302 303 if b.childLB != nil { 304 b.childLB.ResolverError(err) 305 } 306 } 307 308 func (b *clusterImplBalancer) UpdateSubConnState(sc balancer.SubConn, s balancer.SubConnState) { 309 if b.closed.HasFired() { 310 b.logger.Warnf("xds: received subconn state change {%+v, %+v} after clusterImplBalancer was closed", sc, s) 311 return 312 } 313 314 // Trigger re-resolution when a SubConn turns transient failure. This is 315 // necessary for the LogicalDNS in cluster_resolver policy to re-resolve. 316 // 317 // Note that this happens not only for the addresses from DNS, but also for 318 // EDS (cluster_impl doesn't know if it's DNS or EDS, only the parent 319 // knows). The parent priority policy is configured to ignore re-resolution 320 // signal from the EDS children. 321 if s.ConnectivityState == connectivity.TransientFailure { 322 b.ClientConn.ResolveNow(resolver.ResolveNowOptions{}) 323 } 324 325 b.scWrappersMu.Lock() 326 if scw, ok := b.scWrappers[sc]; ok { 327 sc = scw 328 if s.ConnectivityState == connectivity.Shutdown { 329 // Remove this SubConn from the map on Shutdown. 330 delete(b.scWrappers, scw.SubConn) 331 } 332 } 333 b.scWrappersMu.Unlock() 334 if b.childLB != nil { 335 b.childLB.UpdateSubConnState(sc, s) 336 } 337 } 338 339 func (b *clusterImplBalancer) Close() { 340 b.mu.Lock() 341 b.closed.Fire() 342 b.mu.Unlock() 343 344 if b.childLB != nil { 345 b.childLB.Close() 346 b.childLB = nil 347 } 348 <-b.done.Done() 349 b.logger.Infof("Shutdown") 350 } 351 352 func (b *clusterImplBalancer) ExitIdle() { 353 if b.childLB == nil { 354 return 355 } 356 if ei, ok := b.childLB.(balancer.ExitIdler); ok { 357 ei.ExitIdle() 358 return 359 } 360 // Fallback for children that don't support ExitIdle -- connect to all 361 // SubConns. 362 for _, sc := range b.scWrappers { 363 sc.Connect() 364 } 365 } 366 367 // Override methods to accept updates from the child LB. 368 369 func (b *clusterImplBalancer) UpdateState(state balancer.State) { 370 // Instead of updating parent ClientConn inline, send state to run(). 371 b.pickerUpdateCh.Put(state) 372 } 373 374 func (b *clusterImplBalancer) setClusterName(n string) { 375 b.clusterNameMu.Lock() 376 defer b.clusterNameMu.Unlock() 377 b.clusterName = n 378 } 379 380 func (b *clusterImplBalancer) getClusterName() string { 381 b.clusterNameMu.Lock() 382 defer b.clusterNameMu.Unlock() 383 return b.clusterName 384 } 385 386 // scWrapper is a wrapper of SubConn with locality ID. The locality ID can be 387 // retrieved from the addresses when creating SubConn. 388 // 389 // All SubConns passed to the child policies are wrapped in this, so that the 390 // picker can get the localityID from the picked SubConn, and do load reporting. 391 // 392 // After wrapping, all SubConns to and from the parent ClientConn (e.g. for 393 // SubConn state update, update/remove SubConn) must be the original SubConns. 394 // All SubConns to and from the child policy (NewSubConn, forwarding SubConn 395 // state update) must be the wrapper. The balancer keeps a map from the original 396 // SubConn to the wrapper for this purpose. 397 type scWrapper struct { 398 balancer.SubConn 399 // locality needs to be atomic because it can be updated while being read by 400 // the picker. 401 locality atomic.Value // type resource.LocalityID 402 } 403 404 func (scw *scWrapper) updateLocalityID(lID resource.LocalityID) { 405 scw.locality.Store(lID) 406 } 407 408 func (scw *scWrapper) localityID() resource.LocalityID { 409 lID, _ := scw.locality.Load().(resource.LocalityID) 410 return lID 411 } 412 413 func (b *clusterImplBalancer) NewSubConn(addrs []resolver.Address, opts balancer.NewSubConnOptions) (balancer.SubConn, error) { 414 clusterName := b.getClusterName() 415 newAddrs := make([]resolver.Address, len(addrs)) 416 var lID resource.LocalityID 417 for i, addr := range addrs { 418 newAddrs[i] = internal.SetXDSHandshakeClusterName(addr, clusterName) 419 lID = resource.GetLocalityID(newAddrs[i]) 420 } 421 sc, err := b.ClientConn.NewSubConn(newAddrs, opts) 422 if err != nil { 423 return nil, err 424 } 425 // Wrap this SubConn in a wrapper, and add it to the map. 426 b.scWrappersMu.Lock() 427 ret := &scWrapper{SubConn: sc} 428 ret.updateLocalityID(lID) 429 b.scWrappers[sc] = ret 430 b.scWrappersMu.Unlock() 431 return ret, nil 432 } 433 434 func (b *clusterImplBalancer) RemoveSubConn(sc balancer.SubConn) { 435 scw, ok := sc.(*scWrapper) 436 if !ok { 437 b.ClientConn.RemoveSubConn(sc) 438 return 439 } 440 // Remove the original SubConn from the parent ClientConn. 441 // 442 // Note that we don't remove this SubConn from the scWrappers map. We will 443 // need it to forward the final SubConn state Shutdown to the child policy. 444 // 445 // This entry is kept in the map until it's state is changes to Shutdown, 446 // and will be deleted in UpdateSubConnState(). 447 b.ClientConn.RemoveSubConn(scw.SubConn) 448 } 449 450 func (b *clusterImplBalancer) UpdateAddresses(sc balancer.SubConn, addrs []resolver.Address) { 451 clusterName := b.getClusterName() 452 newAddrs := make([]resolver.Address, len(addrs)) 453 var lID resource.LocalityID 454 for i, addr := range addrs { 455 newAddrs[i] = internal.SetXDSHandshakeClusterName(addr, clusterName) 456 lID = resource.GetLocalityID(newAddrs[i]) 457 } 458 if scw, ok := sc.(*scWrapper); ok { 459 scw.updateLocalityID(lID) 460 // Need to get the original SubConn from the wrapper before calling 461 // parent ClientConn. 462 sc = scw.SubConn 463 } 464 b.ClientConn.UpdateAddresses(sc, newAddrs) 465 } 466 467 type dropConfigs struct { 468 drops []*dropper 469 requestCounter *client.ClusterRequestsCounter 470 requestCountMax uint32 471 } 472 473 // handleDropAndRequestCount compares drop and request counter in newConfig with 474 // the one currently used by picker. It returns a new dropConfigs if a new 475 // picker needs to be generated, otherwise it returns nil. 476 func (b *clusterImplBalancer) handleDropAndRequestCount(newConfig *LBConfig) *dropConfigs { 477 // Compare new drop config. And update picker if it's changed. 478 var updatePicker bool 479 if !equalDropCategories(b.dropCategories, newConfig.DropCategories) { 480 b.dropCategories = newConfig.DropCategories 481 b.drops = make([]*dropper, 0, len(newConfig.DropCategories)) 482 for _, c := range newConfig.DropCategories { 483 b.drops = append(b.drops, newDropper(c)) 484 } 485 updatePicker = true 486 } 487 488 // Compare cluster name. And update picker if it's changed, because circuit 489 // breaking's stream counter will be different. 490 if b.requestCounterCluster != newConfig.Cluster || b.requestCounterService != newConfig.EDSServiceName { 491 b.requestCounterCluster = newConfig.Cluster 492 b.requestCounterService = newConfig.EDSServiceName 493 b.requestCounter = client.GetClusterRequestsCounter(newConfig.Cluster, newConfig.EDSServiceName) 494 updatePicker = true 495 } 496 // Compare upper bound of stream count. And update picker if it's changed. 497 // This is also for circuit breaking. 498 var newRequestCountMax uint32 = 1024 499 if newConfig.MaxConcurrentRequests != nil { 500 newRequestCountMax = *newConfig.MaxConcurrentRequests 501 } 502 if b.requestCountMax != newRequestCountMax { 503 b.requestCountMax = newRequestCountMax 504 updatePicker = true 505 } 506 507 if !updatePicker { 508 return nil 509 } 510 return &dropConfigs{ 511 drops: b.drops, 512 requestCounter: b.requestCounter, 513 requestCountMax: b.requestCountMax, 514 } 515 } 516 517 func (b *clusterImplBalancer) run() { 518 defer b.done.Fire() 519 for { 520 select { 521 case update := <-b.pickerUpdateCh.Get(): 522 b.pickerUpdateCh.Load() 523 b.mu.Lock() 524 if b.closed.HasFired() { 525 b.mu.Unlock() 526 return 527 } 528 switch u := update.(type) { 529 case balancer.State: 530 b.childState = u 531 b.ClientConn.UpdateState(balancer.State{ 532 ConnectivityState: b.childState.ConnectivityState, 533 Picker: newPicker(b.childState, &dropConfigs{ 534 drops: b.drops, 535 requestCounter: b.requestCounter, 536 requestCountMax: b.requestCountMax, 537 }, b.loadWrapper), 538 }) 539 case *LBConfig: 540 dc := b.handleDropAndRequestCount(u) 541 if dc != nil && b.childState.Picker != nil { 542 b.ClientConn.UpdateState(balancer.State{ 543 ConnectivityState: b.childState.ConnectivityState, 544 Picker: newPicker(b.childState, dc, b.loadWrapper), 545 }) 546 } 547 } 548 b.mu.Unlock() 549 case <-b.closed.Done(): 550 if b.cancelLoadReport != nil { 551 b.cancelLoadReport() 552 b.cancelLoadReport = nil 553 } 554 return 555 } 556 } 557 }