dubbo.apache.org/dubbo-go/v3@v3.1.1/xds/balancer/clusterimpl/clusterimpl.go (about)

     1  /*
     2   * Licensed to the Apache Software Foundation (ASF) under one or more
     3   * contributor license agreements.  See the NOTICE file distributed with
     4   * this work for additional information regarding copyright ownership.
     5   * The ASF licenses this file to You under the Apache License, Version 2.0
     6   * (the "License"); you may not use this file except in compliance with
     7   * the License.  You may obtain a copy of the License at
     8   *
     9   *     http://www.apache.org/licenses/LICENSE-2.0
    10   *
    11   * Unless required by applicable law or agreed to in writing, software
    12   * distributed under the License is distributed on an "AS IS" BASIS,
    13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14   * See the License for the specific language governing permissions and
    15   * limitations under the License.
    16   */
    17  
    18  /*
    19   *
    20   * Copyright 2020 gRPC authors.
    21   *
    22   */
    23  
    24  // Package clusterimpl implements the xds_cluster_impl balancing policy. It
    25  // handles the cluster features (e.g. circuit_breaking, RPC dropping).
    26  //
    27  // Note that it doesn't handle name resolution, which is done by policy
    28  // xds_cluster_resolver.
    29  package clusterimpl
    30  
    31  import (
    32  	"encoding/json"
    33  	"fmt"
    34  	"sync"
    35  	"sync/atomic"
    36  )
    37  
    38  import (
    39  	dubbogoLogger "github.com/dubbogo/gost/log/logger"
    40  
    41  	"google.golang.org/grpc/balancer"
    42  
    43  	"google.golang.org/grpc/connectivity"
    44  
    45  	"google.golang.org/grpc/resolver"
    46  
    47  	"google.golang.org/grpc/serviceconfig"
    48  )
    49  
    50  import (
    51  	internal "dubbo.apache.org/dubbo-go/v3/xds"
    52  	"dubbo.apache.org/dubbo-go/v3/xds/balancer/loadstore"
    53  	"dubbo.apache.org/dubbo-go/v3/xds/client"
    54  	"dubbo.apache.org/dubbo-go/v3/xds/client/load"
    55  	"dubbo.apache.org/dubbo-go/v3/xds/client/resource"
    56  	"dubbo.apache.org/dubbo-go/v3/xds/utils/buffer"
    57  	"dubbo.apache.org/dubbo-go/v3/xds/utils/grpcsync"
    58  	"dubbo.apache.org/dubbo-go/v3/xds/utils/pretty"
    59  )
    60  
    61  const (
    62  	// Name is the name of the cluster_impl balancer.
    63  	Name                   = "xds_cluster_impl_experimental"
    64  	defaultRequestCountMax = 1024
    65  )
    66  
    67  func init() {
    68  	balancer.Register(bb{})
    69  }
    70  
    71  type bb struct{}
    72  
    73  func (bb) Build(cc balancer.ClientConn, bOpts balancer.BuildOptions) balancer.Balancer {
    74  	b := &clusterImplBalancer{
    75  		ClientConn:      cc,
    76  		bOpts:           bOpts,
    77  		closed:          grpcsync.NewEvent(),
    78  		done:            grpcsync.NewEvent(),
    79  		loadWrapper:     loadstore.NewWrapper(),
    80  		scWrappers:      make(map[balancer.SubConn]*scWrapper),
    81  		pickerUpdateCh:  buffer.NewUnbounded(),
    82  		requestCountMax: defaultRequestCountMax,
    83  	}
    84  	b.logger = dubbogoLogger.GetLogger()
    85  	go b.run()
    86  	b.logger.Infof("Created")
    87  	return b
    88  }
    89  
    90  func (bb) Name() string {
    91  	return Name
    92  }
    93  
    94  func (bb) ParseConfig(c json.RawMessage) (serviceconfig.LoadBalancingConfig, error) {
    95  	return parseConfig(c)
    96  }
    97  
    98  type clusterImplBalancer struct {
    99  	balancer.ClientConn
   100  
   101  	// mu guarantees mutual exclusion between Close() and handling of picker
   102  	// update to the parent ClientConn in run(). It's to make sure that the
   103  	// run() goroutine doesn't send picker update to parent after the balancer
   104  	// is closed.
   105  	//
   106  	// It's only used by the run() goroutine, but not the other exported
   107  	// functions. Because the exported functions are guaranteed to be
   108  	// synchronized with Close().
   109  	mu     sync.Mutex
   110  	closed *grpcsync.Event
   111  	done   *grpcsync.Event
   112  
   113  	bOpts     balancer.BuildOptions
   114  	logger    dubbogoLogger.Logger
   115  	xdsClient client.XDSClient
   116  
   117  	config           *LBConfig
   118  	childLB          balancer.Balancer
   119  	cancelLoadReport func()
   120  	edsServiceName   string
   121  	lrsServerName    *string
   122  	loadWrapper      *loadstore.Wrapper
   123  
   124  	clusterNameMu sync.Mutex
   125  	clusterName   string
   126  
   127  	scWrappersMu sync.Mutex
   128  	// The SubConns passed to the child policy are wrapped in a wrapper, to keep
   129  	// locality ID. But when the parent ClientConn sends updates, it's going to
   130  	// give the original SubConn, not the wrapper. But the child policies only
   131  	// know about the wrapper, so when forwarding SubConn updates, they must be
   132  	// sent for the wrappers.
   133  	//
   134  	// This keeps a map from original SubConn to wrapper, so that when
   135  	// forwarding the SubConn state update, the child policy will get the
   136  	// wrappers.
   137  	scWrappers map[balancer.SubConn]*scWrapper
   138  
   139  	// childState/drops/requestCounter keeps the state used by the most recently
   140  	// generated picker. All fields can only be accessed in run(). And run() is
   141  	// the only goroutine that sends picker to the parent ClientConn. All
   142  	// requests to update picker need to be sent to pickerUpdateCh.
   143  	childState            balancer.State
   144  	dropCategories        []DropConfig // The categories for drops.
   145  	drops                 []*dropper
   146  	requestCounterCluster string // The cluster name for the request counter.
   147  	requestCounterService string // The service name for the request counter.
   148  	requestCounter        *client.ClusterRequestsCounter
   149  	requestCountMax       uint32
   150  	pickerUpdateCh        *buffer.Unbounded
   151  }
   152  
   153  // updateLoadStore checks the config for load store, and decides whether it
   154  // needs to restart the load reporting stream.
   155  func (b *clusterImplBalancer) updateLoadStore(newConfig *LBConfig) error {
   156  	var updateLoadClusterAndService bool
   157  
   158  	// ClusterName is different, restart. ClusterName is from ClusterName and
   159  	// EDSServiceName.
   160  	clusterName := b.getClusterName()
   161  	if clusterName != newConfig.Cluster {
   162  		updateLoadClusterAndService = true
   163  		b.setClusterName(newConfig.Cluster)
   164  		clusterName = newConfig.Cluster
   165  	}
   166  	if b.edsServiceName != newConfig.EDSServiceName {
   167  		updateLoadClusterAndService = true
   168  		b.edsServiceName = newConfig.EDSServiceName
   169  	}
   170  	if updateLoadClusterAndService {
   171  		// This updates the clusterName and serviceName that will be reported
   172  		// for the loads. The update here is too early, the perfect timing is
   173  		// when the picker is updated with the new connection. But from this
   174  		// balancer's point of view, it's impossible to tell.
   175  		//
   176  		// On the other hand, this will almost never happen. Each LRS policy
   177  		// shouldn't get updated config. The parent should do a graceful switch
   178  		// when the clusterName or serviceName is changed.
   179  		b.loadWrapper.UpdateClusterAndService(clusterName, b.edsServiceName)
   180  	}
   181  
   182  	var (
   183  		stopOldLoadReport  bool
   184  		startNewLoadReport bool
   185  	)
   186  
   187  	// Check if it's necessary to restart load report.
   188  	if b.lrsServerName == nil {
   189  		if newConfig.LoadReportingServerName != nil {
   190  			// Old is nil, new is not nil, start new LRS.
   191  			b.lrsServerName = newConfig.LoadReportingServerName
   192  			startNewLoadReport = true
   193  		}
   194  		// Old is nil, new is nil, do nothing.
   195  	} else if newConfig.LoadReportingServerName == nil {
   196  		// Old is not nil, new is nil, stop old, don't start new.
   197  		b.lrsServerName = newConfig.LoadReportingServerName
   198  		stopOldLoadReport = true
   199  	} else {
   200  		// Old is not nil, new is not nil, compare string values, if
   201  		// different, stop old and start new.
   202  		if *b.lrsServerName != *newConfig.LoadReportingServerName {
   203  			b.lrsServerName = newConfig.LoadReportingServerName
   204  			stopOldLoadReport = true
   205  			startNewLoadReport = true
   206  		}
   207  	}
   208  
   209  	if stopOldLoadReport {
   210  		if b.cancelLoadReport != nil {
   211  			b.cancelLoadReport()
   212  			b.cancelLoadReport = nil
   213  			if !startNewLoadReport {
   214  				// If a new LRS stream will be started later, no need to update
   215  				// it to nil here.
   216  				b.loadWrapper.UpdateLoadStore(nil)
   217  			}
   218  		}
   219  	}
   220  	if startNewLoadReport {
   221  		var loadStore *load.Store
   222  		if b.xdsClient != nil {
   223  			loadStore, b.cancelLoadReport = b.xdsClient.ReportLoad(*b.lrsServerName)
   224  		}
   225  		b.loadWrapper.UpdateLoadStore(loadStore)
   226  	}
   227  
   228  	return nil
   229  }
   230  
   231  func (b *clusterImplBalancer) UpdateClientConnState(s balancer.ClientConnState) error {
   232  	if b.closed.HasFired() {
   233  		b.logger.Warnf("xds: received ClientConnState {%+v} after clusterImplBalancer was closed", s)
   234  		return nil
   235  	}
   236  
   237  	b.logger.Infof("Received update from resolver, balancer config: %+v", pretty.ToJSON(s.BalancerConfig))
   238  	newConfig, ok := s.BalancerConfig.(*LBConfig)
   239  	if !ok {
   240  		return fmt.Errorf("unexpected balancer config with type: %T", s.BalancerConfig)
   241  	}
   242  
   243  	// Need to check for potential errors at the beginning of this function, so
   244  	// that on errors, we reject the whole config, instead of applying part of
   245  	// it.
   246  	bb := balancer.Get(newConfig.ChildPolicy.Name)
   247  	if bb == nil {
   248  		return fmt.Errorf("balancer %q not registered", newConfig.ChildPolicy.Name)
   249  	}
   250  
   251  	if b.xdsClient == nil {
   252  		c := client.FromResolverState(s.ResolverState)
   253  		if c == nil {
   254  			return balancer.ErrBadResolverState
   255  		}
   256  		b.xdsClient = c
   257  	}
   258  
   259  	// Update load reporting config. This needs to be done before updating the
   260  	// child policy because we need the loadStore from the updated client to be
   261  	// passed to the ccWrapper, so that the next picker from the child policy
   262  	// will pick up the new loadStore.
   263  	if err := b.updateLoadStore(newConfig); err != nil {
   264  		return err
   265  	}
   266  
   267  	// If child policy is a different type, recreate the sub-balancer.
   268  	if b.config == nil || b.config.ChildPolicy.Name != newConfig.ChildPolicy.Name {
   269  		if b.childLB != nil {
   270  			b.childLB.Close()
   271  		}
   272  		b.childLB = bb.Build(b, b.bOpts)
   273  	}
   274  	b.config = newConfig
   275  
   276  	if b.childLB == nil {
   277  		// This is not an expected situation, and should be super rare in
   278  		// practice.
   279  		//
   280  		// When this happens, we already applied all the other configurations
   281  		// (drop/circuit breaking), but there's no child policy. This balancer
   282  		// will be stuck, and we report the error to the parent.
   283  		return fmt.Errorf("child policy is nil, this means balancer %q's Build() returned nil", newConfig.ChildPolicy.Name)
   284  	}
   285  
   286  	// Notify run() of this new config, in case drop and request counter need
   287  	// update (which means a new picker needs to be generated).
   288  	b.pickerUpdateCh.Put(newConfig)
   289  
   290  	// Addresses and sub-balancer config are sent to sub-balancer.
   291  	return b.childLB.UpdateClientConnState(balancer.ClientConnState{
   292  		ResolverState:  s.ResolverState,
   293  		BalancerConfig: b.config.ChildPolicy.Config,
   294  	})
   295  }
   296  
   297  func (b *clusterImplBalancer) ResolverError(err error) {
   298  	if b.closed.HasFired() {
   299  		b.logger.Warnf("xds: received resolver error {%+v} after clusterImplBalancer was closed", err)
   300  		return
   301  	}
   302  
   303  	if b.childLB != nil {
   304  		b.childLB.ResolverError(err)
   305  	}
   306  }
   307  
   308  func (b *clusterImplBalancer) UpdateSubConnState(sc balancer.SubConn, s balancer.SubConnState) {
   309  	if b.closed.HasFired() {
   310  		b.logger.Warnf("xds: received subconn state change {%+v, %+v} after clusterImplBalancer was closed", sc, s)
   311  		return
   312  	}
   313  
   314  	// Trigger re-resolution when a SubConn turns transient failure. This is
   315  	// necessary for the LogicalDNS in cluster_resolver policy to re-resolve.
   316  	//
   317  	// Note that this happens not only for the addresses from DNS, but also for
   318  	// EDS (cluster_impl doesn't know if it's DNS or EDS, only the parent
   319  	// knows). The parent priority policy is configured to ignore re-resolution
   320  	// signal from the EDS children.
   321  	if s.ConnectivityState == connectivity.TransientFailure {
   322  		b.ClientConn.ResolveNow(resolver.ResolveNowOptions{})
   323  	}
   324  
   325  	b.scWrappersMu.Lock()
   326  	if scw, ok := b.scWrappers[sc]; ok {
   327  		sc = scw
   328  		if s.ConnectivityState == connectivity.Shutdown {
   329  			// Remove this SubConn from the map on Shutdown.
   330  			delete(b.scWrappers, scw.SubConn)
   331  		}
   332  	}
   333  	b.scWrappersMu.Unlock()
   334  	if b.childLB != nil {
   335  		b.childLB.UpdateSubConnState(sc, s)
   336  	}
   337  }
   338  
   339  func (b *clusterImplBalancer) Close() {
   340  	b.mu.Lock()
   341  	b.closed.Fire()
   342  	b.mu.Unlock()
   343  
   344  	if b.childLB != nil {
   345  		b.childLB.Close()
   346  		b.childLB = nil
   347  	}
   348  	<-b.done.Done()
   349  	b.logger.Infof("Shutdown")
   350  }
   351  
   352  func (b *clusterImplBalancer) ExitIdle() {
   353  	if b.childLB == nil {
   354  		return
   355  	}
   356  	if ei, ok := b.childLB.(balancer.ExitIdler); ok {
   357  		ei.ExitIdle()
   358  		return
   359  	}
   360  	// Fallback for children that don't support ExitIdle -- connect to all
   361  	// SubConns.
   362  	for _, sc := range b.scWrappers {
   363  		sc.Connect()
   364  	}
   365  }
   366  
   367  // Override methods to accept updates from the child LB.
   368  
   369  func (b *clusterImplBalancer) UpdateState(state balancer.State) {
   370  	// Instead of updating parent ClientConn inline, send state to run().
   371  	b.pickerUpdateCh.Put(state)
   372  }
   373  
   374  func (b *clusterImplBalancer) setClusterName(n string) {
   375  	b.clusterNameMu.Lock()
   376  	defer b.clusterNameMu.Unlock()
   377  	b.clusterName = n
   378  }
   379  
   380  func (b *clusterImplBalancer) getClusterName() string {
   381  	b.clusterNameMu.Lock()
   382  	defer b.clusterNameMu.Unlock()
   383  	return b.clusterName
   384  }
   385  
   386  // scWrapper is a wrapper of SubConn with locality ID. The locality ID can be
   387  // retrieved from the addresses when creating SubConn.
   388  //
   389  // All SubConns passed to the child policies are wrapped in this, so that the
   390  // picker can get the localityID from the picked SubConn, and do load reporting.
   391  //
   392  // After wrapping, all SubConns to and from the parent ClientConn (e.g. for
   393  // SubConn state update, update/remove SubConn) must be the original SubConns.
   394  // All SubConns to and from the child policy (NewSubConn, forwarding SubConn
   395  // state update) must be the wrapper. The balancer keeps a map from the original
   396  // SubConn to the wrapper for this purpose.
   397  type scWrapper struct {
   398  	balancer.SubConn
   399  	// locality needs to be atomic because it can be updated while being read by
   400  	// the picker.
   401  	locality atomic.Value // type resource.LocalityID
   402  }
   403  
   404  func (scw *scWrapper) updateLocalityID(lID resource.LocalityID) {
   405  	scw.locality.Store(lID)
   406  }
   407  
   408  func (scw *scWrapper) localityID() resource.LocalityID {
   409  	lID, _ := scw.locality.Load().(resource.LocalityID)
   410  	return lID
   411  }
   412  
   413  func (b *clusterImplBalancer) NewSubConn(addrs []resolver.Address, opts balancer.NewSubConnOptions) (balancer.SubConn, error) {
   414  	clusterName := b.getClusterName()
   415  	newAddrs := make([]resolver.Address, len(addrs))
   416  	var lID resource.LocalityID
   417  	for i, addr := range addrs {
   418  		newAddrs[i] = internal.SetXDSHandshakeClusterName(addr, clusterName)
   419  		lID = resource.GetLocalityID(newAddrs[i])
   420  	}
   421  	sc, err := b.ClientConn.NewSubConn(newAddrs, opts)
   422  	if err != nil {
   423  		return nil, err
   424  	}
   425  	// Wrap this SubConn in a wrapper, and add it to the map.
   426  	b.scWrappersMu.Lock()
   427  	ret := &scWrapper{SubConn: sc}
   428  	ret.updateLocalityID(lID)
   429  	b.scWrappers[sc] = ret
   430  	b.scWrappersMu.Unlock()
   431  	return ret, nil
   432  }
   433  
   434  func (b *clusterImplBalancer) RemoveSubConn(sc balancer.SubConn) {
   435  	scw, ok := sc.(*scWrapper)
   436  	if !ok {
   437  		b.ClientConn.RemoveSubConn(sc)
   438  		return
   439  	}
   440  	// Remove the original SubConn from the parent ClientConn.
   441  	//
   442  	// Note that we don't remove this SubConn from the scWrappers map. We will
   443  	// need it to forward the final SubConn state Shutdown to the child policy.
   444  	//
   445  	// This entry is kept in the map until it's state is changes to Shutdown,
   446  	// and will be deleted in UpdateSubConnState().
   447  	b.ClientConn.RemoveSubConn(scw.SubConn)
   448  }
   449  
   450  func (b *clusterImplBalancer) UpdateAddresses(sc balancer.SubConn, addrs []resolver.Address) {
   451  	clusterName := b.getClusterName()
   452  	newAddrs := make([]resolver.Address, len(addrs))
   453  	var lID resource.LocalityID
   454  	for i, addr := range addrs {
   455  		newAddrs[i] = internal.SetXDSHandshakeClusterName(addr, clusterName)
   456  		lID = resource.GetLocalityID(newAddrs[i])
   457  	}
   458  	if scw, ok := sc.(*scWrapper); ok {
   459  		scw.updateLocalityID(lID)
   460  		// Need to get the original SubConn from the wrapper before calling
   461  		// parent ClientConn.
   462  		sc = scw.SubConn
   463  	}
   464  	b.ClientConn.UpdateAddresses(sc, newAddrs)
   465  }
   466  
   467  type dropConfigs struct {
   468  	drops           []*dropper
   469  	requestCounter  *client.ClusterRequestsCounter
   470  	requestCountMax uint32
   471  }
   472  
   473  // handleDropAndRequestCount compares drop and request counter in newConfig with
   474  // the one currently used by picker. It returns a new dropConfigs if a new
   475  // picker needs to be generated, otherwise it returns nil.
   476  func (b *clusterImplBalancer) handleDropAndRequestCount(newConfig *LBConfig) *dropConfigs {
   477  	// Compare new drop config. And update picker if it's changed.
   478  	var updatePicker bool
   479  	if !equalDropCategories(b.dropCategories, newConfig.DropCategories) {
   480  		b.dropCategories = newConfig.DropCategories
   481  		b.drops = make([]*dropper, 0, len(newConfig.DropCategories))
   482  		for _, c := range newConfig.DropCategories {
   483  			b.drops = append(b.drops, newDropper(c))
   484  		}
   485  		updatePicker = true
   486  	}
   487  
   488  	// Compare cluster name. And update picker if it's changed, because circuit
   489  	// breaking's stream counter will be different.
   490  	if b.requestCounterCluster != newConfig.Cluster || b.requestCounterService != newConfig.EDSServiceName {
   491  		b.requestCounterCluster = newConfig.Cluster
   492  		b.requestCounterService = newConfig.EDSServiceName
   493  		b.requestCounter = client.GetClusterRequestsCounter(newConfig.Cluster, newConfig.EDSServiceName)
   494  		updatePicker = true
   495  	}
   496  	// Compare upper bound of stream count. And update picker if it's changed.
   497  	// This is also for circuit breaking.
   498  	var newRequestCountMax uint32 = 1024
   499  	if newConfig.MaxConcurrentRequests != nil {
   500  		newRequestCountMax = *newConfig.MaxConcurrentRequests
   501  	}
   502  	if b.requestCountMax != newRequestCountMax {
   503  		b.requestCountMax = newRequestCountMax
   504  		updatePicker = true
   505  	}
   506  
   507  	if !updatePicker {
   508  		return nil
   509  	}
   510  	return &dropConfigs{
   511  		drops:           b.drops,
   512  		requestCounter:  b.requestCounter,
   513  		requestCountMax: b.requestCountMax,
   514  	}
   515  }
   516  
   517  func (b *clusterImplBalancer) run() {
   518  	defer b.done.Fire()
   519  	for {
   520  		select {
   521  		case update := <-b.pickerUpdateCh.Get():
   522  			b.pickerUpdateCh.Load()
   523  			b.mu.Lock()
   524  			if b.closed.HasFired() {
   525  				b.mu.Unlock()
   526  				return
   527  			}
   528  			switch u := update.(type) {
   529  			case balancer.State:
   530  				b.childState = u
   531  				b.ClientConn.UpdateState(balancer.State{
   532  					ConnectivityState: b.childState.ConnectivityState,
   533  					Picker: newPicker(b.childState, &dropConfigs{
   534  						drops:           b.drops,
   535  						requestCounter:  b.requestCounter,
   536  						requestCountMax: b.requestCountMax,
   537  					}, b.loadWrapper),
   538  				})
   539  			case *LBConfig:
   540  				dc := b.handleDropAndRequestCount(u)
   541  				if dc != nil && b.childState.Picker != nil {
   542  					b.ClientConn.UpdateState(balancer.State{
   543  						ConnectivityState: b.childState.ConnectivityState,
   544  						Picker:            newPicker(b.childState, dc, b.loadWrapper),
   545  					})
   546  				}
   547  			}
   548  			b.mu.Unlock()
   549  		case <-b.closed.Done():
   550  			if b.cancelLoadReport != nil {
   551  				b.cancelLoadReport()
   552  				b.cancelLoadReport = nil
   553  			}
   554  			return
   555  		}
   556  	}
   557  }