github.com/hxx258456/ccgo@v0.0.5-0.20230213014102-48b35f46f66f/grpc/xds/internal/balancer/clusterimpl/clusterimpl.go (about)

     1  /*
     2   *
     3   * Copyright 2020 gRPC authors.
     4   *
     5   * Licensed under the Apache License, Version 2.0 (the "License");
     6   * you may not use this file except in compliance with the License.
     7   * You may obtain a copy of the License at
     8   *
     9   *     http://www.apache.org/licenses/LICENSE-2.0
    10   *
    11   * Unless required by applicable law or agreed to in writing, software
    12   * distributed under the License is distributed on an "AS IS" BASIS,
    13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14   * See the License for the specific language governing permissions and
    15   * limitations under the License.
    16   *
    17   */
    18  
    19  // Package clusterimpl implements the xds_cluster_impl balancing policy. It
    20  // handles the cluster features (e.g. circuit_breaking, RPC dropping).
    21  //
    22  // Note that it doesn't handle name resolution, which is done by policy
    23  // xds_cluster_resolver.
    24  package clusterimpl
    25  
    26  import (
    27  	"encoding/json"
    28  	"fmt"
    29  	"sync"
    30  	"sync/atomic"
    31  
    32  	"github.com/hxx258456/ccgo/grpc/balancer"
    33  	"github.com/hxx258456/ccgo/grpc/connectivity"
    34  	"github.com/hxx258456/ccgo/grpc/internal"
    35  	"github.com/hxx258456/ccgo/grpc/internal/buffer"
    36  	"github.com/hxx258456/ccgo/grpc/internal/grpclog"
    37  	"github.com/hxx258456/ccgo/grpc/internal/grpcsync"
    38  	"github.com/hxx258456/ccgo/grpc/internal/pretty"
    39  	"github.com/hxx258456/ccgo/grpc/resolver"
    40  	"github.com/hxx258456/ccgo/grpc/serviceconfig"
    41  	xdsinternal "github.com/hxx258456/ccgo/grpc/xds/internal"
    42  	"github.com/hxx258456/ccgo/grpc/xds/internal/balancer/loadstore"
    43  	"github.com/hxx258456/ccgo/grpc/xds/internal/xdsclient"
    44  	"github.com/hxx258456/ccgo/grpc/xds/internal/xdsclient/load"
    45  )
    46  
    47  const (
    48  	// Name is the name of the cluster_impl balancer.
    49  	Name                   = "xds_cluster_impl_experimental"
    50  	defaultRequestCountMax = 1024
    51  )
    52  
    53  func init() {
    54  	balancer.Register(bb{})
    55  }
    56  
    57  type bb struct{}
    58  
    59  func (bb) Build(cc balancer.ClientConn, bOpts balancer.BuildOptions) balancer.Balancer {
    60  	b := &clusterImplBalancer{
    61  		ClientConn:      cc,
    62  		bOpts:           bOpts,
    63  		closed:          grpcsync.NewEvent(),
    64  		done:            grpcsync.NewEvent(),
    65  		loadWrapper:     loadstore.NewWrapper(),
    66  		scWrappers:      make(map[balancer.SubConn]*scWrapper),
    67  		pickerUpdateCh:  buffer.NewUnbounded(),
    68  		requestCountMax: defaultRequestCountMax,
    69  	}
    70  	b.logger = prefixLogger(b)
    71  	go b.run()
    72  	b.logger.Infof("Created")
    73  	return b
    74  }
    75  
    76  func (bb) Name() string {
    77  	return Name
    78  }
    79  
    80  func (bb) ParseConfig(c json.RawMessage) (serviceconfig.LoadBalancingConfig, error) {
    81  	return parseConfig(c)
    82  }
    83  
    84  type clusterImplBalancer struct {
    85  	balancer.ClientConn
    86  
    87  	// mu guarantees mutual exclusion between Close() and handling of picker
    88  	// update to the parent ClientConn in run(). It's to make sure that the
    89  	// run() goroutine doesn't send picker update to parent after the balancer
    90  	// is closed.
    91  	//
    92  	// It's only used by the run() goroutine, but not the other exported
    93  	// functions. Because the exported functions are guaranteed to be
    94  	// synchronized with Close().
    95  	mu     sync.Mutex
    96  	closed *grpcsync.Event
    97  	done   *grpcsync.Event
    98  
    99  	bOpts     balancer.BuildOptions
   100  	logger    *grpclog.PrefixLogger
   101  	xdsClient xdsclient.XDSClient
   102  
   103  	config           *LBConfig
   104  	childLB          balancer.Balancer
   105  	cancelLoadReport func()
   106  	edsServiceName   string
   107  	lrsServerName    *string
   108  	loadWrapper      *loadstore.Wrapper
   109  
   110  	clusterNameMu sync.Mutex
   111  	clusterName   string
   112  
   113  	scWrappersMu sync.Mutex
   114  	// The SubConns passed to the child policy are wrapped in a wrapper, to keep
   115  	// locality ID. But when the parent ClientConn sends updates, it's going to
   116  	// give the original SubConn, not the wrapper. But the child policies only
   117  	// know about the wrapper, so when forwarding SubConn updates, they must be
   118  	// sent for the wrappers.
   119  	//
   120  	// This keeps a map from original SubConn to wrapper, so that when
   121  	// forwarding the SubConn state update, the child policy will get the
   122  	// wrappers.
   123  	scWrappers map[balancer.SubConn]*scWrapper
   124  
   125  	// childState/drops/requestCounter keeps the state used by the most recently
   126  	// generated picker. All fields can only be accessed in run(). And run() is
   127  	// the only goroutine that sends picker to the parent ClientConn. All
   128  	// requests to update picker need to be sent to pickerUpdateCh.
   129  	childState            balancer.State
   130  	dropCategories        []DropConfig // The categories for drops.
   131  	drops                 []*dropper
   132  	requestCounterCluster string // The cluster name for the request counter.
   133  	requestCounterService string // The service name for the request counter.
   134  	requestCounter        *xdsclient.ClusterRequestsCounter
   135  	requestCountMax       uint32
   136  	pickerUpdateCh        *buffer.Unbounded
   137  }
   138  
   139  // updateLoadStore checks the config for load store, and decides whether it
   140  // needs to restart the load reporting stream.
   141  func (b *clusterImplBalancer) updateLoadStore(newConfig *LBConfig) error {
   142  	var updateLoadClusterAndService bool
   143  
   144  	// ClusterName is different, restart. ClusterName is from ClusterName and
   145  	// EDSServiceName.
   146  	clusterName := b.getClusterName()
   147  	if clusterName != newConfig.Cluster {
   148  		updateLoadClusterAndService = true
   149  		b.setClusterName(newConfig.Cluster)
   150  		clusterName = newConfig.Cluster
   151  	}
   152  	if b.edsServiceName != newConfig.EDSServiceName {
   153  		updateLoadClusterAndService = true
   154  		b.edsServiceName = newConfig.EDSServiceName
   155  	}
   156  	if updateLoadClusterAndService {
   157  		// This updates the clusterName and serviceName that will be reported
   158  		// for the loads. The update here is too early, the perfect timing is
   159  		// when the picker is updated with the new connection. But from this
   160  		// balancer's point of view, it's impossible to tell.
   161  		//
   162  		// On the other hand, this will almost never happen. Each LRS policy
   163  		// shouldn't get updated config. The parent should do a graceful switch
   164  		// when the clusterName or serviceName is changed.
   165  		b.loadWrapper.UpdateClusterAndService(clusterName, b.edsServiceName)
   166  	}
   167  
   168  	var (
   169  		stopOldLoadReport  bool
   170  		startNewLoadReport bool
   171  	)
   172  
   173  	// Check if it's necessary to restart load report.
   174  	if b.lrsServerName == nil {
   175  		if newConfig.LoadReportingServerName != nil {
   176  			// Old is nil, new is not nil, start new LRS.
   177  			b.lrsServerName = newConfig.LoadReportingServerName
   178  			startNewLoadReport = true
   179  		}
   180  		// Old is nil, new is nil, do nothing.
   181  	} else if newConfig.LoadReportingServerName == nil {
   182  		// Old is not nil, new is nil, stop old, don't start new.
   183  		b.lrsServerName = newConfig.LoadReportingServerName
   184  		stopOldLoadReport = true
   185  	} else {
   186  		// Old is not nil, new is not nil, compare string values, if
   187  		// different, stop old and start new.
   188  		if *b.lrsServerName != *newConfig.LoadReportingServerName {
   189  			b.lrsServerName = newConfig.LoadReportingServerName
   190  			stopOldLoadReport = true
   191  			startNewLoadReport = true
   192  		}
   193  	}
   194  
   195  	if stopOldLoadReport {
   196  		if b.cancelLoadReport != nil {
   197  			b.cancelLoadReport()
   198  			b.cancelLoadReport = nil
   199  			if !startNewLoadReport {
   200  				// If a new LRS stream will be started later, no need to update
   201  				// it to nil here.
   202  				b.loadWrapper.UpdateLoadStore(nil)
   203  			}
   204  		}
   205  	}
   206  	if startNewLoadReport {
   207  		var loadStore *load.Store
   208  		if b.xdsClient != nil {
   209  			loadStore, b.cancelLoadReport = b.xdsClient.ReportLoad(*b.lrsServerName)
   210  		}
   211  		b.loadWrapper.UpdateLoadStore(loadStore)
   212  	}
   213  
   214  	return nil
   215  }
   216  
   217  func (b *clusterImplBalancer) UpdateClientConnState(s balancer.ClientConnState) error {
   218  	if b.closed.HasFired() {
   219  		b.logger.Warningf("xds: received ClientConnState {%+v} after clusterImplBalancer was closed", s)
   220  		return nil
   221  	}
   222  
   223  	b.logger.Infof("Received update from resolver, balancer config: %+v", pretty.ToJSON(s.BalancerConfig))
   224  	newConfig, ok := s.BalancerConfig.(*LBConfig)
   225  	if !ok {
   226  		return fmt.Errorf("unexpected balancer config with type: %T", s.BalancerConfig)
   227  	}
   228  
   229  	// Need to check for potential errors at the beginning of this function, so
   230  	// that on errors, we reject the whole config, instead of applying part of
   231  	// it.
   232  	bb := balancer.Get(newConfig.ChildPolicy.Name)
   233  	if bb == nil {
   234  		return fmt.Errorf("balancer %q not registered", newConfig.ChildPolicy.Name)
   235  	}
   236  
   237  	if b.xdsClient == nil {
   238  		c := xdsclient.FromResolverState(s.ResolverState)
   239  		if c == nil {
   240  			return balancer.ErrBadResolverState
   241  		}
   242  		b.xdsClient = c
   243  	}
   244  
   245  	// Update load reporting config. This needs to be done before updating the
   246  	// child policy because we need the loadStore from the updated client to be
   247  	// passed to the ccWrapper, so that the next picker from the child policy
   248  	// will pick up the new loadStore.
   249  	if err := b.updateLoadStore(newConfig); err != nil {
   250  		return err
   251  	}
   252  
   253  	// If child policy is a different type, recreate the sub-balancer.
   254  	if b.config == nil || b.config.ChildPolicy.Name != newConfig.ChildPolicy.Name {
   255  		if b.childLB != nil {
   256  			b.childLB.Close()
   257  		}
   258  		b.childLB = bb.Build(b, b.bOpts)
   259  	}
   260  	b.config = newConfig
   261  
   262  	if b.childLB == nil {
   263  		// This is not an expected situation, and should be super rare in
   264  		// practice.
   265  		//
   266  		// When this happens, we already applied all the other configurations
   267  		// (drop/circuit breaking), but there's no child policy. This balancer
   268  		// will be stuck, and we report the error to the parent.
   269  		return fmt.Errorf("child policy is nil, this means balancer %q's Build() returned nil", newConfig.ChildPolicy.Name)
   270  	}
   271  
   272  	// Notify run() of this new config, in case drop and request counter need
   273  	// update (which means a new picker needs to be generated).
   274  	b.pickerUpdateCh.Put(newConfig)
   275  
   276  	// Addresses and sub-balancer config are sent to sub-balancer.
   277  	return b.childLB.UpdateClientConnState(balancer.ClientConnState{
   278  		ResolverState:  s.ResolverState,
   279  		BalancerConfig: b.config.ChildPolicy.Config,
   280  	})
   281  }
   282  
   283  func (b *clusterImplBalancer) ResolverError(err error) {
   284  	if b.closed.HasFired() {
   285  		b.logger.Warningf("xds: received resolver error {%+v} after clusterImplBalancer was closed", err)
   286  		return
   287  	}
   288  
   289  	if b.childLB != nil {
   290  		b.childLB.ResolverError(err)
   291  	}
   292  }
   293  
   294  func (b *clusterImplBalancer) UpdateSubConnState(sc balancer.SubConn, s balancer.SubConnState) {
   295  	if b.closed.HasFired() {
   296  		b.logger.Warningf("xds: received subconn state change {%+v, %+v} after clusterImplBalancer was closed", sc, s)
   297  		return
   298  	}
   299  
   300  	// Trigger re-resolution when a SubConn turns transient failure. This is
   301  	// necessary for the LogicalDNS in cluster_resolver policy to re-resolve.
   302  	//
   303  	// Note that this happens not only for the addresses from DNS, but also for
   304  	// EDS (cluster_impl doesn't know if it's DNS or EDS, only the parent
   305  	// knows). The parent priority policy is configured to ignore re-resolution
   306  	// signal from the EDS children.
   307  	if s.ConnectivityState == connectivity.TransientFailure {
   308  		b.ClientConn.ResolveNow(resolver.ResolveNowOptions{})
   309  	}
   310  
   311  	b.scWrappersMu.Lock()
   312  	if scw, ok := b.scWrappers[sc]; ok {
   313  		sc = scw
   314  		if s.ConnectivityState == connectivity.Shutdown {
   315  			// Remove this SubConn from the map on Shutdown.
   316  			delete(b.scWrappers, scw.SubConn)
   317  		}
   318  	}
   319  	b.scWrappersMu.Unlock()
   320  	if b.childLB != nil {
   321  		b.childLB.UpdateSubConnState(sc, s)
   322  	}
   323  }
   324  
   325  func (b *clusterImplBalancer) Close() {
   326  	b.mu.Lock()
   327  	b.closed.Fire()
   328  	b.mu.Unlock()
   329  
   330  	if b.childLB != nil {
   331  		b.childLB.Close()
   332  		b.childLB = nil
   333  	}
   334  	<-b.done.Done()
   335  	b.logger.Infof("Shutdown")
   336  }
   337  
   338  func (b *clusterImplBalancer) ExitIdle() {
   339  	if b.childLB == nil {
   340  		return
   341  	}
   342  	if ei, ok := b.childLB.(balancer.ExitIdler); ok {
   343  		ei.ExitIdle()
   344  		return
   345  	}
   346  	// Fallback for children that don't support ExitIdle -- connect to all
   347  	// SubConns.
   348  	for _, sc := range b.scWrappers {
   349  		sc.Connect()
   350  	}
   351  }
   352  
   353  // Override methods to accept updates from the child LB.
   354  
   355  func (b *clusterImplBalancer) UpdateState(state balancer.State) {
   356  	// Instead of updating parent ClientConn inline, send state to run().
   357  	b.pickerUpdateCh.Put(state)
   358  }
   359  
   360  func (b *clusterImplBalancer) setClusterName(n string) {
   361  	b.clusterNameMu.Lock()
   362  	defer b.clusterNameMu.Unlock()
   363  	b.clusterName = n
   364  }
   365  
   366  func (b *clusterImplBalancer) getClusterName() string {
   367  	b.clusterNameMu.Lock()
   368  	defer b.clusterNameMu.Unlock()
   369  	return b.clusterName
   370  }
   371  
   372  // scWrapper is a wrapper of SubConn with locality ID. The locality ID can be
   373  // retrieved from the addresses when creating SubConn.
   374  //
   375  // All SubConns passed to the child policies are wrapped in this, so that the
   376  // picker can get the localityID from the picked SubConn, and do load reporting.
   377  //
   378  // After wrapping, all SubConns to and from the parent ClientConn (e.g. for
   379  // SubConn state update, update/remove SubConn) must be the original SubConns.
   380  // All SubConns to and from the child policy (NewSubConn, forwarding SubConn
   381  // state update) must be the wrapper. The balancer keeps a map from the original
   382  // SubConn to the wrapper for this purpose.
   383  type scWrapper struct {
   384  	balancer.SubConn
   385  	// locality needs to be atomic because it can be updated while being read by
   386  	// the picker.
   387  	locality atomic.Value // type xdsinternal.LocalityID
   388  }
   389  
   390  func (scw *scWrapper) updateLocalityID(lID xdsinternal.LocalityID) {
   391  	scw.locality.Store(lID)
   392  }
   393  
   394  func (scw *scWrapper) localityID() xdsinternal.LocalityID {
   395  	lID, _ := scw.locality.Load().(xdsinternal.LocalityID)
   396  	return lID
   397  }
   398  
   399  func (b *clusterImplBalancer) NewSubConn(addrs []resolver.Address, opts balancer.NewSubConnOptions) (balancer.SubConn, error) {
   400  	clusterName := b.getClusterName()
   401  	newAddrs := make([]resolver.Address, len(addrs))
   402  	var lID xdsinternal.LocalityID
   403  	for i, addr := range addrs {
   404  		newAddrs[i] = internal.SetXDSHandshakeClusterName(addr, clusterName)
   405  		lID = xdsinternal.GetLocalityID(newAddrs[i])
   406  	}
   407  	sc, err := b.ClientConn.NewSubConn(newAddrs, opts)
   408  	if err != nil {
   409  		return nil, err
   410  	}
   411  	// Wrap this SubConn in a wrapper, and add it to the map.
   412  	b.scWrappersMu.Lock()
   413  	ret := &scWrapper{SubConn: sc}
   414  	ret.updateLocalityID(lID)
   415  	b.scWrappers[sc] = ret
   416  	b.scWrappersMu.Unlock()
   417  	return ret, nil
   418  }
   419  
   420  func (b *clusterImplBalancer) RemoveSubConn(sc balancer.SubConn) {
   421  	scw, ok := sc.(*scWrapper)
   422  	if !ok {
   423  		b.ClientConn.RemoveSubConn(sc)
   424  		return
   425  	}
   426  	// Remove the original SubConn from the parent ClientConn.
   427  	//
   428  	// Note that we don't remove this SubConn from the scWrappers map. We will
   429  	// need it to forward the final SubConn state Shutdown to the child policy.
   430  	//
   431  	// This entry is kept in the map until it's state is changes to Shutdown,
   432  	// and will be deleted in UpdateSubConnState().
   433  	b.ClientConn.RemoveSubConn(scw.SubConn)
   434  }
   435  
   436  func (b *clusterImplBalancer) UpdateAddresses(sc balancer.SubConn, addrs []resolver.Address) {
   437  	clusterName := b.getClusterName()
   438  	newAddrs := make([]resolver.Address, len(addrs))
   439  	var lID xdsinternal.LocalityID
   440  	for i, addr := range addrs {
   441  		newAddrs[i] = internal.SetXDSHandshakeClusterName(addr, clusterName)
   442  		lID = xdsinternal.GetLocalityID(newAddrs[i])
   443  	}
   444  	if scw, ok := sc.(*scWrapper); ok {
   445  		scw.updateLocalityID(lID)
   446  		// Need to get the original SubConn from the wrapper before calling
   447  		// parent ClientConn.
   448  		sc = scw.SubConn
   449  	}
   450  	b.ClientConn.UpdateAddresses(sc, newAddrs)
   451  }
   452  
   453  type dropConfigs struct {
   454  	drops           []*dropper
   455  	requestCounter  *xdsclient.ClusterRequestsCounter
   456  	requestCountMax uint32
   457  }
   458  
   459  // handleDropAndRequestCount compares drop and request counter in newConfig with
   460  // the one currently used by picker. It returns a new dropConfigs if a new
   461  // picker needs to be generated, otherwise it returns nil.
   462  func (b *clusterImplBalancer) handleDropAndRequestCount(newConfig *LBConfig) *dropConfigs {
   463  	// Compare new drop config. And update picker if it's changed.
   464  	var updatePicker bool
   465  	if !equalDropCategories(b.dropCategories, newConfig.DropCategories) {
   466  		b.dropCategories = newConfig.DropCategories
   467  		b.drops = make([]*dropper, 0, len(newConfig.DropCategories))
   468  		for _, c := range newConfig.DropCategories {
   469  			b.drops = append(b.drops, newDropper(c))
   470  		}
   471  		updatePicker = true
   472  	}
   473  
   474  	// Compare cluster name. And update picker if it's changed, because circuit
   475  	// breaking's stream counter will be different.
   476  	if b.requestCounterCluster != newConfig.Cluster || b.requestCounterService != newConfig.EDSServiceName {
   477  		b.requestCounterCluster = newConfig.Cluster
   478  		b.requestCounterService = newConfig.EDSServiceName
   479  		b.requestCounter = xdsclient.GetClusterRequestsCounter(newConfig.Cluster, newConfig.EDSServiceName)
   480  		updatePicker = true
   481  	}
   482  	// Compare upper bound of stream count. And update picker if it's changed.
   483  	// This is also for circuit breaking.
   484  	var newRequestCountMax uint32 = 1024
   485  	if newConfig.MaxConcurrentRequests != nil {
   486  		newRequestCountMax = *newConfig.MaxConcurrentRequests
   487  	}
   488  	if b.requestCountMax != newRequestCountMax {
   489  		b.requestCountMax = newRequestCountMax
   490  		updatePicker = true
   491  	}
   492  
   493  	if !updatePicker {
   494  		return nil
   495  	}
   496  	return &dropConfigs{
   497  		drops:           b.drops,
   498  		requestCounter:  b.requestCounter,
   499  		requestCountMax: b.requestCountMax,
   500  	}
   501  }
   502  
   503  func (b *clusterImplBalancer) run() {
   504  	defer b.done.Fire()
   505  	for {
   506  		select {
   507  		case update := <-b.pickerUpdateCh.Get():
   508  			b.pickerUpdateCh.Load()
   509  			b.mu.Lock()
   510  			if b.closed.HasFired() {
   511  				b.mu.Unlock()
   512  				return
   513  			}
   514  			switch u := update.(type) {
   515  			case balancer.State:
   516  				b.childState = u
   517  				b.ClientConn.UpdateState(balancer.State{
   518  					ConnectivityState: b.childState.ConnectivityState,
   519  					Picker: newPicker(b.childState, &dropConfigs{
   520  						drops:           b.drops,
   521  						requestCounter:  b.requestCounter,
   522  						requestCountMax: b.requestCountMax,
   523  					}, b.loadWrapper),
   524  				})
   525  			case *LBConfig:
   526  				dc := b.handleDropAndRequestCount(u)
   527  				if dc != nil && b.childState.Picker != nil {
   528  					b.ClientConn.UpdateState(balancer.State{
   529  						ConnectivityState: b.childState.ConnectivityState,
   530  						Picker:            newPicker(b.childState, dc, b.loadWrapper),
   531  					})
   532  				}
   533  			}
   534  			b.mu.Unlock()
   535  		case <-b.closed.Done():
   536  			if b.cancelLoadReport != nil {
   537  				b.cancelLoadReport()
   538  				b.cancelLoadReport = nil
   539  			}
   540  			return
   541  		}
   542  	}
   543  }