google.golang.org/grpc@v1.62.1/xds/internal/balancer/clusterimpl/clusterimpl.go (about)

     1  /*
     2   *
     3   * Copyright 2020 gRPC authors.
     4   *
     5   * Licensed under the Apache License, Version 2.0 (the "License");
     6   * you may not use this file except in compliance with the License.
     7   * You may obtain a copy of the License at
     8   *
     9   *     http://www.apache.org/licenses/LICENSE-2.0
    10   *
    11   * Unless required by applicable law or agreed to in writing, software
    12   * distributed under the License is distributed on an "AS IS" BASIS,
    13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14   * See the License for the specific language governing permissions and
    15   * limitations under the License.
    16   *
    17   */
    18  
    19  // Package clusterimpl implements the xds_cluster_impl balancing policy. It
    20  // handles the cluster features (e.g. circuit_breaking, RPC dropping).
    21  //
    22  // Note that it doesn't handle name resolution, which is done by policy
    23  // xds_cluster_resolver.
    24  package clusterimpl
    25  
    26  import (
    27  	"encoding/json"
    28  	"fmt"
    29  	"sync"
    30  	"sync/atomic"
    31  
    32  	"google.golang.org/grpc/balancer"
    33  	"google.golang.org/grpc/connectivity"
    34  	"google.golang.org/grpc/internal"
    35  	"google.golang.org/grpc/internal/balancer/gracefulswitch"
    36  	"google.golang.org/grpc/internal/buffer"
    37  	"google.golang.org/grpc/internal/grpclog"
    38  	"google.golang.org/grpc/internal/grpcsync"
    39  	"google.golang.org/grpc/internal/pretty"
    40  	"google.golang.org/grpc/resolver"
    41  	"google.golang.org/grpc/serviceconfig"
    42  	xdsinternal "google.golang.org/grpc/xds/internal"
    43  	"google.golang.org/grpc/xds/internal/balancer/loadstore"
    44  	"google.golang.org/grpc/xds/internal/xdsclient"
    45  	"google.golang.org/grpc/xds/internal/xdsclient/bootstrap"
    46  	"google.golang.org/grpc/xds/internal/xdsclient/load"
    47  )
    48  
    49  const (
    50  	// Name is the name of the cluster_impl balancer.
    51  	Name                   = "xds_cluster_impl_experimental"
    52  	defaultRequestCountMax = 1024
    53  )
    54  
    55  func init() {
    56  	balancer.Register(bb{})
    57  }
    58  
    59  type bb struct{}
    60  
    61  func (bb) Build(cc balancer.ClientConn, bOpts balancer.BuildOptions) balancer.Balancer {
    62  	b := &clusterImplBalancer{
    63  		ClientConn:      cc,
    64  		bOpts:           bOpts,
    65  		closed:          grpcsync.NewEvent(),
    66  		done:            grpcsync.NewEvent(),
    67  		loadWrapper:     loadstore.NewWrapper(),
    68  		pickerUpdateCh:  buffer.NewUnbounded(),
    69  		requestCountMax: defaultRequestCountMax,
    70  	}
    71  	b.logger = prefixLogger(b)
    72  	b.child = gracefulswitch.NewBalancer(b, bOpts)
    73  	go b.run()
    74  	b.logger.Infof("Created")
    75  	return b
    76  }
    77  
    78  func (bb) Name() string {
    79  	return Name
    80  }
    81  
    82  func (bb) ParseConfig(c json.RawMessage) (serviceconfig.LoadBalancingConfig, error) {
    83  	return parseConfig(c)
    84  }
    85  
    86  type clusterImplBalancer struct {
    87  	balancer.ClientConn
    88  
    89  	// mu guarantees mutual exclusion between Close() and handling of picker
    90  	// update to the parent ClientConn in run(). It's to make sure that the
    91  	// run() goroutine doesn't send picker update to parent after the balancer
    92  	// is closed.
    93  	//
    94  	// It's only used by the run() goroutine, but not the other exported
    95  	// functions. Because the exported functions are guaranteed to be
    96  	// synchronized with Close().
    97  	mu     sync.Mutex
    98  	closed *grpcsync.Event
    99  	done   *grpcsync.Event
   100  
   101  	bOpts     balancer.BuildOptions
   102  	logger    *grpclog.PrefixLogger
   103  	xdsClient xdsclient.XDSClient
   104  
   105  	config           *LBConfig
   106  	child            *gracefulswitch.Balancer
   107  	cancelLoadReport func()
   108  	edsServiceName   string
   109  	lrsServer        *bootstrap.ServerConfig
   110  	loadWrapper      *loadstore.Wrapper
   111  
   112  	clusterNameMu sync.Mutex
   113  	clusterName   string
   114  
   115  	// childState/drops/requestCounter keeps the state used by the most recently
   116  	// generated picker. All fields can only be accessed in run(). And run() is
   117  	// the only goroutine that sends picker to the parent ClientConn. All
   118  	// requests to update picker need to be sent to pickerUpdateCh.
   119  	childState            balancer.State
   120  	dropCategories        []DropConfig // The categories for drops.
   121  	drops                 []*dropper
   122  	requestCounterCluster string // The cluster name for the request counter.
   123  	requestCounterService string // The service name for the request counter.
   124  	requestCounter        *xdsclient.ClusterRequestsCounter
   125  	requestCountMax       uint32
   126  	pickerUpdateCh        *buffer.Unbounded
   127  }
   128  
   129  // updateLoadStore checks the config for load store, and decides whether it
   130  // needs to restart the load reporting stream.
   131  func (b *clusterImplBalancer) updateLoadStore(newConfig *LBConfig) error {
   132  	var updateLoadClusterAndService bool
   133  
   134  	// ClusterName is different, restart. ClusterName is from ClusterName and
   135  	// EDSServiceName.
   136  	clusterName := b.getClusterName()
   137  	if clusterName != newConfig.Cluster {
   138  		updateLoadClusterAndService = true
   139  		b.setClusterName(newConfig.Cluster)
   140  		clusterName = newConfig.Cluster
   141  	}
   142  	if b.edsServiceName != newConfig.EDSServiceName {
   143  		updateLoadClusterAndService = true
   144  		b.edsServiceName = newConfig.EDSServiceName
   145  	}
   146  	if updateLoadClusterAndService {
   147  		// This updates the clusterName and serviceName that will be reported
   148  		// for the loads. The update here is too early, the perfect timing is
   149  		// when the picker is updated with the new connection. But from this
   150  		// balancer's point of view, it's impossible to tell.
   151  		//
   152  		// On the other hand, this will almost never happen. Each LRS policy
   153  		// shouldn't get updated config. The parent should do a graceful switch
   154  		// when the clusterName or serviceName is changed.
   155  		b.loadWrapper.UpdateClusterAndService(clusterName, b.edsServiceName)
   156  	}
   157  
   158  	var (
   159  		stopOldLoadReport  bool
   160  		startNewLoadReport bool
   161  	)
   162  
   163  	// Check if it's necessary to restart load report.
   164  	if b.lrsServer == nil {
   165  		if newConfig.LoadReportingServer != nil {
   166  			// Old is nil, new is not nil, start new LRS.
   167  			b.lrsServer = newConfig.LoadReportingServer
   168  			startNewLoadReport = true
   169  		}
   170  		// Old is nil, new is nil, do nothing.
   171  	} else if newConfig.LoadReportingServer == nil {
   172  		// Old is not nil, new is nil, stop old, don't start new.
   173  		b.lrsServer = newConfig.LoadReportingServer
   174  		stopOldLoadReport = true
   175  	} else {
   176  		// Old is not nil, new is not nil, compare string values, if
   177  		// different, stop old and start new.
   178  		if !b.lrsServer.Equal(newConfig.LoadReportingServer) {
   179  			b.lrsServer = newConfig.LoadReportingServer
   180  			stopOldLoadReport = true
   181  			startNewLoadReport = true
   182  		}
   183  	}
   184  
   185  	if stopOldLoadReport {
   186  		if b.cancelLoadReport != nil {
   187  			b.cancelLoadReport()
   188  			b.cancelLoadReport = nil
   189  			if !startNewLoadReport {
   190  				// If a new LRS stream will be started later, no need to update
   191  				// it to nil here.
   192  				b.loadWrapper.UpdateLoadStore(nil)
   193  			}
   194  		}
   195  	}
   196  	if startNewLoadReport {
   197  		var loadStore *load.Store
   198  		if b.xdsClient != nil {
   199  			loadStore, b.cancelLoadReport = b.xdsClient.ReportLoad(b.lrsServer)
   200  		}
   201  		b.loadWrapper.UpdateLoadStore(loadStore)
   202  	}
   203  
   204  	return nil
   205  }
   206  
   207  func (b *clusterImplBalancer) UpdateClientConnState(s balancer.ClientConnState) error {
   208  	if b.closed.HasFired() {
   209  		b.logger.Warningf("xds: received ClientConnState {%+v} after clusterImplBalancer was closed", s)
   210  		return nil
   211  	}
   212  
   213  	b.logger.Infof("Received update from resolver, balancer config: %+v", pretty.ToJSON(s.BalancerConfig))
   214  	newConfig, ok := s.BalancerConfig.(*LBConfig)
   215  	if !ok {
   216  		return fmt.Errorf("unexpected balancer config with type: %T", s.BalancerConfig)
   217  	}
   218  
   219  	// Need to check for potential errors at the beginning of this function, so
   220  	// that on errors, we reject the whole config, instead of applying part of
   221  	// it.
   222  	bb := balancer.Get(newConfig.ChildPolicy.Name)
   223  	if bb == nil {
   224  		return fmt.Errorf("balancer %q not registered", newConfig.ChildPolicy.Name)
   225  	}
   226  
   227  	if b.xdsClient == nil {
   228  		c := xdsclient.FromResolverState(s.ResolverState)
   229  		if c == nil {
   230  			return balancer.ErrBadResolverState
   231  		}
   232  		b.xdsClient = c
   233  	}
   234  
   235  	// Update load reporting config. This needs to be done before updating the
   236  	// child policy because we need the loadStore from the updated client to be
   237  	// passed to the ccWrapper, so that the next picker from the child policy
   238  	// will pick up the new loadStore.
   239  	if err := b.updateLoadStore(newConfig); err != nil {
   240  		return err
   241  	}
   242  
   243  	if b.config == nil || b.config.ChildPolicy.Name != newConfig.ChildPolicy.Name {
   244  		if err := b.child.SwitchTo(bb); err != nil {
   245  			return fmt.Errorf("error switching to child of type %q: %v", newConfig.ChildPolicy.Name, err)
   246  		}
   247  	}
   248  	b.config = newConfig
   249  
   250  	// Notify run() of this new config, in case drop and request counter need
   251  	// update (which means a new picker needs to be generated).
   252  	b.pickerUpdateCh.Put(newConfig)
   253  
   254  	// Addresses and sub-balancer config are sent to sub-balancer.
   255  	return b.child.UpdateClientConnState(balancer.ClientConnState{
   256  		ResolverState:  s.ResolverState,
   257  		BalancerConfig: b.config.ChildPolicy.Config,
   258  	})
   259  }
   260  
   261  func (b *clusterImplBalancer) ResolverError(err error) {
   262  	if b.closed.HasFired() {
   263  		b.logger.Warningf("xds: received resolver error {%+v} after clusterImplBalancer was closed", err)
   264  		return
   265  	}
   266  	b.child.ResolverError(err)
   267  }
   268  
   269  func (b *clusterImplBalancer) updateSubConnState(sc balancer.SubConn, s balancer.SubConnState, cb func(balancer.SubConnState)) {
   270  	if b.closed.HasFired() {
   271  		b.logger.Warningf("xds: received subconn state change {%+v, %+v} after clusterImplBalancer was closed", sc, s)
   272  		return
   273  	}
   274  
   275  	// Trigger re-resolution when a SubConn turns transient failure. This is
   276  	// necessary for the LogicalDNS in cluster_resolver policy to re-resolve.
   277  	//
   278  	// Note that this happens not only for the addresses from DNS, but also for
   279  	// EDS (cluster_impl doesn't know if it's DNS or EDS, only the parent
   280  	// knows). The parent priority policy is configured to ignore re-resolution
   281  	// signal from the EDS children.
   282  	if s.ConnectivityState == connectivity.TransientFailure {
   283  		b.ClientConn.ResolveNow(resolver.ResolveNowOptions{})
   284  	}
   285  
   286  	if cb != nil {
   287  		cb(s)
   288  	}
   289  }
   290  
   291  func (b *clusterImplBalancer) UpdateSubConnState(sc balancer.SubConn, s balancer.SubConnState) {
   292  	b.logger.Errorf("UpdateSubConnState(%v, %+v) called unexpectedly", sc, s)
   293  }
   294  
   295  func (b *clusterImplBalancer) Close() {
   296  	b.mu.Lock()
   297  	b.closed.Fire()
   298  	b.mu.Unlock()
   299  
   300  	b.child.Close()
   301  	b.childState = balancer.State{}
   302  	b.pickerUpdateCh.Close()
   303  	<-b.done.Done()
   304  	b.logger.Infof("Shutdown")
   305  }
   306  
   307  func (b *clusterImplBalancer) ExitIdle() {
   308  	b.child.ExitIdle()
   309  }
   310  
   311  // Override methods to accept updates from the child LB.
   312  
   313  func (b *clusterImplBalancer) UpdateState(state balancer.State) {
   314  	// Instead of updating parent ClientConn inline, send state to run().
   315  	b.pickerUpdateCh.Put(state)
   316  }
   317  
   318  func (b *clusterImplBalancer) setClusterName(n string) {
   319  	b.clusterNameMu.Lock()
   320  	defer b.clusterNameMu.Unlock()
   321  	b.clusterName = n
   322  }
   323  
   324  func (b *clusterImplBalancer) getClusterName() string {
   325  	b.clusterNameMu.Lock()
   326  	defer b.clusterNameMu.Unlock()
   327  	return b.clusterName
   328  }
   329  
   330  // scWrapper is a wrapper of SubConn with locality ID. The locality ID can be
   331  // retrieved from the addresses when creating SubConn.
   332  //
   333  // All SubConns passed to the child policies are wrapped in this, so that the
   334  // picker can get the localityID from the picked SubConn, and do load reporting.
   335  //
   336  // After wrapping, all SubConns to and from the parent ClientConn (e.g. for
   337  // SubConn state update, update/remove SubConn) must be the original SubConns.
   338  // All SubConns to and from the child policy (NewSubConn, forwarding SubConn
   339  // state update) must be the wrapper. The balancer keeps a map from the original
   340  // SubConn to the wrapper for this purpose.
   341  type scWrapper struct {
   342  	balancer.SubConn
   343  	// locality needs to be atomic because it can be updated while being read by
   344  	// the picker.
   345  	locality atomic.Value // type xdsinternal.LocalityID
   346  }
   347  
   348  func (scw *scWrapper) updateLocalityID(lID xdsinternal.LocalityID) {
   349  	scw.locality.Store(lID)
   350  }
   351  
   352  func (scw *scWrapper) localityID() xdsinternal.LocalityID {
   353  	lID, _ := scw.locality.Load().(xdsinternal.LocalityID)
   354  	return lID
   355  }
   356  
   357  func (b *clusterImplBalancer) NewSubConn(addrs []resolver.Address, opts balancer.NewSubConnOptions) (balancer.SubConn, error) {
   358  	clusterName := b.getClusterName()
   359  	newAddrs := make([]resolver.Address, len(addrs))
   360  	var lID xdsinternal.LocalityID
   361  	for i, addr := range addrs {
   362  		newAddrs[i] = internal.SetXDSHandshakeClusterName(addr, clusterName)
   363  		lID = xdsinternal.GetLocalityID(newAddrs[i])
   364  	}
   365  	var sc balancer.SubConn
   366  	oldListener := opts.StateListener
   367  	opts.StateListener = func(state balancer.SubConnState) { b.updateSubConnState(sc, state, oldListener) }
   368  	sc, err := b.ClientConn.NewSubConn(newAddrs, opts)
   369  	if err != nil {
   370  		return nil, err
   371  	}
   372  	// Wrap this SubConn in a wrapper, and add it to the map.
   373  	ret := &scWrapper{SubConn: sc}
   374  	ret.updateLocalityID(lID)
   375  	return ret, nil
   376  }
   377  
   378  func (b *clusterImplBalancer) RemoveSubConn(sc balancer.SubConn) {
   379  	b.logger.Errorf("RemoveSubConn(%v) called unexpectedly", sc)
   380  }
   381  
   382  func (b *clusterImplBalancer) UpdateAddresses(sc balancer.SubConn, addrs []resolver.Address) {
   383  	clusterName := b.getClusterName()
   384  	newAddrs := make([]resolver.Address, len(addrs))
   385  	var lID xdsinternal.LocalityID
   386  	for i, addr := range addrs {
   387  		newAddrs[i] = internal.SetXDSHandshakeClusterName(addr, clusterName)
   388  		lID = xdsinternal.GetLocalityID(newAddrs[i])
   389  	}
   390  	if scw, ok := sc.(*scWrapper); ok {
   391  		scw.updateLocalityID(lID)
   392  		// Need to get the original SubConn from the wrapper before calling
   393  		// parent ClientConn.
   394  		sc = scw.SubConn
   395  	}
   396  	b.ClientConn.UpdateAddresses(sc, newAddrs)
   397  }
   398  
   399  type dropConfigs struct {
   400  	drops           []*dropper
   401  	requestCounter  *xdsclient.ClusterRequestsCounter
   402  	requestCountMax uint32
   403  }
   404  
   405  // handleDropAndRequestCount compares drop and request counter in newConfig with
   406  // the one currently used by picker. It returns a new dropConfigs if a new
   407  // picker needs to be generated, otherwise it returns nil.
   408  func (b *clusterImplBalancer) handleDropAndRequestCount(newConfig *LBConfig) *dropConfigs {
   409  	// Compare new drop config. And update picker if it's changed.
   410  	var updatePicker bool
   411  	if !equalDropCategories(b.dropCategories, newConfig.DropCategories) {
   412  		b.dropCategories = newConfig.DropCategories
   413  		b.drops = make([]*dropper, 0, len(newConfig.DropCategories))
   414  		for _, c := range newConfig.DropCategories {
   415  			b.drops = append(b.drops, newDropper(c))
   416  		}
   417  		updatePicker = true
   418  	}
   419  
   420  	// Compare cluster name. And update picker if it's changed, because circuit
   421  	// breaking's stream counter will be different.
   422  	if b.requestCounterCluster != newConfig.Cluster || b.requestCounterService != newConfig.EDSServiceName {
   423  		b.requestCounterCluster = newConfig.Cluster
   424  		b.requestCounterService = newConfig.EDSServiceName
   425  		b.requestCounter = xdsclient.GetClusterRequestsCounter(newConfig.Cluster, newConfig.EDSServiceName)
   426  		updatePicker = true
   427  	}
   428  	// Compare upper bound of stream count. And update picker if it's changed.
   429  	// This is also for circuit breaking.
   430  	var newRequestCountMax uint32 = 1024
   431  	if newConfig.MaxConcurrentRequests != nil {
   432  		newRequestCountMax = *newConfig.MaxConcurrentRequests
   433  	}
   434  	if b.requestCountMax != newRequestCountMax {
   435  		b.requestCountMax = newRequestCountMax
   436  		updatePicker = true
   437  	}
   438  
   439  	if !updatePicker {
   440  		return nil
   441  	}
   442  	return &dropConfigs{
   443  		drops:           b.drops,
   444  		requestCounter:  b.requestCounter,
   445  		requestCountMax: b.requestCountMax,
   446  	}
   447  }
   448  
   449  func (b *clusterImplBalancer) run() {
   450  	defer b.done.Fire()
   451  	for {
   452  		select {
   453  		case update, ok := <-b.pickerUpdateCh.Get():
   454  			if !ok {
   455  				return
   456  			}
   457  			b.pickerUpdateCh.Load()
   458  			b.mu.Lock()
   459  			if b.closed.HasFired() {
   460  				b.mu.Unlock()
   461  				return
   462  			}
   463  			switch u := update.(type) {
   464  			case balancer.State:
   465  				b.childState = u
   466  				b.ClientConn.UpdateState(balancer.State{
   467  					ConnectivityState: b.childState.ConnectivityState,
   468  					Picker: newPicker(b.childState, &dropConfigs{
   469  						drops:           b.drops,
   470  						requestCounter:  b.requestCounter,
   471  						requestCountMax: b.requestCountMax,
   472  					}, b.loadWrapper),
   473  				})
   474  			case *LBConfig:
   475  				dc := b.handleDropAndRequestCount(u)
   476  				if dc != nil && b.childState.Picker != nil {
   477  					b.ClientConn.UpdateState(balancer.State{
   478  						ConnectivityState: b.childState.ConnectivityState,
   479  						Picker:            newPicker(b.childState, dc, b.loadWrapper),
   480  					})
   481  				}
   482  			}
   483  			b.mu.Unlock()
   484  		case <-b.closed.Done():
   485  			if b.cancelLoadReport != nil {
   486  				b.cancelLoadReport()
   487  				b.cancelLoadReport = nil
   488  			}
   489  			return
   490  		}
   491  	}
   492  }