google.golang.org/grpc@v1.72.2/xds/internal/balancer/clusterimpl/clusterimpl.go (about)

     1  /*
     2   *
     3   * Copyright 2020 gRPC authors.
     4   *
     5   * Licensed under the Apache License, Version 2.0 (the "License");
     6   * you may not use this file except in compliance with the License.
     7   * You may obtain a copy of the License at
     8   *
     9   *     http://www.apache.org/licenses/LICENSE-2.0
    10   *
    11   * Unless required by applicable law or agreed to in writing, software
    12   * distributed under the License is distributed on an "AS IS" BASIS,
    13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14   * See the License for the specific language governing permissions and
    15   * limitations under the License.
    16   *
    17   */
    18  
    19  // Package clusterimpl implements the xds_cluster_impl balancing policy. It
    20  // handles the cluster features (e.g. circuit_breaking, RPC dropping).
    21  //
    22  // Note that it doesn't handle name resolution, which is done by policy
    23  // xds_cluster_resolver.
    24  package clusterimpl
    25  
    26  import (
    27  	"encoding/json"
    28  	"fmt"
    29  	"sync"
    30  	"sync/atomic"
    31  
    32  	"google.golang.org/grpc/balancer"
    33  	"google.golang.org/grpc/connectivity"
    34  	"google.golang.org/grpc/internal"
    35  	"google.golang.org/grpc/internal/balancer/gracefulswitch"
    36  	"google.golang.org/grpc/internal/grpclog"
    37  	"google.golang.org/grpc/internal/pretty"
    38  	"google.golang.org/grpc/internal/xds"
    39  	"google.golang.org/grpc/internal/xds/bootstrap"
    40  	"google.golang.org/grpc/resolver"
    41  	"google.golang.org/grpc/serviceconfig"
    42  	xdsinternal "google.golang.org/grpc/xds/internal"
    43  	"google.golang.org/grpc/xds/internal/balancer/loadstore"
    44  	"google.golang.org/grpc/xds/internal/xdsclient"
    45  	"google.golang.org/grpc/xds/internal/xdsclient/load"
    46  )
    47  
    48  const (
    49  	// Name is the name of the cluster_impl balancer.
    50  	Name                   = "xds_cluster_impl_experimental"
    51  	defaultRequestCountMax = 1024
    52  )
    53  
    54  var (
    55  	connectedAddress = internal.ConnectedAddress.(func(balancer.SubConnState) resolver.Address)
    56  	// Below function is no-op in actual code, but can be overridden in
    57  	// tests to give tests visibility into exactly when certain events happen.
    58  	clientConnUpdateHook = func() {}
    59  	pickerUpdateHook     = func() {}
    60  )
    61  
    62  func init() {
    63  	balancer.Register(bb{})
    64  }
    65  
    66  type bb struct{}
    67  
    68  func (bb) Build(cc balancer.ClientConn, bOpts balancer.BuildOptions) balancer.Balancer {
    69  	b := &clusterImplBalancer{
    70  		ClientConn:      cc,
    71  		loadWrapper:     loadstore.NewWrapper(),
    72  		requestCountMax: defaultRequestCountMax,
    73  	}
    74  	b.logger = prefixLogger(b)
    75  	b.child = gracefulswitch.NewBalancer(b, bOpts)
    76  	b.logger.Infof("Created")
    77  	return b
    78  }
    79  
    80  func (bb) Name() string {
    81  	return Name
    82  }
    83  
    84  func (bb) ParseConfig(c json.RawMessage) (serviceconfig.LoadBalancingConfig, error) {
    85  	return parseConfig(c)
    86  }
    87  
    88  type clusterImplBalancer struct {
    89  	balancer.ClientConn
    90  
    91  	// The following fields are set at creation time, and are read-only after
    92  	// that, and therefore need not be protected by a mutex.
    93  	logger      *grpclog.PrefixLogger
    94  	loadWrapper *loadstore.Wrapper
    95  
    96  	// The following fields are only accessed from balancer API methods, which
    97  	// are guaranteed to be called serially by gRPC.
    98  	xdsClient        xdsclient.XDSClient     // Sent down in ResolverState attributes.
    99  	cancelLoadReport func()                  // To stop reporting load through the above xDS client.
   100  	edsServiceName   string                  // EDS service name to report load for.
   101  	lrsServer        *bootstrap.ServerConfig // Load reporting server configuration.
   102  	dropCategories   []DropConfig            // The categories for drops.
   103  	child            *gracefulswitch.Balancer
   104  
   105  	// The following fields are protected by mu, since they are accessed in
   106  	// balancer API methods and in methods called from the child policy.
   107  	mu                    sync.Mutex
   108  	clusterName           string                            // The cluster name for credentials handshaking.
   109  	inhibitPickerUpdates  bool                              // Inhibits state updates from child policy when processing an update from the parent.
   110  	pendingPickerUpdates  bool                              // True if a picker update from the child policy was inhibited when processing an update from the parent.
   111  	childState            balancer.State                    // Most recent state update from the child policy.
   112  	drops                 []*dropper                        // Drops implementation.
   113  	requestCounterCluster string                            // The cluster name for the request counter, from LB config.
   114  	requestCounterService string                            // The service name for the request counter, from LB config.
   115  	requestCountMax       uint32                            // Max concurrent requests, from LB config.
   116  	requestCounter        *xdsclient.ClusterRequestsCounter // Tracks total inflight requests for a given service.
   117  	telemetryLabels       map[string]string                 // Telemetry labels to set on picks, from LB config.
   118  }
   119  
   120  // handleDropAndRequestCountLocked compares drop and request counter in newConfig with
   121  // the one currently used by picker, and is protected by b.mu. It returns a boolean
   122  // indicating if a new picker needs to be generated.
   123  func (b *clusterImplBalancer) handleDropAndRequestCountLocked(newConfig *LBConfig) bool {
   124  	var updatePicker bool
   125  	if !equalDropCategories(b.dropCategories, newConfig.DropCategories) {
   126  		b.dropCategories = newConfig.DropCategories
   127  		b.drops = make([]*dropper, 0, len(newConfig.DropCategories))
   128  		for _, c := range newConfig.DropCategories {
   129  			b.drops = append(b.drops, newDropper(c))
   130  		}
   131  		updatePicker = true
   132  	}
   133  
   134  	if b.requestCounterCluster != newConfig.Cluster || b.requestCounterService != newConfig.EDSServiceName {
   135  		b.requestCounterCluster = newConfig.Cluster
   136  		b.requestCounterService = newConfig.EDSServiceName
   137  		b.requestCounter = xdsclient.GetClusterRequestsCounter(newConfig.Cluster, newConfig.EDSServiceName)
   138  		updatePicker = true
   139  	}
   140  	var newRequestCountMax uint32 = 1024
   141  	if newConfig.MaxConcurrentRequests != nil {
   142  		newRequestCountMax = *newConfig.MaxConcurrentRequests
   143  	}
   144  	if b.requestCountMax != newRequestCountMax {
   145  		b.requestCountMax = newRequestCountMax
   146  		updatePicker = true
   147  	}
   148  
   149  	return updatePicker
   150  }
   151  
   152  func (b *clusterImplBalancer) newPickerLocked() *picker {
   153  	return &picker{
   154  		drops:           b.drops,
   155  		s:               b.childState,
   156  		loadStore:       b.loadWrapper,
   157  		counter:         b.requestCounter,
   158  		countMax:        b.requestCountMax,
   159  		telemetryLabels: b.telemetryLabels,
   160  	}
   161  }
   162  
   163  // updateLoadStore checks the config for load store, and decides whether it
   164  // needs to restart the load reporting stream.
   165  func (b *clusterImplBalancer) updateLoadStore(newConfig *LBConfig) error {
   166  	var updateLoadClusterAndService bool
   167  
   168  	// ClusterName is different, restart. ClusterName is from ClusterName and
   169  	// EDSServiceName.
   170  	clusterName := b.getClusterName()
   171  	if clusterName != newConfig.Cluster {
   172  		updateLoadClusterAndService = true
   173  		b.setClusterName(newConfig.Cluster)
   174  		clusterName = newConfig.Cluster
   175  	}
   176  	if b.edsServiceName != newConfig.EDSServiceName {
   177  		updateLoadClusterAndService = true
   178  		b.edsServiceName = newConfig.EDSServiceName
   179  	}
   180  	if updateLoadClusterAndService {
   181  		// This updates the clusterName and serviceName that will be reported
   182  		// for the loads. The update here is too early, the perfect timing is
   183  		// when the picker is updated with the new connection. But from this
   184  		// balancer's point of view, it's impossible to tell.
   185  		//
   186  		// On the other hand, this will almost never happen. Each LRS policy
   187  		// shouldn't get updated config. The parent should do a graceful switch
   188  		// when the clusterName or serviceName is changed.
   189  		b.loadWrapper.UpdateClusterAndService(clusterName, b.edsServiceName)
   190  	}
   191  
   192  	var (
   193  		stopOldLoadReport  bool
   194  		startNewLoadReport bool
   195  	)
   196  
   197  	// Check if it's necessary to restart load report.
   198  	if b.lrsServer == nil {
   199  		if newConfig.LoadReportingServer != nil {
   200  			// Old is nil, new is not nil, start new LRS.
   201  			b.lrsServer = newConfig.LoadReportingServer
   202  			startNewLoadReport = true
   203  		}
   204  		// Old is nil, new is nil, do nothing.
   205  	} else if newConfig.LoadReportingServer == nil {
   206  		// Old is not nil, new is nil, stop old, don't start new.
   207  		b.lrsServer = newConfig.LoadReportingServer
   208  		stopOldLoadReport = true
   209  	} else {
   210  		// Old is not nil, new is not nil, compare string values, if
   211  		// different, stop old and start new.
   212  		if !b.lrsServer.Equal(newConfig.LoadReportingServer) {
   213  			b.lrsServer = newConfig.LoadReportingServer
   214  			stopOldLoadReport = true
   215  			startNewLoadReport = true
   216  		}
   217  	}
   218  
   219  	if stopOldLoadReport {
   220  		if b.cancelLoadReport != nil {
   221  			b.cancelLoadReport()
   222  			b.cancelLoadReport = nil
   223  			if !startNewLoadReport {
   224  				// If a new LRS stream will be started later, no need to update
   225  				// it to nil here.
   226  				b.loadWrapper.UpdateLoadStore(nil)
   227  			}
   228  		}
   229  	}
   230  	if startNewLoadReport {
   231  		var loadStore *load.Store
   232  		if b.xdsClient != nil {
   233  			loadStore, b.cancelLoadReport = b.xdsClient.ReportLoad(b.lrsServer)
   234  		}
   235  		b.loadWrapper.UpdateLoadStore(loadStore)
   236  	}
   237  
   238  	return nil
   239  }
   240  
   241  func (b *clusterImplBalancer) UpdateClientConnState(s balancer.ClientConnState) error {
   242  	defer clientConnUpdateHook()
   243  
   244  	b.mu.Lock()
   245  	b.inhibitPickerUpdates = true
   246  	b.mu.Unlock()
   247  	if b.logger.V(2) {
   248  		b.logger.Infof("Received configuration: %s", pretty.ToJSON(s.BalancerConfig))
   249  	}
   250  	newConfig, ok := s.BalancerConfig.(*LBConfig)
   251  	if !ok {
   252  		return fmt.Errorf("unexpected balancer config with type: %T", s.BalancerConfig)
   253  	}
   254  
   255  	// Need to check for potential errors at the beginning of this function, so
   256  	// that on errors, we reject the whole config, instead of applying part of
   257  	// it.
   258  	bb := balancer.Get(newConfig.ChildPolicy.Name)
   259  	if bb == nil {
   260  		return fmt.Errorf("child policy %q not registered", newConfig.ChildPolicy.Name)
   261  	}
   262  
   263  	if b.xdsClient == nil {
   264  		c := xdsclient.FromResolverState(s.ResolverState)
   265  		if c == nil {
   266  			return balancer.ErrBadResolverState
   267  		}
   268  		b.xdsClient = c
   269  	}
   270  
   271  	// Update load reporting config. This needs to be done before updating the
   272  	// child policy because we need the loadStore from the updated client to be
   273  	// passed to the ccWrapper, so that the next picker from the child policy
   274  	// will pick up the new loadStore.
   275  	if err := b.updateLoadStore(newConfig); err != nil {
   276  		return err
   277  	}
   278  
   279  	// Build config for the gracefulswitch balancer. It is safe to ignore JSON
   280  	// marshaling errors here, since the config was already validated as part of
   281  	// ParseConfig().
   282  	cfg := []map[string]any{{newConfig.ChildPolicy.Name: newConfig.ChildPolicy.Config}}
   283  	cfgJSON, _ := json.Marshal(cfg)
   284  	parsedCfg, err := gracefulswitch.ParseConfig(cfgJSON)
   285  	if err != nil {
   286  		return err
   287  	}
   288  
   289  	// Addresses and sub-balancer config are sent to sub-balancer.
   290  	err = b.child.UpdateClientConnState(balancer.ClientConnState{
   291  		ResolverState:  s.ResolverState,
   292  		BalancerConfig: parsedCfg,
   293  	})
   294  
   295  	b.mu.Lock()
   296  	b.telemetryLabels = newConfig.TelemetryLabels
   297  	// We want to send a picker update to the parent if one of the two
   298  	// conditions are met:
   299  	// - drop/request config has changed *and* there is already a picker from
   300  	//   the child, or
   301  	// - there is a pending picker update from the child (and this covers the
   302  	//   case where the drop/request config has not changed, but the child sent
   303  	//   a picker update while we were still processing config from our parent).
   304  	if (b.handleDropAndRequestCountLocked(newConfig) && b.childState.Picker != nil) || b.pendingPickerUpdates {
   305  		b.pendingPickerUpdates = false
   306  		b.ClientConn.UpdateState(balancer.State{
   307  			ConnectivityState: b.childState.ConnectivityState,
   308  			Picker:            b.newPickerLocked(),
   309  		})
   310  	}
   311  	b.inhibitPickerUpdates = false
   312  	b.mu.Unlock()
   313  	pickerUpdateHook()
   314  	return err
   315  }
   316  
   317  func (b *clusterImplBalancer) ResolverError(err error) {
   318  	b.child.ResolverError(err)
   319  }
   320  
   321  func (b *clusterImplBalancer) updateSubConnState(_ balancer.SubConn, s balancer.SubConnState, cb func(balancer.SubConnState)) {
   322  	// Trigger re-resolution when a SubConn turns transient failure. This is
   323  	// necessary for the LogicalDNS in cluster_resolver policy to re-resolve.
   324  	//
   325  	// Note that this happens not only for the addresses from DNS, but also for
   326  	// EDS (cluster_impl doesn't know if it's DNS or EDS, only the parent
   327  	// knows). The parent priority policy is configured to ignore re-resolution
   328  	// signal from the EDS children.
   329  	if s.ConnectivityState == connectivity.TransientFailure {
   330  		b.ClientConn.ResolveNow(resolver.ResolveNowOptions{})
   331  	}
   332  
   333  	if cb != nil {
   334  		cb(s)
   335  	}
   336  }
   337  
   338  func (b *clusterImplBalancer) UpdateSubConnState(sc balancer.SubConn, s balancer.SubConnState) {
   339  	b.logger.Errorf("UpdateSubConnState(%v, %+v) called unexpectedly", sc, s)
   340  }
   341  
   342  func (b *clusterImplBalancer) Close() {
   343  	b.child.Close()
   344  	b.childState = balancer.State{}
   345  
   346  	if b.cancelLoadReport != nil {
   347  		b.cancelLoadReport()
   348  		b.cancelLoadReport = nil
   349  	}
   350  	b.logger.Infof("Shutdown")
   351  }
   352  
   353  func (b *clusterImplBalancer) ExitIdle() {
   354  	b.child.ExitIdle()
   355  }
   356  
   357  // Override methods to accept updates from the child LB.
   358  
   359  func (b *clusterImplBalancer) UpdateState(state balancer.State) {
   360  	b.mu.Lock()
   361  	defer b.mu.Unlock()
   362  
   363  	// Inhibit sending a picker update to our parent as part of handling new
   364  	// state from the child, if we are currently handling an update from our
   365  	// parent. Update the childState field regardless.
   366  	b.childState = state
   367  	if b.inhibitPickerUpdates {
   368  		b.pendingPickerUpdates = true
   369  		if b.logger.V(2) {
   370  			b.logger.Infof("Received a picker update from the child when processing an update from the parent")
   371  		}
   372  		return
   373  	}
   374  
   375  	b.ClientConn.UpdateState(balancer.State{
   376  		ConnectivityState: state.ConnectivityState,
   377  		Picker:            b.newPickerLocked(),
   378  	})
   379  	pickerUpdateHook()
   380  }
   381  
   382  func (b *clusterImplBalancer) setClusterName(n string) {
   383  	b.mu.Lock()
   384  	defer b.mu.Unlock()
   385  	b.clusterName = n
   386  }
   387  
   388  func (b *clusterImplBalancer) getClusterName() string {
   389  	b.mu.Lock()
   390  	defer b.mu.Unlock()
   391  	return b.clusterName
   392  }
   393  
   394  // scWrapper is a wrapper of SubConn with locality ID. The locality ID can be
   395  // retrieved from the addresses when creating SubConn.
   396  //
   397  // All SubConns passed to the child policies are wrapped in this, so that the
   398  // picker can get the localityID from the picked SubConn, and do load reporting.
   399  //
   400  // After wrapping, all SubConns to and from the parent ClientConn (e.g. for
   401  // SubConn state update, update/remove SubConn) must be the original SubConns.
   402  // All SubConns to and from the child policy (NewSubConn, forwarding SubConn
   403  // state update) must be the wrapper. The balancer keeps a map from the original
   404  // SubConn to the wrapper for this purpose.
   405  type scWrapper struct {
   406  	balancer.SubConn
   407  	// locality needs to be atomic because it can be updated while being read by
   408  	// the picker.
   409  	locality atomic.Value // type xdsinternal.LocalityID
   410  }
   411  
   412  func (scw *scWrapper) updateLocalityID(lID xdsinternal.LocalityID) {
   413  	scw.locality.Store(lID)
   414  }
   415  
   416  func (scw *scWrapper) localityID() xdsinternal.LocalityID {
   417  	lID, _ := scw.locality.Load().(xdsinternal.LocalityID)
   418  	return lID
   419  }
   420  
   421  func (b *clusterImplBalancer) NewSubConn(addrs []resolver.Address, opts balancer.NewSubConnOptions) (balancer.SubConn, error) {
   422  	clusterName := b.getClusterName()
   423  	newAddrs := make([]resolver.Address, len(addrs))
   424  	for i, addr := range addrs {
   425  		newAddrs[i] = xds.SetXDSHandshakeClusterName(addr, clusterName)
   426  	}
   427  	var sc balancer.SubConn
   428  	scw := &scWrapper{}
   429  	oldListener := opts.StateListener
   430  	opts.StateListener = func(state balancer.SubConnState) {
   431  		b.updateSubConnState(sc, state, oldListener)
   432  		if state.ConnectivityState != connectivity.Ready {
   433  			return
   434  		}
   435  		// Read connected address and call updateLocalityID() based on the connected
   436  		// address's locality. https://github.com/grpc/grpc-go/issues/7339
   437  		addr := connectedAddress(state)
   438  		lID := xdsinternal.GetLocalityID(addr)
   439  		if lID.Empty() {
   440  			if b.logger.V(2) {
   441  				b.logger.Infof("Locality ID for %s unexpectedly empty", addr)
   442  			}
   443  			return
   444  		}
   445  		scw.updateLocalityID(lID)
   446  	}
   447  	sc, err := b.ClientConn.NewSubConn(newAddrs, opts)
   448  	if err != nil {
   449  		return nil, err
   450  	}
   451  	scw.SubConn = sc
   452  	return scw, nil
   453  }
   454  
   455  func (b *clusterImplBalancer) RemoveSubConn(sc balancer.SubConn) {
   456  	b.logger.Errorf("RemoveSubConn(%v) called unexpectedly", sc)
   457  }
   458  
   459  func (b *clusterImplBalancer) UpdateAddresses(sc balancer.SubConn, addrs []resolver.Address) {
   460  	clusterName := b.getClusterName()
   461  	newAddrs := make([]resolver.Address, len(addrs))
   462  	var lID xdsinternal.LocalityID
   463  	for i, addr := range addrs {
   464  		newAddrs[i] = xds.SetXDSHandshakeClusterName(addr, clusterName)
   465  		lID = xdsinternal.GetLocalityID(newAddrs[i])
   466  	}
   467  	if scw, ok := sc.(*scWrapper); ok {
   468  		scw.updateLocalityID(lID)
   469  		// Need to get the original SubConn from the wrapper before calling
   470  		// parent ClientConn.
   471  		sc = scw.SubConn
   472  	}
   473  	b.ClientConn.UpdateAddresses(sc, newAddrs)
   474  }