google.golang.org/grpc@v1.72.2/xds/internal/balancer/ringhash/ringhash.go (about)

     1  /*
     2   *
     3   * Copyright 2021 gRPC authors.
     4   *
     5   * Licensed under the Apache License, Version 2.0 (the "License");
     6   * you may not use this file except in compliance with the License.
     7   * You may obtain a copy of the License at
     8   *
     9   *     http://www.apache.org/licenses/LICENSE-2.0
    10   *
    11   * Unless required by applicable law or agreed to in writing, software
    12   * distributed under the License is distributed on an "AS IS" BASIS,
    13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14   * See the License for the specific language governing permissions and
    15   * limitations under the License.
    16   *
    17   */
    18  
    19  // Package ringhash implements the ringhash balancer.
    20  package ringhash
    21  
    22  import (
    23  	"encoding/json"
    24  	"errors"
    25  	"fmt"
    26  	"math/rand/v2"
    27  	"sort"
    28  	"sync"
    29  
    30  	"google.golang.org/grpc/balancer"
    31  	"google.golang.org/grpc/balancer/base"
    32  	"google.golang.org/grpc/balancer/endpointsharding"
    33  	"google.golang.org/grpc/balancer/lazy"
    34  	"google.golang.org/grpc/balancer/pickfirst/pickfirstleaf"
    35  	"google.golang.org/grpc/connectivity"
    36  	"google.golang.org/grpc/internal/balancer/weight"
    37  	"google.golang.org/grpc/internal/grpclog"
    38  	"google.golang.org/grpc/internal/pretty"
    39  	"google.golang.org/grpc/resolver"
    40  	"google.golang.org/grpc/resolver/ringhash"
    41  	"google.golang.org/grpc/serviceconfig"
    42  )
    43  
    44  // Name is the name of the ring_hash balancer.
    45  const Name = "ring_hash_experimental"
    46  
    47  func lazyPickFirstBuilder(cc balancer.ClientConn, opts balancer.BuildOptions) balancer.Balancer {
    48  	return lazy.NewBalancer(cc, opts, balancer.Get(pickfirstleaf.Name).Build)
    49  }
    50  
    51  func init() {
    52  	balancer.Register(bb{})
    53  }
    54  
    55  type bb struct{}
    56  
    57  func (bb) Build(cc balancer.ClientConn, opts balancer.BuildOptions) balancer.Balancer {
    58  	b := &ringhashBalancer{
    59  		ClientConn:     cc,
    60  		endpointStates: resolver.NewEndpointMap[*endpointState](),
    61  	}
    62  	esOpts := endpointsharding.Options{DisableAutoReconnect: true}
    63  	b.child = endpointsharding.NewBalancer(b, opts, lazyPickFirstBuilder, esOpts)
    64  	b.logger = prefixLogger(b)
    65  	b.logger.Infof("Created")
    66  	return b
    67  }
    68  
    69  func (bb) Name() string {
    70  	return Name
    71  }
    72  
    73  func (bb) ParseConfig(c json.RawMessage) (serviceconfig.LoadBalancingConfig, error) {
    74  	return parseConfig(c)
    75  }
    76  
    77  type ringhashBalancer struct {
    78  	// The following fields are initialized at build time and read-only after
    79  	// that and therefore do not need to be guarded by a mutex.
    80  
    81  	// ClientConn is embedded to intercept UpdateState calls from the child
    82  	// endpointsharding balancer.
    83  	balancer.ClientConn
    84  	logger *grpclog.PrefixLogger
    85  	child  balancer.Balancer
    86  
    87  	mu                   sync.Mutex
    88  	config               *LBConfig
    89  	inhibitChildUpdates  bool
    90  	shouldRegenerateRing bool
    91  	endpointStates       *resolver.EndpointMap[*endpointState]
    92  
    93  	// ring is always in sync with endpoints. When endpoints change, a new ring
    94  	// is generated. Note that address weights updates also regenerates the
    95  	// ring.
    96  	ring *ring
    97  }
    98  
    99  // hashKey returns the hash key to use for an endpoint. Per gRFC A61, each entry
   100  // in the ring is a hash of the endpoint's hash key concatenated with a
   101  // per-entry unique suffix.
   102  func hashKey(endpoint resolver.Endpoint) string {
   103  	if hk := ringhash.HashKey(endpoint); hk != "" {
   104  		return hk
   105  	}
   106  	// If no hash key is set, use the endpoint's first address as the hash key.
   107  	// This is the default behavior when no hash key is set.
   108  	return endpoint.Addresses[0].Addr
   109  }
   110  
   111  // UpdateState intercepts child balancer state updates. It updates the
   112  // per-endpoint state stored in the ring, and also the aggregated state based on
   113  // the child picker. It also reconciles the endpoint list. It sets
   114  // `b.shouldRegenerateRing` to true if the new endpoint list is different from
   115  // the previous, i.e. any of the following is true:
   116  // - an endpoint was added
   117  // - an endpoint was removed
   118  // - an endpoint's weight was updated
   119  // - the first addresses of the endpoint has changed
   120  func (b *ringhashBalancer) UpdateState(state balancer.State) {
   121  	b.mu.Lock()
   122  	defer b.mu.Unlock()
   123  	childStates := endpointsharding.ChildStatesFromPicker(state.Picker)
   124  	// endpointsSet is the set converted from endpoints, used for quick lookup.
   125  	endpointsSet := resolver.NewEndpointMap[bool]()
   126  
   127  	for _, childState := range childStates {
   128  		endpoint := childState.Endpoint
   129  		endpointsSet.Set(endpoint, true)
   130  		newWeight := getWeightAttribute(endpoint)
   131  		hk := hashKey(endpoint)
   132  		es, ok := b.endpointStates.Get(endpoint)
   133  		if !ok {
   134  			es := &endpointState{
   135  				balancer: childState.Balancer,
   136  				hashKey:  hk,
   137  				weight:   newWeight,
   138  				state:    childState.State,
   139  			}
   140  			b.endpointStates.Set(endpoint, es)
   141  			b.shouldRegenerateRing = true
   142  		} else {
   143  			// We have seen this endpoint before and created a `endpointState`
   144  			// object for it. If the weight or the hash key of the endpoint has
   145  			// changed, update the endpoint state map with the new weight or
   146  			// hash key. This will be used when a new ring is created.
   147  			if oldWeight := es.weight; oldWeight != newWeight {
   148  				b.shouldRegenerateRing = true
   149  				es.weight = newWeight
   150  			}
   151  			if es.hashKey != hk {
   152  				b.shouldRegenerateRing = true
   153  				es.hashKey = hk
   154  			}
   155  			es.state = childState.State
   156  		}
   157  	}
   158  
   159  	for _, endpoint := range b.endpointStates.Keys() {
   160  		if _, ok := endpointsSet.Get(endpoint); ok {
   161  			continue
   162  		}
   163  		// endpoint was removed by resolver.
   164  		b.endpointStates.Delete(endpoint)
   165  		b.shouldRegenerateRing = true
   166  	}
   167  
   168  	b.updatePickerLocked()
   169  }
   170  
   171  func (b *ringhashBalancer) UpdateClientConnState(ccs balancer.ClientConnState) error {
   172  	if b.logger.V(2) {
   173  		b.logger.Infof("Received update from resolver, balancer config: %+v", pretty.ToJSON(ccs.BalancerConfig))
   174  	}
   175  
   176  	newConfig, ok := ccs.BalancerConfig.(*LBConfig)
   177  	if !ok {
   178  		return fmt.Errorf("unexpected balancer config with type: %T", ccs.BalancerConfig)
   179  	}
   180  
   181  	b.mu.Lock()
   182  	b.inhibitChildUpdates = true
   183  	b.mu.Unlock()
   184  
   185  	defer func() {
   186  		b.mu.Lock()
   187  		b.inhibitChildUpdates = false
   188  		b.updatePickerLocked()
   189  		b.mu.Unlock()
   190  	}()
   191  
   192  	if err := b.child.UpdateClientConnState(balancer.ClientConnState{
   193  		// Make pickfirst children use health listeners for outlier detection
   194  		// and health checking to work.
   195  		ResolverState: pickfirstleaf.EnableHealthListener(ccs.ResolverState),
   196  	}); err != nil {
   197  		return err
   198  	}
   199  
   200  	b.mu.Lock()
   201  	// Ring updates can happen due to the following:
   202  	// 1. Addition or deletion of endpoints: The synchronous picker update from
   203  	//    the child endpointsharding balancer would contain the list of updated
   204  	//    endpoints.  Updates triggered by the child after handling the
   205  	//    `UpdateClientConnState` call will not change the endpoint list.
   206  	// 2. Change in the `LoadBalancerConfig`: Ring config such as max/min ring
   207  	//    size.
   208  	// To avoid extra ring updates, a boolean is used to track the need for a
   209  	// ring update and the update is done only once at the end.
   210  	//
   211  	// If the ring configuration has changed, we need to regenerate the ring
   212  	// while sending a new picker.
   213  	if b.config == nil || b.config.MinRingSize != newConfig.MinRingSize || b.config.MaxRingSize != newConfig.MaxRingSize {
   214  		b.shouldRegenerateRing = true
   215  	}
   216  	b.config = newConfig
   217  	b.mu.Unlock()
   218  	return nil
   219  }
   220  
   221  func (b *ringhashBalancer) ResolverError(err error) {
   222  	b.child.ResolverError(err)
   223  }
   224  
   225  func (b *ringhashBalancer) UpdateSubConnState(sc balancer.SubConn, state balancer.SubConnState) {
   226  	b.logger.Errorf("UpdateSubConnState(%v, %+v) called unexpectedly", sc, state)
   227  }
   228  
   229  func (b *ringhashBalancer) updatePickerLocked() {
   230  	state := b.aggregatedStateLocked()
   231  	// Start connecting to new endpoints if necessary.
   232  	if state == connectivity.Connecting || state == connectivity.TransientFailure {
   233  		// When overall state is TransientFailure, we need to make sure at least
   234  		// one endpoint is attempting to connect, otherwise this balancer may
   235  		// never get picks if the parent is priority.
   236  		//
   237  		// Because we report Connecting as the overall state when only one
   238  		// endpoint is in TransientFailure, we do the same check for Connecting
   239  		// here.
   240  		//
   241  		// Note that this check also covers deleting endpoints. E.g. if the
   242  		// endpoint attempting to connect is deleted, and the overall state is
   243  		// TF. Since there must be at least one endpoint attempting to connect,
   244  		// we need to trigger one.
   245  		//
   246  		// After calling `ExitIdle` on a child balancer, the child will send a
   247  		// picker update asynchronously. A race condition may occur if another
   248  		// picker update from endpointsharding arrives before the child's
   249  		// picker update. The received picker may trigger a re-execution of the
   250  		// loop below to find an idle child. Since map iteration order is
   251  		// non-deterministic, the list of `endpointState`s must be sorted to
   252  		// ensure `ExitIdle` is called on the same child, preventing unnecessary
   253  		// connections.
   254  		var endpointStates = make([]*endpointState, b.endpointStates.Len())
   255  		for i, s := range b.endpointStates.Values() {
   256  			endpointStates[i] = s
   257  		}
   258  		sort.Slice(endpointStates, func(i, j int) bool {
   259  			return endpointStates[i].hashKey < endpointStates[j].hashKey
   260  		})
   261  		var idleBalancer balancer.ExitIdler
   262  		for _, es := range endpointStates {
   263  			connState := es.state.ConnectivityState
   264  			if connState == connectivity.Connecting {
   265  				idleBalancer = nil
   266  				break
   267  			}
   268  			if idleBalancer == nil && connState == connectivity.Idle {
   269  				idleBalancer = es.balancer
   270  			}
   271  		}
   272  		if idleBalancer != nil {
   273  			idleBalancer.ExitIdle()
   274  		}
   275  	}
   276  
   277  	if b.inhibitChildUpdates {
   278  		return
   279  	}
   280  
   281  	// Update the channel.
   282  	if b.endpointStates.Len() > 0 && b.shouldRegenerateRing {
   283  		// with a non-empty list of endpoints.
   284  		b.ring = newRing(b.endpointStates, b.config.MinRingSize, b.config.MaxRingSize, b.logger)
   285  	}
   286  	b.shouldRegenerateRing = false
   287  	var newPicker balancer.Picker
   288  	if b.endpointStates.Len() == 0 {
   289  		newPicker = base.NewErrPicker(errors.New("produced zero addresses"))
   290  	} else {
   291  		newPicker = b.newPickerLocked()
   292  	}
   293  	b.ClientConn.UpdateState(balancer.State{
   294  		ConnectivityState: state,
   295  		Picker:            newPicker,
   296  	})
   297  }
   298  
   299  func (b *ringhashBalancer) Close() {
   300  	b.logger.Infof("Shutdown")
   301  	b.child.Close()
   302  }
   303  
   304  func (b *ringhashBalancer) ExitIdle() {
   305  	// ExitIdle implementation is a no-op because connections are either
   306  	// triggers from picks or from child balancer state changes.
   307  }
   308  
   309  // newPickerLocked generates a picker. The picker copies the endpoint states
   310  // over to avoid locking the mutex at RPC time. The picker should be
   311  // re-generated every time an endpoint state is updated.
   312  func (b *ringhashBalancer) newPickerLocked() *picker {
   313  	states := make(map[string]endpointState)
   314  	hasEndpointConnecting := false
   315  	for _, epState := range b.endpointStates.Values() {
   316  		// Copy the endpoint state to avoid races, since ring hash
   317  		// mutates the state, weight and hash key in place.
   318  		states[epState.hashKey] = *epState
   319  		if epState.state.ConnectivityState == connectivity.Connecting {
   320  			hasEndpointConnecting = true
   321  		}
   322  	}
   323  	return &picker{
   324  		ring:                         b.ring,
   325  		endpointStates:               states,
   326  		requestHashHeader:            b.config.RequestHashHeader,
   327  		hasEndpointInConnectingState: hasEndpointConnecting,
   328  		randUint64:                   rand.Uint64,
   329  	}
   330  }
   331  
   332  // aggregatedStateLocked returns the aggregated child balancers state
   333  // based on the following rules.
   334  //   - If there is at least one endpoint in READY state, report READY.
   335  //   - If there are 2 or more endpoints in TRANSIENT_FAILURE state, report
   336  //     TRANSIENT_FAILURE.
   337  //   - If there is at least one endpoint in CONNECTING state, report CONNECTING.
   338  //   - If there is one endpoint in TRANSIENT_FAILURE and there is more than one
   339  //     endpoint, report state CONNECTING.
   340  //   - If there is at least one endpoint in Idle state, report Idle.
   341  //   - Otherwise, report TRANSIENT_FAILURE.
   342  //
   343  // Note that if there are 1 connecting, 2 transient failure, the overall state
   344  // is transient failure. This is because the second transient failure is a
   345  // fallback of the first failing endpoint, and we want to report transient
   346  // failure to failover to the lower priority.
   347  func (b *ringhashBalancer) aggregatedStateLocked() connectivity.State {
   348  	var nums [5]int
   349  	for _, es := range b.endpointStates.Values() {
   350  		nums[es.state.ConnectivityState]++
   351  	}
   352  
   353  	if nums[connectivity.Ready] > 0 {
   354  		return connectivity.Ready
   355  	}
   356  	if nums[connectivity.TransientFailure] > 1 {
   357  		return connectivity.TransientFailure
   358  	}
   359  	if nums[connectivity.Connecting] > 0 {
   360  		return connectivity.Connecting
   361  	}
   362  	if nums[connectivity.TransientFailure] == 1 && b.endpointStates.Len() > 1 {
   363  		return connectivity.Connecting
   364  	}
   365  	if nums[connectivity.Idle] > 0 {
   366  		return connectivity.Idle
   367  	}
   368  	return connectivity.TransientFailure
   369  }
   370  
   371  // getWeightAttribute is a convenience function which returns the value of the
   372  // weight endpoint Attribute.
   373  //
   374  // When used in the xDS context, the weight attribute is guaranteed to be
   375  // non-zero. But, when used in a non-xDS context, the weight attribute could be
   376  // unset. A Default of 1 is used in the latter case.
   377  func getWeightAttribute(e resolver.Endpoint) uint32 {
   378  	w := weight.FromEndpoint(e).Weight
   379  	if w == 0 {
   380  		return 1
   381  	}
   382  	return w
   383  }
   384  
   385  type endpointState struct {
   386  	// hashKey is the hash key of the endpoint. Per gRFC A61, each entry in the
   387  	// ring is an endpoint, positioned based on the hash of the endpoint's first
   388  	// address by default. Per gRFC A76, the hash key of an endpoint may be
   389  	// overridden, for example based on EDS endpoint metadata.
   390  	hashKey  string
   391  	weight   uint32
   392  	balancer balancer.ExitIdler
   393  
   394  	// state is updated by the balancer while receiving resolver updates from
   395  	// the channel and picker updates from its children. Access to it is guarded
   396  	// by ringhashBalancer.mu.
   397  	state balancer.State
   398  }