google.golang.org/grpc@v1.72.2/xds/internal/balancer/ringhash/ringhash.go (about) 1 /* 2 * 3 * Copyright 2021 gRPC authors. 4 * 5 * Licensed under the Apache License, Version 2.0 (the "License"); 6 * you may not use this file except in compliance with the License. 7 * You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 * 17 */ 18 19 // Package ringhash implements the ringhash balancer. 20 package ringhash 21 22 import ( 23 "encoding/json" 24 "errors" 25 "fmt" 26 "math/rand/v2" 27 "sort" 28 "sync" 29 30 "google.golang.org/grpc/balancer" 31 "google.golang.org/grpc/balancer/base" 32 "google.golang.org/grpc/balancer/endpointsharding" 33 "google.golang.org/grpc/balancer/lazy" 34 "google.golang.org/grpc/balancer/pickfirst/pickfirstleaf" 35 "google.golang.org/grpc/connectivity" 36 "google.golang.org/grpc/internal/balancer/weight" 37 "google.golang.org/grpc/internal/grpclog" 38 "google.golang.org/grpc/internal/pretty" 39 "google.golang.org/grpc/resolver" 40 "google.golang.org/grpc/resolver/ringhash" 41 "google.golang.org/grpc/serviceconfig" 42 ) 43 44 // Name is the name of the ring_hash balancer. 45 const Name = "ring_hash_experimental" 46 47 func lazyPickFirstBuilder(cc balancer.ClientConn, opts balancer.BuildOptions) balancer.Balancer { 48 return lazy.NewBalancer(cc, opts, balancer.Get(pickfirstleaf.Name).Build) 49 } 50 51 func init() { 52 balancer.Register(bb{}) 53 } 54 55 type bb struct{} 56 57 func (bb) Build(cc balancer.ClientConn, opts balancer.BuildOptions) balancer.Balancer { 58 b := &ringhashBalancer{ 59 ClientConn: cc, 60 endpointStates: resolver.NewEndpointMap[*endpointState](), 61 } 62 esOpts := endpointsharding.Options{DisableAutoReconnect: true} 63 b.child = endpointsharding.NewBalancer(b, opts, lazyPickFirstBuilder, esOpts) 64 b.logger = prefixLogger(b) 65 b.logger.Infof("Created") 66 return b 67 } 68 69 func (bb) Name() string { 70 return Name 71 } 72 73 func (bb) ParseConfig(c json.RawMessage) (serviceconfig.LoadBalancingConfig, error) { 74 return parseConfig(c) 75 } 76 77 type ringhashBalancer struct { 78 // The following fields are initialized at build time and read-only after 79 // that and therefore do not need to be guarded by a mutex. 80 81 // ClientConn is embedded to intercept UpdateState calls from the child 82 // endpointsharding balancer. 83 balancer.ClientConn 84 logger *grpclog.PrefixLogger 85 child balancer.Balancer 86 87 mu sync.Mutex 88 config *LBConfig 89 inhibitChildUpdates bool 90 shouldRegenerateRing bool 91 endpointStates *resolver.EndpointMap[*endpointState] 92 93 // ring is always in sync with endpoints. When endpoints change, a new ring 94 // is generated. Note that address weights updates also regenerates the 95 // ring. 96 ring *ring 97 } 98 99 // hashKey returns the hash key to use for an endpoint. Per gRFC A61, each entry 100 // in the ring is a hash of the endpoint's hash key concatenated with a 101 // per-entry unique suffix. 102 func hashKey(endpoint resolver.Endpoint) string { 103 if hk := ringhash.HashKey(endpoint); hk != "" { 104 return hk 105 } 106 // If no hash key is set, use the endpoint's first address as the hash key. 107 // This is the default behavior when no hash key is set. 108 return endpoint.Addresses[0].Addr 109 } 110 111 // UpdateState intercepts child balancer state updates. It updates the 112 // per-endpoint state stored in the ring, and also the aggregated state based on 113 // the child picker. It also reconciles the endpoint list. It sets 114 // `b.shouldRegenerateRing` to true if the new endpoint list is different from 115 // the previous, i.e. any of the following is true: 116 // - an endpoint was added 117 // - an endpoint was removed 118 // - an endpoint's weight was updated 119 // - the first addresses of the endpoint has changed 120 func (b *ringhashBalancer) UpdateState(state balancer.State) { 121 b.mu.Lock() 122 defer b.mu.Unlock() 123 childStates := endpointsharding.ChildStatesFromPicker(state.Picker) 124 // endpointsSet is the set converted from endpoints, used for quick lookup. 125 endpointsSet := resolver.NewEndpointMap[bool]() 126 127 for _, childState := range childStates { 128 endpoint := childState.Endpoint 129 endpointsSet.Set(endpoint, true) 130 newWeight := getWeightAttribute(endpoint) 131 hk := hashKey(endpoint) 132 es, ok := b.endpointStates.Get(endpoint) 133 if !ok { 134 es := &endpointState{ 135 balancer: childState.Balancer, 136 hashKey: hk, 137 weight: newWeight, 138 state: childState.State, 139 } 140 b.endpointStates.Set(endpoint, es) 141 b.shouldRegenerateRing = true 142 } else { 143 // We have seen this endpoint before and created a `endpointState` 144 // object for it. If the weight or the hash key of the endpoint has 145 // changed, update the endpoint state map with the new weight or 146 // hash key. This will be used when a new ring is created. 147 if oldWeight := es.weight; oldWeight != newWeight { 148 b.shouldRegenerateRing = true 149 es.weight = newWeight 150 } 151 if es.hashKey != hk { 152 b.shouldRegenerateRing = true 153 es.hashKey = hk 154 } 155 es.state = childState.State 156 } 157 } 158 159 for _, endpoint := range b.endpointStates.Keys() { 160 if _, ok := endpointsSet.Get(endpoint); ok { 161 continue 162 } 163 // endpoint was removed by resolver. 164 b.endpointStates.Delete(endpoint) 165 b.shouldRegenerateRing = true 166 } 167 168 b.updatePickerLocked() 169 } 170 171 func (b *ringhashBalancer) UpdateClientConnState(ccs balancer.ClientConnState) error { 172 if b.logger.V(2) { 173 b.logger.Infof("Received update from resolver, balancer config: %+v", pretty.ToJSON(ccs.BalancerConfig)) 174 } 175 176 newConfig, ok := ccs.BalancerConfig.(*LBConfig) 177 if !ok { 178 return fmt.Errorf("unexpected balancer config with type: %T", ccs.BalancerConfig) 179 } 180 181 b.mu.Lock() 182 b.inhibitChildUpdates = true 183 b.mu.Unlock() 184 185 defer func() { 186 b.mu.Lock() 187 b.inhibitChildUpdates = false 188 b.updatePickerLocked() 189 b.mu.Unlock() 190 }() 191 192 if err := b.child.UpdateClientConnState(balancer.ClientConnState{ 193 // Make pickfirst children use health listeners for outlier detection 194 // and health checking to work. 195 ResolverState: pickfirstleaf.EnableHealthListener(ccs.ResolverState), 196 }); err != nil { 197 return err 198 } 199 200 b.mu.Lock() 201 // Ring updates can happen due to the following: 202 // 1. Addition or deletion of endpoints: The synchronous picker update from 203 // the child endpointsharding balancer would contain the list of updated 204 // endpoints. Updates triggered by the child after handling the 205 // `UpdateClientConnState` call will not change the endpoint list. 206 // 2. Change in the `LoadBalancerConfig`: Ring config such as max/min ring 207 // size. 208 // To avoid extra ring updates, a boolean is used to track the need for a 209 // ring update and the update is done only once at the end. 210 // 211 // If the ring configuration has changed, we need to regenerate the ring 212 // while sending a new picker. 213 if b.config == nil || b.config.MinRingSize != newConfig.MinRingSize || b.config.MaxRingSize != newConfig.MaxRingSize { 214 b.shouldRegenerateRing = true 215 } 216 b.config = newConfig 217 b.mu.Unlock() 218 return nil 219 } 220 221 func (b *ringhashBalancer) ResolverError(err error) { 222 b.child.ResolverError(err) 223 } 224 225 func (b *ringhashBalancer) UpdateSubConnState(sc balancer.SubConn, state balancer.SubConnState) { 226 b.logger.Errorf("UpdateSubConnState(%v, %+v) called unexpectedly", sc, state) 227 } 228 229 func (b *ringhashBalancer) updatePickerLocked() { 230 state := b.aggregatedStateLocked() 231 // Start connecting to new endpoints if necessary. 232 if state == connectivity.Connecting || state == connectivity.TransientFailure { 233 // When overall state is TransientFailure, we need to make sure at least 234 // one endpoint is attempting to connect, otherwise this balancer may 235 // never get picks if the parent is priority. 236 // 237 // Because we report Connecting as the overall state when only one 238 // endpoint is in TransientFailure, we do the same check for Connecting 239 // here. 240 // 241 // Note that this check also covers deleting endpoints. E.g. if the 242 // endpoint attempting to connect is deleted, and the overall state is 243 // TF. Since there must be at least one endpoint attempting to connect, 244 // we need to trigger one. 245 // 246 // After calling `ExitIdle` on a child balancer, the child will send a 247 // picker update asynchronously. A race condition may occur if another 248 // picker update from endpointsharding arrives before the child's 249 // picker update. The received picker may trigger a re-execution of the 250 // loop below to find an idle child. Since map iteration order is 251 // non-deterministic, the list of `endpointState`s must be sorted to 252 // ensure `ExitIdle` is called on the same child, preventing unnecessary 253 // connections. 254 var endpointStates = make([]*endpointState, b.endpointStates.Len()) 255 for i, s := range b.endpointStates.Values() { 256 endpointStates[i] = s 257 } 258 sort.Slice(endpointStates, func(i, j int) bool { 259 return endpointStates[i].hashKey < endpointStates[j].hashKey 260 }) 261 var idleBalancer balancer.ExitIdler 262 for _, es := range endpointStates { 263 connState := es.state.ConnectivityState 264 if connState == connectivity.Connecting { 265 idleBalancer = nil 266 break 267 } 268 if idleBalancer == nil && connState == connectivity.Idle { 269 idleBalancer = es.balancer 270 } 271 } 272 if idleBalancer != nil { 273 idleBalancer.ExitIdle() 274 } 275 } 276 277 if b.inhibitChildUpdates { 278 return 279 } 280 281 // Update the channel. 282 if b.endpointStates.Len() > 0 && b.shouldRegenerateRing { 283 // with a non-empty list of endpoints. 284 b.ring = newRing(b.endpointStates, b.config.MinRingSize, b.config.MaxRingSize, b.logger) 285 } 286 b.shouldRegenerateRing = false 287 var newPicker balancer.Picker 288 if b.endpointStates.Len() == 0 { 289 newPicker = base.NewErrPicker(errors.New("produced zero addresses")) 290 } else { 291 newPicker = b.newPickerLocked() 292 } 293 b.ClientConn.UpdateState(balancer.State{ 294 ConnectivityState: state, 295 Picker: newPicker, 296 }) 297 } 298 299 func (b *ringhashBalancer) Close() { 300 b.logger.Infof("Shutdown") 301 b.child.Close() 302 } 303 304 func (b *ringhashBalancer) ExitIdle() { 305 // ExitIdle implementation is a no-op because connections are either 306 // triggers from picks or from child balancer state changes. 307 } 308 309 // newPickerLocked generates a picker. The picker copies the endpoint states 310 // over to avoid locking the mutex at RPC time. The picker should be 311 // re-generated every time an endpoint state is updated. 312 func (b *ringhashBalancer) newPickerLocked() *picker { 313 states := make(map[string]endpointState) 314 hasEndpointConnecting := false 315 for _, epState := range b.endpointStates.Values() { 316 // Copy the endpoint state to avoid races, since ring hash 317 // mutates the state, weight and hash key in place. 318 states[epState.hashKey] = *epState 319 if epState.state.ConnectivityState == connectivity.Connecting { 320 hasEndpointConnecting = true 321 } 322 } 323 return &picker{ 324 ring: b.ring, 325 endpointStates: states, 326 requestHashHeader: b.config.RequestHashHeader, 327 hasEndpointInConnectingState: hasEndpointConnecting, 328 randUint64: rand.Uint64, 329 } 330 } 331 332 // aggregatedStateLocked returns the aggregated child balancers state 333 // based on the following rules. 334 // - If there is at least one endpoint in READY state, report READY. 335 // - If there are 2 or more endpoints in TRANSIENT_FAILURE state, report 336 // TRANSIENT_FAILURE. 337 // - If there is at least one endpoint in CONNECTING state, report CONNECTING. 338 // - If there is one endpoint in TRANSIENT_FAILURE and there is more than one 339 // endpoint, report state CONNECTING. 340 // - If there is at least one endpoint in Idle state, report Idle. 341 // - Otherwise, report TRANSIENT_FAILURE. 342 // 343 // Note that if there are 1 connecting, 2 transient failure, the overall state 344 // is transient failure. This is because the second transient failure is a 345 // fallback of the first failing endpoint, and we want to report transient 346 // failure to failover to the lower priority. 347 func (b *ringhashBalancer) aggregatedStateLocked() connectivity.State { 348 var nums [5]int 349 for _, es := range b.endpointStates.Values() { 350 nums[es.state.ConnectivityState]++ 351 } 352 353 if nums[connectivity.Ready] > 0 { 354 return connectivity.Ready 355 } 356 if nums[connectivity.TransientFailure] > 1 { 357 return connectivity.TransientFailure 358 } 359 if nums[connectivity.Connecting] > 0 { 360 return connectivity.Connecting 361 } 362 if nums[connectivity.TransientFailure] == 1 && b.endpointStates.Len() > 1 { 363 return connectivity.Connecting 364 } 365 if nums[connectivity.Idle] > 0 { 366 return connectivity.Idle 367 } 368 return connectivity.TransientFailure 369 } 370 371 // getWeightAttribute is a convenience function which returns the value of the 372 // weight endpoint Attribute. 373 // 374 // When used in the xDS context, the weight attribute is guaranteed to be 375 // non-zero. But, when used in a non-xDS context, the weight attribute could be 376 // unset. A Default of 1 is used in the latter case. 377 func getWeightAttribute(e resolver.Endpoint) uint32 { 378 w := weight.FromEndpoint(e).Weight 379 if w == 0 { 380 return 1 381 } 382 return w 383 } 384 385 type endpointState struct { 386 // hashKey is the hash key of the endpoint. Per gRFC A61, each entry in the 387 // ring is an endpoint, positioned based on the hash of the endpoint's first 388 // address by default. Per gRFC A76, the hash key of an endpoint may be 389 // overridden, for example based on EDS endpoint metadata. 390 hashKey string 391 weight uint32 392 balancer balancer.ExitIdler 393 394 // state is updated by the balancer while receiving resolver updates from 395 // the channel and picker updates from its children. Access to it is guarded 396 // by ringhashBalancer.mu. 397 state balancer.State 398 }