istio.io/istio@v0.0.0-20240520182934-d79c90f27776/pilot/pkg/leaderelection/k8sleaderelection/leaderelection.go (about) 1 /* 2 Copyright 2015 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 // Package leaderelection implements leader election of a set of endpoints. 18 // It uses an annotation in the endpoints object to store the record of the 19 // election state. This implementation does not guarantee that only one 20 // client is acting as a leader (a.k.a. fencing). 21 // 22 // A client only acts on timestamps captured locally to infer the state of the 23 // leader election. The client does not consider timestamps in the leader 24 // election record to be accurate because these timestamps may not have been 25 // produced by a local clock. The implementation does not depend on their 26 // accuracy and only uses their change to indicate that another client has 27 // renewed the leader lease. Thus the implementation is tolerant to arbitrary 28 // clock skew, but is not tolerant to arbitrary clock skew rate. 29 // 30 // However the level of tolerance to skew rate can be configured by setting 31 // RenewDeadline and LeaseDuration appropriately. The tolerance expressed as a 32 // maximum tolerated ratio of time passed on the fastest node to time passed on 33 // the slowest node can be approximately achieved with a configuration that sets 34 // the same ratio of LeaseDuration to RenewDeadline. For example if a user wanted 35 // to tolerate some nodes progressing forward in time twice as fast as other nodes, 36 // the user could set LeaseDuration to 60 seconds and RenewDeadline to 30 seconds. 37 // 38 // While not required, some method of clock synchronization between nodes in the 39 // cluster is highly recommended. It's important to keep in mind when configuring 40 // this client that the tolerance to skew rate varies inversely to master 41 // availability. 42 // 43 // Larger clusters often have a more lenient SLA for API latency. This should be 44 // taken into account when configuring the client. The rate of leader transitions 45 // should be monitored and RetryPeriod and LeaseDuration should be increased 46 // until the rate is stable and acceptably low. It's important to keep in mind 47 // when configuring this client that the tolerance to API latency varies inversely 48 // to master availability. 49 // 50 // DISCLAIMER: this is an alpha API. This library will likely change significantly 51 // or even be removed entirely in subsequent releases. Depend on this API at 52 // your own risk. 53 // nolint 54 package k8sleaderelection 55 56 import ( 57 "bytes" 58 "context" 59 "fmt" 60 "sync" 61 "time" 62 63 "k8s.io/apimachinery/pkg/api/errors" 64 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 65 "k8s.io/apimachinery/pkg/util/runtime" 66 "k8s.io/apimachinery/pkg/util/wait" 67 "k8s.io/klog/v2" 68 "k8s.io/utils/clock" 69 70 "istio.io/istio/pilot/pkg/leaderelection/k8sleaderelection/k8sresourcelock" 71 ) 72 73 const ( 74 JitterFactor = 1.2 75 ) 76 77 // NewLeaderElector creates a LeaderElector from a LeaderElectionConfig 78 func NewLeaderElector(lec LeaderElectionConfig) (*LeaderElector, error) { 79 if lec.LeaseDuration <= lec.RenewDeadline { 80 return nil, fmt.Errorf("leaseDuration must be greater than renewDeadline") 81 } 82 if lec.RenewDeadline <= time.Duration(JitterFactor*float64(lec.RetryPeriod)) { 83 return nil, fmt.Errorf("renewDeadline must be greater than retryPeriod*JitterFactor") 84 } 85 if lec.LeaseDuration < 1 { 86 return nil, fmt.Errorf("leaseDuration must be greater than zero") 87 } 88 if lec.RenewDeadline < 1 { 89 return nil, fmt.Errorf("renewDeadline must be greater than zero") 90 } 91 if lec.RetryPeriod < 1 { 92 return nil, fmt.Errorf("retryPeriod must be greater than zero") 93 } 94 if lec.Callbacks.OnStartedLeading == nil { 95 return nil, fmt.Errorf("callback OnStartedLeading must not be nil") 96 } 97 if lec.Callbacks.OnStoppedLeading == nil { 98 return nil, fmt.Errorf("callback OnStoppedLeading must not be nil") 99 } 100 101 if lec.Lock == nil { 102 return nil, fmt.Errorf("lock must not be nil") 103 } 104 le := LeaderElector{ 105 config: lec, 106 clock: clock.RealClock{}, 107 metrics: globalMetricsFactory.newLeaderMetrics(), 108 } 109 le.metrics.leaderOff(le.config.Name) 110 return &le, nil 111 } 112 113 type KeyComparisonFunc func(existingKey string) bool 114 115 type LeaderElectionConfig struct { 116 // Lock is the resource that will be used for locking 117 Lock k8sresourcelock.Interface 118 119 // LeaseDuration is the duration that non-leader candidates will 120 // wait to force acquire leadership. This is measured against time of 121 // last observed ack. 122 // 123 // A client needs to wait a full LeaseDuration without observing a change to 124 // the record before it can attempt to take over. When all clients are 125 // shutdown and a new set of clients are started with different names against 126 // the same leader record, they must wait the full LeaseDuration before 127 // attempting to acquire the lease. Thus LeaseDuration should be as short as 128 // possible (within your tolerance for clock skew rate) to avoid a possible 129 // long waits in the scenario. 130 // 131 // Core clients default this value to 15 seconds. 132 LeaseDuration time.Duration 133 // RenewDeadline is the duration that the acting master will retry 134 // refreshing leadership before giving up. 135 // 136 // Core clients default this value to 10 seconds. 137 RenewDeadline time.Duration 138 // RetryPeriod is the duration the LeaderElector clients should wait 139 // between tries of actions. 140 // 141 // Core clients default this value to 2 seconds. 142 RetryPeriod time.Duration 143 144 // KeyComparison defines a function to compare the existing leader's key to our own. 145 // If the function returns true, indicating our key has high precedence, we will take over 146 // leadership even if their is another un-expired leader. 147 // 148 // This can be used to implemented a prioritized leader election. For example, if multiple 149 // versions of the same application run simultaneously, we can ensure the newest version 150 // will become the leader. 151 // 152 // It is the responsibility of the caller to ensure that all KeyComparison functions are 153 // logically consistent between all clients participating in the leader election to avoid multiple 154 // clients claiming to have high precedence and constantly pre-empting the existing leader. 155 // 156 // KeyComparison functions should ensure they handle an empty existingKey, as "key" is not a required field. 157 // 158 // Warning: when a lock is stolen (from KeyComparison returning true), the old leader may not 159 // immediately be notified they have lost the leader election. 160 KeyComparison KeyComparisonFunc 161 162 // Callbacks are callbacks that are triggered during certain lifecycle 163 // events of the LeaderElector 164 Callbacks LeaderCallbacks 165 166 // WatchDog is the associated health checker 167 // WatchDog may be null if its not needed/configured. 168 WatchDog *HealthzAdaptor 169 170 // ReleaseOnCancel should be set true if the lock should be released 171 // when the run context is canceled. If you set this to true, you must 172 // ensure all code guarded by this lease has successfully completed 173 // prior to canceling the context, or you may have two processes 174 // simultaneously acting on the critical path. 175 ReleaseOnCancel bool 176 177 // Name is the name of the resource lock for debugging 178 Name string 179 } 180 181 // LeaderCallbacks are callbacks that are triggered during certain 182 // lifecycle events of the LeaderElector. These are invoked asynchronously. 183 // 184 // possible future callbacks: 185 // - OnChallenge() 186 type LeaderCallbacks struct { 187 // OnStartedLeading is called when a LeaderElector client starts leading 188 OnStartedLeading func(context.Context) 189 // OnStoppedLeading is called when a LeaderElector client stops leading 190 OnStoppedLeading func() 191 // OnNewLeader is called when the client observes a leader that is 192 // not the previously observed leader. This includes the first observed 193 // leader when the client starts. 194 OnNewLeader func(identity string) 195 } 196 197 // LeaderElector is a leader election client. 198 type LeaderElector struct { 199 config LeaderElectionConfig 200 // internal bookkeeping 201 observedRecord k8sresourcelock.LeaderElectionRecord 202 observedRawRecord []byte 203 observedTime time.Time 204 // used to implement OnNewLeader(), may lag slightly from the 205 // value observedRecord.HolderIdentity if the transition has 206 // not yet been reported. 207 reportedLeader string 208 209 // clock is wrapper around time to allow for less flaky testing 210 clock clock.Clock 211 212 // used to lock the observedRecord 213 observedRecordLock sync.Mutex 214 215 metrics leaderMetricsAdapter 216 } 217 218 // Run starts the leader election loop. Run will not return 219 // before leader election loop is stopped by ctx or it has 220 // stopped holding the leader lease 221 func (le *LeaderElector) Run(ctx context.Context) { 222 defer runtime.HandleCrash() 223 defer func() { 224 le.config.Callbacks.OnStoppedLeading() 225 }() 226 227 if !le.acquire(ctx) { 228 return // ctx signaled done 229 } 230 ctx, cancel := context.WithCancel(ctx) 231 defer cancel() 232 go le.config.Callbacks.OnStartedLeading(ctx) 233 le.renew(ctx) 234 } 235 236 // RunOrDie starts a client with the provided config or panics if the config 237 // fails to validate. RunOrDie blocks until leader election loop is 238 // stopped by ctx or it has stopped holding the leader lease 239 func RunOrDie(ctx context.Context, lec LeaderElectionConfig) { 240 le, err := NewLeaderElector(lec) 241 if err != nil { 242 panic(err) 243 } 244 if lec.WatchDog != nil { 245 lec.WatchDog.SetLeaderElection(le) 246 } 247 le.Run(ctx) 248 } 249 250 // GetLeader returns the identity of the last observed leader or returns the empty string if 251 // no leader has yet been observed. 252 // This function is for informational purposes. (e.g. monitoring, logs, etc.) 253 func (le *LeaderElector) GetLeader() string { 254 return le.getObservedRecord().HolderIdentity 255 } 256 257 // IsLeader returns true if the last observed leader was this client else returns false. 258 func (le *LeaderElector) IsLeader() bool { 259 return le.getObservedRecord().HolderIdentity == le.config.Lock.Identity() 260 } 261 262 // acquire loops calling tryAcquireOrRenew and returns true immediately when tryAcquireOrRenew succeeds. 263 // Returns false if ctx signals done. 264 func (le *LeaderElector) acquire(ctx context.Context) bool { 265 ctx, cancel := context.WithCancel(ctx) 266 defer cancel() 267 succeeded := false 268 desc := le.config.Lock.Describe() 269 klog.Infof("attempting to acquire leader lease %v...", desc) 270 wait.JitterUntil(func() { 271 succeeded = le.tryAcquireOrRenew(ctx) 272 le.maybeReportTransition() 273 if !succeeded { 274 klog.V(4).Infof("failed to acquire lease %v", desc) 275 return 276 } 277 le.config.Lock.RecordEvent("became leader") 278 le.metrics.leaderOn(le.config.Name) 279 klog.Infof("successfully acquired lease %v", desc) 280 cancel() 281 }, le.config.RetryPeriod, JitterFactor, true, ctx.Done()) 282 return succeeded 283 } 284 285 // renew loops calling tryAcquireOrRenew and returns immediately when tryAcquireOrRenew fails or ctx signals done. 286 func (le *LeaderElector) renew(ctx context.Context) { 287 ctx, cancel := context.WithCancel(ctx) 288 defer cancel() 289 wait.Until(func() { 290 timeoutCtx, timeoutCancel := context.WithTimeout(ctx, le.config.RenewDeadline) 291 defer timeoutCancel() 292 err := wait.PollImmediateUntil(le.config.RetryPeriod, func() (bool, error) { 293 return le.tryAcquireOrRenew(timeoutCtx), nil 294 }, timeoutCtx.Done()) 295 296 le.maybeReportTransition() 297 desc := le.config.Lock.Describe() 298 if err == nil { 299 klog.V(5).Infof("successfully renewed lease %v", desc) 300 return 301 } 302 le.config.Lock.RecordEvent("stopped leading") 303 le.metrics.leaderOff(le.config.Name) 304 klog.Infof("failed to renew lease %v: %v", desc, err) 305 cancel() 306 }, le.config.RetryPeriod, ctx.Done()) 307 308 // if we hold the lease, give it up 309 if le.config.ReleaseOnCancel { 310 le.release() 311 } 312 } 313 314 // release attempts to release the leader lease if we have acquired it. 315 func (le *LeaderElector) release() bool { 316 if !le.IsLeader() { 317 return true 318 } 319 now := metav1.Now() 320 leaderElectionRecord := k8sresourcelock.LeaderElectionRecord{ 321 LeaderTransitions: le.observedRecord.LeaderTransitions, 322 LeaseDurationSeconds: 1, 323 RenewTime: now, 324 AcquireTime: now, 325 } 326 if err := le.config.Lock.Update(context.TODO(), leaderElectionRecord); err != nil { 327 klog.Errorf("Failed to release lock: %v", err) 328 return false 329 } 330 331 le.setObservedRecord(&leaderElectionRecord) 332 return true 333 } 334 335 // tryAcquireOrRenew tries to acquire a leader lease if it is not already acquired, 336 // else it tries to renew the lease if it has already been acquired. Returns true 337 // on success else returns false. 338 func (le *LeaderElector) tryAcquireOrRenew(ctx context.Context) bool { 339 now := metav1.Now() 340 leaderElectionRecord := k8sresourcelock.LeaderElectionRecord{ 341 HolderIdentity: le.config.Lock.Identity(), 342 HolderKey: le.config.Lock.Key(), 343 LeaseDurationSeconds: int(le.config.LeaseDuration / time.Second), 344 RenewTime: now, 345 AcquireTime: now, 346 } 347 348 // 1. obtain or create the ElectionRecord 349 oldLeaderElectionRecord, oldLeaderElectionRawRecord, err := le.config.Lock.Get(ctx) 350 if err != nil { 351 if !errors.IsNotFound(err) { 352 klog.Errorf("error retrieving resource lock %v: %v", le.config.Lock.Describe(), err) 353 return false 354 } 355 if err = le.config.Lock.Create(ctx, leaderElectionRecord); err != nil { 356 klog.Errorf("error initially creating leader election record: %v", err) 357 return false 358 } 359 360 le.setObservedRecord(&leaderElectionRecord) 361 362 return true 363 } 364 365 // 2. Record obtained, check the Identity & Time 366 if !bytes.Equal(le.observedRawRecord, oldLeaderElectionRawRecord) { 367 le.setObservedRecord(oldLeaderElectionRecord) 368 369 le.observedRawRecord = oldLeaderElectionRawRecord 370 } 371 if len(oldLeaderElectionRecord.HolderIdentity) > 0 && 372 le.observedTime.Add(le.config.LeaseDuration).After(now.Time) && 373 !le.IsLeader() { 374 if le.config.KeyComparison != nil && le.config.KeyComparison(oldLeaderElectionRecord.HolderKey) { 375 // Lock is held and not expired, but our key is higher than the existing one. 376 // We will pre-empt the existing leader. 377 // nolint: lll 378 klog.V(4).Infof("lock is held by %v with key %v, but our key (%v) evicts it", oldLeaderElectionRecord.HolderIdentity, oldLeaderElectionRecord.HolderKey, le.config.Lock.Key()) 379 } else { 380 klog.V(4).Infof("lock is held by %v and has not yet expired", oldLeaderElectionRecord.HolderIdentity) 381 return false 382 } 383 } 384 385 // 3. We're going to try to update. The leaderElectionRecord is set to it's default 386 // here. Let's correct it before updating. 387 if le.IsLeader() { 388 leaderElectionRecord.AcquireTime = oldLeaderElectionRecord.AcquireTime 389 leaderElectionRecord.LeaderTransitions = oldLeaderElectionRecord.LeaderTransitions 390 } else { 391 leaderElectionRecord.LeaderTransitions = oldLeaderElectionRecord.LeaderTransitions + 1 392 } 393 394 // update the lock itself 395 if err = le.config.Lock.Update(ctx, leaderElectionRecord); err != nil { 396 klog.Errorf("Failed to update lock: %v", err) 397 return false 398 } 399 400 le.setObservedRecord(&leaderElectionRecord) 401 return true 402 } 403 404 func (le *LeaderElector) maybeReportTransition() { 405 if le.observedRecord.HolderIdentity == le.reportedLeader { 406 return 407 } 408 le.reportedLeader = le.observedRecord.HolderIdentity 409 if le.config.Callbacks.OnNewLeader != nil { 410 go le.config.Callbacks.OnNewLeader(le.reportedLeader) 411 } 412 } 413 414 // Check will determine if the current lease is expired by more than timeout. 415 func (le *LeaderElector) Check(maxTolerableExpiredLease time.Duration) error { 416 if !le.IsLeader() { 417 // Currently not concerned with the case that we are hot standby 418 return nil 419 } 420 // If we are more than timeout seconds after the lease duration that is past the timeout 421 // on the lease renew. Time to start reporting ourselves as unhealthy. We should have 422 // died but conditions like deadlock can prevent this. (See #70819) 423 if le.clock.Since(le.observedTime) > le.config.LeaseDuration+maxTolerableExpiredLease { 424 return fmt.Errorf("failed election to renew leadership on lease %s", le.config.Name) 425 } 426 427 return nil 428 } 429 430 // setObservedRecord will set a new observedRecord and update observedTime to the current time. 431 // Protect critical sections with lock. 432 func (le *LeaderElector) setObservedRecord(observedRecord *k8sresourcelock.LeaderElectionRecord) { 433 le.observedRecordLock.Lock() 434 defer le.observedRecordLock.Unlock() 435 436 le.observedRecord = *observedRecord 437 le.observedTime = le.clock.Now() 438 } 439 440 // getObservedRecord returns observersRecord. 441 // Protect critical sections with lock. 442 func (le *LeaderElector) getObservedRecord() k8sresourcelock.LeaderElectionRecord { 443 le.observedRecordLock.Lock() 444 defer le.observedRecordLock.Unlock() 445 446 return le.observedRecord 447 }