k8s.io/client-go@v0.31.1/tools/leaderelection/leaderelection.go (about) 1 /* 2 Copyright 2015 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 // Package leaderelection implements leader election of a set of endpoints. 18 // It uses an annotation in the endpoints object to store the record of the 19 // election state. This implementation does not guarantee that only one 20 // client is acting as a leader (a.k.a. fencing). 21 // 22 // A client only acts on timestamps captured locally to infer the state of the 23 // leader election. The client does not consider timestamps in the leader 24 // election record to be accurate because these timestamps may not have been 25 // produced by a local clock. The implemention does not depend on their 26 // accuracy and only uses their change to indicate that another client has 27 // renewed the leader lease. Thus the implementation is tolerant to arbitrary 28 // clock skew, but is not tolerant to arbitrary clock skew rate. 29 // 30 // However the level of tolerance to skew rate can be configured by setting 31 // RenewDeadline and LeaseDuration appropriately. The tolerance expressed as a 32 // maximum tolerated ratio of time passed on the fastest node to time passed on 33 // the slowest node can be approximately achieved with a configuration that sets 34 // the same ratio of LeaseDuration to RenewDeadline. For example if a user wanted 35 // to tolerate some nodes progressing forward in time twice as fast as other nodes, 36 // the user could set LeaseDuration to 60 seconds and RenewDeadline to 30 seconds. 37 // 38 // While not required, some method of clock synchronization between nodes in the 39 // cluster is highly recommended. It's important to keep in mind when configuring 40 // this client that the tolerance to skew rate varies inversely to master 41 // availability. 42 // 43 // Larger clusters often have a more lenient SLA for API latency. This should be 44 // taken into account when configuring the client. The rate of leader transitions 45 // should be monitored and RetryPeriod and LeaseDuration should be increased 46 // until the rate is stable and acceptably low. It's important to keep in mind 47 // when configuring this client that the tolerance to API latency varies inversely 48 // to master availability. 49 // 50 // DISCLAIMER: this is an alpha API. This library will likely change significantly 51 // or even be removed entirely in subsequent releases. Depend on this API at 52 // your own risk. 53 package leaderelection 54 55 import ( 56 "bytes" 57 "context" 58 "fmt" 59 "sync" 60 "time" 61 62 "k8s.io/apimachinery/pkg/api/errors" 63 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 64 "k8s.io/apimachinery/pkg/util/runtime" 65 "k8s.io/apimachinery/pkg/util/wait" 66 rl "k8s.io/client-go/tools/leaderelection/resourcelock" 67 "k8s.io/klog/v2" 68 "k8s.io/utils/clock" 69 ) 70 71 const ( 72 JitterFactor = 1.2 73 ) 74 75 // NewLeaderElector creates a LeaderElector from a LeaderElectionConfig 76 func NewLeaderElector(lec LeaderElectionConfig) (*LeaderElector, error) { 77 if lec.LeaseDuration <= lec.RenewDeadline { 78 return nil, fmt.Errorf("leaseDuration must be greater than renewDeadline") 79 } 80 if lec.RenewDeadline <= time.Duration(JitterFactor*float64(lec.RetryPeriod)) { 81 return nil, fmt.Errorf("renewDeadline must be greater than retryPeriod*JitterFactor") 82 } 83 if lec.LeaseDuration < 1 { 84 return nil, fmt.Errorf("leaseDuration must be greater than zero") 85 } 86 if lec.RenewDeadline < 1 { 87 return nil, fmt.Errorf("renewDeadline must be greater than zero") 88 } 89 if lec.RetryPeriod < 1 { 90 return nil, fmt.Errorf("retryPeriod must be greater than zero") 91 } 92 if lec.Callbacks.OnStartedLeading == nil { 93 return nil, fmt.Errorf("OnStartedLeading callback must not be nil") 94 } 95 if lec.Callbacks.OnStoppedLeading == nil { 96 return nil, fmt.Errorf("OnStoppedLeading callback must not be nil") 97 } 98 99 if lec.Lock == nil { 100 return nil, fmt.Errorf("Lock must not be nil.") 101 } 102 id := lec.Lock.Identity() 103 if id == "" { 104 return nil, fmt.Errorf("Lock identity is empty") 105 } 106 107 le := LeaderElector{ 108 config: lec, 109 clock: clock.RealClock{}, 110 metrics: globalMetricsFactory.newLeaderMetrics(), 111 } 112 le.metrics.leaderOff(le.config.Name) 113 return &le, nil 114 } 115 116 type LeaderElectionConfig struct { 117 // Lock is the resource that will be used for locking 118 Lock rl.Interface 119 120 // LeaseDuration is the duration that non-leader candidates will 121 // wait to force acquire leadership. This is measured against time of 122 // last observed ack. 123 // 124 // A client needs to wait a full LeaseDuration without observing a change to 125 // the record before it can attempt to take over. When all clients are 126 // shutdown and a new set of clients are started with different names against 127 // the same leader record, they must wait the full LeaseDuration before 128 // attempting to acquire the lease. Thus LeaseDuration should be as short as 129 // possible (within your tolerance for clock skew rate) to avoid a possible 130 // long waits in the scenario. 131 // 132 // Core clients default this value to 15 seconds. 133 LeaseDuration time.Duration 134 // RenewDeadline is the duration that the acting master will retry 135 // refreshing leadership before giving up. 136 // 137 // Core clients default this value to 10 seconds. 138 RenewDeadline time.Duration 139 // RetryPeriod is the duration the LeaderElector clients should wait 140 // between tries of actions. 141 // 142 // Core clients default this value to 2 seconds. 143 RetryPeriod time.Duration 144 145 // Callbacks are callbacks that are triggered during certain lifecycle 146 // events of the LeaderElector 147 Callbacks LeaderCallbacks 148 149 // WatchDog is the associated health checker 150 // WatchDog may be null if it's not needed/configured. 151 WatchDog *HealthzAdaptor 152 153 // ReleaseOnCancel should be set true if the lock should be released 154 // when the run context is cancelled. If you set this to true, you must 155 // ensure all code guarded by this lease has successfully completed 156 // prior to cancelling the context, or you may have two processes 157 // simultaneously acting on the critical path. 158 ReleaseOnCancel bool 159 160 // Name is the name of the resource lock for debugging 161 Name string 162 163 // Coordinated will use the Coordinated Leader Election feature 164 // WARNING: Coordinated leader election is ALPHA. 165 Coordinated bool 166 } 167 168 // LeaderCallbacks are callbacks that are triggered during certain 169 // lifecycle events of the LeaderElector. These are invoked asynchronously. 170 // 171 // possible future callbacks: 172 // - OnChallenge() 173 type LeaderCallbacks struct { 174 // OnStartedLeading is called when a LeaderElector client starts leading 175 OnStartedLeading func(context.Context) 176 // OnStoppedLeading is called when a LeaderElector client stops leading 177 OnStoppedLeading func() 178 // OnNewLeader is called when the client observes a leader that is 179 // not the previously observed leader. This includes the first observed 180 // leader when the client starts. 181 OnNewLeader func(identity string) 182 } 183 184 // LeaderElector is a leader election client. 185 type LeaderElector struct { 186 config LeaderElectionConfig 187 // internal bookkeeping 188 observedRecord rl.LeaderElectionRecord 189 observedRawRecord []byte 190 observedTime time.Time 191 // used to implement OnNewLeader(), may lag slightly from the 192 // value observedRecord.HolderIdentity if the transition has 193 // not yet been reported. 194 reportedLeader string 195 196 // clock is wrapper around time to allow for less flaky testing 197 clock clock.Clock 198 199 // used to lock the observedRecord 200 observedRecordLock sync.Mutex 201 202 metrics leaderMetricsAdapter 203 } 204 205 // Run starts the leader election loop. Run will not return 206 // before leader election loop is stopped by ctx or it has 207 // stopped holding the leader lease 208 func (le *LeaderElector) Run(ctx context.Context) { 209 defer runtime.HandleCrash() 210 defer le.config.Callbacks.OnStoppedLeading() 211 212 if !le.acquire(ctx) { 213 return // ctx signalled done 214 } 215 ctx, cancel := context.WithCancel(ctx) 216 defer cancel() 217 go le.config.Callbacks.OnStartedLeading(ctx) 218 le.renew(ctx) 219 } 220 221 // RunOrDie starts a client with the provided config or panics if the config 222 // fails to validate. RunOrDie blocks until leader election loop is 223 // stopped by ctx or it has stopped holding the leader lease 224 func RunOrDie(ctx context.Context, lec LeaderElectionConfig) { 225 le, err := NewLeaderElector(lec) 226 if err != nil { 227 panic(err) 228 } 229 if lec.WatchDog != nil { 230 lec.WatchDog.SetLeaderElection(le) 231 } 232 le.Run(ctx) 233 } 234 235 // GetLeader returns the identity of the last observed leader or returns the empty string if 236 // no leader has yet been observed. 237 // This function is for informational purposes. (e.g. monitoring, logs, etc.) 238 func (le *LeaderElector) GetLeader() string { 239 return le.getObservedRecord().HolderIdentity 240 } 241 242 // IsLeader returns true if the last observed leader was this client else returns false. 243 func (le *LeaderElector) IsLeader() bool { 244 return le.getObservedRecord().HolderIdentity == le.config.Lock.Identity() 245 } 246 247 // acquire loops calling tryAcquireOrRenew and returns true immediately when tryAcquireOrRenew succeeds. 248 // Returns false if ctx signals done. 249 func (le *LeaderElector) acquire(ctx context.Context) bool { 250 ctx, cancel := context.WithCancel(ctx) 251 defer cancel() 252 succeeded := false 253 desc := le.config.Lock.Describe() 254 klog.Infof("attempting to acquire leader lease %v...", desc) 255 wait.JitterUntil(func() { 256 if !le.config.Coordinated { 257 succeeded = le.tryAcquireOrRenew(ctx) 258 } else { 259 succeeded = le.tryCoordinatedRenew(ctx) 260 } 261 le.maybeReportTransition() 262 if !succeeded { 263 klog.V(4).Infof("failed to acquire lease %v", desc) 264 return 265 } 266 le.config.Lock.RecordEvent("became leader") 267 le.metrics.leaderOn(le.config.Name) 268 klog.Infof("successfully acquired lease %v", desc) 269 cancel() 270 }, le.config.RetryPeriod, JitterFactor, true, ctx.Done()) 271 return succeeded 272 } 273 274 // renew loops calling tryAcquireOrRenew and returns immediately when tryAcquireOrRenew fails or ctx signals done. 275 func (le *LeaderElector) renew(ctx context.Context) { 276 defer le.config.Lock.RecordEvent("stopped leading") 277 ctx, cancel := context.WithCancel(ctx) 278 defer cancel() 279 wait.Until(func() { 280 timeoutCtx, timeoutCancel := context.WithTimeout(ctx, le.config.RenewDeadline) 281 defer timeoutCancel() 282 err := wait.PollImmediateUntil(le.config.RetryPeriod, func() (bool, error) { 283 if !le.config.Coordinated { 284 return le.tryAcquireOrRenew(timeoutCtx), nil 285 } else { 286 return le.tryCoordinatedRenew(timeoutCtx), nil 287 } 288 }, timeoutCtx.Done()) 289 290 le.maybeReportTransition() 291 desc := le.config.Lock.Describe() 292 if err == nil { 293 klog.V(5).Infof("successfully renewed lease %v", desc) 294 return 295 } 296 le.metrics.leaderOff(le.config.Name) 297 klog.Infof("failed to renew lease %v: %v", desc, err) 298 cancel() 299 }, le.config.RetryPeriod, ctx.Done()) 300 301 // if we hold the lease, give it up 302 if le.config.ReleaseOnCancel { 303 le.release() 304 } 305 } 306 307 // release attempts to release the leader lease if we have acquired it. 308 func (le *LeaderElector) release() bool { 309 if !le.IsLeader() { 310 return true 311 } 312 now := metav1.NewTime(le.clock.Now()) 313 leaderElectionRecord := rl.LeaderElectionRecord{ 314 LeaderTransitions: le.observedRecord.LeaderTransitions, 315 LeaseDurationSeconds: 1, 316 RenewTime: now, 317 AcquireTime: now, 318 } 319 timeoutCtx, timeoutCancel := context.WithTimeout(context.Background(), le.config.RenewDeadline) 320 defer timeoutCancel() 321 if err := le.config.Lock.Update(timeoutCtx, leaderElectionRecord); err != nil { 322 klog.Errorf("Failed to release lock: %v", err) 323 return false 324 } 325 326 le.setObservedRecord(&leaderElectionRecord) 327 return true 328 } 329 330 // tryCoordinatedRenew checks if it acquired a lease and tries to renew the 331 // lease if it has already been acquired. Returns true on success else returns 332 // false. 333 func (le *LeaderElector) tryCoordinatedRenew(ctx context.Context) bool { 334 now := metav1.NewTime(le.clock.Now()) 335 leaderElectionRecord := rl.LeaderElectionRecord{ 336 HolderIdentity: le.config.Lock.Identity(), 337 LeaseDurationSeconds: int(le.config.LeaseDuration / time.Second), 338 RenewTime: now, 339 AcquireTime: now, 340 } 341 342 // 1. obtain the electionRecord 343 oldLeaderElectionRecord, oldLeaderElectionRawRecord, err := le.config.Lock.Get(ctx) 344 if err != nil { 345 if !errors.IsNotFound(err) { 346 klog.Errorf("error retrieving resource lock %v: %v", le.config.Lock.Describe(), err) 347 return false 348 } 349 klog.Infof("lease lock not found: %v", le.config.Lock.Describe()) 350 return false 351 } 352 353 // 2. Record obtained, check the Identity & Time 354 if !bytes.Equal(le.observedRawRecord, oldLeaderElectionRawRecord) { 355 le.setObservedRecord(oldLeaderElectionRecord) 356 357 le.observedRawRecord = oldLeaderElectionRawRecord 358 } 359 360 hasExpired := le.observedTime.Add(time.Second * time.Duration(oldLeaderElectionRecord.LeaseDurationSeconds)).Before(now.Time) 361 if hasExpired { 362 klog.Infof("lock has expired: %v", le.config.Lock.Describe()) 363 return false 364 } 365 366 if !le.IsLeader() { 367 klog.V(6).Infof("lock is held by %v and has not yet expired: %v", oldLeaderElectionRecord.HolderIdentity, le.config.Lock.Describe()) 368 return false 369 } 370 371 // 2b. If the lease has been marked as "end of term", don't renew it 372 if le.IsLeader() && oldLeaderElectionRecord.PreferredHolder != "" { 373 klog.V(4).Infof("lock is marked as 'end of term': %v", le.config.Lock.Describe()) 374 // TODO: Instead of letting lease expire, the holder may deleted it directly 375 // This will not be compatible with all controllers, so it needs to be opt-in behavior. 376 // We must ensure all code guarded by this lease has successfully completed 377 // prior to releasing or there may be two processes 378 // simultaneously acting on the critical path. 379 // Usually once this returns false, the process is terminated.. 380 // xref: OnStoppedLeading 381 return false 382 } 383 384 // 3. We're going to try to update. The leaderElectionRecord is set to it's default 385 // here. Let's correct it before updating. 386 if le.IsLeader() { 387 leaderElectionRecord.AcquireTime = oldLeaderElectionRecord.AcquireTime 388 leaderElectionRecord.LeaderTransitions = oldLeaderElectionRecord.LeaderTransitions 389 leaderElectionRecord.Strategy = oldLeaderElectionRecord.Strategy 390 le.metrics.slowpathExercised(le.config.Name) 391 } else { 392 leaderElectionRecord.LeaderTransitions = oldLeaderElectionRecord.LeaderTransitions + 1 393 } 394 395 // update the lock itself 396 if err = le.config.Lock.Update(ctx, leaderElectionRecord); err != nil { 397 klog.Errorf("Failed to update lock: %v", err) 398 return false 399 } 400 401 le.setObservedRecord(&leaderElectionRecord) 402 return true 403 } 404 405 // tryAcquireOrRenew tries to acquire a leader lease if it is not already acquired, 406 // else it tries to renew the lease if it has already been acquired. Returns true 407 // on success else returns false. 408 func (le *LeaderElector) tryAcquireOrRenew(ctx context.Context) bool { 409 now := metav1.NewTime(le.clock.Now()) 410 leaderElectionRecord := rl.LeaderElectionRecord{ 411 HolderIdentity: le.config.Lock.Identity(), 412 LeaseDurationSeconds: int(le.config.LeaseDuration / time.Second), 413 RenewTime: now, 414 AcquireTime: now, 415 } 416 417 // 1. fast path for the leader to update optimistically assuming that the record observed 418 // last time is the current version. 419 if le.IsLeader() && le.isLeaseValid(now.Time) { 420 oldObservedRecord := le.getObservedRecord() 421 leaderElectionRecord.AcquireTime = oldObservedRecord.AcquireTime 422 leaderElectionRecord.LeaderTransitions = oldObservedRecord.LeaderTransitions 423 424 err := le.config.Lock.Update(ctx, leaderElectionRecord) 425 if err == nil { 426 le.setObservedRecord(&leaderElectionRecord) 427 return true 428 } 429 klog.Errorf("Failed to update lock optimitically: %v, falling back to slow path", err) 430 } 431 432 // 2. obtain or create the ElectionRecord 433 oldLeaderElectionRecord, oldLeaderElectionRawRecord, err := le.config.Lock.Get(ctx) 434 if err != nil { 435 if !errors.IsNotFound(err) { 436 klog.Errorf("error retrieving resource lock %v: %v", le.config.Lock.Describe(), err) 437 return false 438 } 439 if err = le.config.Lock.Create(ctx, leaderElectionRecord); err != nil { 440 klog.Errorf("error initially creating leader election record: %v", err) 441 return false 442 } 443 444 le.setObservedRecord(&leaderElectionRecord) 445 446 return true 447 } 448 449 // 3. Record obtained, check the Identity & Time 450 if !bytes.Equal(le.observedRawRecord, oldLeaderElectionRawRecord) { 451 le.setObservedRecord(oldLeaderElectionRecord) 452 453 le.observedRawRecord = oldLeaderElectionRawRecord 454 } 455 if len(oldLeaderElectionRecord.HolderIdentity) > 0 && le.isLeaseValid(now.Time) && !le.IsLeader() { 456 klog.V(4).Infof("lock is held by %v and has not yet expired", oldLeaderElectionRecord.HolderIdentity) 457 return false 458 } 459 460 // 4. We're going to try to update. The leaderElectionRecord is set to it's default 461 // here. Let's correct it before updating. 462 if le.IsLeader() { 463 leaderElectionRecord.AcquireTime = oldLeaderElectionRecord.AcquireTime 464 leaderElectionRecord.LeaderTransitions = oldLeaderElectionRecord.LeaderTransitions 465 le.metrics.slowpathExercised(le.config.Name) 466 } else { 467 leaderElectionRecord.LeaderTransitions = oldLeaderElectionRecord.LeaderTransitions + 1 468 } 469 470 // update the lock itself 471 if err = le.config.Lock.Update(ctx, leaderElectionRecord); err != nil { 472 klog.Errorf("Failed to update lock: %v", err) 473 return false 474 } 475 476 le.setObservedRecord(&leaderElectionRecord) 477 return true 478 } 479 480 func (le *LeaderElector) maybeReportTransition() { 481 if le.observedRecord.HolderIdentity == le.reportedLeader { 482 return 483 } 484 le.reportedLeader = le.observedRecord.HolderIdentity 485 if le.config.Callbacks.OnNewLeader != nil { 486 go le.config.Callbacks.OnNewLeader(le.reportedLeader) 487 } 488 } 489 490 // Check will determine if the current lease is expired by more than timeout. 491 func (le *LeaderElector) Check(maxTolerableExpiredLease time.Duration) error { 492 if !le.IsLeader() { 493 // Currently not concerned with the case that we are hot standby 494 return nil 495 } 496 // If we are more than timeout seconds after the lease duration that is past the timeout 497 // on the lease renew. Time to start reporting ourselves as unhealthy. We should have 498 // died but conditions like deadlock can prevent this. (See #70819) 499 if le.clock.Since(le.observedTime) > le.config.LeaseDuration+maxTolerableExpiredLease { 500 return fmt.Errorf("failed election to renew leadership on lease %s", le.config.Name) 501 } 502 503 return nil 504 } 505 506 func (le *LeaderElector) isLeaseValid(now time.Time) bool { 507 return le.observedTime.Add(time.Second * time.Duration(le.getObservedRecord().LeaseDurationSeconds)).After(now) 508 } 509 510 // setObservedRecord will set a new observedRecord and update observedTime to the current time. 511 // Protect critical sections with lock. 512 func (le *LeaderElector) setObservedRecord(observedRecord *rl.LeaderElectionRecord) { 513 le.observedRecordLock.Lock() 514 defer le.observedRecordLock.Unlock() 515 516 le.observedRecord = *observedRecord 517 le.observedTime = le.clock.Now() 518 } 519 520 // getObservedRecord returns observersRecord. 521 // Protect critical sections with lock. 522 func (le *LeaderElector) getObservedRecord() rl.LeaderElectionRecord { 523 le.observedRecordLock.Lock() 524 defer le.observedRecordLock.Unlock() 525 526 return le.observedRecord 527 }