k8s.io/client-go@v0.31.1/tools/leaderelection/leaderelection.go (about)

     1  /*
     2  Copyright 2015 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  // Package leaderelection implements leader election of a set of endpoints.
    18  // It uses an annotation in the endpoints object to store the record of the
    19  // election state. This implementation does not guarantee that only one
    20  // client is acting as a leader (a.k.a. fencing).
    21  //
    22  // A client only acts on timestamps captured locally to infer the state of the
    23  // leader election. The client does not consider timestamps in the leader
    24  // election record to be accurate because these timestamps may not have been
    25  // produced by a local clock. The implemention does not depend on their
    26  // accuracy and only uses their change to indicate that another client has
    27  // renewed the leader lease. Thus the implementation is tolerant to arbitrary
    28  // clock skew, but is not tolerant to arbitrary clock skew rate.
    29  //
    30  // However the level of tolerance to skew rate can be configured by setting
    31  // RenewDeadline and LeaseDuration appropriately. The tolerance expressed as a
    32  // maximum tolerated ratio of time passed on the fastest node to time passed on
    33  // the slowest node can be approximately achieved with a configuration that sets
    34  // the same ratio of LeaseDuration to RenewDeadline. For example if a user wanted
    35  // to tolerate some nodes progressing forward in time twice as fast as other nodes,
    36  // the user could set LeaseDuration to 60 seconds and RenewDeadline to 30 seconds.
    37  //
    38  // While not required, some method of clock synchronization between nodes in the
    39  // cluster is highly recommended. It's important to keep in mind when configuring
    40  // this client that the tolerance to skew rate varies inversely to master
    41  // availability.
    42  //
    43  // Larger clusters often have a more lenient SLA for API latency. This should be
    44  // taken into account when configuring the client. The rate of leader transitions
    45  // should be monitored and RetryPeriod and LeaseDuration should be increased
    46  // until the rate is stable and acceptably low. It's important to keep in mind
    47  // when configuring this client that the tolerance to API latency varies inversely
    48  // to master availability.
    49  //
    50  // DISCLAIMER: this is an alpha API. This library will likely change significantly
    51  // or even be removed entirely in subsequent releases. Depend on this API at
    52  // your own risk.
    53  package leaderelection
    54  
    55  import (
    56  	"bytes"
    57  	"context"
    58  	"fmt"
    59  	"sync"
    60  	"time"
    61  
    62  	"k8s.io/apimachinery/pkg/api/errors"
    63  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    64  	"k8s.io/apimachinery/pkg/util/runtime"
    65  	"k8s.io/apimachinery/pkg/util/wait"
    66  	rl "k8s.io/client-go/tools/leaderelection/resourcelock"
    67  	"k8s.io/klog/v2"
    68  	"k8s.io/utils/clock"
    69  )
    70  
    71  const (
    72  	JitterFactor = 1.2
    73  )
    74  
    75  // NewLeaderElector creates a LeaderElector from a LeaderElectionConfig
    76  func NewLeaderElector(lec LeaderElectionConfig) (*LeaderElector, error) {
    77  	if lec.LeaseDuration <= lec.RenewDeadline {
    78  		return nil, fmt.Errorf("leaseDuration must be greater than renewDeadline")
    79  	}
    80  	if lec.RenewDeadline <= time.Duration(JitterFactor*float64(lec.RetryPeriod)) {
    81  		return nil, fmt.Errorf("renewDeadline must be greater than retryPeriod*JitterFactor")
    82  	}
    83  	if lec.LeaseDuration < 1 {
    84  		return nil, fmt.Errorf("leaseDuration must be greater than zero")
    85  	}
    86  	if lec.RenewDeadline < 1 {
    87  		return nil, fmt.Errorf("renewDeadline must be greater than zero")
    88  	}
    89  	if lec.RetryPeriod < 1 {
    90  		return nil, fmt.Errorf("retryPeriod must be greater than zero")
    91  	}
    92  	if lec.Callbacks.OnStartedLeading == nil {
    93  		return nil, fmt.Errorf("OnStartedLeading callback must not be nil")
    94  	}
    95  	if lec.Callbacks.OnStoppedLeading == nil {
    96  		return nil, fmt.Errorf("OnStoppedLeading callback must not be nil")
    97  	}
    98  
    99  	if lec.Lock == nil {
   100  		return nil, fmt.Errorf("Lock must not be nil.")
   101  	}
   102  	id := lec.Lock.Identity()
   103  	if id == "" {
   104  		return nil, fmt.Errorf("Lock identity is empty")
   105  	}
   106  
   107  	le := LeaderElector{
   108  		config:  lec,
   109  		clock:   clock.RealClock{},
   110  		metrics: globalMetricsFactory.newLeaderMetrics(),
   111  	}
   112  	le.metrics.leaderOff(le.config.Name)
   113  	return &le, nil
   114  }
   115  
   116  type LeaderElectionConfig struct {
   117  	// Lock is the resource that will be used for locking
   118  	Lock rl.Interface
   119  
   120  	// LeaseDuration is the duration that non-leader candidates will
   121  	// wait to force acquire leadership. This is measured against time of
   122  	// last observed ack.
   123  	//
   124  	// A client needs to wait a full LeaseDuration without observing a change to
   125  	// the record before it can attempt to take over. When all clients are
   126  	// shutdown and a new set of clients are started with different names against
   127  	// the same leader record, they must wait the full LeaseDuration before
   128  	// attempting to acquire the lease. Thus LeaseDuration should be as short as
   129  	// possible (within your tolerance for clock skew rate) to avoid a possible
   130  	// long waits in the scenario.
   131  	//
   132  	// Core clients default this value to 15 seconds.
   133  	LeaseDuration time.Duration
   134  	// RenewDeadline is the duration that the acting master will retry
   135  	// refreshing leadership before giving up.
   136  	//
   137  	// Core clients default this value to 10 seconds.
   138  	RenewDeadline time.Duration
   139  	// RetryPeriod is the duration the LeaderElector clients should wait
   140  	// between tries of actions.
   141  	//
   142  	// Core clients default this value to 2 seconds.
   143  	RetryPeriod time.Duration
   144  
   145  	// Callbacks are callbacks that are triggered during certain lifecycle
   146  	// events of the LeaderElector
   147  	Callbacks LeaderCallbacks
   148  
   149  	// WatchDog is the associated health checker
   150  	// WatchDog may be null if it's not needed/configured.
   151  	WatchDog *HealthzAdaptor
   152  
   153  	// ReleaseOnCancel should be set true if the lock should be released
   154  	// when the run context is cancelled. If you set this to true, you must
   155  	// ensure all code guarded by this lease has successfully completed
   156  	// prior to cancelling the context, or you may have two processes
   157  	// simultaneously acting on the critical path.
   158  	ReleaseOnCancel bool
   159  
   160  	// Name is the name of the resource lock for debugging
   161  	Name string
   162  
   163  	// Coordinated will use the Coordinated Leader Election feature
   164  	// WARNING: Coordinated leader election is ALPHA.
   165  	Coordinated bool
   166  }
   167  
   168  // LeaderCallbacks are callbacks that are triggered during certain
   169  // lifecycle events of the LeaderElector. These are invoked asynchronously.
   170  //
   171  // possible future callbacks:
   172  //   - OnChallenge()
   173  type LeaderCallbacks struct {
   174  	// OnStartedLeading is called when a LeaderElector client starts leading
   175  	OnStartedLeading func(context.Context)
   176  	// OnStoppedLeading is called when a LeaderElector client stops leading
   177  	OnStoppedLeading func()
   178  	// OnNewLeader is called when the client observes a leader that is
   179  	// not the previously observed leader. This includes the first observed
   180  	// leader when the client starts.
   181  	OnNewLeader func(identity string)
   182  }
   183  
   184  // LeaderElector is a leader election client.
   185  type LeaderElector struct {
   186  	config LeaderElectionConfig
   187  	// internal bookkeeping
   188  	observedRecord    rl.LeaderElectionRecord
   189  	observedRawRecord []byte
   190  	observedTime      time.Time
   191  	// used to implement OnNewLeader(), may lag slightly from the
   192  	// value observedRecord.HolderIdentity if the transition has
   193  	// not yet been reported.
   194  	reportedLeader string
   195  
   196  	// clock is wrapper around time to allow for less flaky testing
   197  	clock clock.Clock
   198  
   199  	// used to lock the observedRecord
   200  	observedRecordLock sync.Mutex
   201  
   202  	metrics leaderMetricsAdapter
   203  }
   204  
   205  // Run starts the leader election loop. Run will not return
   206  // before leader election loop is stopped by ctx or it has
   207  // stopped holding the leader lease
   208  func (le *LeaderElector) Run(ctx context.Context) {
   209  	defer runtime.HandleCrash()
   210  	defer le.config.Callbacks.OnStoppedLeading()
   211  
   212  	if !le.acquire(ctx) {
   213  		return // ctx signalled done
   214  	}
   215  	ctx, cancel := context.WithCancel(ctx)
   216  	defer cancel()
   217  	go le.config.Callbacks.OnStartedLeading(ctx)
   218  	le.renew(ctx)
   219  }
   220  
   221  // RunOrDie starts a client with the provided config or panics if the config
   222  // fails to validate. RunOrDie blocks until leader election loop is
   223  // stopped by ctx or it has stopped holding the leader lease
   224  func RunOrDie(ctx context.Context, lec LeaderElectionConfig) {
   225  	le, err := NewLeaderElector(lec)
   226  	if err != nil {
   227  		panic(err)
   228  	}
   229  	if lec.WatchDog != nil {
   230  		lec.WatchDog.SetLeaderElection(le)
   231  	}
   232  	le.Run(ctx)
   233  }
   234  
   235  // GetLeader returns the identity of the last observed leader or returns the empty string if
   236  // no leader has yet been observed.
   237  // This function is for informational purposes. (e.g. monitoring, logs, etc.)
   238  func (le *LeaderElector) GetLeader() string {
   239  	return le.getObservedRecord().HolderIdentity
   240  }
   241  
   242  // IsLeader returns true if the last observed leader was this client else returns false.
   243  func (le *LeaderElector) IsLeader() bool {
   244  	return le.getObservedRecord().HolderIdentity == le.config.Lock.Identity()
   245  }
   246  
   247  // acquire loops calling tryAcquireOrRenew and returns true immediately when tryAcquireOrRenew succeeds.
   248  // Returns false if ctx signals done.
   249  func (le *LeaderElector) acquire(ctx context.Context) bool {
   250  	ctx, cancel := context.WithCancel(ctx)
   251  	defer cancel()
   252  	succeeded := false
   253  	desc := le.config.Lock.Describe()
   254  	klog.Infof("attempting to acquire leader lease %v...", desc)
   255  	wait.JitterUntil(func() {
   256  		if !le.config.Coordinated {
   257  			succeeded = le.tryAcquireOrRenew(ctx)
   258  		} else {
   259  			succeeded = le.tryCoordinatedRenew(ctx)
   260  		}
   261  		le.maybeReportTransition()
   262  		if !succeeded {
   263  			klog.V(4).Infof("failed to acquire lease %v", desc)
   264  			return
   265  		}
   266  		le.config.Lock.RecordEvent("became leader")
   267  		le.metrics.leaderOn(le.config.Name)
   268  		klog.Infof("successfully acquired lease %v", desc)
   269  		cancel()
   270  	}, le.config.RetryPeriod, JitterFactor, true, ctx.Done())
   271  	return succeeded
   272  }
   273  
   274  // renew loops calling tryAcquireOrRenew and returns immediately when tryAcquireOrRenew fails or ctx signals done.
   275  func (le *LeaderElector) renew(ctx context.Context) {
   276  	defer le.config.Lock.RecordEvent("stopped leading")
   277  	ctx, cancel := context.WithCancel(ctx)
   278  	defer cancel()
   279  	wait.Until(func() {
   280  		timeoutCtx, timeoutCancel := context.WithTimeout(ctx, le.config.RenewDeadline)
   281  		defer timeoutCancel()
   282  		err := wait.PollImmediateUntil(le.config.RetryPeriod, func() (bool, error) {
   283  			if !le.config.Coordinated {
   284  				return le.tryAcquireOrRenew(timeoutCtx), nil
   285  			} else {
   286  				return le.tryCoordinatedRenew(timeoutCtx), nil
   287  			}
   288  		}, timeoutCtx.Done())
   289  
   290  		le.maybeReportTransition()
   291  		desc := le.config.Lock.Describe()
   292  		if err == nil {
   293  			klog.V(5).Infof("successfully renewed lease %v", desc)
   294  			return
   295  		}
   296  		le.metrics.leaderOff(le.config.Name)
   297  		klog.Infof("failed to renew lease %v: %v", desc, err)
   298  		cancel()
   299  	}, le.config.RetryPeriod, ctx.Done())
   300  
   301  	// if we hold the lease, give it up
   302  	if le.config.ReleaseOnCancel {
   303  		le.release()
   304  	}
   305  }
   306  
   307  // release attempts to release the leader lease if we have acquired it.
   308  func (le *LeaderElector) release() bool {
   309  	if !le.IsLeader() {
   310  		return true
   311  	}
   312  	now := metav1.NewTime(le.clock.Now())
   313  	leaderElectionRecord := rl.LeaderElectionRecord{
   314  		LeaderTransitions:    le.observedRecord.LeaderTransitions,
   315  		LeaseDurationSeconds: 1,
   316  		RenewTime:            now,
   317  		AcquireTime:          now,
   318  	}
   319  	timeoutCtx, timeoutCancel := context.WithTimeout(context.Background(), le.config.RenewDeadline)
   320  	defer timeoutCancel()
   321  	if err := le.config.Lock.Update(timeoutCtx, leaderElectionRecord); err != nil {
   322  		klog.Errorf("Failed to release lock: %v", err)
   323  		return false
   324  	}
   325  
   326  	le.setObservedRecord(&leaderElectionRecord)
   327  	return true
   328  }
   329  
   330  // tryCoordinatedRenew checks if it acquired a lease and tries to renew the
   331  // lease if it has already been acquired. Returns true on success else returns
   332  // false.
   333  func (le *LeaderElector) tryCoordinatedRenew(ctx context.Context) bool {
   334  	now := metav1.NewTime(le.clock.Now())
   335  	leaderElectionRecord := rl.LeaderElectionRecord{
   336  		HolderIdentity:       le.config.Lock.Identity(),
   337  		LeaseDurationSeconds: int(le.config.LeaseDuration / time.Second),
   338  		RenewTime:            now,
   339  		AcquireTime:          now,
   340  	}
   341  
   342  	// 1. obtain the electionRecord
   343  	oldLeaderElectionRecord, oldLeaderElectionRawRecord, err := le.config.Lock.Get(ctx)
   344  	if err != nil {
   345  		if !errors.IsNotFound(err) {
   346  			klog.Errorf("error retrieving resource lock %v: %v", le.config.Lock.Describe(), err)
   347  			return false
   348  		}
   349  		klog.Infof("lease lock not found: %v", le.config.Lock.Describe())
   350  		return false
   351  	}
   352  
   353  	// 2. Record obtained, check the Identity & Time
   354  	if !bytes.Equal(le.observedRawRecord, oldLeaderElectionRawRecord) {
   355  		le.setObservedRecord(oldLeaderElectionRecord)
   356  
   357  		le.observedRawRecord = oldLeaderElectionRawRecord
   358  	}
   359  
   360  	hasExpired := le.observedTime.Add(time.Second * time.Duration(oldLeaderElectionRecord.LeaseDurationSeconds)).Before(now.Time)
   361  	if hasExpired {
   362  		klog.Infof("lock has expired: %v", le.config.Lock.Describe())
   363  		return false
   364  	}
   365  
   366  	if !le.IsLeader() {
   367  		klog.V(6).Infof("lock is held by %v and has not yet expired: %v", oldLeaderElectionRecord.HolderIdentity, le.config.Lock.Describe())
   368  		return false
   369  	}
   370  
   371  	// 2b. If the lease has been marked as "end of term", don't renew it
   372  	if le.IsLeader() && oldLeaderElectionRecord.PreferredHolder != "" {
   373  		klog.V(4).Infof("lock is marked as 'end of term': %v", le.config.Lock.Describe())
   374  		// TODO: Instead of letting lease expire, the holder may deleted it directly
   375  		// This will not be compatible with all controllers, so it needs to be opt-in behavior.
   376  		// We must ensure all code guarded by this lease has successfully completed
   377  		// prior to releasing or there may be two processes
   378  		// simultaneously acting on the critical path.
   379  		// Usually once this returns false, the process is terminated..
   380  		// xref: OnStoppedLeading
   381  		return false
   382  	}
   383  
   384  	// 3. We're going to try to update. The leaderElectionRecord is set to it's default
   385  	// here. Let's correct it before updating.
   386  	if le.IsLeader() {
   387  		leaderElectionRecord.AcquireTime = oldLeaderElectionRecord.AcquireTime
   388  		leaderElectionRecord.LeaderTransitions = oldLeaderElectionRecord.LeaderTransitions
   389  		leaderElectionRecord.Strategy = oldLeaderElectionRecord.Strategy
   390  		le.metrics.slowpathExercised(le.config.Name)
   391  	} else {
   392  		leaderElectionRecord.LeaderTransitions = oldLeaderElectionRecord.LeaderTransitions + 1
   393  	}
   394  
   395  	// update the lock itself
   396  	if err = le.config.Lock.Update(ctx, leaderElectionRecord); err != nil {
   397  		klog.Errorf("Failed to update lock: %v", err)
   398  		return false
   399  	}
   400  
   401  	le.setObservedRecord(&leaderElectionRecord)
   402  	return true
   403  }
   404  
   405  // tryAcquireOrRenew tries to acquire a leader lease if it is not already acquired,
   406  // else it tries to renew the lease if it has already been acquired. Returns true
   407  // on success else returns false.
   408  func (le *LeaderElector) tryAcquireOrRenew(ctx context.Context) bool {
   409  	now := metav1.NewTime(le.clock.Now())
   410  	leaderElectionRecord := rl.LeaderElectionRecord{
   411  		HolderIdentity:       le.config.Lock.Identity(),
   412  		LeaseDurationSeconds: int(le.config.LeaseDuration / time.Second),
   413  		RenewTime:            now,
   414  		AcquireTime:          now,
   415  	}
   416  
   417  	// 1. fast path for the leader to update optimistically assuming that the record observed
   418  	// last time is the current version.
   419  	if le.IsLeader() && le.isLeaseValid(now.Time) {
   420  		oldObservedRecord := le.getObservedRecord()
   421  		leaderElectionRecord.AcquireTime = oldObservedRecord.AcquireTime
   422  		leaderElectionRecord.LeaderTransitions = oldObservedRecord.LeaderTransitions
   423  
   424  		err := le.config.Lock.Update(ctx, leaderElectionRecord)
   425  		if err == nil {
   426  			le.setObservedRecord(&leaderElectionRecord)
   427  			return true
   428  		}
   429  		klog.Errorf("Failed to update lock optimitically: %v, falling back to slow path", err)
   430  	}
   431  
   432  	// 2. obtain or create the ElectionRecord
   433  	oldLeaderElectionRecord, oldLeaderElectionRawRecord, err := le.config.Lock.Get(ctx)
   434  	if err != nil {
   435  		if !errors.IsNotFound(err) {
   436  			klog.Errorf("error retrieving resource lock %v: %v", le.config.Lock.Describe(), err)
   437  			return false
   438  		}
   439  		if err = le.config.Lock.Create(ctx, leaderElectionRecord); err != nil {
   440  			klog.Errorf("error initially creating leader election record: %v", err)
   441  			return false
   442  		}
   443  
   444  		le.setObservedRecord(&leaderElectionRecord)
   445  
   446  		return true
   447  	}
   448  
   449  	// 3. Record obtained, check the Identity & Time
   450  	if !bytes.Equal(le.observedRawRecord, oldLeaderElectionRawRecord) {
   451  		le.setObservedRecord(oldLeaderElectionRecord)
   452  
   453  		le.observedRawRecord = oldLeaderElectionRawRecord
   454  	}
   455  	if len(oldLeaderElectionRecord.HolderIdentity) > 0 && le.isLeaseValid(now.Time) && !le.IsLeader() {
   456  		klog.V(4).Infof("lock is held by %v and has not yet expired", oldLeaderElectionRecord.HolderIdentity)
   457  		return false
   458  	}
   459  
   460  	// 4. We're going to try to update. The leaderElectionRecord is set to it's default
   461  	// here. Let's correct it before updating.
   462  	if le.IsLeader() {
   463  		leaderElectionRecord.AcquireTime = oldLeaderElectionRecord.AcquireTime
   464  		leaderElectionRecord.LeaderTransitions = oldLeaderElectionRecord.LeaderTransitions
   465  		le.metrics.slowpathExercised(le.config.Name)
   466  	} else {
   467  		leaderElectionRecord.LeaderTransitions = oldLeaderElectionRecord.LeaderTransitions + 1
   468  	}
   469  
   470  	// update the lock itself
   471  	if err = le.config.Lock.Update(ctx, leaderElectionRecord); err != nil {
   472  		klog.Errorf("Failed to update lock: %v", err)
   473  		return false
   474  	}
   475  
   476  	le.setObservedRecord(&leaderElectionRecord)
   477  	return true
   478  }
   479  
   480  func (le *LeaderElector) maybeReportTransition() {
   481  	if le.observedRecord.HolderIdentity == le.reportedLeader {
   482  		return
   483  	}
   484  	le.reportedLeader = le.observedRecord.HolderIdentity
   485  	if le.config.Callbacks.OnNewLeader != nil {
   486  		go le.config.Callbacks.OnNewLeader(le.reportedLeader)
   487  	}
   488  }
   489  
   490  // Check will determine if the current lease is expired by more than timeout.
   491  func (le *LeaderElector) Check(maxTolerableExpiredLease time.Duration) error {
   492  	if !le.IsLeader() {
   493  		// Currently not concerned with the case that we are hot standby
   494  		return nil
   495  	}
   496  	// If we are more than timeout seconds after the lease duration that is past the timeout
   497  	// on the lease renew. Time to start reporting ourselves as unhealthy. We should have
   498  	// died but conditions like deadlock can prevent this. (See #70819)
   499  	if le.clock.Since(le.observedTime) > le.config.LeaseDuration+maxTolerableExpiredLease {
   500  		return fmt.Errorf("failed election to renew leadership on lease %s", le.config.Name)
   501  	}
   502  
   503  	return nil
   504  }
   505  
   506  func (le *LeaderElector) isLeaseValid(now time.Time) bool {
   507  	return le.observedTime.Add(time.Second * time.Duration(le.getObservedRecord().LeaseDurationSeconds)).After(now)
   508  }
   509  
   510  // setObservedRecord will set a new observedRecord and update observedTime to the current time.
   511  // Protect critical sections with lock.
   512  func (le *LeaderElector) setObservedRecord(observedRecord *rl.LeaderElectionRecord) {
   513  	le.observedRecordLock.Lock()
   514  	defer le.observedRecordLock.Unlock()
   515  
   516  	le.observedRecord = *observedRecord
   517  	le.observedTime = le.clock.Now()
   518  }
   519  
   520  // getObservedRecord returns observersRecord.
   521  // Protect critical sections with lock.
   522  func (le *LeaderElector) getObservedRecord() rl.LeaderElectionRecord {
   523  	le.observedRecordLock.Lock()
   524  	defer le.observedRecordLock.Unlock()
   525  
   526  	return le.observedRecord
   527  }