     5  // Package leaderelection implements leader election of a set of endpoints.
     6  // It uses an annotation in the endpoints object to store the record of the
     7  // election state. This implementation does not guarantee that only one
     8  // client is acting as a leader (a.k.a. fencing).
     9  //
    10  // A client only acts on timestamps captured locally to infer the state of the
    11  // leader election. The client does not consider timestamps in the leader
    12  // election record to be accurate because these timestamps may not have been
    13  // produced by a local clock. The implementation does not depend on their
    14  // accuracy and only uses their change to indicate that another client has
    15  // renewed the leader lease. Thus, the implementation is tolerant to arbitrary
    16  // clock skew, but is not tolerant to arbitrary clock skew rate.
    17  //
    18  // However, the level of tolerance to skew rate can be configured by setting
    19  // RenewDeadline and LeaseDuration appropriately. The tolerance expressed as a
    20  // maximum tolerated ratio of time passed on the fastest node to time passed on
    21  // the slowest node can be approximately achieved with a configuration that sets
    22  // the same ratio of LeaseDuration to RenewDeadline. For example if a user wanted
    23  // to tolerate some nodes progressing forward in time twice as fast as other nodes,
    24  // the user could set LeaseDuration to 60 seconds and RenewDeadline to 30 seconds.
    25  //
    26  // While not required, some method of clock synchronization between nodes in the
    27  // cluster is highly recommended. It's important to keep in mind when configuring
    28  // this client that the tolerance to skew rate varies inversely to master
    29  // availability.
    30  //
    31  // Larger clusters often have a more lenient SLA for API latency. This should be
    32  // taken into account when configuring the client. The rate of leader transitions
    33  // should be monitored and RetryPeriod and LeaseDuration should be increased
    34  // until the rate is stable and acceptably low. It's important to keep in mind
    35  // when configuring this client that the tolerance to API latency varies inversely
    36  // to master availability.
    37  package leaderelection
    39  import (
    40  	"bytes"
    41  	"context"
    42  	"fmt"
    43  	"log"
    44  	"sync"
    45  	"time"
    47  	""
    48  	time_ ""
    49  )
    51  const (
    52  	JitterFactor        = 1.2
    53  	EventBecameLeader   = "became leader"
    54  	EventStoppedLeading = "stopped leading"
    55  )
    57  // NewLeaderElector creates a LeaderElector from a LeaderElectionConfig
    58  func NewLeaderElector(lec Config) (*LeaderElector, error) {
    59  	if lec.LeaseDuration <= lec.RenewTimeout {
    60  		return nil, fmt.Errorf("leaseDuration must be greater than renewDeadline")
    61  	}
    62  	if lec.RenewTimeout <= time.Duration(JitterFactor*float64(lec.RetryPeriod)) {
    63  		lec.RenewTimeout = time.Duration(JitterFactor * float64(lec.RetryPeriod))
    64  		return nil, fmt.Errorf("renewDeadline must be greater than retryPeriod*JitterFactor")
    65  	}
    66  	if lec.LeaseDuration < 1 {
    67  		return nil, fmt.Errorf("leaseDuration must be greater than zero")
    68  	}
    69  	if lec.RenewTimeout < 1 {
    70  		return nil, fmt.Errorf("renewDeadline must be greater than zero")
    71  	}
    72  	if lec.RetryPeriod < 1 {
    73  		return nil, fmt.Errorf("retryPeriod must be greater than zero")
    74  	}
    76  	if lec.Lock == nil {
    77  		return nil, fmt.Errorf("lock must not be nil")
    78  	}
    79  	le := LeaderElector{
    80  		config: lec,
    81  	}
    82  	le.config.Lock.RecordEvent(le.config.Name, EventStoppedLeading)
    84  	return &le, nil
    85  }
    87  // LeaderElector is a leader election client.
    88  type LeaderElector struct {
    89  	// ErrorLog specifies an optional logger for errors accepting
    90  	// connections, unexpected behavior from handlers, and
    91  	// underlying FileSystem errors.
    92  	// If nil, logging is done via the log package's standard logger.
    93  	ErrorLog *log.Logger
    95  	config Config
    96  	// internal bookkeeping
    97  	observedRecord    Record
    98  	observedRawRecord []byte
    99  	observedTime      time.Time // Time when setObservedRecord is called
   100  	// used to implement OnNewLeader(), may lag slightly from the
   101  	// value observedRecord.HolderIdentity if the transition has
   102  	// not yet been reported.
   103  	reportedLeader string
   105  	// used to lock the observedRecord
   106  	observedRecordLock sync.Mutex
   107  }
   109  func (le *LeaderElector) logf(format string, args ...any) {
   110  	if le.ErrorLog != nil {
   111  		le.ErrorLog.Printf(format, args...)
   112  	} else {
   113  		log.Printf(format, args...)
   114  	}
   115  }
   117  // Run starts the leader election loop. Run will not return
   118  // before leader election loop is stopped by ctx, or it has
   119  // stopped holding the leader lease
   120  func (le *LeaderElector) Run(ctx context.Context) {
   121  	defer runtime.DefaultPanic.Recover()
   122  	if le.config.Callbacks.OnStoppedLeading != nil {
   123  		defer func() {
   124  			le.config.Callbacks.OnStoppedLeading()
   125  		}()
   126  	}
   128  	// wait until we are a leader
   129  	if !le.acquire(ctx) {
   130  		return // ctx signalled done
   131  	}
   132  	ctx, cancel := context.WithCancel(ctx)
   133  	defer cancel()
   134  	if le.config.Callbacks.OnStartedLeading != nil {
   135  		go le.config.Callbacks.OnStartedLeading(ctx)
   136  	}
   137  	le.renew(ctx)
   138  }
   140  // Run starts a client with the provided config or failed if the config
   141  // fails to validate. RunOrDie blocks until leader election loop is
   142  // stopped by ctx or it has stopped holding the leader lease
   143  func Run(ctx context.Context, lec Config) (*LeaderElector, error) {
   144  	le, err := NewLeaderElector(lec)
   145  	if err != nil {
   146  		return nil, err
   147  	}
   148  	le.Run(ctx)
   149  	return nil, err
   150  }
   152  // GetLeader returns the identity of the last observed leader or returns the empty string if
   153  // no leader has yet been observed.
   154  // This function is for informational purposes. (e.g. monitoring, logs, etc.)
   155  func (le *LeaderElector) GetLeader() string {
   156  	return le.getObservedRecord().HolderIdentity
   157  }
   159  // IsLeader returns true if the last observed leader was this client else returns false.
   160  func (le *LeaderElector) IsLeader() bool {
   161  	return le.getObservedRecord().HolderIdentity == le.config.Lock.Identity()
   162  }
   164  // acquire loops calling tryAcquireOrRenew and returns true immediately when tryAcquireOrRenew succeeds.
   165  // Returns false if ctx signals done.
   166  func (le *LeaderElector) acquire(ctx context.Context) bool {
   167  	ctx, cancel := context.WithCancel(ctx)
   168  	defer cancel()
   169  	succeeded := false
   170  	desc := le.config.Lock.Describe()
   171  	le.logf("attempting to acquire leader lease %v...", desc)
   172  	time_.JitterUntil(ctx, func(ctx context.Context) {
   173  		// try lock
   174  		succeeded = le.tryAcquireOrRenew(ctx)
   175  		le.maybeReportTransition()
   176  		if !succeeded {
   177  			le.logf("failed to acquire lease %v", desc)
   178  			return
   179  		}
   180  		le.config.Lock.RecordEvent(le.config.Name, EventBecameLeader)
   181  		le.logf("successfully acquired lease %v", desc)
   182  		cancel()
   183  	}, true, time_.WithExponentialBackOffOptionRandomizationFactor(JitterFactor))
   184  	return succeeded
   185  }
   187  // renew loops calling tryAcquireOrRenew and returns immediately when tryAcquireOrRenew fails or ctx signals done.
   188  func (le *LeaderElector) renew(ctx context.Context) {
   189  	ctx, cancel := context.WithCancel(ctx)
   190  	defer cancel()
   191  	time_.Until(ctx, func(ctx context.Context) {
   192  		timeoutCtx, timeoutCancel := context.WithTimeout(ctx, le.config.RenewTimeout)
   193  		defer timeoutCancel()
   194  		var leader bool
   195  		// block until leader elector finished
   196  		time_.Until(timeoutCtx, func(ctx context.Context) {
   197  			if le.tryAcquireOrRenew(timeoutCtx) {
   198  				leader = true
   199  				timeoutCancel()
   200  			}
   201  		}, le.config.RenewTimeout)
   202  		// leader elector finished
   204  		// maybe report leader changed
   205  		le.maybeReportTransition()
   206  		desc := le.config.Lock.Describe()
   207  		if leader {
   208  			// I'm a leader now
   209  			le.logf("successfully renewed lease %v", desc)
   210  			return
   211  		}
   212  		// I'm a follower now
   213  		le.config.Lock.RecordEvent(le.config.Name, EventStoppedLeading)
   214  		le.logf("failed to renew lease %v: %v", desc, timeoutCtx.Err())
   215  		cancel()
   217  	}, le.config.RetryPeriod)
   219  	// if we hold the lease, give it up
   220  	// unlock if I'm a leader
   221  	if le.config.ReleaseOnCancel {
   222  		le.release()
   223  	}
   224  }
   226  // release attempts to release the leader lease if we have acquired it.
   227  // unlock is we hold this lock
   228  func (le *LeaderElector) release() bool {
   229  	if !le.IsLeader() {
   230  		return true
   231  	}
   232  	now := time.Now()
   233  	leaderElectionRecord := Record{
   234  		LeaderTransitions: le.observedRecord.LeaderTransitions,
   235  		LeaseDuration:     time.Second,
   236  		RenewTime:         now,
   237  		AcquireTime:       now,
   238  	}
   239  	if err := le.config.Lock.Update(context.TODO(), leaderElectionRecord); err != nil {
   240  		le.logf("Failed to release lock: %v", err)
   241  		return false
   242  	}
   244  	le.setObservedRecord(&leaderElectionRecord)
   245  	return true
   246  }
   248  // tryAcquireOrRenew tries to acquire a leader lease if it is not already acquired,
   249  // else it tries to renew the lease if it has already been acquired. Returns true
   250  // on success else returns false.
   251  func (le *LeaderElector) tryAcquireOrRenew(ctx context.Context) bool {
   252  	now := time.Now()
   253  	leaderElectionRecord := Record{
   254  		HolderIdentity: le.config.Lock.Identity(),
   255  		LeaseDuration:  le.config.LeaseDuration,
   256  		RenewTime:      now,
   257  		AcquireTime:    now,
   258  	}
   260  	// 1. obtain or create the ElectionRecord
   261  	oldLeaderElectionRecord, oldLeaderElectionRawRecord, err := le.config.Lock.Get(ctx)
   262  	if err != nil {
   263  		// Lock, try to lock as I'm a leader
   264  		if err = le.config.Lock.Create(ctx, leaderElectionRecord); err != nil {
   265  			le.logf("error initially creating leader election record: %v", err)
   266  			return false
   267  		}
   269  		le.setObservedRecord(&leaderElectionRecord)
   270  		return true
   271  	}
   272  	// renew
   274  	// 2. Record obtained, check the Identity & Time
   275  	if !bytes.Equal(le.observedRawRecord, oldLeaderElectionRawRecord) {
   276  		le.setObservedRecord(oldLeaderElectionRecord)
   277  		le.observedRawRecord = oldLeaderElectionRawRecord
   278  	}
   280  	if len(oldLeaderElectionRecord.HolderIdentity) > 0 &&
   281  		!le.observedRecordExpired(now) && !le.IsLeader() {
   282  		le.logf("lock is held by %v and has not yet expired", oldLeaderElectionRecord.HolderIdentity)
   283  		// return as a follower
   284  		return false
   285  	}
   287  	// 3. We're going to try to update. The leaderElectionRecord is set to it's default
   288  	// here. Let's correct it before updating.
   289  	if le.IsLeader() {
   290  		// refresh the locker by leader self
   291  		// relock, the lock is inherited, so AcquireTime is kept
   292  		// refresh the lock by RenewTime refreshed
   293  		leaderElectionRecord.AcquireTime = oldLeaderElectionRecord.AcquireTime
   294  		leaderElectionRecord.LeaderTransitions = oldLeaderElectionRecord.LeaderTransitions
   295  	} else {
   296  		// try to lock as a leader
   297  		leaderElectionRecord.LeaderTransitions = oldLeaderElectionRecord.LeaderTransitions + 1
   298  	}
   300  	// update the lock as a leader
   301  	if err = le.config.Lock.Update(ctx, leaderElectionRecord); err != nil {
   302  		le.logf("Failed to update lock: %v", err)
   303  		return false
   304  	}
   306  	le.setObservedRecord(&leaderElectionRecord)
   307  	return true
   308  }
   310  // maybeReportTransition call OnNewLeader when the client observes a leader that is
   311  //
   312  //	not the previously observed leader.
   313  func (le *LeaderElector) maybeReportTransition() {
   314  	if le.observedRecord.HolderIdentity == le.reportedLeader {
   315  		return
   316  	}
   317  	le.reportedLeader = le.observedRecord.HolderIdentity
   318  	if le.config.Callbacks.OnNewLeader != nil {
   319  		go le.config.Callbacks.OnNewLeader(le.reportedLeader)
   320  	}
   321  }
   323  // Check will determine if the current lease is expired by more than timeout.
   324  func (le *LeaderElector) Check(maxTolerableExpiredLease time.Duration) error {
   325  	if !le.IsLeader() {
   326  		// Currently, not concerned with the case that we are hot standby
   327  		return nil
   328  	}
   329  	// If we are more than timeout seconds after the lease duration that is past the timeout
   330  	// on the lease renew. Time to start reporting ourselves as unhealthy. We should have
   331  	// died but conditions like deadlock can prevent this. (See #70819)
   332  	if time.Since(le.observedTime) > le.config.LeaseDuration+maxTolerableExpiredLease {
   333  		return fmt.Errorf("failed election to renew leadership on lease %s", le.config.Name)
   334  	}
   336  	return nil
   337  }
   339  // setObservedRecord will set a new observedRecord and update observedTime to the current time.
   340  // Protect critical sections with lock.
   341  func (le *LeaderElector) setObservedRecord(observedRecord *Record) {
   342  	le.observedRecordLock.Lock()
   343  	defer le.observedRecordLock.Unlock()
   345  	le.observedRecord = *observedRecord
   346  	le.observedTime = time.Now()
   347  }
   349  // getObservedRecord returns observersRecord.
   350  // Protect critical sections with lock.
   351  func (le *LeaderElector) getObservedRecord() Record {
   352  	le.observedRecordLock.Lock()
   353  	defer le.observedRecordLock.Unlock()
   355  	return le.observedRecord
   356  }
   358  // observedRecordExpired returns true if observersRecord expired.
   359  // Protect critical sections with lock.
   360  func (le *LeaderElector) observedRecordExpired(now time.Time) bool {
   361  	return le.observedTime.Add(le.config.LeaseDuration).After(now)
   362  }