k8s.io/client-go@v0.22.2/tools/leaderelection/leaderelection.go (about)

     1  /*
     2  Copyright 2015 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  // Package leaderelection implements leader election of a set of endpoints.
    18  // It uses an annotation in the endpoints object to store the record of the
    19  // election state. This implementation does not guarantee that only one
    20  // client is acting as a leader (a.k.a. fencing).
    21  //
    22  // A client only acts on timestamps captured locally to infer the state of the
    23  // leader election. The client does not consider timestamps in the leader
    24  // election record to be accurate because these timestamps may not have been
    25  // produced by a local clock. The implemention does not depend on their
    26  // accuracy and only uses their change to indicate that another client has
    27  // renewed the leader lease. Thus the implementation is tolerant to arbitrary
    28  // clock skew, but is not tolerant to arbitrary clock skew rate.
    29  //
    30  // However the level of tolerance to skew rate can be configured by setting
    31  // RenewDeadline and LeaseDuration appropriately. The tolerance expressed as a
    32  // maximum tolerated ratio of time passed on the fastest node to time passed on
    33  // the slowest node can be approximately achieved with a configuration that sets
    34  // the same ratio of LeaseDuration to RenewDeadline. For example if a user wanted
    35  // to tolerate some nodes progressing forward in time twice as fast as other nodes,
    36  // the user could set LeaseDuration to 60 seconds and RenewDeadline to 30 seconds.
    37  //
    38  // While not required, some method of clock synchronization between nodes in the
    39  // cluster is highly recommended. It's important to keep in mind when configuring
    40  // this client that the tolerance to skew rate varies inversely to master
    41  // availability.
    42  //
    43  // Larger clusters often have a more lenient SLA for API latency. This should be
    44  // taken into account when configuring the client. The rate of leader transitions
    45  // should be monitored and RetryPeriod and LeaseDuration should be increased
    46  // until the rate is stable and acceptably low. It's important to keep in mind
    47  // when configuring this client that the tolerance to API latency varies inversely
    48  // to master availability.
    49  //
    50  // DISCLAIMER: this is an alpha API. This library will likely change significantly
    51  // or even be removed entirely in subsequent releases. Depend on this API at
    52  // your own risk.
    53  package leaderelection
    54  
    55  import (
    56  	"bytes"
    57  	"context"
    58  	"fmt"
    59  	"sync"
    60  	"time"
    61  
    62  	"k8s.io/apimachinery/pkg/api/errors"
    63  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    64  	"k8s.io/apimachinery/pkg/util/clock"
    65  	"k8s.io/apimachinery/pkg/util/runtime"
    66  	"k8s.io/apimachinery/pkg/util/wait"
    67  	rl "k8s.io/client-go/tools/leaderelection/resourcelock"
    68  
    69  	"k8s.io/klog/v2"
    70  )
    71  
    72  const (
    73  	JitterFactor = 1.2
    74  )
    75  
    76  // NewLeaderElector creates a LeaderElector from a LeaderElectionConfig
    77  func NewLeaderElector(lec LeaderElectionConfig) (*LeaderElector, error) {
    78  	if lec.LeaseDuration <= lec.RenewDeadline {
    79  		return nil, fmt.Errorf("leaseDuration must be greater than renewDeadline")
    80  	}
    81  	if lec.RenewDeadline <= time.Duration(JitterFactor*float64(lec.RetryPeriod)) {
    82  		return nil, fmt.Errorf("renewDeadline must be greater than retryPeriod*JitterFactor")
    83  	}
    84  	if lec.LeaseDuration < 1 {
    85  		return nil, fmt.Errorf("leaseDuration must be greater than zero")
    86  	}
    87  	if lec.RenewDeadline < 1 {
    88  		return nil, fmt.Errorf("renewDeadline must be greater than zero")
    89  	}
    90  	if lec.RetryPeriod < 1 {
    91  		return nil, fmt.Errorf("retryPeriod must be greater than zero")
    92  	}
    93  	if lec.Callbacks.OnStartedLeading == nil {
    94  		return nil, fmt.Errorf("OnStartedLeading callback must not be nil")
    95  	}
    96  	if lec.Callbacks.OnStoppedLeading == nil {
    97  		return nil, fmt.Errorf("OnStoppedLeading callback must not be nil")
    98  	}
    99  
   100  	if lec.Lock == nil {
   101  		return nil, fmt.Errorf("Lock must not be nil.")
   102  	}
   103  	le := LeaderElector{
   104  		config:  lec,
   105  		clock:   clock.RealClock{},
   106  		metrics: globalMetricsFactory.newLeaderMetrics(),
   107  	}
   108  	le.metrics.leaderOff(le.config.Name)
   109  	return &le, nil
   110  }
   111  
   112  type LeaderElectionConfig struct {
   113  	// Lock is the resource that will be used for locking
   114  	Lock rl.Interface
   115  
   116  	// LeaseDuration is the duration that non-leader candidates will
   117  	// wait to force acquire leadership. This is measured against time of
   118  	// last observed ack.
   119  	//
   120  	// A client needs to wait a full LeaseDuration without observing a change to
   121  	// the record before it can attempt to take over. When all clients are
   122  	// shutdown and a new set of clients are started with different names against
   123  	// the same leader record, they must wait the full LeaseDuration before
   124  	// attempting to acquire the lease. Thus LeaseDuration should be as short as
   125  	// possible (within your tolerance for clock skew rate) to avoid a possible
   126  	// long waits in the scenario.
   127  	//
   128  	// Core clients default this value to 15 seconds.
   129  	LeaseDuration time.Duration
   130  	// RenewDeadline is the duration that the acting master will retry
   131  	// refreshing leadership before giving up.
   132  	//
   133  	// Core clients default this value to 10 seconds.
   134  	RenewDeadline time.Duration
   135  	// RetryPeriod is the duration the LeaderElector clients should wait
   136  	// between tries of actions.
   137  	//
   138  	// Core clients default this value to 2 seconds.
   139  	RetryPeriod time.Duration
   140  
   141  	// Callbacks are callbacks that are triggered during certain lifecycle
   142  	// events of the LeaderElector
   143  	Callbacks LeaderCallbacks
   144  
   145  	// WatchDog is the associated health checker
   146  	// WatchDog may be null if its not needed/configured.
   147  	WatchDog *HealthzAdaptor
   148  
   149  	// ReleaseOnCancel should be set true if the lock should be released
   150  	// when the run context is cancelled. If you set this to true, you must
   151  	// ensure all code guarded by this lease has successfully completed
   152  	// prior to cancelling the context, or you may have two processes
   153  	// simultaneously acting on the critical path.
   154  	ReleaseOnCancel bool
   155  
   156  	// Name is the name of the resource lock for debugging
   157  	Name string
   158  }
   159  
   160  // LeaderCallbacks are callbacks that are triggered during certain
   161  // lifecycle events of the LeaderElector. These are invoked asynchronously.
   162  //
   163  // possible future callbacks:
   164  //  * OnChallenge()
   165  type LeaderCallbacks struct {
   166  	// OnStartedLeading is called when a LeaderElector client starts leading
   167  	OnStartedLeading func(context.Context)
   168  	// OnStoppedLeading is called when a LeaderElector client stops leading
   169  	OnStoppedLeading func()
   170  	// OnNewLeader is called when the client observes a leader that is
   171  	// not the previously observed leader. This includes the first observed
   172  	// leader when the client starts.
   173  	OnNewLeader func(identity string)
   174  }
   175  
   176  // LeaderElector is a leader election client.
   177  type LeaderElector struct {
   178  	config LeaderElectionConfig
   179  	// internal bookkeeping
   180  	observedRecord    rl.LeaderElectionRecord
   181  	observedRawRecord []byte
   182  	observedTime      time.Time
   183  	// used to implement OnNewLeader(), may lag slightly from the
   184  	// value observedRecord.HolderIdentity if the transition has
   185  	// not yet been reported.
   186  	reportedLeader string
   187  
   188  	// clock is wrapper around time to allow for less flaky testing
   189  	clock clock.Clock
   190  
   191  	// used to lock the observedRecord
   192  	observedRecordLock sync.Mutex
   193  
   194  	metrics leaderMetricsAdapter
   195  }
   196  
   197  // Run starts the leader election loop. Run will not return
   198  // before leader election loop is stopped by ctx or it has
   199  // stopped holding the leader lease
   200  func (le *LeaderElector) Run(ctx context.Context) {
   201  	defer runtime.HandleCrash()
   202  	defer func() {
   203  		le.config.Callbacks.OnStoppedLeading()
   204  	}()
   205  
   206  	if !le.acquire(ctx) {
   207  		return // ctx signalled done
   208  	}
   209  	ctx, cancel := context.WithCancel(ctx)
   210  	defer cancel()
   211  	go le.config.Callbacks.OnStartedLeading(ctx)
   212  	le.renew(ctx)
   213  }
   214  
   215  // RunOrDie starts a client with the provided config or panics if the config
   216  // fails to validate. RunOrDie blocks until leader election loop is
   217  // stopped by ctx or it has stopped holding the leader lease
   218  func RunOrDie(ctx context.Context, lec LeaderElectionConfig) {
   219  	le, err := NewLeaderElector(lec)
   220  	if err != nil {
   221  		panic(err)
   222  	}
   223  	if lec.WatchDog != nil {
   224  		lec.WatchDog.SetLeaderElection(le)
   225  	}
   226  	le.Run(ctx)
   227  }
   228  
   229  // GetLeader returns the identity of the last observed leader or returns the empty string if
   230  // no leader has yet been observed.
   231  // This function is for informational purposes. (e.g. monitoring, logs, etc.)
   232  func (le *LeaderElector) GetLeader() string {
   233  	return le.getObservedRecord().HolderIdentity
   234  }
   235  
   236  // IsLeader returns true if the last observed leader was this client else returns false.
   237  func (le *LeaderElector) IsLeader() bool {
   238  	return le.getObservedRecord().HolderIdentity == le.config.Lock.Identity()
   239  }
   240  
   241  // acquire loops calling tryAcquireOrRenew and returns true immediately when tryAcquireOrRenew succeeds.
   242  // Returns false if ctx signals done.
   243  func (le *LeaderElector) acquire(ctx context.Context) bool {
   244  	ctx, cancel := context.WithCancel(ctx)
   245  	defer cancel()
   246  	succeeded := false
   247  	desc := le.config.Lock.Describe()
   248  	klog.Infof("attempting to acquire leader lease %v...", desc)
   249  	wait.JitterUntil(func() {
   250  		succeeded = le.tryAcquireOrRenew(ctx)
   251  		le.maybeReportTransition()
   252  		if !succeeded {
   253  			klog.V(4).Infof("failed to acquire lease %v", desc)
   254  			return
   255  		}
   256  		le.config.Lock.RecordEvent("became leader")
   257  		le.metrics.leaderOn(le.config.Name)
   258  		klog.Infof("successfully acquired lease %v", desc)
   259  		cancel()
   260  	}, le.config.RetryPeriod, JitterFactor, true, ctx.Done())
   261  	return succeeded
   262  }
   263  
   264  // renew loops calling tryAcquireOrRenew and returns immediately when tryAcquireOrRenew fails or ctx signals done.
   265  func (le *LeaderElector) renew(ctx context.Context) {
   266  	ctx, cancel := context.WithCancel(ctx)
   267  	defer cancel()
   268  	wait.Until(func() {
   269  		timeoutCtx, timeoutCancel := context.WithTimeout(ctx, le.config.RenewDeadline)
   270  		defer timeoutCancel()
   271  		err := wait.PollImmediateUntil(le.config.RetryPeriod, func() (bool, error) {
   272  			return le.tryAcquireOrRenew(timeoutCtx), nil
   273  		}, timeoutCtx.Done())
   274  
   275  		le.maybeReportTransition()
   276  		desc := le.config.Lock.Describe()
   277  		if err == nil {
   278  			klog.V(5).Infof("successfully renewed lease %v", desc)
   279  			return
   280  		}
   281  		le.config.Lock.RecordEvent("stopped leading")
   282  		le.metrics.leaderOff(le.config.Name)
   283  		klog.Infof("failed to renew lease %v: %v", desc, err)
   284  		cancel()
   285  	}, le.config.RetryPeriod, ctx.Done())
   286  
   287  	// if we hold the lease, give it up
   288  	if le.config.ReleaseOnCancel {
   289  		le.release()
   290  	}
   291  }
   292  
   293  // release attempts to release the leader lease if we have acquired it.
   294  func (le *LeaderElector) release() bool {
   295  	if !le.IsLeader() {
   296  		return true
   297  	}
   298  	now := metav1.Now()
   299  	leaderElectionRecord := rl.LeaderElectionRecord{
   300  		LeaderTransitions:    le.observedRecord.LeaderTransitions,
   301  		LeaseDurationSeconds: 1,
   302  		RenewTime:            now,
   303  		AcquireTime:          now,
   304  	}
   305  	if err := le.config.Lock.Update(context.TODO(), leaderElectionRecord); err != nil {
   306  		klog.Errorf("Failed to release lock: %v", err)
   307  		return false
   308  	}
   309  
   310  	le.setObservedRecord(&leaderElectionRecord)
   311  	return true
   312  }
   313  
   314  // tryAcquireOrRenew tries to acquire a leader lease if it is not already acquired,
   315  // else it tries to renew the lease if it has already been acquired. Returns true
   316  // on success else returns false.
   317  func (le *LeaderElector) tryAcquireOrRenew(ctx context.Context) bool {
   318  	now := metav1.Now()
   319  	leaderElectionRecord := rl.LeaderElectionRecord{
   320  		HolderIdentity:       le.config.Lock.Identity(),
   321  		LeaseDurationSeconds: int(le.config.LeaseDuration / time.Second),
   322  		RenewTime:            now,
   323  		AcquireTime:          now,
   324  	}
   325  
   326  	// 1. obtain or create the ElectionRecord
   327  	oldLeaderElectionRecord, oldLeaderElectionRawRecord, err := le.config.Lock.Get(ctx)
   328  	if err != nil {
   329  		if !errors.IsNotFound(err) {
   330  			klog.Errorf("error retrieving resource lock %v: %v", le.config.Lock.Describe(), err)
   331  			return false
   332  		}
   333  		if err = le.config.Lock.Create(ctx, leaderElectionRecord); err != nil {
   334  			klog.Errorf("error initially creating leader election record: %v", err)
   335  			return false
   336  		}
   337  
   338  		le.setObservedRecord(&leaderElectionRecord)
   339  
   340  		return true
   341  	}
   342  
   343  	// 2. Record obtained, check the Identity & Time
   344  	if !bytes.Equal(le.observedRawRecord, oldLeaderElectionRawRecord) {
   345  		le.setObservedRecord(oldLeaderElectionRecord)
   346  
   347  		le.observedRawRecord = oldLeaderElectionRawRecord
   348  	}
   349  	if len(oldLeaderElectionRecord.HolderIdentity) > 0 &&
   350  		le.observedTime.Add(le.config.LeaseDuration).After(now.Time) &&
   351  		!le.IsLeader() {
   352  		klog.V(4).Infof("lock is held by %v and has not yet expired", oldLeaderElectionRecord.HolderIdentity)
   353  		return false
   354  	}
   355  
   356  	// 3. We're going to try to update. The leaderElectionRecord is set to it's default
   357  	// here. Let's correct it before updating.
   358  	if le.IsLeader() {
   359  		leaderElectionRecord.AcquireTime = oldLeaderElectionRecord.AcquireTime
   360  		leaderElectionRecord.LeaderTransitions = oldLeaderElectionRecord.LeaderTransitions
   361  	} else {
   362  		leaderElectionRecord.LeaderTransitions = oldLeaderElectionRecord.LeaderTransitions + 1
   363  	}
   364  
   365  	// update the lock itself
   366  	if err = le.config.Lock.Update(ctx, leaderElectionRecord); err != nil {
   367  		klog.Errorf("Failed to update lock: %v", err)
   368  		return false
   369  	}
   370  
   371  	le.setObservedRecord(&leaderElectionRecord)
   372  	return true
   373  }
   374  
   375  func (le *LeaderElector) maybeReportTransition() {
   376  	if le.observedRecord.HolderIdentity == le.reportedLeader {
   377  		return
   378  	}
   379  	le.reportedLeader = le.observedRecord.HolderIdentity
   380  	if le.config.Callbacks.OnNewLeader != nil {
   381  		go le.config.Callbacks.OnNewLeader(le.reportedLeader)
   382  	}
   383  }
   384  
   385  // Check will determine if the current lease is expired by more than timeout.
   386  func (le *LeaderElector) Check(maxTolerableExpiredLease time.Duration) error {
   387  	if !le.IsLeader() {
   388  		// Currently not concerned with the case that we are hot standby
   389  		return nil
   390  	}
   391  	// If we are more than timeout seconds after the lease duration that is past the timeout
   392  	// on the lease renew. Time to start reporting ourselves as unhealthy. We should have
   393  	// died but conditions like deadlock can prevent this. (See #70819)
   394  	if le.clock.Since(le.observedTime) > le.config.LeaseDuration+maxTolerableExpiredLease {
   395  		return fmt.Errorf("failed election to renew leadership on lease %s", le.config.Name)
   396  	}
   397  
   398  	return nil
   399  }
   400  
   401  // setObservedRecord will set a new observedRecord and update observedTime to the current time.
   402  // Protect critical sections with lock.
   403  func (le *LeaderElector) setObservedRecord(observedRecord *rl.LeaderElectionRecord) {
   404  	le.observedRecordLock.Lock()
   405  	defer le.observedRecordLock.Unlock()
   406  
   407  	le.observedRecord = *observedRecord
   408  	le.observedTime = le.clock.Now()
   409  }
   410  
   411  // getObservedRecord returns observersRecord.
   412  // Protect critical sections with lock.
   413  func (le *LeaderElector) getObservedRecord() rl.LeaderElectionRecord {
   414  	le.observedRecordLock.Lock()
   415  	defer le.observedRecordLock.Unlock()
   416  
   417  	return le.observedRecord
   418  }