istio.io/istio@v0.0.0-20240520182934-d79c90f27776/pilot/pkg/leaderelection/k8sleaderelection/leaderelection.go (about)

     1  /*
     2  Copyright 2015 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  // Package leaderelection implements leader election of a set of endpoints.
    18  // It uses an annotation in the endpoints object to store the record of the
    19  // election state. This implementation does not guarantee that only one
    20  // client is acting as a leader (a.k.a. fencing).
    21  //
    22  // A client only acts on timestamps captured locally to infer the state of the
    23  // leader election. The client does not consider timestamps in the leader
    24  // election record to be accurate because these timestamps may not have been
    25  // produced by a local clock. The implementation does not depend on their
    26  // accuracy and only uses their change to indicate that another client has
    27  // renewed the leader lease. Thus the implementation is tolerant to arbitrary
    28  // clock skew, but is not tolerant to arbitrary clock skew rate.
    29  //
    30  // However the level of tolerance to skew rate can be configured by setting
    31  // RenewDeadline and LeaseDuration appropriately. The tolerance expressed as a
    32  // maximum tolerated ratio of time passed on the fastest node to time passed on
    33  // the slowest node can be approximately achieved with a configuration that sets
    34  // the same ratio of LeaseDuration to RenewDeadline. For example if a user wanted
    35  // to tolerate some nodes progressing forward in time twice as fast as other nodes,
    36  // the user could set LeaseDuration to 60 seconds and RenewDeadline to 30 seconds.
    37  //
    38  // While not required, some method of clock synchronization between nodes in the
    39  // cluster is highly recommended. It's important to keep in mind when configuring
    40  // this client that the tolerance to skew rate varies inversely to master
    41  // availability.
    42  //
    43  // Larger clusters often have a more lenient SLA for API latency. This should be
    44  // taken into account when configuring the client. The rate of leader transitions
    45  // should be monitored and RetryPeriod and LeaseDuration should be increased
    46  // until the rate is stable and acceptably low. It's important to keep in mind
    47  // when configuring this client that the tolerance to API latency varies inversely
    48  // to master availability.
    49  //
    50  // DISCLAIMER: this is an alpha API. This library will likely change significantly
    51  // or even be removed entirely in subsequent releases. Depend on this API at
    52  // your own risk.
    53  // nolint
    54  package k8sleaderelection
    55  
    56  import (
    57  	"bytes"
    58  	"context"
    59  	"fmt"
    60  	"sync"
    61  	"time"
    62  
    63  	"k8s.io/apimachinery/pkg/api/errors"
    64  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    65  	"k8s.io/apimachinery/pkg/util/runtime"
    66  	"k8s.io/apimachinery/pkg/util/wait"
    67  	"k8s.io/klog/v2"
    68  	"k8s.io/utils/clock"
    69  
    70  	"istio.io/istio/pilot/pkg/leaderelection/k8sleaderelection/k8sresourcelock"
    71  )
    72  
    73  const (
    74  	JitterFactor = 1.2
    75  )
    76  
    77  // NewLeaderElector creates a LeaderElector from a LeaderElectionConfig
    78  func NewLeaderElector(lec LeaderElectionConfig) (*LeaderElector, error) {
    79  	if lec.LeaseDuration <= lec.RenewDeadline {
    80  		return nil, fmt.Errorf("leaseDuration must be greater than renewDeadline")
    81  	}
    82  	if lec.RenewDeadline <= time.Duration(JitterFactor*float64(lec.RetryPeriod)) {
    83  		return nil, fmt.Errorf("renewDeadline must be greater than retryPeriod*JitterFactor")
    84  	}
    85  	if lec.LeaseDuration < 1 {
    86  		return nil, fmt.Errorf("leaseDuration must be greater than zero")
    87  	}
    88  	if lec.RenewDeadline < 1 {
    89  		return nil, fmt.Errorf("renewDeadline must be greater than zero")
    90  	}
    91  	if lec.RetryPeriod < 1 {
    92  		return nil, fmt.Errorf("retryPeriod must be greater than zero")
    93  	}
    94  	if lec.Callbacks.OnStartedLeading == nil {
    95  		return nil, fmt.Errorf("callback OnStartedLeading must not be nil")
    96  	}
    97  	if lec.Callbacks.OnStoppedLeading == nil {
    98  		return nil, fmt.Errorf("callback OnStoppedLeading  must not be nil")
    99  	}
   100  
   101  	if lec.Lock == nil {
   102  		return nil, fmt.Errorf("lock must not be nil")
   103  	}
   104  	le := LeaderElector{
   105  		config:  lec,
   106  		clock:   clock.RealClock{},
   107  		metrics: globalMetricsFactory.newLeaderMetrics(),
   108  	}
   109  	le.metrics.leaderOff(le.config.Name)
   110  	return &le, nil
   111  }
   112  
   113  type KeyComparisonFunc func(existingKey string) bool
   114  
   115  type LeaderElectionConfig struct {
   116  	// Lock is the resource that will be used for locking
   117  	Lock k8sresourcelock.Interface
   118  
   119  	// LeaseDuration is the duration that non-leader candidates will
   120  	// wait to force acquire leadership. This is measured against time of
   121  	// last observed ack.
   122  	//
   123  	// A client needs to wait a full LeaseDuration without observing a change to
   124  	// the record before it can attempt to take over. When all clients are
   125  	// shutdown and a new set of clients are started with different names against
   126  	// the same leader record, they must wait the full LeaseDuration before
   127  	// attempting to acquire the lease. Thus LeaseDuration should be as short as
   128  	// possible (within your tolerance for clock skew rate) to avoid a possible
   129  	// long waits in the scenario.
   130  	//
   131  	// Core clients default this value to 15 seconds.
   132  	LeaseDuration time.Duration
   133  	// RenewDeadline is the duration that the acting master will retry
   134  	// refreshing leadership before giving up.
   135  	//
   136  	// Core clients default this value to 10 seconds.
   137  	RenewDeadline time.Duration
   138  	// RetryPeriod is the duration the LeaderElector clients should wait
   139  	// between tries of actions.
   140  	//
   141  	// Core clients default this value to 2 seconds.
   142  	RetryPeriod time.Duration
   143  
   144  	// KeyComparison defines a function to compare the existing leader's key to our own.
   145  	// If the function returns true, indicating our key has high precedence, we will take over
   146  	// leadership even if their is another un-expired leader.
   147  	//
   148  	// This can be used to implemented a prioritized leader election. For example, if multiple
   149  	// versions of the same application run simultaneously, we can ensure the newest version
   150  	// will become the leader.
   151  	//
   152  	// It is the responsibility of the caller to ensure that all KeyComparison functions are
   153  	// logically consistent between all clients participating in the leader election to avoid multiple
   154  	// clients claiming to have high precedence and constantly pre-empting the existing leader.
   155  	//
   156  	// KeyComparison functions should ensure they handle an empty existingKey, as "key" is not a required field.
   157  	//
   158  	// Warning: when a lock is stolen (from KeyComparison returning true), the old leader may not
   159  	// immediately be notified they have lost the leader election.
   160  	KeyComparison KeyComparisonFunc
   161  
   162  	// Callbacks are callbacks that are triggered during certain lifecycle
   163  	// events of the LeaderElector
   164  	Callbacks LeaderCallbacks
   165  
   166  	// WatchDog is the associated health checker
   167  	// WatchDog may be null if its not needed/configured.
   168  	WatchDog *HealthzAdaptor
   169  
   170  	// ReleaseOnCancel should be set true if the lock should be released
   171  	// when the run context is canceled. If you set this to true, you must
   172  	// ensure all code guarded by this lease has successfully completed
   173  	// prior to canceling the context, or you may have two processes
   174  	// simultaneously acting on the critical path.
   175  	ReleaseOnCancel bool
   176  
   177  	// Name is the name of the resource lock for debugging
   178  	Name string
   179  }
   180  
   181  // LeaderCallbacks are callbacks that are triggered during certain
   182  // lifecycle events of the LeaderElector. These are invoked asynchronously.
   183  //
   184  // possible future callbacks:
   185  //   - OnChallenge()
   186  type LeaderCallbacks struct {
   187  	// OnStartedLeading is called when a LeaderElector client starts leading
   188  	OnStartedLeading func(context.Context)
   189  	// OnStoppedLeading is called when a LeaderElector client stops leading
   190  	OnStoppedLeading func()
   191  	// OnNewLeader is called when the client observes a leader that is
   192  	// not the previously observed leader. This includes the first observed
   193  	// leader when the client starts.
   194  	OnNewLeader func(identity string)
   195  }
   196  
   197  // LeaderElector is a leader election client.
   198  type LeaderElector struct {
   199  	config LeaderElectionConfig
   200  	// internal bookkeeping
   201  	observedRecord    k8sresourcelock.LeaderElectionRecord
   202  	observedRawRecord []byte
   203  	observedTime      time.Time
   204  	// used to implement OnNewLeader(), may lag slightly from the
   205  	// value observedRecord.HolderIdentity if the transition has
   206  	// not yet been reported.
   207  	reportedLeader string
   208  
   209  	// clock is wrapper around time to allow for less flaky testing
   210  	clock clock.Clock
   211  
   212  	// used to lock the observedRecord
   213  	observedRecordLock sync.Mutex
   214  
   215  	metrics leaderMetricsAdapter
   216  }
   217  
   218  // Run starts the leader election loop. Run will not return
   219  // before leader election loop is stopped by ctx or it has
   220  // stopped holding the leader lease
   221  func (le *LeaderElector) Run(ctx context.Context) {
   222  	defer runtime.HandleCrash()
   223  	defer func() {
   224  		le.config.Callbacks.OnStoppedLeading()
   225  	}()
   226  
   227  	if !le.acquire(ctx) {
   228  		return // ctx signaled done
   229  	}
   230  	ctx, cancel := context.WithCancel(ctx)
   231  	defer cancel()
   232  	go le.config.Callbacks.OnStartedLeading(ctx)
   233  	le.renew(ctx)
   234  }
   235  
   236  // RunOrDie starts a client with the provided config or panics if the config
   237  // fails to validate. RunOrDie blocks until leader election loop is
   238  // stopped by ctx or it has stopped holding the leader lease
   239  func RunOrDie(ctx context.Context, lec LeaderElectionConfig) {
   240  	le, err := NewLeaderElector(lec)
   241  	if err != nil {
   242  		panic(err)
   243  	}
   244  	if lec.WatchDog != nil {
   245  		lec.WatchDog.SetLeaderElection(le)
   246  	}
   247  	le.Run(ctx)
   248  }
   249  
   250  // GetLeader returns the identity of the last observed leader or returns the empty string if
   251  // no leader has yet been observed.
   252  // This function is for informational purposes. (e.g. monitoring, logs, etc.)
   253  func (le *LeaderElector) GetLeader() string {
   254  	return le.getObservedRecord().HolderIdentity
   255  }
   256  
   257  // IsLeader returns true if the last observed leader was this client else returns false.
   258  func (le *LeaderElector) IsLeader() bool {
   259  	return le.getObservedRecord().HolderIdentity == le.config.Lock.Identity()
   260  }
   261  
   262  // acquire loops calling tryAcquireOrRenew and returns true immediately when tryAcquireOrRenew succeeds.
   263  // Returns false if ctx signals done.
   264  func (le *LeaderElector) acquire(ctx context.Context) bool {
   265  	ctx, cancel := context.WithCancel(ctx)
   266  	defer cancel()
   267  	succeeded := false
   268  	desc := le.config.Lock.Describe()
   269  	klog.Infof("attempting to acquire leader lease %v...", desc)
   270  	wait.JitterUntil(func() {
   271  		succeeded = le.tryAcquireOrRenew(ctx)
   272  		le.maybeReportTransition()
   273  		if !succeeded {
   274  			klog.V(4).Infof("failed to acquire lease %v", desc)
   275  			return
   276  		}
   277  		le.config.Lock.RecordEvent("became leader")
   278  		le.metrics.leaderOn(le.config.Name)
   279  		klog.Infof("successfully acquired lease %v", desc)
   280  		cancel()
   281  	}, le.config.RetryPeriod, JitterFactor, true, ctx.Done())
   282  	return succeeded
   283  }
   284  
   285  // renew loops calling tryAcquireOrRenew and returns immediately when tryAcquireOrRenew fails or ctx signals done.
   286  func (le *LeaderElector) renew(ctx context.Context) {
   287  	ctx, cancel := context.WithCancel(ctx)
   288  	defer cancel()
   289  	wait.Until(func() {
   290  		timeoutCtx, timeoutCancel := context.WithTimeout(ctx, le.config.RenewDeadline)
   291  		defer timeoutCancel()
   292  		err := wait.PollImmediateUntil(le.config.RetryPeriod, func() (bool, error) {
   293  			return le.tryAcquireOrRenew(timeoutCtx), nil
   294  		}, timeoutCtx.Done())
   295  
   296  		le.maybeReportTransition()
   297  		desc := le.config.Lock.Describe()
   298  		if err == nil {
   299  			klog.V(5).Infof("successfully renewed lease %v", desc)
   300  			return
   301  		}
   302  		le.config.Lock.RecordEvent("stopped leading")
   303  		le.metrics.leaderOff(le.config.Name)
   304  		klog.Infof("failed to renew lease %v: %v", desc, err)
   305  		cancel()
   306  	}, le.config.RetryPeriod, ctx.Done())
   307  
   308  	// if we hold the lease, give it up
   309  	if le.config.ReleaseOnCancel {
   310  		le.release()
   311  	}
   312  }
   313  
   314  // release attempts to release the leader lease if we have acquired it.
   315  func (le *LeaderElector) release() bool {
   316  	if !le.IsLeader() {
   317  		return true
   318  	}
   319  	now := metav1.Now()
   320  	leaderElectionRecord := k8sresourcelock.LeaderElectionRecord{
   321  		LeaderTransitions:    le.observedRecord.LeaderTransitions,
   322  		LeaseDurationSeconds: 1,
   323  		RenewTime:            now,
   324  		AcquireTime:          now,
   325  	}
   326  	if err := le.config.Lock.Update(context.TODO(), leaderElectionRecord); err != nil {
   327  		klog.Errorf("Failed to release lock: %v", err)
   328  		return false
   329  	}
   330  
   331  	le.setObservedRecord(&leaderElectionRecord)
   332  	return true
   333  }
   334  
   335  // tryAcquireOrRenew tries to acquire a leader lease if it is not already acquired,
   336  // else it tries to renew the lease if it has already been acquired. Returns true
   337  // on success else returns false.
   338  func (le *LeaderElector) tryAcquireOrRenew(ctx context.Context) bool {
   339  	now := metav1.Now()
   340  	leaderElectionRecord := k8sresourcelock.LeaderElectionRecord{
   341  		HolderIdentity:       le.config.Lock.Identity(),
   342  		HolderKey:            le.config.Lock.Key(),
   343  		LeaseDurationSeconds: int(le.config.LeaseDuration / time.Second),
   344  		RenewTime:            now,
   345  		AcquireTime:          now,
   346  	}
   347  
   348  	// 1. obtain or create the ElectionRecord
   349  	oldLeaderElectionRecord, oldLeaderElectionRawRecord, err := le.config.Lock.Get(ctx)
   350  	if err != nil {
   351  		if !errors.IsNotFound(err) {
   352  			klog.Errorf("error retrieving resource lock %v: %v", le.config.Lock.Describe(), err)
   353  			return false
   354  		}
   355  		if err = le.config.Lock.Create(ctx, leaderElectionRecord); err != nil {
   356  			klog.Errorf("error initially creating leader election record: %v", err)
   357  			return false
   358  		}
   359  
   360  		le.setObservedRecord(&leaderElectionRecord)
   361  
   362  		return true
   363  	}
   364  
   365  	// 2. Record obtained, check the Identity & Time
   366  	if !bytes.Equal(le.observedRawRecord, oldLeaderElectionRawRecord) {
   367  		le.setObservedRecord(oldLeaderElectionRecord)
   368  
   369  		le.observedRawRecord = oldLeaderElectionRawRecord
   370  	}
   371  	if len(oldLeaderElectionRecord.HolderIdentity) > 0 &&
   372  		le.observedTime.Add(le.config.LeaseDuration).After(now.Time) &&
   373  		!le.IsLeader() {
   374  		if le.config.KeyComparison != nil && le.config.KeyComparison(oldLeaderElectionRecord.HolderKey) {
   375  			// Lock is held and not expired, but our key is higher than the existing one.
   376  			// We will pre-empt the existing leader.
   377  			// nolint: lll
   378  			klog.V(4).Infof("lock is held by %v with key %v, but our key (%v) evicts it", oldLeaderElectionRecord.HolderIdentity, oldLeaderElectionRecord.HolderKey, le.config.Lock.Key())
   379  		} else {
   380  			klog.V(4).Infof("lock is held by %v and has not yet expired", oldLeaderElectionRecord.HolderIdentity)
   381  			return false
   382  		}
   383  	}
   384  
   385  	// 3. We're going to try to update. The leaderElectionRecord is set to it's default
   386  	// here. Let's correct it before updating.
   387  	if le.IsLeader() {
   388  		leaderElectionRecord.AcquireTime = oldLeaderElectionRecord.AcquireTime
   389  		leaderElectionRecord.LeaderTransitions = oldLeaderElectionRecord.LeaderTransitions
   390  	} else {
   391  		leaderElectionRecord.LeaderTransitions = oldLeaderElectionRecord.LeaderTransitions + 1
   392  	}
   393  
   394  	// update the lock itself
   395  	if err = le.config.Lock.Update(ctx, leaderElectionRecord); err != nil {
   396  		klog.Errorf("Failed to update lock: %v", err)
   397  		return false
   398  	}
   399  
   400  	le.setObservedRecord(&leaderElectionRecord)
   401  	return true
   402  }
   403  
   404  func (le *LeaderElector) maybeReportTransition() {
   405  	if le.observedRecord.HolderIdentity == le.reportedLeader {
   406  		return
   407  	}
   408  	le.reportedLeader = le.observedRecord.HolderIdentity
   409  	if le.config.Callbacks.OnNewLeader != nil {
   410  		go le.config.Callbacks.OnNewLeader(le.reportedLeader)
   411  	}
   412  }
   413  
   414  // Check will determine if the current lease is expired by more than timeout.
   415  func (le *LeaderElector) Check(maxTolerableExpiredLease time.Duration) error {
   416  	if !le.IsLeader() {
   417  		// Currently not concerned with the case that we are hot standby
   418  		return nil
   419  	}
   420  	// If we are more than timeout seconds after the lease duration that is past the timeout
   421  	// on the lease renew. Time to start reporting ourselves as unhealthy. We should have
   422  	// died but conditions like deadlock can prevent this. (See #70819)
   423  	if le.clock.Since(le.observedTime) > le.config.LeaseDuration+maxTolerableExpiredLease {
   424  		return fmt.Errorf("failed election to renew leadership on lease %s", le.config.Name)
   425  	}
   426  
   427  	return nil
   428  }
   429  
   430  // setObservedRecord will set a new observedRecord and update observedTime to the current time.
   431  // Protect critical sections with lock.
   432  func (le *LeaderElector) setObservedRecord(observedRecord *k8sresourcelock.LeaderElectionRecord) {
   433  	le.observedRecordLock.Lock()
   434  	defer le.observedRecordLock.Unlock()
   435  
   436  	le.observedRecord = *observedRecord
   437  	le.observedTime = le.clock.Now()
   438  }
   439  
   440  // getObservedRecord returns observersRecord.
   441  // Protect critical sections with lock.
   442  func (le *LeaderElector) getObservedRecord() k8sresourcelock.LeaderElectionRecord {
   443  	le.observedRecordLock.Lock()
   444  	defer le.observedRecordLock.Unlock()
   445  
   446  	return le.observedRecord
   447  }