github.com/cilium/cilium@v1.16.2/pkg/kvstore/etcd_lease.go (about)

     1  // SPDX-License-Identifier: Apache-2.0
     2  // Copyright Authors of Cilium
     3  
     4  package kvstore
     5  
     6  import (
     7  	"context"
     8  	"errors"
     9  	"strings"
    10  	"sync"
    11  
    12  	"github.com/sirupsen/logrus"
    13  	v3rpcErrors "go.etcd.io/etcd/api/v3/v3rpc/rpctypes"
    14  	client "go.etcd.io/etcd/client/v3"
    15  	"go.etcd.io/etcd/client/v3/concurrency"
    16  
    17  	"github.com/cilium/cilium/pkg/lock"
    18  	"github.com/cilium/cilium/pkg/spanstat"
    19  	"github.com/cilium/cilium/pkg/time"
    20  )
    21  
    22  type leaseInfo struct {
    23  	count   uint32
    24  	session *concurrency.Session
    25  }
    26  
    27  // etcdLeaseManager manages the acquisition of the leases, and keeps track of
    28  // which lease is attached to which etcd key.
    29  type etcdLeaseManager struct {
    30  	client *client.Client
    31  	log    logrus.FieldLogger
    32  
    33  	ttl     time.Duration
    34  	limit   uint32
    35  	expired func(key string)
    36  
    37  	mu      lock.RWMutex
    38  	leases  map[client.LeaseID]*leaseInfo
    39  	keys    map[string]client.LeaseID
    40  	current client.LeaseID
    41  
    42  	acquiring chan struct{}
    43  	wg        sync.WaitGroup
    44  }
    45  
    46  // newEtcdLeaseManager builds and returns a new lease manager instance.
    47  func newEtcdLeaseManager(cl *client.Client, ttl time.Duration, limit uint32, expired func(key string), log logrus.FieldLogger) *etcdLeaseManager {
    48  	return &etcdLeaseManager{
    49  		client: cl,
    50  		log:    log,
    51  
    52  		ttl:     ttl,
    53  		limit:   limit,
    54  		expired: expired,
    55  
    56  		current: client.NoLease,
    57  		leases:  make(map[client.LeaseID]*leaseInfo),
    58  		keys:    make(map[string]client.LeaseID),
    59  	}
    60  }
    61  
    62  // GetLeaseID returns a lease ID, and associates it to the given key. It leverages
    63  // one of the already acquired leases if they are not already attached to too many
    64  // keys, otherwise a new one is acquired.
    65  //
    66  // There's a small possibility that the returned lease is already expired, or gets
    67  // expired immediately before use (due the time window between the lease expiration
    68  // on the etcd server and the subsequent client side detection and garbage collection).
    69  // As we cannot completely remove this uncertainty period, let's adopt the easiest
    70  // approach here, without explicitly checking if the lease is expired before returning
    71  // it (given that it would be a client-side check only). Instead, let's just rely on
    72  // the fact that the operation will fail (as the lease is no longer valid), triggering
    73  // a retry. At that point, a new (hopefully valid) lease will be retrieved again.
    74  func (elm *etcdLeaseManager) GetLeaseID(ctx context.Context, key string) (client.LeaseID, error) {
    75  	session, err := elm.GetSession(ctx, key)
    76  	if err != nil {
    77  		return client.NoLease, err
    78  	}
    79  
    80  	return session.Lease(), nil
    81  }
    82  
    83  // GetSession returns a session, and associates it to the given key. It leverages
    84  // one of the already acquired leases if they are not already attached to too many
    85  // keys, otherwise a new one is acquired.
    86  //
    87  // There's a small possibility that the returned session is already expired, or gets
    88  // expired immediately before use (due the time window between the lease expiration
    89  // on the etcd server and the subsequent client side detection and garbage collection).
    90  // As we cannot completely remove this uncertainty period, let's adopt the easiest
    91  // approach here, without explicitly checking if the session is expired before returning
    92  // it (given that it would be a client-side check only). Instead, let's just rely on
    93  // the fact that the operation will fail (as the lease is no longer valid), triggering
    94  // a retry. At that point, a new (hopefully valid) session will be retrieved again.
    95  func (elm *etcdLeaseManager) GetSession(ctx context.Context, key string) (*concurrency.Session, error) {
    96  	elm.mu.Lock()
    97  
    98  	// This key is already attached to a lease, hence just return it.
    99  	if leaseID := elm.keys[key]; leaseID != client.NoLease {
   100  		// The entry is guaranteed to exist if the lease is associated with a key
   101  		info := elm.leases[leaseID]
   102  		elm.mu.Unlock()
   103  		return info.session, nil
   104  	}
   105  
   106  	// Return the current lease if it has not been used more than limit times
   107  	if info := elm.leases[elm.current]; info != nil && info.count < elm.limit {
   108  		info.count++
   109  		elm.keys[key] = elm.current
   110  		elm.mu.Unlock()
   111  
   112  		return info.session, nil
   113  	}
   114  
   115  	// Otherwise, loop through the other known leases to see if any has been released
   116  	for lease, info := range elm.leases {
   117  		if info.count < elm.limit {
   118  			elm.current = lease
   119  			info.count++
   120  			elm.keys[key] = elm.current
   121  			elm.mu.Unlock()
   122  
   123  			return info.session, nil
   124  		}
   125  	}
   126  
   127  	// If none is found, we need to acquire a new lease. acquiring is a channel
   128  	// used to detect whether we are already in the process of acquiring a new
   129  	// lease, to prevent multiple acquisitions in parallel.
   130  	acquiring := elm.acquiring
   131  	if acquiring == nil {
   132  		elm.acquiring = make(chan struct{})
   133  	}
   134  
   135  	// Unlock, so that we don't block other paraller operations (e.g., releases)
   136  	// while acquiring a new lease, since it might be a slow operation.
   137  	elm.mu.Unlock()
   138  
   139  	// Someone else is already acquiring a new lease. Wait until
   140  	// it completes, and then retry again.
   141  	if acquiring != nil {
   142  		select {
   143  		case <-acquiring:
   144  			return elm.GetSession(ctx, key)
   145  		case <-ctx.Done():
   146  			return nil, ctx.Err()
   147  		case <-elm.client.Ctx().Done():
   148  			return nil, elm.client.Ctx().Err()
   149  		}
   150  	}
   151  
   152  	// Otherwise, we can proceed to acquire a new lease.
   153  	session, err := elm.newSession(ctx)
   154  
   155  	elm.mu.Lock()
   156  
   157  	// Signal that the acquisition process has completed.
   158  	close(elm.acquiring)
   159  	elm.acquiring = nil
   160  
   161  	if err != nil {
   162  		elm.mu.Unlock()
   163  		return nil, err
   164  	}
   165  
   166  	elm.current = session.Lease()
   167  	elm.leases[session.Lease()] = &leaseInfo{session: session}
   168  	elm.mu.Unlock()
   169  
   170  	return elm.GetSession(ctx, key)
   171  }
   172  
   173  // Release decrements the counter of the lease attached to the given key.
   174  func (elm *etcdLeaseManager) Release(key string) {
   175  	elm.mu.Lock()
   176  	defer elm.mu.Unlock()
   177  
   178  	elm.releaseUnlocked(key)
   179  }
   180  
   181  // ReleasePrefix decrements the counter of the leases attached to the keys
   182  // starting with the given prefix.
   183  func (elm *etcdLeaseManager) ReleasePrefix(prefix string) {
   184  	elm.mu.Lock()
   185  	defer elm.mu.Unlock()
   186  
   187  	for key, leaseID := range elm.keys {
   188  		if strings.HasPrefix(key, prefix) {
   189  			if info := elm.leases[leaseID]; info != nil && info.count > 0 {
   190  				info.count--
   191  			}
   192  			delete(elm.keys, key)
   193  		}
   194  	}
   195  }
   196  
   197  // KeyHasLease returns whether the given key is associated with the specified lease.
   198  func (elm *etcdLeaseManager) KeyHasLease(key string, leaseID client.LeaseID) bool {
   199  	elm.mu.RLock()
   200  	defer elm.mu.RUnlock()
   201  
   202  	return elm.keys[key] == leaseID
   203  }
   204  
   205  // CancelIfExpired verifies whether the error reports that the given lease has
   206  // expired, and in that case aborts the corresponding keepalive process.
   207  func (elm *etcdLeaseManager) CancelIfExpired(err error, leaseID client.LeaseID) {
   208  	if errors.Is(err, v3rpcErrors.ErrLeaseNotFound) {
   209  		elm.mu.Lock()
   210  		if info := elm.leases[leaseID]; info != nil {
   211  			info.session.Orphan()
   212  		}
   213  		elm.mu.Unlock()
   214  	}
   215  }
   216  
   217  // TotalLeases returns the number of managed leases.
   218  func (elm *etcdLeaseManager) TotalLeases() uint32 {
   219  	elm.mu.RLock()
   220  	defer elm.mu.RUnlock()
   221  
   222  	return uint32(len(elm.leases))
   223  }
   224  
   225  // Wait waits until all child goroutines terminated.
   226  func (elm *etcdLeaseManager) Wait() {
   227  	elm.wg.Wait()
   228  }
   229  
   230  func (elm *etcdLeaseManager) newSession(ctx context.Context) (session *concurrency.Session, err error) {
   231  	defer func(duration *spanstat.SpanStat) {
   232  		increaseMetric("lease", metricSet, "AcquireLease", duration.EndError(err).Total(), err)
   233  	}(spanstat.Start())
   234  	resp, err := elm.client.Grant(ctx, int64(elm.ttl.Seconds()))
   235  	if err != nil {
   236  		return nil, err
   237  	}
   238  	leaseID := resp.ID
   239  
   240  	// Construct the session specifying the lease just acquired. This allows to
   241  	// split the possibly blocking operation (i.e., lease acquisition), from the
   242  	// non-blocking one (i.e., the setup of the keepalive logic), so that we can use
   243  	// different contexts. We want the lease acquisition to be controlled by the
   244  	// context associated with the given request, while the keepalive process should
   245  	// continue until either the etcd client is closed or the session is orphaned.
   246  	session, err = concurrency.NewSession(elm.client,
   247  		concurrency.WithLease(leaseID),
   248  		concurrency.WithTTL(int(elm.ttl.Seconds())),
   249  	)
   250  	if err != nil {
   251  		return nil, err
   252  	}
   253  
   254  	elm.wg.Add(1)
   255  	go elm.waitForExpiration(session)
   256  
   257  	elm.log.WithFields(logrus.Fields{
   258  		"LeaseID": leaseID,
   259  		"TTL":     elm.ttl,
   260  	}).Info("New lease successfully acquired")
   261  	return session, nil
   262  }
   263  
   264  func (elm *etcdLeaseManager) waitForExpiration(session *concurrency.Session) {
   265  	defer elm.wg.Done()
   266  
   267  	// Block until the session gets orphaned, either because it fails to be
   268  	// renewed or the etcd client is closed.
   269  	<-session.Done()
   270  
   271  	select {
   272  	case <-elm.client.Ctx().Done():
   273  		// The context of the etcd client was closed
   274  		return
   275  	default:
   276  	}
   277  
   278  	elm.log.WithField("LeaseID", session.Lease()).Warning("Lease expired")
   279  
   280  	elm.mu.Lock()
   281  	delete(elm.leases, session.Lease())
   282  
   283  	var keys []string
   284  	for key, id := range elm.keys {
   285  		if id == session.Lease() {
   286  			keys = append(keys, key)
   287  			delete(elm.keys, key)
   288  		}
   289  	}
   290  	elm.mu.Unlock()
   291  
   292  	if elm.expired != nil {
   293  		for _, key := range keys {
   294  			elm.expired(key)
   295  		}
   296  	}
   297  }
   298  
   299  func (elm *etcdLeaseManager) releaseUnlocked(key string) {
   300  	leaseID := elm.keys[key]
   301  	if leaseID != client.NoLease {
   302  		if info := elm.leases[leaseID]; info != nil && info.count > 0 {
   303  			info.count--
   304  		}
   305  		delete(elm.keys, key)
   306  	}
   307  }