github.com/cilium/cilium@v1.16.2/pkg/kvstore/etcd_lease.go (about) 1 // SPDX-License-Identifier: Apache-2.0 2 // Copyright Authors of Cilium 3 4 package kvstore 5 6 import ( 7 "context" 8 "errors" 9 "strings" 10 "sync" 11 12 "github.com/sirupsen/logrus" 13 v3rpcErrors "go.etcd.io/etcd/api/v3/v3rpc/rpctypes" 14 client "go.etcd.io/etcd/client/v3" 15 "go.etcd.io/etcd/client/v3/concurrency" 16 17 "github.com/cilium/cilium/pkg/lock" 18 "github.com/cilium/cilium/pkg/spanstat" 19 "github.com/cilium/cilium/pkg/time" 20 ) 21 22 type leaseInfo struct { 23 count uint32 24 session *concurrency.Session 25 } 26 27 // etcdLeaseManager manages the acquisition of the leases, and keeps track of 28 // which lease is attached to which etcd key. 29 type etcdLeaseManager struct { 30 client *client.Client 31 log logrus.FieldLogger 32 33 ttl time.Duration 34 limit uint32 35 expired func(key string) 36 37 mu lock.RWMutex 38 leases map[client.LeaseID]*leaseInfo 39 keys map[string]client.LeaseID 40 current client.LeaseID 41 42 acquiring chan struct{} 43 wg sync.WaitGroup 44 } 45 46 // newEtcdLeaseManager builds and returns a new lease manager instance. 47 func newEtcdLeaseManager(cl *client.Client, ttl time.Duration, limit uint32, expired func(key string), log logrus.FieldLogger) *etcdLeaseManager { 48 return &etcdLeaseManager{ 49 client: cl, 50 log: log, 51 52 ttl: ttl, 53 limit: limit, 54 expired: expired, 55 56 current: client.NoLease, 57 leases: make(map[client.LeaseID]*leaseInfo), 58 keys: make(map[string]client.LeaseID), 59 } 60 } 61 62 // GetLeaseID returns a lease ID, and associates it to the given key. It leverages 63 // one of the already acquired leases if they are not already attached to too many 64 // keys, otherwise a new one is acquired. 65 // 66 // There's a small possibility that the returned lease is already expired, or gets 67 // expired immediately before use (due the time window between the lease expiration 68 // on the etcd server and the subsequent client side detection and garbage collection). 69 // As we cannot completely remove this uncertainty period, let's adopt the easiest 70 // approach here, without explicitly checking if the lease is expired before returning 71 // it (given that it would be a client-side check only). Instead, let's just rely on 72 // the fact that the operation will fail (as the lease is no longer valid), triggering 73 // a retry. At that point, a new (hopefully valid) lease will be retrieved again. 74 func (elm *etcdLeaseManager) GetLeaseID(ctx context.Context, key string) (client.LeaseID, error) { 75 session, err := elm.GetSession(ctx, key) 76 if err != nil { 77 return client.NoLease, err 78 } 79 80 return session.Lease(), nil 81 } 82 83 // GetSession returns a session, and associates it to the given key. It leverages 84 // one of the already acquired leases if they are not already attached to too many 85 // keys, otherwise a new one is acquired. 86 // 87 // There's a small possibility that the returned session is already expired, or gets 88 // expired immediately before use (due the time window between the lease expiration 89 // on the etcd server and the subsequent client side detection and garbage collection). 90 // As we cannot completely remove this uncertainty period, let's adopt the easiest 91 // approach here, without explicitly checking if the session is expired before returning 92 // it (given that it would be a client-side check only). Instead, let's just rely on 93 // the fact that the operation will fail (as the lease is no longer valid), triggering 94 // a retry. At that point, a new (hopefully valid) session will be retrieved again. 95 func (elm *etcdLeaseManager) GetSession(ctx context.Context, key string) (*concurrency.Session, error) { 96 elm.mu.Lock() 97 98 // This key is already attached to a lease, hence just return it. 99 if leaseID := elm.keys[key]; leaseID != client.NoLease { 100 // The entry is guaranteed to exist if the lease is associated with a key 101 info := elm.leases[leaseID] 102 elm.mu.Unlock() 103 return info.session, nil 104 } 105 106 // Return the current lease if it has not been used more than limit times 107 if info := elm.leases[elm.current]; info != nil && info.count < elm.limit { 108 info.count++ 109 elm.keys[key] = elm.current 110 elm.mu.Unlock() 111 112 return info.session, nil 113 } 114 115 // Otherwise, loop through the other known leases to see if any has been released 116 for lease, info := range elm.leases { 117 if info.count < elm.limit { 118 elm.current = lease 119 info.count++ 120 elm.keys[key] = elm.current 121 elm.mu.Unlock() 122 123 return info.session, nil 124 } 125 } 126 127 // If none is found, we need to acquire a new lease. acquiring is a channel 128 // used to detect whether we are already in the process of acquiring a new 129 // lease, to prevent multiple acquisitions in parallel. 130 acquiring := elm.acquiring 131 if acquiring == nil { 132 elm.acquiring = make(chan struct{}) 133 } 134 135 // Unlock, so that we don't block other paraller operations (e.g., releases) 136 // while acquiring a new lease, since it might be a slow operation. 137 elm.mu.Unlock() 138 139 // Someone else is already acquiring a new lease. Wait until 140 // it completes, and then retry again. 141 if acquiring != nil { 142 select { 143 case <-acquiring: 144 return elm.GetSession(ctx, key) 145 case <-ctx.Done(): 146 return nil, ctx.Err() 147 case <-elm.client.Ctx().Done(): 148 return nil, elm.client.Ctx().Err() 149 } 150 } 151 152 // Otherwise, we can proceed to acquire a new lease. 153 session, err := elm.newSession(ctx) 154 155 elm.mu.Lock() 156 157 // Signal that the acquisition process has completed. 158 close(elm.acquiring) 159 elm.acquiring = nil 160 161 if err != nil { 162 elm.mu.Unlock() 163 return nil, err 164 } 165 166 elm.current = session.Lease() 167 elm.leases[session.Lease()] = &leaseInfo{session: session} 168 elm.mu.Unlock() 169 170 return elm.GetSession(ctx, key) 171 } 172 173 // Release decrements the counter of the lease attached to the given key. 174 func (elm *etcdLeaseManager) Release(key string) { 175 elm.mu.Lock() 176 defer elm.mu.Unlock() 177 178 elm.releaseUnlocked(key) 179 } 180 181 // ReleasePrefix decrements the counter of the leases attached to the keys 182 // starting with the given prefix. 183 func (elm *etcdLeaseManager) ReleasePrefix(prefix string) { 184 elm.mu.Lock() 185 defer elm.mu.Unlock() 186 187 for key, leaseID := range elm.keys { 188 if strings.HasPrefix(key, prefix) { 189 if info := elm.leases[leaseID]; info != nil && info.count > 0 { 190 info.count-- 191 } 192 delete(elm.keys, key) 193 } 194 } 195 } 196 197 // KeyHasLease returns whether the given key is associated with the specified lease. 198 func (elm *etcdLeaseManager) KeyHasLease(key string, leaseID client.LeaseID) bool { 199 elm.mu.RLock() 200 defer elm.mu.RUnlock() 201 202 return elm.keys[key] == leaseID 203 } 204 205 // CancelIfExpired verifies whether the error reports that the given lease has 206 // expired, and in that case aborts the corresponding keepalive process. 207 func (elm *etcdLeaseManager) CancelIfExpired(err error, leaseID client.LeaseID) { 208 if errors.Is(err, v3rpcErrors.ErrLeaseNotFound) { 209 elm.mu.Lock() 210 if info := elm.leases[leaseID]; info != nil { 211 info.session.Orphan() 212 } 213 elm.mu.Unlock() 214 } 215 } 216 217 // TotalLeases returns the number of managed leases. 218 func (elm *etcdLeaseManager) TotalLeases() uint32 { 219 elm.mu.RLock() 220 defer elm.mu.RUnlock() 221 222 return uint32(len(elm.leases)) 223 } 224 225 // Wait waits until all child goroutines terminated. 226 func (elm *etcdLeaseManager) Wait() { 227 elm.wg.Wait() 228 } 229 230 func (elm *etcdLeaseManager) newSession(ctx context.Context) (session *concurrency.Session, err error) { 231 defer func(duration *spanstat.SpanStat) { 232 increaseMetric("lease", metricSet, "AcquireLease", duration.EndError(err).Total(), err) 233 }(spanstat.Start()) 234 resp, err := elm.client.Grant(ctx, int64(elm.ttl.Seconds())) 235 if err != nil { 236 return nil, err 237 } 238 leaseID := resp.ID 239 240 // Construct the session specifying the lease just acquired. This allows to 241 // split the possibly blocking operation (i.e., lease acquisition), from the 242 // non-blocking one (i.e., the setup of the keepalive logic), so that we can use 243 // different contexts. We want the lease acquisition to be controlled by the 244 // context associated with the given request, while the keepalive process should 245 // continue until either the etcd client is closed or the session is orphaned. 246 session, err = concurrency.NewSession(elm.client, 247 concurrency.WithLease(leaseID), 248 concurrency.WithTTL(int(elm.ttl.Seconds())), 249 ) 250 if err != nil { 251 return nil, err 252 } 253 254 elm.wg.Add(1) 255 go elm.waitForExpiration(session) 256 257 elm.log.WithFields(logrus.Fields{ 258 "LeaseID": leaseID, 259 "TTL": elm.ttl, 260 }).Info("New lease successfully acquired") 261 return session, nil 262 } 263 264 func (elm *etcdLeaseManager) waitForExpiration(session *concurrency.Session) { 265 defer elm.wg.Done() 266 267 // Block until the session gets orphaned, either because it fails to be 268 // renewed or the etcd client is closed. 269 <-session.Done() 270 271 select { 272 case <-elm.client.Ctx().Done(): 273 // The context of the etcd client was closed 274 return 275 default: 276 } 277 278 elm.log.WithField("LeaseID", session.Lease()).Warning("Lease expired") 279 280 elm.mu.Lock() 281 delete(elm.leases, session.Lease()) 282 283 var keys []string 284 for key, id := range elm.keys { 285 if id == session.Lease() { 286 keys = append(keys, key) 287 delete(elm.keys, key) 288 } 289 } 290 elm.mu.Unlock() 291 292 if elm.expired != nil { 293 for _, key := range keys { 294 elm.expired(key) 295 } 296 } 297 } 298 299 func (elm *etcdLeaseManager) releaseUnlocked(key string) { 300 leaseID := elm.keys[key] 301 if leaseID != client.NoLease { 302 if info := elm.leases[leaseID]; info != nil && info.count > 0 { 303 info.count-- 304 } 305 delete(elm.keys, key) 306 } 307 }