github.com/webmeshproj/webmesh-cni@v0.0.27/internal/ipam/lock.go (about)

     1  /*
     2  Copyright 2023 Avi Zimmerman <avi.zimmerman@gmail.com>.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package ipam
    18  
    19  import (
    20  	"context"
    21  	"errors"
    22  	"fmt"
    23  	"sync"
    24  	"sync/atomic"
    25  	"time"
    26  
    27  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    28  	coordinationv1client "k8s.io/client-go/kubernetes/typed/coordination/v1"
    29  	corev1client "k8s.io/client-go/kubernetes/typed/core/v1"
    30  	"k8s.io/client-go/rest"
    31  	"k8s.io/client-go/tools/leaderelection/resourcelock"
    32  	"sigs.k8s.io/controller-runtime/pkg/log"
    33  
    34  	"github.com/webmeshproj/webmesh-cni/internal/types"
    35  )
    36  
    37  // Locker is the interface for taking a distributed lock during IPv4 allocations.
    38  type Locker interface {
    39  	// Acquire attempts to acquire the lock. If a lock is already acquired, the
    40  	// lock count is incremented. When the lock is released, the lock count is
    41  	// decremented. When the lock count reaches 0, the lock is released.
    42  	Acquire(ctx context.Context) error
    43  	// Release releases the lock. This decrements the lock count. When the lock
    44  	// count reaches 0, the lock is released.
    45  	Release(ctx context.Context)
    46  }
    47  
    48  // LockConfig is the configuration for a lock.
    49  type LockConfig struct {
    50  	ID                 string
    51  	Namespace          string
    52  	LockDuration       time.Duration
    53  	LockAcquireTimeout time.Duration
    54  }
    55  
    56  // NewLock creates a new IPAM lock.
    57  func NewLock(cfg *rest.Config, config LockConfig) (Locker, error) {
    58  	corev1client, err := corev1client.NewForConfig(cfg)
    59  	if err != nil {
    60  		return nil, fmt.Errorf("create corev1 client: %w", err)
    61  	}
    62  	coordinationClient, err := coordinationv1client.NewForConfig(cfg)
    63  	if err != nil {
    64  		return nil, fmt.Errorf("create coordinationv1 client: %w", err)
    65  	}
    66  	rlock, err := resourcelock.New(
    67  		"leases",
    68  		config.Namespace,
    69  		types.IPAMLockID,
    70  		corev1client,
    71  		coordinationClient,
    72  		resourcelock.ResourceLockConfig{
    73  			Identity: config.ID,
    74  		},
    75  	)
    76  	if err != nil {
    77  		return nil, fmt.Errorf("create resource lock interface: %w", err)
    78  	}
    79  	ipamlock := &ipamLock{
    80  		rlock:  rlock,
    81  		config: config,
    82  	}
    83  	return ipamlock, nil
    84  }
    85  
    86  type ipamLock struct {
    87  	rlock     resourcelock.Interface
    88  	config    LockConfig
    89  	lockCount atomic.Int32
    90  	mu        sync.Mutex
    91  }
    92  
    93  // Acquire attempts to acquire the lock.
    94  func (l *ipamLock) Acquire(ctx context.Context) error {
    95  	l.mu.Lock()
    96  	defer l.mu.Unlock()
    97  	log := log.FromContext(ctx).WithName("ipam-lock")
    98  	if l.lockCount.Load() > 0 {
    99  		log.V(1).Info("Lock already held, attempting to renew and increment lock count")
   100  		// Try to update the lock with a renew time.
   101  		lock, _, err := l.rlock.Get(ctx)
   102  		if err == nil {
   103  			lock.RenewTime = metav1.NewTime(time.Now().UTC())
   104  			err = l.rlock.Update(ctx, *lock)
   105  			if err == nil {
   106  				l.lockCount.Add(1)
   107  				return nil
   108  			}
   109  			log.Error(err, "Failed to renew IPAM lock")
   110  			l.lockCount.Store(0)
   111  			return fmt.Errorf("failed to renew IPAM lock: %w", err)
   112  		}
   113  		log.Error(err, "Failed to get IPAM lock")
   114  		l.lockCount.Store(0)
   115  		return fmt.Errorf("failed to acquire IPAM lock: %w", err)
   116  	}
   117  	ctx, cancel := context.WithTimeout(ctx, l.config.LockAcquireTimeout)
   118  	defer cancel()
   119  	for {
   120  		// Check if the lock has already been created.
   121  		lock, _, err := l.rlock.Get(ctx)
   122  		if err == nil {
   123  			// Check if there is a holder for the lock.
   124  			if lock.HolderIdentity != "" {
   125  				// Check if the lock expired.
   126  				if !lock.RenewTime.IsZero() || !lock.AcquireTime.IsZero() {
   127  					var lockExpiry time.Time
   128  					if !lock.RenewTime.IsZero() {
   129  						lockExpiry = lock.RenewTime.Add(time.Duration(lock.LeaseDurationSeconds) * time.Second)
   130  					} else {
   131  						lockExpiry = lock.AcquireTime.Add(time.Duration(lock.LeaseDurationSeconds) * time.Second)
   132  					}
   133  					if lockExpiry.After(time.Now().UTC()) {
   134  						log.V(1).Info("Lock currently held, retrying...", "holder", lock.HolderIdentity)
   135  						goto Retry
   136  					}
   137  					// The lock has expired, try to acquire it.
   138  				}
   139  			}
   140  			// Try to update the lock.
   141  			lock.LeaseDurationSeconds = int(l.config.LockDuration.Seconds())
   142  			lock.HolderIdentity = l.config.ID
   143  			lock.AcquireTime = metav1.NewTime(time.Now().UTC())
   144  			lock.RenewTime = metav1.NewTime(time.Now().UTC())
   145  			err = l.rlock.Update(ctx, *lock)
   146  			if err == nil {
   147  				// We acquired the lock.
   148  				l.lockCount.Add(1)
   149  				return nil
   150  			}
   151  			log.Error(err, "Failed to acquire IPAM lock, retrying...")
   152  			goto Retry
   153  		}
   154  		// Try to create the lock.
   155  		err = l.rlock.Create(ctx, resourcelock.LeaderElectionRecord{
   156  			HolderIdentity:       l.config.ID,
   157  			LeaseDurationSeconds: int(l.config.LockDuration.Seconds()),
   158  		})
   159  		if err == nil {
   160  			// We acquired the lock.
   161  			l.lockCount.Add(1)
   162  			return nil
   163  		}
   164  		log.Error(err, "Failed to acquire IPAM lock, retrying...")
   165  	Retry:
   166  		select {
   167  		case <-ctx.Done():
   168  			return fmt.Errorf("failed to acquire IPAM lock: %w", ctx.Err())
   169  		default:
   170  			time.Sleep(time.Second)
   171  		}
   172  	}
   173  }
   174  
   175  func (l *ipamLock) Release(ctx context.Context) {
   176  	l.mu.Lock()
   177  	defer l.mu.Unlock()
   178  	log := log.FromContext(ctx).WithName("ipam-lock")
   179  	lockCount := l.lockCount.Load()
   180  	if lockCount <= 0 {
   181  		log.Error(errors.New("release unacquired lock"), "Lock count is already 0, cannot release lock")
   182  		return
   183  	}
   184  	lockCount--
   185  	l.lockCount.Store(lockCount)
   186  	if lockCount > 0 {
   187  		log.V(1).Info("Lock still held, not releasing")
   188  		return
   189  	}
   190  	log.V(1).Info("Releasing IPAM lock")
   191  	err := l.rlock.Update(ctx, resourcelock.LeaderElectionRecord{
   192  		HolderIdentity:       "",
   193  		LeaseDurationSeconds: int(l.config.LockDuration.Seconds()),
   194  	})
   195  	if err != nil {
   196  		log.Error(err, "Failed to release IPAM lock")
   197  	}
   198  }