github.com/hashicorp/nomad/api@v0.0.0-20240306165712-3193ac204f65/locks_test.go (about)

     1  // Copyright (c) HashiCorp, Inc.
     2  // SPDX-License-Identifier: MPL-2.0
     3  
     4  package api
     5  
     6  import (
     7  	"context"
     8  	"errors"
     9  	"sync"
    10  	"testing"
    11  	"time"
    12  
    13  	"github.com/shoenig/test/must"
    14  )
    15  
    16  var testLease = 10 * time.Millisecond
    17  
    18  type mockLock struct {
    19  	locked        bool
    20  	acquireCalls  map[string]int
    21  	renewsCounter int
    22  	mu            sync.Mutex
    23  
    24  	leaseStartTime time.Time
    25  }
    26  
    27  func (ml *mockLock) acquire(_ context.Context, callerID string) (string, error) {
    28  	ml.mu.Lock()
    29  	defer ml.mu.Unlock()
    30  
    31  	if callerID == "hac-early-return" {
    32  		return "", ErrLockConflict
    33  	}
    34  
    35  	ml.acquireCalls[callerID] += 1
    36  	if ml.locked {
    37  		return "", nil
    38  	}
    39  
    40  	ml.locked = true
    41  	ml.leaseStartTime = time.Now()
    42  	ml.renewsCounter = 0
    43  	return "lockPath", nil
    44  }
    45  
    46  type lockHandler struct {
    47  	*mockLock
    48  	callerID string
    49  }
    50  
    51  func (lh *lockHandler) LockTTL() time.Duration {
    52  	return testLease
    53  }
    54  
    55  func (lh *lockHandler) Acquire(ctx context.Context) (string, error) {
    56  	return lh.acquire(ctx, lh.callerID)
    57  }
    58  
    59  func (ml *mockLock) Release(_ context.Context) error {
    60  	ml.mu.Lock()
    61  	defer ml.mu.Unlock()
    62  
    63  	if !ml.locked {
    64  		return errors.New("lock not locked")
    65  	}
    66  
    67  	ml.locked = false
    68  	ml.renewsCounter = 0
    69  	return nil
    70  }
    71  
    72  // The behavior of renew is not an exact replication of
    73  // the lock work, its intended to test the behavior of the
    74  // multiple instances running.
    75  func (ml *mockLock) Renew(_ context.Context) error {
    76  	ml.mu.Lock()
    77  	defer ml.mu.Unlock()
    78  
    79  	if !ml.locked {
    80  		return errors.New("error")
    81  	}
    82  
    83  	if time.Since(ml.leaseStartTime) > testLease {
    84  		ml.locked = false
    85  		return ErrLockConflict
    86  	}
    87  
    88  	ml.leaseStartTime = time.Now()
    89  	ml.renewsCounter += 1
    90  	return nil
    91  }
    92  
    93  func (ml *mockLock) getLockState() mockLock {
    94  	ml.mu.Lock()
    95  	defer ml.mu.Unlock()
    96  
    97  	return mockLock{
    98  		locked:        ml.locked,
    99  		acquireCalls:  copyMap(ml.acquireCalls),
   100  		renewsCounter: ml.renewsCounter,
   101  	}
   102  }
   103  
   104  type mockService struct {
   105  	mockLock
   106  
   107  	mu            sync.Mutex
   108  	startsCounter int
   109  	starterID     string
   110  }
   111  
   112  func (ms *mockService) Run(callerID string, _ context.Context) func(ctx context.Context) error {
   113  	return func(ctx context.Context) error {
   114  		ms.mu.Lock()
   115  		ms.startsCounter += 1
   116  		ms.starterID = callerID
   117  		ms.mu.Unlock()
   118  
   119  		<-ctx.Done()
   120  
   121  		ms.mu.Lock()
   122  		ms.starterID = ""
   123  		ms.mu.Unlock()
   124  
   125  		return nil
   126  	}
   127  }
   128  
   129  func (ms *mockService) getServiceState() mockService {
   130  	ms.mu.Lock()
   131  	defer ms.mu.Unlock()
   132  
   133  	return mockService{
   134  		startsCounter: ms.startsCounter,
   135  		starterID:     ms.starterID,
   136  	}
   137  }
   138  
   139  func TestAcquireLock_MultipleInstances(t *testing.T) {
   140  	l := mockLock{
   141  		acquireCalls: map[string]int{},
   142  	}
   143  
   144  	s := mockService{}
   145  
   146  	testCtx := context.Background()
   147  
   148  	// Set up independent contexts to test the switch when one controller stops
   149  	hac1Ctx, hac1Cancel := context.WithCancel(testCtx)
   150  	defer hac1Cancel()
   151  
   152  	// Wait time on hac1 is 0, it should always get the lock.
   153  	hac1 := LockLeaser{
   154  		Name: "hac1",
   155  		locker: &lockHandler{
   156  			mockLock: &l,
   157  			callerID: "hac1",
   158  		},
   159  		renewalPeriod: time.Duration(float64(testLease) * lockLeaseRenewalFactor),
   160  		waitPeriod:    time.Duration(float64(testLease) * lockRetryBackoffFactor),
   161  		randomDelay:   0,
   162  	}
   163  
   164  	hac2 := LockLeaser{
   165  		Name: "hac2",
   166  		locker: &lockHandler{
   167  			mockLock: &l,
   168  			callerID: "hac2",
   169  		},
   170  		renewalPeriod: time.Duration(float64(testLease) * lockLeaseRenewalFactor),
   171  		waitPeriod:    time.Duration(float64(testLease) * lockRetryBackoffFactor),
   172  		randomDelay:   6 * time.Millisecond,
   173  	}
   174  
   175  	lock := l.getLockState()
   176  	must.False(t, lock.locked)
   177  
   178  	go func() {
   179  		err := hac1.Start(hac1Ctx, s.Run(hac1.Name, testCtx))
   180  		must.NoError(t, err)
   181  	}()
   182  
   183  	go func() {
   184  		err := hac2.Start(testCtx, s.Run(hac2.Name, testCtx))
   185  		must.NoError(t, err)
   186  	}()
   187  
   188  	time.Sleep(4 * time.Millisecond)
   189  	/*
   190  		After 4 ms more (4 ms total):
   191  		* hac2 should  not have tried to acquire the lock because it has an initial delay of 6ms.
   192  		* hac1 should have the lock and the service should be running.
   193  		* The first lease is not over yet, no calls to renew should have been made.
   194  	*/
   195  
   196  	lock = l.getLockState()
   197  	service := s.getServiceState()
   198  
   199  	must.True(t, lock.locked)
   200  	must.Eq(t, 1, lock.acquireCalls[hac1.Name])
   201  	must.Eq(t, 0, lock.acquireCalls[hac2.Name])
   202  
   203  	must.Eq(t, 0, lock.renewsCounter)
   204  
   205  	must.Eq(t, 1, service.startsCounter)
   206  	must.StrContains(t, hac1.Name, service.starterID)
   207  
   208  	time.Sleep(6 * time.Millisecond)
   209  	/*
   210  		After 6 ms more (10 ms total):
   211  		* hac2 should have tried to acquire the lock at least once, after the 6ms
   212  			initial delay has passed.
   213  		* hc1 should have renewed once the lease and still hold the lock.
   214  	*/
   215  	lock = l.getLockState()
   216  	service = s.getServiceState()
   217  	must.True(t, lock.locked)
   218  	must.Eq(t, 1, lock.acquireCalls[hac1.Name])
   219  	must.Eq(t, 1, lock.acquireCalls[hac2.Name])
   220  
   221  	must.One(t, lock.renewsCounter)
   222  
   223  	must.One(t, service.startsCounter)
   224  	must.StrContains(t, hac1.Name, service.starterID)
   225  
   226  	time.Sleep(5 * time.Millisecond)
   227  
   228  	/*
   229  		After 5 ms more (15 ms total):
   230  		* hac2 should have tried to acquire the lock still just once:
   231  				initialDelay(6) + waitTime(11) = 18.
   232  		* hac1 should have renewed the lease 2 times and still hold the lock:
   233  				initialDelay(0) + renewals(2) * renewalPeriod(7) = 14.
   234  	*/
   235  
   236  	lock = l.getLockState()
   237  	service = s.getServiceState()
   238  	must.Eq(t, 1, lock.acquireCalls[hac1.Name])
   239  	must.Eq(t, 1, lock.acquireCalls[hac2.Name])
   240  
   241  	must.True(t, lock.locked)
   242  
   243  	must.Eq(t, 2, lock.renewsCounter)
   244  	must.Eq(t, 1, service.startsCounter)
   245  	must.StrContains(t, hac1.Name, service.starterID)
   246  
   247  	time.Sleep(15 * time.Millisecond)
   248  
   249  	/*
   250  		After 15 ms more (30 ms total):
   251  		* hac2 should have tried to acquire the lock 3 times:
   252  				initialDelay(6) + calls(2)* waitTime(11) = 28.
   253  		* hac1 should have renewed the lease 4 times and still hold the lock:
   254  				initialDelay(0) + renewals(4) * renewalPeriod(7) = 28.
   255  	*/
   256  
   257  	lock = l.getLockState()
   258  	service = s.getServiceState()
   259  	must.Eq(t, 1, lock.acquireCalls[hac1.Name])
   260  	must.Eq(t, 3, lock.acquireCalls[hac2.Name])
   261  
   262  	must.True(t, lock.locked)
   263  
   264  	must.Eq(t, 4, lock.renewsCounter)
   265  	must.Eq(t, 1, service.startsCounter)
   266  	must.StrContains(t, hac1.Name, service.starterID)
   267  
   268  	// Start a new instance of the service with ha running, initial delay of 1ms
   269  	hac3 := LockLeaser{
   270  		Name: "hac3",
   271  		locker: &lockHandler{
   272  			mockLock: &l,
   273  			callerID: "hac3",
   274  		},
   275  		renewalPeriod: time.Duration(float64(testLease) * lockLeaseRenewalFactor),
   276  		waitPeriod:    time.Duration(float64(testLease) * lockRetryBackoffFactor),
   277  		randomDelay:   1 * time.Millisecond,
   278  	}
   279  
   280  	go func() {
   281  		err := hac3.Start(testCtx, s.Run(hac3.Name, testCtx))
   282  		must.NoError(t, err)
   283  	}()
   284  
   285  	time.Sleep(15 * time.Millisecond)
   286  
   287  	/*
   288  		After 15 ms more (45 ms total):
   289  		* hac3 should have tried to acquire the lock twice, once on start and
   290  			once after waitTime(11).
   291  		* hac2 should have tried to acquire the lock 4 times:
   292  				initialDelay(6) + calls(3) * waitTime(11) = 39.
   293  		* hac1 should have renewed the lease 4 times and still hold the lock:
   294  				initialDelay(0) + renewals(6) * renewalPeriod(7) = 42.
   295  	*/
   296  
   297  	lock = l.getLockState()
   298  	service = s.getServiceState()
   299  	must.Eq(t, 1, lock.acquireCalls[hac1.Name])
   300  	must.Eq(t, 4, lock.acquireCalls[hac2.Name])
   301  	must.Eq(t, 2, lock.acquireCalls[hac3.Name])
   302  
   303  	must.True(t, lock.locked)
   304  
   305  	must.Eq(t, 6, lock.renewsCounter)
   306  	must.Eq(t, 1, service.startsCounter)
   307  	must.StrContains(t, hac1.Name, service.starterID)
   308  
   309  	// Stop hac1 and release the lock
   310  	hac1Cancel()
   311  
   312  	time.Sleep(10 * time.Millisecond)
   313  
   314  	/*
   315  		After 10 ms more (55 ms total):
   316  		* hac3 should have tried to acquire the lock 3 times.
   317  		* hac2 should have tried to acquire the lock 5 times and succeeded on the
   318  		 the fifth, is currently holding the lock and Run the service, no renewals.
   319  		* hc1 is stopped.
   320  	*/
   321  
   322  	lock = l.getLockState()
   323  	service = s.getServiceState()
   324  	must.Eq(t, 1, lock.acquireCalls[hac1.Name])
   325  	must.Eq(t, 5, lock.acquireCalls[hac2.Name])
   326  	must.Eq(t, 3, lock.acquireCalls[hac3.Name])
   327  
   328  	must.True(t, lock.locked)
   329  
   330  	must.Eq(t, 0, lock.renewsCounter)
   331  	must.Eq(t, 2, service.startsCounter)
   332  	must.StrContains(t, hac2.Name, service.starterID)
   333  
   334  	time.Sleep(5 * time.Millisecond)
   335  
   336  	/*
   337  		After 5 ms more (60 ms total):
   338  		* hac3 should have tried to acquire the lock 3 times.
   339  		* hac2 should have renewed the lock once.
   340  		* hc1 is stopped.
   341  	*/
   342  
   343  	lock = l.getLockState()
   344  	service = s.getServiceState()
   345  	must.Eq(t, 1, lock.acquireCalls[hac1.Name])
   346  	must.Eq(t, 5, lock.acquireCalls[hac2.Name])
   347  	must.Eq(t, 3, lock.acquireCalls[hac3.Name])
   348  
   349  	must.True(t, lock.locked)
   350  
   351  	must.Eq(t, 1, lock.renewsCounter)
   352  	must.Eq(t, 2, service.startsCounter)
   353  	must.StrContains(t, hac2.Name, service.starterID)
   354  }
   355  
   356  func TestFailedRenewal(t *testing.T) {
   357  	l := mockLock{
   358  		acquireCalls: map[string]int{},
   359  	}
   360  
   361  	s := mockService{}
   362  
   363  	testCtx, testCancel := context.WithCancel(context.Background())
   364  	defer testCancel()
   365  
   366  	// Set the renewal period to 1.5  * testLease (15 ms) to force and error.
   367  	hac := LockLeaser{
   368  		Name: "hac1",
   369  		locker: &lockHandler{
   370  			mockLock: &l,
   371  			callerID: "hac1",
   372  		},
   373  		renewalPeriod: time.Duration(float64(testLease) * 1.5),
   374  		waitPeriod:    time.Duration(float64(testLease) * lockRetryBackoffFactor),
   375  		randomDelay:   0,
   376  	}
   377  
   378  	lock := l.getLockState()
   379  	must.False(t, lock.locked)
   380  
   381  	go hac.Start(testCtx, s.Run(hac.Name, testCtx))
   382  
   383  	time.Sleep(5 * time.Millisecond)
   384  	/*
   385  		After 5ms, the service should be running and the lock held,
   386  		no renewals needed or performed yet.
   387  	*/
   388  
   389  	lock = l.getLockState()
   390  	service := s.getServiceState()
   391  	must.Eq(t, 1, lock.acquireCalls[hac.Name])
   392  	must.True(t, lock.locked)
   393  
   394  	must.Eq(t, 0, lock.renewsCounter)
   395  	must.Eq(t, 1, service.startsCounter)
   396  	must.StrContains(t, hac.Name, service.starterID)
   397  
   398  	time.Sleep(15 * time.Millisecond)
   399  
   400  	/*
   401  		After 15ms (20ms total) hac should have tried and failed at renewing the
   402  		lock, causing the service to return, no new calls to acquire the lock yet
   403  		either.
   404  	*/
   405  
   406  	lock = l.getLockState()
   407  	service = s.getServiceState()
   408  	must.Eq(t, 1, lock.acquireCalls[hac.Name])
   409  	must.False(t, lock.locked)
   410  
   411  	must.Eq(t, 0, lock.renewsCounter)
   412  	must.Eq(t, 1, service.startsCounter)
   413  	must.StrContains(t, hac.Name, "")
   414  
   415  	time.Sleep(10 * time.Millisecond)
   416  
   417  	/*
   418  		After 10ms (30ms total) hac should have tried and succeeded at getting
   419  		the lock and the service should be running again.
   420  	*/
   421  
   422  	lock = l.getLockState()
   423  	service = s.getServiceState()
   424  	must.Eq(t, 2, lock.acquireCalls[hac.Name])
   425  	must.True(t, lock.locked)
   426  
   427  	must.Eq(t, 0, lock.renewsCounter)
   428  	must.Eq(t, 2, service.startsCounter)
   429  	must.StrContains(t, hac.Name, service.starterID)
   430  }
   431  
   432  func TestStart_ProtectedFunctionError(t *testing.T) {
   433  	l := mockLock{
   434  		acquireCalls: map[string]int{},
   435  	}
   436  
   437  	testCtx := context.Background()
   438  
   439  	hac := LockLeaser{
   440  		locker: &lockHandler{
   441  			mockLock: &l,
   442  			callerID: "hac",
   443  		},
   444  		renewalPeriod: time.Duration(float64(testLease) * lockLeaseRenewalFactor),
   445  		waitPeriod:    time.Duration(float64(testLease) * lockRetryBackoffFactor),
   446  	}
   447  
   448  	lock := l.getLockState()
   449  	must.False(t, lock.locked)
   450  
   451  	err := hac.Start(testCtx, func(ctx context.Context) error {
   452  		return errors.New("error")
   453  	})
   454  
   455  	must.Error(t, err)
   456  
   457  	lock = l.getLockState()
   458  	must.False(t, lock.locked)
   459  	must.Zero(t, lock.renewsCounter)
   460  }
   461  
   462  func copyMap(originalMap map[string]int) map[string]int {
   463  	newMap := map[string]int{}
   464  	for k, v := range originalMap {
   465  		newMap[k] = v
   466  	}
   467  	return newMap
   468  }
   469  
   470  func Test_EarlyReturn(t *testing.T) {
   471  	l := mockLock{
   472  		acquireCalls: map[string]int{},
   473  	}
   474  
   475  	testCtx := context.Background()
   476  
   477  	hac := LockLeaser{
   478  		locker: &lockHandler{
   479  			mockLock: &l,
   480  			callerID: "hac-early-return",
   481  		},
   482  		renewalPeriod: time.Duration(float64(testLease) * lockLeaseRenewalFactor),
   483  		waitPeriod:    time.Duration(float64(testLease) * lockRetryBackoffFactor),
   484  		earlyReturn:   true,
   485  	}
   486  
   487  	lock := l.getLockState()
   488  	must.False(t, lock.locked)
   489  
   490  	err := hac.Start(testCtx, func(ctx context.Context) error {
   491  		return errors.New("error")
   492  	})
   493  
   494  	must.NoError(t, err)
   495  
   496  	lock = l.getLockState()
   497  	must.False(t, lock.locked)
   498  	must.Zero(t, lock.renewsCounter)
   499  }