github.com/hashicorp/nomad/api@v0.0.0-20240306165712-3193ac204f65/locks_test.go (about) 1 // Copyright (c) HashiCorp, Inc. 2 // SPDX-License-Identifier: MPL-2.0 3 4 package api 5 6 import ( 7 "context" 8 "errors" 9 "sync" 10 "testing" 11 "time" 12 13 "github.com/shoenig/test/must" 14 ) 15 16 var testLease = 10 * time.Millisecond 17 18 type mockLock struct { 19 locked bool 20 acquireCalls map[string]int 21 renewsCounter int 22 mu sync.Mutex 23 24 leaseStartTime time.Time 25 } 26 27 func (ml *mockLock) acquire(_ context.Context, callerID string) (string, error) { 28 ml.mu.Lock() 29 defer ml.mu.Unlock() 30 31 if callerID == "hac-early-return" { 32 return "", ErrLockConflict 33 } 34 35 ml.acquireCalls[callerID] += 1 36 if ml.locked { 37 return "", nil 38 } 39 40 ml.locked = true 41 ml.leaseStartTime = time.Now() 42 ml.renewsCounter = 0 43 return "lockPath", nil 44 } 45 46 type lockHandler struct { 47 *mockLock 48 callerID string 49 } 50 51 func (lh *lockHandler) LockTTL() time.Duration { 52 return testLease 53 } 54 55 func (lh *lockHandler) Acquire(ctx context.Context) (string, error) { 56 return lh.acquire(ctx, lh.callerID) 57 } 58 59 func (ml *mockLock) Release(_ context.Context) error { 60 ml.mu.Lock() 61 defer ml.mu.Unlock() 62 63 if !ml.locked { 64 return errors.New("lock not locked") 65 } 66 67 ml.locked = false 68 ml.renewsCounter = 0 69 return nil 70 } 71 72 // The behavior of renew is not an exact replication of 73 // the lock work, its intended to test the behavior of the 74 // multiple instances running. 75 func (ml *mockLock) Renew(_ context.Context) error { 76 ml.mu.Lock() 77 defer ml.mu.Unlock() 78 79 if !ml.locked { 80 return errors.New("error") 81 } 82 83 if time.Since(ml.leaseStartTime) > testLease { 84 ml.locked = false 85 return ErrLockConflict 86 } 87 88 ml.leaseStartTime = time.Now() 89 ml.renewsCounter += 1 90 return nil 91 } 92 93 func (ml *mockLock) getLockState() mockLock { 94 ml.mu.Lock() 95 defer ml.mu.Unlock() 96 97 return mockLock{ 98 locked: ml.locked, 99 acquireCalls: copyMap(ml.acquireCalls), 100 renewsCounter: ml.renewsCounter, 101 } 102 } 103 104 type mockService struct { 105 mockLock 106 107 mu sync.Mutex 108 startsCounter int 109 starterID string 110 } 111 112 func (ms *mockService) Run(callerID string, _ context.Context) func(ctx context.Context) error { 113 return func(ctx context.Context) error { 114 ms.mu.Lock() 115 ms.startsCounter += 1 116 ms.starterID = callerID 117 ms.mu.Unlock() 118 119 <-ctx.Done() 120 121 ms.mu.Lock() 122 ms.starterID = "" 123 ms.mu.Unlock() 124 125 return nil 126 } 127 } 128 129 func (ms *mockService) getServiceState() mockService { 130 ms.mu.Lock() 131 defer ms.mu.Unlock() 132 133 return mockService{ 134 startsCounter: ms.startsCounter, 135 starterID: ms.starterID, 136 } 137 } 138 139 func TestAcquireLock_MultipleInstances(t *testing.T) { 140 l := mockLock{ 141 acquireCalls: map[string]int{}, 142 } 143 144 s := mockService{} 145 146 testCtx := context.Background() 147 148 // Set up independent contexts to test the switch when one controller stops 149 hac1Ctx, hac1Cancel := context.WithCancel(testCtx) 150 defer hac1Cancel() 151 152 // Wait time on hac1 is 0, it should always get the lock. 153 hac1 := LockLeaser{ 154 Name: "hac1", 155 locker: &lockHandler{ 156 mockLock: &l, 157 callerID: "hac1", 158 }, 159 renewalPeriod: time.Duration(float64(testLease) * lockLeaseRenewalFactor), 160 waitPeriod: time.Duration(float64(testLease) * lockRetryBackoffFactor), 161 randomDelay: 0, 162 } 163 164 hac2 := LockLeaser{ 165 Name: "hac2", 166 locker: &lockHandler{ 167 mockLock: &l, 168 callerID: "hac2", 169 }, 170 renewalPeriod: time.Duration(float64(testLease) * lockLeaseRenewalFactor), 171 waitPeriod: time.Duration(float64(testLease) * lockRetryBackoffFactor), 172 randomDelay: 6 * time.Millisecond, 173 } 174 175 lock := l.getLockState() 176 must.False(t, lock.locked) 177 178 go func() { 179 err := hac1.Start(hac1Ctx, s.Run(hac1.Name, testCtx)) 180 must.NoError(t, err) 181 }() 182 183 go func() { 184 err := hac2.Start(testCtx, s.Run(hac2.Name, testCtx)) 185 must.NoError(t, err) 186 }() 187 188 time.Sleep(4 * time.Millisecond) 189 /* 190 After 4 ms more (4 ms total): 191 * hac2 should not have tried to acquire the lock because it has an initial delay of 6ms. 192 * hac1 should have the lock and the service should be running. 193 * The first lease is not over yet, no calls to renew should have been made. 194 */ 195 196 lock = l.getLockState() 197 service := s.getServiceState() 198 199 must.True(t, lock.locked) 200 must.Eq(t, 1, lock.acquireCalls[hac1.Name]) 201 must.Eq(t, 0, lock.acquireCalls[hac2.Name]) 202 203 must.Eq(t, 0, lock.renewsCounter) 204 205 must.Eq(t, 1, service.startsCounter) 206 must.StrContains(t, hac1.Name, service.starterID) 207 208 time.Sleep(6 * time.Millisecond) 209 /* 210 After 6 ms more (10 ms total): 211 * hac2 should have tried to acquire the lock at least once, after the 6ms 212 initial delay has passed. 213 * hc1 should have renewed once the lease and still hold the lock. 214 */ 215 lock = l.getLockState() 216 service = s.getServiceState() 217 must.True(t, lock.locked) 218 must.Eq(t, 1, lock.acquireCalls[hac1.Name]) 219 must.Eq(t, 1, lock.acquireCalls[hac2.Name]) 220 221 must.One(t, lock.renewsCounter) 222 223 must.One(t, service.startsCounter) 224 must.StrContains(t, hac1.Name, service.starterID) 225 226 time.Sleep(5 * time.Millisecond) 227 228 /* 229 After 5 ms more (15 ms total): 230 * hac2 should have tried to acquire the lock still just once: 231 initialDelay(6) + waitTime(11) = 18. 232 * hac1 should have renewed the lease 2 times and still hold the lock: 233 initialDelay(0) + renewals(2) * renewalPeriod(7) = 14. 234 */ 235 236 lock = l.getLockState() 237 service = s.getServiceState() 238 must.Eq(t, 1, lock.acquireCalls[hac1.Name]) 239 must.Eq(t, 1, lock.acquireCalls[hac2.Name]) 240 241 must.True(t, lock.locked) 242 243 must.Eq(t, 2, lock.renewsCounter) 244 must.Eq(t, 1, service.startsCounter) 245 must.StrContains(t, hac1.Name, service.starterID) 246 247 time.Sleep(15 * time.Millisecond) 248 249 /* 250 After 15 ms more (30 ms total): 251 * hac2 should have tried to acquire the lock 3 times: 252 initialDelay(6) + calls(2)* waitTime(11) = 28. 253 * hac1 should have renewed the lease 4 times and still hold the lock: 254 initialDelay(0) + renewals(4) * renewalPeriod(7) = 28. 255 */ 256 257 lock = l.getLockState() 258 service = s.getServiceState() 259 must.Eq(t, 1, lock.acquireCalls[hac1.Name]) 260 must.Eq(t, 3, lock.acquireCalls[hac2.Name]) 261 262 must.True(t, lock.locked) 263 264 must.Eq(t, 4, lock.renewsCounter) 265 must.Eq(t, 1, service.startsCounter) 266 must.StrContains(t, hac1.Name, service.starterID) 267 268 // Start a new instance of the service with ha running, initial delay of 1ms 269 hac3 := LockLeaser{ 270 Name: "hac3", 271 locker: &lockHandler{ 272 mockLock: &l, 273 callerID: "hac3", 274 }, 275 renewalPeriod: time.Duration(float64(testLease) * lockLeaseRenewalFactor), 276 waitPeriod: time.Duration(float64(testLease) * lockRetryBackoffFactor), 277 randomDelay: 1 * time.Millisecond, 278 } 279 280 go func() { 281 err := hac3.Start(testCtx, s.Run(hac3.Name, testCtx)) 282 must.NoError(t, err) 283 }() 284 285 time.Sleep(15 * time.Millisecond) 286 287 /* 288 After 15 ms more (45 ms total): 289 * hac3 should have tried to acquire the lock twice, once on start and 290 once after waitTime(11). 291 * hac2 should have tried to acquire the lock 4 times: 292 initialDelay(6) + calls(3) * waitTime(11) = 39. 293 * hac1 should have renewed the lease 4 times and still hold the lock: 294 initialDelay(0) + renewals(6) * renewalPeriod(7) = 42. 295 */ 296 297 lock = l.getLockState() 298 service = s.getServiceState() 299 must.Eq(t, 1, lock.acquireCalls[hac1.Name]) 300 must.Eq(t, 4, lock.acquireCalls[hac2.Name]) 301 must.Eq(t, 2, lock.acquireCalls[hac3.Name]) 302 303 must.True(t, lock.locked) 304 305 must.Eq(t, 6, lock.renewsCounter) 306 must.Eq(t, 1, service.startsCounter) 307 must.StrContains(t, hac1.Name, service.starterID) 308 309 // Stop hac1 and release the lock 310 hac1Cancel() 311 312 time.Sleep(10 * time.Millisecond) 313 314 /* 315 After 10 ms more (55 ms total): 316 * hac3 should have tried to acquire the lock 3 times. 317 * hac2 should have tried to acquire the lock 5 times and succeeded on the 318 the fifth, is currently holding the lock and Run the service, no renewals. 319 * hc1 is stopped. 320 */ 321 322 lock = l.getLockState() 323 service = s.getServiceState() 324 must.Eq(t, 1, lock.acquireCalls[hac1.Name]) 325 must.Eq(t, 5, lock.acquireCalls[hac2.Name]) 326 must.Eq(t, 3, lock.acquireCalls[hac3.Name]) 327 328 must.True(t, lock.locked) 329 330 must.Eq(t, 0, lock.renewsCounter) 331 must.Eq(t, 2, service.startsCounter) 332 must.StrContains(t, hac2.Name, service.starterID) 333 334 time.Sleep(5 * time.Millisecond) 335 336 /* 337 After 5 ms more (60 ms total): 338 * hac3 should have tried to acquire the lock 3 times. 339 * hac2 should have renewed the lock once. 340 * hc1 is stopped. 341 */ 342 343 lock = l.getLockState() 344 service = s.getServiceState() 345 must.Eq(t, 1, lock.acquireCalls[hac1.Name]) 346 must.Eq(t, 5, lock.acquireCalls[hac2.Name]) 347 must.Eq(t, 3, lock.acquireCalls[hac3.Name]) 348 349 must.True(t, lock.locked) 350 351 must.Eq(t, 1, lock.renewsCounter) 352 must.Eq(t, 2, service.startsCounter) 353 must.StrContains(t, hac2.Name, service.starterID) 354 } 355 356 func TestFailedRenewal(t *testing.T) { 357 l := mockLock{ 358 acquireCalls: map[string]int{}, 359 } 360 361 s := mockService{} 362 363 testCtx, testCancel := context.WithCancel(context.Background()) 364 defer testCancel() 365 366 // Set the renewal period to 1.5 * testLease (15 ms) to force and error. 367 hac := LockLeaser{ 368 Name: "hac1", 369 locker: &lockHandler{ 370 mockLock: &l, 371 callerID: "hac1", 372 }, 373 renewalPeriod: time.Duration(float64(testLease) * 1.5), 374 waitPeriod: time.Duration(float64(testLease) * lockRetryBackoffFactor), 375 randomDelay: 0, 376 } 377 378 lock := l.getLockState() 379 must.False(t, lock.locked) 380 381 go hac.Start(testCtx, s.Run(hac.Name, testCtx)) 382 383 time.Sleep(5 * time.Millisecond) 384 /* 385 After 5ms, the service should be running and the lock held, 386 no renewals needed or performed yet. 387 */ 388 389 lock = l.getLockState() 390 service := s.getServiceState() 391 must.Eq(t, 1, lock.acquireCalls[hac.Name]) 392 must.True(t, lock.locked) 393 394 must.Eq(t, 0, lock.renewsCounter) 395 must.Eq(t, 1, service.startsCounter) 396 must.StrContains(t, hac.Name, service.starterID) 397 398 time.Sleep(15 * time.Millisecond) 399 400 /* 401 After 15ms (20ms total) hac should have tried and failed at renewing the 402 lock, causing the service to return, no new calls to acquire the lock yet 403 either. 404 */ 405 406 lock = l.getLockState() 407 service = s.getServiceState() 408 must.Eq(t, 1, lock.acquireCalls[hac.Name]) 409 must.False(t, lock.locked) 410 411 must.Eq(t, 0, lock.renewsCounter) 412 must.Eq(t, 1, service.startsCounter) 413 must.StrContains(t, hac.Name, "") 414 415 time.Sleep(10 * time.Millisecond) 416 417 /* 418 After 10ms (30ms total) hac should have tried and succeeded at getting 419 the lock and the service should be running again. 420 */ 421 422 lock = l.getLockState() 423 service = s.getServiceState() 424 must.Eq(t, 2, lock.acquireCalls[hac.Name]) 425 must.True(t, lock.locked) 426 427 must.Eq(t, 0, lock.renewsCounter) 428 must.Eq(t, 2, service.startsCounter) 429 must.StrContains(t, hac.Name, service.starterID) 430 } 431 432 func TestStart_ProtectedFunctionError(t *testing.T) { 433 l := mockLock{ 434 acquireCalls: map[string]int{}, 435 } 436 437 testCtx := context.Background() 438 439 hac := LockLeaser{ 440 locker: &lockHandler{ 441 mockLock: &l, 442 callerID: "hac", 443 }, 444 renewalPeriod: time.Duration(float64(testLease) * lockLeaseRenewalFactor), 445 waitPeriod: time.Duration(float64(testLease) * lockRetryBackoffFactor), 446 } 447 448 lock := l.getLockState() 449 must.False(t, lock.locked) 450 451 err := hac.Start(testCtx, func(ctx context.Context) error { 452 return errors.New("error") 453 }) 454 455 must.Error(t, err) 456 457 lock = l.getLockState() 458 must.False(t, lock.locked) 459 must.Zero(t, lock.renewsCounter) 460 } 461 462 func copyMap(originalMap map[string]int) map[string]int { 463 newMap := map[string]int{} 464 for k, v := range originalMap { 465 newMap[k] = v 466 } 467 return newMap 468 } 469 470 func Test_EarlyReturn(t *testing.T) { 471 l := mockLock{ 472 acquireCalls: map[string]int{}, 473 } 474 475 testCtx := context.Background() 476 477 hac := LockLeaser{ 478 locker: &lockHandler{ 479 mockLock: &l, 480 callerID: "hac-early-return", 481 }, 482 renewalPeriod: time.Duration(float64(testLease) * lockLeaseRenewalFactor), 483 waitPeriod: time.Duration(float64(testLease) * lockRetryBackoffFactor), 484 earlyReturn: true, 485 } 486 487 lock := l.getLockState() 488 must.False(t, lock.locked) 489 490 err := hac.Start(testCtx, func(ctx context.Context) error { 491 return errors.New("error") 492 }) 493 494 must.NoError(t, err) 495 496 lock = l.getLockState() 497 must.False(t, lock.locked) 498 must.Zero(t, lock.renewsCounter) 499 }