github.com/minio/minio@v0.0.0-20240328213742-3f72439b8a27/internal/dsync/drwmutex.go (about) 1 // Copyright (c) 2015-2021 MinIO, Inc. 2 // 3 // This file is part of MinIO Object Storage stack 4 // 5 // This program is free software: you can redistribute it and/or modify 6 // it under the terms of the GNU Affero General Public License as published by 7 // the Free Software Foundation, either version 3 of the License, or 8 // (at your option) any later version. 9 // 10 // This program is distributed in the hope that it will be useful 11 // but WITHOUT ANY WARRANTY; without even the implied warranty of 12 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 // GNU Affero General Public License for more details. 14 // 15 // You should have received a copy of the GNU Affero General Public License 16 // along with this program. If not, see <http://www.gnu.org/licenses/>. 17 18 package dsync 19 20 import ( 21 "context" 22 "errors" 23 "math/rand" 24 "sort" 25 "strconv" 26 "sync" 27 "time" 28 29 xioutil "github.com/minio/minio/internal/ioutil" 30 "github.com/minio/minio/internal/mcontext" 31 "github.com/minio/pkg/v2/console" 32 "github.com/minio/pkg/v2/env" 33 ) 34 35 // Indicator if logging is enabled. 36 var dsyncLog bool 37 38 // Retry unit interval 39 var lockRetryMinInterval time.Duration 40 41 var lockRetryBackOff func(*rand.Rand, uint) time.Duration 42 43 func init() { 44 // Check for MINIO_DSYNC_TRACE env variable, if set logging will be enabled for failed REST operations. 45 dsyncLog = env.Get("_MINIO_DSYNC_TRACE", "0") == "1" 46 47 lockRetryMinInterval = 250 * time.Millisecond 48 if lri := env.Get("_MINIO_LOCK_RETRY_INTERVAL", ""); lri != "" { 49 v, err := strconv.Atoi(lri) 50 if err != nil { 51 panic(err) 52 } 53 lockRetryMinInterval = time.Duration(v) * time.Millisecond 54 } 55 56 lockRetryBackOff = backoffWait( 57 lockRetryMinInterval, 58 100*time.Millisecond, 59 5*time.Second, 60 ) 61 } 62 63 func log(format string, data ...interface{}) { 64 if dsyncLog { 65 console.Printf(format, data...) 66 } 67 } 68 69 const ( 70 // dRWMutexAcquireTimeout - default tolerance limit to wait for lock acquisition before. 71 drwMutexAcquireTimeout = 1 * time.Second // 1 second. 72 73 // dRWMutexRefreshTimeout - default timeout for the refresh call 74 drwMutexRefreshCallTimeout = 5 * time.Second 75 76 // dRWMutexUnlockTimeout - default timeout for the unlock call 77 drwMutexUnlockCallTimeout = 30 * time.Second 78 79 // dRWMutexForceUnlockTimeout - default timeout for the unlock call 80 drwMutexForceUnlockCallTimeout = 30 * time.Second 81 82 // dRWMutexRefreshInterval - default the interval between two refresh calls 83 drwMutexRefreshInterval = 10 * time.Second 84 85 drwMutexInfinite = 1<<63 - 1 86 ) 87 88 // Timeouts are timeouts for specific operations. 89 type Timeouts struct { 90 // Acquire - tolerance limit to wait for lock acquisition before. 91 Acquire time.Duration 92 93 // RefreshCall - timeout for the refresh call 94 RefreshCall time.Duration 95 96 // UnlockCall - timeout for the unlock call 97 UnlockCall time.Duration 98 99 // ForceUnlockCall - timeout for the force unlock call 100 ForceUnlockCall time.Duration 101 } 102 103 // DefaultTimeouts contains default timeouts. 104 var DefaultTimeouts = Timeouts{ 105 Acquire: drwMutexAcquireTimeout, 106 RefreshCall: drwMutexRefreshCallTimeout, 107 UnlockCall: drwMutexUnlockCallTimeout, 108 ForceUnlockCall: drwMutexForceUnlockCallTimeout, 109 } 110 111 // A DRWMutex is a distributed mutual exclusion lock. 112 type DRWMutex struct { 113 Names []string 114 writeLocks []string // Array of nodes that granted a write lock 115 readLocks []string // Array of array of nodes that granted reader locks 116 rng *rand.Rand 117 m sync.Mutex // Mutex to prevent multiple simultaneous locks from this node 118 clnt *Dsync 119 cancelRefresh context.CancelFunc 120 refreshInterval time.Duration 121 lockRetryMinInterval time.Duration 122 } 123 124 // Granted - represents a structure of a granted lock. 125 type Granted struct { 126 index int 127 lockUID string // Locked if set with UID string, unlocked if empty 128 } 129 130 func (g *Granted) isLocked() bool { 131 return isLocked(g.lockUID) 132 } 133 134 func isLocked(uid string) bool { 135 return len(uid) > 0 136 } 137 138 // NewDRWMutex - initializes a new dsync RW mutex. 139 func NewDRWMutex(clnt *Dsync, names ...string) *DRWMutex { 140 restClnts, _ := clnt.GetLockers() 141 sort.Strings(names) 142 return &DRWMutex{ 143 writeLocks: make([]string, len(restClnts)), 144 readLocks: make([]string, len(restClnts)), 145 Names: names, 146 clnt: clnt, 147 rng: rand.New(&lockedRandSource{src: rand.NewSource(time.Now().UTC().UnixNano())}), 148 refreshInterval: drwMutexRefreshInterval, 149 lockRetryMinInterval: lockRetryMinInterval, 150 } 151 } 152 153 // Lock holds a write lock on dm. 154 // 155 // If the lock is already in use, the calling go routine 156 // blocks until the mutex is available. 157 func (dm *DRWMutex) Lock(id, source string) { 158 isReadLock := false 159 dm.lockBlocking(context.Background(), nil, id, source, isReadLock, Options{ 160 Timeout: drwMutexInfinite, 161 }) 162 } 163 164 // Options lock options. 165 type Options struct { 166 Timeout time.Duration 167 RetryInterval time.Duration 168 } 169 170 // GetLock tries to get a write lock on dm before the timeout elapses. 171 // 172 // If the lock is already in use, the calling go routine 173 // blocks until either the mutex becomes available and return success or 174 // more time has passed than the timeout value and return false. 175 func (dm *DRWMutex) GetLock(ctx context.Context, cancel context.CancelFunc, id, source string, opts Options) (locked bool) { 176 isReadLock := false 177 return dm.lockBlocking(ctx, cancel, id, source, isReadLock, opts) 178 } 179 180 // RLock holds a read lock on dm. 181 // 182 // If one or more read locks are already in use, it will grant another lock. 183 // Otherwise the calling go routine blocks until the mutex is available. 184 func (dm *DRWMutex) RLock(id, source string) { 185 isReadLock := true 186 dm.lockBlocking(context.Background(), nil, id, source, isReadLock, Options{ 187 Timeout: drwMutexInfinite, 188 }) 189 } 190 191 // GetRLock tries to get a read lock on dm before the timeout elapses. 192 // 193 // If one or more read locks are already in use, it will grant another lock. 194 // Otherwise the calling go routine blocks until either the mutex becomes 195 // available and return success or more time has passed than the timeout 196 // value and return false. 197 func (dm *DRWMutex) GetRLock(ctx context.Context, cancel context.CancelFunc, id, source string, opts Options) (locked bool) { 198 isReadLock := true 199 return dm.lockBlocking(ctx, cancel, id, source, isReadLock, opts) 200 } 201 202 // lockBlocking will try to acquire either a read or a write lock 203 // 204 // The function will loop using a built-in timing randomized back-off 205 // algorithm until either the lock is acquired successfully or more 206 // time has elapsed than the timeout value. 207 func (dm *DRWMutex) lockBlocking(ctx context.Context, lockLossCallback func(), id, source string, isReadLock bool, opts Options) (locked bool) { 208 restClnts, _ := dm.clnt.GetLockers() 209 210 // Create lock array to capture the successful lockers 211 locks := make([]string, len(restClnts)) 212 213 // Add total timeout 214 ctx, cancel := context.WithTimeout(ctx, opts.Timeout) 215 defer cancel() 216 217 // Tolerance is not set, defaults to half of the locker clients. 218 tolerance := len(restClnts) / 2 219 220 // Quorum is effectively = total clients subtracted with tolerance limit 221 quorum := len(restClnts) - tolerance 222 if !isReadLock { 223 // In situations for write locks, as a special case 224 // to avoid split brains we make sure to acquire 225 // quorum + 1 when tolerance is exactly half of the 226 // total locker clients. 227 if quorum == tolerance { 228 quorum++ 229 } 230 } 231 232 log("lockBlocking %s/%s for %#v: lockType readLock(%t), additional opts: %#v, quorum: %d, tolerance: %d, lockClients: %d\n", id, source, dm.Names, isReadLock, opts, quorum, tolerance, len(restClnts)) 233 234 tolerance = len(restClnts) - quorum 235 attempt := uint(0) 236 237 for { 238 select { 239 case <-ctx.Done(): 240 return false 241 default: 242 // Try to acquire the lock. 243 if locked = lock(ctx, dm.clnt, &locks, id, source, isReadLock, tolerance, quorum, dm.Names...); locked { 244 dm.m.Lock() 245 246 // If success, copy array to object 247 if isReadLock { 248 copy(dm.readLocks, locks) 249 } else { 250 copy(dm.writeLocks, locks) 251 } 252 253 dm.m.Unlock() 254 log("lockBlocking %s/%s for %#v: granted\n", id, source, dm.Names) 255 256 // Refresh lock continuously and cancel if there is no quorum in the lock anymore 257 dm.startContinuousLockRefresh(lockLossCallback, id, source, quorum) 258 259 return locked 260 } 261 262 switch { 263 case opts.RetryInterval < 0: 264 return false 265 case opts.RetryInterval > 0: 266 time.Sleep(opts.RetryInterval) 267 default: 268 attempt++ 269 time.Sleep(lockRetryBackOff(dm.rng, attempt)) 270 } 271 } 272 } 273 } 274 275 func (dm *DRWMutex) startContinuousLockRefresh(lockLossCallback func(), id, source string, quorum int) { 276 ctx, cancel := context.WithCancel(context.Background()) 277 278 dm.m.Lock() 279 dm.cancelRefresh = cancel 280 dm.m.Unlock() 281 282 go func() { 283 defer cancel() 284 285 refreshTimer := time.NewTimer(dm.refreshInterval) 286 defer refreshTimer.Stop() 287 288 for { 289 select { 290 case <-ctx.Done(): 291 return 292 case <-refreshTimer.C: 293 noQuorum, err := refreshLock(ctx, dm.clnt, id, source, quorum) 294 if err == nil && noQuorum { 295 // Clean the lock locally and in remote nodes 296 forceUnlock(ctx, dm.clnt, id) 297 // Execute the caller lock loss callback 298 if lockLossCallback != nil { 299 lockLossCallback() 300 } 301 return 302 } 303 304 refreshTimer.Reset(dm.refreshInterval) 305 } 306 } 307 }() 308 } 309 310 func forceUnlock(ctx context.Context, ds *Dsync, id string) { 311 ctx, cancel := context.WithTimeout(ctx, ds.Timeouts.ForceUnlockCall) 312 defer cancel() 313 314 restClnts, _ := ds.GetLockers() 315 316 args := LockArgs{ 317 UID: id, 318 } 319 320 var wg sync.WaitGroup 321 for index, c := range restClnts { 322 wg.Add(1) 323 // Send refresh request to all nodes 324 go func(index int, c NetLocker) { 325 defer wg.Done() 326 c.ForceUnlock(ctx, args) 327 }(index, c) 328 } 329 wg.Wait() 330 } 331 332 type refreshResult struct { 333 offline bool 334 refreshed bool 335 } 336 337 // Refresh the given lock in all nodes, return true to indicate if a lock 338 // does not exist in enough quorum nodes. 339 func refreshLock(ctx context.Context, ds *Dsync, id, source string, quorum int) (bool, error) { 340 restClnts, _ := ds.GetLockers() 341 342 // Create buffered channel of size equal to total number of nodes. 343 ch := make(chan refreshResult, len(restClnts)) 344 var wg sync.WaitGroup 345 346 args := LockArgs{ 347 UID: id, 348 } 349 350 for index, c := range restClnts { 351 wg.Add(1) 352 // Send refresh request to all nodes 353 go func(index int, c NetLocker) { 354 defer wg.Done() 355 356 if c == nil { 357 ch <- refreshResult{offline: true} 358 return 359 } 360 361 ctx, cancel := context.WithTimeout(ctx, ds.Timeouts.RefreshCall) 362 defer cancel() 363 364 refreshed, err := c.Refresh(ctx, args) 365 if err != nil { 366 ch <- refreshResult{offline: true} 367 log("dsync: Unable to call Refresh failed with %s for %#v at %s\n", err, args, c) 368 } else { 369 ch <- refreshResult{refreshed: refreshed} 370 log("dsync: Refresh returned false for %#v at %s\n", args, c) 371 } 372 }(index, c) 373 } 374 375 // Wait until we have either 376 // 377 // a) received all refresh responses 378 // b) received too many refreshed for quorum to be still possible 379 // c) timed out 380 // 381 lockNotFound, lockRefreshed := 0, 0 382 done := false 383 384 for i := 0; i < len(restClnts); i++ { 385 select { 386 case refreshResult := <-ch: 387 if refreshResult.offline { 388 continue 389 } 390 if refreshResult.refreshed { 391 lockRefreshed++ 392 } else { 393 lockNotFound++ 394 } 395 if lockRefreshed >= quorum || lockNotFound > len(restClnts)-quorum { 396 done = true 397 } 398 case <-ctx.Done(): 399 // Refreshing is canceled 400 return false, ctx.Err() 401 } 402 if done { 403 break 404 } 405 } 406 407 // We may have some unused results in ch, release them async. 408 go func() { 409 wg.Wait() 410 xioutil.SafeClose(ch) 411 for range ch { 412 } 413 }() 414 415 noQuorum := lockNotFound > len(restClnts)-quorum 416 return noQuorum, nil 417 } 418 419 // lock tries to acquire the distributed lock, returning true or false. 420 func lock(ctx context.Context, ds *Dsync, locks *[]string, id, source string, isReadLock bool, tolerance, quorum int, names ...string) bool { 421 for i := range *locks { 422 (*locks)[i] = "" 423 } 424 425 restClnts, owner := ds.GetLockers() 426 427 // Create buffered channel of size equal to total number of nodes. 428 ch := make(chan Granted, len(restClnts)) 429 var wg sync.WaitGroup 430 431 args := LockArgs{ 432 Owner: owner, 433 UID: id, 434 Resources: names, 435 Source: source, 436 Quorum: quorum, 437 } 438 439 // Combined timeout for the lock attempt. 440 ctx, cancel := context.WithTimeout(ctx, ds.Timeouts.Acquire) 441 defer cancel() 442 443 // Special context for NetLockers - do not use timeouts. 444 // Also, pass the trace context info if found for debugging 445 netLockCtx := context.Background() 446 tc, ok := ctx.Value(mcontext.ContextTraceKey).(*mcontext.TraceCtxt) 447 if ok { 448 netLockCtx = context.WithValue(netLockCtx, mcontext.ContextTraceKey, tc) 449 } 450 451 for index, c := range restClnts { 452 wg.Add(1) 453 // broadcast lock request to all nodes 454 go func(index int, isReadLock bool, c NetLocker) { 455 defer wg.Done() 456 457 g := Granted{index: index} 458 if c == nil { 459 log("dsync: nil locker\n") 460 ch <- g 461 return 462 } 463 464 var locked bool 465 var err error 466 if isReadLock { 467 if locked, err = c.RLock(netLockCtx, args); err != nil { 468 log("dsync: Unable to call RLock failed with %s for %#v at %s\n", err, args, c) 469 } 470 } else { 471 if locked, err = c.Lock(netLockCtx, args); err != nil { 472 log("dsync: Unable to call Lock failed with %s for %#v at %s\n", err, args, c) 473 } 474 } 475 if locked { 476 g.lockUID = args.UID 477 } 478 ch <- g 479 }(index, isReadLock, c) 480 } 481 482 // Wait until we have either 483 // 484 // a) received all lock responses 485 // b) received too many 'non-'locks for quorum to be still possible 486 // c) timed out 487 // 488 i, locksFailed := 0, 0 489 done := false 490 491 for ; i < len(restClnts); i++ { // Loop until we acquired all locks 492 select { 493 case grant := <-ch: 494 if grant.isLocked() { 495 // Mark that this node has acquired the lock 496 (*locks)[grant.index] = grant.lockUID 497 } else { 498 locksFailed++ 499 if locksFailed > tolerance { 500 // We know that we are not going to get the lock anymore, 501 // so exit out and release any locks that did get acquired 502 done = true 503 } 504 } 505 case <-ctx.Done(): 506 // Capture timedout locks as failed or took too long 507 locksFailed++ 508 if locksFailed > tolerance { 509 // We know that we are not going to get the lock anymore, 510 // so exit out and release any locks that did get acquired 511 done = true 512 } 513 } 514 515 if done { 516 break 517 } 518 } 519 520 quorumLocked := checkQuorumLocked(locks, quorum) && locksFailed <= tolerance 521 if !quorumLocked { 522 log("dsync: Unable to acquire lock in quorum %#v\n", args) 523 // Release all acquired locks without quorum. 524 if !releaseAll(ctx, ds, tolerance, owner, locks, isReadLock, restClnts, names...) { 525 log("Unable to release acquired locks, these locks will expire automatically %#v\n", args) 526 } 527 } 528 529 // We may have some unused results in ch, release them async. 530 go func() { 531 wg.Wait() 532 xioutil.SafeClose(ch) 533 for grantToBeReleased := range ch { 534 if grantToBeReleased.isLocked() { 535 // release abandoned lock 536 log("Releasing abandoned lock\n") 537 sendRelease(ctx, ds, restClnts[grantToBeReleased.index], 538 owner, grantToBeReleased.lockUID, isReadLock, names...) 539 } 540 } 541 }() 542 543 return quorumLocked 544 } 545 546 // checkFailedUnlocks determines whether we have sufficiently unlocked all 547 // resources to ensure no deadlocks for future callers 548 func checkFailedUnlocks(locks []string, tolerance int) bool { 549 unlocksFailed := 0 550 for lockID := range locks { 551 if isLocked(locks[lockID]) { 552 unlocksFailed++ 553 } 554 } 555 556 // Unlock failures are higher than tolerance limit 557 // for this instance of unlocker, we should let the 558 // caller know that lock is not successfully released 559 // yet. 560 if len(locks)-tolerance == tolerance { 561 // In case of split brain scenarios where 562 // tolerance is exactly half of the len(*locks) 563 // then we need to make sure we have unlocked 564 // upto tolerance+1 - especially for RUnlock 565 // to ensure that we don't end up with active 566 // read locks on the resource after unlocking 567 // only half of the lockers. 568 return unlocksFailed >= tolerance 569 } 570 return unlocksFailed > tolerance 571 } 572 573 // checkQuorumLocked determines whether we have locked the required quorum of underlying locks or not 574 func checkQuorumLocked(locks *[]string, quorum int) bool { 575 count := 0 576 for _, uid := range *locks { 577 if isLocked(uid) { 578 count++ 579 } 580 } 581 582 return count >= quorum 583 } 584 585 // releaseAll releases all locks that are marked as locked 586 func releaseAll(ctx context.Context, ds *Dsync, tolerance int, owner string, locks *[]string, isReadLock bool, restClnts []NetLocker, names ...string) bool { 587 var wg sync.WaitGroup 588 for lockID := range restClnts { 589 wg.Add(1) 590 go func(lockID int) { 591 defer wg.Done() 592 if sendRelease(ctx, ds, restClnts[lockID], owner, (*locks)[lockID], isReadLock, names...) { 593 (*locks)[lockID] = "" 594 } 595 }(lockID) 596 } 597 wg.Wait() 598 599 // Return true if releaseAll was successful, otherwise we return 'false' 600 // to indicate we haven't sufficiently unlocked lockers to avoid deadlocks. 601 // 602 // Caller may use this as an indication to call again. 603 return !checkFailedUnlocks(*locks, tolerance) 604 } 605 606 // Unlock unlocks the write lock. 607 // 608 // It is a run-time error if dm is not locked on entry to Unlock. 609 func (dm *DRWMutex) Unlock(ctx context.Context) { 610 dm.m.Lock() 611 dm.cancelRefresh() 612 dm.m.Unlock() 613 614 restClnts, owner := dm.clnt.GetLockers() 615 // create temp array on stack 616 locks := make([]string, len(restClnts)) 617 618 { 619 dm.m.Lock() 620 defer dm.m.Unlock() 621 622 // Check if minimally a single bool is set in the writeLocks array 623 lockFound := false 624 for _, uid := range dm.writeLocks { 625 if isLocked(uid) { 626 lockFound = true 627 break 628 } 629 } 630 if !lockFound { 631 panic("Trying to Unlock() while no Lock() is active") 632 } 633 634 // Copy write locks to stack array 635 copy(locks, dm.writeLocks) 636 } 637 638 // Tolerance is not set, defaults to half of the locker clients. 639 tolerance := len(restClnts) / 2 640 641 isReadLock := false 642 for !releaseAll(ctx, dm.clnt, tolerance, owner, &locks, isReadLock, restClnts, dm.Names...) { 643 time.Sleep(time.Duration(dm.rng.Float64() * float64(dm.lockRetryMinInterval))) 644 } 645 } 646 647 // RUnlock releases a read lock held on dm. 648 // 649 // It is a run-time error if dm is not locked on entry to RUnlock. 650 func (dm *DRWMutex) RUnlock(ctx context.Context) { 651 dm.m.Lock() 652 dm.cancelRefresh() 653 dm.m.Unlock() 654 655 restClnts, owner := dm.clnt.GetLockers() 656 // create temp array on stack 657 locks := make([]string, len(restClnts)) 658 659 { 660 dm.m.Lock() 661 defer dm.m.Unlock() 662 663 // Check if minimally a single bool is set in the writeLocks array 664 lockFound := false 665 for _, uid := range dm.readLocks { 666 if isLocked(uid) { 667 lockFound = true 668 break 669 } 670 } 671 if !lockFound { 672 panic("Trying to RUnlock() while no RLock() is active") 673 } 674 675 // Copy write locks to stack array 676 copy(locks, dm.readLocks) 677 } 678 679 // Tolerance is not set, defaults to half of the locker clients. 680 tolerance := len(restClnts) / 2 681 682 isReadLock := true 683 for !releaseAll(ctx, dm.clnt, tolerance, owner, &locks, isReadLock, restClnts, dm.Names...) { 684 time.Sleep(time.Duration(dm.rng.Float64() * float64(dm.lockRetryMinInterval))) 685 } 686 } 687 688 // sendRelease sends a release message to a node that previously granted a lock 689 func sendRelease(ctx context.Context, ds *Dsync, c NetLocker, owner string, uid string, isReadLock bool, names ...string) bool { 690 if c == nil { 691 log("Unable to call RUnlock failed with %s\n", errors.New("netLocker is offline")) 692 return false 693 } 694 695 if len(uid) == 0 { 696 return false 697 } 698 699 args := LockArgs{ 700 Owner: owner, 701 UID: uid, 702 Resources: names, 703 } 704 705 netLockCtx, cancel := context.WithTimeout(context.Background(), ds.Timeouts.UnlockCall) 706 defer cancel() 707 708 tc, ok := ctx.Value(mcontext.ContextTraceKey).(*mcontext.TraceCtxt) 709 if ok { 710 netLockCtx = context.WithValue(netLockCtx, mcontext.ContextTraceKey, tc) 711 } 712 713 if isReadLock { 714 if _, err := c.RUnlock(netLockCtx, args); err != nil { 715 log("dsync: Unable to call RUnlock failed with %s for %#v at %s\n", err, args, c) 716 return false 717 } 718 } else { 719 if _, err := c.Unlock(netLockCtx, args); err != nil { 720 log("dsync: Unable to call Unlock failed with %s for %#v at %s\n", err, args, c) 721 return false 722 } 723 } 724 725 return true 726 }