github.com/juju/juju@v0.0.0-20240327075706-a90865de2538/worker/lease/manager.go (about) 1 // Copyright 2015 Canonical Ltd. 2 // Licensed under the AGPLv3, see LICENCE file for details. 3 4 package lease 5 6 import ( 7 "context" 8 "fmt" 9 "io" 10 "os" 11 "path/filepath" 12 "runtime/pprof" 13 "sync" 14 "sync/atomic" 15 "time" 16 17 "github.com/juju/clock" 18 "github.com/juju/errors" 19 "github.com/prometheus/client_golang/prometheus" 20 "gopkg.in/retry.v1" 21 "gopkg.in/tomb.v2" 22 23 "github.com/juju/juju/core/lease" 24 "github.com/juju/juju/database/txn" 25 ) 26 27 const ( 28 // maxRetries gives the maximum number of attempts we'll try if 29 // there are timeouts. 30 maxRetries = 10 31 32 // initialRetryDelay is the starting delay - this will be 33 // increased exponentially up maxRetries. 34 initialRetryDelay = 50 * time.Millisecond 35 36 // retryBackoffFactor is how much longer we wait after a failing retry. 37 // Retrying 10 times starting at 50ms and backing off 1.6x gives us a total 38 // delay time of about 9s. 39 retryBackoffFactor = 1.6 40 41 // maxShutdownWait is the maximum time to wait for the async 42 // claims and expires to complete before stopping the worker 43 // anyway. Picked to be slightly quicker than the httpserver 44 // shutdown timeout. 45 maxShutdownWait = 55 * time.Second 46 ) 47 48 // errStopped is returned to clients when an operation cannot complete because 49 // the manager has started (and possibly finished) shutdown. 50 var errStopped = errors.New("lease manager stopped") 51 52 // NewManager returns a new *Manager configured as supplied. The caller takes 53 // responsibility for killing, and handling errors from, the returned Worker. 54 func NewManager(config ManagerConfig) (*Manager, error) { 55 if err := config.Validate(); err != nil { 56 return nil, errors.Trace(err) 57 } 58 logContext := config.EntityUUID 59 if len(logContext) > 6 { 60 logContext = logContext[:6] 61 } 62 manager := &Manager{ 63 config: config, 64 claims: make(chan claim), 65 revokes: make(chan revoke), 66 checks: make(chan check), 67 blocks: make(chan block), 68 expireDone: make(chan struct{}), 69 pins: make(chan pin), 70 unpins: make(chan pin), 71 logContext: logContext, 72 } 73 manager.tomb.Go(manager.loop) 74 return manager, nil 75 } 76 77 // Manager implements worker.Worker and can be bound to get 78 // lease.Checkers and lease.Claimers. 79 type Manager struct { 80 tomb tomb.Tomb 81 82 // config collects all external configuration and dependencies. 83 config ManagerConfig 84 85 // logContext is just a string that associates messages in the log 86 // It is seeded with the first six characters of the config.EntityUUID 87 // if supplied 88 logContext string 89 90 // nextTimeout is the next time that has a possible expiry that we would care 91 // about, capped at the maximum time. 92 nextTimeout time.Time 93 94 // timer tracks when nextTimeout would expire and triggers when it does 95 timer clock.Timer 96 97 // claims is used to deliver lease claim requests to the loop. 98 claims chan claim 99 100 // revokes is used to deliver lease revoke requests to the loop. 101 revokes chan revoke 102 103 // checks is used to deliver lease check requests to the loop. 104 checks chan check 105 106 // expireDone is sent an event when we successfully finish a call to expire() 107 expireDone chan struct{} 108 109 // blocks is used to deliver expiry block requests to the loop. 110 blocks chan block 111 112 // pins is used to deliver lease pin requests to the loop. 113 pins chan pin 114 115 // unpins is used to deliver lease unpin requests to the loop. 116 unpins chan pin 117 118 // wg is used to ensure that all child goroutines are finished 119 // before we stop. 120 wg sync.WaitGroup 121 122 // outstandingClaims tracks how many unfinished claim goroutines 123 // are running (for debugging purposes). 124 outstandingClaims int64 125 126 // outstandingRevokes tracks how many unfinished revoke goroutines 127 // are running (for debugging purposes). 128 outstandingRevokes int64 129 } 130 131 // Kill is part of the worker.Worker interface. 132 func (manager *Manager) Kill() { 133 manager.tomb.Kill(nil) 134 } 135 136 // Wait is part of the worker.Worker interface. 137 func (manager *Manager) Wait() error { 138 return manager.tomb.Wait() 139 } 140 141 // loop runs until the manager is stopped. 142 func (manager *Manager) loop() error { 143 if collector, ok := manager.config.Store.(prometheus.Collector); ok && manager.config.PrometheusRegisterer != nil { 144 // The store implements the collector interface, but the lease.Store 145 // does not expose those. 146 _ = manager.config.PrometheusRegisterer.Register(collector) 147 defer manager.config.PrometheusRegisterer.Unregister(collector) 148 } 149 150 defer manager.waitForGoroutines() 151 152 // This context is passed into all lease store operations. 153 // Doing this ensures that no such operations can block worker shutdown. 154 // Killing the tomb, cancels the context. 155 ctx := manager.tomb.Context(context.Background()) 156 157 leases, err := manager.config.Store.Leases(ctx) 158 if err != nil { 159 return errors.Trace(err) 160 } 161 manager.computeNextTimeout(leases) 162 163 blocks := make(blocks) 164 for { 165 if err := manager.choose(ctx, blocks); err != nil { 166 if errors.Is(err, tomb.ErrDying) { 167 err = manager.tomb.Err() 168 } 169 manager.config.Logger.Tracef("[%s] exiting main loop with error: %v", manager.logContext, err) 170 return errors.Trace(err) 171 } 172 } 173 } 174 175 func (manager *Manager) lookupLease(ctx context.Context, leaseKey lease.Key) (lease.Info, bool, error) { 176 leases, err := manager.config.Store.Leases(ctx, leaseKey) 177 if err != nil { 178 return lease.Info{}, false, errors.Trace(err) 179 } 180 181 l, exists := leases[leaseKey] 182 return l, exists, nil 183 } 184 185 // choose breaks the select out of loop to make the blocking logic clearer. 186 func (manager *Manager) choose(ctx context.Context, blocks blocks) error { 187 select { 188 case <-manager.tomb.Dying(): 189 return tomb.ErrDying 190 191 case check := <-manager.checks: 192 return manager.handleCheck(ctx, check) 193 194 case now := <-manager.timer.Chan(): 195 if err := manager.tick(ctx, now, blocks); err != nil { 196 return errors.Trace(err) 197 } 198 199 case <-manager.expireDone: 200 if err := manager.checkBlocks(ctx, blocks); err != nil { 201 return errors.Trace(err) 202 } 203 204 case claim := <-manager.claims: 205 manager.startingClaim() 206 go manager.retryingClaim(ctx, claim) 207 208 case revoke := <-manager.revokes: 209 manager.startingRevoke() 210 go manager.retryingRevoke(ctx, revoke) 211 212 case pin := <-manager.pins: 213 manager.handlePin(ctx, pin) 214 215 case unpin := <-manager.unpins: 216 manager.handleUnpin(ctx, unpin) 217 218 case block := <-manager.blocks: 219 manager.config.Logger.Tracef("[%s] adding block for: %s", manager.logContext, block.leaseKey.Lease) 220 blocks.add(block) 221 } 222 return nil 223 } 224 225 func (manager *Manager) bind(namespace, modelUUID string) (broker, error) { 226 secretary, err := manager.config.Secretary(namespace) 227 if err != nil { 228 return nil, errors.Trace(err) 229 } 230 return &boundManager{ 231 manager: manager, 232 secretary: secretary, 233 namespace: namespace, 234 modelUUID: modelUUID, 235 }, nil 236 } 237 238 // Checker returns a lease.Checker for the specified namespace and model. 239 func (manager *Manager) Checker(namespace, modelUUID string) (lease.Checker, error) { 240 return manager.bind(namespace, modelUUID) 241 } 242 243 // Claimer returns a lease.Claimer for the specified namespace and model. 244 func (manager *Manager) Claimer(namespace, modelUUID string) (lease.Claimer, error) { 245 return manager.bind(namespace, modelUUID) 246 } 247 248 // Revoker returns a lease.Revoker for the specified namespace and model. 249 func (manager *Manager) Revoker(namespace, modelUUID string) (lease.Revoker, error) { 250 return manager.bind(namespace, modelUUID) 251 } 252 253 // Pinner returns a lease.Pinner for the specified namespace and model. 254 func (manager *Manager) Pinner(namespace, modelUUID string) (lease.Pinner, error) { 255 return manager.bind(namespace, modelUUID) 256 } 257 258 // Reader returns a lease.Reader for the specified namespace and model. 259 func (manager *Manager) Reader(namespace, modelUUID string) (lease.Reader, error) { 260 return manager.bind(namespace, modelUUID) 261 } 262 263 // retryingClaim handles timeouts when claiming, and responds to the 264 // claiming party when it eventually succeeds or fails, or if it times 265 // out after a number of retries. 266 func (manager *Manager) retryingClaim(ctx context.Context, claim claim) { 267 defer manager.finishedClaim() 268 var ( 269 err error 270 success bool 271 ) 272 273 for a := manager.startRetry(); a.Next(); { 274 var act action 275 act, success, err = manager.handleClaim(ctx, claim) 276 if isFatalClaimRetryError(act, err, a.Count()) { 277 break 278 } 279 280 if a.More() { 281 switch { 282 case lease.IsInvalid(err): 283 manager.config.Logger.Tracef("[%s] request by %s for lease %s %v, retrying...", 284 manager.logContext, claim.holderName, claim.leaseKey.Lease, err) 285 286 default: 287 manager.config.Logger.Tracef("[%s] timed out handling claim by %s for lease %s, retrying...", 288 manager.logContext, claim.holderName, claim.leaseKey.Lease) 289 } 290 } 291 } 292 293 if err == nil { 294 if !success { 295 claim.respond(lease.ErrClaimDenied) 296 return 297 } 298 claim.respond(nil) 299 } else { 300 switch { 301 case lease.IsTimeout(err), txn.IsErrRetryable(err): 302 manager.config.Logger.Warningf("[%s] retrying timed out while handling claim %q for %q", 303 manager.logContext, claim.leaseKey, claim.holderName) 304 claim.respond(lease.ErrTimeout) 305 306 case lease.IsInvalid(err): 307 // We want to see this, but it doesn't indicate something a user 308 // can do something about. 309 manager.config.Logger.Infof("[%s] got %v after %d retries, denying claim %q for %q", 310 manager.logContext, err, maxRetries, claim.leaseKey, claim.holderName) 311 claim.respond(lease.ErrClaimDenied) 312 313 case lease.IsHeld(err): 314 // This can happen in HA if the original check for an extant lease 315 // (against the local node) returned nothing, but the leader FSM 316 // has this lease being held by another entity. 317 manager.config.Logger.Tracef( 318 "[%s] %s asked for lease %s, held by by another entity", 319 manager.logContext, claim.holderName, claim.leaseKey.Lease) 320 claim.respond(lease.ErrClaimDenied) 321 322 default: 323 // Stop the main loop because we got an abnormal error 324 manager.tomb.Kill(errors.Trace(err)) 325 } 326 } 327 } 328 329 type action string 330 331 const ( 332 claimAction action = "claim" 333 extendAction action = "extend" 334 ) 335 336 func (a action) String() string { 337 switch a { 338 case claimAction: 339 return "claiming" 340 case extendAction: 341 return "extending" 342 } 343 return "unknown" 344 } 345 346 // handleClaim processes the supplied claim. It will only return 347 // unrecoverable errors or timeouts; mere failure to claim just 348 // indicates a bad request, and is returned as (false, nil). 349 func (manager *Manager) handleClaim(ctx context.Context, claim claim) (action, bool, error) { 350 logger := manager.config.Logger 351 var act action 352 353 select { 354 case <-manager.tomb.Dying(): 355 return "unknown", false, tomb.ErrDying 356 default: 357 info, found, err := manager.lookupLease(ctx, claim.leaseKey) 358 if err != nil { 359 return "unknown", false, errors.Trace(err) 360 } 361 362 request := lease.Request{Holder: claim.holderName, Duration: claim.duration} 363 store := manager.config.Store 364 365 switch { 366 case !found: 367 logger.Tracef("[%s] %s asked for lease %s (%s), no lease found, claiming for %s", 368 manager.logContext, claim.holderName, claim.leaseKey.Lease, claim.leaseKey.Namespace, claim.duration) 369 act = claimAction 370 err = store.ClaimLease(ctx, claim.leaseKey, request) 371 372 case info.Holder == claim.holderName: 373 logger.Tracef("[%s] %s extending lease %s (%s) for %s", 374 manager.logContext, claim.holderName, claim.leaseKey.Lease, claim.leaseKey.Namespace, claim.duration) 375 act = extendAction 376 err = store.ExtendLease(ctx, claim.leaseKey, request) 377 378 default: 379 // Note: (jam) 2017-10-31) We don't check here if the lease has 380 // expired for the current holder. Should we? 381 remaining := info.Expiry.Sub(manager.config.Clock.Now()) 382 logger.Tracef("[%s] %s asked for lease %s, held by %s for another %s, rejecting", 383 manager.logContext, claim.holderName, claim.leaseKey.Lease, info.Holder, remaining) 384 return "unknown", false, nil 385 } 386 387 if lease.IsAborted(err) { 388 return act, false, tomb.ErrDying 389 } 390 if err != nil { 391 return act, false, errors.Trace(err) 392 } 393 } 394 395 logger.Tracef("[%s] %s %s lease %s for %s successful", 396 manager.logContext, claim.holderName, act.String(), claim.leaseKey.Lease, claim.duration) 397 return act, true, nil 398 } 399 400 // retryingRevoke handles timeouts when revoking, and responds to the 401 // revoking party when it eventually succeeds or fails, or if it times 402 // out after a number of retries. 403 func (manager *Manager) retryingRevoke(ctx context.Context, revoke revoke) { 404 defer manager.finishedRevoke() 405 var err error 406 for a := manager.startRetry(); a.Next(); { 407 err = manager.handleRevoke(ctx, revoke) 408 if isFatalRetryError(err) { 409 break 410 } 411 412 if a.More() { 413 switch { 414 case lease.IsInvalid(err): 415 manager.config.Logger.Tracef("[%s] request by %s for revoking lease %s %v, retrying...", 416 manager.logContext, revoke.holderName, revoke.leaseKey.Lease, err) 417 418 default: 419 manager.config.Logger.Tracef("[%s] timed out handling revoke by %s for lease %s, retrying...", 420 manager.logContext, revoke.holderName, revoke.leaseKey.Lease) 421 } 422 } 423 } 424 425 if err == nil { 426 revoke.respond(nil) 427 // If we send back an error, then the main loop won't listen for expireDone 428 select { 429 case <-manager.tomb.Dying(): 430 return 431 case manager.expireDone <- struct{}{}: 432 } 433 } else { 434 switch { 435 case lease.IsTimeout(err), txn.IsErrRetryable(err): 436 manager.config.Logger.Warningf("[%s] retrying timed out while handling revoke %q for %q", 437 manager.logContext, revoke.leaseKey, revoke.holderName) 438 revoke.respond(lease.ErrTimeout) 439 440 case lease.IsInvalid(err): 441 // we want to see this, but it doesn't indicate something a user can do something about 442 manager.config.Logger.Infof("[%s] got %v after %d retries, revoke %q for %q", 443 manager.logContext, err, maxRetries, revoke.leaseKey, revoke.holderName) 444 revoke.respond(err) 445 446 case lease.IsNotHeld(err): 447 // we want to see this, but it doesn't indicate something a user can do something about 448 manager.config.Logger.Infof("[%s] got %v after %d retries, revoke %q for %q", 449 manager.logContext, err, maxRetries, revoke.leaseKey, revoke.holderName) 450 revoke.respond(err) 451 452 default: 453 // Stop the main loop because we got an abnormal error 454 manager.tomb.Kill(errors.Trace(err)) 455 } 456 } 457 } 458 459 // handleRevoke processes the supplied revocation. It will only return 460 // unrecoverable errors or timeouts. 461 func (manager *Manager) handleRevoke(ctx context.Context, revoke revoke) error { 462 logger := manager.config.Logger 463 464 select { 465 case <-manager.tomb.Dying(): 466 return tomb.ErrDying 467 default: 468 info, found, err := manager.lookupLease(ctx, revoke.leaseKey) 469 if err != nil { 470 return errors.Trace(err) 471 } 472 473 switch { 474 case !found: 475 logger.Tracef("[%s] %s asked to revoke lease %s, no lease found", 476 manager.logContext, revoke.holderName, revoke.leaseKey.Lease) 477 return nil 478 479 case info.Holder == revoke.holderName: 480 logger.Tracef("[%s] %s revoking lease %s", manager.logContext, revoke.holderName, revoke.leaseKey.Lease) 481 err = manager.config.Store.RevokeLease(ctx, revoke.leaseKey, revoke.holderName) 482 483 default: 484 logger.Tracef("[%s] %s revoking lease %s, held by %s, rejecting", 485 manager.logContext, revoke.holderName, revoke.leaseKey.Lease, info.Holder) 486 return lease.ErrNotHeld 487 } 488 489 if lease.IsAborted(err) { 490 return tomb.ErrDying 491 } 492 if err != nil { 493 return errors.Trace(err) 494 } 495 } 496 497 logger.Tracef("[%s] %s revoked lease %s successful", manager.logContext, revoke.holderName, revoke.leaseKey.Lease) 498 return nil 499 } 500 501 // handleCheck processes and responds to the supplied check. It will only return 502 // unrecoverable errors; mere untruth of the assertion just indicates a bad 503 // request, and is communicated back to the check's originator. 504 func (manager *Manager) handleCheck(ctx context.Context, check check) error { 505 key := check.leaseKey 506 507 manager.config.Logger.Tracef("[%s] handling Check for lease %s on behalf of %s", 508 manager.logContext, key.Lease, check.holderName) 509 510 info, found, err := manager.lookupLease(ctx, key) 511 if err != nil { 512 return errors.Trace(err) 513 } 514 515 var response error 516 if !found || info.Holder != check.holderName { 517 if found { 518 manager.config.Logger.Tracef("[%s] handling Check for lease %s on behalf of %s, found held by %s", 519 manager.logContext, key.Lease, check.holderName, info.Holder) 520 } else { 521 // Someone thought they were the lease-holder, otherwise they 522 // wouldn't be confirming via the check. However, the lease has 523 // expired, and they are out of sync. Schedule a block check. 524 manager.setNextTimeout(manager.config.Clock.Now().Add(time.Second)) 525 526 manager.config.Logger.Tracef("[%s] handling Check for lease %s on behalf of %s, not found", 527 manager.logContext, key.Lease, check.holderName) 528 } 529 530 response = lease.ErrNotHeld 531 } 532 check.respond(errors.Trace(response)) 533 return nil 534 } 535 536 // tick triggers when we think a lease might be expiring, so we check if there 537 // are leases to expire, and then unblock anything that is no longer blocked, 538 // and then compute the next time we should wake up. 539 func (manager *Manager) tick(ctx context.Context, now time.Time, blocks blocks) error { 540 manager.config.Logger.Tracef("[%s] tick at %v, running expiry checks\n", manager.logContext, now) 541 // Check for blocks that need to be notified. 542 return errors.Trace(manager.checkBlocks(ctx, blocks)) 543 } 544 545 func (manager *Manager) checkBlocks(ctx context.Context, blocks blocks) error { 546 manager.config.Logger.Tracef("[%s] evaluating %d blocks", manager.logContext, len(blocks)) 547 548 leases, err := manager.config.Store.Leases(ctx) 549 if err != nil { 550 return errors.Trace(err) 551 } 552 553 for leaseName := range blocks { 554 if _, found := leases[leaseName]; !found { 555 manager.config.Logger.Tracef("[%s] unblocking: %s", manager.logContext, leaseName) 556 blocks.unblock(leaseName) 557 } 558 } 559 manager.computeNextTimeout(leases) 560 return nil 561 } 562 563 // computeNextTimeout iterates the leases and finds out what the next time we 564 // want to wake up, expire any leases and then handle any unblocks that happen. 565 // It is the earliest lease expiration due in the future, but before MaxSleep. 566 func (manager *Manager) computeNextTimeout(leases map[lease.Key]lease.Info) { 567 now := manager.config.Clock.Now() 568 nextTick := now.Add(manager.config.MaxSleep) 569 for _, info := range leases { 570 if info.Expiry.After(nextTick) { 571 continue 572 } 573 nextTick = info.Expiry 574 } 575 576 // If we had leases set to expire in the past, then we assume that our FSM 577 // is behind the leader and will soon indicate their expiration. 578 // Check the blocks again soon. 579 if !nextTick.After(now) { 580 nextTick = now 581 } 582 583 // The lease clock ticks *at least* a second from now. Expirations only 584 // occur when the global clock updater ticks the clock, so this avoids 585 // too frequently checking with the potential of having no work to do. 586 // The blanket addition of a second is no big deal. 587 nextTick = nextTick.Add(time.Second) 588 589 nextDuration := nextTick.Sub(now).Round(time.Millisecond) 590 manager.config.Logger.Tracef("[%s] next expire in %v %v", manager.logContext, nextDuration, nextTick) 591 manager.setNextTimeout(nextTick) 592 } 593 594 func (manager *Manager) setNextTimeout(t time.Time) { 595 now := manager.config.Clock.Now() 596 597 // Ensure we never walk the next check back without have performed a 598 // scheduled check *unless* we think our last check was in the past. 599 if !manager.nextTimeout.Before(now) && !t.Before(manager.nextTimeout) { 600 manager.config.Logger.Tracef("[%s] not rescheduling check from %v to %v based on current time %v", 601 manager.logContext, manager.nextTimeout, t, now) 602 return 603 } 604 manager.nextTimeout = t 605 606 d := t.Sub(now) 607 if manager.timer == nil { 608 manager.timer = manager.config.Clock.NewTimer(d) 609 } else { 610 // See the docs on Timer.Reset() that says it isn't safe to call 611 // on a non-stopped channel, and if it is stopped, you need to check 612 // if the channel needs to be drained anyway. It isn't safe to drain 613 // unconditionally in case another goroutine has already noticed, 614 // but make an attempt. 615 if !manager.timer.Stop() { 616 select { 617 case <-manager.timer.Chan(): 618 default: 619 } 620 } 621 manager.timer.Reset(d) 622 } 623 } 624 625 func (manager *Manager) startRetry() *retry.Attempt { 626 return retry.StartWithCancel( 627 retry.LimitCount(maxRetries, retry.Exponential{ 628 Initial: initialRetryDelay, 629 Factor: retryBackoffFactor, 630 Jitter: true, 631 }), 632 manager.config.Clock, 633 manager.tomb.Dying(), 634 ) 635 } 636 637 func isFatalRetryError(err error) bool { 638 switch { 639 case txn.IsErrRetryable(err): 640 return false 641 case lease.IsTimeout(err): 642 return false 643 case lease.IsInvalid(err): 644 return false 645 } 646 return true 647 } 648 649 func isFatalClaimRetryError(act action, err error, count int) bool { 650 switch { 651 case txn.IsErrRetryable(err): 652 return false 653 case lease.IsTimeout(err): 654 return false 655 case lease.IsInvalid(err): 656 return false 657 } 658 return true 659 } 660 661 func (manager *Manager) handlePin(ctx context.Context, p pin) { 662 p.respond(errors.Trace(manager.config.Store.PinLease(ctx, p.leaseKey, p.entity))) 663 } 664 665 func (manager *Manager) handleUnpin(ctx context.Context, p pin) { 666 p.respond(errors.Trace(manager.config.Store.UnpinLease(ctx, p.leaseKey, p.entity))) 667 } 668 669 // pinned returns lease names and the entities requiring their pinned 670 // behaviour, from the input namespace/model for which leases are pinned. 671 func (manager *Manager) pinned(ctx context.Context, namespace, modelUUID string) (map[string][]string, error) { 672 pinned, err := manager.config.Store.Pinned(ctx) 673 if err != nil { 674 return nil, errors.Trace(err) 675 } 676 677 result := make(map[string][]string) 678 for key, entities := range pinned { 679 if key.Namespace == namespace && key.ModelUUID == modelUUID { 680 result[key.Lease] = entities 681 } 682 } 683 return result, nil 684 } 685 686 func (manager *Manager) leases(ctx context.Context, namespace, modelUUID string) (map[string]string, error) { 687 group, err := manager.config.Store.LeaseGroup(ctx, namespace, modelUUID) 688 if err != nil { 689 return nil, errors.Trace(err) 690 } 691 692 leases := make(map[string]string) 693 for key, info := range group { 694 leases[key.Lease] = info.Holder 695 } 696 return leases, nil 697 } 698 699 func (manager *Manager) startingClaim() { 700 atomic.AddInt64(&manager.outstandingClaims, 1) 701 manager.wg.Add(1) 702 } 703 704 func (manager *Manager) finishedClaim() { 705 manager.wg.Done() 706 atomic.AddInt64(&manager.outstandingClaims, -1) 707 } 708 709 func (manager *Manager) startingRevoke() { 710 atomic.AddInt64(&manager.outstandingRevokes, 1) 711 manager.wg.Add(1) 712 } 713 714 func (manager *Manager) finishedRevoke() { 715 manager.wg.Done() 716 atomic.AddInt64(&manager.outstandingRevokes, -1) 717 } 718 719 // Report is part of dependency.Reporter 720 func (manager *Manager) Report() map[string]interface{} { 721 out := make(map[string]interface{}) 722 out["entity-uuid"] = manager.config.EntityUUID 723 out["outstanding-claims"] = atomic.LoadInt64(&manager.outstandingClaims) 724 out["outstanding-revokes"] = atomic.LoadInt64(&manager.outstandingRevokes) 725 return out 726 } 727 728 func (manager *Manager) waitForGoroutines() { 729 // Wait for the waitgroup to finish, but only up to a point. 730 groupDone := make(chan struct{}) 731 go func() { 732 manager.wg.Wait() 733 close(groupDone) 734 }() 735 736 select { 737 case <-groupDone: 738 return 739 case <-manager.config.Clock.After(maxShutdownWait): 740 } 741 msg := "timeout waiting for lease manager shutdown" 742 dumpFile, err := manager.dumpDebug() 743 logger := manager.config.Logger 744 if err == nil { 745 logger.Warningf("%v\ndebug info written to %v", msg, dumpFile) 746 } else { 747 logger.Warningf("%v\nerror writing debug info: %v", msg, err) 748 } 749 750 } 751 752 func (manager *Manager) dumpDebug() (string, error) { 753 dumpFile, err := os.OpenFile(filepath.Join(manager.config.LogDir, "lease-manager-debug.log"), os.O_WRONLY|os.O_CREATE|os.O_APPEND, 0644) 754 if err != nil { 755 return "", errors.Trace(err) 756 } 757 defer func() { _ = dumpFile.Close() }() 758 759 claims := atomic.LoadInt64(&manager.outstandingClaims) 760 revokes := atomic.LoadInt64(&manager.outstandingRevokes) 761 template := ` 762 lease manager state dump %v 763 entity-uuid: %v 764 outstanding-claims: %v 765 outstanding-revokes: %v 766 767 `[1:] 768 message := fmt.Sprintf(template, 769 time.Now().Format(time.RFC3339), 770 manager.config.EntityUUID, 771 claims, 772 revokes, 773 ) 774 if _, err = io.WriteString(dumpFile, message); err != nil { 775 return "", errors.Annotate(err, "writing state to debug log file") 776 } 777 // Including the goroutines because the httpserver won't dump them 778 // anymore if this worker stops happily. 779 return dumpFile.Name(), pprof.Lookup("goroutine").WriteTo(dumpFile, 1) 780 }