github.com/mhilton/juju-juju@v0.0.0-20150901100907-a94dd2c73455/state/leadership/manager.go (about)

     1  // Copyright 2015 Canonical Ltd.
     2  // Licensed under the AGPLv3, see LICENCE file for details.
     3  
     4  package leadership
     5  
     6  import (
     7  	"sort"
     8  	"time"
     9  
    10  	"github.com/juju/errors"
    11  	"github.com/juju/loggo"
    12  	"github.com/juju/utils/clock"
    13  	"launchpad.net/tomb"
    14  
    15  	"github.com/juju/juju/leadership"
    16  	"github.com/juju/juju/state/lease"
    17  )
    18  
    19  var logger = loggo.GetLogger("juju.state.leadership")
    20  
    21  // NewManager returns a Manager implementation, backed by a lease.Client,
    22  // which (in addition to its exposed Manager capabilities) will expire all
    23  // known leases as they run out. The caller takes responsibility for killing,
    24  // and handling errors from, the returned Worker.
    25  func NewManager(config ManagerConfig) (ManagerWorker, error) {
    26  	if err := config.Validate(); err != nil {
    27  		return nil, errors.Trace(err)
    28  	}
    29  	manager := &manager{
    30  		config: config,
    31  		claims: make(chan claim),
    32  		checks: make(chan check),
    33  		blocks: make(chan block),
    34  	}
    35  	go func() {
    36  		defer manager.tomb.Done()
    37  		// note: we don't directly tomb.Kill, because we may need to
    38  		// unwrap tomb.ErrDying in order to function correctly.
    39  		manager.kill(manager.loop())
    40  	}()
    41  	return manager, nil
    42  }
    43  
    44  // manager implements ManagerWorker.
    45  type manager struct {
    46  	tomb tomb.Tomb
    47  
    48  	// config collects all external configuration and dependencies.
    49  	config ManagerConfig
    50  
    51  	// claims is used to deliver leadership claim requests to the loop.
    52  	claims chan claim
    53  
    54  	// checks is used to deliver leadership check requests to the loop.
    55  	checks chan check
    56  
    57  	// blocks is used to deliver leaderlessness block requests to the loop.
    58  	blocks chan block
    59  }
    60  
    61  // Kill is part of the worker.Worker interface.
    62  func (manager *manager) Kill() {
    63  	manager.kill(nil)
    64  }
    65  
    66  // kill unwraps tomb.ErrDying before killing the tomb, thus allowing the worker
    67  // to use errors.Trace liberally and still stop cleanly.
    68  func (manager *manager) kill(err error) {
    69  	if errors.Cause(err) == tomb.ErrDying {
    70  		err = tomb.ErrDying
    71  	} else if err != nil {
    72  		logger.Errorf("stopping leadership manager with error: %v", err)
    73  	}
    74  	manager.tomb.Kill(err)
    75  }
    76  
    77  // Wait is part of the worker.Worker interface.
    78  func (manager *manager) Wait() error {
    79  	return manager.tomb.Wait()
    80  }
    81  
    82  // loop runs until the manager is stopped.
    83  func (manager *manager) loop() error {
    84  	blocks := make(blocks)
    85  	for {
    86  		if err := manager.choose(blocks); err != nil {
    87  			return errors.Trace(err)
    88  		}
    89  
    90  		leases := manager.config.Client.Leases()
    91  		for serviceName := range blocks {
    92  			if _, found := leases[serviceName]; !found {
    93  				blocks.unblock(serviceName)
    94  			}
    95  		}
    96  	}
    97  }
    98  
    99  // choose breaks the select out of loop to make the blocking logic clearer.
   100  func (manager *manager) choose(blocks blocks) error {
   101  	select {
   102  	case <-manager.tomb.Dying():
   103  		return tomb.ErrDying
   104  	case <-manager.nextExpiry():
   105  		return manager.expire()
   106  	case claim := <-manager.claims:
   107  		return manager.handleClaim(claim)
   108  	case check := <-manager.checks:
   109  		return manager.handleCheck(check)
   110  	case block := <-manager.blocks:
   111  		blocks.add(block)
   112  		return nil
   113  	}
   114  }
   115  
   116  // ClaimLeadership is part of the leadership.Claimer interface.
   117  func (manager *manager) ClaimLeadership(serviceName, unitName string, duration time.Duration) error {
   118  	return claim{
   119  		serviceName: serviceName,
   120  		unitName:    unitName,
   121  		duration:    duration,
   122  		response:    make(chan bool),
   123  		abort:       manager.tomb.Dying(),
   124  	}.invoke(manager.claims)
   125  }
   126  
   127  // handleClaim processes and responds to the supplied claim. It will only return
   128  // unrecoverable errors; mere failure to claim just indicates a bad request, and
   129  // is communicated back to the claim's originator.
   130  func (manager *manager) handleClaim(claim claim) error {
   131  	client := manager.config.Client
   132  	request := lease.Request{claim.unitName, claim.duration}
   133  	err := lease.ErrInvalid
   134  	for err == lease.ErrInvalid {
   135  		select {
   136  		case <-manager.tomb.Dying():
   137  			return tomb.ErrDying
   138  		default:
   139  			info, found := client.Leases()[claim.serviceName]
   140  			switch {
   141  			case !found:
   142  				err = client.ClaimLease(claim.serviceName, request)
   143  			case info.Holder == claim.unitName:
   144  				err = client.ExtendLease(claim.serviceName, request)
   145  			default:
   146  				claim.respond(false)
   147  				return nil
   148  			}
   149  		}
   150  	}
   151  	if err != nil {
   152  		return errors.Trace(err)
   153  	}
   154  	claim.respond(true)
   155  	return nil
   156  }
   157  
   158  // LeadershipCheck is part of the leadership.Checker interface.
   159  //
   160  // The token returned will accept a `*[]txn.Op` passed to Check, and will
   161  // populate it with transaction operations that will fail if the unit is
   162  // not leader of the service.
   163  func (manager *manager) LeadershipCheck(serviceName, unitName string) leadership.Token {
   164  	return token{
   165  		serviceName: serviceName,
   166  		unitName:    unitName,
   167  		checks:      manager.checks,
   168  		abort:       manager.tomb.Dying(),
   169  	}
   170  }
   171  
   172  // handleCheck processes and responds to the supplied check. It will only return
   173  // unrecoverable errors; mere untruth of the assertion just indicates a bad
   174  // request, and is communicated back to the check's originator.
   175  func (manager *manager) handleCheck(check check) error {
   176  	client := manager.config.Client
   177  	info, found := client.Leases()[check.serviceName]
   178  	if !found || info.Holder != check.unitName {
   179  		if err := client.Refresh(); err != nil {
   180  			return errors.Trace(err)
   181  		}
   182  		info, found = client.Leases()[check.serviceName]
   183  	}
   184  	if found && info.Holder == check.unitName {
   185  		check.succeed(info.AssertOp)
   186  	} else {
   187  		check.fail()
   188  	}
   189  	return nil
   190  }
   191  
   192  // BlockUntilLeadershipReleased is part of the leadership.Claimer interface.
   193  func (manager *manager) BlockUntilLeadershipReleased(serviceName string) error {
   194  	return block{
   195  		serviceName: serviceName,
   196  		unblock:     make(chan struct{}),
   197  		abort:       manager.tomb.Dying(),
   198  	}.invoke(manager.blocks)
   199  }
   200  
   201  // nextExpiry returns a channel that will send a value at some point when we
   202  // expect at least one lease to be ready to expire. If no leases are known,
   203  // it will return nil.
   204  func (manager *manager) nextExpiry() <-chan time.Time {
   205  	var nextExpiry *time.Time
   206  	for _, info := range manager.config.Client.Leases() {
   207  		if nextExpiry != nil {
   208  			if info.Expiry.After(*nextExpiry) {
   209  				continue
   210  			}
   211  		}
   212  		nextExpiry = &info.Expiry
   213  	}
   214  	if nextExpiry == nil {
   215  		logger.Tracef("no leases recorded; never waking for expiry")
   216  		return nil
   217  	}
   218  	logger.Tracef("waking to expire leases at %s", *nextExpiry)
   219  	return clock.Alarm(manager.config.Clock, *nextExpiry)
   220  }
   221  
   222  // expire will attempt to expire all leases that may have expired. There might
   223  // be none; they might have been extended or expired already by someone else; so
   224  // ErrInvalid is expected, and ignored, in the comfortable knowledge that the
   225  // client will have been updated and we'll see fresh info when we scan for new
   226  // expiries next time through the loop. It will return only unrecoverable errors.
   227  func (manager *manager) expire() error {
   228  	logger.Tracef("expiring leases...")
   229  	client := manager.config.Client
   230  	leases := client.Leases()
   231  
   232  	// Sort lease names so we expire in a predictable order for the tests.
   233  	names := make([]string, 0, len(leases))
   234  	for name := range leases {
   235  		names = append(names, name)
   236  	}
   237  	sort.Strings(names)
   238  	for _, name := range names {
   239  		now := manager.config.Clock.Now()
   240  		if leases[name].Expiry.After(now) {
   241  			continue
   242  		}
   243  		switch err := client.ExpireLease(name); err {
   244  		case nil, lease.ErrInvalid:
   245  		default:
   246  			return errors.Trace(err)
   247  		}
   248  	}
   249  	return nil
   250  }