github.com/mhilton/juju-juju@v0.0.0-20150901100907-a94dd2c73455/worker/uniter/modes.go (about)

     1  // Copyright 2012-2015 Canonical Ltd.
     2  // Licensed under the AGPLv3, see LICENCE file for details.
     3  
     4  package uniter
     5  
     6  import (
     7  	"fmt"
     8  	"time"
     9  
    10  	"github.com/juju/errors"
    11  	"gopkg.in/juju/charm.v5"
    12  	"gopkg.in/juju/charm.v5/hooks"
    13  	"launchpad.net/tomb"
    14  
    15  	"github.com/juju/juju/apiserver/params"
    16  	"github.com/juju/juju/state/watcher"
    17  	"github.com/juju/juju/worker"
    18  	"github.com/juju/juju/worker/uniter/hook"
    19  	"github.com/juju/juju/worker/uniter/operation"
    20  )
    21  
    22  // setAgentStatus sets the unit's status if it has changed since last time this method was called.
    23  func setAgentStatus(u *Uniter, status params.Status, info string, data map[string]interface{}) error {
    24  	u.setStatusMutex.Lock()
    25  	defer u.setStatusMutex.Unlock()
    26  	if u.lastReportedStatus == status && u.lastReportedMessage == info {
    27  		return nil
    28  	}
    29  	u.lastReportedStatus = status
    30  	u.lastReportedMessage = info
    31  	logger.Debugf("[AGENT-STATUS] %s: %s", status, info)
    32  	return u.unit.SetAgentStatus(status, info, data)
    33  }
    34  
    35  // reportAgentError reports if there was an error performing an agent operation.
    36  func reportAgentError(u *Uniter, userMessage string, err error) {
    37  	// If a non-nil error is reported (e.g. due to an operation failing),
    38  	// set the agent status to Failed.
    39  	if err == nil {
    40  		return
    41  	}
    42  	err2 := setAgentStatus(u, params.StatusFailed, userMessage, nil)
    43  	if err2 != nil {
    44  		logger.Errorf("updating agent status: %v", err2)
    45  	}
    46  }
    47  
    48  // Mode defines the signature of the functions that implement the possible
    49  // states of a running Uniter.
    50  type Mode func(u *Uniter) (Mode, error)
    51  
    52  // ModeContinue determines what action to take based on persistent uniter state.
    53  func ModeContinue(u *Uniter) (next Mode, err error) {
    54  	defer modeContext("ModeContinue", &err)()
    55  	opState := u.operationState()
    56  
    57  	// Resume interrupted deployment operations.
    58  	if opState.Kind == operation.Install {
    59  		logger.Infof("resuming charm install")
    60  		return ModeInstalling(opState.CharmURL)
    61  	} else if opState.Kind == operation.Upgrade {
    62  		logger.Infof("resuming charm upgrade")
    63  		return ModeUpgrading(opState.CharmURL), nil
    64  	}
    65  
    66  	// If we got this far, we should have an installed charm,
    67  	// so initialize the metrics timers according to what's
    68  	// currently deployed.
    69  	if err := u.initializeMetricsTimers(); err != nil {
    70  		return nil, errors.Trace(err)
    71  	}
    72  
    73  	// Check for any leadership change, and enact it if possible.
    74  	logger.Infof("checking leadership status")
    75  	// If we've already accepted leadership, we don't need to do it again.
    76  	canAcceptLeader := !opState.Leader
    77  	select {
    78  	// If the unit's shutting down, we shouldn't accept it.
    79  	case <-u.f.UnitDying():
    80  		canAcceptLeader = false
    81  	default:
    82  		// If we're in an unexpected mode (eg pending hook) we shouldn't try either.
    83  		if opState.Kind != operation.Continue {
    84  			canAcceptLeader = false
    85  		}
    86  	}
    87  
    88  	// NOTE: the Wait() looks scary, but a ClaimLeadership ticket should always
    89  	// complete quickly; worst-case is API latency time, but it's designed that
    90  	// it should be vanishingly rare to hit that code path.
    91  	isLeader := u.leadershipTracker.ClaimLeader().Wait()
    92  	var creator creator
    93  	switch {
    94  	case isLeader && canAcceptLeader:
    95  		creator = newAcceptLeadershipOp()
    96  	case opState.Leader && !isLeader:
    97  		creator = newResignLeadershipOp()
    98  	}
    99  	if creator != nil {
   100  		return continueAfter(u, creator)
   101  	}
   102  	logger.Infof("leadership status is up-to-date")
   103  
   104  	switch opState.Kind {
   105  	case operation.RunAction:
   106  		// TODO(fwereade): we *should* handle interrupted actions, and make sure
   107  		// they're marked as failed, but that's not for now.
   108  		if opState.Hook != nil {
   109  			logger.Infof("found incomplete action %q; ignoring", opState.ActionId)
   110  			logger.Infof("recommitting prior %q hook", opState.Hook.Kind)
   111  			creator = newSkipHookOp(*opState.Hook)
   112  		} else {
   113  			logger.Infof("%q hook is nil", operation.RunAction)
   114  		}
   115  	case operation.RunHook:
   116  		switch opState.Step {
   117  		case operation.Pending:
   118  			logger.Infof("awaiting error resolution for %q hook", opState.Hook.Kind)
   119  			return ModeHookError, nil
   120  		case operation.Queued:
   121  			logger.Infof("found queued %q hook", opState.Hook.Kind)
   122  			// Ensure storage-attached hooks are run before install
   123  			// or upgrade hooks.
   124  			switch opState.Hook.Kind {
   125  			case hooks.UpgradeCharm:
   126  				// Force a refresh of all storage attachments,
   127  				// so we find out about new ones introduced
   128  				// by the charm upgrade.
   129  				if err := u.storage.Refresh(); err != nil {
   130  					return nil, errors.Trace(err)
   131  				}
   132  				fallthrough
   133  			case hooks.Install:
   134  				if err := waitStorage(u); err != nil {
   135  					return nil, errors.Trace(err)
   136  				}
   137  			}
   138  			creator = newRunHookOp(*opState.Hook)
   139  		case operation.Done:
   140  			logger.Infof("committing %q hook", opState.Hook.Kind)
   141  			creator = newSkipHookOp(*opState.Hook)
   142  		}
   143  	case operation.Continue:
   144  		if opState.Stopped {
   145  			logger.Infof("opState.Stopped == true; transition to ModeTerminating")
   146  			return ModeTerminating, nil
   147  		}
   148  		logger.Infof("no operations in progress; waiting for changes")
   149  		return ModeAbide, nil
   150  	default:
   151  		return nil, errors.Errorf("unknown operation kind %v", opState.Kind)
   152  	}
   153  	return continueAfter(u, creator)
   154  }
   155  
   156  // ModeInstalling is responsible for the initial charm deployment. If an install
   157  // operation were to set an appropriate status, it shouldn't be necessary; but see
   158  // ModeUpgrading for discussion relevant to both.
   159  func ModeInstalling(curl *charm.URL) (next Mode, err error) {
   160  	name := fmt.Sprintf("ModeInstalling %s", curl)
   161  	return func(u *Uniter) (next Mode, err error) {
   162  		defer modeContext(name, &err)()
   163  		return continueAfter(u, newInstallOp(curl))
   164  	}, nil
   165  }
   166  
   167  // ModeUpgrading is responsible for upgrading the charm. It shouldn't really
   168  // need to be a mode at all -- it's just running a single operation -- but
   169  // it's not safe to call it inside arbitrary other modes, because failing to
   170  // pass through ModeContinue on the way out could cause a queued hook to be
   171  // accidentally skipped.
   172  func ModeUpgrading(curl *charm.URL) Mode {
   173  	name := fmt.Sprintf("ModeUpgrading %s", curl)
   174  	return func(u *Uniter) (next Mode, err error) {
   175  		defer modeContext(name, &err)()
   176  		return continueAfter(u, newUpgradeOp(curl))
   177  	}
   178  }
   179  
   180  // ModeTerminating marks the unit dead and returns ErrTerminateAgent.
   181  func ModeTerminating(u *Uniter) (next Mode, err error) {
   182  	defer modeContext("ModeTerminating", &err)()
   183  	w, err := u.unit.Watch()
   184  	if err != nil {
   185  		return nil, errors.Trace(err)
   186  	}
   187  	defer watcher.Stop(w, &u.tomb)
   188  
   189  	// Upon unit termination we attempt to send any leftover metrics one last time. If we fail, there is nothing
   190  	// else we can do but log the error.
   191  	sendErr := u.runOperation(newSendMetricsOp())
   192  	if sendErr != nil {
   193  		logger.Warningf("failed to send metrics: %v", sendErr)
   194  	}
   195  
   196  	for {
   197  		select {
   198  		case <-u.tomb.Dying():
   199  			return nil, tomb.ErrDying
   200  		case actionId := <-u.f.ActionEvents():
   201  			creator := newActionOp(actionId)
   202  			if err := u.runOperation(creator); err != nil {
   203  				return nil, errors.Trace(err)
   204  			}
   205  		case _, ok := <-w.Changes():
   206  			if !ok {
   207  				return nil, watcher.EnsureErr(w)
   208  			}
   209  			if err := u.unit.Refresh(); err != nil {
   210  				return nil, errors.Trace(err)
   211  			}
   212  			if hasSubs, err := u.unit.HasSubordinates(); err != nil {
   213  				return nil, errors.Trace(err)
   214  			} else if hasSubs {
   215  				continue
   216  			}
   217  			// The unit is known to be Dying; so if it didn't have subordinates
   218  			// just above, it can't acquire new ones before this call.
   219  			if err := u.unit.EnsureDead(); err != nil {
   220  				return nil, errors.Trace(err)
   221  			}
   222  			return nil, worker.ErrTerminateAgent
   223  		}
   224  	}
   225  }
   226  
   227  // ModeAbide is the Uniter's usual steady state. It watches for and responds to:
   228  // * service configuration changes
   229  // * charm upgrade requests
   230  // * relation changes
   231  // * unit death
   232  // * acquisition or loss of service leadership
   233  func ModeAbide(u *Uniter) (next Mode, err error) {
   234  	defer modeContext("ModeAbide", &err)()
   235  	opState := u.operationState()
   236  	if opState.Kind != operation.Continue {
   237  		return nil, errors.Errorf("insane uniter state: %#v", opState)
   238  	}
   239  	if err := u.deployer.Fix(); err != nil {
   240  		return nil, errors.Trace(err)
   241  	}
   242  
   243  	if !opState.Leader && !u.ranLeaderSettingsChanged {
   244  		creator := newSimpleRunHookOp(hook.LeaderSettingsChanged)
   245  		if err := u.runOperation(creator); err != nil {
   246  			return nil, errors.Trace(err)
   247  		}
   248  	}
   249  
   250  	if !u.ranConfigChanged {
   251  		return continueAfter(u, newSimpleRunHookOp(hooks.ConfigChanged))
   252  	}
   253  	if !opState.Started {
   254  		return continueAfter(u, newSimpleRunHookOp(hooks.Start))
   255  	}
   256  	u.f.WantUpgradeEvent(false)
   257  	u.relations.StartHooks()
   258  	defer func() {
   259  		if e := u.relations.StopHooks(); e != nil {
   260  			if err == nil {
   261  				err = e
   262  			} else {
   263  				logger.Errorf("error while stopping hooks: %v", e)
   264  			}
   265  		}
   266  	}()
   267  
   268  	select {
   269  	case <-u.f.UnitDying():
   270  		return modeAbideDyingLoop(u)
   271  	default:
   272  	}
   273  	return modeAbideAliveLoop(u)
   274  }
   275  
   276  // idleWaitTime is the time after which, if there are no uniter events,
   277  // the agent state becomes idle.
   278  var idleWaitTime = 2 * time.Second
   279  
   280  // modeAbideAliveLoop handles all state changes for ModeAbide when the unit
   281  // is in an Alive state.
   282  func modeAbideAliveLoop(u *Uniter) (Mode, error) {
   283  	var leaderElected, leaderDeposed <-chan struct{}
   284  	for {
   285  		// We expect one or none of these vars to be non-nil; and if none
   286  		// are, we set the one that should trigger when our leadership state
   287  		// differs from what we have recorded locally.
   288  		if leaderElected == nil && leaderDeposed == nil {
   289  			if u.operationState().Leader {
   290  				logger.Infof("waiting to lose leadership")
   291  				leaderDeposed = u.leadershipTracker.WaitMinion().Ready()
   292  			} else {
   293  				logger.Infof("waiting to gain leadership")
   294  				leaderElected = u.leadershipTracker.WaitLeader().Ready()
   295  			}
   296  		}
   297  
   298  		// collect-metrics hook
   299  		lastCollectMetrics := time.Unix(u.operationState().CollectMetricsTime, 0)
   300  		collectMetricsSignal := u.collectMetricsAt(
   301  			time.Now(), lastCollectMetrics, metricsPollInterval,
   302  		)
   303  
   304  		lastSentMetrics := time.Unix(u.operationState().SendMetricsTime, 0)
   305  		sendMetricsSignal := u.sendMetricsAt(
   306  			time.Now(), lastSentMetrics, metricsSendInterval,
   307  		)
   308  
   309  		// update-status hook
   310  		lastUpdateStatus := time.Unix(u.operationState().UpdateStatusTime, 0)
   311  		updateStatusSignal := u.updateStatusAt(
   312  			time.Now(), lastUpdateStatus, statusPollInterval,
   313  		)
   314  
   315  		var creator creator
   316  		select {
   317  		case <-time.After(idleWaitTime):
   318  			if err := setAgentStatus(u, params.StatusIdle, "", nil); err != nil {
   319  				return nil, errors.Trace(err)
   320  			}
   321  			continue
   322  		case <-u.tomb.Dying():
   323  			return nil, tomb.ErrDying
   324  		case <-u.f.UnitDying():
   325  			return modeAbideDyingLoop(u)
   326  		case curl := <-u.f.UpgradeEvents():
   327  			return ModeUpgrading(curl), nil
   328  		case ids := <-u.f.RelationsEvents():
   329  			creator = newUpdateRelationsOp(ids)
   330  		case actionId := <-u.f.ActionEvents():
   331  			creator = newActionOp(actionId)
   332  		case tags := <-u.f.StorageEvents():
   333  			creator = newUpdateStorageOp(tags)
   334  		case <-u.f.ConfigEvents():
   335  			creator = newSimpleRunHookOp(hooks.ConfigChanged)
   336  		case <-u.f.MeterStatusEvents():
   337  			creator = newSimpleRunHookOp(hooks.MeterStatusChanged)
   338  		case <-collectMetricsSignal:
   339  			creator = newSimpleRunHookOp(hooks.CollectMetrics)
   340  		case <-sendMetricsSignal:
   341  			creator = newSendMetricsOp()
   342  		case <-updateStatusSignal:
   343  			creator = newSimpleRunHookOp(hooks.UpdateStatus)
   344  		case hookInfo := <-u.relations.Hooks():
   345  			creator = newRunHookOp(hookInfo)
   346  		case hookInfo := <-u.storage.Hooks():
   347  			creator = newRunHookOp(hookInfo)
   348  		case <-leaderElected:
   349  			// This operation queues a hook, better to let ModeContinue pick up
   350  			// after it than to duplicate queued-hook handling here.
   351  			return continueAfter(u, newAcceptLeadershipOp())
   352  		case <-leaderDeposed:
   353  			leaderDeposed = nil
   354  			creator = newResignLeadershipOp()
   355  		case <-u.f.LeaderSettingsEvents():
   356  			creator = newSimpleRunHookOp(hook.LeaderSettingsChanged)
   357  		}
   358  		if err := u.runOperation(creator); err != nil {
   359  			return nil, errors.Trace(err)
   360  		}
   361  	}
   362  }
   363  
   364  // modeAbideDyingLoop handles the proper termination of all relations in
   365  // response to a Dying unit.
   366  func modeAbideDyingLoop(u *Uniter) (next Mode, err error) {
   367  	if err := u.unit.Refresh(); err != nil {
   368  		return nil, errors.Trace(err)
   369  	}
   370  	if err = u.unit.DestroyAllSubordinates(); err != nil {
   371  		return nil, errors.Trace(err)
   372  	}
   373  	if err := u.relations.SetDying(); err != nil {
   374  		return nil, errors.Trace(err)
   375  	}
   376  	if u.operationState().Leader {
   377  		if err := u.runOperation(newResignLeadershipOp()); err != nil {
   378  			return nil, errors.Trace(err)
   379  		}
   380  		// TODO(fwereade): we ought to inform the tracker that we're shutting down
   381  		// (and no longer wish to continue renewing our lease) so that the tracker
   382  		// can then report minionhood at all times, and thus prevent the is-leader
   383  		// and leader-set hook tools from acting in a correct but misleading way
   384  		// (ie continuing to act as though leader after leader-deposed has run).
   385  	}
   386  	if err := u.storage.SetDying(); err != nil {
   387  		return nil, errors.Trace(err)
   388  	}
   389  	for {
   390  		if len(u.relations.GetInfo()) == 0 && u.storage.Empty() {
   391  			return continueAfter(u, newSimpleRunHookOp(hooks.Stop))
   392  		}
   393  		var creator creator
   394  		select {
   395  		case <-u.tomb.Dying():
   396  			return nil, tomb.ErrDying
   397  		case actionId := <-u.f.ActionEvents():
   398  			creator = newActionOp(actionId)
   399  		case <-u.f.ConfigEvents():
   400  			creator = newSimpleRunHookOp(hooks.ConfigChanged)
   401  		case <-u.f.LeaderSettingsEvents():
   402  			creator = newSimpleRunHookOp(hook.LeaderSettingsChanged)
   403  		case hookInfo := <-u.relations.Hooks():
   404  			creator = newRunHookOp(hookInfo)
   405  		case hookInfo := <-u.storage.Hooks():
   406  			creator = newRunHookOp(hookInfo)
   407  		}
   408  		if err := u.runOperation(creator); err != nil {
   409  			return nil, errors.Trace(err)
   410  		}
   411  	}
   412  }
   413  
   414  // waitStorage waits until all storage attachments are provisioned
   415  // and their hooks processed.
   416  func waitStorage(u *Uniter) error {
   417  	if u.storage.Pending() == 0 {
   418  		return nil
   419  	}
   420  	logger.Infof("waiting for storage attachments")
   421  	for u.storage.Pending() > 0 {
   422  		var creator creator
   423  		select {
   424  		case <-u.tomb.Dying():
   425  			return tomb.ErrDying
   426  		case <-u.f.UnitDying():
   427  			// Unit is shutting down; no need to handle any
   428  			// more storage-attached hooks. We will process
   429  			// required storage-detaching hooks in ModeAbideDying.
   430  			return nil
   431  		case tags := <-u.f.StorageEvents():
   432  			creator = newUpdateStorageOp(tags)
   433  		case hookInfo := <-u.storage.Hooks():
   434  			creator = newRunHookOp(hookInfo)
   435  		}
   436  		if err := u.runOperation(creator); err != nil {
   437  			return errors.Trace(err)
   438  		}
   439  	}
   440  	logger.Infof("storage attachments ready")
   441  	return nil
   442  }
   443  
   444  // ModeHookError is responsible for watching and responding to:
   445  // * user resolution of hook errors
   446  // * forced charm upgrade requests
   447  // * loss of service leadership
   448  func ModeHookError(u *Uniter) (next Mode, err error) {
   449  	defer modeContext("ModeHookError", &err)()
   450  	opState := u.operationState()
   451  	if opState.Kind != operation.RunHook || opState.Step != operation.Pending {
   452  		return nil, errors.Errorf("insane uniter state: %#v", u.operationState())
   453  	}
   454  
   455  	// Create error information for status.
   456  	hookInfo := *opState.Hook
   457  	hookName := string(hookInfo.Kind)
   458  	statusData := map[string]interface{}{}
   459  	if hookInfo.Kind.IsRelation() {
   460  		statusData["relation-id"] = hookInfo.RelationId
   461  		if hookInfo.RemoteUnit != "" {
   462  			statusData["remote-unit"] = hookInfo.RemoteUnit
   463  		}
   464  		relationName, err := u.relations.Name(hookInfo.RelationId)
   465  		if err != nil {
   466  			return nil, errors.Trace(err)
   467  		}
   468  		hookName = fmt.Sprintf("%s-%s", relationName, hookInfo.Kind)
   469  	}
   470  	statusData["hook"] = hookName
   471  	statusMessage := fmt.Sprintf("hook failed: %q", hookName)
   472  
   473  	// Run the select loop.
   474  	u.f.WantResolvedEvent()
   475  	u.f.WantUpgradeEvent(true)
   476  	var leaderDeposed <-chan struct{}
   477  	if opState.Leader {
   478  		leaderDeposed = u.leadershipTracker.WaitMinion().Ready()
   479  	}
   480  	for {
   481  		// The spec says we should set the workload status to Error, but that's crazy talk.
   482  		// It's the agent itself that should be in Error state. So we'll ensure the model is
   483  		// correct and translate before the user sees the data.
   484  		// ie a charm hook error results in agent error status, but is presented as a workload error.
   485  		if err = setAgentStatus(u, params.StatusError, statusMessage, statusData); err != nil {
   486  			return nil, errors.Trace(err)
   487  		}
   488  		select {
   489  		case <-u.tomb.Dying():
   490  			return nil, tomb.ErrDying
   491  		case curl := <-u.f.UpgradeEvents():
   492  			return ModeUpgrading(curl), nil
   493  		case rm := <-u.f.ResolvedEvents():
   494  			var creator creator
   495  			switch rm {
   496  			case params.ResolvedRetryHooks:
   497  				creator = newRetryHookOp(hookInfo)
   498  			case params.ResolvedNoHooks:
   499  				creator = newSkipHookOp(hookInfo)
   500  			default:
   501  				return nil, errors.Errorf("unknown resolved mode %q", rm)
   502  			}
   503  			err := u.runOperation(creator)
   504  			if errors.Cause(err) == operation.ErrHookFailed {
   505  				continue
   506  			} else if err != nil {
   507  				return nil, errors.Trace(err)
   508  			}
   509  			return ModeContinue, nil
   510  		case actionId := <-u.f.ActionEvents():
   511  			if err := u.runOperation(newActionOp(actionId)); err != nil {
   512  				return nil, errors.Trace(err)
   513  			}
   514  		case <-leaderDeposed:
   515  			// This should trigger at most once -- we can't reaccept leadership while
   516  			// in an error state.
   517  			leaderDeposed = nil
   518  			if err := u.runOperation(newResignLeadershipOp()); err != nil {
   519  				return nil, errors.Trace(err)
   520  			}
   521  		}
   522  	}
   523  }
   524  
   525  // ModeConflicted is responsible for watching and responding to:
   526  // * user resolution of charm upgrade conflicts
   527  // * forced charm upgrade requests
   528  func ModeConflicted(curl *charm.URL) Mode {
   529  	return func(u *Uniter) (next Mode, err error) {
   530  		defer modeContext("ModeConflicted", &err)()
   531  		// TODO(mue) Add helpful data here too in later CL.
   532  		// The spec says we should set the workload status to Error, but that's crazy talk.
   533  		// It's the agent itself that should be in Error state. So we'll ensure the model is
   534  		// correct and translate before the user sees the data.
   535  		// ie a charm upgrade error results in agent error status, but is presented as a workload error.
   536  		if err := setAgentStatus(u, params.StatusError, "upgrade failed", nil); err != nil {
   537  			return nil, errors.Trace(err)
   538  		}
   539  		u.f.WantResolvedEvent()
   540  		u.f.WantUpgradeEvent(true)
   541  		var creator creator
   542  		select {
   543  		case <-u.tomb.Dying():
   544  			return nil, tomb.ErrDying
   545  		case curl = <-u.f.UpgradeEvents():
   546  			creator = newRevertUpgradeOp(curl)
   547  		case <-u.f.ResolvedEvents():
   548  			creator = newResolvedUpgradeOp(curl)
   549  		}
   550  		return continueAfter(u, creator)
   551  	}
   552  }
   553  
   554  // modeContext returns a function that implements logging and common error
   555  // manipulation for Mode funcs.
   556  func modeContext(name string, err *error) func() {
   557  	logger.Infof("%s starting", name)
   558  	return func() {
   559  		logger.Infof("%s exiting", name)
   560  		*err = errors.Annotatef(*err, name)
   561  	}
   562  }
   563  
   564  // continueAfter is commonly used at the end of a Mode func to execute the
   565  // operation returned by creator and return ModeContinue (or any error).
   566  func continueAfter(u *Uniter, creator creator) (Mode, error) {
   567  	if err := u.runOperation(creator); err != nil {
   568  		return nil, errors.Trace(err)
   569  	}
   570  	return ModeContinue, nil
   571  }