github.com/axw/juju@v0.0.0-20161005053422-4bd6544d08d4/worker/upgradesteps/worker.go (about)

     1  // Copyright 2015 Canonical Ltd.
     2  // Licensed under the AGPLv3, see LICENCE file for details.
     3  
     4  package upgradesteps
     5  
     6  import (
     7  	"fmt"
     8  	"time"
     9  
    10  	"github.com/juju/errors"
    11  	"github.com/juju/loggo"
    12  	"github.com/juju/utils"
    13  	"gopkg.in/juju/names.v2"
    14  	"gopkg.in/tomb.v1"
    15  
    16  	"github.com/juju/juju/agent"
    17  	"github.com/juju/juju/api"
    18  	cmdutil "github.com/juju/juju/cmd/jujud/util"
    19  	"github.com/juju/juju/mongo"
    20  	"github.com/juju/juju/state"
    21  	"github.com/juju/juju/state/multiwatcher"
    22  	"github.com/juju/juju/status"
    23  	"github.com/juju/juju/upgrades"
    24  	jujuversion "github.com/juju/juju/version"
    25  	"github.com/juju/juju/worker"
    26  	"github.com/juju/juju/worker/gate"
    27  	"github.com/juju/juju/wrench"
    28  	"github.com/juju/version"
    29  )
    30  
    31  var logger = loggo.GetLogger("juju.worker.upgradesteps")
    32  
    33  var (
    34  	PerformUpgrade = upgrades.PerformUpgrade // Allow patching
    35  
    36  	// The maximum time a master controller will wait for other
    37  	// controllers to come up and indicate they are ready to begin
    38  	// running upgrade steps.
    39  	UpgradeStartTimeoutMaster = time.Minute * 15
    40  
    41  	// The maximum time a secondary controller will wait for other
    42  	// controllers to come up and indicate they are ready to begin
    43  	// running upgrade steps. This is effectively "forever" because we
    44  	// don't really want secondaries to ever give up once they've
    45  	// indicated that they're ready to upgrade. It's up to the master
    46  	// to abort the upgrade if required.
    47  	//
    48  	// This should get reduced when/if master re-elections are
    49  	// introduce in the case a master that failing to come up for
    50  	// upgrade.
    51  	UpgradeStartTimeoutSecondary = time.Hour * 4
    52  )
    53  
    54  // NewLock creates a gate.Lock to be used to synchronise workers which
    55  // need to start after upgrades have completed. If no upgrade steps
    56  // are required the Lock is unlocked and the version in agent's
    57  // configuration is updated to the currently running version.
    58  //
    59  // The returned Lock should be passed to NewWorker.
    60  func NewLock(a agent.Agent) (gate.Lock, error) {
    61  	lock := gate.NewLock()
    62  
    63  	if wrench.IsActive("machine-agent", "always-try-upgrade") {
    64  		// Always enter upgrade mode. This allows test of upgrades
    65  		// even when there's actually no upgrade steps to run.
    66  		return lock, nil
    67  	}
    68  
    69  	err := a.ChangeConfig(func(agentConfig agent.ConfigSetter) error {
    70  		if !upgrades.AreUpgradesDefined(agentConfig.UpgradedToVersion()) {
    71  			logger.Infof("no upgrade steps required or upgrade steps for %v "+
    72  				"have already been run.", jujuversion.Current)
    73  			lock.Unlock()
    74  
    75  			// Even if no upgrade is required the version number in
    76  			// the agent's config still needs to be bumped.
    77  			agentConfig.SetUpgradedToVersion(jujuversion.Current)
    78  		}
    79  		return nil
    80  	})
    81  	if err != nil {
    82  		return nil, err
    83  	}
    84  	return lock, nil
    85  }
    86  
    87  // StatusSetter defines the single method required to set an agent's
    88  // status.
    89  type StatusSetter interface {
    90  	SetStatus(setableStatus status.Status, info string, data map[string]interface{}) error
    91  }
    92  
    93  // NewWorker returns a new instance of the upgradesteps worker. It
    94  // will run any required steps to upgrade to the currently running
    95  // Juju version.
    96  func NewWorker(
    97  	upgradeComplete gate.Lock,
    98  	agent agent.Agent,
    99  	apiConn api.Connection,
   100  	jobs []multiwatcher.MachineJob,
   101  	openState func() (*state.State, error),
   102  	preUpgradeSteps func(st *state.State, agentConf agent.Config, isController, isMasterServer bool) error,
   103  	machine StatusSetter,
   104  ) (worker.Worker, error) {
   105  	tag, ok := agent.CurrentConfig().Tag().(names.MachineTag)
   106  	if !ok {
   107  		return nil, errors.New("machine agent's tag is not a MachineTag")
   108  	}
   109  	w := &upgradesteps{
   110  		upgradeComplete: upgradeComplete,
   111  		agent:           agent,
   112  		apiConn:         apiConn,
   113  		jobs:            jobs,
   114  		openState:       openState,
   115  		preUpgradeSteps: preUpgradeSteps,
   116  		machine:         machine,
   117  		tag:             tag,
   118  	}
   119  	go func() {
   120  		defer w.tomb.Done()
   121  		w.tomb.Kill(w.run())
   122  	}()
   123  	return w, nil
   124  }
   125  
   126  type upgradesteps struct {
   127  	tomb            tomb.Tomb
   128  	upgradeComplete gate.Lock
   129  	agent           agent.Agent
   130  	apiConn         api.Connection
   131  	jobs            []multiwatcher.MachineJob
   132  	openState       func() (*state.State, error)
   133  	preUpgradeSteps func(st *state.State, agentConf agent.Config, isController, isMaster bool) error
   134  	machine         StatusSetter
   135  
   136  	fromVersion  version.Number
   137  	toVersion    version.Number
   138  	tag          names.MachineTag
   139  	isMaster     bool
   140  	isController bool
   141  	st           *state.State
   142  }
   143  
   144  // Kill is part of the worker.Worker interface.
   145  func (w *upgradesteps) Kill() {
   146  	w.tomb.Kill(nil)
   147  }
   148  
   149  // Wait is part of the worker.Worker interface.
   150  func (w *upgradesteps) Wait() error {
   151  	return w.tomb.Wait()
   152  }
   153  
   154  type apiLostDuringUpgrade struct {
   155  	err error
   156  }
   157  
   158  func (e *apiLostDuringUpgrade) Error() string {
   159  	return fmt.Sprintf("API connection lost during upgrade: %v", e.err)
   160  }
   161  
   162  func isAPILostDuringUpgrade(err error) bool {
   163  	_, ok := err.(*apiLostDuringUpgrade)
   164  	return ok
   165  }
   166  
   167  func (w *upgradesteps) run() error {
   168  	if wrench.IsActive("machine-agent", "fail-upgrade-start") {
   169  		return nil // Make the worker stop
   170  	}
   171  
   172  	if w.upgradeComplete.IsUnlocked() {
   173  		// Our work is already done (we're probably being restarted
   174  		// because the API connection has gone down), so do nothing.
   175  		return nil
   176  	}
   177  
   178  	w.fromVersion = w.agent.CurrentConfig().UpgradedToVersion()
   179  	w.toVersion = jujuversion.Current
   180  	if w.fromVersion == w.toVersion {
   181  		logger.Infof("upgrade to %v already completed.", w.toVersion)
   182  		w.upgradeComplete.Unlock()
   183  		return nil
   184  	}
   185  
   186  	// If the machine agent is a controller, flag that state
   187  	// needs to be opened before running upgrade steps
   188  	for _, job := range w.jobs {
   189  		if job == multiwatcher.JobManageModel {
   190  			w.isController = true
   191  		}
   192  	}
   193  
   194  	// We need a *state.State for upgrades. We open it independently
   195  	// of StateWorker, because we have no guarantees about when
   196  	// and how often StateWorker might run.
   197  	if w.isController {
   198  		var err error
   199  		if w.st, err = w.openState(); err != nil {
   200  			return err
   201  		}
   202  		defer w.st.Close()
   203  
   204  		if w.isMaster, err = IsMachineMaster(w.st, w.tag.Id()); err != nil {
   205  			return errors.Trace(err)
   206  		}
   207  	}
   208  
   209  	if err := w.runUpgrades(); err != nil {
   210  		// Only return an error from the worker if the connection to
   211  		// state went away (possible mongo master change). Returning
   212  		// an error when the connection is lost will cause the agent
   213  		// to restart.
   214  		//
   215  		// For other errors, the error is not returned because we want
   216  		// the machine agent to stay running in an error state waiting
   217  		// for user intervention.
   218  		if isAPILostDuringUpgrade(err) {
   219  			return err
   220  		}
   221  		w.reportUpgradeFailure(err, false)
   222  
   223  	} else {
   224  		// Upgrade succeeded - signal that the upgrade is complete.
   225  		logger.Infof("upgrade to %v completed successfully.", w.toVersion)
   226  		w.machine.SetStatus(status.Started, "", nil)
   227  		w.upgradeComplete.Unlock()
   228  	}
   229  	return nil
   230  }
   231  
   232  // runUpgrades runs the upgrade operations for each job type and
   233  // updates the updatedToVersion on success.
   234  func (w *upgradesteps) runUpgrades() error {
   235  	upgradeInfo, err := w.prepareForUpgrade()
   236  	if err != nil {
   237  		return err
   238  	}
   239  
   240  	if wrench.IsActive("machine-agent", "fail-upgrade") {
   241  		return errors.New("wrench")
   242  	}
   243  
   244  	if err := w.agent.ChangeConfig(w.runUpgradeSteps); err != nil {
   245  		return err
   246  	}
   247  
   248  	if err := w.finaliseUpgrade(upgradeInfo); err != nil {
   249  		return err
   250  	}
   251  	return nil
   252  }
   253  
   254  func (w *upgradesteps) prepareForUpgrade() (*state.UpgradeInfo, error) {
   255  	logger.Infof("checking that upgrade can proceed")
   256  	if err := w.preUpgradeSteps(w.st, w.agent.CurrentConfig(), w.st != nil, w.isMaster); err != nil {
   257  		return nil, errors.Annotatef(err, "%s cannot be upgraded", names.ReadableString(w.tag))
   258  	}
   259  
   260  	if !w.isController {
   261  		return nil, nil
   262  	}
   263  
   264  	logger.Infof("signalling that this controller is ready for upgrade")
   265  	info, err := w.st.EnsureUpgradeInfo(w.tag.Id(), w.fromVersion, w.toVersion)
   266  	if err != nil {
   267  		return nil, errors.Trace(err)
   268  	}
   269  
   270  	// controllers need to wait for other controllers to be ready
   271  	// to run the upgrade steps.
   272  	logger.Infof("waiting for other controllers to be ready for upgrade")
   273  	if err := w.waitForOtherControllers(info); err != nil {
   274  		if err == tomb.ErrDying {
   275  			logger.Warningf(`stopped waiting for other controllers: %v`, err)
   276  			return nil, err
   277  		}
   278  		logger.Errorf(`aborted wait for other controllers: %v`, err)
   279  		// If master, trigger a rollback to the previous agent version.
   280  		if w.isMaster {
   281  			logger.Errorf("downgrading model agent version to %v due to aborted upgrade",
   282  				w.fromVersion)
   283  			if rollbackErr := w.st.SetModelAgentVersion(w.fromVersion); rollbackErr != nil {
   284  				logger.Errorf("rollback failed: %v", rollbackErr)
   285  				return nil, errors.Annotate(rollbackErr, "failed to roll back desired agent version")
   286  			}
   287  		}
   288  		return nil, errors.Annotate(err, "aborted wait for other controllers")
   289  	}
   290  	if w.isMaster {
   291  		logger.Infof("finished waiting - all controllers are ready to run upgrade steps")
   292  	} else {
   293  		logger.Infof("finished waiting - the master has completed its upgrade steps")
   294  	}
   295  	return info, nil
   296  }
   297  
   298  func (w *upgradesteps) waitForOtherControllers(info *state.UpgradeInfo) error {
   299  	watcher := info.Watch()
   300  	defer watcher.Stop()
   301  
   302  	maxWait := getUpgradeStartTimeout(w.isMaster)
   303  	timeout := time.After(maxWait)
   304  	for {
   305  		select {
   306  		case <-watcher.Changes():
   307  			if err := info.Refresh(); err != nil {
   308  				return errors.Trace(err)
   309  			}
   310  			if w.isMaster {
   311  				if ready, err := info.AllProvisionedControllersReady(); err != nil {
   312  					return errors.Trace(err)
   313  				} else if ready {
   314  					// All controllers ready to start upgrade
   315  					err := info.SetStatus(state.UpgradeRunning)
   316  					return errors.Trace(err)
   317  				}
   318  			} else {
   319  				if info.Status() == state.UpgradeFinishing {
   320  					// Master is done, ok to proceed
   321  					return nil
   322  				}
   323  			}
   324  		case <-timeout:
   325  			if w.isMaster {
   326  				if err := info.Abort(); err != nil {
   327  					return errors.Annotate(err, "unable to abort upgrade")
   328  				}
   329  			}
   330  			return errors.Errorf("timed out after %s", maxWait)
   331  		case <-w.tomb.Dying():
   332  			return tomb.ErrDying
   333  		}
   334  
   335  	}
   336  }
   337  
   338  // runUpgradeSteps runs the required upgrade steps for the machine
   339  // agent, retrying on failure. The agent's UpgradedToVersion is set
   340  // once the upgrade is complete.
   341  //
   342  // This function conforms to the agent.ConfigMutator type and is
   343  // designed to be called via a machine agent's ChangeConfig method.
   344  func (w *upgradesteps) runUpgradeSteps(agentConfig agent.ConfigSetter) error {
   345  	var upgradeErr error
   346  	w.machine.SetStatus(status.Started, fmt.Sprintf("upgrading to %v", w.toVersion), nil)
   347  
   348  	context := upgrades.NewContext(agentConfig, w.apiConn, w.st)
   349  	logger.Infof("starting upgrade from %v to %v for %q", w.fromVersion, w.toVersion, w.tag)
   350  
   351  	targets := jobsToTargets(w.jobs, w.isMaster)
   352  	attempts := getUpgradeRetryStrategy()
   353  	for attempt := attempts.Start(); attempt.Next(); {
   354  		upgradeErr = PerformUpgrade(w.fromVersion, targets, context)
   355  		if upgradeErr == nil {
   356  			break
   357  		}
   358  		if cmdutil.ConnectionIsDead(logger, w.apiConn) {
   359  			// API connection has gone away - abort!
   360  			return &apiLostDuringUpgrade{upgradeErr}
   361  		}
   362  		if attempt.HasNext() {
   363  			w.reportUpgradeFailure(upgradeErr, true)
   364  		}
   365  	}
   366  	if upgradeErr != nil {
   367  		return upgradeErr
   368  	}
   369  	agentConfig.SetUpgradedToVersion(w.toVersion)
   370  	return nil
   371  }
   372  
   373  func (w *upgradesteps) reportUpgradeFailure(err error, willRetry bool) {
   374  	retryText := "will retry"
   375  	if !willRetry {
   376  		retryText = "giving up"
   377  	}
   378  	logger.Errorf("upgrade from %v to %v for %q failed (%s): %v",
   379  		w.fromVersion, w.toVersion, w.tag, retryText, err)
   380  	w.machine.SetStatus(status.Error,
   381  		fmt.Sprintf("upgrade to %v failed (%s): %v", w.toVersion, retryText, err), nil)
   382  }
   383  
   384  func (w *upgradesteps) finaliseUpgrade(info *state.UpgradeInfo) error {
   385  	if !w.isController {
   386  		return nil
   387  	}
   388  
   389  	if w.isMaster {
   390  		// Tell other controllers that the master has completed its
   391  		// upgrade steps.
   392  		if err := info.SetStatus(state.UpgradeFinishing); err != nil {
   393  			return errors.Annotate(err, "upgrade done but")
   394  		}
   395  	}
   396  
   397  	if err := info.SetControllerDone(w.tag.Id()); err != nil {
   398  		return errors.Annotate(err, "upgrade done but failed to synchronise")
   399  	}
   400  
   401  	return nil
   402  }
   403  
   404  func getUpgradeStartTimeout(isMaster bool) time.Duration {
   405  	if wrench.IsActive("machine-agent", "short-upgrade-timeout") {
   406  		// This duration is fairly arbitrary. During manual testing it
   407  		// avoids the normal long wait but still provides a small
   408  		// window to check the environment status and logs before the
   409  		// timeout is triggered.
   410  		return time.Minute
   411  	}
   412  
   413  	if isMaster {
   414  		return UpgradeStartTimeoutMaster
   415  	}
   416  	return UpgradeStartTimeoutSecondary
   417  }
   418  
   419  var IsMachineMaster = func(st *state.State, machineId string) (bool, error) {
   420  	if st == nil {
   421  		// If there is no state, we aren't a master.
   422  		return false, nil
   423  	}
   424  	// Not calling the agent openState method as it does other checks
   425  	// we really don't care about here.  All we need here is the machine
   426  	// so we can determine if we are the master or not.
   427  	machine, err := st.Machine(machineId)
   428  	if err != nil {
   429  		// This shouldn't happen, and if it does, the state worker will have
   430  		// found out before us, and already errored, or is likely to error out
   431  		// very shortly.  All we do here is return the error. The state worker
   432  		// returns an error that will cause the agent to be terminated.
   433  		return false, errors.Trace(err)
   434  	}
   435  	isMaster, err := mongo.IsMaster(st.MongoSession(), machine)
   436  	if err != nil {
   437  		return false, errors.Trace(err)
   438  	}
   439  	return isMaster, nil
   440  }
   441  
   442  // TODO(katco): 2016-08-09: lp:1611427
   443  var getUpgradeRetryStrategy = func() utils.AttemptStrategy {
   444  	return utils.AttemptStrategy{
   445  		Delay: 2 * time.Minute,
   446  		Min:   5,
   447  	}
   448  }
   449  
   450  // jobsToTargets determines the upgrade targets corresponding to the
   451  // jobs assigned to a machine agent. This determines the upgrade steps
   452  // which will run during an upgrade.
   453  func jobsToTargets(jobs []multiwatcher.MachineJob, isMaster bool) (targets []upgrades.Target) {
   454  	for _, job := range jobs {
   455  		switch job {
   456  		case multiwatcher.JobManageModel:
   457  			targets = append(targets, upgrades.Controller)
   458  			if isMaster {
   459  				targets = append(targets, upgrades.DatabaseMaster)
   460  			}
   461  		case multiwatcher.JobHostUnits:
   462  			targets = append(targets, upgrades.HostMachine)
   463  		}
   464  	}
   465  	return
   466  }