github.com/niedbalski/juju@v0.0.0-20190215020005-8ff100488e47/worker/upgradesteps/worker.go

github.com/niedbalski/juju@v0.0.0-20190215020005-8ff100488e47/worker/upgradesteps/worker.go (about)

     1  // Copyright 2015 Canonical Ltd.
     2  // Licensed under the AGPLv3, see LICENCE file for details.
     3  
     4  package upgradesteps
     5  
     6  import (
     7  	"fmt"
     8  	"time"
     9  
    10  	"github.com/juju/errors"
    11  	"github.com/juju/loggo"
    12  	"github.com/juju/utils"
    13  	"github.com/juju/version"
    14  	"gopkg.in/juju/names.v2"
    15  	"gopkg.in/juju/worker.v1"
    16  	"gopkg.in/tomb.v2"
    17  
    18  	"github.com/juju/juju/agent"
    19  	"github.com/juju/juju/api"
    20  	cmdutil "github.com/juju/juju/cmd/jujud/util"
    21  	"github.com/juju/juju/core/status"
    22  	"github.com/juju/juju/environs"
    23  	"github.com/juju/juju/mongo"
    24  	"github.com/juju/juju/state"
    25  	"github.com/juju/juju/state/multiwatcher"
    26  	"github.com/juju/juju/upgrades"
    27  	jujuversion "github.com/juju/juju/version"
    28  	"github.com/juju/juju/worker/gate"
    29  	"github.com/juju/juju/wrench"
    30  )
    31  
    32  var logger = loggo.GetLogger("juju.worker.upgradesteps")
    33  
    34  var (
    35  	PerformUpgrade = upgrades.PerformUpgrade // Allow patching
    36  
    37  	// The maximum time a master controller will wait for other
    38  	// controllers to come up and indicate they are ready to begin
    39  	// running upgrade steps.
    40  	UpgradeStartTimeoutMaster = time.Minute * 15
    41  
    42  	// The maximum time a secondary controller will wait for other
    43  	// controllers to come up and indicate they are ready to begin
    44  	// running upgrade steps. This is effectively "forever" because we
    45  	// don't really want secondaries to ever give up once they've
    46  	// indicated that they're ready to upgrade. It's up to the master
    47  	// to abort the upgrade if required.
    48  	//
    49  	// This should get reduced when/if master re-elections are
    50  	// introduce in the case a master that failing to come up for
    51  	// upgrade.
    52  	UpgradeStartTimeoutSecondary = time.Hour * 4
    53  )
    54  
    55  // NewLock creates a gate.Lock to be used to synchronise workers which
    56  // need to start after upgrades have completed. The returned Lock should
    57  // be passed to NewWorker. If the agent has already upgraded to the
    58  // current version, then the lock will be returned in the released state.
    59  func NewLock(agentConfig agent.Config) gate.Lock {
    60  	lock := gate.NewLock()
    61  
    62  	if wrench.IsActive(wrenchKey(agentConfig), "always-try-upgrade") {
    63  		// Always enter upgrade mode. This allows test of upgrades
    64  		// even when there's actually no upgrade steps to run.
    65  		return lock
    66  	}
    67  
    68  	// Build numbers are irrelevant to upgrade steps.
    69  	upgradedToVersion := agentConfig.UpgradedToVersion()
    70  	upgradedToVersion.Build = 0
    71  	currentVersion := jujuversion.Current
    72  	currentVersion.Build = 0
    73  	if upgradedToVersion == currentVersion {
    74  		logger.Infof(
    75  			"upgrade steps for %v have already been run.",
    76  			jujuversion.Current,
    77  		)
    78  		lock.Unlock()
    79  	}
    80  
    81  	return lock
    82  }
    83  
    84  // StatusSetter defines the single method required to set an agent's
    85  // status.
    86  type StatusSetter interface {
    87  	SetStatus(setableStatus status.Status, info string, data map[string]interface{}) error
    88  }
    89  
    90  // NewWorker returns a new instance of the upgradesteps worker. It
    91  // will run any required steps to upgrade to the currently running
    92  // Juju version.
    93  func NewWorker(
    94  	upgradeComplete gate.Lock,
    95  	agent agent.Agent,
    96  	apiConn api.Connection,
    97  	jobs []multiwatcher.MachineJob,
    98  	openState func() (*state.StatePool, error),
    99  	preUpgradeSteps func(st *state.StatePool, agentConf agent.Config, isController, isMasterServer bool) error,
   100  	machine StatusSetter,
   101  	newEnvironFunc environs.NewEnvironFunc,
   102  ) (worker.Worker, error) {
   103  	w := &upgradesteps{
   104  		upgradeComplete: upgradeComplete,
   105  		agent:           agent,
   106  		apiConn:         apiConn,
   107  		jobs:            jobs,
   108  		openState:       openState,
   109  		preUpgradeSteps: preUpgradeSteps,
   110  		machine:         machine,
   111  		tag:             agent.CurrentConfig().Tag(),
   112  	}
   113  	w.tomb.Go(w.run)
   114  	return w, nil
   115  }
   116  
   117  type upgradesteps struct {
   118  	tomb            tomb.Tomb
   119  	upgradeComplete gate.Lock
   120  	agent           agent.Agent
   121  	apiConn         api.Connection
   122  	jobs            []multiwatcher.MachineJob
   123  	openState       func() (*state.StatePool, error)
   124  	preUpgradeSteps func(st *state.StatePool, agentConf agent.Config, isController, isMaster bool) error
   125  	machine         StatusSetter
   126  
   127  	fromVersion  version.Number
   128  	toVersion    version.Number
   129  	tag          names.Tag
   130  	isMaster     bool
   131  	isController bool
   132  	pool         *state.StatePool
   133  }
   134  
   135  // Kill is part of the worker.Worker interface.
   136  func (w *upgradesteps) Kill() {
   137  	w.tomb.Kill(nil)
   138  }
   139  
   140  // Wait is part of the worker.Worker interface.
   141  func (w *upgradesteps) Wait() error {
   142  	return w.tomb.Wait()
   143  }
   144  
   145  type apiLostDuringUpgrade struct {
   146  	err error
   147  }
   148  
   149  func (e *apiLostDuringUpgrade) Error() string {
   150  	return fmt.Sprintf("API connection lost during upgrade: %v", e.err)
   151  }
   152  
   153  func isAPILostDuringUpgrade(err error) bool {
   154  	_, ok := err.(*apiLostDuringUpgrade)
   155  	return ok
   156  }
   157  
   158  func (w *upgradesteps) wrenchKey() string {
   159  	return wrenchKey(w.agent.CurrentConfig())
   160  }
   161  
   162  func wrenchKey(agentConfig agent.Config) string {
   163  	return agentConfig.Tag().Kind() + "-agent"
   164  }
   165  
   166  func (w *upgradesteps) run() error {
   167  	if wrench.IsActive(w.wrenchKey(), "fail-upgrade-start") {
   168  		return nil // Make the worker stop
   169  	}
   170  
   171  	if w.upgradeComplete.IsUnlocked() {
   172  		// Our work is already done (we're probably being restarted
   173  		// because the API connection has gone down), so do nothing.
   174  		return nil
   175  	}
   176  
   177  	w.fromVersion = w.agent.CurrentConfig().UpgradedToVersion()
   178  	w.toVersion = jujuversion.Current
   179  	if w.fromVersion == w.toVersion {
   180  		logger.Infof("upgrade to %v already completed.", w.toVersion)
   181  		w.upgradeComplete.Unlock()
   182  		return nil
   183  	}
   184  
   185  	// If the agent is a machine agent for a controller, flag that state
   186  	// needs to be opened before running upgrade steps
   187  	for _, job := range w.jobs {
   188  		if job == multiwatcher.JobManageModel {
   189  			w.isController = true
   190  		}
   191  	}
   192  
   193  	// We need a *state.State for upgrades. We open it independently
   194  	// of StateWorker, because we have no guarantees about when
   195  	// and how often StateWorker might run.
   196  	if w.isController {
   197  		var err error
   198  		if w.pool, err = w.openState(); err != nil {
   199  			return err
   200  		}
   201  		defer w.pool.Close()
   202  
   203  		if w.isMaster, err = IsMachineMaster(w.pool, w.tag.Id()); err != nil {
   204  			return errors.Trace(err)
   205  		}
   206  	}
   207  
   208  	if err := w.runUpgrades(); err != nil {
   209  		// Only return an error from the worker if the connection to
   210  		// state went away (possible mongo master change). Returning
   211  		// an error when the connection is lost will cause the agent
   212  		// to restart.
   213  		//
   214  		// For other errors, the error is not returned because we want
   215  		// the agent to stay running in an error state waiting
   216  		// for user intervention.
   217  		if isAPILostDuringUpgrade(err) {
   218  			return err
   219  		}
   220  		w.reportUpgradeFailure(err, false)
   221  
   222  	} else {
   223  		// Upgrade succeeded - signal that the upgrade is complete.
   224  		logger.Infof("upgrade to %v completed successfully.", w.toVersion)
   225  		w.machine.SetStatus(status.Started, "", nil)
   226  		w.upgradeComplete.Unlock()
   227  	}
   228  	return nil
   229  }
   230  
   231  // runUpgrades runs the upgrade operations for each job type and
   232  // updates the updatedToVersion on success.
   233  func (w *upgradesteps) runUpgrades() error {
   234  	upgradeInfo, err := w.prepareForUpgrade()
   235  	if err != nil {
   236  		return err
   237  	}
   238  
   239  	if wrench.IsActive(w.wrenchKey(), "fail-upgrade") {
   240  		return errors.New("wrench")
   241  	}
   242  
   243  	if err := w.agent.ChangeConfig(w.runUpgradeSteps); err != nil {
   244  		return err
   245  	}
   246  
   247  	if err := w.finaliseUpgrade(upgradeInfo); err != nil {
   248  		return err
   249  	}
   250  	return nil
   251  }
   252  
   253  func (w *upgradesteps) prepareForUpgrade() (*state.UpgradeInfo, error) {
   254  	logger.Infof("checking that upgrade can proceed")
   255  	if err := w.preUpgradeSteps(w.pool, w.agent.CurrentConfig(), w.pool != nil, w.isMaster); err != nil {
   256  		return nil, errors.Annotatef(err, "%s cannot be upgraded", names.ReadableString(w.tag))
   257  	}
   258  
   259  	if w.isController {
   260  		return w.prepareControllerForUpgrade()
   261  	}
   262  	return nil, nil
   263  }
   264  
   265  func (w *upgradesteps) prepareControllerForUpgrade() (*state.UpgradeInfo, error) {
   266  	logger.Infof("signalling that this controller is ready for upgrade")
   267  	st := w.pool.SystemState()
   268  	info, err := st.EnsureUpgradeInfo(w.tag.Id(), w.fromVersion, w.toVersion)
   269  	if err != nil {
   270  		return nil, errors.Trace(err)
   271  	}
   272  
   273  	// controllers need to wait for other controllers to be ready
   274  	// to run the upgrade steps.
   275  	logger.Infof("waiting for other controllers to be ready for upgrade")
   276  	if err := w.waitForOtherControllers(info); err != nil {
   277  		if err == tomb.ErrDying {
   278  			logger.Warningf(`stopped waiting for other controllers: %v`, err)
   279  			return nil, err
   280  		}
   281  		logger.Errorf(`aborted wait for other controllers: %v`, err)
   282  		// If master, trigger a rollback to the previous agent version.
   283  		if w.isMaster {
   284  			logger.Errorf("downgrading model agent version to %v due to aborted upgrade",
   285  				w.fromVersion)
   286  			if rollbackErr := st.SetModelAgentVersion(w.fromVersion, true); rollbackErr != nil {
   287  				logger.Errorf("rollback failed: %v", rollbackErr)
   288  				return nil, errors.Annotate(rollbackErr, "failed to roll back desired agent version")
   289  			}
   290  		}
   291  		return nil, errors.Annotate(err, "aborted wait for other controllers")
   292  	}
   293  	if w.isMaster {
   294  		logger.Infof("finished waiting - all controllers are ready to run upgrade steps")
   295  	} else {
   296  		logger.Infof("finished waiting - the master has completed its upgrade steps")
   297  	}
   298  	return info, nil
   299  }
   300  
   301  func (w *upgradesteps) waitForOtherControllers(info *state.UpgradeInfo) error {
   302  	watcher := info.Watch()
   303  	defer watcher.Stop()
   304  
   305  	maxWait := w.getUpgradeStartTimeout()
   306  	timeout := time.After(maxWait)
   307  	for {
   308  		select {
   309  		case <-watcher.Changes():
   310  			if err := info.Refresh(); err != nil {
   311  				return errors.Trace(err)
   312  			}
   313  			if w.isMaster {
   314  				if ready, err := info.AllProvisionedControllersReady(); err != nil {
   315  					return errors.Trace(err)
   316  				} else if ready {
   317  					// All controllers ready to start upgrade
   318  					err := info.SetStatus(state.UpgradeRunning)
   319  					return errors.Trace(err)
   320  				}
   321  			} else {
   322  				if info.Status() == state.UpgradeFinishing {
   323  					// Master is done, ok to proceed
   324  					return nil
   325  				}
   326  			}
   327  		case <-timeout:
   328  			if w.isMaster {
   329  				if err := info.Abort(); err != nil {
   330  					return errors.Annotate(err, "unable to abort upgrade")
   331  				}
   332  			}
   333  			return errors.Errorf("timed out after %s", maxWait)
   334  		case <-w.tomb.Dying():
   335  			return tomb.ErrDying
   336  		}
   337  
   338  	}
   339  }
   340  
   341  // runUpgradeSteps runs the required upgrade steps for the agent,
   342  // retrying on failure. The agent's UpgradedToVersion is set
   343  // once the upgrade is complete.
   344  //
   345  // This function conforms to the agent.ConfigMutator type and is
   346  // designed to be called via an agent's ChangeConfig method.
   347  func (w *upgradesteps) runUpgradeSteps(agentConfig agent.ConfigSetter) error {
   348  	var upgradeErr error
   349  	w.machine.SetStatus(status.Started, fmt.Sprintf("upgrading to %v", w.toVersion), nil)
   350  
   351  	stBackend := upgrades.NewStateBackend(w.pool)
   352  	context := upgrades.NewContext(agentConfig, w.apiConn, stBackend)
   353  	logger.Infof("starting upgrade from %v to %v for %q", w.fromVersion, w.toVersion, w.tag)
   354  
   355  	targets := jobsToTargets(w.jobs, w.isMaster)
   356  	attempts := getUpgradeRetryStrategy()
   357  	for attempt := attempts.Start(); attempt.Next(); {
   358  		upgradeErr = PerformUpgrade(w.fromVersion, targets, context)
   359  		if upgradeErr == nil {
   360  			break
   361  		}
   362  		if cmdutil.ConnectionIsDead(logger, w.apiConn) {
   363  			// API connection has gone away - abort!
   364  			return &apiLostDuringUpgrade{upgradeErr}
   365  		}
   366  		if attempt.HasNext() {
   367  			w.reportUpgradeFailure(upgradeErr, true)
   368  		}
   369  	}
   370  	if upgradeErr != nil {
   371  		return upgradeErr
   372  	}
   373  	agentConfig.SetUpgradedToVersion(w.toVersion)
   374  	return nil
   375  }
   376  
   377  func (w *upgradesteps) reportUpgradeFailure(err error, willRetry bool) {
   378  	retryText := "will retry"
   379  	if !willRetry {
   380  		retryText = "giving up"
   381  	}
   382  	logger.Errorf("upgrade from %v to %v for %q failed (%s): %v",
   383  		w.fromVersion, w.toVersion, w.tag, retryText, err)
   384  	w.machine.SetStatus(status.Error,
   385  		fmt.Sprintf("upgrade to %v failed (%s): %v", w.toVersion, retryText, err), nil)
   386  }
   387  
   388  func (w *upgradesteps) finaliseUpgrade(info *state.UpgradeInfo) error {
   389  	if !w.isController {
   390  		return nil
   391  	}
   392  
   393  	if w.isMaster {
   394  		// Tell other controllers that the master has completed its
   395  		// upgrade steps.
   396  		if err := info.SetStatus(state.UpgradeFinishing); err != nil {
   397  			return errors.Annotate(err, "upgrade done but")
   398  		}
   399  	}
   400  
   401  	if err := info.SetControllerDone(w.tag.Id()); err != nil {
   402  		return errors.Annotate(err, "upgrade done but failed to synchronise")
   403  	}
   404  
   405  	return nil
   406  }
   407  
   408  func (w *upgradesteps) getUpgradeStartTimeout() time.Duration {
   409  	if wrench.IsActive(w.wrenchKey(), "short-upgrade-timeout") {
   410  		// This duration is fairly arbitrary. During manual testing it
   411  		// avoids the normal long wait but still provides a small
   412  		// window to check the environment status and logs before the
   413  		// timeout is triggered.
   414  		return time.Minute
   415  	}
   416  
   417  	if w.isMaster {
   418  		return UpgradeStartTimeoutMaster
   419  	}
   420  	return UpgradeStartTimeoutSecondary
   421  }
   422  
   423  var IsMachineMaster = func(pool *state.StatePool, machineId string) (bool, error) {
   424  	if pool == nil {
   425  		// If there is no state pool, we aren't a master.
   426  		return false, nil
   427  	}
   428  	// Not calling the agent openState method as it does other checks
   429  	// we really don't care about here.  All we need here is the machine
   430  	// so we can determine if we are the master or not.
   431  	st := pool.SystemState()
   432  	machine, err := st.Machine(machineId)
   433  	if err != nil {
   434  		// This shouldn't happen, and if it does, the state worker will have
   435  		// found out before us, and already errored, or is likely to error out
   436  		// very shortly.  All we do here is return the error. The state worker
   437  		// returns an error that will cause the agent to be terminated.
   438  		return false, errors.Trace(err)
   439  	}
   440  	isMaster, err := mongo.IsMaster(st.MongoSession(), machine)
   441  	if err != nil {
   442  		return false, errors.Trace(err)
   443  	}
   444  	return isMaster, nil
   445  }
   446  
   447  // TODO(katco): 2016-08-09: lp:1611427
   448  var getUpgradeRetryStrategy = func() utils.AttemptStrategy {
   449  	return utils.AttemptStrategy{
   450  		Delay: 2 * time.Minute,
   451  		Min:   5,
   452  	}
   453  }
   454  
   455  // jobsToTargets determines the upgrade targets corresponding to the
   456  // jobs assigned to an agent. This determines the upgrade steps
   457  // which will run during an upgrade.
   458  func jobsToTargets(jobs []multiwatcher.MachineJob, isMaster bool) (targets []upgrades.Target) {
   459  	for _, job := range jobs {
   460  		switch job {
   461  		case multiwatcher.JobManageModel:
   462  			targets = append(targets, upgrades.Controller)
   463  			if isMaster {
   464  				targets = append(targets, upgrades.DatabaseMaster)
   465  			}
   466  		case multiwatcher.JobHostUnits:
   467  			targets = append(targets, upgrades.HostMachine)
   468  		}
   469  	}
   470  	return
   471  }