github.com/altoros/juju-vmware@v0.0.0-20150312064031-f19ae857ccca/cmd/jujud/agent/upgrade.go

github.com/altoros/juju-vmware@v0.0.0-20150312064031-f19ae857ccca/cmd/jujud/agent/upgrade.go (about)

     1  package agent
     2  
     3  import (
     4  	"fmt"
     5  	"time"
     6  
     7  	"github.com/juju/errors"
     8  	"github.com/juju/names"
     9  	"github.com/juju/utils"
    10  
    11  	"github.com/juju/juju/agent"
    12  	"github.com/juju/juju/api"
    13  	"github.com/juju/juju/apiserver/params"
    14  	cmdutil "github.com/juju/juju/cmd/jujud/util"
    15  	"github.com/juju/juju/environs"
    16  	"github.com/juju/juju/mongo"
    17  	"github.com/juju/juju/state"
    18  	"github.com/juju/juju/state/multiwatcher"
    19  	"github.com/juju/juju/state/storage"
    20  	"github.com/juju/juju/upgrades"
    21  	"github.com/juju/juju/version"
    22  	"github.com/juju/juju/worker"
    23  	"github.com/juju/juju/wrench"
    24  )
    25  
    26  type upgradingMachineAgent interface {
    27  	ensureMongoServer(agent.Config) error
    28  	setMachineStatus(*api.State, params.Status, string) error
    29  	CurrentConfig() agent.Config
    30  	ChangeConfig(AgentConfigMutator) error
    31  	Dying() <-chan struct{}
    32  }
    33  
    34  var (
    35  	upgradesPerformUpgrade = upgrades.PerformUpgrade // Allow patching
    36  
    37  	// The maximum time a master state server will wait for other
    38  	// state servers to come up and indicate they are ready to begin
    39  	// running upgrade steps.
    40  	upgradeStartTimeoutMaster = time.Minute * 15
    41  
    42  	// The maximum time a secondary state server will wait for other
    43  	// state servers to come up and indicate they are ready to begin
    44  	// running upgrade steps. This is effectively "forever" because we
    45  	// don't really want secondaries to ever give up once they've
    46  	// indicated that they're ready to upgrade. It's up to the master
    47  	// to abort the upgrade if required.
    48  	//
    49  	// This should get reduced when/if master re-elections are
    50  	// introduce in the case a master that failing to come up for
    51  	// upgrade.
    52  	upgradeStartTimeoutSecondary = time.Hour * 4
    53  )
    54  
    55  func NewUpgradeWorkerContext() *upgradeWorkerContext {
    56  	return &upgradeWorkerContext{
    57  		UpgradeComplete: make(chan struct{}),
    58  	}
    59  }
    60  
    61  type upgradeWorkerContext struct {
    62  	UpgradeComplete chan struct{}
    63  	fromVersion     version.Number
    64  	toVersion       version.Number
    65  	agent           upgradingMachineAgent
    66  	tag             names.MachineTag
    67  	machineId       string
    68  	isMaster        bool
    69  	apiState        *api.State
    70  	jobs            []multiwatcher.MachineJob
    71  	agentConfig     agent.Config
    72  	isStateServer   bool
    73  	st              *state.State
    74  }
    75  
    76  // InitialiseUsingAgent sets up a upgradeWorkerContext from a machine agent instance.
    77  // It may update the agent's configuration.
    78  func (c *upgradeWorkerContext) InitializeUsingAgent(a upgradingMachineAgent) error {
    79  	if wrench.IsActive("machine-agent", "always-try-upgrade") {
    80  		// Always enter upgrade mode. This allows test of upgrades
    81  		// even when there's actually no upgrade steps to run.
    82  		return nil
    83  	}
    84  	return a.ChangeConfig(func(agentConfig agent.ConfigSetter) error {
    85  		if !upgrades.AreUpgradesDefined(agentConfig.UpgradedToVersion()) {
    86  			logger.Infof("no upgrade steps required or upgrade steps for %v "+
    87  				"have already been run.", version.Current.Number)
    88  			close(c.UpgradeComplete)
    89  
    90  			// Even if no upgrade is required the version number in
    91  			// the agent's config still needs to be bumped.
    92  			agentConfig.SetUpgradedToVersion(version.Current.Number)
    93  		}
    94  		return nil
    95  	})
    96  }
    97  
    98  func (c *upgradeWorkerContext) Worker(
    99  	agent upgradingMachineAgent,
   100  	apiState *api.State,
   101  	jobs []multiwatcher.MachineJob,
   102  ) worker.Worker {
   103  	c.agent = agent
   104  	c.apiState = apiState
   105  	c.jobs = jobs
   106  	return worker.NewSimpleWorker(c.run)
   107  }
   108  
   109  func (c *upgradeWorkerContext) IsUpgradeRunning() bool {
   110  	select {
   111  	case <-c.UpgradeComplete:
   112  		return false
   113  	default:
   114  		return true
   115  	}
   116  }
   117  
   118  type apiLostDuringUpgrade struct {
   119  	err error
   120  }
   121  
   122  func (e *apiLostDuringUpgrade) Error() string {
   123  	return fmt.Sprintf("API connection lost during upgrade: %v", e.err)
   124  }
   125  
   126  func isAPILostDuringUpgrade(err error) bool {
   127  	_, ok := err.(*apiLostDuringUpgrade)
   128  	return ok
   129  }
   130  
   131  func (c *upgradeWorkerContext) run(stop <-chan struct{}) error {
   132  	if wrench.IsActive("machine-agent", "fail-upgrade-start") {
   133  		return nil // Make the worker stop
   134  	}
   135  
   136  	select {
   137  	case <-c.UpgradeComplete:
   138  		// Our work is already done (we're probably being restarted
   139  		// because the API connection has gone down), so do nothing.
   140  		return nil
   141  	default:
   142  	}
   143  
   144  	c.agentConfig = c.agent.CurrentConfig()
   145  
   146  	c.fromVersion = c.agentConfig.UpgradedToVersion()
   147  	c.toVersion = version.Current.Number
   148  	if c.fromVersion == c.toVersion {
   149  		logger.Infof("upgrade to %v already completed.", c.toVersion)
   150  		close(c.UpgradeComplete)
   151  		return nil
   152  	}
   153  
   154  	if err := c.initTag(c.agentConfig.Tag()); err != nil {
   155  		return errors.Trace(err)
   156  	}
   157  
   158  	// If the machine agent is a state server, flag that state
   159  	// needs to be opened before running upgrade steps
   160  	for _, job := range c.jobs {
   161  		if job == multiwatcher.JobManageEnviron {
   162  			c.isStateServer = true
   163  		}
   164  	}
   165  
   166  	// We need a *state.State for upgrades. We open it independently
   167  	// of StateWorker, because we have no guarantees about when
   168  	// and how often StateWorker might run.
   169  	if c.isStateServer {
   170  		var err error
   171  		if c.st, err = openStateForUpgrade(c.agent, c.agentConfig); err != nil {
   172  			return err
   173  		}
   174  		defer c.st.Close()
   175  
   176  		if c.isMaster, err = isMachineMaster(c.st, c.machineId); err != nil {
   177  			return errors.Trace(err)
   178  		}
   179  
   180  		stor := storage.NewStorage(c.st.EnvironUUID(), c.st.MongoSession())
   181  		registerSimplestreamsDataSource(stor)
   182  	}
   183  	if err := c.runUpgrades(); err != nil {
   184  		// Only return an error from the worker if the connection to
   185  		// state went away (possible mongo master change). Returning
   186  		// an error when the connection is lost will cause the agent
   187  		// to restart.
   188  		//
   189  		// For other errors, the error is not returned because we want
   190  		// the machine agent to stay running in an error state waiting
   191  		// for user intervention.
   192  		if isAPILostDuringUpgrade(err) {
   193  			return err
   194  		}
   195  		c.reportUpgradeFailure(err, false)
   196  
   197  	} else {
   198  		// Upgrade succeeded - signal that the upgrade is complete.
   199  		logger.Infof("upgrade to %v completed successfully.", c.toVersion)
   200  		c.agent.setMachineStatus(c.apiState, params.StatusStarted, "")
   201  		close(c.UpgradeComplete)
   202  	}
   203  	return nil
   204  }
   205  
   206  func (c *upgradeWorkerContext) initTag(tag names.Tag) error {
   207  	var ok bool
   208  	if c.tag, ok = tag.(names.MachineTag); !ok {
   209  		return errors.New("machine agent's tag is not a MachineTag")
   210  	}
   211  	c.machineId = c.tag.Id()
   212  	return nil
   213  }
   214  
   215  var agentTerminating = errors.New("machine agent is terminating")
   216  
   217  // runUpgrades runs the upgrade operations for each job type and
   218  // updates the updatedToVersion on success.
   219  func (c *upgradeWorkerContext) runUpgrades() error {
   220  	upgradeInfo, err := c.prepareForUpgrade()
   221  	if err != nil {
   222  		return err
   223  	}
   224  
   225  	if wrench.IsActive("machine-agent", "fail-upgrade") {
   226  		return errors.New("wrench")
   227  	}
   228  
   229  	if err := c.agent.ChangeConfig(c.runUpgradeSteps); err != nil {
   230  		return err
   231  	}
   232  
   233  	if err := c.finaliseUpgrade(upgradeInfo); err != nil {
   234  		return err
   235  	}
   236  	return nil
   237  }
   238  
   239  func (c *upgradeWorkerContext) prepareForUpgrade() (*state.UpgradeInfo, error) {
   240  	if !c.isStateServer {
   241  		return nil, nil
   242  	}
   243  
   244  	logger.Infof("signalling that this state server is ready for upgrade")
   245  	info, err := c.st.EnsureUpgradeInfo(c.machineId, c.fromVersion, c.toVersion)
   246  	if err != nil {
   247  		return nil, errors.Trace(err)
   248  	}
   249  
   250  	// State servers need to wait for other state servers to be ready
   251  	// to run the upgrade steps.
   252  	logger.Infof("waiting for other state servers to be ready for upgrade")
   253  	if err := c.waitForOtherStateServers(info); err != nil {
   254  		if err == agentTerminating {
   255  			logger.Warningf(`stopped waiting for other state servers: %v`, err)
   256  		} else {
   257  			logger.Errorf(`aborted wait for other state servers: %v`, err)
   258  			// If master, trigger a rollback to the previous agent version.
   259  			if c.isMaster {
   260  				logger.Errorf("downgrading environment agent version to %v due to aborted upgrade",
   261  					c.fromVersion)
   262  				if rollbackErr := c.st.SetEnvironAgentVersion(c.fromVersion); rollbackErr != nil {
   263  					logger.Errorf("rollback failed: %v", rollbackErr)
   264  					return nil, errors.Annotate(rollbackErr, "failed to roll back desired agent version")
   265  				}
   266  			}
   267  		}
   268  		return nil, errors.Annotate(err, "aborted wait for other state servers")
   269  	}
   270  	if c.isMaster {
   271  		logger.Infof("finished waiting - all state servers are ready to run upgrade steps")
   272  	} else {
   273  		logger.Infof("finished waiting - the master has completed its upgrade steps")
   274  	}
   275  	return info, nil
   276  }
   277  
   278  func (c *upgradeWorkerContext) waitForOtherStateServers(info *state.UpgradeInfo) error {
   279  	watcher := info.Watch()
   280  
   281  	maxWait := getUpgradeStartTimeout(c.isMaster)
   282  	timeout := time.After(maxWait)
   283  	for {
   284  		select {
   285  		case <-watcher.Changes():
   286  			if err := info.Refresh(); err != nil {
   287  				return errors.Trace(err)
   288  			}
   289  			if c.isMaster {
   290  				if ready, err := info.AllProvisionedStateServersReady(); err != nil {
   291  					return errors.Trace(err)
   292  				} else if ready {
   293  					// All state servers ready to start upgrade
   294  					err := info.SetStatus(state.UpgradeRunning)
   295  					return errors.Trace(err)
   296  				}
   297  			} else {
   298  				if info.Status() == state.UpgradeFinishing {
   299  					// Master is done, ok to proceed
   300  					return nil
   301  				}
   302  			}
   303  		case <-timeout:
   304  			if c.isMaster {
   305  				if err := info.Abort(); err != nil {
   306  					return errors.Annotate(err, "unable to abort upgrade")
   307  				}
   308  			}
   309  			return errors.Errorf("timed out after %s", maxWait)
   310  		case <-c.agent.Dying():
   311  			return agentTerminating
   312  		}
   313  
   314  	}
   315  }
   316  
   317  // runUpgradeSteps runs the required upgrade steps for the machine
   318  // agent, retrying on failure. The agent's UpgradedToVersion is set
   319  // once the upgrade is complete.
   320  //
   321  // This function conforms to the AgentConfigMutator type and is
   322  // designed to be called via a machine agent's ChangeConfig method.
   323  func (c *upgradeWorkerContext) runUpgradeSteps(agentConfig agent.ConfigSetter) error {
   324  	var upgradeErr error
   325  	a := c.agent
   326  	a.setMachineStatus(c.apiState, params.StatusStarted, fmt.Sprintf("upgrading to %v", c.toVersion))
   327  
   328  	context := upgrades.NewContext(agentConfig, c.apiState, c.st)
   329  	logger.Infof("starting upgrade from %v to %v for %q", c.fromVersion, c.toVersion, c.tag)
   330  
   331  	targets := jobsToTargets(c.jobs, c.isMaster)
   332  	attempts := getUpgradeRetryStrategy()
   333  	for attempt := attempts.Start(); attempt.Next(); {
   334  		upgradeErr = upgradesPerformUpgrade(c.fromVersion, targets, context)
   335  		if upgradeErr == nil {
   336  			break
   337  		}
   338  		if cmdutil.ConnectionIsDead(logger, c.apiState) {
   339  			// API connection has gone away - abort!
   340  			return &apiLostDuringUpgrade{upgradeErr}
   341  		}
   342  		if attempt.HasNext() {
   343  			c.reportUpgradeFailure(upgradeErr, true)
   344  		}
   345  	}
   346  	if upgradeErr != nil {
   347  		return upgradeErr
   348  	}
   349  	agentConfig.SetUpgradedToVersion(c.toVersion)
   350  	return nil
   351  }
   352  
   353  func (c *upgradeWorkerContext) reportUpgradeFailure(err error, willRetry bool) {
   354  	retryText := "will retry"
   355  	if !willRetry {
   356  		retryText = "giving up"
   357  	}
   358  	logger.Errorf("upgrade from %v to %v for %q failed (%s): %v",
   359  		c.fromVersion, c.toVersion, c.tag, retryText, err)
   360  	c.agent.setMachineStatus(c.apiState, params.StatusError,
   361  		fmt.Sprintf("upgrade to %v failed (%s): %v", c.toVersion, retryText, err))
   362  }
   363  
   364  func (c *upgradeWorkerContext) finaliseUpgrade(info *state.UpgradeInfo) error {
   365  	if !c.isStateServer {
   366  		return nil
   367  	}
   368  
   369  	if c.isMaster {
   370  		// Tell other state servers that the master has completed its
   371  		// upgrade steps.
   372  		if err := info.SetStatus(state.UpgradeFinishing); err != nil {
   373  			return errors.Annotate(err, "upgrade done but")
   374  		}
   375  	}
   376  
   377  	if err := info.SetStateServerDone(c.machineId); err != nil {
   378  		return errors.Annotate(err, "upgrade done but failed to synchronise")
   379  	}
   380  
   381  	return nil
   382  }
   383  
   384  func getUpgradeStartTimeout(isMaster bool) time.Duration {
   385  	if wrench.IsActive("machine-agent", "short-upgrade-timeout") {
   386  		// This duration is fairly arbitrary. During manual testing it
   387  		// avoids the normal long wait but still provides a small
   388  		// window to check the environment status and logs before the
   389  		// timeout is triggered.
   390  		return time.Minute
   391  	}
   392  
   393  	if isMaster {
   394  		return upgradeStartTimeoutMaster
   395  	}
   396  	return upgradeStartTimeoutSecondary
   397  }
   398  
   399  var openStateForUpgrade = func(
   400  	agent upgradingMachineAgent,
   401  	agentConfig agent.Config,
   402  ) (*state.State, error) {
   403  	if err := agent.ensureMongoServer(agentConfig); err != nil {
   404  		return nil, err
   405  	}
   406  	var err error
   407  	info, ok := agentConfig.MongoInfo()
   408  	if !ok {
   409  		return nil, fmt.Errorf("no state info available")
   410  	}
   411  	st, err := state.Open(info, mongo.DefaultDialOpts(), environs.NewStatePolicy())
   412  	if err != nil {
   413  		return nil, err
   414  	}
   415  	return st, nil
   416  }
   417  
   418  var isMachineMaster = func(st *state.State, machineId string) (bool, error) {
   419  	if st == nil {
   420  		// If there is no state, we aren't a master.
   421  		return false, nil
   422  	}
   423  	// Not calling the agent openState method as it does other checks
   424  	// we really don't care about here.  All we need here is the machine
   425  	// so we can determine if we are the master or not.
   426  	machine, err := st.Machine(machineId)
   427  	if err != nil {
   428  		// This shouldn't happen, and if it does, the state worker will have
   429  		// found out before us, and already errored, or is likely to error out
   430  		// very shortly.  All we do here is return the error. The state worker
   431  		// returns an error that will cause the agent to be terminated.
   432  		return false, errors.Trace(err)
   433  	}
   434  	isMaster, err := mongo.IsMaster(st.MongoSession(), machine)
   435  	if err != nil {
   436  		return false, errors.Trace(err)
   437  	}
   438  	return isMaster, nil
   439  }
   440  
   441  var getUpgradeRetryStrategy = func() utils.AttemptStrategy {
   442  	return utils.AttemptStrategy{
   443  		Delay: 2 * time.Minute,
   444  		Min:   5,
   445  	}
   446  }
   447  
   448  // jobsToTargets determines the upgrade targets corresponding to the
   449  // jobs assigned to a machine agent. This determines the upgrade steps
   450  // which will run during an upgrade.
   451  func jobsToTargets(jobs []multiwatcher.MachineJob, isMaster bool) (targets []upgrades.Target) {
   452  	for _, job := range jobs {
   453  		switch job {
   454  		case multiwatcher.JobManageEnviron:
   455  			targets = append(targets, upgrades.StateServer)
   456  			if isMaster {
   457  				targets = append(targets, upgrades.DatabaseMaster)
   458  			}
   459  		case multiwatcher.JobHostUnits:
   460  			targets = append(targets, upgrades.HostMachine)
   461  		}
   462  	}
   463  	return
   464  }