github.com/cloud-green/juju@v0.0.0-20151002100041-a00291338d3d/cmd/jujud/agent/upgrade.go (about)

     1  package agent
     2  
     3  import (
     4  	"fmt"
     5  	"time"
     6  
     7  	"github.com/juju/errors"
     8  	"github.com/juju/names"
     9  	"github.com/juju/utils"
    10  
    11  	"github.com/juju/juju/agent"
    12  	"github.com/juju/juju/api"
    13  	"github.com/juju/juju/apiserver/params"
    14  	cmdutil "github.com/juju/juju/cmd/jujud/util"
    15  	"github.com/juju/juju/environs"
    16  	"github.com/juju/juju/mongo"
    17  	"github.com/juju/juju/state"
    18  	"github.com/juju/juju/state/multiwatcher"
    19  	"github.com/juju/juju/state/storage"
    20  	"github.com/juju/juju/upgrades"
    21  	"github.com/juju/juju/version"
    22  	"github.com/juju/juju/worker"
    23  	"github.com/juju/juju/wrench"
    24  )
    25  
    26  type upgradingMachineAgent interface {
    27  	ensureMongoServer(agent.Config) error
    28  	setMachineStatus(api.Connection, params.Status, string) error
    29  	CurrentConfig() agent.Config
    30  	ChangeConfig(agent.ConfigMutator) error
    31  	Dying() <-chan struct{}
    32  }
    33  
    34  var (
    35  	upgradesPerformUpgrade = upgrades.PerformUpgrade // Allow patching
    36  
    37  	// The maximum time a master state server will wait for other
    38  	// state servers to come up and indicate they are ready to begin
    39  	// running upgrade steps.
    40  	upgradeStartTimeoutMaster = time.Minute * 15
    41  
    42  	// The maximum time a secondary state server will wait for other
    43  	// state servers to come up and indicate they are ready to begin
    44  	// running upgrade steps. This is effectively "forever" because we
    45  	// don't really want secondaries to ever give up once they've
    46  	// indicated that they're ready to upgrade. It's up to the master
    47  	// to abort the upgrade if required.
    48  	//
    49  	// This should get reduced when/if master re-elections are
    50  	// introduce in the case a master that failing to come up for
    51  	// upgrade.
    52  	upgradeStartTimeoutSecondary = time.Hour * 4
    53  )
    54  
    55  func NewUpgradeWorkerContext() *upgradeWorkerContext {
    56  	return &upgradeWorkerContext{
    57  		UpgradeComplete: make(chan struct{}),
    58  	}
    59  }
    60  
    61  type upgradeWorkerContext struct {
    62  	UpgradeComplete chan struct{}
    63  	fromVersion     version.Number
    64  	toVersion       version.Number
    65  	agent           upgradingMachineAgent
    66  	tag             names.MachineTag
    67  	machineId       string
    68  	isMaster        bool
    69  	apiState        api.Connection
    70  	jobs            []multiwatcher.MachineJob
    71  	agentConfig     agent.Config
    72  	isStateServer   bool
    73  	st              *state.State
    74  }
    75  
    76  // InitialiseUsingAgent sets up a upgradeWorkerContext from a machine agent instance.
    77  // It may update the agent's configuration.
    78  func (c *upgradeWorkerContext) InitializeUsingAgent(a upgradingMachineAgent) error {
    79  	if wrench.IsActive("machine-agent", "always-try-upgrade") {
    80  		// Always enter upgrade mode. This allows test of upgrades
    81  		// even when there's actually no upgrade steps to run.
    82  		return nil
    83  	}
    84  	return a.ChangeConfig(func(agentConfig agent.ConfigSetter) error {
    85  		if !upgrades.AreUpgradesDefined(agentConfig.UpgradedToVersion()) {
    86  			logger.Infof("no upgrade steps required or upgrade steps for %v "+
    87  				"have already been run.", version.Current.Number)
    88  			close(c.UpgradeComplete)
    89  
    90  			// Even if no upgrade is required the version number in
    91  			// the agent's config still needs to be bumped.
    92  			agentConfig.SetUpgradedToVersion(version.Current.Number)
    93  		}
    94  		return nil
    95  	})
    96  }
    97  
    98  func (c *upgradeWorkerContext) Worker(
    99  	agent upgradingMachineAgent,
   100  	apiState api.Connection,
   101  	jobs []multiwatcher.MachineJob,
   102  ) worker.Worker {
   103  	c.agent = agent
   104  	c.apiState = apiState
   105  	c.jobs = jobs
   106  	return worker.NewSimpleWorker(c.run)
   107  }
   108  
   109  func (c *upgradeWorkerContext) IsUpgradeRunning() bool {
   110  	select {
   111  	case <-c.UpgradeComplete:
   112  		return false
   113  	default:
   114  		return true
   115  	}
   116  }
   117  
   118  type apiLostDuringUpgrade struct {
   119  	err error
   120  }
   121  
   122  func (e *apiLostDuringUpgrade) Error() string {
   123  	return fmt.Sprintf("API connection lost during upgrade: %v", e.err)
   124  }
   125  
   126  func isAPILostDuringUpgrade(err error) bool {
   127  	_, ok := err.(*apiLostDuringUpgrade)
   128  	return ok
   129  }
   130  
   131  func (c *upgradeWorkerContext) run(stop <-chan struct{}) error {
   132  	if wrench.IsActive("machine-agent", "fail-upgrade-start") {
   133  		return nil // Make the worker stop
   134  	}
   135  
   136  	select {
   137  	case <-c.UpgradeComplete:
   138  		// Our work is already done (we're probably being restarted
   139  		// because the API connection has gone down), so do nothing.
   140  		return nil
   141  	default:
   142  	}
   143  
   144  	c.agentConfig = c.agent.CurrentConfig()
   145  
   146  	c.fromVersion = c.agentConfig.UpgradedToVersion()
   147  	c.toVersion = version.Current.Number
   148  	if c.fromVersion == c.toVersion {
   149  		logger.Infof("upgrade to %v already completed.", c.toVersion)
   150  		close(c.UpgradeComplete)
   151  		return nil
   152  	}
   153  
   154  	if err := c.initTag(c.agentConfig.Tag()); err != nil {
   155  		return errors.Trace(err)
   156  	}
   157  
   158  	// If the machine agent is a state server, flag that state
   159  	// needs to be opened before running upgrade steps
   160  	for _, job := range c.jobs {
   161  		if job == multiwatcher.JobManageEnviron {
   162  			c.isStateServer = true
   163  		}
   164  	}
   165  
   166  	// We need a *state.State for upgrades. We open it independently
   167  	// of StateWorker, because we have no guarantees about when
   168  	// and how often StateWorker might run.
   169  	if c.isStateServer {
   170  		var err error
   171  		if c.st, err = openStateForUpgrade(c.agent, c.agentConfig); err != nil {
   172  			return err
   173  		}
   174  		defer c.st.Close()
   175  
   176  		if c.isMaster, err = isMachineMaster(c.st, c.machineId); err != nil {
   177  			return errors.Trace(err)
   178  		}
   179  
   180  		stor := storage.NewStorage(c.st.EnvironUUID(), c.st.MongoSession())
   181  		registerSimplestreamsDataSource(stor)
   182  
   183  		// This state-dependent data source will be useless
   184  		// once state is closed in previous defer - un-register it.
   185  		defer unregisterSimplestreamsDataSource()
   186  	}
   187  	if err := c.runUpgrades(); err != nil {
   188  		// Only return an error from the worker if the connection to
   189  		// state went away (possible mongo master change). Returning
   190  		// an error when the connection is lost will cause the agent
   191  		// to restart.
   192  		//
   193  		// For other errors, the error is not returned because we want
   194  		// the machine agent to stay running in an error state waiting
   195  		// for user intervention.
   196  		if isAPILostDuringUpgrade(err) {
   197  			return err
   198  		}
   199  		c.reportUpgradeFailure(err, false)
   200  
   201  	} else {
   202  		// Upgrade succeeded - signal that the upgrade is complete.
   203  		logger.Infof("upgrade to %v completed successfully.", c.toVersion)
   204  		c.agent.setMachineStatus(c.apiState, params.StatusStarted, "")
   205  		close(c.UpgradeComplete)
   206  	}
   207  	return nil
   208  }
   209  
   210  func (c *upgradeWorkerContext) initTag(tag names.Tag) error {
   211  	var ok bool
   212  	if c.tag, ok = tag.(names.MachineTag); !ok {
   213  		return errors.New("machine agent's tag is not a MachineTag")
   214  	}
   215  	c.machineId = c.tag.Id()
   216  	return nil
   217  }
   218  
   219  var agentTerminating = errors.New("machine agent is terminating")
   220  
   221  // runUpgrades runs the upgrade operations for each job type and
   222  // updates the updatedToVersion on success.
   223  func (c *upgradeWorkerContext) runUpgrades() error {
   224  	upgradeInfo, err := c.prepareForUpgrade()
   225  	if err != nil {
   226  		return err
   227  	}
   228  
   229  	if wrench.IsActive("machine-agent", "fail-upgrade") {
   230  		return errors.New("wrench")
   231  	}
   232  
   233  	if err := c.agent.ChangeConfig(c.runUpgradeSteps); err != nil {
   234  		return err
   235  	}
   236  
   237  	if err := c.finaliseUpgrade(upgradeInfo); err != nil {
   238  		return err
   239  	}
   240  	return nil
   241  }
   242  
   243  func (c *upgradeWorkerContext) prepareForUpgrade() (*state.UpgradeInfo, error) {
   244  	if !c.isStateServer {
   245  		return nil, nil
   246  	}
   247  
   248  	logger.Infof("signalling that this state server is ready for upgrade")
   249  	info, err := c.st.EnsureUpgradeInfo(c.machineId, c.fromVersion, c.toVersion)
   250  	if err != nil {
   251  		return nil, errors.Trace(err)
   252  	}
   253  
   254  	// State servers need to wait for other state servers to be ready
   255  	// to run the upgrade steps.
   256  	logger.Infof("waiting for other state servers to be ready for upgrade")
   257  	if err := c.waitForOtherStateServers(info); err != nil {
   258  		if err == agentTerminating {
   259  			logger.Warningf(`stopped waiting for other state servers: %v`, err)
   260  		} else {
   261  			logger.Errorf(`aborted wait for other state servers: %v`, err)
   262  			// If master, trigger a rollback to the previous agent version.
   263  			if c.isMaster {
   264  				logger.Errorf("downgrading environment agent version to %v due to aborted upgrade",
   265  					c.fromVersion)
   266  				if rollbackErr := c.st.SetEnvironAgentVersion(c.fromVersion); rollbackErr != nil {
   267  					logger.Errorf("rollback failed: %v", rollbackErr)
   268  					return nil, errors.Annotate(rollbackErr, "failed to roll back desired agent version")
   269  				}
   270  			}
   271  		}
   272  		return nil, errors.Annotate(err, "aborted wait for other state servers")
   273  	}
   274  	if c.isMaster {
   275  		logger.Infof("finished waiting - all state servers are ready to run upgrade steps")
   276  	} else {
   277  		logger.Infof("finished waiting - the master has completed its upgrade steps")
   278  	}
   279  	return info, nil
   280  }
   281  
   282  func (c *upgradeWorkerContext) waitForOtherStateServers(info *state.UpgradeInfo) error {
   283  	watcher := info.Watch()
   284  	defer watcher.Stop()
   285  
   286  	maxWait := getUpgradeStartTimeout(c.isMaster)
   287  	timeout := time.After(maxWait)
   288  	for {
   289  		select {
   290  		case <-watcher.Changes():
   291  			if err := info.Refresh(); err != nil {
   292  				return errors.Trace(err)
   293  			}
   294  			if c.isMaster {
   295  				if ready, err := info.AllProvisionedStateServersReady(); err != nil {
   296  					return errors.Trace(err)
   297  				} else if ready {
   298  					// All state servers ready to start upgrade
   299  					err := info.SetStatus(state.UpgradeRunning)
   300  					return errors.Trace(err)
   301  				}
   302  			} else {
   303  				if info.Status() == state.UpgradeFinishing {
   304  					// Master is done, ok to proceed
   305  					return nil
   306  				}
   307  			}
   308  		case <-timeout:
   309  			if c.isMaster {
   310  				if err := info.Abort(); err != nil {
   311  					return errors.Annotate(err, "unable to abort upgrade")
   312  				}
   313  			}
   314  			return errors.Errorf("timed out after %s", maxWait)
   315  		case <-c.agent.Dying():
   316  			return agentTerminating
   317  		}
   318  
   319  	}
   320  }
   321  
   322  // runUpgradeSteps runs the required upgrade steps for the machine
   323  // agent, retrying on failure. The agent's UpgradedToVersion is set
   324  // once the upgrade is complete.
   325  //
   326  // This function conforms to the agent.ConfigMutator type and is
   327  // designed to be called via a machine agent's ChangeConfig method.
   328  func (c *upgradeWorkerContext) runUpgradeSteps(agentConfig agent.ConfigSetter) error {
   329  	var upgradeErr error
   330  	a := c.agent
   331  	a.setMachineStatus(c.apiState, params.StatusStarted, fmt.Sprintf("upgrading to %v", c.toVersion))
   332  
   333  	context := upgrades.NewContext(agentConfig, c.apiState, c.st)
   334  	logger.Infof("starting upgrade from %v to %v for %q", c.fromVersion, c.toVersion, c.tag)
   335  
   336  	targets := jobsToTargets(c.jobs, c.isMaster)
   337  	attempts := getUpgradeRetryStrategy()
   338  	for attempt := attempts.Start(); attempt.Next(); {
   339  		upgradeErr = upgradesPerformUpgrade(c.fromVersion, targets, context)
   340  		if upgradeErr == nil {
   341  			break
   342  		}
   343  		if cmdutil.ConnectionIsDead(logger, c.apiState) {
   344  			// API connection has gone away - abort!
   345  			return &apiLostDuringUpgrade{upgradeErr}
   346  		}
   347  		if attempt.HasNext() {
   348  			c.reportUpgradeFailure(upgradeErr, true)
   349  		}
   350  	}
   351  	if upgradeErr != nil {
   352  		return upgradeErr
   353  	}
   354  	agentConfig.SetUpgradedToVersion(c.toVersion)
   355  	return nil
   356  }
   357  
   358  func (c *upgradeWorkerContext) reportUpgradeFailure(err error, willRetry bool) {
   359  	retryText := "will retry"
   360  	if !willRetry {
   361  		retryText = "giving up"
   362  	}
   363  	logger.Errorf("upgrade from %v to %v for %q failed (%s): %v",
   364  		c.fromVersion, c.toVersion, c.tag, retryText, err)
   365  	c.agent.setMachineStatus(c.apiState, params.StatusError,
   366  		fmt.Sprintf("upgrade to %v failed (%s): %v", c.toVersion, retryText, err))
   367  }
   368  
   369  func (c *upgradeWorkerContext) finaliseUpgrade(info *state.UpgradeInfo) error {
   370  	if !c.isStateServer {
   371  		return nil
   372  	}
   373  
   374  	if c.isMaster {
   375  		// Tell other state servers that the master has completed its
   376  		// upgrade steps.
   377  		if err := info.SetStatus(state.UpgradeFinishing); err != nil {
   378  			return errors.Annotate(err, "upgrade done but")
   379  		}
   380  	}
   381  
   382  	if err := info.SetStateServerDone(c.machineId); err != nil {
   383  		return errors.Annotate(err, "upgrade done but failed to synchronise")
   384  	}
   385  
   386  	return nil
   387  }
   388  
   389  func getUpgradeStartTimeout(isMaster bool) time.Duration {
   390  	if wrench.IsActive("machine-agent", "short-upgrade-timeout") {
   391  		// This duration is fairly arbitrary. During manual testing it
   392  		// avoids the normal long wait but still provides a small
   393  		// window to check the environment status and logs before the
   394  		// timeout is triggered.
   395  		return time.Minute
   396  	}
   397  
   398  	if isMaster {
   399  		return upgradeStartTimeoutMaster
   400  	}
   401  	return upgradeStartTimeoutSecondary
   402  }
   403  
   404  var openStateForUpgrade = func(
   405  	agent upgradingMachineAgent,
   406  	agentConfig agent.Config,
   407  ) (*state.State, error) {
   408  	if err := agent.ensureMongoServer(agentConfig); err != nil {
   409  		return nil, err
   410  	}
   411  	var err error
   412  	info, ok := agentConfig.MongoInfo()
   413  	if !ok {
   414  		return nil, fmt.Errorf("no state info available")
   415  	}
   416  	st, err := state.Open(agentConfig.Environment(), info, mongo.DefaultDialOpts(), environs.NewStatePolicy())
   417  	if err != nil {
   418  		return nil, err
   419  	}
   420  	return st, nil
   421  }
   422  
   423  var isMachineMaster = func(st *state.State, machineId string) (bool, error) {
   424  	if st == nil {
   425  		// If there is no state, we aren't a master.
   426  		return false, nil
   427  	}
   428  	// Not calling the agent openState method as it does other checks
   429  	// we really don't care about here.  All we need here is the machine
   430  	// so we can determine if we are the master or not.
   431  	machine, err := st.Machine(machineId)
   432  	if err != nil {
   433  		// This shouldn't happen, and if it does, the state worker will have
   434  		// found out before us, and already errored, or is likely to error out
   435  		// very shortly.  All we do here is return the error. The state worker
   436  		// returns an error that will cause the agent to be terminated.
   437  		return false, errors.Trace(err)
   438  	}
   439  	isMaster, err := mongo.IsMaster(st.MongoSession(), machine)
   440  	if err != nil {
   441  		return false, errors.Trace(err)
   442  	}
   443  	return isMaster, nil
   444  }
   445  
   446  var getUpgradeRetryStrategy = func() utils.AttemptStrategy {
   447  	return utils.AttemptStrategy{
   448  		Delay: 2 * time.Minute,
   449  		Min:   5,
   450  	}
   451  }
   452  
   453  // jobsToTargets determines the upgrade targets corresponding to the
   454  // jobs assigned to a machine agent. This determines the upgrade steps
   455  // which will run during an upgrade.
   456  func jobsToTargets(jobs []multiwatcher.MachineJob, isMaster bool) (targets []upgrades.Target) {
   457  	for _, job := range jobs {
   458  		switch job {
   459  		case multiwatcher.JobManageEnviron:
   460  			targets = append(targets, upgrades.StateServer)
   461  			if isMaster {
   462  				targets = append(targets, upgrades.DatabaseMaster)
   463  			}
   464  		case multiwatcher.JobHostUnits:
   465  			targets = append(targets, upgrades.HostMachine)
   466  		}
   467  	}
   468  	return
   469  }