github.com/altoros/juju-vmware@v0.0.0-20150312064031-f19ae857ccca/cmd/jujud/agent/upgrade_test.go (about)

     1  // Copyright 2012, 2013 Canonical Ltd.
     2  // Licensed under the AGPLv3, see LICENCE file for details.
     3  
     4  package agent
     5  
     6  import (
     7  	"fmt"
     8  	"os"
     9  	"os/exec"
    10  	"path/filepath"
    11  	"sync"
    12  	"time"
    13  
    14  	"github.com/juju/errors"
    15  	"github.com/juju/loggo"
    16  	"github.com/juju/names"
    17  	jc "github.com/juju/testing/checkers"
    18  	"github.com/juju/utils"
    19  	"github.com/juju/utils/apt"
    20  	gc "gopkg.in/check.v1"
    21  
    22  	"github.com/juju/juju/agent"
    23  	"github.com/juju/juju/api"
    24  	"github.com/juju/juju/apiserver/params"
    25  	cmdutil "github.com/juju/juju/cmd/jujud/util"
    26  	"github.com/juju/juju/constraints"
    27  	"github.com/juju/juju/environs"
    28  	"github.com/juju/juju/environs/config"
    29  	envtesting "github.com/juju/juju/environs/testing"
    30  	"github.com/juju/juju/mongo"
    31  	"github.com/juju/juju/state"
    32  	"github.com/juju/juju/state/multiwatcher"
    33  	"github.com/juju/juju/state/watcher"
    34  	coretesting "github.com/juju/juju/testing"
    35  	"github.com/juju/juju/upgrades"
    36  	"github.com/juju/juju/version"
    37  	"github.com/juju/juju/worker"
    38  	"github.com/juju/juju/worker/upgrader"
    39  )
    40  
    41  type UpgradeSuite struct {
    42  	commonMachineSuite
    43  
    44  	aptCmds         []*exec.Cmd
    45  	oldVersion      version.Binary
    46  	logWriter       loggo.TestWriter
    47  	connectionDead  bool
    48  	machineIsMaster bool
    49  	aptMutex        sync.Mutex
    50  }
    51  
    52  var _ = gc.Suite(&UpgradeSuite{})
    53  
    54  type exposedAPI bool
    55  
    56  var (
    57  	FullAPIExposed       exposedAPI = true
    58  	RestrictedAPIExposed exposedAPI = false
    59  )
    60  
    61  const fails = true
    62  const succeeds = false
    63  
    64  func (s *UpgradeSuite) setAptCmds(cmd *exec.Cmd) []*exec.Cmd {
    65  	s.aptMutex.Lock()
    66  	defer s.aptMutex.Unlock()
    67  	if cmd == nil {
    68  		s.aptCmds = nil
    69  	} else {
    70  		s.aptCmds = append(s.aptCmds, cmd)
    71  	}
    72  	return s.aptCmds
    73  }
    74  
    75  func (s *UpgradeSuite) getAptCmds() []*exec.Cmd {
    76  	s.aptMutex.Lock()
    77  	defer s.aptMutex.Unlock()
    78  	return s.aptCmds
    79  }
    80  
    81  func (s *UpgradeSuite) SetUpTest(c *gc.C) {
    82  	s.commonMachineSuite.SetUpTest(c)
    83  
    84  	// Capture all apt commands.
    85  	s.aptCmds = nil
    86  	aptCmds := s.AgentSuite.HookCommandOutput(&apt.CommandOutput, nil, nil)
    87  	go func() {
    88  		for cmd := range aptCmds {
    89  			s.setAptCmds(cmd)
    90  		}
    91  	}()
    92  
    93  	s.oldVersion = version.Current
    94  	s.oldVersion.Major = 1
    95  	s.oldVersion.Minor = 16
    96  
    97  	// Don't wait so long in tests.
    98  	s.PatchValue(&upgradeStartTimeoutMaster, time.Duration(time.Millisecond*50))
    99  	s.PatchValue(&upgradeStartTimeoutSecondary, time.Duration(time.Millisecond*60))
   100  
   101  	// Allow tests to make the API connection appear to be dead.
   102  	s.connectionDead = false
   103  	s.PatchValue(&cmdutil.ConnectionIsDead, func(loggo.Logger, cmdutil.Pinger) bool {
   104  		return s.connectionDead
   105  	})
   106  
   107  	var fakeOpenStateForUpgrade = func(upgradingMachineAgent, agent.Config) (*state.State, error) {
   108  		mongoInfo := s.State.MongoConnectionInfo()
   109  		st, err := state.Open(mongoInfo, mongo.DefaultDialOpts(), environs.NewStatePolicy())
   110  		c.Assert(err, jc.ErrorIsNil)
   111  		return st, nil
   112  	}
   113  	s.PatchValue(&openStateForUpgrade, fakeOpenStateForUpgrade)
   114  
   115  	s.machineIsMaster = true
   116  	fakeIsMachineMaster := func(*state.State, string) (bool, error) {
   117  		return s.machineIsMaster, nil
   118  	}
   119  	s.PatchValue(&isMachineMaster, fakeIsMachineMaster)
   120  }
   121  
   122  func (s *UpgradeSuite) captureLogs(c *gc.C) {
   123  	c.Assert(loggo.RegisterWriter("upgrade-tests", &s.logWriter, loggo.INFO), gc.IsNil)
   124  	s.AddCleanup(func(*gc.C) {
   125  		loggo.RemoveWriter("upgrade-tests")
   126  		s.logWriter.Clear()
   127  	})
   128  }
   129  
   130  func (s *UpgradeSuite) countUpgradeAttempts(upgradeErr error) *int {
   131  	count := 0
   132  	s.PatchValue(&upgradesPerformUpgrade, func(version.Number, []upgrades.Target, upgrades.Context) error {
   133  		count++
   134  		return upgradeErr
   135  	})
   136  	return &count
   137  }
   138  
   139  func (s *UpgradeSuite) TestContextInitializeWhenNoUpgradeRequired(c *gc.C) {
   140  	// Set the agent's initial upgradedToVersion to almost the same as
   141  	// the current version. We want it to be different to
   142  	// version.Current (so that we can see it change) but not to
   143  	// trigger upgrade steps.
   144  	config := NewFakeConfigSetter(names.NewMachineTag("0"), makeBumpedCurrentVersion().Number)
   145  	agent := NewFakeUpgradingMachineAgent(config)
   146  
   147  	context := NewUpgradeWorkerContext()
   148  	context.InitializeUsingAgent(agent)
   149  
   150  	select {
   151  	case <-context.UpgradeComplete:
   152  		// Success
   153  	default:
   154  		c.Fatal("UpgradeComplete channel should be closed because no upgrade is required")
   155  	}
   156  	// The agent's version should have been updated.
   157  	c.Assert(config.Version, gc.Equals, version.Current.Number)
   158  
   159  }
   160  
   161  func (s *UpgradeSuite) TestContextInitializeWhenUpgradeRequired(c *gc.C) {
   162  	// Set the agent's upgradedToVersion so that upgrade steps are required.
   163  	initialVersion := version.MustParse("1.16.0")
   164  	config := NewFakeConfigSetter(names.NewMachineTag("0"), initialVersion)
   165  	agent := NewFakeUpgradingMachineAgent(config)
   166  
   167  	context := NewUpgradeWorkerContext()
   168  	context.InitializeUsingAgent(agent)
   169  
   170  	select {
   171  	case <-context.UpgradeComplete:
   172  		c.Fatal("UpgradeComplete channel shouldn't be closed because upgrade is required")
   173  	default:
   174  		// Success
   175  	}
   176  	// The agent's version should NOT have been updated.
   177  	c.Assert(config.Version, gc.Equals, initialVersion)
   178  }
   179  
   180  func (s *UpgradeSuite) TestRetryStrategy(c *gc.C) {
   181  	retries := getUpgradeRetryStrategy()
   182  	c.Assert(retries.Delay, gc.Equals, 2*time.Minute)
   183  	c.Assert(retries.Min, gc.Equals, 5)
   184  }
   185  
   186  func (s *UpgradeSuite) TestIsUpgradeRunning(c *gc.C) {
   187  	context := NewUpgradeWorkerContext()
   188  	c.Assert(context.IsUpgradeRunning(), jc.IsTrue)
   189  
   190  	close(context.UpgradeComplete)
   191  	c.Assert(context.IsUpgradeRunning(), jc.IsFalse)
   192  }
   193  
   194  func (s *UpgradeSuite) TestNoUpgradeNecessary(c *gc.C) {
   195  	attemptsP := s.countUpgradeAttempts(nil)
   196  	s.captureLogs(c)
   197  	s.oldVersion = version.Current // nothing to do
   198  
   199  	workerErr, config, _, context := s.runUpgradeWorker(c, multiwatcher.JobHostUnits)
   200  
   201  	c.Check(workerErr, gc.IsNil)
   202  	c.Check(*attemptsP, gc.Equals, 0)
   203  	c.Check(config.Version, gc.Equals, version.Current.Number)
   204  	assertUpgradeComplete(c, context)
   205  }
   206  
   207  func (s *UpgradeSuite) TestUpgradeStepsFailure(c *gc.C) {
   208  	// This test checks what happens when every upgrade attempt fails.
   209  	// A number of retries should be observed and the agent should end
   210  	// up in a state where it is is still running but is reporting an
   211  	// error and the upgrade is not flagged as having completed (which
   212  	// prevents most of the agent's workers from running and keeps the
   213  	// API in restricted mode).
   214  
   215  	attemptsP := s.countUpgradeAttempts(errors.New("boom"))
   216  	s.captureLogs(c)
   217  
   218  	workerErr, config, agent, context := s.runUpgradeWorker(c, multiwatcher.JobHostUnits)
   219  
   220  	// The worker shouldn't return an error so that the worker and
   221  	// agent keep running.
   222  	c.Check(workerErr, gc.IsNil)
   223  
   224  	c.Check(*attemptsP, gc.Equals, maxUpgradeRetries)
   225  	c.Check(config.Version, gc.Equals, s.oldVersion.Number) // Upgrade didn't finish
   226  	c.Assert(agent.MachineStatusCalls, jc.DeepEquals,
   227  		s.makeExpectedStatusCalls(maxUpgradeRetries-1, fails, "boom"))
   228  	c.Assert(s.logWriter.Log(), jc.LogMatches,
   229  		s.makeExpectedUpgradeLogs(maxUpgradeRetries-1, "hostMachine", fails, "boom"))
   230  	assertUpgradeNotComplete(c, context)
   231  }
   232  
   233  func (s *UpgradeSuite) TestUpgradeStepsRetries(c *gc.C) {
   234  	// This test checks what happens when the first upgrade attempt
   235  	// fails but the following on succeeds. The final state should be
   236  	// the same as a successful upgrade which worked first go.
   237  	attempts := 0
   238  	fail := true
   239  	fakePerformUpgrade := func(version.Number, []upgrades.Target, upgrades.Context) error {
   240  		attempts++
   241  		if fail {
   242  			fail = false
   243  			return errors.New("boom")
   244  		} else {
   245  			return nil
   246  		}
   247  	}
   248  	s.PatchValue(&upgradesPerformUpgrade, fakePerformUpgrade)
   249  	s.captureLogs(c)
   250  
   251  	workerErr, config, agent, context := s.runUpgradeWorker(c, multiwatcher.JobHostUnits)
   252  
   253  	c.Check(workerErr, gc.IsNil)
   254  	c.Check(attempts, gc.Equals, 2)
   255  	c.Check(config.Version, gc.Equals, version.Current.Number) // Upgrade finished
   256  	c.Assert(agent.MachineStatusCalls, jc.DeepEquals, s.makeExpectedStatusCalls(1, succeeds, "boom"))
   257  	c.Assert(s.logWriter.Log(), jc.LogMatches, s.makeExpectedUpgradeLogs(1, "hostMachine", succeeds, "boom"))
   258  	assertUpgradeComplete(c, context)
   259  }
   260  
   261  func (s *UpgradeSuite) TestOtherUpgradeRunFailure(c *gc.C) {
   262  	// This test checks what happens something other than the upgrade
   263  	// steps themselves fails, ensuring the something is logged and
   264  	// the agent status is updated.
   265  
   266  	fakePerformUpgrade := func(version.Number, []upgrades.Target, upgrades.Context) error {
   267  		// Delete UpgradeInfo for the upgrade so that finaliseUpgrade() will fail
   268  		s.State.ClearUpgradeInfo()
   269  		return nil
   270  	}
   271  	s.PatchValue(&upgradesPerformUpgrade, fakePerformUpgrade)
   272  	s.primeAgent(c, s.oldVersion, state.JobManageEnviron)
   273  	s.captureLogs(c)
   274  
   275  	workerErr, config, agent, context := s.runUpgradeWorker(c, multiwatcher.JobManageEnviron)
   276  
   277  	c.Check(workerErr, gc.IsNil)
   278  	c.Check(config.Version, gc.Equals, version.Current.Number) // Upgrade almost finished
   279  	failReason := `upgrade done but: cannot set upgrade status to "finishing": ` +
   280  		`Another status change may have occurred concurrently`
   281  	c.Assert(agent.MachineStatusCalls, jc.DeepEquals,
   282  		s.makeExpectedStatusCalls(0, fails, failReason))
   283  	c.Assert(s.logWriter.Log(), jc.LogMatches,
   284  		s.makeExpectedUpgradeLogs(0, "databaseMaster", fails, failReason))
   285  	assertUpgradeNotComplete(c, context)
   286  }
   287  
   288  func (s *UpgradeSuite) TestApiConnectionFailure(c *gc.C) {
   289  	// This test checks what happens when an upgrade fails because the
   290  	// connection to mongo has gone away. This will happen when the
   291  	// mongo master changes. In this case we want the upgrade worker
   292  	// to return immediately without further retries. The error should
   293  	// be returned by the worker so that the agent will restart.
   294  
   295  	attemptsP := s.countUpgradeAttempts(errors.New("boom"))
   296  	s.connectionDead = true // Make the connection to state appear to be dead
   297  	s.captureLogs(c)
   298  
   299  	workerErr, config, _, context := s.runUpgradeWorker(c, multiwatcher.JobHostUnits)
   300  
   301  	c.Check(workerErr, gc.ErrorMatches, "API connection lost during upgrade: boom")
   302  	c.Check(*attemptsP, gc.Equals, 1)
   303  	c.Check(config.Version, gc.Equals, s.oldVersion.Number) // Upgrade didn't finish
   304  	assertUpgradeNotComplete(c, context)
   305  }
   306  
   307  func (s *UpgradeSuite) TestAbortWhenOtherStateServerDoesntStartUpgrade(c *gc.C) {
   308  	// This test checks when a state server is upgrading and one of
   309  	// the other state servers doesn't signal it is ready in time.
   310  
   311  	// The master state server in this scenario is functionally tested
   312  	// elsewhere in this suite.
   313  	s.machineIsMaster = false
   314  
   315  	s.createUpgradingStateServers(c)
   316  	s.captureLogs(c)
   317  	attemptsP := s.countUpgradeAttempts(nil)
   318  
   319  	workerErr, config, agent, context := s.runUpgradeWorker(c, multiwatcher.JobManageEnviron)
   320  
   321  	c.Check(workerErr, gc.IsNil)
   322  	c.Check(*attemptsP, gc.Equals, 0)
   323  	c.Check(config.Version, gc.Equals, s.oldVersion.Number) // Upgrade didn't happen
   324  	assertUpgradeNotComplete(c, context)
   325  
   326  	// The environment agent-version should still be the new version.
   327  	// It's up to the master to trigger the rollback.
   328  	s.assertEnvironAgentVersion(c, version.Current.Number)
   329  
   330  	causeMsg := " timed out after 60ms"
   331  	c.Assert(s.logWriter.Log(), jc.LogMatches, []jc.SimpleMessage{
   332  		{loggo.INFO, "waiting for other state servers to be ready for upgrade"},
   333  		{loggo.ERROR, "aborted wait for other state servers: timed out after 60ms"},
   334  		{loggo.ERROR, `upgrade from .+ to .+ for "machine-0" failed \(giving up\): ` +
   335  			"aborted wait for other state servers:" + causeMsg},
   336  	})
   337  	c.Assert(agent.MachineStatusCalls, jc.DeepEquals, []MachineStatusCall{{
   338  		params.StatusError,
   339  		fmt.Sprintf(
   340  			"upgrade to %s failed (giving up): aborted wait for other state servers:"+causeMsg,
   341  			version.Current.Number),
   342  	}})
   343  }
   344  
   345  func (s *UpgradeSuite) TestWorkerAbortsIfAgentDies(c *gc.C) {
   346  	s.machineIsMaster = false
   347  	s.captureLogs(c)
   348  	attemptsP := s.countUpgradeAttempts(nil)
   349  
   350  	s.primeAgent(c, s.oldVersion, state.JobManageEnviron)
   351  
   352  	config := s.makeFakeConfig()
   353  	agent := NewFakeUpgradingMachineAgent(config)
   354  	close(agent.DyingCh)
   355  	workerErr, context := s.runUpgradeWorkerUsingAgent(c, agent, multiwatcher.JobManageEnviron)
   356  
   357  	c.Check(workerErr, gc.IsNil)
   358  	c.Check(*attemptsP, gc.Equals, 0)
   359  	c.Check(config.Version, gc.Equals, s.oldVersion.Number) // Upgrade didn't happen
   360  	assertUpgradeNotComplete(c, context)
   361  	c.Assert(s.logWriter.Log(), jc.LogMatches, []jc.SimpleMessage{
   362  		{loggo.WARNING, "stopped waiting for other state servers: machine agent is terminating"},
   363  	})
   364  }
   365  
   366  func (s *UpgradeSuite) TestSuccessMaster(c *gc.C) {
   367  	// This test checks what happens when an upgrade works on the
   368  	// first attempt on a master state server.
   369  	s.machineIsMaster = true
   370  	info := s.checkSuccess(c, "databaseMaster", func(*state.UpgradeInfo) {})
   371  	c.Assert(info.Status(), gc.Equals, state.UpgradeFinishing)
   372  }
   373  
   374  func (s *UpgradeSuite) TestSuccessSecondary(c *gc.C) {
   375  	// This test checks what happens when an upgrade works on the
   376  	// first attempt on a secondary state server.
   377  	s.machineIsMaster = false
   378  	mungeInfo := func(info *state.UpgradeInfo) {
   379  		// Indicate that the master is done
   380  		err := info.SetStatus(state.UpgradeRunning)
   381  		c.Assert(err, jc.ErrorIsNil)
   382  		err = info.SetStatus(state.UpgradeFinishing)
   383  		c.Assert(err, jc.ErrorIsNil)
   384  	}
   385  	s.checkSuccess(c, "stateServer", mungeInfo)
   386  }
   387  
   388  func (s *UpgradeSuite) checkSuccess(c *gc.C, target string, mungeInfo func(*state.UpgradeInfo)) *state.UpgradeInfo {
   389  	_, machineIdB, machineIdC := s.createUpgradingStateServers(c)
   390  
   391  	// Indicate that machine B and C are ready to upgrade
   392  	vPrevious := s.oldVersion.Number
   393  	vNext := version.Current.Number
   394  	info, err := s.State.EnsureUpgradeInfo(machineIdB, vPrevious, vNext)
   395  	c.Assert(err, jc.ErrorIsNil)
   396  	_, err = s.State.EnsureUpgradeInfo(machineIdC, vPrevious, vNext)
   397  	c.Assert(err, jc.ErrorIsNil)
   398  
   399  	mungeInfo(info)
   400  
   401  	attemptsP := s.countUpgradeAttempts(nil)
   402  	s.captureLogs(c)
   403  
   404  	workerErr, config, agent, context := s.runUpgradeWorker(c, multiwatcher.JobManageEnviron)
   405  
   406  	c.Check(workerErr, gc.IsNil)
   407  	c.Check(*attemptsP, gc.Equals, 1)
   408  	c.Check(config.Version, gc.Equals, version.Current.Number) // Upgrade finished
   409  	c.Assert(agent.MachineStatusCalls, jc.DeepEquals, s.makeExpectedStatusCalls(0, succeeds, ""))
   410  	c.Assert(s.logWriter.Log(), jc.LogMatches, s.makeExpectedUpgradeLogs(0, target, succeeds, ""))
   411  	assertUpgradeComplete(c, context)
   412  
   413  	err = info.Refresh()
   414  	c.Assert(err, jc.ErrorIsNil)
   415  	c.Assert(info.StateServersDone(), jc.DeepEquals, []string{"0"})
   416  	return info
   417  }
   418  
   419  func (s *UpgradeSuite) TestJobsToTargets(c *gc.C) {
   420  	check := func(jobs []multiwatcher.MachineJob, isMaster bool, expectedTargets ...upgrades.Target) {
   421  		c.Assert(jobsToTargets(jobs, isMaster), jc.SameContents, expectedTargets)
   422  	}
   423  
   424  	check([]multiwatcher.MachineJob{multiwatcher.JobHostUnits}, false, upgrades.HostMachine)
   425  	check([]multiwatcher.MachineJob{multiwatcher.JobManageEnviron}, false, upgrades.StateServer)
   426  	check([]multiwatcher.MachineJob{multiwatcher.JobManageEnviron}, true,
   427  		upgrades.StateServer, upgrades.DatabaseMaster)
   428  	check([]multiwatcher.MachineJob{multiwatcher.JobManageEnviron, multiwatcher.JobHostUnits}, false,
   429  		upgrades.StateServer, upgrades.HostMachine)
   430  	check([]multiwatcher.MachineJob{multiwatcher.JobManageEnviron, multiwatcher.JobHostUnits}, true,
   431  		upgrades.StateServer, upgrades.DatabaseMaster, upgrades.HostMachine)
   432  }
   433  
   434  func (s *UpgradeSuite) TestUpgradeStepsStateServer(c *gc.C) {
   435  	s.setInstantRetryStrategy(c)
   436  	// Upload tools to provider storage, so they can be migrated to environment storage.
   437  	stor, err := environs.LegacyStorage(s.State)
   438  	if !errors.IsNotSupported(err) {
   439  		c.Assert(err, jc.ErrorIsNil)
   440  		envtesting.AssertUploadFakeToolsVersions(
   441  			c, stor, "releases", s.Environ.Config().AgentStream(), s.oldVersion)
   442  	}
   443  
   444  	s.assertUpgradeSteps(c, state.JobManageEnviron)
   445  	s.assertStateServerUpgrades(c)
   446  }
   447  
   448  func (s *UpgradeSuite) TestUpgradeStepsHostMachine(c *gc.C) {
   449  	s.setInstantRetryStrategy(c)
   450  	// We need to first start up a state server that thinks it has already been upgraded.
   451  	ss, _, _ := s.primeAgent(c, version.Current, state.JobManageEnviron)
   452  	a := s.newAgent(c, ss)
   453  	go func() { c.Check(a.Run(nil), gc.IsNil) }()
   454  	defer func() { c.Check(a.Stop(), gc.IsNil) }()
   455  	// Now run the test.
   456  	s.assertUpgradeSteps(c, state.JobHostUnits)
   457  	s.assertHostUpgrades(c)
   458  }
   459  
   460  func (s *UpgradeSuite) TestLoginsDuringUpgrade(c *gc.C) {
   461  	// Create machine agent to upgrade
   462  	machine, machine0Conf, _ := s.primeAgent(c, s.oldVersion, state.JobManageEnviron)
   463  	a := s.newAgent(c, machine)
   464  
   465  	// Mock out upgrade logic, using a channel so that the test knows
   466  	// when upgrades have started and can control when upgrades
   467  	// should finish.
   468  	upgradeCh := make(chan bool)
   469  	abort := make(chan bool)
   470  	fakePerformUpgrade := func(version.Number, []upgrades.Target, upgrades.Context) error {
   471  		// Signal that upgrade has started.
   472  		select {
   473  		case upgradeCh <- true:
   474  		case <-abort:
   475  			return nil
   476  		}
   477  
   478  		// Wait for signal that upgrades should finish.
   479  		select {
   480  		case <-upgradeCh:
   481  		case <-abort:
   482  			return nil
   483  		}
   484  		return nil
   485  	}
   486  	s.PatchValue(&upgradesPerformUpgrade, fakePerformUpgrade)
   487  
   488  	// Start the API server and upgrade-steps works just as the agent would.
   489  	runner := worker.NewRunner(cmdutil.IsFatal, cmdutil.MoreImportant)
   490  	defer func() {
   491  		close(abort)
   492  		runner.Kill()
   493  		runner.Wait()
   494  	}()
   495  	certChangedChan := make(chan params.StateServingInfo)
   496  	runner.StartWorker("apiserver", a.apiserverWorkerStarter(s.State, certChangedChan))
   497  	runner.StartWorker("upgrade-steps", a.upgradeStepsWorkerStarter(
   498  		s.APIState,
   499  		[]multiwatcher.MachineJob{multiwatcher.JobManageEnviron},
   500  	))
   501  
   502  	// Set up a second machine to log in as.
   503  	// API logins are tested manually so there's no need to actually
   504  	// start this machine.
   505  	var machine1Conf agent.Config
   506  	_, machine1Conf, _ = s.primeAgent(c, version.Current, state.JobHostUnits)
   507  
   508  	c.Assert(waitForUpgradeToStart(upgradeCh), jc.IsTrue)
   509  
   510  	// Only user and local logins are allowed during upgrade. Users get a restricted API.
   511  	s.checkLoginToAPIAsUser(c, machine0Conf, RestrictedAPIExposed)
   512  	c.Assert(canLoginToAPIAsMachine(c, machine0Conf, machine0Conf), jc.IsTrue)
   513  	c.Assert(canLoginToAPIAsMachine(c, machine1Conf, machine0Conf), jc.IsFalse)
   514  
   515  	close(upgradeCh) // Allow upgrade to complete
   516  
   517  	waitForUpgradeToFinish(c, machine0Conf)
   518  
   519  	// All logins are allowed after upgrade
   520  	s.checkLoginToAPIAsUser(c, machine0Conf, FullAPIExposed)
   521  	c.Assert(canLoginToAPIAsMachine(c, machine0Conf, machine0Conf), jc.IsTrue)
   522  	c.Assert(canLoginToAPIAsMachine(c, machine1Conf, machine0Conf), jc.IsTrue)
   523  }
   524  
   525  func (s *UpgradeSuite) TestUpgradeSkippedIfNoUpgradeRequired(c *gc.C) {
   526  	attempts := 0
   527  	upgradeCh := make(chan bool)
   528  	fakePerformUpgrade := func(version.Number, []upgrades.Target, upgrades.Context) error {
   529  		// Note: this shouldn't run.
   530  		attempts++
   531  		// If execution ends up here, wait so it can be detected (by
   532  		// checking for restricted API
   533  		<-upgradeCh
   534  		return nil
   535  	}
   536  	s.PatchValue(&upgradesPerformUpgrade, fakePerformUpgrade)
   537  
   538  	// Set up machine agent running the current version.
   539  	//
   540  	// Set the agent's initial upgradedToVersion to be almost the same
   541  	// as version.Current but not quite. We want it to be different to
   542  	// version.Current (so that we can see it change) but not to
   543  	// trigger upgrade steps.
   544  	initialVersion := makeBumpedCurrentVersion()
   545  	machine, agentConf, _ := s.primeAgent(c, initialVersion, state.JobManageEnviron)
   546  	a := s.newAgent(c, machine)
   547  	go func() { c.Check(a.Run(nil), gc.IsNil) }()
   548  	defer func() {
   549  		close(upgradeCh)
   550  		c.Check(a.Stop(), gc.IsNil)
   551  	}()
   552  
   553  	// Test that unrestricted API logins are possible (i.e. no
   554  	// "upgrade mode" in force)
   555  	s.checkLoginToAPIAsUser(c, agentConf, FullAPIExposed)
   556  	c.Assert(attempts, gc.Equals, 0) // There should have been no attempt to upgrade.
   557  
   558  	// Even though no upgrade was done upgradedToVersion should have been updated.
   559  	c.Assert(a.CurrentConfig().UpgradedToVersion(), gc.Equals, version.Current.Number)
   560  }
   561  
   562  func (s *UpgradeSuite) TestDowngradeOnMasterWhenOtherStateServerDoesntStartUpgrade(c *gc.C) {
   563  	// This test checks that the master triggers a downgrade if one of
   564  	// the other state server fails to signal it is ready for upgrade.
   565  	//
   566  	// This test is functional, ensuring that the upgrader worker
   567  	// terminates the machine agent with the UpgradeReadyError which
   568  	// makes the downgrade happen.
   569  
   570  	// Speed up the watcher frequency to make the test much faster.
   571  	s.PatchValue(&watcher.Period, 200*time.Millisecond)
   572  
   573  	// Provide (fake) tools so that the upgrader has something to downgrade to.
   574  	envtesting.AssertUploadFakeToolsVersions(
   575  		c, s.DefaultToolsStorage, s.Environ.Config().AgentStream(), s.Environ.Config().AgentStream(), s.oldVersion)
   576  
   577  	// Only the first machine is going to be ready for upgrade.
   578  	machineIdA, machineIdB, _ := s.createUpgradingStateServers(c)
   579  
   580  	// One of the other state servers is ready for upgrade (but machine C doesn't).
   581  	info, err := s.State.EnsureUpgradeInfo(machineIdB, s.oldVersion.Number, version.Current.Number)
   582  	c.Assert(err, jc.ErrorIsNil)
   583  
   584  	agent := s.newAgentFromMachineId(c, machineIdA)
   585  	defer agent.Stop()
   586  
   587  	s.machineIsMaster = true
   588  
   589  	var agentErr error
   590  	agentDone := make(chan bool)
   591  	go func() {
   592  		agentErr = agent.Run(nil)
   593  		close(agentDone)
   594  	}()
   595  
   596  	select {
   597  	case <-agentDone:
   598  		upgradeReadyErr, ok := agentErr.(*upgrader.UpgradeReadyError)
   599  		if !ok {
   600  			c.Fatalf("didn't see UpgradeReadyError, instead got: %v", agentErr)
   601  		}
   602  		// Confirm that the downgrade is back to the previous version.
   603  		c.Assert(upgradeReadyErr.OldTools, gc.Equals, version.Current)
   604  		c.Assert(upgradeReadyErr.NewTools, gc.Equals, s.oldVersion)
   605  
   606  	case <-time.After(coretesting.LongWait):
   607  		c.Fatal("machine agent did not exit as expected")
   608  	}
   609  
   610  	// UpgradeInfo doc should now be archived.
   611  	err = info.Refresh()
   612  	c.Assert(err, gc.ErrorMatches, "current upgrade info not found")
   613  }
   614  
   615  // Run just the upgrade-steps worker with a fake machine agent and
   616  // fake agent config.
   617  func (s *UpgradeSuite) runUpgradeWorker(c *gc.C, jobs ...multiwatcher.MachineJob) (
   618  	error, *fakeConfigSetter, *fakeUpgradingMachineAgent, *upgradeWorkerContext,
   619  ) {
   620  	config := s.makeFakeConfig()
   621  	agent := NewFakeUpgradingMachineAgent(config)
   622  	err, context := s.runUpgradeWorkerUsingAgent(c, agent, jobs...)
   623  	return err, config, agent, context
   624  }
   625  
   626  // Run just the upgrade-steps worker with the fake machine agent
   627  // provided.
   628  func (s *UpgradeSuite) runUpgradeWorkerUsingAgent(
   629  	c *gc.C,
   630  	agent *fakeUpgradingMachineAgent,
   631  	jobs ...multiwatcher.MachineJob,
   632  ) (error, *upgradeWorkerContext) {
   633  	s.setInstantRetryStrategy(c)
   634  	context := NewUpgradeWorkerContext()
   635  	worker := context.Worker(agent, nil, jobs)
   636  	return worker.Wait(), context
   637  }
   638  
   639  func (s *UpgradeSuite) makeFakeConfig() *fakeConfigSetter {
   640  	return NewFakeConfigSetter(names.NewMachineTag("0"), s.oldVersion.Number)
   641  }
   642  
   643  // Create 3 configured state servers that appear to be running tools
   644  // with version s.oldVersion and return their ids.
   645  func (s *UpgradeSuite) createUpgradingStateServers(c *gc.C) (machineIdA, machineIdB, machineIdC string) {
   646  	machine0, _, _ := s.primeAgent(c, s.oldVersion, state.JobManageEnviron)
   647  	machineIdA = machine0.Id()
   648  
   649  	changes, err := s.State.EnsureAvailability(3, constraints.Value{}, "quantal", nil)
   650  	c.Assert(err, jc.ErrorIsNil)
   651  	c.Assert(len(changes.Added), gc.Equals, 2)
   652  	machineIdB = changes.Added[0]
   653  	s.configureMachine(c, machineIdB, s.oldVersion)
   654  	machineIdC = changes.Added[1]
   655  	s.configureMachine(c, machineIdC, s.oldVersion)
   656  
   657  	return
   658  }
   659  
   660  func (s *UpgradeSuite) newAgentFromMachineId(c *gc.C, machineId string) *MachineAgent {
   661  	machine, err := s.State.Machine(machineId)
   662  	c.Assert(err, jc.ErrorIsNil)
   663  	return s.newAgent(c, machine)
   664  }
   665  
   666  // Return a version the same as the current software version, but with
   667  // the build number bumped.
   668  //
   669  // The version Tag is also cleared so that upgrades.PerformUpgrade
   670  // doesn't think it needs to run upgrade steps unnecessarily.
   671  func makeBumpedCurrentVersion() version.Binary {
   672  	v := version.Current
   673  	v.Build++
   674  	v.Tag = ""
   675  	return v
   676  }
   677  
   678  func waitForUpgradeToStart(upgradeCh chan bool) bool {
   679  	select {
   680  	case <-upgradeCh:
   681  		return true
   682  	case <-time.After(coretesting.LongWait):
   683  		return false
   684  	}
   685  }
   686  
   687  const maxUpgradeRetries = 3
   688  
   689  func (s *UpgradeSuite) setInstantRetryStrategy(c *gc.C) {
   690  	s.PatchValue(&getUpgradeRetryStrategy, func() utils.AttemptStrategy {
   691  		c.Logf("setting instant retry strategy for upgrade: retries=%d", maxUpgradeRetries)
   692  		return utils.AttemptStrategy{
   693  			Delay: 0,
   694  			Min:   maxUpgradeRetries,
   695  		}
   696  	})
   697  }
   698  
   699  func (s *UpgradeSuite) makeExpectedStatusCalls(retryCount int, expectFail bool, failReason string) []MachineStatusCall {
   700  	calls := []MachineStatusCall{{
   701  		params.StatusStarted,
   702  		fmt.Sprintf("upgrading to %s", version.Current.Number),
   703  	}}
   704  	for i := 0; i < retryCount; i++ {
   705  		calls = append(calls, MachineStatusCall{
   706  			params.StatusError,
   707  			fmt.Sprintf("upgrade to %s failed (will retry): %s", version.Current.Number, failReason),
   708  		})
   709  	}
   710  	if expectFail {
   711  		calls = append(calls, MachineStatusCall{
   712  			params.StatusError,
   713  			fmt.Sprintf("upgrade to %s failed (giving up): %s", version.Current.Number, failReason),
   714  		})
   715  	} else {
   716  		calls = append(calls, MachineStatusCall{params.StatusStarted, ""})
   717  	}
   718  	return calls
   719  }
   720  
   721  func (s *UpgradeSuite) makeExpectedUpgradeLogs(
   722  	retryCount int,
   723  	target string,
   724  	expectFail bool,
   725  	failReason string,
   726  ) []jc.SimpleMessage {
   727  	outLogs := []jc.SimpleMessage{}
   728  
   729  	if target == "databaseMaster" || target == "stateServer" {
   730  		outLogs = append(outLogs, jc.SimpleMessage{
   731  			loggo.INFO, "waiting for other state servers to be ready for upgrade",
   732  		})
   733  		var waitMsg string
   734  		switch target {
   735  		case "databaseMaster":
   736  			waitMsg = "all state servers are ready to run upgrade steps"
   737  		case "stateServer":
   738  			waitMsg = "the master has completed its upgrade steps"
   739  		}
   740  		outLogs = append(outLogs, jc.SimpleMessage{loggo.INFO, "finished waiting - " + waitMsg})
   741  	}
   742  
   743  	outLogs = append(outLogs, jc.SimpleMessage{
   744  		loggo.INFO, fmt.Sprintf(
   745  			`starting upgrade from %s to %s for "machine-0"`,
   746  			s.oldVersion.Number, version.Current.Number),
   747  	})
   748  
   749  	failMessage := fmt.Sprintf(
   750  		`upgrade from %s to %s for "machine-0" failed \(%%s\): %s`,
   751  		s.oldVersion.Number, version.Current.Number, failReason)
   752  
   753  	for i := 0; i < retryCount; i++ {
   754  		outLogs = append(outLogs, jc.SimpleMessage{loggo.ERROR, fmt.Sprintf(failMessage, "will retry")})
   755  	}
   756  	if expectFail {
   757  		outLogs = append(outLogs, jc.SimpleMessage{loggo.ERROR, fmt.Sprintf(failMessage, "giving up")})
   758  	} else {
   759  		outLogs = append(outLogs, jc.SimpleMessage{loggo.INFO,
   760  			fmt.Sprintf(`upgrade to %s completed successfully.`, version.Current.Number)})
   761  	}
   762  	return outLogs
   763  }
   764  
   765  func (s *UpgradeSuite) assertUpgradeSteps(c *gc.C, job state.MachineJob) {
   766  	agent, stopFunc := s.createAgentAndStartUpgrade(c, job)
   767  	defer stopFunc()
   768  	waitForUpgradeToFinish(c, agent.CurrentConfig())
   769  }
   770  
   771  func (s *UpgradeSuite) keyFile() string {
   772  	return filepath.Join(s.DataDir(), "system-identity")
   773  }
   774  
   775  func (s *UpgradeSuite) assertCommonUpgrades(c *gc.C) {
   776  	// rsyslog-gnutls should have been installed.
   777  	cmds := s.getAptCmds()
   778  	c.Assert(cmds, gc.HasLen, 1)
   779  	args := cmds[0].Args
   780  	c.Assert(len(args), jc.GreaterThan, 1)
   781  	c.Assert(args[0], gc.Equals, "apt-get")
   782  	c.Assert(args[len(args)-1], gc.Equals, "rsyslog-gnutls")
   783  }
   784  
   785  func (s *UpgradeSuite) assertStateServerUpgrades(c *gc.C) {
   786  	s.assertCommonUpgrades(c)
   787  	// System SSH key
   788  	c.Assert(s.keyFile(), jc.IsNonEmptyFile)
   789  	// Syslog port should have been updated
   790  	cfg, err := s.State.EnvironConfig()
   791  	c.Assert(err, jc.ErrorIsNil)
   792  	c.Assert(cfg.SyslogPort(), gc.Equals, config.DefaultSyslogPort)
   793  	// Deprecated attributes should have been deleted - just test a couple.
   794  	allAttrs := cfg.AllAttrs()
   795  	_, ok := allAttrs["public-bucket"]
   796  	c.Assert(ok, jc.IsFalse)
   797  	_, ok = allAttrs["public-bucket-region"]
   798  	c.Assert(ok, jc.IsFalse)
   799  }
   800  
   801  func (s *UpgradeSuite) assertHostUpgrades(c *gc.C) {
   802  	s.assertCommonUpgrades(c)
   803  	// Lock directory
   804  	lockdir := filepath.Join(s.DataDir(), "locks")
   805  	c.Assert(lockdir, jc.IsDirectory)
   806  	// SSH key file should not be generated for hosts.
   807  	_, err := os.Stat(s.keyFile())
   808  	c.Assert(err, jc.Satisfies, os.IsNotExist)
   809  	// Syslog port should not have been updated
   810  	cfg, err := s.State.EnvironConfig()
   811  	c.Assert(err, jc.ErrorIsNil)
   812  	c.Assert(cfg.SyslogPort(), gc.Not(gc.Equals), config.DefaultSyslogPort)
   813  	// Add other checks as needed...
   814  }
   815  
   816  func (s *UpgradeSuite) createAgentAndStartUpgrade(c *gc.C, job state.MachineJob) (*MachineAgent, func()) {
   817  	machine, _, _ := s.primeAgent(c, s.oldVersion, job)
   818  	a := s.newAgent(c, machine)
   819  	go func() { c.Check(a.Run(nil), gc.IsNil) }()
   820  	return a, func() { c.Check(a.Stop(), gc.IsNil) }
   821  }
   822  
   823  func (s *UpgradeSuite) assertEnvironAgentVersion(c *gc.C, expected version.Number) {
   824  	envConfig, err := s.State.EnvironConfig()
   825  	c.Assert(err, jc.ErrorIsNil)
   826  	agentVersion, ok := envConfig.AgentVersion()
   827  	c.Assert(ok, jc.IsTrue)
   828  	c.Assert(agentVersion, gc.Equals, expected)
   829  }
   830  
   831  func waitForUpgradeToFinish(c *gc.C, conf agent.Config) {
   832  	success := false
   833  	for attempt := coretesting.LongAttempt.Start(); attempt.Next(); {
   834  		diskConf := readConfigFromDisk(c, conf.DataDir(), conf.Tag())
   835  		success = diskConf.UpgradedToVersion() == version.Current.Number
   836  		if success {
   837  			break
   838  		}
   839  	}
   840  	c.Assert(success, jc.IsTrue)
   841  }
   842  
   843  func readConfigFromDisk(c *gc.C, dir string, tag names.Tag) agent.Config {
   844  	conf, err := agent.ReadConfig(agent.ConfigPath(dir, tag))
   845  	c.Assert(err, jc.ErrorIsNil)
   846  	return conf
   847  }
   848  
   849  func (s *UpgradeSuite) checkLoginToAPIAsUser(c *gc.C, conf agent.Config, expectFullApi exposedAPI) {
   850  	info := conf.APIInfo()
   851  	info.Tag = s.AdminUserTag(c)
   852  	info.Password = "dummy-secret"
   853  	info.Nonce = ""
   854  
   855  	apiState, err := api.Open(info, upgradeTestDialOpts)
   856  	c.Assert(err, jc.ErrorIsNil)
   857  	defer apiState.Close()
   858  
   859  	// this call should always work
   860  	var result api.Status
   861  	err = apiState.APICall("Client", 0, "", "FullStatus", nil, &result)
   862  	c.Assert(err, jc.ErrorIsNil)
   863  
   864  	// this call should only work if API is not restricted
   865  	err = apiState.APICall("Client", 0, "", "DestroyEnvironment", nil, nil)
   866  	if expectFullApi {
   867  		c.Assert(err, jc.ErrorIsNil)
   868  	} else {
   869  		c.Assert(err, gc.ErrorMatches, "upgrade in progress .+")
   870  	}
   871  }
   872  
   873  func canLoginToAPIAsMachine(c *gc.C, fromConf, toConf agent.Config) bool {
   874  	info := fromConf.APIInfo()
   875  	info.Addrs = toConf.APIInfo().Addrs
   876  	apiState, err := api.Open(info, upgradeTestDialOpts)
   877  	if apiState != nil {
   878  		apiState.Close()
   879  	}
   880  	return apiState != nil && err == nil
   881  }
   882  
   883  var upgradeTestDialOpts = api.DialOpts{
   884  	Timeout:             2 * time.Minute,
   885  	RetryDelay:          250 * time.Millisecond,
   886  	DialAddressInterval: 50 * time.Millisecond,
   887  }
   888  
   889  func assertUpgradeComplete(c *gc.C, context *upgradeWorkerContext) {
   890  	select {
   891  	case <-context.UpgradeComplete:
   892  	default:
   893  		c.Error("UpgradeComplete channel is open but shouldn't be")
   894  	}
   895  }
   896  
   897  func assertUpgradeNotComplete(c *gc.C, context *upgradeWorkerContext) {
   898  	select {
   899  	case <-context.UpgradeComplete:
   900  		c.Error("UpgradeComplete channel is closed but shouldn't be")
   901  	default:
   902  	}
   903  }
   904  
   905  // NewFakeConfigSetter returns a fakeConfigSetter which implements
   906  // just enough of the agent.ConfigSetter interface to keep the upgrade
   907  // steps worker happy.
   908  func NewFakeConfigSetter(agentTag names.Tag, initialVersion version.Number) *fakeConfigSetter {
   909  	return &fakeConfigSetter{
   910  		AgentTag: agentTag,
   911  		Version:  initialVersion,
   912  	}
   913  }
   914  
   915  type fakeConfigSetter struct {
   916  	agent.ConfigSetter
   917  	AgentTag names.Tag
   918  	Version  version.Number
   919  }
   920  
   921  func (s *fakeConfigSetter) Tag() names.Tag {
   922  	return s.AgentTag
   923  }
   924  
   925  func (s *fakeConfigSetter) UpgradedToVersion() version.Number {
   926  	return s.Version
   927  }
   928  
   929  func (s *fakeConfigSetter) SetUpgradedToVersion(newVersion version.Number) {
   930  	s.Version = newVersion
   931  }
   932  
   933  // NewFakeUpgradingMachineAgent returns a fakeUpgradingMachineAgent which implements
   934  // the upgradingMachineAgent interface. This provides enough
   935  // MachineAgent functionality to support upgrades.
   936  func NewFakeUpgradingMachineAgent(confSetter agent.ConfigSetter) *fakeUpgradingMachineAgent {
   937  	return &fakeUpgradingMachineAgent{
   938  		config:  confSetter,
   939  		DyingCh: make(chan struct{}),
   940  	}
   941  }
   942  
   943  type fakeUpgradingMachineAgent struct {
   944  	config             agent.ConfigSetter
   945  	DyingCh            chan struct{}
   946  	MachineStatusCalls []MachineStatusCall
   947  }
   948  
   949  type MachineStatusCall struct {
   950  	Status params.Status
   951  	Info   string
   952  }
   953  
   954  func (a *fakeUpgradingMachineAgent) setMachineStatus(_ *api.State, status params.Status, info string) error {
   955  	// Record setMachineStatus calls for later inspection.
   956  	a.MachineStatusCalls = append(a.MachineStatusCalls, MachineStatusCall{status, info})
   957  	return nil
   958  }
   959  
   960  func (a *fakeUpgradingMachineAgent) ensureMongoServer(agent.Config) error {
   961  	return nil
   962  }
   963  
   964  func (a *fakeUpgradingMachineAgent) CurrentConfig() agent.Config {
   965  	return a.config
   966  }
   967  
   968  func (a *fakeUpgradingMachineAgent) ChangeConfig(mutate AgentConfigMutator) error {
   969  	return mutate(a.config)
   970  }
   971  
   972  func (a *fakeUpgradingMachineAgent) Dying() <-chan struct{} {
   973  	return a.DyingCh
   974  }