github.com/Pankov404/juju@v0.0.0-20150703034450-be266991dceb/cmd/jujud/agent/upgrade_test.go (about)

     1  // Copyright 2012, 2013 Canonical Ltd.
     2  // Licensed under the AGPLv3, see LICENCE file for details.
     3  
     4  package agent
     5  
     6  import (
     7  	"fmt"
     8  	"os"
     9  	"os/exec"
    10  	"path/filepath"
    11  	"strings"
    12  	"sync"
    13  	"time"
    14  
    15  	"github.com/juju/errors"
    16  	"github.com/juju/loggo"
    17  	"github.com/juju/names"
    18  	jc "github.com/juju/testing/checkers"
    19  	"github.com/juju/utils"
    20  	pacman "github.com/juju/utils/packaging/manager"
    21  	gc "gopkg.in/check.v1"
    22  
    23  	"github.com/juju/juju/agent"
    24  	"github.com/juju/juju/api"
    25  	"github.com/juju/juju/apiserver/params"
    26  	cmdutil "github.com/juju/juju/cmd/jujud/util"
    27  	"github.com/juju/juju/constraints"
    28  	"github.com/juju/juju/environs"
    29  	"github.com/juju/juju/environs/config"
    30  	envtesting "github.com/juju/juju/environs/testing"
    31  	"github.com/juju/juju/mongo"
    32  	"github.com/juju/juju/state"
    33  	"github.com/juju/juju/state/multiwatcher"
    34  	"github.com/juju/juju/state/watcher"
    35  	coretesting "github.com/juju/juju/testing"
    36  	"github.com/juju/juju/upgrades"
    37  	"github.com/juju/juju/version"
    38  	"github.com/juju/juju/worker"
    39  	"github.com/juju/juju/worker/upgrader"
    40  )
    41  
    42  type UpgradeSuite struct {
    43  	commonMachineSuite
    44  
    45  	aptCmds         []*exec.Cmd
    46  	oldVersion      version.Binary
    47  	logWriter       loggo.TestWriter
    48  	connectionDead  bool
    49  	machineIsMaster bool
    50  	aptMutex        sync.Mutex
    51  }
    52  
    53  var _ = gc.Suite(&UpgradeSuite{})
    54  
    55  type exposedAPI bool
    56  
    57  var (
    58  	FullAPIExposed       exposedAPI = true
    59  	RestrictedAPIExposed exposedAPI = false
    60  )
    61  
    62  const fails = true
    63  const succeeds = false
    64  
    65  func (s *UpgradeSuite) setAptCmds(cmd *exec.Cmd) {
    66  	s.aptMutex.Lock()
    67  	defer s.aptMutex.Unlock()
    68  	if cmd == nil {
    69  		s.aptCmds = nil
    70  	} else {
    71  		s.aptCmds = append(s.aptCmds, cmd)
    72  	}
    73  }
    74  
    75  func (s *UpgradeSuite) getAptCmds() []*exec.Cmd {
    76  	s.aptMutex.Lock()
    77  	defer s.aptMutex.Unlock()
    78  	return s.aptCmds
    79  }
    80  
    81  func (s *UpgradeSuite) SetUpTest(c *gc.C) {
    82  	s.commonMachineSuite.SetUpTest(c)
    83  
    84  	// clear s.aptCmds
    85  	s.setAptCmds(nil)
    86  
    87  	// Capture all apt commands.
    88  	aptCmds := s.AgentSuite.HookCommandOutput(&pacman.CommandOutput, nil, nil)
    89  	go func() {
    90  		for cmd := range aptCmds {
    91  			s.setAptCmds(cmd)
    92  		}
    93  	}()
    94  
    95  	s.oldVersion = version.Current
    96  	s.oldVersion.Major = 1
    97  	s.oldVersion.Minor = 16
    98  
    99  	// Don't wait so long in tests.
   100  	s.PatchValue(&upgradeStartTimeoutMaster, time.Duration(time.Millisecond*50))
   101  	s.PatchValue(&upgradeStartTimeoutSecondary, time.Duration(time.Millisecond*60))
   102  
   103  	// Allow tests to make the API connection appear to be dead.
   104  	s.connectionDead = false
   105  	s.PatchValue(&cmdutil.ConnectionIsDead, func(loggo.Logger, cmdutil.Pinger) bool {
   106  		return s.connectionDead
   107  	})
   108  
   109  	var fakeOpenStateForUpgrade = func(upgradingMachineAgent, agent.Config) (*state.State, error) {
   110  		mongoInfo := s.State.MongoConnectionInfo()
   111  		st, err := state.Open(mongoInfo, mongo.DefaultDialOpts(), environs.NewStatePolicy())
   112  		c.Assert(err, jc.ErrorIsNil)
   113  		return st, nil
   114  	}
   115  	s.PatchValue(&openStateForUpgrade, fakeOpenStateForUpgrade)
   116  
   117  	s.machineIsMaster = true
   118  	fakeIsMachineMaster := func(*state.State, string) (bool, error) {
   119  		return s.machineIsMaster, nil
   120  	}
   121  	s.PatchValue(&isMachineMaster, fakeIsMachineMaster)
   122  	// Most of these tests normally finish sub-second on a fast machine.
   123  	// If any given test hits a minute, we have almost certainly become
   124  	// wedged, so dump the logs.
   125  	coretesting.DumpTestLogsAfter(time.Minute, c, s)
   126  }
   127  
   128  func (s *UpgradeSuite) captureLogs(c *gc.C) {
   129  	c.Assert(loggo.RegisterWriter("upgrade-tests", &s.logWriter, loggo.INFO), gc.IsNil)
   130  	s.AddCleanup(func(*gc.C) {
   131  		loggo.RemoveWriter("upgrade-tests")
   132  		s.logWriter.Clear()
   133  	})
   134  }
   135  
   136  func (s *UpgradeSuite) countUpgradeAttempts(upgradeErr error) *int {
   137  	count := 0
   138  	s.PatchValue(&upgradesPerformUpgrade, func(version.Number, []upgrades.Target, upgrades.Context) error {
   139  		count++
   140  		return upgradeErr
   141  	})
   142  	return &count
   143  }
   144  
   145  func (s *UpgradeSuite) TestContextInitializeWhenNoUpgradeRequired(c *gc.C) {
   146  	// Set the agent's initial upgradedToVersion to almost the same as
   147  	// the current version. We want it to be different to
   148  	// version.Current (so that we can see it change) but not to
   149  	// trigger upgrade steps.
   150  	config := NewFakeConfigSetter(names.NewMachineTag("0"), makeBumpedCurrentVersion().Number)
   151  	agent := NewFakeUpgradingMachineAgent(config)
   152  
   153  	context := NewUpgradeWorkerContext()
   154  	context.InitializeUsingAgent(agent)
   155  
   156  	select {
   157  	case <-context.UpgradeComplete:
   158  		// Success
   159  	default:
   160  		c.Fatal("UpgradeComplete channel should be closed because no upgrade is required")
   161  	}
   162  	// The agent's version should have been updated.
   163  	c.Assert(config.Version, gc.Equals, version.Current.Number)
   164  
   165  }
   166  
   167  func (s *UpgradeSuite) TestContextInitializeWhenUpgradeRequired(c *gc.C) {
   168  	// Set the agent's upgradedToVersion so that upgrade steps are required.
   169  	initialVersion := version.MustParse("1.16.0")
   170  	config := NewFakeConfigSetter(names.NewMachineTag("0"), initialVersion)
   171  	agent := NewFakeUpgradingMachineAgent(config)
   172  
   173  	context := NewUpgradeWorkerContext()
   174  	context.InitializeUsingAgent(agent)
   175  
   176  	select {
   177  	case <-context.UpgradeComplete:
   178  		c.Fatal("UpgradeComplete channel shouldn't be closed because upgrade is required")
   179  	default:
   180  		// Success
   181  	}
   182  	// The agent's version should NOT have been updated.
   183  	c.Assert(config.Version, gc.Equals, initialVersion)
   184  }
   185  
   186  func (s *UpgradeSuite) TestRetryStrategy(c *gc.C) {
   187  	retries := getUpgradeRetryStrategy()
   188  	c.Assert(retries.Delay, gc.Equals, 2*time.Minute)
   189  	c.Assert(retries.Min, gc.Equals, 5)
   190  }
   191  
   192  func (s *UpgradeSuite) TestIsUpgradeRunning(c *gc.C) {
   193  	context := NewUpgradeWorkerContext()
   194  	c.Assert(context.IsUpgradeRunning(), jc.IsTrue)
   195  
   196  	close(context.UpgradeComplete)
   197  	c.Assert(context.IsUpgradeRunning(), jc.IsFalse)
   198  }
   199  
   200  func (s *UpgradeSuite) TestNoUpgradeNecessary(c *gc.C) {
   201  	attemptsP := s.countUpgradeAttempts(nil)
   202  	s.captureLogs(c)
   203  	s.oldVersion = version.Current // nothing to do
   204  
   205  	workerErr, config, _, context := s.runUpgradeWorker(c, multiwatcher.JobHostUnits)
   206  
   207  	c.Check(workerErr, gc.IsNil)
   208  	c.Check(*attemptsP, gc.Equals, 0)
   209  	c.Check(config.Version, gc.Equals, version.Current.Number)
   210  	assertUpgradeComplete(c, context)
   211  }
   212  
   213  func (s *UpgradeSuite) TestUpgradeStepsFailure(c *gc.C) {
   214  	// This test checks what happens when every upgrade attempt fails.
   215  	// A number of retries should be observed and the agent should end
   216  	// up in a state where it is is still running but is reporting an
   217  	// error and the upgrade is not flagged as having completed (which
   218  	// prevents most of the agent's workers from running and keeps the
   219  	// API in restricted mode).
   220  
   221  	attemptsP := s.countUpgradeAttempts(errors.New("boom"))
   222  	s.captureLogs(c)
   223  
   224  	workerErr, config, agent, context := s.runUpgradeWorker(c, multiwatcher.JobHostUnits)
   225  
   226  	// The worker shouldn't return an error so that the worker and
   227  	// agent keep running.
   228  	c.Check(workerErr, gc.IsNil)
   229  
   230  	c.Check(*attemptsP, gc.Equals, maxUpgradeRetries)
   231  	c.Check(config.Version, gc.Equals, s.oldVersion.Number) // Upgrade didn't finish
   232  	c.Assert(agent.MachineStatusCalls, jc.DeepEquals,
   233  		s.makeExpectedStatusCalls(maxUpgradeRetries-1, fails, "boom"))
   234  	c.Assert(s.logWriter.Log(), jc.LogMatches,
   235  		s.makeExpectedUpgradeLogs(maxUpgradeRetries-1, "hostMachine", fails, "boom"))
   236  	assertUpgradeNotComplete(c, context)
   237  }
   238  
   239  func (s *UpgradeSuite) TestUpgradeStepsRetries(c *gc.C) {
   240  	// This test checks what happens when the first upgrade attempt
   241  	// fails but the following on succeeds. The final state should be
   242  	// the same as a successful upgrade which worked first go.
   243  	attempts := 0
   244  	fail := true
   245  	fakePerformUpgrade := func(version.Number, []upgrades.Target, upgrades.Context) error {
   246  		attempts++
   247  		if fail {
   248  			fail = false
   249  			return errors.New("boom")
   250  		} else {
   251  			return nil
   252  		}
   253  	}
   254  	s.PatchValue(&upgradesPerformUpgrade, fakePerformUpgrade)
   255  	s.captureLogs(c)
   256  
   257  	workerErr, config, agent, context := s.runUpgradeWorker(c, multiwatcher.JobHostUnits)
   258  
   259  	c.Check(workerErr, gc.IsNil)
   260  	c.Check(attempts, gc.Equals, 2)
   261  	c.Check(config.Version, gc.Equals, version.Current.Number) // Upgrade finished
   262  	c.Assert(agent.MachineStatusCalls, jc.DeepEquals, s.makeExpectedStatusCalls(1, succeeds, "boom"))
   263  	c.Assert(s.logWriter.Log(), jc.LogMatches, s.makeExpectedUpgradeLogs(1, "hostMachine", succeeds, "boom"))
   264  	assertUpgradeComplete(c, context)
   265  }
   266  
   267  func (s *UpgradeSuite) TestOtherUpgradeRunFailure(c *gc.C) {
   268  	// This test checks what happens something other than the upgrade
   269  	// steps themselves fails, ensuring the something is logged and
   270  	// the agent status is updated.
   271  
   272  	fakePerformUpgrade := func(version.Number, []upgrades.Target, upgrades.Context) error {
   273  		// Delete UpgradeInfo for the upgrade so that finaliseUpgrade() will fail
   274  		s.State.ClearUpgradeInfo()
   275  		return nil
   276  	}
   277  	s.PatchValue(&upgradesPerformUpgrade, fakePerformUpgrade)
   278  	s.primeAgent(c, s.oldVersion, state.JobManageEnviron)
   279  	s.captureLogs(c)
   280  
   281  	workerErr, config, agent, context := s.runUpgradeWorker(c, multiwatcher.JobManageEnviron)
   282  
   283  	c.Check(workerErr, gc.IsNil)
   284  	c.Check(config.Version, gc.Equals, version.Current.Number) // Upgrade almost finished
   285  	failReason := `upgrade done but: cannot set upgrade status to "finishing": ` +
   286  		`Another status change may have occurred concurrently`
   287  	c.Assert(agent.MachineStatusCalls, jc.DeepEquals,
   288  		s.makeExpectedStatusCalls(0, fails, failReason))
   289  	c.Assert(s.logWriter.Log(), jc.LogMatches,
   290  		s.makeExpectedUpgradeLogs(0, "databaseMaster", fails, failReason))
   291  	assertUpgradeNotComplete(c, context)
   292  }
   293  
   294  func (s *UpgradeSuite) TestApiConnectionFailure(c *gc.C) {
   295  	// This test checks what happens when an upgrade fails because the
   296  	// connection to mongo has gone away. This will happen when the
   297  	// mongo master changes. In this case we want the upgrade worker
   298  	// to return immediately without further retries. The error should
   299  	// be returned by the worker so that the agent will restart.
   300  
   301  	attemptsP := s.countUpgradeAttempts(errors.New("boom"))
   302  	s.connectionDead = true // Make the connection to state appear to be dead
   303  	s.captureLogs(c)
   304  
   305  	workerErr, config, _, context := s.runUpgradeWorker(c, multiwatcher.JobHostUnits)
   306  
   307  	c.Check(workerErr, gc.ErrorMatches, "API connection lost during upgrade: boom")
   308  	c.Check(*attemptsP, gc.Equals, 1)
   309  	c.Check(config.Version, gc.Equals, s.oldVersion.Number) // Upgrade didn't finish
   310  	assertUpgradeNotComplete(c, context)
   311  }
   312  
   313  func (s *UpgradeSuite) TestAbortWhenOtherStateServerDoesntStartUpgrade(c *gc.C) {
   314  	// This test checks when a state server is upgrading and one of
   315  	// the other state servers doesn't signal it is ready in time.
   316  
   317  	// The master state server in this scenario is functionally tested
   318  	// elsewhere in this suite.
   319  	s.machineIsMaster = false
   320  
   321  	s.createUpgradingStateServers(c)
   322  	s.captureLogs(c)
   323  	attemptsP := s.countUpgradeAttempts(nil)
   324  
   325  	workerErr, config, agent, context := s.runUpgradeWorker(c, multiwatcher.JobManageEnviron)
   326  
   327  	c.Check(workerErr, gc.IsNil)
   328  	c.Check(*attemptsP, gc.Equals, 0)
   329  	c.Check(config.Version, gc.Equals, s.oldVersion.Number) // Upgrade didn't happen
   330  	assertUpgradeNotComplete(c, context)
   331  
   332  	// The environment agent-version should still be the new version.
   333  	// It's up to the master to trigger the rollback.
   334  	s.assertEnvironAgentVersion(c, version.Current.Number)
   335  
   336  	causeMsg := " timed out after 60ms"
   337  	c.Assert(s.logWriter.Log(), jc.LogMatches, []jc.SimpleMessage{
   338  		{loggo.INFO, "waiting for other state servers to be ready for upgrade"},
   339  		{loggo.ERROR, "aborted wait for other state servers: timed out after 60ms"},
   340  		{loggo.ERROR, `upgrade from .+ to .+ for "machine-0" failed \(giving up\): ` +
   341  			"aborted wait for other state servers:" + causeMsg},
   342  	})
   343  	c.Assert(agent.MachineStatusCalls, jc.DeepEquals, []MachineStatusCall{{
   344  		params.StatusError,
   345  		fmt.Sprintf(
   346  			"upgrade to %s failed (giving up): aborted wait for other state servers:"+causeMsg,
   347  			version.Current.Number),
   348  	}})
   349  }
   350  
   351  func (s *UpgradeSuite) TestWorkerAbortsIfAgentDies(c *gc.C) {
   352  	s.machineIsMaster = false
   353  	s.captureLogs(c)
   354  	attemptsP := s.countUpgradeAttempts(nil)
   355  
   356  	s.primeAgent(c, s.oldVersion, state.JobManageEnviron)
   357  
   358  	config := s.makeFakeConfig()
   359  	agent := NewFakeUpgradingMachineAgent(config)
   360  	close(agent.DyingCh)
   361  	workerErr, context := s.runUpgradeWorkerUsingAgent(c, agent, multiwatcher.JobManageEnviron)
   362  
   363  	c.Check(workerErr, gc.IsNil)
   364  	c.Check(*attemptsP, gc.Equals, 0)
   365  	c.Check(config.Version, gc.Equals, s.oldVersion.Number) // Upgrade didn't happen
   366  	assertUpgradeNotComplete(c, context)
   367  	c.Assert(s.logWriter.Log(), jc.LogMatches, []jc.SimpleMessage{
   368  		{loggo.WARNING, "stopped waiting for other state servers: machine agent is terminating"},
   369  	})
   370  }
   371  
   372  func (s *UpgradeSuite) TestSuccessMaster(c *gc.C) {
   373  	// This test checks what happens when an upgrade works on the
   374  	// first attempt on a master state server.
   375  	s.machineIsMaster = true
   376  	info := s.checkSuccess(c, "databaseMaster", func(*state.UpgradeInfo) {})
   377  	c.Assert(info.Status(), gc.Equals, state.UpgradeFinishing)
   378  }
   379  
   380  func (s *UpgradeSuite) TestSuccessSecondary(c *gc.C) {
   381  	// This test checks what happens when an upgrade works on the
   382  	// first attempt on a secondary state server.
   383  	s.machineIsMaster = false
   384  	mungeInfo := func(info *state.UpgradeInfo) {
   385  		// Indicate that the master is done
   386  		err := info.SetStatus(state.UpgradeRunning)
   387  		c.Assert(err, jc.ErrorIsNil)
   388  		err = info.SetStatus(state.UpgradeFinishing)
   389  		c.Assert(err, jc.ErrorIsNil)
   390  	}
   391  	s.checkSuccess(c, "stateServer", mungeInfo)
   392  }
   393  
   394  func (s *UpgradeSuite) checkSuccess(c *gc.C, target string, mungeInfo func(*state.UpgradeInfo)) *state.UpgradeInfo {
   395  	_, machineIdB, machineIdC := s.createUpgradingStateServers(c)
   396  
   397  	// Indicate that machine B and C are ready to upgrade
   398  	vPrevious := s.oldVersion.Number
   399  	vNext := version.Current.Number
   400  	info, err := s.State.EnsureUpgradeInfo(machineIdB, vPrevious, vNext)
   401  	c.Assert(err, jc.ErrorIsNil)
   402  	_, err = s.State.EnsureUpgradeInfo(machineIdC, vPrevious, vNext)
   403  	c.Assert(err, jc.ErrorIsNil)
   404  
   405  	mungeInfo(info)
   406  
   407  	attemptsP := s.countUpgradeAttempts(nil)
   408  	s.captureLogs(c)
   409  
   410  	workerErr, config, agent, context := s.runUpgradeWorker(c, multiwatcher.JobManageEnviron)
   411  
   412  	c.Check(workerErr, gc.IsNil)
   413  	c.Check(*attemptsP, gc.Equals, 1)
   414  	c.Check(config.Version, gc.Equals, version.Current.Number) // Upgrade finished
   415  	c.Assert(agent.MachineStatusCalls, jc.DeepEquals, s.makeExpectedStatusCalls(0, succeeds, ""))
   416  	c.Assert(s.logWriter.Log(), jc.LogMatches, s.makeExpectedUpgradeLogs(0, target, succeeds, ""))
   417  	assertUpgradeComplete(c, context)
   418  
   419  	err = info.Refresh()
   420  	c.Assert(err, jc.ErrorIsNil)
   421  	c.Assert(info.StateServersDone(), jc.DeepEquals, []string{"0"})
   422  	return info
   423  }
   424  
   425  func (s *UpgradeSuite) TestJobsToTargets(c *gc.C) {
   426  	check := func(jobs []multiwatcher.MachineJob, isMaster bool, expectedTargets ...upgrades.Target) {
   427  		c.Assert(jobsToTargets(jobs, isMaster), jc.SameContents, expectedTargets)
   428  	}
   429  
   430  	check([]multiwatcher.MachineJob{multiwatcher.JobHostUnits}, false, upgrades.HostMachine)
   431  	check([]multiwatcher.MachineJob{multiwatcher.JobManageEnviron}, false, upgrades.StateServer)
   432  	check([]multiwatcher.MachineJob{multiwatcher.JobManageEnviron}, true,
   433  		upgrades.StateServer, upgrades.DatabaseMaster)
   434  	check([]multiwatcher.MachineJob{multiwatcher.JobManageEnviron, multiwatcher.JobHostUnits}, false,
   435  		upgrades.StateServer, upgrades.HostMachine)
   436  	check([]multiwatcher.MachineJob{multiwatcher.JobManageEnviron, multiwatcher.JobHostUnits}, true,
   437  		upgrades.StateServer, upgrades.DatabaseMaster, upgrades.HostMachine)
   438  }
   439  
   440  func (s *UpgradeSuite) TestUpgradeStepsStateServer(c *gc.C) {
   441  	coretesting.SkipIfI386(c, "lp:1444576")
   442  	coretesting.SkipIfPPC64EL(c, "lp:1444576")
   443  	coretesting.SkipIfWindowsBug(c, "lp:1446885")
   444  	s.setInstantRetryStrategy(c)
   445  	// Upload tools to provider storage, so they can be migrated to environment storage.
   446  	stor, err := environs.LegacyStorage(s.State)
   447  	if !errors.IsNotSupported(err) {
   448  		c.Assert(err, jc.ErrorIsNil)
   449  		envtesting.AssertUploadFakeToolsVersions(
   450  			c, stor, "releases", s.Environ.Config().AgentStream(), s.oldVersion)
   451  	}
   452  
   453  	s.assertUpgradeSteps(c, state.JobManageEnviron)
   454  	s.assertStateServerUpgrades(c)
   455  }
   456  
   457  func (s *UpgradeSuite) TestUpgradeStepsHostMachine(c *gc.C) {
   458  	coretesting.SkipIfPPC64EL(c, "lp:1444576")
   459  	coretesting.SkipIfWindowsBug(c, "lp:1446885")
   460  	s.setInstantRetryStrategy(c)
   461  	// We need to first start up a state server that thinks it has already been upgraded.
   462  	ss, _, _ := s.primeAgent(c, version.Current, state.JobManageEnviron)
   463  	a := s.newAgent(c, ss)
   464  	go func() { c.Check(a.Run(nil), gc.IsNil) }()
   465  	defer func() { c.Check(a.Stop(), gc.IsNil) }()
   466  	// Now run the test.
   467  	s.assertUpgradeSteps(c, state.JobHostUnits)
   468  	s.assertHostUpgrades(c)
   469  }
   470  
   471  func (s *UpgradeSuite) TestLoginsDuringUpgrade(c *gc.C) {
   472  	// Create machine agent to upgrade
   473  	machine, machine0Conf, _ := s.primeAgent(c, s.oldVersion, state.JobManageEnviron)
   474  	a := s.newAgent(c, machine)
   475  
   476  	// Mock out upgrade logic, using a channel so that the test knows
   477  	// when upgrades have started and can control when upgrades
   478  	// should finish.
   479  	upgradeCh := make(chan bool)
   480  	abort := make(chan bool)
   481  	fakePerformUpgrade := func(version.Number, []upgrades.Target, upgrades.Context) error {
   482  		// Signal that upgrade has started.
   483  		select {
   484  		case upgradeCh <- true:
   485  		case <-abort:
   486  			return nil
   487  		}
   488  
   489  		// Wait for signal that upgrades should finish.
   490  		select {
   491  		case <-upgradeCh:
   492  		case <-abort:
   493  			return nil
   494  		}
   495  		return nil
   496  	}
   497  	s.PatchValue(&upgradesPerformUpgrade, fakePerformUpgrade)
   498  
   499  	// Start the API server and upgrade-steps works just as the agent would.
   500  	runner := worker.NewRunner(cmdutil.IsFatal, cmdutil.MoreImportant)
   501  	defer func() {
   502  		close(abort)
   503  		runner.Kill()
   504  		runner.Wait()
   505  	}()
   506  	certChangedChan := make(chan params.StateServingInfo)
   507  	runner.StartWorker("apiserver", a.apiserverWorkerStarter(s.State, certChangedChan))
   508  	runner.StartWorker("upgrade-steps", a.upgradeStepsWorkerStarter(
   509  		s.APIState,
   510  		[]multiwatcher.MachineJob{multiwatcher.JobManageEnviron},
   511  	))
   512  
   513  	// Set up a second machine to log in as.
   514  	// API logins are tested manually so there's no need to actually
   515  	// start this machine.
   516  	var machine1Conf agent.Config
   517  	_, machine1Conf, _ = s.primeAgent(c, version.Current, state.JobHostUnits)
   518  
   519  	c.Assert(waitForUpgradeToStart(upgradeCh), jc.IsTrue)
   520  
   521  	// Only user and local logins are allowed during upgrade. Users get a restricted API.
   522  	s.checkLoginToAPIAsUser(c, machine0Conf, RestrictedAPIExposed)
   523  	c.Assert(canLoginToAPIAsMachine(c, machine0Conf, machine0Conf), jc.IsTrue)
   524  	c.Assert(canLoginToAPIAsMachine(c, machine1Conf, machine0Conf), jc.IsFalse)
   525  
   526  	close(upgradeCh) // Allow upgrade to complete
   527  
   528  	waitForUpgradeToFinish(c, machine0Conf)
   529  
   530  	// Only user and local logins are allowed even after upgrade steps because
   531  	// agent upgrade not finished yet.
   532  	s.checkLoginToAPIAsUser(c, machine0Conf, RestrictedAPIExposed)
   533  	c.Assert(canLoginToAPIAsMachine(c, machine0Conf, machine0Conf), jc.IsTrue)
   534  	c.Assert(canLoginToAPIAsMachine(c, machine1Conf, machine0Conf), jc.IsFalse)
   535  
   536  	machineAPI := s.OpenAPIAsMachine(c, machine.Tag(), initialMachinePassword, agent.BootstrapNonce)
   537  	runner.StartWorker("upgrader", a.agentUpgraderWorkerStarter(machineAPI.Upgrader(), machine0Conf))
   538  	// Wait for agent upgrade worker to determine that no
   539  	// agent upgrades are required.
   540  	select {
   541  	case <-a.initialAgentUpgradeCheckComplete:
   542  	case <-time.After(coretesting.LongWait):
   543  		c.Fatalf("timeout waiting for upgrade check")
   544  	}
   545  
   546  	// All logins are allowed after upgrade
   547  	s.checkLoginToAPIAsUser(c, machine0Conf, FullAPIExposed)
   548  	c.Assert(canLoginToAPIAsMachine(c, machine0Conf, machine0Conf), jc.IsTrue)
   549  	c.Assert(canLoginToAPIAsMachine(c, machine1Conf, machine0Conf), jc.IsTrue)
   550  }
   551  
   552  func (s *UpgradeSuite) TestUpgradeSkippedIfNoUpgradeRequired(c *gc.C) {
   553  	attempts := 0
   554  	upgradeCh := make(chan bool)
   555  	fakePerformUpgrade := func(version.Number, []upgrades.Target, upgrades.Context) error {
   556  		// Note: this shouldn't run.
   557  		attempts++
   558  		// If execution ends up here, wait so it can be detected (by
   559  		// checking for restricted API
   560  		<-upgradeCh
   561  		return nil
   562  	}
   563  	s.PatchValue(&upgradesPerformUpgrade, fakePerformUpgrade)
   564  
   565  	// Set up machine agent running the current version.
   566  	//
   567  	// Set the agent's initial upgradedToVersion to be almost the same
   568  	// as version.Current but not quite. We want it to be different to
   569  	// version.Current (so that we can see it change) but not to
   570  	// trigger upgrade steps.
   571  	initialVersion := makeBumpedCurrentVersion()
   572  	machine, agentConf, _ := s.primeAgent(c, initialVersion, state.JobManageEnviron)
   573  	a := s.newAgent(c, machine)
   574  	go func() { c.Check(a.Run(nil), gc.IsNil) }()
   575  	defer func() {
   576  		close(upgradeCh)
   577  		c.Check(a.Stop(), gc.IsNil)
   578  	}()
   579  
   580  	// Test that unrestricted API logins are possible (i.e. no
   581  	// "upgrade mode" in force)
   582  	s.checkLoginToAPIAsUser(c, agentConf, FullAPIExposed)
   583  	c.Assert(attempts, gc.Equals, 0) // There should have been no attempt to upgrade.
   584  
   585  	// Even though no upgrade was done upgradedToVersion should have been updated.
   586  	c.Assert(a.CurrentConfig().UpgradedToVersion(), gc.Equals, version.Current.Number)
   587  }
   588  
   589  func (s *UpgradeSuite) TestDowngradeOnMasterWhenOtherStateServerDoesntStartUpgrade(c *gc.C) {
   590  	coretesting.SkipIfWindowsBug(c, "lp:1446885")
   591  	// This test checks that the master triggers a downgrade if one of
   592  	// the other state server fails to signal it is ready for upgrade.
   593  	//
   594  	// This test is functional, ensuring that the upgrader worker
   595  	// terminates the machine agent with the UpgradeReadyError which
   596  	// makes the downgrade happen.
   597  
   598  	// Speed up the watcher frequency to make the test much faster.
   599  	s.PatchValue(&watcher.Period, 200*time.Millisecond)
   600  
   601  	// Provide (fake) tools so that the upgrader has something to downgrade to.
   602  	envtesting.AssertUploadFakeToolsVersions(
   603  		c, s.DefaultToolsStorage, s.Environ.Config().AgentStream(), s.Environ.Config().AgentStream(), s.oldVersion)
   604  
   605  	// Only the first machine is going to be ready for upgrade.
   606  	machineIdA, machineIdB, _ := s.createUpgradingStateServers(c)
   607  
   608  	// One of the other state servers is ready for upgrade (but machine C doesn't).
   609  	info, err := s.State.EnsureUpgradeInfo(machineIdB, s.oldVersion.Number, version.Current.Number)
   610  	c.Assert(err, jc.ErrorIsNil)
   611  
   612  	agent := s.newAgentFromMachineId(c, machineIdA)
   613  	defer agent.Stop()
   614  
   615  	s.machineIsMaster = true
   616  
   617  	var agentErr error
   618  	agentDone := make(chan bool)
   619  	go func() {
   620  		agentErr = agent.Run(nil)
   621  		close(agentDone)
   622  	}()
   623  
   624  	select {
   625  	case <-agentDone:
   626  		upgradeReadyErr, ok := agentErr.(*upgrader.UpgradeReadyError)
   627  		if !ok {
   628  			c.Fatalf("didn't see UpgradeReadyError, instead got: %v", agentErr)
   629  		}
   630  		// Confirm that the downgrade is back to the previous version.
   631  		c.Assert(upgradeReadyErr.OldTools, gc.Equals, version.Current)
   632  		c.Assert(upgradeReadyErr.NewTools, gc.Equals, s.oldVersion)
   633  
   634  	case <-time.After(coretesting.LongWait):
   635  		c.Fatal("machine agent did not exit as expected")
   636  	}
   637  
   638  	// UpgradeInfo doc should now be archived.
   639  	err = info.Refresh()
   640  	c.Assert(err, gc.ErrorMatches, "current upgrade info not found")
   641  }
   642  
   643  // Run just the upgrade-steps worker with a fake machine agent and
   644  // fake agent config.
   645  func (s *UpgradeSuite) runUpgradeWorker(c *gc.C, jobs ...multiwatcher.MachineJob) (
   646  	error, *fakeConfigSetter, *fakeUpgradingMachineAgent, *upgradeWorkerContext,
   647  ) {
   648  	config := s.makeFakeConfig()
   649  	agent := NewFakeUpgradingMachineAgent(config)
   650  	err, context := s.runUpgradeWorkerUsingAgent(c, agent, jobs...)
   651  	return err, config, agent, context
   652  }
   653  
   654  // Run just the upgrade-steps worker with the fake machine agent
   655  // provided.
   656  func (s *UpgradeSuite) runUpgradeWorkerUsingAgent(
   657  	c *gc.C,
   658  	agent *fakeUpgradingMachineAgent,
   659  	jobs ...multiwatcher.MachineJob,
   660  ) (error, *upgradeWorkerContext) {
   661  	s.setInstantRetryStrategy(c)
   662  	context := NewUpgradeWorkerContext()
   663  	worker := context.Worker(agent, nil, jobs)
   664  	return worker.Wait(), context
   665  }
   666  
   667  func (s *UpgradeSuite) makeFakeConfig() *fakeConfigSetter {
   668  	return NewFakeConfigSetter(names.NewMachineTag("0"), s.oldVersion.Number)
   669  }
   670  
   671  // Create 3 configured state servers that appear to be running tools
   672  // with version s.oldVersion and return their ids.
   673  func (s *UpgradeSuite) createUpgradingStateServers(c *gc.C) (machineIdA, machineIdB, machineIdC string) {
   674  	machine0, _, _ := s.primeAgent(c, s.oldVersion, state.JobManageEnviron)
   675  	machineIdA = machine0.Id()
   676  
   677  	changes, err := s.State.EnsureAvailability(3, constraints.Value{}, "quantal", nil)
   678  	c.Assert(err, jc.ErrorIsNil)
   679  	c.Assert(len(changes.Added), gc.Equals, 2)
   680  	machineIdB = changes.Added[0]
   681  	s.configureMachine(c, machineIdB, s.oldVersion)
   682  	machineIdC = changes.Added[1]
   683  	s.configureMachine(c, machineIdC, s.oldVersion)
   684  
   685  	return
   686  }
   687  
   688  func (s *UpgradeSuite) newAgentFromMachineId(c *gc.C, machineId string) *MachineAgent {
   689  	machine, err := s.State.Machine(machineId)
   690  	c.Assert(err, jc.ErrorIsNil)
   691  	return s.newAgent(c, machine)
   692  }
   693  
   694  // Return a version the same as the current software version, but with
   695  // the build number bumped.
   696  //
   697  // The version Tag is also cleared so that upgrades.PerformUpgrade
   698  // doesn't think it needs to run upgrade steps unnecessarily.
   699  func makeBumpedCurrentVersion() version.Binary {
   700  	v := version.Current
   701  	v.Build++
   702  	v.Tag = ""
   703  	return v
   704  }
   705  
   706  func waitForUpgradeToStart(upgradeCh chan bool) bool {
   707  	select {
   708  	case <-upgradeCh:
   709  		return true
   710  	case <-time.After(coretesting.LongWait):
   711  		return false
   712  	}
   713  }
   714  
   715  const maxUpgradeRetries = 3
   716  
   717  func (s *UpgradeSuite) setInstantRetryStrategy(c *gc.C) {
   718  	s.PatchValue(&getUpgradeRetryStrategy, func() utils.AttemptStrategy {
   719  		c.Logf("setting instant retry strategy for upgrade: retries=%d", maxUpgradeRetries)
   720  		return utils.AttemptStrategy{
   721  			Delay: 0,
   722  			Min:   maxUpgradeRetries,
   723  		}
   724  	})
   725  }
   726  
   727  func (s *UpgradeSuite) makeExpectedStatusCalls(retryCount int, expectFail bool, failReason string) []MachineStatusCall {
   728  	calls := []MachineStatusCall{{
   729  		params.StatusStarted,
   730  		fmt.Sprintf("upgrading to %s", version.Current.Number),
   731  	}}
   732  	for i := 0; i < retryCount; i++ {
   733  		calls = append(calls, MachineStatusCall{
   734  			params.StatusError,
   735  			fmt.Sprintf("upgrade to %s failed (will retry): %s", version.Current.Number, failReason),
   736  		})
   737  	}
   738  	if expectFail {
   739  		calls = append(calls, MachineStatusCall{
   740  			params.StatusError,
   741  			fmt.Sprintf("upgrade to %s failed (giving up): %s", version.Current.Number, failReason),
   742  		})
   743  	} else {
   744  		calls = append(calls, MachineStatusCall{params.StatusStarted, ""})
   745  	}
   746  	return calls
   747  }
   748  
   749  func (s *UpgradeSuite) makeExpectedUpgradeLogs(
   750  	retryCount int,
   751  	target string,
   752  	expectFail bool,
   753  	failReason string,
   754  ) []jc.SimpleMessage {
   755  	outLogs := []jc.SimpleMessage{}
   756  
   757  	if target == "databaseMaster" || target == "stateServer" {
   758  		outLogs = append(outLogs, jc.SimpleMessage{
   759  			loggo.INFO, "waiting for other state servers to be ready for upgrade",
   760  		})
   761  		var waitMsg string
   762  		switch target {
   763  		case "databaseMaster":
   764  			waitMsg = "all state servers are ready to run upgrade steps"
   765  		case "stateServer":
   766  			waitMsg = "the master has completed its upgrade steps"
   767  		}
   768  		outLogs = append(outLogs, jc.SimpleMessage{loggo.INFO, "finished waiting - " + waitMsg})
   769  	}
   770  
   771  	outLogs = append(outLogs, jc.SimpleMessage{
   772  		loggo.INFO, fmt.Sprintf(
   773  			`starting upgrade from %s to %s for "machine-0"`,
   774  			s.oldVersion.Number, version.Current.Number),
   775  	})
   776  
   777  	failMessage := fmt.Sprintf(
   778  		`upgrade from %s to %s for "machine-0" failed \(%%s\): %s`,
   779  		s.oldVersion.Number, version.Current.Number, failReason)
   780  
   781  	for i := 0; i < retryCount; i++ {
   782  		outLogs = append(outLogs, jc.SimpleMessage{loggo.ERROR, fmt.Sprintf(failMessage, "will retry")})
   783  	}
   784  	if expectFail {
   785  		outLogs = append(outLogs, jc.SimpleMessage{loggo.ERROR, fmt.Sprintf(failMessage, "giving up")})
   786  	} else {
   787  		outLogs = append(outLogs, jc.SimpleMessage{loggo.INFO,
   788  			fmt.Sprintf(`upgrade to %s completed successfully.`, version.Current.Number)})
   789  	}
   790  	return outLogs
   791  }
   792  
   793  func (s *UpgradeSuite) assertUpgradeSteps(c *gc.C, job state.MachineJob) {
   794  	agent, stopFunc := s.createAgentAndStartUpgrade(c, job)
   795  	defer stopFunc()
   796  	waitForUpgradeToFinish(c, agent.CurrentConfig())
   797  }
   798  
   799  func (s *UpgradeSuite) keyFile() string {
   800  	return filepath.Join(s.DataDir(), "system-identity")
   801  }
   802  
   803  func (s *UpgradeSuite) assertCommonUpgrades(c *gc.C) {
   804  	// rsyslog-gnutls should have been installed.
   805  	cmds := s.getAptCmds()
   806  	c.Assert(cmds, gc.HasLen, 1)
   807  	args := cmds[0].Args
   808  	c.Assert(len(args), jc.GreaterThan, 1)
   809  	c.Assert(args[0], gc.Equals, "apt-get")
   810  	c.Assert(args[len(args)-1], gc.Equals, "rsyslog-gnutls")
   811  }
   812  
   813  func (s *UpgradeSuite) assertStateServerUpgrades(c *gc.C) {
   814  	s.assertCommonUpgrades(c)
   815  	// System SSH key
   816  	c.Assert(s.keyFile(), jc.IsNonEmptyFile)
   817  	// Syslog port should have been updated
   818  	cfg, err := s.State.EnvironConfig()
   819  	c.Assert(err, jc.ErrorIsNil)
   820  	c.Assert(cfg.SyslogPort(), gc.Equals, config.DefaultSyslogPort)
   821  	// Deprecated attributes should have been deleted - just test a couple.
   822  	allAttrs := cfg.AllAttrs()
   823  	_, ok := allAttrs["public-bucket"]
   824  	c.Assert(ok, jc.IsFalse)
   825  	_, ok = allAttrs["public-bucket-region"]
   826  	c.Assert(ok, jc.IsFalse)
   827  }
   828  
   829  func (s *UpgradeSuite) assertHostUpgrades(c *gc.C) {
   830  	s.assertCommonUpgrades(c)
   831  	// Lock directory
   832  	// TODO(bogdanteleaga): Fix this on windows. Currently a bash script is
   833  	// used to create the directory which partially works on windows 8 but
   834  	// doesn't work on windows server.
   835  	lockdir := filepath.Join(s.DataDir(), "locks")
   836  	c.Assert(lockdir, jc.IsDirectory)
   837  	// SSH key file should not be generated for hosts.
   838  	_, err := os.Stat(s.keyFile())
   839  	c.Assert(err, jc.Satisfies, os.IsNotExist)
   840  	// Syslog port should not have been updated
   841  	cfg, err := s.State.EnvironConfig()
   842  	c.Assert(err, jc.ErrorIsNil)
   843  	c.Assert(cfg.SyslogPort(), gc.Not(gc.Equals), config.DefaultSyslogPort)
   844  	// Add other checks as needed...
   845  }
   846  
   847  func (s *UpgradeSuite) createAgentAndStartUpgrade(c *gc.C, job state.MachineJob) (*MachineAgent, func()) {
   848  	machine, _, _ := s.primeAgent(c, s.oldVersion, job)
   849  	a := s.newAgent(c, machine)
   850  	go func() { c.Check(a.Run(nil), gc.IsNil) }()
   851  	return a, func() { c.Check(a.Stop(), gc.IsNil) }
   852  }
   853  
   854  func (s *UpgradeSuite) assertEnvironAgentVersion(c *gc.C, expected version.Number) {
   855  	envConfig, err := s.State.EnvironConfig()
   856  	c.Assert(err, jc.ErrorIsNil)
   857  	agentVersion, ok := envConfig.AgentVersion()
   858  	c.Assert(ok, jc.IsTrue)
   859  	c.Assert(agentVersion, gc.Equals, expected)
   860  }
   861  
   862  func waitForUpgradeToFinish(c *gc.C, conf agent.Config) {
   863  	success := false
   864  	for attempt := coretesting.LongAttempt.Start(); attempt.Next(); {
   865  		diskConf := readConfigFromDisk(c, conf.DataDir(), conf.Tag())
   866  		success = diskConf.UpgradedToVersion() == version.Current.Number
   867  		if success {
   868  			break
   869  		}
   870  	}
   871  	c.Assert(success, jc.IsTrue)
   872  }
   873  
   874  func readConfigFromDisk(c *gc.C, dir string, tag names.Tag) agent.Config {
   875  	conf, err := agent.ReadConfig(agent.ConfigPath(dir, tag))
   876  	c.Assert(err, jc.ErrorIsNil)
   877  	return conf
   878  }
   879  
   880  func (s *UpgradeSuite) checkLoginToAPIAsUser(c *gc.C, conf agent.Config, expectFullApi exposedAPI) {
   881  	var err error
   882  	// Multiple attempts may be necessary because there is a small gap
   883  	// between the post-upgrade version being written to the agent's
   884  	// config (as observed by waitForUpgradeToFinish) and the end of
   885  	// "upgrade mode" (i.e. when the agent's UpgradeComplete channel
   886  	// is closed). Without this tests that call checkLoginToAPIAsUser
   887  	// can occasionally fail.
   888  	for a := coretesting.LongAttempt.Start(); a.Next(); {
   889  		err = s.attemptRestrictedAPIAsUser(c, conf)
   890  		switch expectFullApi {
   891  		case FullAPIExposed:
   892  			if err == nil {
   893  				return
   894  			}
   895  		case RestrictedAPIExposed:
   896  			if err != nil && strings.HasPrefix(err.Error(), "upgrade in progress") {
   897  				return
   898  			}
   899  		}
   900  	}
   901  	c.Fatalf("timed out waiting for expected API behaviour. last error was: %v", err)
   902  }
   903  
   904  func (s *UpgradeSuite) attemptRestrictedAPIAsUser(c *gc.C, conf agent.Config) error {
   905  	info := conf.APIInfo()
   906  	info.Tag = s.AdminUserTag(c)
   907  	info.Password = "dummy-secret"
   908  	info.Nonce = ""
   909  
   910  	apiState, err := api.Open(info, upgradeTestDialOpts)
   911  	c.Assert(err, jc.ErrorIsNil)
   912  	defer apiState.Close()
   913  
   914  	// this call should always work
   915  	var result api.Status
   916  	err = apiState.APICall("Client", 0, "", "FullStatus", nil, &result)
   917  	c.Assert(err, jc.ErrorIsNil)
   918  
   919  	// this call should only work if API is not restricted
   920  	return apiState.APICall("Client", 0, "", "WatchAll", nil, nil)
   921  }
   922  
   923  func canLoginToAPIAsMachine(c *gc.C, fromConf, toConf agent.Config) bool {
   924  	info := fromConf.APIInfo()
   925  	info.Addrs = toConf.APIInfo().Addrs
   926  	apiState, err := api.Open(info, upgradeTestDialOpts)
   927  	if apiState != nil {
   928  		apiState.Close()
   929  	}
   930  	return apiState != nil && err == nil
   931  }
   932  
   933  var upgradeTestDialOpts = api.DialOpts{
   934  	Timeout:             2 * time.Minute,
   935  	RetryDelay:          250 * time.Millisecond,
   936  	DialAddressInterval: 50 * time.Millisecond,
   937  }
   938  
   939  func assertUpgradeComplete(c *gc.C, context *upgradeWorkerContext) {
   940  	select {
   941  	case <-context.UpgradeComplete:
   942  	default:
   943  		c.Error("UpgradeComplete channel is open but shouldn't be")
   944  	}
   945  }
   946  
   947  func assertUpgradeNotComplete(c *gc.C, context *upgradeWorkerContext) {
   948  	select {
   949  	case <-context.UpgradeComplete:
   950  		c.Error("UpgradeComplete channel is closed but shouldn't be")
   951  	default:
   952  	}
   953  }
   954  
   955  // NewFakeConfigSetter returns a fakeConfigSetter which implements
   956  // just enough of the agent.ConfigSetter interface to keep the upgrade
   957  // steps worker happy.
   958  func NewFakeConfigSetter(agentTag names.Tag, initialVersion version.Number) *fakeConfigSetter {
   959  	return &fakeConfigSetter{
   960  		AgentTag: agentTag,
   961  		Version:  initialVersion,
   962  	}
   963  }
   964  
   965  type fakeConfigSetter struct {
   966  	agent.ConfigSetter
   967  	AgentTag names.Tag
   968  	Version  version.Number
   969  }
   970  
   971  func (s *fakeConfigSetter) Tag() names.Tag {
   972  	return s.AgentTag
   973  }
   974  
   975  func (s *fakeConfigSetter) UpgradedToVersion() version.Number {
   976  	return s.Version
   977  }
   978  
   979  func (s *fakeConfigSetter) SetUpgradedToVersion(newVersion version.Number) {
   980  	s.Version = newVersion
   981  }
   982  
   983  // NewFakeUpgradingMachineAgent returns a fakeUpgradingMachineAgent which implements
   984  // the upgradingMachineAgent interface. This provides enough
   985  // MachineAgent functionality to support upgrades.
   986  func NewFakeUpgradingMachineAgent(confSetter agent.ConfigSetter) *fakeUpgradingMachineAgent {
   987  	return &fakeUpgradingMachineAgent{
   988  		config:  confSetter,
   989  		DyingCh: make(chan struct{}),
   990  	}
   991  }
   992  
   993  type fakeUpgradingMachineAgent struct {
   994  	config             agent.ConfigSetter
   995  	DyingCh            chan struct{}
   996  	MachineStatusCalls []MachineStatusCall
   997  }
   998  
   999  type MachineStatusCall struct {
  1000  	Status params.Status
  1001  	Info   string
  1002  }
  1003  
  1004  func (a *fakeUpgradingMachineAgent) setMachineStatus(_ *api.State, status params.Status, info string) error {
  1005  	// Record setMachineStatus calls for later inspection.
  1006  	a.MachineStatusCalls = append(a.MachineStatusCalls, MachineStatusCall{status, info})
  1007  	return nil
  1008  }
  1009  
  1010  func (a *fakeUpgradingMachineAgent) ensureMongoServer(agent.Config) error {
  1011  	return nil
  1012  }
  1013  
  1014  func (a *fakeUpgradingMachineAgent) CurrentConfig() agent.Config {
  1015  	return a.config
  1016  }
  1017  
  1018  func (a *fakeUpgradingMachineAgent) ChangeConfig(mutate agent.ConfigMutator) error {
  1019  	return mutate(a.config)
  1020  }
  1021  
  1022  func (a *fakeUpgradingMachineAgent) Dying() <-chan struct{} {
  1023  	return a.DyingCh
  1024  }