github.com/juju/juju@v0.0.0-20240327075706-a90865de2538/worker/upgradesteps/worker.go (about)

     1  // Copyright 2015 Canonical Ltd.
     2  // Licensed under the AGPLv3, see LICENCE file for details.
     3  
     4  package upgradesteps
     5  
     6  import (
     7  	"fmt"
     8  	"time"
     9  
    10  	"github.com/juju/errors"
    11  	"github.com/juju/loggo"
    12  	"github.com/juju/names/v5"
    13  	"github.com/juju/retry"
    14  	"github.com/juju/version/v2"
    15  	"github.com/juju/worker/v3"
    16  	"gopkg.in/tomb.v2"
    17  
    18  	"github.com/juju/juju/agent"
    19  	"github.com/juju/juju/api"
    20  	agenterrors "github.com/juju/juju/cmd/jujud/agent/errors"
    21  	"github.com/juju/juju/core/status"
    22  	"github.com/juju/juju/state"
    23  	"github.com/juju/juju/upgrades"
    24  	jujuversion "github.com/juju/juju/version"
    25  	"github.com/juju/juju/worker/gate"
    26  	"github.com/juju/juju/wrench"
    27  )
    28  
    29  var logger = loggo.GetLogger("juju.worker.upgradesteps")
    30  
    31  // TODO (manadart 2021-05-18): These are exported for tests and in the case of
    32  // the timeout, for feature tests. That especially should be a dependency of the
    33  // worker.
    34  var (
    35  	PerformUpgrade = upgrades.PerformUpgrade
    36  
    37  	// UpgradeStartTimeoutController the maximum time a controller will
    38  	// wait for other controllers to come up and indicate they are ready
    39  	// to begin running upgrade steps.
    40  	UpgradeStartTimeoutController = time.Minute * 15
    41  )
    42  
    43  // NewLock creates a gate.Lock to be used to synchronise workers which
    44  // need to start after upgrades have completed. The returned Lock should
    45  // be passed to NewWorker. If the agent has already upgraded to the
    46  // current version, then the lock will be returned in the released state.
    47  func NewLock(agentConfig agent.Config) gate.Lock {
    48  	lock := gate.NewLock()
    49  
    50  	if wrench.IsActive(wrenchKey(agentConfig), "always-try-upgrade") {
    51  		// Always enter upgrade mode. This allows test of upgrades
    52  		// even when there's actually no upgrade steps to run.
    53  		return lock
    54  	}
    55  
    56  	// Build numbers are irrelevant to upgrade steps.
    57  	upgradedToVersion := agentConfig.UpgradedToVersion().ToPatch()
    58  	currentVersion := jujuversion.Current.ToPatch()
    59  	if upgradedToVersion == currentVersion {
    60  		logger.Infof(
    61  			"upgrade steps for %v have already been run.",
    62  			jujuversion.Current,
    63  		)
    64  		lock.Unlock()
    65  	}
    66  
    67  	return lock
    68  }
    69  
    70  // StatusSetter defines the single method required to set an agent's
    71  // status.
    72  type StatusSetter interface {
    73  	SetStatus(setableStatus status.Status, info string, data map[string]interface{}) error
    74  }
    75  
    76  // NewWorker returns a new instance of the upgradeSteps worker. It
    77  // will run any required steps to upgrade to the currently running
    78  // Juju version.
    79  func NewWorker(
    80  	upgradeComplete gate.Lock,
    81  	agent agent.Agent,
    82  	apiConn api.Connection,
    83  	isController bool,
    84  	openState func() (*state.StatePool, error),
    85  	preUpgradeSteps upgrades.PreUpgradeStepsFunc,
    86  	retryStrategy retry.CallArgs,
    87  	entity StatusSetter,
    88  	isCaas bool,
    89  ) (worker.Worker, error) {
    90  	w := &upgradeSteps{
    91  		upgradeComplete: upgradeComplete,
    92  		agent:           agent,
    93  		apiConn:         apiConn,
    94  		openState:       openState,
    95  		preUpgradeSteps: preUpgradeSteps,
    96  		retryStrategy:   retryStrategy,
    97  		entity:          entity,
    98  		tag:             agent.CurrentConfig().Tag(),
    99  		isController:    isController,
   100  		isCaas:          isCaas,
   101  	}
   102  	w.tomb.Go(w.run)
   103  	return w, nil
   104  }
   105  
   106  type upgradeSteps struct {
   107  	tomb            tomb.Tomb
   108  	upgradeComplete gate.Lock
   109  	agent           agent.Agent
   110  	apiConn         api.Connection
   111  	openState       func() (*state.StatePool, error)
   112  	preUpgradeSteps upgrades.PreUpgradeStepsFunc
   113  	entity          StatusSetter
   114  	retryStrategy   retry.CallArgs
   115  
   116  	fromVersion version.Number
   117  	toVersion   version.Number
   118  	tag         names.Tag
   119  	// If the agent is a machine agent for a controller, flag that state
   120  	// needs to be opened before running upgrade steps
   121  	isController bool
   122  	isCaas       bool
   123  	pool         *state.StatePool
   124  }
   125  
   126  // Kill is part of the worker.Worker interface.
   127  func (w *upgradeSteps) Kill() {
   128  	w.tomb.Kill(nil)
   129  }
   130  
   131  // Wait is part of the worker.Worker interface.
   132  func (w *upgradeSteps) Wait() error {
   133  	return w.tomb.Wait()
   134  }
   135  
   136  type apiLostDuringUpgrade struct {
   137  	err error
   138  }
   139  
   140  func (e *apiLostDuringUpgrade) Error() string {
   141  	return fmt.Sprintf("API connection lost during upgrade: %v", e.err)
   142  }
   143  
   144  func isAPILostDuringUpgrade(err error) bool {
   145  	_, ok := err.(*apiLostDuringUpgrade)
   146  	return ok
   147  }
   148  
   149  func (w *upgradeSteps) wrenchKey() string {
   150  	return wrenchKey(w.agent.CurrentConfig())
   151  }
   152  
   153  func wrenchKey(agentConfig agent.Config) string {
   154  	return agentConfig.Tag().Kind() + "-agent"
   155  }
   156  
   157  func (w *upgradeSteps) run() error {
   158  	if wrench.IsActive(w.wrenchKey(), "fail-upgrade-start") {
   159  		return nil // Make the worker stop
   160  	}
   161  
   162  	if w.upgradeComplete.IsUnlocked() {
   163  		// Our work is already done (we're probably being restarted
   164  		// because the API connection has gone down), so do nothing.
   165  		return nil
   166  	}
   167  
   168  	w.fromVersion = w.agent.CurrentConfig().UpgradedToVersion()
   169  	w.toVersion = jujuversion.Current
   170  	if w.fromVersion == w.toVersion {
   171  		logger.Infof("upgrade to %v already completed.", w.toVersion)
   172  		w.upgradeComplete.Unlock()
   173  		return nil
   174  	}
   175  
   176  	// We need a *state.State for upgrades. We open it independently
   177  	// of StateWorker, because we have no guarantees about when
   178  	// and how often StateWorker might run.
   179  	if w.isController {
   180  		var err error
   181  		if w.pool, err = w.openState(); err != nil {
   182  			return err
   183  		}
   184  		defer func() { _ = w.pool.Close() }()
   185  
   186  		st, err := w.pool.SystemState()
   187  		if err != nil {
   188  			return errors.Trace(err)
   189  		}
   190  		model, err := st.Model()
   191  		if err != nil {
   192  			return errors.Trace(err)
   193  		}
   194  		w.isCaas = model.Type() == state.ModelTypeCAAS
   195  	}
   196  
   197  	if err := w.runUpgrades(); err != nil {
   198  		// Only return an error from the worker if the connection to
   199  		// state went away (possible mongo primary change). Returning
   200  		// an error when the connection is lost will cause the agent
   201  		// to restart.
   202  		//
   203  		// For other errors, the error is not returned because we want
   204  		// the agent to stay running in an error state waiting
   205  		// for user intervention.
   206  		if isAPILostDuringUpgrade(err) {
   207  			return err
   208  		}
   209  		w.reportUpgradeFailure(err, false)
   210  	} else {
   211  		// Upgrade succeeded - signal that the upgrade is complete.
   212  		logger.Infof("upgrade to %v completed successfully.", w.toVersion)
   213  		_ = w.entity.SetStatus(status.Started, "", nil)
   214  		w.upgradeComplete.Unlock()
   215  	}
   216  	return nil
   217  }
   218  
   219  // runUpgrades runs the upgrade operations for each job type and
   220  // updates the updatedToVersion on success.
   221  func (w *upgradeSteps) runUpgrades() error {
   222  	upgradeInfo, err := w.prepareForUpgrade()
   223  	if err != nil {
   224  		return err
   225  	}
   226  
   227  	if wrench.IsActive(w.wrenchKey(), "fail-upgrade") {
   228  		return errors.New("wrench")
   229  	}
   230  
   231  	if err := w.agent.ChangeConfig(w.runUpgradeSteps); err != nil {
   232  		return err
   233  	}
   234  
   235  	if err := w.finaliseUpgrade(upgradeInfo); err != nil {
   236  		return err
   237  	}
   238  	return nil
   239  }
   240  
   241  func (w *upgradeSteps) prepareForUpgrade() (*state.UpgradeInfo, error) {
   242  	logger.Infof("checking that upgrade can proceed")
   243  	if err := w.preUpgradeSteps(w.pool, w.agent.CurrentConfig(), w.pool != nil, w.isCaas); err != nil {
   244  		return nil, errors.Annotatef(err, "%s cannot be upgraded", names.ReadableString(w.tag))
   245  	}
   246  
   247  	if w.isController {
   248  		return w.prepareControllerForUpgrade()
   249  	}
   250  	return nil, nil
   251  }
   252  
   253  func (w *upgradeSteps) prepareControllerForUpgrade() (*state.UpgradeInfo, error) {
   254  	logger.Infof("signalling that this controller is ready for upgrade")
   255  	st, err := w.pool.SystemState()
   256  	if err != nil {
   257  		return nil, errors.Trace(err)
   258  	}
   259  	info, err := st.EnsureUpgradeInfo(w.tag.Id(), w.fromVersion, w.toVersion)
   260  	if err != nil {
   261  		return nil, errors.Trace(err)
   262  	}
   263  
   264  	// controllers need to wait for other controllers to be ready
   265  	// to run the upgrade steps.
   266  	logger.Infof("waiting for other controllers to be ready for upgrade")
   267  	if err := w.waitForOtherControllers(info); err != nil {
   268  		if err == tomb.ErrDying {
   269  			logger.Warningf("stopped waiting for other controllers: %v", err)
   270  			return nil, err
   271  		}
   272  		logger.Errorf("aborted wait for other controllers: %v", err)
   273  		return nil, errors.Annotate(err, "aborted wait for other controllers")
   274  	}
   275  
   276  	logger.Infof("finished waiting - all controllers are ready to run upgrade steps")
   277  	return info, nil
   278  }
   279  
   280  func (w *upgradeSteps) waitForOtherControllers(info *state.UpgradeInfo) error {
   281  	watcher := info.Watch()
   282  	defer func() { _ = watcher.Stop() }()
   283  
   284  	maxWait := w.getUpgradeStartTimeout()
   285  	timeout := time.After(maxWait)
   286  	for {
   287  		select {
   288  		case <-watcher.Changes():
   289  			if err := info.Refresh(); err != nil {
   290  				return errors.Trace(err)
   291  			}
   292  
   293  			allReady, err := info.AllProvisionedControllersReady()
   294  			if err != nil {
   295  				return errors.Trace(err)
   296  			}
   297  			if allReady {
   298  				return errors.Trace(info.SetStatus(state.UpgradeRunning))
   299  			}
   300  		case <-timeout:
   301  			if err := info.Abort(); err != nil {
   302  				return errors.Annotate(err, "unable to abort upgrade")
   303  			}
   304  			return errors.Errorf("timed out after %s", maxWait)
   305  		case <-w.tomb.Dying():
   306  			return tomb.ErrDying
   307  		}
   308  	}
   309  }
   310  
   311  // runUpgradeSteps runs the required upgrade steps for the agent,
   312  // retrying on failure. The agent's UpgradedToVersion is set
   313  // once the upgrade is complete.
   314  //
   315  // This function conforms to the agent.ConfigMutator type and is
   316  // designed to be called via an agent's ChangeConfig method.
   317  func (w *upgradeSteps) runUpgradeSteps(agentConfig agent.ConfigSetter) error {
   318  	if err := w.entity.SetStatus(status.Started, fmt.Sprintf("upgrading to %v", w.toVersion), nil); err != nil {
   319  		return errors.Trace(err)
   320  	}
   321  
   322  	stBackend := upgrades.NewStateBackend(w.pool)
   323  	context := upgrades.NewContext(agentConfig, w.apiConn, stBackend)
   324  	logger.Infof("starting upgrade from %v to %v for %q", w.fromVersion, w.toVersion, w.tag)
   325  
   326  	targets := upgradeTargets(w.isController)
   327  
   328  	retryStrategy := w.retryStrategy
   329  	retryStrategy.IsFatalError = func(err error) bool {
   330  		// Abort if API connection has gone away!
   331  		return agenterrors.ConnectionIsDead(logger, w.apiConn)
   332  	}
   333  	retryStrategy.NotifyFunc = func(lastErr error, attempt int) {
   334  		if retryStrategy.Attempts != 0 && attempt != retryStrategy.Attempts {
   335  			w.reportUpgradeFailure(lastErr, true)
   336  		}
   337  	}
   338  	retryStrategy.Func = func() error {
   339  		err := PerformUpgrade(w.fromVersion, targets, context)
   340  		// w.entity.SetStatus(status.Error, fmt.Sprintf("TEST inner %v", err), nil)
   341  		return err
   342  	}
   343  
   344  	err := retry.Call(retryStrategy)
   345  	// w.entity.SetStatus(status.Error, fmt.Sprintf("TEST outer %v", err), nil)
   346  	if retry.IsAttemptsExceeded(err) || retry.IsDurationExceeded(err) {
   347  		err = retry.LastError(err)
   348  		return err
   349  	}
   350  	if err != nil {
   351  		return &apiLostDuringUpgrade{err}
   352  	}
   353  
   354  	agentConfig.SetUpgradedToVersion(w.toVersion)
   355  	return nil
   356  }
   357  
   358  func (w *upgradeSteps) reportUpgradeFailure(err error, willRetry bool) {
   359  	retryText := "will retry"
   360  	if !willRetry {
   361  		retryText = "giving up"
   362  	}
   363  	logger.Errorf("upgrade from %v to %v for %q failed (%s): %v",
   364  		w.fromVersion, w.toVersion, w.tag, retryText, err)
   365  	_ = w.entity.SetStatus(status.Error,
   366  		fmt.Sprintf("upgrade to %v failed (%s): %v", w.toVersion, retryText, err), nil)
   367  }
   368  
   369  func (w *upgradeSteps) finaliseUpgrade(info *state.UpgradeInfo) error {
   370  	if !w.isController {
   371  		return nil
   372  	}
   373  
   374  	if err := info.SetControllerDone(w.tag.Id()); err != nil {
   375  		return errors.Annotate(err, "upgrade done but failed to synchronise")
   376  	}
   377  
   378  	return nil
   379  }
   380  
   381  func (w *upgradeSteps) getUpgradeStartTimeout() time.Duration {
   382  	if wrench.IsActive(w.wrenchKey(), "short-upgrade-timeout") {
   383  		// This duration is fairly arbitrary. During manual testing it
   384  		// avoids the normal long wait but still provides a small
   385  		// window to check the environment status and logs before the
   386  		// timeout is triggered.
   387  		return time.Minute
   388  	}
   389  	return UpgradeStartTimeoutController
   390  }
   391  
   392  // upgradeTargets determines the upgrade targets corresponding to the
   393  // role of an agent. This determines the upgrade steps
   394  // which will run during an upgrade.
   395  func upgradeTargets(isController bool) []upgrades.Target {
   396  	var targets []upgrades.Target
   397  	if isController {
   398  		targets = []upgrades.Target{upgrades.Controller}
   399  	}
   400  	return append(targets, upgrades.HostMachine)
   401  }