github.com/juju/juju@v0.0.0-20240327075706-a90865de2538/worker/upgradedatabase/worker.go (about)

     1  // Copyright 2019 Canonical Ltd.
     2  // Licensed under the AGPLv3, see LICENCE file for details.
     3  
     4  package upgradedatabase
     5  
     6  import (
     7  	"fmt"
     8  	"time"
     9  
    10  	"github.com/juju/errors"
    11  	"github.com/juju/names/v5"
    12  	"github.com/juju/retry"
    13  	"github.com/juju/version/v2"
    14  	"github.com/juju/worker/v3"
    15  	"gopkg.in/tomb.v2"
    16  
    17  	"github.com/juju/juju/agent"
    18  	"github.com/juju/juju/core/status"
    19  	"github.com/juju/juju/state"
    20  	"github.com/juju/juju/upgrades"
    21  	jujuversion "github.com/juju/juju/version"
    22  	"github.com/juju/juju/worker/gate"
    23  	"github.com/juju/juju/wrench"
    24  )
    25  
    26  // NewLock creates a gate.Lock to be used to synchronise workers
    27  // that need to start after database upgrades have completed.
    28  // The returned Lock should be passed to NewWorker.
    29  // If the agent has already upgraded to the current version,
    30  // then the lock will be returned in the released state.
    31  func NewLock(agentConfig agent.Config) gate.Lock {
    32  	lock := gate.NewLock()
    33  
    34  	// Build numbers are irrelevant to upgrade steps.
    35  	upgradedToVersion := agentConfig.UpgradedToVersion().ToPatch()
    36  	currentVersion := jujuversion.Current.ToPatch()
    37  
    38  	if upgradedToVersion == currentVersion {
    39  		lock.Unlock()
    40  	}
    41  
    42  	return lock
    43  }
    44  
    45  // Config is the configuration needed to construct an upgradeDB worker.
    46  type Config struct {
    47  	// UpgradeComplete is a lock used to synchronise workers that must start
    48  	// after database upgrades are verified as completed.
    49  	UpgradeComplete gate.Lock
    50  
    51  	// Tag is the current machine tag.
    52  	Tag names.Tag
    53  
    54  	// agent is the running machine agent.
    55  	Agent agent.Agent
    56  
    57  	// Logger is the logger for this worker.
    58  	Logger Logger
    59  
    60  	// Open state is a function pointer for returning a state pool indirection.
    61  	OpenState func() (Pool, error)
    62  
    63  	// PerformUpgrade is a function pointer for executing the DB upgrade steps.
    64  	// Context retrieval is lazy because because it requires a real
    65  	// state.StatePool that we cast our Pool indirection back to.
    66  	// We need the concrete type, because we are unable to indirect all the
    67  	// state methods that upgrade steps might require.
    68  	// This is OK for in-theatre operation, but is not suitable for testing.
    69  	PerformUpgrade func(version.Number, []upgrades.Target, func() upgrades.Context) error
    70  
    71  	// RetryStrategy is the strategy to use for re-attempting failed upgrades.
    72  	RetryStrategy retry.CallArgs
    73  
    74  	// Clock is used to enforce time-out logic for controllers waiting for the
    75  	// master MongoDB upgrades to execute.
    76  	Clock Clock
    77  }
    78  
    79  // Validate returns an error if the worker config is not valid.
    80  func (cfg Config) Validate() error {
    81  	if cfg.UpgradeComplete == nil {
    82  		return errors.NotValidf("nil UpgradeComplete lock")
    83  	}
    84  	if cfg.Tag == nil {
    85  		return errors.NotValidf("nil machine tag")
    86  	}
    87  	k := cfg.Tag.Kind()
    88  	if k != names.MachineTagKind && k != names.ControllerAgentTagKind {
    89  		return errors.NotValidf("%q tag kind", k)
    90  	}
    91  	if cfg.Agent == nil {
    92  		return errors.NotValidf("nil Agent")
    93  	}
    94  	if cfg.Logger == nil {
    95  		return errors.NotValidf("nil Logger")
    96  	}
    97  	if cfg.OpenState == nil {
    98  		return errors.NotValidf("nil OpenState function")
    99  	}
   100  	if cfg.PerformUpgrade == nil {
   101  		return errors.NotValidf("nil PerformUpgrade function")
   102  	}
   103  	if cfg.RetryStrategy.Clock == nil {
   104  		return errors.NotValidf("nil RetryStrategy Clock")
   105  	}
   106  	if cfg.RetryStrategy.Delay == 0 {
   107  		return errors.NotValidf("zero value for RetryStrategy Delay")
   108  	}
   109  	if cfg.RetryStrategy.Attempts == 0 && cfg.RetryStrategy.MaxDuration == 0 {
   110  		return errors.NotValidf("zero value for RetryStrategy Attempts and MaxDuration")
   111  	}
   112  	if cfg.Clock == nil {
   113  		return errors.NotValidf("nil Clock")
   114  	}
   115  	return nil
   116  }
   117  
   118  // upgradeDB is a worker that will run on a controller machine.
   119  // It is responsible for running upgrade steps of type `DatabaseMaster` on the
   120  // primary MongoDB instance.
   121  type upgradeDB struct {
   122  	tomb            tomb.Tomb
   123  	upgradeComplete gate.Lock
   124  
   125  	tag            names.Tag
   126  	agent          agent.Agent
   127  	logger         Logger
   128  	pool           Pool
   129  	performUpgrade func(version.Number, []upgrades.Target, func() upgrades.Context) error
   130  	upgradeInfo    UpgradeInfo
   131  	retryStrategy  retry.CallArgs
   132  	clock          Clock
   133  
   134  	fromVersion version.Number
   135  	toVersion   version.Number
   136  }
   137  
   138  // NewWorker validates the input configuration, then uses it to create,
   139  // start and return an upgradeDB worker.
   140  func NewWorker(cfg Config) (worker.Worker, error) {
   141  	var err error
   142  
   143  	if err = cfg.Validate(); err != nil {
   144  		return nil, errors.Trace(err)
   145  	}
   146  
   147  	w := &upgradeDB{
   148  		upgradeComplete: cfg.UpgradeComplete,
   149  		tag:             cfg.Tag,
   150  		agent:           cfg.Agent,
   151  		logger:          cfg.Logger,
   152  		performUpgrade:  cfg.PerformUpgrade,
   153  		retryStrategy:   cfg.RetryStrategy,
   154  		clock:           cfg.Clock,
   155  	}
   156  	if w.pool, err = cfg.OpenState(); err != nil {
   157  		return nil, err
   158  	}
   159  
   160  	w.tomb.Go(w.run)
   161  	return w, nil
   162  }
   163  
   164  func (w *upgradeDB) run() error {
   165  	defer func() {
   166  		if err := w.pool.Close(); err != nil {
   167  			w.logger.Errorf("failed closing state pool: %v", err)
   168  		}
   169  	}()
   170  
   171  	if w.upgradeDone() {
   172  		return nil
   173  	}
   174  
   175  	isPrimary, err := w.pool.IsPrimary(w.tag.Id())
   176  	if err != nil {
   177  		return errors.Trace(err)
   178  	}
   179  
   180  	// Ensure that an upgrade document exists in order to monitor this upgrade.
   181  	// This is the same document that will be used by the `upgradesteps` worker
   182  	// that will execute subsequently.
   183  	// In this worker we use it as a distributed lock - once the status reports
   184  	// `UpgradeDBComplete` this causes our member `upgradeComplete` to unlock
   185  	// on each controller running this worker.
   186  	if w.upgradeInfo, err = w.pool.EnsureUpgradeInfo(w.tag.Id(), w.fromVersion, w.toVersion); err != nil {
   187  		return errors.Annotate(err, "retrieving upgrade info")
   188  	}
   189  
   190  	// If we are the primary we need to run the upgrade steps.
   191  	// Otherwise we watch state and unlock once the primary has run the steps.
   192  	if isPrimary {
   193  		err = w.runUpgrade()
   194  	} else {
   195  		err = w.watchUpgrade()
   196  	}
   197  	return errors.Trace(err)
   198  }
   199  
   200  // upgradeDone returns true if this worker
   201  // does not need to run any upgrade logic.
   202  func (w *upgradeDB) upgradeDone() bool {
   203  	// If we are already unlocked, there is nothing to do.
   204  	if w.upgradeComplete.IsUnlocked() {
   205  		return true
   206  	}
   207  
   208  	// If we are already on the current version, there is nothing to do.
   209  	w.fromVersion = w.agent.CurrentConfig().UpgradedToVersion()
   210  	w.toVersion = jujuversion.Current
   211  	if w.fromVersion == w.toVersion {
   212  		w.logger.Infof("database upgrade for %v already completed", w.toVersion)
   213  		w.upgradeComplete.Unlock()
   214  		return true
   215  	}
   216  
   217  	return false
   218  }
   219  
   220  func (w *upgradeDB) runUpgrade() error {
   221  	w.logger.Infof("running database upgrade for %v on mongodb primary", w.toVersion)
   222  	w.setStatus(status.Started, fmt.Sprintf("upgrading database for %v", w.toVersion))
   223  
   224  	err := w.agent.ChangeConfig(w.runUpgradeSteps)
   225  	if err != nil {
   226  		w.setFailStatus()
   227  		return errors.Trace(err)
   228  	}
   229  	// Update the upgrade status document to unlock the other controllers.
   230  	err = w.upgradeInfo.SetStatus(state.UpgradeDBComplete)
   231  	if err != nil {
   232  		w.setFailStatus()
   233  		return errors.Trace(err)
   234  	}
   235  	w.logger.Infof("database upgrade for %v completed successfully.", w.toVersion)
   236  	w.setStatus(status.Started, fmt.Sprintf("database upgrade for %v completed", w.toVersion))
   237  	w.upgradeComplete.Unlock()
   238  	return nil
   239  }
   240  
   241  // runUpgradeSteps runs the required database upgrade steps for the agent,
   242  // retrying on failure.
   243  func (w *upgradeDB) runUpgradeSteps(agentConfig agent.ConfigSetter) error {
   244  	contextGetter := w.contextGetter(agentConfig)
   245  
   246  	retryStrategy := w.retryStrategy
   247  	retryStrategy.Func = func() error {
   248  		return w.performUpgrade(w.fromVersion, []upgrades.Target{upgrades.DatabaseMaster}, contextGetter)
   249  	}
   250  	retryStrategy.NotifyFunc = func(lastError error, attempt int) {
   251  		w.reportUpgradeFailure(lastError, attempt != retryStrategy.Attempts)
   252  	}
   253  	err := retry.Call(retryStrategy)
   254  	if retry.IsAttemptsExceeded(err) || retry.IsDurationExceeded(err) {
   255  		err = retry.LastError(err)
   256  	}
   257  	return errors.Trace(err)
   258  }
   259  
   260  // contextGetter returns a function that creates an upgrade context.
   261  // Note that the performUpgrade method passed by the manifold calls
   262  // upgrades.PerformStateUpgrade, which only uses the StateContext from this
   263  // context. We can set the API connection to nil - it should never be used.
   264  func (w *upgradeDB) contextGetter(agentConfig agent.ConfigSetter) func() upgrades.Context {
   265  	return func() upgrades.Context {
   266  		return upgrades.NewContext(agentConfig, nil, upgrades.NewStateBackend(w.pool.(*pool).StatePool))
   267  	}
   268  }
   269  
   270  func (w *upgradeDB) watchUpgrade() error {
   271  	w.logger.Infof("waiting for database upgrade on mongodb primary")
   272  	w.setStatus(status.Started, fmt.Sprintf("waiting on primary database upgrade for %v", w.toVersion))
   273  
   274  	if wrench.IsActive("upgrade-database", "watch-upgrade") {
   275  		// Simulate an error causing the upgrade to fail.
   276  		w.setFailStatus()
   277  		return errors.New("unable to upgrade - wrench in works")
   278  	}
   279  
   280  	timeout := w.clock.After(10 * time.Minute)
   281  	watcher := w.upgradeInfo.Watch()
   282  	defer func() { _ = watcher.Stop() }()
   283  
   284  	// Ensure that we re-read the upgrade document after starting the watcher to
   285  	// ensure that we are operating on the latest information, otherwise there
   286  	// is a potential race where we wouldn't notice a change.
   287  	if err := w.upgradeInfo.Refresh(); err != nil {
   288  		w.logger.Errorf("unable to refresh upgrade info: %v", err)
   289  		w.setFailStatus()
   290  		return err
   291  	}
   292  
   293  	// To be here, this node previously returned false for isPrimary
   294  	// Sometimes our primary changes, or is reported as false when called too
   295  	// early. In the case that a node state changes whilst watching,
   296  	// escalate an error which will result in the worker being restarted
   297  	stateChanged := make(chan struct{})
   298  	done := make(chan struct{})
   299  	defer close(done)
   300  	go func() {
   301  		for {
   302  			select {
   303  			case <-done:
   304  				return
   305  			case <-w.clock.After(5 * time.Second):
   306  				isPrimary, err := w.pool.IsPrimary(w.tag.Id())
   307  				if isPrimary || err != nil {
   308  					if err != nil {
   309  						w.logger.Errorf("Failed to check is this node is primary: %v", err)
   310  					}
   311  					close(stateChanged)
   312  					return
   313  				}
   314  			}
   315  		}
   316  	}()
   317  
   318  	for {
   319  		// If the primary has already run the database steps then the status
   320  		// will be "db-complete", however it may have progressed further on to
   321  		// upgrade steps, so we check for that status too.
   322  		switch w.upgradeInfo.Status() {
   323  		case state.UpgradeDBComplete, state.UpgradeRunning:
   324  			w.logger.Infof("finished waiting - database upgrade steps completed on mongodb primary")
   325  			w.setStatus(status.Started, fmt.Sprintf("confirmed primary database upgrade for %v", w.toVersion))
   326  			w.upgradeComplete.Unlock()
   327  			return nil
   328  		default:
   329  			// Continue waiting for another change.
   330  		}
   331  
   332  		select {
   333  		case <-watcher.Changes():
   334  			if err := w.upgradeInfo.Refresh(); err != nil {
   335  				w.setFailStatus()
   336  				return errors.Trace(err)
   337  			}
   338  		case <-stateChanged:
   339  			w.logger.Infof("primary changed mid-upgrade to this watching host. Restart upgrade")
   340  			return errors.New("mongo primary state changed")
   341  		case <-timeout:
   342  			w.setFailStatus()
   343  			return errors.New("timed out waiting for primary database upgrade")
   344  		case <-w.tomb.Dying():
   345  			return tomb.ErrDying
   346  		}
   347  	}
   348  }
   349  
   350  func (w *upgradeDB) reportUpgradeFailure(err error, willRetry bool) {
   351  	retryText := "will retry"
   352  	if !willRetry {
   353  		retryText = "giving up"
   354  	}
   355  
   356  	w.logger.Errorf("database upgrade from %v to %v for %q failed (%s): %v",
   357  		w.fromVersion, w.toVersion, w.tag, retryText, err)
   358  	w.setFailStatus()
   359  }
   360  
   361  func (w *upgradeDB) setFailStatus() {
   362  	w.setStatus(status.Error, fmt.Sprintf("upgrading database for %v", w.toVersion))
   363  }
   364  
   365  func (w *upgradeDB) setStatus(sts status.Status, msg string) {
   366  	if err := w.pool.SetStatus(w.tag.Id(), sts, msg); err != nil {
   367  		w.logger.Errorf("setting agent status: %v", err)
   368  	}
   369  }
   370  
   371  // Kill is part of the worker.Worker interface.
   372  func (w *upgradeDB) Kill() {
   373  	w.tomb.Kill(nil)
   374  }
   375  
   376  // Wait is part of the worker.Worker interface.
   377  func (w *upgradeDB) Wait() error {
   378  	return w.tomb.Wait()
   379  }