github.com/juju/juju@v0.0.0-20240430160146-1752b71fcf00/worker/upgradeseries/worker.go (about)

     1  // Copyright 2018 Canonical Ltd.
     2  // Licensed under the AGPLv3, see LICENCE file for details.
     3  
     4  package upgradeseries
     5  
     6  import (
     7  	"strings"
     8  	"sync"
     9  
    10  	"github.com/juju/errors"
    11  	"github.com/juju/names/v5"
    12  	"github.com/juju/worker/v3"
    13  	"github.com/juju/worker/v3/catacomb"
    14  
    15  	"github.com/juju/juju/core/model"
    16  	"github.com/juju/juju/core/os"
    17  	"github.com/juju/juju/rpc/params"
    18  )
    19  
    20  //go:generate go run go.uber.org/mock/mockgen -package mocks -destination mocks/package_mock.go github.com/juju/juju/worker/upgradeseries Facade,UnitDiscovery,Upgrader
    21  
    22  var hostBase = os.HostBase
    23  
    24  // Logger represents the methods required to emit log messages.
    25  type Logger interface {
    26  	Debugf(message string, args ...interface{})
    27  	Infof(message string, args ...interface{})
    28  	Warningf(message string, args ...interface{})
    29  	Errorf(message string, args ...interface{})
    30  }
    31  
    32  // UnitDiscovery represents how the worker determines which units need
    33  // to check in.
    34  type UnitDiscovery interface {
    35  	Units() ([]names.UnitTag, error)
    36  }
    37  
    38  // Config is the configuration needed to construct an UpgradeSeries worker.
    39  type Config struct {
    40  	// Facade is used to access back-end state.
    41  	Facade Facade
    42  
    43  	// Logger is the logger for this worker.
    44  	Logger Logger
    45  
    46  	// UnitDiscovery determines how the worker knows which units should
    47  	// be running on the machine.
    48  	UnitDiscovery UnitDiscovery
    49  
    50  	// UpgraderFactory is a factory method that will return an upgrader capable
    51  	// of handling service and agent binary manipulation for a
    52  	// runtime-determined current and target OS series.
    53  	UpgraderFactory func() (Upgrader, error)
    54  }
    55  
    56  // Validate validates the upgrade-series worker configuration.
    57  func (config Config) Validate() error {
    58  	if config.Logger == nil {
    59  		return errors.NotValidf("nil Logger")
    60  	}
    61  	if config.Facade == nil {
    62  		return errors.NotValidf("nil Facade")
    63  	}
    64  	if config.UnitDiscovery == nil {
    65  		return errors.NotValidf("nil UnitDiscovery")
    66  	}
    67  	if config.UpgraderFactory == nil {
    68  		return errors.NotValidf("nil UpgraderFactory")
    69  	}
    70  	return nil
    71  }
    72  
    73  // upgradeSeriesWorker is responsible for machine and unit agent requirements
    74  // during upgrade-series:
    75  //
    76  //	copying the agent binary directory and renaming;
    77  //	rewriting the machine and unit(s) systemd files if necessary;
    78  //	ensuring unit agents are started post-upgrade;
    79  //	moving the status of the upgrade-series steps along.
    80  type upgradeSeriesWorker struct {
    81  	Facade
    82  
    83  	catacomb        catacomb.Catacomb
    84  	logger          Logger
    85  	unitDiscovery   UnitDiscovery
    86  	upgraderFactory func() (Upgrader, error)
    87  
    88  	// Some local state retained for reporting purposes.
    89  	mu             sync.Mutex
    90  	machineStatus  model.UpgradeSeriesStatus
    91  	units          names.Set
    92  	preparedUnits  []names.UnitTag
    93  	completedUnits []names.UnitTag
    94  
    95  	// Ensure that leaders are pinned only once if possible,
    96  	// on the first transition to UpgradeSeriesPrepareStarted.
    97  	// However repeated pin calls are not of too much concern,
    98  	// as the pin operations are idempotent.
    99  	leadersPinned bool
   100  }
   101  
   102  // NewWorker creates, starts and returns a new upgrade-series worker based on
   103  // the input configuration.
   104  func NewWorker(config Config) (worker.Worker, error) {
   105  	if err := config.Validate(); err != nil {
   106  		return nil, errors.Trace(err)
   107  	}
   108  
   109  	w := &upgradeSeriesWorker{
   110  		Facade:          config.Facade,
   111  		logger:          config.Logger,
   112  		unitDiscovery:   config.UnitDiscovery,
   113  		upgraderFactory: config.UpgraderFactory,
   114  		machineStatus:   model.UpgradeSeriesNotStarted,
   115  		leadersPinned:   false,
   116  	}
   117  
   118  	if err := catacomb.Invoke(catacomb.Plan{
   119  		Site: &w.catacomb,
   120  		Work: w.loop,
   121  	}); err != nil {
   122  		return nil, errors.Trace(err)
   123  	}
   124  
   125  	return w, nil
   126  }
   127  
   128  func (w *upgradeSeriesWorker) loop() error {
   129  	uw, err := w.WatchUpgradeSeriesNotifications()
   130  	if err != nil {
   131  		return errors.Trace(err)
   132  	}
   133  	err = w.catacomb.Add(uw)
   134  	if err != nil {
   135  		return errors.Trace(err)
   136  	}
   137  	for {
   138  		select {
   139  		case <-w.catacomb.Dying():
   140  			return w.catacomb.ErrDying()
   141  		case <-uw.Changes():
   142  			if err := w.handleUpgradeSeriesChange(); err != nil {
   143  				return errors.Trace(err)
   144  			}
   145  		}
   146  	}
   147  }
   148  
   149  // handleUpgradeSeriesChange retrieves the current upgrade-series status for
   150  // this machine and based on the status, calls methods that will progress
   151  // the workflow accordingly.
   152  func (w *upgradeSeriesWorker) handleUpgradeSeriesChange() error {
   153  	w.mu.Lock()
   154  	defer w.mu.Unlock()
   155  
   156  	var err error
   157  	if w.machineStatus, err = w.MachineStatus(); err != nil {
   158  		if errors.IsNotFound(err) {
   159  			// No upgrade-series lock. This can happen when:
   160  			// - The first watch call is made.
   161  			// - The lock is removed after a completed upgrade.
   162  			w.logger.Infof("no series upgrade lock present")
   163  			w.machineStatus = model.UpgradeSeriesNotStarted
   164  			w.preparedUnits = nil
   165  			w.completedUnits = nil
   166  			return nil
   167  		}
   168  		return errors.Trace(err)
   169  	}
   170  	w.logger.Infof("machine series upgrade status is %q", w.machineStatus)
   171  
   172  	// Determine the set of units that are on the machine.
   173  	if w.units == nil {
   174  		units, err := w.unitDiscovery.Units()
   175  		if err != nil {
   176  			return errors.Annotate(err, "unit discovery")
   177  		}
   178  		w.units = names.NewSet(asGenericTags(units)...)
   179  	}
   180  
   181  	switch w.machineStatus {
   182  	case model.UpgradeSeriesValidate:
   183  		err = w.handleValidate()
   184  	case model.UpgradeSeriesPrepareStarted:
   185  		err = w.handlePrepareStarted()
   186  	case model.UpgradeSeriesCompleteStarted:
   187  		err = w.handleCompleteStarted()
   188  	case model.UpgradeSeriesCompleted:
   189  		err = w.handleCompleted()
   190  	}
   191  
   192  	if err != nil {
   193  		if err := w.SetInstanceStatus(model.UpgradeSeriesError, err.Error()); err != nil {
   194  			w.logger.Errorf("failed to set series upgrade error status: %s", err.Error())
   195  		}
   196  	}
   197  	return errors.Trace(err)
   198  }
   199  
   200  // handleValidate handles the workflow for the machine with validating the
   201  // given set of machine applications and charms.
   202  func (w *upgradeSeriesWorker) handleValidate() error {
   203  	if err := w.SetInstanceStatus(model.UpgradeSeriesValidate, "validating units"); err != nil {
   204  		return errors.Trace(err)
   205  	}
   206  	return nil
   207  }
   208  
   209  // handlePrepareStarted handles workflow for the machine with an upgrade-series
   210  // lock status of "UpgradeSeriesPrepareStarted"
   211  func (w *upgradeSeriesWorker) handlePrepareStarted() error {
   212  	var err error
   213  	if err = w.SetInstanceStatus(model.UpgradeSeriesPrepareStarted, "preparing units"); err != nil {
   214  		return errors.Trace(err)
   215  	}
   216  
   217  	if !w.leadersPinned {
   218  		if err = w.pinLeaders(); err != nil {
   219  			return errors.Trace(err)
   220  		}
   221  	}
   222  
   223  	if w.preparedUnits, err = w.UnitsPrepared(); err != nil {
   224  		return errors.Trace(err)
   225  	}
   226  
   227  	// If not all the units have checked in, we are still preparing.
   228  	prepared := names.NewSet(asGenericTags(w.preparedUnits)...)
   229  	if remaining := w.units.Difference(prepared); remaining.Size() > 0 {
   230  		// Not done yet.
   231  		var names []string
   232  		for _, tag := range remaining.SortedValues() {
   233  			names = append(names, tag.Id())
   234  		}
   235  		w.logger.Debugf("waiting for units: %s", strings.Join(names, ","))
   236  		return nil
   237  	}
   238  
   239  	return errors.Trace(w.transitionPrepareComplete())
   240  }
   241  
   242  // transitionPrepareComplete rewrites service unit files for unit agents running
   243  // on this machine so that they are compatible with the init system of the
   244  // series upgrade target.
   245  func (w *upgradeSeriesWorker) transitionPrepareComplete() error {
   246  	if err := w.SetInstanceStatus(model.UpgradeSeriesPrepareStarted, "completing preparation"); err != nil {
   247  		return errors.Trace(err)
   248  	}
   249  
   250  	w.logger.Infof("preparing service units for series upgrade")
   251  	upgrader, err := w.upgraderFactory()
   252  	if err != nil {
   253  		return errors.Trace(err)
   254  	}
   255  	if err := upgrader.PerformUpgrade(); err != nil {
   256  		return errors.Trace(err)
   257  	}
   258  
   259  	if err := w.SetMachineStatus(model.UpgradeSeriesPrepareCompleted, "binaries and service files written"); err != nil {
   260  		return errors.Trace(err)
   261  	}
   262  
   263  	return errors.Trace(w.SetInstanceStatus(model.UpgradeSeriesPrepareCompleted, "waiting for completion command"))
   264  }
   265  
   266  func (w *upgradeSeriesWorker) handleCompleteStarted() error {
   267  	if err := w.SetInstanceStatus(model.UpgradeSeriesCompleteStarted, "waiting for units"); err != nil {
   268  		return errors.Trace(err)
   269  	}
   270  
   271  	var err error
   272  	if w.preparedUnits, err = w.UnitsPrepared(); err != nil {
   273  		return errors.Trace(err)
   274  	}
   275  
   276  	// If all the units are prepared, tell them to start.
   277  	prepared := names.NewSet(asGenericTags(w.preparedUnits)...)
   278  	if remaining := w.units.Difference(prepared); remaining.Size() == 0 && len(w.units) > 0 {
   279  		return errors.Trace(w.StartUnitCompletion("start units after series upgrade"))
   280  	}
   281  
   282  	// If the units have all completed their workflow, then we are done.
   283  	// Make the final update to the lock to say the machine is completed.
   284  	if w.completedUnits, err = w.UnitsCompleted(); err != nil {
   285  		return errors.Trace(err)
   286  	}
   287  
   288  	// If not all the units have checked in, we are still preparing.
   289  	completed := names.NewSet(asGenericTags(w.completedUnits)...)
   290  	if remaining := w.units.Difference(completed); remaining.Size() > 0 {
   291  		// Not done yet.
   292  		var names []string
   293  		for _, tag := range remaining.SortedValues() {
   294  			names = append(names, tag.Id())
   295  		}
   296  		w.logger.Debugf("waiting for units: %s", strings.Join(names, ","))
   297  		return nil
   298  	}
   299  
   300  	w.logger.Infof("series upgrade complete")
   301  	return errors.Trace(w.SetMachineStatus(model.UpgradeSeriesCompleted, "series upgrade complete"))
   302  }
   303  
   304  // handleCompleted notifies the server that it has completed the upgrade
   305  // workflow, then unpins leadership for applications running on the machine.
   306  func (w *upgradeSeriesWorker) handleCompleted() error {
   307  	if err := w.SetInstanceStatus(model.UpgradeSeriesCompleted, "finalising upgrade"); err != nil {
   308  		return errors.Trace(err)
   309  	}
   310  
   311  	b, err := hostBase()
   312  	if err != nil {
   313  		return errors.Trace(err)
   314  	}
   315  	if err = w.FinishUpgradeSeries(b); err != nil {
   316  		return errors.Trace(err)
   317  	}
   318  	if err = w.unpinLeaders(); err != nil {
   319  		return errors.Trace(err)
   320  	}
   321  
   322  	return errors.Trace(w.SetInstanceStatus(model.UpgradeSeriesCompleted, "success"))
   323  }
   324  
   325  // pinLeaders pins leadership for applications
   326  // represented by units running on this machine.
   327  func (w *upgradeSeriesWorker) pinLeaders() (err error) {
   328  	// if we encounter an error,
   329  	// attempt to ensure that no application leaders remain pinned.
   330  	defer func() {
   331  		if err != nil {
   332  			if unpinErr := w.unpinLeaders(); unpinErr != nil {
   333  				err = errors.Wrap(err, unpinErr)
   334  			}
   335  		}
   336  	}()
   337  
   338  	results, err := w.PinMachineApplications()
   339  	if err != nil {
   340  		// If pin machine applications method return not implemented because it's
   341  		// utilising the legacy leases store, then we should display the warning
   342  		// in the log and return out. Unpinning leaders should be safe as that
   343  		// should be considered a no-op
   344  		if params.IsCodeNotImplemented(err) {
   345  			w.logger.Infof("failed to pin machine applications, with legacy lease manager leadership pinning is not implemented")
   346  			return nil
   347  		}
   348  		return errors.Trace(err)
   349  	}
   350  
   351  	var lastErr error
   352  	for app, err := range results {
   353  		if err == nil {
   354  			w.logger.Infof("unpin leader for application %q", app)
   355  			continue
   356  		}
   357  		w.logger.Errorf("failed to pin leader for application %q: %s", app, err.Error())
   358  		lastErr = err
   359  	}
   360  
   361  	if lastErr == nil {
   362  		w.leadersPinned = true
   363  		return nil
   364  	}
   365  	return errors.Trace(lastErr)
   366  }
   367  
   368  // unpinLeaders unpins leadership for applications
   369  // represented by units running on this machine.
   370  func (w *upgradeSeriesWorker) unpinLeaders() error {
   371  	results, err := w.UnpinMachineApplications()
   372  	if err != nil {
   373  		return errors.Trace(err)
   374  	}
   375  
   376  	var lastErr error
   377  	for app, err := range results {
   378  		if err == nil {
   379  			w.logger.Infof("unpinned leader for application %q", app)
   380  			continue
   381  		}
   382  		w.logger.Errorf("failed to unpin leader for application %q: %s", app, err.Error())
   383  		lastErr = err
   384  	}
   385  
   386  	if lastErr == nil {
   387  		w.leadersPinned = false
   388  		return nil
   389  	}
   390  	return errors.Trace(lastErr)
   391  }
   392  
   393  // Report (worker.Reporter) generates a report for the Juju engine.
   394  func (w *upgradeSeriesWorker) Report() map[string]interface{} {
   395  	w.mu.Lock()
   396  	defer w.mu.Unlock()
   397  
   398  	report := map[string]interface{}{"machine status": w.machineStatus}
   399  
   400  	if len(w.preparedUnits) > 0 {
   401  		units := make([]string, len(w.preparedUnits))
   402  		for i, u := range w.preparedUnits {
   403  			units[i] = u.Id()
   404  		}
   405  		report["prepared units"] = units
   406  	}
   407  
   408  	if len(w.completedUnits) > 0 {
   409  		units := make([]string, len(w.completedUnits))
   410  		for i, u := range w.completedUnits {
   411  			units[i] = u.Id()
   412  		}
   413  		report["completed units"] = units
   414  	}
   415  
   416  	return report
   417  }
   418  
   419  // Kill implements worker.Worker.Kill.
   420  func (w *upgradeSeriesWorker) Kill() {
   421  	w.catacomb.Kill(nil)
   422  }
   423  
   424  // Wait implements worker.Worker.Wait.
   425  func (w *upgradeSeriesWorker) Wait() error {
   426  	return w.catacomb.Wait()
   427  }
   428  
   429  // Stop stops the upgrade-series worker and returns any
   430  // error it encountered when running.
   431  func (w *upgradeSeriesWorker) Stop() error {
   432  	w.Kill()
   433  	return w.Wait()
   434  }
   435  
   436  func asGenericTags(units []names.UnitTag) []names.Tag {
   437  	result := make([]names.Tag, len(units))
   438  	for i, tag := range units {
   439  		result[i] = tag
   440  	}
   441  	return result
   442  }