github.com/juju/juju@v0.0.0-20240327075706-a90865de2538/worker/upgradeseries/worker.go (about)

     1  // Copyright 2018 Canonical Ltd.
     2  // Licensed under the AGPLv3, see LICENCE file for details.
     3  
     4  package upgradeseries
     5  
     6  import (
     7  	"strings"
     8  	"sync"
     9  
    10  	"github.com/juju/errors"
    11  	"github.com/juju/names/v5"
    12  	"github.com/juju/os/v2/series"
    13  	"github.com/juju/worker/v3"
    14  	"github.com/juju/worker/v3/catacomb"
    15  
    16  	"github.com/juju/juju/core/model"
    17  	"github.com/juju/juju/rpc/params"
    18  )
    19  
    20  //go:generate go run go.uber.org/mock/mockgen -package mocks -destination mocks/package_mock.go github.com/juju/juju/worker/upgradeseries Facade,UnitDiscovery,Upgrader
    21  
    22  var hostSeries = series.HostSeries
    23  
    24  // Logger represents the methods required to emit log messages.
    25  type Logger interface {
    26  	Debugf(message string, args ...interface{})
    27  	Infof(message string, args ...interface{})
    28  	Warningf(message string, args ...interface{})
    29  	Errorf(message string, args ...interface{})
    30  }
    31  
    32  // UnitDiscovery represents how the worker determines which units need
    33  // to check in.
    34  type UnitDiscovery interface {
    35  	Units() ([]names.UnitTag, error)
    36  }
    37  
    38  // Config is the configuration needed to construct an UpgradeSeries worker.
    39  type Config struct {
    40  	// Facade is used to access back-end state.
    41  	Facade Facade
    42  
    43  	// Logger is the logger for this worker.
    44  	Logger Logger
    45  
    46  	// UnitDiscovery determines how the worker knows which units should
    47  	// be running on the machine.
    48  	UnitDiscovery UnitDiscovery
    49  
    50  	// UpgraderFactory is a factory method that will return an upgrader capable
    51  	// of handling service and agent binary manipulation for a
    52  	// runtime-determined current and target OS series.
    53  	UpgraderFactory func(string, string) (Upgrader, error)
    54  }
    55  
    56  // Validate validates the upgrade-series worker configuration.
    57  func (config Config) Validate() error {
    58  	if config.Logger == nil {
    59  		return errors.NotValidf("nil Logger")
    60  	}
    61  	if config.Facade == nil {
    62  		return errors.NotValidf("nil Facade")
    63  	}
    64  	if config.UnitDiscovery == nil {
    65  		return errors.NotValidf("nil UnitDiscovery")
    66  	}
    67  	if config.UpgraderFactory == nil {
    68  		return errors.NotValidf("nil UpgraderFactory")
    69  	}
    70  	return nil
    71  }
    72  
    73  // upgradeSeriesWorker is responsible for machine and unit agent requirements
    74  // during upgrade-series:
    75  //
    76  //	copying the agent binary directory and renaming;
    77  //	rewriting the machine and unit(s) systemd files if necessary;
    78  //	ensuring unit agents are started post-upgrade;
    79  //	moving the status of the upgrade-series steps along.
    80  type upgradeSeriesWorker struct {
    81  	Facade
    82  
    83  	catacomb        catacomb.Catacomb
    84  	logger          Logger
    85  	unitDiscovery   UnitDiscovery
    86  	upgraderFactory func(string, string) (Upgrader, error)
    87  
    88  	// Some local state retained for reporting purposes.
    89  	mu             sync.Mutex
    90  	machineStatus  model.UpgradeSeriesStatus
    91  	units          names.Set
    92  	preparedUnits  []names.UnitTag
    93  	completedUnits []names.UnitTag
    94  
    95  	// Ensure that leaders are pinned only once if possible,
    96  	// on the first transition to UpgradeSeriesPrepareStarted.
    97  	// However repeated pin calls are not of too much concern,
    98  	// as the pin operations are idempotent.
    99  	leadersPinned bool
   100  }
   101  
   102  // NewWorker creates, starts and returns a new upgrade-series worker based on
   103  // the input configuration.
   104  func NewWorker(config Config) (worker.Worker, error) {
   105  	if err := config.Validate(); err != nil {
   106  		return nil, errors.Trace(err)
   107  	}
   108  
   109  	w := &upgradeSeriesWorker{
   110  		Facade:          config.Facade,
   111  		logger:          config.Logger,
   112  		unitDiscovery:   config.UnitDiscovery,
   113  		upgraderFactory: config.UpgraderFactory,
   114  		machineStatus:   model.UpgradeSeriesNotStarted,
   115  		leadersPinned:   false,
   116  	}
   117  
   118  	if err := catacomb.Invoke(catacomb.Plan{
   119  		Site: &w.catacomb,
   120  		Work: w.loop,
   121  	}); err != nil {
   122  		return nil, errors.Trace(err)
   123  	}
   124  
   125  	return w, nil
   126  }
   127  
   128  func (w *upgradeSeriesWorker) loop() error {
   129  	uw, err := w.WatchUpgradeSeriesNotifications()
   130  	if err != nil {
   131  		return errors.Trace(err)
   132  	}
   133  	err = w.catacomb.Add(uw)
   134  	if err != nil {
   135  		return errors.Trace(err)
   136  	}
   137  	for {
   138  		select {
   139  		case <-w.catacomb.Dying():
   140  			return w.catacomb.ErrDying()
   141  		case <-uw.Changes():
   142  			if err := w.handleUpgradeSeriesChange(); err != nil {
   143  				return errors.Trace(err)
   144  			}
   145  		}
   146  	}
   147  }
   148  
   149  // handleUpgradeSeriesChange retrieves the current upgrade-series status for
   150  // this machine and based on the status, calls methods that will progress
   151  // the workflow accordingly.
   152  func (w *upgradeSeriesWorker) handleUpgradeSeriesChange() error {
   153  	w.mu.Lock()
   154  	defer w.mu.Unlock()
   155  
   156  	var err error
   157  	if w.machineStatus, err = w.MachineStatus(); err != nil {
   158  		if errors.IsNotFound(err) {
   159  			// No upgrade-series lock. This can happen when:
   160  			// - The first watch call is made.
   161  			// - The lock is removed after a completed upgrade.
   162  			w.logger.Infof("no series upgrade lock present")
   163  			w.machineStatus = model.UpgradeSeriesNotStarted
   164  			w.preparedUnits = nil
   165  			w.completedUnits = nil
   166  			return nil
   167  		}
   168  		return errors.Trace(err)
   169  	}
   170  	w.logger.Infof("machine series upgrade status is %q", w.machineStatus)
   171  
   172  	// Determine the set of units that are on the machine.
   173  	if w.units == nil {
   174  		units, err := w.unitDiscovery.Units()
   175  		if err != nil {
   176  			return errors.Annotate(err, "unit discovery")
   177  		}
   178  		w.units = names.NewSet(asGenericTags(units)...)
   179  	}
   180  
   181  	switch w.machineStatus {
   182  	case model.UpgradeSeriesValidate:
   183  		err = w.handleValidate()
   184  	case model.UpgradeSeriesPrepareStarted:
   185  		err = w.handlePrepareStarted()
   186  	case model.UpgradeSeriesCompleteStarted:
   187  		err = w.handleCompleteStarted()
   188  	case model.UpgradeSeriesCompleted:
   189  		err = w.handleCompleted()
   190  	}
   191  
   192  	if err != nil {
   193  		if err := w.SetInstanceStatus(model.UpgradeSeriesError, err.Error()); err != nil {
   194  			w.logger.Errorf("failed to set series upgrade error status: %s", err.Error())
   195  		}
   196  	}
   197  	return errors.Trace(err)
   198  }
   199  
   200  // handleValidate handles the workflow for the machine with validating the
   201  // given set of machine applications and charms.
   202  func (w *upgradeSeriesWorker) handleValidate() error {
   203  	if err := w.SetInstanceStatus(model.UpgradeSeriesValidate, "validating units"); err != nil {
   204  		return errors.Trace(err)
   205  	}
   206  	return nil
   207  }
   208  
   209  // handlePrepareStarted handles workflow for the machine with an upgrade-series
   210  // lock status of "UpgradeSeriesPrepareStarted"
   211  func (w *upgradeSeriesWorker) handlePrepareStarted() error {
   212  	var err error
   213  	if err = w.SetInstanceStatus(model.UpgradeSeriesPrepareStarted, "preparing units"); err != nil {
   214  		return errors.Trace(err)
   215  	}
   216  
   217  	if !w.leadersPinned {
   218  		if err = w.pinLeaders(); err != nil {
   219  			return errors.Trace(err)
   220  		}
   221  	}
   222  
   223  	if w.preparedUnits, err = w.UnitsPrepared(); err != nil {
   224  		return errors.Trace(err)
   225  	}
   226  
   227  	// If not all the units have checked in, we are still preparing.
   228  	prepared := names.NewSet(asGenericTags(w.preparedUnits)...)
   229  	if remaining := w.units.Difference(prepared); remaining.Size() > 0 {
   230  		// Not done yet.
   231  		var names []string
   232  		for _, tag := range remaining.SortedValues() {
   233  			names = append(names, tag.Id())
   234  		}
   235  		w.logger.Debugf("waiting for units: %s", strings.Join(names, ","))
   236  		return nil
   237  	}
   238  
   239  	return errors.Trace(w.transitionPrepareComplete())
   240  }
   241  
   242  // transitionPrepareComplete rewrites service unit files for unit agents running
   243  // on this machine so that they are compatible with the init system of the
   244  // series upgrade target.
   245  func (w *upgradeSeriesWorker) transitionPrepareComplete() error {
   246  	if err := w.SetInstanceStatus(model.UpgradeSeriesPrepareStarted, "completing preparation"); err != nil {
   247  		return errors.Trace(err)
   248  	}
   249  
   250  	w.logger.Infof("preparing service units for series upgrade")
   251  	currentSeries, err := w.CurrentSeries()
   252  	if err != nil {
   253  		return errors.Trace(err)
   254  	}
   255  
   256  	toSeries, err := w.TargetSeries()
   257  	if err != nil {
   258  		return errors.Trace(err)
   259  	}
   260  
   261  	upgrader, err := w.upgraderFactory(currentSeries, toSeries)
   262  	if err != nil {
   263  		return errors.Trace(err)
   264  	}
   265  	if err := upgrader.PerformUpgrade(); err != nil {
   266  		return errors.Trace(err)
   267  	}
   268  
   269  	if err := w.SetMachineStatus(model.UpgradeSeriesPrepareCompleted, "binaries and service files written"); err != nil {
   270  		return errors.Trace(err)
   271  	}
   272  
   273  	return errors.Trace(w.SetInstanceStatus(model.UpgradeSeriesPrepareCompleted, "waiting for completion command"))
   274  }
   275  
   276  func (w *upgradeSeriesWorker) handleCompleteStarted() error {
   277  	if err := w.SetInstanceStatus(model.UpgradeSeriesCompleteStarted, "waiting for units"); err != nil {
   278  		return errors.Trace(err)
   279  	}
   280  
   281  	var err error
   282  	if w.preparedUnits, err = w.UnitsPrepared(); err != nil {
   283  		return errors.Trace(err)
   284  	}
   285  
   286  	// If all the units are prepared, tell them to start.
   287  	prepared := names.NewSet(asGenericTags(w.preparedUnits)...)
   288  	if remaining := w.units.Difference(prepared); remaining.Size() == 0 && len(w.units) > 0 {
   289  		return errors.Trace(w.StartUnitCompletion("start units after series upgrade"))
   290  	}
   291  
   292  	// If the units have all completed their workflow, then we are done.
   293  	// Make the final update to the lock to say the machine is completed.
   294  	if w.completedUnits, err = w.UnitsCompleted(); err != nil {
   295  		return errors.Trace(err)
   296  	}
   297  
   298  	// If not all the units have checked in, we are still preparing.
   299  	completed := names.NewSet(asGenericTags(w.completedUnits)...)
   300  	if remaining := w.units.Difference(completed); remaining.Size() > 0 {
   301  		// Not done yet.
   302  		var names []string
   303  		for _, tag := range remaining.SortedValues() {
   304  			names = append(names, tag.Id())
   305  		}
   306  		w.logger.Debugf("waiting for units: %s", strings.Join(names, ","))
   307  		return nil
   308  	}
   309  
   310  	w.logger.Infof("series upgrade complete")
   311  	return errors.Trace(w.SetMachineStatus(model.UpgradeSeriesCompleted, "series upgrade complete"))
   312  }
   313  
   314  // handleCompleted notifies the server that it has completed the upgrade
   315  // workflow, then unpins leadership for applications running on the machine.
   316  func (w *upgradeSeriesWorker) handleCompleted() error {
   317  	if err := w.SetInstanceStatus(model.UpgradeSeriesCompleted, "finalising upgrade"); err != nil {
   318  		return errors.Trace(err)
   319  	}
   320  
   321  	s, err := hostSeries()
   322  	if err != nil {
   323  		return errors.Trace(err)
   324  	}
   325  	if err = w.FinishUpgradeSeries(s); err != nil {
   326  		return errors.Trace(err)
   327  	}
   328  	if err = w.unpinLeaders(); err != nil {
   329  		return errors.Trace(err)
   330  	}
   331  
   332  	return errors.Trace(w.SetInstanceStatus(model.UpgradeSeriesCompleted, "success"))
   333  }
   334  
   335  // pinLeaders pins leadership for applications
   336  // represented by units running on this machine.
   337  func (w *upgradeSeriesWorker) pinLeaders() (err error) {
   338  	// if we encounter an error,
   339  	// attempt to ensure that no application leaders remain pinned.
   340  	defer func() {
   341  		if err != nil {
   342  			if unpinErr := w.unpinLeaders(); unpinErr != nil {
   343  				err = errors.Wrap(err, unpinErr)
   344  			}
   345  		}
   346  	}()
   347  
   348  	results, err := w.PinMachineApplications()
   349  	if err != nil {
   350  		// If pin machine applications method return not implemented because it's
   351  		// utilising the legacy leases store, then we should display the warning
   352  		// in the log and return out. Unpinning leaders should be safe as that
   353  		// should be considered a no-op
   354  		if params.IsCodeNotImplemented(err) {
   355  			w.logger.Infof("failed to pin machine applications, with legacy lease manager leadership pinning is not implemented")
   356  			return nil
   357  		}
   358  		return errors.Trace(err)
   359  	}
   360  
   361  	var lastErr error
   362  	for app, err := range results {
   363  		if err == nil {
   364  			w.logger.Infof("unpin leader for application %q", app)
   365  			continue
   366  		}
   367  		w.logger.Errorf("failed to pin leader for application %q: %s", app, err.Error())
   368  		lastErr = err
   369  	}
   370  
   371  	if lastErr == nil {
   372  		w.leadersPinned = true
   373  		return nil
   374  	}
   375  	return errors.Trace(lastErr)
   376  }
   377  
   378  // unpinLeaders unpins leadership for applications
   379  // represented by units running on this machine.
   380  func (w *upgradeSeriesWorker) unpinLeaders() error {
   381  	results, err := w.UnpinMachineApplications()
   382  	if err != nil {
   383  		return errors.Trace(err)
   384  	}
   385  
   386  	var lastErr error
   387  	for app, err := range results {
   388  		if err == nil {
   389  			w.logger.Infof("unpinned leader for application %q", app)
   390  			continue
   391  		}
   392  		w.logger.Errorf("failed to unpin leader for application %q: %s", app, err.Error())
   393  		lastErr = err
   394  	}
   395  
   396  	if lastErr == nil {
   397  		w.leadersPinned = false
   398  		return nil
   399  	}
   400  	return errors.Trace(lastErr)
   401  }
   402  
   403  // Report (worker.Reporter) generates a report for the Juju engine.
   404  func (w *upgradeSeriesWorker) Report() map[string]interface{} {
   405  	w.mu.Lock()
   406  	defer w.mu.Unlock()
   407  
   408  	report := map[string]interface{}{"machine status": w.machineStatus}
   409  
   410  	if len(w.preparedUnits) > 0 {
   411  		units := make([]string, len(w.preparedUnits))
   412  		for i, u := range w.preparedUnits {
   413  			units[i] = u.Id()
   414  		}
   415  		report["prepared units"] = units
   416  	}
   417  
   418  	if len(w.completedUnits) > 0 {
   419  		units := make([]string, len(w.completedUnits))
   420  		for i, u := range w.completedUnits {
   421  			units[i] = u.Id()
   422  		}
   423  		report["completed units"] = units
   424  	}
   425  
   426  	return report
   427  }
   428  
   429  // Kill implements worker.Worker.Kill.
   430  func (w *upgradeSeriesWorker) Kill() {
   431  	w.catacomb.Kill(nil)
   432  }
   433  
   434  // Wait implements worker.Worker.Wait.
   435  func (w *upgradeSeriesWorker) Wait() error {
   436  	return w.catacomb.Wait()
   437  }
   438  
   439  // Stop stops the upgrade-series worker and returns any
   440  // error it encountered when running.
   441  func (w *upgradeSeriesWorker) Stop() error {
   442  	w.Kill()
   443  	return w.Wait()
   444  }
   445  
   446  func asGenericTags(units []names.UnitTag) []names.Tag {
   447  	result := make([]names.Tag, len(units))
   448  	for i, tag := range units {
   449  		result[i] = tag
   450  	}
   451  	return result
   452  }