github.com/juju/juju@v0.0.0-20240327075706-a90865de2538/worker/instancepoller/worker.go (about)

     1  // Copyright 2013 Canonical Ltd.
     2  // Licensed under the AGPLv3, see LICENCE file for details.
     3  
     4  package instancepoller
     5  
     6  import (
     7  	stdcontext "context"
     8  	"time"
     9  
    10  	"github.com/juju/clock"
    11  	"github.com/juju/errors"
    12  	"github.com/juju/names/v5"
    13  	"github.com/juju/worker/v3"
    14  	"github.com/juju/worker/v3/catacomb"
    15  
    16  	"github.com/juju/juju/core/instance"
    17  	"github.com/juju/juju/core/life"
    18  	"github.com/juju/juju/core/network"
    19  	"github.com/juju/juju/core/status"
    20  	"github.com/juju/juju/core/watcher"
    21  	"github.com/juju/juju/environs"
    22  	"github.com/juju/juju/environs/context"
    23  	"github.com/juju/juju/environs/instances"
    24  	"github.com/juju/juju/rpc/params"
    25  	"github.com/juju/juju/worker/common"
    26  )
    27  
    28  // ShortPoll and LongPoll hold the polling intervals for the instance
    29  // updater. When a machine has no address or is not started, it will be
    30  // polled at ShortPoll intervals until it does, exponentially backing off
    31  // with an exponent of ShortPollBackoff until a maximum of ShortPollCap is
    32  // reached.
    33  //
    34  // When a machine has an address and is started LongPoll will be used to
    35  // check that the instance address or status has not changed.
    36  var (
    37  	ShortPoll        = 3 * time.Second
    38  	ShortPollBackoff = 2.0
    39  	ShortPollCap     = 1 * time.Minute
    40  	LongPoll         = 15 * time.Minute
    41  )
    42  
    43  // Environ specifies the provider-specific methods needed by the instance
    44  // poller.
    45  type Environ interface {
    46  	Instances(ctx context.ProviderCallContext, ids []instance.Id) ([]instances.Instance, error)
    47  	NetworkInterfaces(ctx context.ProviderCallContext, ids []instance.Id) ([]network.InterfaceInfos, error)
    48  }
    49  
    50  // Machine specifies an interface for machine instances processed by the
    51  // instance poller.
    52  type Machine interface {
    53  	Id() string
    54  	InstanceId() (instance.Id, error)
    55  	SetProviderNetworkConfig(network.InterfaceInfos) (network.ProviderAddresses, bool, error)
    56  	InstanceStatus() (params.StatusResult, error)
    57  	SetInstanceStatus(status.Status, string, map[string]interface{}) error
    58  	String() string
    59  	Refresh() error
    60  	Status() (params.StatusResult, error)
    61  	Life() life.Value
    62  	IsManual() (bool, error)
    63  }
    64  
    65  // FacadeAPI specifies the api-server methods needed by the instance
    66  // poller.
    67  type FacadeAPI interface {
    68  	WatchModelMachines() (watcher.StringsWatcher, error)
    69  	Machine(tag names.MachineTag) (Machine, error)
    70  }
    71  
    72  // Config encapsulates the configuration options for instantiating a new
    73  // instance poller worker.
    74  type Config struct {
    75  	Clock   clock.Clock
    76  	Facade  FacadeAPI
    77  	Environ Environ
    78  	Logger  Logger
    79  
    80  	CredentialAPI common.CredentialAPI
    81  }
    82  
    83  // Validate checks whether the worker configuration settings are valid.
    84  func (config Config) Validate() error {
    85  	if config.Clock == nil {
    86  		return errors.NotValidf("nil clock.Clock")
    87  	}
    88  	if config.Facade == nil {
    89  		return errors.NotValidf("nil Facade")
    90  	}
    91  	if config.Environ == nil {
    92  		return errors.NotValidf("nil Environ")
    93  	}
    94  	if config.Logger == nil {
    95  		return errors.NotValidf("nil Logger")
    96  	}
    97  	if config.CredentialAPI == nil {
    98  		return errors.NotValidf("nil CredentialAPI")
    99  	}
   100  	return nil
   101  }
   102  
   103  type pollGroupType uint8
   104  
   105  const (
   106  	shortPollGroup pollGroupType = iota
   107  	longPollGroup
   108  	invalidPollGroup
   109  )
   110  
   111  type pollGroupEntry struct {
   112  	m          Machine
   113  	tag        names.MachineTag
   114  	instanceID instance.Id
   115  
   116  	shortPollInterval time.Duration
   117  	shortPollAt       time.Time
   118  }
   119  
   120  func (e *pollGroupEntry) resetShortPollInterval(clk clock.Clock) {
   121  	e.shortPollInterval = ShortPoll
   122  	e.shortPollAt = clk.Now().Add(e.shortPollInterval)
   123  }
   124  
   125  func (e *pollGroupEntry) bumpShortPollInterval(clk clock.Clock) {
   126  	e.shortPollInterval = time.Duration(float64(e.shortPollInterval) * ShortPollBackoff)
   127  	if e.shortPollInterval > ShortPollCap {
   128  		e.shortPollInterval = ShortPollCap
   129  	}
   130  	e.shortPollAt = clk.Now().Add(e.shortPollInterval)
   131  }
   132  
   133  type updaterWorker struct {
   134  	config   Config
   135  	catacomb catacomb.Catacomb
   136  
   137  	pollGroup              [2]map[names.MachineTag]*pollGroupEntry
   138  	instanceIDToGroupEntry map[instance.Id]*pollGroupEntry
   139  	callContextFunc        common.CloudCallContextFunc
   140  
   141  	// Hook function which tests can use to be notified when the worker
   142  	// has processed a full loop iteration.
   143  	loopCompletedHook func()
   144  }
   145  
   146  // NewWorker returns a worker that keeps track of
   147  // the machines in the state and polls their instance
   148  // addresses and status periodically to keep them up to date.
   149  func NewWorker(config Config) (worker.Worker, error) {
   150  	if err := config.Validate(); err != nil {
   151  		return nil, errors.Trace(err)
   152  	}
   153  	u := &updaterWorker{
   154  		config: config,
   155  		pollGroup: [2]map[names.MachineTag]*pollGroupEntry{
   156  			make(map[names.MachineTag]*pollGroupEntry),
   157  			make(map[names.MachineTag]*pollGroupEntry),
   158  		},
   159  		instanceIDToGroupEntry: make(map[instance.Id]*pollGroupEntry),
   160  		callContextFunc:        common.NewCloudCallContextFunc(config.CredentialAPI),
   161  	}
   162  	err := catacomb.Invoke(catacomb.Plan{
   163  		Site: &u.catacomb,
   164  		Work: u.loop,
   165  	})
   166  	if err != nil {
   167  		return nil, errors.Trace(err)
   168  	}
   169  	return u, nil
   170  }
   171  
   172  // Kill is part of the worker.Worker interface.
   173  func (u *updaterWorker) Kill() {
   174  	u.catacomb.Kill(nil)
   175  }
   176  
   177  // Wait is part of the worker.Worker interface.
   178  func (u *updaterWorker) Wait() error {
   179  	return u.catacomb.Wait()
   180  }
   181  
   182  func (u *updaterWorker) loop() error {
   183  	watch, err := u.config.Facade.WatchModelMachines()
   184  	if err != nil {
   185  		return errors.Trace(err)
   186  	}
   187  	if err := u.catacomb.Add(watch); err != nil {
   188  		return errors.Trace(err)
   189  	}
   190  
   191  	shortPollTimer := u.config.Clock.NewTimer(ShortPoll)
   192  	longPollTimer := u.config.Clock.NewTimer(LongPoll)
   193  	defer func() {
   194  		_ = shortPollTimer.Stop()
   195  		_ = longPollTimer.Stop()
   196  	}()
   197  
   198  	for {
   199  		select {
   200  		case <-u.catacomb.Dying():
   201  			return u.catacomb.ErrDying()
   202  		case ids, ok := <-watch.Changes():
   203  			if !ok {
   204  				return errors.New("machines watcher closed")
   205  			}
   206  
   207  			for i := range ids {
   208  				tag := names.NewMachineTag(ids[i])
   209  				if err := u.queueMachineForPolling(tag); err != nil {
   210  					return err
   211  				}
   212  			}
   213  		case <-shortPollTimer.Chan():
   214  			if err := u.pollGroupMembers(shortPollGroup); err != nil {
   215  				return err
   216  			}
   217  			shortPollTimer.Reset(ShortPoll)
   218  		case <-longPollTimer.Chan():
   219  			if err := u.pollGroupMembers(longPollGroup); err != nil {
   220  				return err
   221  			}
   222  			longPollTimer.Reset(LongPoll)
   223  		}
   224  
   225  		if u.loopCompletedHook != nil {
   226  			u.loopCompletedHook()
   227  		}
   228  	}
   229  }
   230  
   231  func (u *updaterWorker) queueMachineForPolling(tag names.MachineTag) error {
   232  	// If we are already polling this machine, check whether it is still alive
   233  	// and remove it from its poll group if it is now dead.
   234  	if entry, groupType := u.lookupPolledMachine(tag); entry != nil {
   235  		var isDead bool
   236  		if err := entry.m.Refresh(); err != nil {
   237  			// If the machine is not found, this probably means
   238  			// that it is dead and has been removed from the DB.
   239  			if !errors.IsNotFound(err) {
   240  				return errors.Trace(err)
   241  			}
   242  			isDead = true
   243  		} else if entry.m.Life() == life.Dead {
   244  			isDead = true
   245  		}
   246  
   247  		if isDead {
   248  			u.config.Logger.Debugf("removing dead machine %q (instance ID %q)", entry.m, entry.instanceID)
   249  			delete(u.pollGroup[groupType], tag)
   250  			delete(u.instanceIDToGroupEntry, entry.instanceID)
   251  			return nil
   252  		}
   253  
   254  		// Something has changed with the machine state. Reset short
   255  		// poll interval for the machine and move it to the short poll
   256  		// group (if not already there) so we immediately poll its
   257  		// status at the next interval.
   258  		u.moveEntryToPollGroup(shortPollGroup, entry)
   259  		if groupType == longPollGroup {
   260  			u.config.Logger.Debugf("moving machine %q (instance ID %q) to short poll group", entry.m, entry.instanceID)
   261  		}
   262  		return nil
   263  	}
   264  
   265  	// Get information about the machine
   266  	m, err := u.config.Facade.Machine(tag)
   267  	if err != nil {
   268  		return errors.Trace(err)
   269  	}
   270  
   271  	// We don't poll manual machines, instead we're setting the status to 'running'
   272  	// as we don't have any better information from the provider, see lp:1678981
   273  	isManual, err := m.IsManual()
   274  	if err != nil {
   275  		return errors.Trace(err)
   276  	}
   277  
   278  	if isManual {
   279  		machineStatus, err := m.InstanceStatus()
   280  		if err != nil {
   281  			return errors.Trace(err)
   282  		}
   283  		if status.Status(machineStatus.Status) != status.Running {
   284  			if err = m.SetInstanceStatus(status.Running, "Manually provisioned machine", nil); err != nil {
   285  				u.config.Logger.Errorf("cannot set instance status on %q: %v", m, err)
   286  				return err
   287  			}
   288  		}
   289  		return nil
   290  	}
   291  
   292  	// Add all new machines to the short poll group and arrange for them to
   293  	// be polled as soon as possible.
   294  	u.appendToShortPollGroup(tag, m)
   295  	return nil
   296  }
   297  
   298  func (u *updaterWorker) appendToShortPollGroup(tag names.MachineTag, m Machine) {
   299  	entry := &pollGroupEntry{
   300  		tag: tag,
   301  		m:   m,
   302  	}
   303  	entry.resetShortPollInterval(u.config.Clock)
   304  	u.pollGroup[shortPollGroup][tag] = entry
   305  }
   306  
   307  func (u *updaterWorker) moveEntryToPollGroup(toGroup pollGroupType, entry *pollGroupEntry) {
   308  	// Ensure that the entry is not present in the other group
   309  	delete(u.pollGroup[1-toGroup], entry.tag)
   310  	u.pollGroup[toGroup][entry.tag] = entry
   311  
   312  	// If moving to the short poll group reset the poll interval
   313  	if toGroup == shortPollGroup {
   314  		entry.resetShortPollInterval(u.config.Clock)
   315  	}
   316  }
   317  
   318  func (u *updaterWorker) lookupPolledMachine(tag names.MachineTag) (*pollGroupEntry, pollGroupType) {
   319  	for groupType, members := range u.pollGroup {
   320  		if found := members[tag]; found != nil {
   321  			return found, pollGroupType(groupType)
   322  		}
   323  	}
   324  	return nil, invalidPollGroup
   325  }
   326  
   327  func (u *updaterWorker) pollGroupMembers(groupType pollGroupType) error {
   328  	// Build a list of instance IDs to pass as a query to the provider.
   329  	var instList []instance.Id
   330  	now := u.config.Clock.Now()
   331  	for _, entry := range u.pollGroup[groupType] {
   332  		if groupType == shortPollGroup && now.Before(entry.shortPollAt) {
   333  			continue // we shouldn't poll this entry yet
   334  		}
   335  
   336  		if err := u.resolveInstanceID(entry); err != nil {
   337  			if params.IsCodeNotProvisioned(err) {
   338  				// machine not provisioned yet; bump its poll
   339  				// interval and re-try later (or as soon as we
   340  				// get a change for the machine)
   341  				entry.bumpShortPollInterval(u.config.Clock)
   342  				continue
   343  			}
   344  			return errors.Trace(err)
   345  		}
   346  
   347  		instList = append(instList, entry.instanceID)
   348  	}
   349  
   350  	if len(instList) == 0 {
   351  		return nil
   352  	}
   353  
   354  	ctx := stdcontext.Background()
   355  	infoList, err := u.config.Environ.Instances(u.callContextFunc(ctx), instList)
   356  	if err != nil {
   357  		switch errors.Cause(err) {
   358  		case environs.ErrPartialInstances:
   359  			// Proceed and process the ones we've found.
   360  		case environs.ErrNoInstances:
   361  			// If there were no instances recognised by the provider, we do not
   362  			// retrieve the network configuration, and will therefore have
   363  			// nothing to update.
   364  			// This can happen when machines do have instance IDs, but the
   365  			// instances themselves are shut down, such as we have seen for
   366  			// dying models.
   367  			// If we're in the short poll group bump all the poll intervals for
   368  			// entries with an instance ID. Any without an instance ID will
   369  			// already have had their intervals bumped above.
   370  			if groupType == shortPollGroup {
   371  				for _, id := range instList {
   372  					u.instanceIDToGroupEntry[id].bumpShortPollInterval(u.config.Clock)
   373  				}
   374  			}
   375  
   376  			return nil
   377  		default:
   378  			return errors.Trace(err)
   379  		}
   380  	}
   381  
   382  	netList, err := u.config.Environ.NetworkInterfaces(u.callContextFunc(ctx), instList)
   383  	if err != nil && !isPartialOrNoInstancesError(err) {
   384  		// NOTE(achilleasa): 2022-01-24: all existing providers (with the
   385  		// exception of "manual" which we don't care about in this context)
   386  		// implement the NetworkInterfaces method.
   387  		//
   388  		// This error is meant as a hint to folks working on new providers
   389  		// in the future to ensure that they implement this method.
   390  		if errors.IsNotSupported(errors.Cause(err)) {
   391  			return errors.Errorf("BUG: substrate does not implement required NetworkInterfaces method")
   392  		}
   393  
   394  		return errors.Annotate(err, "enumerating network interface list for instances")
   395  	}
   396  
   397  	for idx, info := range infoList {
   398  		var nics network.InterfaceInfos
   399  		if netList != nil {
   400  			nics = netList[idx]
   401  		}
   402  
   403  		if err := u.processOneInstance(instList[idx], info, nics, groupType); err != nil {
   404  			return errors.Trace(err)
   405  		}
   406  	}
   407  
   408  	return nil
   409  }
   410  
   411  func (u *updaterWorker) processOneInstance(
   412  	id instance.Id, info instances.Instance, nics network.InterfaceInfos, groupType pollGroupType,
   413  ) error {
   414  	entry := u.instanceIDToGroupEntry[id]
   415  
   416  	// If we received ErrPartialInstances, and this ID is one of those not found,
   417  	// and we're in the short poll group, back off the poll interval.
   418  	// This will ensure that instances that have gone away do not cause excessive
   419  	// provider call volumes.
   420  	if info == nil {
   421  		u.config.Logger.Warningf("unable to retrieve instance information for instance: %q", id)
   422  
   423  		if groupType == shortPollGroup {
   424  			entry.bumpShortPollInterval(u.config.Clock)
   425  		}
   426  		return nil
   427  	}
   428  
   429  	providerStatus, providerAddrCount, err := u.processProviderInfo(entry, info, nics)
   430  	if err != nil {
   431  		return errors.Trace(err)
   432  	}
   433  
   434  	machineStatus, err := entry.m.Status()
   435  	if err != nil {
   436  		return errors.Trace(err)
   437  	}
   438  
   439  	u.maybeSwitchPollGroup(groupType, entry, providerStatus, status.Status(machineStatus.Status), providerAddrCount)
   440  	return nil
   441  }
   442  
   443  func (u *updaterWorker) resolveInstanceID(entry *pollGroupEntry) error {
   444  	if entry.instanceID != "" {
   445  		return nil // already resolved
   446  	}
   447  
   448  	instID, err := entry.m.InstanceId()
   449  	if err != nil {
   450  		return errors.Annotatef(err, "retrieving instance ID for machine %q", entry.m.Id())
   451  	}
   452  
   453  	entry.instanceID = instID
   454  	u.instanceIDToGroupEntry[instID] = entry
   455  	return nil
   456  }
   457  
   458  // processProviderInfo updates an entry's machine status and set of provider
   459  // addresses based on the information collected from the provider. It returns
   460  // the *instance* status and the number of provider addresses currently
   461  // known for the machine.
   462  func (u *updaterWorker) processProviderInfo(
   463  	entry *pollGroupEntry, info instances.Instance, providerInterfaces network.InterfaceInfos,
   464  ) (status.Status, int, error) {
   465  	curStatus, err := entry.m.InstanceStatus()
   466  	if err != nil {
   467  		// This should never occur since the machine is provisioned. If
   468  		// it does occur, report an unknown status to move the machine to
   469  		// the short poll group.
   470  		u.config.Logger.Warningf("cannot get current instance status for machine %v (instance ID %q): %v",
   471  			entry.m.Id(), entry.instanceID, err)
   472  
   473  		return status.Unknown, -1, nil
   474  	}
   475  
   476  	// Check for status changes
   477  	providerStatus := info.Status(u.callContextFunc(stdcontext.Background()))
   478  	curInstStatus := instance.Status{
   479  		Status:  status.Status(curStatus.Status),
   480  		Message: curStatus.Info,
   481  	}
   482  
   483  	if providerStatus != curInstStatus {
   484  		u.config.Logger.Infof("machine %q (instance ID %q) instance status changed from %q to %q",
   485  			entry.m.Id(), entry.instanceID, curInstStatus, providerStatus)
   486  
   487  		if err = entry.m.SetInstanceStatus(providerStatus.Status, providerStatus.Message, nil); err != nil {
   488  			u.config.Logger.Errorf("cannot set instance status on %q: %v", entry.m, err)
   489  			return status.Unknown, -1, errors.Trace(err)
   490  		}
   491  
   492  		// If the instance is now running, we should reset the poll
   493  		// interval to make sure we can capture machine status changes
   494  		// as early as possible.
   495  		if providerStatus.Status == status.Running {
   496  			entry.resetShortPollInterval(u.config.Clock)
   497  		}
   498  	}
   499  
   500  	// We don't care about dead machines; they will be cleaned up when we
   501  	// process the following machine watcher events.
   502  	if entry.m.Life() == life.Dead {
   503  		return status.Unknown, -1, nil
   504  	}
   505  
   506  	// Check whether the provider addresses for this machine need to be
   507  	// updated.
   508  	addrCount, err := u.syncProviderAddresses(entry, providerInterfaces)
   509  	if err != nil {
   510  		return status.Unknown, -1, err
   511  	}
   512  
   513  	return providerStatus.Status, addrCount, nil
   514  }
   515  
   516  // syncProviderAddresses updates the provider addresses for this entry's machine
   517  // using either the provider sourced interface list.
   518  //
   519  // The call returns the count of provider addresses for the machine.
   520  func (u *updaterWorker) syncProviderAddresses(
   521  	entry *pollGroupEntry, providerIfaceList network.InterfaceInfos,
   522  ) (int, error) {
   523  	addrs, modified, err := entry.m.SetProviderNetworkConfig(providerIfaceList)
   524  	if err != nil {
   525  		return -1, errors.Trace(err)
   526  	} else if modified {
   527  		u.config.Logger.Infof("machine %q (instance ID %q) has new addresses: %v",
   528  			entry.m.Id(), entry.instanceID, addrs)
   529  	}
   530  
   531  	return len(addrs), nil
   532  }
   533  
   534  func (u *updaterWorker) maybeSwitchPollGroup(
   535  	curGroup pollGroupType,
   536  	entry *pollGroupEntry,
   537  	curProviderStatus,
   538  	curMachineStatus status.Status,
   539  	providerAddrCount int,
   540  ) {
   541  	if curProviderStatus == status.Allocating || curProviderStatus == status.Pending {
   542  		// Keep the machine in the short poll group until it settles.
   543  		entry.bumpShortPollInterval(u.config.Clock)
   544  		return
   545  	}
   546  
   547  	// If the machine is currently in the long poll group and it has an
   548  	// unknown status or suddenly has no network addresses, move it back to
   549  	// the short poll group.
   550  	if curGroup == longPollGroup && (curProviderStatus == status.Unknown || providerAddrCount == 0) {
   551  		u.moveEntryToPollGroup(shortPollGroup, entry)
   552  		u.config.Logger.Debugf("moving machine %q (instance ID %q) back to short poll group", entry.m, entry.instanceID)
   553  		return
   554  	}
   555  
   556  	// The machine has started and we have at least one address; move to
   557  	// the long poll group
   558  	if providerAddrCount > 0 && curMachineStatus == status.Started {
   559  		u.moveEntryToPollGroup(longPollGroup, entry)
   560  		if curGroup != longPollGroup {
   561  			u.config.Logger.Debugf("moving machine %q (instance ID %q) to long poll group", entry.m, entry.instanceID)
   562  		}
   563  		return
   564  	}
   565  
   566  	// If we are in the short poll group apply exponential backoff to the
   567  	// poll frequency allow time for the machine to boot up.
   568  	if curGroup == shortPollGroup {
   569  		entry.bumpShortPollInterval(u.config.Clock)
   570  	}
   571  }
   572  
   573  func isPartialOrNoInstancesError(err error) bool {
   574  	cause := errors.Cause(err)
   575  	return cause == environs.ErrPartialInstances || cause == environs.ErrNoInstances
   576  }