github.com/niedbalski/juju@v0.0.0-20190215020005-8ff100488e47/worker/instancepoller/updater.go (about)

     1  // Copyright 2013 Canonical Ltd.
     2  // Licensed under the AGPLv3, see LICENCE file for details.
     3  
     4  package instancepoller
     5  
     6  import (
     7  	"time"
     8  
     9  	"github.com/juju/clock"
    10  	"github.com/juju/errors"
    11  	"github.com/juju/loggo"
    12  	"gopkg.in/juju/names.v2"
    13  
    14  	"github.com/juju/juju/apiserver/params"
    15  	"github.com/juju/juju/core/instance"
    16  	"github.com/juju/juju/core/status"
    17  	"github.com/juju/juju/core/watcher"
    18  	"github.com/juju/juju/network"
    19  )
    20  
    21  var logger = loggo.GetLogger("juju.worker.instancepoller")
    22  
    23  // ShortPoll and LongPoll hold the polling intervals for the instance
    24  // updater. When a machine has no address or is not started, it will be
    25  // polled at ShortPoll intervals until it does, exponentially backing off
    26  // with an exponent of ShortPollBackoff until a maximum(ish) of LongPoll.
    27  //
    28  // When a machine has an address and is started LongPoll will be used to
    29  // check that the instance address or status has not changed.
    30  var (
    31  	ShortPoll        = 1 * time.Second
    32  	ShortPollBackoff = 2.0
    33  	LongPoll         = 15 * time.Minute
    34  )
    35  
    36  type machine interface {
    37  	Id() string
    38  	Tag() names.MachineTag
    39  	InstanceId() (instance.Id, error)
    40  	ProviderAddresses() ([]network.Address, error)
    41  	SetProviderAddresses(...network.Address) error
    42  	InstanceStatus() (params.StatusResult, error)
    43  	SetInstanceStatus(status.Status, string, map[string]interface{}) error
    44  	String() string
    45  	Refresh() error
    46  	Life() params.Life
    47  	Status() (params.StatusResult, error)
    48  	IsManual() (bool, error)
    49  }
    50  
    51  type instanceInfo struct {
    52  	addresses []network.Address
    53  	status    instance.Status
    54  }
    55  
    56  // lifetimeContext was extracted to allow the various context clients to get
    57  // the benefits of the catacomb encapsulating everything that should happen
    58  // here. A clean implementation would almost certainly not need this.
    59  type lifetimeContext interface {
    60  	kill(error)
    61  	dying() <-chan struct{}
    62  	errDying() error
    63  }
    64  
    65  type machineContext interface {
    66  	lifetimeContext
    67  	instanceInfo(id instance.Id) (instanceInfo, error)
    68  }
    69  
    70  type updaterContext interface {
    71  	lifetimeContext
    72  	newMachineContext() machineContext
    73  	getMachine(tag names.MachineTag) (machine, error)
    74  }
    75  
    76  type updater struct {
    77  	context     updaterContext
    78  	machines    map[names.MachineTag]chan struct{}
    79  	machineDead chan machine
    80  }
    81  
    82  // watchMachinesLoop watches for changes provided by the given
    83  // machinesWatcher and starts machine goroutines to deal with them,
    84  // using the provided newMachineContext function to create the
    85  // appropriate context for each new machine tag.
    86  func watchMachinesLoop(context updaterContext, machinesWatcher watcher.StringsWatcher) (err error) {
    87  	p := &updater{
    88  		context:     context,
    89  		machines:    make(map[names.MachineTag]chan struct{}),
    90  		machineDead: make(chan machine),
    91  	}
    92  	defer func() {
    93  		// TODO(fwereade): is this a home-grown sync.WaitGroup or something?
    94  		// strongly suspect these machine goroutines could be managed rather
    95  		// less opaquely if we made them all workers.
    96  		for len(p.machines) > 0 {
    97  			delete(p.machines, (<-p.machineDead).Tag())
    98  		}
    99  	}()
   100  	for {
   101  		select {
   102  		case <-p.context.dying():
   103  			return p.context.errDying()
   104  		case ids, ok := <-machinesWatcher.Changes():
   105  			if !ok {
   106  				return errors.New("machines watcher closed")
   107  			}
   108  			tags := make([]names.MachineTag, len(ids))
   109  			for i := range ids {
   110  				tags[i] = names.NewMachineTag(ids[i])
   111  			}
   112  			if err := p.startMachines(tags); err != nil {
   113  				return err
   114  			}
   115  		case m := <-p.machineDead:
   116  			delete(p.machines, m.Tag())
   117  		}
   118  	}
   119  }
   120  
   121  func (p *updater) startMachines(tags []names.MachineTag) error {
   122  	for _, tag := range tags {
   123  		if c := p.machines[tag]; c == nil {
   124  			// We don't know about the machine - start
   125  			// a goroutine to deal with it.
   126  			m, err := p.context.getMachine(tag)
   127  			if err != nil {
   128  				return errors.Trace(err)
   129  			}
   130  			// We don't poll manual machines, instead we're setting the status to 'running'
   131  			// as we don't have any better information from the provider, see lp:1678981
   132  			isManual, err := m.IsManual()
   133  			if err != nil {
   134  				return errors.Trace(err)
   135  			}
   136  			if isManual {
   137  				statusInfo, err := m.Status()
   138  				if err != nil {
   139  					return errors.Trace(err)
   140  				}
   141  				machineStatus := status.Status(statusInfo.Status)
   142  				if machineStatus != status.Running {
   143  					if err = m.SetInstanceStatus(status.Running, "Manually provisioned machine", nil); err != nil {
   144  						logger.Errorf("cannot set instance status on %q: %v", m, err)
   145  					}
   146  				}
   147  				continue
   148  			}
   149  			c = make(chan struct{})
   150  			p.machines[tag] = c
   151  			// TODO(fwereade): 2016-03-17 lp:1558657
   152  			go runMachine(p.context.newMachineContext(), m, c, p.machineDead, clock.WallClock)
   153  		} else {
   154  			select {
   155  			case <-p.context.dying():
   156  				return p.context.errDying()
   157  			case c <- struct{}{}:
   158  			}
   159  		}
   160  	}
   161  	return nil
   162  }
   163  
   164  // runMachine processes the address and status publishing for a given machine.
   165  // We assume that the machine is alive when this is first called.
   166  func runMachine(context machineContext, m machine, changed <-chan struct{}, died chan<- machine, clock clock.Clock) {
   167  	defer func() {
   168  		// We can't just send on the died channel because the
   169  		// central loop might be trying to write to us on the
   170  		// changed channel.
   171  		for {
   172  			select {
   173  			case died <- m:
   174  				return
   175  			case <-changed:
   176  			}
   177  		}
   178  	}()
   179  	if err := machineLoop(context, m, changed, clock); err != nil {
   180  		context.kill(err)
   181  	}
   182  }
   183  
   184  func machineLoop(context machineContext, m machine, lifeChanged <-chan struct{}, clock clock.Clock) error {
   185  	// Use a short poll interval when initially waiting for
   186  	// a machine's address and machine agent to start, and a long one when it already
   187  	// has an address and the machine agent is started.
   188  	pollInterval := ShortPoll
   189  	pollInstance := func() error {
   190  		instInfo, err := pollInstanceInfo(context, m)
   191  		if err != nil {
   192  			return err
   193  		}
   194  
   195  		machineStatus := status.Pending
   196  		if err == nil {
   197  			if statusInfo, err := m.Status(); err != nil {
   198  				logger.Warningf("cannot get current machine status for machine %v: %v", m.Id(), err)
   199  			} else {
   200  				// TODO(perrito666) add status validation.
   201  				machineStatus = status.Status(statusInfo.Status)
   202  			}
   203  		}
   204  
   205  		// the extra condition below (checking allocating/pending) is here to improve user experience
   206  		// without it the instance status will say "pending" for +10 minutes after the agent comes up to "started"
   207  		if instInfo.status.Status != status.Allocating && instInfo.status.Status != status.Pending {
   208  			if len(instInfo.addresses) > 0 && machineStatus == status.Started {
   209  				// We've got at least one address and a status and instance is started, so poll infrequently.
   210  				pollInterval = LongPoll
   211  			} else if pollInterval < LongPoll {
   212  				// We have no addresses or not started - poll increasingly rarely
   213  				// until we do.
   214  				pollInterval = time.Duration(float64(pollInterval) * ShortPollBackoff)
   215  				if pollInterval > LongPoll {
   216  					pollInterval = LongPoll
   217  				}
   218  			}
   219  		}
   220  		return nil
   221  	}
   222  
   223  	shouldPollInstance := true
   224  	for {
   225  		if shouldPollInstance {
   226  			if err := pollInstance(); err != nil {
   227  				if !params.IsCodeNotProvisioned(err) {
   228  					return errors.Trace(err)
   229  				}
   230  			}
   231  			shouldPollInstance = false
   232  		}
   233  		select {
   234  		case <-context.dying():
   235  			return context.errDying()
   236  		case <-clock.After(pollInterval):
   237  			shouldPollInstance = true
   238  		case <-lifeChanged:
   239  			if err := m.Refresh(); err != nil {
   240  				return err
   241  			}
   242  			if m.Life() == params.Dead {
   243  				return nil
   244  			}
   245  		}
   246  	}
   247  }
   248  
   249  // pollInstanceInfo checks the current provider addresses and status
   250  // for the given machine's instance, and sets them on the machine if they've changed.
   251  func pollInstanceInfo(context machineContext, m machine) (instInfo instanceInfo, err error) {
   252  	instInfo = instanceInfo{}
   253  	instId, err := m.InstanceId()
   254  	// We can't ask the machine for its addresses if it isn't provisioned yet.
   255  	if params.IsCodeNotProvisioned(err) {
   256  		return instanceInfo{}, err
   257  	}
   258  	if err != nil {
   259  		return instanceInfo{}, errors.Annotate(err, "cannot get machine's instance id")
   260  	}
   261  	instInfo, err = context.instanceInfo(instId)
   262  	if err != nil {
   263  		// TODO (anastasiamac 2016-02-01) This does not look like it needs to be removed now.
   264  		if params.IsCodeNotImplemented(err) {
   265  			return instanceInfo{}, err
   266  		}
   267  		logger.Warningf("cannot get instance info for instance %q: %v", instId, err)
   268  		return instInfo, nil
   269  	}
   270  	if instStat, err := m.InstanceStatus(); err != nil {
   271  		// This should never occur since the machine is provisioned.
   272  		// But just in case, we reset polled status so we try again next time.
   273  		logger.Warningf("cannot get current instance status for machine %v: %v", m.Id(), err)
   274  		instInfo.status = instance.Status{status.Unknown, ""}
   275  	} else {
   276  		// TODO(perrito666) add status validation.
   277  		currentInstStatus := instance.Status{
   278  			Status:  status.Status(instStat.Status),
   279  			Message: instStat.Info,
   280  		}
   281  		if instInfo.status != currentInstStatus {
   282  			logger.Infof("machine %q instance status changed from %q to %q", m.Id(), currentInstStatus, instInfo.status)
   283  			if err = m.SetInstanceStatus(instInfo.status.Status, instInfo.status.Message, nil); err != nil {
   284  				logger.Errorf("cannot set instance status on %q: %v", m, err)
   285  				return instanceInfo{}, err
   286  			}
   287  		}
   288  
   289  	}
   290  	if m.Life() != params.Dead {
   291  		providerAddresses, err := m.ProviderAddresses()
   292  		if err != nil {
   293  			return instanceInfo{}, err
   294  		}
   295  		if !addressesEqual(providerAddresses, instInfo.addresses) {
   296  			logger.Infof("machine %q has new addresses: %v", m.Id(), instInfo.addresses)
   297  			if err := m.SetProviderAddresses(instInfo.addresses...); err != nil {
   298  				logger.Errorf("cannot set addresses on %q: %v", m, err)
   299  				return instanceInfo{}, err
   300  			}
   301  		}
   302  	}
   303  	return instInfo, nil
   304  }
   305  
   306  // addressesEqual compares the addresses of the machine and the instance information.
   307  func addressesEqual(a0, a1 []network.Address) bool {
   308  	if len(a0) != len(a1) {
   309  		logger.Tracef("address lists have different lengths %d != %d for %v != %v",
   310  			len(a0), len(a1), a0, a1)
   311  		return false
   312  	}
   313  
   314  	ca0 := make([]network.Address, len(a0))
   315  	copy(ca0, a0)
   316  	network.SortAddresses(ca0)
   317  	ca1 := make([]network.Address, len(a1))
   318  	copy(ca1, a1)
   319  	network.SortAddresses(ca1)
   320  
   321  	for i := range ca0 {
   322  		if ca0[i] != ca1[i] {
   323  			logger.Tracef("address entry at offset %d has a different value for %v != %v",
   324  				i, ca0, ca1)
   325  			return false
   326  		}
   327  	}
   328  	return true
   329  }