github.com/axw/juju@v0.0.0-20161005053422-4bd6544d08d4/worker/instancepoller/updater.go (about) 1 // Copyright 2013 Canonical Ltd. 2 // Licensed under the AGPLv3, see LICENCE file for details. 3 4 package instancepoller 5 6 import ( 7 "time" 8 9 "github.com/juju/errors" 10 "github.com/juju/loggo" 11 "github.com/juju/utils/clock" 12 "gopkg.in/juju/names.v2" 13 14 "github.com/juju/juju/apiserver/params" 15 "github.com/juju/juju/instance" 16 "github.com/juju/juju/network" 17 "github.com/juju/juju/status" 18 "github.com/juju/juju/watcher" 19 ) 20 21 var logger = loggo.GetLogger("juju.worker.instancepoller") 22 23 // ShortPoll and LongPoll hold the polling intervals for the instance 24 // updater. When a machine has no address or is not started, it will be 25 // polled at ShortPoll intervals until it does, exponentially backing off 26 // with an exponent of ShortPollBackoff until a maximum(ish) of LongPoll. 27 // 28 // When a machine has an address and is started LongPoll will be used to 29 // check that the instance address or status has not changed. 30 var ( 31 ShortPoll = 1 * time.Second 32 ShortPollBackoff = 2.0 33 LongPoll = 15 * time.Minute 34 ) 35 36 type machine interface { 37 Id() string 38 Tag() names.MachineTag 39 InstanceId() (instance.Id, error) 40 ProviderAddresses() ([]network.Address, error) 41 SetProviderAddresses(...network.Address) error 42 InstanceStatus() (params.StatusResult, error) 43 SetInstanceStatus(status.Status, string, map[string]interface{}) error 44 String() string 45 Refresh() error 46 Life() params.Life 47 Status() (params.StatusResult, error) 48 IsManual() (bool, error) 49 } 50 51 type instanceInfo struct { 52 addresses []network.Address 53 status instance.InstanceStatus 54 } 55 56 // lifetimeContext was extracted to allow the various context clients to get 57 // the benefits of the catacomb encapsulating everything that should happen 58 // here. A clean implementation would almost certainly not need this. 59 type lifetimeContext interface { 60 kill(error) 61 dying() <-chan struct{} 62 errDying() error 63 } 64 65 type machineContext interface { 66 lifetimeContext 67 instanceInfo(id instance.Id) (instanceInfo, error) 68 } 69 70 type updaterContext interface { 71 lifetimeContext 72 newMachineContext() machineContext 73 getMachine(tag names.MachineTag) (machine, error) 74 } 75 76 type updater struct { 77 context updaterContext 78 machines map[names.MachineTag]chan struct{} 79 machineDead chan machine 80 } 81 82 // watchMachinesLoop watches for changes provided by the given 83 // machinesWatcher and starts machine goroutines to deal with them, 84 // using the provided newMachineContext function to create the 85 // appropriate context for each new machine tag. 86 func watchMachinesLoop(context updaterContext, machinesWatcher watcher.StringsWatcher) (err error) { 87 p := &updater{ 88 context: context, 89 machines: make(map[names.MachineTag]chan struct{}), 90 machineDead: make(chan machine), 91 } 92 defer func() { 93 // TODO(fwereade): is this a home-grown sync.WaitGroup or something? 94 // strongly suspect these machine goroutines could be managed rather 95 // less opaquely if we made them all workers. 96 for len(p.machines) > 0 { 97 delete(p.machines, (<-p.machineDead).Tag()) 98 } 99 }() 100 for { 101 select { 102 case <-p.context.dying(): 103 return p.context.errDying() 104 case ids, ok := <-machinesWatcher.Changes(): 105 if !ok { 106 return errors.New("machines watcher closed") 107 } 108 tags := make([]names.MachineTag, len(ids)) 109 for i := range ids { 110 tags[i] = names.NewMachineTag(ids[i]) 111 } 112 if err := p.startMachines(tags); err != nil { 113 return err 114 } 115 case m := <-p.machineDead: 116 delete(p.machines, m.Tag()) 117 } 118 } 119 } 120 121 func (p *updater) startMachines(tags []names.MachineTag) error { 122 for _, tag := range tags { 123 if c := p.machines[tag]; c == nil { 124 // We don't know about the machine - start 125 // a goroutine to deal with it. 126 m, err := p.context.getMachine(tag) 127 if err != nil { 128 return errors.Trace(err) 129 } 130 // We don't poll manual machines. 131 isManual, err := m.IsManual() 132 if err != nil { 133 return errors.Trace(err) 134 } 135 if isManual { 136 continue 137 } 138 c = make(chan struct{}) 139 p.machines[tag] = c 140 // TODO(fwereade): 2016-03-17 lp:1558657 141 go runMachine(p.context.newMachineContext(), m, c, p.machineDead, clock.WallClock) 142 } else { 143 select { 144 case <-p.context.dying(): 145 return p.context.errDying() 146 case c <- struct{}{}: 147 } 148 } 149 } 150 return nil 151 } 152 153 // runMachine processes the address and status publishing for a given machine. 154 // We assume that the machine is alive when this is first called. 155 func runMachine(context machineContext, m machine, changed <-chan struct{}, died chan<- machine, clock clock.Clock) { 156 defer func() { 157 // We can't just send on the died channel because the 158 // central loop might be trying to write to us on the 159 // changed channel. 160 for { 161 select { 162 case died <- m: 163 return 164 case <-changed: 165 } 166 } 167 }() 168 if err := machineLoop(context, m, changed, clock); err != nil { 169 context.kill(err) 170 } 171 } 172 173 func machineLoop(context machineContext, m machine, lifeChanged <-chan struct{}, clock clock.Clock) error { 174 // Use a short poll interval when initially waiting for 175 // a machine's address and machine agent to start, and a long one when it already 176 // has an address and the machine agent is started. 177 pollInterval := ShortPoll 178 pollInstance := func() error { 179 instInfo, err := pollInstanceInfo(context, m) 180 if err != nil { 181 return err 182 } 183 184 machineStatus := status.Pending 185 if err == nil { 186 if statusInfo, err := m.Status(); err != nil { 187 logger.Warningf("cannot get current machine status for machine %v: %v", m.Id(), err) 188 } else { 189 // TODO(perrito666) add status validation. 190 machineStatus = status.Status(statusInfo.Status) 191 } 192 } 193 194 // the extra condition below (checking allocating/pending) is here to improve user experience 195 // without it the instance status will say "pending" for +10 minutes after the agent comes up to "started" 196 if instInfo.status.Status != status.Allocating && instInfo.status.Status != status.Pending { 197 if len(instInfo.addresses) > 0 && machineStatus == status.Started { 198 // We've got at least one address and a status and instance is started, so poll infrequently. 199 pollInterval = LongPoll 200 } else if pollInterval < LongPoll { 201 // We have no addresses or not started - poll increasingly rarely 202 // until we do. 203 pollInterval = time.Duration(float64(pollInterval) * ShortPollBackoff) 204 } 205 } 206 return nil 207 } 208 209 shouldPollInstance := true 210 for { 211 if shouldPollInstance { 212 if err := pollInstance(); err != nil { 213 if !params.IsCodeNotProvisioned(err) { 214 return errors.Trace(err) 215 } 216 } 217 shouldPollInstance = false 218 } 219 select { 220 case <-context.dying(): 221 return context.errDying() 222 case <-clock.After(pollInterval): 223 shouldPollInstance = true 224 case <-lifeChanged: 225 if err := m.Refresh(); err != nil { 226 return err 227 } 228 if m.Life() == params.Dead { 229 return nil 230 } 231 } 232 } 233 } 234 235 // pollInstanceInfo checks the current provider addresses and status 236 // for the given machine's instance, and sets them on the machine if they've changed. 237 func pollInstanceInfo(context machineContext, m machine) (instInfo instanceInfo, err error) { 238 instInfo = instanceInfo{} 239 instId, err := m.InstanceId() 240 // We can't ask the machine for its addresses if it isn't provisioned yet. 241 if params.IsCodeNotProvisioned(err) { 242 return instanceInfo{}, err 243 } 244 if err != nil { 245 return instanceInfo{}, errors.Annotate(err, "cannot get machine's instance id") 246 } 247 instInfo, err = context.instanceInfo(instId) 248 if err != nil { 249 // TODO (anastasiamac 2016-02-01) This does not look like it needs to be removed now. 250 if params.IsCodeNotImplemented(err) { 251 return instanceInfo{}, err 252 } 253 logger.Warningf("cannot get instance info for instance %q: %v", instId, err) 254 return instInfo, nil 255 } 256 if instStat, err := m.InstanceStatus(); err != nil { 257 // This should never occur since the machine is provisioned. 258 // But just in case, we reset polled status so we try again next time. 259 logger.Warningf("cannot get current instance status for machine %v: %v", m.Id(), err) 260 instInfo.status = instance.InstanceStatus{status.Unknown, ""} 261 } else { 262 // TODO(perrito666) add status validation. 263 currentInstStatus := instance.InstanceStatus{ 264 Status: status.Status(instStat.Status), 265 Message: instStat.Info, 266 } 267 if instInfo.status != currentInstStatus { 268 logger.Infof("machine %q instance status changed from %q to %q", m.Id(), currentInstStatus, instInfo.status) 269 if err = m.SetInstanceStatus(instInfo.status.Status, instInfo.status.Message, nil); err != nil { 270 logger.Errorf("cannot set instance status on %q: %v", m, err) 271 return instanceInfo{}, err 272 } 273 } 274 } 275 if m.Life() != params.Dead { 276 providerAddresses, err := m.ProviderAddresses() 277 if err != nil { 278 return instanceInfo{}, err 279 } 280 if !addressesEqual(providerAddresses, instInfo.addresses) { 281 logger.Infof("machine %q has new addresses: %v", m.Id(), instInfo.addresses) 282 if err := m.SetProviderAddresses(instInfo.addresses...); err != nil { 283 logger.Errorf("cannot set addresses on %q: %v", m, err) 284 return instanceInfo{}, err 285 } 286 } 287 } 288 return instInfo, nil 289 } 290 291 // addressesEqual compares the addresses of the machine and the instance information. 292 func addressesEqual(a0, a1 []network.Address) bool { 293 if len(a0) != len(a1) { 294 logger.Tracef("address lists have different lengths %d != %d for %v != %v", 295 len(a0), len(a1), a0, a1) 296 return false 297 } 298 299 ca0 := make([]network.Address, len(a0)) 300 copy(ca0, a0) 301 network.SortAddresses(ca0) 302 ca1 := make([]network.Address, len(a1)) 303 copy(ca1, a1) 304 network.SortAddresses(ca1) 305 306 for i := range ca0 { 307 if ca0[i] != ca1[i] { 308 logger.Tracef("address entry at offset %d has a different value for %v != %v", 309 i, ca0, ca1) 310 return false 311 } 312 } 313 return true 314 }