github.com/makyo/juju@v0.0.0-20160425123129-2608902037e9/worker/instancepoller/updater.go (about) 1 // Copyright 2013 Canonical Ltd. 2 // Licensed under the AGPLv3, see LICENCE file for details. 3 4 package instancepoller 5 6 import ( 7 "fmt" 8 "time" 9 10 "github.com/juju/errors" 11 "github.com/juju/loggo" 12 "github.com/juju/names" 13 14 "github.com/juju/juju/apiserver/params" 15 "github.com/juju/juju/instance" 16 "github.com/juju/juju/network" 17 "github.com/juju/juju/status" 18 "github.com/juju/juju/watcher" 19 ) 20 21 var logger = loggo.GetLogger("juju.worker.instancepoller") 22 23 // ShortPoll and LongPoll hold the polling intervals for the instance 24 // updater. When a machine has no address or is not started, it will be 25 // polled at ShortPoll intervals until it does, exponentially backing off 26 // with an exponent of ShortPollBackoff until a maximum(ish) of LongPoll. 27 // 28 // When a machine has an address and is started LongPoll will be used to 29 // check that the instance address or status has not changed. 30 var ( 31 ShortPoll = 1 * time.Second 32 ShortPollBackoff = 2.0 33 LongPoll = 15 * time.Minute 34 ) 35 36 type machine interface { 37 Id() string 38 Tag() names.MachineTag 39 InstanceId() (instance.Id, error) 40 ProviderAddresses() ([]network.Address, error) 41 SetProviderAddresses(...network.Address) error 42 InstanceStatus() (params.StatusResult, error) 43 SetInstanceStatus(status.Status, string, map[string]interface{}) error 44 String() string 45 Refresh() error 46 Life() params.Life 47 Status() (params.StatusResult, error) 48 IsManual() (bool, error) 49 } 50 51 type instanceInfo struct { 52 addresses []network.Address 53 status instance.InstanceStatus 54 } 55 56 // lifetimeContext was extracted to allow the various context clients to get 57 // the benefits of the catacomb encapsulating everything that should happen 58 // here. A clean implementation would almost certainly not need this. 59 type lifetimeContext interface { 60 kill(error) 61 dying() <-chan struct{} 62 errDying() error 63 } 64 65 type machineContext interface { 66 lifetimeContext 67 instanceInfo(id instance.Id) (instanceInfo, error) 68 } 69 70 type updaterContext interface { 71 lifetimeContext 72 newMachineContext() machineContext 73 getMachine(tag names.MachineTag) (machine, error) 74 } 75 76 type updater struct { 77 context updaterContext 78 machines map[names.MachineTag]chan struct{} 79 machineDead chan machine 80 } 81 82 // watchMachinesLoop watches for changes provided by the given 83 // machinesWatcher and starts machine goroutines to deal with them, 84 // using the provided newMachineContext function to create the 85 // appropriate context for each new machine tag. 86 func watchMachinesLoop(context updaterContext, machinesWatcher watcher.StringsWatcher) (err error) { 87 p := &updater{ 88 context: context, 89 machines: make(map[names.MachineTag]chan struct{}), 90 machineDead: make(chan machine), 91 } 92 defer func() { 93 // TODO(fwereade): is this a home-grown sync.WaitGroup or something? 94 // strongly suspect these machine goroutines could be managed rather 95 // less opaquely if we made them all workers. 96 for len(p.machines) > 0 { 97 delete(p.machines, (<-p.machineDead).Tag()) 98 } 99 }() 100 for { 101 select { 102 case <-p.context.dying(): 103 return p.context.errDying() 104 case ids, ok := <-machinesWatcher.Changes(): 105 if !ok { 106 return errors.New("machines watcher closed") 107 } 108 tags := make([]names.MachineTag, len(ids)) 109 for i := range ids { 110 tags[i] = names.NewMachineTag(ids[i]) 111 } 112 if err := p.startMachines(tags); err != nil { 113 return err 114 } 115 case m := <-p.machineDead: 116 delete(p.machines, m.Tag()) 117 } 118 } 119 } 120 121 func (p *updater) startMachines(tags []names.MachineTag) error { 122 for _, tag := range tags { 123 if c := p.machines[tag]; c == nil { 124 // We don't know about the machine - start 125 // a goroutine to deal with it. 126 m, err := p.context.getMachine(tag) 127 if params.IsCodeNotFound(err) { 128 logger.Warningf("watcher gave notification of non-existent machine %q", tag.Id()) 129 continue 130 } 131 if err != nil { 132 return err 133 } 134 // We don't poll manual machines. 135 isManual, err := m.IsManual() 136 if err != nil { 137 return err 138 } 139 if isManual { 140 continue 141 } 142 c = make(chan struct{}) 143 p.machines[tag] = c 144 go runMachine(p.context.newMachineContext(), m, c, p.machineDead) 145 } else { 146 select { 147 case <-p.context.dying(): 148 return p.context.errDying() 149 case c <- struct{}{}: 150 } 151 } 152 } 153 return nil 154 } 155 156 // runMachine processes the address and status publishing for a given machine. 157 // We assume that the machine is alive when this is first called. 158 func runMachine(context machineContext, m machine, changed <-chan struct{}, died chan<- machine) { 159 defer func() { 160 // We can't just send on the died channel because the 161 // central loop might be trying to write to us on the 162 // changed channel. 163 for { 164 select { 165 case died <- m: 166 return 167 case <-changed: 168 } 169 } 170 }() 171 if err := machineLoop(context, m, changed); err != nil { 172 context.kill(err) 173 } 174 } 175 176 func machineLoop(context machineContext, m machine, changed <-chan struct{}) error { 177 // Use a short poll interval when initially waiting for 178 // a machine's address and machine agent to start, and a long one when it already 179 // has an address and the machine agent is started. 180 pollInterval := ShortPoll 181 pollInstance := true 182 for { 183 if pollInstance { 184 instInfo, err := pollInstanceInfo(context, m) 185 if err != nil && !params.IsCodeNotProvisioned(err) { 186 return err 187 } 188 machineStatus := status.StatusPending 189 if err == nil { 190 if statusInfo, err := m.Status(); err != nil { 191 logger.Warningf("cannot get current machine status for machine %v: %v", m.Id(), err) 192 } else { 193 machineStatus = statusInfo.Status 194 } 195 } 196 // the extra condition below (checking allocating/pending) is here to improve user experience 197 // without it the instance status will say "pending" for +10 minutes after the agent comes up to "started" 198 if instInfo.status.Status != status.StatusAllocating && instInfo.status.Status != status.StatusPending { 199 if len(instInfo.addresses) > 0 && machineStatus == status.StatusStarted { 200 // We've got at least one address and a status and instance is started, so poll infrequently. 201 pollInterval = LongPoll 202 } else if pollInterval < LongPoll { 203 // We have no addresses or not started - poll increasingly rarely 204 // until we do. 205 pollInterval = time.Duration(float64(pollInterval) * ShortPollBackoff) 206 } 207 } 208 pollInstance = false 209 } 210 select { 211 case <-context.dying(): 212 return context.errDying() 213 case <-time.After(pollInterval): 214 // TODO(fwereade): 2016-03-17 lp:1558657 215 pollInstance = true 216 case <-changed: 217 if err := m.Refresh(); err != nil { 218 return err 219 } 220 if m.Life() == params.Dead { 221 return nil 222 } 223 } 224 } 225 } 226 227 // pollInstanceInfo checks the current provider addresses and status 228 // for the given machine's instance, and sets them on the machine if they've changed. 229 func pollInstanceInfo(context machineContext, m machine) (instInfo instanceInfo, err error) { 230 instInfo = instanceInfo{} 231 instId, err := m.InstanceId() 232 // We can't ask the machine for its addresses if it isn't provisioned yet. 233 if params.IsCodeNotProvisioned(err) { 234 return instInfo, err 235 } 236 if err != nil { 237 return instInfo, fmt.Errorf("cannot get machine's instance id: %v", err) 238 } 239 instInfo, err = context.instanceInfo(instId) 240 if err != nil { 241 // TODO (anastasiamac 2016-02-01) This does not look like it needs to be removed now. 242 if params.IsCodeNotImplemented(err) { 243 return instInfo, err 244 } 245 logger.Warningf("cannot get instance info for instance %q: %v", instId, err) 246 return instInfo, nil 247 } 248 instStat, err := m.InstanceStatus() 249 if err != nil { 250 // This should never occur since the machine is provisioned. 251 // But just in case, we reset polled status so we try again next time. 252 logger.Warningf("cannot get current instance status for machine %v: %v", m.Id(), err) 253 instInfo.status = instance.InstanceStatus{status.StatusUnknown, ""} 254 } else { 255 currentInstStatus := instance.InstanceStatus{ 256 Status: instStat.Status, 257 Message: instStat.Info, 258 } 259 if instInfo.status != currentInstStatus { 260 logger.Infof("machine %q instance status changed from %q to %q", m.Id(), currentInstStatus, instInfo.status) 261 if err = m.SetInstanceStatus(instInfo.status.Status, instInfo.status.Message, nil); err != nil { 262 logger.Errorf("cannot set instance status on %q: %v", m, err) 263 } 264 } 265 } 266 providerAddresses, err := m.ProviderAddresses() 267 if err != nil { 268 return instInfo, err 269 } 270 if !addressesEqual(providerAddresses, instInfo.addresses) { 271 logger.Infof("machine %q has new addresses: %v", m.Id(), instInfo.addresses) 272 if err = m.SetProviderAddresses(instInfo.addresses...); err != nil { 273 logger.Errorf("cannot set addresses on %q: %v", m, err) 274 } 275 } 276 return instInfo, err 277 } 278 279 // addressesEqual compares the addresses of the machine and the instance information. 280 func addressesEqual(a0, a1 []network.Address) bool { 281 if len(a0) != len(a1) { 282 logger.Tracef("address lists have different lengths %d != %d for %v != %v", 283 len(a0), len(a1), a0, a1) 284 return false 285 } 286 287 ca0 := make([]network.Address, len(a0)) 288 copy(ca0, a0) 289 network.SortAddresses(ca0, true) 290 ca1 := make([]network.Address, len(a1)) 291 copy(ca1, a1) 292 network.SortAddresses(ca1, true) 293 294 for i := range ca0 { 295 if ca0[i] != ca1[i] { 296 logger.Tracef("address entry at offset %d has a different value for %v != %v", 297 i, ca0, ca1) 298 return false 299 } 300 } 301 return true 302 }