github.com/niedbalski/juju@v0.0.0-20190215020005-8ff100488e47/worker/instancepoller/updater.go (about) 1 // Copyright 2013 Canonical Ltd. 2 // Licensed under the AGPLv3, see LICENCE file for details. 3 4 package instancepoller 5 6 import ( 7 "time" 8 9 "github.com/juju/clock" 10 "github.com/juju/errors" 11 "github.com/juju/loggo" 12 "gopkg.in/juju/names.v2" 13 14 "github.com/juju/juju/apiserver/params" 15 "github.com/juju/juju/core/instance" 16 "github.com/juju/juju/core/status" 17 "github.com/juju/juju/core/watcher" 18 "github.com/juju/juju/network" 19 ) 20 21 var logger = loggo.GetLogger("juju.worker.instancepoller") 22 23 // ShortPoll and LongPoll hold the polling intervals for the instance 24 // updater. When a machine has no address or is not started, it will be 25 // polled at ShortPoll intervals until it does, exponentially backing off 26 // with an exponent of ShortPollBackoff until a maximum(ish) of LongPoll. 27 // 28 // When a machine has an address and is started LongPoll will be used to 29 // check that the instance address or status has not changed. 30 var ( 31 ShortPoll = 1 * time.Second 32 ShortPollBackoff = 2.0 33 LongPoll = 15 * time.Minute 34 ) 35 36 type machine interface { 37 Id() string 38 Tag() names.MachineTag 39 InstanceId() (instance.Id, error) 40 ProviderAddresses() ([]network.Address, error) 41 SetProviderAddresses(...network.Address) error 42 InstanceStatus() (params.StatusResult, error) 43 SetInstanceStatus(status.Status, string, map[string]interface{}) error 44 String() string 45 Refresh() error 46 Life() params.Life 47 Status() (params.StatusResult, error) 48 IsManual() (bool, error) 49 } 50 51 type instanceInfo struct { 52 addresses []network.Address 53 status instance.Status 54 } 55 56 // lifetimeContext was extracted to allow the various context clients to get 57 // the benefits of the catacomb encapsulating everything that should happen 58 // here. A clean implementation would almost certainly not need this. 59 type lifetimeContext interface { 60 kill(error) 61 dying() <-chan struct{} 62 errDying() error 63 } 64 65 type machineContext interface { 66 lifetimeContext 67 instanceInfo(id instance.Id) (instanceInfo, error) 68 } 69 70 type updaterContext interface { 71 lifetimeContext 72 newMachineContext() machineContext 73 getMachine(tag names.MachineTag) (machine, error) 74 } 75 76 type updater struct { 77 context updaterContext 78 machines map[names.MachineTag]chan struct{} 79 machineDead chan machine 80 } 81 82 // watchMachinesLoop watches for changes provided by the given 83 // machinesWatcher and starts machine goroutines to deal with them, 84 // using the provided newMachineContext function to create the 85 // appropriate context for each new machine tag. 86 func watchMachinesLoop(context updaterContext, machinesWatcher watcher.StringsWatcher) (err error) { 87 p := &updater{ 88 context: context, 89 machines: make(map[names.MachineTag]chan struct{}), 90 machineDead: make(chan machine), 91 } 92 defer func() { 93 // TODO(fwereade): is this a home-grown sync.WaitGroup or something? 94 // strongly suspect these machine goroutines could be managed rather 95 // less opaquely if we made them all workers. 96 for len(p.machines) > 0 { 97 delete(p.machines, (<-p.machineDead).Tag()) 98 } 99 }() 100 for { 101 select { 102 case <-p.context.dying(): 103 return p.context.errDying() 104 case ids, ok := <-machinesWatcher.Changes(): 105 if !ok { 106 return errors.New("machines watcher closed") 107 } 108 tags := make([]names.MachineTag, len(ids)) 109 for i := range ids { 110 tags[i] = names.NewMachineTag(ids[i]) 111 } 112 if err := p.startMachines(tags); err != nil { 113 return err 114 } 115 case m := <-p.machineDead: 116 delete(p.machines, m.Tag()) 117 } 118 } 119 } 120 121 func (p *updater) startMachines(tags []names.MachineTag) error { 122 for _, tag := range tags { 123 if c := p.machines[tag]; c == nil { 124 // We don't know about the machine - start 125 // a goroutine to deal with it. 126 m, err := p.context.getMachine(tag) 127 if err != nil { 128 return errors.Trace(err) 129 } 130 // We don't poll manual machines, instead we're setting the status to 'running' 131 // as we don't have any better information from the provider, see lp:1678981 132 isManual, err := m.IsManual() 133 if err != nil { 134 return errors.Trace(err) 135 } 136 if isManual { 137 statusInfo, err := m.Status() 138 if err != nil { 139 return errors.Trace(err) 140 } 141 machineStatus := status.Status(statusInfo.Status) 142 if machineStatus != status.Running { 143 if err = m.SetInstanceStatus(status.Running, "Manually provisioned machine", nil); err != nil { 144 logger.Errorf("cannot set instance status on %q: %v", m, err) 145 } 146 } 147 continue 148 } 149 c = make(chan struct{}) 150 p.machines[tag] = c 151 // TODO(fwereade): 2016-03-17 lp:1558657 152 go runMachine(p.context.newMachineContext(), m, c, p.machineDead, clock.WallClock) 153 } else { 154 select { 155 case <-p.context.dying(): 156 return p.context.errDying() 157 case c <- struct{}{}: 158 } 159 } 160 } 161 return nil 162 } 163 164 // runMachine processes the address and status publishing for a given machine. 165 // We assume that the machine is alive when this is first called. 166 func runMachine(context machineContext, m machine, changed <-chan struct{}, died chan<- machine, clock clock.Clock) { 167 defer func() { 168 // We can't just send on the died channel because the 169 // central loop might be trying to write to us on the 170 // changed channel. 171 for { 172 select { 173 case died <- m: 174 return 175 case <-changed: 176 } 177 } 178 }() 179 if err := machineLoop(context, m, changed, clock); err != nil { 180 context.kill(err) 181 } 182 } 183 184 func machineLoop(context machineContext, m machine, lifeChanged <-chan struct{}, clock clock.Clock) error { 185 // Use a short poll interval when initially waiting for 186 // a machine's address and machine agent to start, and a long one when it already 187 // has an address and the machine agent is started. 188 pollInterval := ShortPoll 189 pollInstance := func() error { 190 instInfo, err := pollInstanceInfo(context, m) 191 if err != nil { 192 return err 193 } 194 195 machineStatus := status.Pending 196 if err == nil { 197 if statusInfo, err := m.Status(); err != nil { 198 logger.Warningf("cannot get current machine status for machine %v: %v", m.Id(), err) 199 } else { 200 // TODO(perrito666) add status validation. 201 machineStatus = status.Status(statusInfo.Status) 202 } 203 } 204 205 // the extra condition below (checking allocating/pending) is here to improve user experience 206 // without it the instance status will say "pending" for +10 minutes after the agent comes up to "started" 207 if instInfo.status.Status != status.Allocating && instInfo.status.Status != status.Pending { 208 if len(instInfo.addresses) > 0 && machineStatus == status.Started { 209 // We've got at least one address and a status and instance is started, so poll infrequently. 210 pollInterval = LongPoll 211 } else if pollInterval < LongPoll { 212 // We have no addresses or not started - poll increasingly rarely 213 // until we do. 214 pollInterval = time.Duration(float64(pollInterval) * ShortPollBackoff) 215 if pollInterval > LongPoll { 216 pollInterval = LongPoll 217 } 218 } 219 } 220 return nil 221 } 222 223 shouldPollInstance := true 224 for { 225 if shouldPollInstance { 226 if err := pollInstance(); err != nil { 227 if !params.IsCodeNotProvisioned(err) { 228 return errors.Trace(err) 229 } 230 } 231 shouldPollInstance = false 232 } 233 select { 234 case <-context.dying(): 235 return context.errDying() 236 case <-clock.After(pollInterval): 237 shouldPollInstance = true 238 case <-lifeChanged: 239 if err := m.Refresh(); err != nil { 240 return err 241 } 242 if m.Life() == params.Dead { 243 return nil 244 } 245 } 246 } 247 } 248 249 // pollInstanceInfo checks the current provider addresses and status 250 // for the given machine's instance, and sets them on the machine if they've changed. 251 func pollInstanceInfo(context machineContext, m machine) (instInfo instanceInfo, err error) { 252 instInfo = instanceInfo{} 253 instId, err := m.InstanceId() 254 // We can't ask the machine for its addresses if it isn't provisioned yet. 255 if params.IsCodeNotProvisioned(err) { 256 return instanceInfo{}, err 257 } 258 if err != nil { 259 return instanceInfo{}, errors.Annotate(err, "cannot get machine's instance id") 260 } 261 instInfo, err = context.instanceInfo(instId) 262 if err != nil { 263 // TODO (anastasiamac 2016-02-01) This does not look like it needs to be removed now. 264 if params.IsCodeNotImplemented(err) { 265 return instanceInfo{}, err 266 } 267 logger.Warningf("cannot get instance info for instance %q: %v", instId, err) 268 return instInfo, nil 269 } 270 if instStat, err := m.InstanceStatus(); err != nil { 271 // This should never occur since the machine is provisioned. 272 // But just in case, we reset polled status so we try again next time. 273 logger.Warningf("cannot get current instance status for machine %v: %v", m.Id(), err) 274 instInfo.status = instance.Status{status.Unknown, ""} 275 } else { 276 // TODO(perrito666) add status validation. 277 currentInstStatus := instance.Status{ 278 Status: status.Status(instStat.Status), 279 Message: instStat.Info, 280 } 281 if instInfo.status != currentInstStatus { 282 logger.Infof("machine %q instance status changed from %q to %q", m.Id(), currentInstStatus, instInfo.status) 283 if err = m.SetInstanceStatus(instInfo.status.Status, instInfo.status.Message, nil); err != nil { 284 logger.Errorf("cannot set instance status on %q: %v", m, err) 285 return instanceInfo{}, err 286 } 287 } 288 289 } 290 if m.Life() != params.Dead { 291 providerAddresses, err := m.ProviderAddresses() 292 if err != nil { 293 return instanceInfo{}, err 294 } 295 if !addressesEqual(providerAddresses, instInfo.addresses) { 296 logger.Infof("machine %q has new addresses: %v", m.Id(), instInfo.addresses) 297 if err := m.SetProviderAddresses(instInfo.addresses...); err != nil { 298 logger.Errorf("cannot set addresses on %q: %v", m, err) 299 return instanceInfo{}, err 300 } 301 } 302 } 303 return instInfo, nil 304 } 305 306 // addressesEqual compares the addresses of the machine and the instance information. 307 func addressesEqual(a0, a1 []network.Address) bool { 308 if len(a0) != len(a1) { 309 logger.Tracef("address lists have different lengths %d != %d for %v != %v", 310 len(a0), len(a1), a0, a1) 311 return false 312 } 313 314 ca0 := make([]network.Address, len(a0)) 315 copy(ca0, a0) 316 network.SortAddresses(ca0) 317 ca1 := make([]network.Address, len(a1)) 318 copy(ca1, a1) 319 network.SortAddresses(ca1) 320 321 for i := range ca0 { 322 if ca0[i] != ca1[i] { 323 logger.Tracef("address entry at offset %d has a different value for %v != %v", 324 i, ca0, ca1) 325 return false 326 } 327 } 328 return true 329 }