github.com/wallyworld/juju@v0.0.0-20161013125918-6cf1bc9d917a/worker/leadership/tracker.go (about) 1 // Copyright 2015 Canonical Ltd. 2 // Licensed under the AGPLv3, see LICENCE file for details. 3 4 package leadership 5 6 import ( 7 "time" 8 9 "github.com/juju/errors" 10 "github.com/juju/loggo" 11 "github.com/juju/utils/clock" 12 "gopkg.in/juju/names.v2" 13 "gopkg.in/tomb.v1" 14 15 "github.com/juju/juju/core/leadership" 16 ) 17 18 var logger = loggo.GetLogger("juju.worker.leadership") 19 20 type Tracker struct { 21 tomb tomb.Tomb 22 claimer leadership.Claimer 23 unitName string 24 applicationName string 25 clock clock.Clock 26 duration time.Duration 27 isMinion bool 28 29 claimLease chan struct{} 30 renewLease <-chan time.Time 31 claimTickets chan chan bool 32 waitLeaderTickets chan chan bool 33 waitMinionTickets chan chan bool 34 waitingLeader []chan bool 35 waitingMinion []chan bool 36 } 37 38 // NewTracker returns a *Tracker that attempts to claim and retain service 39 // leadership for the supplied unit. It will claim leadership for twice the 40 // supplied duration, and once it's leader it will renew leadership every 41 // time the duration elapses. 42 // Thus, successful leadership claims on the resulting Tracker will guarantee 43 // leadership for the duration supplied here without generating additional 44 // calls to the supplied manager (which may very well be on the other side of 45 // a network connection). 46 func NewTracker(tag names.UnitTag, claimer leadership.Claimer, clock clock.Clock, duration time.Duration) *Tracker { 47 unitName := tag.Id() 48 serviceName, _ := names.UnitApplication(unitName) 49 t := &Tracker{ 50 unitName: unitName, 51 applicationName: serviceName, 52 claimer: claimer, 53 clock: clock, 54 duration: duration, 55 claimTickets: make(chan chan bool), 56 waitLeaderTickets: make(chan chan bool), 57 waitMinionTickets: make(chan chan bool), 58 } 59 go func() { 60 defer t.tomb.Done() 61 defer func() { 62 for _, ticketCh := range t.waitingLeader { 63 close(ticketCh) 64 } 65 for _, ticketCh := range t.waitingMinion { 66 close(ticketCh) 67 } 68 }() 69 err := t.loop() 70 // TODO: jam 2015-04-02 is this the most elegant way to make 71 // sure we shutdown cleanly? Essentially the lowest level sees 72 // that we are dying, and propagates an ErrDying up to us so 73 // that we shut down, which we then are passing back into 74 // Tomb.Kill(). 75 // Tomb.Kill() special cases the exact object ErrDying, and has 76 // no idea about errors.Cause and the general errors.Trace 77 // mechanisms that we use. 78 // So we explicitly unwrap before calling tomb.Kill() else 79 // tomb.Stop() thinks that we have a genuine error. 80 switch cause := errors.Cause(err); cause { 81 case tomb.ErrDying: 82 err = cause 83 } 84 t.tomb.Kill(err) 85 }() 86 return t 87 } 88 89 // Kill is part of the worker.Worker interface. 90 func (t *Tracker) Kill() { 91 t.tomb.Kill(nil) 92 } 93 94 // Wait is part of the worker.Worker interface. 95 func (t *Tracker) Wait() error { 96 return t.tomb.Wait() 97 } 98 99 // ApplicationName is part of the leadership.Tracker interface. 100 func (t *Tracker) ApplicationName() string { 101 return t.applicationName 102 } 103 104 // ClaimDuration is part of the leadership.Tracker interface. 105 func (t *Tracker) ClaimDuration() time.Duration { 106 return t.duration 107 } 108 109 // ClaimLeader is part of the leadership.Tracker interface. 110 func (t *Tracker) ClaimLeader() leadership.Ticket { 111 return t.submit(t.claimTickets) 112 } 113 114 // WaitLeader is part of the leadership.Tracker interface. 115 func (t *Tracker) WaitLeader() leadership.Ticket { 116 return t.submit(t.waitLeaderTickets) 117 } 118 119 // WaitMinion is part of the leadership.Tracker interface. 120 func (t *Tracker) WaitMinion() leadership.Ticket { 121 return t.submit(t.waitMinionTickets) 122 } 123 124 func (t *Tracker) loop() error { 125 logger.Debugf("%s making initial claim for %s leadership", t.unitName, t.applicationName) 126 if err := t.refresh(); err != nil { 127 return errors.Trace(err) 128 } 129 for { 130 select { 131 case <-t.tomb.Dying(): 132 return tomb.ErrDying 133 case <-t.claimLease: 134 logger.Debugf("%s claiming lease for %s leadership", t.unitName, t.applicationName) 135 t.claimLease = nil 136 if err := t.refresh(); err != nil { 137 return errors.Trace(err) 138 } 139 case <-t.renewLease: 140 logger.Debugf("%s renewing lease for %s leadership", t.unitName, t.applicationName) 141 t.renewLease = nil 142 if err := t.refresh(); err != nil { 143 return errors.Trace(err) 144 } 145 case ticketCh := <-t.claimTickets: 146 logger.Debugf("%s got claim request for %s leadership", t.unitName, t.applicationName) 147 if err := t.resolveClaim(ticketCh); err != nil { 148 return errors.Trace(err) 149 } 150 case ticketCh := <-t.waitLeaderTickets: 151 logger.Debugf("%s got wait request for %s leadership", t.unitName, t.applicationName) 152 if err := t.resolveWaitLeader(ticketCh); err != nil { 153 return errors.Trace(err) 154 } 155 case ticketCh := <-t.waitMinionTickets: 156 logger.Debugf("%s got wait request for %s leadership loss", t.unitName, t.applicationName) 157 if err := t.resolveWaitMinion(ticketCh); err != nil { 158 return errors.Trace(err) 159 } 160 } 161 } 162 } 163 164 // refresh makes a leadership request, and updates Tracker state to conform to 165 // latest known reality. 166 func (t *Tracker) refresh() error { 167 logger.Debugf("checking %s for %s leadership", t.unitName, t.applicationName) 168 leaseDuration := 2 * t.duration 169 untilTime := t.clock.Now().Add(leaseDuration) 170 err := t.claimer.ClaimLeadership(t.applicationName, t.unitName, leaseDuration) 171 switch { 172 case err == nil: 173 return t.setLeader(untilTime) 174 case errors.Cause(err) == leadership.ErrClaimDenied: 175 return t.setMinion() 176 } 177 return errors.Annotatef(err, "leadership failure") 178 } 179 180 // setLeader arranges for lease renewal. 181 func (t *Tracker) setLeader(untilTime time.Time) error { 182 logger.Debugf("%s confirmed for %s leadership until %s", t.unitName, t.applicationName, untilTime) 183 renewTime := untilTime.Add(-t.duration) 184 logger.Infof("%s will renew %s leadership at %s", t.unitName, t.applicationName, renewTime) 185 t.isMinion = false 186 t.claimLease = nil 187 t.renewLease = t.clock.After(renewTime.Sub(t.clock.Now())) 188 189 for len(t.waitingLeader) > 0 { 190 logger.Debugf("notifying %s ticket of impending %s leadership", t.unitName, t.applicationName) 191 var ticketCh chan bool 192 ticketCh, t.waitingLeader = t.waitingLeader[0], t.waitingLeader[1:] 193 defer close(ticketCh) 194 if err := t.sendTrue(ticketCh); err != nil { 195 return errors.Trace(err) 196 } 197 } 198 return nil 199 } 200 201 // setMinion arranges for lease acquisition when there's an opportunity. 202 func (t *Tracker) setMinion() error { 203 logger.Infof("%s leadership for %s denied", t.applicationName, t.unitName) 204 t.isMinion = true 205 t.renewLease = nil 206 if t.claimLease == nil { 207 t.claimLease = make(chan struct{}) 208 go func() { 209 defer close(t.claimLease) 210 logger.Debugf("%s waiting for %s leadership release", t.unitName, t.applicationName) 211 err := t.claimer.BlockUntilLeadershipReleased(t.applicationName) 212 if err != nil { 213 logger.Debugf("error while %s waiting for %s leadership release: %v", t.unitName, t.applicationName, err) 214 } 215 // We don't need to do anything else with the error, because we just 216 // close the claimLease channel and trigger a leadership claim on the 217 // main loop; if anything's gone seriously wrong we'll find out right 218 // away and shut down anyway. (And if this goroutine outlives the 219 // Tracker, it keeps it around as a zombie, but I don't see a way 220 // around that...) 221 }() 222 } 223 224 for len(t.waitingMinion) > 0 { 225 logger.Debugf("notifying %s ticket of impending loss of %s leadership", t.unitName, t.applicationName) 226 var ticketCh chan bool 227 ticketCh, t.waitingMinion = t.waitingMinion[0], t.waitingMinion[1:] 228 defer close(ticketCh) 229 if err := t.sendTrue(ticketCh); err != nil { 230 return errors.Trace(err) 231 } 232 } 233 return nil 234 } 235 236 // isLeader returns true if leadership is guaranteed for the Tracker's duration. 237 func (t *Tracker) isLeader() (bool, error) { 238 if !t.isMinion { 239 // Last time we looked, we were leader. 240 select { 241 case <-t.tomb.Dying(): 242 return false, errors.Trace(tomb.ErrDying) 243 case <-t.renewLease: 244 logger.Debugf("%s renewing lease for %s leadership", t.unitName, t.applicationName) 245 t.renewLease = nil 246 if err := t.refresh(); err != nil { 247 return false, errors.Trace(err) 248 } 249 default: 250 logger.Debugf("%s still has %s leadership", t.unitName, t.applicationName) 251 } 252 } 253 return !t.isMinion, nil 254 } 255 256 // resolveClaim will send true on the supplied channel if leadership can be 257 // successfully verified, and will always close it whether or not it sent. 258 func (t *Tracker) resolveClaim(ticketCh chan bool) error { 259 logger.Debugf("resolving %s leadership ticket for %s...", t.applicationName, t.unitName) 260 defer close(ticketCh) 261 if leader, err := t.isLeader(); err != nil { 262 return errors.Trace(err) 263 } else if !leader { 264 logger.Debugf("%s is not %s leader", t.unitName, t.applicationName) 265 return nil 266 } 267 logger.Debugf("confirming %s leadership for %s", t.applicationName, t.unitName) 268 return t.sendTrue(ticketCh) 269 } 270 271 // resolveWaitLeader will send true on the supplied channel if leadership can be 272 // guaranteed for the Tracker's duration. It will then close the channel. If 273 // leadership cannot be guaranteed, the channel is left untouched until either 274 // the termination of the Tracker or the next invocation of setLeader; at which 275 // point true is sent if applicable, and the channel is closed. 276 func (t *Tracker) resolveWaitLeader(ticketCh chan bool) error { 277 var dontClose bool 278 defer func() { 279 if !dontClose { 280 close(ticketCh) 281 } 282 }() 283 284 if leader, err := t.isLeader(); err != nil { 285 return errors.Trace(err) 286 } else if leader { 287 logger.Debugf("reporting %s leadership for %s", t.applicationName, t.unitName) 288 return t.sendTrue(ticketCh) 289 } 290 291 logger.Debugf("waiting for %s to attain %s leadership", t.unitName, t.applicationName) 292 t.waitingLeader = append(t.waitingLeader, ticketCh) 293 dontClose = true 294 return nil 295 } 296 297 // resolveWaitMinion will close the supplied channel as soon as leadership cannot 298 // be guaranteed beyond the Tracker's duration. 299 func (t *Tracker) resolveWaitMinion(ticketCh chan bool) error { 300 var dontClose bool 301 defer func() { 302 if !dontClose { 303 close(ticketCh) 304 } 305 }() 306 307 if leader, err := t.isLeader(); err != nil { 308 return errors.Trace(err) 309 } else if leader { 310 logger.Debugf("waiting for %s to lose %s leadership", t.unitName, t.applicationName) 311 t.waitingMinion = append(t.waitingMinion, ticketCh) 312 dontClose = true 313 } else { 314 logger.Debugf("reporting %s leadership loss for %s", t.applicationName, t.unitName) 315 } 316 return nil 317 318 } 319 320 func (t *Tracker) sendTrue(ticketCh chan bool) error { 321 select { 322 case <-t.tomb.Dying(): 323 return tomb.ErrDying 324 case ticketCh <- true: 325 return nil 326 } 327 } 328 329 func (t *Tracker) submit(tickets chan chan bool) leadership.Ticket { 330 ticketCh := make(chan bool, 1) 331 select { 332 case <-t.tomb.Dying(): 333 close(ticketCh) 334 case tickets <- ticketCh: 335 } 336 ticket := &ticket{ 337 ch: ticketCh, 338 ready: make(chan struct{}), 339 } 340 go ticket.run() 341 return ticket 342 } 343 344 // ticket is used by Tracker to communicate leadership status back to a client. 345 type ticket struct { 346 ch chan bool 347 ready chan struct{} 348 success bool 349 } 350 351 func (t *ticket) run() { 352 defer close(t.ready) 353 // This is only safe/sane because the Tracker promises to close all pending 354 // ticket channels when it shuts down. 355 if <-t.ch { 356 t.success = true 357 } 358 } 359 360 // Ready is part of the leadership.Ticket interface. 361 func (t *ticket) Ready() <-chan struct{} { 362 return t.ready 363 } 364 365 // Wait is part of the leadership.Ticket interface. 366 func (t *ticket) Wait() bool { 367 <-t.ready 368 return t.success 369 }