launchpad.net/~rogpeppe/juju-core/500-errgo-fix@v0.0.0-20140213181702-000000002356/state/presence/presence.go (about) 1 // Copyright 2012, 2013 Canonical Ltd. 2 // Licensed under the AGPLv3, see LICENCE file for details. 3 4 // The presence package implements an interface for observing liveness 5 // of arbitrary keys (agents, processes, etc) on top of MongoDB. 6 // The design works by periodically updating the database so that 7 // watchers can tell an arbitrary key is alive. 8 package presence 9 10 import ( 11 "fmt" 12 "strconv" 13 "sync" 14 "time" 15 16 "labix.org/v2/mgo" 17 "labix.org/v2/mgo/bson" 18 "launchpad.net/errgo/errors" 19 "launchpad.net/tomb" 20 21 "launchpad.net/juju-core/log" 22 ) 23 24 var mask = errors.Mask 25 26 // Debug specifies whether the package will log debug 27 // messages. 28 // TODO(rog) allow debug level setting in the log package. 29 var Debug = false 30 31 // The implementation works by assigning a unique sequence number to each 32 // pinger that is alive, and the pinger is then responsible for 33 // periodically updating the current time slot document with its 34 // sequence number so that watchers can tell it is alive. 35 // 36 // The internal implementation of the time slot document is as follows: 37 // 38 // { 39 // "_id": <time slot>, 40 // "alive": { hex(<pinger seq> / 63) : (1 << (<pinger seq> % 63) | <others>) }, 41 // "dead": { hex(<pinger seq> / 63) : (1 << (<pinger seq> % 63) | <others>) }, 42 // } 43 // 44 // All pingers that have their sequence number under "alive" and not 45 // under "dead" are currently alive. This design enables implementing 46 // a ping with a single update operation, a kill with another operation, 47 // and obtaining liveness data with a single query that returns two 48 // documents (the last two time slots). 49 // 50 // A new pinger sequence is obtained every time a pinger starts by 51 // atomically incrementing a counter in a globally used document in a 52 // helper collection. That sequence number is then inserted into the 53 // beings collection to establish the mapping between pinger sequence 54 // and key. 55 56 // BUG(gn): The pings and beings collection currently grow without bound. 57 58 // A Watcher can watch any number of pinger keys for liveness changes. 59 type Watcher struct { 60 tomb tomb.Tomb 61 base *mgo.Collection 62 pings *mgo.Collection 63 beings *mgo.Collection 64 65 // delta is an approximate clock skew between the local system 66 // clock and the database clock. 67 delta time.Duration 68 69 // beingKey and beingSeq are the pinger seq <=> key mappings. 70 // Entries in these maps are considered alive. 71 beingKey map[int64]string 72 beingSeq map[string]int64 73 74 // watches has the per-key observer channels from Watch/Unwatch. 75 watches map[string][]chan<- Change 76 77 // pending contains all the events to be dispatched to the watcher 78 // channels. They're queued during processing and flushed at the 79 // end to simplify the algorithm. 80 pending []event 81 82 // request is used to deliver requests from the public API into 83 // the the gorotuine loop. 84 request chan interface{} 85 86 // syncDone contains pending done channels from sync requests. 87 syncDone []chan bool 88 89 // next will dispatch when it's time to sync the database 90 // knowledge. It's maintained here so that ForceRefresh 91 // can manipulate it to force a sync sooner. 92 next <-chan time.Time 93 } 94 95 type event struct { 96 ch chan<- Change 97 key string 98 alive bool 99 } 100 101 // Change holds a liveness change notification. 102 type Change struct { 103 Key string 104 Alive bool 105 } 106 107 // NewWatcher returns a new Watcher. 108 func NewWatcher(base *mgo.Collection) *Watcher { 109 w := &Watcher{ 110 base: base, 111 pings: pingsC(base), 112 beings: beingsC(base), 113 beingKey: make(map[int64]string), 114 beingSeq: make(map[string]int64), 115 watches: make(map[string][]chan<- Change), 116 request: make(chan interface{}), 117 } 118 go func() { 119 w.tomb.Kill(w.loop()) 120 w.tomb.Done() 121 }() 122 return w 123 } 124 125 // Stop stops all the watcher activities. 126 func (w *Watcher) Stop() error { 127 w.tomb.Kill(nil) 128 return w.tomb.Wait() 129 } 130 131 // Dead returns a channel that is closed when the watcher has stopped. 132 func (w *Watcher) Dead() <-chan struct{} { 133 return w.tomb.Dead() 134 } 135 136 // Err returns the error with which the watcher stopped. 137 // It returns nil if the watcher stopped cleanly, tomb.ErrStillAlive 138 // if the watcher is still running properly, or the respective error 139 // if the watcher is terminating or has terminated with an error. 140 func (w *Watcher) Err() error { 141 return w.tomb.Err() 142 } 143 144 type reqWatch struct { 145 key string 146 ch chan<- Change 147 } 148 149 type reqUnwatch struct { 150 key string 151 ch chan<- Change 152 } 153 154 type reqSync struct { 155 done chan bool 156 } 157 158 type reqAlive struct { 159 key string 160 result chan bool 161 } 162 163 func (w *Watcher) sendReq(req interface{}) { 164 select { 165 case w.request <- req: 166 case <-w.tomb.Dying(): 167 } 168 } 169 170 // Watch starts watching the liveness of key. An event will 171 // be sent onto ch to report the initial status for the key, and 172 // from then on a new event will be sent whenever a change is 173 // detected. Change values sent to the channel must be consumed, 174 // or the whole watcher will blocked. 175 func (w *Watcher) Watch(key string, ch chan<- Change) { 176 w.sendReq(reqWatch{key, ch}) 177 } 178 179 // Unwatch stops watching the liveness of key via ch. 180 func (w *Watcher) Unwatch(key string, ch chan<- Change) { 181 w.sendReq(reqUnwatch{key, ch}) 182 } 183 184 // StartSync forces the watcher to load new events from the database. 185 func (w *Watcher) StartSync() { 186 w.sendReq(reqSync{nil}) 187 } 188 189 // Sync forces the watcher to load new events from the database and blocks 190 // until all events have been dispatched. 191 func (w *Watcher) Sync() { 192 done := make(chan bool) 193 w.sendReq(reqSync{done}) 194 select { 195 case <-done: 196 case <-w.tomb.Dying(): 197 } 198 } 199 200 // Alive returns whether the key is currently considered alive by w, 201 // or an error in case the watcher is dying. 202 func (w *Watcher) Alive(key string) (bool, error) { 203 result := make(chan bool, 1) 204 w.sendReq(reqAlive{key, result}) 205 var alive bool 206 select { 207 case alive = <-result: 208 case <-w.tomb.Dying(): 209 return false, errors.Newf("cannot check liveness: watcher is dying") 210 } 211 return alive, nil 212 } 213 214 // period is the length of each time slot in seconds. 215 // It's not a time.Duration because the code is more convenient like 216 // this and also because sub-second timings don't work as the slot 217 // identifier is an int64 in seconds. 218 var period int64 = 30 219 220 // loop implements the main watcher loop. 221 func (w *Watcher) loop() error { 222 var err error 223 if w.delta, err = clockDelta(w.base); err != nil { 224 return mask(err) 225 } 226 w.next = time.After(0) 227 for { 228 select { 229 case <-w.tomb.Dying(): 230 return tomb.ErrDying 231 case <-w.next: 232 w.next = time.After(time.Duration(period) * time.Second) 233 syncDone := w.syncDone 234 w.syncDone = nil 235 if err := w.sync(); err != nil { 236 return mask(err) 237 } 238 w.flush() 239 for _, done := range syncDone { 240 close(done) 241 } 242 case req := <-w.request: 243 w.handle(req) 244 w.flush() 245 } 246 } 247 return nil 248 } 249 250 // flush sends all pending events to their respective channels. 251 func (w *Watcher) flush() { 252 // w.pending may get new requests as we handle other requests. 253 for i := 0; i < len(w.pending); i++ { 254 e := &w.pending[i] 255 for e.ch != nil { 256 select { 257 case <-w.tomb.Dying(): 258 return 259 case req := <-w.request: 260 w.handle(req) 261 continue 262 case e.ch <- Change{e.key, e.alive}: 263 } 264 break 265 } 266 } 267 w.pending = w.pending[:0] 268 } 269 270 // handle deals with requests delivered by the public API 271 // onto the background watcher goroutine. 272 func (w *Watcher) handle(req interface{}) { 273 debugf("state/presence: got request: %#v", req) 274 switch r := req.(type) { 275 case reqSync: 276 w.next = time.After(0) 277 if r.done != nil { 278 w.syncDone = append(w.syncDone, r.done) 279 } 280 case reqWatch: 281 for _, ch := range w.watches[r.key] { 282 if ch == r.ch { 283 panic("adding channel twice for same key") 284 } 285 } 286 w.watches[r.key] = append(w.watches[r.key], r.ch) 287 _, alive := w.beingSeq[r.key] 288 w.pending = append(w.pending, event{r.ch, r.key, alive}) 289 case reqUnwatch: 290 watches := w.watches[r.key] 291 for i, ch := range watches { 292 if ch == r.ch { 293 watches[i] = watches[len(watches)-1] 294 w.watches[r.key] = watches[:len(watches)-1] 295 break 296 } 297 } 298 for i := range w.pending { 299 e := &w.pending[i] 300 if e.key == r.key && e.ch == r.ch { 301 e.ch = nil 302 } 303 } 304 case reqAlive: 305 _, alive := w.beingSeq[r.key] 306 r.result <- alive 307 default: 308 panic(errors.Newf("unknown request: %T", req)) 309 } 310 } 311 312 type beingInfo struct { 313 Seq int64 "_id,omitempty" 314 Key string "key,omitempty" 315 } 316 317 type pingInfo struct { 318 Slot int64 "_id" 319 Alive map[string]int64 ",omitempty" 320 Dead map[string]int64 ",omitempty" 321 } 322 323 func (w *Watcher) findAllBeings() (map[int64]beingInfo, error) { 324 beings := make([]beingInfo, 0) 325 err := w.beings.Find(bson.D{{}}).All(&beings) 326 if err != nil { 327 return nil, mask(err) 328 } 329 beingInfos := make(map[int64]beingInfo, len(beings)) 330 for _, being := range beings { 331 beingInfos[being.Seq] = being 332 } 333 return beingInfos, nil 334 } 335 336 // sync updates the watcher knowledge from the database, and 337 // queues events to observing channels. It fetches the last two time 338 // slots and compares the union of both to the in-memory state. 339 func (w *Watcher) sync() error { 340 var allBeings map[int64]beingInfo 341 if len(w.beingKey) == 0 { 342 // The very first time we sync, we grab all ever-known beings, 343 // so we don't have to look them up one-by-one 344 var err error 345 if allBeings, err = w.findAllBeings(); err != nil { 346 return mask(err) 347 } 348 } 349 slot := timeSlot(time.Now(), w.delta) 350 var ping []pingInfo 351 err := w.pings.Find(bson.D{{"$or", []pingInfo{{Slot: slot}, {Slot: slot - period}}}}).All(&ping) 352 if err != nil && err == mgo.ErrNotFound { 353 return err 354 } 355 356 // Learn about all enforced deaths. 357 dead := make(map[int64]bool) 358 for i := range ping { 359 for key, value := range ping[i].Dead { 360 k, err := strconv.ParseInt(key, 16, 64) 361 if err != nil { 362 panic(errors.Newf("presence cannot parse dead key: %q", key)) 363 } 364 k *= 63 365 for i := int64(0); i < 63 && value > 0; i++ { 366 on := value&1 == 1 367 value >>= 1 368 if !on { 369 continue 370 } 371 seq := k + i 372 dead[seq] = true 373 debugf("state/presence: found seq=%d dead", seq) 374 } 375 } 376 } 377 378 // Learn about all the pingers that reported and queue 379 // events for those that weren't known to be alive and 380 // are not reportedly dead either. 381 alive := make(map[int64]bool) 382 being := beingInfo{} 383 for i := range ping { 384 for key, value := range ping[i].Alive { 385 k, err := strconv.ParseInt(key, 16, 64) 386 if err != nil { 387 panic(errors.Newf("presence cannot parse alive key: %q", key)) 388 } 389 k *= 63 390 for i := int64(0); i < 63 && value > 0; i++ { 391 on := value&1 == 1 392 value >>= 1 393 if !on { 394 continue 395 } 396 seq := k + i 397 alive[seq] = true 398 if _, ok := w.beingKey[seq]; ok { 399 continue 400 } 401 // Check if the being exists in the 'all' map, 402 // otherwise do a single lookup in mongo 403 var ok bool 404 if being, ok = allBeings[seq]; !ok { 405 err := w.beings.Find(bson.D{{"_id", seq}}).One(&being) 406 if errors.Cause(err) == mgo.ErrNotFound { 407 debugf("state/presence: found seq=%d unowned", seq) 408 continue 409 } else if err != nil { 410 return mask(err) 411 } 412 } 413 cur := w.beingSeq[being.Key] 414 if cur < seq { 415 delete(w.beingKey, cur) 416 } else { 417 // Current sequence is more recent. 418 continue 419 } 420 w.beingKey[seq] = being.Key 421 w.beingSeq[being.Key] = seq 422 if cur > 0 || dead[seq] { 423 continue 424 } 425 debugf("state/presence: found seq=%d alive with key %q", seq, being.Key) 426 for _, ch := range w.watches[being.Key] { 427 w.pending = append(w.pending, event{ch, being.Key, true}) 428 } 429 } 430 } 431 } 432 433 // Pingers that were known to be alive and haven't reported 434 // in the last two slots are now considered dead. Dispatch 435 // the respective events and forget their sequences. 436 for seq, key := range w.beingKey { 437 if dead[seq] || !alive[seq] { 438 delete(w.beingKey, seq) 439 delete(w.beingSeq, key) 440 for _, ch := range w.watches[key] { 441 w.pending = append(w.pending, event{ch, key, false}) 442 } 443 } 444 } 445 return nil 446 } 447 448 // Pinger periodically reports that a specific key is alive, so that 449 // watchers interested on that fact can react appropriately. 450 type Pinger struct { 451 mu sync.Mutex 452 tomb tomb.Tomb 453 base *mgo.Collection 454 pings *mgo.Collection 455 started bool 456 beingKey string 457 beingSeq int64 458 fieldKey string // hex(beingKey / 63) 459 fieldBit uint64 // 1 << (beingKey%63) 460 lastSlot int64 461 delta time.Duration 462 } 463 464 // NewPinger returns a new Pinger to report that key is alive. 465 // It starts reporting after Start is called. 466 func NewPinger(base *mgo.Collection, key string) *Pinger { 467 return &Pinger{base: base, pings: pingsC(base), beingKey: key} 468 } 469 470 // Start starts periodically reporting that p's key is alive. 471 func (p *Pinger) Start() error { 472 p.mu.Lock() 473 defer p.mu.Unlock() 474 if p.started { 475 return errors.Newf("pinger already started") 476 } 477 p.tomb = tomb.Tomb{} 478 if err := p.prepare(); err != nil { 479 return mask(err) 480 } 481 debugf("state/presence: starting pinger for %q with seq=%d", p.beingKey, p.beingSeq) 482 if err := p.ping(); err != nil { 483 return mask(err) 484 } 485 p.started = true 486 go func() { 487 p.tomb.Kill(p.loop()) 488 p.tomb.Done() 489 }() 490 return nil 491 } 492 493 // Stop stops p's periodical ping. 494 // Watchers will not notice p has stopped pinging until the 495 // previous ping times out. 496 func (p *Pinger) Stop() error { 497 p.mu.Lock() 498 defer p.mu.Unlock() 499 if p.started { 500 debugf("state/presence: stopping pinger for %q with seq=%d", p.beingKey, p.beingSeq) 501 } 502 p.tomb.Kill(nil) 503 err := p.tomb.Wait() 504 // TODO ping one more time to guarantee a late timeout. 505 p.started = false 506 return err 507 508 } 509 510 // Stop stops p's periodical ping and immediately report that it is dead. 511 func (p *Pinger) Kill() error { 512 p.mu.Lock() 513 defer p.mu.Unlock() 514 if p.started { 515 debugf("state/presence: killing pinger for %q (was started)", p.beingKey) 516 return p.killStarted() 517 } 518 debugf("state/presence: killing pinger for %q (was stopped)", p.beingKey) 519 return p.killStopped() 520 } 521 522 // killStarted kills the pinger while it is running, by first 523 // stopping it and then recording in the last pinged slot that 524 // the pinger was killed. 525 func (p *Pinger) killStarted() error { 526 p.tomb.Kill(nil) 527 killErr := p.tomb.Wait() 528 p.started = false 529 530 slot := p.lastSlot 531 udoc := bson.D{{"$inc", bson.D{{"dead." + p.fieldKey, p.fieldBit}}}} 532 if _, err := p.pings.UpsertId(slot, udoc); err != nil { 533 return mask(err) 534 } 535 return killErr 536 } 537 538 // killStopped kills the pinger while it is not running, by 539 // first allocating a new sequence, and then atomically recording 540 // the new sequence both as alive and dead at once. 541 func (p *Pinger) killStopped() error { 542 if err := p.prepare(); err != nil { 543 return mask(err) 544 } 545 slot := timeSlot(time.Now(), p.delta) 546 udoc := bson.D{{"$inc", bson.D{ 547 {"dead." + p.fieldKey, p.fieldBit}, 548 {"alive." + p.fieldKey, p.fieldBit}, 549 }}} 550 _, err := p.pings.UpsertId(slot, udoc) 551 return err 552 } 553 554 // loop is the main pinger loop that runs while it is 555 // in started state. 556 func (p *Pinger) loop() error { 557 for { 558 select { 559 case <-p.tomb.Dying(): 560 return tomb.ErrDying 561 case <-time.After(time.Duration(float64(period+1)*0.75) * time.Second): 562 if err := p.ping(); err != nil { 563 return mask(err) 564 } 565 } 566 } 567 } 568 569 // prepare allocates a new unique sequence for the 570 // pinger key and prepares the pinger to use it. 571 func (p *Pinger) prepare() error { 572 change := mgo.Change{ 573 Update: bson.D{{"$inc", bson.D{{"seq", int64(1)}}}}, 574 Upsert: true, 575 ReturnNew: true, 576 } 577 seqs := seqsC(p.base) 578 var seq struct{ Seq int64 } 579 if _, err := seqs.FindId("beings").Apply(change, &seq); err != nil { 580 return mask(err) 581 } 582 p.beingSeq = seq.Seq 583 p.fieldKey = fmt.Sprintf("%x", p.beingSeq/63) 584 p.fieldBit = 1 << uint64(p.beingSeq%63) 585 p.lastSlot = 0 586 beings := beingsC(p.base) 587 return beings.Insert(beingInfo{p.beingSeq, p.beingKey}) 588 } 589 590 // ping records updates the current time slot with the 591 // sequence in use by the pinger. 592 func (p *Pinger) ping() error { 593 debugf("state/presence: pinging %q with seq=%d", p.beingKey, p.beingSeq) 594 if p.delta == 0 { 595 delta, err := clockDelta(p.base) 596 if err != nil { 597 return mask(err) 598 } 599 p.delta = delta 600 } 601 slot := timeSlot(time.Now(), p.delta) 602 if slot == p.lastSlot { 603 // Never, ever, ping the same slot twice. 604 // The increment below would corrupt the slot. 605 return nil 606 } 607 p.lastSlot = slot 608 if _, err := p.pings.UpsertId(slot, bson.D{{"$inc", bson.D{{"alive." + p.fieldKey, p.fieldBit}}}}); err != nil { 609 return mask(err) 610 } 611 return nil 612 } 613 614 // clockDelta returns the approximate skew between 615 // the local clock and the database clock. 616 func clockDelta(c *mgo.Collection) (time.Duration, error) { 617 var server struct { 618 time.Time "retval" 619 } 620 var isMaster struct { 621 LocalTime time.Time "localTime" 622 } 623 var after time.Time 624 var before time.Time 625 var serverDelay time.Duration 626 supportsMasterLocalTime := true 627 for i := 0; i < 10; i++ { 628 if supportsMasterLocalTime { 629 // Try isMaster.localTime, which is present since MongoDB 2.2 630 // and does not require admin privileges. 631 before = time.Now() 632 err := c.Database.Run("isMaster", &isMaster) 633 after = time.Now() 634 if err != nil { 635 return 0, mask(err) 636 } 637 if isMaster.LocalTime.IsZero() { 638 supportsMasterLocalTime = false 639 continue 640 } else { 641 serverDelay = isMaster.LocalTime.Sub(before) 642 } 643 } else { 644 // If MongoDB doesn't have localTime as part of 645 // isMaster result, it means that the server is likely 646 // a MongoDB older than 2.2. 647 // 648 // Fallback to 'eval' works fine on versions older than 649 // 2.4 where it does not require admin privileges. 650 // 651 // NOTE: 'eval' takes a global write lock unless you 652 // specify 'nolock' (which we are not doing below, for 653 // no apparent reason), so it is quite likely that the 654 // eval could take a relatively long time to acquire 655 // the lock and thus cause a retry on the callDelay 656 // check below on a busy server. 657 before = time.Now() 658 err := c.Database.Run(bson.D{{"$eval", "function() { return new Date(); }"}}, &server) 659 after = time.Now() 660 if err != nil { 661 return 0, mask(err) 662 } 663 serverDelay = server.Sub(before) 664 } 665 // If the call to the server takes longer than a few seconds we 666 // retry it a couple more times before giving up. It is unclear 667 // why the retry would help at all here. 668 // 669 // If the server takes longer than the specified amount of time 670 // on every single try, then we simply give up. 671 callDelay := after.Sub(before) 672 if callDelay > 5*time.Second { 673 continue 674 } 675 return serverDelay, nil 676 } 677 return 0, errors.Newf("cannot synchronize clock with database server") 678 } 679 680 // timeSlot returns the current time slot, in seconds since the 681 // epoch, for the provided now time. The delta skew is applied 682 // to the now time to improve the synchronization with a 683 // centrally agreed time. 684 // 685 // The result of this method may be manipulated for test purposes 686 // by fakeTimeSlot and realTimeSlot. 687 func timeSlot(now time.Time, delta time.Duration) int64 { 688 fakeMutex.Lock() 689 fake := !fakeNow.IsZero() 690 if fake { 691 now = fakeNow 692 } 693 slot := now.Add(delta).Unix() 694 slot -= slot % period 695 if fake { 696 slot += int64(fakeOffset) * period 697 } 698 fakeMutex.Unlock() 699 return slot 700 } 701 702 var ( 703 fakeMutex sync.Mutex // protects fakeOffset, fakeNow 704 fakeNow time.Time 705 fakeOffset int 706 ) 707 708 // fakeTimeSlot hardcodes the slot time returned by the timeSlot 709 // function for testing purposes. The offset parameter is the slot 710 // position to return: offsets +1 and -1 are +period and -period 711 // seconds from slot 0, respectively. 712 func fakeTimeSlot(offset int) { 713 fakeMutex.Lock() 714 if fakeNow.IsZero() { 715 fakeNow = time.Now() 716 } 717 fakeOffset = offset 718 fakeMutex.Unlock() 719 log.Infof("state/presence: Faking presence to time slot %d", offset) 720 } 721 722 // realTimeSlot disables the hardcoding introduced by fakeTimeSlot. 723 func realTimeSlot() { 724 fakeMutex.Lock() 725 fakeNow = time.Time{} 726 fakeOffset = 0 727 fakeMutex.Unlock() 728 log.Infof("state/presence: Not faking presence time. Real time slot in use.") 729 } 730 731 func seqsC(base *mgo.Collection) *mgo.Collection { 732 return base.Database.C(base.Name + ".seqs") 733 } 734 735 func beingsC(base *mgo.Collection) *mgo.Collection { 736 return base.Database.C(base.Name + ".beings") 737 } 738 739 func pingsC(base *mgo.Collection) *mgo.Collection { 740 return base.Database.C(base.Name + ".pings") 741 } 742 743 func debugf(f string, a ...interface{}) { 744 if Debug { 745 log.Debugf(f, a...) 746 } 747 }