github.com/mattyw/juju@v0.0.0-20140610034352-732aecd63861/state/presence/presence.go (about) 1 // Copyright 2012, 2013 Canonical Ltd. 2 // Licensed under the AGPLv3, see LICENCE file for details. 3 4 // The presence package implements an interface for observing liveness 5 // of arbitrary keys (agents, processes, etc) on top of MongoDB. 6 // The design works by periodically updating the database so that 7 // watchers can tell an arbitrary key is alive. 8 package presence 9 10 import ( 11 "fmt" 12 "strconv" 13 "sync" 14 "time" 15 16 "github.com/juju/loggo" 17 "labix.org/v2/mgo" 18 "labix.org/v2/mgo/bson" 19 "launchpad.net/tomb" 20 ) 21 22 var logger = loggo.GetLogger("juju.state.presence") 23 24 // The implementation works by assigning a unique sequence number to each 25 // pinger that is alive, and the pinger is then responsible for 26 // periodically updating the current time slot document with its 27 // sequence number so that watchers can tell it is alive. 28 // 29 // The internal implementation of the time slot document is as follows: 30 // 31 // { 32 // "_id": <time slot>, 33 // "alive": { hex(<pinger seq> / 63) : (1 << (<pinger seq> % 63) | <others>) }, 34 // "dead": { hex(<pinger seq> / 63) : (1 << (<pinger seq> % 63) | <others>) }, 35 // } 36 // 37 // All pingers that have their sequence number under "alive" and not 38 // under "dead" are currently alive. This design enables implementing 39 // a ping with a single update operation, a kill with another operation, 40 // and obtaining liveness data with a single query that returns two 41 // documents (the last two time slots). 42 // 43 // A new pinger sequence is obtained every time a pinger starts by 44 // atomically incrementing a counter in a globally used document in a 45 // helper collection. That sequence number is then inserted into the 46 // beings collection to establish the mapping between pinger sequence 47 // and key. 48 49 // BUG(gn): The pings and beings collection currently grow without bound. 50 51 // A Watcher can watch any number of pinger keys for liveness changes. 52 type Watcher struct { 53 tomb tomb.Tomb 54 base *mgo.Collection 55 pings *mgo.Collection 56 beings *mgo.Collection 57 58 // delta is an approximate clock skew between the local system 59 // clock and the database clock. 60 delta time.Duration 61 62 // beingKey and beingSeq are the pinger seq <=> key mappings. 63 // Entries in these maps are considered alive. 64 beingKey map[int64]string 65 beingSeq map[string]int64 66 67 // watches has the per-key observer channels from Watch/Unwatch. 68 watches map[string][]chan<- Change 69 70 // pending contains all the events to be dispatched to the watcher 71 // channels. They're queued during processing and flushed at the 72 // end to simplify the algorithm. 73 pending []event 74 75 // request is used to deliver requests from the public API into 76 // the the gorotuine loop. 77 request chan interface{} 78 79 // syncDone contains pending done channels from sync requests. 80 syncDone []chan bool 81 82 // next will dispatch when it's time to sync the database 83 // knowledge. It's maintained here so that ForceRefresh 84 // can manipulate it to force a sync sooner. 85 next <-chan time.Time 86 } 87 88 type event struct { 89 ch chan<- Change 90 key string 91 alive bool 92 } 93 94 // Change holds a liveness change notification. 95 type Change struct { 96 Key string 97 Alive bool 98 } 99 100 // NewWatcher returns a new Watcher. 101 func NewWatcher(base *mgo.Collection) *Watcher { 102 w := &Watcher{ 103 base: base, 104 pings: pingsC(base), 105 beings: beingsC(base), 106 beingKey: make(map[int64]string), 107 beingSeq: make(map[string]int64), 108 watches: make(map[string][]chan<- Change), 109 request: make(chan interface{}), 110 } 111 go func() { 112 w.tomb.Kill(w.loop()) 113 w.tomb.Done() 114 }() 115 return w 116 } 117 118 // Stop stops all the watcher activities. 119 func (w *Watcher) Stop() error { 120 w.tomb.Kill(nil) 121 return w.tomb.Wait() 122 } 123 124 // Dead returns a channel that is closed when the watcher has stopped. 125 func (w *Watcher) Dead() <-chan struct{} { 126 return w.tomb.Dead() 127 } 128 129 // Err returns the error with which the watcher stopped. 130 // It returns nil if the watcher stopped cleanly, tomb.ErrStillAlive 131 // if the watcher is still running properly, or the respective error 132 // if the watcher is terminating or has terminated with an error. 133 func (w *Watcher) Err() error { 134 return w.tomb.Err() 135 } 136 137 type reqWatch struct { 138 key string 139 ch chan<- Change 140 } 141 142 type reqUnwatch struct { 143 key string 144 ch chan<- Change 145 } 146 147 type reqSync struct { 148 done chan bool 149 } 150 151 type reqAlive struct { 152 key string 153 result chan bool 154 } 155 156 func (w *Watcher) sendReq(req interface{}) { 157 select { 158 case w.request <- req: 159 case <-w.tomb.Dying(): 160 } 161 } 162 163 // Watch starts watching the liveness of key. An event will 164 // be sent onto ch to report the initial status for the key, and 165 // from then on a new event will be sent whenever a change is 166 // detected. Change values sent to the channel must be consumed, 167 // or the whole watcher will blocked. 168 func (w *Watcher) Watch(key string, ch chan<- Change) { 169 w.sendReq(reqWatch{key, ch}) 170 } 171 172 // Unwatch stops watching the liveness of key via ch. 173 func (w *Watcher) Unwatch(key string, ch chan<- Change) { 174 w.sendReq(reqUnwatch{key, ch}) 175 } 176 177 // StartSync forces the watcher to load new events from the database. 178 func (w *Watcher) StartSync() { 179 w.sendReq(reqSync{nil}) 180 } 181 182 // Sync forces the watcher to load new events from the database and blocks 183 // until all events have been dispatched. 184 func (w *Watcher) Sync() { 185 done := make(chan bool) 186 w.sendReq(reqSync{done}) 187 select { 188 case <-done: 189 case <-w.tomb.Dying(): 190 } 191 } 192 193 // Alive returns whether the key is currently considered alive by w, 194 // or an error in case the watcher is dying. 195 func (w *Watcher) Alive(key string) (bool, error) { 196 result := make(chan bool, 1) 197 w.sendReq(reqAlive{key, result}) 198 var alive bool 199 select { 200 case alive = <-result: 201 case <-w.tomb.Dying(): 202 return false, fmt.Errorf("cannot check liveness: watcher is dying") 203 } 204 return alive, nil 205 } 206 207 // period is the length of each time slot in seconds. 208 // It's not a time.Duration because the code is more convenient like 209 // this and also because sub-second timings don't work as the slot 210 // identifier is an int64 in seconds. 211 var period int64 = 30 212 213 // loop implements the main watcher loop. 214 func (w *Watcher) loop() error { 215 var err error 216 if w.delta, err = clockDelta(w.base); err != nil { 217 return err 218 } 219 w.next = time.After(0) 220 for { 221 select { 222 case <-w.tomb.Dying(): 223 return tomb.ErrDying 224 case <-w.next: 225 w.next = time.After(time.Duration(period) * time.Second) 226 syncDone := w.syncDone 227 w.syncDone = nil 228 if err := w.sync(); err != nil { 229 return err 230 } 231 w.flush() 232 for _, done := range syncDone { 233 close(done) 234 } 235 case req := <-w.request: 236 w.handle(req) 237 w.flush() 238 } 239 } 240 } 241 242 // flush sends all pending events to their respective channels. 243 func (w *Watcher) flush() { 244 // w.pending may get new requests as we handle other requests. 245 for i := 0; i < len(w.pending); i++ { 246 e := &w.pending[i] 247 for e.ch != nil { 248 select { 249 case <-w.tomb.Dying(): 250 return 251 case req := <-w.request: 252 w.handle(req) 253 continue 254 case e.ch <- Change{e.key, e.alive}: 255 } 256 break 257 } 258 } 259 w.pending = w.pending[:0] 260 } 261 262 // handle deals with requests delivered by the public API 263 // onto the background watcher goroutine. 264 func (w *Watcher) handle(req interface{}) { 265 logger.Tracef("got request: %#v", req) 266 switch r := req.(type) { 267 case reqSync: 268 w.next = time.After(0) 269 if r.done != nil { 270 w.syncDone = append(w.syncDone, r.done) 271 } 272 case reqWatch: 273 for _, ch := range w.watches[r.key] { 274 if ch == r.ch { 275 panic("adding channel twice for same key") 276 } 277 } 278 w.watches[r.key] = append(w.watches[r.key], r.ch) 279 _, alive := w.beingSeq[r.key] 280 w.pending = append(w.pending, event{r.ch, r.key, alive}) 281 case reqUnwatch: 282 watches := w.watches[r.key] 283 for i, ch := range watches { 284 if ch == r.ch { 285 watches[i] = watches[len(watches)-1] 286 w.watches[r.key] = watches[:len(watches)-1] 287 break 288 } 289 } 290 for i := range w.pending { 291 e := &w.pending[i] 292 if e.key == r.key && e.ch == r.ch { 293 e.ch = nil 294 } 295 } 296 case reqAlive: 297 _, alive := w.beingSeq[r.key] 298 r.result <- alive 299 default: 300 panic(fmt.Errorf("unknown request: %T", req)) 301 } 302 } 303 304 type beingInfo struct { 305 Seq int64 "_id,omitempty" 306 Key string "key,omitempty" 307 } 308 309 type pingInfo struct { 310 Slot int64 "_id" 311 Alive map[string]int64 ",omitempty" 312 Dead map[string]int64 ",omitempty" 313 } 314 315 func (w *Watcher) findAllBeings() (map[int64]beingInfo, error) { 316 beings := make([]beingInfo, 0) 317 err := w.beings.Find(bson.D{{}}).All(&beings) 318 if err != nil { 319 return nil, err 320 } 321 beingInfos := make(map[int64]beingInfo, len(beings)) 322 for _, being := range beings { 323 beingInfos[being.Seq] = being 324 } 325 return beingInfos, nil 326 } 327 328 // sync updates the watcher knowledge from the database, and 329 // queues events to observing channels. It fetches the last two time 330 // slots and compares the union of both to the in-memory state. 331 func (w *Watcher) sync() error { 332 var allBeings map[int64]beingInfo 333 if len(w.beingKey) == 0 { 334 // The very first time we sync, we grab all ever-known beings, 335 // so we don't have to look them up one-by-one 336 var err error 337 if allBeings, err = w.findAllBeings(); err != nil { 338 return err 339 } 340 } 341 slot := timeSlot(time.Now(), w.delta) 342 var ping []pingInfo 343 err := w.pings.Find(bson.D{{"$or", []pingInfo{{Slot: slot}, {Slot: slot - period}}}}).All(&ping) 344 if err != nil && err == mgo.ErrNotFound { 345 return err 346 } 347 348 // Learn about all enforced deaths. 349 dead := make(map[int64]bool) 350 for i := range ping { 351 for key, value := range ping[i].Dead { 352 k, err := strconv.ParseInt(key, 16, 64) 353 if err != nil { 354 panic(fmt.Errorf("presence cannot parse dead key: %q", key)) 355 } 356 k *= 63 357 for i := int64(0); i < 63 && value > 0; i++ { 358 on := value&1 == 1 359 value >>= 1 360 if !on { 361 continue 362 } 363 seq := k + i 364 dead[seq] = true 365 logger.Tracef("found seq=%d dead", seq) 366 } 367 } 368 } 369 370 // Learn about all the pingers that reported and queue 371 // events for those that weren't known to be alive and 372 // are not reportedly dead either. 373 alive := make(map[int64]bool) 374 being := beingInfo{} 375 for i := range ping { 376 for key, value := range ping[i].Alive { 377 k, err := strconv.ParseInt(key, 16, 64) 378 if err != nil { 379 panic(fmt.Errorf("presence cannot parse alive key: %q", key)) 380 } 381 k *= 63 382 for i := int64(0); i < 63 && value > 0; i++ { 383 on := value&1 == 1 384 value >>= 1 385 if !on { 386 continue 387 } 388 seq := k + i 389 alive[seq] = true 390 if _, ok := w.beingKey[seq]; ok { 391 continue 392 } 393 // Check if the being exists in the 'all' map, 394 // otherwise do a single lookup in mongo 395 var ok bool 396 if being, ok = allBeings[seq]; !ok { 397 err := w.beings.Find(bson.D{{"_id", seq}}).One(&being) 398 if err == mgo.ErrNotFound { 399 logger.Tracef("found seq=%d unowned", seq) 400 continue 401 } else if err != nil { 402 return err 403 } 404 } 405 cur := w.beingSeq[being.Key] 406 if cur < seq { 407 delete(w.beingKey, cur) 408 } else { 409 // Current sequence is more recent. 410 continue 411 } 412 w.beingKey[seq] = being.Key 413 w.beingSeq[being.Key] = seq 414 if cur > 0 || dead[seq] { 415 continue 416 } 417 logger.Tracef("found seq=%d alive with key %q", seq, being.Key) 418 for _, ch := range w.watches[being.Key] { 419 w.pending = append(w.pending, event{ch, being.Key, true}) 420 } 421 } 422 } 423 } 424 425 // Pingers that were known to be alive and haven't reported 426 // in the last two slots are now considered dead. Dispatch 427 // the respective events and forget their sequences. 428 for seq, key := range w.beingKey { 429 if dead[seq] || !alive[seq] { 430 delete(w.beingKey, seq) 431 delete(w.beingSeq, key) 432 for _, ch := range w.watches[key] { 433 w.pending = append(w.pending, event{ch, key, false}) 434 } 435 } 436 } 437 return nil 438 } 439 440 // Pinger periodically reports that a specific key is alive, so that 441 // watchers interested on that fact can react appropriately. 442 type Pinger struct { 443 mu sync.Mutex 444 tomb tomb.Tomb 445 base *mgo.Collection 446 pings *mgo.Collection 447 started bool 448 beingKey string 449 beingSeq int64 450 fieldKey string // hex(beingKey / 63) 451 fieldBit uint64 // 1 << (beingKey%63) 452 lastSlot int64 453 delta time.Duration 454 } 455 456 // NewPinger returns a new Pinger to report that key is alive. 457 // It starts reporting after Start is called. 458 func NewPinger(base *mgo.Collection, key string) *Pinger { 459 return &Pinger{base: base, pings: pingsC(base), beingKey: key} 460 } 461 462 // Start starts periodically reporting that p's key is alive. 463 func (p *Pinger) Start() error { 464 p.mu.Lock() 465 defer p.mu.Unlock() 466 if p.started { 467 return fmt.Errorf("pinger already started") 468 } 469 p.tomb = tomb.Tomb{} 470 if err := p.prepare(); err != nil { 471 return err 472 } 473 logger.Tracef("starting pinger for %q with seq=%d", p.beingKey, p.beingSeq) 474 if err := p.ping(); err != nil { 475 return err 476 } 477 p.started = true 478 go func() { 479 p.tomb.Kill(p.loop()) 480 p.tomb.Done() 481 }() 482 return nil 483 } 484 485 // Stop stops p's periodical ping. 486 // Watchers will not notice p has stopped pinging until the 487 // previous ping times out. 488 func (p *Pinger) Stop() error { 489 p.mu.Lock() 490 defer p.mu.Unlock() 491 if p.started { 492 logger.Tracef("stopping pinger for %q with seq=%d", p.beingKey, p.beingSeq) 493 } 494 p.tomb.Kill(nil) 495 err := p.tomb.Wait() 496 // TODO ping one more time to guarantee a late timeout. 497 p.started = false 498 return err 499 500 } 501 502 // Stop stops p's periodical ping and immediately report that it is dead. 503 func (p *Pinger) Kill() error { 504 p.mu.Lock() 505 defer p.mu.Unlock() 506 if p.started { 507 logger.Tracef("killing pinger for %q (was started)", p.beingKey) 508 return p.killStarted() 509 } 510 logger.Tracef("killing pinger for %q (was stopped)", p.beingKey) 511 return p.killStopped() 512 } 513 514 // killStarted kills the pinger while it is running, by first 515 // stopping it and then recording in the last pinged slot that 516 // the pinger was killed. 517 func (p *Pinger) killStarted() error { 518 p.tomb.Kill(nil) 519 killErr := p.tomb.Wait() 520 p.started = false 521 522 slot := p.lastSlot 523 udoc := bson.D{{"$inc", bson.D{{"dead." + p.fieldKey, p.fieldBit}}}} 524 if _, err := p.pings.UpsertId(slot, udoc); err != nil { 525 return err 526 } 527 return killErr 528 } 529 530 // killStopped kills the pinger while it is not running, by 531 // first allocating a new sequence, and then atomically recording 532 // the new sequence both as alive and dead at once. 533 func (p *Pinger) killStopped() error { 534 if err := p.prepare(); err != nil { 535 return err 536 } 537 slot := timeSlot(time.Now(), p.delta) 538 udoc := bson.D{{"$inc", bson.D{ 539 {"dead." + p.fieldKey, p.fieldBit}, 540 {"alive." + p.fieldKey, p.fieldBit}, 541 }}} 542 _, err := p.pings.UpsertId(slot, udoc) 543 return err 544 } 545 546 // loop is the main pinger loop that runs while it is 547 // in started state. 548 func (p *Pinger) loop() error { 549 for { 550 select { 551 case <-p.tomb.Dying(): 552 return tomb.ErrDying 553 case <-time.After(time.Duration(float64(period+1)*0.75) * time.Second): 554 if err := p.ping(); err != nil { 555 return err 556 } 557 } 558 } 559 } 560 561 // prepare allocates a new unique sequence for the 562 // pinger key and prepares the pinger to use it. 563 func (p *Pinger) prepare() error { 564 change := mgo.Change{ 565 Update: bson.D{{"$inc", bson.D{{"seq", int64(1)}}}}, 566 Upsert: true, 567 ReturnNew: true, 568 } 569 seqs := seqsC(p.base) 570 var seq struct{ Seq int64 } 571 if _, err := seqs.FindId("beings").Apply(change, &seq); err != nil { 572 return err 573 } 574 p.beingSeq = seq.Seq 575 p.fieldKey = fmt.Sprintf("%x", p.beingSeq/63) 576 p.fieldBit = 1 << uint64(p.beingSeq%63) 577 p.lastSlot = 0 578 beings := beingsC(p.base) 579 return beings.Insert(beingInfo{p.beingSeq, p.beingKey}) 580 } 581 582 // ping records updates the current time slot with the 583 // sequence in use by the pinger. 584 func (p *Pinger) ping() error { 585 logger.Tracef("pinging %q with seq=%d", p.beingKey, p.beingSeq) 586 if p.delta == 0 { 587 delta, err := clockDelta(p.base) 588 if err != nil { 589 return err 590 } 591 p.delta = delta 592 } 593 slot := timeSlot(time.Now(), p.delta) 594 if slot == p.lastSlot { 595 // Never, ever, ping the same slot twice. 596 // The increment below would corrupt the slot. 597 return nil 598 } 599 p.lastSlot = slot 600 if _, err := p.pings.UpsertId(slot, bson.D{{"$inc", bson.D{{"alive." + p.fieldKey, p.fieldBit}}}}); err != nil { 601 return err 602 } 603 return nil 604 } 605 606 // clockDelta returns the approximate skew between 607 // the local clock and the database clock. 608 func clockDelta(c *mgo.Collection) (time.Duration, error) { 609 var server struct { 610 time.Time "retval" 611 } 612 var isMaster struct { 613 LocalTime time.Time "localTime" 614 } 615 var after time.Time 616 var before time.Time 617 var serverDelay time.Duration 618 supportsMasterLocalTime := true 619 for i := 0; i < 10; i++ { 620 if supportsMasterLocalTime { 621 // Try isMaster.localTime, which is present since MongoDB 2.2 622 // and does not require admin privileges. 623 before = time.Now() 624 err := c.Database.Run("isMaster", &isMaster) 625 after = time.Now() 626 if err != nil { 627 return 0, err 628 } 629 if isMaster.LocalTime.IsZero() { 630 supportsMasterLocalTime = false 631 continue 632 } else { 633 serverDelay = isMaster.LocalTime.Sub(before) 634 } 635 } else { 636 // If MongoDB doesn't have localTime as part of 637 // isMaster result, it means that the server is likely 638 // a MongoDB older than 2.2. 639 // 640 // Fallback to 'eval' works fine on versions older than 641 // 2.4 where it does not require admin privileges. 642 // 643 // NOTE: 'eval' takes a global write lock unless you 644 // specify 'nolock' (which we are not doing below, for 645 // no apparent reason), so it is quite likely that the 646 // eval could take a relatively long time to acquire 647 // the lock and thus cause a retry on the callDelay 648 // check below on a busy server. 649 before = time.Now() 650 err := c.Database.Run(bson.D{{"$eval", "function() { return new Date(); }"}}, &server) 651 after = time.Now() 652 if err != nil { 653 return 0, err 654 } 655 serverDelay = server.Sub(before) 656 } 657 // If the call to the server takes longer than a few seconds we 658 // retry it a couple more times before giving up. It is unclear 659 // why the retry would help at all here. 660 // 661 // If the server takes longer than the specified amount of time 662 // on every single try, then we simply give up. 663 callDelay := after.Sub(before) 664 if callDelay > 5*time.Second { 665 continue 666 } 667 return serverDelay, nil 668 } 669 return 0, fmt.Errorf("cannot synchronize clock with database server") 670 } 671 672 // timeSlot returns the current time slot, in seconds since the 673 // epoch, for the provided now time. The delta skew is applied 674 // to the now time to improve the synchronization with a 675 // centrally agreed time. 676 // 677 // The result of this method may be manipulated for test purposes 678 // by fakeTimeSlot and realTimeSlot. 679 func timeSlot(now time.Time, delta time.Duration) int64 { 680 fakeMutex.Lock() 681 fake := !fakeNow.IsZero() 682 if fake { 683 now = fakeNow 684 } 685 slot := now.Add(delta).Unix() 686 slot -= slot % period 687 if fake { 688 slot += int64(fakeOffset) * period 689 } 690 fakeMutex.Unlock() 691 return slot 692 } 693 694 var ( 695 fakeMutex sync.Mutex // protects fakeOffset, fakeNow 696 fakeNow time.Time 697 fakeOffset int 698 ) 699 700 // fakeTimeSlot hardcodes the slot time returned by the timeSlot 701 // function for testing purposes. The offset parameter is the slot 702 // position to return: offsets +1 and -1 are +period and -period 703 // seconds from slot 0, respectively. 704 func fakeTimeSlot(offset int) { 705 fakeMutex.Lock() 706 if fakeNow.IsZero() { 707 fakeNow = time.Now() 708 } 709 fakeOffset = offset 710 fakeMutex.Unlock() 711 logger.Infof("faking presence to time slot %d", offset) 712 } 713 714 // realTimeSlot disables the hardcoding introduced by fakeTimeSlot. 715 func realTimeSlot() { 716 fakeMutex.Lock() 717 fakeNow = time.Time{} 718 fakeOffset = 0 719 fakeMutex.Unlock() 720 logger.Infof("not faking presence time. Real time slot in use.") 721 } 722 723 func seqsC(base *mgo.Collection) *mgo.Collection { 724 return base.Database.C(base.Name + ".seqs") 725 } 726 727 func beingsC(base *mgo.Collection) *mgo.Collection { 728 return base.Database.C(base.Name + ".beings") 729 } 730 731 func pingsC(base *mgo.Collection) *mgo.Collection { 732 return base.Database.C(base.Name + ".pings") 733 }