github.com/cloudbase/juju-core@v0.0.0-20140504232958-a7271ac7912f/state/presence/presence.go (about) 1 // Copyright 2012, 2013 Canonical Ltd. 2 // Licensed under the AGPLv3, see LICENCE file for details. 3 4 // The presence package implements an interface for observing liveness 5 // of arbitrary keys (agents, processes, etc) on top of MongoDB. 6 // The design works by periodically updating the database so that 7 // watchers can tell an arbitrary key is alive. 8 package presence 9 10 import ( 11 "fmt" 12 "strconv" 13 "sync" 14 "time" 15 16 "labix.org/v2/mgo" 17 "labix.org/v2/mgo/bson" 18 "launchpad.net/tomb" 19 20 "launchpad.net/juju-core/log" 21 ) 22 23 // Debug specifies whether the package will log debug 24 // messages. 25 // TODO(rog) allow debug level setting in the log package. 26 var Debug = false 27 28 // The implementation works by assigning a unique sequence number to each 29 // pinger that is alive, and the pinger is then responsible for 30 // periodically updating the current time slot document with its 31 // sequence number so that watchers can tell it is alive. 32 // 33 // The internal implementation of the time slot document is as follows: 34 // 35 // { 36 // "_id": <time slot>, 37 // "alive": { hex(<pinger seq> / 63) : (1 << (<pinger seq> % 63) | <others>) }, 38 // "dead": { hex(<pinger seq> / 63) : (1 << (<pinger seq> % 63) | <others>) }, 39 // } 40 // 41 // All pingers that have their sequence number under "alive" and not 42 // under "dead" are currently alive. This design enables implementing 43 // a ping with a single update operation, a kill with another operation, 44 // and obtaining liveness data with a single query that returns two 45 // documents (the last two time slots). 46 // 47 // A new pinger sequence is obtained every time a pinger starts by 48 // atomically incrementing a counter in a globally used document in a 49 // helper collection. That sequence number is then inserted into the 50 // beings collection to establish the mapping between pinger sequence 51 // and key. 52 53 // BUG(gn): The pings and beings collection currently grow without bound. 54 55 // A Watcher can watch any number of pinger keys for liveness changes. 56 type Watcher struct { 57 tomb tomb.Tomb 58 base *mgo.Collection 59 pings *mgo.Collection 60 beings *mgo.Collection 61 62 // delta is an approximate clock skew between the local system 63 // clock and the database clock. 64 delta time.Duration 65 66 // beingKey and beingSeq are the pinger seq <=> key mappings. 67 // Entries in these maps are considered alive. 68 beingKey map[int64]string 69 beingSeq map[string]int64 70 71 // watches has the per-key observer channels from Watch/Unwatch. 72 watches map[string][]chan<- Change 73 74 // pending contains all the events to be dispatched to the watcher 75 // channels. They're queued during processing and flushed at the 76 // end to simplify the algorithm. 77 pending []event 78 79 // request is used to deliver requests from the public API into 80 // the the gorotuine loop. 81 request chan interface{} 82 83 // syncDone contains pending done channels from sync requests. 84 syncDone []chan bool 85 86 // next will dispatch when it's time to sync the database 87 // knowledge. It's maintained here so that ForceRefresh 88 // can manipulate it to force a sync sooner. 89 next <-chan time.Time 90 } 91 92 type event struct { 93 ch chan<- Change 94 key string 95 alive bool 96 } 97 98 // Change holds a liveness change notification. 99 type Change struct { 100 Key string 101 Alive bool 102 } 103 104 // NewWatcher returns a new Watcher. 105 func NewWatcher(base *mgo.Collection) *Watcher { 106 w := &Watcher{ 107 base: base, 108 pings: pingsC(base), 109 beings: beingsC(base), 110 beingKey: make(map[int64]string), 111 beingSeq: make(map[string]int64), 112 watches: make(map[string][]chan<- Change), 113 request: make(chan interface{}), 114 } 115 go func() { 116 w.tomb.Kill(w.loop()) 117 w.tomb.Done() 118 }() 119 return w 120 } 121 122 // Stop stops all the watcher activities. 123 func (w *Watcher) Stop() error { 124 w.tomb.Kill(nil) 125 return w.tomb.Wait() 126 } 127 128 // Dead returns a channel that is closed when the watcher has stopped. 129 func (w *Watcher) Dead() <-chan struct{} { 130 return w.tomb.Dead() 131 } 132 133 // Err returns the error with which the watcher stopped. 134 // It returns nil if the watcher stopped cleanly, tomb.ErrStillAlive 135 // if the watcher is still running properly, or the respective error 136 // if the watcher is terminating or has terminated with an error. 137 func (w *Watcher) Err() error { 138 return w.tomb.Err() 139 } 140 141 type reqWatch struct { 142 key string 143 ch chan<- Change 144 } 145 146 type reqUnwatch struct { 147 key string 148 ch chan<- Change 149 } 150 151 type reqSync struct { 152 done chan bool 153 } 154 155 type reqAlive struct { 156 key string 157 result chan bool 158 } 159 160 func (w *Watcher) sendReq(req interface{}) { 161 select { 162 case w.request <- req: 163 case <-w.tomb.Dying(): 164 } 165 } 166 167 // Watch starts watching the liveness of key. An event will 168 // be sent onto ch to report the initial status for the key, and 169 // from then on a new event will be sent whenever a change is 170 // detected. Change values sent to the channel must be consumed, 171 // or the whole watcher will blocked. 172 func (w *Watcher) Watch(key string, ch chan<- Change) { 173 w.sendReq(reqWatch{key, ch}) 174 } 175 176 // Unwatch stops watching the liveness of key via ch. 177 func (w *Watcher) Unwatch(key string, ch chan<- Change) { 178 w.sendReq(reqUnwatch{key, ch}) 179 } 180 181 // StartSync forces the watcher to load new events from the database. 182 func (w *Watcher) StartSync() { 183 w.sendReq(reqSync{nil}) 184 } 185 186 // Sync forces the watcher to load new events from the database and blocks 187 // until all events have been dispatched. 188 func (w *Watcher) Sync() { 189 done := make(chan bool) 190 w.sendReq(reqSync{done}) 191 select { 192 case <-done: 193 case <-w.tomb.Dying(): 194 } 195 } 196 197 // Alive returns whether the key is currently considered alive by w, 198 // or an error in case the watcher is dying. 199 func (w *Watcher) Alive(key string) (bool, error) { 200 result := make(chan bool, 1) 201 w.sendReq(reqAlive{key, result}) 202 var alive bool 203 select { 204 case alive = <-result: 205 case <-w.tomb.Dying(): 206 return false, fmt.Errorf("cannot check liveness: watcher is dying") 207 } 208 return alive, nil 209 } 210 211 // period is the length of each time slot in seconds. 212 // It's not a time.Duration because the code is more convenient like 213 // this and also because sub-second timings don't work as the slot 214 // identifier is an int64 in seconds. 215 var period int64 = 30 216 217 // loop implements the main watcher loop. 218 func (w *Watcher) loop() error { 219 var err error 220 if w.delta, err = clockDelta(w.base); err != nil { 221 return err 222 } 223 w.next = time.After(0) 224 for { 225 select { 226 case <-w.tomb.Dying(): 227 return tomb.ErrDying 228 case <-w.next: 229 w.next = time.After(time.Duration(period) * time.Second) 230 syncDone := w.syncDone 231 w.syncDone = nil 232 if err := w.sync(); err != nil { 233 return err 234 } 235 w.flush() 236 for _, done := range syncDone { 237 close(done) 238 } 239 case req := <-w.request: 240 w.handle(req) 241 w.flush() 242 } 243 } 244 return nil 245 } 246 247 // flush sends all pending events to their respective channels. 248 func (w *Watcher) flush() { 249 // w.pending may get new requests as we handle other requests. 250 for i := 0; i < len(w.pending); i++ { 251 e := &w.pending[i] 252 for e.ch != nil { 253 select { 254 case <-w.tomb.Dying(): 255 return 256 case req := <-w.request: 257 w.handle(req) 258 continue 259 case e.ch <- Change{e.key, e.alive}: 260 } 261 break 262 } 263 } 264 w.pending = w.pending[:0] 265 } 266 267 // handle deals with requests delivered by the public API 268 // onto the background watcher goroutine. 269 func (w *Watcher) handle(req interface{}) { 270 debugf("state/presence: got request: %#v", req) 271 switch r := req.(type) { 272 case reqSync: 273 w.next = time.After(0) 274 if r.done != nil { 275 w.syncDone = append(w.syncDone, r.done) 276 } 277 case reqWatch: 278 for _, ch := range w.watches[r.key] { 279 if ch == r.ch { 280 panic("adding channel twice for same key") 281 } 282 } 283 w.watches[r.key] = append(w.watches[r.key], r.ch) 284 _, alive := w.beingSeq[r.key] 285 w.pending = append(w.pending, event{r.ch, r.key, alive}) 286 case reqUnwatch: 287 watches := w.watches[r.key] 288 for i, ch := range watches { 289 if ch == r.ch { 290 watches[i] = watches[len(watches)-1] 291 w.watches[r.key] = watches[:len(watches)-1] 292 break 293 } 294 } 295 for i := range w.pending { 296 e := &w.pending[i] 297 if e.key == r.key && e.ch == r.ch { 298 e.ch = nil 299 } 300 } 301 case reqAlive: 302 _, alive := w.beingSeq[r.key] 303 r.result <- alive 304 default: 305 panic(fmt.Errorf("unknown request: %T", req)) 306 } 307 } 308 309 type beingInfo struct { 310 Seq int64 "_id,omitempty" 311 Key string "key,omitempty" 312 } 313 314 type pingInfo struct { 315 Slot int64 "_id" 316 Alive map[string]int64 ",omitempty" 317 Dead map[string]int64 ",omitempty" 318 } 319 320 func (w *Watcher) findAllBeings() (map[int64]beingInfo, error) { 321 beings := make([]beingInfo, 0) 322 err := w.beings.Find(bson.D{{}}).All(&beings) 323 if err != nil { 324 return nil, err 325 } 326 beingInfos := make(map[int64]beingInfo, len(beings)) 327 for _, being := range beings { 328 beingInfos[being.Seq] = being 329 } 330 return beingInfos, nil 331 } 332 333 // sync updates the watcher knowledge from the database, and 334 // queues events to observing channels. It fetches the last two time 335 // slots and compares the union of both to the in-memory state. 336 func (w *Watcher) sync() error { 337 var allBeings map[int64]beingInfo 338 if len(w.beingKey) == 0 { 339 // The very first time we sync, we grab all ever-known beings, 340 // so we don't have to look them up one-by-one 341 var err error 342 if allBeings, err = w.findAllBeings(); err != nil { 343 return err 344 } 345 } 346 slot := timeSlot(time.Now(), w.delta) 347 var ping []pingInfo 348 err := w.pings.Find(bson.D{{"$or", []pingInfo{{Slot: slot}, {Slot: slot - period}}}}).All(&ping) 349 if err != nil && err == mgo.ErrNotFound { 350 return err 351 } 352 353 // Learn about all enforced deaths. 354 dead := make(map[int64]bool) 355 for i := range ping { 356 for key, value := range ping[i].Dead { 357 k, err := strconv.ParseInt(key, 16, 64) 358 if err != nil { 359 panic(fmt.Errorf("presence cannot parse dead key: %q", key)) 360 } 361 k *= 63 362 for i := int64(0); i < 63 && value > 0; i++ { 363 on := value&1 == 1 364 value >>= 1 365 if !on { 366 continue 367 } 368 seq := k + i 369 dead[seq] = true 370 debugf("state/presence: found seq=%d dead", seq) 371 } 372 } 373 } 374 375 // Learn about all the pingers that reported and queue 376 // events for those that weren't known to be alive and 377 // are not reportedly dead either. 378 alive := make(map[int64]bool) 379 being := beingInfo{} 380 for i := range ping { 381 for key, value := range ping[i].Alive { 382 k, err := strconv.ParseInt(key, 16, 64) 383 if err != nil { 384 panic(fmt.Errorf("presence cannot parse alive key: %q", key)) 385 } 386 k *= 63 387 for i := int64(0); i < 63 && value > 0; i++ { 388 on := value&1 == 1 389 value >>= 1 390 if !on { 391 continue 392 } 393 seq := k + i 394 alive[seq] = true 395 if _, ok := w.beingKey[seq]; ok { 396 continue 397 } 398 // Check if the being exists in the 'all' map, 399 // otherwise do a single lookup in mongo 400 var ok bool 401 if being, ok = allBeings[seq]; !ok { 402 err := w.beings.Find(bson.D{{"_id", seq}}).One(&being) 403 if err == mgo.ErrNotFound { 404 debugf("state/presence: found seq=%d unowned", seq) 405 continue 406 } else if err != nil { 407 return err 408 } 409 } 410 cur := w.beingSeq[being.Key] 411 if cur < seq { 412 delete(w.beingKey, cur) 413 } else { 414 // Current sequence is more recent. 415 continue 416 } 417 w.beingKey[seq] = being.Key 418 w.beingSeq[being.Key] = seq 419 if cur > 0 || dead[seq] { 420 continue 421 } 422 debugf("state/presence: found seq=%d alive with key %q", seq, being.Key) 423 for _, ch := range w.watches[being.Key] { 424 w.pending = append(w.pending, event{ch, being.Key, true}) 425 } 426 } 427 } 428 } 429 430 // Pingers that were known to be alive and haven't reported 431 // in the last two slots are now considered dead. Dispatch 432 // the respective events and forget their sequences. 433 for seq, key := range w.beingKey { 434 if dead[seq] || !alive[seq] { 435 delete(w.beingKey, seq) 436 delete(w.beingSeq, key) 437 for _, ch := range w.watches[key] { 438 w.pending = append(w.pending, event{ch, key, false}) 439 } 440 } 441 } 442 return nil 443 } 444 445 // Pinger periodically reports that a specific key is alive, so that 446 // watchers interested on that fact can react appropriately. 447 type Pinger struct { 448 mu sync.Mutex 449 tomb tomb.Tomb 450 base *mgo.Collection 451 pings *mgo.Collection 452 started bool 453 beingKey string 454 beingSeq int64 455 fieldKey string // hex(beingKey / 63) 456 fieldBit uint64 // 1 << (beingKey%63) 457 lastSlot int64 458 delta time.Duration 459 } 460 461 // NewPinger returns a new Pinger to report that key is alive. 462 // It starts reporting after Start is called. 463 func NewPinger(base *mgo.Collection, key string) *Pinger { 464 return &Pinger{base: base, pings: pingsC(base), beingKey: key} 465 } 466 467 // Start starts periodically reporting that p's key is alive. 468 func (p *Pinger) Start() error { 469 p.mu.Lock() 470 defer p.mu.Unlock() 471 if p.started { 472 return fmt.Errorf("pinger already started") 473 } 474 p.tomb = tomb.Tomb{} 475 if err := p.prepare(); err != nil { 476 return err 477 } 478 debugf("state/presence: starting pinger for %q with seq=%d", p.beingKey, p.beingSeq) 479 if err := p.ping(); err != nil { 480 return err 481 } 482 p.started = true 483 go func() { 484 p.tomb.Kill(p.loop()) 485 p.tomb.Done() 486 }() 487 return nil 488 } 489 490 // Stop stops p's periodical ping. 491 // Watchers will not notice p has stopped pinging until the 492 // previous ping times out. 493 func (p *Pinger) Stop() error { 494 p.mu.Lock() 495 defer p.mu.Unlock() 496 if p.started { 497 debugf("state/presence: stopping pinger for %q with seq=%d", p.beingKey, p.beingSeq) 498 } 499 p.tomb.Kill(nil) 500 err := p.tomb.Wait() 501 // TODO ping one more time to guarantee a late timeout. 502 p.started = false 503 return err 504 505 } 506 507 // Stop stops p's periodical ping and immediately report that it is dead. 508 func (p *Pinger) Kill() error { 509 p.mu.Lock() 510 defer p.mu.Unlock() 511 if p.started { 512 debugf("state/presence: killing pinger for %q (was started)", p.beingKey) 513 return p.killStarted() 514 } 515 debugf("state/presence: killing pinger for %q (was stopped)", p.beingKey) 516 return p.killStopped() 517 } 518 519 // killStarted kills the pinger while it is running, by first 520 // stopping it and then recording in the last pinged slot that 521 // the pinger was killed. 522 func (p *Pinger) killStarted() error { 523 p.tomb.Kill(nil) 524 killErr := p.tomb.Wait() 525 p.started = false 526 527 slot := p.lastSlot 528 udoc := bson.D{{"$inc", bson.D{{"dead." + p.fieldKey, p.fieldBit}}}} 529 if _, err := p.pings.UpsertId(slot, udoc); err != nil { 530 return err 531 } 532 return killErr 533 } 534 535 // killStopped kills the pinger while it is not running, by 536 // first allocating a new sequence, and then atomically recording 537 // the new sequence both as alive and dead at once. 538 func (p *Pinger) killStopped() error { 539 if err := p.prepare(); err != nil { 540 return err 541 } 542 slot := timeSlot(time.Now(), p.delta) 543 udoc := bson.D{{"$inc", bson.D{ 544 {"dead." + p.fieldKey, p.fieldBit}, 545 {"alive." + p.fieldKey, p.fieldBit}, 546 }}} 547 _, err := p.pings.UpsertId(slot, udoc) 548 return err 549 } 550 551 // loop is the main pinger loop that runs while it is 552 // in started state. 553 func (p *Pinger) loop() error { 554 for { 555 select { 556 case <-p.tomb.Dying(): 557 return tomb.ErrDying 558 case <-time.After(time.Duration(float64(period+1)*0.75) * time.Second): 559 if err := p.ping(); err != nil { 560 return err 561 } 562 } 563 } 564 } 565 566 // prepare allocates a new unique sequence for the 567 // pinger key and prepares the pinger to use it. 568 func (p *Pinger) prepare() error { 569 change := mgo.Change{ 570 Update: bson.D{{"$inc", bson.D{{"seq", int64(1)}}}}, 571 Upsert: true, 572 ReturnNew: true, 573 } 574 seqs := seqsC(p.base) 575 var seq struct{ Seq int64 } 576 if _, err := seqs.FindId("beings").Apply(change, &seq); err != nil { 577 return err 578 } 579 p.beingSeq = seq.Seq 580 p.fieldKey = fmt.Sprintf("%x", p.beingSeq/63) 581 p.fieldBit = 1 << uint64(p.beingSeq%63) 582 p.lastSlot = 0 583 beings := beingsC(p.base) 584 return beings.Insert(beingInfo{p.beingSeq, p.beingKey}) 585 } 586 587 // ping records updates the current time slot with the 588 // sequence in use by the pinger. 589 func (p *Pinger) ping() error { 590 debugf("state/presence: pinging %q with seq=%d", p.beingKey, p.beingSeq) 591 if p.delta == 0 { 592 delta, err := clockDelta(p.base) 593 if err != nil { 594 return err 595 } 596 p.delta = delta 597 } 598 slot := timeSlot(time.Now(), p.delta) 599 if slot == p.lastSlot { 600 // Never, ever, ping the same slot twice. 601 // The increment below would corrupt the slot. 602 return nil 603 } 604 p.lastSlot = slot 605 if _, err := p.pings.UpsertId(slot, bson.D{{"$inc", bson.D{{"alive." + p.fieldKey, p.fieldBit}}}}); err != nil { 606 return err 607 } 608 return nil 609 } 610 611 // clockDelta returns the approximate skew between 612 // the local clock and the database clock. 613 func clockDelta(c *mgo.Collection) (time.Duration, error) { 614 var server struct { 615 time.Time "retval" 616 } 617 var isMaster struct { 618 LocalTime time.Time "localTime" 619 } 620 var after time.Time 621 var before time.Time 622 var serverDelay time.Duration 623 supportsMasterLocalTime := true 624 for i := 0; i < 10; i++ { 625 if supportsMasterLocalTime { 626 // Try isMaster.localTime, which is present since MongoDB 2.2 627 // and does not require admin privileges. 628 before = time.Now() 629 err := c.Database.Run("isMaster", &isMaster) 630 after = time.Now() 631 if err != nil { 632 return 0, err 633 } 634 if isMaster.LocalTime.IsZero() { 635 supportsMasterLocalTime = false 636 continue 637 } else { 638 serverDelay = isMaster.LocalTime.Sub(before) 639 } 640 } else { 641 // If MongoDB doesn't have localTime as part of 642 // isMaster result, it means that the server is likely 643 // a MongoDB older than 2.2. 644 // 645 // Fallback to 'eval' works fine on versions older than 646 // 2.4 where it does not require admin privileges. 647 // 648 // NOTE: 'eval' takes a global write lock unless you 649 // specify 'nolock' (which we are not doing below, for 650 // no apparent reason), so it is quite likely that the 651 // eval could take a relatively long time to acquire 652 // the lock and thus cause a retry on the callDelay 653 // check below on a busy server. 654 before = time.Now() 655 err := c.Database.Run(bson.D{{"$eval", "function() { return new Date(); }"}}, &server) 656 after = time.Now() 657 if err != nil { 658 return 0, err 659 } 660 serverDelay = server.Sub(before) 661 } 662 // If the call to the server takes longer than a few seconds we 663 // retry it a couple more times before giving up. It is unclear 664 // why the retry would help at all here. 665 // 666 // If the server takes longer than the specified amount of time 667 // on every single try, then we simply give up. 668 callDelay := after.Sub(before) 669 if callDelay > 5*time.Second { 670 continue 671 } 672 return serverDelay, nil 673 } 674 return 0, fmt.Errorf("cannot synchronize clock with database server") 675 } 676 677 // timeSlot returns the current time slot, in seconds since the 678 // epoch, for the provided now time. The delta skew is applied 679 // to the now time to improve the synchronization with a 680 // centrally agreed time. 681 // 682 // The result of this method may be manipulated for test purposes 683 // by fakeTimeSlot and realTimeSlot. 684 func timeSlot(now time.Time, delta time.Duration) int64 { 685 fakeMutex.Lock() 686 fake := !fakeNow.IsZero() 687 if fake { 688 now = fakeNow 689 } 690 slot := now.Add(delta).Unix() 691 slot -= slot % period 692 if fake { 693 slot += int64(fakeOffset) * period 694 } 695 fakeMutex.Unlock() 696 return slot 697 } 698 699 var ( 700 fakeMutex sync.Mutex // protects fakeOffset, fakeNow 701 fakeNow time.Time 702 fakeOffset int 703 ) 704 705 // fakeTimeSlot hardcodes the slot time returned by the timeSlot 706 // function for testing purposes. The offset parameter is the slot 707 // position to return: offsets +1 and -1 are +period and -period 708 // seconds from slot 0, respectively. 709 func fakeTimeSlot(offset int) { 710 fakeMutex.Lock() 711 if fakeNow.IsZero() { 712 fakeNow = time.Now() 713 } 714 fakeOffset = offset 715 fakeMutex.Unlock() 716 log.Infof("state/presence: Faking presence to time slot %d", offset) 717 } 718 719 // realTimeSlot disables the hardcoding introduced by fakeTimeSlot. 720 func realTimeSlot() { 721 fakeMutex.Lock() 722 fakeNow = time.Time{} 723 fakeOffset = 0 724 fakeMutex.Unlock() 725 log.Infof("state/presence: Not faking presence time. Real time slot in use.") 726 } 727 728 func seqsC(base *mgo.Collection) *mgo.Collection { 729 return base.Database.C(base.Name + ".seqs") 730 } 731 732 func beingsC(base *mgo.Collection) *mgo.Collection { 733 return base.Database.C(base.Name + ".beings") 734 } 735 736 func pingsC(base *mgo.Collection) *mgo.Collection { 737 return base.Database.C(base.Name + ".pings") 738 } 739 740 func debugf(f string, a ...interface{}) { 741 if Debug { 742 log.Debugf(f, a...) 743 } 744 }