github.com/niedbalski/juju@v0.0.0-20190215020005-8ff100488e47/state/presence/presence.go (about)

     1  // Copyright 2012, 2013 Canonical Ltd.
     2  // Licensed under the AGPLv3, see LICENCE file for details.
     3  
     4  // The presence package implements an interface for observing liveness
     5  // of arbitrary keys (agents, processes, etc) on top of MongoDB.
     6  // The design works by periodically updating the database so that
     7  // watchers can tell an arbitrary key is alive.
     8  package presence
     9  
    10  import (
    11  	"fmt"
    12  	"math/rand"
    13  	"strconv"
    14  	"sync"
    15  	"time"
    16  
    17  	"github.com/juju/errors"
    18  	"github.com/juju/loggo"
    19  	"gopkg.in/juju/names.v2"
    20  	"gopkg.in/juju/worker.v1"
    21  	"gopkg.in/mgo.v2"
    22  	"gopkg.in/mgo.v2/bson"
    23  	"gopkg.in/tomb.v2"
    24  )
    25  
    26  var logger = loggo.GetLogger("juju.state.presence")
    27  
    28  // lookupBatchSize is how many Sequence => Being keys we'll lookup at one time.
    29  // In testing, we could do 50,000 entries in a single request without errors.
    30  // This mostly prevents us from blowout.
    31  // Going from 10 to 100, increased the throughput 2x. Going to 1k was ~1.1x, and
    32  // going to 10k was another 1.1x. 1000 seems a reasonable size.
    33  const lookupBatchSize = 1000
    34  
    35  // Agent shouldn't really live here -- it's not used in this package,
    36  // and is implemented by a couple of state types for the convenience of
    37  // the apiserver -- but one of the methods returns a concrete *Pinger,
    38  // and that ties it down here quite effectively (until we want to take
    39  // on the task of cleaning it up and promoting it to core, which might
    40  // well never happen).
    41  type Agent interface {
    42  	AgentPresence() (bool, error)
    43  	SetAgentPresence() (*Pinger, error)
    44  	WaitAgentPresence(time.Duration) error
    45  }
    46  
    47  // docIDInt64 generates a globally unique id value
    48  // where the model uuid is prefixed to the
    49  // given int64 localID.
    50  func docIDInt64(modelUUID string, localID int64) string {
    51  	return modelUUID + ":" + strconv.FormatInt(localID, 10)
    52  }
    53  
    54  // docIDStr generates a globally unique id value
    55  // where the model uuid is prefixed to the
    56  // given string localID.
    57  func docIDStr(modelUUID string, localID string) string {
    58  	return modelUUID + ":" + localID
    59  }
    60  
    61  // The implementation works by assigning a unique sequence number to each
    62  // pinger that is alive, and the pinger is then responsible for
    63  // periodically updating the current time slot document with its
    64  // sequence number so that watchers can tell it is alive.
    65  //
    66  // There is only one time slot document per time slot, per model. The
    67  // internal implementation of the time slot document is as follows:
    68  //
    69  // {
    70  //   "_id":   <model UUID>:<time slot>,
    71  //   "slot": <slot>,
    72  //   "alive": { hex(<pinger seq> / 63) : (1 << (<pinger seq> % 63) | <others>) },
    73  //   "dead":  { hex(<pinger seq> / 63) : (1 << (<pinger seq> % 63) | <others>) },
    74  // }
    75  //
    76  // All pingers that have their sequence number under "alive" and not
    77  // under "dead" are currently alive. This design enables implementing
    78  // a ping with a single update operation, a kill with another operation,
    79  // and obtaining liveness data with a single query that returns two
    80  // documents (the last two time slots).
    81  //
    82  // A new pinger sequence is obtained every time a pinger starts by atomically
    83  // incrementing a counter in a document in a helper collection. There is only
    84  // one such document per model. That sequence number is then inserted
    85  // into the beings collection to establish the mapping between pinger sequence
    86  // and key.
    87  
    88  // psuedoRandomFactor defines an increasing chance that we will trigger an effect.
    89  // Inspired by: http://dota2.gamepedia.com/Random_distribution
    90  // The idea is that 'on average' we will trigger 5% of the time. However, that
    91  // leaves a low but non-zero chance that we will *never* trigger, and a
    92  // surprisingly high chance that we will trigger twice in a row.
    93  // psuedoRandom increases the chance to trigger everytime it does not trigger,
    94  // ultimately making it mandatory that you will trigger, and giving the desirable
    95  // average case that you will trigger while still giving some slop so that
    96  // machines won't get into sync and trigger at the same time.
    97  // psuedoRandomFactor of 0.00380 represents a 5% average chance to trigger.
    98  const psuedoRandomFactor = 0.00380
    99  
   100  // A Watcher can watch any number of pinger keys for liveness changes.
   101  type Watcher struct {
   102  	modelUUID string
   103  	tomb      tomb.Tomb
   104  	base      *mgo.Collection
   105  	pings     *mgo.Collection
   106  	beings    *mgo.Collection
   107  
   108  	// delta is an approximate clock skew between the local system
   109  	// clock and the database clock.
   110  	delta time.Duration
   111  
   112  	// beingKey and beingSeq are the pinger seq <=> key mappings.
   113  	// Entries in these maps are considered alive.
   114  	beingKey map[int64]string
   115  	beingSeq map[string]int64
   116  
   117  	// ignoredSeqs are sequences that are active pingers, but are superseded by another pinger.
   118  	// We know who they represent, but there is a newer being that applies. This is because some agents
   119  	// (like the Controller agents), maintain multiple active connections.
   120  	ignoredSeqs map[int64]string
   121  
   122  	// watches has the per-key observer channels from Watch/Unwatch.
   123  	watches map[string][]chan<- Change
   124  
   125  	// pending contains all the events to be dispatched to the watcher
   126  	// channels. They're queued during processing and flushed at the
   127  	// end to simplify the algorithm.
   128  	pending []event
   129  
   130  	// request is used to deliver requests from the public API into
   131  	// the the goroutine loop.
   132  	request chan interface{}
   133  
   134  	// syncDone contains pending done channels from sync requests.
   135  	syncDone []chan bool
   136  
   137  	// next will dispatch when it's time to sync the database
   138  	// knowledge. It's maintained here so that ForceRefresh
   139  	// can manipulate it to force a sync sooner.
   140  	next <-chan time.Time
   141  
   142  	// syncsSinceLastPrune is a counter that tracks how long it has been
   143  	// since we've run a prune on the Beings and Pings collections.
   144  	syncsSinceLastPrune int
   145  
   146  	// beingLoads tracks the number of beings that we've had to refresh from the database
   147  	beingLoads uint64
   148  }
   149  
   150  func (w *Watcher) String() string {
   151  	return fmt.Sprintf("presence.Watcher(%s)", w.modelUUID)
   152  }
   153  
   154  type event struct {
   155  	ch    chan<- Change
   156  	key   string
   157  	alive bool
   158  }
   159  
   160  // Change holds a liveness change notification.
   161  type Change struct {
   162  	Key   string
   163  	Alive bool
   164  }
   165  
   166  // NewDeadWatcher returns a new watcher that is already dead
   167  // and always returns the given error from its Err method.
   168  func NewDeadWatcher(err error) *Watcher {
   169  	var w Watcher
   170  	w.tomb.Kill(errors.Trace(err))
   171  	return &w
   172  }
   173  
   174  // NewWatcher returns a new Watcher.
   175  func NewWatcher(base *mgo.Collection, modelTag names.ModelTag) *Watcher {
   176  	w := &Watcher{
   177  		modelUUID:   modelTag.Id(),
   178  		base:        base,
   179  		pings:       pingsC(base),
   180  		beings:      beingsC(base),
   181  		beingKey:    make(map[int64]string),
   182  		beingSeq:    make(map[string]int64),
   183  		watches:     make(map[string][]chan<- Change),
   184  		request:     make(chan interface{}),
   185  		ignoredSeqs: make(map[int64]string),
   186  	}
   187  	w.tomb.Go(func() error {
   188  		err := w.loop()
   189  		cause := errors.Cause(err)
   190  		// tomb expects ErrDying or ErrStillAlive as
   191  		// exact values, so we need to log and unwrap
   192  		// the error first.
   193  		if err != nil && cause != tomb.ErrDying {
   194  			logger.Infof("watcher loop failed: %v", err)
   195  		}
   196  		return cause
   197  	})
   198  	return w
   199  }
   200  
   201  // Kill is part of the worker.Worker interface.
   202  func (w *Watcher) Kill() {
   203  	w.tomb.Kill(nil)
   204  }
   205  
   206  // Wait is part of the worker.Worker interface.
   207  func (w *Watcher) Wait() error {
   208  	return w.tomb.Wait()
   209  }
   210  
   211  // Stop stops all the watcher activities.
   212  func (w *Watcher) Stop() error {
   213  	return worker.Stop(w)
   214  }
   215  
   216  // Dead returns a channel that is closed when the watcher has stopped.
   217  func (w *Watcher) Dead() <-chan struct{} {
   218  	return w.tomb.Dead()
   219  }
   220  
   221  // Err returns the error with which the watcher stopped.
   222  // It returns nil if the watcher stopped cleanly, tomb.ErrStillAlive
   223  // if the watcher is still running properly, or the respective error
   224  // if the watcher is terminating or has terminated with an error.
   225  func (w *Watcher) Err() error {
   226  	return w.tomb.Err()
   227  }
   228  
   229  type reqWatch struct {
   230  	key string
   231  	ch  chan<- Change
   232  }
   233  
   234  type reqUnwatch struct {
   235  	key string
   236  	ch  chan<- Change
   237  }
   238  
   239  type reqSync struct {
   240  	done chan bool
   241  }
   242  
   243  type reqAlive struct {
   244  	key    string
   245  	result chan bool
   246  }
   247  
   248  func (w *Watcher) sendReq(req interface{}) {
   249  	select {
   250  	case w.request <- req:
   251  	case <-w.tomb.Dying():
   252  	}
   253  }
   254  
   255  // Watch starts watching the liveness of key. An event will
   256  // be sent onto ch to report the initial status for the key, and
   257  // from then on a new event will be sent whenever a change is
   258  // detected. Change values sent to the channel must be consumed,
   259  // or the whole watcher will blocked.
   260  func (w *Watcher) Watch(key string, ch chan<- Change) {
   261  	w.sendReq(reqWatch{key, ch})
   262  }
   263  
   264  // Unwatch stops watching the liveness of key via ch.
   265  func (w *Watcher) Unwatch(key string, ch chan<- Change) {
   266  	w.sendReq(reqUnwatch{key, ch})
   267  }
   268  
   269  // StartSync forces the watcher to load new events from the database.
   270  func (w *Watcher) StartSync() {
   271  	w.sendReq(reqSync{nil})
   272  }
   273  
   274  // Sync forces the watcher to load new events from the database and blocks
   275  // until all events have been dispatched.
   276  func (w *Watcher) Sync() {
   277  	done := make(chan bool)
   278  	w.sendReq(reqSync{done})
   279  	select {
   280  	case <-done:
   281  	case <-w.tomb.Dying():
   282  	}
   283  }
   284  
   285  // Alive returns whether the key is currently considered alive by w,
   286  // or an error in case the watcher is dying.
   287  func (w *Watcher) Alive(key string) (bool, error) {
   288  	result := make(chan bool, 1)
   289  	w.sendReq(reqAlive{key, result})
   290  	var alive bool
   291  	select {
   292  	case alive = <-result:
   293  	case <-w.tomb.Dying():
   294  		return false, errors.Errorf("cannot check liveness: watcher is dying")
   295  	}
   296  	logger.Tracef("[%s] Alive(%q) -> %v", w.modelUUID[:6], key, alive)
   297  	return alive, nil
   298  }
   299  
   300  // period is the length of each time slot in seconds.
   301  // It's not a time.Duration because the code is more convenient like
   302  // this and also because sub-second timings don't work as the slot
   303  // identifier is an int64 in seconds.
   304  var period int64 = 30
   305  
   306  // loop implements the main watcher loop.
   307  func (w *Watcher) loop() error {
   308  	var err error
   309  	if w.delta, err = clockDelta(w.base); err != nil {
   310  		return errors.Trace(err)
   311  	}
   312  	// Always sync before handling request.
   313  	if err := w.sync(); err != nil {
   314  		return errors.Trace(err)
   315  	}
   316  	w.next = time.After(time.Duration(period) * time.Second)
   317  	for {
   318  		select {
   319  		case <-w.tomb.Dying():
   320  			return errors.Trace(tomb.ErrDying)
   321  		case <-w.next:
   322  			w.next = time.After(time.Duration(period) * time.Second)
   323  			syncDone := w.syncDone
   324  			w.syncDone = nil
   325  			if err := w.sync(); err != nil {
   326  				return errors.Trace(err)
   327  			}
   328  			w.flush()
   329  			for _, done := range syncDone {
   330  				close(done)
   331  			}
   332  			w.syncsSinceLastPrune++
   333  			w.checkShouldPrune()
   334  		case req := <-w.request:
   335  			w.handle(req)
   336  			w.flush()
   337  		}
   338  	}
   339  }
   340  
   341  // flush sends all pending events to their respective channels.
   342  func (w *Watcher) flush() {
   343  	// w.pending may get new requests as we handle other requests.
   344  	for i := 0; i < len(w.pending); i++ {
   345  		e := &w.pending[i]
   346  		// Allow the handling of requests while waiting for e.ch
   347  		// to be ready to read from the channel.
   348  		for e.ch != nil {
   349  			select {
   350  			case <-w.tomb.Dying():
   351  				return
   352  			case req := <-w.request:
   353  				w.handle(req)
   354  				// handle may append to the w.pending array, and/or it may unwatch something that was previously pending
   355  				// thus changing e.ch to nil while we are waiting to send the request. We need to make sure we are using
   356  				// the correct 'e' object
   357  				// See TestRobustness which fails if this line doesn't exist
   358  				e = &w.pending[i]
   359  				continue
   360  			case e.ch <- Change{e.key, e.alive}:
   361  			}
   362  			break
   363  		}
   364  	}
   365  	w.pending = w.pending[:0]
   366  }
   367  
   368  // checkShouldPrune looks at whether we should run a prune step this time
   369  func (w *Watcher) checkShouldPrune() {
   370  	chanceToPrune := float64(w.syncsSinceLastPrune) * psuedoRandomFactor
   371  	if chanceToPrune < 1.0 && rand.Float64() > chanceToPrune {
   372  		return
   373  	}
   374  	// When we decide to prune, we also drop our old cached beings
   375  	logger.Debugf("watcher %q decided to prune %q and %q", w.modelUUID, w.beings.Name, w.pings.Name)
   376  	w.syncsSinceLastPrune = 0
   377  	pruner := NewPruner(w.modelUUID, w.beings, w.pings, w.delta)
   378  	err := pruner.Prune(w.ignoredSeqs)
   379  	if err != nil {
   380  		logger.Warningf("error while pruning %q for %q: %v", w.beings.Name, w.modelUUID, err)
   381  	}
   382  }
   383  
   384  // handle deals with requests delivered by the public API
   385  // onto the background watcher goroutine.
   386  func (w *Watcher) handle(req interface{}) {
   387  	logger.Tracef("[%s] got request: %#v for model", w.modelUUID[:6], req)
   388  	switch r := req.(type) {
   389  	case reqSync:
   390  		w.next = time.After(0)
   391  		if r.done != nil {
   392  			w.syncDone = append(w.syncDone, r.done)
   393  		}
   394  	case reqWatch:
   395  		for _, ch := range w.watches[r.key] {
   396  			if ch == r.ch {
   397  				panic("adding channel twice for same key")
   398  			}
   399  		}
   400  		w.watches[r.key] = append(w.watches[r.key], r.ch)
   401  		_, alive := w.beingSeq[r.key]
   402  		w.pending = append(w.pending, event{r.ch, r.key, alive})
   403  	case reqUnwatch:
   404  		watches := w.watches[r.key]
   405  		for i, ch := range watches {
   406  			if ch == r.ch {
   407  				watches[i] = watches[len(watches)-1]
   408  				w.watches[r.key] = watches[:len(watches)-1]
   409  				break
   410  			}
   411  		}
   412  		for i := range w.pending {
   413  			e := &w.pending[i]
   414  			if e.key == r.key && e.ch == r.ch {
   415  				e.ch = nil
   416  			}
   417  		}
   418  	case reqAlive:
   419  		_, alive := w.beingSeq[r.key]
   420  		r.result <- alive
   421  	default:
   422  		panic(fmt.Errorf("unknown request: %T", req))
   423  	}
   424  }
   425  
   426  type beingInfo struct {
   427  	DocID string `bson:"_id"`
   428  	Seq   int64  `bson:"seq,omitempty"`
   429  	Key   string `bson:"key,omitempty"`
   430  }
   431  
   432  type pingInfo struct {
   433  	DocID string           `bson:"_id"`
   434  	Slot  int64            `bson:"slot,omitempty"`
   435  	Alive map[string]int64 `bson:",omitempty"`
   436  	Dead  map[string]int64 `bson:",omitempty"`
   437  }
   438  
   439  func (w *Watcher) lookupPings(session *mgo.Session) ([]pingInfo, error) {
   440  	pings := w.pings.With(session)
   441  	return lookupPings(pings, w.modelUUID, time.Now(), w.delta)
   442  }
   443  
   444  func lookupPings(pings *mgo.Collection, modelUUID string, ts time.Time, delta time.Duration) ([]pingInfo, error) {
   445  	// TODO(perrito666) 2016-05-02 lp:1558657
   446  	s := timeSlot(ts, delta)
   447  	slot := docIDInt64(modelUUID, s)
   448  	previousSlot := docIDInt64(modelUUID, s-period)
   449  	var ping []pingInfo
   450  	q := bson.D{{"$or", []pingInfo{{DocID: slot}, {DocID: previousSlot}}}}
   451  	err := pings.Find(q).All(&ping)
   452  	if err != nil && err != mgo.ErrNotFound {
   453  		return nil, errors.Trace(err)
   454  	}
   455  	return ping, nil
   456  }
   457  
   458  func (w *Watcher) lookForDead(pings []pingInfo) (map[int64]bool, error) {
   459  	// Learn about all enforced deaths.
   460  	// TODO(ericsnow) Remove this once KillForTesting() goes away.
   461  	dead := make(map[int64]bool)
   462  	deadSeqs, err := deadSeqs(pings)
   463  	if err != nil {
   464  		return nil, errors.Trace(err)
   465  	}
   466  	for _, seq := range deadSeqs {
   467  		dead[seq] = true
   468  		logger.Tracef("[%s] found seq=%d dead", w.modelUUID[:6], seq)
   469  	}
   470  	return dead, nil
   471  }
   472  
   473  // decompressPings looks at a map like 'Alive' and turns it into an array of
   474  // sequences that were seen in those maps
   475  func decompressPings(maps []map[string]int64) ([]int64, error) {
   476  	if len(maps) == 0 {
   477  		return nil, nil
   478  	}
   479  	// First step, merge all value structures together.
   480  	// Every ping has a bit field in an int64. However, bitwise-or preserves
   481  	// everything that was ever alive without having to decode them multiple times.
   482  	baseToBits := make(map[string]int64, len(maps[0]))
   483  	for i := range maps {
   484  		for hexbase, bits := range maps[i] {
   485  			baseToBits[hexbase] |= bits
   486  		}
   487  	}
   488  	sequences := make([]int64, 0, len(baseToBits)*30)
   489  	for hexbase, bits := range baseToBits {
   490  		base, err := strconv.ParseInt(hexbase, 16, 64)
   491  		if err != nil {
   492  			return nil, errors.Annotatef(err, "presence cannot parse alive key: %q", base)
   493  		}
   494  		base *= 63
   495  		for i := int64(0); i < 63 && bits > 0; i++ {
   496  			on := (bits&1 == 1)
   497  			bits >>= 1
   498  			if !on {
   499  				continue
   500  			}
   501  			seq := base + i
   502  			sequences = append(sequences, seq)
   503  		}
   504  	}
   505  	return sequences, nil
   506  }
   507  
   508  func aliveSeqs(pings []pingInfo) ([]int64, error) {
   509  	maps := make([]map[string]int64, len(pings))
   510  	for i := range pings {
   511  		maps[i] = pings[i].Alive
   512  	}
   513  	return decompressPings(maps)
   514  }
   515  
   516  func deadSeqs(pings []pingInfo) ([]int64, error) {
   517  	maps := make([]map[string]int64, len(pings))
   518  	for i := range pings {
   519  		maps[i] = pings[i].Dead
   520  	}
   521  	return decompressPings(maps)
   522  }
   523  
   524  func (w *Watcher) handleAlive(pings []pingInfo) (map[int64]bool, []int64, error) {
   525  	// Learn about all the pingers that reported and queue
   526  	// events for those that weren't known to be alive and
   527  	// are not reportedly dead either.
   528  	alive := make(map[int64]bool)
   529  	unknownSeqs := make([]int64, 0)
   530  	aliveSeq, err := aliveSeqs(pings)
   531  	if err != nil {
   532  		return nil, nil, errors.Trace(err)
   533  	}
   534  	for _, seq := range aliveSeq {
   535  		alive[seq] = true
   536  		if _, ok := w.beingKey[seq]; ok {
   537  			// entries in beingKey are ones we consider alive
   538  			// since we already have this sequence, we
   539  			// consider this being alive and this as the
   540  			// active sequence for that being, so we don't
   541  			// need to do any more work for this sequence
   542  			continue
   543  		}
   544  		if key, ok := w.ignoredSeqs[seq]; ok {
   545  			// This is a known sequence that has been superseded
   546  			if _, ok := w.beingSeq[key]; ok {
   547  				// The sequence that supersedes this one is still alive
   548  				continue
   549  			} else {
   550  				// What was a living pinger has now died, and this one might be promoted to primary pinger
   551  				// Treat it as Unknown
   552  				delete(w.ignoredSeqs, seq)
   553  			}
   554  		}
   555  		unknownSeqs = append(unknownSeqs, seq)
   556  	}
   557  	return alive, unknownSeqs, nil
   558  }
   559  
   560  // lookupUnknownSeqs handles finding new sequences that we weren't already tracking.
   561  // Keys that we find are now alive will have a 'found alive' event queued.
   562  func (w *Watcher) lookupUnknownSeqs(unknownSeqs []int64, dead map[int64]bool, session *mgo.Session) error {
   563  	if len(unknownSeqs) == 0 {
   564  		// Nothing to do, with nothing unknown.
   565  		return nil
   566  	}
   567  	// We do cache *all* beingInfos, but they're reasonably small
   568  	seqToBeing := make(map[int64]beingInfo, len(unknownSeqs))
   569  	startTime := time.Now()
   570  	beingsC := w.beings.With(session)
   571  	remaining := unknownSeqs
   572  	for len(remaining) > 0 {
   573  		// batch this into reasonable lengths
   574  		// testing shows that it works just fine at 50,000 ids, but be a
   575  		// bit more conservative
   576  		batch := remaining
   577  		if len(remaining) > lookupBatchSize {
   578  			batch = remaining[:lookupBatchSize]
   579  			remaining = remaining[lookupBatchSize:]
   580  		} else {
   581  			remaining = nil
   582  		}
   583  		docIds := make([]string, len(batch))
   584  		for i, seq := range batch {
   585  			docIds[i] = docIDInt64(w.modelUUID, seq)
   586  		}
   587  		query := beingsC.Find(bson.M{"_id": bson.M{"$in": docIds}})
   588  		// We don't need the _id returned, as its just a way to lookup the seq,
   589  		// and _id is quite large
   590  		query = query.Select(bson.M{"_id": false, "key": true, "seq": true})
   591  		query.Batch(lookupBatchSize)
   592  		beingIter := query.Iter()
   593  		being := beingInfo{}
   594  		for beingIter.Next(&being) {
   595  			seqToBeing[being.Seq] = being
   596  			w.beingLoads++
   597  		}
   598  		if err := beingIter.Close(); err != nil {
   599  			if err != mgo.ErrNotFound {
   600  				// This may be an old sequence, not considered fatal
   601  				return err
   602  			}
   603  		}
   604  	}
   605  	rate := ""
   606  	elapsed := time.Since(startTime)
   607  	if len(unknownSeqs) > 0 {
   608  		seqPerMS := float64(len(unknownSeqs)) / (elapsed.Seconds() * 1000.0)
   609  		rate = fmt.Sprintf(" (%.1fseq/ms)", seqPerMS)
   610  	}
   611  	unownedCount := 0
   612  	for _, seq := range unknownSeqs {
   613  		being, ok := seqToBeing[seq]
   614  		if !ok {
   615  			// Not Found
   616  			unownedCount++
   617  			logger.Tracef("[%s] found seq=%d unowned", w.modelUUID[:6], seq)
   618  			continue
   619  		}
   620  		cur := w.beingSeq[being.Key]
   621  		if cur < seq {
   622  			delete(w.beingKey, cur)
   623  			if cur > 0 {
   624  				w.ignoredSeqs[cur] = being.Key
   625  			}
   626  		} else {
   627  			// We already have a sequence for this key, and it is
   628  			// newer than the one we just saw.
   629  			w.ignoredSeqs[seq] = being.Key
   630  			continue
   631  		}
   632  		// Start tracking the new sequence for this key
   633  		w.beingKey[seq] = being.Key
   634  		w.beingSeq[being.Key] = seq
   635  		if cur > 0 || dead[seq] {
   636  			// if cur > 0, then we already think this is alive, no
   637  			// need to queue another message.
   638  			// if dead[] then we still wouldn't queue an alive message
   639  			// because we are writing a 'is dead' message.
   640  			continue
   641  		}
   642  		logger.Tracef("[%s] found seq=%d alive with key %q", w.modelUUID[:6], seq, being.Key)
   643  		for _, ch := range w.watches[being.Key] {
   644  			w.pending = append(w.pending, event{ch, being.Key, true})
   645  		}
   646  	}
   647  	logger.Tracef("looked up %d unknown sequences for %q (%d unowned) in %v%s from %q",
   648  		len(unknownSeqs), w.modelUUID, unownedCount, elapsed, rate, beingsC.Name)
   649  	return nil
   650  }
   651  
   652  // sync updates the watcher knowledge from the database, and
   653  // queues events to observing channels. It fetches the last two time
   654  // slots and compares the union of both to the in-memory state.
   655  func (w *Watcher) sync() error {
   656  	session := w.pings.Database.Session.Copy()
   657  	defer session.Close()
   658  	pings, err := w.lookupPings(session)
   659  	if err != nil {
   660  		return err
   661  	}
   662  	dead, err := w.lookForDead(pings)
   663  	if err != nil {
   664  		return err
   665  	}
   666  	alive, unknownSeqs, err := w.handleAlive(pings)
   667  	if err != nil {
   668  		return err
   669  	}
   670  	err = w.lookupUnknownSeqs(unknownSeqs, dead, session)
   671  	if err != nil {
   672  		return err
   673  	}
   674  
   675  	// Pingers that were known to be alive and haven't reported
   676  	// in the last two slots are now considered dead. Dispatch
   677  	// the respective events and forget their sequences.
   678  	for seq, key := range w.beingKey {
   679  		if dead[seq] || !alive[seq] {
   680  			logger.Tracef("[%s] removing seq=%d with key %q", w.modelUUID[:6], seq, key)
   681  			delete(w.beingKey, seq)
   682  			delete(w.beingSeq, key)
   683  			for _, ch := range w.watches[key] {
   684  				w.pending = append(w.pending, event{ch, key, false})
   685  			}
   686  		}
   687  	}
   688  	return nil
   689  }
   690  
   691  // Pinger periodically reports that a specific key is alive, so that
   692  // watchers interested on that fact can react appropriately.
   693  type Pinger struct {
   694  	modelUUID    string
   695  	mu           sync.Mutex
   696  	tomb         tomb.Tomb
   697  	base         *mgo.Collection
   698  	pings        *mgo.Collection
   699  	started      bool
   700  	beingKey     string
   701  	beingSeq     int64
   702  	fieldKey     string // hex(beingKey / 63)
   703  	fieldBit     uint64 // 1 << (beingKey%63)
   704  	lastSlot     int64
   705  	delta        time.Duration
   706  	recorderFunc func() PingRecorder
   707  }
   708  
   709  type PingRecorder interface {
   710  	Ping(modelUUID string, slot int64, fieldKey string, fieldBit uint64) error
   711  }
   712  
   713  // NewPinger returns a new Pinger to report that key is alive.
   714  // It starts reporting after Start is called.
   715  func NewPinger(base *mgo.Collection, modelTag names.ModelTag, key string, recorderFunc func() PingRecorder) *Pinger {
   716  	return &Pinger{
   717  		base:         base,
   718  		pings:        pingsC(base),
   719  		beingKey:     key,
   720  		modelUUID:    modelTag.Id(),
   721  		recorderFunc: recorderFunc,
   722  	}
   723  }
   724  
   725  // Start starts periodically reporting that p's key is alive.
   726  func (p *Pinger) Start() error {
   727  	p.mu.Lock()
   728  	defer p.mu.Unlock()
   729  	if p.started {
   730  		return errors.Errorf("pinger already started")
   731  	}
   732  	p.tomb = tomb.Tomb{}
   733  	if err := p.prepare(); err != nil {
   734  		return errors.Trace(err)
   735  	}
   736  	logger.Tracef("[%s] starting pinger for %q with seq=%d", p.modelUUID[:6], p.beingKey, p.beingSeq)
   737  	if err := p.ping(); err != nil {
   738  		return errors.Trace(err)
   739  	}
   740  	p.started = true
   741  	p.tomb.Go(func() error {
   742  		err := p.loop()
   743  		cause := errors.Cause(err)
   744  		// tomb expects ErrDying or ErrStillAlive as
   745  		// exact values, so we need to log and unwrap
   746  		// the error first.
   747  		if err != nil && cause != tomb.ErrDying {
   748  			logger.Infof("pinger loop failed: %v", err)
   749  		}
   750  		return cause
   751  	})
   752  	return nil
   753  }
   754  
   755  // Kill is part of the worker.Worker interface.
   756  func (p *Pinger) Kill() {
   757  	p.tomb.Kill(nil)
   758  }
   759  
   760  // Wait returns when the Pinger has stopped, and returns the first error
   761  // it encountered.
   762  func (p *Pinger) Wait() error {
   763  	return p.tomb.Wait()
   764  }
   765  
   766  // Stop stops p's periodical ping.
   767  // Watchers will not notice p has stopped pinging until the
   768  // previous ping times out.
   769  func (p *Pinger) Stop() error {
   770  	p.mu.Lock()
   771  	defer p.mu.Unlock()
   772  	if p.started {
   773  		logger.Tracef("[%s] stopping pinger for %q with seq=%d", p.modelUUID[:6], p.beingKey, p.beingSeq)
   774  	}
   775  	p.tomb.Kill(nil)
   776  	err := p.tomb.Wait()
   777  	// TODO ping one more time to guarantee a late timeout.
   778  	p.started = false
   779  	return errors.Trace(err)
   780  
   781  }
   782  
   783  // KillForTesting stops p's periodical ping and immediately reports that it is dead.
   784  // TODO(ericsnow) We should be able to drop this and the two kill* methods.
   785  func (p *Pinger) KillForTesting() error {
   786  	p.mu.Lock()
   787  	defer p.mu.Unlock()
   788  	if p.started {
   789  		logger.Tracef("killing pinger for %q (was started)", p.beingKey)
   790  		return p.killStarted()
   791  	}
   792  	logger.Tracef("killing pinger for %q (was stopped)", p.beingKey)
   793  	return p.killStopped()
   794  }
   795  
   796  // killStarted kills the pinger while it is running, by first
   797  // stopping it and then recording in the last pinged slot that
   798  // the pinger was killed.
   799  func (p *Pinger) killStarted() error {
   800  	p.tomb.Kill(nil)
   801  	killErr := p.tomb.Wait()
   802  	p.started = false
   803  
   804  	slot := p.lastSlot
   805  	udoc := bson.D{
   806  		{"$set", bson.D{{"slot", slot}}},
   807  		{"$inc", bson.D{{"dead." + p.fieldKey, p.fieldBit}}}}
   808  	session := p.pings.Database.Session.Copy()
   809  	defer session.Close()
   810  	pings := p.pings.With(session)
   811  	if _, err := pings.UpsertId(docIDInt64(p.modelUUID, slot), udoc); err != nil {
   812  		return errors.Trace(err)
   813  	}
   814  	return errors.Trace(killErr)
   815  }
   816  
   817  // killStopped kills the pinger while it is not running, by
   818  // first allocating a new sequence, and then atomically recording
   819  // the new sequence both as alive and dead at once.
   820  func (p *Pinger) killStopped() error {
   821  	if err := p.prepare(); err != nil {
   822  		return err
   823  	}
   824  	// TODO(perrito666) 2016-05-02 lp:1558657
   825  	slot := timeSlot(time.Now(), p.delta)
   826  	udoc := bson.D{
   827  		{"$set", bson.D{{"slot", slot}}},
   828  		{"$inc", bson.D{
   829  			{"dead." + p.fieldKey, p.fieldBit},
   830  			{"alive." + p.fieldKey, p.fieldBit},
   831  		}}}
   832  	session := p.pings.Database.Session.Copy()
   833  	defer session.Close()
   834  	pings := p.pings.With(session)
   835  	_, err := pings.UpsertId(docIDInt64(p.modelUUID, slot), udoc)
   836  	return errors.Trace(err)
   837  }
   838  
   839  // loop is the main pinger loop that runs while it is
   840  // in started state.
   841  func (p *Pinger) loop() error {
   842  	for {
   843  		select {
   844  		case <-p.tomb.Dying():
   845  			return errors.Trace(tomb.ErrDying)
   846  		case <-time.After(time.Duration(float64(period+1)*0.75) * time.Second):
   847  			if err := p.ping(); err != nil {
   848  				return errors.Trace(err)
   849  			}
   850  		}
   851  	}
   852  }
   853  
   854  // prepare allocates a new unique sequence for the
   855  // pinger key and prepares the pinger to use it.
   856  func (p *Pinger) prepare() error {
   857  	change := mgo.Change{
   858  		Update:    bson.D{{"$inc", bson.D{{"seq", int64(1)}}}},
   859  		Upsert:    true,
   860  		ReturnNew: true,
   861  	}
   862  	session := p.base.Database.Session.Copy()
   863  	defer session.Close()
   864  	base := p.base.With(session)
   865  	seqs := seqsC(base)
   866  	var seq struct{ Seq int64 }
   867  	seqID := docIDStr(p.modelUUID, "beings")
   868  	if _, err := seqs.FindId(seqID).Apply(change, &seq); err != nil {
   869  		return errors.Trace(err)
   870  	}
   871  	p.beingSeq = seq.Seq
   872  	p.fieldKey = fmt.Sprintf("%x", p.beingSeq/63)
   873  	p.fieldBit = 1 << uint64(p.beingSeq%63)
   874  	p.lastSlot = 0
   875  	beings := beingsC(base)
   876  	return errors.Trace(beings.Insert(
   877  		beingInfo{
   878  			DocID: docIDInt64(p.modelUUID, p.beingSeq),
   879  			Seq:   p.beingSeq,
   880  			Key:   p.beingKey,
   881  		},
   882  	))
   883  }
   884  
   885  // ping records updates the current time slot with the
   886  // sequence in use by the pinger.
   887  func (p *Pinger) ping() (err error) {
   888  	logger.Tracef("[%s] pinging %q with seq=%d", p.modelUUID[:6], p.beingKey, p.beingSeq)
   889  	defer func() {
   890  		// If the session is killed from underneath us, it panics when we
   891  		// try to copy it, so deal with that here.
   892  		if v := recover(); v != nil {
   893  			err = fmt.Errorf("%v", v)
   894  		}
   895  	}()
   896  	if p.delta == 0 {
   897  		session := p.pings.Database.Session.Copy()
   898  		defer session.Close()
   899  		base := p.base.With(session)
   900  		delta, err := clockDelta(base)
   901  		if err != nil {
   902  			return errors.Trace(err)
   903  		}
   904  		p.delta = delta
   905  	}
   906  	// TODO(perrito666) 2016-05-02 lp:1558657
   907  	slot := timeSlot(time.Now(), p.delta)
   908  	if slot == p.lastSlot {
   909  		// Never, ever, ping the same slot twice.
   910  		// The increment below would corrupt the slot.
   911  		return nil
   912  	}
   913  	p.lastSlot = slot
   914  	p.recorderFunc().Ping(p.modelUUID, slot, p.fieldKey, p.fieldBit)
   915  	return errors.Trace(err)
   916  }
   917  
   918  // collapsedBeingsInfo tracks the result of aggregating all of the items in the
   919  // beings table by their key.
   920  type collapsedBeingsInfo struct {
   921  	Key  string  `bson:"_id"`
   922  	Seqs []int64 `bson:"seqs"`
   923  }
   924  
   925  // clockDelta returns the approximate skew between
   926  // the local clock and the database clock.
   927  func clockDelta(c *mgo.Collection) (time.Duration, error) {
   928  	var server struct {
   929  		time.Time `bson:"retval"`
   930  	}
   931  	var isMaster struct {
   932  		LocalTime time.Time `bson:"localTime"`
   933  	}
   934  	var after time.Time
   935  	var before time.Time
   936  	var serverDelay time.Duration
   937  	supportsMasterLocalTime := true
   938  	session := c.Database.Session.Copy()
   939  	defer session.Close()
   940  	db := c.Database.With(session)
   941  	for i := 0; i < 10; i++ {
   942  		if supportsMasterLocalTime {
   943  			// Try isMaster.localTime, which is present since MongoDB 2.2
   944  			// and does not require admin privileges.
   945  			// TODO(perrito666) 2016-05-02 lp:1558657
   946  			before = time.Now()
   947  			err := db.Run("isMaster", &isMaster)
   948  			// TODO(perrito666) 2016-05-02 lp:1558657
   949  			after = time.Now()
   950  			if err != nil {
   951  				return 0, errors.Trace(err)
   952  			}
   953  			if isMaster.LocalTime.IsZero() {
   954  				supportsMasterLocalTime = false
   955  				continue
   956  			} else {
   957  				serverDelay = isMaster.LocalTime.Sub(before)
   958  			}
   959  		} else {
   960  			// If MongoDB doesn't have localTime as part of
   961  			// isMaster result, it means that the server is likely
   962  			// a MongoDB older than 2.2.
   963  			//
   964  			// Fallback to 'eval' works fine on versions older than
   965  			// 2.4 where it does not require admin privileges.
   966  			//
   967  			// NOTE: 'eval' takes a global write lock unless you
   968  			// specify 'nolock' (which we are not doing below, for
   969  			// no apparent reason), so it is quite likely that the
   970  			// eval could take a relatively long time to acquire
   971  			// the lock and thus cause a retry on the callDelay
   972  			// check below on a busy server.
   973  			// TODO(perrito666) 2016-05-02 lp:1558657
   974  			before = time.Now()
   975  			err := db.Run(bson.D{{"$eval", "function() { return new Date(); }"}}, &server)
   976  			// TODO(perrito666) 2016-05-02 lp:1558657
   977  			after = time.Now()
   978  			if err != nil {
   979  				return 0, errors.Trace(err)
   980  			}
   981  			serverDelay = server.Sub(before)
   982  		}
   983  		// If the call to the server takes longer than a few seconds we
   984  		// retry it a couple more times before giving up. It is unclear
   985  		// why the retry would help at all here.
   986  		//
   987  		// If the server takes longer than the specified amount of time
   988  		// on every single try, then we simply give up.
   989  		callDelay := after.Sub(before)
   990  		if callDelay > 5*time.Second {
   991  			continue
   992  		}
   993  		return serverDelay, nil
   994  	}
   995  	return 0, errors.Errorf("cannot synchronize clock with database server")
   996  }
   997  
   998  // timeSlot returns the current time slot, in seconds since the
   999  // epoch, for the provided now time. The delta skew is applied
  1000  // to the now time to improve the synchronization with a
  1001  // centrally agreed time.
  1002  //
  1003  // The result of this method may be manipulated for test purposes
  1004  // by fakeTimeSlot and realTimeSlot.
  1005  func timeSlot(now time.Time, delta time.Duration) int64 {
  1006  	fakeMutex.Lock()
  1007  	fake := !fakeNow.IsZero()
  1008  	if fake {
  1009  		now = fakeNow
  1010  	}
  1011  	slot := now.Add(delta).Unix()
  1012  	slot -= slot % period
  1013  	if fake {
  1014  		slot += int64(fakeOffset) * period
  1015  	}
  1016  	fakeMutex.Unlock()
  1017  	return slot
  1018  }
  1019  
  1020  var (
  1021  	fakeMutex  sync.Mutex // protects fakeOffset, fakeNow
  1022  	fakeNow    time.Time
  1023  	fakeOffset int
  1024  )
  1025  
  1026  // fakeTimeSlot hardcodes the slot time returned by the timeSlot
  1027  // function for testing purposes. The offset parameter is the slot
  1028  // position to return: offsets +1 and -1 are +period and -period
  1029  // seconds from slot 0, respectively.
  1030  func fakeTimeSlot(offset int) {
  1031  	fakeMutex.Lock()
  1032  	if fakeNow.IsZero() {
  1033  		fakeNow = time.Now()
  1034  	}
  1035  	fakeOffset = offset
  1036  	fakeMutex.Unlock()
  1037  	logger.Infof("faking presence to time slot %d", offset)
  1038  }
  1039  
  1040  // realTimeSlot disables the hardcoding introduced by fakeTimeSlot.
  1041  func realTimeSlot() {
  1042  	fakeMutex.Lock()
  1043  	fakeNow = time.Time{}
  1044  	fakeOffset = 0
  1045  	fakeMutex.Unlock()
  1046  	logger.Infof("not faking presence time. Real time slot in use.")
  1047  }
  1048  
  1049  func seqsC(base *mgo.Collection) *mgo.Collection {
  1050  	return base.Database.C(base.Name + ".seqs")
  1051  }
  1052  
  1053  func beingsC(base *mgo.Collection) *mgo.Collection {
  1054  	return base.Database.C(base.Name + ".beings")
  1055  }
  1056  
  1057  func pingsC(base *mgo.Collection) *mgo.Collection {
  1058  	return base.Database.C(base.Name + ".pings")
  1059  }
  1060  
  1061  func removeModelFromCollection(coll *mgo.Collection, modelUUID string) error {
  1062  	modelIDMatch := bson.M{"_id": bson.RegEx{"^" + modelUUID + ":", ""}}
  1063  	if changed, err := coll.RemoveAll(modelIDMatch); err != nil {
  1064  		if err == mgo.ErrNotFound {
  1065  			// not a worthy error
  1066  			return nil
  1067  		}
  1068  		return errors.Trace(err)
  1069  	} else {
  1070  		logger.Debugf("removed %d entries from %q for model %q",
  1071  			changed.Removed, coll.FullName, modelUUID)
  1072  	}
  1073  	return nil
  1074  }
  1075  
  1076  // RemovePresenceForModel removes all of the records of entities for a given model
  1077  // across all of the collections.
  1078  func RemovePresenceForModel(base *mgo.Collection, modelTag names.ModelTag) error {
  1079  	errs := make([]error, 0)
  1080  	for _, f := range []func(*mgo.Collection) *mgo.Collection{pingsC, beingsC, seqsC} {
  1081  		err := removeModelFromCollection(f(base), modelTag.Id())
  1082  		if err != nil {
  1083  			errs = append(errs, err)
  1084  		}
  1085  	}
  1086  	if len(errs) != 0 {
  1087  		return errors.Errorf("errors removing presence for model %q: %v", modelTag.Id(), errs)
  1088  	}
  1089  	return nil
  1090  }