github.com/makyo/juju@v0.0.0-20160425123129-2608902037e9/state/presence/presence.go (about)

     1  // Copyright 2012, 2013 Canonical Ltd.
     2  // Licensed under the AGPLv3, see LICENCE file for details.
     3  
     4  // The presence package implements an interface for observing liveness
     5  // of arbitrary keys (agents, processes, etc) on top of MongoDB.
     6  // The design works by periodically updating the database so that
     7  // watchers can tell an arbitrary key is alive.
     8  package presence
     9  
    10  import (
    11  	"fmt"
    12  	"strconv"
    13  	"sync"
    14  	"time"
    15  
    16  	"github.com/juju/errors"
    17  	"github.com/juju/loggo"
    18  	"github.com/juju/names"
    19  	"gopkg.in/mgo.v2"
    20  	"gopkg.in/mgo.v2/bson"
    21  	"launchpad.net/tomb"
    22  )
    23  
    24  var logger = loggo.GetLogger("juju.state.presence")
    25  
    26  // Agent shouldn't really live here -- it's not used in this package,
    27  // and is implemented by a couple of state types for the convenience of
    28  // the apiserver -- but one of the methods returns a concrete *Pinger,
    29  // and that ties it down here quite effectively (until we want to take
    30  // on the task of cleaning it up and promoting it to core, which might
    31  // well never happen).
    32  type Agent interface {
    33  	AgentPresence() (bool, error)
    34  	SetAgentPresence() (*Pinger, error)
    35  	WaitAgentPresence(time.Duration) error
    36  }
    37  
    38  // docIDInt64 generates a globally unique id value
    39  // where the model uuid is prefixed to the
    40  // given int64 localID.
    41  func docIDInt64(modelUUID string, localID int64) string {
    42  	return modelUUID + ":" + strconv.FormatInt(localID, 10)
    43  }
    44  
    45  // docIDStr generates a globally unique id value
    46  // where the model uuid is prefixed to the
    47  // given string localID.
    48  func docIDStr(modelUUID string, localID string) string {
    49  	return modelUUID + ":" + localID
    50  }
    51  
    52  // The implementation works by assigning a unique sequence number to each
    53  // pinger that is alive, and the pinger is then responsible for
    54  // periodically updating the current time slot document with its
    55  // sequence number so that watchers can tell it is alive.
    56  //
    57  // There is only one time slot document per time slot, per model. The
    58  // internal implementation of the time slot document is as follows:
    59  //
    60  // {
    61  //   "_id":   <model UUID>:<time slot>,
    62  //   "slot": <slot>,
    63  //   "model-uuid": <model UUID>,
    64  //   "alive": { hex(<pinger seq> / 63) : (1 << (<pinger seq> % 63) | <others>) },
    65  //   "dead":  { hex(<pinger seq> / 63) : (1 << (<pinger seq> % 63) | <others>) },
    66  // }
    67  //
    68  // All pingers that have their sequence number under "alive" and not
    69  // under "dead" are currently alive. This design enables implementing
    70  // a ping with a single update operation, a kill with another operation,
    71  // and obtaining liveness data with a single query that returns two
    72  // documents (the last two time slots).
    73  //
    74  // A new pinger sequence is obtained every time a pinger starts by atomically
    75  // incrementing a counter in a document in a helper collection. There is only
    76  // one such document per model. That sequence number is then inserted
    77  // into the beings collection to establish the mapping between pinger sequence
    78  // and key.
    79  
    80  // BUG(gn): The pings and beings collection currently grow without bound.
    81  
    82  // A Watcher can watch any number of pinger keys for liveness changes.
    83  type Watcher struct {
    84  	modelUUID string
    85  	tomb      tomb.Tomb
    86  	base      *mgo.Collection
    87  	pings     *mgo.Collection
    88  	beings    *mgo.Collection
    89  
    90  	// delta is an approximate clock skew between the local system
    91  	// clock and the database clock.
    92  	delta time.Duration
    93  
    94  	// beingKey and beingSeq are the pinger seq <=> key mappings.
    95  	// Entries in these maps are considered alive.
    96  	beingKey map[int64]string
    97  	beingSeq map[string]int64
    98  
    99  	// watches has the per-key observer channels from Watch/Unwatch.
   100  	watches map[string][]chan<- Change
   101  
   102  	// pending contains all the events to be dispatched to the watcher
   103  	// channels. They're queued during processing and flushed at the
   104  	// end to simplify the algorithm.
   105  	pending []event
   106  
   107  	// request is used to deliver requests from the public API into
   108  	// the the gorotuine loop.
   109  	request chan interface{}
   110  
   111  	// syncDone contains pending done channels from sync requests.
   112  	syncDone []chan bool
   113  
   114  	// next will dispatch when it's time to sync the database
   115  	// knowledge. It's maintained here so that ForceRefresh
   116  	// can manipulate it to force a sync sooner.
   117  	next <-chan time.Time
   118  }
   119  
   120  type event struct {
   121  	ch    chan<- Change
   122  	key   string
   123  	alive bool
   124  }
   125  
   126  // Change holds a liveness change notification.
   127  type Change struct {
   128  	Key   string
   129  	Alive bool
   130  }
   131  
   132  // NewWatcher returns a new Watcher.
   133  func NewWatcher(base *mgo.Collection, modelTag names.ModelTag) *Watcher {
   134  	w := &Watcher{
   135  		modelUUID: modelTag.Id(),
   136  		base:      base,
   137  		pings:     pingsC(base),
   138  		beings:    beingsC(base),
   139  		beingKey:  make(map[int64]string),
   140  		beingSeq:  make(map[string]int64),
   141  		watches:   make(map[string][]chan<- Change),
   142  		request:   make(chan interface{}),
   143  	}
   144  	go func() {
   145  		err := w.loop()
   146  		cause := errors.Cause(err)
   147  		// tomb expects ErrDying or ErrStillAlive as
   148  		// exact values, so we need to log and unwrap
   149  		// the error first.
   150  		if err != nil && cause != tomb.ErrDying {
   151  			logger.Infof("watcher loop failed: %v", err)
   152  		}
   153  		w.tomb.Kill(cause)
   154  		w.tomb.Done()
   155  	}()
   156  	return w
   157  }
   158  
   159  // Stop stops all the watcher activities.
   160  func (w *Watcher) Stop() error {
   161  	w.tomb.Kill(nil)
   162  	return errors.Trace(w.tomb.Wait())
   163  }
   164  
   165  // Dead returns a channel that is closed when the watcher has stopped.
   166  func (w *Watcher) Dead() <-chan struct{} {
   167  	return w.tomb.Dead()
   168  }
   169  
   170  // Err returns the error with which the watcher stopped.
   171  // It returns nil if the watcher stopped cleanly, tomb.ErrStillAlive
   172  // if the watcher is still running properly, or the respective error
   173  // if the watcher is terminating or has terminated with an error.
   174  func (w *Watcher) Err() error {
   175  	return w.tomb.Err()
   176  }
   177  
   178  type reqWatch struct {
   179  	key string
   180  	ch  chan<- Change
   181  }
   182  
   183  type reqUnwatch struct {
   184  	key string
   185  	ch  chan<- Change
   186  }
   187  
   188  type reqSync struct {
   189  	done chan bool
   190  }
   191  
   192  type reqAlive struct {
   193  	key    string
   194  	result chan bool
   195  }
   196  
   197  func (w *Watcher) sendReq(req interface{}) {
   198  	select {
   199  	case w.request <- req:
   200  	case <-w.tomb.Dying():
   201  	}
   202  }
   203  
   204  // Watch starts watching the liveness of key. An event will
   205  // be sent onto ch to report the initial status for the key, and
   206  // from then on a new event will be sent whenever a change is
   207  // detected. Change values sent to the channel must be consumed,
   208  // or the whole watcher will blocked.
   209  func (w *Watcher) Watch(key string, ch chan<- Change) {
   210  	w.sendReq(reqWatch{key, ch})
   211  }
   212  
   213  // Unwatch stops watching the liveness of key via ch.
   214  func (w *Watcher) Unwatch(key string, ch chan<- Change) {
   215  	w.sendReq(reqUnwatch{key, ch})
   216  }
   217  
   218  // StartSync forces the watcher to load new events from the database.
   219  func (w *Watcher) StartSync() {
   220  	w.sendReq(reqSync{nil})
   221  }
   222  
   223  // Sync forces the watcher to load new events from the database and blocks
   224  // until all events have been dispatched.
   225  func (w *Watcher) Sync() {
   226  	done := make(chan bool)
   227  	w.sendReq(reqSync{done})
   228  	select {
   229  	case <-done:
   230  	case <-w.tomb.Dying():
   231  	}
   232  }
   233  
   234  // Alive returns whether the key is currently considered alive by w,
   235  // or an error in case the watcher is dying.
   236  func (w *Watcher) Alive(key string) (bool, error) {
   237  	result := make(chan bool, 1)
   238  	w.sendReq(reqAlive{key, result})
   239  	var alive bool
   240  	select {
   241  	case alive = <-result:
   242  	case <-w.tomb.Dying():
   243  		return false, errors.Errorf("cannot check liveness: watcher is dying")
   244  	}
   245  	return alive, nil
   246  }
   247  
   248  // period is the length of each time slot in seconds.
   249  // It's not a time.Duration because the code is more convenient like
   250  // this and also because sub-second timings don't work as the slot
   251  // identifier is an int64 in seconds.
   252  var period int64 = 30
   253  
   254  // loop implements the main watcher loop.
   255  func (w *Watcher) loop() error {
   256  	var err error
   257  	if w.delta, err = clockDelta(w.base); err != nil {
   258  		return errors.Trace(err)
   259  	}
   260  	w.next = time.After(0)
   261  	for {
   262  		select {
   263  		case <-w.tomb.Dying():
   264  			return errors.Trace(tomb.ErrDying)
   265  		case <-w.next:
   266  			w.next = time.After(time.Duration(period) * time.Second)
   267  			syncDone := w.syncDone
   268  			w.syncDone = nil
   269  			if err := w.sync(); err != nil {
   270  				return errors.Trace(err)
   271  			}
   272  			w.flush()
   273  			for _, done := range syncDone {
   274  				close(done)
   275  			}
   276  		case req := <-w.request:
   277  			w.handle(req)
   278  			w.flush()
   279  		}
   280  	}
   281  }
   282  
   283  // flush sends all pending events to their respective channels.
   284  func (w *Watcher) flush() {
   285  	// w.pending may get new requests as we handle other requests.
   286  	for i := 0; i < len(w.pending); i++ {
   287  		e := &w.pending[i]
   288  		for e.ch != nil {
   289  			select {
   290  			case <-w.tomb.Dying():
   291  				return
   292  			case req := <-w.request:
   293  				w.handle(req)
   294  				continue
   295  			case e.ch <- Change{e.key, e.alive}:
   296  			}
   297  			break
   298  		}
   299  	}
   300  	w.pending = w.pending[:0]
   301  }
   302  
   303  // handle deals with requests delivered by the public API
   304  // onto the background watcher goroutine.
   305  func (w *Watcher) handle(req interface{}) {
   306  	logger.Tracef("got request: %#v", req)
   307  	switch r := req.(type) {
   308  	case reqSync:
   309  		w.next = time.After(0)
   310  		if r.done != nil {
   311  			w.syncDone = append(w.syncDone, r.done)
   312  		}
   313  	case reqWatch:
   314  		for _, ch := range w.watches[r.key] {
   315  			if ch == r.ch {
   316  				panic("adding channel twice for same key")
   317  			}
   318  		}
   319  		w.watches[r.key] = append(w.watches[r.key], r.ch)
   320  		_, alive := w.beingSeq[r.key]
   321  		w.pending = append(w.pending, event{r.ch, r.key, alive})
   322  	case reqUnwatch:
   323  		watches := w.watches[r.key]
   324  		for i, ch := range watches {
   325  			if ch == r.ch {
   326  				watches[i] = watches[len(watches)-1]
   327  				w.watches[r.key] = watches[:len(watches)-1]
   328  				break
   329  			}
   330  		}
   331  		for i := range w.pending {
   332  			e := &w.pending[i]
   333  			if e.key == r.key && e.ch == r.ch {
   334  				e.ch = nil
   335  			}
   336  		}
   337  	case reqAlive:
   338  		_, alive := w.beingSeq[r.key]
   339  		r.result <- alive
   340  	default:
   341  		panic(fmt.Errorf("unknown request: %T", req))
   342  	}
   343  }
   344  
   345  type beingInfo struct {
   346  	DocID     string `bson:"_id,omitempty"`
   347  	Seq       int64  `bson:"seq,omitempty"`
   348  	ModelUUID string `bson:"model-uuid,omitempty"`
   349  	Key       string `bson:"key,omitempty"`
   350  }
   351  
   352  type pingInfo struct {
   353  	DocID string           `bson:"_id,omitempty"`
   354  	Slot  int64            `bson:"slot,omitempty"`
   355  	Alive map[string]int64 `bson:",omitempty"`
   356  	Dead  map[string]int64 `bson:",omitempty"`
   357  }
   358  
   359  func (w *Watcher) findAllBeings() (map[int64]beingInfo, error) {
   360  	beings := make([]beingInfo, 0)
   361  	session := w.beings.Database.Session.Copy()
   362  	defer session.Close()
   363  	beingsC := w.beings.With(session)
   364  
   365  	err := beingsC.Find(bson.D{{"model-uuid", w.modelUUID}}).All(&beings)
   366  	if err != nil {
   367  		return nil, err
   368  	}
   369  	beingInfos := make(map[int64]beingInfo, len(beings))
   370  	for _, being := range beings {
   371  		beingInfos[being.Seq] = being
   372  	}
   373  	return beingInfos, nil
   374  }
   375  
   376  // sync updates the watcher knowledge from the database, and
   377  // queues events to observing channels. It fetches the last two time
   378  // slots and compares the union of both to the in-memory state.
   379  func (w *Watcher) sync() error {
   380  	var allBeings map[int64]beingInfo
   381  	if len(w.beingKey) == 0 {
   382  		// The very first time we sync, we grab all ever-known beings,
   383  		// so we don't have to look them up one-by-one
   384  		var err error
   385  		if allBeings, err = w.findAllBeings(); err != nil {
   386  			return errors.Trace(err)
   387  		}
   388  	}
   389  	s := timeSlot(time.Now(), w.delta)
   390  	slot := docIDInt64(w.modelUUID, s)
   391  	previousSlot := docIDInt64(w.modelUUID, s-period)
   392  	session := w.pings.Database.Session.Copy()
   393  	defer session.Close()
   394  	pings := w.pings.With(session)
   395  	var ping []pingInfo
   396  	q := bson.D{{"$or", []pingInfo{{DocID: slot}, {DocID: previousSlot}}}}
   397  	err := pings.Find(q).All(&ping)
   398  	if err != nil && err == mgo.ErrNotFound {
   399  		return errors.Trace(err)
   400  	}
   401  
   402  	// Learn about all enforced deaths.
   403  	// TODO(ericsnow) Remove this once KillForTesting() goes away.
   404  	dead := make(map[int64]bool)
   405  	for i := range ping {
   406  		for key, value := range ping[i].Dead {
   407  			k, err := strconv.ParseInt(key, 16, 64)
   408  			if err != nil {
   409  				err = errors.Annotatef(err, "presence cannot parse dead key: %q", key)
   410  				panic(err)
   411  			}
   412  			k *= 63
   413  			for i := int64(0); i < 63 && value > 0; i++ {
   414  				on := value&1 == 1
   415  				value >>= 1
   416  				if !on {
   417  					continue
   418  				}
   419  				seq := k + i
   420  				dead[seq] = true
   421  				logger.Tracef("found seq=%d dead", seq)
   422  			}
   423  		}
   424  	}
   425  
   426  	// Learn about all the pingers that reported and queue
   427  	// events for those that weren't known to be alive and
   428  	// are not reportedly dead either.
   429  	beingsC := w.beings.With(session)
   430  	alive := make(map[int64]bool)
   431  	being := beingInfo{}
   432  	for i := range ping {
   433  		for key, value := range ping[i].Alive {
   434  			k, err := strconv.ParseInt(key, 16, 64)
   435  			if err != nil {
   436  				err = errors.Annotatef(err, "presence cannot parse alive key: %q", key)
   437  				panic(err)
   438  			}
   439  			k *= 63
   440  			for i := int64(0); i < 63 && value > 0; i++ {
   441  				on := value&1 == 1
   442  				value >>= 1
   443  				if !on {
   444  					continue
   445  				}
   446  				seq := k + i
   447  				alive[seq] = true
   448  				if _, ok := w.beingKey[seq]; ok {
   449  					continue
   450  				}
   451  				// Check if the being exists in the 'all' map,
   452  				// otherwise do a single lookup in mongo
   453  				var ok bool
   454  				if being, ok = allBeings[seq]; !ok {
   455  					err := beingsC.Find(bson.D{{"_id", docIDInt64(w.modelUUID, seq)}}).One(&being)
   456  					if err == mgo.ErrNotFound {
   457  						logger.Tracef("found seq=%d unowned", seq)
   458  						continue
   459  					} else if err != nil {
   460  						return errors.Trace(err)
   461  					}
   462  				}
   463  				cur := w.beingSeq[being.Key]
   464  				if cur < seq {
   465  					delete(w.beingKey, cur)
   466  				} else {
   467  					// Current sequence is more recent.
   468  					continue
   469  				}
   470  				w.beingKey[seq] = being.Key
   471  				w.beingSeq[being.Key] = seq
   472  				if cur > 0 || dead[seq] {
   473  					continue
   474  				}
   475  				logger.Tracef("found seq=%d alive with key %q", seq, being.Key)
   476  				for _, ch := range w.watches[being.Key] {
   477  					w.pending = append(w.pending, event{ch, being.Key, true})
   478  				}
   479  			}
   480  		}
   481  	}
   482  
   483  	// Pingers that were known to be alive and haven't reported
   484  	// in the last two slots are now considered dead. Dispatch
   485  	// the respective events and forget their sequences.
   486  	for seq, key := range w.beingKey {
   487  		if dead[seq] || !alive[seq] {
   488  			delete(w.beingKey, seq)
   489  			delete(w.beingSeq, key)
   490  			for _, ch := range w.watches[key] {
   491  				w.pending = append(w.pending, event{ch, key, false})
   492  			}
   493  		}
   494  	}
   495  	return nil
   496  }
   497  
   498  // Pinger periodically reports that a specific key is alive, so that
   499  // watchers interested on that fact can react appropriately.
   500  type Pinger struct {
   501  	modelUUID string
   502  	mu        sync.Mutex
   503  	tomb      tomb.Tomb
   504  	base      *mgo.Collection
   505  	pings     *mgo.Collection
   506  	started   bool
   507  	beingKey  string
   508  	beingSeq  int64
   509  	fieldKey  string // hex(beingKey / 63)
   510  	fieldBit  uint64 // 1 << (beingKey%63)
   511  	lastSlot  int64
   512  	delta     time.Duration
   513  }
   514  
   515  // NewPinger returns a new Pinger to report that key is alive.
   516  // It starts reporting after Start is called.
   517  func NewPinger(base *mgo.Collection, modelTag names.ModelTag, key string) *Pinger {
   518  	return &Pinger{
   519  		base:      base,
   520  		pings:     pingsC(base),
   521  		beingKey:  key,
   522  		modelUUID: modelTag.Id(),
   523  	}
   524  }
   525  
   526  // Start starts periodically reporting that p's key is alive.
   527  func (p *Pinger) Start() error {
   528  	p.mu.Lock()
   529  	defer p.mu.Unlock()
   530  	if p.started {
   531  		return errors.Errorf("pinger already started")
   532  	}
   533  	p.tomb = tomb.Tomb{}
   534  	if err := p.prepare(); err != nil {
   535  		return errors.Trace(err)
   536  	}
   537  	logger.Tracef("starting pinger for %q with seq=%d", p.beingKey, p.beingSeq)
   538  	if err := p.ping(); err != nil {
   539  		return errors.Trace(err)
   540  	}
   541  	p.started = true
   542  	go func() {
   543  		err := p.loop()
   544  		cause := errors.Cause(err)
   545  		// tomb expects ErrDying or ErrStillAlive as
   546  		// exact values, so we need to log and unwrap
   547  		// the error first.
   548  		if err != nil && cause != tomb.ErrDying {
   549  			logger.Infof("pinger loop failed: %v", err)
   550  		}
   551  		p.tomb.Kill(cause)
   552  		p.tomb.Done()
   553  	}()
   554  	return nil
   555  }
   556  
   557  // Wait returns when the Pinger has stopped, and returns the first error
   558  // it encountered.
   559  func (p *Pinger) Wait() error {
   560  	return p.tomb.Wait()
   561  }
   562  
   563  // Stop stops p's periodical ping.
   564  // Watchers will not notice p has stopped pinging until the
   565  // previous ping times out.
   566  func (p *Pinger) Stop() error {
   567  	p.mu.Lock()
   568  	defer p.mu.Unlock()
   569  	if p.started {
   570  		logger.Tracef("stopping pinger for %q with seq=%d", p.beingKey, p.beingSeq)
   571  	}
   572  	p.tomb.Kill(nil)
   573  	err := p.tomb.Wait()
   574  	// TODO ping one more time to guarantee a late timeout.
   575  	p.started = false
   576  	return errors.Trace(err)
   577  
   578  }
   579  
   580  // KillForTesting stops p's periodical ping and immediately reports that it is dead.
   581  // TODO(ericsnow) We should be able to drop this and the two kill* methods.
   582  func (p *Pinger) KillForTesting() error {
   583  	p.mu.Lock()
   584  	defer p.mu.Unlock()
   585  	if p.started {
   586  		logger.Tracef("killing pinger for %q (was started)", p.beingKey)
   587  		return p.killStarted()
   588  	}
   589  	logger.Tracef("killing pinger for %q (was stopped)", p.beingKey)
   590  	return p.killStopped()
   591  }
   592  
   593  // killStarted kills the pinger while it is running, by first
   594  // stopping it and then recording in the last pinged slot that
   595  // the pinger was killed.
   596  func (p *Pinger) killStarted() error {
   597  	p.tomb.Kill(nil)
   598  	killErr := p.tomb.Wait()
   599  	p.started = false
   600  
   601  	slot := p.lastSlot
   602  	udoc := bson.D{
   603  		{"$set", bson.D{{"slot", slot}}},
   604  		{"$inc", bson.D{{"dead." + p.fieldKey, p.fieldBit}}}}
   605  	session := p.pings.Database.Session.Copy()
   606  	defer session.Close()
   607  	pings := p.pings.With(session)
   608  	if _, err := pings.UpsertId(docIDInt64(p.modelUUID, slot), udoc); err != nil {
   609  		return errors.Trace(err)
   610  	}
   611  	return errors.Trace(killErr)
   612  }
   613  
   614  // killStopped kills the pinger while it is not running, by
   615  // first allocating a new sequence, and then atomically recording
   616  // the new sequence both as alive and dead at once.
   617  func (p *Pinger) killStopped() error {
   618  	if err := p.prepare(); err != nil {
   619  		return err
   620  	}
   621  	slot := timeSlot(time.Now(), p.delta)
   622  	udoc := bson.D{
   623  		{"$set", bson.D{{"slot", slot}}},
   624  		{"$inc", bson.D{
   625  			{"dead." + p.fieldKey, p.fieldBit},
   626  			{"alive." + p.fieldKey, p.fieldBit},
   627  		}}}
   628  	session := p.pings.Database.Session.Copy()
   629  	defer session.Close()
   630  	pings := p.pings.With(session)
   631  	_, err := pings.UpsertId(docIDInt64(p.modelUUID, slot), udoc)
   632  	return errors.Trace(err)
   633  }
   634  
   635  // loop is the main pinger loop that runs while it is
   636  // in started state.
   637  func (p *Pinger) loop() error {
   638  	for {
   639  		select {
   640  		case <-p.tomb.Dying():
   641  			return errors.Trace(tomb.ErrDying)
   642  		case <-time.After(time.Duration(float64(period+1)*0.75) * time.Second):
   643  			if err := p.ping(); err != nil {
   644  				return errors.Trace(err)
   645  			}
   646  		}
   647  	}
   648  }
   649  
   650  // prepare allocates a new unique sequence for the
   651  // pinger key and prepares the pinger to use it.
   652  func (p *Pinger) prepare() error {
   653  	change := mgo.Change{
   654  		Update:    bson.D{{"$inc", bson.D{{"seq", int64(1)}}}},
   655  		Upsert:    true,
   656  		ReturnNew: true,
   657  	}
   658  	session := p.base.Database.Session.Copy()
   659  	defer session.Close()
   660  	base := p.base.With(session)
   661  	seqs := seqsC(base)
   662  	var seq struct{ Seq int64 }
   663  	seqID := docIDStr(p.modelUUID, "beings")
   664  	if _, err := seqs.FindId(seqID).Apply(change, &seq); err != nil {
   665  		return errors.Trace(err)
   666  	}
   667  	p.beingSeq = seq.Seq
   668  	p.fieldKey = fmt.Sprintf("%x", p.beingSeq/63)
   669  	p.fieldBit = 1 << uint64(p.beingSeq%63)
   670  	p.lastSlot = 0
   671  	beings := beingsC(base)
   672  	return errors.Trace(beings.Insert(
   673  		beingInfo{
   674  			DocID:     docIDInt64(p.modelUUID, p.beingSeq),
   675  			Seq:       p.beingSeq,
   676  			ModelUUID: p.modelUUID,
   677  			Key:       p.beingKey,
   678  		},
   679  	))
   680  }
   681  
   682  // ping records updates the current time slot with the
   683  // sequence in use by the pinger.
   684  func (p *Pinger) ping() (err error) {
   685  	logger.Tracef("pinging %q with seq=%d", p.beingKey, p.beingSeq)
   686  	defer func() {
   687  		// If the session is killed from underneath us, it panics when we
   688  		// try to copy it, so deal with that here.
   689  		if v := recover(); v != nil {
   690  			err = fmt.Errorf("%v", v)
   691  		}
   692  	}()
   693  	session := p.pings.Database.Session.Copy()
   694  	defer session.Close()
   695  	if p.delta == 0 {
   696  		base := p.base.With(session)
   697  		delta, err := clockDelta(base)
   698  		if err != nil {
   699  			return errors.Trace(err)
   700  		}
   701  		p.delta = delta
   702  	}
   703  	slot := timeSlot(time.Now(), p.delta)
   704  	if slot == p.lastSlot {
   705  		// Never, ever, ping the same slot twice.
   706  		// The increment below would corrupt the slot.
   707  		return nil
   708  	}
   709  	p.lastSlot = slot
   710  	pings := p.pings.With(session)
   711  	_, err = pings.UpsertId(
   712  		docIDInt64(p.modelUUID, slot),
   713  		bson.D{
   714  			{"$set", bson.D{{"slot", slot}}},
   715  			{"$inc", bson.D{{"alive." + p.fieldKey, p.fieldBit}}},
   716  		})
   717  	return errors.Trace(err)
   718  }
   719  
   720  // clockDelta returns the approximate skew between
   721  // the local clock and the database clock.
   722  func clockDelta(c *mgo.Collection) (time.Duration, error) {
   723  	var server struct {
   724  		time.Time `bson:"retval"`
   725  	}
   726  	var isMaster struct {
   727  		LocalTime time.Time `bson:"localTime"`
   728  	}
   729  	var after time.Time
   730  	var before time.Time
   731  	var serverDelay time.Duration
   732  	supportsMasterLocalTime := true
   733  	session := c.Database.Session.Copy()
   734  	defer session.Close()
   735  	db := c.Database.With(session)
   736  	for i := 0; i < 10; i++ {
   737  		if supportsMasterLocalTime {
   738  			// Try isMaster.localTime, which is present since MongoDB 2.2
   739  			// and does not require admin privileges.
   740  			before = time.Now()
   741  			err := db.Run("isMaster", &isMaster)
   742  			after = time.Now()
   743  			if err != nil {
   744  				return 0, errors.Trace(err)
   745  			}
   746  			if isMaster.LocalTime.IsZero() {
   747  				supportsMasterLocalTime = false
   748  				continue
   749  			} else {
   750  				serverDelay = isMaster.LocalTime.Sub(before)
   751  			}
   752  		} else {
   753  			// If MongoDB doesn't have localTime as part of
   754  			// isMaster result, it means that the server is likely
   755  			// a MongoDB older than 2.2.
   756  			//
   757  			// Fallback to 'eval' works fine on versions older than
   758  			// 2.4 where it does not require admin privileges.
   759  			//
   760  			// NOTE: 'eval' takes a global write lock unless you
   761  			// specify 'nolock' (which we are not doing below, for
   762  			// no apparent reason), so it is quite likely that the
   763  			// eval could take a relatively long time to acquire
   764  			// the lock and thus cause a retry on the callDelay
   765  			// check below on a busy server.
   766  			before = time.Now()
   767  			err := db.Run(bson.D{{"$eval", "function() { return new Date(); }"}}, &server)
   768  			after = time.Now()
   769  			if err != nil {
   770  				return 0, errors.Trace(err)
   771  			}
   772  			serverDelay = server.Sub(before)
   773  		}
   774  		// If the call to the server takes longer than a few seconds we
   775  		// retry it a couple more times before giving up. It is unclear
   776  		// why the retry would help at all here.
   777  		//
   778  		// If the server takes longer than the specified amount of time
   779  		// on every single try, then we simply give up.
   780  		callDelay := after.Sub(before)
   781  		if callDelay > 5*time.Second {
   782  			continue
   783  		}
   784  		return serverDelay, nil
   785  	}
   786  	return 0, errors.Errorf("cannot synchronize clock with database server")
   787  }
   788  
   789  // timeSlot returns the current time slot, in seconds since the
   790  // epoch, for the provided now time. The delta skew is applied
   791  // to the now time to improve the synchronization with a
   792  // centrally agreed time.
   793  //
   794  // The result of this method may be manipulated for test purposes
   795  // by fakeTimeSlot and realTimeSlot.
   796  func timeSlot(now time.Time, delta time.Duration) int64 {
   797  	fakeMutex.Lock()
   798  	fake := !fakeNow.IsZero()
   799  	if fake {
   800  		now = fakeNow
   801  	}
   802  	slot := now.Add(delta).Unix()
   803  	slot -= slot % period
   804  	if fake {
   805  		slot += int64(fakeOffset) * period
   806  	}
   807  	fakeMutex.Unlock()
   808  	return slot
   809  }
   810  
   811  var (
   812  	fakeMutex  sync.Mutex // protects fakeOffset, fakeNow
   813  	fakeNow    time.Time
   814  	fakeOffset int
   815  )
   816  
   817  // fakeTimeSlot hardcodes the slot time returned by the timeSlot
   818  // function for testing purposes. The offset parameter is the slot
   819  // position to return: offsets +1 and -1 are +period and -period
   820  // seconds from slot 0, respectively.
   821  func fakeTimeSlot(offset int) {
   822  	fakeMutex.Lock()
   823  	if fakeNow.IsZero() {
   824  		fakeNow = time.Now()
   825  	}
   826  	fakeOffset = offset
   827  	fakeMutex.Unlock()
   828  	logger.Infof("faking presence to time slot %d", offset)
   829  }
   830  
   831  // realTimeSlot disables the hardcoding introduced by fakeTimeSlot.
   832  func realTimeSlot() {
   833  	fakeMutex.Lock()
   834  	fakeNow = time.Time{}
   835  	fakeOffset = 0
   836  	fakeMutex.Unlock()
   837  	logger.Infof("not faking presence time. Real time slot in use.")
   838  }
   839  
   840  func seqsC(base *mgo.Collection) *mgo.Collection {
   841  	return base.Database.C(base.Name + ".seqs")
   842  }
   843  
   844  func beingsC(base *mgo.Collection) *mgo.Collection {
   845  	return base.Database.C(base.Name + ".beings")
   846  }
   847  
   848  func pingsC(base *mgo.Collection) *mgo.Collection {
   849  	return base.Database.C(base.Name + ".pings")
   850  }