github.com/niedbalski/juju@v0.0.0-20190215020005-8ff100488e47/state/watcher/txnwatcher.go

github.com/niedbalski/juju@v0.0.0-20190215020005-8ff100488e47/state/watcher/txnwatcher.go (about)

     1  // Copyright 2012, 2013 Canonical Ltd.
     2  // Licensed under the AGPLv3, see LICENCE file for details.
     3  
     4  package watcher
     5  
     6  import (
     7  	"time"
     8  
     9  	"github.com/juju/errors"
    10  	"gopkg.in/juju/worker.v1"
    11  	"gopkg.in/mgo.v2"
    12  	"gopkg.in/mgo.v2/bson"
    13  	"gopkg.in/retry.v1"
    14  	"gopkg.in/tomb.v2"
    15  
    16  	"github.com/juju/juju/mongo"
    17  )
    18  
    19  // Hub represents a pubsub hub. The TxnWatcher only ever publishes
    20  // events to the hub.
    21  type Hub interface {
    22  	Publish(topic string, data interface{}) <-chan struct{}
    23  }
    24  
    25  // Clock represents the time methods used.
    26  type Clock interface {
    27  	Now() time.Time
    28  	After(time.Duration) <-chan time.Time
    29  }
    30  
    31  const (
    32  	txnWatcherStarting   = "starting"
    33  	txnWatcherSyncErr    = "sync err"
    34  	txnWatcherCollection = "collection"
    35  
    36  	txnWatcherShortWait = 10 * time.Millisecond
    37  )
    38  
    39  var (
    40  
    41  	// PollStrategy is used to determine how long
    42  	// to delay between poll intervals. A new timer
    43  	// is created each time some watcher event is
    44  	// fired or if the old timer completes.
    45  	//
    46  	// It must not be changed when any watchers are active.
    47  	PollStrategy retry.Strategy = retry.Exponential{
    48  		Initial:  txnWatcherShortWait,
    49  		Factor:   1.5,
    50  		MaxDelay: 5 * time.Second,
    51  	}
    52  
    53  	// TxnPollNotifyFunc allows tests to be able to specify
    54  	// callbacks each time the database has been polled and processed.
    55  	TxnPollNotifyFunc func()
    56  )
    57  
    58  type txnChange struct {
    59  	collection string
    60  	docID      interface{}
    61  	revID      int64
    62  }
    63  
    64  // A TxnWatcher watches the txns.log collection and publishes all change events
    65  // to the hub.
    66  type TxnWatcher struct {
    67  	hub    Hub
    68  	clock  Clock
    69  	logger Logger
    70  
    71  	tomb         tomb.Tomb
    72  	iteratorFunc func() mongo.Iterator
    73  	log          *mgo.Collection
    74  
    75  	// notifySync is copied from the package variable when the watcher
    76  	// is created.
    77  	notifySync func()
    78  
    79  	reportRequest chan chan map[string]interface{}
    80  
    81  	// syncEvents contain the events to be
    82  	// dispatched to the watcher channels. They're queued during
    83  	// processing and flushed at the end to simplify the algorithm.
    84  	// The two queues are separated because events from sync are
    85  	// handled in reverse order due to the way the algorithm works.
    86  	syncEvents []Change
    87  
    88  	// iteratorStepCount tracks how many documents we've read from the database
    89  	iteratorStepCount uint64
    90  
    91  	// changesCount tracks all sync events that we have processed
    92  	changesCount uint64
    93  
    94  	// syncEventsLastLen was the length of syncEvents when we did our last flush()
    95  	syncEventsLastLen int
    96  
    97  	// averageSyncLen tracks a filtered average of how long the sync event queue gets before we flush
    98  	averageSyncLen float64
    99  
   100  	// lastId is the most recent transaction id observed by a sync.
   101  	lastId interface{}
   102  }
   103  
   104  // TxnWatcherConfig contains the configuration parameters required
   105  // for a NewTxnWatcher.
   106  type TxnWatcherConfig struct {
   107  	// ChangeLog is usually the tnxs.log collection.
   108  	ChangeLog *mgo.Collection
   109  	// Hub is where the changes are published to.
   110  	Hub Hub
   111  	// Clock allows tests to control the advancing of time.
   112  	Clock Clock
   113  	// Logger is used to control where the log messages for this watcher go.
   114  	Logger Logger
   115  	// IteratorFunc can be overridden in tests to control what values the
   116  	// watcher sees.
   117  	IteratorFunc func() mongo.Iterator
   118  }
   119  
   120  // Validate ensures that all the values that have to be set are set.
   121  func (config TxnWatcherConfig) Validate() error {
   122  	if config.ChangeLog == nil {
   123  		return errors.NotValidf("missing ChangeLog")
   124  	}
   125  	if config.Hub == nil {
   126  		return errors.NotValidf("missing Hub")
   127  	}
   128  	if config.Clock == nil {
   129  		return errors.NotValidf("missing Clock")
   130  	}
   131  	return nil
   132  }
   133  
   134  // New returns a new Watcher observing the changelog collection,
   135  // which must be a capped collection maintained by mgo/txn.
   136  func NewTxnWatcher(config TxnWatcherConfig) (*TxnWatcher, error) {
   137  	if err := config.Validate(); err != nil {
   138  		return nil, errors.Annotate(err, "new TxnWatcher invalid config")
   139  	}
   140  
   141  	w := &TxnWatcher{
   142  		hub:           config.Hub,
   143  		clock:         config.Clock,
   144  		logger:        config.Logger,
   145  		log:           config.ChangeLog,
   146  		iteratorFunc:  config.IteratorFunc,
   147  		notifySync:    TxnPollNotifyFunc,
   148  		reportRequest: make(chan chan map[string]interface{}),
   149  	}
   150  	if w.iteratorFunc == nil {
   151  		w.iteratorFunc = w.iter
   152  	}
   153  	if w.logger == nil {
   154  		w.logger = noOpLogger{}
   155  	}
   156  	w.tomb.Go(func() error {
   157  		err := w.loop()
   158  		cause := errors.Cause(err)
   159  		// tomb expects ErrDying or ErrStillAlive as
   160  		// exact values, so we need to log and unwrap
   161  		// the error first.
   162  		if err != nil && cause != tomb.ErrDying {
   163  			w.logger.Infof("watcher loop failed: %v", err)
   164  		}
   165  		return cause
   166  	})
   167  	return w, nil
   168  }
   169  
   170  // Kill is part of the worker.Worker interface.
   171  func (w *TxnWatcher) Kill() {
   172  	w.tomb.Kill(nil)
   173  }
   174  
   175  // Wait is part of the worker.Worker interface.
   176  func (w *TxnWatcher) Wait() error {
   177  	return w.tomb.Wait()
   178  }
   179  
   180  // Stop stops all the watcher activities.
   181  func (w *TxnWatcher) Stop() error {
   182  	return worker.Stop(w)
   183  }
   184  
   185  // Dead returns a channel that is closed when the watcher has stopped.
   186  func (w *TxnWatcher) Dead() <-chan struct{} {
   187  	return w.tomb.Dead()
   188  }
   189  
   190  // Err returns the error with which the watcher stopped.
   191  // It returns nil if the watcher stopped cleanly, tomb.ErrStillAlive
   192  // if the watcher is still running properly, or the respective error
   193  // if the watcher is terminating or has terminated with an error.
   194  func (w *TxnWatcher) Err() error {
   195  	return w.tomb.Err()
   196  }
   197  
   198  // Report is part of the watcher/runner Reporting interface, to expose runtime details of the watcher.
   199  func (w *TxnWatcher) Report() map[string]interface{} {
   200  	// TODO: (jam) do we need to synchronize with the loop?
   201  	resCh := make(chan map[string]interface{})
   202  	select {
   203  	case <-w.tomb.Dying():
   204  		return nil
   205  	case w.reportRequest <- resCh:
   206  		break
   207  	}
   208  	select {
   209  	case <-w.tomb.Dying():
   210  		return nil
   211  	case res := <-resCh:
   212  		return res
   213  	}
   214  }
   215  
   216  // loop implements the main watcher loop.
   217  // period is the delay between each sync.
   218  func (w *TxnWatcher) loop() error {
   219  	w.logger.Tracef("loop started")
   220  	defer w.logger.Tracef("loop finished")
   221  	// Make sure we have read the last ID before telling people
   222  	// we have started.
   223  	if err := w.initLastId(); err != nil {
   224  		return errors.Trace(err)
   225  	}
   226  	// Also make sure we have prepared the timer before
   227  	// we tell people we've started.
   228  	now := w.clock.Now()
   229  	backoff := PollStrategy.NewTimer(now)
   230  	d, _ := backoff.NextSleep(now)
   231  	next := w.clock.After(d)
   232  	w.hub.Publish(txnWatcherStarting, nil)
   233  	for {
   234  		select {
   235  		case <-w.tomb.Dying():
   236  			return errors.Trace(tomb.ErrDying)
   237  		case <-next:
   238  			d, ok := backoff.NextSleep(w.clock.Now())
   239  			if !ok {
   240  				// This shouldn't happen, but be defensive.
   241  				backoff = PollStrategy.NewTimer(w.clock.Now())
   242  			}
   243  			next = w.clock.After(d)
   244  		case resCh := <-w.reportRequest:
   245  			report := map[string]interface{}{
   246  				// How long was sync-events in our last flush
   247  				"sync-events-last-len": w.syncEventsLastLen,
   248  				// How long is sync-events on average
   249  				"sync-events-avg": int(w.averageSyncLen + 0.5),
   250  				// How long is the queue right now? (probably should always be 0 if we are at this point in the loop)
   251  				"sync-events-len": len(w.syncEvents),
   252  				// How big is our buffer
   253  				"sync-events-cap": cap(w.syncEvents),
   254  				// How many events have we actually generated
   255  				"total-changes": w.changesCount,
   256  				// How many database records have we read. note: because we have to iterate until we get to lastId,
   257  				// this is often a bit bigger than total-sync-events
   258  				"iterator-step-count": w.iteratorStepCount,
   259  			}
   260  			select {
   261  			case <-w.tomb.Dying():
   262  				return errors.Trace(tomb.ErrDying)
   263  			case resCh <- report:
   264  			}
   265  			// This doesn't indicate we need to perform a sync
   266  			continue
   267  		}
   268  
   269  		added, err := w.sync()
   270  		if err != nil {
   271  			w.hub.Publish(txnWatcherSyncErr, nil)
   272  			return errors.Trace(err)
   273  		}
   274  		w.flush()
   275  		if added {
   276  			// Something's happened, so reset the exponential backoff
   277  			// so we'll retry again quickly.
   278  			backoff = PollStrategy.NewTimer(w.clock.Now())
   279  			next = w.clock.After(txnWatcherShortWait)
   280  		} else if w.notifySync != nil {
   281  			w.notifySync()
   282  		}
   283  	}
   284  }
   285  
   286  // flush sends all pending events to their respective channels.
   287  func (w *TxnWatcher) flush() {
   288  	// refreshEvents are stored newest first.
   289  	for i := len(w.syncEvents) - 1; i >= 0; i-- {
   290  		e := w.syncEvents[i]
   291  		w.hub.Publish(txnWatcherCollection, e)
   292  	}
   293  	w.averageSyncLen = (filterFactor * float64(len(w.syncEvents))) + ((1.0 - filterFactor) * w.averageSyncLen)
   294  	w.syncEventsLastLen = len(w.syncEvents)
   295  	w.syncEvents = w.syncEvents[:0]
   296  	// TODO(jam): 2018-11-07 Consider if averageSyncLen << cap(syncEvents) we should reallocate the buffer, so that it
   297  	// doesn't grow to the size of the largest-ever change and never shrink
   298  }
   299  
   300  // initLastId reads the most recent changelog document and initializes
   301  // lastId with it. This causes all history that precedes the creation
   302  // of the watcher to be ignored.
   303  func (w *TxnWatcher) initLastId() error {
   304  	var entry struct {
   305  		Id interface{} `bson:"_id"`
   306  	}
   307  	err := w.log.Find(nil).Sort("-$natural").One(&entry)
   308  	if err != nil && err != mgo.ErrNotFound {
   309  		return errors.Trace(err)
   310  	}
   311  	w.lastId = entry.Id
   312  	return nil
   313  }
   314  
   315  func (w *TxnWatcher) iter() mongo.Iterator {
   316  	return w.log.Find(nil).Batch(10).Sort("-$natural").Iter()
   317  }
   318  
   319  // sync updates the watcher knowledge from the database, and
   320  // queues events to observing channels.
   321  func (w *TxnWatcher) sync() (bool, error) {
   322  	w.logger.Tracef("txn watcher %p starting sync", w)
   323  	added := false
   324  	// Iterate through log events in reverse insertion order (newest first).
   325  	iter := w.iteratorFunc()
   326  	seen := make(map[watchKey]bool)
   327  	first := true
   328  	lastId := w.lastId
   329  	var entry bson.D
   330  	for iter.Next(&entry) {
   331  		w.iteratorStepCount++
   332  		if len(entry) == 0 {
   333  			w.logger.Tracef("got empty changelog document")
   334  		}
   335  		id := entry[0]
   336  		if id.Name != "_id" {
   337  			w.logger.Warningf("watcher: _id field isn't first entry")
   338  			continue
   339  		}
   340  		if first {
   341  			w.lastId = id.Value
   342  			first = false
   343  		}
   344  		if id.Value == lastId {
   345  			break
   346  		}
   347  		w.logger.Tracef("%p step %d got changelog document: %#v", w, w.iteratorStepCount, entry)
   348  		for _, c := range entry[1:] {
   349  			// See txn's Runner.ChangeLog for the structure of log entries.
   350  			var d, r []interface{}
   351  			dr, _ := c.Value.(bson.D)
   352  			for _, item := range dr {
   353  				switch item.Name {
   354  				case "d":
   355  					d, _ = item.Value.([]interface{})
   356  				case "r":
   357  					r, _ = item.Value.([]interface{})
   358  				}
   359  			}
   360  			if len(d) == 0 || len(d) != len(r) {
   361  				w.logger.Warningf("changelog has invalid collection document: %#v", c)
   362  				continue
   363  			}
   364  			for i := len(d) - 1; i >= 0; i-- {
   365  				key := watchKey{c.Name, d[i]}
   366  				if seen[key] {
   367  					continue
   368  				}
   369  				seen[key] = true
   370  				revno, ok := r[i].(int64)
   371  				if !ok {
   372  					w.logger.Warningf("changelog has revno with type %T: %#v", r[i], r[i])
   373  					continue
   374  				}
   375  				if revno < 0 {
   376  					revno = -1
   377  				}
   378  				w.syncEvents = append(w.syncEvents, Change{
   379  					C:     c.Name,
   380  					Id:    d[i],
   381  					Revno: revno,
   382  				})
   383  				w.changesCount++
   384  				added = true
   385  			}
   386  		}
   387  	}
   388  	if err := iter.Close(); err != nil {
   389  		return false, errors.Annotate(err, "watcher iteration error")
   390  	}
   391  	return added, nil
   392  }