github.com/wallyworld/juju@v0.0.0-20161013125918-6cf1bc9d917a/mongo/oplog.go (about)

     1  // Copyright 2015 Canonical Ltd.
     2  // Licensed under the AGPLv3, see LICENCE file for details.
     3  
     4  package mongo
     5  
     6  import (
     7  	"reflect"
     8  	"time"
     9  
    10  	"github.com/juju/errors"
    11  	"gopkg.in/mgo.v2"
    12  	"gopkg.in/mgo.v2/bson"
    13  	"gopkg.in/tomb.v1"
    14  )
    15  
    16  // OplogDoc represents a document in the oplog.rs collection.
    17  // See: http://www.kchodorow.com/blog/2010/10/12/replication-internals/
    18  //
    19  // The Object and UpdateObject fields are returned raw to allow
    20  // unmarshalling into arbitrary types. Use the UnmarshalObject and
    21  // UnmarshalUpdate methods to unmarshall these fields.
    22  type OplogDoc struct {
    23  	Timestamp    bson.MongoTimestamp `bson:"ts"`
    24  	OperationId  int64               `bson:"h"`
    25  	MongoVersion int                 `bson:"v"`
    26  	Operation    string              `bson:"op"` // "i" - insert, "u" - update, "d" - delete
    27  	Namespace    string              `bson:"ns"`
    28  	Object       *bson.Raw           `bson:"o"`
    29  	UpdateObject *bson.Raw           `bson:"o2"`
    30  }
    31  
    32  // UnmarshalObject unmarshals the Object field into out. The out
    33  // argument should be a pointer or a suitable map.
    34  func (d *OplogDoc) UnmarshalObject(out interface{}) error {
    35  	return d.unmarshal(d.Object, out)
    36  }
    37  
    38  // UnmarshalUpdate unmarshals the UpdateObject field into out. The out
    39  // argument should be a pointer or a suitable map.
    40  func (d *OplogDoc) UnmarshalUpdate(out interface{}) error {
    41  	return d.unmarshal(d.UpdateObject, out)
    42  }
    43  
    44  func (d *OplogDoc) unmarshal(raw *bson.Raw, out interface{}) error {
    45  	if raw == nil {
    46  		// If the field is not set, set out to the zero value for its type.
    47  		v := reflect.ValueOf(out)
    48  		switch v.Kind() {
    49  		case reflect.Ptr:
    50  			v = v.Elem()
    51  			v.Set(reflect.Zero(v.Type()))
    52  		case reflect.Map:
    53  			// Empty the map.
    54  			for _, k := range v.MapKeys() {
    55  				v.SetMapIndex(k, reflect.Value{})
    56  			}
    57  		default:
    58  			return errors.New("output must be a pointer or map")
    59  		}
    60  		return nil
    61  	}
    62  	return raw.Unmarshal(out)
    63  }
    64  
    65  // NewMongoTimestamp returns a bson.MongoTimestamp repesentation for
    66  // the time.Time given. Note that these timestamps are not the same
    67  // the usual MongoDB time fields. These are an internal format used
    68  // only in a few places such as the replication oplog.
    69  //
    70  // See: http://docs.mongodb.org/manual/reference/bson-types/#timestamps
    71  func NewMongoTimestamp(t time.Time) bson.MongoTimestamp {
    72  	unixTime := t.Unix()
    73  	if unixTime < 0 {
    74  		unixTime = 0
    75  	}
    76  	return bson.MongoTimestamp(unixTime << 32)
    77  }
    78  
    79  // GetOplog returns the the oplog collection in the local database.
    80  func GetOplog(session *mgo.Session) *mgo.Collection {
    81  	return session.DB("local").C("oplog.rs")
    82  }
    83  
    84  func isRealOplog(c *mgo.Collection) bool {
    85  	return c.Database.Name == "local" && c.Name == "oplog.rs"
    86  }
    87  
    88  // OplogIterator defines the parts of the mgo.Iter that we use - this
    89  // interface allows us to switch out the querying for testing.
    90  type OplogIterator interface {
    91  	Next(interface{}) bool
    92  	Err() error
    93  	Timeout() bool
    94  }
    95  
    96  // OplogSession represents a connection to the oplog store, used
    97  // to create an iterator to get oplog documents (and recreate it if it
    98  // gets killed or times out).
    99  type OplogSession interface {
   100  	NewIter(bson.MongoTimestamp, []int64) OplogIterator
   101  	Close()
   102  }
   103  
   104  type oplogSession struct {
   105  	session    *mgo.Session
   106  	collection *mgo.Collection
   107  	query      bson.D
   108  }
   109  
   110  // NewOplogSession defines a new OplogSession.
   111  //
   112  // Arguments:
   113  // - "collection" is the collection to use for the oplog. Typically this
   114  //   would be the result of GetOpLog.
   115  // - "query" can be used to limit the returned oplog entries. A
   116  //    typical filter would limit based on ns ("<database>.<collection>")
   117  //    and o (object).
   118  //
   119  // The returned session should be `Close`d when it's no longer needed.
   120  func NewOplogSession(collection *mgo.Collection, query bson.D) *oplogSession {
   121  	// Use a fresh session for the tailer.
   122  	session := collection.Database.Session.Copy()
   123  	return &oplogSession{
   124  		session:    session,
   125  		collection: collection.With(session),
   126  		query:      query,
   127  	}
   128  }
   129  
   130  const oplogTailTimeout = time.Second
   131  
   132  func (s *oplogSession) NewIter(fromTimestamp bson.MongoTimestamp, excludeIds []int64) OplogIterator {
   133  	// When recreating the iterator (required when the cursor
   134  	// is invalidated) avoid reporting oplog entries that have
   135  	// already been reported.
   136  	sel := append(s.query,
   137  		bson.DocElem{"ts", bson.D{{"$gte", fromTimestamp}}},
   138  		bson.DocElem{"h", bson.D{{"$nin", excludeIds}}},
   139  	)
   140  
   141  	query := s.collection.Find(sel)
   142  	if isRealOplog(s.collection) {
   143  		// Apply an optimisation that is only supported with
   144  		// the real oplog.
   145  		query = query.LogReplay()
   146  	}
   147  
   148  	// Time the tail call out every second so that requests to
   149  	// stop can be honoured.
   150  	return query.Tail(oplogTailTimeout)
   151  }
   152  
   153  func (s *oplogSession) Close() {
   154  	s.session.Close()
   155  }
   156  
   157  // NewOplogTailer returns a new OplogTailer.
   158  //
   159  // Arguments:
   160  // - "session" determines the collection and filtering on records that
   161  //    should be returned.
   162  // - "initialTs" sets the operation timestamp to start returning
   163  //    results from. This can be used to avoid an expensive initial search
   164  //    through the oplog when the tailer first starts.
   165  //
   166  // Remember to call Stop on the returned OplogTailer when it is no
   167  // longer needed.
   168  func NewOplogTailer(
   169  	session OplogSession,
   170  	initialTs time.Time,
   171  ) *OplogTailer {
   172  	t := &OplogTailer{
   173  		session:   session,
   174  		initialTs: NewMongoTimestamp(initialTs),
   175  		outCh:     make(chan *OplogDoc),
   176  	}
   177  	go func() {
   178  		defer func() {
   179  			close(t.outCh)
   180  			t.tomb.Done()
   181  			session.Close()
   182  		}()
   183  		t.tomb.Kill(t.loop())
   184  	}()
   185  	return t
   186  }
   187  
   188  // OplogTailer tails MongoDB's replication oplog.
   189  type OplogTailer struct {
   190  	tomb      tomb.Tomb
   191  	session   OplogSession
   192  	initialTs bson.MongoTimestamp
   193  	outCh     chan *OplogDoc
   194  }
   195  
   196  // Out returns a channel that reports the oplog entries matching the
   197  // query passed to NewOplogTailer as they appear.
   198  func (t *OplogTailer) Out() <-chan *OplogDoc {
   199  	return t.outCh
   200  }
   201  
   202  // Dying returns a channel that will be closed with the OplogTailer is
   203  // shutting down.
   204  func (t *OplogTailer) Dying() <-chan struct{} {
   205  	return t.tomb.Dying()
   206  }
   207  
   208  // Stop shuts down the OplogTailer. It will block until shutdown is
   209  // complete.
   210  func (t *OplogTailer) Stop() error {
   211  	t.tomb.Kill(nil)
   212  	return t.tomb.Wait()
   213  }
   214  
   215  // Err returns the error that caused the OplogTailer to stop. If it
   216  // finished normally or hasn't stopped then nil will be returned.
   217  func (t *OplogTailer) Err() error {
   218  	return t.tomb.Err()
   219  }
   220  
   221  func (t *OplogTailer) loop() error {
   222  	var iter OplogIterator
   223  
   224  	// lastTimestamp tracks the most recent oplog timestamp reported.
   225  	lastTimestamp := t.initialTs
   226  
   227  	// idsForLastTimestamp records the unique operation ids that have
   228  	// been reported for the most recently reported oplog
   229  	// timestamp. This is used to avoid re-reporting oplog entries
   230  	// when the iterator is restarted. These timestamps are unique for
   231  	// a given mongod but when there's multiple replicaset members
   232  	// it's possible for there to be multiple oplog entries for a
   233  	// given timestamp.
   234  	//
   235  	// See: http://docs.mongodb.org/v2.4/reference/bson-types/#timestamps
   236  	var idsForLastTimestamp []int64
   237  
   238  	for {
   239  		if t.dying() {
   240  			return tomb.ErrDying
   241  		}
   242  
   243  		if iter == nil {
   244  			iter = t.session.NewIter(lastTimestamp, idsForLastTimestamp)
   245  		}
   246  
   247  		var doc OplogDoc
   248  		if iter.Next(&doc) {
   249  			select {
   250  			case <-t.tomb.Dying():
   251  				return tomb.ErrDying
   252  			case t.outCh <- &doc:
   253  			}
   254  
   255  			if doc.Timestamp > lastTimestamp {
   256  				lastTimestamp = doc.Timestamp
   257  				idsForLastTimestamp = nil
   258  			}
   259  			idsForLastTimestamp = append(idsForLastTimestamp, doc.OperationId)
   260  		} else {
   261  			if err := iter.Err(); err != nil && err != mgo.ErrCursor {
   262  				return err
   263  			}
   264  			if iter.Timeout() {
   265  				continue
   266  			}
   267  			// Either there's no error or the error is an expired
   268  			// cursor. Force recreating the iterator next loop by
   269  			// marking it as nil.
   270  			iter = nil
   271  		}
   272  	}
   273  }
   274  
   275  func (t *OplogTailer) dying() bool {
   276  	select {
   277  	case <-t.tomb.Dying():
   278  		return true
   279  	default:
   280  		return false
   281  	}
   282  }