github.com/mattyw/juju@v0.0.0-20140610034352-732aecd63861/worker/singular/mongo_test.go (about)

     1  // Copyright 2014 Canonical Ltd.
     2  // Licensed under the AGPLv3, see LICENCE file for details.
     3  
     4  package singular_test
     5  
     6  import (
     7  	"flag"
     8  	"fmt"
     9  	"strings"
    10  	"time"
    11  
    12  	"github.com/juju/loggo"
    13  	"github.com/juju/utils"
    14  	"labix.org/v2/mgo"
    15  	gc "launchpad.net/gocheck"
    16  
    17  	"github.com/juju/juju/replicaset"
    18  	"github.com/juju/juju/testing"
    19  	"github.com/juju/juju/worker"
    20  	"github.com/juju/juju/worker/singular"
    21  )
    22  
    23  var logger = loggo.GetLogger("juju.singular-test")
    24  
    25  type mongoSuite struct {
    26  	testing.BaseSuite
    27  }
    28  
    29  var enableUnreliableTests = flag.Bool("juju.unreliabletests", false, "enable unreliable and slow tests")
    30  
    31  var _ = gc.Suite(&mongoSuite{})
    32  
    33  func (*mongoSuite) SetUpSuite(c *gc.C) {
    34  	if !*enableUnreliableTests {
    35  		c.Skip("skipping unreliable tests")
    36  	}
    37  }
    38  
    39  // start replica set with three mongods
    40  // start singular worker on each one.
    41  // change worker priorities so the master changes.
    42  // check that
    43  // a) there is never more than one running at a time
    44  // b) the running worker changes when the master changes.
    45  
    46  func (*mongoSuite) TestMongoMastership(c *gc.C) {
    47  	insts, err := startReplicaSet(3)
    48  	c.Assert(err, gc.IsNil)
    49  	for _, inst := range insts {
    50  		defer inst.Destroy()
    51  	}
    52  	notifyCh := make(chan event, 100)
    53  	globalState := newGlobalAgentState(len(insts), notifyCh)
    54  
    55  	agents := startAgents(c, notifyCh, insts)
    56  
    57  	assertAgentsConnect(c, globalState)
    58  
    59  	// Wait for one of the agents to start.
    60  	for globalState.activeId == -1 {
    61  		globalState.waitEvent(c)
    62  	}
    63  	c.Logf("agent %d started; waiting for servers to sync", globalState.activeId)
    64  	time.Sleep(1 * time.Minute)
    65  
    66  	// Try to choose a different agent than the primary to
    67  	// make master (note we can't just do (activeId+1)%len(insts)
    68  	// because ids start at 1 not 0)
    69  	nextId := ((globalState.activeId+1)-1)%len(insts) + 1
    70  
    71  	c.Logf("giving agent %d priority to become master", nextId)
    72  	changeVotes(c, insts, nextId)
    73  
    74  	// Wait for the first agent to stop and another agent
    75  	// to start. Note that because of mongo's vagaries, we
    76  	// cannot be sure which agent will actually start, even
    77  	// though we've set our priorities to hope that a
    78  	// particular mongo instance (nextId) becomes master.
    79  	oldId := globalState.activeId
    80  	oldHasStopped := false
    81  	for {
    82  		if oldHasStopped && globalState.activeId != -1 {
    83  			break
    84  		}
    85  		got := globalState.waitEvent(c)
    86  		if got.kind == "stop" && got.id == oldId {
    87  			oldHasStopped = true
    88  		}
    89  	}
    90  
    91  	// Kill all the agents and wait for them to quit.
    92  	for _, a := range agents {
    93  		if a.Runner == nil {
    94  			panic("runner is nil")
    95  		}
    96  		a.Kill()
    97  	}
    98  
    99  	assertAgentsQuit(c, globalState)
   100  }
   101  
   102  func startAgents(c *gc.C, notifyCh chan<- event, insts []*testing.MgoInstance) []*agent {
   103  	agents := make([]*agent, len(insts))
   104  	for i, inst := range insts {
   105  		a := &agent{
   106  			// Note: we use ids starting from 1 to match
   107  			// the replica set ids.
   108  			notify: &notifier{
   109  				id: i + 1,
   110  				ch: notifyCh,
   111  			},
   112  			Runner:   newRunner(),
   113  			hostPort: inst.Addr(),
   114  		}
   115  		go func() {
   116  			err := a.run()
   117  			a.notify.agentQuit(err)
   118  		}()
   119  		agents[i] = a
   120  	}
   121  	return agents
   122  }
   123  
   124  // assertAgentsConnect waits for all the agents to connect.
   125  func assertAgentsConnect(c *gc.C, globalState *globalAgentState) {
   126  	allConnected := func() bool {
   127  		for _, connected := range globalState.connected {
   128  			if !connected {
   129  				return false
   130  			}
   131  		}
   132  		return true
   133  	}
   134  	for !allConnected() {
   135  		globalState.waitEvent(c)
   136  	}
   137  }
   138  
   139  func assertAgentsQuit(c *gc.C, globalState *globalAgentState) {
   140  	allQuit := func() bool {
   141  		for _, quit := range globalState.quit {
   142  			if !quit {
   143  				return false
   144  			}
   145  		}
   146  		return true
   147  	}
   148  	for !allQuit() {
   149  		globalState.waitEvent(c)
   150  	}
   151  }
   152  
   153  type agent struct {
   154  	notify *notifier
   155  	worker.Runner
   156  	hostPort string
   157  }
   158  
   159  func (a *agent) run() error {
   160  	a.Runner.StartWorker(fmt.Sprint("mongo-", a.notify.id), a.mongoWorker)
   161  	return a.Runner.Wait()
   162  }
   163  
   164  func (a *agent) mongoWorker() (worker.Worker, error) {
   165  	dialInfo := testing.MgoDialInfo(a.hostPort)
   166  	session, err := mgo.DialWithInfo(dialInfo)
   167  	if err != nil {
   168  		return nil, err
   169  	}
   170  	mc := &mongoConn{
   171  		localHostPort: a.hostPort,
   172  		session:       session,
   173  	}
   174  	runner := worker.NewRunner(
   175  		connectionIsFatal(mc),
   176  		func(err0, err1 error) bool { return true },
   177  	)
   178  	singularRunner, err := singular.New(runner, mc)
   179  	if err != nil {
   180  		return nil, fmt.Errorf("cannot start singular runner: %v", err)
   181  	}
   182  	a.notify.workerConnected()
   183  	singularRunner.StartWorker(fmt.Sprint("worker-", a.notify.id), func() (worker.Worker, error) {
   184  		return worker.NewSimpleWorker(func(stop <-chan struct{}) error {
   185  			return a.worker(session, stop)
   186  		}), nil
   187  	})
   188  	return runner, nil
   189  }
   190  
   191  func (a *agent) worker(session *mgo.Session, stop <-chan struct{}) error {
   192  	a.notify.workerStarted()
   193  	defer a.notify.workerStopped()
   194  	coll := session.DB("foo").C("bar")
   195  	for {
   196  		select {
   197  		case <-stop:
   198  			return nil
   199  		case <-time.After(250 * time.Millisecond):
   200  		}
   201  		if err := coll.Insert(struct{}{}); err != nil {
   202  			return fmt.Errorf("insert error: %v", err)
   203  		}
   204  		a.notify.operation()
   205  	}
   206  }
   207  
   208  // globalAgentState keeps track of the global state
   209  // of all the running "agents". The state is
   210  // updated by the waitEvent method.
   211  // The slices (connected, started and quit) hold an entry for each
   212  // agent - the entry for the agent with id x is held at index x-1.
   213  type globalAgentState struct {
   214  	numAgents int
   215  	notifyCh  <-chan event
   216  
   217  	// connected reports which agents have ever connected.
   218  	connected []bool
   219  
   220  	// started reports which agents have started.
   221  	started []bool
   222  
   223  	// quit reports which agents have quit.
   224  	quit []bool
   225  
   226  	// activeId holds the id of the agent that is
   227  	// currently performing operations.
   228  	activeId int
   229  }
   230  
   231  // newGlobalAgentState returns a globalAgentState instance that keeps track
   232  // of the given number of agents which all send events on notifyCh.
   233  func newGlobalAgentState(numAgents int, notifyCh <-chan event) *globalAgentState {
   234  	return &globalAgentState{
   235  		notifyCh:  notifyCh,
   236  		numAgents: numAgents,
   237  		connected: make([]bool, numAgents),
   238  
   239  		started: make([]bool, numAgents),
   240  
   241  		quit:     make([]bool, numAgents),
   242  		activeId: -1,
   243  	}
   244  }
   245  
   246  func (g *globalAgentState) String() string {
   247  	return fmt.Sprintf("{active %d; connected %s; started %s; quit %s}",
   248  		g.activeId,
   249  		boolsToStr(g.connected),
   250  		boolsToStr(g.started),
   251  		boolsToStr(g.quit),
   252  	)
   253  }
   254  
   255  func boolsToStr(b []bool) string {
   256  	d := make([]byte, len(b))
   257  	for i, ok := range b {
   258  		if ok {
   259  			d[i] = '1'
   260  		} else {
   261  			d[i] = '0'
   262  		}
   263  	}
   264  	return string(d)
   265  }
   266  
   267  // waitEvent waits for any event to happen and updates g
   268  // accordingly. It ensures that expected invariants are
   269  // maintained - if an invariant is violated, a fatal error
   270  // will be generated using c.
   271  func (g *globalAgentState) waitEvent(c *gc.C) event {
   272  	c.Logf("awaiting event; current state %s", g)
   273  
   274  	possible := g.possibleEvents()
   275  	c.Logf("possible: %q", possible)
   276  
   277  	got := expectNotification(c, g.notifyCh, possible)
   278  	index := got.id - 1
   279  	switch got.kind {
   280  	case "connect":
   281  		g.connected[index] = true
   282  	case "start":
   283  		g.started[index] = true
   284  	case "operation":
   285  		if g.activeId != -1 && g.activeId != got.id {
   286  			c.Fatalf("mixed operations from different agents")
   287  		}
   288  		g.activeId = got.id
   289  	case "stop":
   290  		g.activeId = -1
   291  		g.started[index] = false
   292  	case "quit":
   293  		g.quit[index] = true
   294  		c.Assert(got.info, gc.IsNil)
   295  	default:
   296  		c.Fatalf("unexpected event %q", got)
   297  	}
   298  	return got
   299  }
   300  
   301  func (g *globalAgentState) possibleEvents() []event {
   302  	var possible []event
   303  	for i := 0; i < g.numAgents; i++ {
   304  		isConnected, isStarted, hasQuit := g.connected[i], g.started[i], g.quit[i]
   305  		id := i + 1
   306  		addPossible := func(kind string) {
   307  			possible = append(possible, event{kind: kind, id: id})
   308  		}
   309  		if !isConnected {
   310  			addPossible("connect")
   311  			continue
   312  		}
   313  		if isStarted {
   314  			if g.activeId == -1 || id == g.activeId {
   315  				// If there's no active worker, then we allow
   316  				// any worker to run an operation, but
   317  				// once a worker has successfully run an
   318  				// operation, it will be an error if any
   319  				// other worker runs an operation before
   320  				// the first worker has stopped.
   321  				addPossible("operation")
   322  			}
   323  			// It's always ok for a started worker to stop.
   324  			addPossible("stop")
   325  		} else {
   326  			// connect followed by connect is possible for a worker
   327  			// that's not master.
   328  			addPossible("connect")
   329  
   330  			// We allow any number of workers to start - it's
   331  			// ok as long as none of the extra workers actually
   332  			// manage to complete an operation successfully.
   333  			addPossible("start")
   334  
   335  			if !hasQuit {
   336  				addPossible("quit")
   337  			}
   338  		}
   339  	}
   340  	return possible
   341  }
   342  
   343  func mkEvent(s string) event {
   344  	var e event
   345  	if n, _ := fmt.Sscanf(s, "%s %d", &e.kind, &e.id); n != 2 {
   346  		panic("invalid event " + s)
   347  	}
   348  	return e
   349  }
   350  
   351  func mkEvents(ss ...string) []event {
   352  	events := make([]event, len(ss))
   353  	for i, s := range ss {
   354  		events[i] = mkEvent(s)
   355  	}
   356  	return events
   357  }
   358  
   359  type event struct {
   360  	kind string
   361  	id   int
   362  	info interface{}
   363  }
   364  
   365  func (e event) String() string {
   366  	if e.info != nil {
   367  		return fmt.Sprintf("%s %d %v", e.kind, e.id, e.info)
   368  	} else {
   369  		return fmt.Sprintf("%s %d", e.kind, e.id)
   370  	}
   371  }
   372  
   373  func oneOf(possible ...string) string {
   374  	return strings.Join(possible, "|")
   375  }
   376  
   377  func expectNotification(c *gc.C, notifyCh <-chan event, possible []event) event {
   378  	select {
   379  	case e := <-notifyCh:
   380  		c.Logf("received notification %q", e)
   381  		for _, p := range possible {
   382  			if e.kind == p.kind && e.id == p.id {
   383  				return e
   384  			}
   385  		}
   386  		c.Fatalf("event %q does not match any of %q", e, possible)
   387  		return e
   388  	case <-time.After(testing.LongWait):
   389  		c.Fatalf("timed out waiting for %q", possible)
   390  	}
   391  	panic("unreachable")
   392  }
   393  
   394  func changeVotes(c *gc.C, insts []*testing.MgoInstance, voteId int) {
   395  	c.Logf("changing voting id to %v", voteId)
   396  
   397  	addrs := make([]string, len(insts))
   398  	for i, inst := range insts {
   399  		addrs[i] = inst.Addr()
   400  	}
   401  	dialInfo := testing.MgoDialInfo(addrs...)
   402  
   403  	session, err := mgo.DialWithInfo(dialInfo)
   404  	c.Assert(err, gc.IsNil)
   405  	defer session.Close()
   406  
   407  	members, err := replicaset.CurrentMembers(session)
   408  	c.Assert(err, gc.IsNil)
   409  	c.Assert(members, gc.HasLen, len(insts))
   410  	for i := range members {
   411  		member := &members[i]
   412  		if member.Id == voteId {
   413  			member.Priority = nil
   414  		} else {
   415  			member.Priority = newFloat64(0.1)
   416  		}
   417  	}
   418  	c.Logf("new member set: %#v", members)
   419  	err = replicaset.Set(session, members)
   420  	c.Assert(err, gc.IsNil)
   421  
   422  	c.Logf("successfully changed replica set members")
   423  }
   424  
   425  type notifier struct {
   426  	id int
   427  	ch chan<- event
   428  }
   429  
   430  func (n *notifier) sendEvent(kind string, info interface{}) {
   431  	n.ch <- event{
   432  		id:   n.id,
   433  		kind: kind,
   434  		info: info,
   435  	}
   436  }
   437  
   438  func (n *notifier) workerConnected() {
   439  	n.sendEvent("connect", nil)
   440  }
   441  
   442  func (n *notifier) workerStarted() {
   443  	n.sendEvent("start", nil)
   444  }
   445  
   446  func (n *notifier) workerStopped() {
   447  	n.sendEvent("stop", nil)
   448  }
   449  
   450  func (n *notifier) operation() {
   451  	n.sendEvent("operation", nil)
   452  }
   453  
   454  func (n *notifier) agentQuit(err error) {
   455  	n.sendEvent("quit", err)
   456  }
   457  
   458  type mongoConn struct {
   459  	localHostPort string
   460  	session       *mgo.Session
   461  }
   462  
   463  func (c *mongoConn) Ping() error {
   464  	return c.session.Ping()
   465  }
   466  
   467  func (c *mongoConn) IsMaster() (bool, error) {
   468  	hostPort, err := replicaset.MasterHostPort(c.session)
   469  	if err != nil {
   470  		logger.Errorf("replicaset.MasterHostPort returned error: %v", err)
   471  		return false, err
   472  	}
   473  	logger.Errorf("replicaset.MasterHostPort(%s) returned %s", c.localHostPort, hostPort)
   474  	logger.Errorf("-> %s IsMaster: %v", c.localHostPort, hostPort == c.localHostPort)
   475  	return hostPort == c.localHostPort, nil
   476  }
   477  
   478  const replicaSetName = "juju"
   479  
   480  // startReplicaSet starts up a replica set with n mongo instances.
   481  func startReplicaSet(n int) (_ []*testing.MgoInstance, err error) {
   482  	insts := make([]*testing.MgoInstance, 0, n)
   483  	root, err := newMongoInstance()
   484  	if err != nil {
   485  		return nil, err
   486  	}
   487  	insts = append(insts, root)
   488  	defer func() {
   489  		if err == nil {
   490  			return
   491  		}
   492  		for _, inst := range insts {
   493  			inst.Destroy()
   494  		}
   495  	}()
   496  
   497  	dialInfo := root.DialInfo()
   498  	dialInfo.Direct = true
   499  	dialInfo.Timeout = 60 * time.Second
   500  
   501  	session, err := root.DialDirect()
   502  	if err != nil {
   503  		return nil, fmt.Errorf("cannot dial root instance: %v", err)
   504  	}
   505  	defer session.Close()
   506  
   507  	logger.Infof("dialled root instance")
   508  
   509  	if err := replicaset.Initiate(session, root.Addr(), replicaSetName, nil); err != nil {
   510  		return nil, fmt.Errorf("cannot initiate replica set: %v", err)
   511  	}
   512  	var members []replicaset.Member
   513  	for i := 1; i < n; i++ {
   514  		inst, err := newMongoInstance()
   515  		if err != nil {
   516  			return nil, err
   517  		}
   518  		insts = append(insts, inst)
   519  		members = append(members, replicaset.Member{
   520  			Address:  inst.Addr(),
   521  			Priority: newFloat64(0.1),
   522  			Id:       i + 1,
   523  		})
   524  	}
   525  	attempt := utils.AttemptStrategy{
   526  		Total: 60 * time.Second,
   527  		Delay: 1 * time.Second,
   528  	}
   529  	for a := attempt.Start(); a.Next(); {
   530  		err := replicaset.Add(session, members...)
   531  		if err == nil {
   532  			break
   533  		}
   534  		logger.Errorf("cannot add members: %v", err)
   535  		if !a.HasNext() {
   536  			return nil, fmt.Errorf("timed out trying to add members")
   537  		}
   538  		logger.Errorf("retrying")
   539  	}
   540  	return insts, err
   541  }
   542  
   543  func newMongoInstance() (*testing.MgoInstance, error) {
   544  	inst := &testing.MgoInstance{Params: []string{"--replSet", replicaSetName}}
   545  	if err := inst.Start(true); err != nil {
   546  		return nil, fmt.Errorf("cannot start mongo server: %s", err.Error())
   547  	}
   548  	return inst, nil
   549  }
   550  
   551  func newFloat64(f float64) *float64 {
   552  	return &f
   553  }
   554  
   555  // connectionIsFatal returns a function suitable for passing
   556  // as the isFatal argument to worker.NewRunner,
   557  // that diagnoses an error as fatal if the connection
   558  // has failed or if the error is otherwise fatal.
   559  // Copied from jujud.
   560  func connectionIsFatal(conn singular.Conn) func(err error) bool {
   561  	return func(err error) bool {
   562  		if err := conn.Ping(); err != nil {
   563  			logger.Infof("error pinging %T: %v", conn, err)
   564  			return true
   565  		}
   566  		logger.Infof("error %q is not fatal", err)
   567  		return false
   568  	}
   569  }