github.com/wallyworld/juju@v0.0.0-20161013125918-6cf1bc9d917a/worker/singular/mongo_test.go (about)

     1  // Copyright 2014 Canonical Ltd.
     2  // Licensed under the AGPLv3, see LICENCE file for details.
     3  
     4  package singular_test
     5  
     6  import (
     7  	"flag"
     8  	"fmt"
     9  	"strings"
    10  	"time"
    11  
    12  	"github.com/juju/loggo"
    13  	"github.com/juju/replicaset"
    14  	gitjujutesting "github.com/juju/testing"
    15  	jc "github.com/juju/testing/checkers"
    16  	"github.com/juju/utils"
    17  	gc "gopkg.in/check.v1"
    18  	"gopkg.in/mgo.v2"
    19  
    20  	"github.com/juju/juju/testing"
    21  	coretesting "github.com/juju/juju/testing"
    22  	"github.com/juju/juju/worker"
    23  	"github.com/juju/juju/worker/singular"
    24  )
    25  
    26  var logger = loggo.GetLogger("juju.singular-test")
    27  
    28  type mongoSuite struct {
    29  	testing.BaseSuite
    30  }
    31  
    32  var _ = gc.Suite(&mongoSuite{})
    33  
    34  var enableUnreliableTests = flag.Bool("juju.unreliabletests", false, "enable unreliable and slow tests")
    35  
    36  func (*mongoSuite) SetUpSuite(c *gc.C) {
    37  	if !*enableUnreliableTests {
    38  		c.Skip("skipping unreliable tests")
    39  	}
    40  }
    41  
    42  // start replica set with three mongods
    43  // start singular worker on each one.
    44  // change worker priorities so the master changes.
    45  // check that
    46  // a) there is never more than one running at a time
    47  // b) the running worker changes when the master changes.
    48  
    49  func (*mongoSuite) TestMongoMastership(c *gc.C) {
    50  	insts, err := startReplicaSet(3)
    51  	c.Assert(err, jc.ErrorIsNil)
    52  	for _, inst := range insts {
    53  		defer inst.Destroy()
    54  	}
    55  	notifyCh := make(chan event, 100)
    56  	globalState := newGlobalAgentState(len(insts), notifyCh)
    57  
    58  	agents := startAgents(c, notifyCh, insts)
    59  
    60  	assertAgentsConnect(c, globalState)
    61  
    62  	// Wait for one of the agents to start.
    63  	for globalState.activeId == -1 {
    64  		globalState.waitEvent(c)
    65  	}
    66  	c.Logf("agent %d started; waiting for servers to sync", globalState.activeId)
    67  	time.Sleep(1 * time.Minute)
    68  
    69  	// Try to choose a different agent than the primary to
    70  	// make master (note we can't just do (activeId+1)%len(insts)
    71  	// because ids start at 1 not 0)
    72  	nextId := ((globalState.activeId+1)-1)%len(insts) + 1
    73  
    74  	c.Logf("giving agent %d priority to become master", nextId)
    75  	changeVotes(c, insts, nextId)
    76  
    77  	// Wait for the first agent to stop and another agent
    78  	// to start. Note that because of mongo's vagaries, we
    79  	// cannot be sure which agent will actually start, even
    80  	// though we've set our priorities to hope that a
    81  	// particular mongo instance (nextId) becomes master.
    82  	oldId := globalState.activeId
    83  	oldHasStopped := false
    84  	for {
    85  		if oldHasStopped && globalState.activeId != -1 {
    86  			break
    87  		}
    88  		got := globalState.waitEvent(c)
    89  		if got.kind == "stop" && got.id == oldId {
    90  			oldHasStopped = true
    91  		}
    92  	}
    93  
    94  	// Kill all the agents and wait for them to quit.
    95  	for _, a := range agents {
    96  		if a.Runner == nil {
    97  			panic("runner is nil")
    98  		}
    99  		a.Kill()
   100  	}
   101  
   102  	assertAgentsQuit(c, globalState)
   103  }
   104  
   105  func startAgents(c *gc.C, notifyCh chan<- event, insts []*gitjujutesting.MgoInstance) []*testAgent {
   106  	agents := make([]*testAgent, len(insts))
   107  	for i, inst := range insts {
   108  		a := &testAgent{
   109  			// Note: we use ids starting from 1 to match
   110  			// the replica set ids.
   111  			notify: &notifier{
   112  				id: i + 1,
   113  				ch: notifyCh,
   114  			},
   115  			Runner:   newRunner(),
   116  			hostPort: inst.Addr(),
   117  		}
   118  		go func() {
   119  			err := a.run()
   120  			a.notify.agentQuit(err)
   121  		}()
   122  		agents[i] = a
   123  	}
   124  	return agents
   125  }
   126  
   127  // assertAgentsConnect waits for all the agents to connect.
   128  func assertAgentsConnect(c *gc.C, globalState *globalAgentState) {
   129  	allConnected := func() bool {
   130  		for _, connected := range globalState.connected {
   131  			if !connected {
   132  				return false
   133  			}
   134  		}
   135  		return true
   136  	}
   137  	for !allConnected() {
   138  		globalState.waitEvent(c)
   139  	}
   140  }
   141  
   142  func assertAgentsQuit(c *gc.C, globalState *globalAgentState) {
   143  	allQuit := func() bool {
   144  		for _, quit := range globalState.quit {
   145  			if !quit {
   146  				return false
   147  			}
   148  		}
   149  		return true
   150  	}
   151  	for !allQuit() {
   152  		globalState.waitEvent(c)
   153  	}
   154  }
   155  
   156  type testAgent struct {
   157  	notify *notifier
   158  	worker.Runner
   159  	hostPort string
   160  }
   161  
   162  func (a *testAgent) run() error {
   163  	a.Runner.StartWorker(fmt.Sprint("mongo-", a.notify.id), a.mongoWorker)
   164  	return a.Runner.Wait()
   165  }
   166  
   167  func (a *testAgent) mongoWorker() (worker.Worker, error) {
   168  	dialInfo := gitjujutesting.MgoDialInfo(coretesting.Certs, a.hostPort)
   169  	session, err := mgo.DialWithInfo(dialInfo)
   170  	if err != nil {
   171  		return nil, err
   172  	}
   173  	mc := &mongoConn{
   174  		localHostPort: a.hostPort,
   175  		session:       session,
   176  	}
   177  
   178  	fn := func(err0, err1 error) bool { return true }
   179  	runner := worker.NewRunner(connectionIsFatal(mc), fn, worker.RestartDelay)
   180  	singularRunner, err := singular.New(runner, mc)
   181  	if err != nil {
   182  		return nil, fmt.Errorf("cannot start singular runner: %v", err)
   183  	}
   184  	a.notify.workerConnected()
   185  	singularRunner.StartWorker(fmt.Sprint("worker-", a.notify.id), func() (worker.Worker, error) {
   186  		return worker.NewSimpleWorker(func(stop <-chan struct{}) error {
   187  			return a.worker(session, stop)
   188  		}), nil
   189  	})
   190  	return runner, nil
   191  }
   192  
   193  func (a *testAgent) worker(session *mgo.Session, stop <-chan struct{}) error {
   194  	a.notify.workerStarted()
   195  	defer a.notify.workerStopped()
   196  	coll := session.DB("foo").C("bar")
   197  	for {
   198  		select {
   199  		case <-stop:
   200  			return nil
   201  		case <-time.After(250 * time.Millisecond):
   202  		}
   203  		if err := coll.Insert(struct{}{}); err != nil {
   204  			return fmt.Errorf("insert error: %v", err)
   205  		}
   206  		a.notify.operation()
   207  	}
   208  }
   209  
   210  // globalAgentState keeps track of the global state
   211  // of all the running "agents". The state is
   212  // updated by the waitEvent method.
   213  // The slices (connected, started and quit) hold an entry for each
   214  // agent - the entry for the agent with id x is held at index x-1.
   215  type globalAgentState struct {
   216  	numAgents int
   217  	notifyCh  <-chan event
   218  
   219  	// connected reports which agents have ever connected.
   220  	connected []bool
   221  
   222  	// started reports which agents have started.
   223  	started []bool
   224  
   225  	// quit reports which agents have quit.
   226  	quit []bool
   227  
   228  	// activeId holds the id of the agent that is
   229  	// currently performing operations.
   230  	activeId int
   231  }
   232  
   233  // newGlobalAgentState returns a globalAgentState instance that keeps track
   234  // of the given number of agents which all send events on notifyCh.
   235  func newGlobalAgentState(numAgents int, notifyCh <-chan event) *globalAgentState {
   236  	return &globalAgentState{
   237  		notifyCh:  notifyCh,
   238  		numAgents: numAgents,
   239  		connected: make([]bool, numAgents),
   240  
   241  		started: make([]bool, numAgents),
   242  
   243  		quit:     make([]bool, numAgents),
   244  		activeId: -1,
   245  	}
   246  }
   247  
   248  func (g *globalAgentState) String() string {
   249  	return fmt.Sprintf("{active %d; connected %s; started %s; quit %s}",
   250  		g.activeId,
   251  		boolsToStr(g.connected),
   252  		boolsToStr(g.started),
   253  		boolsToStr(g.quit),
   254  	)
   255  }
   256  
   257  func boolsToStr(b []bool) string {
   258  	d := make([]byte, len(b))
   259  	for i, ok := range b {
   260  		if ok {
   261  			d[i] = '1'
   262  		} else {
   263  			d[i] = '0'
   264  		}
   265  	}
   266  	return string(d)
   267  }
   268  
   269  // waitEvent waits for any event to happen and updates g
   270  // accordingly. It ensures that expected invariants are
   271  // maintained - if an invariant is violated, a fatal error
   272  // will be generated using c.
   273  func (g *globalAgentState) waitEvent(c *gc.C) event {
   274  	c.Logf("awaiting event; current state %s", g)
   275  
   276  	possible := g.possibleEvents()
   277  	c.Logf("possible: %q", possible)
   278  
   279  	got := expectNotification(c, g.notifyCh, possible)
   280  	index := got.id - 1
   281  	switch got.kind {
   282  	case "connect":
   283  		g.connected[index] = true
   284  	case "start":
   285  		g.started[index] = true
   286  	case "operation":
   287  		if g.activeId != -1 && g.activeId != got.id {
   288  			c.Fatalf("mixed operations from different agents")
   289  		}
   290  		g.activeId = got.id
   291  	case "stop":
   292  		g.activeId = -1
   293  		g.started[index] = false
   294  	case "quit":
   295  		g.quit[index] = true
   296  		c.Assert(got.info, gc.IsNil)
   297  	default:
   298  		c.Fatalf("unexpected event %q", got)
   299  	}
   300  	return got
   301  }
   302  
   303  func (g *globalAgentState) possibleEvents() []event {
   304  	var possible []event
   305  	for i := 0; i < g.numAgents; i++ {
   306  		isConnected, isStarted, hasQuit := g.connected[i], g.started[i], g.quit[i]
   307  		id := i + 1
   308  		addPossible := func(kind string) {
   309  			possible = append(possible, event{kind: kind, id: id})
   310  		}
   311  		if !isConnected {
   312  			addPossible("connect")
   313  			continue
   314  		}
   315  		if isStarted {
   316  			if g.activeId == -1 || id == g.activeId {
   317  				// If there's no active worker, then we allow
   318  				// any worker to run an operation, but
   319  				// once a worker has successfully run an
   320  				// operation, it will be an error if any
   321  				// other worker runs an operation before
   322  				// the first worker has stopped.
   323  				addPossible("operation")
   324  			}
   325  			// It's always ok for a started worker to stop.
   326  			addPossible("stop")
   327  		} else {
   328  			// connect followed by connect is possible for a worker
   329  			// that's not master.
   330  			addPossible("connect")
   331  
   332  			// We allow any number of workers to start - it's
   333  			// ok as long as none of the extra workers actually
   334  			// manage to complete an operation successfully.
   335  			addPossible("start")
   336  
   337  			if !hasQuit {
   338  				addPossible("quit")
   339  			}
   340  		}
   341  	}
   342  	return possible
   343  }
   344  
   345  func mkEvent(s string) event {
   346  	var e event
   347  	if n, _ := fmt.Sscanf(s, "%s %d", &e.kind, &e.id); n != 2 {
   348  		panic("invalid event " + s)
   349  	}
   350  	return e
   351  }
   352  
   353  func mkEvents(ss ...string) []event {
   354  	events := make([]event, len(ss))
   355  	for i, s := range ss {
   356  		events[i] = mkEvent(s)
   357  	}
   358  	return events
   359  }
   360  
   361  type event struct {
   362  	kind string
   363  	id   int
   364  	info interface{}
   365  }
   366  
   367  func (e event) String() string {
   368  	if e.info != nil {
   369  		return fmt.Sprintf("%s %d %v", e.kind, e.id, e.info)
   370  	} else {
   371  		return fmt.Sprintf("%s %d", e.kind, e.id)
   372  	}
   373  }
   374  
   375  func oneOf(possible ...string) string {
   376  	return strings.Join(possible, "|")
   377  }
   378  
   379  func expectNotification(c *gc.C, notifyCh <-chan event, possible []event) event {
   380  	select {
   381  	case e := <-notifyCh:
   382  		c.Logf("received notification %q", e)
   383  		for _, p := range possible {
   384  			if e.kind == p.kind && e.id == p.id {
   385  				return e
   386  			}
   387  		}
   388  		c.Fatalf("event %q does not match any of %q", e, possible)
   389  		return e
   390  	case <-time.After(testing.LongWait):
   391  		c.Fatalf("timed out waiting for %q", possible)
   392  	}
   393  	panic("unreachable")
   394  }
   395  
   396  func changeVotes(c *gc.C, insts []*gitjujutesting.MgoInstance, voteId int) {
   397  	c.Logf("changing voting id to %v", voteId)
   398  
   399  	addrs := make([]string, len(insts))
   400  	for i, inst := range insts {
   401  		addrs[i] = inst.Addr()
   402  	}
   403  	dialInfo := gitjujutesting.MgoDialInfo(coretesting.Certs, addrs...)
   404  
   405  	session, err := mgo.DialWithInfo(dialInfo)
   406  	c.Assert(err, jc.ErrorIsNil)
   407  	defer session.Close()
   408  
   409  	members, err := replicaset.CurrentMembers(session)
   410  	c.Assert(err, jc.ErrorIsNil)
   411  	c.Assert(members, gc.HasLen, len(insts))
   412  	for i := range members {
   413  		member := &members[i]
   414  		if member.Id == voteId {
   415  			member.Priority = nil
   416  		} else {
   417  			member.Priority = newFloat64(0.1)
   418  		}
   419  	}
   420  	c.Logf("new member set: %#v", members)
   421  	err = replicaset.Set(session, members)
   422  	c.Assert(err, jc.ErrorIsNil)
   423  
   424  	c.Logf("successfully changed replica set members")
   425  }
   426  
   427  type notifier struct {
   428  	id int
   429  	ch chan<- event
   430  }
   431  
   432  func (n *notifier) sendEvent(kind string, info interface{}) {
   433  	n.ch <- event{
   434  		id:   n.id,
   435  		kind: kind,
   436  		info: info,
   437  	}
   438  }
   439  
   440  func (n *notifier) workerConnected() {
   441  	n.sendEvent("connect", nil)
   442  }
   443  
   444  func (n *notifier) workerStarted() {
   445  	n.sendEvent("start", nil)
   446  }
   447  
   448  func (n *notifier) workerStopped() {
   449  	n.sendEvent("stop", nil)
   450  }
   451  
   452  func (n *notifier) operation() {
   453  	n.sendEvent("operation", nil)
   454  }
   455  
   456  func (n *notifier) agentQuit(err error) {
   457  	n.sendEvent("quit", err)
   458  }
   459  
   460  type mongoConn struct {
   461  	localHostPort string
   462  	session       *mgo.Session
   463  }
   464  
   465  func (c *mongoConn) Ping() error {
   466  	return c.session.Ping()
   467  }
   468  
   469  func (c *mongoConn) IsMaster() (bool, error) {
   470  	hostPort, err := replicaset.MasterHostPort(c.session)
   471  	if err != nil {
   472  		logger.Errorf("replicaset.MasterHostPort returned error: %v", err)
   473  		return false, err
   474  	}
   475  	logger.Errorf("replicaset.MasterHostPort(%s) returned %s", c.localHostPort, hostPort)
   476  	logger.Errorf("-> %s IsMaster: %v", c.localHostPort, hostPort == c.localHostPort)
   477  	return hostPort == c.localHostPort, nil
   478  }
   479  
   480  const replicaSetName = "juju"
   481  
   482  // startReplicaSet starts up a replica set with n mongo instances.
   483  func startReplicaSet(n int) (_ []*gitjujutesting.MgoInstance, err error) {
   484  	insts := make([]*gitjujutesting.MgoInstance, 0, n)
   485  	root, err := newMongoInstance()
   486  	if err != nil {
   487  		return nil, err
   488  	}
   489  	insts = append(insts, root)
   490  	defer func() {
   491  		if err == nil {
   492  			return
   493  		}
   494  		for _, inst := range insts {
   495  			inst.Destroy()
   496  		}
   497  	}()
   498  
   499  	dialInfo := root.DialInfo()
   500  	dialInfo.Direct = true
   501  	dialInfo.Timeout = 60 * time.Second
   502  
   503  	session, err := root.DialDirect()
   504  	if err != nil {
   505  		return nil, fmt.Errorf("cannot dial root instance: %v", err)
   506  	}
   507  	defer session.Close()
   508  
   509  	logger.Infof("dialled root instance")
   510  
   511  	if err := replicaset.Initiate(session, root.Addr(), replicaSetName, nil); err != nil {
   512  		return nil, fmt.Errorf("cannot initiate replica set: %v", err)
   513  	}
   514  	var members []replicaset.Member
   515  	for i := 1; i < n; i++ {
   516  		inst, err := newMongoInstance()
   517  		if err != nil {
   518  			return nil, err
   519  		}
   520  		insts = append(insts, inst)
   521  		members = append(members, replicaset.Member{
   522  			Address:  inst.Addr(),
   523  			Priority: newFloat64(0.1),
   524  			Id:       i + 1,
   525  		})
   526  	}
   527  	// TODO(katco): 2016-08-09: lp:1611427
   528  	attempt := utils.AttemptStrategy{
   529  		Total: 60 * time.Second,
   530  		Delay: 1 * time.Second,
   531  	}
   532  	for a := attempt.Start(); a.Next(); {
   533  		err := replicaset.Add(session, members...)
   534  		if err == nil {
   535  			break
   536  		}
   537  		logger.Errorf("cannot add members: %v", err)
   538  		if !a.HasNext() {
   539  			return nil, fmt.Errorf("timed out trying to add members")
   540  		}
   541  		logger.Errorf("retrying")
   542  	}
   543  	return insts, err
   544  }
   545  
   546  func newMongoInstance() (*gitjujutesting.MgoInstance, error) {
   547  	inst := &gitjujutesting.MgoInstance{Params: []string{"--replSet", replicaSetName}}
   548  	if err := inst.Start(testing.Certs); err != nil {
   549  		return nil, fmt.Errorf("cannot start mongo server: %s", err.Error())
   550  	}
   551  	return inst, nil
   552  }
   553  
   554  func newFloat64(f float64) *float64 {
   555  	return &f
   556  }
   557  
   558  // connectionIsFatal returns a function suitable for passing
   559  // as the isFatal argument to worker.NewRunner,
   560  // that diagnoses an error as fatal if the connection
   561  // has failed or if the error is otherwise fatal.
   562  // Copied from jujud.
   563  func connectionIsFatal(conn singular.Conn) func(err error) bool {
   564  	return func(err error) bool {
   565  		if err := conn.Ping(); err != nil {
   566  			logger.Infof("error pinging %T: %v", conn, err)
   567  			return true
   568  		}
   569  		logger.Infof("error %q is not fatal", err)
   570  		return false
   571  	}
   572  }