github.com/cloud-green/juju@v0.0.0-20151002100041-a00291338d3d/worker/singular/mongo_test.go (about)

     1  // Copyright 2014 Canonical Ltd.
     2  // Licensed under the AGPLv3, see LICENCE file for details.
     3  
     4  package singular_test
     5  
     6  import (
     7  	"flag"
     8  	"fmt"
     9  	"strings"
    10  	"time"
    11  
    12  	"github.com/juju/loggo"
    13  	"github.com/juju/replicaset"
    14  	gitjujutesting "github.com/juju/testing"
    15  	jc "github.com/juju/testing/checkers"
    16  	"github.com/juju/utils"
    17  	gc "gopkg.in/check.v1"
    18  	"gopkg.in/mgo.v2"
    19  
    20  	"github.com/juju/juju/testing"
    21  	coretesting "github.com/juju/juju/testing"
    22  	"github.com/juju/juju/worker"
    23  	"github.com/juju/juju/worker/singular"
    24  )
    25  
    26  var logger = loggo.GetLogger("juju.singular-test")
    27  
    28  type mongoSuite struct {
    29  	testing.BaseSuite
    30  }
    31  
    32  var enableUnreliableTests = flag.Bool("juju.unreliabletests", false, "enable unreliable and slow tests")
    33  
    34  var _ = gc.Suite(&mongoSuite{})
    35  
    36  func (*mongoSuite) SetUpSuite(c *gc.C) {
    37  	if !*enableUnreliableTests {
    38  		c.Skip("skipping unreliable tests")
    39  	}
    40  }
    41  
    42  // start replica set with three mongods
    43  // start singular worker on each one.
    44  // change worker priorities so the master changes.
    45  // check that
    46  // a) there is never more than one running at a time
    47  // b) the running worker changes when the master changes.
    48  
    49  func (*mongoSuite) TestMongoMastership(c *gc.C) {
    50  	insts, err := startReplicaSet(3)
    51  	c.Assert(err, jc.ErrorIsNil)
    52  	for _, inst := range insts {
    53  		defer inst.Destroy()
    54  	}
    55  	notifyCh := make(chan event, 100)
    56  	globalState := newGlobalAgentState(len(insts), notifyCh)
    57  
    58  	agents := startAgents(c, notifyCh, insts)
    59  
    60  	assertAgentsConnect(c, globalState)
    61  
    62  	// Wait for one of the agents to start.
    63  	for globalState.activeId == -1 {
    64  		globalState.waitEvent(c)
    65  	}
    66  	c.Logf("agent %d started; waiting for servers to sync", globalState.activeId)
    67  	time.Sleep(1 * time.Minute)
    68  
    69  	// Try to choose a different agent than the primary to
    70  	// make master (note we can't just do (activeId+1)%len(insts)
    71  	// because ids start at 1 not 0)
    72  	nextId := ((globalState.activeId+1)-1)%len(insts) + 1
    73  
    74  	c.Logf("giving agent %d priority to become master", nextId)
    75  	changeVotes(c, insts, nextId)
    76  
    77  	// Wait for the first agent to stop and another agent
    78  	// to start. Note that because of mongo's vagaries, we
    79  	// cannot be sure which agent will actually start, even
    80  	// though we've set our priorities to hope that a
    81  	// particular mongo instance (nextId) becomes master.
    82  	oldId := globalState.activeId
    83  	oldHasStopped := false
    84  	for {
    85  		if oldHasStopped && globalState.activeId != -1 {
    86  			break
    87  		}
    88  		got := globalState.waitEvent(c)
    89  		if got.kind == "stop" && got.id == oldId {
    90  			oldHasStopped = true
    91  		}
    92  	}
    93  
    94  	// Kill all the agents and wait for them to quit.
    95  	for _, a := range agents {
    96  		if a.Runner == nil {
    97  			panic("runner is nil")
    98  		}
    99  		a.Kill()
   100  	}
   101  
   102  	assertAgentsQuit(c, globalState)
   103  }
   104  
   105  func startAgents(c *gc.C, notifyCh chan<- event, insts []*gitjujutesting.MgoInstance) []*agent {
   106  	agents := make([]*agent, len(insts))
   107  	for i, inst := range insts {
   108  		a := &agent{
   109  			// Note: we use ids starting from 1 to match
   110  			// the replica set ids.
   111  			notify: &notifier{
   112  				id: i + 1,
   113  				ch: notifyCh,
   114  			},
   115  			Runner:   newRunner(),
   116  			hostPort: inst.Addr(),
   117  		}
   118  		go func() {
   119  			err := a.run()
   120  			a.notify.agentQuit(err)
   121  		}()
   122  		agents[i] = a
   123  	}
   124  	return agents
   125  }
   126  
   127  // assertAgentsConnect waits for all the agents to connect.
   128  func assertAgentsConnect(c *gc.C, globalState *globalAgentState) {
   129  	allConnected := func() bool {
   130  		for _, connected := range globalState.connected {
   131  			if !connected {
   132  				return false
   133  			}
   134  		}
   135  		return true
   136  	}
   137  	for !allConnected() {
   138  		globalState.waitEvent(c)
   139  	}
   140  }
   141  
   142  func assertAgentsQuit(c *gc.C, globalState *globalAgentState) {
   143  	allQuit := func() bool {
   144  		for _, quit := range globalState.quit {
   145  			if !quit {
   146  				return false
   147  			}
   148  		}
   149  		return true
   150  	}
   151  	for !allQuit() {
   152  		globalState.waitEvent(c)
   153  	}
   154  }
   155  
   156  type agent struct {
   157  	notify *notifier
   158  	worker.Runner
   159  	hostPort string
   160  }
   161  
   162  func (a *agent) run() error {
   163  	a.Runner.StartWorker(fmt.Sprint("mongo-", a.notify.id), a.mongoWorker)
   164  	return a.Runner.Wait()
   165  }
   166  
   167  func (a *agent) mongoWorker() (worker.Worker, error) {
   168  	dialInfo := gitjujutesting.MgoDialInfo(coretesting.Certs, a.hostPort)
   169  	session, err := mgo.DialWithInfo(dialInfo)
   170  	if err != nil {
   171  		return nil, err
   172  	}
   173  	mc := &mongoConn{
   174  		localHostPort: a.hostPort,
   175  		session:       session,
   176  	}
   177  	runner := worker.NewRunner(
   178  		connectionIsFatal(mc),
   179  		func(err0, err1 error) bool { return true },
   180  	)
   181  	singularRunner, err := singular.New(runner, mc)
   182  	if err != nil {
   183  		return nil, fmt.Errorf("cannot start singular runner: %v", err)
   184  	}
   185  	a.notify.workerConnected()
   186  	singularRunner.StartWorker(fmt.Sprint("worker-", a.notify.id), func() (worker.Worker, error) {
   187  		return worker.NewSimpleWorker(func(stop <-chan struct{}) error {
   188  			return a.worker(session, stop)
   189  		}), nil
   190  	})
   191  	return runner, nil
   192  }
   193  
   194  func (a *agent) worker(session *mgo.Session, stop <-chan struct{}) error {
   195  	a.notify.workerStarted()
   196  	defer a.notify.workerStopped()
   197  	coll := session.DB("foo").C("bar")
   198  	for {
   199  		select {
   200  		case <-stop:
   201  			return nil
   202  		case <-time.After(250 * time.Millisecond):
   203  		}
   204  		if err := coll.Insert(struct{}{}); err != nil {
   205  			return fmt.Errorf("insert error: %v", err)
   206  		}
   207  		a.notify.operation()
   208  	}
   209  }
   210  
   211  // globalAgentState keeps track of the global state
   212  // of all the running "agents". The state is
   213  // updated by the waitEvent method.
   214  // The slices (connected, started and quit) hold an entry for each
   215  // agent - the entry for the agent with id x is held at index x-1.
   216  type globalAgentState struct {
   217  	numAgents int
   218  	notifyCh  <-chan event
   219  
   220  	// connected reports which agents have ever connected.
   221  	connected []bool
   222  
   223  	// started reports which agents have started.
   224  	started []bool
   225  
   226  	// quit reports which agents have quit.
   227  	quit []bool
   228  
   229  	// activeId holds the id of the agent that is
   230  	// currently performing operations.
   231  	activeId int
   232  }
   233  
   234  // newGlobalAgentState returns a globalAgentState instance that keeps track
   235  // of the given number of agents which all send events on notifyCh.
   236  func newGlobalAgentState(numAgents int, notifyCh <-chan event) *globalAgentState {
   237  	return &globalAgentState{
   238  		notifyCh:  notifyCh,
   239  		numAgents: numAgents,
   240  		connected: make([]bool, numAgents),
   241  
   242  		started: make([]bool, numAgents),
   243  
   244  		quit:     make([]bool, numAgents),
   245  		activeId: -1,
   246  	}
   247  }
   248  
   249  func (g *globalAgentState) String() string {
   250  	return fmt.Sprintf("{active %d; connected %s; started %s; quit %s}",
   251  		g.activeId,
   252  		boolsToStr(g.connected),
   253  		boolsToStr(g.started),
   254  		boolsToStr(g.quit),
   255  	)
   256  }
   257  
   258  func boolsToStr(b []bool) string {
   259  	d := make([]byte, len(b))
   260  	for i, ok := range b {
   261  		if ok {
   262  			d[i] = '1'
   263  		} else {
   264  			d[i] = '0'
   265  		}
   266  	}
   267  	return string(d)
   268  }
   269  
   270  // waitEvent waits for any event to happen and updates g
   271  // accordingly. It ensures that expected invariants are
   272  // maintained - if an invariant is violated, a fatal error
   273  // will be generated using c.
   274  func (g *globalAgentState) waitEvent(c *gc.C) event {
   275  	c.Logf("awaiting event; current state %s", g)
   276  
   277  	possible := g.possibleEvents()
   278  	c.Logf("possible: %q", possible)
   279  
   280  	got := expectNotification(c, g.notifyCh, possible)
   281  	index := got.id - 1
   282  	switch got.kind {
   283  	case "connect":
   284  		g.connected[index] = true
   285  	case "start":
   286  		g.started[index] = true
   287  	case "operation":
   288  		if g.activeId != -1 && g.activeId != got.id {
   289  			c.Fatalf("mixed operations from different agents")
   290  		}
   291  		g.activeId = got.id
   292  	case "stop":
   293  		g.activeId = -1
   294  		g.started[index] = false
   295  	case "quit":
   296  		g.quit[index] = true
   297  		c.Assert(got.info, gc.IsNil)
   298  	default:
   299  		c.Fatalf("unexpected event %q", got)
   300  	}
   301  	return got
   302  }
   303  
   304  func (g *globalAgentState) possibleEvents() []event {
   305  	var possible []event
   306  	for i := 0; i < g.numAgents; i++ {
   307  		isConnected, isStarted, hasQuit := g.connected[i], g.started[i], g.quit[i]
   308  		id := i + 1
   309  		addPossible := func(kind string) {
   310  			possible = append(possible, event{kind: kind, id: id})
   311  		}
   312  		if !isConnected {
   313  			addPossible("connect")
   314  			continue
   315  		}
   316  		if isStarted {
   317  			if g.activeId == -1 || id == g.activeId {
   318  				// If there's no active worker, then we allow
   319  				// any worker to run an operation, but
   320  				// once a worker has successfully run an
   321  				// operation, it will be an error if any
   322  				// other worker runs an operation before
   323  				// the first worker has stopped.
   324  				addPossible("operation")
   325  			}
   326  			// It's always ok for a started worker to stop.
   327  			addPossible("stop")
   328  		} else {
   329  			// connect followed by connect is possible for a worker
   330  			// that's not master.
   331  			addPossible("connect")
   332  
   333  			// We allow any number of workers to start - it's
   334  			// ok as long as none of the extra workers actually
   335  			// manage to complete an operation successfully.
   336  			addPossible("start")
   337  
   338  			if !hasQuit {
   339  				addPossible("quit")
   340  			}
   341  		}
   342  	}
   343  	return possible
   344  }
   345  
   346  func mkEvent(s string) event {
   347  	var e event
   348  	if n, _ := fmt.Sscanf(s, "%s %d", &e.kind, &e.id); n != 2 {
   349  		panic("invalid event " + s)
   350  	}
   351  	return e
   352  }
   353  
   354  func mkEvents(ss ...string) []event {
   355  	events := make([]event, len(ss))
   356  	for i, s := range ss {
   357  		events[i] = mkEvent(s)
   358  	}
   359  	return events
   360  }
   361  
   362  type event struct {
   363  	kind string
   364  	id   int
   365  	info interface{}
   366  }
   367  
   368  func (e event) String() string {
   369  	if e.info != nil {
   370  		return fmt.Sprintf("%s %d %v", e.kind, e.id, e.info)
   371  	} else {
   372  		return fmt.Sprintf("%s %d", e.kind, e.id)
   373  	}
   374  }
   375  
   376  func oneOf(possible ...string) string {
   377  	return strings.Join(possible, "|")
   378  }
   379  
   380  func expectNotification(c *gc.C, notifyCh <-chan event, possible []event) event {
   381  	select {
   382  	case e := <-notifyCh:
   383  		c.Logf("received notification %q", e)
   384  		for _, p := range possible {
   385  			if e.kind == p.kind && e.id == p.id {
   386  				return e
   387  			}
   388  		}
   389  		c.Fatalf("event %q does not match any of %q", e, possible)
   390  		return e
   391  	case <-time.After(testing.LongWait):
   392  		c.Fatalf("timed out waiting for %q", possible)
   393  	}
   394  	panic("unreachable")
   395  }
   396  
   397  func changeVotes(c *gc.C, insts []*gitjujutesting.MgoInstance, voteId int) {
   398  	c.Logf("changing voting id to %v", voteId)
   399  
   400  	addrs := make([]string, len(insts))
   401  	for i, inst := range insts {
   402  		addrs[i] = inst.Addr()
   403  	}
   404  	dialInfo := gitjujutesting.MgoDialInfo(coretesting.Certs, addrs...)
   405  
   406  	session, err := mgo.DialWithInfo(dialInfo)
   407  	c.Assert(err, jc.ErrorIsNil)
   408  	defer session.Close()
   409  
   410  	members, err := replicaset.CurrentMembers(session)
   411  	c.Assert(err, jc.ErrorIsNil)
   412  	c.Assert(members, gc.HasLen, len(insts))
   413  	for i := range members {
   414  		member := &members[i]
   415  		if member.Id == voteId {
   416  			member.Priority = nil
   417  		} else {
   418  			member.Priority = newFloat64(0.1)
   419  		}
   420  	}
   421  	c.Logf("new member set: %#v", members)
   422  	err = replicaset.Set(session, members)
   423  	c.Assert(err, jc.ErrorIsNil)
   424  
   425  	c.Logf("successfully changed replica set members")
   426  }
   427  
   428  type notifier struct {
   429  	id int
   430  	ch chan<- event
   431  }
   432  
   433  func (n *notifier) sendEvent(kind string, info interface{}) {
   434  	n.ch <- event{
   435  		id:   n.id,
   436  		kind: kind,
   437  		info: info,
   438  	}
   439  }
   440  
   441  func (n *notifier) workerConnected() {
   442  	n.sendEvent("connect", nil)
   443  }
   444  
   445  func (n *notifier) workerStarted() {
   446  	n.sendEvent("start", nil)
   447  }
   448  
   449  func (n *notifier) workerStopped() {
   450  	n.sendEvent("stop", nil)
   451  }
   452  
   453  func (n *notifier) operation() {
   454  	n.sendEvent("operation", nil)
   455  }
   456  
   457  func (n *notifier) agentQuit(err error) {
   458  	n.sendEvent("quit", err)
   459  }
   460  
   461  type mongoConn struct {
   462  	localHostPort string
   463  	session       *mgo.Session
   464  }
   465  
   466  func (c *mongoConn) Ping() error {
   467  	return c.session.Ping()
   468  }
   469  
   470  func (c *mongoConn) IsMaster() (bool, error) {
   471  	hostPort, err := replicaset.MasterHostPort(c.session)
   472  	if err != nil {
   473  		logger.Errorf("replicaset.MasterHostPort returned error: %v", err)
   474  		return false, err
   475  	}
   476  	logger.Errorf("replicaset.MasterHostPort(%s) returned %s", c.localHostPort, hostPort)
   477  	logger.Errorf("-> %s IsMaster: %v", c.localHostPort, hostPort == c.localHostPort)
   478  	return hostPort == c.localHostPort, nil
   479  }
   480  
   481  const replicaSetName = "juju"
   482  
   483  // startReplicaSet starts up a replica set with n mongo instances.
   484  func startReplicaSet(n int) (_ []*gitjujutesting.MgoInstance, err error) {
   485  	insts := make([]*gitjujutesting.MgoInstance, 0, n)
   486  	root, err := newMongoInstance()
   487  	if err != nil {
   488  		return nil, err
   489  	}
   490  	insts = append(insts, root)
   491  	defer func() {
   492  		if err == nil {
   493  			return
   494  		}
   495  		for _, inst := range insts {
   496  			inst.Destroy()
   497  		}
   498  	}()
   499  
   500  	dialInfo := root.DialInfo()
   501  	dialInfo.Direct = true
   502  	dialInfo.Timeout = 60 * time.Second
   503  
   504  	session, err := root.DialDirect()
   505  	if err != nil {
   506  		return nil, fmt.Errorf("cannot dial root instance: %v", err)
   507  	}
   508  	defer session.Close()
   509  
   510  	logger.Infof("dialled root instance")
   511  
   512  	if err := replicaset.Initiate(session, root.Addr(), replicaSetName, nil); err != nil {
   513  		return nil, fmt.Errorf("cannot initiate replica set: %v", err)
   514  	}
   515  	var members []replicaset.Member
   516  	for i := 1; i < n; i++ {
   517  		inst, err := newMongoInstance()
   518  		if err != nil {
   519  			return nil, err
   520  		}
   521  		insts = append(insts, inst)
   522  		members = append(members, replicaset.Member{
   523  			Address:  inst.Addr(),
   524  			Priority: newFloat64(0.1),
   525  			Id:       i + 1,
   526  		})
   527  	}
   528  	attempt := utils.AttemptStrategy{
   529  		Total: 60 * time.Second,
   530  		Delay: 1 * time.Second,
   531  	}
   532  	for a := attempt.Start(); a.Next(); {
   533  		err := replicaset.Add(session, members...)
   534  		if err == nil {
   535  			break
   536  		}
   537  		logger.Errorf("cannot add members: %v", err)
   538  		if !a.HasNext() {
   539  			return nil, fmt.Errorf("timed out trying to add members")
   540  		}
   541  		logger.Errorf("retrying")
   542  	}
   543  	return insts, err
   544  }
   545  
   546  func newMongoInstance() (*gitjujutesting.MgoInstance, error) {
   547  	inst := &gitjujutesting.MgoInstance{Params: []string{"--replSet", replicaSetName}}
   548  	if err := inst.Start(testing.Certs); err != nil {
   549  		return nil, fmt.Errorf("cannot start mongo server: %s", err.Error())
   550  	}
   551  	return inst, nil
   552  }
   553  
   554  func newFloat64(f float64) *float64 {
   555  	return &f
   556  }
   557  
   558  // connectionIsFatal returns a function suitable for passing
   559  // as the isFatal argument to worker.NewRunner,
   560  // that diagnoses an error as fatal if the connection
   561  // has failed or if the error is otherwise fatal.
   562  // Copied from jujud.
   563  func connectionIsFatal(conn singular.Conn) func(err error) bool {
   564  	return func(err error) bool {
   565  		if err := conn.Ping(); err != nil {
   566  			logger.Infof("error pinging %T: %v", conn, err)
   567  			return true
   568  		}
   569  		logger.Infof("error %q is not fatal", err)
   570  		return false
   571  	}
   572  }