github.com/rogpeppe/juju@v0.0.0-20140613142852-6337964b789e/worker/peergrouper/worker_test.go (about)

     1  // Copyright 2014 Canonical Ltd.
     2  // Licensed under the AGPLv3, see LICENCE file for details.
     3  
     4  package peergrouper
     5  
     6  import (
     7  	"errors"
     8  	"fmt"
     9  	"time"
    10  
    11  	jc "github.com/juju/testing/checkers"
    12  	"github.com/juju/utils/voyeur"
    13  	gc "launchpad.net/gocheck"
    14  
    15  	"github.com/juju/juju/instance"
    16  	"github.com/juju/juju/juju/testing"
    17  	"github.com/juju/juju/network"
    18  	statetesting "github.com/juju/juju/state/testing"
    19  	coretesting "github.com/juju/juju/testing"
    20  	"github.com/juju/juju/worker"
    21  )
    22  
    23  type workerJujuConnSuite struct {
    24  	testing.JujuConnSuite
    25  }
    26  
    27  var _ = gc.Suite(&workerJujuConnSuite{})
    28  
    29  func (s *workerJujuConnSuite) TestStartStop(c *gc.C) {
    30  	w, err := New(s.State)
    31  	c.Assert(err, gc.IsNil)
    32  	err = worker.Stop(w)
    33  	c.Assert(err, gc.IsNil)
    34  }
    35  
    36  func (s *workerJujuConnSuite) TestPublisherSetsAPIHostPorts(c *gc.C) {
    37  	st := newFakeState()
    38  	initState(c, st, 3)
    39  
    40  	watcher := s.State.WatchAPIHostPorts()
    41  	cwatch := statetesting.NewNotifyWatcherC(c, s.State, watcher)
    42  	cwatch.AssertOneChange()
    43  
    44  	statePublish := newPublisher(s.State)
    45  
    46  	// Wrap the publisher so that we can call StartSync immediately
    47  	// after the publishAPIServers method is called.
    48  	publish := func(apiServers [][]network.HostPort, instanceIds []instance.Id) error {
    49  		err := statePublish.publishAPIServers(apiServers, instanceIds)
    50  		s.State.StartSync()
    51  		return err
    52  	}
    53  
    54  	w := newWorker(st, publisherFunc(publish))
    55  	defer func() {
    56  		c.Check(worker.Stop(w), gc.IsNil)
    57  	}()
    58  
    59  	cwatch.AssertOneChange()
    60  	hps, err := s.State.APIHostPorts()
    61  	c.Assert(err, gc.IsNil)
    62  	assertAPIHostPorts(c, hps, expectedAPIHostPorts(3))
    63  }
    64  
    65  type workerSuite struct {
    66  	coretesting.BaseSuite
    67  }
    68  
    69  var _ = gc.Suite(&workerSuite{})
    70  
    71  func (s *workerSuite) SetUpTest(c *gc.C) {
    72  	s.BaseSuite.SetUpTest(c)
    73  	resetErrors()
    74  }
    75  
    76  // initState initializes the fake state with a single
    77  // replicaset member and numMachines machines
    78  // primed to vote.
    79  func initState(c *gc.C, st *fakeState, numMachines int) {
    80  	var ids []string
    81  	for i := 10; i < 10+numMachines; i++ {
    82  		id := fmt.Sprint(i)
    83  		m := st.addMachine(id, true)
    84  		m.setInstanceId(instance.Id("id-" + id))
    85  		m.setStateHostPort(fmt.Sprintf("0.1.2.%d:%d", i, mongoPort))
    86  		ids = append(ids, id)
    87  		c.Assert(m.MongoHostPorts(), gc.HasLen, 1)
    88  
    89  		m.setAPIHostPorts(addressesWithPort(apiPort, fmt.Sprintf("0.1.2.%d", i)))
    90  	}
    91  	st.machine("10").SetHasVote(true)
    92  	st.setStateServers(ids...)
    93  	st.session.Set(mkMembers("0v"))
    94  	st.session.setStatus(mkStatuses("0p"))
    95  	st.check = checkInvariants
    96  }
    97  
    98  // expectedAPIHostPorts returns the expected addresses
    99  // of the machines as created by initState.
   100  func expectedAPIHostPorts(n int) [][]network.HostPort {
   101  	servers := make([][]network.HostPort, n)
   102  	for i := range servers {
   103  		servers[i] = []network.HostPort{{
   104  			Address: network.NewAddress(fmt.Sprintf("0.1.2.%d", i+10), network.ScopeUnknown),
   105  			Port:    apiPort,
   106  		}}
   107  	}
   108  	return servers
   109  }
   110  
   111  func addressesWithPort(port int, addrs ...string) []network.HostPort {
   112  	return network.AddressesWithPort(network.NewAddresses(addrs...), port)
   113  }
   114  
   115  func (s *workerSuite) TestSetsAndUpdatesMembers(c *gc.C) {
   116  	s.PatchValue(&pollInterval, 5*time.Millisecond)
   117  
   118  	st := newFakeState()
   119  	initState(c, st, 3)
   120  
   121  	memberWatcher := st.session.members.Watch()
   122  	mustNext(c, memberWatcher)
   123  	assertMembers(c, memberWatcher.Value(), mkMembers("0v"))
   124  
   125  	logger.Infof("starting worker")
   126  	w := newWorker(st, noPublisher{})
   127  	defer func() {
   128  		c.Check(worker.Stop(w), gc.IsNil)
   129  	}()
   130  
   131  	// Wait for the worker to set the initial members.
   132  	mustNext(c, memberWatcher)
   133  	assertMembers(c, memberWatcher.Value(), mkMembers("0v 1 2"))
   134  
   135  	// Update the status of the new members
   136  	// and check that they become voting.
   137  	c.Logf("updating new member status")
   138  	st.session.setStatus(mkStatuses("0p 1s 2s"))
   139  	mustNext(c, memberWatcher)
   140  	assertMembers(c, memberWatcher.Value(), mkMembers("0v 1v 2v"))
   141  
   142  	c.Logf("adding another machine")
   143  	// Add another machine.
   144  	m13 := st.addMachine("13", false)
   145  	m13.setStateHostPort(fmt.Sprintf("0.1.2.%d:%d", 13, mongoPort))
   146  	st.setStateServers("10", "11", "12", "13")
   147  
   148  	c.Logf("waiting for new member to be added")
   149  	mustNext(c, memberWatcher)
   150  	assertMembers(c, memberWatcher.Value(), mkMembers("0v 1v 2v 3"))
   151  
   152  	// Remove vote from an existing member;
   153  	// and give it to the new machine.
   154  	// Also set the status of the new machine to
   155  	// healthy.
   156  	c.Logf("removing vote from machine 10 and adding it to machine 13")
   157  	st.machine("10").setWantsVote(false)
   158  	st.machine("13").setWantsVote(true)
   159  
   160  	st.session.setStatus(mkStatuses("0p 1s 2s 3s"))
   161  
   162  	// Check that the new machine gets the vote and the
   163  	// old machine loses it.
   164  	c.Logf("waiting for vote switch")
   165  	mustNext(c, memberWatcher)
   166  	assertMembers(c, memberWatcher.Value(), mkMembers("0 1v 2v 3v"))
   167  
   168  	c.Logf("removing old machine")
   169  	// Remove the old machine.
   170  	st.removeMachine("10")
   171  	st.setStateServers("11", "12", "13")
   172  
   173  	// Check that it's removed from the members.
   174  	c.Logf("waiting for removal")
   175  	mustNext(c, memberWatcher)
   176  	assertMembers(c, memberWatcher.Value(), mkMembers("1v 2v 3v"))
   177  }
   178  
   179  func (s *workerSuite) TestHasVoteMaintainedEvenWhenReplicaSetFails(c *gc.C) {
   180  	st := newFakeState()
   181  
   182  	// Simulate a state where we have four state servers,
   183  	// one has gone down, and we're replacing it:
   184  	// 0 - hasvote true, wantsvote false, down
   185  	// 1 - hasvote true, wantsvote true
   186  	// 2 - hasvote true, wantsvote true
   187  	// 3 - hasvote false, wantsvote true
   188  	//
   189  	// When it starts, the worker should move the vote from
   190  	// 0 to 3. We'll arrange things so that it will succeed in
   191  	// setting the membership but fail setting the HasVote
   192  	// to false.
   193  	initState(c, st, 4)
   194  	st.machine("10").SetHasVote(true)
   195  	st.machine("11").SetHasVote(true)
   196  	st.machine("12").SetHasVote(true)
   197  	st.machine("13").SetHasVote(false)
   198  
   199  	st.machine("10").setWantsVote(false)
   200  	st.machine("11").setWantsVote(true)
   201  	st.machine("12").setWantsVote(true)
   202  	st.machine("13").setWantsVote(true)
   203  
   204  	st.session.Set(mkMembers("0v 1v 2v 3"))
   205  	st.session.setStatus(mkStatuses("0H 1p 2s 3s"))
   206  
   207  	// Make the worker fail to set HasVote to false
   208  	// after changing the replica set membership.
   209  	setErrorFor("Machine.SetHasVote * false", errors.New("frood"))
   210  
   211  	memberWatcher := st.session.members.Watch()
   212  	mustNext(c, memberWatcher)
   213  	assertMembers(c, memberWatcher.Value(), mkMembers("0v 1v 2v 3"))
   214  
   215  	w := newWorker(st, noPublisher{})
   216  	done := make(chan error)
   217  	go func() {
   218  		done <- w.Wait()
   219  	}()
   220  
   221  	// Wait for the worker to set the initial members.
   222  	mustNext(c, memberWatcher)
   223  	assertMembers(c, memberWatcher.Value(), mkMembers("0 1v 2v 3v"))
   224  
   225  	// The worker should encounter an error setting the
   226  	// has-vote status to false and exit.
   227  	select {
   228  	case err := <-done:
   229  		c.Assert(err, gc.ErrorMatches, `cannot set voting status of "[0-9]+" to false: frood`)
   230  	case <-time.After(coretesting.LongWait):
   231  		c.Fatalf("timed out waiting for worker to exit")
   232  	}
   233  
   234  	// Start the worker again - although the membership should
   235  	// not change, the HasVote status should be updated correctly.
   236  	resetErrors()
   237  	w = newWorker(st, noPublisher{})
   238  
   239  	// Watch all the machines for changes, so we can check
   240  	// their has-vote status without polling.
   241  	changed := make(chan struct{}, 1)
   242  	for i := 10; i < 14; i++ {
   243  		watcher := st.machine(fmt.Sprint(i)).val.Watch()
   244  		defer watcher.Close()
   245  		go func() {
   246  			for watcher.Next() {
   247  				select {
   248  				case changed <- struct{}{}:
   249  				default:
   250  				}
   251  			}
   252  		}()
   253  	}
   254  	timeout := time.After(coretesting.LongWait)
   255  loop:
   256  	for {
   257  		select {
   258  		case <-changed:
   259  			correct := true
   260  			for i := 10; i < 14; i++ {
   261  				hasVote := st.machine(fmt.Sprint(i)).HasVote()
   262  				expectHasVote := i != 10
   263  				if hasVote != expectHasVote {
   264  					correct = false
   265  				}
   266  			}
   267  			if correct {
   268  				break loop
   269  			}
   270  		case <-timeout:
   271  			c.Fatalf("timed out waiting for vote to be set")
   272  		}
   273  	}
   274  }
   275  
   276  func (s *workerSuite) TestAddressChange(c *gc.C) {
   277  	st := newFakeState()
   278  	initState(c, st, 3)
   279  
   280  	memberWatcher := st.session.members.Watch()
   281  	mustNext(c, memberWatcher)
   282  	assertMembers(c, memberWatcher.Value(), mkMembers("0v"))
   283  
   284  	logger.Infof("starting worker")
   285  	w := newWorker(st, noPublisher{})
   286  	defer func() {
   287  		c.Check(worker.Stop(w), gc.IsNil)
   288  	}()
   289  
   290  	// Wait for the worker to set the initial members.
   291  	mustNext(c, memberWatcher)
   292  	assertMembers(c, memberWatcher.Value(), mkMembers("0v 1 2"))
   293  
   294  	// Change an address and wait for it to be changed in the
   295  	// members.
   296  	st.machine("11").setStateHostPort("0.1.99.99:9876")
   297  
   298  	mustNext(c, memberWatcher)
   299  	expectMembers := mkMembers("0v 1 2")
   300  	expectMembers[1].Address = "0.1.99.99:9876"
   301  	assertMembers(c, memberWatcher.Value(), expectMembers)
   302  }
   303  
   304  var fatalErrorsTests = []struct {
   305  	errPattern string
   306  	err        error
   307  	expectErr  string
   308  }{{
   309  	errPattern: "State.StateServerInfo",
   310  	expectErr:  "cannot get state server info: sample",
   311  }, {
   312  	errPattern: "Machine.SetHasVote 11 true",
   313  	expectErr:  `cannot set voting status of "11" to true: sample`,
   314  }, {
   315  	errPattern: "Session.CurrentStatus",
   316  	expectErr:  "cannot get replica set status: sample",
   317  }, {
   318  	errPattern: "Session.CurrentMembers",
   319  	expectErr:  "cannot get replica set members: sample",
   320  }, {
   321  	errPattern: "State.Machine *",
   322  	expectErr:  `cannot get machine "10": sample`,
   323  }, {
   324  	errPattern: "Machine.InstanceId *",
   325  	expectErr:  `cannot get API server info: sample`,
   326  }}
   327  
   328  func (s *workerSuite) TestFatalErrors(c *gc.C) {
   329  	s.PatchValue(&pollInterval, 5*time.Millisecond)
   330  	for i, test := range fatalErrorsTests {
   331  		c.Logf("test %d: %s -> %s", i, test.errPattern, test.expectErr)
   332  		resetErrors()
   333  		st := newFakeState()
   334  		st.session.InstantlyReady = true
   335  		initState(c, st, 3)
   336  		setErrorFor(test.errPattern, errors.New("sample"))
   337  		w := newWorker(st, noPublisher{})
   338  		done := make(chan error)
   339  		go func() {
   340  			done <- w.Wait()
   341  		}()
   342  		select {
   343  		case err := <-done:
   344  			c.Assert(err, gc.ErrorMatches, test.expectErr)
   345  		case <-time.After(coretesting.LongWait):
   346  			c.Fatalf("timed out waiting for error")
   347  		}
   348  	}
   349  }
   350  
   351  func (s *workerSuite) TestSetMembersErrorIsNotFatal(c *gc.C) {
   352  	st := newFakeState()
   353  	initState(c, st, 3)
   354  	st.session.setStatus(mkStatuses("0p 1s 2s"))
   355  	var isSet voyeur.Value
   356  	count := 0
   357  	setErrorFuncFor("Session.Set", func() error {
   358  		isSet.Set(count)
   359  		count++
   360  		return errors.New("sample")
   361  	})
   362  	s.PatchValue(&initialRetryInterval, 10*time.Microsecond)
   363  	s.PatchValue(&maxRetryInterval, coretesting.ShortWait/4)
   364  
   365  	expectedIterations := 0
   366  	for d := initialRetryInterval; d < maxRetryInterval*2; d *= 2 {
   367  		expectedIterations++
   368  	}
   369  
   370  	w := newWorker(st, noPublisher{})
   371  	defer func() {
   372  		c.Check(worker.Stop(w), gc.IsNil)
   373  	}()
   374  	isSetWatcher := isSet.Watch()
   375  
   376  	n0 := mustNext(c, isSetWatcher).(int)
   377  	time.Sleep(maxRetryInterval * 2)
   378  	n1 := mustNext(c, isSetWatcher).(int)
   379  
   380  	// The worker should have backed off exponentially...
   381  	c.Assert(n1-n0, jc.LessThan, expectedIterations+1)
   382  	c.Logf("actual iterations %d; expected iterations %d", n1-n0, expectedIterations)
   383  
   384  	// ... but only up to the maximum retry interval
   385  	n0 = mustNext(c, isSetWatcher).(int)
   386  	time.Sleep(maxRetryInterval * 2)
   387  	n1 = mustNext(c, isSetWatcher).(int)
   388  
   389  	c.Assert(n1-n0, jc.LessThan, 3)
   390  }
   391  
   392  type publisherFunc func(apiServers [][]network.HostPort, instanceIds []instance.Id) error
   393  
   394  func (f publisherFunc) publishAPIServers(apiServers [][]network.HostPort, instanceIds []instance.Id) error {
   395  	return f(apiServers, instanceIds)
   396  }
   397  
   398  func (s *workerSuite) TestStateServersArePublished(c *gc.C) {
   399  	publishCh := make(chan [][]network.HostPort)
   400  	publish := func(apiServers [][]network.HostPort, instanceIds []instance.Id) error {
   401  		publishCh <- apiServers
   402  		return nil
   403  	}
   404  
   405  	st := newFakeState()
   406  	initState(c, st, 3)
   407  	w := newWorker(st, publisherFunc(publish))
   408  	defer func() {
   409  		c.Check(worker.Stop(w), gc.IsNil)
   410  	}()
   411  	select {
   412  	case servers := <-publishCh:
   413  		assertAPIHostPorts(c, servers, expectedAPIHostPorts(3))
   414  	case <-time.After(coretesting.LongWait):
   415  		c.Fatalf("timed out waiting for publish")
   416  	}
   417  
   418  	// Change one of the servers' API addresses and check that it's published.
   419  
   420  	newMachine10APIHostPorts := addressesWithPort(apiPort, "0.2.8.124")
   421  	st.machine("10").setAPIHostPorts(newMachine10APIHostPorts)
   422  	select {
   423  	case servers := <-publishCh:
   424  		expected := expectedAPIHostPorts(3)
   425  		expected[0] = newMachine10APIHostPorts
   426  		assertAPIHostPorts(c, servers, expected)
   427  	case <-time.After(coretesting.LongWait):
   428  		c.Fatalf("timed out waiting for publish")
   429  	}
   430  }
   431  
   432  func (s *workerSuite) TestWorkerRetriesOnPublishError(c *gc.C) {
   433  	s.PatchValue(&pollInterval, coretesting.LongWait+time.Second)
   434  	s.PatchValue(&initialRetryInterval, 5*time.Millisecond)
   435  	s.PatchValue(&maxRetryInterval, initialRetryInterval)
   436  
   437  	publishCh := make(chan [][]network.HostPort, 100)
   438  
   439  	count := 0
   440  	publish := func(apiServers [][]network.HostPort, instanceIds []instance.Id) error {
   441  		publishCh <- apiServers
   442  		count++
   443  		if count <= 3 {
   444  			return fmt.Errorf("publish error")
   445  		}
   446  		return nil
   447  	}
   448  	st := newFakeState()
   449  	initState(c, st, 3)
   450  
   451  	w := newWorker(st, publisherFunc(publish))
   452  	defer func() {
   453  		c.Check(worker.Stop(w), gc.IsNil)
   454  	}()
   455  
   456  	for i := 0; i < 4; i++ {
   457  		select {
   458  		case servers := <-publishCh:
   459  			assertAPIHostPorts(c, servers, expectedAPIHostPorts(3))
   460  		case <-time.After(coretesting.LongWait):
   461  			c.Fatalf("timed out waiting for publish #%d", i)
   462  		}
   463  	}
   464  	select {
   465  	case <-publishCh:
   466  		c.Errorf("unexpected publish event")
   467  	case <-time.After(coretesting.ShortWait):
   468  	}
   469  }
   470  
   471  func (s *workerSuite) TestWorkerPublishesInstanceIds(c *gc.C) {
   472  	s.PatchValue(&pollInterval, coretesting.LongWait+time.Second)
   473  	s.PatchValue(&initialRetryInterval, 5*time.Millisecond)
   474  	s.PatchValue(&maxRetryInterval, initialRetryInterval)
   475  
   476  	publishCh := make(chan []instance.Id, 100)
   477  
   478  	publish := func(apiServers [][]network.HostPort, instanceIds []instance.Id) error {
   479  		publishCh <- instanceIds
   480  		return nil
   481  	}
   482  	st := newFakeState()
   483  	initState(c, st, 3)
   484  
   485  	w := newWorker(st, publisherFunc(publish))
   486  	defer func() {
   487  		c.Check(worker.Stop(w), gc.IsNil)
   488  	}()
   489  
   490  	select {
   491  	case instanceIds := <-publishCh:
   492  		c.Assert(instanceIds, jc.SameContents, []instance.Id{"id-10", "id-11", "id-12"})
   493  	case <-time.After(coretesting.LongWait):
   494  		c.Errorf("timed out waiting for publish")
   495  	}
   496  }
   497  
   498  // mustNext waits for w's value to be set and returns it.
   499  func mustNext(c *gc.C, w *voyeur.Watcher) (val interface{}) {
   500  	done := make(chan bool)
   501  	go func() {
   502  		c.Logf("mustNext %p", w)
   503  		ok := w.Next()
   504  		val = w.Value()
   505  		c.Logf("mustNext done %p, ok %v", w, ok)
   506  		done <- ok
   507  	}()
   508  	select {
   509  	case ok := <-done:
   510  		c.Assert(ok, jc.IsTrue)
   511  		return
   512  	case <-time.After(coretesting.LongWait):
   513  		c.Fatalf("timed out waiting for value to be set")
   514  	}
   515  	panic("unreachable")
   516  }
   517  
   518  type noPublisher struct{}
   519  
   520  func (noPublisher) publishAPIServers(apiServers [][]network.HostPort, instanceIds []instance.Id) error {
   521  	return nil
   522  }