github.com/juju/juju@v0.0.0-20240327075706-a90865de2538/worker/peergrouper/worker_test.go (about)

     1  // Copyright 2014 Canonical Ltd.
     2  // Licensed under the AGPLv3, see LICENCE file for details.
     3  
     4  package peergrouper
     5  
     6  import (
     7  	"errors"
     8  	"fmt"
     9  	"net"
    10  	"sort"
    11  	"strconv"
    12  	"sync"
    13  	"time"
    14  
    15  	"github.com/juju/clock/testclock"
    16  	"github.com/juju/loggo"
    17  	"github.com/juju/pubsub/v2"
    18  	"github.com/juju/replicaset/v3"
    19  	jc "github.com/juju/testing/checkers"
    20  	"github.com/juju/utils/v3/voyeur"
    21  	"github.com/juju/worker/v3"
    22  	"github.com/juju/worker/v3/workertest"
    23  	"github.com/kr/pretty"
    24  	"github.com/prometheus/client_golang/prometheus"
    25  	gc "gopkg.in/check.v1"
    26  
    27  	"github.com/juju/juju/core/network"
    28  	"github.com/juju/juju/core/status"
    29  	"github.com/juju/juju/pubsub/apiserver"
    30  	"github.com/juju/juju/state"
    31  	coretesting "github.com/juju/juju/testing"
    32  )
    33  
    34  type TestIPVersion struct {
    35  	version     string
    36  	formatHost  string
    37  	extraHost   string
    38  	addressType network.AddressType
    39  }
    40  
    41  var (
    42  	testIPv4 = TestIPVersion{
    43  		version:     "IPv4",
    44  		formatHost:  "0.1.2.%d",
    45  		extraHost:   "0.1.99.13",
    46  		addressType: network.IPv4Address,
    47  	}
    48  	testIPv6 = TestIPVersion{
    49  		version:     "IPv6",
    50  		formatHost:  "2001:DB8::%d",
    51  		extraHost:   "2001:DB8::99:13",
    52  		addressType: network.IPv6Address,
    53  	}
    54  )
    55  
    56  type workerSuite struct {
    57  	coretesting.BaseSuite
    58  	clock *testclock.Clock
    59  	hub   Hub
    60  	idle  chan struct{}
    61  	mu    sync.Mutex
    62  
    63  	memberUpdates [][]replicaset.Member
    64  }
    65  
    66  var _ = gc.Suite(&workerSuite{})
    67  
    68  func (s *workerSuite) SetUpTest(c *gc.C) {
    69  	s.BaseSuite.SetUpTest(c)
    70  	s.clock = testclock.NewClock(time.Now())
    71  	s.hub = nopHub{}
    72  	logger.SetLogLevel(loggo.TRACE)
    73  	s.PatchValue(&IdleFunc, s.idleNotify)
    74  }
    75  
    76  type testSuite interface {
    77  	SetUpTest(c *gc.C)
    78  	TearDownTest(c *gc.C)
    79  }
    80  
    81  // DoTestForIPv4AndIPv6 runs the passed test for IPv4 and IPv6.
    82  //
    83  // TODO(axw) the type of address has little to do with the
    84  // behaviour of this worker. so we should not need to run the
    85  // tests for each address type. We can introduce a limited
    86  // number (probably one) of feature tests to check that we
    87  // handle both address types as expected.
    88  func DoTestForIPv4AndIPv6(c *gc.C, s testSuite, t func(ipVersion TestIPVersion)) {
    89  	t(testIPv4)
    90  	s.TearDownTest(c)
    91  	s.SetUpTest(c)
    92  	t(testIPv6)
    93  }
    94  
    95  // InitState initializes the fake state with a single replica-set member and
    96  // numNodes nodes primed to vote.
    97  func InitState(c *gc.C, st *fakeState, numNodes int, ipVersion TestIPVersion) {
    98  	var ids []string
    99  	for i := 10; i < 10+numNodes; i++ {
   100  		id := fmt.Sprint(i)
   101  		m := st.addController(id, true)
   102  		m.setAddresses(network.NewSpaceAddress(fmt.Sprintf(ipVersion.formatHost, i)))
   103  		ids = append(ids, id)
   104  		c.Assert(m.Addresses(), gc.HasLen, 1)
   105  	}
   106  	st.setControllers(ids...)
   107  	err := st.session.Set(mkMembers("0v", ipVersion))
   108  	c.Assert(err, jc.ErrorIsNil)
   109  	st.session.setStatus(mkStatuses("0p", ipVersion))
   110  	err = st.controller("10").SetHasVote(true)
   111  	c.Assert(err, jc.ErrorIsNil)
   112  	st.setCheck(checkInvariants)
   113  }
   114  
   115  // ExpectedAPIHostPorts returns the expected addresses
   116  // of the nodes as created by InitState.
   117  func ExpectedAPIHostPorts(n int, ipVersion TestIPVersion) []network.SpaceHostPorts {
   118  	servers := make([]network.SpaceHostPorts, n)
   119  	for i := range servers {
   120  		servers[i] = network.NewSpaceHostPorts(
   121  			apiPort,
   122  			fmt.Sprintf(ipVersion.formatHost, i+10),
   123  		)
   124  	}
   125  	return servers
   126  }
   127  
   128  func (s *workerSuite) TestSetsAndUpdatesMembersIPv4(c *gc.C) {
   129  	s.doTestSetAndUpdateMembers(c, testIPv4)
   130  }
   131  
   132  func (s *workerSuite) TestSetsAndUpdatesMembersIPv6(c *gc.C) {
   133  	s.doTestSetAndUpdateMembers(c, testIPv6)
   134  }
   135  
   136  func (s *workerSuite) doTestSetAndUpdateMembers(c *gc.C, ipVersion TestIPVersion) {
   137  	c.Logf("\n\nTestSetsAndUpdatesMembers: %s", ipVersion.version)
   138  	st := NewFakeState()
   139  	InitState(c, st, 3, ipVersion)
   140  	memberWatcher := st.session.members.Watch()
   141  	defer memberWatcher.Close()
   142  
   143  	s.recordMemberChanges(c, memberWatcher)
   144  	update := s.mustNext(c, "init")
   145  	assertMembers(c, update, mkMembers("0v", ipVersion))
   146  
   147  	logger.Infof("starting worker")
   148  	w := s.newWorker(c, st, st.session, nopAPIHostPortsSetter{}, true)
   149  	defer workertest.CleanKill(c, w)
   150  
   151  	// Due to the inherit complexity of the multiple goroutines running
   152  	// and listen do different watchers, there is no way to manually
   153  	// advance the testing clock in a controlled manner as the clock.After
   154  	// calls can be replaced in response to other watcher events. Hence
   155  	// using the standard testing clock wait / advance method does not
   156  	// work. So we use the real clock to advance the test clock for this
   157  	// test.
   158  	// Every 5ms we advance the testing clock by pollInterval (1min)
   159  	done := make(chan struct{})
   160  	clockAdvancerFinished := make(chan struct{})
   161  	defer func() {
   162  		close(done)
   163  		select {
   164  		case <-clockAdvancerFinished:
   165  			return
   166  		case <-time.After(coretesting.LongWait):
   167  			c.Error("advancing goroutine didn't finish")
   168  		}
   169  	}()
   170  	go func() {
   171  		defer close(clockAdvancerFinished)
   172  		for {
   173  			select {
   174  			case <-time.After(5 * time.Millisecond):
   175  				s.clock.Advance(pollInterval)
   176  			case <-done:
   177  				return
   178  			}
   179  		}
   180  	}()
   181  
   182  	// Wait for the worker to set the initial members.
   183  	update = s.mustNext(c, "initial members")
   184  	assertMembers(c, update, mkMembers("0v 1 2", ipVersion))
   185  
   186  	// Update the status of the new members
   187  	// and check that they become voting.
   188  	c.Logf("\nupdating new member status")
   189  	st.session.setStatus(mkStatuses("0p 1s 2s", ipVersion))
   190  	update = s.mustNext(c, "new member status")
   191  	assertMembers(c, update, mkMembers("0v 1v 2v", ipVersion))
   192  
   193  	c.Logf("\nadding another controller")
   194  	m13 := st.addController("13", false)
   195  	m13.setAddresses(network.NewSpaceAddress(fmt.Sprintf(ipVersion.formatHost, 13)))
   196  	st.setControllers("10", "11", "12", "13")
   197  
   198  	update = s.mustNext(c, "waiting for new member to be added")
   199  	assertMembers(c, update, mkMembers("0v 1v 2v 3", ipVersion))
   200  
   201  	// Remove vote from an existing member; and give it to the new
   202  	// controller. Also set the status of the new controller to healthy.
   203  	c.Logf("\nremoving vote from controller 10 and adding it to controller 13")
   204  	st.controller("10").setWantsVote(false)
   205  	// Controller 11 or 12 becomes the new primary (it is randomised).
   206  	update = s.mustNext(c, "waiting for vote switch")
   207  
   208  	if st.session.currentPrimary() == "11" {
   209  		assertMembers(c, update, mkMembers("0 1v 2 3", ipVersion))
   210  	} else {
   211  		assertMembers(c, update, mkMembers("0 1 2v 3", ipVersion))
   212  	}
   213  
   214  	st.controller("13").setWantsVote(true)
   215  
   216  	st.session.setStatus(mkStatuses("0s 1p 2s 3s", ipVersion))
   217  
   218  	// Check that the new controller gets the vote and the
   219  	// old controller loses it.
   220  	update = s.mustNext(c, "waiting for vote switch")
   221  	assertMembers(c, update, mkMembers("0 1v 2v 3v", ipVersion))
   222  
   223  	c.Logf("\nremoving old controller")
   224  	// Remove the old controller.
   225  	st.removeController("10")
   226  	st.setControllers("11", "12", "13")
   227  
   228  	// Check that it's removed from the members.
   229  	update = s.mustNext(c, "waiting for removal")
   230  	assertMembers(c, update, mkMembers("1v 2v 3v", ipVersion))
   231  }
   232  
   233  func (s *workerSuite) TestHasVoteMaintainedEvenWhenReplicaSetFailsIPv4(c *gc.C) {
   234  	s.doTestHasVoteMaintainsEvenWhenReplicaSetFails(c, testIPv4)
   235  }
   236  
   237  func (s *workerSuite) TestHasVoteMaintainedEvenWhenReplicaSetFailsIPv6(c *gc.C) {
   238  	s.doTestHasVoteMaintainsEvenWhenReplicaSetFails(c, testIPv6)
   239  }
   240  
   241  func (s *workerSuite) doTestHasVoteMaintainsEvenWhenReplicaSetFails(c *gc.C, ipVersion TestIPVersion) {
   242  	st := NewFakeState()
   243  
   244  	// Simulate a state where we have four controllers,
   245  	// one has gone down, and we're replacing it:
   246  	// 0 - hasvote true, wantsvote true, primary
   247  	// 1 - hasvote true, wantsvote false, down
   248  	// 2 - hasvote true, wantsvote true
   249  	// 3 - hasvote false, wantsvote true
   250  	//
   251  	// When it starts, the worker should move the vote from
   252  	// 0 to 3. We'll arrange things so that it will succeed in
   253  	// setting the membership but fail setting the HasVote
   254  	// to false.
   255  	InitState(c, st, 4, ipVersion)
   256  	err := st.controller("10").SetHasVote(true)
   257  	c.Assert(err, jc.ErrorIsNil)
   258  	err = st.controller("11").SetHasVote(true)
   259  	c.Assert(err, jc.ErrorIsNil)
   260  	err = st.controller("12").SetHasVote(true)
   261  	c.Assert(err, jc.ErrorIsNil)
   262  	err = st.controller("13").SetHasVote(false)
   263  	c.Assert(err, jc.ErrorIsNil)
   264  
   265  	st.controller("10").setWantsVote(true)
   266  	st.controller("11").setWantsVote(false)
   267  	st.controller("12").setWantsVote(true)
   268  	st.controller("13").setWantsVote(true)
   269  
   270  	err = st.session.Set(mkMembers("0v 1v 2v 3", ipVersion))
   271  	c.Assert(err, jc.ErrorIsNil)
   272  	st.session.setStatus(mkStatuses("0p 1H 2s 3s", ipVersion))
   273  
   274  	// Make the worker fail to set HasVote to false
   275  	// after changing the replica set membership.
   276  	st.errors.setErrorFor("Controller.SetHasVote * false", errors.New("frood"))
   277  
   278  	memberWatcher := st.session.members.Watch()
   279  	defer memberWatcher.Close()
   280  
   281  	s.recordMemberChanges(c, memberWatcher)
   282  	update := s.mustNext(c, "waiting for SetHasVote failure")
   283  	assertMembers(c, update, mkMembers("0v 1v 2v 3", ipVersion))
   284  
   285  	w := s.newWorker(c, st, st.session, nopAPIHostPortsSetter{}, true)
   286  	defer workertest.DirtyKill(c, w)
   287  
   288  	// Wait for the worker to set the initial members.
   289  	update = s.mustNext(c, "initial members")
   290  	assertMembers(c, update, mkMembers("0v 1 2v 3v", ipVersion))
   291  
   292  	// The worker should encounter an error setting the
   293  	// has-vote status to false and exit.
   294  	err = workertest.CheckKilled(c, w)
   295  	c.Assert(err, gc.ErrorMatches, `removing non-voters: cannot set voting status of "[0-9]+" to false: frood`)
   296  
   297  	// Start the worker again - although the membership should
   298  	// not change, the HasVote status should be updated correctly.
   299  	st.errors.resetErrors()
   300  	w = s.newWorker(c, st, st.session, nopAPIHostPortsSetter{}, true)
   301  	defer workertest.CleanKill(c, w)
   302  
   303  	// Watch all the controllers for changes, so we can check
   304  	// their has-vote status without polling.
   305  	changed := make(chan struct{}, 1)
   306  	for i := 10; i < 14; i++ {
   307  		watcher := st.controller(fmt.Sprint(i)).val.Watch()
   308  		defer watcher.Close()
   309  		go func() {
   310  			for watcher.Next() {
   311  				select {
   312  				case changed <- struct{}{}:
   313  				default:
   314  				}
   315  			}
   316  		}()
   317  	}
   318  	timeout := time.After(coretesting.LongWait)
   319  loop:
   320  	for {
   321  		select {
   322  		case <-changed:
   323  			correct := true
   324  			for i := 10; i < 14; i++ {
   325  				hasVote := st.controller(fmt.Sprint(i)).HasVote()
   326  				expectHasVote := i != 11
   327  				if hasVote != expectHasVote {
   328  					correct = false
   329  				}
   330  			}
   331  			if correct {
   332  				break loop
   333  			}
   334  		case <-timeout:
   335  			c.Fatalf("timed out waiting for vote to be set")
   336  		}
   337  	}
   338  }
   339  
   340  func (s *workerSuite) TestAddressChange(c *gc.C) {
   341  	DoTestForIPv4AndIPv6(c, s, func(ipVersion TestIPVersion) {
   342  		st := NewFakeState()
   343  		InitState(c, st, 3, ipVersion)
   344  
   345  		memberWatcher := st.session.members.Watch()
   346  		defer memberWatcher.Close()
   347  
   348  		s.recordMemberChanges(c, memberWatcher)
   349  		update := s.mustNext(c, "init")
   350  		assertMembers(c, update, mkMembers("0v", ipVersion))
   351  
   352  		logger.Infof("starting worker")
   353  		w := s.newWorker(c, st, st.session, nopAPIHostPortsSetter{}, true)
   354  		defer workertest.CleanKill(c, w)
   355  
   356  		// Wait for the worker to set the initial members.
   357  		update = s.mustNext(c, "initial members")
   358  		assertMembers(c, update, mkMembers("0v 1 2", ipVersion))
   359  
   360  		// Change an address and wait for it to be changed in the
   361  		// members.
   362  		st.controller("11").setAddresses(network.NewSpaceAddress(ipVersion.extraHost))
   363  
   364  		update = s.mustNext(c, "waiting for new address")
   365  		expectMembers := mkMembers("0v 1 2", ipVersion)
   366  		expectMembers[1].Address = net.JoinHostPort(ipVersion.extraHost, fmt.Sprint(mongoPort))
   367  		assertMembers(c, update, expectMembers)
   368  	})
   369  }
   370  
   371  func (s *workerSuite) TestAddressChangeNoHA(c *gc.C) {
   372  	DoTestForIPv4AndIPv6(c, s, func(ipVersion TestIPVersion) {
   373  		st := NewFakeState()
   374  		InitState(c, st, 3, ipVersion)
   375  
   376  		memberWatcher := st.session.members.Watch()
   377  		defer memberWatcher.Close()
   378  
   379  		s.recordMemberChanges(c, memberWatcher)
   380  		update := s.mustNext(c, "init")
   381  		assertMembers(c, update, mkMembers("0v", ipVersion))
   382  
   383  		logger.Infof("starting worker")
   384  		w := s.newWorker(c, st, st.session, nopAPIHostPortsSetter{}, false)
   385  		defer workertest.CleanKill(c, w)
   386  
   387  		// There must be no replicaset updates.
   388  		type voyeurResult struct {
   389  			ok  bool
   390  			val interface{}
   391  		}
   392  		done := make(chan voyeurResult)
   393  		go func() {
   394  			ok := memberWatcher.Next()
   395  			val := memberWatcher.Value()
   396  			if ok {
   397  				members := val.([]replicaset.Member)
   398  				val = "\n" + prettyReplicaSetMembersSlice(members)
   399  			}
   400  			done <- voyeurResult{ok, val}
   401  		}()
   402  		select {
   403  		case <-done:
   404  			c.Fatalf("unexpected event")
   405  		case <-time.After(coretesting.ShortWait):
   406  		}
   407  	})
   408  }
   409  
   410  var fatalErrorsTests = []struct {
   411  	errPattern string
   412  	err        error
   413  	expectErr  string
   414  }{{
   415  	errPattern: "State.ControllerIds",
   416  	expectErr:  "cannot get controller ids: sample",
   417  }, {
   418  	errPattern: "Session.CurrentStatus",
   419  	expectErr:  "creating peer group info: cannot get replica set status: sample",
   420  }, {
   421  	errPattern: "Session.CurrentMembers",
   422  	expectErr:  "creating peer group info: cannot get replica set members: sample",
   423  }, {
   424  	errPattern: "State.ControllerNode *",
   425  	expectErr:  `cannot get controller "10": sample`,
   426  }, {
   427  	errPattern: "State.ControllerHost *",
   428  	expectErr:  `cannot get controller "10": sample`,
   429  }}
   430  
   431  func (s *workerSuite) TestFatalErrors(c *gc.C) {
   432  	DoTestForIPv4AndIPv6(c, s, func(ipVersion TestIPVersion) {
   433  		s.PatchValue(&pollInterval, 5*time.Millisecond)
   434  		for i, testCase := range fatalErrorsTests {
   435  			c.Logf("\n(%s) test %d: %s -> %s", ipVersion.version, i, testCase.errPattern, testCase.expectErr)
   436  			st := NewFakeState()
   437  			st.session.InstantlyReady = true
   438  			InitState(c, st, 3, ipVersion)
   439  			st.errors.setErrorFor(testCase.errPattern, errors.New("sample"))
   440  
   441  			w := s.newWorker(c, st, st.session, nopAPIHostPortsSetter{}, true)
   442  			defer workertest.DirtyKill(c, w)
   443  
   444  			done := make(chan error)
   445  			go func() {
   446  				done <- w.Wait()
   447  			}()
   448  			select {
   449  			case err := <-done:
   450  				c.Assert(err, gc.ErrorMatches, testCase.expectErr)
   451  			case <-time.After(coretesting.LongWait):
   452  				c.Fatalf("timed out waiting for error")
   453  			}
   454  		}
   455  	})
   456  }
   457  
   458  func (s *workerSuite) TestSetMembersErrorIsNotFatal(c *gc.C) {
   459  	DoTestForIPv4AndIPv6(c, s, func(ipVersion TestIPVersion) {
   460  		st := NewFakeState()
   461  		InitState(c, st, 3, ipVersion)
   462  		st.session.setStatus(mkStatuses("0p 1s 2s", ipVersion))
   463  		called := make(chan error)
   464  		setErr := errors.New("sample")
   465  		st.errors.setErrorFuncFor("Session.Set", func() error {
   466  			called <- setErr
   467  			return setErr
   468  		})
   469  
   470  		w := s.newWorker(c, st, st.session, nopAPIHostPortsSetter{}, true)
   471  		defer workertest.CleanKill(c, w)
   472  
   473  		// Just watch three error retries
   474  		retryInterval := initialRetryInterval
   475  		for i := 0; i < 3; i++ {
   476  			_ = s.clock.WaitAdvance(retryInterval, coretesting.ShortWait, 1)
   477  			retryInterval = scaleRetry(retryInterval)
   478  			select {
   479  			case err := <-called:
   480  				c.Check(err, gc.Equals, setErr)
   481  			case <-time.After(coretesting.LongWait):
   482  				c.Fatalf("timed out waiting for loop #%d", i)
   483  			}
   484  		}
   485  	})
   486  }
   487  
   488  type SetAPIHostPortsFunc func(apiServers []network.SpaceHostPorts) error
   489  
   490  func (f SetAPIHostPortsFunc) SetAPIHostPorts(apiServers []network.SpaceHostPorts) error {
   491  	return f(apiServers)
   492  }
   493  
   494  func (s *workerSuite) TestControllersArePublished(c *gc.C) {
   495  	DoTestForIPv4AndIPv6(c, s, func(ipVersion TestIPVersion) {
   496  		publishCh := make(chan []network.SpaceHostPorts)
   497  		publish := func(apiServers []network.SpaceHostPorts) error {
   498  			publishCh <- apiServers
   499  			return nil
   500  		}
   501  
   502  		st := NewFakeState()
   503  		InitState(c, st, 3, ipVersion)
   504  		w := s.newWorker(c, st, st.session, SetAPIHostPortsFunc(publish), true)
   505  		defer workertest.CleanKill(c, w)
   506  
   507  		select {
   508  		case servers := <-publishCh:
   509  			AssertAPIHostPorts(c, servers, ExpectedAPIHostPorts(3, ipVersion))
   510  		case <-time.After(coretesting.LongWait):
   511  			c.Fatalf("timed out waiting for publish")
   512  		}
   513  
   514  		// If a config change wakes up the loop *after* the controller topology
   515  		// is published, then we will get another call to setAPIHostPorts.
   516  		select {
   517  		case <-publishCh:
   518  		case <-time.After(coretesting.ShortWait):
   519  		}
   520  
   521  		// Change one of the server API addresses and check that it is
   522  		// published.
   523  		newMachine10Addresses := network.NewSpaceAddresses(ipVersion.extraHost)
   524  		st.controller("10").setAddresses(newMachine10Addresses...)
   525  		select {
   526  		case servers := <-publishCh:
   527  			expected := ExpectedAPIHostPorts(3, ipVersion)
   528  			expected[0] = network.SpaceAddressesWithPort(newMachine10Addresses, apiPort)
   529  			AssertAPIHostPorts(c, servers, expected)
   530  		case <-time.After(coretesting.LongWait):
   531  			c.Fatalf("timed out waiting for publish")
   532  		}
   533  	})
   534  }
   535  
   536  func (s *workerSuite) TestControllersArePublishedOverHub(c *gc.C) {
   537  	st := NewFakeState()
   538  	InitState(c, st, 3, testIPv4)
   539  
   540  	hub := pubsub.NewStructuredHub(nil)
   541  	event := make(chan apiserver.Details)
   542  	_, err := hub.Subscribe(apiserver.DetailsTopic, func(topic string, data apiserver.Details, err error) {
   543  		c.Check(err, jc.ErrorIsNil)
   544  		event <- data
   545  	})
   546  	c.Assert(err, jc.ErrorIsNil)
   547  	s.hub = hub
   548  
   549  	w := s.newWorker(c, st, st.session, nopAPIHostPortsSetter{}, true)
   550  	defer workertest.CleanKill(c, w)
   551  
   552  	expected := apiserver.Details{
   553  		Servers: map[string]apiserver.APIServer{
   554  			"10": {ID: "10", Addresses: []string{"0.1.2.10:5678"}, InternalAddress: "0.1.2.10:5678"},
   555  			"11": {ID: "11", Addresses: []string{"0.1.2.11:5678"}, InternalAddress: "0.1.2.11:5678"},
   556  			"12": {ID: "12", Addresses: []string{"0.1.2.12:5678"}, InternalAddress: "0.1.2.12:5678"},
   557  		},
   558  		LocalOnly: true,
   559  	}
   560  
   561  	select {
   562  	case obtained := <-event:
   563  		c.Assert(obtained, jc.DeepEquals, expected)
   564  	case <-time.After(coretesting.LongWait):
   565  		c.Fatalf("timed out waiting for event")
   566  	}
   567  }
   568  
   569  func (s *workerSuite) TestControllersPublishedWithControllerAPIPort(c *gc.C) {
   570  	st := NewFakeState()
   571  	InitState(c, st, 3, testIPv4)
   572  
   573  	hub := pubsub.NewStructuredHub(nil)
   574  	event := make(chan apiserver.Details)
   575  	_, err := hub.Subscribe(apiserver.DetailsTopic, func(topic string, data apiserver.Details, err error) {
   576  		c.Check(err, jc.ErrorIsNil)
   577  		event <- data
   578  	})
   579  	c.Assert(err, jc.ErrorIsNil)
   580  	s.hub = hub
   581  
   582  	w := s.newWorkerWithConfig(c, Config{
   583  		Clock:                s.clock,
   584  		State:                st,
   585  		MongoSession:         st.session,
   586  		APIHostPortsSetter:   nopAPIHostPortsSetter{},
   587  		ControllerId:         func() string { return "10" },
   588  		MongoPort:            mongoPort,
   589  		APIPort:              apiPort,
   590  		ControllerAPIPort:    controllerAPIPort,
   591  		Hub:                  s.hub,
   592  		SupportsHA:           true,
   593  		PrometheusRegisterer: noopRegisterer{},
   594  	})
   595  	defer workertest.CleanKill(c, w)
   596  
   597  	expected := apiserver.Details{
   598  		Servers: map[string]apiserver.APIServer{
   599  			"10": {ID: "10", Addresses: []string{"0.1.2.10:5678"}, InternalAddress: "0.1.2.10:9876"},
   600  			"11": {ID: "11", Addresses: []string{"0.1.2.11:5678"}, InternalAddress: "0.1.2.11:9876"},
   601  			"12": {ID: "12", Addresses: []string{"0.1.2.12:5678"}, InternalAddress: "0.1.2.12:9876"},
   602  		},
   603  		LocalOnly: true,
   604  	}
   605  
   606  	select {
   607  	case obtained := <-event:
   608  		c.Assert(obtained, jc.DeepEquals, expected)
   609  	case <-time.After(coretesting.LongWait):
   610  		c.Fatalf("timed out waiting for event")
   611  	}
   612  }
   613  
   614  func (s *workerSuite) TestControllersArePublishedOverHubWithNewVoters(c *gc.C) {
   615  	st := NewFakeState()
   616  	var ids []string
   617  	for i := 10; i < 13; i++ {
   618  		id := fmt.Sprint(i)
   619  		m := st.addController(id, true)
   620  		err := m.SetHasVote(true)
   621  		c.Assert(err, jc.ErrorIsNil)
   622  		m.setAddresses(network.NewSpaceAddress(fmt.Sprintf(testIPv4.formatHost, i)))
   623  		ids = append(ids, id)
   624  		c.Assert(m.Addresses(), gc.HasLen, 1)
   625  	}
   626  	st.setControllers(ids...)
   627  	err := st.session.Set(mkMembers("0v 1 2", testIPv4))
   628  	c.Assert(err, jc.ErrorIsNil)
   629  	st.session.setStatus(mkStatuses("0p 1s 2s", testIPv4))
   630  	st.setCheck(checkInvariants)
   631  
   632  	hub := pubsub.NewStructuredHub(nil)
   633  	event := make(chan apiserver.Details)
   634  	_, err = hub.Subscribe(apiserver.DetailsTopic, func(topic string, data apiserver.Details, err error) {
   635  		c.Check(err, jc.ErrorIsNil)
   636  		event <- data
   637  	})
   638  	c.Assert(err, jc.ErrorIsNil)
   639  	s.hub = hub
   640  
   641  	w := s.newWorker(c, st, st.session, nopAPIHostPortsSetter{}, true)
   642  	defer workertest.CleanKill(c, w)
   643  
   644  	expected := apiserver.Details{
   645  		Servers: map[string]apiserver.APIServer{
   646  			"10": {ID: "10", Addresses: []string{"0.1.2.10:5678"}, InternalAddress: "0.1.2.10:5678"},
   647  			"11": {ID: "11", Addresses: []string{"0.1.2.11:5678"}, InternalAddress: "0.1.2.11:5678"},
   648  			"12": {ID: "12", Addresses: []string{"0.1.2.12:5678"}, InternalAddress: "0.1.2.12:5678"},
   649  		},
   650  		LocalOnly: true,
   651  	}
   652  
   653  	select {
   654  	case obtained := <-event:
   655  		c.Assert(obtained, jc.DeepEquals, expected)
   656  	case <-time.After(coretesting.LongWait):
   657  		c.Fatalf("timed out waiting for event")
   658  	}
   659  
   660  	// And check that they can be republished on request.
   661  	_, err = hub.Publish(apiserver.DetailsRequestTopic, apiserver.DetailsRequest{
   662  		Requester: "dad",
   663  		LocalOnly: true,
   664  	})
   665  	c.Assert(err, jc.ErrorIsNil)
   666  	select {
   667  	case obtained := <-event:
   668  		c.Assert(obtained, jc.DeepEquals, expected)
   669  	case <-time.After(coretesting.LongWait):
   670  		c.Fatalf("timed out waiting for event")
   671  	}
   672  }
   673  
   674  func haSpaceTestCommonSetup(c *gc.C, ipVersion TestIPVersion, members string) *fakeState {
   675  	st := NewFakeState()
   676  	InitState(c, st, 3, ipVersion)
   677  
   678  	addrs := network.NewSpaceAddresses(
   679  		fmt.Sprintf(ipVersion.formatHost, 1),
   680  		fmt.Sprintf(ipVersion.formatHost, 2),
   681  		fmt.Sprintf(ipVersion.formatHost, 3),
   682  	)
   683  	for i := range addrs {
   684  		addrs[i].Scope = network.ScopeCloudLocal
   685  	}
   686  
   687  	spaces := []string{"one", "two", "three"}
   688  	controllers := []int{10, 11, 12}
   689  	for _, id := range controllers {
   690  		controller := st.controller(strconv.Itoa(id))
   691  		err := controller.SetHasVote(true)
   692  		c.Assert(err, jc.ErrorIsNil)
   693  		controller.setWantsVote(true)
   694  
   695  		// Each controller gets 3 addresses in 3 different spaces.
   696  		// Space "one" address on controller 10 ends with "10"
   697  		// Space "two" address ends with "11"
   698  		// Space "three" address ends with "12"
   699  		// Space "one" address on controller 20 ends with "20"
   700  		// Space "two" address ends with "21"
   701  		// ...
   702  		addrs := make(network.SpaceAddresses, 3)
   703  		for i, name := range spaces {
   704  			addr := network.NewSpaceAddress(
   705  				fmt.Sprintf(ipVersion.formatHost, i*10+id), network.WithScope(network.ScopeCloudLocal))
   706  			addr.SpaceID = name
   707  			addrs[i] = addr
   708  		}
   709  		controller.setAddresses(addrs...)
   710  	}
   711  
   712  	err := st.session.Set(mkMembers(members, ipVersion))
   713  	c.Assert(err, jc.ErrorIsNil)
   714  	return st
   715  }
   716  
   717  func (s *workerSuite) TestUsesConfiguredHASpaceIPv4(c *gc.C) {
   718  	s.doTestUsesConfiguredHASpace(c, testIPv4)
   719  }
   720  
   721  func (s *workerSuite) TestUsesConfiguredHASpaceIPv6(c *gc.C) {
   722  	s.doTestUsesConfiguredHASpace(c, testIPv6)
   723  }
   724  
   725  func (s *workerSuite) doTestUsesConfiguredHASpace(c *gc.C, ipVersion TestIPVersion) {
   726  	st := haSpaceTestCommonSetup(c, ipVersion, "0v 1v 2v")
   727  
   728  	// Set one of the statuses to ensure it is cleared upon determination
   729  	// of a new peer group.
   730  	now := time.Now()
   731  	err := st.controller("11").SetStatus(status.StatusInfo{
   732  		Status:  status.Started,
   733  		Message: "You said that would be bad, Egon",
   734  		Since:   &now,
   735  	})
   736  	c.Assert(err, gc.IsNil)
   737  
   738  	st.setHASpace("two")
   739  	s.runUntilPublish(c, st, "")
   740  	assertMemberAddresses(c, st, ipVersion.formatHost, 2)
   741  
   742  	sInfo, err := st.controller("11").Status()
   743  	c.Assert(err, gc.IsNil)
   744  	c.Check(sInfo.Status, gc.Equals, status.Started)
   745  	c.Check(sInfo.Message, gc.Equals, "")
   746  }
   747  
   748  // runUntilPublish runs a worker until addresses are published over the pub/sub
   749  // hub. Note that the replica-set is updated earlier than the publish,
   750  // so this sync can be used to check for those changes.
   751  // If errMsg is not empty, it is used to check for a matching error.
   752  func (s *workerSuite) runUntilPublish(c *gc.C, st *fakeState, errMsg string) {
   753  	hub := pubsub.NewStructuredHub(nil)
   754  	event := make(chan apiserver.Details)
   755  	_, err := hub.Subscribe(apiserver.DetailsTopic, func(topic string, data apiserver.Details, err error) {
   756  		c.Check(err, jc.ErrorIsNil)
   757  		event <- data
   758  	})
   759  	c.Assert(err, jc.ErrorIsNil)
   760  	s.hub = hub
   761  
   762  	w := s.newWorker(c, st, st.session, nopAPIHostPortsSetter{}, true)
   763  	defer func() {
   764  		if errMsg == "" {
   765  			workertest.CleanKill(c, w)
   766  		} else {
   767  			err := workertest.CheckKill(c, w)
   768  			c.Assert(err, gc.ErrorMatches, errMsg)
   769  		}
   770  	}()
   771  
   772  	select {
   773  	case <-event:
   774  	case <-time.After(coretesting.LongWait):
   775  		c.Fatalf("timed out waiting for event")
   776  	}
   777  }
   778  
   779  func (s *workerSuite) TestDetectsAndUsesHASpaceChangeIPv4(c *gc.C) {
   780  	s.doTestDetectsAndUsesHASpaceChange(c, testIPv4)
   781  }
   782  
   783  func (s *workerSuite) TestDetectsAndUsesHASpaceChangeIPv6(c *gc.C) {
   784  	s.doTestDetectsAndUsesHASpaceChange(c, testIPv6)
   785  }
   786  
   787  func (s *workerSuite) doTestDetectsAndUsesHASpaceChange(c *gc.C, ipVersion TestIPVersion) {
   788  	st := haSpaceTestCommonSetup(c, ipVersion, "0v 1v 2v")
   789  	st.setHASpace("one")
   790  
   791  	// Set up a hub and channel on which to receive notifications.
   792  	hub := pubsub.NewStructuredHub(nil)
   793  	event := make(chan apiserver.Details)
   794  	_, err := hub.Subscribe(apiserver.DetailsTopic, func(topic string, data apiserver.Details, err error) {
   795  		c.Check(err, jc.ErrorIsNil)
   796  		event <- data
   797  	})
   798  	c.Assert(err, jc.ErrorIsNil)
   799  	s.hub = hub
   800  
   801  	w := s.newWorker(c, st, st.session, nopAPIHostPortsSetter{}, true)
   802  	defer workertest.CleanKill(c, w)
   803  
   804  	select {
   805  	case <-event:
   806  	case <-time.After(coretesting.LongWait):
   807  		c.Fatalf("timed out waiting for event")
   808  	}
   809  	assertMemberAddresses(c, st, ipVersion.formatHost, 1)
   810  
   811  	// Changing the space does not change the API server details, so the
   812  	// change will not be broadcast via the hub.
   813  	// We watch the members collection, which *will* change.
   814  	memberWatcher := st.session.members.Watch()
   815  	defer memberWatcher.Close()
   816  
   817  	s.recordMemberChanges(c, memberWatcher)
   818  	s.mustNext(c, "initial watch")
   819  
   820  	// HA space config change should invoke the worker.
   821  	// Replica set addresses should change to the new space.
   822  	st.setHASpace("three")
   823  	s.mustNext(c, "waiting for members to be updated for space change")
   824  	assertMemberAddresses(c, st, ipVersion.formatHost, 3)
   825  }
   826  
   827  func assertMemberAddresses(c *gc.C, st *fakeState, addrTemplate string, addrDesignator int) {
   828  	members, _ := st.session.CurrentMembers()
   829  	obtained := make([]string, 3)
   830  	for i, m := range members {
   831  		obtained[i] = m.Address
   832  	}
   833  	sort.Strings(obtained)
   834  
   835  	expected := make([]string, 3)
   836  	for i := 0; i < 3; i++ {
   837  		expected[i] = net.JoinHostPort(fmt.Sprintf(addrTemplate, 10*addrDesignator+i), fmt.Sprint(mongoPort))
   838  	}
   839  
   840  	c.Check(obtained, gc.DeepEquals, expected)
   841  }
   842  
   843  func (s *workerSuite) TestErrorAndStatusForNewPeersAndNoHASpaceAndMachinesWithMultiAddrIPv4(c *gc.C) {
   844  	s.doTestErrorAndStatusForNewPeersAndNoHASpaceAndMachinesWithMultiAddr(c, testIPv4)
   845  }
   846  
   847  func (s *workerSuite) TestErrorAndStatusForNewPeersAndNoHASpaceAndMachinesWithMultiAddrIPv6(c *gc.C) {
   848  	s.doTestErrorAndStatusForNewPeersAndNoHASpaceAndMachinesWithMultiAddr(c, testIPv6)
   849  }
   850  
   851  func (s *workerSuite) doTestErrorAndStatusForNewPeersAndNoHASpaceAndMachinesWithMultiAddr(
   852  	c *gc.C, ipVersion TestIPVersion,
   853  ) {
   854  	st := haSpaceTestCommonSetup(c, ipVersion, "0v")
   855  	err := s.newWorker(c, st, st.session, nopAPIHostPortsSetter{}, true).Wait()
   856  	errMsg := `computing desired peer group: updating member addresses: ` +
   857  		`juju-ha-space is not set and these nodes have more than one usable address: 1[12], 1[12]` +
   858  		"\nrun \"juju controller-config juju-ha-space=<name>\" to set a space for Mongo peer communication"
   859  	c.Check(err, gc.ErrorMatches, errMsg)
   860  
   861  	for _, id := range []string{"11", "12"} {
   862  		sInfo, err := st.controller(id).Status()
   863  		c.Assert(err, gc.IsNil)
   864  		c.Check(sInfo.Status, gc.Equals, status.Started)
   865  		c.Check(sInfo.Message, gc.Not(gc.Equals), "")
   866  	}
   867  }
   868  
   869  func (s *workerSuite) TestErrorAndStatusForHASpaceWithNoAddressesAddrIPv4(c *gc.C) {
   870  	s.doTestErrorAndStatusForHASpaceWithNoAddresses(c, testIPv4)
   871  }
   872  
   873  func (s *workerSuite) TestErrorAndStatusForHASpaceWithNoAddressesAddrIPv6(c *gc.C) {
   874  	s.doTestErrorAndStatusForHASpaceWithNoAddresses(c, testIPv6)
   875  }
   876  
   877  func (s *workerSuite) doTestErrorAndStatusForHASpaceWithNoAddresses(
   878  	c *gc.C, ipVersion TestIPVersion,
   879  ) {
   880  	st := haSpaceTestCommonSetup(c, ipVersion, "0v")
   881  	st.setHASpace("nope")
   882  
   883  	err := s.newWorker(c, st, st.session, nopAPIHostPortsSetter{}, true).Wait()
   884  	errMsg := `computing desired peer group: updating member addresses: ` +
   885  		`no usable Mongo addresses found in configured juju-ha-space "nope" for nodes: 1[012], 1[012], 1[012]`
   886  	c.Check(err, gc.ErrorMatches, errMsg)
   887  
   888  	for _, id := range []string{"10", "11", "12"} {
   889  		sInfo, err := st.controller(id).Status()
   890  		c.Assert(err, gc.IsNil)
   891  		c.Check(sInfo.Status, gc.Equals, status.Started)
   892  		c.Check(sInfo.Message, gc.Not(gc.Equals), "")
   893  	}
   894  }
   895  
   896  func (s *workerSuite) TestSamePeersAndNoHASpaceAndMachinesWithMultiAddrIPv4(c *gc.C) {
   897  	s.doTestSamePeersAndNoHASpaceAndMachinesWithMultiAddr(c, testIPv4)
   898  }
   899  
   900  func (s *workerSuite) TestSamePeersAndNoHASpaceAndMachinesWithMultiAddrIPv6(c *gc.C) {
   901  	s.doTestSamePeersAndNoHASpaceAndMachinesWithMultiAddr(c, testIPv6)
   902  }
   903  
   904  func (s *workerSuite) doTestSamePeersAndNoHASpaceAndMachinesWithMultiAddr(c *gc.C, ipVersion TestIPVersion) {
   905  	st := haSpaceTestCommonSetup(c, ipVersion, "0v 1v 2v")
   906  	s.runUntilPublish(c, st, "")
   907  	assertMemberAddresses(c, st, ipVersion.formatHost, 1)
   908  }
   909  
   910  func (s *workerSuite) TestWorkerRetriesOnSetAPIHostPortsErrorIPv4(c *gc.C) {
   911  	s.doTestWorkerRetriesOnSetAPIHostPortsError(c, testIPv4)
   912  }
   913  
   914  func (s *workerSuite) TestWorkerRetriesOnSetAPIHostPortsErrorIPv6(c *gc.C) {
   915  	s.doTestWorkerRetriesOnSetAPIHostPortsError(c, testIPv6)
   916  }
   917  
   918  func (s *workerSuite) doTestWorkerRetriesOnSetAPIHostPortsError(c *gc.C, ipVersion TestIPVersion) {
   919  	logger.SetLogLevel(loggo.TRACE)
   920  
   921  	publishCh := make(chan []network.SpaceHostPorts, 10)
   922  	failedOnce := false
   923  	publish := func(apiServers []network.SpaceHostPorts) error {
   924  		if !failedOnce {
   925  			failedOnce = true
   926  			return fmt.Errorf("publish error")
   927  		}
   928  		publishCh <- apiServers
   929  		return nil
   930  	}
   931  	st := NewFakeState()
   932  	InitState(c, st, 3, ipVersion)
   933  
   934  	w := s.newWorker(c, st, st.session, SetAPIHostPortsFunc(publish), true)
   935  	defer workertest.CleanKill(c, w)
   936  
   937  	retryInterval := initialRetryInterval
   938  	_ = s.clock.WaitAdvance(retryInterval, coretesting.ShortWait, 1)
   939  	select {
   940  	case servers := <-publishCh:
   941  		AssertAPIHostPorts(c, servers, ExpectedAPIHostPorts(3, ipVersion))
   942  		break
   943  	case <-time.After(coretesting.ShortWait):
   944  		c.Fatal("APIHostPorts were not published")
   945  	}
   946  	// There isn't any point checking for additional publish
   947  	// calls as we are also racing against config changed, which
   948  	// will also call SetAPIHostPorts. But we may not get this.
   949  }
   950  
   951  func (s *workerSuite) initialize3Voters(c *gc.C) (*fakeState, worker.Worker, *voyeur.Watcher) {
   952  	st := NewFakeState()
   953  	InitState(c, st, 1, testIPv4)
   954  	err := st.controller("10").SetHasVote(true)
   955  	c.Assert(err, jc.ErrorIsNil)
   956  	st.session.setStatus(mkStatuses("0p", testIPv4))
   957  
   958  	w := s.newWorker(c, st, st.session, nopAPIHostPortsSetter{}, true)
   959  	defer func() {
   960  		if r := recover(); r != nil {
   961  			// we aren't exiting cleanly, so kill the worker
   962  			workertest.CleanKill(c, w)
   963  			// but let the stack trace continue
   964  			panic(r)
   965  		}
   966  	}()
   967  
   968  	memberWatcher := st.session.members.Watch()
   969  	s.recordMemberChanges(c, memberWatcher)
   970  
   971  	update := s.mustNext(c, "init")
   972  	assertMembers(c, update, mkMembers("0v", testIPv4))
   973  	// Now that 1 has come up successfully, bring in the next 2
   974  	for i := 11; i < 13; i++ {
   975  		id := fmt.Sprint(i)
   976  		m := st.addController(id, true)
   977  		m.setAddresses(network.NewSpaceAddress(fmt.Sprintf(testIPv4.formatHost, i)))
   978  		c.Check(m.Addresses(), gc.HasLen, 1)
   979  	}
   980  	// Now that we've added 2 more, flag them as started and mark them as participating
   981  	st.session.setStatus(mkStatuses("0p 1 2", testIPv4))
   982  	st.setControllers("10", "11", "12")
   983  	update = s.mustNext(c, "nonvoting members")
   984  	assertMembers(c, update, mkMembers("0v 1 2", testIPv4))
   985  	st.session.setStatus(mkStatuses("0p 1s 2s", testIPv4))
   986  	s.waitUntilIdle(c)
   987  	s.clock.Advance(pollInterval)
   988  	update = s.mustNext(c, "status ok")
   989  	assertMembers(c, update, mkMembers("0v 1v 2v", testIPv4))
   990  	err = st.controller("11").SetHasVote(true)
   991  	c.Assert(err, jc.ErrorIsNil)
   992  	err = st.controller("12").SetHasVote(true)
   993  	c.Assert(err, jc.ErrorIsNil)
   994  	return st, w, memberWatcher
   995  }
   996  
   997  func (s *workerSuite) TestDyingMachinesAreRemoved(c *gc.C) {
   998  	st, w, memberWatcher := s.initialize3Voters(c)
   999  	defer workertest.CleanKill(c, w)
  1000  	defer memberWatcher.Close()
  1001  
  1002  	// When we advance the lifecycle (aka controller.Destroy()), we should notice that the controller no longer wants a vote
  1003  	// controller.Destroy() advances to both Dying and SetWantsVote(false)
  1004  	st.controller("11").advanceLifecycle(state.Dying, false)
  1005  	// We see the controller is Dying we should remove it.
  1006  	update := s.mustNext(c, "remove dying controller")
  1007  	assertMembers(c, update, mkMembers("0v 2", testIPv4))
  1008  
  1009  	// Now, controller 2 no longer has the vote, but if we now flag it as dying,
  1010  	// then it should also get progressed to dead as well.
  1011  	st.controller("12").advanceLifecycle(state.Dying, false)
  1012  	update = s.mustNext(c, "removing dying controller")
  1013  	assertMembers(c, update, mkMembers("0v", testIPv4))
  1014  }
  1015  
  1016  func (s *workerSuite) TestRemovePrimaryValidSecondaries(c *gc.C) {
  1017  	st, w, memberWatcher := s.initialize3Voters(c)
  1018  	defer workertest.CleanKill(c, w)
  1019  	defer memberWatcher.Close()
  1020  
  1021  	statusWatcher := st.session.status.Watch()
  1022  	defer statusWatcher.Close()
  1023  	testStatus := mustNextStatus(c, statusWatcher, "init")
  1024  	c.Check(testStatus.Members, gc.DeepEquals, mkStatuses("0p 1s 2s", testIPv4))
  1025  	primaryMemberIndex := 0
  1026  
  1027  	st.controller("10").setWantsVote(false)
  1028  	// we should notice that the primary has failed, and have called StepDownPrimary which should ultimately cause
  1029  	// a change in the Status.
  1030  	testStatus = mustNextStatus(c, statusWatcher, "stepping down primary")
  1031  	// find out which one is primary, should only be one of 1 or 2
  1032  	c.Assert(testStatus.Members, gc.HasLen, 3)
  1033  	c.Check(testStatus.Members[0].State, gc.Equals, replicaset.MemberState(replicaset.SecondaryState))
  1034  	if testStatus.Members[1].State == replicaset.PrimaryState {
  1035  		primaryMemberIndex = 1
  1036  		c.Check(testStatus.Members[2].State, gc.Equals, replicaset.MemberState(replicaset.SecondaryState))
  1037  	} else {
  1038  		primaryMemberIndex = 2
  1039  		c.Check(testStatus.Members[2].State, gc.Equals, replicaset.MemberState(replicaset.PrimaryState))
  1040  	}
  1041  	// Now we have to wait for time to advance for us to reevaluate the system
  1042  	s.waitUntilIdle(c)
  1043  	s.clock.Advance(2 * pollInterval)
  1044  	update := s.mustNext(c, "reevaluting member post-step-down")
  1045  	// we should now have switch the vote over to whoever became the primary
  1046  	if primaryMemberIndex == 1 {
  1047  		assertMembers(c, update, mkMembers("0 1v 2", testIPv4))
  1048  	} else {
  1049  		assertMembers(c, update, mkMembers("0 1 2v", testIPv4))
  1050  	}
  1051  	// Now we ask the primary to step down again, and we should first reconfigure the group to include
  1052  	// the other secondary. We first unset the invariant checker, because we are intentionally going to an even number
  1053  	// of voters, but this is not the normal condition
  1054  	st.setCheck(nil)
  1055  	st.controller(st.session.currentPrimary()).setWantsVote(false)
  1056  	// member watcher must fire first
  1057  	update = s.mustNext(c, "observing member step down")
  1058  	assertMembers(c, update, mkMembers("0 1v 2v", testIPv4))
  1059  	// as part of stepping down the only primary, we re-enable the vote for the other secondary, and then can call
  1060  	// StepDownPrimary and then can remove its vote.
  1061  	// now we timeout so that the system will notice we really do still want to step down the primary, and ask
  1062  	// for it to revote.
  1063  	s.waitUntilIdle(c)
  1064  	s.clock.Advance(2 * pollInterval)
  1065  	testStatus = mustNextStatus(c, statusWatcher, "stepping down new primary")
  1066  	if primaryMemberIndex == 1 {
  1067  		c.Check(testStatus.Members[1].State, gc.Equals, replicaset.MemberState(replicaset.SecondaryState))
  1068  		c.Check(testStatus.Members[2].State, gc.Equals, replicaset.MemberState(replicaset.PrimaryState))
  1069  	} else {
  1070  		c.Check(testStatus.Members[1].State, gc.Equals, replicaset.MemberState(replicaset.PrimaryState))
  1071  		c.Check(testStatus.Members[2].State, gc.Equals, replicaset.MemberState(replicaset.SecondaryState))
  1072  	}
  1073  	// and then we again notice that the primary has been rescheduled and changed the member votes again
  1074  	s.waitUntilIdle(c)
  1075  	s.clock.Advance(pollInterval)
  1076  	update = s.mustNext(c, "reevaluting member post-step-down")
  1077  	if primaryMemberIndex == 1 {
  1078  		// primary was 11, now it is 12 as the only voter
  1079  		assertMembers(c, update, mkMembers("0 1 2v", testIPv4))
  1080  	} else {
  1081  		// primary was 12, now it is 11 as the only voter
  1082  		assertMembers(c, update, mkMembers("0 1v 2", testIPv4))
  1083  	}
  1084  }
  1085  
  1086  // recordMemberChanges starts a go routine to record member changes.
  1087  func (s *workerSuite) recordMemberChanges(c *gc.C, w *voyeur.Watcher) {
  1088  	go func() {
  1089  		for {
  1090  			c.Logf("waiting for next update")
  1091  			ok := w.Next()
  1092  			if !ok {
  1093  				c.Logf("watcher closed")
  1094  				return
  1095  			}
  1096  			val := w.Value()
  1097  			members := val.([]replicaset.Member)
  1098  			c.Logf("next update, val: %v", "\n"+prettyReplicaSetMembersSlice(members))
  1099  			s.mu.Lock()
  1100  			s.memberUpdates = append(s.memberUpdates, members)
  1101  			s.mu.Unlock()
  1102  		}
  1103  	}()
  1104  }
  1105  
  1106  // mustNext waits for w's value to be set and returns it.
  1107  func (s *workerSuite) mustNext(c *gc.C, context string) []replicaset.Member {
  1108  	c.Logf("waiting for next update: %v", context)
  1109  	for a := coretesting.LongAttempt.Start(); a.Next(); {
  1110  		s.mu.Lock()
  1111  		if len(s.memberUpdates) == 0 {
  1112  			s.mu.Unlock()
  1113  			continue
  1114  		}
  1115  		update := s.memberUpdates[0]
  1116  		s.memberUpdates = s.memberUpdates[1:]
  1117  		s.mu.Unlock()
  1118  		return update
  1119  	}
  1120  	c.Fatalf("no replicaset update: %v", context)
  1121  	return nil
  1122  }
  1123  
  1124  func mustNextStatus(c *gc.C, w *voyeur.Watcher, context string) *replicaset.Status {
  1125  	type voyeurResult struct {
  1126  		ok  bool
  1127  		val *replicaset.Status
  1128  	}
  1129  	done := make(chan voyeurResult)
  1130  	go func() {
  1131  		c.Logf("mustNextStatus %v", context)
  1132  		var result voyeurResult
  1133  		result.ok = w.Next()
  1134  		if result.ok {
  1135  			val := w.Value()
  1136  			result.val = val.(*replicaset.Status)
  1137  		}
  1138  		c.Logf("mustNextStatus %v done, ok: %v, val: %v", context, result.ok, pretty.Sprint(result.val))
  1139  		done <- result
  1140  	}()
  1141  	select {
  1142  	case result := <-done:
  1143  		c.Assert(result.ok, jc.IsTrue)
  1144  		return result.val
  1145  	case <-time.After(coretesting.LongWait):
  1146  		c.Fatalf("timed out waiting for value to be set %v", context)
  1147  	}
  1148  	panic("unreachable")
  1149  }
  1150  
  1151  type nopAPIHostPortsSetter struct{}
  1152  
  1153  func (nopAPIHostPortsSetter) SetAPIHostPorts(apiServers []network.SpaceHostPorts) error {
  1154  	return nil
  1155  }
  1156  
  1157  type nopHub struct{}
  1158  
  1159  func (nopHub) Publish(topic string, data interface{}) (func(), error) {
  1160  	return func() {}, nil
  1161  }
  1162  
  1163  func (nopHub) Subscribe(topic string, handler interface{}) (func(), error) {
  1164  	return func() {}, nil
  1165  }
  1166  
  1167  type noopRegisterer struct {
  1168  	prometheus.Registerer
  1169  }
  1170  
  1171  func (noopRegisterer) Register(prometheus.Collector) error {
  1172  	return nil
  1173  }
  1174  
  1175  func (noopRegisterer) Unregister(prometheus.Collector) bool {
  1176  	return true
  1177  }
  1178  
  1179  func (s *workerSuite) newWorkerWithConfig(
  1180  	c *gc.C,
  1181  	config Config,
  1182  ) worker.Worker {
  1183  	// We create a new clock for the worker so we can wait on alarms even when
  1184  	// a single test tests both ipv4 and 6 so is creating two workers.
  1185  	s.clock = testclock.NewClock(time.Now())
  1186  	config.Clock = s.clock
  1187  	w, err := New(config)
  1188  	c.Assert(err, jc.ErrorIsNil)
  1189  	s.AddCleanup(func(c *gc.C) { workertest.DirtyKill(c, w) })
  1190  	return w
  1191  }
  1192  
  1193  func (s *workerSuite) newWorker(
  1194  	c *gc.C,
  1195  	st State,
  1196  	session *fakeMongoSession,
  1197  	apiHostPortsSetter APIHostPortsSetter,
  1198  	supportsHA bool,
  1199  ) worker.Worker {
  1200  	return s.newWorkerWithConfig(c, Config{
  1201  		Clock:                s.clock,
  1202  		State:                st,
  1203  		MongoSession:         session,
  1204  		APIHostPortsSetter:   apiHostPortsSetter,
  1205  		ControllerId:         session.currentPrimary,
  1206  		MongoPort:            mongoPort,
  1207  		APIPort:              apiPort,
  1208  		Hub:                  s.hub,
  1209  		SupportsHA:           supportsHA,
  1210  		PrometheusRegisterer: noopRegisterer{},
  1211  	})
  1212  }
  1213  
  1214  func (s *workerSuite) idleNotify() {
  1215  	logger.Infof("idleNotify signalled")
  1216  	s.mu.Lock()
  1217  	idle := s.idle
  1218  	s.mu.Unlock()
  1219  	if idle == nil {
  1220  		return
  1221  	}
  1222  	// Send down the idle channel if it is set.
  1223  	select {
  1224  	case idle <- struct{}{}:
  1225  	case <-time.After(coretesting.LongWait):
  1226  		// no-op
  1227  		logger.Infof("... no one watching")
  1228  	}
  1229  }
  1230  
  1231  func (s *workerSuite) waitUntilIdle(c *gc.C) {
  1232  	logger.Infof("wait for idle")
  1233  	s.mu.Lock()
  1234  	s.idle = make(chan struct{})
  1235  	s.mu.Unlock()
  1236  
  1237  	select {
  1238  	case <-s.idle:
  1239  		// All good.
  1240  	case <-time.After(coretesting.LongWait):
  1241  		c.Fatalf("idle channel not signalled in worker")
  1242  	}
  1243  
  1244  	s.mu.Lock()
  1245  	s.idle = nil
  1246  	s.mu.Unlock()
  1247  }