github.com/niedbalski/juju@v0.0.0-20190215020005-8ff100488e47/worker/raft/worker_test.go (about)

     1  // Copyright 2018 Canonical Ltd.
     2  // Licensed under the AGPLv3, see LICENCE file for details.
     3  
     4  package raft_test
     5  
     6  import (
     7  	"log"
     8  	"time"
     9  
    10  	coreraft "github.com/hashicorp/raft"
    11  	"github.com/juju/clock"
    12  	"github.com/juju/clock/testclock"
    13  	"github.com/juju/loggo"
    14  	"github.com/juju/testing"
    15  	jc "github.com/juju/testing/checkers"
    16  	gc "gopkg.in/check.v1"
    17  	"gopkg.in/juju/worker.v1/workertest"
    18  
    19  	coretesting "github.com/juju/juju/testing"
    20  	"github.com/juju/juju/worker/raft"
    21  	"github.com/juju/juju/worker/raft/rafttest"
    22  	"github.com/juju/juju/worker/raft/raftutil"
    23  )
    24  
    25  type workerFixture struct {
    26  	testing.IsolationSuite
    27  	fsm    *raft.SimpleFSM
    28  	config raft.Config
    29  }
    30  
    31  func (s *workerFixture) SetUpTest(c *gc.C) {
    32  	s.IsolationSuite.SetUpTest(c)
    33  	s.fsm = &raft.SimpleFSM{}
    34  	s.config = raft.Config{
    35  		FSM:        s.fsm,
    36  		Logger:     loggo.GetLogger("juju.worker.raft_test"),
    37  		StorageDir: c.MkDir(),
    38  		LocalID:    "123",
    39  		Transport:  s.newTransport("123"),
    40  		Clock:      testclock.NewClock(time.Time{}),
    41  	}
    42  }
    43  
    44  func (s *workerFixture) newTransport(address coreraft.ServerAddress) *coreraft.InmemTransport {
    45  	_, transport := coreraft.NewInmemTransport(address)
    46  	s.AddCleanup(func(c *gc.C) {
    47  		c.Assert(transport.Close(), jc.ErrorIsNil)
    48  	})
    49  	return transport
    50  }
    51  
    52  type WorkerValidationSuite struct {
    53  	workerFixture
    54  }
    55  
    56  var _ = gc.Suite(&WorkerValidationSuite{})
    57  
    58  func (s *WorkerValidationSuite) TestValidateErrors(c *gc.C) {
    59  	type test struct {
    60  		f      func(*raft.Config)
    61  		expect string
    62  	}
    63  	tests := []test{{
    64  		func(cfg *raft.Config) { cfg.FSM = nil },
    65  		"nil FSM not valid",
    66  	}, {
    67  		func(cfg *raft.Config) { cfg.Logger = nil },
    68  		"nil Logger not valid",
    69  	}, {
    70  		func(cfg *raft.Config) { cfg.StorageDir = "" },
    71  		"empty StorageDir not valid",
    72  	}, {
    73  		func(cfg *raft.Config) { cfg.LocalID = "" },
    74  		"empty LocalID not valid",
    75  	}, {
    76  		func(cfg *raft.Config) { cfg.HeartbeatTimeout = time.Millisecond },
    77  		"validating raft config: Heartbeat timeout is too low",
    78  	}, {
    79  		func(cfg *raft.Config) { cfg.Transport = nil },
    80  		"nil Transport not valid",
    81  	}, {
    82  		func(cfg *raft.Config) { cfg.Clock = nil },
    83  		"nil Clock not valid",
    84  	}}
    85  	for i, test := range tests {
    86  		c.Logf("test #%d (%s)", i, test.expect)
    87  		s.testValidateError(c, test.f, test.expect)
    88  	}
    89  }
    90  
    91  func (s *WorkerValidationSuite) testValidateError(c *gc.C, f func(*raft.Config), expect string) {
    92  	config := s.config
    93  	f(&config)
    94  	w, err := raft.NewWorker(config)
    95  	if !c.Check(err, gc.NotNil) {
    96  		workertest.DirtyKill(c, w)
    97  		return
    98  	}
    99  	c.Check(w, gc.IsNil)
   100  	c.Check(err, gc.ErrorMatches, expect)
   101  }
   102  
   103  func (s *WorkerValidationSuite) TestBootstrapFSM(c *gc.C) {
   104  	s.config.Transport = nil
   105  	err := raft.Bootstrap(s.config)
   106  	c.Assert(err, gc.ErrorMatches, "non-nil FSM during Bootstrap not valid")
   107  }
   108  
   109  func (s *WorkerValidationSuite) TestBootstrapTransport(c *gc.C) {
   110  	s.config.FSM = nil
   111  	err := raft.Bootstrap(s.config)
   112  	c.Assert(err, gc.ErrorMatches, "non-nil Transport during Bootstrap not valid")
   113  }
   114  
   115  type WorkerSuite struct {
   116  	workerFixture
   117  	worker *raft.Worker
   118  	clock  *testclock.Clock
   119  }
   120  
   121  var _ = gc.Suite(&WorkerSuite{})
   122  
   123  func (s *WorkerSuite) SetUpTest(c *gc.C) {
   124  	s.workerFixture.SetUpTest(c)
   125  
   126  	// Speed up the tests.
   127  	s.config.HeartbeatTimeout = 100 * time.Millisecond
   128  	s.config.ElectionTimeout = s.config.HeartbeatTimeout
   129  	s.config.LeaderLeaseTimeout = s.config.HeartbeatTimeout
   130  
   131  	// Bootstrap before starting the worker.
   132  	transport := s.config.Transport
   133  	fsm := s.config.FSM
   134  	s.config.Transport = nil
   135  	s.config.FSM = nil
   136  	err := raft.Bootstrap(s.config)
   137  	c.Assert(err, jc.ErrorIsNil)
   138  
   139  	// Make a new clock so the waits from the bootstrap aren't hanging
   140  	// around. Use time.Now() as the start so the time can be compared
   141  	// to raft.LastContact(), which unfortunately uses wallclock time.
   142  	s.clock = testclock.NewClock(time.Now())
   143  	s.config.Clock = s.clock
   144  	s.config.NoLeaderTimeout = 4 * time.Second
   145  
   146  	s.config.Transport = transport
   147  	s.config.FSM = fsm
   148  	worker, err := raft.NewWorker(s.config)
   149  	c.Assert(err, jc.ErrorIsNil)
   150  	s.AddCleanup(func(c *gc.C) {
   151  		workertest.DirtyKill(c, worker)
   152  	})
   153  	s.worker = worker.(*raft.Worker)
   154  }
   155  
   156  func (s *WorkerSuite) waitLeader(c *gc.C) *coreraft.Raft {
   157  	r, err := s.worker.Raft()
   158  	c.Assert(err, jc.ErrorIsNil)
   159  	c.Assert(r, gc.NotNil)
   160  
   161  	select {
   162  	case leader := <-r.LeaderCh():
   163  		c.Assert(leader, jc.IsTrue)
   164  	case <-time.After(coretesting.LongWait):
   165  		c.Fatal("timed out waiting for leadership change")
   166  	}
   167  	return r
   168  }
   169  
   170  func (s *WorkerSuite) TestBootstrapAddress(c *gc.C) {
   171  	r := s.waitLeader(c)
   172  
   173  	f := r.GetConfiguration()
   174  	c.Assert(f.Error(), jc.ErrorIsNil)
   175  	c.Assert(f.Configuration().Servers, jc.DeepEquals, []coreraft.Server{{
   176  		Suffrage: coreraft.Voter,
   177  		ID:       "123",
   178  		Address:  "localhost",
   179  	}})
   180  }
   181  
   182  func (s *WorkerSuite) TestRaft(c *gc.C) {
   183  	r := s.waitLeader(c)
   184  
   185  	f := r.Apply([]byte("command1"), time.Minute)
   186  	c.Assert(f.Error(), jc.ErrorIsNil)
   187  	c.Assert(f.Index(), gc.Equals, uint64(3))
   188  	c.Assert(f.Response(), gc.Equals, 1)
   189  
   190  	f = r.Apply([]byte("command2"), time.Minute)
   191  	c.Assert(f.Error(), jc.ErrorIsNil)
   192  	c.Assert(f.Index(), gc.Equals, uint64(4))
   193  	c.Assert(f.Response(), gc.Equals, 2)
   194  
   195  	c.Assert(s.fsm.Logs(), jc.DeepEquals, [][]byte{
   196  		[]byte("command1"),
   197  		[]byte("command2"),
   198  	})
   199  }
   200  
   201  func (s *WorkerSuite) TestRaftWorkerStopped(c *gc.C) {
   202  	s.worker.Kill()
   203  
   204  	r, err := s.worker.Raft()
   205  	c.Assert(err, gc.Equals, raft.ErrWorkerStopped)
   206  	c.Assert(r, gc.IsNil)
   207  }
   208  
   209  func (s *WorkerSuite) TestRestoreSnapshot(c *gc.C) {
   210  	r := s.waitLeader(c)
   211  
   212  	f := r.Apply([]byte("command1"), time.Minute)
   213  	c.Assert(f.Error(), jc.ErrorIsNil)
   214  	c.Assert(f.Index(), gc.Equals, uint64(3))
   215  	c.Assert(f.Response(), gc.Equals, 1)
   216  
   217  	sf := r.Snapshot()
   218  	c.Assert(sf.Error(), jc.ErrorIsNil)
   219  	meta, rc, err := sf.Open()
   220  	c.Assert(err, jc.ErrorIsNil)
   221  	defer rc.Close()
   222  
   223  	f = r.Apply([]byte("command2"), time.Minute)
   224  	c.Assert(f.Error(), jc.ErrorIsNil)
   225  	c.Assert(f.Index(), gc.Equals, uint64(4))
   226  	c.Assert(f.Response(), gc.Equals, 2)
   227  
   228  	err = r.Restore(meta, rc, time.Minute)
   229  	c.Assert(err, jc.ErrorIsNil)
   230  	c.Assert(s.fsm.Logs(), jc.DeepEquals, [][]byte{
   231  		[]byte("command1"),
   232  	})
   233  }
   234  
   235  func (s *WorkerSuite) TestStartStop(c *gc.C) {
   236  	workertest.CleanKill(c, s.worker)
   237  }
   238  
   239  func (s *WorkerSuite) TestShutdownRaftKillsWorker(c *gc.C) {
   240  	r := s.waitLeader(c)
   241  	c.Assert(r.Shutdown().Error(), jc.ErrorIsNil)
   242  
   243  	err := workertest.CheckKilled(c, s.worker)
   244  	c.Assert(err, gc.ErrorMatches, "raft shutdown")
   245  }
   246  
   247  func (s *WorkerSuite) TestLogStore(c *gc.C) {
   248  	_, err := s.worker.LogStore()
   249  	c.Assert(err, jc.ErrorIsNil)
   250  }
   251  
   252  func (s *WorkerSuite) newRaft(c *gc.C, id coreraft.ServerID) (
   253  	*coreraft.Raft, *coreraft.InmemTransport,
   254  ) {
   255  	transport := s.newTransport("")
   256  	store := coreraft.NewInmemStore()
   257  	raftConfig := coreraft.DefaultConfig()
   258  	raftConfig.LocalID = id
   259  	raftConfig.HeartbeatTimeout = 100 * time.Millisecond
   260  	raftConfig.ElectionTimeout = raftConfig.HeartbeatTimeout
   261  	raftConfig.LeaderLeaseTimeout = raftConfig.HeartbeatTimeout
   262  	raftConfig.Logger = log.New(&raftutil.LoggoWriter{
   263  		loggo.GetLogger("juju.worker.raft_test_" + string(id)),
   264  		loggo.DEBUG,
   265  	}, "", 0)
   266  	r, err := coreraft.NewRaft(
   267  		raftConfig,
   268  		&raft.SimpleFSM{},
   269  		store,
   270  		store,
   271  		coreraft.NewInmemSnapshotStore(),
   272  		transport,
   273  	)
   274  	c.Assert(err, jc.ErrorIsNil)
   275  	s.AddCleanup(func(c *gc.C) {
   276  		c.Assert(r.Shutdown().Error(), jc.ErrorIsNil)
   277  	})
   278  	return r, transport
   279  }
   280  
   281  func (s *WorkerSuite) TestNoLeaderTimeout(c *gc.C) {
   282  	// Get the raft node into a state where it has no contact with the
   283  	// leader by adding 2 more nodes, demoting the local one so that
   284  	// it isn't the leader, then stopping the other nodes.
   285  	transport0 := s.config.Transport.(coreraft.LoopbackTransport)
   286  	raft1, transport1 := s.newRaft(c, "1")
   287  	raft2, transport2 := s.newRaft(c, "2")
   288  	connectTransports(transport0, transport1, transport2)
   289  
   290  	raft0 := s.waitLeader(c)
   291  	f1 := raft0.AddVoter("1", transport1.LocalAddr(), 0, 0)
   292  	f2 := raft0.AddVoter("2", transport2.LocalAddr(), 0, 0)
   293  	c.Assert(f1.Error(), jc.ErrorIsNil)
   294  	c.Assert(f2.Error(), jc.ErrorIsNil)
   295  
   296  	rafttest.CheckConfiguration(c, raft0, []coreraft.Server{{
   297  		ID:       "123",
   298  		Address:  coreraft.ServerAddress("localhost"),
   299  		Suffrage: coreraft.Voter,
   300  	}, {
   301  		ID:       "1",
   302  		Address:  transport1.LocalAddr(),
   303  		Suffrage: coreraft.Voter,
   304  	}, {
   305  		ID:       "2",
   306  		Address:  transport2.LocalAddr(),
   307  		Suffrage: coreraft.Voter,
   308  	}})
   309  
   310  	f3 := raft0.DemoteVoter("123", 0, 0)
   311  	c.Assert(f3.Error(), jc.ErrorIsNil)
   312  
   313  	// Wait until raft0 isn't the leader anymore.
   314  	leader := true
   315  	for a := coretesting.LongAttempt.Start(); a.Next(); {
   316  		leader = raft0.Leader() == coreraft.ServerAddress("localhost")
   317  		if !leader {
   318  			break
   319  		}
   320  	}
   321  	c.Assert(leader, jc.IsFalse)
   322  
   323  	f4 := raft1.Shutdown()
   324  	f5 := raft2.Shutdown()
   325  	c.Assert(f4.Error(), jc.ErrorIsNil)
   326  	c.Assert(f5.Error(), jc.ErrorIsNil)
   327  
   328  	// Now advance time to trigger the timeout. There should be 2
   329  	// waits when we advance:
   330  	// * the loop timeout wait from starting the worker
   331  	// * the no leader timeout check in loop.
   332  	c.Assert(s.clock.WaitAdvance(10*time.Second, coretesting.LongWait, 2), jc.ErrorIsNil)
   333  	c.Assert(workertest.CheckKilled(c, s.worker), gc.Equals, raft.ErrNoLeaderTimeout)
   334  }
   335  
   336  // Connect the provided transport bidirectionally.
   337  func connectTransports(transports ...coreraft.LoopbackTransport) {
   338  	for _, t1 := range transports {
   339  		for _, t2 := range transports {
   340  			if t1 == t2 {
   341  				continue
   342  			}
   343  			t1.Connect(t2.LocalAddr(), t2)
   344  		}
   345  	}
   346  }
   347  
   348  type WorkerTimeoutSuite struct {
   349  	workerFixture
   350  }
   351  
   352  var _ = gc.Suite(&WorkerTimeoutSuite{})
   353  
   354  func (s *WorkerTimeoutSuite) SetUpTest(c *gc.C) {
   355  	s.workerFixture.SetUpTest(c)
   356  
   357  	// Speed up the tests.
   358  	s.config.HeartbeatTimeout = 100 * time.Millisecond
   359  	s.config.ElectionTimeout = s.config.HeartbeatTimeout
   360  	s.config.LeaderLeaseTimeout = s.config.HeartbeatTimeout
   361  
   362  	// Bootstrap before starting the worker.
   363  	transport := s.config.Transport
   364  	fsm := s.config.FSM
   365  	s.config.Transport = nil
   366  	s.config.FSM = nil
   367  	err := raft.Bootstrap(s.config)
   368  	c.Assert(err, jc.ErrorIsNil)
   369  
   370  	s.config.Transport = transport
   371  	s.config.FSM = fsm
   372  }
   373  
   374  func (s *WorkerTimeoutSuite) TestNewWorkerTimesOut(c *gc.C) {
   375  	// If for some reason it takes a long time to create the Raft
   376  	// object we don't want to just hang - that can make it really
   377  	// hard to work out what's going on. Instead we should timeout if
   378  	// the raft loop doesn't get started.
   379  	testClock := testclock.NewClock(time.Time{})
   380  	s.config.Clock = testClock
   381  	_, underlying := coreraft.NewInmemTransport("something")
   382  	s.config.Transport = &hangingTransport{
   383  		Transport: underlying,
   384  		clock:     testClock,
   385  	}
   386  	errChan := make(chan error)
   387  	go func() {
   388  		w, err := raft.NewWorker(s.config)
   389  		c.Check(w, gc.IsNil)
   390  		errChan <- err
   391  	}()
   392  
   393  	// We wait for the transport and the worker to be waiting for the
   394  	// clock, then we move it past the timeout.
   395  	err := testClock.WaitAdvance(2*raft.LoopTimeout, coretesting.LongWait, 2)
   396  	c.Assert(err, jc.ErrorIsNil)
   397  
   398  	select {
   399  	case err := <-errChan:
   400  		c.Assert(err, gc.ErrorMatches, "timed out waiting for worker loop")
   401  	case <-time.After(coretesting.LongWait):
   402  		c.Fatalf("timed out waiting for worker error")
   403  	}
   404  }
   405  
   406  type hangingTransport struct {
   407  	coreraft.Transport
   408  	clock clock.Clock
   409  }
   410  
   411  func (t *hangingTransport) LocalAddr() coreraft.ServerAddress {
   412  	<-t.clock.After(5 * raft.LoopTimeout)
   413  	return t.Transport.LocalAddr()
   414  }