github.com/zoomfoo/nomad@v0.8.5-0.20180907175415-f28fd3a1a056/nomad/leader_test.go (about)

     1  package nomad
     2  
     3  import (
     4  	"errors"
     5  	"fmt"
     6  	"strconv"
     7  	"testing"
     8  	"time"
     9  
    10  	"github.com/hashicorp/consul/testutil/retry"
    11  	memdb "github.com/hashicorp/go-memdb"
    12  	"github.com/hashicorp/nomad/nomad/mock"
    13  	"github.com/hashicorp/nomad/nomad/state"
    14  	"github.com/hashicorp/nomad/nomad/structs"
    15  	"github.com/hashicorp/nomad/testutil"
    16  	"github.com/hashicorp/raft"
    17  	"github.com/hashicorp/serf/serf"
    18  	"github.com/stretchr/testify/assert"
    19  	"github.com/stretchr/testify/require"
    20  )
    21  
    22  func TestLeader_LeftServer(t *testing.T) {
    23  	s1 := TestServer(t, nil)
    24  	defer s1.Shutdown()
    25  
    26  	s2 := TestServer(t, func(c *Config) {
    27  		c.DevDisableBootstrap = true
    28  	})
    29  	defer s2.Shutdown()
    30  
    31  	s3 := TestServer(t, func(c *Config) {
    32  		c.DevDisableBootstrap = true
    33  	})
    34  	defer s3.Shutdown()
    35  	servers := []*Server{s1, s2, s3}
    36  	TestJoin(t, s1, s2, s3)
    37  
    38  	for _, s := range servers {
    39  		testutil.WaitForResult(func() (bool, error) {
    40  			peers, _ := s.numPeers()
    41  			return peers == 3, nil
    42  		}, func(err error) {
    43  			t.Fatalf("should have 3 peers")
    44  		})
    45  	}
    46  
    47  	// Kill any server
    48  	var peer *Server
    49  	for _, s := range servers {
    50  		if !s.IsLeader() {
    51  			peer = s
    52  			break
    53  		}
    54  	}
    55  	if peer == nil {
    56  		t.Fatalf("Should have a non-leader")
    57  	}
    58  	peer.Shutdown()
    59  	name := fmt.Sprintf("%s.%s", peer.config.NodeName, peer.config.Region)
    60  
    61  	testutil.WaitForResult(func() (bool, error) {
    62  		for _, s := range servers {
    63  			if s == peer {
    64  				continue
    65  			}
    66  
    67  			// Force remove the non-leader (transition to left state)
    68  			if err := s.RemoveFailedNode(name); err != nil {
    69  				return false, err
    70  			}
    71  
    72  			peers, _ := s.numPeers()
    73  			return peers == 2, errors.New(fmt.Sprintf("%v", peers))
    74  		}
    75  
    76  		return true, nil
    77  	}, func(err error) {
    78  		t.Fatalf("err: %s", err)
    79  	})
    80  }
    81  
    82  func TestLeader_LeftLeader(t *testing.T) {
    83  	s1 := TestServer(t, nil)
    84  	defer s1.Shutdown()
    85  
    86  	s2 := TestServer(t, func(c *Config) {
    87  		c.DevDisableBootstrap = true
    88  	})
    89  	defer s2.Shutdown()
    90  
    91  	s3 := TestServer(t, func(c *Config) {
    92  		c.DevDisableBootstrap = true
    93  	})
    94  	defer s3.Shutdown()
    95  	servers := []*Server{s1, s2, s3}
    96  	TestJoin(t, s1, s2, s3)
    97  
    98  	for _, s := range servers {
    99  		testutil.WaitForResult(func() (bool, error) {
   100  			peers, _ := s.numPeers()
   101  			return peers == 3, nil
   102  		}, func(err error) {
   103  			t.Fatalf("should have 3 peers")
   104  		})
   105  	}
   106  
   107  	// Kill the leader!
   108  	var leader *Server
   109  	for _, s := range servers {
   110  		if s.IsLeader() {
   111  			leader = s
   112  			break
   113  		}
   114  	}
   115  	if leader == nil {
   116  		t.Fatalf("Should have a leader")
   117  	}
   118  	leader.Leave()
   119  	leader.Shutdown()
   120  
   121  	for _, s := range servers {
   122  		if s == leader {
   123  			continue
   124  		}
   125  		testutil.WaitForResult(func() (bool, error) {
   126  			peers, _ := s.numPeers()
   127  			return peers == 2, errors.New(fmt.Sprintf("%v", peers))
   128  		}, func(err error) {
   129  			t.Fatalf("should have 2 peers: %v", err)
   130  		})
   131  	}
   132  }
   133  
   134  func TestLeader_MultiBootstrap(t *testing.T) {
   135  	s1 := TestServer(t, nil)
   136  	defer s1.Shutdown()
   137  
   138  	s2 := TestServer(t, nil)
   139  	defer s2.Shutdown()
   140  	servers := []*Server{s1, s2}
   141  	TestJoin(t, s1, s2)
   142  
   143  	for _, s := range servers {
   144  		testutil.WaitForResult(func() (bool, error) {
   145  			peers := s.Members()
   146  			return len(peers) == 2, nil
   147  		}, func(err error) {
   148  			t.Fatalf("should have 2 peers")
   149  		})
   150  	}
   151  
   152  	// Ensure we don't have multiple raft peers
   153  	for _, s := range servers {
   154  		peers, _ := s.numPeers()
   155  		if peers != 1 {
   156  			t.Fatalf("should only have 1 raft peer!")
   157  		}
   158  	}
   159  }
   160  
   161  func TestLeader_PlanQueue_Reset(t *testing.T) {
   162  	s1 := TestServer(t, nil)
   163  	defer s1.Shutdown()
   164  
   165  	s2 := TestServer(t, func(c *Config) {
   166  		c.DevDisableBootstrap = true
   167  	})
   168  	defer s2.Shutdown()
   169  
   170  	s3 := TestServer(t, func(c *Config) {
   171  		c.DevDisableBootstrap = true
   172  	})
   173  	defer s3.Shutdown()
   174  	servers := []*Server{s1, s2, s3}
   175  	TestJoin(t, s1, s2, s3)
   176  
   177  	for _, s := range servers {
   178  		testutil.WaitForResult(func() (bool, error) {
   179  			peers, _ := s.numPeers()
   180  			return peers == 3, nil
   181  		}, func(err error) {
   182  			t.Fatalf("should have 3 peers")
   183  		})
   184  	}
   185  
   186  	var leader *Server
   187  	for _, s := range servers {
   188  		if s.IsLeader() {
   189  			leader = s
   190  			break
   191  		}
   192  	}
   193  	if leader == nil {
   194  		t.Fatalf("Should have a leader")
   195  	}
   196  
   197  	if !leader.planQueue.Enabled() {
   198  		t.Fatalf("should enable plan queue")
   199  	}
   200  
   201  	for _, s := range servers {
   202  		if !s.IsLeader() && s.planQueue.Enabled() {
   203  			t.Fatalf("plan queue should not be enabled")
   204  		}
   205  	}
   206  
   207  	// Kill the leader
   208  	leader.Shutdown()
   209  	time.Sleep(100 * time.Millisecond)
   210  
   211  	// Wait for a new leader
   212  	leader = nil
   213  	testutil.WaitForResult(func() (bool, error) {
   214  		for _, s := range servers {
   215  			if s.IsLeader() {
   216  				leader = s
   217  				return true, nil
   218  			}
   219  		}
   220  		return false, nil
   221  	}, func(err error) {
   222  		t.Fatalf("should have leader")
   223  	})
   224  
   225  	// Check that the new leader has a pending GC expiration
   226  	testutil.WaitForResult(func() (bool, error) {
   227  		return leader.planQueue.Enabled(), nil
   228  	}, func(err error) {
   229  		t.Fatalf("should enable plan queue")
   230  	})
   231  }
   232  
   233  func TestLeader_EvalBroker_Reset(t *testing.T) {
   234  	s1 := TestServer(t, func(c *Config) {
   235  		c.NumSchedulers = 0
   236  	})
   237  	defer s1.Shutdown()
   238  
   239  	s2 := TestServer(t, func(c *Config) {
   240  		c.NumSchedulers = 0
   241  		c.DevDisableBootstrap = true
   242  	})
   243  	defer s2.Shutdown()
   244  
   245  	s3 := TestServer(t, func(c *Config) {
   246  		c.NumSchedulers = 0
   247  		c.DevDisableBootstrap = true
   248  	})
   249  	defer s3.Shutdown()
   250  	servers := []*Server{s1, s2, s3}
   251  	TestJoin(t, s1, s2, s3)
   252  	testutil.WaitForLeader(t, s1.RPC)
   253  
   254  	for _, s := range servers {
   255  		testutil.WaitForResult(func() (bool, error) {
   256  			peers, _ := s.numPeers()
   257  			return peers == 3, nil
   258  		}, func(err error) {
   259  			t.Fatalf("should have 3 peers")
   260  		})
   261  	}
   262  
   263  	var leader *Server
   264  	for _, s := range servers {
   265  		if s.IsLeader() {
   266  			leader = s
   267  			break
   268  		}
   269  	}
   270  	if leader == nil {
   271  		t.Fatalf("Should have a leader")
   272  	}
   273  
   274  	// Inject a pending eval
   275  	req := structs.EvalUpdateRequest{
   276  		Evals: []*structs.Evaluation{mock.Eval()},
   277  	}
   278  	_, _, err := leader.raftApply(structs.EvalUpdateRequestType, req)
   279  	if err != nil {
   280  		t.Fatalf("err: %v", err)
   281  	}
   282  
   283  	// Kill the leader
   284  	leader.Shutdown()
   285  	time.Sleep(100 * time.Millisecond)
   286  
   287  	// Wait for a new leader
   288  	leader = nil
   289  	testutil.WaitForResult(func() (bool, error) {
   290  		for _, s := range servers {
   291  			if s.IsLeader() {
   292  				leader = s
   293  				return true, nil
   294  			}
   295  		}
   296  		return false, nil
   297  	}, func(err error) {
   298  		t.Fatalf("should have leader")
   299  	})
   300  
   301  	// Check that the new leader has a pending evaluation
   302  	testutil.WaitForResult(func() (bool, error) {
   303  		stats := leader.evalBroker.Stats()
   304  		return stats.TotalReady == 1, nil
   305  	}, func(err error) {
   306  		t.Fatalf("should have pending evaluation")
   307  	})
   308  }
   309  
   310  func TestLeader_PeriodicDispatcher_Restore_Adds(t *testing.T) {
   311  	s1 := TestServer(t, func(c *Config) {
   312  		c.NumSchedulers = 0
   313  	})
   314  	defer s1.Shutdown()
   315  
   316  	s2 := TestServer(t, func(c *Config) {
   317  		c.NumSchedulers = 0
   318  		c.DevDisableBootstrap = true
   319  	})
   320  	defer s2.Shutdown()
   321  
   322  	s3 := TestServer(t, func(c *Config) {
   323  		c.NumSchedulers = 0
   324  		c.DevDisableBootstrap = true
   325  	})
   326  	defer s3.Shutdown()
   327  	servers := []*Server{s1, s2, s3}
   328  	TestJoin(t, s1, s2, s3)
   329  	testutil.WaitForLeader(t, s1.RPC)
   330  
   331  	for _, s := range servers {
   332  		testutil.WaitForResult(func() (bool, error) {
   333  			peers, _ := s.numPeers()
   334  			return peers == 3, nil
   335  		}, func(err error) {
   336  			t.Fatalf("should have 3 peers")
   337  		})
   338  	}
   339  
   340  	var leader *Server
   341  	for _, s := range servers {
   342  		if s.IsLeader() {
   343  			leader = s
   344  			break
   345  		}
   346  	}
   347  	if leader == nil {
   348  		t.Fatalf("Should have a leader")
   349  	}
   350  
   351  	// Inject a periodic job, a parameterized periodic job and a non-periodic job
   352  	periodic := mock.PeriodicJob()
   353  	nonPeriodic := mock.Job()
   354  	parameterizedPeriodic := mock.PeriodicJob()
   355  	parameterizedPeriodic.ParameterizedJob = &structs.ParameterizedJobConfig{}
   356  	for _, job := range []*structs.Job{nonPeriodic, periodic, parameterizedPeriodic} {
   357  		req := structs.JobRegisterRequest{
   358  			Job: job,
   359  			WriteRequest: structs.WriteRequest{
   360  				Namespace: job.Namespace,
   361  			},
   362  		}
   363  		_, _, err := leader.raftApply(structs.JobRegisterRequestType, req)
   364  		if err != nil {
   365  			t.Fatalf("err: %v", err)
   366  		}
   367  	}
   368  
   369  	// Kill the leader
   370  	leader.Shutdown()
   371  	time.Sleep(100 * time.Millisecond)
   372  
   373  	// Wait for a new leader
   374  	leader = nil
   375  	testutil.WaitForResult(func() (bool, error) {
   376  		for _, s := range servers {
   377  			if s.IsLeader() {
   378  				leader = s
   379  				return true, nil
   380  			}
   381  		}
   382  		return false, nil
   383  	}, func(err error) {
   384  		t.Fatalf("should have leader")
   385  	})
   386  
   387  	tuplePeriodic := structs.NamespacedID{
   388  		ID:        periodic.ID,
   389  		Namespace: periodic.Namespace,
   390  	}
   391  	tupleNonPeriodic := structs.NamespacedID{
   392  		ID:        nonPeriodic.ID,
   393  		Namespace: nonPeriodic.Namespace,
   394  	}
   395  	tupleParameterized := structs.NamespacedID{
   396  		ID:        parameterizedPeriodic.ID,
   397  		Namespace: parameterizedPeriodic.Namespace,
   398  	}
   399  
   400  	// Check that the new leader is tracking the periodic job only
   401  	testutil.WaitForResult(func() (bool, error) {
   402  		if _, tracked := leader.periodicDispatcher.tracked[tuplePeriodic]; !tracked {
   403  			return false, fmt.Errorf("periodic job not tracked")
   404  		}
   405  		if _, tracked := leader.periodicDispatcher.tracked[tupleNonPeriodic]; tracked {
   406  			return false, fmt.Errorf("non periodic job tracked")
   407  		}
   408  		if _, tracked := leader.periodicDispatcher.tracked[tupleParameterized]; tracked {
   409  			return false, fmt.Errorf("parameterized periodic job tracked")
   410  		}
   411  		return true, nil
   412  	}, func(err error) {
   413  		t.Fatalf(err.Error())
   414  	})
   415  }
   416  
   417  func TestLeader_PeriodicDispatcher_Restore_NoEvals(t *testing.T) {
   418  	s1 := TestServer(t, func(c *Config) {
   419  		c.NumSchedulers = 0
   420  	})
   421  	defer s1.Shutdown()
   422  	testutil.WaitForLeader(t, s1.RPC)
   423  
   424  	// Inject a periodic job that will be triggered soon.
   425  	launch := time.Now().Add(1 * time.Second)
   426  	job := testPeriodicJob(launch)
   427  	req := structs.JobRegisterRequest{
   428  		Job: job,
   429  		WriteRequest: structs.WriteRequest{
   430  			Namespace: job.Namespace,
   431  		},
   432  	}
   433  	_, _, err := s1.raftApply(structs.JobRegisterRequestType, req)
   434  	if err != nil {
   435  		t.Fatalf("err: %v", err)
   436  	}
   437  
   438  	// Flush the periodic dispatcher, ensuring that no evals will be created.
   439  	s1.periodicDispatcher.SetEnabled(false)
   440  
   441  	// Get the current time to ensure the launch time is after this once we
   442  	// restore.
   443  	now := time.Now()
   444  
   445  	// Sleep till after the job should have been launched.
   446  	time.Sleep(3 * time.Second)
   447  
   448  	// Restore the periodic dispatcher.
   449  	s1.periodicDispatcher.SetEnabled(true)
   450  	s1.restorePeriodicDispatcher()
   451  
   452  	// Ensure the job is tracked.
   453  	tuple := structs.NamespacedID{
   454  		ID:        job.ID,
   455  		Namespace: job.Namespace,
   456  	}
   457  	if _, tracked := s1.periodicDispatcher.tracked[tuple]; !tracked {
   458  		t.Fatalf("periodic job not restored")
   459  	}
   460  
   461  	// Check that an eval was made.
   462  	ws := memdb.NewWatchSet()
   463  	last, err := s1.fsm.State().PeriodicLaunchByID(ws, job.Namespace, job.ID)
   464  	if err != nil || last == nil {
   465  		t.Fatalf("failed to get periodic launch time: %v", err)
   466  	}
   467  
   468  	if last.Launch.Before(now) {
   469  		t.Fatalf("restorePeriodicDispatcher did not force launch: last %v; want after %v", last.Launch, now)
   470  	}
   471  }
   472  
   473  func TestLeader_PeriodicDispatcher_Restore_Evals(t *testing.T) {
   474  	s1 := TestServer(t, func(c *Config) {
   475  		c.NumSchedulers = 0
   476  	})
   477  	defer s1.Shutdown()
   478  	testutil.WaitForLeader(t, s1.RPC)
   479  
   480  	// Inject a periodic job that triggered once in the past, should trigger now
   481  	// and once in the future.
   482  	now := time.Now()
   483  	past := now.Add(-1 * time.Second)
   484  	future := now.Add(10 * time.Second)
   485  	job := testPeriodicJob(past, now, future)
   486  	req := structs.JobRegisterRequest{
   487  		Job: job,
   488  		WriteRequest: structs.WriteRequest{
   489  			Namespace: job.Namespace,
   490  		},
   491  	}
   492  	_, _, err := s1.raftApply(structs.JobRegisterRequestType, req)
   493  	if err != nil {
   494  		t.Fatalf("err: %v", err)
   495  	}
   496  
   497  	// Create an eval for the past launch.
   498  	s1.periodicDispatcher.createEval(job, past)
   499  
   500  	// Flush the periodic dispatcher, ensuring that no evals will be created.
   501  	s1.periodicDispatcher.SetEnabled(false)
   502  
   503  	// Sleep till after the job should have been launched.
   504  	time.Sleep(3 * time.Second)
   505  
   506  	// Restore the periodic dispatcher.
   507  	s1.periodicDispatcher.SetEnabled(true)
   508  	s1.restorePeriodicDispatcher()
   509  
   510  	// Ensure the job is tracked.
   511  	tuple := structs.NamespacedID{
   512  		ID:        job.ID,
   513  		Namespace: job.Namespace,
   514  	}
   515  	if _, tracked := s1.periodicDispatcher.tracked[tuple]; !tracked {
   516  		t.Fatalf("periodic job not restored")
   517  	}
   518  
   519  	// Check that an eval was made.
   520  	ws := memdb.NewWatchSet()
   521  	last, err := s1.fsm.State().PeriodicLaunchByID(ws, job.Namespace, job.ID)
   522  	if err != nil || last == nil {
   523  		t.Fatalf("failed to get periodic launch time: %v", err)
   524  	}
   525  	if last.Launch == past {
   526  		t.Fatalf("restorePeriodicDispatcher did not force launch")
   527  	}
   528  }
   529  
   530  func TestLeader_PeriodicDispatch(t *testing.T) {
   531  	s1 := TestServer(t, func(c *Config) {
   532  		c.NumSchedulers = 0
   533  		c.EvalGCInterval = 5 * time.Millisecond
   534  	})
   535  	defer s1.Shutdown()
   536  
   537  	// Wait for a periodic dispatch
   538  	testutil.WaitForResult(func() (bool, error) {
   539  		stats := s1.evalBroker.Stats()
   540  		bySched, ok := stats.ByScheduler[structs.JobTypeCore]
   541  		if !ok {
   542  			return false, nil
   543  		}
   544  		return bySched.Ready > 0, nil
   545  	}, func(err error) {
   546  		t.Fatalf("should pending job")
   547  	})
   548  }
   549  
   550  func TestLeader_ReapFailedEval(t *testing.T) {
   551  	s1 := TestServer(t, func(c *Config) {
   552  		c.NumSchedulers = 0
   553  		c.EvalDeliveryLimit = 1
   554  	})
   555  	defer s1.Shutdown()
   556  	testutil.WaitForLeader(t, s1.RPC)
   557  
   558  	// Wait for a periodic dispatch
   559  	eval := mock.Eval()
   560  	s1.evalBroker.Enqueue(eval)
   561  
   562  	// Dequeue and Nack
   563  	out, token, err := s1.evalBroker.Dequeue(defaultSched, time.Second)
   564  	if err != nil {
   565  		t.Fatalf("err: %v", err)
   566  	}
   567  	s1.evalBroker.Nack(out.ID, token)
   568  
   569  	// Wait for an updated and followup evaluation
   570  	state := s1.fsm.State()
   571  	testutil.WaitForResult(func() (bool, error) {
   572  		ws := memdb.NewWatchSet()
   573  		out, err := state.EvalByID(ws, eval.ID)
   574  		if err != nil {
   575  			return false, err
   576  		}
   577  		if out == nil {
   578  			return false, fmt.Errorf("expect original evaluation to exist")
   579  		}
   580  		if out.Status != structs.EvalStatusFailed {
   581  			return false, fmt.Errorf("got status %v; want %v", out.Status, structs.EvalStatusFailed)
   582  		}
   583  
   584  		// See if there is a followup
   585  		evals, err := state.EvalsByJob(ws, eval.Namespace, eval.JobID)
   586  		if err != nil {
   587  			return false, err
   588  		}
   589  
   590  		if l := len(evals); l != 2 {
   591  			return false, fmt.Errorf("got %d evals, want 2", l)
   592  		}
   593  
   594  		for _, e := range evals {
   595  			if e.ID == eval.ID {
   596  				continue
   597  			}
   598  
   599  			if e.Status != structs.EvalStatusPending {
   600  				return false, fmt.Errorf("follow up eval has status %v; want %v",
   601  					e.Status, structs.EvalStatusPending)
   602  			}
   603  
   604  			if e.Wait < s1.config.EvalFailedFollowupBaselineDelay ||
   605  				e.Wait > s1.config.EvalFailedFollowupBaselineDelay+s1.config.EvalFailedFollowupDelayRange {
   606  				return false, fmt.Errorf("bad wait: %v", e.Wait)
   607  			}
   608  
   609  			if e.TriggeredBy != structs.EvalTriggerFailedFollowUp {
   610  				return false, fmt.Errorf("follow up eval TriggeredBy %v; want %v",
   611  					e.TriggeredBy, structs.EvalTriggerFailedFollowUp)
   612  			}
   613  		}
   614  
   615  		return true, nil
   616  	}, func(err error) {
   617  		t.Fatalf("err: %v", err)
   618  	})
   619  }
   620  
   621  func TestLeader_ReapDuplicateEval(t *testing.T) {
   622  	s1 := TestServer(t, func(c *Config) {
   623  		c.NumSchedulers = 0
   624  	})
   625  	defer s1.Shutdown()
   626  	testutil.WaitForLeader(t, s1.RPC)
   627  
   628  	// Create a duplicate blocked eval
   629  	eval := mock.Eval()
   630  	eval2 := mock.Eval()
   631  	eval2.JobID = eval.JobID
   632  	s1.blockedEvals.Block(eval)
   633  	s1.blockedEvals.Block(eval2)
   634  
   635  	// Wait for the evaluation to marked as cancelled
   636  	state := s1.fsm.State()
   637  	testutil.WaitForResult(func() (bool, error) {
   638  		ws := memdb.NewWatchSet()
   639  		out, err := state.EvalByID(ws, eval2.ID)
   640  		if err != nil {
   641  			return false, err
   642  		}
   643  		return out != nil && out.Status == structs.EvalStatusCancelled, nil
   644  	}, func(err error) {
   645  		t.Fatalf("err: %v", err)
   646  	})
   647  }
   648  
   649  func TestLeader_RestoreVaultAccessors(t *testing.T) {
   650  	s1 := TestServer(t, func(c *Config) {
   651  		c.NumSchedulers = 0
   652  	})
   653  	defer s1.Shutdown()
   654  	testutil.WaitForLeader(t, s1.RPC)
   655  
   656  	// Insert a vault accessor that should be revoked
   657  	state := s1.fsm.State()
   658  	va := mock.VaultAccessor()
   659  	if err := state.UpsertVaultAccessor(100, []*structs.VaultAccessor{va}); err != nil {
   660  		t.Fatalf("bad: %v", err)
   661  	}
   662  
   663  	// Swap the Vault client
   664  	tvc := &TestVaultClient{}
   665  	s1.vault = tvc
   666  
   667  	// Do a restore
   668  	if err := s1.restoreRevokingAccessors(); err != nil {
   669  		t.Fatalf("Failed to restore: %v", err)
   670  	}
   671  
   672  	if len(tvc.RevokedTokens) != 1 && tvc.RevokedTokens[0].Accessor != va.Accessor {
   673  		t.Fatalf("Bad revoked accessors: %v", tvc.RevokedTokens)
   674  	}
   675  }
   676  
   677  func TestLeader_ReplicateACLPolicies(t *testing.T) {
   678  	t.Parallel()
   679  	s1, root := TestACLServer(t, func(c *Config) {
   680  		c.Region = "region1"
   681  		c.AuthoritativeRegion = "region1"
   682  		c.ACLEnabled = true
   683  	})
   684  	defer s1.Shutdown()
   685  	s2, _ := TestACLServer(t, func(c *Config) {
   686  		c.Region = "region2"
   687  		c.AuthoritativeRegion = "region1"
   688  		c.ACLEnabled = true
   689  		c.ReplicationBackoff = 20 * time.Millisecond
   690  		c.ReplicationToken = root.SecretID
   691  	})
   692  	defer s2.Shutdown()
   693  	TestJoin(t, s1, s2)
   694  	testutil.WaitForLeader(t, s1.RPC)
   695  	testutil.WaitForLeader(t, s2.RPC)
   696  
   697  	// Write a policy to the authoritative region
   698  	p1 := mock.ACLPolicy()
   699  	if err := s1.State().UpsertACLPolicies(100, []*structs.ACLPolicy{p1}); err != nil {
   700  		t.Fatalf("bad: %v", err)
   701  	}
   702  
   703  	// Wait for the policy to replicate
   704  	testutil.WaitForResult(func() (bool, error) {
   705  		state := s2.State()
   706  		out, err := state.ACLPolicyByName(nil, p1.Name)
   707  		return out != nil, err
   708  	}, func(err error) {
   709  		t.Fatalf("should replicate policy")
   710  	})
   711  }
   712  
   713  func TestLeader_DiffACLPolicies(t *testing.T) {
   714  	t.Parallel()
   715  
   716  	state := state.TestStateStore(t)
   717  
   718  	// Populate the local state
   719  	p1 := mock.ACLPolicy()
   720  	p2 := mock.ACLPolicy()
   721  	p3 := mock.ACLPolicy()
   722  	assert.Nil(t, state.UpsertACLPolicies(100, []*structs.ACLPolicy{p1, p2, p3}))
   723  
   724  	// Simulate a remote list
   725  	p2Stub := p2.Stub()
   726  	p2Stub.ModifyIndex = 50 // Ignored, same index
   727  	p3Stub := p3.Stub()
   728  	p3Stub.ModifyIndex = 100 // Updated, higher index
   729  	p3Stub.Hash = []byte{0, 1, 2, 3}
   730  	p4 := mock.ACLPolicy()
   731  	remoteList := []*structs.ACLPolicyListStub{
   732  		p2Stub,
   733  		p3Stub,
   734  		p4.Stub(),
   735  	}
   736  	delete, update := diffACLPolicies(state, 50, remoteList)
   737  
   738  	// P1 does not exist on the remote side, should delete
   739  	assert.Equal(t, []string{p1.Name}, delete)
   740  
   741  	// P2 is un-modified - ignore. P3 modified, P4 new.
   742  	assert.Equal(t, []string{p3.Name, p4.Name}, update)
   743  }
   744  
   745  func TestLeader_ReplicateACLTokens(t *testing.T) {
   746  	t.Parallel()
   747  	s1, root := TestACLServer(t, func(c *Config) {
   748  		c.Region = "region1"
   749  		c.AuthoritativeRegion = "region1"
   750  		c.ACLEnabled = true
   751  	})
   752  	defer s1.Shutdown()
   753  	s2, _ := TestACLServer(t, func(c *Config) {
   754  		c.Region = "region2"
   755  		c.AuthoritativeRegion = "region1"
   756  		c.ACLEnabled = true
   757  		c.ReplicationBackoff = 20 * time.Millisecond
   758  		c.ReplicationToken = root.SecretID
   759  	})
   760  	defer s2.Shutdown()
   761  	TestJoin(t, s1, s2)
   762  	testutil.WaitForLeader(t, s1.RPC)
   763  	testutil.WaitForLeader(t, s2.RPC)
   764  
   765  	// Write a token to the authoritative region
   766  	p1 := mock.ACLToken()
   767  	p1.Global = true
   768  	if err := s1.State().UpsertACLTokens(100, []*structs.ACLToken{p1}); err != nil {
   769  		t.Fatalf("bad: %v", err)
   770  	}
   771  
   772  	// Wait for the token to replicate
   773  	testutil.WaitForResult(func() (bool, error) {
   774  		state := s2.State()
   775  		out, err := state.ACLTokenByAccessorID(nil, p1.AccessorID)
   776  		return out != nil, err
   777  	}, func(err error) {
   778  		t.Fatalf("should replicate token")
   779  	})
   780  }
   781  
   782  func TestLeader_DiffACLTokens(t *testing.T) {
   783  	t.Parallel()
   784  
   785  	state := state.TestStateStore(t)
   786  
   787  	// Populate the local state
   788  	p0 := mock.ACLToken()
   789  	p1 := mock.ACLToken()
   790  	p1.Global = true
   791  	p2 := mock.ACLToken()
   792  	p2.Global = true
   793  	p3 := mock.ACLToken()
   794  	p3.Global = true
   795  	assert.Nil(t, state.UpsertACLTokens(100, []*structs.ACLToken{p0, p1, p2, p3}))
   796  
   797  	// Simulate a remote list
   798  	p2Stub := p2.Stub()
   799  	p2Stub.ModifyIndex = 50 // Ignored, same index
   800  	p3Stub := p3.Stub()
   801  	p3Stub.ModifyIndex = 100 // Updated, higher index
   802  	p3Stub.Hash = []byte{0, 1, 2, 3}
   803  	p4 := mock.ACLToken()
   804  	p4.Global = true
   805  	remoteList := []*structs.ACLTokenListStub{
   806  		p2Stub,
   807  		p3Stub,
   808  		p4.Stub(),
   809  	}
   810  	delete, update := diffACLTokens(state, 50, remoteList)
   811  
   812  	// P0 is local and should be ignored
   813  	// P1 does not exist on the remote side, should delete
   814  	assert.Equal(t, []string{p1.AccessorID}, delete)
   815  
   816  	// P2 is un-modified - ignore. P3 modified, P4 new.
   817  	assert.Equal(t, []string{p3.AccessorID, p4.AccessorID}, update)
   818  }
   819  
   820  func TestLeader_UpgradeRaftVersion(t *testing.T) {
   821  	t.Parallel()
   822  	s1 := TestServer(t, func(c *Config) {
   823  		c.Datacenter = "dc1"
   824  		c.RaftConfig.ProtocolVersion = 2
   825  	})
   826  	defer s1.Shutdown()
   827  
   828  	s2 := TestServer(t, func(c *Config) {
   829  		c.DevDisableBootstrap = true
   830  		c.RaftConfig.ProtocolVersion = 1
   831  	})
   832  	defer s2.Shutdown()
   833  
   834  	s3 := TestServer(t, func(c *Config) {
   835  		c.DevDisableBootstrap = true
   836  		c.RaftConfig.ProtocolVersion = 2
   837  	})
   838  	defer s3.Shutdown()
   839  
   840  	servers := []*Server{s1, s2, s3}
   841  
   842  	// Try to join
   843  	TestJoin(t, s1, s2, s3)
   844  
   845  	for _, s := range servers {
   846  		testutil.WaitForResult(func() (bool, error) {
   847  			peers, _ := s.numPeers()
   848  			return peers == 3, nil
   849  		}, func(err error) {
   850  			t.Fatalf("should have 3 peers")
   851  		})
   852  	}
   853  
   854  	// Kill the v1 server
   855  	if err := s2.Leave(); err != nil {
   856  		t.Fatal(err)
   857  	}
   858  
   859  	for _, s := range []*Server{s1, s3} {
   860  		minVer, err := s.autopilot.MinRaftProtocol()
   861  		if err != nil {
   862  			t.Fatal(err)
   863  		}
   864  		if got, want := minVer, 2; got != want {
   865  			t.Fatalf("got min raft version %d want %d", got, want)
   866  		}
   867  	}
   868  
   869  	// Replace the dead server with one running raft protocol v3
   870  	s4 := TestServer(t, func(c *Config) {
   871  		c.DevDisableBootstrap = true
   872  		c.Datacenter = "dc1"
   873  		c.RaftConfig.ProtocolVersion = 3
   874  	})
   875  	defer s4.Shutdown()
   876  	TestJoin(t, s1, s4)
   877  	servers[1] = s4
   878  
   879  	// Make sure we're back to 3 total peers with the new one added via ID
   880  	for _, s := range servers {
   881  		testutil.WaitForResult(func() (bool, error) {
   882  			addrs := 0
   883  			ids := 0
   884  			future := s.raft.GetConfiguration()
   885  			if err := future.Error(); err != nil {
   886  				return false, err
   887  			}
   888  			for _, server := range future.Configuration().Servers {
   889  				if string(server.ID) == string(server.Address) {
   890  					addrs++
   891  				} else {
   892  					ids++
   893  				}
   894  			}
   895  			if got, want := addrs, 2; got != want {
   896  				return false, fmt.Errorf("got %d server addresses want %d", got, want)
   897  			}
   898  			if got, want := ids, 1; got != want {
   899  				return false, fmt.Errorf("got %d server ids want %d", got, want)
   900  			}
   901  
   902  			return true, nil
   903  		}, func(err error) {
   904  			t.Fatal(err)
   905  		})
   906  	}
   907  }
   908  
   909  func TestLeader_Reelection(t *testing.T) {
   910  	raftProtocols := []int{1, 2, 3}
   911  	for _, p := range raftProtocols {
   912  		t.Run("Leader Election - Protocol version "+string(p), func(t *testing.T) {
   913  			leaderElectionTest(t, raft.ProtocolVersion(p))
   914  		})
   915  	}
   916  
   917  }
   918  
   919  func leaderElectionTest(t *testing.T, raftProtocol raft.ProtocolVersion) {
   920  	s1 := TestServer(t, func(c *Config) {
   921  		c.BootstrapExpect = 3
   922  		c.RaftConfig.ProtocolVersion = raftProtocol
   923  	})
   924  	defer s1.Shutdown()
   925  
   926  	s2 := TestServer(t, func(c *Config) {
   927  		c.BootstrapExpect = 3
   928  		c.DevDisableBootstrap = true
   929  		c.RaftConfig.ProtocolVersion = raftProtocol
   930  	})
   931  	defer s2.Shutdown()
   932  
   933  	s3 := TestServer(t, func(c *Config) {
   934  		c.BootstrapExpect = 3
   935  		c.DevDisableBootstrap = true
   936  		c.RaftConfig.ProtocolVersion = raftProtocol
   937  	})
   938  
   939  	servers := []*Server{s1, s2, s3}
   940  
   941  	// Try to join
   942  	TestJoin(t, s1, s2, s3)
   943  	testutil.WaitForLeader(t, s1.RPC)
   944  
   945  	testutil.WaitForResult(func() (bool, error) {
   946  		future := s1.raft.GetConfiguration()
   947  		if err := future.Error(); err != nil {
   948  			return false, err
   949  		}
   950  
   951  		for _, server := range future.Configuration().Servers {
   952  			if server.Suffrage == raft.Nonvoter {
   953  				return false, fmt.Errorf("non-voter %v", server)
   954  			}
   955  		}
   956  
   957  		return true, nil
   958  	}, func(err error) {
   959  		t.Fatal(err)
   960  	})
   961  
   962  	var leader, nonLeader *Server
   963  	for _, s := range servers {
   964  		if s.IsLeader() {
   965  			leader = s
   966  		} else {
   967  			nonLeader = s
   968  		}
   969  	}
   970  
   971  	// Shutdown the leader
   972  	leader.Shutdown()
   973  	// Wait for new leader to elect
   974  	testutil.WaitForLeader(t, nonLeader.RPC)
   975  }
   976  
   977  func TestLeader_RollRaftServer(t *testing.T) {
   978  	t.Parallel()
   979  	s1 := TestServer(t, func(c *Config) {
   980  		c.RaftConfig.ProtocolVersion = 2
   981  	})
   982  	defer s1.Shutdown()
   983  
   984  	s2 := TestServer(t, func(c *Config) {
   985  		c.DevDisableBootstrap = true
   986  		c.RaftConfig.ProtocolVersion = 2
   987  	})
   988  	defer s2.Shutdown()
   989  
   990  	s3 := TestServer(t, func(c *Config) {
   991  		c.DevDisableBootstrap = true
   992  		c.RaftConfig.ProtocolVersion = 2
   993  	})
   994  	defer s3.Shutdown()
   995  
   996  	servers := []*Server{s1, s2, s3}
   997  
   998  	// Try to join
   999  	TestJoin(t, s1, s2, s3)
  1000  
  1001  	for _, s := range servers {
  1002  		retry.Run(t, func(r *retry.R) { r.Check(wantPeers(s, 3)) })
  1003  	}
  1004  
  1005  	// Kill the first v2 server
  1006  	s1.Shutdown()
  1007  
  1008  	for _, s := range []*Server{s1, s3} {
  1009  		retry.Run(t, func(r *retry.R) {
  1010  			minVer, err := s.autopilot.MinRaftProtocol()
  1011  			if err != nil {
  1012  				r.Fatal(err)
  1013  			}
  1014  			if got, want := minVer, 2; got != want {
  1015  				r.Fatalf("got min raft version %d want %d", got, want)
  1016  			}
  1017  		})
  1018  	}
  1019  
  1020  	// Replace the dead server with one running raft protocol v3
  1021  	s4 := TestServer(t, func(c *Config) {
  1022  		c.DevDisableBootstrap = true
  1023  		c.RaftConfig.ProtocolVersion = 3
  1024  	})
  1025  	defer s4.Shutdown()
  1026  	TestJoin(t, s4, s2)
  1027  	servers[0] = s4
  1028  
  1029  	// Kill the second v2 server
  1030  	s2.Shutdown()
  1031  
  1032  	for _, s := range []*Server{s3, s4} {
  1033  		retry.Run(t, func(r *retry.R) {
  1034  			minVer, err := s.autopilot.MinRaftProtocol()
  1035  			if err != nil {
  1036  				r.Fatal(err)
  1037  			}
  1038  			if got, want := minVer, 2; got != want {
  1039  				r.Fatalf("got min raft version %d want %d", got, want)
  1040  			}
  1041  		})
  1042  	}
  1043  	// Replace another dead server with one running raft protocol v3
  1044  	s5 := TestServer(t, func(c *Config) {
  1045  		c.DevDisableBootstrap = true
  1046  		c.RaftConfig.ProtocolVersion = 3
  1047  	})
  1048  	defer s5.Shutdown()
  1049  	TestJoin(t, s5, s4)
  1050  	servers[1] = s5
  1051  
  1052  	// Kill the last v2 server, now minRaftProtocol should be 3
  1053  	s3.Shutdown()
  1054  
  1055  	for _, s := range []*Server{s4, s5} {
  1056  		retry.Run(t, func(r *retry.R) {
  1057  			minVer, err := s.autopilot.MinRaftProtocol()
  1058  			if err != nil {
  1059  				r.Fatal(err)
  1060  			}
  1061  			if got, want := minVer, 3; got != want {
  1062  				r.Fatalf("got min raft version %d want %d", got, want)
  1063  			}
  1064  		})
  1065  	}
  1066  
  1067  	// Replace the last dead server with one running raft protocol v3
  1068  	s6 := TestServer(t, func(c *Config) {
  1069  		c.DevDisableBootstrap = true
  1070  		c.RaftConfig.ProtocolVersion = 3
  1071  	})
  1072  	defer s6.Shutdown()
  1073  	TestJoin(t, s6, s4)
  1074  	servers[2] = s6
  1075  
  1076  	// Make sure all the dead servers are removed and we're back to 3 total peers
  1077  	for _, s := range servers {
  1078  		retry.Run(t, func(r *retry.R) {
  1079  			addrs := 0
  1080  			ids := 0
  1081  			future := s.raft.GetConfiguration()
  1082  			if err := future.Error(); err != nil {
  1083  				r.Fatal(err)
  1084  			}
  1085  			for _, server := range future.Configuration().Servers {
  1086  				if string(server.ID) == string(server.Address) {
  1087  					addrs++
  1088  				} else {
  1089  					ids++
  1090  				}
  1091  			}
  1092  			if got, want := addrs, 0; got != want {
  1093  				r.Fatalf("got %d server addresses want %d", got, want)
  1094  			}
  1095  			if got, want := ids, 3; got != want {
  1096  				r.Fatalf("got %d server ids want %d", got, want)
  1097  			}
  1098  		})
  1099  	}
  1100  }
  1101  
  1102  func TestLeader_RevokeLeadership_MultipleTimes(t *testing.T) {
  1103  	s1 := TestServer(t, nil)
  1104  	defer s1.Shutdown()
  1105  	testutil.WaitForLeader(t, s1.RPC)
  1106  
  1107  	testutil.WaitForResult(func() (bool, error) {
  1108  		return s1.evalBroker.Enabled(), nil
  1109  	}, func(err error) {
  1110  		t.Fatalf("should have finished establish leader loop")
  1111  	})
  1112  
  1113  	require.Nil(t, s1.revokeLeadership())
  1114  	require.Nil(t, s1.revokeLeadership())
  1115  	require.Nil(t, s1.revokeLeadership())
  1116  }
  1117  
  1118  // Test doing an inplace upgrade on a server from raft protocol 2 to 3
  1119  // This verifies that removing the server and adding it back with a uuid works
  1120  // even if the server's address stays the same.
  1121  func TestServer_ReconcileMember(t *testing.T) {
  1122  	// Create a three node cluster
  1123  	t.Parallel()
  1124  	s1 := TestServer(t, func(c *Config) {
  1125  		c.DevDisableBootstrap = true
  1126  		c.RaftConfig.ProtocolVersion = 3
  1127  	})
  1128  	defer s1.Shutdown()
  1129  
  1130  	s2 := TestServer(t, func(c *Config) {
  1131  		c.DevDisableBootstrap = true
  1132  		c.RaftConfig.ProtocolVersion = 3
  1133  	})
  1134  	defer s2.Shutdown()
  1135  
  1136  	s3 := TestServer(t, func(c *Config) {
  1137  		c.DevDisableBootstrap = true
  1138  		c.RaftConfig.ProtocolVersion = 2
  1139  	})
  1140  	defer s3.Shutdown()
  1141  	TestJoin(t, s1, s2, s3)
  1142  	testutil.WaitForLeader(t, s1.RPC)
  1143  
  1144  	// Create a memberlist object for s3, with raft protocol upgraded to 3
  1145  	upgradedS3Member := serf.Member{
  1146  		Name:   s3.config.NodeName,
  1147  		Addr:   s3.config.RPCAddr.IP,
  1148  		Status: serf.StatusAlive,
  1149  		Tags:   make(map[string]string),
  1150  	}
  1151  	upgradedS3Member.Tags["role"] = "nomad"
  1152  	upgradedS3Member.Tags["id"] = s3.config.NodeID
  1153  	upgradedS3Member.Tags["region"] = s3.config.Region
  1154  	upgradedS3Member.Tags["dc"] = s3.config.Datacenter
  1155  	upgradedS3Member.Tags["rpc_addr"] = "127.0.0.1"
  1156  	upgradedS3Member.Tags["port"] = strconv.Itoa(s3.config.RPCAddr.Port)
  1157  	upgradedS3Member.Tags["build"] = "0.8.0"
  1158  	upgradedS3Member.Tags["vsn"] = "2"
  1159  	upgradedS3Member.Tags["mvn"] = "1"
  1160  	upgradedS3Member.Tags["raft_vsn"] = "3"
  1161  
  1162  	// Find the leader so that we can call reconcile member on it
  1163  	var leader *Server
  1164  	for _, s := range []*Server{s1, s2, s3} {
  1165  		if s.IsLeader() {
  1166  			leader = s
  1167  		}
  1168  	}
  1169  	leader.reconcileMember(upgradedS3Member)
  1170  	// This should remove s3 from the config and potentially cause a leader election
  1171  	testutil.WaitForLeader(t, s1.RPC)
  1172  
  1173  	// Figure out the new leader and call reconcile again, this should add s3 with the new ID format
  1174  	for _, s := range []*Server{s1, s2, s3} {
  1175  		if s.IsLeader() {
  1176  			leader = s
  1177  		}
  1178  	}
  1179  	leader.reconcileMember(upgradedS3Member)
  1180  	testutil.WaitForLeader(t, s1.RPC)
  1181  	future := s2.raft.GetConfiguration()
  1182  	if err := future.Error(); err != nil {
  1183  		t.Fatal(err)
  1184  	}
  1185  	addrs := 0
  1186  	ids := 0
  1187  	for _, server := range future.Configuration().Servers {
  1188  		if string(server.ID) == string(server.Address) {
  1189  			addrs++
  1190  		} else {
  1191  			ids++
  1192  		}
  1193  	}
  1194  	// After this, all three servers should have IDs in raft
  1195  	if got, want := addrs, 0; got != want {
  1196  		t.Fatalf("got %d server addresses want %d", got, want)
  1197  	}
  1198  	if got, want := ids, 3; got != want {
  1199  		t.Fatalf("got %d server ids want %d", got, want)
  1200  	}
  1201  }