github.com/clly/consul@v1.4.5/agent/consul/leader_test.go (about)

     1  package consul
     2  
     3  import (
     4  	"os"
     5  	"reflect"
     6  	"testing"
     7  	"time"
     8  
     9  	"github.com/hashicorp/consul/agent/connect"
    10  	"github.com/hashicorp/consul/agent/structs"
    11  	"github.com/hashicorp/consul/api"
    12  	"github.com/hashicorp/consul/testrpc"
    13  	"github.com/hashicorp/consul/testutil/retry"
    14  	"github.com/hashicorp/net-rpc-msgpackrpc"
    15  	"github.com/hashicorp/serf/serf"
    16  	"github.com/stretchr/testify/require"
    17  )
    18  
    19  func TestLeader_RegisterMember(t *testing.T) {
    20  	t.Parallel()
    21  	dir1, s1 := testServerWithConfig(t, func(c *Config) {
    22  		c.ACLDatacenter = "dc1"
    23  		c.ACLsEnabled = true
    24  		c.ACLMasterToken = "root"
    25  		c.ACLDefaultPolicy = "deny"
    26  		c.ACLEnforceVersion8 = true
    27  	})
    28  	defer os.RemoveAll(dir1)
    29  	defer s1.Shutdown()
    30  
    31  	dir2, c1 := testClient(t)
    32  	defer os.RemoveAll(dir2)
    33  	defer c1.Shutdown()
    34  
    35  	// Try to join
    36  	joinLAN(t, c1, s1)
    37  
    38  	testrpc.WaitForLeader(t, s1.RPC, "dc1")
    39  
    40  	// Client should be registered
    41  	state := s1.fsm.State()
    42  	retry.Run(t, func(r *retry.R) {
    43  		_, node, err := state.GetNode(c1.config.NodeName)
    44  		if err != nil {
    45  			r.Fatalf("err: %v", err)
    46  		}
    47  		if node == nil {
    48  			r.Fatal("client not registered")
    49  		}
    50  	})
    51  
    52  	// Should have a check
    53  	_, checks, err := state.NodeChecks(nil, c1.config.NodeName)
    54  	if err != nil {
    55  		t.Fatalf("err: %v", err)
    56  	}
    57  	if len(checks) != 1 {
    58  		t.Fatalf("client missing check")
    59  	}
    60  	if checks[0].CheckID != structs.SerfCheckID {
    61  		t.Fatalf("bad check: %v", checks[0])
    62  	}
    63  	if checks[0].Name != structs.SerfCheckName {
    64  		t.Fatalf("bad check: %v", checks[0])
    65  	}
    66  	if checks[0].Status != api.HealthPassing {
    67  		t.Fatalf("bad check: %v", checks[0])
    68  	}
    69  
    70  	// Server should be registered
    71  	_, node, err := state.GetNode(s1.config.NodeName)
    72  	if err != nil {
    73  		t.Fatalf("err: %v", err)
    74  	}
    75  	if node == nil {
    76  		t.Fatalf("server not registered")
    77  	}
    78  
    79  	// Service should be registered
    80  	_, services, err := state.NodeServices(nil, s1.config.NodeName)
    81  	if err != nil {
    82  		t.Fatalf("err: %v", err)
    83  	}
    84  	if _, ok := services.Services["consul"]; !ok {
    85  		t.Fatalf("consul service not registered: %v", services)
    86  	}
    87  }
    88  
    89  func TestLeader_FailedMember(t *testing.T) {
    90  	t.Parallel()
    91  	dir1, s1 := testServerWithConfig(t, func(c *Config) {
    92  		c.ACLDatacenter = "dc1"
    93  		c.ACLsEnabled = true
    94  		c.ACLMasterToken = "root"
    95  		c.ACLDefaultPolicy = "deny"
    96  		c.ACLEnforceVersion8 = true
    97  	})
    98  	defer os.RemoveAll(dir1)
    99  	defer s1.Shutdown()
   100  
   101  	dir2, c1 := testClient(t)
   102  	defer os.RemoveAll(dir2)
   103  	defer c1.Shutdown()
   104  
   105  	testrpc.WaitForLeader(t, s1.RPC, "dc1")
   106  
   107  	// Try to join
   108  	joinLAN(t, c1, s1)
   109  
   110  	// Fail the member
   111  	c1.Shutdown()
   112  
   113  	// Should be registered
   114  	state := s1.fsm.State()
   115  	retry.Run(t, func(r *retry.R) {
   116  		_, node, err := state.GetNode(c1.config.NodeName)
   117  		if err != nil {
   118  			r.Fatalf("err: %v", err)
   119  		}
   120  		if node == nil {
   121  			r.Fatal("client not registered")
   122  		}
   123  	})
   124  
   125  	// Should have a check
   126  	_, checks, err := state.NodeChecks(nil, c1.config.NodeName)
   127  	if err != nil {
   128  		t.Fatalf("err: %v", err)
   129  	}
   130  	if len(checks) != 1 {
   131  		t.Fatalf("client missing check")
   132  	}
   133  	if checks[0].CheckID != structs.SerfCheckID {
   134  		t.Fatalf("bad check: %v", checks[0])
   135  	}
   136  	if checks[0].Name != structs.SerfCheckName {
   137  		t.Fatalf("bad check: %v", checks[0])
   138  	}
   139  
   140  	retry.Run(t, func(r *retry.R) {
   141  		_, checks, err = state.NodeChecks(nil, c1.config.NodeName)
   142  		if err != nil {
   143  			r.Fatalf("err: %v", err)
   144  		}
   145  		if got, want := checks[0].Status, api.HealthCritical; got != want {
   146  			r.Fatalf("got status %q want %q", got, want)
   147  		}
   148  	})
   149  }
   150  
   151  func TestLeader_LeftMember(t *testing.T) {
   152  	t.Parallel()
   153  	dir1, s1 := testServerWithConfig(t, func(c *Config) {
   154  		c.ACLDatacenter = "dc1"
   155  		c.ACLsEnabled = true
   156  		c.ACLMasterToken = "root"
   157  		c.ACLDefaultPolicy = "deny"
   158  		c.ACLEnforceVersion8 = true
   159  	})
   160  	defer os.RemoveAll(dir1)
   161  	defer s1.Shutdown()
   162  
   163  	dir2, c1 := testClient(t)
   164  	defer os.RemoveAll(dir2)
   165  	defer c1.Shutdown()
   166  
   167  	// Try to join
   168  	joinLAN(t, c1, s1)
   169  
   170  	state := s1.fsm.State()
   171  
   172  	// Should be registered
   173  	retry.Run(t, func(r *retry.R) {
   174  		_, node, err := state.GetNode(c1.config.NodeName)
   175  		if err != nil {
   176  			r.Fatalf("err: %v", err)
   177  		}
   178  		if node == nil {
   179  			r.Fatal("client not registered")
   180  		}
   181  	})
   182  
   183  	// Node should leave
   184  	c1.Leave()
   185  	c1.Shutdown()
   186  
   187  	// Should be deregistered
   188  	retry.Run(t, func(r *retry.R) {
   189  		_, node, err := state.GetNode(c1.config.NodeName)
   190  		if err != nil {
   191  			r.Fatalf("err: %v", err)
   192  		}
   193  		if node != nil {
   194  			r.Fatal("client still registered")
   195  		}
   196  	})
   197  }
   198  func TestLeader_ReapMember(t *testing.T) {
   199  	t.Parallel()
   200  	dir1, s1 := testServerWithConfig(t, func(c *Config) {
   201  		c.ACLDatacenter = "dc1"
   202  		c.ACLsEnabled = true
   203  		c.ACLMasterToken = "root"
   204  		c.ACLDefaultPolicy = "deny"
   205  		c.ACLEnforceVersion8 = true
   206  	})
   207  	defer os.RemoveAll(dir1)
   208  	defer s1.Shutdown()
   209  
   210  	dir2, c1 := testClient(t)
   211  	defer os.RemoveAll(dir2)
   212  	defer c1.Shutdown()
   213  
   214  	// Try to join
   215  	joinLAN(t, c1, s1)
   216  
   217  	state := s1.fsm.State()
   218  
   219  	// Should be registered
   220  	retry.Run(t, func(r *retry.R) {
   221  		_, node, err := state.GetNode(c1.config.NodeName)
   222  		if err != nil {
   223  			r.Fatalf("err: %v", err)
   224  		}
   225  		if node == nil {
   226  			r.Fatal("client not registered")
   227  		}
   228  	})
   229  
   230  	// Simulate a node reaping
   231  	mems := s1.LANMembers()
   232  	var c1mem serf.Member
   233  	for _, m := range mems {
   234  		if m.Name == c1.config.NodeName {
   235  			c1mem = m
   236  			c1mem.Status = StatusReap
   237  			break
   238  		}
   239  	}
   240  	s1.reconcileCh <- c1mem
   241  
   242  	// Should be deregistered; we have to poll quickly here because
   243  	// anti-entropy will put it back.
   244  	reaped := false
   245  	for start := time.Now(); time.Since(start) < 5*time.Second; {
   246  		_, node, err := state.GetNode(c1.config.NodeName)
   247  		if err != nil {
   248  			t.Fatalf("err: %v", err)
   249  		}
   250  		if node == nil {
   251  			reaped = true
   252  			break
   253  		}
   254  	}
   255  	if !reaped {
   256  		t.Fatalf("client should not be registered")
   257  	}
   258  }
   259  
   260  func TestLeader_ReapServer(t *testing.T) {
   261  	t.Parallel()
   262  	dir1, s1 := testServerWithConfig(t, func(c *Config) {
   263  		c.ACLDatacenter = "dc1"
   264  		c.ACLsEnabled = true
   265  		c.ACLMasterToken = "root"
   266  		c.ACLDefaultPolicy = "allow"
   267  		c.ACLEnforceVersion8 = true
   268  		c.Bootstrap = true
   269  	})
   270  	defer os.RemoveAll(dir1)
   271  	defer s1.Shutdown()
   272  
   273  	dir2, s2 := testServerWithConfig(t, func(c *Config) {
   274  		c.ACLDatacenter = "dc1"
   275  		c.ACLsEnabled = true
   276  		c.ACLMasterToken = "root"
   277  		c.ACLDefaultPolicy = "allow"
   278  		c.ACLEnforceVersion8 = true
   279  		c.Bootstrap = false
   280  	})
   281  	defer os.RemoveAll(dir2)
   282  	defer s2.Shutdown()
   283  
   284  	dir3, s3 := testServerWithConfig(t, func(c *Config) {
   285  		c.ACLDatacenter = "dc1"
   286  		c.ACLsEnabled = true
   287  		c.ACLMasterToken = "root"
   288  		c.ACLDefaultPolicy = "allow"
   289  		c.ACLEnforceVersion8 = true
   290  		c.Bootstrap = false
   291  	})
   292  	defer os.RemoveAll(dir3)
   293  	defer s3.Shutdown()
   294  
   295  	// Try to join
   296  	joinLAN(t, s1, s2)
   297  	joinLAN(t, s1, s3)
   298  
   299  	testrpc.WaitForLeader(t, s1.RPC, "dc1")
   300  	testrpc.WaitForLeader(t, s2.RPC, "dc1")
   301  	testrpc.WaitForLeader(t, s3.RPC, "dc1")
   302  	state := s1.fsm.State()
   303  
   304  	// s3 should be registered
   305  	retry.Run(t, func(r *retry.R) {
   306  		_, node, err := state.GetNode(s3.config.NodeName)
   307  		if err != nil {
   308  			r.Fatalf("err: %v", err)
   309  		}
   310  		if node == nil {
   311  			r.Fatal("client not registered")
   312  		}
   313  	})
   314  
   315  	// call reconcileReaped with a map that does not contain s3
   316  	knownMembers := make(map[string]struct{})
   317  	knownMembers[s1.config.NodeName] = struct{}{}
   318  	knownMembers[s2.config.NodeName] = struct{}{}
   319  
   320  	err := s1.reconcileReaped(knownMembers)
   321  
   322  	if err != nil {
   323  		t.Fatalf("Unexpected error :%v", err)
   324  	}
   325  	// s3 should be deregistered
   326  	retry.Run(t, func(r *retry.R) {
   327  		_, node, err := state.GetNode(s3.config.NodeName)
   328  		if err != nil {
   329  			r.Fatalf("err: %v", err)
   330  		}
   331  		if node != nil {
   332  			r.Fatalf("server with id %v should not be registered", s3.config.NodeID)
   333  		}
   334  	})
   335  
   336  }
   337  
   338  func TestLeader_Reconcile_ReapMember(t *testing.T) {
   339  	t.Parallel()
   340  	dir1, s1 := testServerWithConfig(t, func(c *Config) {
   341  		c.ACLDatacenter = "dc1"
   342  		c.ACLsEnabled = true
   343  		c.ACLMasterToken = "root"
   344  		c.ACLDefaultPolicy = "deny"
   345  		c.ACLEnforceVersion8 = true
   346  	})
   347  	defer os.RemoveAll(dir1)
   348  	defer s1.Shutdown()
   349  
   350  	testrpc.WaitForLeader(t, s1.RPC, "dc1")
   351  
   352  	// Register a non-existing member
   353  	dead := structs.RegisterRequest{
   354  		Datacenter: s1.config.Datacenter,
   355  		Node:       "no-longer-around",
   356  		Address:    "127.1.1.1",
   357  		Check: &structs.HealthCheck{
   358  			Node:    "no-longer-around",
   359  			CheckID: structs.SerfCheckID,
   360  			Name:    structs.SerfCheckName,
   361  			Status:  api.HealthCritical,
   362  		},
   363  		WriteRequest: structs.WriteRequest{
   364  			Token: "root",
   365  		},
   366  	}
   367  	var out struct{}
   368  	if err := s1.RPC("Catalog.Register", &dead, &out); err != nil {
   369  		t.Fatalf("err: %v", err)
   370  	}
   371  
   372  	// Force a reconciliation
   373  	if err := s1.reconcile(); err != nil {
   374  		t.Fatalf("err: %v", err)
   375  	}
   376  
   377  	// Node should be gone
   378  	state := s1.fsm.State()
   379  	_, node, err := state.GetNode("no-longer-around")
   380  	if err != nil {
   381  		t.Fatalf("err: %v", err)
   382  	}
   383  	if node != nil {
   384  		t.Fatalf("client registered")
   385  	}
   386  }
   387  
   388  func TestLeader_Reconcile(t *testing.T) {
   389  	t.Parallel()
   390  	dir1, s1 := testServerWithConfig(t, func(c *Config) {
   391  		c.ACLDatacenter = "dc1"
   392  		c.ACLsEnabled = true
   393  		c.ACLMasterToken = "root"
   394  		c.ACLDefaultPolicy = "deny"
   395  		c.ACLEnforceVersion8 = true
   396  	})
   397  	defer os.RemoveAll(dir1)
   398  	defer s1.Shutdown()
   399  
   400  	dir2, c1 := testClient(t)
   401  	defer os.RemoveAll(dir2)
   402  	defer c1.Shutdown()
   403  
   404  	// Join before we have a leader, this should cause a reconcile!
   405  	joinLAN(t, c1, s1)
   406  
   407  	// Should not be registered
   408  	state := s1.fsm.State()
   409  	_, node, err := state.GetNode(c1.config.NodeName)
   410  	if err != nil {
   411  		t.Fatalf("err: %v", err)
   412  	}
   413  	if node != nil {
   414  		t.Fatalf("client registered")
   415  	}
   416  
   417  	// Should be registered
   418  	retry.Run(t, func(r *retry.R) {
   419  		_, node, err := state.GetNode(c1.config.NodeName)
   420  		if err != nil {
   421  			r.Fatalf("err: %v", err)
   422  		}
   423  		if node == nil {
   424  			r.Fatal("client not registered")
   425  		}
   426  	})
   427  }
   428  
   429  func TestLeader_Reconcile_Races(t *testing.T) {
   430  	t.Parallel()
   431  	dir1, s1 := testServer(t)
   432  	defer os.RemoveAll(dir1)
   433  	defer s1.Shutdown()
   434  
   435  	testrpc.WaitForLeader(t, s1.RPC, "dc1")
   436  
   437  	dir2, c1 := testClient(t)
   438  	defer os.RemoveAll(dir2)
   439  	defer c1.Shutdown()
   440  
   441  	joinLAN(t, c1, s1)
   442  
   443  	// Wait for the server to reconcile the client and register it.
   444  	state := s1.fsm.State()
   445  	var nodeAddr string
   446  	retry.Run(t, func(r *retry.R) {
   447  		_, node, err := state.GetNode(c1.config.NodeName)
   448  		if err != nil {
   449  			r.Fatalf("err: %v", err)
   450  		}
   451  		if node == nil {
   452  			r.Fatal("client not registered")
   453  		}
   454  		nodeAddr = node.Address
   455  	})
   456  
   457  	// Add in some metadata via the catalog (as if the agent synced it
   458  	// there). We also set the serfHealth check to failing so the reconcile
   459  	// will attempt to flip it back
   460  	req := structs.RegisterRequest{
   461  		Datacenter: s1.config.Datacenter,
   462  		Node:       c1.config.NodeName,
   463  		ID:         c1.config.NodeID,
   464  		Address:    nodeAddr,
   465  		NodeMeta:   map[string]string{"hello": "world"},
   466  		Check: &structs.HealthCheck{
   467  			Node:    c1.config.NodeName,
   468  			CheckID: structs.SerfCheckID,
   469  			Name:    structs.SerfCheckName,
   470  			Status:  api.HealthCritical,
   471  			Output:  "",
   472  		},
   473  	}
   474  	var out struct{}
   475  	if err := s1.RPC("Catalog.Register", &req, &out); err != nil {
   476  		t.Fatalf("err: %v", err)
   477  	}
   478  
   479  	// Force a reconcile and make sure the metadata stuck around.
   480  	if err := s1.reconcile(); err != nil {
   481  		t.Fatalf("err: %v", err)
   482  	}
   483  	_, node, err := state.GetNode(c1.config.NodeName)
   484  	if err != nil {
   485  		t.Fatalf("err: %v", err)
   486  	}
   487  	if node == nil {
   488  		t.Fatalf("bad")
   489  	}
   490  	if hello, ok := node.Meta["hello"]; !ok || hello != "world" {
   491  		t.Fatalf("bad")
   492  	}
   493  
   494  	// Fail the member and wait for the health to go critical.
   495  	c1.Shutdown()
   496  	retry.Run(t, func(r *retry.R) {
   497  		_, checks, err := state.NodeChecks(nil, c1.config.NodeName)
   498  		if err != nil {
   499  			r.Fatalf("err: %v", err)
   500  		}
   501  		if got, want := checks[0].Status, api.HealthCritical; got != want {
   502  			r.Fatalf("got state %q want %q", got, want)
   503  		}
   504  	})
   505  
   506  	// Make sure the metadata didn't get clobbered.
   507  	_, node, err = state.GetNode(c1.config.NodeName)
   508  	if err != nil {
   509  		t.Fatalf("err: %v", err)
   510  	}
   511  	if node == nil {
   512  		t.Fatalf("bad")
   513  	}
   514  	if hello, ok := node.Meta["hello"]; !ok || hello != "world" {
   515  		t.Fatalf("bad")
   516  	}
   517  }
   518  
   519  func TestLeader_LeftServer(t *testing.T) {
   520  	t.Parallel()
   521  	dir1, s1 := testServer(t)
   522  	defer os.RemoveAll(dir1)
   523  	defer s1.Shutdown()
   524  
   525  	dir2, s2 := testServerDCBootstrap(t, "dc1", false)
   526  	defer os.RemoveAll(dir2)
   527  	defer s2.Shutdown()
   528  
   529  	dir3, s3 := testServerDCBootstrap(t, "dc1", false)
   530  	defer os.RemoveAll(dir3)
   531  	defer s3.Shutdown()
   532  
   533  	// Put s1 last so we don't trigger a leader election.
   534  	servers := []*Server{s2, s3, s1}
   535  
   536  	// Try to join
   537  	joinLAN(t, s2, s1)
   538  	joinLAN(t, s3, s1)
   539  	for _, s := range servers {
   540  		retry.Run(t, func(r *retry.R) { r.Check(wantPeers(s, 3)) })
   541  	}
   542  
   543  	// Kill any server
   544  	servers[0].Shutdown()
   545  
   546  	// Force remove the non-leader (transition to left state)
   547  	if err := servers[1].RemoveFailedNode(servers[0].config.NodeName); err != nil {
   548  		t.Fatalf("err: %v", err)
   549  	}
   550  
   551  	// Wait until the remaining servers show only 2 peers.
   552  	for _, s := range servers[1:] {
   553  		retry.Run(t, func(r *retry.R) { r.Check(wantPeers(s, 2)) })
   554  	}
   555  	s1.Shutdown()
   556  }
   557  
   558  func TestLeader_LeftLeader(t *testing.T) {
   559  	t.Parallel()
   560  	dir1, s1 := testServer(t)
   561  	defer os.RemoveAll(dir1)
   562  	defer s1.Shutdown()
   563  
   564  	dir2, s2 := testServerDCBootstrap(t, "dc1", false)
   565  	defer os.RemoveAll(dir2)
   566  	defer s2.Shutdown()
   567  
   568  	dir3, s3 := testServerDCBootstrap(t, "dc1", false)
   569  	defer os.RemoveAll(dir3)
   570  	defer s3.Shutdown()
   571  	servers := []*Server{s1, s2, s3}
   572  
   573  	// Try to join
   574  	joinLAN(t, s2, s1)
   575  	joinLAN(t, s3, s1)
   576  
   577  	for _, s := range servers {
   578  		retry.Run(t, func(r *retry.R) { r.Check(wantPeers(s, 3)) })
   579  	}
   580  
   581  	// Kill the leader!
   582  	var leader *Server
   583  	for _, s := range servers {
   584  		if s.IsLeader() {
   585  			leader = s
   586  			break
   587  		}
   588  	}
   589  	if leader == nil {
   590  		t.Fatalf("Should have a leader")
   591  	}
   592  	if !leader.isReadyForConsistentReads() {
   593  		t.Fatalf("Expected leader to be ready for consistent reads ")
   594  	}
   595  	leader.Leave()
   596  	if leader.isReadyForConsistentReads() {
   597  		t.Fatalf("Expected consistent read state to be false ")
   598  	}
   599  	leader.Shutdown()
   600  	time.Sleep(100 * time.Millisecond)
   601  
   602  	var remain *Server
   603  	for _, s := range servers {
   604  		if s == leader {
   605  			continue
   606  		}
   607  		remain = s
   608  		retry.Run(t, func(r *retry.R) { r.Check(wantPeers(s, 2)) })
   609  	}
   610  
   611  	// Verify the old leader is deregistered
   612  	state := remain.fsm.State()
   613  	retry.Run(t, func(r *retry.R) {
   614  		_, node, err := state.GetNode(leader.config.NodeName)
   615  		if err != nil {
   616  			r.Fatalf("err: %v", err)
   617  		}
   618  		if node != nil {
   619  			r.Fatal("leader should be deregistered")
   620  		}
   621  	})
   622  }
   623  
   624  func TestLeader_MultiBootstrap(t *testing.T) {
   625  	t.Parallel()
   626  	dir1, s1 := testServer(t)
   627  	defer os.RemoveAll(dir1)
   628  	defer s1.Shutdown()
   629  
   630  	dir2, s2 := testServer(t)
   631  	defer os.RemoveAll(dir2)
   632  	defer s2.Shutdown()
   633  
   634  	servers := []*Server{s1, s2}
   635  
   636  	// Try to join
   637  	joinLAN(t, s2, s1)
   638  
   639  	for _, s := range servers {
   640  		retry.Run(t, func(r *retry.R) {
   641  			if got, want := len(s.serfLAN.Members()), 2; got != want {
   642  				r.Fatalf("got %d peers want %d", got, want)
   643  			}
   644  		})
   645  	}
   646  
   647  	// Ensure we don't have multiple raft peers
   648  	for _, s := range servers {
   649  		peers, _ := s.numPeers()
   650  		if peers != 1 {
   651  			t.Fatalf("should only have 1 raft peer!")
   652  		}
   653  	}
   654  }
   655  
   656  func TestLeader_TombstoneGC_Reset(t *testing.T) {
   657  	t.Parallel()
   658  	dir1, s1 := testServer(t)
   659  	defer os.RemoveAll(dir1)
   660  	defer s1.Shutdown()
   661  
   662  	dir2, s2 := testServerDCBootstrap(t, "dc1", false)
   663  	defer os.RemoveAll(dir2)
   664  	defer s2.Shutdown()
   665  
   666  	dir3, s3 := testServerDCBootstrap(t, "dc1", false)
   667  	defer os.RemoveAll(dir3)
   668  	defer s3.Shutdown()
   669  	servers := []*Server{s1, s2, s3}
   670  
   671  	// Try to join
   672  	joinLAN(t, s2, s1)
   673  	joinLAN(t, s3, s1)
   674  
   675  	for _, s := range servers {
   676  		retry.Run(t, func(r *retry.R) { r.Check(wantPeers(s, 3)) })
   677  	}
   678  
   679  	var leader *Server
   680  	for _, s := range servers {
   681  		if s.IsLeader() {
   682  			leader = s
   683  			break
   684  		}
   685  	}
   686  	if leader == nil {
   687  		t.Fatalf("Should have a leader")
   688  	}
   689  
   690  	// Check that the leader has a pending GC expiration
   691  	if !leader.tombstoneGC.PendingExpiration() {
   692  		t.Fatalf("should have pending expiration")
   693  	}
   694  
   695  	// Kill the leader
   696  	leader.Shutdown()
   697  	time.Sleep(100 * time.Millisecond)
   698  
   699  	// Wait for a new leader
   700  	leader = nil
   701  	retry.Run(t, func(r *retry.R) {
   702  		for _, s := range servers {
   703  			if s.IsLeader() {
   704  				leader = s
   705  				return
   706  			}
   707  		}
   708  		r.Fatal("no leader")
   709  	})
   710  
   711  	retry.Run(t, func(r *retry.R) {
   712  		if !leader.tombstoneGC.PendingExpiration() {
   713  			r.Fatal("leader has no pending GC expiration")
   714  		}
   715  	})
   716  }
   717  
   718  func TestLeader_ReapTombstones(t *testing.T) {
   719  	t.Parallel()
   720  	dir1, s1 := testServerWithConfig(t, func(c *Config) {
   721  		c.ACLDatacenter = "dc1"
   722  		c.ACLsEnabled = true
   723  		c.ACLMasterToken = "root"
   724  		c.ACLDefaultPolicy = "deny"
   725  		c.TombstoneTTL = 50 * time.Millisecond
   726  		c.TombstoneTTLGranularity = 10 * time.Millisecond
   727  	})
   728  	defer os.RemoveAll(dir1)
   729  	defer s1.Shutdown()
   730  	codec := rpcClient(t, s1)
   731  
   732  	testrpc.WaitForLeader(t, s1.RPC, "dc1")
   733  
   734  	// Create a KV entry
   735  	arg := structs.KVSRequest{
   736  		Datacenter: "dc1",
   737  		Op:         api.KVSet,
   738  		DirEnt: structs.DirEntry{
   739  			Key:   "test",
   740  			Value: []byte("test"),
   741  		},
   742  		WriteRequest: structs.WriteRequest{
   743  			Token: "root",
   744  		},
   745  	}
   746  	var out bool
   747  	if err := msgpackrpc.CallWithCodec(codec, "KVS.Apply", &arg, &out); err != nil {
   748  		t.Fatalf("err: %v", err)
   749  	}
   750  
   751  	// Delete the KV entry (tombstoned).
   752  	arg.Op = api.KVDelete
   753  	if err := msgpackrpc.CallWithCodec(codec, "KVS.Apply", &arg, &out); err != nil {
   754  		t.Fatalf("err: %v", err)
   755  	}
   756  
   757  	// Make sure there's a tombstone.
   758  	state := s1.fsm.State()
   759  	func() {
   760  		snap := state.Snapshot()
   761  		defer snap.Close()
   762  		stones, err := snap.Tombstones()
   763  		if err != nil {
   764  			t.Fatalf("err: %s", err)
   765  		}
   766  		if stones.Next() == nil {
   767  			t.Fatalf("missing tombstones")
   768  		}
   769  		if stones.Next() != nil {
   770  			t.Fatalf("unexpected extra tombstones")
   771  		}
   772  	}()
   773  
   774  	// Check that the new leader has a pending GC expiration by
   775  	// watching for the tombstone to get removed.
   776  	retry.Run(t, func(r *retry.R) {
   777  		snap := state.Snapshot()
   778  		defer snap.Close()
   779  		stones, err := snap.Tombstones()
   780  		if err != nil {
   781  			r.Fatal(err)
   782  		}
   783  		if stones.Next() != nil {
   784  			r.Fatal("should have no tombstones")
   785  		}
   786  	})
   787  }
   788  
   789  func TestLeader_RollRaftServer(t *testing.T) {
   790  	t.Parallel()
   791  	dir1, s1 := testServerWithConfig(t, func(c *Config) {
   792  		c.Bootstrap = true
   793  		c.Datacenter = "dc1"
   794  		c.RaftConfig.ProtocolVersion = 2
   795  	})
   796  	defer os.RemoveAll(dir1)
   797  	defer s1.Shutdown()
   798  
   799  	dir2, s2 := testServerWithConfig(t, func(c *Config) {
   800  		c.Bootstrap = false
   801  		c.Datacenter = "dc1"
   802  		c.RaftConfig.ProtocolVersion = 1
   803  	})
   804  	defer os.RemoveAll(dir2)
   805  	defer s2.Shutdown()
   806  
   807  	dir3, s3 := testServerWithConfig(t, func(c *Config) {
   808  		c.Bootstrap = false
   809  		c.Datacenter = "dc1"
   810  		c.RaftConfig.ProtocolVersion = 2
   811  	})
   812  	defer os.RemoveAll(dir3)
   813  	defer s3.Shutdown()
   814  
   815  	servers := []*Server{s1, s2, s3}
   816  
   817  	// Try to join
   818  	joinLAN(t, s2, s1)
   819  	joinLAN(t, s3, s1)
   820  
   821  	for _, s := range servers {
   822  		retry.Run(t, func(r *retry.R) { r.Check(wantPeers(s, 3)) })
   823  	}
   824  
   825  	// Kill the v1 server
   826  	s2.Shutdown()
   827  
   828  	for _, s := range []*Server{s1, s3} {
   829  		retry.Run(t, func(r *retry.R) {
   830  			minVer, err := s.autopilot.MinRaftProtocol()
   831  			if err != nil {
   832  				r.Fatal(err)
   833  			}
   834  			if got, want := minVer, 2; got != want {
   835  				r.Fatalf("got min raft version %d want %d", got, want)
   836  			}
   837  		})
   838  	}
   839  
   840  	// Replace the dead server with one running raft protocol v3
   841  	dir4, s4 := testServerWithConfig(t, func(c *Config) {
   842  		c.Bootstrap = false
   843  		c.Datacenter = "dc1"
   844  		c.RaftConfig.ProtocolVersion = 3
   845  	})
   846  	defer os.RemoveAll(dir4)
   847  	defer s4.Shutdown()
   848  	joinLAN(t, s4, s1)
   849  	servers[1] = s4
   850  
   851  	// Make sure the dead server is removed and we're back to 3 total peers
   852  	for _, s := range servers {
   853  		retry.Run(t, func(r *retry.R) {
   854  			addrs := 0
   855  			ids := 0
   856  			future := s.raft.GetConfiguration()
   857  			if err := future.Error(); err != nil {
   858  				r.Fatal(err)
   859  			}
   860  			for _, server := range future.Configuration().Servers {
   861  				if string(server.ID) == string(server.Address) {
   862  					addrs++
   863  				} else {
   864  					ids++
   865  				}
   866  			}
   867  			if got, want := addrs, 2; got != want {
   868  				r.Fatalf("got %d server addresses want %d", got, want)
   869  			}
   870  			if got, want := ids, 1; got != want {
   871  				r.Fatalf("got %d server ids want %d", got, want)
   872  			}
   873  		})
   874  	}
   875  }
   876  
   877  func TestLeader_ChangeServerID(t *testing.T) {
   878  	t.Parallel()
   879  	conf := func(c *Config) {
   880  		c.Bootstrap = false
   881  		c.BootstrapExpect = 3
   882  		c.Datacenter = "dc1"
   883  		c.RaftConfig.ProtocolVersion = 3
   884  	}
   885  	dir1, s1 := testServerWithConfig(t, conf)
   886  	defer os.RemoveAll(dir1)
   887  	defer s1.Shutdown()
   888  
   889  	dir2, s2 := testServerWithConfig(t, conf)
   890  	defer os.RemoveAll(dir2)
   891  	defer s2.Shutdown()
   892  
   893  	dir3, s3 := testServerWithConfig(t, conf)
   894  	defer os.RemoveAll(dir3)
   895  	defer s3.Shutdown()
   896  
   897  	servers := []*Server{s1, s2, s3}
   898  
   899  	// Try to join and wait for all servers to get promoted
   900  	joinLAN(t, s2, s1)
   901  	joinLAN(t, s3, s1)
   902  	for _, s := range servers {
   903  		testrpc.WaitForTestAgent(t, s.RPC, "dc1")
   904  		retry.Run(t, func(r *retry.R) { r.Check(wantPeers(s, 3)) })
   905  	}
   906  
   907  	// Shut down a server, freeing up its address/port
   908  	s3.Shutdown()
   909  
   910  	retry.Run(t, func(r *retry.R) {
   911  		alive := 0
   912  		for _, m := range s1.LANMembers() {
   913  			if m.Status == serf.StatusAlive {
   914  				alive++
   915  			}
   916  		}
   917  		if got, want := alive, 2; got != want {
   918  			r.Fatalf("got %d alive members want %d", got, want)
   919  		}
   920  	})
   921  
   922  	// Bring up a new server with s3's address that will get a different ID
   923  	dir4, s4 := testServerWithConfig(t, func(c *Config) {
   924  		c.Bootstrap = false
   925  		c.BootstrapExpect = 3
   926  		c.Datacenter = "dc1"
   927  		c.RaftConfig.ProtocolVersion = 3
   928  		c.SerfLANConfig.MemberlistConfig = s3.config.SerfLANConfig.MemberlistConfig
   929  		c.RPCAddr = s3.config.RPCAddr
   930  		c.RPCAdvertise = s3.config.RPCAdvertise
   931  	})
   932  	defer os.RemoveAll(dir4)
   933  	defer s4.Shutdown()
   934  	joinLAN(t, s4, s1)
   935  	testrpc.WaitForTestAgent(t, s1.RPC, "dc1")
   936  	testrpc.WaitForTestAgent(t, s4.RPC, "dc1")
   937  	servers[2] = s4
   938  
   939  	// While integrating #3327 it uncovered that this test was flaky. The
   940  	// connection pool would use the same TCP connection to the old server
   941  	// which would give EOF errors to the autopilot health check RPC call.
   942  	// To make this more reliable we changed the connection pool to throw
   943  	// away the connection if it sees an EOF error, since there's no way
   944  	// that connection is going to work again. This made this test reliable
   945  	// since it will make a new connection to s4.
   946  
   947  	// Make sure the dead server is removed and we're back to 3 total peers
   948  	retry.Run(t, func(r *retry.R) {
   949  		r.Check(wantRaft(servers))
   950  		for _, s := range servers {
   951  			r.Check(wantPeers(s, 3))
   952  		}
   953  	})
   954  }
   955  
   956  func TestLeader_ACL_Initialization(t *testing.T) {
   957  	t.Parallel()
   958  
   959  	tests := []struct {
   960  		name      string
   961  		build     string
   962  		master    string
   963  		bootstrap bool
   964  	}{
   965  		{"old version, no master", "0.8.0", "", true},
   966  		{"old version, master", "0.8.0", "root", false},
   967  		{"new version, no master", "0.9.1", "", true},
   968  		{"new version, master", "0.9.1", "root", false},
   969  	}
   970  	for _, tt := range tests {
   971  		t.Run(tt.name, func(t *testing.T) {
   972  			conf := func(c *Config) {
   973  				c.Build = tt.build
   974  				c.Bootstrap = true
   975  				c.Datacenter = "dc1"
   976  				c.ACLDatacenter = "dc1"
   977  				c.ACLsEnabled = true
   978  				c.ACLMasterToken = tt.master
   979  			}
   980  			dir1, s1 := testServerWithConfig(t, conf)
   981  			defer os.RemoveAll(dir1)
   982  			defer s1.Shutdown()
   983  			testrpc.WaitForTestAgent(t, s1.RPC, "dc1")
   984  
   985  			if tt.master != "" {
   986  				_, master, err := s1.fsm.State().ACLTokenGetBySecret(nil, tt.master)
   987  				require.NoError(t, err)
   988  				require.NotNil(t, master)
   989  			}
   990  
   991  			_, anon, err := s1.fsm.State().ACLTokenGetBySecret(nil, anonymousToken)
   992  			require.NoError(t, err)
   993  			require.NotNil(t, anon)
   994  
   995  			canBootstrap, _, err := s1.fsm.State().CanBootstrapACLToken()
   996  			require.NoError(t, err)
   997  			require.Equal(t, tt.bootstrap, canBootstrap)
   998  
   999  			_, policy, err := s1.fsm.State().ACLPolicyGetByID(nil, structs.ACLPolicyGlobalManagementID)
  1000  			require.NoError(t, err)
  1001  			require.NotNil(t, policy)
  1002  		})
  1003  	}
  1004  }
  1005  
  1006  func TestLeader_CARootPruning(t *testing.T) {
  1007  	t.Parallel()
  1008  
  1009  	caRootPruneInterval = 200 * time.Millisecond
  1010  
  1011  	require := require.New(t)
  1012  	dir1, s1 := testServer(t)
  1013  	defer os.RemoveAll(dir1)
  1014  	defer s1.Shutdown()
  1015  	codec := rpcClient(t, s1)
  1016  	defer codec.Close()
  1017  
  1018  	testrpc.WaitForTestAgent(t, s1.RPC, "dc1")
  1019  
  1020  	// Get the current root
  1021  	rootReq := &structs.DCSpecificRequest{
  1022  		Datacenter: "dc1",
  1023  	}
  1024  	var rootList structs.IndexedCARoots
  1025  	require.Nil(msgpackrpc.CallWithCodec(codec, "ConnectCA.Roots", rootReq, &rootList))
  1026  	require.Len(rootList.Roots, 1)
  1027  	oldRoot := rootList.Roots[0]
  1028  
  1029  	// Update the provider config to use a new private key, which should
  1030  	// cause a rotation.
  1031  	_, newKey, err := connect.GeneratePrivateKey()
  1032  	require.NoError(err)
  1033  	newConfig := &structs.CAConfiguration{
  1034  		Provider: "consul",
  1035  		Config: map[string]interface{}{
  1036  			"LeafCertTTL":    "500ms",
  1037  			"PrivateKey":     newKey,
  1038  			"RootCert":       "",
  1039  			"RotationPeriod": "2160h",
  1040  			"SkipValidate":   true,
  1041  		},
  1042  	}
  1043  	{
  1044  		args := &structs.CARequest{
  1045  			Datacenter: "dc1",
  1046  			Config:     newConfig,
  1047  		}
  1048  		var reply interface{}
  1049  
  1050  		require.NoError(msgpackrpc.CallWithCodec(codec, "ConnectCA.ConfigurationSet", args, &reply))
  1051  	}
  1052  
  1053  	// Should have 2 roots now.
  1054  	_, roots, err := s1.fsm.State().CARoots(nil)
  1055  	require.NoError(err)
  1056  	require.Len(roots, 2)
  1057  
  1058  	time.Sleep(2 * time.Second)
  1059  
  1060  	// Now the old root should be pruned.
  1061  	_, roots, err = s1.fsm.State().CARoots(nil)
  1062  	require.NoError(err)
  1063  	require.Len(roots, 1)
  1064  	require.True(roots[0].Active)
  1065  	require.NotEqual(roots[0].ID, oldRoot.ID)
  1066  }
  1067  
  1068  func TestLeader_PersistIntermediateCAs(t *testing.T) {
  1069  	t.Parallel()
  1070  
  1071  	require := require.New(t)
  1072  	dir1, s1 := testServer(t)
  1073  	defer os.RemoveAll(dir1)
  1074  	defer s1.Shutdown()
  1075  	codec := rpcClient(t, s1)
  1076  	defer codec.Close()
  1077  
  1078  	dir2, s2 := testServerDCBootstrap(t, "dc1", false)
  1079  	defer os.RemoveAll(dir2)
  1080  	defer s2.Shutdown()
  1081  
  1082  	dir3, s3 := testServerDCBootstrap(t, "dc1", false)
  1083  	defer os.RemoveAll(dir3)
  1084  	defer s3.Shutdown()
  1085  
  1086  	joinLAN(t, s2, s1)
  1087  	joinLAN(t, s3, s1)
  1088  
  1089  	testrpc.WaitForLeader(t, s1.RPC, "dc1")
  1090  
  1091  	// Get the current root
  1092  	rootReq := &structs.DCSpecificRequest{
  1093  		Datacenter: "dc1",
  1094  	}
  1095  	var rootList structs.IndexedCARoots
  1096  	require.Nil(msgpackrpc.CallWithCodec(codec, "ConnectCA.Roots", rootReq, &rootList))
  1097  	require.Len(rootList.Roots, 1)
  1098  
  1099  	// Update the provider config to use a new private key, which should
  1100  	// cause a rotation.
  1101  	_, newKey, err := connect.GeneratePrivateKey()
  1102  	require.NoError(err)
  1103  	newConfig := &structs.CAConfiguration{
  1104  		Provider: "consul",
  1105  		Config: map[string]interface{}{
  1106  			"PrivateKey":     newKey,
  1107  			"RootCert":       "",
  1108  			"RotationPeriod": 90 * 24 * time.Hour,
  1109  		},
  1110  	}
  1111  	{
  1112  		args := &structs.CARequest{
  1113  			Datacenter: "dc1",
  1114  			Config:     newConfig,
  1115  		}
  1116  		var reply interface{}
  1117  
  1118  		require.NoError(msgpackrpc.CallWithCodec(codec, "ConnectCA.ConfigurationSet", args, &reply))
  1119  	}
  1120  
  1121  	// Get the active root before leader change.
  1122  	_, root := s1.getCAProvider()
  1123  	require.Len(root.IntermediateCerts, 1)
  1124  
  1125  	// Force a leader change and make sure the root CA values are preserved.
  1126  	s1.Leave()
  1127  	s1.Shutdown()
  1128  
  1129  	retry.Run(t, func(r *retry.R) {
  1130  		var leader *Server
  1131  		for _, s := range []*Server{s2, s3} {
  1132  			if s.IsLeader() {
  1133  				leader = s
  1134  				break
  1135  			}
  1136  		}
  1137  		if leader == nil {
  1138  			r.Fatal("no leader")
  1139  		}
  1140  
  1141  		_, newLeaderRoot := leader.getCAProvider()
  1142  		if !reflect.DeepEqual(newLeaderRoot, root) {
  1143  			r.Fatalf("got %v, want %v", newLeaderRoot, root)
  1144  		}
  1145  	})
  1146  }
  1147  
  1148  func TestLeader_ACLUpgrade(t *testing.T) {
  1149  	t.Parallel()
  1150  	dir1, s1 := testServerWithConfig(t, func(c *Config) {
  1151  		c.ACLsEnabled = true
  1152  		c.ACLMasterToken = "root"
  1153  	})
  1154  	defer os.RemoveAll(dir1)
  1155  	defer s1.Shutdown()
  1156  	testrpc.WaitForTestAgent(t, s1.RPC, "dc1")
  1157  	codec := rpcClient(t, s1)
  1158  	defer codec.Close()
  1159  
  1160  	// create a legacy management ACL
  1161  	mgmt := structs.ACLRequest{
  1162  		Datacenter: "dc1",
  1163  		Op:         structs.ACLSet,
  1164  		ACL: structs.ACL{
  1165  			Name: "Management token",
  1166  			Type: structs.ACLTokenTypeManagement,
  1167  		},
  1168  		WriteRequest: structs.WriteRequest{Token: "root"},
  1169  	}
  1170  	var mgmt_id string
  1171  	require.NoError(t, msgpackrpc.CallWithCodec(codec, "ACL.Apply", &mgmt, &mgmt_id))
  1172  
  1173  	// wait for it to be upgraded
  1174  	retry.Run(t, func(t *retry.R) {
  1175  		_, token, err := s1.fsm.State().ACLTokenGetBySecret(nil, mgmt_id)
  1176  		require.NoError(t, err)
  1177  		require.NotNil(t, token)
  1178  		require.NotEqual(t, "", token.AccessorID)
  1179  		require.Equal(t, structs.ACLTokenTypeManagement, token.Type)
  1180  		require.Len(t, token.Policies, 1)
  1181  		require.Equal(t, structs.ACLPolicyGlobalManagementID, token.Policies[0].ID)
  1182  	})
  1183  
  1184  	// create a legacy management ACL
  1185  	client := structs.ACLRequest{
  1186  		Datacenter: "dc1",
  1187  		Op:         structs.ACLSet,
  1188  		ACL: structs.ACL{
  1189  			Name:  "Management token",
  1190  			Type:  structs.ACLTokenTypeClient,
  1191  			Rules: `node "" { policy = "read"}`,
  1192  		},
  1193  		WriteRequest: structs.WriteRequest{Token: "root"},
  1194  	}
  1195  	var client_id string
  1196  	require.NoError(t, msgpackrpc.CallWithCodec(codec, "ACL.Apply", &client, &client_id))
  1197  
  1198  	// wait for it to be upgraded
  1199  	retry.Run(t, func(t *retry.R) {
  1200  		_, token, err := s1.fsm.State().ACLTokenGetBySecret(nil, client_id)
  1201  		require.NoError(t, err)
  1202  		require.NotNil(t, token)
  1203  		require.NotEqual(t, "", token.AccessorID)
  1204  		require.Len(t, token.Policies, 0)
  1205  		require.Equal(t, structs.ACLTokenTypeClient, token.Type)
  1206  		require.Equal(t, client.ACL.Rules, token.Rules)
  1207  	})
  1208  }