github.com/Iqoqo/consul@v1.4.5/agent/consul/leader.go (about)

     1  package consul
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"net"
     7  	"strconv"
     8  	"strings"
     9  	"sync"
    10  	"sync/atomic"
    11  	"time"
    12  
    13  	"github.com/armon/go-metrics"
    14  	"github.com/hashicorp/consul/acl"
    15  	"github.com/hashicorp/consul/agent/connect"
    16  	ca "github.com/hashicorp/consul/agent/connect/ca"
    17  	"github.com/hashicorp/consul/agent/consul/autopilot"
    18  	"github.com/hashicorp/consul/agent/metadata"
    19  	"github.com/hashicorp/consul/agent/structs"
    20  	"github.com/hashicorp/consul/api"
    21  	"github.com/hashicorp/consul/lib"
    22  	"github.com/hashicorp/consul/types"
    23  	memdb "github.com/hashicorp/go-memdb"
    24  	uuid "github.com/hashicorp/go-uuid"
    25  	"github.com/hashicorp/go-version"
    26  	"github.com/hashicorp/raft"
    27  	"github.com/hashicorp/serf/serf"
    28  	"golang.org/x/time/rate"
    29  )
    30  
    31  const (
    32  	newLeaderEvent      = "consul:new-leader"
    33  	barrierWriteTimeout = 2 * time.Minute
    34  )
    35  
    36  var (
    37  	// caRootPruneInterval is how often we check for stale CARoots to remove.
    38  	caRootPruneInterval = time.Hour
    39  
    40  	// minAutopilotVersion is the minimum Consul version in which Autopilot features
    41  	// are supported.
    42  	minAutopilotVersion = version.Must(version.NewVersion("0.8.0"))
    43  )
    44  
    45  // monitorLeadership is used to monitor if we acquire or lose our role
    46  // as the leader in the Raft cluster. There is some work the leader is
    47  // expected to do, so we must react to changes
    48  func (s *Server) monitorLeadership() {
    49  	// We use the notify channel we configured Raft with, NOT Raft's
    50  	// leaderCh, which is only notified best-effort. Doing this ensures
    51  	// that we get all notifications in order, which is required for
    52  	// cleanup and to ensure we never run multiple leader loops.
    53  	raftNotifyCh := s.raftNotifyCh
    54  
    55  	aclModeCheckWait := aclModeCheckMinInterval
    56  	var aclUpgradeCh <-chan time.Time
    57  	if s.ACLsEnabled() {
    58  		aclUpgradeCh = time.After(aclModeCheckWait)
    59  	}
    60  	var weAreLeaderCh chan struct{}
    61  	var leaderLoop sync.WaitGroup
    62  	for {
    63  		select {
    64  		case isLeader := <-raftNotifyCh:
    65  			switch {
    66  			case isLeader:
    67  				if weAreLeaderCh != nil {
    68  					s.logger.Printf("[ERR] consul: attempted to start the leader loop while running")
    69  					continue
    70  				}
    71  
    72  				weAreLeaderCh = make(chan struct{})
    73  				leaderLoop.Add(1)
    74  				go func(ch chan struct{}) {
    75  					defer leaderLoop.Done()
    76  					s.leaderLoop(ch)
    77  				}(weAreLeaderCh)
    78  				s.logger.Printf("[INFO] consul: cluster leadership acquired")
    79  
    80  			default:
    81  				if weAreLeaderCh == nil {
    82  					s.logger.Printf("[ERR] consul: attempted to stop the leader loop while not running")
    83  					continue
    84  				}
    85  
    86  				s.logger.Printf("[DEBUG] consul: shutting down leader loop")
    87  				close(weAreLeaderCh)
    88  				leaderLoop.Wait()
    89  				weAreLeaderCh = nil
    90  				s.logger.Printf("[INFO] consul: cluster leadership lost")
    91  			}
    92  		case <-aclUpgradeCh:
    93  			if atomic.LoadInt32(&s.useNewACLs) == 0 {
    94  				aclModeCheckWait = aclModeCheckWait * 2
    95  				if aclModeCheckWait > aclModeCheckMaxInterval {
    96  					aclModeCheckWait = aclModeCheckMaxInterval
    97  				}
    98  				aclUpgradeCh = time.After(aclModeCheckWait)
    99  
   100  				if canUpgrade := s.canUpgradeToNewACLs(weAreLeaderCh != nil); canUpgrade {
   101  					if weAreLeaderCh != nil {
   102  						if err := s.initializeACLs(true); err != nil {
   103  							s.logger.Printf("[ERR] consul: error transitioning to using new ACLs: %v", err)
   104  							continue
   105  						}
   106  					}
   107  
   108  					s.logger.Printf("[DEBUG] acl: transitioning out of legacy ACL mode")
   109  					atomic.StoreInt32(&s.useNewACLs, 1)
   110  					s.updateACLAdvertisement()
   111  
   112  					// setting this to nil ensures that we will never hit this case again
   113  					aclUpgradeCh = nil
   114  				}
   115  			} else {
   116  				// establishLeadership probably transitioned us
   117  				aclUpgradeCh = nil
   118  			}
   119  		case <-s.shutdownCh:
   120  			return
   121  		}
   122  	}
   123  }
   124  
   125  // leaderLoop runs as long as we are the leader to run various
   126  // maintenance activities
   127  func (s *Server) leaderLoop(stopCh chan struct{}) {
   128  	// Fire a user event indicating a new leader
   129  	payload := []byte(s.config.NodeName)
   130  	for name, segment := range s.LANSegments() {
   131  		if err := segment.UserEvent(newLeaderEvent, payload, false); err != nil {
   132  			s.logger.Printf("[WARN] consul: failed to broadcast new leader event on segment %q: %v", name, err)
   133  		}
   134  	}
   135  
   136  	// Reconcile channel is only used once initial reconcile
   137  	// has succeeded
   138  	var reconcileCh chan serf.Member
   139  	establishedLeader := false
   140  
   141  	reassert := func() error {
   142  		if !establishedLeader {
   143  			return fmt.Errorf("leadership has not been established")
   144  		}
   145  		if err := s.revokeLeadership(); err != nil {
   146  			return err
   147  		}
   148  		if err := s.establishLeadership(); err != nil {
   149  			return err
   150  		}
   151  		return nil
   152  	}
   153  
   154  RECONCILE:
   155  	// Setup a reconciliation timer
   156  	reconcileCh = nil
   157  	interval := time.After(s.config.ReconcileInterval)
   158  
   159  	// Apply a raft barrier to ensure our FSM is caught up
   160  	start := time.Now()
   161  	barrier := s.raft.Barrier(barrierWriteTimeout)
   162  	if err := barrier.Error(); err != nil {
   163  		s.logger.Printf("[ERR] consul: failed to wait for barrier: %v", err)
   164  		goto WAIT
   165  	}
   166  	metrics.MeasureSince([]string{"leader", "barrier"}, start)
   167  
   168  	// Check if we need to handle initial leadership actions
   169  	if !establishedLeader {
   170  		if err := s.establishLeadership(); err != nil {
   171  			s.logger.Printf("[ERR] consul: failed to establish leadership: %v", err)
   172  			// Immediately revoke leadership since we didn't successfully
   173  			// establish leadership.
   174  			if err := s.revokeLeadership(); err != nil {
   175  				s.logger.Printf("[ERR] consul: failed to revoke leadership: %v", err)
   176  			}
   177  			goto WAIT
   178  		}
   179  		establishedLeader = true
   180  		defer func() {
   181  			if err := s.revokeLeadership(); err != nil {
   182  				s.logger.Printf("[ERR] consul: failed to revoke leadership: %v", err)
   183  			}
   184  		}()
   185  	}
   186  
   187  	// Reconcile any missing data
   188  	if err := s.reconcile(); err != nil {
   189  		s.logger.Printf("[ERR] consul: failed to reconcile: %v", err)
   190  		goto WAIT
   191  	}
   192  
   193  	// Initial reconcile worked, now we can process the channel
   194  	// updates
   195  	reconcileCh = s.reconcileCh
   196  
   197  WAIT:
   198  	// Poll the stop channel to give it priority so we don't waste time
   199  	// trying to perform the other operations if we have been asked to shut
   200  	// down.
   201  	select {
   202  	case <-stopCh:
   203  		return
   204  	default:
   205  	}
   206  
   207  	// Periodically reconcile as long as we are the leader,
   208  	// or when Serf events arrive
   209  	for {
   210  		select {
   211  		case <-stopCh:
   212  			return
   213  		case <-s.shutdownCh:
   214  			return
   215  		case <-interval:
   216  			goto RECONCILE
   217  		case member := <-reconcileCh:
   218  			s.reconcileMember(member)
   219  		case index := <-s.tombstoneGC.ExpireCh():
   220  			go s.reapTombstones(index)
   221  		case errCh := <-s.reassertLeaderCh:
   222  			errCh <- reassert()
   223  		}
   224  	}
   225  }
   226  
   227  // establishLeadership is invoked once we become leader and are able
   228  // to invoke an initial barrier. The barrier is used to ensure any
   229  // previously inflight transactions have been committed and that our
   230  // state is up-to-date.
   231  func (s *Server) establishLeadership() error {
   232  	// check for the upgrade here - this helps us transition to new ACLs much
   233  	// quicker if this is a new cluster or this is a test agent
   234  	if canUpgrade := s.canUpgradeToNewACLs(true); canUpgrade {
   235  		if err := s.initializeACLs(true); err != nil {
   236  			return err
   237  		}
   238  		atomic.StoreInt32(&s.useNewACLs, 1)
   239  		s.updateACLAdvertisement()
   240  	} else if err := s.initializeACLs(false); err != nil {
   241  		return err
   242  	}
   243  
   244  	// Hint the tombstone expiration timer. When we freshly establish leadership
   245  	// we become the authoritative timer, and so we need to start the clock
   246  	// on any pending GC events.
   247  	s.tombstoneGC.SetEnabled(true)
   248  	lastIndex := s.raft.LastIndex()
   249  	s.tombstoneGC.Hint(lastIndex)
   250  
   251  	// Setup the session timers. This is done both when starting up or when
   252  	// a leader fail over happens. Since the timers are maintained by the leader
   253  	// node along, effectively this means all the timers are renewed at the
   254  	// time of failover. The TTL contract is that the session will not be expired
   255  	// before the TTL, so expiring it later is allowable.
   256  	//
   257  	// This MUST be done after the initial barrier to ensure the latest Sessions
   258  	// are available to be initialized. Otherwise initialization may use stale
   259  	// data.
   260  	if err := s.initializeSessionTimers(); err != nil {
   261  		return err
   262  	}
   263  
   264  	s.getOrCreateAutopilotConfig()
   265  	s.autopilot.Start()
   266  
   267  	// todo(kyhavlov): start a goroutine here for handling periodic CA rotation
   268  	if err := s.initializeCA(); err != nil {
   269  		return err
   270  	}
   271  
   272  	s.startEnterpriseLeader()
   273  
   274  	s.startCARootPruning()
   275  
   276  	s.setConsistentReadReady()
   277  	return nil
   278  }
   279  
   280  // revokeLeadership is invoked once we step down as leader.
   281  // This is used to cleanup any state that may be specific to a leader.
   282  func (s *Server) revokeLeadership() error {
   283  	// Disable the tombstone GC, since it is only useful as a leader
   284  	s.tombstoneGC.SetEnabled(false)
   285  
   286  	// Clear the session timers on either shutdown or step down, since we
   287  	// are no longer responsible for session expirations.
   288  	if err := s.clearAllSessionTimers(); err != nil {
   289  		return err
   290  	}
   291  
   292  	s.stopEnterpriseLeader()
   293  
   294  	s.stopCARootPruning()
   295  
   296  	s.setCAProvider(nil, nil)
   297  
   298  	s.stopACLUpgrade()
   299  
   300  	s.resetConsistentReadReady()
   301  	s.autopilot.Stop()
   302  	return nil
   303  }
   304  
   305  // DEPRECATED (ACL-Legacy-Compat) - Remove once old ACL compatibility is removed
   306  func (s *Server) initializeLegacyACL() error {
   307  	if !s.ACLsEnabled() {
   308  		return nil
   309  	}
   310  
   311  	authDC := s.config.ACLDatacenter
   312  
   313  	// Create anonymous token if missing.
   314  	state := s.fsm.State()
   315  	_, token, err := state.ACLTokenGetBySecret(nil, anonymousToken)
   316  	if err != nil {
   317  		return fmt.Errorf("failed to get anonymous token: %v", err)
   318  	}
   319  	if token == nil {
   320  		req := structs.ACLRequest{
   321  			Datacenter: authDC,
   322  			Op:         structs.ACLSet,
   323  			ACL: structs.ACL{
   324  				ID:   anonymousToken,
   325  				Name: "Anonymous Token",
   326  				Type: structs.ACLTokenTypeClient,
   327  			},
   328  		}
   329  		_, err := s.raftApply(structs.ACLRequestType, &req)
   330  		if err != nil {
   331  			return fmt.Errorf("failed to create anonymous token: %v", err)
   332  		}
   333  		s.logger.Printf("[INFO] acl: Created the anonymous token")
   334  	}
   335  
   336  	// Check for configured master token.
   337  	if master := s.config.ACLMasterToken; len(master) > 0 {
   338  		_, token, err = state.ACLTokenGetBySecret(nil, master)
   339  		if err != nil {
   340  			return fmt.Errorf("failed to get master token: %v", err)
   341  		}
   342  		if token == nil {
   343  			req := structs.ACLRequest{
   344  				Datacenter: authDC,
   345  				Op:         structs.ACLSet,
   346  				ACL: structs.ACL{
   347  					ID:   master,
   348  					Name: "Master Token",
   349  					Type: structs.ACLTokenTypeManagement,
   350  				},
   351  			}
   352  			_, err := s.raftApply(structs.ACLRequestType, &req)
   353  			if err != nil {
   354  				return fmt.Errorf("failed to create master token: %v", err)
   355  			}
   356  			s.logger.Printf("[INFO] consul: Created ACL master token from configuration")
   357  		}
   358  	}
   359  
   360  	// Check to see if we need to initialize the ACL bootstrap info. This
   361  	// needs a Consul version check since it introduces a new Raft operation
   362  	// that'll produce an error on older servers, and it also makes a piece
   363  	// of state in the state store that will cause problems with older
   364  	// servers consuming snapshots, so we have to wait to create it.
   365  	var minVersion = version.Must(version.NewVersion("0.9.1"))
   366  	if ServersMeetMinimumVersion(s.LANMembers(), minVersion) {
   367  		canBootstrap, _, err := state.CanBootstrapACLToken()
   368  		if err != nil {
   369  			return fmt.Errorf("failed looking for ACL bootstrap info: %v", err)
   370  		}
   371  		if canBootstrap {
   372  			req := structs.ACLRequest{
   373  				Datacenter: authDC,
   374  				Op:         structs.ACLBootstrapInit,
   375  			}
   376  			resp, err := s.raftApply(structs.ACLRequestType, &req)
   377  			if err != nil {
   378  				return fmt.Errorf("failed to initialize ACL bootstrap: %v", err)
   379  			}
   380  			switch v := resp.(type) {
   381  			case error:
   382  				return fmt.Errorf("failed to initialize ACL bootstrap: %v", v)
   383  
   384  			case bool:
   385  				if v {
   386  					s.logger.Printf("[INFO] consul: ACL bootstrap enabled")
   387  				} else {
   388  					s.logger.Printf("[INFO] consul: ACL bootstrap disabled, existing management tokens found")
   389  				}
   390  
   391  			default:
   392  				return fmt.Errorf("unexpected response trying to initialize ACL bootstrap: %T", v)
   393  			}
   394  		}
   395  	} else {
   396  		s.logger.Printf("[WARN] consul: Can't initialize ACL bootstrap until all servers are >= %s", minVersion.String())
   397  	}
   398  
   399  	return nil
   400  }
   401  
   402  // initializeACLs is used to setup the ACLs if we are the leader
   403  // and need to do this.
   404  func (s *Server) initializeACLs(upgrade bool) error {
   405  	if !s.ACLsEnabled() {
   406  		return nil
   407  	}
   408  
   409  	// Purge the cache, since it could've changed while we were not the
   410  	// leader.
   411  	s.acls.cache.Purge()
   412  
   413  	// Remove any token affected by CVE-2019-8336
   414  	if !s.InACLDatacenter() {
   415  		_, token, err := s.fsm.State().ACLTokenGetBySecret(nil, redactedToken)
   416  		if err == nil && token != nil {
   417  			req := structs.ACLTokenBatchDeleteRequest{
   418  				TokenIDs: []string{token.AccessorID},
   419  			}
   420  
   421  			_, err := s.raftApply(structs.ACLTokenDeleteRequestType, &req)
   422  			if err != nil {
   423  				return fmt.Errorf("failed to remove token with a redacted secret: %v", err)
   424  			}
   425  		}
   426  	}
   427  
   428  	if s.InACLDatacenter() {
   429  		if s.UseLegacyACLs() && !upgrade {
   430  			s.logger.Printf("[INFO] acl: initializing legacy acls")
   431  			return s.initializeLegacyACL()
   432  		}
   433  
   434  		s.logger.Printf("[INFO] acl: initializing acls")
   435  
   436  		// Create the builtin global-management policy
   437  		_, policy, err := s.fsm.State().ACLPolicyGetByID(nil, structs.ACLPolicyGlobalManagementID)
   438  		if err != nil {
   439  			return fmt.Errorf("failed to get the builtin global-management policy")
   440  		}
   441  		if policy == nil {
   442  			policy := structs.ACLPolicy{
   443  				ID:          structs.ACLPolicyGlobalManagementID,
   444  				Name:        "global-management",
   445  				Description: "Builtin Policy that grants unlimited access",
   446  				Rules:       structs.ACLPolicyGlobalManagement,
   447  				Syntax:      acl.SyntaxCurrent,
   448  			}
   449  			policy.SetHash(true)
   450  
   451  			req := structs.ACLPolicyBatchSetRequest{
   452  				Policies: structs.ACLPolicies{&policy},
   453  			}
   454  			_, err := s.raftApply(structs.ACLPolicySetRequestType, &req)
   455  			if err != nil {
   456  				return fmt.Errorf("failed to create global-management policy: %v", err)
   457  			}
   458  			s.logger.Printf("[INFO] consul: Created ACL 'global-management' policy")
   459  		}
   460  
   461  		// Check for configured master token.
   462  		if master := s.config.ACLMasterToken; len(master) > 0 {
   463  			state := s.fsm.State()
   464  			if _, err := uuid.ParseUUID(master); err != nil {
   465  				s.logger.Printf("[WARN] consul: Configuring a non-UUID master token is deprecated")
   466  			}
   467  
   468  			_, token, err := state.ACLTokenGetBySecret(nil, master)
   469  			if err != nil {
   470  				return fmt.Errorf("failed to get master token: %v", err)
   471  			}
   472  			if token == nil {
   473  				accessor, err := lib.GenerateUUID(s.checkTokenUUID)
   474  				if err != nil {
   475  					return fmt.Errorf("failed to generate the accessor ID for the master token: %v", err)
   476  				}
   477  
   478  				token := structs.ACLToken{
   479  					AccessorID:  accessor,
   480  					SecretID:    master,
   481  					Description: "Master Token",
   482  					Policies: []structs.ACLTokenPolicyLink{
   483  						{
   484  							ID: structs.ACLPolicyGlobalManagementID,
   485  						},
   486  					},
   487  					CreateTime: time.Now(),
   488  					Local:      false,
   489  
   490  					// DEPRECATED (ACL-Legacy-Compat) - only needed for compatibility
   491  					Type: structs.ACLTokenTypeManagement,
   492  				}
   493  
   494  				token.SetHash(true)
   495  
   496  				done := false
   497  				if canBootstrap, _, err := state.CanBootstrapACLToken(); err == nil && canBootstrap {
   498  					req := structs.ACLTokenBootstrapRequest{
   499  						Token:      token,
   500  						ResetIndex: 0,
   501  					}
   502  					if _, err := s.raftApply(structs.ACLBootstrapRequestType, &req); err == nil {
   503  						s.logger.Printf("[INFO] consul: Bootstrapped ACL master token from configuration")
   504  						done = true
   505  					} else {
   506  						if err.Error() != structs.ACLBootstrapNotAllowedErr.Error() &&
   507  							err.Error() != structs.ACLBootstrapInvalidResetIndexErr.Error() {
   508  							return fmt.Errorf("failed to bootstrap master token: %v", err)
   509  						}
   510  					}
   511  				}
   512  
   513  				if !done {
   514  					// either we didn't attempt to or setting the token with a bootstrap request failed.
   515  					req := structs.ACLTokenBatchSetRequest{
   516  						Tokens: structs.ACLTokens{&token},
   517  						CAS:    false,
   518  					}
   519  					if _, err := s.raftApply(structs.ACLTokenSetRequestType, &req); err != nil {
   520  						return fmt.Errorf("failed to create master token: %v", err)
   521  					}
   522  
   523  					s.logger.Printf("[INFO] consul: Created ACL master token from configuration")
   524  				}
   525  			}
   526  		}
   527  
   528  		state := s.fsm.State()
   529  		_, token, err := state.ACLTokenGetBySecret(nil, structs.ACLTokenAnonymousID)
   530  		if err != nil {
   531  			return fmt.Errorf("failed to get anonymous token: %v", err)
   532  		}
   533  		if token == nil {
   534  			// DEPRECATED (ACL-Legacy-Compat) - Don't need to query for previous "anonymous" token
   535  			// check for legacy token that needs an upgrade
   536  			_, legacyToken, err := state.ACLTokenGetBySecret(nil, anonymousToken)
   537  			if err != nil {
   538  				return fmt.Errorf("failed to get anonymous token: %v", err)
   539  			}
   540  
   541  			// the token upgrade routine will take care of upgrading the token if a legacy version exists
   542  			if legacyToken == nil {
   543  				token = &structs.ACLToken{
   544  					AccessorID:  structs.ACLTokenAnonymousID,
   545  					SecretID:    anonymousToken,
   546  					Description: "Anonymous Token",
   547  					CreateTime:  time.Now(),
   548  				}
   549  				token.SetHash(true)
   550  
   551  				req := structs.ACLTokenBatchSetRequest{
   552  					Tokens: structs.ACLTokens{token},
   553  					CAS:    false,
   554  				}
   555  				_, err := s.raftApply(structs.ACLTokenSetRequestType, &req)
   556  				if err != nil {
   557  					return fmt.Errorf("failed to create anonymous token: %v", err)
   558  				}
   559  				s.logger.Printf("[INFO] consul: Created ACL anonymous token from configuration")
   560  			}
   561  		}
   562  		s.startACLUpgrade()
   563  	} else {
   564  		if s.UseLegacyACLs() && !upgrade {
   565  			if s.IsACLReplicationEnabled() {
   566  				s.startLegacyACLReplication()
   567  			}
   568  		}
   569  
   570  		if upgrade {
   571  			s.stopACLReplication()
   572  		}
   573  
   574  		// ACL replication is now mandatory
   575  		s.startACLReplication()
   576  	}
   577  
   578  	// launch the upgrade go routine to generate accessors for everything
   579  
   580  	return nil
   581  }
   582  
   583  func (s *Server) startACLUpgrade() {
   584  	s.aclUpgradeLock.Lock()
   585  	defer s.aclUpgradeLock.Unlock()
   586  
   587  	if s.aclUpgradeEnabled {
   588  		return
   589  	}
   590  
   591  	ctx, cancel := context.WithCancel(context.Background())
   592  	s.aclUpgradeCancel = cancel
   593  
   594  	go func() {
   595  		limiter := rate.NewLimiter(aclUpgradeRateLimit, int(aclUpgradeRateLimit))
   596  		for {
   597  			if err := limiter.Wait(ctx); err != nil {
   598  				return
   599  			}
   600  
   601  			// actually run the upgrade here
   602  			state := s.fsm.State()
   603  			tokens, waitCh, err := state.ACLTokenListUpgradeable(aclUpgradeBatchSize)
   604  			if err != nil {
   605  				s.logger.Printf("[WARN] acl: encountered an error while searching for tokens without accessor ids: %v", err)
   606  			}
   607  
   608  			if len(tokens) == 0 {
   609  				ws := memdb.NewWatchSet()
   610  				ws.Add(state.AbandonCh())
   611  				ws.Add(waitCh)
   612  				ws.Add(ctx.Done())
   613  
   614  				// wait for more tokens to need upgrading or the aclUpgradeCh to be closed
   615  				ws.Watch(nil)
   616  				continue
   617  			}
   618  
   619  			var newTokens structs.ACLTokens
   620  			for _, token := range tokens {
   621  				// This should be entirely unnecessary but is just a small safeguard against changing accessor IDs
   622  				if token.AccessorID != "" {
   623  					continue
   624  				}
   625  
   626  				newToken := *token
   627  				if token.SecretID == anonymousToken {
   628  					newToken.AccessorID = structs.ACLTokenAnonymousID
   629  				} else {
   630  					accessor, err := lib.GenerateUUID(s.checkTokenUUID)
   631  					if err != nil {
   632  						s.logger.Printf("[WARN] acl: failed to generate accessor during token auto-upgrade: %v", err)
   633  						continue
   634  					}
   635  					newToken.AccessorID = accessor
   636  				}
   637  
   638  				// Assign the global-management policy to legacy management tokens
   639  				if len(newToken.Policies) == 0 && newToken.Type == structs.ACLTokenTypeManagement {
   640  					newToken.Policies = append(newToken.Policies, structs.ACLTokenPolicyLink{ID: structs.ACLPolicyGlobalManagementID})
   641  				}
   642  
   643  				// need to copy these as we are going to do a CAS operation.
   644  				newToken.CreateIndex = token.CreateIndex
   645  				newToken.ModifyIndex = token.ModifyIndex
   646  
   647  				newToken.SetHash(true)
   648  
   649  				newTokens = append(newTokens, &newToken)
   650  			}
   651  
   652  			req := &structs.ACLTokenBatchSetRequest{Tokens: newTokens, CAS: true}
   653  
   654  			resp, err := s.raftApply(structs.ACLTokenSetRequestType, req)
   655  			if err != nil {
   656  				s.logger.Printf("[ERR] acl: failed to apply acl token upgrade batch: %v", err)
   657  			}
   658  
   659  			if err, ok := resp.(error); ok {
   660  				s.logger.Printf("[ERR] acl: failed to apply acl token upgrade batch: %v", err)
   661  			}
   662  		}
   663  	}()
   664  
   665  	s.aclUpgradeEnabled = true
   666  }
   667  
   668  func (s *Server) stopACLUpgrade() {
   669  	s.aclUpgradeLock.Lock()
   670  	defer s.aclUpgradeLock.Unlock()
   671  
   672  	if !s.aclUpgradeEnabled {
   673  		return
   674  	}
   675  
   676  	s.aclUpgradeCancel()
   677  	s.aclUpgradeCancel = nil
   678  	s.aclUpgradeEnabled = false
   679  }
   680  
   681  func (s *Server) startLegacyACLReplication() {
   682  	s.aclReplicationLock.Lock()
   683  	defer s.aclReplicationLock.Unlock()
   684  
   685  	if s.aclReplicationEnabled {
   686  		return
   687  	}
   688  
   689  	s.initReplicationStatus()
   690  	ctx, cancel := context.WithCancel(context.Background())
   691  	s.aclReplicationCancel = cancel
   692  
   693  	go func() {
   694  		var lastRemoteIndex uint64
   695  		limiter := rate.NewLimiter(rate.Limit(s.config.ACLReplicationRate), s.config.ACLReplicationBurst)
   696  
   697  		for {
   698  			if err := limiter.Wait(ctx); err != nil {
   699  				return
   700  			}
   701  
   702  			if s.tokens.ReplicationToken() == "" {
   703  				continue
   704  			}
   705  
   706  			index, exit, err := s.replicateLegacyACLs(lastRemoteIndex, ctx)
   707  			if exit {
   708  				return
   709  			}
   710  
   711  			if err != nil {
   712  				lastRemoteIndex = 0
   713  				s.updateACLReplicationStatusError()
   714  				s.logger.Printf("[WARN] consul: Legacy ACL replication error (will retry if still leader): %v", err)
   715  			} else {
   716  				lastRemoteIndex = index
   717  				s.updateACLReplicationStatusIndex(index)
   718  				s.logger.Printf("[DEBUG] consul: Legacy ACL replication completed through remote index %d", index)
   719  			}
   720  		}
   721  	}()
   722  
   723  	s.updateACLReplicationStatusRunning(structs.ACLReplicateLegacy)
   724  	s.aclReplicationEnabled = true
   725  }
   726  
   727  func (s *Server) startACLReplication() {
   728  	s.aclReplicationLock.Lock()
   729  	defer s.aclReplicationLock.Unlock()
   730  
   731  	if s.aclReplicationEnabled {
   732  		return
   733  	}
   734  
   735  	s.initReplicationStatus()
   736  	ctx, cancel := context.WithCancel(context.Background())
   737  	s.aclReplicationCancel = cancel
   738  
   739  	replicationType := structs.ACLReplicatePolicies
   740  
   741  	go func() {
   742  		var failedAttempts uint
   743  		limiter := rate.NewLimiter(rate.Limit(s.config.ACLReplicationRate), s.config.ACLReplicationBurst)
   744  
   745  		var lastRemoteIndex uint64
   746  		for {
   747  			if err := limiter.Wait(ctx); err != nil {
   748  				return
   749  			}
   750  
   751  			if s.tokens.ReplicationToken() == "" {
   752  				continue
   753  			}
   754  
   755  			index, exit, err := s.replicateACLPolicies(lastRemoteIndex, ctx)
   756  			if exit {
   757  				return
   758  			}
   759  
   760  			if err != nil {
   761  				lastRemoteIndex = 0
   762  				s.updateACLReplicationStatusError()
   763  				s.logger.Printf("[WARN] consul: ACL policy replication error (will retry if still leader): %v", err)
   764  				if (1 << failedAttempts) < aclReplicationMaxRetryBackoff {
   765  					failedAttempts++
   766  				}
   767  
   768  				select {
   769  				case <-ctx.Done():
   770  					return
   771  				case <-time.After((1 << failedAttempts) * time.Second):
   772  					// do nothing
   773  				}
   774  			} else {
   775  				lastRemoteIndex = index
   776  				s.updateACLReplicationStatusIndex(index)
   777  				s.logger.Printf("[DEBUG] consul: ACL policy replication completed through remote index %d", index)
   778  				failedAttempts = 0
   779  			}
   780  		}
   781  	}()
   782  
   783  	s.logger.Printf("[INFO] acl: started ACL Policy replication")
   784  
   785  	if s.config.ACLTokenReplication {
   786  		replicationType = structs.ACLReplicateTokens
   787  
   788  		go func() {
   789  			var failedAttempts uint
   790  			limiter := rate.NewLimiter(rate.Limit(s.config.ACLReplicationRate), s.config.ACLReplicationBurst)
   791  			var lastRemoteIndex uint64
   792  			for {
   793  				if err := limiter.Wait(ctx); err != nil {
   794  					return
   795  				}
   796  
   797  				if s.tokens.ReplicationToken() == "" {
   798  					continue
   799  				}
   800  
   801  				index, exit, err := s.replicateACLTokens(lastRemoteIndex, ctx)
   802  				if exit {
   803  					return
   804  				}
   805  
   806  				if err != nil {
   807  					lastRemoteIndex = 0
   808  					s.updateACLReplicationStatusError()
   809  					s.logger.Printf("[WARN] consul: ACL token replication error (will retry if still leader): %v", err)
   810  					if (1 << failedAttempts) < aclReplicationMaxRetryBackoff {
   811  						failedAttempts++
   812  					}
   813  
   814  					select {
   815  					case <-ctx.Done():
   816  						return
   817  					case <-time.After((1 << failedAttempts) * time.Second):
   818  						// do nothing
   819  					}
   820  				} else {
   821  					lastRemoteIndex = index
   822  					s.updateACLReplicationStatusTokenIndex(index)
   823  					s.logger.Printf("[DEBUG] consul: ACL token replication completed through remote index %d", index)
   824  					failedAttempts = 0
   825  				}
   826  			}
   827  		}()
   828  
   829  		s.logger.Printf("[INFO] acl: started ACL Token replication")
   830  	}
   831  
   832  	s.updateACLReplicationStatusRunning(replicationType)
   833  
   834  	s.aclReplicationEnabled = true
   835  }
   836  
   837  func (s *Server) stopACLReplication() {
   838  	s.aclReplicationLock.Lock()
   839  	defer s.aclReplicationLock.Unlock()
   840  
   841  	if !s.aclReplicationEnabled {
   842  		return
   843  	}
   844  
   845  	s.aclReplicationCancel()
   846  	s.aclReplicationCancel = nil
   847  	s.updateACLReplicationStatusStopped()
   848  	s.aclReplicationEnabled = false
   849  }
   850  
   851  // getOrCreateAutopilotConfig is used to get the autopilot config, initializing it if necessary
   852  func (s *Server) getOrCreateAutopilotConfig() *autopilot.Config {
   853  	state := s.fsm.State()
   854  	_, config, err := state.AutopilotConfig()
   855  	if err != nil {
   856  		s.logger.Printf("[ERR] autopilot: failed to get config: %v", err)
   857  		return nil
   858  	}
   859  	if config != nil {
   860  		return config
   861  	}
   862  
   863  	if !ServersMeetMinimumVersion(s.LANMembers(), minAutopilotVersion) {
   864  		s.logger.Printf("[WARN] autopilot: can't initialize until all servers are >= %s", minAutopilotVersion.String())
   865  		return nil
   866  	}
   867  
   868  	config = s.config.AutopilotConfig
   869  	req := structs.AutopilotSetConfigRequest{Config: *config}
   870  	if _, err = s.raftApply(structs.AutopilotRequestType, req); err != nil {
   871  		s.logger.Printf("[ERR] autopilot: failed to initialize config: %v", err)
   872  		return nil
   873  	}
   874  
   875  	return config
   876  }
   877  
   878  // initializeCAConfig is used to initialize the CA config if necessary
   879  // when setting up the CA during establishLeadership
   880  func (s *Server) initializeCAConfig() (*structs.CAConfiguration, error) {
   881  	state := s.fsm.State()
   882  	_, config, err := state.CAConfig()
   883  	if err != nil {
   884  		return nil, err
   885  	}
   886  	if config != nil {
   887  		return config, nil
   888  	}
   889  
   890  	config = s.config.CAConfig
   891  	if config.ClusterID == "" {
   892  		id, err := uuid.GenerateUUID()
   893  		if err != nil {
   894  			return nil, err
   895  		}
   896  		config.ClusterID = id
   897  	}
   898  
   899  	req := structs.CARequest{
   900  		Op:     structs.CAOpSetConfig,
   901  		Config: config,
   902  	}
   903  	if _, err = s.raftApply(structs.ConnectCARequestType, req); err != nil {
   904  		return nil, err
   905  	}
   906  
   907  	return config, nil
   908  }
   909  
   910  // initializeRootCA runs the initialization logic for a root CA.
   911  func (s *Server) initializeRootCA(provider ca.Provider, conf *structs.CAConfiguration) error {
   912  	if err := provider.Configure(conf.ClusterID, true, conf.Config); err != nil {
   913  		return fmt.Errorf("error configuring provider: %v", err)
   914  	}
   915  	if err := provider.GenerateRoot(); err != nil {
   916  		return fmt.Errorf("error generating CA root certificate: %v", err)
   917  	}
   918  
   919  	// Get the active root cert from the CA
   920  	rootPEM, err := provider.ActiveRoot()
   921  	if err != nil {
   922  		return fmt.Errorf("error getting root cert: %v", err)
   923  	}
   924  
   925  	rootCA, err := parseCARoot(rootPEM, conf.Provider, conf.ClusterID)
   926  	if err != nil {
   927  		return err
   928  	}
   929  
   930  	// Check if the CA root is already initialized and exit if it is,
   931  	// adding on any existing intermediate certs since they aren't directly
   932  	// tied to the provider.
   933  	// Every change to the CA after this initial bootstrapping should
   934  	// be done through the rotation process.
   935  	state := s.fsm.State()
   936  	_, activeRoot, err := state.CARootActive(nil)
   937  	if err != nil {
   938  		return err
   939  	}
   940  	if activeRoot != nil {
   941  		// This state shouldn't be possible to get into because we update the root and
   942  		// CA config in the same FSM operation.
   943  		if activeRoot.ID != rootCA.ID {
   944  			return fmt.Errorf("stored CA root %q is not the active root (%s)", rootCA.ID, activeRoot.ID)
   945  		}
   946  
   947  		rootCA.IntermediateCerts = activeRoot.IntermediateCerts
   948  		s.setCAProvider(provider, rootCA)
   949  
   950  		return nil
   951  	}
   952  
   953  	// Get the highest index
   954  	idx, _, err := state.CARoots(nil)
   955  	if err != nil {
   956  		return err
   957  	}
   958  
   959  	// Store the root cert in raft
   960  	resp, err := s.raftApply(structs.ConnectCARequestType, &structs.CARequest{
   961  		Op:    structs.CAOpSetRoots,
   962  		Index: idx,
   963  		Roots: []*structs.CARoot{rootCA},
   964  	})
   965  	if err != nil {
   966  		s.logger.Printf("[ERR] connect: Apply failed %v", err)
   967  		return err
   968  	}
   969  	if respErr, ok := resp.(error); ok {
   970  		return respErr
   971  	}
   972  
   973  	s.setCAProvider(provider, rootCA)
   974  
   975  	s.logger.Printf("[INFO] connect: initialized primary datacenter CA with provider %q", conf.Provider)
   976  
   977  	return nil
   978  }
   979  
   980  // parseCARoot returns a filled-in structs.CARoot from a raw PEM value.
   981  func parseCARoot(pemValue, provider, clusterID string) (*structs.CARoot, error) {
   982  	id, err := connect.CalculateCertFingerprint(pemValue)
   983  	if err != nil {
   984  		return nil, fmt.Errorf("error parsing root fingerprint: %v", err)
   985  	}
   986  	rootCert, err := connect.ParseCert(pemValue)
   987  	if err != nil {
   988  		return nil, fmt.Errorf("error parsing root cert: %v", err)
   989  	}
   990  	return &structs.CARoot{
   991  		ID:                  id,
   992  		Name:                fmt.Sprintf("%s CA Root Cert", strings.Title(provider)),
   993  		SerialNumber:        rootCert.SerialNumber.Uint64(),
   994  		SigningKeyID:        connect.HexString(rootCert.AuthorityKeyId),
   995  		ExternalTrustDomain: clusterID,
   996  		NotBefore:           rootCert.NotBefore,
   997  		NotAfter:            rootCert.NotAfter,
   998  		RootCert:            pemValue,
   999  		Active:              true,
  1000  	}, nil
  1001  }
  1002  
  1003  // createProvider returns a connect CA provider from the given config.
  1004  func (s *Server) createCAProvider(conf *structs.CAConfiguration) (ca.Provider, error) {
  1005  	switch conf.Provider {
  1006  	case structs.ConsulCAProvider:
  1007  		return &ca.ConsulProvider{Delegate: &consulCADelegate{s}}, nil
  1008  	case structs.VaultCAProvider:
  1009  		return &ca.VaultProvider{}, nil
  1010  	default:
  1011  		return nil, fmt.Errorf("unknown CA provider %q", conf.Provider)
  1012  	}
  1013  }
  1014  
  1015  func (s *Server) getCAProvider() (ca.Provider, *structs.CARoot) {
  1016  	retries := 0
  1017  	var result ca.Provider
  1018  	var resultRoot *structs.CARoot
  1019  	for result == nil {
  1020  		s.caProviderLock.RLock()
  1021  		result = s.caProvider
  1022  		resultRoot = s.caProviderRoot
  1023  		s.caProviderLock.RUnlock()
  1024  
  1025  		// In cases where an agent is started with managed proxies, we may ask
  1026  		// for the provider before establishLeadership completes. If we're the
  1027  		// leader, then wait and get the provider again
  1028  		if result == nil && s.IsLeader() && retries < 10 {
  1029  			retries++
  1030  			time.Sleep(50 * time.Millisecond)
  1031  			continue
  1032  		}
  1033  
  1034  		break
  1035  	}
  1036  
  1037  	return result, resultRoot
  1038  }
  1039  
  1040  func (s *Server) setCAProvider(newProvider ca.Provider, root *structs.CARoot) {
  1041  	s.caProviderLock.Lock()
  1042  	defer s.caProviderLock.Unlock()
  1043  	s.caProvider = newProvider
  1044  	s.caProviderRoot = root
  1045  }
  1046  
  1047  // startCARootPruning starts a goroutine that looks for stale CARoots
  1048  // and removes them from the state store.
  1049  func (s *Server) startCARootPruning() {
  1050  	s.caPruningLock.Lock()
  1051  	defer s.caPruningLock.Unlock()
  1052  
  1053  	if s.caPruningEnabled {
  1054  		return
  1055  	}
  1056  
  1057  	s.caPruningCh = make(chan struct{})
  1058  
  1059  	go func() {
  1060  		ticker := time.NewTicker(caRootPruneInterval)
  1061  		defer ticker.Stop()
  1062  
  1063  		for {
  1064  			select {
  1065  			case <-s.caPruningCh:
  1066  				return
  1067  			case <-ticker.C:
  1068  				if err := s.pruneCARoots(); err != nil {
  1069  					s.logger.Printf("[ERR] connect: error pruning CA roots: %v", err)
  1070  				}
  1071  			}
  1072  		}
  1073  	}()
  1074  
  1075  	s.caPruningEnabled = true
  1076  }
  1077  
  1078  // pruneCARoots looks for any CARoots that have been rotated out and expired.
  1079  func (s *Server) pruneCARoots() error {
  1080  	if !s.config.ConnectEnabled {
  1081  		return nil
  1082  	}
  1083  
  1084  	state := s.fsm.State()
  1085  	idx, roots, err := state.CARoots(nil)
  1086  	if err != nil {
  1087  		return err
  1088  	}
  1089  
  1090  	_, caConf, err := state.CAConfig()
  1091  	if err != nil {
  1092  		return err
  1093  	}
  1094  
  1095  	common, err := caConf.GetCommonConfig()
  1096  	if err != nil {
  1097  		return err
  1098  	}
  1099  
  1100  	var newRoots structs.CARoots
  1101  	for _, r := range roots {
  1102  		if !r.Active && !r.RotatedOutAt.IsZero() && time.Now().Sub(r.RotatedOutAt) > common.LeafCertTTL*2 {
  1103  			s.logger.Printf("[INFO] connect: pruning old unused root CA (ID: %s)", r.ID)
  1104  			continue
  1105  		}
  1106  		newRoot := *r
  1107  		newRoots = append(newRoots, &newRoot)
  1108  	}
  1109  
  1110  	// Return early if there's nothing to remove.
  1111  	if len(newRoots) == len(roots) {
  1112  		return nil
  1113  	}
  1114  
  1115  	// Commit the new root state.
  1116  	var args structs.CARequest
  1117  	args.Op = structs.CAOpSetRoots
  1118  	args.Index = idx
  1119  	args.Roots = newRoots
  1120  	resp, err := s.raftApply(structs.ConnectCARequestType, args)
  1121  	if err != nil {
  1122  		return err
  1123  	}
  1124  	if respErr, ok := resp.(error); ok {
  1125  		return respErr
  1126  	}
  1127  
  1128  	return nil
  1129  }
  1130  
  1131  // stopCARootPruning stops the CARoot pruning process.
  1132  func (s *Server) stopCARootPruning() {
  1133  	s.caPruningLock.Lock()
  1134  	defer s.caPruningLock.Unlock()
  1135  
  1136  	if !s.caPruningEnabled {
  1137  		return
  1138  	}
  1139  
  1140  	close(s.caPruningCh)
  1141  	s.caPruningEnabled = false
  1142  }
  1143  
  1144  // reconcileReaped is used to reconcile nodes that have failed and been reaped
  1145  // from Serf but remain in the catalog. This is done by looking for unknown nodes with serfHealth checks registered.
  1146  // We generate a "reap" event to cause the node to be cleaned up.
  1147  func (s *Server) reconcileReaped(known map[string]struct{}) error {
  1148  	state := s.fsm.State()
  1149  	_, checks, err := state.ChecksInState(nil, api.HealthAny)
  1150  	if err != nil {
  1151  		return err
  1152  	}
  1153  	for _, check := range checks {
  1154  		// Ignore any non serf checks
  1155  		if check.CheckID != structs.SerfCheckID {
  1156  			continue
  1157  		}
  1158  
  1159  		// Check if this node is "known" by serf
  1160  		if _, ok := known[check.Node]; ok {
  1161  			continue
  1162  		}
  1163  
  1164  		// Get the node services, look for ConsulServiceID
  1165  		_, services, err := state.NodeServices(nil, check.Node)
  1166  		if err != nil {
  1167  			return err
  1168  		}
  1169  		serverPort := 0
  1170  		serverAddr := ""
  1171  		serverID := ""
  1172  
  1173  	CHECKS:
  1174  		for _, service := range services.Services {
  1175  			if service.ID == structs.ConsulServiceID {
  1176  				_, node, err := state.GetNode(check.Node)
  1177  				if err != nil {
  1178  					s.logger.Printf("[ERR] consul: Unable to look up node with name %q: %v", check.Node, err)
  1179  					continue CHECKS
  1180  				}
  1181  
  1182  				serverAddr = node.Address
  1183  				serverPort = service.Port
  1184  				lookupAddr := net.JoinHostPort(serverAddr, strconv.Itoa(serverPort))
  1185  				svr := s.serverLookup.Server(raft.ServerAddress(lookupAddr))
  1186  				if svr != nil {
  1187  					serverID = svr.ID
  1188  				}
  1189  				break
  1190  			}
  1191  		}
  1192  
  1193  		// Create a fake member
  1194  		member := serf.Member{
  1195  			Name: check.Node,
  1196  			Tags: map[string]string{
  1197  				"dc":   s.config.Datacenter,
  1198  				"role": "node",
  1199  			},
  1200  		}
  1201  
  1202  		// Create the appropriate tags if this was a server node
  1203  		if serverPort > 0 {
  1204  			member.Tags["role"] = "consul"
  1205  			member.Tags["port"] = strconv.FormatUint(uint64(serverPort), 10)
  1206  			member.Tags["id"] = serverID
  1207  			member.Addr = net.ParseIP(serverAddr)
  1208  		}
  1209  
  1210  		// Attempt to reap this member
  1211  		if err := s.handleReapMember(member); err != nil {
  1212  			return err
  1213  		}
  1214  	}
  1215  	return nil
  1216  }
  1217  
  1218  // reconcileMember is used to do an async reconcile of a single
  1219  // serf member
  1220  func (s *Server) reconcileMember(member serf.Member) error {
  1221  	// Check if this is a member we should handle
  1222  	if !s.shouldHandleMember(member) {
  1223  		s.logger.Printf("[WARN] consul: skipping reconcile of node %v", member)
  1224  		return nil
  1225  	}
  1226  	defer metrics.MeasureSince([]string{"leader", "reconcileMember"}, time.Now())
  1227  	var err error
  1228  	switch member.Status {
  1229  	case serf.StatusAlive:
  1230  		err = s.handleAliveMember(member)
  1231  	case serf.StatusFailed:
  1232  		err = s.handleFailedMember(member)
  1233  	case serf.StatusLeft:
  1234  		err = s.handleLeftMember(member)
  1235  	case StatusReap:
  1236  		err = s.handleReapMember(member)
  1237  	}
  1238  	if err != nil {
  1239  		s.logger.Printf("[ERR] consul: failed to reconcile member: %v: %v",
  1240  			member, err)
  1241  
  1242  		// Permission denied should not bubble up
  1243  		if acl.IsErrPermissionDenied(err) {
  1244  			return nil
  1245  		}
  1246  	}
  1247  	return nil
  1248  }
  1249  
  1250  // shouldHandleMember checks if this is a Consul pool member
  1251  func (s *Server) shouldHandleMember(member serf.Member) bool {
  1252  	if valid, dc := isConsulNode(member); valid && dc == s.config.Datacenter {
  1253  		return true
  1254  	}
  1255  	if valid, parts := metadata.IsConsulServer(member); valid &&
  1256  		parts.Segment == "" &&
  1257  		parts.Datacenter == s.config.Datacenter {
  1258  		return true
  1259  	}
  1260  	return false
  1261  }
  1262  
  1263  // handleAliveMember is used to ensure the node
  1264  // is registered, with a passing health check.
  1265  func (s *Server) handleAliveMember(member serf.Member) error {
  1266  	// Register consul service if a server
  1267  	var service *structs.NodeService
  1268  	if valid, parts := metadata.IsConsulServer(member); valid {
  1269  		service = &structs.NodeService{
  1270  			ID:      structs.ConsulServiceID,
  1271  			Service: structs.ConsulServiceName,
  1272  			Port:    parts.Port,
  1273  		}
  1274  
  1275  		// Attempt to join the consul server
  1276  		if err := s.joinConsulServer(member, parts); err != nil {
  1277  			return err
  1278  		}
  1279  	}
  1280  
  1281  	// Check if the node exists
  1282  	state := s.fsm.State()
  1283  	_, node, err := state.GetNode(member.Name)
  1284  	if err != nil {
  1285  		return err
  1286  	}
  1287  	if node != nil && node.Address == member.Addr.String() {
  1288  		// Check if the associated service is available
  1289  		if service != nil {
  1290  			match := false
  1291  			_, services, err := state.NodeServices(nil, member.Name)
  1292  			if err != nil {
  1293  				return err
  1294  			}
  1295  			if services != nil {
  1296  				for id := range services.Services {
  1297  					if id == service.ID {
  1298  						match = true
  1299  					}
  1300  				}
  1301  			}
  1302  			if !match {
  1303  				goto AFTER_CHECK
  1304  			}
  1305  		}
  1306  
  1307  		// Check if the serfCheck is in the passing state
  1308  		_, checks, err := state.NodeChecks(nil, member.Name)
  1309  		if err != nil {
  1310  			return err
  1311  		}
  1312  		for _, check := range checks {
  1313  			if check.CheckID == structs.SerfCheckID && check.Status == api.HealthPassing {
  1314  				return nil
  1315  			}
  1316  		}
  1317  	}
  1318  AFTER_CHECK:
  1319  	s.logger.Printf("[INFO] consul: member '%s' joined, marking health alive", member.Name)
  1320  
  1321  	// Register with the catalog.
  1322  	req := structs.RegisterRequest{
  1323  		Datacenter: s.config.Datacenter,
  1324  		Node:       member.Name,
  1325  		ID:         types.NodeID(member.Tags["id"]),
  1326  		Address:    member.Addr.String(),
  1327  		Service:    service,
  1328  		Check: &structs.HealthCheck{
  1329  			Node:    member.Name,
  1330  			CheckID: structs.SerfCheckID,
  1331  			Name:    structs.SerfCheckName,
  1332  			Status:  api.HealthPassing,
  1333  			Output:  structs.SerfCheckAliveOutput,
  1334  		},
  1335  
  1336  		// If there's existing information about the node, do not
  1337  		// clobber it.
  1338  		SkipNodeUpdate: true,
  1339  	}
  1340  	_, err = s.raftApply(structs.RegisterRequestType, &req)
  1341  	return err
  1342  }
  1343  
  1344  // handleFailedMember is used to mark the node's status
  1345  // as being critical, along with all checks as unknown.
  1346  func (s *Server) handleFailedMember(member serf.Member) error {
  1347  	// Check if the node exists
  1348  	state := s.fsm.State()
  1349  	_, node, err := state.GetNode(member.Name)
  1350  	if err != nil {
  1351  		return err
  1352  	}
  1353  	if node != nil && node.Address == member.Addr.String() {
  1354  		// Check if the serfCheck is in the critical state
  1355  		_, checks, err := state.NodeChecks(nil, member.Name)
  1356  		if err != nil {
  1357  			return err
  1358  		}
  1359  		for _, check := range checks {
  1360  			if check.CheckID == structs.SerfCheckID && check.Status == api.HealthCritical {
  1361  				return nil
  1362  			}
  1363  		}
  1364  	}
  1365  	s.logger.Printf("[INFO] consul: member '%s' failed, marking health critical", member.Name)
  1366  
  1367  	// Register with the catalog
  1368  	req := structs.RegisterRequest{
  1369  		Datacenter: s.config.Datacenter,
  1370  		Node:       member.Name,
  1371  		ID:         types.NodeID(member.Tags["id"]),
  1372  		Address:    member.Addr.String(),
  1373  		Check: &structs.HealthCheck{
  1374  			Node:    member.Name,
  1375  			CheckID: structs.SerfCheckID,
  1376  			Name:    structs.SerfCheckName,
  1377  			Status:  api.HealthCritical,
  1378  			Output:  structs.SerfCheckFailedOutput,
  1379  		},
  1380  
  1381  		// If there's existing information about the node, do not
  1382  		// clobber it.
  1383  		SkipNodeUpdate: true,
  1384  	}
  1385  	_, err = s.raftApply(structs.RegisterRequestType, &req)
  1386  	return err
  1387  }
  1388  
  1389  // handleLeftMember is used to handle members that gracefully
  1390  // left. They are deregistered if necessary.
  1391  func (s *Server) handleLeftMember(member serf.Member) error {
  1392  	return s.handleDeregisterMember("left", member)
  1393  }
  1394  
  1395  // handleReapMember is used to handle members that have been
  1396  // reaped after a prolonged failure. They are deregistered.
  1397  func (s *Server) handleReapMember(member serf.Member) error {
  1398  	return s.handleDeregisterMember("reaped", member)
  1399  }
  1400  
  1401  // handleDeregisterMember is used to deregister a member of a given reason
  1402  func (s *Server) handleDeregisterMember(reason string, member serf.Member) error {
  1403  	// Do not deregister ourself. This can only happen if the current leader
  1404  	// is leaving. Instead, we should allow a follower to take-over and
  1405  	// deregister us later.
  1406  	if member.Name == s.config.NodeName {
  1407  		s.logger.Printf("[WARN] consul: deregistering self (%s) should be done by follower", s.config.NodeName)
  1408  		return nil
  1409  	}
  1410  
  1411  	// Remove from Raft peers if this was a server
  1412  	if valid, parts := metadata.IsConsulServer(member); valid {
  1413  		if err := s.removeConsulServer(member, parts.Port); err != nil {
  1414  			return err
  1415  		}
  1416  	}
  1417  
  1418  	// Check if the node does not exist
  1419  	state := s.fsm.State()
  1420  	_, node, err := state.GetNode(member.Name)
  1421  	if err != nil {
  1422  		return err
  1423  	}
  1424  	if node == nil {
  1425  		return nil
  1426  	}
  1427  
  1428  	// Deregister the node
  1429  	s.logger.Printf("[INFO] consul: member '%s' %s, deregistering", member.Name, reason)
  1430  	req := structs.DeregisterRequest{
  1431  		Datacenter: s.config.Datacenter,
  1432  		Node:       member.Name,
  1433  	}
  1434  	_, err = s.raftApply(structs.DeregisterRequestType, &req)
  1435  	return err
  1436  }
  1437  
  1438  // joinConsulServer is used to try to join another consul server
  1439  func (s *Server) joinConsulServer(m serf.Member, parts *metadata.Server) error {
  1440  	// Check for possibility of multiple bootstrap nodes
  1441  	if parts.Bootstrap {
  1442  		members := s.serfLAN.Members()
  1443  		for _, member := range members {
  1444  			valid, p := metadata.IsConsulServer(member)
  1445  			if valid && member.Name != m.Name && p.Bootstrap {
  1446  				s.logger.Printf("[ERR] consul: '%v' and '%v' are both in bootstrap mode. Only one node should be in bootstrap mode, not adding Raft peer.", m.Name, member.Name)
  1447  				return nil
  1448  			}
  1449  		}
  1450  	}
  1451  
  1452  	// Processing ourselves could result in trying to remove ourselves to
  1453  	// fix up our address, which would make us step down. This is only
  1454  	// safe to attempt if there are multiple servers available.
  1455  	configFuture := s.raft.GetConfiguration()
  1456  	if err := configFuture.Error(); err != nil {
  1457  		s.logger.Printf("[ERR] consul: failed to get raft configuration: %v", err)
  1458  		return err
  1459  	}
  1460  	if m.Name == s.config.NodeName {
  1461  		if l := len(configFuture.Configuration().Servers); l < 3 {
  1462  			s.logger.Printf("[DEBUG] consul: Skipping self join check for %q since the cluster is too small", m.Name)
  1463  			return nil
  1464  		}
  1465  	}
  1466  
  1467  	// See if it's already in the configuration. It's harmless to re-add it
  1468  	// but we want to avoid doing that if possible to prevent useless Raft
  1469  	// log entries. If the address is the same but the ID changed, remove the
  1470  	// old server before adding the new one.
  1471  	addr := (&net.TCPAddr{IP: m.Addr, Port: parts.Port}).String()
  1472  	minRaftProtocol, err := s.autopilot.MinRaftProtocol()
  1473  	if err != nil {
  1474  		return err
  1475  	}
  1476  	for _, server := range configFuture.Configuration().Servers {
  1477  		// No-op if the raft version is too low
  1478  		if server.Address == raft.ServerAddress(addr) && (minRaftProtocol < 2 || parts.RaftVersion < 3) {
  1479  			return nil
  1480  		}
  1481  
  1482  		// If the address or ID matches an existing server, see if we need to remove the old one first
  1483  		if server.Address == raft.ServerAddress(addr) || server.ID == raft.ServerID(parts.ID) {
  1484  			// Exit with no-op if this is being called on an existing server
  1485  			if server.Address == raft.ServerAddress(addr) && server.ID == raft.ServerID(parts.ID) {
  1486  				return nil
  1487  			}
  1488  			future := s.raft.RemoveServer(server.ID, 0, 0)
  1489  			if server.Address == raft.ServerAddress(addr) {
  1490  				if err := future.Error(); err != nil {
  1491  					return fmt.Errorf("error removing server with duplicate address %q: %s", server.Address, err)
  1492  				}
  1493  				s.logger.Printf("[INFO] consul: removed server with duplicate address: %s", server.Address)
  1494  			} else {
  1495  				if err := future.Error(); err != nil {
  1496  					return fmt.Errorf("error removing server with duplicate ID %q: %s", server.ID, err)
  1497  				}
  1498  				s.logger.Printf("[INFO] consul: removed server with duplicate ID: %s", server.ID)
  1499  			}
  1500  		}
  1501  	}
  1502  
  1503  	// Attempt to add as a peer
  1504  	switch {
  1505  	case minRaftProtocol >= 3:
  1506  		addFuture := s.raft.AddNonvoter(raft.ServerID(parts.ID), raft.ServerAddress(addr), 0, 0)
  1507  		if err := addFuture.Error(); err != nil {
  1508  			s.logger.Printf("[ERR] consul: failed to add raft peer: %v", err)
  1509  			return err
  1510  		}
  1511  	case minRaftProtocol == 2 && parts.RaftVersion >= 3:
  1512  		addFuture := s.raft.AddVoter(raft.ServerID(parts.ID), raft.ServerAddress(addr), 0, 0)
  1513  		if err := addFuture.Error(); err != nil {
  1514  			s.logger.Printf("[ERR] consul: failed to add raft peer: %v", err)
  1515  			return err
  1516  		}
  1517  	default:
  1518  		addFuture := s.raft.AddPeer(raft.ServerAddress(addr))
  1519  		if err := addFuture.Error(); err != nil {
  1520  			s.logger.Printf("[ERR] consul: failed to add raft peer: %v", err)
  1521  			return err
  1522  		}
  1523  	}
  1524  
  1525  	// Trigger a check to remove dead servers
  1526  	s.autopilot.RemoveDeadServers()
  1527  
  1528  	return nil
  1529  }
  1530  
  1531  // removeConsulServer is used to try to remove a consul server that has left
  1532  func (s *Server) removeConsulServer(m serf.Member, port int) error {
  1533  	addr := (&net.TCPAddr{IP: m.Addr, Port: port}).String()
  1534  
  1535  	// See if it's already in the configuration. It's harmless to re-remove it
  1536  	// but we want to avoid doing that if possible to prevent useless Raft
  1537  	// log entries.
  1538  	configFuture := s.raft.GetConfiguration()
  1539  	if err := configFuture.Error(); err != nil {
  1540  		s.logger.Printf("[ERR] consul: failed to get raft configuration: %v", err)
  1541  		return err
  1542  	}
  1543  
  1544  	minRaftProtocol, err := s.autopilot.MinRaftProtocol()
  1545  	if err != nil {
  1546  		return err
  1547  	}
  1548  
  1549  	_, parts := metadata.IsConsulServer(m)
  1550  
  1551  	// Pick which remove API to use based on how the server was added.
  1552  	for _, server := range configFuture.Configuration().Servers {
  1553  		// If we understand the new add/remove APIs and the server was added by ID, use the new remove API
  1554  		if minRaftProtocol >= 2 && server.ID == raft.ServerID(parts.ID) {
  1555  			s.logger.Printf("[INFO] consul: removing server by ID: %q", server.ID)
  1556  			future := s.raft.RemoveServer(raft.ServerID(parts.ID), 0, 0)
  1557  			if err := future.Error(); err != nil {
  1558  				s.logger.Printf("[ERR] consul: failed to remove raft peer '%v': %v",
  1559  					server.ID, err)
  1560  				return err
  1561  			}
  1562  			break
  1563  		} else if server.Address == raft.ServerAddress(addr) {
  1564  			// If not, use the old remove API
  1565  			s.logger.Printf("[INFO] consul: removing server by address: %q", server.Address)
  1566  			future := s.raft.RemovePeer(raft.ServerAddress(addr))
  1567  			if err := future.Error(); err != nil {
  1568  				s.logger.Printf("[ERR] consul: failed to remove raft peer '%v': %v",
  1569  					addr, err)
  1570  				return err
  1571  			}
  1572  			break
  1573  		}
  1574  	}
  1575  
  1576  	return nil
  1577  }
  1578  
  1579  // reapTombstones is invoked by the current leader to manage garbage
  1580  // collection of tombstones. When a key is deleted, we trigger a tombstone
  1581  // GC clock. Once the expiration is reached, this routine is invoked
  1582  // to clear all tombstones before this index. This must be replicated
  1583  // through Raft to ensure consistency. We do this outside the leader loop
  1584  // to avoid blocking.
  1585  func (s *Server) reapTombstones(index uint64) {
  1586  	defer metrics.MeasureSince([]string{"leader", "reapTombstones"}, time.Now())
  1587  	req := structs.TombstoneRequest{
  1588  		Datacenter: s.config.Datacenter,
  1589  		Op:         structs.TombstoneReap,
  1590  		ReapIndex:  index,
  1591  	}
  1592  	_, err := s.raftApply(structs.TombstoneRequestType, &req)
  1593  	if err != nil {
  1594  		s.logger.Printf("[ERR] consul: failed to reap tombstones up to %d: %v",
  1595  			index, err)
  1596  	}
  1597  }