github.com/criteo-forks/consul@v1.4.5-criteonogrpc/agent/agent.go (about)

     1  package agent
     2  
     3  import (
     4  	"context"
     5  	"crypto/sha512"
     6  	"crypto/tls"
     7  	"encoding/json"
     8  	"fmt"
     9  	"io"
    10  	"io/ioutil"
    11  	"log"
    12  	"net"
    13  	"net/http"
    14  	"os"
    15  	"path/filepath"
    16  	"strconv"
    17  	"strings"
    18  	"sync"
    19  	"time"
    20  
    21  	"google.golang.org/grpc"
    22  
    23  	metrics "github.com/armon/go-metrics"
    24  	"github.com/hashicorp/consul/acl"
    25  	"github.com/hashicorp/consul/agent/ae"
    26  	"github.com/hashicorp/consul/agent/cache"
    27  	cachetype "github.com/hashicorp/consul/agent/cache-types"
    28  	"github.com/hashicorp/consul/agent/checks"
    29  	"github.com/hashicorp/consul/agent/config"
    30  	"github.com/hashicorp/consul/agent/consul"
    31  	"github.com/hashicorp/consul/agent/local"
    32  	"github.com/hashicorp/consul/agent/proxycfg"
    33  	"github.com/hashicorp/consul/agent/proxyprocess"
    34  	"github.com/hashicorp/consul/agent/structs"
    35  	"github.com/hashicorp/consul/agent/systemd"
    36  	"github.com/hashicorp/consul/agent/token"
    37  	"github.com/hashicorp/consul/agent/xds"
    38  	"github.com/hashicorp/consul/api"
    39  	"github.com/hashicorp/consul/ipaddr"
    40  	"github.com/hashicorp/consul/lib"
    41  	"github.com/hashicorp/consul/lib/file"
    42  	"github.com/hashicorp/consul/logger"
    43  	"github.com/hashicorp/consul/tlsutil"
    44  	"github.com/hashicorp/consul/types"
    45  	"github.com/hashicorp/consul/watch"
    46  	multierror "github.com/hashicorp/go-multierror"
    47  	uuid "github.com/hashicorp/go-uuid"
    48  	"github.com/hashicorp/memberlist"
    49  	"github.com/hashicorp/raft"
    50  	"github.com/hashicorp/serf/serf"
    51  	"github.com/shirou/gopsutil/host"
    52  	"golang.org/x/net/http2"
    53  )
    54  
    55  const (
    56  	// Path to save agent service definitions
    57  	servicesDir = "services"
    58  
    59  	// Path to save agent proxy definitions
    60  	proxyDir = "proxies"
    61  
    62  	// Path to save local agent checks
    63  	checksDir     = "checks"
    64  	checkStateDir = "checks/state"
    65  
    66  	// Name of the file tokens will be persisted within
    67  	tokensPath = "acl-tokens.json"
    68  
    69  	// Default reasons for node/service maintenance mode
    70  	defaultNodeMaintReason = "Maintenance mode is enabled for this node, " +
    71  		"but no reason was provided. This is a default message."
    72  	defaultServiceMaintReason = "Maintenance mode is enabled for this " +
    73  		"service, but no reason was provided. This is a default message."
    74  )
    75  
    76  type configSource int
    77  
    78  const (
    79  	ConfigSourceLocal configSource = iota
    80  	ConfigSourceRemote
    81  )
    82  
    83  // delegate defines the interface shared by both
    84  // consul.Client and consul.Server.
    85  type delegate interface {
    86  	Encrypted() bool
    87  	GetLANCoordinate() (lib.CoordinateSet, error)
    88  	Leave() error
    89  	LANMembers() []serf.Member
    90  	LANMembersAllSegments() ([]serf.Member, error)
    91  	LANSegmentMembers(segment string) ([]serf.Member, error)
    92  	LocalMember() serf.Member
    93  	JoinLAN(addrs []string) (n int, err error)
    94  	RemoveFailedNode(node string) error
    95  	ResolveToken(secretID string) (acl.Authorizer, error)
    96  	RPC(method string, args interface{}, reply interface{}) error
    97  	ACLsEnabled() bool
    98  	UseLegacyACLs() bool
    99  	SnapshotRPC(args *structs.SnapshotRequest, in io.Reader, out io.Writer, replyFn structs.SnapshotReplyFn) error
   100  	Shutdown() error
   101  	Stats() map[string]map[string]string
   102  	ReloadConfig(config *consul.Config) error
   103  	enterpriseDelegate
   104  }
   105  
   106  // notifier is called after a successful JoinLAN.
   107  type notifier interface {
   108  	Notify(string) error
   109  }
   110  
   111  // The agent is the long running process that is run on every machine.
   112  // It exposes an RPC interface that is used by the CLI to control the
   113  // agent. The agent runs the query interfaces like HTTP, DNS, and RPC.
   114  // However, it can run in either a client, or server mode. In server
   115  // mode, it runs a full Consul server. In client-only mode, it only forwards
   116  // requests to other Consul servers.
   117  type Agent struct {
   118  	// config is the agent configuration.
   119  	config *config.RuntimeConfig
   120  
   121  	// Used for writing our logs
   122  	logger *log.Logger
   123  
   124  	// Output sink for logs
   125  	LogOutput io.Writer
   126  
   127  	// Used for streaming logs to
   128  	LogWriter *logger.LogWriter
   129  
   130  	// In-memory sink used for collecting metrics
   131  	MemSink *metrics.InmemSink
   132  
   133  	// delegate is either a *consul.Server or *consul.Client
   134  	// depending on the configuration
   135  	delegate delegate
   136  
   137  	// aclMasterAuthorizer is an object that helps manage local ACL enforcement.
   138  	aclMasterAuthorizer acl.Authorizer
   139  
   140  	// state stores a local representation of the node,
   141  	// services and checks. Used for anti-entropy.
   142  	State *local.State
   143  
   144  	// sync manages the synchronization of the local
   145  	// and the remote state.
   146  	sync *ae.StateSyncer
   147  
   148  	// syncMu and syncCh are used to coordinate agent endpoints that are blocking
   149  	// on local state during a config reload.
   150  	syncMu sync.Mutex
   151  	syncCh chan struct{}
   152  
   153  	// cache is the in-memory cache for data the Agent requests.
   154  	cache *cache.Cache
   155  
   156  	// checkReapAfter maps the check ID to a timeout after which we should
   157  	// reap its associated service
   158  	checkReapAfter map[types.CheckID]time.Duration
   159  
   160  	// checkMonitors maps the check ID to an associated monitor
   161  	checkMonitors map[types.CheckID]*checks.CheckMonitor
   162  
   163  	// checkHTTPs maps the check ID to an associated HTTP check
   164  	checkHTTPs map[types.CheckID]*checks.CheckHTTP
   165  
   166  	// checkTCPs maps the check ID to an associated TCP check
   167  	checkTCPs map[types.CheckID]*checks.CheckTCP
   168  
   169  	// checkGRPCs maps the check ID to an associated GRPC check
   170  	checkGRPCs map[types.CheckID]*checks.CheckGRPC
   171  
   172  	// checkTTLs maps the check ID to an associated check TTL
   173  	checkTTLs map[types.CheckID]*checks.CheckTTL
   174  
   175  	// checkDockers maps the check ID to an associated Docker Exec based check
   176  	checkDockers map[types.CheckID]*checks.CheckDocker
   177  
   178  	// checkAliases maps the check ID to an associated Alias checks
   179  	checkAliases map[types.CheckID]*checks.CheckAlias
   180  
   181  	// stateLock protects the agent state
   182  	stateLock sync.Mutex
   183  
   184  	// dockerClient is the client for performing docker health checks.
   185  	dockerClient *checks.DockerClient
   186  
   187  	// eventCh is used to receive user events
   188  	eventCh chan serf.UserEvent
   189  
   190  	// eventBuf stores the most recent events in a ring buffer
   191  	// using eventIndex as the next index to insert into. This
   192  	// is guarded by eventLock. When an insert happens, the
   193  	// eventNotify group is notified.
   194  	eventBuf    []*UserEvent
   195  	eventIndex  int
   196  	eventLock   sync.RWMutex
   197  	eventNotify NotifyGroup
   198  
   199  	reloadCh chan chan error
   200  
   201  	shutdown     bool
   202  	shutdownCh   chan struct{}
   203  	shutdownLock sync.Mutex
   204  
   205  	// joinLANNotifier is called after a successful JoinLAN.
   206  	joinLANNotifier notifier
   207  
   208  	// retryJoinCh transports errors from the retry join
   209  	// attempts.
   210  	retryJoinCh chan error
   211  
   212  	// endpoints maps unique RPC endpoint names to common ones
   213  	// to allow overriding of RPC handlers since the golang
   214  	// net/rpc server does not allow this.
   215  	endpoints     map[string]string
   216  	endpointsLock sync.RWMutex
   217  
   218  	// dnsServer provides the DNS API
   219  	dnsServers []*DNSServer
   220  
   221  	// httpServers provides the HTTP API on various endpoints
   222  	httpServers []*HTTPServer
   223  
   224  	// wgServers is the wait group for all HTTP and DNS servers
   225  	wgServers sync.WaitGroup
   226  
   227  	// watchPlans tracks all the currently-running watch plans for the
   228  	// agent.
   229  	watchPlans []*watch.Plan
   230  
   231  	// tokens holds ACL tokens initially from the configuration, but can
   232  	// be updated at runtime, so should always be used instead of going to
   233  	// the configuration directly.
   234  	tokens *token.Store
   235  
   236  	// proxyManager is the proxy process manager for managed Connect proxies.
   237  	proxyManager *proxyprocess.Manager
   238  
   239  	// proxyConfig is the manager for proxy service (Kind = connect-proxy)
   240  	// configuration state. This ensures all state needed by a proxy registration
   241  	// is maintained in cache and handles pushing updates to that state into XDS
   242  	// server to be pushed out to Envoy. This is NOT related to managed proxies
   243  	// directly.
   244  	proxyConfig *proxycfg.Manager
   245  
   246  	// xdsServer is the Server instance that serves xDS gRPC API.
   247  	xdsServer *xds.Server
   248  
   249  	// grpcServer is the server instance used currently to serve xDS API for
   250  	// Envoy.
   251  	grpcServer *grpc.Server
   252  
   253  	// tlsConfigurator is the central instance to provide a *tls.Config
   254  	// based on the current consul configuration.
   255  	tlsConfigurator *tlsutil.Configurator
   256  
   257  	// persistedTokensLock is used to synchronize access to the persisted token
   258  	// store within the data directory. This will prevent loading while writing as
   259  	// well as multiple concurrent writes.
   260  	persistedTokensLock sync.RWMutex
   261  }
   262  
   263  func New(c *config.RuntimeConfig) (*Agent, error) {
   264  	if c.Datacenter == "" {
   265  		return nil, fmt.Errorf("Must configure a Datacenter")
   266  	}
   267  	if c.DataDir == "" && !c.DevMode {
   268  		return nil, fmt.Errorf("Must configure a DataDir")
   269  	}
   270  
   271  	a := &Agent{
   272  		config:          c,
   273  		checkReapAfter:  make(map[types.CheckID]time.Duration),
   274  		checkMonitors:   make(map[types.CheckID]*checks.CheckMonitor),
   275  		checkTTLs:       make(map[types.CheckID]*checks.CheckTTL),
   276  		checkHTTPs:      make(map[types.CheckID]*checks.CheckHTTP),
   277  		checkTCPs:       make(map[types.CheckID]*checks.CheckTCP),
   278  		checkGRPCs:      make(map[types.CheckID]*checks.CheckGRPC),
   279  		checkDockers:    make(map[types.CheckID]*checks.CheckDocker),
   280  		checkAliases:    make(map[types.CheckID]*checks.CheckAlias),
   281  		eventCh:         make(chan serf.UserEvent, 1024),
   282  		eventBuf:        make([]*UserEvent, 256),
   283  		joinLANNotifier: &systemd.Notifier{},
   284  		reloadCh:        make(chan chan error),
   285  		retryJoinCh:     make(chan error),
   286  		shutdownCh:      make(chan struct{}),
   287  		endpoints:       make(map[string]string),
   288  		tokens:          new(token.Store),
   289  	}
   290  
   291  	if err := a.initializeACLs(); err != nil {
   292  		return nil, err
   293  	}
   294  
   295  	return a, nil
   296  }
   297  
   298  func LocalConfig(cfg *config.RuntimeConfig) local.Config {
   299  	lc := local.Config{
   300  		AdvertiseAddr:       cfg.AdvertiseAddrLAN.String(),
   301  		CheckUpdateInterval: cfg.CheckUpdateInterval,
   302  		Datacenter:          cfg.Datacenter,
   303  		DiscardCheckOutput:  cfg.DiscardCheckOutput,
   304  		NodeID:              cfg.NodeID,
   305  		NodeName:            cfg.NodeName,
   306  		TaggedAddresses:     map[string]string{},
   307  		ProxyBindMinPort:    cfg.ConnectProxyBindMinPort,
   308  		ProxyBindMaxPort:    cfg.ConnectProxyBindMaxPort,
   309  	}
   310  	for k, v := range cfg.TaggedAddresses {
   311  		lc.TaggedAddresses[k] = v
   312  	}
   313  	return lc
   314  }
   315  
   316  func (a *Agent) setupProxyManager() error {
   317  	acfg, err := a.config.APIConfig(true)
   318  	if err != nil {
   319  		return fmt.Errorf("[INFO] agent: Connect managed proxies are disabled due to providing an invalid HTTP configuration")
   320  	}
   321  	a.proxyManager = proxyprocess.NewManager()
   322  	a.proxyManager.AllowRoot = a.config.ConnectProxyAllowManagedRoot
   323  	a.proxyManager.State = a.State
   324  	a.proxyManager.Logger = a.logger
   325  	if a.config.DataDir != "" {
   326  		// DataDir is required for all non-dev mode agents, but we want
   327  		// to allow setting the data dir for demos and so on for the agent,
   328  		// so do the check above instead.
   329  		a.proxyManager.DataDir = filepath.Join(a.config.DataDir, "proxy")
   330  
   331  		// Restore from our snapshot (if it exists)
   332  		if err := a.proxyManager.Restore(a.proxyManager.SnapshotPath()); err != nil {
   333  			a.logger.Printf("[WARN] agent: error restoring proxy state: %s", err)
   334  		}
   335  	}
   336  	a.proxyManager.ProxyEnv = acfg.GenerateEnv()
   337  	return nil
   338  }
   339  
   340  func (a *Agent) Start() error {
   341  	a.stateLock.Lock()
   342  	defer a.stateLock.Unlock()
   343  
   344  	c := a.config
   345  
   346  	logOutput := a.LogOutput
   347  	if a.logger == nil {
   348  		if logOutput == nil {
   349  			logOutput = os.Stderr
   350  		}
   351  		a.logger = log.New(logOutput, "", log.LstdFlags)
   352  	}
   353  
   354  	// Retrieve or generate the node ID before setting up the rest of the
   355  	// agent, which depends on it.
   356  	if err := a.setupNodeID(c); err != nil {
   357  		return fmt.Errorf("Failed to setup node ID: %v", err)
   358  	}
   359  
   360  	// Warn if the node name is incompatible with DNS
   361  	if InvalidDnsRe.MatchString(a.config.NodeName) {
   362  		a.logger.Printf("[WARN] agent: Node name %q will not be discoverable "+
   363  			"via DNS due to invalid characters. Valid characters include "+
   364  			"all alpha-numerics and dashes.", a.config.NodeName)
   365  	} else if len(a.config.NodeName) > MaxDNSLabelLength {
   366  		a.logger.Printf("[WARN] agent: Node name %q will not be discoverable "+
   367  			"via DNS due to it being too long. Valid lengths are between "+
   368  			"1 and 63 bytes.", a.config.NodeName)
   369  	}
   370  
   371  	// load the tokens - this requires the logger to be setup
   372  	// which is why we can't do this in New
   373  	a.loadTokens(a.config)
   374  
   375  	// create the local state
   376  	a.State = local.NewState(LocalConfig(c), a.logger, a.tokens)
   377  
   378  	// create the state synchronization manager which performs
   379  	// regular and on-demand state synchronizations (anti-entropy).
   380  	a.sync = ae.NewStateSyncer(a.State, c.AEInterval, a.shutdownCh, a.logger)
   381  
   382  	// create the cache
   383  	a.cache = cache.New(nil)
   384  
   385  	// create the config for the rpc server/client
   386  	consulCfg, err := a.consulConfig()
   387  	if err != nil {
   388  		return err
   389  	}
   390  
   391  	// ServerUp is used to inform that a new consul server is now
   392  	// up. This can be used to speed up the sync process if we are blocking
   393  	// waiting to discover a consul server
   394  	consulCfg.ServerUp = a.sync.SyncFull.Trigger
   395  
   396  	tlsConfigurator, err := tlsutil.NewConfigurator(c.ToTLSUtilConfig(), a.logger)
   397  	if err != nil {
   398  		return err
   399  	}
   400  	a.tlsConfigurator = tlsConfigurator
   401  
   402  	// Setup either the client or the server.
   403  	if c.ServerMode {
   404  		server, err := consul.NewServerLogger(consulCfg, a.logger, a.tokens, a.tlsConfigurator)
   405  		if err != nil {
   406  			return fmt.Errorf("Failed to start Consul server: %v", err)
   407  		}
   408  		a.delegate = server
   409  	} else {
   410  		client, err := consul.NewClientLogger(consulCfg, a.logger, a.tlsConfigurator)
   411  		if err != nil {
   412  			return fmt.Errorf("Failed to start Consul client: %v", err)
   413  		}
   414  		a.delegate = client
   415  	}
   416  
   417  	// the staggering of the state syncing depends on the cluster size.
   418  	a.sync.ClusterSize = func() int { return len(a.delegate.LANMembers()) }
   419  
   420  	// link the state with the consul server/client and the state syncer
   421  	// via callbacks. After several attempts this was easier than using
   422  	// channels since the event notification needs to be non-blocking
   423  	// and that should be hidden in the state syncer implementation.
   424  	a.State.Delegate = a.delegate
   425  	a.State.TriggerSyncChanges = a.sync.SyncChanges.Trigger
   426  
   427  	// Register the cache. We do this much later so the delegate is
   428  	// populated from above.
   429  	a.registerCache()
   430  
   431  	// Load checks/services/metadata.
   432  	if err := a.loadServices(c); err != nil {
   433  		return err
   434  	}
   435  	if err := a.loadProxies(c); err != nil {
   436  		return err
   437  	}
   438  	if err := a.loadChecks(c); err != nil {
   439  		return err
   440  	}
   441  	if err := a.loadMetadata(c); err != nil {
   442  		return err
   443  	}
   444  
   445  	// create the proxy process manager and start it. This is purposely
   446  	// done here after the local state above is loaded in so we can have
   447  	// a more accurate initial state view.
   448  	if !c.ConnectTestDisableManagedProxies {
   449  		if err := a.setupProxyManager(); err != nil {
   450  			a.logger.Printf(err.Error())
   451  		} else {
   452  			go a.proxyManager.Run()
   453  		}
   454  	}
   455  
   456  	// Start the proxy config manager.
   457  	a.proxyConfig, err = proxycfg.NewManager(proxycfg.ManagerConfig{
   458  		Cache:  a.cache,
   459  		Logger: a.logger,
   460  		State:  a.State,
   461  		Source: &structs.QuerySource{
   462  			Node:       a.config.NodeName,
   463  			Datacenter: a.config.Datacenter,
   464  			Segment:    a.config.SegmentName,
   465  		},
   466  	})
   467  	if err != nil {
   468  		return err
   469  	}
   470  	go func() {
   471  		if err := a.proxyConfig.Run(); err != nil {
   472  			a.logger.Printf("[ERR] Proxy Config Manager exited: %s", err)
   473  		}
   474  	}()
   475  
   476  	// Start watching for critical services to deregister, based on their
   477  	// checks.
   478  	go a.reapServices()
   479  
   480  	// Start handling events.
   481  	go a.handleEvents()
   482  
   483  	// Start sending network coordinate to the server.
   484  	if !c.DisableCoordinates {
   485  		go a.sendCoordinate()
   486  	}
   487  
   488  	// Write out the PID file if necessary.
   489  	if err := a.storePid(); err != nil {
   490  		return err
   491  	}
   492  
   493  	// start DNS servers
   494  	if err := a.listenAndServeDNS(); err != nil {
   495  		return err
   496  	}
   497  
   498  	// Create listeners and unstarted servers; see comment on listenHTTP why
   499  	// we are doing this.
   500  	servers, err := a.listenHTTP()
   501  	if err != nil {
   502  		return err
   503  	}
   504  
   505  	// Start HTTP and HTTPS servers.
   506  	for _, srv := range servers {
   507  		if err := a.serveHTTP(srv); err != nil {
   508  			return err
   509  		}
   510  		a.httpServers = append(a.httpServers, srv)
   511  	}
   512  
   513  	// Start gRPC server.
   514  	if err := a.listenAndServeGRPC(); err != nil {
   515  		return err
   516  	}
   517  
   518  	// register watches
   519  	if err := a.reloadWatches(a.config); err != nil {
   520  		return err
   521  	}
   522  
   523  	// start retry join
   524  	go a.retryJoinLAN()
   525  	go a.retryJoinWAN()
   526  
   527  	return nil
   528  }
   529  
   530  func (a *Agent) listenAndServeGRPC() error {
   531  	if len(a.config.GRPCAddrs) < 1 {
   532  		return nil
   533  	}
   534  
   535  	a.xdsServer = &xds.Server{
   536  		Logger:       a.logger,
   537  		CfgMgr:       a.proxyConfig,
   538  		Authz:        a,
   539  		ResolveToken: a.resolveToken,
   540  	}
   541  	a.xdsServer.Initialize()
   542  
   543  	var err error
   544  	if a.config.HTTPSPort > 0 {
   545  		// gRPC uses the same TLS settings as the HTTPS API. If HTTPS is
   546  		// enabled then gRPC will require HTTPS as well.
   547  		a.grpcServer, err = a.xdsServer.GRPCServer(a.config.CertFile, a.config.KeyFile)
   548  	} else {
   549  		a.grpcServer, err = a.xdsServer.GRPCServer("", "")
   550  	}
   551  	if err != nil {
   552  		return err
   553  	}
   554  
   555  	ln, err := a.startListeners(a.config.GRPCAddrs)
   556  	if err != nil {
   557  		return err
   558  	}
   559  
   560  	for _, l := range ln {
   561  		go func(innerL net.Listener) {
   562  			a.logger.Printf("[INFO] agent: Started gRPC server on %s (%s)",
   563  				innerL.Addr().String(), innerL.Addr().Network())
   564  			err := a.grpcServer.Serve(innerL)
   565  			if err != nil {
   566  				a.logger.Printf("[ERR] gRPC server failed: %s", err)
   567  			}
   568  		}(l)
   569  	}
   570  	return nil
   571  }
   572  
   573  func (a *Agent) listenAndServeDNS() error {
   574  	notif := make(chan net.Addr, len(a.config.DNSAddrs))
   575  	errCh := make(chan error, len(a.config.DNSAddrs))
   576  	for _, addr := range a.config.DNSAddrs {
   577  		// create server
   578  		s, err := NewDNSServer(a)
   579  		if err != nil {
   580  			return err
   581  		}
   582  		a.dnsServers = append(a.dnsServers, s)
   583  
   584  		// start server
   585  		a.wgServers.Add(1)
   586  		go func(addr net.Addr) {
   587  			defer a.wgServers.Done()
   588  			err := s.ListenAndServe(addr.Network(), addr.String(), func() { notif <- addr })
   589  			if err != nil && !strings.Contains(err.Error(), "accept") {
   590  				errCh <- err
   591  			}
   592  		}(addr)
   593  	}
   594  
   595  	// wait for servers to be up
   596  	timeout := time.After(time.Second)
   597  	var merr *multierror.Error
   598  	for range a.config.DNSAddrs {
   599  		select {
   600  		case addr := <-notif:
   601  			a.logger.Printf("[INFO] agent: Started DNS server %s (%s)", addr.String(), addr.Network())
   602  
   603  		case err := <-errCh:
   604  			merr = multierror.Append(merr, err)
   605  		case <-timeout:
   606  			merr = multierror.Append(merr, fmt.Errorf("agent: timeout starting DNS servers"))
   607  			break
   608  		}
   609  	}
   610  	return merr.ErrorOrNil()
   611  }
   612  
   613  func (a *Agent) startListeners(addrs []net.Addr) ([]net.Listener, error) {
   614  	var ln []net.Listener
   615  	for _, addr := range addrs {
   616  		var l net.Listener
   617  		var err error
   618  
   619  		switch x := addr.(type) {
   620  		case *net.UnixAddr:
   621  			l, err = a.listenSocket(x.Name)
   622  			if err != nil {
   623  				return nil, err
   624  			}
   625  
   626  		case *net.TCPAddr:
   627  			l, err = net.Listen("tcp", x.String())
   628  			if err != nil {
   629  				return nil, err
   630  			}
   631  			l = &tcpKeepAliveListener{l.(*net.TCPListener)}
   632  
   633  		default:
   634  			return nil, fmt.Errorf("unsupported address type %T", addr)
   635  		}
   636  		ln = append(ln, l)
   637  	}
   638  	return ln, nil
   639  }
   640  
   641  // listenHTTP binds listeners to the provided addresses and also returns
   642  // pre-configured HTTP servers which are not yet started. The motivation is
   643  // that in the current startup/shutdown setup we de-couple the listener
   644  // creation from the server startup assuming that if any of the listeners
   645  // cannot be bound we fail immediately and later failures do not occur.
   646  // Therefore, starting a server with a running listener is assumed to not
   647  // produce an error.
   648  //
   649  // The second motivation is that an HTTPS server needs to use the same TLSConfig
   650  // on both the listener and the HTTP server. When listeners and servers are
   651  // created at different times this becomes difficult to handle without keeping
   652  // the TLS configuration somewhere or recreating it.
   653  //
   654  // This approach should ultimately be refactored to the point where we just
   655  // start the server and any error should trigger a proper shutdown of the agent.
   656  func (a *Agent) listenHTTP() ([]*HTTPServer, error) {
   657  	var ln []net.Listener
   658  	var servers []*HTTPServer
   659  	start := func(proto string, addrs []net.Addr) error {
   660  		listeners, err := a.startListeners(addrs)
   661  		if err != nil {
   662  			return err
   663  		}
   664  
   665  		for _, l := range listeners {
   666  			var tlscfg *tls.Config
   667  			_, isTCP := l.(*tcpKeepAliveListener)
   668  			if isTCP && proto == "https" {
   669  				tlscfg = a.tlsConfigurator.IncomingHTTPSConfig()
   670  				l = tls.NewListener(l, tlscfg)
   671  			}
   672  			srv := &HTTPServer{
   673  				Server: &http.Server{
   674  					Addr:      l.Addr().String(),
   675  					TLSConfig: tlscfg,
   676  				},
   677  				ln:        l,
   678  				agent:     a,
   679  				blacklist: NewBlacklist(a.config.HTTPBlockEndpoints),
   680  				proto:     proto,
   681  			}
   682  			srv.Server.Handler = srv.handler(a.config.EnableDebug)
   683  
   684  			// This will enable upgrading connections to HTTP/2 as
   685  			// part of TLS negotiation.
   686  			if proto == "https" {
   687  				err = http2.ConfigureServer(srv.Server, nil)
   688  				if err != nil {
   689  					return err
   690  				}
   691  			}
   692  
   693  			ln = append(ln, l)
   694  			servers = append(servers, srv)
   695  		}
   696  		return nil
   697  	}
   698  
   699  	if err := start("http", a.config.HTTPAddrs); err != nil {
   700  		for _, l := range ln {
   701  			l.Close()
   702  		}
   703  		return nil, err
   704  	}
   705  	if err := start("https", a.config.HTTPSAddrs); err != nil {
   706  		for _, l := range ln {
   707  			l.Close()
   708  		}
   709  		return nil, err
   710  	}
   711  	return servers, nil
   712  }
   713  
   714  // tcpKeepAliveListener sets TCP keep-alive timeouts on accepted
   715  // connections. It's used so dead TCP connections eventually go away.
   716  type tcpKeepAliveListener struct {
   717  	*net.TCPListener
   718  }
   719  
   720  func (ln tcpKeepAliveListener) Accept() (c net.Conn, err error) {
   721  	tc, err := ln.AcceptTCP()
   722  	if err != nil {
   723  		return
   724  	}
   725  	tc.SetKeepAlive(true)
   726  	tc.SetKeepAlivePeriod(30 * time.Second)
   727  	return tc, nil
   728  }
   729  
   730  func (a *Agent) listenSocket(path string) (net.Listener, error) {
   731  	if _, err := os.Stat(path); !os.IsNotExist(err) {
   732  		a.logger.Printf("[WARN] agent: Replacing socket %q", path)
   733  	}
   734  	if err := os.Remove(path); err != nil && !os.IsNotExist(err) {
   735  		return nil, fmt.Errorf("error removing socket file: %s", err)
   736  	}
   737  	l, err := net.Listen("unix", path)
   738  	if err != nil {
   739  		return nil, err
   740  	}
   741  	user, group, mode := a.config.UnixSocketUser, a.config.UnixSocketGroup, a.config.UnixSocketMode
   742  	if err := setFilePermissions(path, user, group, mode); err != nil {
   743  		return nil, fmt.Errorf("Failed setting up socket: %s", err)
   744  	}
   745  	return l, nil
   746  }
   747  
   748  func (a *Agent) serveHTTP(srv *HTTPServer) error {
   749  	// https://github.com/golang/go/issues/20239
   750  	//
   751  	// In go.8.1 there is a race between Serve and Shutdown. If
   752  	// Shutdown is called before the Serve go routine was scheduled then
   753  	// the Serve go routine never returns. This deadlocks the agent
   754  	// shutdown for some tests since it will wait forever.
   755  	notif := make(chan net.Addr)
   756  	a.wgServers.Add(1)
   757  	go func() {
   758  		defer a.wgServers.Done()
   759  		notif <- srv.ln.Addr()
   760  		err := srv.Serve(srv.ln)
   761  		if err != nil && err != http.ErrServerClosed {
   762  			a.logger.Print(err)
   763  		}
   764  	}()
   765  
   766  	select {
   767  	case addr := <-notif:
   768  		if srv.proto == "https" {
   769  			a.logger.Printf("[INFO] agent: Started HTTPS server on %s (%s)", addr.String(), addr.Network())
   770  		} else {
   771  			a.logger.Printf("[INFO] agent: Started HTTP server on %s (%s)", addr.String(), addr.Network())
   772  		}
   773  		return nil
   774  	case <-time.After(time.Second):
   775  		return fmt.Errorf("agent: timeout starting HTTP servers")
   776  	}
   777  }
   778  
   779  // reloadWatches stops any existing watch plans and attempts to load the given
   780  // set of watches.
   781  func (a *Agent) reloadWatches(cfg *config.RuntimeConfig) error {
   782  	// Stop the current watches.
   783  	for _, wp := range a.watchPlans {
   784  		wp.Stop()
   785  	}
   786  	a.watchPlans = nil
   787  
   788  	// Return if there are no watches now.
   789  	if len(cfg.Watches) == 0 {
   790  		return nil
   791  	}
   792  
   793  	// Watches use the API to talk to this agent, so that must be enabled.
   794  	if len(cfg.HTTPAddrs) == 0 && len(cfg.HTTPSAddrs) == 0 {
   795  		return fmt.Errorf("watch plans require an HTTP or HTTPS endpoint")
   796  	}
   797  
   798  	// Compile the watches
   799  	var watchPlans []*watch.Plan
   800  	for _, params := range cfg.Watches {
   801  		if handlerType, ok := params["handler_type"]; !ok {
   802  			params["handler_type"] = "script"
   803  		} else if handlerType != "http" && handlerType != "script" {
   804  			return fmt.Errorf("Handler type '%s' not recognized", params["handler_type"])
   805  		}
   806  
   807  		// Don't let people use connect watches via this mechanism for now as it
   808  		// needs thought about how to do securely and shouldn't be necessary. Note
   809  		// that if the type assertion fails an type is not a string then
   810  		// ParseExample below will error so we don't need to handle that case.
   811  		if typ, ok := params["type"].(string); ok {
   812  			if strings.HasPrefix(typ, "connect_") {
   813  				return fmt.Errorf("Watch type %s is not allowed in agent config", typ)
   814  			}
   815  		}
   816  
   817  		// Parse the watches, excluding 'handler' and 'args'
   818  		wp, err := watch.ParseExempt(params, []string{"handler", "args"})
   819  		if err != nil {
   820  			return fmt.Errorf("Failed to parse watch (%#v): %v", params, err)
   821  		}
   822  
   823  		// Get the handler and subprocess arguments
   824  		handler, hasHandler := wp.Exempt["handler"]
   825  		args, hasArgs := wp.Exempt["args"]
   826  		if hasHandler {
   827  			a.logger.Printf("[WARN] agent: The 'handler' field in watches has been deprecated " +
   828  				"and replaced with the 'args' field. See https://www.consul.io/docs/agent/watches.html")
   829  		}
   830  		if _, ok := handler.(string); hasHandler && !ok {
   831  			return fmt.Errorf("Watch handler must be a string")
   832  		}
   833  		if raw, ok := args.([]interface{}); hasArgs && ok {
   834  			var parsed []string
   835  			for _, arg := range raw {
   836  				v, ok := arg.(string)
   837  				if !ok {
   838  					return fmt.Errorf("Watch args must be a list of strings")
   839  				}
   840  
   841  				parsed = append(parsed, v)
   842  			}
   843  			wp.Exempt["args"] = parsed
   844  		} else if hasArgs && !ok {
   845  			return fmt.Errorf("Watch args must be a list of strings")
   846  		}
   847  		if hasHandler && hasArgs || hasHandler && wp.HandlerType == "http" || hasArgs && wp.HandlerType == "http" {
   848  			return fmt.Errorf("Only one watch handler allowed")
   849  		}
   850  		if !hasHandler && !hasArgs && wp.HandlerType != "http" {
   851  			return fmt.Errorf("Must define a watch handler")
   852  		}
   853  
   854  		// Store the watch plan
   855  		watchPlans = append(watchPlans, wp)
   856  	}
   857  
   858  	// Fire off a goroutine for each new watch plan.
   859  	for _, wp := range watchPlans {
   860  		config, err := a.config.APIConfig(true)
   861  		if err != nil {
   862  			a.logger.Printf("[ERR] agent: Failed to run watch: %v", err)
   863  			continue
   864  		}
   865  
   866  		a.watchPlans = append(a.watchPlans, wp)
   867  		go func(wp *watch.Plan) {
   868  			if h, ok := wp.Exempt["handler"]; ok {
   869  				wp.Handler = makeWatchHandler(a.LogOutput, h)
   870  			} else if h, ok := wp.Exempt["args"]; ok {
   871  				wp.Handler = makeWatchHandler(a.LogOutput, h)
   872  			} else {
   873  				httpConfig := wp.Exempt["http_handler_config"].(*watch.HttpHandlerConfig)
   874  				wp.Handler = makeHTTPWatchHandler(a.LogOutput, httpConfig)
   875  			}
   876  			wp.LogOutput = a.LogOutput
   877  
   878  			addr := config.Address
   879  			if config.Scheme == "https" {
   880  				addr = "https://" + addr
   881  			}
   882  
   883  			if err := wp.RunWithConfig(addr, config); err != nil {
   884  				a.logger.Printf("[ERR] agent: Failed to run watch: %v", err)
   885  			}
   886  		}(wp)
   887  	}
   888  	return nil
   889  }
   890  
   891  // consulConfig is used to return a consul configuration
   892  func (a *Agent) consulConfig() (*consul.Config, error) {
   893  	// Start with the provided config or default config
   894  	base := consul.DefaultConfig()
   895  
   896  	// This is set when the agent starts up
   897  	base.NodeID = a.config.NodeID
   898  
   899  	// Apply dev mode
   900  	base.DevMode = a.config.DevMode
   901  
   902  	// Override with our config
   903  	// todo(fs): these are now always set in the runtime config so we can simplify this
   904  	// todo(fs): or is there a reason to keep it like that?
   905  	base.Datacenter = a.config.Datacenter
   906  	base.PrimaryDatacenter = a.config.PrimaryDatacenter
   907  	base.DataDir = a.config.DataDir
   908  	base.NodeName = a.config.NodeName
   909  
   910  	base.CoordinateUpdateBatchSize = a.config.ConsulCoordinateUpdateBatchSize
   911  	base.CoordinateUpdateMaxBatches = a.config.ConsulCoordinateUpdateMaxBatches
   912  	base.CoordinateUpdatePeriod = a.config.ConsulCoordinateUpdatePeriod
   913  
   914  	base.RaftConfig.HeartbeatTimeout = a.config.ConsulRaftHeartbeatTimeout
   915  	base.RaftConfig.LeaderLeaseTimeout = a.config.ConsulRaftLeaderLeaseTimeout
   916  	base.RaftConfig.ElectionTimeout = a.config.ConsulRaftElectionTimeout
   917  
   918  	base.SerfLANConfig.MemberlistConfig.BindAddr = a.config.SerfBindAddrLAN.IP.String()
   919  	base.SerfLANConfig.MemberlistConfig.BindPort = a.config.SerfBindAddrLAN.Port
   920  	base.SerfLANConfig.MemberlistConfig.AdvertiseAddr = a.config.SerfAdvertiseAddrLAN.IP.String()
   921  	base.SerfLANConfig.MemberlistConfig.AdvertisePort = a.config.SerfAdvertiseAddrLAN.Port
   922  	base.SerfLANConfig.MemberlistConfig.GossipVerifyIncoming = a.config.EncryptVerifyIncoming
   923  	base.SerfLANConfig.MemberlistConfig.GossipVerifyOutgoing = a.config.EncryptVerifyOutgoing
   924  	base.SerfLANConfig.MemberlistConfig.GossipInterval = a.config.GossipLANGossipInterval
   925  	base.SerfLANConfig.MemberlistConfig.GossipNodes = a.config.GossipLANGossipNodes
   926  	base.SerfLANConfig.MemberlistConfig.ProbeInterval = a.config.GossipLANProbeInterval
   927  	base.SerfLANConfig.MemberlistConfig.ProbeTimeout = a.config.GossipLANProbeTimeout
   928  	base.SerfLANConfig.MemberlistConfig.SuspicionMult = a.config.GossipLANSuspicionMult
   929  	base.SerfLANConfig.MemberlistConfig.RetransmitMult = a.config.GossipLANRetransmitMult
   930  	if a.config.ReconnectTimeoutLAN != 0 {
   931  		base.SerfLANConfig.ReconnectTimeout = a.config.ReconnectTimeoutLAN
   932  	}
   933  
   934  	if a.config.SerfBindAddrWAN != nil {
   935  		base.SerfWANConfig.MemberlistConfig.BindAddr = a.config.SerfBindAddrWAN.IP.String()
   936  		base.SerfWANConfig.MemberlistConfig.BindPort = a.config.SerfBindAddrWAN.Port
   937  		base.SerfWANConfig.MemberlistConfig.AdvertiseAddr = a.config.SerfAdvertiseAddrWAN.IP.String()
   938  		base.SerfWANConfig.MemberlistConfig.AdvertisePort = a.config.SerfAdvertiseAddrWAN.Port
   939  		base.SerfWANConfig.MemberlistConfig.GossipVerifyIncoming = a.config.EncryptVerifyIncoming
   940  		base.SerfWANConfig.MemberlistConfig.GossipVerifyOutgoing = a.config.EncryptVerifyOutgoing
   941  		base.SerfWANConfig.MemberlistConfig.GossipInterval = a.config.GossipWANGossipInterval
   942  		base.SerfWANConfig.MemberlistConfig.GossipNodes = a.config.GossipWANGossipNodes
   943  		base.SerfWANConfig.MemberlistConfig.ProbeInterval = a.config.GossipWANProbeInterval
   944  		base.SerfWANConfig.MemberlistConfig.ProbeTimeout = a.config.GossipWANProbeTimeout
   945  		base.SerfWANConfig.MemberlistConfig.SuspicionMult = a.config.GossipWANSuspicionMult
   946  		base.SerfWANConfig.MemberlistConfig.RetransmitMult = a.config.GossipWANRetransmitMult
   947  		if a.config.ReconnectTimeoutWAN != 0 {
   948  			base.SerfWANConfig.ReconnectTimeout = a.config.ReconnectTimeoutWAN
   949  		}
   950  	} else {
   951  		// Disable serf WAN federation
   952  		base.SerfWANConfig = nil
   953  	}
   954  
   955  	base.RPCAddr = a.config.RPCBindAddr
   956  	base.RPCAdvertise = a.config.RPCAdvertiseAddr
   957  
   958  	base.Segment = a.config.SegmentName
   959  	if len(a.config.Segments) > 0 {
   960  		segments, err := a.segmentConfig()
   961  		if err != nil {
   962  			return nil, err
   963  		}
   964  		base.Segments = segments
   965  	}
   966  	if a.config.Bootstrap {
   967  		base.Bootstrap = true
   968  	}
   969  	if a.config.RejoinAfterLeave {
   970  		base.RejoinAfterLeave = true
   971  	}
   972  	if a.config.BootstrapExpect != 0 {
   973  		base.BootstrapExpect = a.config.BootstrapExpect
   974  	}
   975  	if a.config.RPCProtocol > 0 {
   976  		base.ProtocolVersion = uint8(a.config.RPCProtocol)
   977  	}
   978  	if a.config.RaftProtocol != 0 {
   979  		base.RaftConfig.ProtocolVersion = raft.ProtocolVersion(a.config.RaftProtocol)
   980  	}
   981  	if a.config.RaftSnapshotThreshold != 0 {
   982  		base.RaftConfig.SnapshotThreshold = uint64(a.config.RaftSnapshotThreshold)
   983  	}
   984  	if a.config.RaftSnapshotInterval != 0 {
   985  		base.RaftConfig.SnapshotInterval = a.config.RaftSnapshotInterval
   986  	}
   987  	if a.config.ACLMasterToken != "" {
   988  		base.ACLMasterToken = a.config.ACLMasterToken
   989  	}
   990  	if a.config.ACLDatacenter != "" {
   991  		base.ACLDatacenter = a.config.ACLDatacenter
   992  	}
   993  	if a.config.ACLTokenTTL != 0 {
   994  		base.ACLTokenTTL = a.config.ACLTokenTTL
   995  	}
   996  	if a.config.ACLPolicyTTL != 0 {
   997  		base.ACLPolicyTTL = a.config.ACLPolicyTTL
   998  	}
   999  	if a.config.ACLDefaultPolicy != "" {
  1000  		base.ACLDefaultPolicy = a.config.ACLDefaultPolicy
  1001  	}
  1002  	if a.config.ACLDownPolicy != "" {
  1003  		base.ACLDownPolicy = a.config.ACLDownPolicy
  1004  	}
  1005  	base.ACLEnforceVersion8 = a.config.ACLEnforceVersion8
  1006  	base.ACLTokenReplication = a.config.ACLTokenReplication
  1007  	base.ACLsEnabled = a.config.ACLsEnabled
  1008  	if a.config.ACLEnableKeyListPolicy {
  1009  		base.ACLEnableKeyListPolicy = a.config.ACLEnableKeyListPolicy
  1010  	}
  1011  	if a.config.SessionTTLMin != 0 {
  1012  		base.SessionTTLMin = a.config.SessionTTLMin
  1013  	}
  1014  	if a.config.NonVotingServer {
  1015  		base.NonVoter = a.config.NonVotingServer
  1016  	}
  1017  
  1018  	// These are fully specified in the agent defaults, so we can simply
  1019  	// copy them over.
  1020  	base.AutopilotConfig.CleanupDeadServers = a.config.AutopilotCleanupDeadServers
  1021  	base.AutopilotConfig.LastContactThreshold = a.config.AutopilotLastContactThreshold
  1022  	base.AutopilotConfig.MaxTrailingLogs = uint64(a.config.AutopilotMaxTrailingLogs)
  1023  	base.AutopilotConfig.ServerStabilizationTime = a.config.AutopilotServerStabilizationTime
  1024  	base.AutopilotConfig.RedundancyZoneTag = a.config.AutopilotRedundancyZoneTag
  1025  	base.AutopilotConfig.DisableUpgradeMigration = a.config.AutopilotDisableUpgradeMigration
  1026  	base.AutopilotConfig.UpgradeVersionTag = a.config.AutopilotUpgradeVersionTag
  1027  
  1028  	// make sure the advertise address is always set
  1029  	if base.RPCAdvertise == nil {
  1030  		base.RPCAdvertise = base.RPCAddr
  1031  	}
  1032  
  1033  	// Rate limiting for RPC calls.
  1034  	if a.config.RPCRateLimit > 0 {
  1035  		base.RPCRate = a.config.RPCRateLimit
  1036  	}
  1037  	if a.config.RPCMaxBurst > 0 {
  1038  		base.RPCMaxBurst = a.config.RPCMaxBurst
  1039  	}
  1040  
  1041  	// RPC-related performance configs.
  1042  	if a.config.RPCHoldTimeout > 0 {
  1043  		base.RPCHoldTimeout = a.config.RPCHoldTimeout
  1044  	}
  1045  	if a.config.LeaveDrainTime > 0 {
  1046  		base.LeaveDrainTime = a.config.LeaveDrainTime
  1047  	}
  1048  
  1049  	// set the src address for outgoing rpc connections
  1050  	// Use port 0 so that outgoing connections use a random port.
  1051  	if !ipaddr.IsAny(base.RPCAddr.IP) {
  1052  		base.RPCSrcAddr = &net.TCPAddr{IP: base.RPCAddr.IP}
  1053  	}
  1054  
  1055  	// Format the build string
  1056  	revision := a.config.Revision
  1057  	if len(revision) > 8 {
  1058  		revision = revision[:8]
  1059  	}
  1060  	base.Build = fmt.Sprintf("%s%s:%s", a.config.Version, a.config.VersionPrerelease, revision)
  1061  
  1062  	// Copy the TLS configuration
  1063  	base.VerifyIncoming = a.config.VerifyIncoming || a.config.VerifyIncomingRPC
  1064  	if a.config.CAPath != "" || a.config.CAFile != "" {
  1065  		base.UseTLS = true
  1066  	}
  1067  	base.VerifyOutgoing = a.config.VerifyOutgoing
  1068  	base.VerifyServerHostname = a.config.VerifyServerHostname
  1069  	base.CAFile = a.config.CAFile
  1070  	base.CAPath = a.config.CAPath
  1071  	base.CertFile = a.config.CertFile
  1072  	base.KeyFile = a.config.KeyFile
  1073  	base.ServerName = a.config.ServerName
  1074  	base.Domain = a.config.DNSDomain
  1075  	base.TLSMinVersion = a.config.TLSMinVersion
  1076  	base.TLSCipherSuites = a.config.TLSCipherSuites
  1077  	base.TLSPreferServerCipherSuites = a.config.TLSPreferServerCipherSuites
  1078  
  1079  	// Copy the Connect CA bootstrap config
  1080  	if a.config.ConnectEnabled {
  1081  		base.ConnectEnabled = true
  1082  
  1083  		// Allow config to specify cluster_id provided it's a valid UUID. This is
  1084  		// meant only for tests where a deterministic ID makes fixtures much simpler
  1085  		// to work with but since it's only read on initial cluster bootstrap it's not
  1086  		// that much of a liability in production. The worst a user could do is
  1087  		// configure logically separate clusters with same ID by mistake but we can
  1088  		// avoid documenting this is even an option.
  1089  		if clusterID, ok := a.config.ConnectCAConfig["cluster_id"]; ok {
  1090  			if cIDStr, ok := clusterID.(string); ok {
  1091  				if _, err := uuid.ParseUUID(cIDStr); err == nil {
  1092  					// Valid UUID configured, use that
  1093  					base.CAConfig.ClusterID = cIDStr
  1094  				}
  1095  			}
  1096  			if base.CAConfig.ClusterID == "" {
  1097  				// If the tried to specify an ID but typoed it don't ignore as they will
  1098  				// then bootstrap with a new ID and have to throw away the whole cluster
  1099  				// and start again.
  1100  				a.logger.Println("[ERR] connect CA config cluster_id specified but " +
  1101  					"is not a valid UUID, aborting startup")
  1102  				return nil, fmt.Errorf("cluster_id was supplied but was not a valid UUID")
  1103  			}
  1104  		}
  1105  
  1106  		if a.config.ConnectCAProvider != "" {
  1107  			base.CAConfig.Provider = a.config.ConnectCAProvider
  1108  		}
  1109  
  1110  		// Merge connect CA Config regardless of provider (since there are some
  1111  		// common config options valid to all like leaf TTL).
  1112  		for k, v := range a.config.ConnectCAConfig {
  1113  			base.CAConfig.Config[k] = v
  1114  		}
  1115  	}
  1116  
  1117  	// Setup the user event callback
  1118  	base.UserEventHandler = func(e serf.UserEvent) {
  1119  		select {
  1120  		case a.eventCh <- e:
  1121  		case <-a.shutdownCh:
  1122  		}
  1123  	}
  1124  
  1125  	// Setup the loggers
  1126  	base.LogOutput = a.LogOutput
  1127  
  1128  	// This will set up the LAN keyring, as well as the WAN and any segments
  1129  	// for servers.
  1130  	if err := a.setupKeyrings(base); err != nil {
  1131  		return nil, fmt.Errorf("Failed to configure keyring: %v", err)
  1132  	}
  1133  
  1134  	base.WatchSoftLimit = a.config.WatchSoftLimit
  1135  
  1136  	return base, nil
  1137  }
  1138  
  1139  // Setup the serf and memberlist config for any defined network segments.
  1140  func (a *Agent) segmentConfig() ([]consul.NetworkSegment, error) {
  1141  	var segments []consul.NetworkSegment
  1142  	config := a.config
  1143  
  1144  	for _, s := range config.Segments {
  1145  		serfConf := consul.DefaultConfig().SerfLANConfig
  1146  
  1147  		serfConf.MemberlistConfig.BindAddr = s.Bind.IP.String()
  1148  		serfConf.MemberlistConfig.BindPort = s.Bind.Port
  1149  		serfConf.MemberlistConfig.AdvertiseAddr = s.Advertise.IP.String()
  1150  		serfConf.MemberlistConfig.AdvertisePort = s.Advertise.Port
  1151  
  1152  		if config.ReconnectTimeoutLAN != 0 {
  1153  			serfConf.ReconnectTimeout = config.ReconnectTimeoutLAN
  1154  		}
  1155  		if config.EncryptVerifyIncoming {
  1156  			serfConf.MemberlistConfig.GossipVerifyIncoming = config.EncryptVerifyIncoming
  1157  		}
  1158  		if config.EncryptVerifyOutgoing {
  1159  			serfConf.MemberlistConfig.GossipVerifyOutgoing = config.EncryptVerifyOutgoing
  1160  		}
  1161  
  1162  		var rpcAddr *net.TCPAddr
  1163  		if s.RPCListener {
  1164  			rpcAddr = &net.TCPAddr{
  1165  				IP:   s.Bind.IP,
  1166  				Port: a.config.ServerPort,
  1167  			}
  1168  		}
  1169  
  1170  		segments = append(segments, consul.NetworkSegment{
  1171  			Name:       s.Name,
  1172  			Bind:       serfConf.MemberlistConfig.BindAddr,
  1173  			Advertise:  serfConf.MemberlistConfig.AdvertiseAddr,
  1174  			Port:       s.Bind.Port,
  1175  			RPCAddr:    rpcAddr,
  1176  			SerfConfig: serfConf,
  1177  		})
  1178  	}
  1179  
  1180  	return segments, nil
  1181  }
  1182  
  1183  // makeRandomID will generate a random UUID for a node.
  1184  func (a *Agent) makeRandomID() (string, error) {
  1185  	id, err := uuid.GenerateUUID()
  1186  	if err != nil {
  1187  		return "", err
  1188  	}
  1189  
  1190  	a.logger.Printf("[DEBUG] agent: Using random ID %q as node ID", id)
  1191  	return id, nil
  1192  }
  1193  
  1194  // makeNodeID will try to find a host-specific ID, or else will generate a
  1195  // random ID. The returned ID will always be formatted as a GUID. We don't tell
  1196  // the caller whether this ID is random or stable since the consequences are
  1197  // high for us if this changes, so we will persist it either way. This will let
  1198  // gopsutil change implementations without affecting in-place upgrades of nodes.
  1199  func (a *Agent) makeNodeID() (string, error) {
  1200  	// If they've disabled host-based IDs then just make a random one.
  1201  	if a.config.DisableHostNodeID {
  1202  		return a.makeRandomID()
  1203  	}
  1204  
  1205  	// Try to get a stable ID associated with the host itself.
  1206  	info, err := host.Info()
  1207  	if err != nil {
  1208  		a.logger.Printf("[DEBUG] agent: Couldn't get a unique ID from the host: %v", err)
  1209  		return a.makeRandomID()
  1210  	}
  1211  
  1212  	// Make sure the host ID parses as a UUID, since we don't have complete
  1213  	// control over this process.
  1214  	id := strings.ToLower(info.HostID)
  1215  	if _, err := uuid.ParseUUID(id); err != nil {
  1216  		a.logger.Printf("[DEBUG] agent: Unique ID %q from host isn't formatted as a UUID: %v",
  1217  			id, err)
  1218  		return a.makeRandomID()
  1219  	}
  1220  
  1221  	// Hash the input to make it well distributed. The reported Host UUID may be
  1222  	// similar across nodes if they are on a cloud provider or on motherboards
  1223  	// created from the same batch.
  1224  	buf := sha512.Sum512([]byte(id))
  1225  	id = fmt.Sprintf("%08x-%04x-%04x-%04x-%12x",
  1226  		buf[0:4],
  1227  		buf[4:6],
  1228  		buf[6:8],
  1229  		buf[8:10],
  1230  		buf[10:16])
  1231  
  1232  	a.logger.Printf("[DEBUG] agent: Using unique ID %q from host as node ID", id)
  1233  	return id, nil
  1234  }
  1235  
  1236  // setupNodeID will pull the persisted node ID, if any, or create a random one
  1237  // and persist it.
  1238  func (a *Agent) setupNodeID(config *config.RuntimeConfig) error {
  1239  	// If they've configured a node ID manually then just use that, as
  1240  	// long as it's valid.
  1241  	if config.NodeID != "" {
  1242  		config.NodeID = types.NodeID(strings.ToLower(string(config.NodeID)))
  1243  		if _, err := uuid.ParseUUID(string(config.NodeID)); err != nil {
  1244  			return err
  1245  		}
  1246  
  1247  		return nil
  1248  	}
  1249  
  1250  	// For dev mode we have no filesystem access so just make one.
  1251  	if a.config.DataDir == "" {
  1252  		id, err := a.makeNodeID()
  1253  		if err != nil {
  1254  			return err
  1255  		}
  1256  
  1257  		config.NodeID = types.NodeID(id)
  1258  		return nil
  1259  	}
  1260  
  1261  	// Load saved state, if any. Since a user could edit this, we also
  1262  	// validate it.
  1263  	fileID := filepath.Join(config.DataDir, "node-id")
  1264  	if _, err := os.Stat(fileID); err == nil {
  1265  		rawID, err := ioutil.ReadFile(fileID)
  1266  		if err != nil {
  1267  			return err
  1268  		}
  1269  
  1270  		nodeID := strings.TrimSpace(string(rawID))
  1271  		nodeID = strings.ToLower(nodeID)
  1272  		if _, err := uuid.ParseUUID(nodeID); err != nil {
  1273  			return err
  1274  		}
  1275  
  1276  		config.NodeID = types.NodeID(nodeID)
  1277  	}
  1278  
  1279  	// If we still don't have a valid node ID, make one.
  1280  	if config.NodeID == "" {
  1281  		id, err := a.makeNodeID()
  1282  		if err != nil {
  1283  			return err
  1284  		}
  1285  		if err := lib.EnsurePath(fileID, false); err != nil {
  1286  			return err
  1287  		}
  1288  		if err := ioutil.WriteFile(fileID, []byte(id), 0600); err != nil {
  1289  			return err
  1290  		}
  1291  
  1292  		config.NodeID = types.NodeID(id)
  1293  	}
  1294  	return nil
  1295  }
  1296  
  1297  // setupBaseKeyrings configures the LAN and WAN keyrings.
  1298  func (a *Agent) setupBaseKeyrings(config *consul.Config) error {
  1299  	// If the keyring file is disabled then just poke the provided key
  1300  	// into the in-memory keyring.
  1301  	federationEnabled := config.SerfWANConfig != nil
  1302  	if a.config.DisableKeyringFile {
  1303  		if a.config.EncryptKey == "" {
  1304  			return nil
  1305  		}
  1306  
  1307  		keys := []string{a.config.EncryptKey}
  1308  		if err := loadKeyring(config.SerfLANConfig, keys); err != nil {
  1309  			return err
  1310  		}
  1311  		if a.config.ServerMode && federationEnabled {
  1312  			if err := loadKeyring(config.SerfWANConfig, keys); err != nil {
  1313  				return err
  1314  			}
  1315  		}
  1316  		return nil
  1317  	}
  1318  
  1319  	// Otherwise, we need to deal with the keyring files.
  1320  	fileLAN := filepath.Join(a.config.DataDir, SerfLANKeyring)
  1321  	fileWAN := filepath.Join(a.config.DataDir, SerfWANKeyring)
  1322  
  1323  	if a.config.EncryptKey == "" {
  1324  		goto LOAD
  1325  	}
  1326  	if _, err := os.Stat(fileLAN); err != nil {
  1327  		if err := initKeyring(fileLAN, a.config.EncryptKey); err != nil {
  1328  			return err
  1329  		}
  1330  	}
  1331  	if a.config.ServerMode && federationEnabled {
  1332  		if _, err := os.Stat(fileWAN); err != nil {
  1333  			if err := initKeyring(fileWAN, a.config.EncryptKey); err != nil {
  1334  				return err
  1335  			}
  1336  		}
  1337  	}
  1338  
  1339  LOAD:
  1340  	if _, err := os.Stat(fileLAN); err == nil {
  1341  		config.SerfLANConfig.KeyringFile = fileLAN
  1342  	}
  1343  	if err := loadKeyringFile(config.SerfLANConfig); err != nil {
  1344  		return err
  1345  	}
  1346  	if a.config.ServerMode && federationEnabled {
  1347  		if _, err := os.Stat(fileWAN); err == nil {
  1348  			config.SerfWANConfig.KeyringFile = fileWAN
  1349  		}
  1350  		if err := loadKeyringFile(config.SerfWANConfig); err != nil {
  1351  			return err
  1352  		}
  1353  	}
  1354  
  1355  	return nil
  1356  }
  1357  
  1358  // setupKeyrings is used to initialize and load keyrings during agent startup.
  1359  func (a *Agent) setupKeyrings(config *consul.Config) error {
  1360  	// First set up the LAN and WAN keyrings.
  1361  	if err := a.setupBaseKeyrings(config); err != nil {
  1362  		return err
  1363  	}
  1364  
  1365  	// If there's no LAN keyring then there's nothing else to set up for
  1366  	// any segments.
  1367  	lanKeyring := config.SerfLANConfig.MemberlistConfig.Keyring
  1368  	if lanKeyring == nil {
  1369  		return nil
  1370  	}
  1371  
  1372  	// Copy the initial state of the LAN keyring into each segment config.
  1373  	// Segments don't have their own keyring file, they rely on the LAN
  1374  	// holding the state so things can't get out of sync.
  1375  	k, pk := lanKeyring.GetKeys(), lanKeyring.GetPrimaryKey()
  1376  	for _, segment := range config.Segments {
  1377  		keyring, err := memberlist.NewKeyring(k, pk)
  1378  		if err != nil {
  1379  			return err
  1380  		}
  1381  		segment.SerfConfig.MemberlistConfig.Keyring = keyring
  1382  	}
  1383  	return nil
  1384  }
  1385  
  1386  // registerEndpoint registers a handler for the consul RPC server
  1387  // under a unique name while making it accessible under the provided
  1388  // name. This allows overwriting handlers for the golang net/rpc
  1389  // service which does not allow this.
  1390  func (a *Agent) registerEndpoint(name string, handler interface{}) error {
  1391  	srv, ok := a.delegate.(*consul.Server)
  1392  	if !ok {
  1393  		panic("agent must be a server")
  1394  	}
  1395  	realname := fmt.Sprintf("%s-%d", name, time.Now().UnixNano())
  1396  	a.endpointsLock.Lock()
  1397  	a.endpoints[name] = realname
  1398  	a.endpointsLock.Unlock()
  1399  	return srv.RegisterEndpoint(realname, handler)
  1400  }
  1401  
  1402  // RPC is used to make an RPC call to the Consul servers
  1403  // This allows the agent to implement the Consul.Interface
  1404  func (a *Agent) RPC(method string, args interface{}, reply interface{}) error {
  1405  	a.endpointsLock.RLock()
  1406  	// fast path: only translate if there are overrides
  1407  	if len(a.endpoints) > 0 {
  1408  		p := strings.SplitN(method, ".", 2)
  1409  		if e := a.endpoints[p[0]]; e != "" {
  1410  			method = e + "." + p[1]
  1411  		}
  1412  	}
  1413  	a.endpointsLock.RUnlock()
  1414  	return a.delegate.RPC(method, args, reply)
  1415  }
  1416  
  1417  // SnapshotRPC performs the requested snapshot RPC against the Consul server in
  1418  // a streaming manner. The contents of in will be read and passed along as the
  1419  // payload, and the response message will determine the error status, and any
  1420  // return payload will be written to out.
  1421  func (a *Agent) SnapshotRPC(args *structs.SnapshotRequest, in io.Reader, out io.Writer,
  1422  	replyFn structs.SnapshotReplyFn) error {
  1423  	return a.delegate.SnapshotRPC(args, in, out, replyFn)
  1424  }
  1425  
  1426  // Leave is used to prepare the agent for a graceful shutdown
  1427  func (a *Agent) Leave() error {
  1428  	return a.delegate.Leave()
  1429  }
  1430  
  1431  // ShutdownAgent is used to hard stop the agent. Should be preceded by
  1432  // Leave to do it gracefully. Should be followed by ShutdownEndpoints to
  1433  // terminate the HTTP and DNS servers as well.
  1434  func (a *Agent) ShutdownAgent() error {
  1435  	a.shutdownLock.Lock()
  1436  	defer a.shutdownLock.Unlock()
  1437  
  1438  	if a.shutdown {
  1439  		return nil
  1440  	}
  1441  	a.logger.Println("[INFO] agent: Requesting shutdown")
  1442  
  1443  	// Stop all the checks
  1444  	a.stateLock.Lock()
  1445  	defer a.stateLock.Unlock()
  1446  	for _, chk := range a.checkMonitors {
  1447  		chk.Stop()
  1448  	}
  1449  	for _, chk := range a.checkTTLs {
  1450  		chk.Stop()
  1451  	}
  1452  	for _, chk := range a.checkHTTPs {
  1453  		chk.Stop()
  1454  	}
  1455  	for _, chk := range a.checkTCPs {
  1456  		chk.Stop()
  1457  	}
  1458  	for _, chk := range a.checkGRPCs {
  1459  		chk.Stop()
  1460  	}
  1461  	for _, chk := range a.checkDockers {
  1462  		chk.Stop()
  1463  	}
  1464  	for _, chk := range a.checkAliases {
  1465  		chk.Stop()
  1466  	}
  1467  
  1468  	// Stop gRPC
  1469  	if a.grpcServer != nil {
  1470  		a.grpcServer.Stop()
  1471  	}
  1472  
  1473  	// Stop the proxy config manager
  1474  	if a.proxyConfig != nil {
  1475  		a.proxyConfig.Close()
  1476  	}
  1477  
  1478  	// Stop the proxy process manager
  1479  	if a.proxyManager != nil {
  1480  		// If persistence is disabled (implies DevMode but a subset of DevMode) then
  1481  		// don't leave the proxies running since the agent will not be able to
  1482  		// recover them later.
  1483  		if a.config.DataDir == "" {
  1484  			a.logger.Printf("[WARN] agent: dev mode disabled persistence, killing " +
  1485  				"all proxies since we can't recover them")
  1486  			if err := a.proxyManager.Kill(); err != nil {
  1487  				a.logger.Printf("[WARN] agent: error shutting down proxy manager: %s", err)
  1488  			}
  1489  		} else {
  1490  			if err := a.proxyManager.Close(); err != nil {
  1491  				a.logger.Printf("[WARN] agent: error shutting down proxy manager: %s", err)
  1492  			}
  1493  		}
  1494  	}
  1495  
  1496  	// Stop the cache background work
  1497  	if a.cache != nil {
  1498  		a.cache.Close()
  1499  	}
  1500  
  1501  	var err error
  1502  	if a.delegate != nil {
  1503  		err = a.delegate.Shutdown()
  1504  		if _, ok := a.delegate.(*consul.Server); ok {
  1505  			a.logger.Print("[INFO] agent: consul server down")
  1506  		} else {
  1507  			a.logger.Print("[INFO] agent: consul client down")
  1508  		}
  1509  	}
  1510  
  1511  	pidErr := a.deletePid()
  1512  	if pidErr != nil {
  1513  		a.logger.Println("[WARN] agent: could not delete pid file ", pidErr)
  1514  	}
  1515  
  1516  	a.logger.Println("[INFO] agent: shutdown complete")
  1517  	a.shutdown = true
  1518  	close(a.shutdownCh)
  1519  	return err
  1520  }
  1521  
  1522  // ShutdownEndpoints terminates the HTTP and DNS servers. Should be
  1523  // preceded by ShutdownAgent.
  1524  func (a *Agent) ShutdownEndpoints() {
  1525  	a.shutdownLock.Lock()
  1526  	defer a.shutdownLock.Unlock()
  1527  
  1528  	if len(a.dnsServers) == 0 && len(a.httpServers) == 0 {
  1529  		return
  1530  	}
  1531  
  1532  	for _, srv := range a.dnsServers {
  1533  		a.logger.Printf("[INFO] agent: Stopping DNS server %s (%s)", srv.Server.Addr, srv.Server.Net)
  1534  		srv.Shutdown()
  1535  	}
  1536  	a.dnsServers = nil
  1537  
  1538  	for _, srv := range a.httpServers {
  1539  		a.logger.Printf("[INFO] agent: Stopping %s server %s (%s)", strings.ToUpper(srv.proto), srv.ln.Addr().String(), srv.ln.Addr().Network())
  1540  		ctx, cancel := context.WithTimeout(context.Background(), time.Second)
  1541  		defer cancel()
  1542  		srv.Shutdown(ctx)
  1543  		if ctx.Err() == context.DeadlineExceeded {
  1544  			a.logger.Printf("[WARN] agent: Timeout stopping %s server %s (%s)", strings.ToUpper(srv.proto), srv.ln.Addr().String(), srv.ln.Addr().Network())
  1545  		}
  1546  	}
  1547  	a.httpServers = nil
  1548  
  1549  	a.logger.Println("[INFO] agent: Waiting for endpoints to shut down")
  1550  	a.wgServers.Wait()
  1551  	a.logger.Print("[INFO] agent: Endpoints down")
  1552  }
  1553  
  1554  // ReloadCh is used to return a channel that can be
  1555  // used for triggering reloads and returning a response.
  1556  func (a *Agent) ReloadCh() chan chan error {
  1557  	return a.reloadCh
  1558  }
  1559  
  1560  // RetryJoinCh is a channel that transports errors
  1561  // from the retry join process.
  1562  func (a *Agent) RetryJoinCh() <-chan error {
  1563  	return a.retryJoinCh
  1564  }
  1565  
  1566  // ShutdownCh is used to return a channel that can be
  1567  // selected to wait for the agent to perform a shutdown.
  1568  func (a *Agent) ShutdownCh() <-chan struct{} {
  1569  	return a.shutdownCh
  1570  }
  1571  
  1572  // JoinLAN is used to have the agent join a LAN cluster
  1573  func (a *Agent) JoinLAN(addrs []string) (n int, err error) {
  1574  	a.logger.Printf("[INFO] agent: (LAN) joining: %v", addrs)
  1575  	n, err = a.delegate.JoinLAN(addrs)
  1576  	a.logger.Printf("[INFO] agent: (LAN) joined: %d Err: %v", n, err)
  1577  	if err == nil && a.joinLANNotifier != nil {
  1578  		if notifErr := a.joinLANNotifier.Notify(systemd.Ready); notifErr != nil {
  1579  			a.logger.Printf("[DEBUG] agent: systemd notify failed: %v", notifErr)
  1580  		}
  1581  	}
  1582  	return
  1583  }
  1584  
  1585  // JoinWAN is used to have the agent join a WAN cluster
  1586  func (a *Agent) JoinWAN(addrs []string) (n int, err error) {
  1587  	a.logger.Printf("[INFO] agent: (WAN) joining: %v", addrs)
  1588  	if srv, ok := a.delegate.(*consul.Server); ok {
  1589  		n, err = srv.JoinWAN(addrs)
  1590  	} else {
  1591  		err = fmt.Errorf("Must be a server to join WAN cluster")
  1592  	}
  1593  	a.logger.Printf("[INFO] agent: (WAN) joined: %d Err: %v", n, err)
  1594  	return
  1595  }
  1596  
  1597  // ForceLeave is used to remove a failed node from the cluster
  1598  func (a *Agent) ForceLeave(node string) (err error) {
  1599  	a.logger.Printf("[INFO] agent: Force leaving node: %v", node)
  1600  	err = a.delegate.RemoveFailedNode(node)
  1601  	if err != nil {
  1602  		a.logger.Printf("[WARN] agent: Failed to remove node: %v", err)
  1603  	}
  1604  	return err
  1605  }
  1606  
  1607  // LocalMember is used to return the local node
  1608  func (a *Agent) LocalMember() serf.Member {
  1609  	return a.delegate.LocalMember()
  1610  }
  1611  
  1612  // LANMembers is used to retrieve the LAN members
  1613  func (a *Agent) LANMembers() []serf.Member {
  1614  	return a.delegate.LANMembers()
  1615  }
  1616  
  1617  // WANMembers is used to retrieve the WAN members
  1618  func (a *Agent) WANMembers() []serf.Member {
  1619  	if srv, ok := a.delegate.(*consul.Server); ok {
  1620  		return srv.WANMembers()
  1621  	}
  1622  	return nil
  1623  }
  1624  
  1625  // StartSync is called once Services and Checks are registered.
  1626  // This is called to prevent a race between clients and the anti-entropy routines
  1627  func (a *Agent) StartSync() {
  1628  	go a.sync.Run()
  1629  	a.logger.Printf("[INFO] agent: started state syncer")
  1630  }
  1631  
  1632  // PauseSync is used to pause anti-entropy while bulk changes are made. It also
  1633  // sets state that agent-local watches use to "ride out" config reloads and bulk
  1634  // updates which might spuriously unload state and reload it again.
  1635  func (a *Agent) PauseSync() {
  1636  	// Do this outside of lock as it has it's own locking
  1637  	a.sync.Pause()
  1638  
  1639  	// Coordinate local state watchers
  1640  	a.syncMu.Lock()
  1641  	defer a.syncMu.Unlock()
  1642  	if a.syncCh == nil {
  1643  		a.syncCh = make(chan struct{})
  1644  	}
  1645  }
  1646  
  1647  // ResumeSync is used to unpause anti-entropy after bulk changes are make
  1648  func (a *Agent) ResumeSync() {
  1649  	// a.sync maintains a stack/ref count of Pause calls since we call
  1650  	// Pause/Resume in nested way during a reload and AddService. We only want to
  1651  	// trigger local state watchers if this Resume call actually started sync back
  1652  	// up again (i.e. was the last resume on the stack). We could check that
  1653  	// separately with a.sync.Paused but that is racey since another Pause call
  1654  	// might be made between our Resume and checking Paused.
  1655  	resumed := a.sync.Resume()
  1656  
  1657  	if !resumed {
  1658  		// Return early so we don't notify local watchers until we are actually
  1659  		// resumed.
  1660  		return
  1661  	}
  1662  
  1663  	// Coordinate local state watchers
  1664  	a.syncMu.Lock()
  1665  	defer a.syncMu.Unlock()
  1666  
  1667  	if a.syncCh != nil {
  1668  		close(a.syncCh)
  1669  		a.syncCh = nil
  1670  	}
  1671  }
  1672  
  1673  // syncPausedCh returns either a channel or nil. If nil sync is not paused. If
  1674  // non-nil, the channel will be closed when sync resumes.
  1675  func (a *Agent) syncPausedCh() <-chan struct{} {
  1676  	a.syncMu.Lock()
  1677  	defer a.syncMu.Unlock()
  1678  	return a.syncCh
  1679  }
  1680  
  1681  // GetLANCoordinate returns the coordinates of this node in the local pools
  1682  // (assumes coordinates are enabled, so check that before calling).
  1683  func (a *Agent) GetLANCoordinate() (lib.CoordinateSet, error) {
  1684  	return a.delegate.GetLANCoordinate()
  1685  }
  1686  
  1687  // sendCoordinate is a long-running loop that periodically sends our coordinate
  1688  // to the server. Closing the agent's shutdownChannel will cause this to exit.
  1689  func (a *Agent) sendCoordinate() {
  1690  OUTER:
  1691  	for {
  1692  		rate := a.config.SyncCoordinateRateTarget
  1693  		min := a.config.SyncCoordinateIntervalMin
  1694  		intv := lib.RateScaledInterval(rate, min, len(a.LANMembers()))
  1695  		intv = intv + lib.RandomStagger(intv)
  1696  
  1697  		select {
  1698  		case <-time.After(intv):
  1699  			members := a.LANMembers()
  1700  			grok, err := consul.CanServersUnderstandProtocol(members, 3)
  1701  			if err != nil {
  1702  				a.logger.Printf("[ERR] agent: Failed to check servers: %s", err)
  1703  				continue
  1704  			}
  1705  			if !grok {
  1706  				a.logger.Printf("[DEBUG] agent: Skipping coordinate updates until servers are upgraded")
  1707  				continue
  1708  			}
  1709  
  1710  			cs, err := a.GetLANCoordinate()
  1711  			if err != nil {
  1712  				a.logger.Printf("[ERR] agent: Failed to get coordinate: %s", err)
  1713  				continue
  1714  			}
  1715  
  1716  			for segment, coord := range cs {
  1717  				req := structs.CoordinateUpdateRequest{
  1718  					Datacenter:   a.config.Datacenter,
  1719  					Node:         a.config.NodeName,
  1720  					Segment:      segment,
  1721  					Coord:        coord,
  1722  					WriteRequest: structs.WriteRequest{Token: a.tokens.AgentToken()},
  1723  				}
  1724  				var reply struct{}
  1725  				if err := a.RPC("Coordinate.Update", &req, &reply); err != nil {
  1726  					if acl.IsErrPermissionDenied(err) {
  1727  						a.logger.Printf("[WARN] agent: Coordinate update blocked by ACLs")
  1728  					} else {
  1729  						a.logger.Printf("[ERR] agent: Coordinate update error: %v", err)
  1730  					}
  1731  					continue OUTER
  1732  				}
  1733  			}
  1734  		case <-a.shutdownCh:
  1735  			return
  1736  		}
  1737  	}
  1738  }
  1739  
  1740  // reapServicesInternal does a single pass, looking for services to reap.
  1741  func (a *Agent) reapServicesInternal() {
  1742  	reaped := make(map[string]bool)
  1743  	for checkID, cs := range a.State.CriticalCheckStates() {
  1744  		serviceID := cs.Check.ServiceID
  1745  
  1746  		// There's nothing to do if there's no service.
  1747  		if serviceID == "" {
  1748  			continue
  1749  		}
  1750  
  1751  		// There might be multiple checks for one service, so
  1752  		// we don't need to reap multiple times.
  1753  		if reaped[serviceID] {
  1754  			continue
  1755  		}
  1756  
  1757  		// See if there's a timeout.
  1758  		// todo(fs): this looks fishy... why is there another data structure in the agent with its own lock?
  1759  		a.stateLock.Lock()
  1760  		timeout := a.checkReapAfter[checkID]
  1761  		a.stateLock.Unlock()
  1762  
  1763  		// Reap, if necessary. We keep track of which service
  1764  		// this is so that we won't try to remove it again.
  1765  		if timeout > 0 && cs.CriticalFor() > timeout {
  1766  			reaped[serviceID] = true
  1767  			if err := a.RemoveService(serviceID, true); err != nil {
  1768  				a.logger.Printf("[ERR] agent: unable to deregister service %q after check %q has been critical for too long: %s",
  1769  					serviceID, checkID, err)
  1770  			} else {
  1771  				a.logger.Printf("[INFO] agent: Check %q for service %q has been critical for too long; deregistered service",
  1772  					checkID, serviceID)
  1773  			}
  1774  		}
  1775  	}
  1776  }
  1777  
  1778  // reapServices is a long running goroutine that looks for checks that have been
  1779  // critical too long and deregisters their associated services.
  1780  func (a *Agent) reapServices() {
  1781  	for {
  1782  		select {
  1783  		case <-time.After(a.config.CheckReapInterval):
  1784  			a.reapServicesInternal()
  1785  
  1786  		case <-a.shutdownCh:
  1787  			return
  1788  		}
  1789  	}
  1790  
  1791  }
  1792  
  1793  // persistedService is used to wrap a service definition and bundle it
  1794  // with an ACL token so we can restore both at a later agent start.
  1795  type persistedService struct {
  1796  	Token   string
  1797  	Service *structs.NodeService
  1798  }
  1799  
  1800  // persistService saves a service definition to a JSON file in the data dir
  1801  func (a *Agent) persistService(service *structs.NodeService) error {
  1802  	svcPath := filepath.Join(a.config.DataDir, servicesDir, stringHash(service.ID))
  1803  
  1804  	wrapped := persistedService{
  1805  		Token:   a.State.ServiceToken(service.ID),
  1806  		Service: service,
  1807  	}
  1808  	encoded, err := json.Marshal(wrapped)
  1809  	if err != nil {
  1810  		return err
  1811  	}
  1812  
  1813  	return file.WriteAtomic(svcPath, encoded)
  1814  }
  1815  
  1816  // purgeService removes a persisted service definition file from the data dir
  1817  func (a *Agent) purgeService(serviceID string) error {
  1818  	svcPath := filepath.Join(a.config.DataDir, servicesDir, stringHash(serviceID))
  1819  	if _, err := os.Stat(svcPath); err == nil {
  1820  		return os.Remove(svcPath)
  1821  	}
  1822  	return nil
  1823  }
  1824  
  1825  // persistedProxy is used to wrap a proxy definition and bundle it with an Proxy
  1826  // token so we can continue to authenticate the running proxy after a restart.
  1827  type persistedProxy struct {
  1828  	ProxyToken string
  1829  	Proxy      *structs.ConnectManagedProxy
  1830  
  1831  	// Set to true when the proxy information originated from the agents configuration
  1832  	// as opposed to API registration.
  1833  	FromFile bool
  1834  }
  1835  
  1836  // persistProxy saves a proxy definition to a JSON file in the data dir
  1837  func (a *Agent) persistProxy(proxy *local.ManagedProxy, FromFile bool) error {
  1838  	proxyPath := filepath.Join(a.config.DataDir, proxyDir,
  1839  		stringHash(proxy.Proxy.ProxyService.ID))
  1840  
  1841  	wrapped := persistedProxy{
  1842  		ProxyToken: proxy.ProxyToken,
  1843  		Proxy:      proxy.Proxy,
  1844  		FromFile:   FromFile,
  1845  	}
  1846  	encoded, err := json.Marshal(wrapped)
  1847  	if err != nil {
  1848  		return err
  1849  	}
  1850  
  1851  	return file.WriteAtomic(proxyPath, encoded)
  1852  }
  1853  
  1854  // purgeProxy removes a persisted proxy definition file from the data dir
  1855  func (a *Agent) purgeProxy(proxyID string) error {
  1856  	proxyPath := filepath.Join(a.config.DataDir, proxyDir, stringHash(proxyID))
  1857  	if _, err := os.Stat(proxyPath); err == nil {
  1858  		return os.Remove(proxyPath)
  1859  	}
  1860  	return nil
  1861  }
  1862  
  1863  // persistCheck saves a check definition to the local agent's state directory
  1864  func (a *Agent) persistCheck(check *structs.HealthCheck, chkType *structs.CheckType) error {
  1865  	checkPath := filepath.Join(a.config.DataDir, checksDir, checkIDHash(check.CheckID))
  1866  
  1867  	// Create the persisted check
  1868  	wrapped := persistedCheck{
  1869  		Check:   check,
  1870  		ChkType: chkType,
  1871  		Token:   a.State.CheckToken(check.CheckID),
  1872  	}
  1873  
  1874  	encoded, err := json.Marshal(wrapped)
  1875  	if err != nil {
  1876  		return err
  1877  	}
  1878  
  1879  	return file.WriteAtomic(checkPath, encoded)
  1880  }
  1881  
  1882  // purgeCheck removes a persisted check definition file from the data dir
  1883  func (a *Agent) purgeCheck(checkID types.CheckID) error {
  1884  	checkPath := filepath.Join(a.config.DataDir, checksDir, checkIDHash(checkID))
  1885  	if _, err := os.Stat(checkPath); err == nil {
  1886  		return os.Remove(checkPath)
  1887  	}
  1888  	return nil
  1889  }
  1890  
  1891  // AddService is used to add a service entry.
  1892  // This entry is persistent and the agent will make a best effort to
  1893  // ensure it is registered
  1894  func (a *Agent) AddService(service *structs.NodeService, chkTypes []*structs.CheckType, persist bool, token string, source configSource) error {
  1895  	a.stateLock.Lock()
  1896  	defer a.stateLock.Unlock()
  1897  	return a.addServiceLocked(service, chkTypes, persist, token, source)
  1898  }
  1899  
  1900  func (a *Agent) addServiceLocked(service *structs.NodeService, chkTypes []*structs.CheckType, persist bool, token string, source configSource) error {
  1901  	if service.Service == "" {
  1902  		return fmt.Errorf("Service name missing")
  1903  	}
  1904  	if service.ID == "" && service.Service != "" {
  1905  		service.ID = service.Service
  1906  	}
  1907  	for _, check := range chkTypes {
  1908  		if err := check.Validate(); err != nil {
  1909  			return fmt.Errorf("Check is not valid: %v", err)
  1910  		}
  1911  	}
  1912  
  1913  	// Set default weights if not specified. This is important as it ensures AE
  1914  	// doesn't consider the service different since it has nil weights.
  1915  	if service.Weights == nil {
  1916  		service.Weights = &structs.Weights{Passing: 1, Warning: 1}
  1917  	}
  1918  
  1919  	// Warn if the service name is incompatible with DNS
  1920  	if InvalidDnsRe.MatchString(service.Service) {
  1921  		a.logger.Printf("[WARN] agent: Service name %q will not be discoverable "+
  1922  			"via DNS due to invalid characters. Valid characters include "+
  1923  			"all alpha-numerics and dashes.", service.Service)
  1924  	} else if len(service.Service) > MaxDNSLabelLength {
  1925  		a.logger.Printf("[WARN] agent: Service name %q will not be discoverable "+
  1926  			"via DNS due to it being too long. Valid lengths are between "+
  1927  			"1 and 63 bytes.", service.Service)
  1928  	}
  1929  
  1930  	// Warn if any tags are incompatible with DNS
  1931  	for _, tag := range service.Tags {
  1932  		if InvalidDnsRe.MatchString(tag) {
  1933  			a.logger.Printf("[DEBUG] agent: Service tag %q will not be discoverable "+
  1934  				"via DNS due to invalid characters. Valid characters include "+
  1935  				"all alpha-numerics and dashes.", tag)
  1936  		} else if len(tag) > MaxDNSLabelLength {
  1937  			a.logger.Printf("[DEBUG] agent: Service tag %q will not be discoverable "+
  1938  				"via DNS due to it being too long. Valid lengths are between "+
  1939  				"1 and 63 bytes.", tag)
  1940  		}
  1941  	}
  1942  
  1943  	// Pause the service syncs during modification
  1944  	a.PauseSync()
  1945  	defer a.ResumeSync()
  1946  
  1947  	// Take a snapshot of the current state of checks (if any), and
  1948  	// restore them before resuming anti-entropy.
  1949  	snap := a.snapshotCheckState()
  1950  	defer a.restoreCheckState(snap)
  1951  
  1952  	var checks []*structs.HealthCheck
  1953  
  1954  	// Create an associated health check
  1955  	for i, chkType := range chkTypes {
  1956  		checkID := string(chkType.CheckID)
  1957  		if checkID == "" {
  1958  			checkID = fmt.Sprintf("service:%s", service.ID)
  1959  			if len(chkTypes) > 1 {
  1960  				checkID += fmt.Sprintf(":%d", i+1)
  1961  			}
  1962  		}
  1963  		name := chkType.Name
  1964  		if name == "" {
  1965  			name = fmt.Sprintf("Service '%s' check", service.Service)
  1966  		}
  1967  		check := &structs.HealthCheck{
  1968  			Node:        a.config.NodeName,
  1969  			CheckID:     types.CheckID(checkID),
  1970  			Name:        name,
  1971  			Status:      api.HealthCritical,
  1972  			Notes:       chkType.Notes,
  1973  			ServiceID:   service.ID,
  1974  			ServiceName: service.Service,
  1975  			ServiceTags: service.Tags,
  1976  		}
  1977  		if chkType.Status != "" {
  1978  			check.Status = chkType.Status
  1979  		}
  1980  
  1981  		checks = append(checks, check)
  1982  	}
  1983  
  1984  	// cleanup, store the ids of services and checks that weren't previously
  1985  	// registered so we clean them up if somthing fails halfway through the
  1986  	// process.
  1987  	var cleanupServices []string
  1988  	var cleanupChecks []types.CheckID
  1989  
  1990  	if s := a.State.Service(service.ID); s == nil {
  1991  		cleanupServices = append(cleanupServices, service.ID)
  1992  	}
  1993  
  1994  	for _, check := range checks {
  1995  		if c := a.State.Check(check.CheckID); c == nil {
  1996  			cleanupChecks = append(cleanupChecks, check.CheckID)
  1997  		}
  1998  	}
  1999  
  2000  	err := a.State.AddServiceWithChecks(service, checks, token)
  2001  	if err != nil {
  2002  		a.cleanupRegistration(cleanupServices, cleanupChecks)
  2003  		return err
  2004  	}
  2005  
  2006  	for i := range checks {
  2007  		if err := a.addCheck(checks[i], chkTypes[i], service, persist, token, source); err != nil {
  2008  			a.cleanupRegistration(cleanupServices, cleanupChecks)
  2009  			return err
  2010  		}
  2011  
  2012  		if persist && a.config.DataDir != "" {
  2013  			if err := a.persistCheck(checks[i], chkTypes[i]); err != nil {
  2014  				a.cleanupRegistration(cleanupServices, cleanupChecks)
  2015  				return err
  2016  
  2017  			}
  2018  		}
  2019  	}
  2020  
  2021  	// Persist the service to a file
  2022  	if persist && a.config.DataDir != "" {
  2023  		if err := a.persistService(service); err != nil {
  2024  			a.cleanupRegistration(cleanupServices, cleanupChecks)
  2025  			return err
  2026  		}
  2027  	}
  2028  
  2029  	return nil
  2030  }
  2031  
  2032  // cleanupRegistration is called on  registration error to ensure no there are no
  2033  // leftovers after a partial failure
  2034  func (a *Agent) cleanupRegistration(serviceIDs []string, checksIDs []types.CheckID) {
  2035  	for _, s := range serviceIDs {
  2036  		if err := a.State.RemoveService(s); err != nil {
  2037  			a.logger.Printf("[ERR] consul: service registration: cleanup: failed to remove service %s: %s", s, err)
  2038  		}
  2039  		if err := a.purgeService(s); err != nil {
  2040  			a.logger.Printf("[ERR] consul: service registration: cleanup: failed to purge service %s file: %s", s, err)
  2041  		}
  2042  	}
  2043  
  2044  	for _, c := range checksIDs {
  2045  		a.cancelCheckMonitors(c)
  2046  		if err := a.State.RemoveCheck(c); err != nil {
  2047  			a.logger.Printf("[ERR] consul: service registration: cleanup: failed to remove check %s: %s", c, err)
  2048  		}
  2049  		if err := a.purgeCheck(c); err != nil {
  2050  			a.logger.Printf("[ERR] consul: service registration: cleanup: failed to purge check %s file: %s", c, err)
  2051  		}
  2052  	}
  2053  }
  2054  
  2055  // RemoveService is used to remove a service entry.
  2056  // The agent will make a best effort to ensure it is deregistered
  2057  func (a *Agent) RemoveService(serviceID string, persist bool) error {
  2058  	a.stateLock.Lock()
  2059  	defer a.stateLock.Unlock()
  2060  	return a.removeServiceLocked(serviceID, persist)
  2061  }
  2062  
  2063  // removeServiceLocked is used to remove a service entry.
  2064  // The agent will make a best effort to ensure it is deregistered
  2065  func (a *Agent) removeServiceLocked(serviceID string, persist bool) error {
  2066  	// Validate ServiceID
  2067  	if serviceID == "" {
  2068  		return fmt.Errorf("ServiceID missing")
  2069  	}
  2070  
  2071  	checks := a.State.Checks()
  2072  	var checkIDs []types.CheckID
  2073  	for id, check := range checks {
  2074  		if check.ServiceID != serviceID {
  2075  			continue
  2076  		}
  2077  		checkIDs = append(checkIDs, id)
  2078  	}
  2079  
  2080  	// Remove the associated managed proxy if it exists
  2081  	// This has to be DONE before purging configuration as might might have issues
  2082  	// With ACLs otherwise
  2083  	for proxyID, p := range a.State.Proxies() {
  2084  		if p.Proxy.TargetServiceID == serviceID {
  2085  			if err := a.removeProxyLocked(proxyID, true); err != nil {
  2086  				return err
  2087  			}
  2088  		}
  2089  	}
  2090  
  2091  	// Remove service immediately
  2092  	if err := a.State.RemoveServiceWithChecks(serviceID, checkIDs); err != nil {
  2093  		a.logger.Printf("[WARN] agent: Failed to deregister service %q: %s", serviceID, err)
  2094  		return nil
  2095  	}
  2096  
  2097  	// Remove the service from the data dir
  2098  	if persist {
  2099  		if err := a.purgeService(serviceID); err != nil {
  2100  			return err
  2101  		}
  2102  	}
  2103  
  2104  	// Deregister any associated health checks
  2105  	for checkID, check := range checks {
  2106  		if check.ServiceID != serviceID {
  2107  			continue
  2108  		}
  2109  		if err := a.removeCheckLocked(checkID, persist); err != nil {
  2110  			return err
  2111  		}
  2112  	}
  2113  
  2114  	a.logger.Printf("[DEBUG] agent: removed service %q", serviceID)
  2115  
  2116  	// If any Sidecar services exist for the removed service ID, remove them too.
  2117  	if sidecar := a.State.Service(a.sidecarServiceID(serviceID)); sidecar != nil {
  2118  		// Double check that it's not just an ID collision and we actually added
  2119  		// this from a sidecar.
  2120  		if sidecar.LocallyRegisteredAsSidecar {
  2121  			// Remove it!
  2122  			err := a.removeServiceLocked(a.sidecarServiceID(serviceID), persist)
  2123  			if err != nil {
  2124  				return err
  2125  			}
  2126  		}
  2127  	}
  2128  
  2129  	return nil
  2130  }
  2131  
  2132  // AddCheck is used to add a health check to the agent.
  2133  // This entry is persistent and the agent will make a best effort to
  2134  // ensure it is registered. The Check may include a CheckType which
  2135  // is used to automatically update the check status
  2136  func (a *Agent) AddCheck(check *structs.HealthCheck, chkType *structs.CheckType, persist bool, token string, source configSource) error {
  2137  	a.stateLock.Lock()
  2138  	defer a.stateLock.Unlock()
  2139  	return a.addCheckLocked(check, chkType, persist, token, source)
  2140  }
  2141  
  2142  func (a *Agent) addCheckLocked(check *structs.HealthCheck, chkType *structs.CheckType, persist bool, token string, source configSource) error {
  2143  	var service *structs.NodeService
  2144  
  2145  	if check.ServiceID != "" {
  2146  		service = a.State.Service(check.ServiceID)
  2147  		if service == nil {
  2148  			return fmt.Errorf("ServiceID %q does not exist", check.ServiceID)
  2149  		}
  2150  	}
  2151  
  2152  	// snapshot the current state of the health check to avoid potential flapping
  2153  	existing := a.State.Check(check.CheckID)
  2154  	defer func() {
  2155  		if existing != nil {
  2156  			a.State.UpdateCheck(check.CheckID, existing.Status, existing.Output)
  2157  		}
  2158  	}()
  2159  
  2160  	err := a.addCheck(check, chkType, service, persist, token, source)
  2161  	if err != nil {
  2162  		a.State.RemoveCheck(check.CheckID)
  2163  		return err
  2164  	}
  2165  
  2166  	// Add to the local state for anti-entropy
  2167  	err = a.State.AddCheck(check, token)
  2168  	if err != nil {
  2169  		return err
  2170  	}
  2171  
  2172  	// Persist the check
  2173  	if persist && a.config.DataDir != "" {
  2174  		return a.persistCheck(check, chkType)
  2175  	}
  2176  
  2177  	return nil
  2178  }
  2179  
  2180  func (a *Agent) addCheck(check *structs.HealthCheck, chkType *structs.CheckType, service *structs.NodeService, persist bool, token string, source configSource) error {
  2181  	if check.CheckID == "" {
  2182  		return fmt.Errorf("CheckID missing")
  2183  	}
  2184  
  2185  	if chkType != nil {
  2186  		if err := chkType.Validate(); err != nil {
  2187  			return fmt.Errorf("Check is not valid: %v", err)
  2188  		}
  2189  
  2190  		if chkType.IsScript() {
  2191  			if source == ConfigSourceLocal && !a.config.EnableLocalScriptChecks {
  2192  				return fmt.Errorf("Scripts are disabled on this agent; to enable, configure 'enable_script_checks' or 'enable_local_script_checks' to true")
  2193  			}
  2194  
  2195  			if source == ConfigSourceRemote && !a.config.EnableRemoteScriptChecks {
  2196  				return fmt.Errorf("Scripts are disabled on this agent from remote calls; to enable, configure 'enable_script_checks' to true")
  2197  			}
  2198  		}
  2199  	}
  2200  
  2201  	if check.ServiceID != "" {
  2202  		check.ServiceName = service.Service
  2203  		check.ServiceTags = service.Tags
  2204  	}
  2205  
  2206  	// Check if already registered
  2207  	if chkType != nil {
  2208  		switch {
  2209  
  2210  		case chkType.IsTTL():
  2211  			if existing, ok := a.checkTTLs[check.CheckID]; ok {
  2212  				existing.Stop()
  2213  				delete(a.checkTTLs, check.CheckID)
  2214  			}
  2215  
  2216  			ttl := &checks.CheckTTL{
  2217  				Notify:  a.State,
  2218  				CheckID: check.CheckID,
  2219  				TTL:     chkType.TTL,
  2220  				Logger:  a.logger,
  2221  			}
  2222  
  2223  			// Restore persisted state, if any
  2224  			if err := a.loadCheckState(check); err != nil {
  2225  				a.logger.Printf("[WARN] agent: failed restoring state for check %q: %s",
  2226  					check.CheckID, err)
  2227  			}
  2228  
  2229  			ttl.Start()
  2230  			a.checkTTLs[check.CheckID] = ttl
  2231  
  2232  		case chkType.IsHTTP():
  2233  			if existing, ok := a.checkHTTPs[check.CheckID]; ok {
  2234  				existing.Stop()
  2235  				delete(a.checkHTTPs, check.CheckID)
  2236  			}
  2237  			if chkType.Interval < checks.MinInterval {
  2238  				a.logger.Println(fmt.Sprintf("[WARN] agent: check '%s' has interval below minimum of %v",
  2239  					check.CheckID, checks.MinInterval))
  2240  				chkType.Interval = checks.MinInterval
  2241  			}
  2242  
  2243  			tlsClientConfig := a.tlsConfigurator.OutgoingTLSConfigForCheck(chkType.TLSSkipVerify)
  2244  
  2245  			http := &checks.CheckHTTP{
  2246  				Notify:          a.State,
  2247  				CheckID:         check.CheckID,
  2248  				HTTP:            chkType.HTTP,
  2249  				Header:          chkType.Header,
  2250  				Method:          chkType.Method,
  2251  				Interval:        chkType.Interval,
  2252  				Timeout:         chkType.Timeout,
  2253  				Logger:          a.logger,
  2254  				TLSClientConfig: tlsClientConfig,
  2255  			}
  2256  			http.Start()
  2257  			a.checkHTTPs[check.CheckID] = http
  2258  
  2259  		case chkType.IsTCP():
  2260  			if existing, ok := a.checkTCPs[check.CheckID]; ok {
  2261  				existing.Stop()
  2262  				delete(a.checkTCPs, check.CheckID)
  2263  			}
  2264  			if chkType.Interval < checks.MinInterval {
  2265  				a.logger.Println(fmt.Sprintf("[WARN] agent: check '%s' has interval below minimum of %v",
  2266  					check.CheckID, checks.MinInterval))
  2267  				chkType.Interval = checks.MinInterval
  2268  			}
  2269  
  2270  			tcp := &checks.CheckTCP{
  2271  				Notify:   a.State,
  2272  				CheckID:  check.CheckID,
  2273  				TCP:      chkType.TCP,
  2274  				Interval: chkType.Interval,
  2275  				Timeout:  chkType.Timeout,
  2276  				Logger:   a.logger,
  2277  			}
  2278  			tcp.Start()
  2279  			a.checkTCPs[check.CheckID] = tcp
  2280  
  2281  		case chkType.IsGRPC():
  2282  			if existing, ok := a.checkGRPCs[check.CheckID]; ok {
  2283  				existing.Stop()
  2284  				delete(a.checkGRPCs, check.CheckID)
  2285  			}
  2286  			if chkType.Interval < checks.MinInterval {
  2287  				a.logger.Println(fmt.Sprintf("[WARN] agent: check '%s' has interval below minimum of %v",
  2288  					check.CheckID, checks.MinInterval))
  2289  				chkType.Interval = checks.MinInterval
  2290  			}
  2291  
  2292  			var tlsClientConfig *tls.Config
  2293  			if chkType.GRPCUseTLS {
  2294  				tlsClientConfig = a.tlsConfigurator.OutgoingTLSConfigForCheck(chkType.TLSSkipVerify)
  2295  			}
  2296  
  2297  			grpc := &checks.CheckGRPC{
  2298  				Notify:          a.State,
  2299  				CheckID:         check.CheckID,
  2300  				GRPC:            chkType.GRPC,
  2301  				Interval:        chkType.Interval,
  2302  				Timeout:         chkType.Timeout,
  2303  				Logger:          a.logger,
  2304  				TLSClientConfig: tlsClientConfig,
  2305  			}
  2306  			grpc.Start()
  2307  			a.checkGRPCs[check.CheckID] = grpc
  2308  
  2309  		case chkType.IsDocker():
  2310  			if existing, ok := a.checkDockers[check.CheckID]; ok {
  2311  				existing.Stop()
  2312  				delete(a.checkDockers, check.CheckID)
  2313  			}
  2314  			if chkType.Interval < checks.MinInterval {
  2315  				a.logger.Println(fmt.Sprintf("[WARN] agent: check '%s' has interval below minimum of %v",
  2316  					check.CheckID, checks.MinInterval))
  2317  				chkType.Interval = checks.MinInterval
  2318  			}
  2319  
  2320  			if a.dockerClient == nil {
  2321  				dc, err := checks.NewDockerClient(os.Getenv("DOCKER_HOST"), checks.BufSize)
  2322  				if err != nil {
  2323  					a.logger.Printf("[ERR] agent: error creating docker client: %s", err)
  2324  					return err
  2325  				}
  2326  				a.logger.Printf("[DEBUG] agent: created docker client for %s", dc.Host())
  2327  				a.dockerClient = dc
  2328  			}
  2329  
  2330  			dockerCheck := &checks.CheckDocker{
  2331  				Notify:            a.State,
  2332  				CheckID:           check.CheckID,
  2333  				DockerContainerID: chkType.DockerContainerID,
  2334  				Shell:             chkType.Shell,
  2335  				ScriptArgs:        chkType.ScriptArgs,
  2336  				Interval:          chkType.Interval,
  2337  				Logger:            a.logger,
  2338  				Client:            a.dockerClient,
  2339  			}
  2340  			if prev := a.checkDockers[check.CheckID]; prev != nil {
  2341  				prev.Stop()
  2342  			}
  2343  			dockerCheck.Start()
  2344  			a.checkDockers[check.CheckID] = dockerCheck
  2345  
  2346  		case chkType.IsMonitor():
  2347  			if existing, ok := a.checkMonitors[check.CheckID]; ok {
  2348  				existing.Stop()
  2349  				delete(a.checkMonitors, check.CheckID)
  2350  			}
  2351  			if chkType.Interval < checks.MinInterval {
  2352  				a.logger.Printf("[WARN] agent: check '%s' has interval below minimum of %v",
  2353  					check.CheckID, checks.MinInterval)
  2354  				chkType.Interval = checks.MinInterval
  2355  			}
  2356  
  2357  			monitor := &checks.CheckMonitor{
  2358  				Notify:     a.State,
  2359  				CheckID:    check.CheckID,
  2360  				ScriptArgs: chkType.ScriptArgs,
  2361  				Interval:   chkType.Interval,
  2362  				Timeout:    chkType.Timeout,
  2363  				Logger:     a.logger,
  2364  			}
  2365  			monitor.Start()
  2366  			a.checkMonitors[check.CheckID] = monitor
  2367  
  2368  		case chkType.IsAlias():
  2369  			if existing, ok := a.checkAliases[check.CheckID]; ok {
  2370  				existing.Stop()
  2371  				delete(a.checkAliases, check.CheckID)
  2372  			}
  2373  
  2374  			var rpcReq structs.NodeSpecificRequest
  2375  			rpcReq.Datacenter = a.config.Datacenter
  2376  
  2377  			// The token to set is really important. The behavior below follows
  2378  			// the same behavior as anti-entropy: we use the user-specified token
  2379  			// if set (either on the service or check definition), otherwise
  2380  			// we use the "UserToken" on the agent. This is tested.
  2381  			rpcReq.Token = a.tokens.UserToken()
  2382  			if token != "" {
  2383  				rpcReq.Token = token
  2384  			}
  2385  
  2386  			chkImpl := &checks.CheckAlias{
  2387  				Notify:    a.State,
  2388  				RPC:       a.delegate,
  2389  				RPCReq:    rpcReq,
  2390  				CheckID:   check.CheckID,
  2391  				Node:      chkType.AliasNode,
  2392  				ServiceID: chkType.AliasService,
  2393  			}
  2394  			chkImpl.Start()
  2395  			a.checkAliases[check.CheckID] = chkImpl
  2396  
  2397  		default:
  2398  			return fmt.Errorf("Check type is not valid")
  2399  		}
  2400  
  2401  		if chkType.DeregisterCriticalServiceAfter > 0 {
  2402  			timeout := chkType.DeregisterCriticalServiceAfter
  2403  			if timeout < a.config.CheckDeregisterIntervalMin {
  2404  				timeout = a.config.CheckDeregisterIntervalMin
  2405  				a.logger.Println(fmt.Sprintf("[WARN] agent: check '%s' has deregister interval below minimum of %v",
  2406  					check.CheckID, a.config.CheckDeregisterIntervalMin))
  2407  			}
  2408  			a.checkReapAfter[check.CheckID] = timeout
  2409  		} else {
  2410  			delete(a.checkReapAfter, check.CheckID)
  2411  		}
  2412  	}
  2413  
  2414  	return nil
  2415  }
  2416  
  2417  // RemoveCheck is used to remove a health check.
  2418  // The agent will make a best effort to ensure it is deregistered
  2419  func (a *Agent) RemoveCheck(checkID types.CheckID, persist bool) error {
  2420  	a.stateLock.Lock()
  2421  	defer a.stateLock.Unlock()
  2422  	return a.removeCheckLocked(checkID, persist)
  2423  }
  2424  
  2425  // removeCheckLocked is used to remove a health check.
  2426  // The agent will make a best effort to ensure it is deregistered
  2427  func (a *Agent) removeCheckLocked(checkID types.CheckID, persist bool) error {
  2428  	// Validate CheckID
  2429  	if checkID == "" {
  2430  		return fmt.Errorf("CheckID missing")
  2431  	}
  2432  
  2433  	a.cancelCheckMonitors(checkID)
  2434  	a.State.RemoveCheck(checkID)
  2435  
  2436  	if persist {
  2437  		if err := a.purgeCheck(checkID); err != nil {
  2438  			return err
  2439  		}
  2440  		if err := a.purgeCheckState(checkID); err != nil {
  2441  			return err
  2442  		}
  2443  	}
  2444  	a.logger.Printf("[DEBUG] agent: removed check %q", checkID)
  2445  	return nil
  2446  }
  2447  
  2448  // addProxyLocked adds a new local Connect Proxy instance to be managed by the agent.
  2449  //
  2450  // This assumes that the agent's proxyLock is already held
  2451  //
  2452  // It REQUIRES that the service that is being proxied is already present in the
  2453  // local state. Note that this is only used for agent-managed proxies so we can
  2454  // ensure that we always make this true. For externally managed and registered
  2455  // proxies we explicitly allow the proxy to be registered first to make
  2456  // bootstrap ordering of a new service simpler but the same is not true here
  2457  // since this is only ever called when setting up a _managed_ proxy which was
  2458  // registered as part of a service registration either from config or HTTP API
  2459  // call.
  2460  //
  2461  // The restoredProxyToken argument should only be used when restoring proxy
  2462  // definitions from disk; new proxies must leave it blank to get a new token
  2463  // assigned. We need to restore from disk to enable to continue authenticating
  2464  // running proxies that already had that credential injected.
  2465  func (a *Agent) addProxyLocked(proxy *structs.ConnectManagedProxy, persist, FromFile bool,
  2466  	restoredProxyToken string, source configSource) error {
  2467  	// Lookup the target service token in state if there is one.
  2468  	token := a.State.ServiceToken(proxy.TargetServiceID)
  2469  
  2470  	// Copy the basic proxy structure so it isn't modified w/ defaults
  2471  	proxyCopy := *proxy
  2472  	proxy = &proxyCopy
  2473  	if err := a.applyProxyDefaults(proxy); err != nil {
  2474  		return err
  2475  	}
  2476  
  2477  	// Add the proxy to local state first since we may need to assign a port which
  2478  	// needs to be coordinate under state lock. AddProxy will generate the
  2479  	// NodeService for the proxy populated with the allocated (or configured) port
  2480  	// and an ID, but it doesn't add it to the agent directly since that could
  2481  	// deadlock and we may need to coordinate adding it and persisting etc.
  2482  	proxyState, err := a.State.AddProxy(proxy, token, restoredProxyToken)
  2483  	if err != nil {
  2484  		return err
  2485  	}
  2486  	proxyService := proxyState.Proxy.ProxyService
  2487  
  2488  	// Register proxy TCP check. The built in proxy doesn't listen publically
  2489  	// until it's loaded certs so this ensures we won't route traffic until it's
  2490  	// ready.
  2491  	proxyCfg, err := a.applyProxyConfigDefaults(proxyState.Proxy)
  2492  	if err != nil {
  2493  		return err
  2494  	}
  2495  	chkAddr := a.resolveProxyCheckAddress(proxyCfg)
  2496  	chkTypes := []*structs.CheckType{}
  2497  	if chkAddr != "" {
  2498  		chkTypes = []*structs.CheckType{
  2499  			&structs.CheckType{
  2500  				Name: "Connect Proxy Listening",
  2501  				TCP: fmt.Sprintf("%s:%d", chkAddr,
  2502  					proxyCfg["bind_port"]),
  2503  				Interval: 10 * time.Second,
  2504  			},
  2505  		}
  2506  	}
  2507  
  2508  	err = a.addServiceLocked(proxyService, chkTypes, persist, token, source)
  2509  	if err != nil {
  2510  		// Remove the state too
  2511  		a.State.RemoveProxy(proxyService.ID)
  2512  		return err
  2513  	}
  2514  
  2515  	// Persist the proxy
  2516  	if persist && a.config.DataDir != "" {
  2517  		return a.persistProxy(proxyState, FromFile)
  2518  	}
  2519  	return nil
  2520  }
  2521  
  2522  // AddProxy adds a new local Connect Proxy instance to be managed by the agent.
  2523  //
  2524  // It REQUIRES that the service that is being proxied is already present in the
  2525  // local state. Note that this is only used for agent-managed proxies so we can
  2526  // ensure that we always make this true. For externally managed and registered
  2527  // proxies we explicitly allow the proxy to be registered first to make
  2528  // bootstrap ordering of a new service simpler but the same is not true here
  2529  // since this is only ever called when setting up a _managed_ proxy which was
  2530  // registered as part of a service registration either from config or HTTP API
  2531  // call.
  2532  //
  2533  // The restoredProxyToken argument should only be used when restoring proxy
  2534  // definitions from disk; new proxies must leave it blank to get a new token
  2535  // assigned. We need to restore from disk to enable to continue authenticating
  2536  // running proxies that already had that credential injected.
  2537  func (a *Agent) AddProxy(proxy *structs.ConnectManagedProxy, persist, FromFile bool,
  2538  	restoredProxyToken string, source configSource) error {
  2539  	a.stateLock.Lock()
  2540  	defer a.stateLock.Unlock()
  2541  	return a.addProxyLocked(proxy, persist, FromFile, restoredProxyToken, source)
  2542  }
  2543  
  2544  // resolveProxyCheckAddress returns the best address to use for a TCP check of
  2545  // the proxy's public listener. It expects the input to already have default
  2546  // values populated by applyProxyConfigDefaults. It may return an empty string
  2547  // indicating that the TCP check should not be created at all.
  2548  //
  2549  // By default this uses the proxy's bind address which in turn defaults to the
  2550  // agent's bind address. If the proxy bind address ends up being 0.0.0.0 we have
  2551  // to assume the agent can dial it over loopback which is usually true.
  2552  //
  2553  // In some topologies such as proxy being in a different container, the IP the
  2554  // agent used to dial proxy over a local bridge might not be the same as the
  2555  // container's public routable IP address so we allow a manual override of the
  2556  // check address in config "tcp_check_address" too.
  2557  //
  2558  // Finally the TCP check can be disabled by another manual override
  2559  // "disable_tcp_check" in cases where the agent will never be able to dial the
  2560  // proxy directly for some reason.
  2561  func (a *Agent) resolveProxyCheckAddress(proxyCfg map[string]interface{}) string {
  2562  	// If user disabled the check return empty string
  2563  	if disable, ok := proxyCfg["disable_tcp_check"].(bool); ok && disable {
  2564  		return ""
  2565  	}
  2566  
  2567  	// If user specified a custom one, use that
  2568  	if chkAddr, ok := proxyCfg["tcp_check_address"].(string); ok && chkAddr != "" {
  2569  		return chkAddr
  2570  	}
  2571  
  2572  	// If we have a bind address and its diallable, use that
  2573  	if bindAddr, ok := proxyCfg["bind_address"].(string); ok &&
  2574  		bindAddr != "" && bindAddr != "0.0.0.0" && bindAddr != "[::]" {
  2575  		return bindAddr
  2576  	}
  2577  
  2578  	// Default to localhost
  2579  	return "127.0.0.1"
  2580  }
  2581  
  2582  // applyProxyConfigDefaults takes a *structs.ConnectManagedProxy and returns
  2583  // it's Config map merged with any defaults from the Agent's config. It would be
  2584  // nicer if this were defined as a method on structs.ConnectManagedProxy but we
  2585  // can't do that because ot the import cycle it causes with agent/config.
  2586  func (a *Agent) applyProxyConfigDefaults(p *structs.ConnectManagedProxy) (map[string]interface{}, error) {
  2587  	if p == nil || p.ProxyService == nil {
  2588  		// Should never happen but protect from panic
  2589  		return nil, fmt.Errorf("invalid proxy state")
  2590  	}
  2591  
  2592  	// Lookup the target service
  2593  	target := a.State.Service(p.TargetServiceID)
  2594  	if target == nil {
  2595  		// Can happen during deregistration race between proxy and scheduler.
  2596  		return nil, fmt.Errorf("unknown target service ID: %s", p.TargetServiceID)
  2597  	}
  2598  
  2599  	// Merge globals defaults
  2600  	config := make(map[string]interface{})
  2601  	for k, v := range a.config.ConnectProxyDefaultConfig {
  2602  		if _, ok := config[k]; !ok {
  2603  			config[k] = v
  2604  		}
  2605  	}
  2606  
  2607  	// Copy config from the proxy
  2608  	for k, v := range p.Config {
  2609  		config[k] = v
  2610  	}
  2611  
  2612  	// Set defaults for anything that is still not specified but required.
  2613  	// Note that these are not included in the content hash. Since we expect
  2614  	// them to be static in general but some like the default target service
  2615  	// port might not be. In that edge case services can set that explicitly
  2616  	// when they re-register which will be caught though.
  2617  	if _, ok := config["bind_port"]; !ok {
  2618  		config["bind_port"] = p.ProxyService.Port
  2619  	}
  2620  	if _, ok := config["bind_address"]; !ok {
  2621  		// Default to binding to the same address the agent is configured to
  2622  		// bind to.
  2623  		config["bind_address"] = a.config.BindAddr.String()
  2624  	}
  2625  	if _, ok := config["local_service_address"]; !ok {
  2626  		// Default to localhost and the port the service registered with
  2627  		config["local_service_address"] = fmt.Sprintf("127.0.0.1:%d", target.Port)
  2628  	}
  2629  
  2630  	// Basic type conversions for expected types.
  2631  	if raw, ok := config["bind_port"]; ok {
  2632  		switch v := raw.(type) {
  2633  		case float64:
  2634  			// Common since HCL/JSON parse as float64
  2635  			config["bind_port"] = int(v)
  2636  
  2637  			// NOTE(mitchellh): No default case since errors and validation
  2638  			// are handled by the ServiceDefinition.Validate function.
  2639  		}
  2640  	}
  2641  
  2642  	return config, nil
  2643  }
  2644  
  2645  // applyProxyDefaults modifies the given proxy by applying any configured
  2646  // defaults, such as the default execution mode, command, etc.
  2647  func (a *Agent) applyProxyDefaults(proxy *structs.ConnectManagedProxy) error {
  2648  	// Set the default exec mode
  2649  	if proxy.ExecMode == structs.ProxyExecModeUnspecified {
  2650  		mode, err := structs.NewProxyExecMode(a.config.ConnectProxyDefaultExecMode)
  2651  		if err != nil {
  2652  			return err
  2653  		}
  2654  
  2655  		proxy.ExecMode = mode
  2656  	}
  2657  	if proxy.ExecMode == structs.ProxyExecModeUnspecified {
  2658  		proxy.ExecMode = structs.ProxyExecModeDaemon
  2659  	}
  2660  
  2661  	// Set the default command to the globally configured default
  2662  	if len(proxy.Command) == 0 {
  2663  		switch proxy.ExecMode {
  2664  		case structs.ProxyExecModeDaemon:
  2665  			proxy.Command = a.config.ConnectProxyDefaultDaemonCommand
  2666  
  2667  		case structs.ProxyExecModeScript:
  2668  			proxy.Command = a.config.ConnectProxyDefaultScriptCommand
  2669  		}
  2670  	}
  2671  
  2672  	// If there is no globally configured default we need to get the
  2673  	// default command so we can do "consul connect proxy"
  2674  	if len(proxy.Command) == 0 {
  2675  		command, err := defaultProxyCommand(a.config)
  2676  		if err != nil {
  2677  			return err
  2678  		}
  2679  
  2680  		proxy.Command = command
  2681  	}
  2682  
  2683  	return nil
  2684  }
  2685  
  2686  // removeProxyLocked stops and removes a local proxy instance.
  2687  //
  2688  // It is assumed that this function is called while holding the proxyLock already
  2689  func (a *Agent) removeProxyLocked(proxyID string, persist bool) error {
  2690  	// Validate proxyID
  2691  	if proxyID == "" {
  2692  		return fmt.Errorf("proxyID missing")
  2693  	}
  2694  
  2695  	// Remove the proxy from the local state
  2696  	p, err := a.State.RemoveProxy(proxyID)
  2697  	if err != nil {
  2698  		return err
  2699  	}
  2700  
  2701  	// Remove the proxy service as well. The proxy ID is also the ID
  2702  	// of the servie, but we might as well use the service pointer.
  2703  	if err := a.removeServiceLocked(p.Proxy.ProxyService.ID, persist); err != nil {
  2704  		return err
  2705  	}
  2706  
  2707  	if persist && a.config.DataDir != "" {
  2708  		return a.purgeProxy(proxyID)
  2709  	}
  2710  
  2711  	return nil
  2712  }
  2713  
  2714  // RemoveProxy stops and removes a local proxy instance.
  2715  func (a *Agent) RemoveProxy(proxyID string, persist bool) error {
  2716  	a.stateLock.Lock()
  2717  	defer a.stateLock.Unlock()
  2718  	return a.removeProxyLocked(proxyID, persist)
  2719  }
  2720  
  2721  // verifyProxyToken takes a token and attempts to verify it against the
  2722  // targetService name. If targetProxy is specified, then the local proxy token
  2723  // must exactly match the given proxy ID. cert, config, etc.).
  2724  //
  2725  // The given token may be a local-only proxy token or it may be an ACL token. We
  2726  // will attempt to verify the local proxy token first.
  2727  //
  2728  // The effective ACL token is returned along with a boolean which is true if the
  2729  // match was against a proxy token rather than an ACL token, and any error. In
  2730  // the case the token matches a proxy token, then the ACL token used to register
  2731  // that proxy's target service is returned for use in any RPC calls the proxy
  2732  // needs to make on behalf of that service. If the token was an ACL token
  2733  // already then it is always returned. Provided error is nil, a valid ACL token
  2734  // is always returned.
  2735  func (a *Agent) verifyProxyToken(token, targetService,
  2736  	targetProxy string) (string, bool, error) {
  2737  	// If we specify a target proxy, we look up that proxy directly. Otherwise,
  2738  	// we resolve with any proxy we can find.
  2739  	var proxy *local.ManagedProxy
  2740  	if targetProxy != "" {
  2741  		proxy = a.State.Proxy(targetProxy)
  2742  		if proxy == nil {
  2743  			return "", false, fmt.Errorf("unknown proxy service ID: %q", targetProxy)
  2744  		}
  2745  
  2746  		// If the token DOESN'T match, then we reset the proxy which will
  2747  		// cause the logic below to fall back to normal ACLs. Otherwise,
  2748  		// we keep the proxy set because we also have to verify that the
  2749  		// target service matches on the proxy.
  2750  		if token != proxy.ProxyToken {
  2751  			proxy = nil
  2752  		}
  2753  	} else {
  2754  		proxy = a.resolveProxyToken(token)
  2755  	}
  2756  
  2757  	// The existence of a token isn't enough, we also need to verify
  2758  	// that the service name of the matching proxy matches our target
  2759  	// service.
  2760  	if proxy != nil {
  2761  		// Get the target service since we only have the name. The nil
  2762  		// check below should never be true since a proxy token always
  2763  		// represents the existence of a local service.
  2764  		target := a.State.Service(proxy.Proxy.TargetServiceID)
  2765  		if target == nil {
  2766  			return "", false, fmt.Errorf("proxy target service not found: %q",
  2767  				proxy.Proxy.TargetServiceID)
  2768  		}
  2769  
  2770  		if target.Service != targetService {
  2771  			return "", false, acl.ErrPermissionDenied
  2772  		}
  2773  
  2774  		// Resolve the actual ACL token used to register the proxy/service and
  2775  		// return that for use in RPC calls.
  2776  		return a.State.ServiceToken(proxy.Proxy.TargetServiceID), true, nil
  2777  	}
  2778  
  2779  	// Doesn't match, we have to do a full token resolution. The required
  2780  	// permission for any proxy-related endpoint is service:write, since
  2781  	// to register a proxy you require that permission and sensitive data
  2782  	// is usually present in the configuration.
  2783  	rule, err := a.resolveToken(token)
  2784  	if err != nil {
  2785  		return "", false, err
  2786  	}
  2787  	if rule != nil && !rule.ServiceWrite(targetService, nil) {
  2788  		return "", false, acl.ErrPermissionDenied
  2789  	}
  2790  
  2791  	return token, false, nil
  2792  }
  2793  
  2794  func (a *Agent) cancelCheckMonitors(checkID types.CheckID) {
  2795  	// Stop any monitors
  2796  	delete(a.checkReapAfter, checkID)
  2797  	if check, ok := a.checkMonitors[checkID]; ok {
  2798  		check.Stop()
  2799  		delete(a.checkMonitors, checkID)
  2800  	}
  2801  	if check, ok := a.checkHTTPs[checkID]; ok {
  2802  		check.Stop()
  2803  		delete(a.checkHTTPs, checkID)
  2804  	}
  2805  	if check, ok := a.checkTCPs[checkID]; ok {
  2806  		check.Stop()
  2807  		delete(a.checkTCPs, checkID)
  2808  	}
  2809  	if check, ok := a.checkGRPCs[checkID]; ok {
  2810  		check.Stop()
  2811  		delete(a.checkGRPCs, checkID)
  2812  	}
  2813  	if check, ok := a.checkTTLs[checkID]; ok {
  2814  		check.Stop()
  2815  		delete(a.checkTTLs, checkID)
  2816  	}
  2817  	if check, ok := a.checkDockers[checkID]; ok {
  2818  		check.Stop()
  2819  		delete(a.checkDockers, checkID)
  2820  	}
  2821  }
  2822  
  2823  // updateTTLCheck is used to update the status of a TTL check via the Agent API.
  2824  func (a *Agent) updateTTLCheck(checkID types.CheckID, status, output string) error {
  2825  	a.stateLock.Lock()
  2826  	defer a.stateLock.Unlock()
  2827  
  2828  	// Grab the TTL check.
  2829  	check, ok := a.checkTTLs[checkID]
  2830  	if !ok {
  2831  		return fmt.Errorf("CheckID %q does not have associated TTL", checkID)
  2832  	}
  2833  
  2834  	// Set the status through CheckTTL to reset the TTL.
  2835  	check.SetStatus(status, output)
  2836  
  2837  	// We don't write any files in dev mode so bail here.
  2838  	if a.config.DataDir == "" {
  2839  		return nil
  2840  	}
  2841  
  2842  	// Persist the state so the TTL check can come up in a good state after
  2843  	// an agent restart, especially with long TTL values.
  2844  	if err := a.persistCheckState(check, status, output); err != nil {
  2845  		return fmt.Errorf("failed persisting state for check %q: %s", checkID, err)
  2846  	}
  2847  
  2848  	return nil
  2849  }
  2850  
  2851  // persistCheckState is used to record the check status into the data dir.
  2852  // This allows the state to be restored on a later agent start. Currently
  2853  // only useful for TTL based checks.
  2854  func (a *Agent) persistCheckState(check *checks.CheckTTL, status, output string) error {
  2855  	// Create the persisted state
  2856  	state := persistedCheckState{
  2857  		CheckID: check.CheckID,
  2858  		Status:  status,
  2859  		Output:  output,
  2860  		Expires: time.Now().Add(check.TTL).Unix(),
  2861  	}
  2862  
  2863  	// Encode the state
  2864  	buf, err := json.Marshal(state)
  2865  	if err != nil {
  2866  		return err
  2867  	}
  2868  
  2869  	// Create the state dir if it doesn't exist
  2870  	dir := filepath.Join(a.config.DataDir, checkStateDir)
  2871  	if err := os.MkdirAll(dir, 0700); err != nil {
  2872  		return fmt.Errorf("failed creating check state dir %q: %s", dir, err)
  2873  	}
  2874  
  2875  	// Write the state to the file
  2876  	file := filepath.Join(dir, checkIDHash(check.CheckID))
  2877  
  2878  	// Create temp file in same dir, to make more likely atomic
  2879  	tempFile := file + ".tmp"
  2880  
  2881  	// persistCheckState is called frequently, so don't use writeFileAtomic to avoid calling fsync here
  2882  	if err := ioutil.WriteFile(tempFile, buf, 0600); err != nil {
  2883  		return fmt.Errorf("failed writing temp file %q: %s", tempFile, err)
  2884  	}
  2885  	if err := os.Rename(tempFile, file); err != nil {
  2886  		return fmt.Errorf("failed to rename temp file from %q to %q: %s", tempFile, file, err)
  2887  	}
  2888  
  2889  	return nil
  2890  }
  2891  
  2892  // loadCheckState is used to restore the persisted state of a check.
  2893  func (a *Agent) loadCheckState(check *structs.HealthCheck) error {
  2894  	// Try to read the persisted state for this check
  2895  	file := filepath.Join(a.config.DataDir, checkStateDir, checkIDHash(check.CheckID))
  2896  	buf, err := ioutil.ReadFile(file)
  2897  	if err != nil {
  2898  		if os.IsNotExist(err) {
  2899  			return nil
  2900  		}
  2901  		return fmt.Errorf("failed reading file %q: %s", file, err)
  2902  	}
  2903  
  2904  	// Decode the state data
  2905  	var p persistedCheckState
  2906  	if err := json.Unmarshal(buf, &p); err != nil {
  2907  		a.logger.Printf("[ERR] agent: failed decoding check state: %s", err)
  2908  		return a.purgeCheckState(check.CheckID)
  2909  	}
  2910  
  2911  	// Check if the state has expired
  2912  	if time.Now().Unix() >= p.Expires {
  2913  		a.logger.Printf("[DEBUG] agent: check state expired for %q, not restoring", check.CheckID)
  2914  		return a.purgeCheckState(check.CheckID)
  2915  	}
  2916  
  2917  	// Restore the fields from the state
  2918  	check.Output = p.Output
  2919  	check.Status = p.Status
  2920  	return nil
  2921  }
  2922  
  2923  // purgeCheckState is used to purge the state of a check from the data dir
  2924  func (a *Agent) purgeCheckState(checkID types.CheckID) error {
  2925  	file := filepath.Join(a.config.DataDir, checkStateDir, checkIDHash(checkID))
  2926  	err := os.Remove(file)
  2927  	if os.IsNotExist(err) {
  2928  		return nil
  2929  	}
  2930  	return err
  2931  }
  2932  
  2933  func (a *Agent) GossipEncrypted() bool {
  2934  	return a.delegate.Encrypted()
  2935  }
  2936  
  2937  // Stats is used to get various debugging state from the sub-systems
  2938  func (a *Agent) Stats() map[string]map[string]string {
  2939  	stats := a.delegate.Stats()
  2940  	stats["agent"] = map[string]string{
  2941  		"check_monitors": strconv.Itoa(len(a.checkMonitors)),
  2942  		"check_ttls":     strconv.Itoa(len(a.checkTTLs)),
  2943  	}
  2944  	for k, v := range a.State.Stats() {
  2945  		stats["agent"][k] = v
  2946  	}
  2947  
  2948  	revision := a.config.Revision
  2949  	if len(revision) > 8 {
  2950  		revision = revision[:8]
  2951  	}
  2952  	stats["build"] = map[string]string{
  2953  		"revision":   revision,
  2954  		"version":    a.config.Version,
  2955  		"prerelease": a.config.VersionPrerelease,
  2956  	}
  2957  	return stats
  2958  }
  2959  
  2960  // storePid is used to write out our PID to a file if necessary
  2961  func (a *Agent) storePid() error {
  2962  	// Quit fast if no pidfile
  2963  	pidPath := a.config.PidFile
  2964  	if pidPath == "" {
  2965  		return nil
  2966  	}
  2967  
  2968  	// Open the PID file
  2969  	pidFile, err := os.OpenFile(pidPath, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0666)
  2970  	if err != nil {
  2971  		return fmt.Errorf("Could not open pid file: %v", err)
  2972  	}
  2973  	defer pidFile.Close()
  2974  
  2975  	// Write out the PID
  2976  	pid := os.Getpid()
  2977  	_, err = pidFile.WriteString(fmt.Sprintf("%d", pid))
  2978  	if err != nil {
  2979  		return fmt.Errorf("Could not write to pid file: %s", err)
  2980  	}
  2981  	return nil
  2982  }
  2983  
  2984  // deletePid is used to delete our PID on exit
  2985  func (a *Agent) deletePid() error {
  2986  	// Quit fast if no pidfile
  2987  	pidPath := a.config.PidFile
  2988  	if pidPath == "" {
  2989  		return nil
  2990  	}
  2991  
  2992  	stat, err := os.Stat(pidPath)
  2993  	if err != nil {
  2994  		return fmt.Errorf("Could not remove pid file: %s", err)
  2995  	}
  2996  
  2997  	if stat.IsDir() {
  2998  		return fmt.Errorf("Specified pid file path is directory")
  2999  	}
  3000  
  3001  	err = os.Remove(pidPath)
  3002  	if err != nil {
  3003  		return fmt.Errorf("Could not remove pid file: %s", err)
  3004  	}
  3005  	return nil
  3006  }
  3007  
  3008  // loadServices will load service definitions from configuration and persisted
  3009  // definitions on disk, and load them into the local agent.
  3010  func (a *Agent) loadServices(conf *config.RuntimeConfig) error {
  3011  	// Register the services from config
  3012  	for _, service := range conf.Services {
  3013  		ns := service.NodeService()
  3014  		chkTypes, err := service.CheckTypes()
  3015  		if err != nil {
  3016  			return fmt.Errorf("Failed to validate checks for service %q: %v", service.Name, err)
  3017  		}
  3018  
  3019  		// Grab and validate sidecar if there is one too
  3020  		sidecar, sidecarChecks, sidecarToken, err := a.sidecarServiceFromNodeService(ns, service.Token)
  3021  		if err != nil {
  3022  			return fmt.Errorf("Failed to validate sidecar for service %q: %v", service.Name, err)
  3023  		}
  3024  
  3025  		// Remove sidecar from NodeService now it's done it's job it's just a config
  3026  		// syntax sugar and shouldn't be persisted in local or server state.
  3027  		ns.Connect.SidecarService = nil
  3028  
  3029  		if err := a.addServiceLocked(ns, chkTypes, false, service.Token, ConfigSourceLocal); err != nil {
  3030  			return fmt.Errorf("Failed to register service %q: %v", service.Name, err)
  3031  		}
  3032  
  3033  		// If there is a sidecar service, register that too.
  3034  		if sidecar != nil {
  3035  			if err := a.addServiceLocked(sidecar, sidecarChecks, false, sidecarToken, ConfigSourceLocal); err != nil {
  3036  				return fmt.Errorf("Failed to register sidecar for service %q: %v", service.Name, err)
  3037  			}
  3038  		}
  3039  	}
  3040  
  3041  	// Load any persisted services
  3042  	svcDir := filepath.Join(a.config.DataDir, servicesDir)
  3043  	files, err := ioutil.ReadDir(svcDir)
  3044  	if err != nil {
  3045  		if os.IsNotExist(err) {
  3046  			return nil
  3047  		}
  3048  		return fmt.Errorf("Failed reading services dir %q: %s", svcDir, err)
  3049  	}
  3050  	for _, fi := range files {
  3051  		// Skip all dirs
  3052  		if fi.IsDir() {
  3053  			continue
  3054  		}
  3055  
  3056  		// Skip all partially written temporary files
  3057  		if strings.HasSuffix(fi.Name(), "tmp") {
  3058  			a.logger.Printf("[WARN] agent: Ignoring temporary service file %v", fi.Name())
  3059  			continue
  3060  		}
  3061  
  3062  		// Open the file for reading
  3063  		file := filepath.Join(svcDir, fi.Name())
  3064  		fh, err := os.Open(file)
  3065  		if err != nil {
  3066  			return fmt.Errorf("failed opening service file %q: %s", file, err)
  3067  		}
  3068  
  3069  		// Read the contents into a buffer
  3070  		buf, err := ioutil.ReadAll(fh)
  3071  		fh.Close()
  3072  		if err != nil {
  3073  			return fmt.Errorf("failed reading service file %q: %s", file, err)
  3074  		}
  3075  
  3076  		// Try decoding the service definition
  3077  		var p persistedService
  3078  		if err := json.Unmarshal(buf, &p); err != nil {
  3079  			// Backwards-compatibility for pre-0.5.1 persisted services
  3080  			if err := json.Unmarshal(buf, &p.Service); err != nil {
  3081  				a.logger.Printf("[ERR] agent: Failed decoding service file %q: %s", file, err)
  3082  				continue
  3083  			}
  3084  		}
  3085  		serviceID := p.Service.ID
  3086  
  3087  		if a.State.Service(serviceID) != nil {
  3088  			// Purge previously persisted service. This allows config to be
  3089  			// preferred over services persisted from the API.
  3090  			a.logger.Printf("[DEBUG] agent: service %q exists, not restoring from %q",
  3091  				serviceID, file)
  3092  			if err := a.purgeService(serviceID); err != nil {
  3093  				return fmt.Errorf("failed purging service %q: %s", serviceID, err)
  3094  			}
  3095  		} else {
  3096  			a.logger.Printf("[DEBUG] agent: restored service definition %q from %q",
  3097  				serviceID, file)
  3098  			if err := a.addServiceLocked(p.Service, nil, false, p.Token, ConfigSourceLocal); err != nil {
  3099  				return fmt.Errorf("failed adding service %q: %s", serviceID, err)
  3100  			}
  3101  		}
  3102  	}
  3103  
  3104  	return nil
  3105  }
  3106  
  3107  // unloadServices will deregister all services.
  3108  func (a *Agent) unloadServices() error {
  3109  	for id := range a.State.Services() {
  3110  		if err := a.removeServiceLocked(id, false); err != nil {
  3111  			return fmt.Errorf("Failed deregistering service '%s': %v", id, err)
  3112  		}
  3113  	}
  3114  	return nil
  3115  }
  3116  
  3117  // loadChecks loads check definitions and/or persisted check definitions from
  3118  // disk and re-registers them with the local agent.
  3119  func (a *Agent) loadChecks(conf *config.RuntimeConfig) error {
  3120  	// Register the checks from config
  3121  	for _, check := range conf.Checks {
  3122  		health := check.HealthCheck(conf.NodeName)
  3123  		chkType := check.CheckType()
  3124  		if err := a.addCheckLocked(health, chkType, false, check.Token, ConfigSourceLocal); err != nil {
  3125  			return fmt.Errorf("Failed to register check '%s': %v %v", check.Name, err, check)
  3126  		}
  3127  	}
  3128  
  3129  	// Load any persisted checks
  3130  	checkDir := filepath.Join(a.config.DataDir, checksDir)
  3131  	files, err := ioutil.ReadDir(checkDir)
  3132  	if err != nil {
  3133  		if os.IsNotExist(err) {
  3134  			return nil
  3135  		}
  3136  		return fmt.Errorf("Failed reading checks dir %q: %s", checkDir, err)
  3137  	}
  3138  	for _, fi := range files {
  3139  		// Ignore dirs - we only care about the check definition files
  3140  		if fi.IsDir() {
  3141  			continue
  3142  		}
  3143  
  3144  		// Open the file for reading
  3145  		file := filepath.Join(checkDir, fi.Name())
  3146  		fh, err := os.Open(file)
  3147  		if err != nil {
  3148  			return fmt.Errorf("Failed opening check file %q: %s", file, err)
  3149  		}
  3150  
  3151  		// Read the contents into a buffer
  3152  		buf, err := ioutil.ReadAll(fh)
  3153  		fh.Close()
  3154  		if err != nil {
  3155  			return fmt.Errorf("failed reading check file %q: %s", file, err)
  3156  		}
  3157  
  3158  		// Decode the check
  3159  		var p persistedCheck
  3160  		if err := json.Unmarshal(buf, &p); err != nil {
  3161  			a.logger.Printf("[ERR] agent: Failed decoding check file %q: %s", file, err)
  3162  			continue
  3163  		}
  3164  		checkID := p.Check.CheckID
  3165  
  3166  		if a.State.Check(checkID) != nil {
  3167  			// Purge previously persisted check. This allows config to be
  3168  			// preferred over persisted checks from the API.
  3169  			a.logger.Printf("[DEBUG] agent: check %q exists, not restoring from %q",
  3170  				checkID, file)
  3171  			if err := a.purgeCheck(checkID); err != nil {
  3172  				return fmt.Errorf("Failed purging check %q: %s", checkID, err)
  3173  			}
  3174  		} else {
  3175  			// Default check to critical to avoid placing potentially unhealthy
  3176  			// services into the active pool
  3177  			p.Check.Status = api.HealthCritical
  3178  
  3179  			if err := a.addCheckLocked(p.Check, p.ChkType, false, p.Token, ConfigSourceLocal); err != nil {
  3180  				// Purge the check if it is unable to be restored.
  3181  				a.logger.Printf("[WARN] agent: Failed to restore check %q: %s",
  3182  					checkID, err)
  3183  				if err := a.purgeCheck(checkID); err != nil {
  3184  					return fmt.Errorf("Failed purging check %q: %s", checkID, err)
  3185  				}
  3186  			}
  3187  			a.logger.Printf("[DEBUG] agent: restored health check %q from %q",
  3188  				p.Check.CheckID, file)
  3189  		}
  3190  	}
  3191  
  3192  	return nil
  3193  }
  3194  
  3195  // unloadChecks will deregister all checks known to the local agent.
  3196  func (a *Agent) unloadChecks() error {
  3197  	for id := range a.State.Checks() {
  3198  		if err := a.removeCheckLocked(id, false); err != nil {
  3199  			return fmt.Errorf("Failed deregistering check '%s': %s", id, err)
  3200  		}
  3201  	}
  3202  	return nil
  3203  }
  3204  
  3205  // loadPersistedProxies will load connect proxy definitions from their
  3206  // persisted state on disk and return a slice of them
  3207  //
  3208  // This does not add them to the local
  3209  func (a *Agent) loadPersistedProxies() (map[string]persistedProxy, error) {
  3210  	persistedProxies := make(map[string]persistedProxy)
  3211  
  3212  	proxyDir := filepath.Join(a.config.DataDir, proxyDir)
  3213  	files, err := ioutil.ReadDir(proxyDir)
  3214  	if err != nil {
  3215  		if !os.IsNotExist(err) {
  3216  			return nil, fmt.Errorf("Failed reading proxies dir %q: %s", proxyDir, err)
  3217  		}
  3218  	}
  3219  
  3220  	for _, fi := range files {
  3221  		// Skip all dirs
  3222  		if fi.IsDir() {
  3223  			continue
  3224  		}
  3225  
  3226  		// Skip all partially written temporary files
  3227  		if strings.HasSuffix(fi.Name(), "tmp") {
  3228  			return nil, fmt.Errorf("Ignoring temporary proxy file %v", fi.Name())
  3229  		}
  3230  
  3231  		// Open the file for reading
  3232  		file := filepath.Join(proxyDir, fi.Name())
  3233  		fh, err := os.Open(file)
  3234  		if err != nil {
  3235  			return nil, fmt.Errorf("failed opening proxy file %q: %s", file, err)
  3236  		}
  3237  
  3238  		// Read the contents into a buffer
  3239  		buf, err := ioutil.ReadAll(fh)
  3240  		fh.Close()
  3241  		if err != nil {
  3242  			return nil, fmt.Errorf("failed reading proxy file %q: %s", file, err)
  3243  		}
  3244  
  3245  		// Try decoding the proxy definition
  3246  		var p persistedProxy
  3247  		if err := json.Unmarshal(buf, &p); err != nil {
  3248  			return nil, fmt.Errorf("Failed decoding proxy file %q: %s", file, err)
  3249  		}
  3250  		svcID := p.Proxy.TargetServiceID
  3251  
  3252  		persistedProxies[svcID] = p
  3253  	}
  3254  
  3255  	return persistedProxies, nil
  3256  }
  3257  
  3258  // loadProxies will load connect proxy definitions from configuration and
  3259  // persisted definitions on disk, and load them into the local agent.
  3260  func (a *Agent) loadProxies(conf *config.RuntimeConfig) error {
  3261  	persistedProxies, persistenceErr := a.loadPersistedProxies()
  3262  
  3263  	for _, svc := range conf.Services {
  3264  		if svc.Connect != nil {
  3265  			proxy, err := svc.ConnectManagedProxy()
  3266  			if err != nil {
  3267  				return fmt.Errorf("failed adding proxy: %s", err)
  3268  			}
  3269  			if proxy == nil {
  3270  				continue
  3271  			}
  3272  			restoredToken := ""
  3273  			if persisted, ok := persistedProxies[proxy.TargetServiceID]; ok {
  3274  				restoredToken = persisted.ProxyToken
  3275  			}
  3276  
  3277  			if err := a.addProxyLocked(proxy, true, true, restoredToken, ConfigSourceLocal); err != nil {
  3278  				return fmt.Errorf("failed adding proxy: %s", err)
  3279  			}
  3280  		}
  3281  	}
  3282  
  3283  	for _, persisted := range persistedProxies {
  3284  		proxyID := persisted.Proxy.ProxyService.ID
  3285  		if persisted.FromFile && a.State.Proxy(proxyID) == nil {
  3286  			// Purge proxies that were configured previously but are no longer in the config
  3287  			a.logger.Printf("[DEBUG] agent: purging stale persisted proxy %q", proxyID)
  3288  			if err := a.purgeProxy(proxyID); err != nil {
  3289  				return fmt.Errorf("failed purging proxy %q: %v", proxyID, err)
  3290  			}
  3291  		} else if !persisted.FromFile {
  3292  			if a.State.Proxy(proxyID) == nil {
  3293  				a.logger.Printf("[DEBUG] agent: restored proxy definition %q", proxyID)
  3294  				if err := a.addProxyLocked(persisted.Proxy, false, false, persisted.ProxyToken, ConfigSourceLocal); err != nil {
  3295  					return fmt.Errorf("failed adding proxy %q: %v", proxyID, err)
  3296  				}
  3297  			} else {
  3298  				a.logger.Printf("[WARN] agent: proxy definition %q was overwritten by a proxy definition within a config file", proxyID)
  3299  			}
  3300  		}
  3301  	}
  3302  
  3303  	return persistenceErr
  3304  }
  3305  
  3306  type persistedTokens struct {
  3307  	Replication string `json:"replication,omitempty"`
  3308  	AgentMaster string `json:"agent_master,omitempty"`
  3309  	Default     string `json:"default,omitempty"`
  3310  	Agent       string `json:"agent,omitempty"`
  3311  }
  3312  
  3313  func (a *Agent) getPersistedTokens() (*persistedTokens, error) {
  3314  	persistedTokens := &persistedTokens{}
  3315  	if !a.config.ACLEnableTokenPersistence {
  3316  		return persistedTokens, nil
  3317  	}
  3318  
  3319  	a.persistedTokensLock.RLock()
  3320  	defer a.persistedTokensLock.RUnlock()
  3321  
  3322  	tokensFullPath := filepath.Join(a.config.DataDir, tokensPath)
  3323  
  3324  	buf, err := ioutil.ReadFile(tokensFullPath)
  3325  	if err != nil {
  3326  		if os.IsNotExist(err) {
  3327  			// non-existence is not an error we care about
  3328  			return persistedTokens, nil
  3329  		}
  3330  		return persistedTokens, fmt.Errorf("failed reading tokens file %q: %s", tokensFullPath, err)
  3331  	}
  3332  
  3333  	if err := json.Unmarshal(buf, persistedTokens); err != nil {
  3334  		return persistedTokens, fmt.Errorf("failed to decode tokens file %q: %s", tokensFullPath, err)
  3335  	}
  3336  
  3337  	return persistedTokens, nil
  3338  }
  3339  
  3340  func (a *Agent) loadTokens(conf *config.RuntimeConfig) error {
  3341  	persistedTokens, persistenceErr := a.getPersistedTokens()
  3342  
  3343  	if persistenceErr != nil {
  3344  		a.logger.Printf("[WARN] unable to load persisted tokens: %v", persistenceErr)
  3345  	}
  3346  
  3347  	if persistedTokens.Default != "" {
  3348  		a.tokens.UpdateUserToken(persistedTokens.Default, token.TokenSourceAPI)
  3349  
  3350  		if conf.ACLToken != "" {
  3351  			a.logger.Printf("[WARN] \"default\" token present in both the configuration and persisted token store, using the persisted token")
  3352  		}
  3353  	} else {
  3354  		a.tokens.UpdateUserToken(conf.ACLToken, token.TokenSourceConfig)
  3355  	}
  3356  
  3357  	if persistedTokens.Agent != "" {
  3358  		a.tokens.UpdateAgentToken(persistedTokens.Agent, token.TokenSourceAPI)
  3359  
  3360  		if conf.ACLAgentToken != "" {
  3361  			a.logger.Printf("[WARN] \"agent\" token present in both the configuration and persisted token store, using the persisted token")
  3362  		}
  3363  	} else {
  3364  		a.tokens.UpdateAgentToken(conf.ACLAgentToken, token.TokenSourceConfig)
  3365  	}
  3366  
  3367  	if persistedTokens.AgentMaster != "" {
  3368  		a.tokens.UpdateAgentMasterToken(persistedTokens.AgentMaster, token.TokenSourceAPI)
  3369  
  3370  		if conf.ACLAgentMasterToken != "" {
  3371  			a.logger.Printf("[WARN] \"agent_master\" token present in both the configuration and persisted token store, using the persisted token")
  3372  		}
  3373  	} else {
  3374  		a.tokens.UpdateAgentMasterToken(conf.ACLAgentMasterToken, token.TokenSourceConfig)
  3375  	}
  3376  
  3377  	if persistedTokens.Replication != "" {
  3378  		a.tokens.UpdateReplicationToken(persistedTokens.Replication, token.TokenSourceAPI)
  3379  
  3380  		if conf.ACLReplicationToken != "" {
  3381  			a.logger.Printf("[WARN] \"replication\" token present in both the configuration and persisted token store, using the persisted token")
  3382  		}
  3383  	} else {
  3384  		a.tokens.UpdateReplicationToken(conf.ACLReplicationToken, token.TokenSourceConfig)
  3385  	}
  3386  
  3387  	return persistenceErr
  3388  }
  3389  
  3390  // unloadProxies will deregister all proxies known to the local agent.
  3391  func (a *Agent) unloadProxies() error {
  3392  	for id := range a.State.Proxies() {
  3393  		if err := a.removeProxyLocked(id, false); err != nil {
  3394  			return fmt.Errorf("Failed deregistering proxy '%s': %s", id, err)
  3395  		}
  3396  	}
  3397  	return nil
  3398  }
  3399  
  3400  // snapshotCheckState is used to snapshot the current state of the health
  3401  // checks. This is done before we reload our checks, so that we can properly
  3402  // restore into the same state.
  3403  func (a *Agent) snapshotCheckState() map[types.CheckID]*structs.HealthCheck {
  3404  	return a.State.Checks()
  3405  }
  3406  
  3407  // restoreCheckState is used to reset the health state based on a snapshot.
  3408  // This is done after we finish the reload to avoid any unnecessary flaps
  3409  // in health state and potential session invalidations.
  3410  func (a *Agent) restoreCheckState(snap map[types.CheckID]*structs.HealthCheck) {
  3411  	for id, check := range snap {
  3412  		a.State.UpdateCheck(id, check.Status, check.Output)
  3413  	}
  3414  }
  3415  
  3416  // loadMetadata loads node metadata fields from the agent config and
  3417  // updates them on the local agent.
  3418  func (a *Agent) loadMetadata(conf *config.RuntimeConfig) error {
  3419  	meta := map[string]string{}
  3420  	for k, v := range conf.NodeMeta {
  3421  		meta[k] = v
  3422  	}
  3423  	meta[structs.MetaSegmentKey] = conf.SegmentName
  3424  	return a.State.LoadMetadata(meta)
  3425  }
  3426  
  3427  // unloadMetadata resets the local metadata state
  3428  func (a *Agent) unloadMetadata() {
  3429  	a.State.UnloadMetadata()
  3430  }
  3431  
  3432  // serviceMaintCheckID returns the ID of a given service's maintenance check
  3433  func serviceMaintCheckID(serviceID string) types.CheckID {
  3434  	return types.CheckID(structs.ServiceMaintPrefix + serviceID)
  3435  }
  3436  
  3437  // EnableServiceMaintenance will register a false health check against the given
  3438  // service ID with critical status. This will exclude the service from queries.
  3439  func (a *Agent) EnableServiceMaintenance(serviceID, reason, token string) error {
  3440  	service, ok := a.State.Services()[serviceID]
  3441  	if !ok {
  3442  		return fmt.Errorf("No service registered with ID %q", serviceID)
  3443  	}
  3444  
  3445  	// Check if maintenance mode is not already enabled
  3446  	checkID := serviceMaintCheckID(serviceID)
  3447  	if _, ok := a.State.Checks()[checkID]; ok {
  3448  		return nil
  3449  	}
  3450  
  3451  	// Use default notes if no reason provided
  3452  	if reason == "" {
  3453  		reason = defaultServiceMaintReason
  3454  	}
  3455  
  3456  	// Create and register the critical health check
  3457  	check := &structs.HealthCheck{
  3458  		Node:        a.config.NodeName,
  3459  		CheckID:     checkID,
  3460  		Name:        "Service Maintenance Mode",
  3461  		Notes:       reason,
  3462  		ServiceID:   service.ID,
  3463  		ServiceName: service.Service,
  3464  		Status:      api.HealthCritical,
  3465  	}
  3466  	a.AddCheck(check, nil, true, token, ConfigSourceLocal)
  3467  	a.logger.Printf("[INFO] agent: Service %q entered maintenance mode", serviceID)
  3468  
  3469  	return nil
  3470  }
  3471  
  3472  // DisableServiceMaintenance will deregister the fake maintenance mode check
  3473  // if the service has been marked as in maintenance.
  3474  func (a *Agent) DisableServiceMaintenance(serviceID string) error {
  3475  	if _, ok := a.State.Services()[serviceID]; !ok {
  3476  		return fmt.Errorf("No service registered with ID %q", serviceID)
  3477  	}
  3478  
  3479  	// Check if maintenance mode is enabled
  3480  	checkID := serviceMaintCheckID(serviceID)
  3481  	if _, ok := a.State.Checks()[checkID]; !ok {
  3482  		return nil
  3483  	}
  3484  
  3485  	// Deregister the maintenance check
  3486  	a.RemoveCheck(checkID, true)
  3487  	a.logger.Printf("[INFO] agent: Service %q left maintenance mode", serviceID)
  3488  
  3489  	return nil
  3490  }
  3491  
  3492  // EnableNodeMaintenance places a node into maintenance mode.
  3493  func (a *Agent) EnableNodeMaintenance(reason, token string) {
  3494  	// Ensure node maintenance is not already enabled
  3495  	if _, ok := a.State.Checks()[structs.NodeMaint]; ok {
  3496  		return
  3497  	}
  3498  
  3499  	// Use a default notes value
  3500  	if reason == "" {
  3501  		reason = defaultNodeMaintReason
  3502  	}
  3503  
  3504  	// Create and register the node maintenance check
  3505  	check := &structs.HealthCheck{
  3506  		Node:    a.config.NodeName,
  3507  		CheckID: structs.NodeMaint,
  3508  		Name:    "Node Maintenance Mode",
  3509  		Notes:   reason,
  3510  		Status:  api.HealthCritical,
  3511  	}
  3512  	a.AddCheck(check, nil, true, token, ConfigSourceLocal)
  3513  	a.logger.Printf("[INFO] agent: Node entered maintenance mode")
  3514  }
  3515  
  3516  // DisableNodeMaintenance removes a node from maintenance mode
  3517  func (a *Agent) DisableNodeMaintenance() {
  3518  	if _, ok := a.State.Checks()[structs.NodeMaint]; !ok {
  3519  		return
  3520  	}
  3521  	a.RemoveCheck(structs.NodeMaint, true)
  3522  	a.logger.Printf("[INFO] agent: Node left maintenance mode")
  3523  }
  3524  
  3525  func (a *Agent) loadLimits(conf *config.RuntimeConfig) {
  3526  	a.config.RPCRateLimit = conf.RPCRateLimit
  3527  	a.config.RPCMaxBurst = conf.RPCMaxBurst
  3528  }
  3529  
  3530  func (a *Agent) ReloadConfig(newCfg *config.RuntimeConfig) error {
  3531  	// Bulk update the services and checks
  3532  	a.PauseSync()
  3533  	defer a.ResumeSync()
  3534  
  3535  	a.stateLock.Lock()
  3536  	defer a.stateLock.Unlock()
  3537  
  3538  	// Snapshot the current state, and restore it afterwards
  3539  	snap := a.snapshotCheckState()
  3540  	defer a.restoreCheckState(snap)
  3541  
  3542  	// First unload all checks, services, and metadata. This lets us begin the reload
  3543  	// with a clean slate.
  3544  	if err := a.unloadProxies(); err != nil {
  3545  		return fmt.Errorf("Failed unloading proxies: %s", err)
  3546  	}
  3547  	if err := a.unloadServices(); err != nil {
  3548  		return fmt.Errorf("Failed unloading services: %s", err)
  3549  	}
  3550  	if err := a.unloadChecks(); err != nil {
  3551  		return fmt.Errorf("Failed unloading checks: %s", err)
  3552  	}
  3553  	a.unloadMetadata()
  3554  
  3555  	// Reload tokens - should be done before all the other loading
  3556  	// to ensure the correct tokens are available for attaching to
  3557  	// the checks and service registrations.
  3558  	a.loadTokens(newCfg)
  3559  
  3560  	if err := a.tlsConfigurator.Update(newCfg.ToTLSUtilConfig()); err != nil {
  3561  		return fmt.Errorf("Failed reloading tls configuration: %s", err)
  3562  	}
  3563  
  3564  	// Reload service/check definitions and metadata.
  3565  	if err := a.loadServices(newCfg); err != nil {
  3566  		return fmt.Errorf("Failed reloading services: %s", err)
  3567  	}
  3568  	if err := a.loadProxies(newCfg); err != nil {
  3569  		return fmt.Errorf("Failed reloading proxies: %s", err)
  3570  	}
  3571  	if err := a.loadChecks(newCfg); err != nil {
  3572  		return fmt.Errorf("Failed reloading checks: %s", err)
  3573  	}
  3574  	if err := a.loadMetadata(newCfg); err != nil {
  3575  		return fmt.Errorf("Failed reloading metadata: %s", err)
  3576  	}
  3577  
  3578  	if err := a.reloadWatches(newCfg); err != nil {
  3579  		return fmt.Errorf("Failed reloading watches: %v", err)
  3580  	}
  3581  
  3582  	a.loadLimits(newCfg)
  3583  
  3584  	// create the config for the rpc server/client
  3585  	consulCfg, err := a.consulConfig()
  3586  	if err != nil {
  3587  		return err
  3588  	}
  3589  
  3590  	if err := a.delegate.ReloadConfig(consulCfg); err != nil {
  3591  		return err
  3592  	}
  3593  
  3594  	// Update filtered metrics
  3595  	metrics.UpdateFilter(newCfg.Telemetry.AllowedPrefixes,
  3596  		newCfg.Telemetry.BlockedPrefixes)
  3597  
  3598  	a.State.SetDiscardCheckOutput(newCfg.DiscardCheckOutput)
  3599  
  3600  	return nil
  3601  }
  3602  
  3603  // registerCache configures the cache and registers all the supported
  3604  // types onto the cache. This is NOT safe to call multiple times so
  3605  // care should be taken to call this exactly once after the cache
  3606  // field has been initialized.
  3607  func (a *Agent) registerCache() {
  3608  	// Note that you should register the _agent_ as the RPC implementation and not
  3609  	// the a.delegate directly, otherwise tests that rely on overriding RPC
  3610  	// routing via a.registerEndpoint will not work.
  3611  
  3612  	a.cache.RegisterType(cachetype.ConnectCARootName, &cachetype.ConnectCARoot{
  3613  		RPC: a,
  3614  	}, &cache.RegisterOptions{
  3615  		// Maintain a blocking query, retry dropped connections quickly
  3616  		Refresh:        true,
  3617  		RefreshTimer:   0 * time.Second,
  3618  		RefreshTimeout: 10 * time.Minute,
  3619  	})
  3620  
  3621  	a.cache.RegisterType(cachetype.ConnectCALeafName, &cachetype.ConnectCALeaf{
  3622  		RPC:                              a,
  3623  		Cache:                            a.cache,
  3624  		Datacenter:                       a.config.Datacenter,
  3625  		TestOverrideCAChangeInitialDelay: a.config.ConnectTestCALeafRootChangeSpread,
  3626  	}, &cache.RegisterOptions{
  3627  		// Maintain a blocking query, retry dropped connections quickly
  3628  		Refresh:        true,
  3629  		RefreshTimer:   0 * time.Second,
  3630  		RefreshTimeout: 10 * time.Minute,
  3631  	})
  3632  
  3633  	a.cache.RegisterType(cachetype.IntentionMatchName, &cachetype.IntentionMatch{
  3634  		RPC: a,
  3635  	}, &cache.RegisterOptions{
  3636  		// Maintain a blocking query, retry dropped connections quickly
  3637  		Refresh:        true,
  3638  		RefreshTimer:   0 * time.Second,
  3639  		RefreshTimeout: 10 * time.Minute,
  3640  	})
  3641  
  3642  	a.cache.RegisterType(cachetype.CatalogServicesName, &cachetype.CatalogServices{
  3643  		RPC: a,
  3644  	}, &cache.RegisterOptions{
  3645  		// Maintain a blocking query, retry dropped connections quickly
  3646  		Refresh:        true,
  3647  		RefreshTimer:   0 * time.Second,
  3648  		RefreshTimeout: 10 * time.Minute,
  3649  	})
  3650  
  3651  	a.cache.RegisterType(cachetype.HealthServicesName, &cachetype.HealthServices{
  3652  		RPC: a,
  3653  	}, &cache.RegisterOptions{
  3654  		// Maintain a blocking query, retry dropped connections quickly
  3655  		Refresh:        true,
  3656  		RefreshTimer:   0 * time.Second,
  3657  		RefreshTimeout: 10 * time.Minute,
  3658  	})
  3659  
  3660  	a.cache.RegisterType(cachetype.PreparedQueryName, &cachetype.PreparedQuery{
  3661  		RPC: a,
  3662  	}, &cache.RegisterOptions{
  3663  		// Prepared queries don't support blocking
  3664  		Refresh: false,
  3665  	})
  3666  
  3667  	a.cache.RegisterType(cachetype.NodeServicesName, &cachetype.NodeServices{
  3668  		RPC: a,
  3669  	}, &cache.RegisterOptions{
  3670  		// Maintain a blocking query, retry dropped connections quickly
  3671  		Refresh:        true,
  3672  		RefreshTimer:   0 * time.Second,
  3673  		RefreshTimeout: 10 * time.Minute,
  3674  	})
  3675  }
  3676  
  3677  // defaultProxyCommand returns the default Connect managed proxy command.
  3678  func defaultProxyCommand(agentCfg *config.RuntimeConfig) ([]string, error) {
  3679  	// Get the path to the current executable. This is cached once by the
  3680  	// library so this is effectively just a variable read.
  3681  	execPath, err := os.Executable()
  3682  	if err != nil {
  3683  		return nil, err
  3684  	}
  3685  
  3686  	// "consul connect proxy" default value for managed daemon proxy
  3687  	cmd := []string{execPath, "connect", "proxy"}
  3688  
  3689  	if agentCfg != nil && agentCfg.LogLevel != "INFO" {
  3690  		cmd = append(cmd, "-log-level", agentCfg.LogLevel)
  3691  	}
  3692  	return cmd, nil
  3693  }