github.com/kjdelisle/consul@v1.4.5/agent/agent.go (about)

     1  package agent
     2  
     3  import (
     4  	"context"
     5  	"crypto/sha512"
     6  	"crypto/tls"
     7  	"encoding/json"
     8  	"fmt"
     9  	"io"
    10  	"io/ioutil"
    11  	"log"
    12  	"net"
    13  	"net/http"
    14  	"os"
    15  	"path/filepath"
    16  	"strconv"
    17  	"strings"
    18  	"sync"
    19  	"time"
    20  
    21  	"google.golang.org/grpc"
    22  
    23  	metrics "github.com/armon/go-metrics"
    24  	"github.com/hashicorp/consul/acl"
    25  	"github.com/hashicorp/consul/agent/ae"
    26  	"github.com/hashicorp/consul/agent/cache"
    27  	cachetype "github.com/hashicorp/consul/agent/cache-types"
    28  	"github.com/hashicorp/consul/agent/checks"
    29  	"github.com/hashicorp/consul/agent/config"
    30  	"github.com/hashicorp/consul/agent/consul"
    31  	"github.com/hashicorp/consul/agent/local"
    32  	"github.com/hashicorp/consul/agent/proxycfg"
    33  	"github.com/hashicorp/consul/agent/proxyprocess"
    34  	"github.com/hashicorp/consul/agent/structs"
    35  	"github.com/hashicorp/consul/agent/systemd"
    36  	"github.com/hashicorp/consul/agent/token"
    37  	"github.com/hashicorp/consul/agent/xds"
    38  	"github.com/hashicorp/consul/api"
    39  	"github.com/hashicorp/consul/ipaddr"
    40  	"github.com/hashicorp/consul/lib"
    41  	"github.com/hashicorp/consul/lib/file"
    42  	"github.com/hashicorp/consul/logger"
    43  	"github.com/hashicorp/consul/tlsutil"
    44  	"github.com/hashicorp/consul/types"
    45  	"github.com/hashicorp/consul/watch"
    46  	multierror "github.com/hashicorp/go-multierror"
    47  	uuid "github.com/hashicorp/go-uuid"
    48  	"github.com/hashicorp/memberlist"
    49  	"github.com/hashicorp/raft"
    50  	"github.com/hashicorp/serf/serf"
    51  	"github.com/shirou/gopsutil/host"
    52  	"golang.org/x/net/http2"
    53  )
    54  
    55  const (
    56  	// Path to save agent service definitions
    57  	servicesDir = "services"
    58  
    59  	// Path to save agent proxy definitions
    60  	proxyDir = "proxies"
    61  
    62  	// Path to save local agent checks
    63  	checksDir     = "checks"
    64  	checkStateDir = "checks/state"
    65  
    66  	// Name of the file tokens will be persisted within
    67  	tokensPath = "acl-tokens.json"
    68  
    69  	// Default reasons for node/service maintenance mode
    70  	defaultNodeMaintReason = "Maintenance mode is enabled for this node, " +
    71  		"but no reason was provided. This is a default message."
    72  	defaultServiceMaintReason = "Maintenance mode is enabled for this " +
    73  		"service, but no reason was provided. This is a default message."
    74  )
    75  
    76  type configSource int
    77  
    78  const (
    79  	ConfigSourceLocal configSource = iota
    80  	ConfigSourceRemote
    81  )
    82  
    83  // delegate defines the interface shared by both
    84  // consul.Client and consul.Server.
    85  type delegate interface {
    86  	Encrypted() bool
    87  	GetLANCoordinate() (lib.CoordinateSet, error)
    88  	Leave() error
    89  	LANMembers() []serf.Member
    90  	LANMembersAllSegments() ([]serf.Member, error)
    91  	LANSegmentMembers(segment string) ([]serf.Member, error)
    92  	LocalMember() serf.Member
    93  	JoinLAN(addrs []string) (n int, err error)
    94  	RemoveFailedNode(node string) error
    95  	ResolveToken(secretID string) (acl.Authorizer, error)
    96  	RPC(method string, args interface{}, reply interface{}) error
    97  	ACLsEnabled() bool
    98  	UseLegacyACLs() bool
    99  	SnapshotRPC(args *structs.SnapshotRequest, in io.Reader, out io.Writer, replyFn structs.SnapshotReplyFn) error
   100  	Shutdown() error
   101  	Stats() map[string]map[string]string
   102  	ReloadConfig(config *consul.Config) error
   103  	enterpriseDelegate
   104  }
   105  
   106  // notifier is called after a successful JoinLAN.
   107  type notifier interface {
   108  	Notify(string) error
   109  }
   110  
   111  // The agent is the long running process that is run on every machine.
   112  // It exposes an RPC interface that is used by the CLI to control the
   113  // agent. The agent runs the query interfaces like HTTP, DNS, and RPC.
   114  // However, it can run in either a client, or server mode. In server
   115  // mode, it runs a full Consul server. In client-only mode, it only forwards
   116  // requests to other Consul servers.
   117  type Agent struct {
   118  	// config is the agent configuration.
   119  	config *config.RuntimeConfig
   120  
   121  	// Used for writing our logs
   122  	logger *log.Logger
   123  
   124  	// Output sink for logs
   125  	LogOutput io.Writer
   126  
   127  	// Used for streaming logs to
   128  	LogWriter *logger.LogWriter
   129  
   130  	// In-memory sink used for collecting metrics
   131  	MemSink *metrics.InmemSink
   132  
   133  	// delegate is either a *consul.Server or *consul.Client
   134  	// depending on the configuration
   135  	delegate delegate
   136  
   137  	// aclMasterAuthorizer is an object that helps manage local ACL enforcement.
   138  	aclMasterAuthorizer acl.Authorizer
   139  
   140  	// state stores a local representation of the node,
   141  	// services and checks. Used for anti-entropy.
   142  	State *local.State
   143  
   144  	// sync manages the synchronization of the local
   145  	// and the remote state.
   146  	sync *ae.StateSyncer
   147  
   148  	// syncMu and syncCh are used to coordinate agent endpoints that are blocking
   149  	// on local state during a config reload.
   150  	syncMu sync.Mutex
   151  	syncCh chan struct{}
   152  
   153  	// cache is the in-memory cache for data the Agent requests.
   154  	cache *cache.Cache
   155  
   156  	// checkReapAfter maps the check ID to a timeout after which we should
   157  	// reap its associated service
   158  	checkReapAfter map[types.CheckID]time.Duration
   159  
   160  	// checkMonitors maps the check ID to an associated monitor
   161  	checkMonitors map[types.CheckID]*checks.CheckMonitor
   162  
   163  	// checkHTTPs maps the check ID to an associated HTTP check
   164  	checkHTTPs map[types.CheckID]*checks.CheckHTTP
   165  
   166  	// checkTCPs maps the check ID to an associated TCP check
   167  	checkTCPs map[types.CheckID]*checks.CheckTCP
   168  
   169  	// checkGRPCs maps the check ID to an associated GRPC check
   170  	checkGRPCs map[types.CheckID]*checks.CheckGRPC
   171  
   172  	// checkTTLs maps the check ID to an associated check TTL
   173  	checkTTLs map[types.CheckID]*checks.CheckTTL
   174  
   175  	// checkDockers maps the check ID to an associated Docker Exec based check
   176  	checkDockers map[types.CheckID]*checks.CheckDocker
   177  
   178  	// checkAliases maps the check ID to an associated Alias checks
   179  	checkAliases map[types.CheckID]*checks.CheckAlias
   180  
   181  	// stateLock protects the agent state
   182  	stateLock sync.Mutex
   183  
   184  	// dockerClient is the client for performing docker health checks.
   185  	dockerClient *checks.DockerClient
   186  
   187  	// eventCh is used to receive user events
   188  	eventCh chan serf.UserEvent
   189  
   190  	// eventBuf stores the most recent events in a ring buffer
   191  	// using eventIndex as the next index to insert into. This
   192  	// is guarded by eventLock. When an insert happens, the
   193  	// eventNotify group is notified.
   194  	eventBuf    []*UserEvent
   195  	eventIndex  int
   196  	eventLock   sync.RWMutex
   197  	eventNotify NotifyGroup
   198  
   199  	reloadCh chan chan error
   200  
   201  	shutdown     bool
   202  	shutdownCh   chan struct{}
   203  	shutdownLock sync.Mutex
   204  
   205  	// joinLANNotifier is called after a successful JoinLAN.
   206  	joinLANNotifier notifier
   207  
   208  	// retryJoinCh transports errors from the retry join
   209  	// attempts.
   210  	retryJoinCh chan error
   211  
   212  	// endpoints maps unique RPC endpoint names to common ones
   213  	// to allow overriding of RPC handlers since the golang
   214  	// net/rpc server does not allow this.
   215  	endpoints     map[string]string
   216  	endpointsLock sync.RWMutex
   217  
   218  	// dnsServer provides the DNS API
   219  	dnsServers []*DNSServer
   220  
   221  	// httpServers provides the HTTP API on various endpoints
   222  	httpServers []*HTTPServer
   223  
   224  	// wgServers is the wait group for all HTTP and DNS servers
   225  	wgServers sync.WaitGroup
   226  
   227  	// watchPlans tracks all the currently-running watch plans for the
   228  	// agent.
   229  	watchPlans []*watch.Plan
   230  
   231  	// tokens holds ACL tokens initially from the configuration, but can
   232  	// be updated at runtime, so should always be used instead of going to
   233  	// the configuration directly.
   234  	tokens *token.Store
   235  
   236  	// proxyManager is the proxy process manager for managed Connect proxies.
   237  	proxyManager *proxyprocess.Manager
   238  
   239  	// proxyConfig is the manager for proxy service (Kind = connect-proxy)
   240  	// configuration state. This ensures all state needed by a proxy registration
   241  	// is maintained in cache and handles pushing updates to that state into XDS
   242  	// server to be pushed out to Envoy. This is NOT related to managed proxies
   243  	// directly.
   244  	proxyConfig *proxycfg.Manager
   245  
   246  	// xdsServer is the Server instance that serves xDS gRPC API.
   247  	xdsServer *xds.Server
   248  
   249  	// grpcServer is the server instance used currently to serve xDS API for
   250  	// Envoy.
   251  	grpcServer *grpc.Server
   252  
   253  	// tlsConfigurator is the central instance to provide a *tls.Config
   254  	// based on the current consul configuration.
   255  	tlsConfigurator *tlsutil.Configurator
   256  
   257  	// persistedTokensLock is used to synchronize access to the persisted token
   258  	// store within the data directory. This will prevent loading while writing as
   259  	// well as multiple concurrent writes.
   260  	persistedTokensLock sync.RWMutex
   261  }
   262  
   263  func New(c *config.RuntimeConfig) (*Agent, error) {
   264  	if c.Datacenter == "" {
   265  		return nil, fmt.Errorf("Must configure a Datacenter")
   266  	}
   267  	if c.DataDir == "" && !c.DevMode {
   268  		return nil, fmt.Errorf("Must configure a DataDir")
   269  	}
   270  
   271  	a := &Agent{
   272  		config:          c,
   273  		checkReapAfter:  make(map[types.CheckID]time.Duration),
   274  		checkMonitors:   make(map[types.CheckID]*checks.CheckMonitor),
   275  		checkTTLs:       make(map[types.CheckID]*checks.CheckTTL),
   276  		checkHTTPs:      make(map[types.CheckID]*checks.CheckHTTP),
   277  		checkTCPs:       make(map[types.CheckID]*checks.CheckTCP),
   278  		checkGRPCs:      make(map[types.CheckID]*checks.CheckGRPC),
   279  		checkDockers:    make(map[types.CheckID]*checks.CheckDocker),
   280  		checkAliases:    make(map[types.CheckID]*checks.CheckAlias),
   281  		eventCh:         make(chan serf.UserEvent, 1024),
   282  		eventBuf:        make([]*UserEvent, 256),
   283  		joinLANNotifier: &systemd.Notifier{},
   284  		reloadCh:        make(chan chan error),
   285  		retryJoinCh:     make(chan error),
   286  		shutdownCh:      make(chan struct{}),
   287  		endpoints:       make(map[string]string),
   288  		tokens:          new(token.Store),
   289  	}
   290  
   291  	if err := a.initializeACLs(); err != nil {
   292  		return nil, err
   293  	}
   294  
   295  	return a, nil
   296  }
   297  
   298  func LocalConfig(cfg *config.RuntimeConfig) local.Config {
   299  	lc := local.Config{
   300  		AdvertiseAddr:       cfg.AdvertiseAddrLAN.String(),
   301  		CheckUpdateInterval: cfg.CheckUpdateInterval,
   302  		Datacenter:          cfg.Datacenter,
   303  		DiscardCheckOutput:  cfg.DiscardCheckOutput,
   304  		NodeID:              cfg.NodeID,
   305  		NodeName:            cfg.NodeName,
   306  		TaggedAddresses:     map[string]string{},
   307  		ProxyBindMinPort:    cfg.ConnectProxyBindMinPort,
   308  		ProxyBindMaxPort:    cfg.ConnectProxyBindMaxPort,
   309  	}
   310  	for k, v := range cfg.TaggedAddresses {
   311  		lc.TaggedAddresses[k] = v
   312  	}
   313  	return lc
   314  }
   315  
   316  func (a *Agent) setupProxyManager() error {
   317  	acfg, err := a.config.APIConfig(true)
   318  	if err != nil {
   319  		return fmt.Errorf("[INFO] agent: Connect managed proxies are disabled due to providing an invalid HTTP configuration")
   320  	}
   321  	a.proxyManager = proxyprocess.NewManager()
   322  	a.proxyManager.AllowRoot = a.config.ConnectProxyAllowManagedRoot
   323  	a.proxyManager.State = a.State
   324  	a.proxyManager.Logger = a.logger
   325  	if a.config.DataDir != "" {
   326  		// DataDir is required for all non-dev mode agents, but we want
   327  		// to allow setting the data dir for demos and so on for the agent,
   328  		// so do the check above instead.
   329  		a.proxyManager.DataDir = filepath.Join(a.config.DataDir, "proxy")
   330  
   331  		// Restore from our snapshot (if it exists)
   332  		if err := a.proxyManager.Restore(a.proxyManager.SnapshotPath()); err != nil {
   333  			a.logger.Printf("[WARN] agent: error restoring proxy state: %s", err)
   334  		}
   335  	}
   336  	a.proxyManager.ProxyEnv = acfg.GenerateEnv()
   337  	return nil
   338  }
   339  
   340  func (a *Agent) Start() error {
   341  	a.stateLock.Lock()
   342  	defer a.stateLock.Unlock()
   343  
   344  	c := a.config
   345  
   346  	logOutput := a.LogOutput
   347  	if a.logger == nil {
   348  		if logOutput == nil {
   349  			logOutput = os.Stderr
   350  		}
   351  		a.logger = log.New(logOutput, "", log.LstdFlags)
   352  	}
   353  
   354  	// Retrieve or generate the node ID before setting up the rest of the
   355  	// agent, which depends on it.
   356  	if err := a.setupNodeID(c); err != nil {
   357  		return fmt.Errorf("Failed to setup node ID: %v", err)
   358  	}
   359  
   360  	// Warn if the node name is incompatible with DNS
   361  	if InvalidDnsRe.MatchString(a.config.NodeName) {
   362  		a.logger.Printf("[WARN] agent: Node name %q will not be discoverable "+
   363  			"via DNS due to invalid characters. Valid characters include "+
   364  			"all alpha-numerics and dashes.", a.config.NodeName)
   365  	} else if len(a.config.NodeName) > MaxDNSLabelLength {
   366  		a.logger.Printf("[WARN] agent: Node name %q will not be discoverable "+
   367  			"via DNS due to it being too long. Valid lengths are between "+
   368  			"1 and 63 bytes.", a.config.NodeName)
   369  	}
   370  
   371  	// load the tokens - this requires the logger to be setup
   372  	// which is why we can't do this in New
   373  	a.loadTokens(a.config)
   374  
   375  	// create the local state
   376  	a.State = local.NewState(LocalConfig(c), a.logger, a.tokens)
   377  
   378  	// create the state synchronization manager which performs
   379  	// regular and on-demand state synchronizations (anti-entropy).
   380  	a.sync = ae.NewStateSyncer(a.State, c.AEInterval, a.shutdownCh, a.logger)
   381  
   382  	// create the cache
   383  	a.cache = cache.New(nil)
   384  
   385  	// create the config for the rpc server/client
   386  	consulCfg, err := a.consulConfig()
   387  	if err != nil {
   388  		return err
   389  	}
   390  
   391  	// ServerUp is used to inform that a new consul server is now
   392  	// up. This can be used to speed up the sync process if we are blocking
   393  	// waiting to discover a consul server
   394  	consulCfg.ServerUp = a.sync.SyncFull.Trigger
   395  
   396  	tlsConfigurator, err := tlsutil.NewConfigurator(c.ToTLSUtilConfig(), a.logger)
   397  	if err != nil {
   398  		return err
   399  	}
   400  	a.tlsConfigurator = tlsConfigurator
   401  
   402  	// Setup either the client or the server.
   403  	if c.ServerMode {
   404  		server, err := consul.NewServerLogger(consulCfg, a.logger, a.tokens, a.tlsConfigurator)
   405  		if err != nil {
   406  			return fmt.Errorf("Failed to start Consul server: %v", err)
   407  		}
   408  		a.delegate = server
   409  	} else {
   410  		client, err := consul.NewClientLogger(consulCfg, a.logger, a.tlsConfigurator)
   411  		if err != nil {
   412  			return fmt.Errorf("Failed to start Consul client: %v", err)
   413  		}
   414  		a.delegate = client
   415  	}
   416  
   417  	// the staggering of the state syncing depends on the cluster size.
   418  	a.sync.ClusterSize = func() int { return len(a.delegate.LANMembers()) }
   419  
   420  	// link the state with the consul server/client and the state syncer
   421  	// via callbacks. After several attempts this was easier than using
   422  	// channels since the event notification needs to be non-blocking
   423  	// and that should be hidden in the state syncer implementation.
   424  	a.State.Delegate = a.delegate
   425  	a.State.TriggerSyncChanges = a.sync.SyncChanges.Trigger
   426  
   427  	// Register the cache. We do this much later so the delegate is
   428  	// populated from above.
   429  	a.registerCache()
   430  
   431  	// Load checks/services/metadata.
   432  	if err := a.loadServices(c); err != nil {
   433  		return err
   434  	}
   435  	if err := a.loadProxies(c); err != nil {
   436  		return err
   437  	}
   438  	if err := a.loadChecks(c); err != nil {
   439  		return err
   440  	}
   441  	if err := a.loadMetadata(c); err != nil {
   442  		return err
   443  	}
   444  
   445  	// create the proxy process manager and start it. This is purposely
   446  	// done here after the local state above is loaded in so we can have
   447  	// a more accurate initial state view.
   448  	if !c.ConnectTestDisableManagedProxies {
   449  		if err := a.setupProxyManager(); err != nil {
   450  			a.logger.Printf(err.Error())
   451  		} else {
   452  			go a.proxyManager.Run()
   453  		}
   454  	}
   455  
   456  	// Start the proxy config manager.
   457  	a.proxyConfig, err = proxycfg.NewManager(proxycfg.ManagerConfig{
   458  		Cache:  a.cache,
   459  		Logger: a.logger,
   460  		State:  a.State,
   461  		Source: &structs.QuerySource{
   462  			Node:       a.config.NodeName,
   463  			Datacenter: a.config.Datacenter,
   464  			Segment:    a.config.SegmentName,
   465  		},
   466  	})
   467  	if err != nil {
   468  		return err
   469  	}
   470  	go func() {
   471  		if err := a.proxyConfig.Run(); err != nil {
   472  			a.logger.Printf("[ERR] Proxy Config Manager exited: %s", err)
   473  		}
   474  	}()
   475  
   476  	// Start watching for critical services to deregister, based on their
   477  	// checks.
   478  	go a.reapServices()
   479  
   480  	// Start handling events.
   481  	go a.handleEvents()
   482  
   483  	// Start sending network coordinate to the server.
   484  	if !c.DisableCoordinates {
   485  		go a.sendCoordinate()
   486  	}
   487  
   488  	// Write out the PID file if necessary.
   489  	if err := a.storePid(); err != nil {
   490  		return err
   491  	}
   492  
   493  	// start DNS servers
   494  	if err := a.listenAndServeDNS(); err != nil {
   495  		return err
   496  	}
   497  
   498  	// Create listeners and unstarted servers; see comment on listenHTTP why
   499  	// we are doing this.
   500  	servers, err := a.listenHTTP()
   501  	if err != nil {
   502  		return err
   503  	}
   504  
   505  	// Start HTTP and HTTPS servers.
   506  	for _, srv := range servers {
   507  		if err := a.serveHTTP(srv); err != nil {
   508  			return err
   509  		}
   510  		a.httpServers = append(a.httpServers, srv)
   511  	}
   512  
   513  	// Start gRPC server.
   514  	if err := a.listenAndServeGRPC(); err != nil {
   515  		return err
   516  	}
   517  
   518  	// register watches
   519  	if err := a.reloadWatches(a.config); err != nil {
   520  		return err
   521  	}
   522  
   523  	// start retry join
   524  	go a.retryJoinLAN()
   525  	go a.retryJoinWAN()
   526  
   527  	return nil
   528  }
   529  
   530  func (a *Agent) listenAndServeGRPC() error {
   531  	if len(a.config.GRPCAddrs) < 1 {
   532  		return nil
   533  	}
   534  
   535  	a.xdsServer = &xds.Server{
   536  		Logger:       a.logger,
   537  		CfgMgr:       a.proxyConfig,
   538  		Authz:        a,
   539  		ResolveToken: a.resolveToken,
   540  	}
   541  	a.xdsServer.Initialize()
   542  
   543  	var err error
   544  	if a.config.HTTPSPort > 0 {
   545  		// gRPC uses the same TLS settings as the HTTPS API. If HTTPS is
   546  		// enabled then gRPC will require HTTPS as well.
   547  		a.grpcServer, err = a.xdsServer.GRPCServer(a.config.CertFile, a.config.KeyFile)
   548  	} else {
   549  		a.grpcServer, err = a.xdsServer.GRPCServer("", "")
   550  	}
   551  	if err != nil {
   552  		return err
   553  	}
   554  
   555  	ln, err := a.startListeners(a.config.GRPCAddrs)
   556  	if err != nil {
   557  		return err
   558  	}
   559  
   560  	for _, l := range ln {
   561  		go func(innerL net.Listener) {
   562  			a.logger.Printf("[INFO] agent: Started gRPC server on %s (%s)",
   563  				innerL.Addr().String(), innerL.Addr().Network())
   564  			err := a.grpcServer.Serve(innerL)
   565  			if err != nil {
   566  				a.logger.Printf("[ERR] gRPC server failed: %s", err)
   567  			}
   568  		}(l)
   569  	}
   570  	return nil
   571  }
   572  
   573  func (a *Agent) listenAndServeDNS() error {
   574  	notif := make(chan net.Addr, len(a.config.DNSAddrs))
   575  	errCh := make(chan error, len(a.config.DNSAddrs))
   576  	for _, addr := range a.config.DNSAddrs {
   577  		// create server
   578  		s, err := NewDNSServer(a)
   579  		if err != nil {
   580  			return err
   581  		}
   582  		a.dnsServers = append(a.dnsServers, s)
   583  
   584  		// start server
   585  		a.wgServers.Add(1)
   586  		go func(addr net.Addr) {
   587  			defer a.wgServers.Done()
   588  			err := s.ListenAndServe(addr.Network(), addr.String(), func() { notif <- addr })
   589  			if err != nil && !strings.Contains(err.Error(), "accept") {
   590  				errCh <- err
   591  			}
   592  		}(addr)
   593  	}
   594  
   595  	// wait for servers to be up
   596  	timeout := time.After(time.Second)
   597  	var merr *multierror.Error
   598  	for range a.config.DNSAddrs {
   599  		select {
   600  		case addr := <-notif:
   601  			a.logger.Printf("[INFO] agent: Started DNS server %s (%s)", addr.String(), addr.Network())
   602  
   603  		case err := <-errCh:
   604  			merr = multierror.Append(merr, err)
   605  		case <-timeout:
   606  			merr = multierror.Append(merr, fmt.Errorf("agent: timeout starting DNS servers"))
   607  			break
   608  		}
   609  	}
   610  	return merr.ErrorOrNil()
   611  }
   612  
   613  func (a *Agent) startListeners(addrs []net.Addr) ([]net.Listener, error) {
   614  	var ln []net.Listener
   615  	for _, addr := range addrs {
   616  		var l net.Listener
   617  		var err error
   618  
   619  		switch x := addr.(type) {
   620  		case *net.UnixAddr:
   621  			l, err = a.listenSocket(x.Name)
   622  			if err != nil {
   623  				return nil, err
   624  			}
   625  
   626  		case *net.TCPAddr:
   627  			l, err = net.Listen("tcp", x.String())
   628  			if err != nil {
   629  				return nil, err
   630  			}
   631  			l = &tcpKeepAliveListener{l.(*net.TCPListener)}
   632  
   633  		default:
   634  			return nil, fmt.Errorf("unsupported address type %T", addr)
   635  		}
   636  		ln = append(ln, l)
   637  	}
   638  	return ln, nil
   639  }
   640  
   641  // listenHTTP binds listeners to the provided addresses and also returns
   642  // pre-configured HTTP servers which are not yet started. The motivation is
   643  // that in the current startup/shutdown setup we de-couple the listener
   644  // creation from the server startup assuming that if any of the listeners
   645  // cannot be bound we fail immediately and later failures do not occur.
   646  // Therefore, starting a server with a running listener is assumed to not
   647  // produce an error.
   648  //
   649  // The second motivation is that an HTTPS server needs to use the same TLSConfig
   650  // on both the listener and the HTTP server. When listeners and servers are
   651  // created at different times this becomes difficult to handle without keeping
   652  // the TLS configuration somewhere or recreating it.
   653  //
   654  // This approach should ultimately be refactored to the point where we just
   655  // start the server and any error should trigger a proper shutdown of the agent.
   656  func (a *Agent) listenHTTP() ([]*HTTPServer, error) {
   657  	var ln []net.Listener
   658  	var servers []*HTTPServer
   659  	start := func(proto string, addrs []net.Addr) error {
   660  		listeners, err := a.startListeners(addrs)
   661  		if err != nil {
   662  			return err
   663  		}
   664  
   665  		for _, l := range listeners {
   666  			var tlscfg *tls.Config
   667  			_, isTCP := l.(*tcpKeepAliveListener)
   668  			if isTCP && proto == "https" {
   669  				tlscfg = a.tlsConfigurator.IncomingHTTPSConfig()
   670  				l = tls.NewListener(l, tlscfg)
   671  			}
   672  			srv := &HTTPServer{
   673  				Server: &http.Server{
   674  					Addr:      l.Addr().String(),
   675  					TLSConfig: tlscfg,
   676  				},
   677  				ln:        l,
   678  				agent:     a,
   679  				blacklist: NewBlacklist(a.config.HTTPBlockEndpoints),
   680  				proto:     proto,
   681  			}
   682  			srv.Server.Handler = srv.handler(a.config.EnableDebug)
   683  
   684  			// This will enable upgrading connections to HTTP/2 as
   685  			// part of TLS negotiation.
   686  			if proto == "https" {
   687  				err = http2.ConfigureServer(srv.Server, nil)
   688  				if err != nil {
   689  					return err
   690  				}
   691  			}
   692  
   693  			ln = append(ln, l)
   694  			servers = append(servers, srv)
   695  		}
   696  		return nil
   697  	}
   698  
   699  	if err := start("http", a.config.HTTPAddrs); err != nil {
   700  		for _, l := range ln {
   701  			l.Close()
   702  		}
   703  		return nil, err
   704  	}
   705  	if err := start("https", a.config.HTTPSAddrs); err != nil {
   706  		for _, l := range ln {
   707  			l.Close()
   708  		}
   709  		return nil, err
   710  	}
   711  	return servers, nil
   712  }
   713  
   714  // tcpKeepAliveListener sets TCP keep-alive timeouts on accepted
   715  // connections. It's used so dead TCP connections eventually go away.
   716  type tcpKeepAliveListener struct {
   717  	*net.TCPListener
   718  }
   719  
   720  func (ln tcpKeepAliveListener) Accept() (c net.Conn, err error) {
   721  	tc, err := ln.AcceptTCP()
   722  	if err != nil {
   723  		return
   724  	}
   725  	tc.SetKeepAlive(true)
   726  	tc.SetKeepAlivePeriod(30 * time.Second)
   727  	return tc, nil
   728  }
   729  
   730  func (a *Agent) listenSocket(path string) (net.Listener, error) {
   731  	if _, err := os.Stat(path); !os.IsNotExist(err) {
   732  		a.logger.Printf("[WARN] agent: Replacing socket %q", path)
   733  	}
   734  	if err := os.Remove(path); err != nil && !os.IsNotExist(err) {
   735  		return nil, fmt.Errorf("error removing socket file: %s", err)
   736  	}
   737  	l, err := net.Listen("unix", path)
   738  	if err != nil {
   739  		return nil, err
   740  	}
   741  	user, group, mode := a.config.UnixSocketUser, a.config.UnixSocketGroup, a.config.UnixSocketMode
   742  	if err := setFilePermissions(path, user, group, mode); err != nil {
   743  		return nil, fmt.Errorf("Failed setting up socket: %s", err)
   744  	}
   745  	return l, nil
   746  }
   747  
   748  func (a *Agent) serveHTTP(srv *HTTPServer) error {
   749  	// https://github.com/golang/go/issues/20239
   750  	//
   751  	// In go.8.1 there is a race between Serve and Shutdown. If
   752  	// Shutdown is called before the Serve go routine was scheduled then
   753  	// the Serve go routine never returns. This deadlocks the agent
   754  	// shutdown for some tests since it will wait forever.
   755  	notif := make(chan net.Addr)
   756  	a.wgServers.Add(1)
   757  	go func() {
   758  		defer a.wgServers.Done()
   759  		notif <- srv.ln.Addr()
   760  		err := srv.Serve(srv.ln)
   761  		if err != nil && err != http.ErrServerClosed {
   762  			a.logger.Print(err)
   763  		}
   764  	}()
   765  
   766  	select {
   767  	case addr := <-notif:
   768  		if srv.proto == "https" {
   769  			a.logger.Printf("[INFO] agent: Started HTTPS server on %s (%s)", addr.String(), addr.Network())
   770  		} else {
   771  			a.logger.Printf("[INFO] agent: Started HTTP server on %s (%s)", addr.String(), addr.Network())
   772  		}
   773  		return nil
   774  	case <-time.After(time.Second):
   775  		return fmt.Errorf("agent: timeout starting HTTP servers")
   776  	}
   777  }
   778  
   779  // reloadWatches stops any existing watch plans and attempts to load the given
   780  // set of watches.
   781  func (a *Agent) reloadWatches(cfg *config.RuntimeConfig) error {
   782  	// Stop the current watches.
   783  	for _, wp := range a.watchPlans {
   784  		wp.Stop()
   785  	}
   786  	a.watchPlans = nil
   787  
   788  	// Return if there are no watches now.
   789  	if len(cfg.Watches) == 0 {
   790  		return nil
   791  	}
   792  
   793  	// Watches use the API to talk to this agent, so that must be enabled.
   794  	if len(cfg.HTTPAddrs) == 0 && len(cfg.HTTPSAddrs) == 0 {
   795  		return fmt.Errorf("watch plans require an HTTP or HTTPS endpoint")
   796  	}
   797  
   798  	// Compile the watches
   799  	var watchPlans []*watch.Plan
   800  	for _, params := range cfg.Watches {
   801  		if handlerType, ok := params["handler_type"]; !ok {
   802  			params["handler_type"] = "script"
   803  		} else if handlerType != "http" && handlerType != "script" {
   804  			return fmt.Errorf("Handler type '%s' not recognized", params["handler_type"])
   805  		}
   806  
   807  		// Don't let people use connect watches via this mechanism for now as it
   808  		// needs thought about how to do securely and shouldn't be necessary. Note
   809  		// that if the type assertion fails an type is not a string then
   810  		// ParseExample below will error so we don't need to handle that case.
   811  		if typ, ok := params["type"].(string); ok {
   812  			if strings.HasPrefix(typ, "connect_") {
   813  				return fmt.Errorf("Watch type %s is not allowed in agent config", typ)
   814  			}
   815  		}
   816  
   817  		// Parse the watches, excluding 'handler' and 'args'
   818  		wp, err := watch.ParseExempt(params, []string{"handler", "args"})
   819  		if err != nil {
   820  			return fmt.Errorf("Failed to parse watch (%#v): %v", params, err)
   821  		}
   822  
   823  		// Get the handler and subprocess arguments
   824  		handler, hasHandler := wp.Exempt["handler"]
   825  		args, hasArgs := wp.Exempt["args"]
   826  		if hasHandler {
   827  			a.logger.Printf("[WARN] agent: The 'handler' field in watches has been deprecated " +
   828  				"and replaced with the 'args' field. See https://www.consul.io/docs/agent/watches.html")
   829  		}
   830  		if _, ok := handler.(string); hasHandler && !ok {
   831  			return fmt.Errorf("Watch handler must be a string")
   832  		}
   833  		if raw, ok := args.([]interface{}); hasArgs && ok {
   834  			var parsed []string
   835  			for _, arg := range raw {
   836  				v, ok := arg.(string)
   837  				if !ok {
   838  					return fmt.Errorf("Watch args must be a list of strings")
   839  				}
   840  
   841  				parsed = append(parsed, v)
   842  			}
   843  			wp.Exempt["args"] = parsed
   844  		} else if hasArgs && !ok {
   845  			return fmt.Errorf("Watch args must be a list of strings")
   846  		}
   847  		if hasHandler && hasArgs || hasHandler && wp.HandlerType == "http" || hasArgs && wp.HandlerType == "http" {
   848  			return fmt.Errorf("Only one watch handler allowed")
   849  		}
   850  		if !hasHandler && !hasArgs && wp.HandlerType != "http" {
   851  			return fmt.Errorf("Must define a watch handler")
   852  		}
   853  
   854  		// Store the watch plan
   855  		watchPlans = append(watchPlans, wp)
   856  	}
   857  
   858  	// Fire off a goroutine for each new watch plan.
   859  	for _, wp := range watchPlans {
   860  		config, err := a.config.APIConfig(true)
   861  		if err != nil {
   862  			a.logger.Printf("[ERR] agent: Failed to run watch: %v", err)
   863  			continue
   864  		}
   865  
   866  		a.watchPlans = append(a.watchPlans, wp)
   867  		go func(wp *watch.Plan) {
   868  			if h, ok := wp.Exempt["handler"]; ok {
   869  				wp.Handler = makeWatchHandler(a.LogOutput, h)
   870  			} else if h, ok := wp.Exempt["args"]; ok {
   871  				wp.Handler = makeWatchHandler(a.LogOutput, h)
   872  			} else {
   873  				httpConfig := wp.Exempt["http_handler_config"].(*watch.HttpHandlerConfig)
   874  				wp.Handler = makeHTTPWatchHandler(a.LogOutput, httpConfig)
   875  			}
   876  			wp.LogOutput = a.LogOutput
   877  
   878  			addr := config.Address
   879  			if config.Scheme == "https" {
   880  				addr = "https://" + addr
   881  			}
   882  
   883  			if err := wp.RunWithConfig(addr, config); err != nil {
   884  				a.logger.Printf("[ERR] agent: Failed to run watch: %v", err)
   885  			}
   886  		}(wp)
   887  	}
   888  	return nil
   889  }
   890  
   891  // consulConfig is used to return a consul configuration
   892  func (a *Agent) consulConfig() (*consul.Config, error) {
   893  	// Start with the provided config or default config
   894  	base := consul.DefaultConfig()
   895  
   896  	// This is set when the agent starts up
   897  	base.NodeID = a.config.NodeID
   898  
   899  	// Apply dev mode
   900  	base.DevMode = a.config.DevMode
   901  
   902  	// Override with our config
   903  	// todo(fs): these are now always set in the runtime config so we can simplify this
   904  	// todo(fs): or is there a reason to keep it like that?
   905  	base.Datacenter = a.config.Datacenter
   906  	base.PrimaryDatacenter = a.config.PrimaryDatacenter
   907  	base.DataDir = a.config.DataDir
   908  	base.NodeName = a.config.NodeName
   909  
   910  	base.CoordinateUpdateBatchSize = a.config.ConsulCoordinateUpdateBatchSize
   911  	base.CoordinateUpdateMaxBatches = a.config.ConsulCoordinateUpdateMaxBatches
   912  	base.CoordinateUpdatePeriod = a.config.ConsulCoordinateUpdatePeriod
   913  
   914  	base.RaftConfig.HeartbeatTimeout = a.config.ConsulRaftHeartbeatTimeout
   915  	base.RaftConfig.LeaderLeaseTimeout = a.config.ConsulRaftLeaderLeaseTimeout
   916  	base.RaftConfig.ElectionTimeout = a.config.ConsulRaftElectionTimeout
   917  
   918  	base.SerfLANConfig.MemberlistConfig.BindAddr = a.config.SerfBindAddrLAN.IP.String()
   919  	base.SerfLANConfig.MemberlistConfig.BindPort = a.config.SerfBindAddrLAN.Port
   920  	base.SerfLANConfig.MemberlistConfig.AdvertiseAddr = a.config.SerfAdvertiseAddrLAN.IP.String()
   921  	base.SerfLANConfig.MemberlistConfig.AdvertisePort = a.config.SerfAdvertiseAddrLAN.Port
   922  	base.SerfLANConfig.MemberlistConfig.GossipVerifyIncoming = a.config.EncryptVerifyIncoming
   923  	base.SerfLANConfig.MemberlistConfig.GossipVerifyOutgoing = a.config.EncryptVerifyOutgoing
   924  	base.SerfLANConfig.MemberlistConfig.GossipInterval = a.config.GossipLANGossipInterval
   925  	base.SerfLANConfig.MemberlistConfig.GossipNodes = a.config.GossipLANGossipNodes
   926  	base.SerfLANConfig.MemberlistConfig.ProbeInterval = a.config.GossipLANProbeInterval
   927  	base.SerfLANConfig.MemberlistConfig.ProbeTimeout = a.config.GossipLANProbeTimeout
   928  	base.SerfLANConfig.MemberlistConfig.SuspicionMult = a.config.GossipLANSuspicionMult
   929  	base.SerfLANConfig.MemberlistConfig.RetransmitMult = a.config.GossipLANRetransmitMult
   930  	if a.config.ReconnectTimeoutLAN != 0 {
   931  		base.SerfLANConfig.ReconnectTimeout = a.config.ReconnectTimeoutLAN
   932  	}
   933  
   934  	if a.config.SerfBindAddrWAN != nil {
   935  		base.SerfWANConfig.MemberlistConfig.BindAddr = a.config.SerfBindAddrWAN.IP.String()
   936  		base.SerfWANConfig.MemberlistConfig.BindPort = a.config.SerfBindAddrWAN.Port
   937  		base.SerfWANConfig.MemberlistConfig.AdvertiseAddr = a.config.SerfAdvertiseAddrWAN.IP.String()
   938  		base.SerfWANConfig.MemberlistConfig.AdvertisePort = a.config.SerfAdvertiseAddrWAN.Port
   939  		base.SerfWANConfig.MemberlistConfig.GossipVerifyIncoming = a.config.EncryptVerifyIncoming
   940  		base.SerfWANConfig.MemberlistConfig.GossipVerifyOutgoing = a.config.EncryptVerifyOutgoing
   941  		base.SerfWANConfig.MemberlistConfig.GossipInterval = a.config.GossipWANGossipInterval
   942  		base.SerfWANConfig.MemberlistConfig.GossipNodes = a.config.GossipWANGossipNodes
   943  		base.SerfWANConfig.MemberlistConfig.ProbeInterval = a.config.GossipWANProbeInterval
   944  		base.SerfWANConfig.MemberlistConfig.ProbeTimeout = a.config.GossipWANProbeTimeout
   945  		base.SerfWANConfig.MemberlistConfig.SuspicionMult = a.config.GossipWANSuspicionMult
   946  		base.SerfWANConfig.MemberlistConfig.RetransmitMult = a.config.GossipWANRetransmitMult
   947  		if a.config.ReconnectTimeoutWAN != 0 {
   948  			base.SerfWANConfig.ReconnectTimeout = a.config.ReconnectTimeoutWAN
   949  		}
   950  	} else {
   951  		// Disable serf WAN federation
   952  		base.SerfWANConfig = nil
   953  	}
   954  
   955  	base.RPCAddr = a.config.RPCBindAddr
   956  	base.RPCAdvertise = a.config.RPCAdvertiseAddr
   957  
   958  	base.Segment = a.config.SegmentName
   959  	if len(a.config.Segments) > 0 {
   960  		segments, err := a.segmentConfig()
   961  		if err != nil {
   962  			return nil, err
   963  		}
   964  		base.Segments = segments
   965  	}
   966  	if a.config.Bootstrap {
   967  		base.Bootstrap = true
   968  	}
   969  	if a.config.RejoinAfterLeave {
   970  		base.RejoinAfterLeave = true
   971  	}
   972  	if a.config.BootstrapExpect != 0 {
   973  		base.BootstrapExpect = a.config.BootstrapExpect
   974  	}
   975  	if a.config.RPCProtocol > 0 {
   976  		base.ProtocolVersion = uint8(a.config.RPCProtocol)
   977  	}
   978  	if a.config.RaftProtocol != 0 {
   979  		base.RaftConfig.ProtocolVersion = raft.ProtocolVersion(a.config.RaftProtocol)
   980  	}
   981  	if a.config.RaftSnapshotThreshold != 0 {
   982  		base.RaftConfig.SnapshotThreshold = uint64(a.config.RaftSnapshotThreshold)
   983  	}
   984  	if a.config.RaftSnapshotInterval != 0 {
   985  		base.RaftConfig.SnapshotInterval = a.config.RaftSnapshotInterval
   986  	}
   987  	if a.config.ACLMasterToken != "" {
   988  		base.ACLMasterToken = a.config.ACLMasterToken
   989  	}
   990  	if a.config.ACLDatacenter != "" {
   991  		base.ACLDatacenter = a.config.ACLDatacenter
   992  	}
   993  	if a.config.ACLTokenTTL != 0 {
   994  		base.ACLTokenTTL = a.config.ACLTokenTTL
   995  	}
   996  	if a.config.ACLPolicyTTL != 0 {
   997  		base.ACLPolicyTTL = a.config.ACLPolicyTTL
   998  	}
   999  	if a.config.ACLDefaultPolicy != "" {
  1000  		base.ACLDefaultPolicy = a.config.ACLDefaultPolicy
  1001  	}
  1002  	if a.config.ACLDownPolicy != "" {
  1003  		base.ACLDownPolicy = a.config.ACLDownPolicy
  1004  	}
  1005  	base.ACLEnforceVersion8 = a.config.ACLEnforceVersion8
  1006  	base.ACLTokenReplication = a.config.ACLTokenReplication
  1007  	base.ACLsEnabled = a.config.ACLsEnabled
  1008  	if a.config.ACLEnableKeyListPolicy {
  1009  		base.ACLEnableKeyListPolicy = a.config.ACLEnableKeyListPolicy
  1010  	}
  1011  	if a.config.SessionTTLMin != 0 {
  1012  		base.SessionTTLMin = a.config.SessionTTLMin
  1013  	}
  1014  	if a.config.NonVotingServer {
  1015  		base.NonVoter = a.config.NonVotingServer
  1016  	}
  1017  
  1018  	// These are fully specified in the agent defaults, so we can simply
  1019  	// copy them over.
  1020  	base.AutopilotConfig.CleanupDeadServers = a.config.AutopilotCleanupDeadServers
  1021  	base.AutopilotConfig.LastContactThreshold = a.config.AutopilotLastContactThreshold
  1022  	base.AutopilotConfig.MaxTrailingLogs = uint64(a.config.AutopilotMaxTrailingLogs)
  1023  	base.AutopilotConfig.ServerStabilizationTime = a.config.AutopilotServerStabilizationTime
  1024  	base.AutopilotConfig.RedundancyZoneTag = a.config.AutopilotRedundancyZoneTag
  1025  	base.AutopilotConfig.DisableUpgradeMigration = a.config.AutopilotDisableUpgradeMigration
  1026  	base.AutopilotConfig.UpgradeVersionTag = a.config.AutopilotUpgradeVersionTag
  1027  
  1028  	// make sure the advertise address is always set
  1029  	if base.RPCAdvertise == nil {
  1030  		base.RPCAdvertise = base.RPCAddr
  1031  	}
  1032  
  1033  	// Rate limiting for RPC calls.
  1034  	if a.config.RPCRateLimit > 0 {
  1035  		base.RPCRate = a.config.RPCRateLimit
  1036  	}
  1037  	if a.config.RPCMaxBurst > 0 {
  1038  		base.RPCMaxBurst = a.config.RPCMaxBurst
  1039  	}
  1040  
  1041  	// RPC-related performance configs.
  1042  	if a.config.RPCHoldTimeout > 0 {
  1043  		base.RPCHoldTimeout = a.config.RPCHoldTimeout
  1044  	}
  1045  	if a.config.LeaveDrainTime > 0 {
  1046  		base.LeaveDrainTime = a.config.LeaveDrainTime
  1047  	}
  1048  
  1049  	// set the src address for outgoing rpc connections
  1050  	// Use port 0 so that outgoing connections use a random port.
  1051  	if !ipaddr.IsAny(base.RPCAddr.IP) {
  1052  		base.RPCSrcAddr = &net.TCPAddr{IP: base.RPCAddr.IP}
  1053  	}
  1054  
  1055  	// Format the build string
  1056  	revision := a.config.Revision
  1057  	if len(revision) > 8 {
  1058  		revision = revision[:8]
  1059  	}
  1060  	base.Build = fmt.Sprintf("%s%s:%s", a.config.Version, a.config.VersionPrerelease, revision)
  1061  
  1062  	// Copy the TLS configuration
  1063  	base.VerifyIncoming = a.config.VerifyIncoming || a.config.VerifyIncomingRPC
  1064  	if a.config.CAPath != "" || a.config.CAFile != "" {
  1065  		base.UseTLS = true
  1066  	}
  1067  	base.VerifyOutgoing = a.config.VerifyOutgoing
  1068  	base.VerifyServerHostname = a.config.VerifyServerHostname
  1069  	base.CAFile = a.config.CAFile
  1070  	base.CAPath = a.config.CAPath
  1071  	base.CertFile = a.config.CertFile
  1072  	base.KeyFile = a.config.KeyFile
  1073  	base.ServerName = a.config.ServerName
  1074  	base.Domain = a.config.DNSDomain
  1075  	base.TLSMinVersion = a.config.TLSMinVersion
  1076  	base.TLSCipherSuites = a.config.TLSCipherSuites
  1077  	base.TLSPreferServerCipherSuites = a.config.TLSPreferServerCipherSuites
  1078  
  1079  	// Copy the Connect CA bootstrap config
  1080  	if a.config.ConnectEnabled {
  1081  		base.ConnectEnabled = true
  1082  
  1083  		// Allow config to specify cluster_id provided it's a valid UUID. This is
  1084  		// meant only for tests where a deterministic ID makes fixtures much simpler
  1085  		// to work with but since it's only read on initial cluster bootstrap it's not
  1086  		// that much of a liability in production. The worst a user could do is
  1087  		// configure logically separate clusters with same ID by mistake but we can
  1088  		// avoid documenting this is even an option.
  1089  		if clusterID, ok := a.config.ConnectCAConfig["cluster_id"]; ok {
  1090  			if cIDStr, ok := clusterID.(string); ok {
  1091  				if _, err := uuid.ParseUUID(cIDStr); err == nil {
  1092  					// Valid UUID configured, use that
  1093  					base.CAConfig.ClusterID = cIDStr
  1094  				}
  1095  			}
  1096  			if base.CAConfig.ClusterID == "" {
  1097  				// If the tried to specify an ID but typoed it don't ignore as they will
  1098  				// then bootstrap with a new ID and have to throw away the whole cluster
  1099  				// and start again.
  1100  				a.logger.Println("[ERR] connect CA config cluster_id specified but " +
  1101  					"is not a valid UUID, aborting startup")
  1102  				return nil, fmt.Errorf("cluster_id was supplied but was not a valid UUID")
  1103  			}
  1104  		}
  1105  
  1106  		if a.config.ConnectCAProvider != "" {
  1107  			base.CAConfig.Provider = a.config.ConnectCAProvider
  1108  		}
  1109  
  1110  		// Merge connect CA Config regardless of provider (since there are some
  1111  		// common config options valid to all like leaf TTL).
  1112  		for k, v := range a.config.ConnectCAConfig {
  1113  			base.CAConfig.Config[k] = v
  1114  		}
  1115  	}
  1116  
  1117  	// Setup the user event callback
  1118  	base.UserEventHandler = func(e serf.UserEvent) {
  1119  		select {
  1120  		case a.eventCh <- e:
  1121  		case <-a.shutdownCh:
  1122  		}
  1123  	}
  1124  
  1125  	// Setup the loggers
  1126  	base.LogOutput = a.LogOutput
  1127  
  1128  	// This will set up the LAN keyring, as well as the WAN and any segments
  1129  	// for servers.
  1130  	if err := a.setupKeyrings(base); err != nil {
  1131  		return nil, fmt.Errorf("Failed to configure keyring: %v", err)
  1132  	}
  1133  
  1134  	return base, nil
  1135  }
  1136  
  1137  // Setup the serf and memberlist config for any defined network segments.
  1138  func (a *Agent) segmentConfig() ([]consul.NetworkSegment, error) {
  1139  	var segments []consul.NetworkSegment
  1140  	config := a.config
  1141  
  1142  	for _, s := range config.Segments {
  1143  		serfConf := consul.DefaultConfig().SerfLANConfig
  1144  
  1145  		serfConf.MemberlistConfig.BindAddr = s.Bind.IP.String()
  1146  		serfConf.MemberlistConfig.BindPort = s.Bind.Port
  1147  		serfConf.MemberlistConfig.AdvertiseAddr = s.Advertise.IP.String()
  1148  		serfConf.MemberlistConfig.AdvertisePort = s.Advertise.Port
  1149  
  1150  		if config.ReconnectTimeoutLAN != 0 {
  1151  			serfConf.ReconnectTimeout = config.ReconnectTimeoutLAN
  1152  		}
  1153  		if config.EncryptVerifyIncoming {
  1154  			serfConf.MemberlistConfig.GossipVerifyIncoming = config.EncryptVerifyIncoming
  1155  		}
  1156  		if config.EncryptVerifyOutgoing {
  1157  			serfConf.MemberlistConfig.GossipVerifyOutgoing = config.EncryptVerifyOutgoing
  1158  		}
  1159  
  1160  		var rpcAddr *net.TCPAddr
  1161  		if s.RPCListener {
  1162  			rpcAddr = &net.TCPAddr{
  1163  				IP:   s.Bind.IP,
  1164  				Port: a.config.ServerPort,
  1165  			}
  1166  		}
  1167  
  1168  		segments = append(segments, consul.NetworkSegment{
  1169  			Name:       s.Name,
  1170  			Bind:       serfConf.MemberlistConfig.BindAddr,
  1171  			Advertise:  serfConf.MemberlistConfig.AdvertiseAddr,
  1172  			Port:       s.Bind.Port,
  1173  			RPCAddr:    rpcAddr,
  1174  			SerfConfig: serfConf,
  1175  		})
  1176  	}
  1177  
  1178  	return segments, nil
  1179  }
  1180  
  1181  // makeRandomID will generate a random UUID for a node.
  1182  func (a *Agent) makeRandomID() (string, error) {
  1183  	id, err := uuid.GenerateUUID()
  1184  	if err != nil {
  1185  		return "", err
  1186  	}
  1187  
  1188  	a.logger.Printf("[DEBUG] agent: Using random ID %q as node ID", id)
  1189  	return id, nil
  1190  }
  1191  
  1192  // makeNodeID will try to find a host-specific ID, or else will generate a
  1193  // random ID. The returned ID will always be formatted as a GUID. We don't tell
  1194  // the caller whether this ID is random or stable since the consequences are
  1195  // high for us if this changes, so we will persist it either way. This will let
  1196  // gopsutil change implementations without affecting in-place upgrades of nodes.
  1197  func (a *Agent) makeNodeID() (string, error) {
  1198  	// If they've disabled host-based IDs then just make a random one.
  1199  	if a.config.DisableHostNodeID {
  1200  		return a.makeRandomID()
  1201  	}
  1202  
  1203  	// Try to get a stable ID associated with the host itself.
  1204  	info, err := host.Info()
  1205  	if err != nil {
  1206  		a.logger.Printf("[DEBUG] agent: Couldn't get a unique ID from the host: %v", err)
  1207  		return a.makeRandomID()
  1208  	}
  1209  
  1210  	// Make sure the host ID parses as a UUID, since we don't have complete
  1211  	// control over this process.
  1212  	id := strings.ToLower(info.HostID)
  1213  	if _, err := uuid.ParseUUID(id); err != nil {
  1214  		a.logger.Printf("[DEBUG] agent: Unique ID %q from host isn't formatted as a UUID: %v",
  1215  			id, err)
  1216  		return a.makeRandomID()
  1217  	}
  1218  
  1219  	// Hash the input to make it well distributed. The reported Host UUID may be
  1220  	// similar across nodes if they are on a cloud provider or on motherboards
  1221  	// created from the same batch.
  1222  	buf := sha512.Sum512([]byte(id))
  1223  	id = fmt.Sprintf("%08x-%04x-%04x-%04x-%12x",
  1224  		buf[0:4],
  1225  		buf[4:6],
  1226  		buf[6:8],
  1227  		buf[8:10],
  1228  		buf[10:16])
  1229  
  1230  	a.logger.Printf("[DEBUG] agent: Using unique ID %q from host as node ID", id)
  1231  	return id, nil
  1232  }
  1233  
  1234  // setupNodeID will pull the persisted node ID, if any, or create a random one
  1235  // and persist it.
  1236  func (a *Agent) setupNodeID(config *config.RuntimeConfig) error {
  1237  	// If they've configured a node ID manually then just use that, as
  1238  	// long as it's valid.
  1239  	if config.NodeID != "" {
  1240  		config.NodeID = types.NodeID(strings.ToLower(string(config.NodeID)))
  1241  		if _, err := uuid.ParseUUID(string(config.NodeID)); err != nil {
  1242  			return err
  1243  		}
  1244  
  1245  		return nil
  1246  	}
  1247  
  1248  	// For dev mode we have no filesystem access so just make one.
  1249  	if a.config.DataDir == "" {
  1250  		id, err := a.makeNodeID()
  1251  		if err != nil {
  1252  			return err
  1253  		}
  1254  
  1255  		config.NodeID = types.NodeID(id)
  1256  		return nil
  1257  	}
  1258  
  1259  	// Load saved state, if any. Since a user could edit this, we also
  1260  	// validate it.
  1261  	fileID := filepath.Join(config.DataDir, "node-id")
  1262  	if _, err := os.Stat(fileID); err == nil {
  1263  		rawID, err := ioutil.ReadFile(fileID)
  1264  		if err != nil {
  1265  			return err
  1266  		}
  1267  
  1268  		nodeID := strings.TrimSpace(string(rawID))
  1269  		nodeID = strings.ToLower(nodeID)
  1270  		if _, err := uuid.ParseUUID(nodeID); err != nil {
  1271  			return err
  1272  		}
  1273  
  1274  		config.NodeID = types.NodeID(nodeID)
  1275  	}
  1276  
  1277  	// If we still don't have a valid node ID, make one.
  1278  	if config.NodeID == "" {
  1279  		id, err := a.makeNodeID()
  1280  		if err != nil {
  1281  			return err
  1282  		}
  1283  		if err := lib.EnsurePath(fileID, false); err != nil {
  1284  			return err
  1285  		}
  1286  		if err := ioutil.WriteFile(fileID, []byte(id), 0600); err != nil {
  1287  			return err
  1288  		}
  1289  
  1290  		config.NodeID = types.NodeID(id)
  1291  	}
  1292  	return nil
  1293  }
  1294  
  1295  // setupBaseKeyrings configures the LAN and WAN keyrings.
  1296  func (a *Agent) setupBaseKeyrings(config *consul.Config) error {
  1297  	// If the keyring file is disabled then just poke the provided key
  1298  	// into the in-memory keyring.
  1299  	federationEnabled := config.SerfWANConfig != nil
  1300  	if a.config.DisableKeyringFile {
  1301  		if a.config.EncryptKey == "" {
  1302  			return nil
  1303  		}
  1304  
  1305  		keys := []string{a.config.EncryptKey}
  1306  		if err := loadKeyring(config.SerfLANConfig, keys); err != nil {
  1307  			return err
  1308  		}
  1309  		if a.config.ServerMode && federationEnabled {
  1310  			if err := loadKeyring(config.SerfWANConfig, keys); err != nil {
  1311  				return err
  1312  			}
  1313  		}
  1314  		return nil
  1315  	}
  1316  
  1317  	// Otherwise, we need to deal with the keyring files.
  1318  	fileLAN := filepath.Join(a.config.DataDir, SerfLANKeyring)
  1319  	fileWAN := filepath.Join(a.config.DataDir, SerfWANKeyring)
  1320  
  1321  	if a.config.EncryptKey == "" {
  1322  		goto LOAD
  1323  	}
  1324  	if _, err := os.Stat(fileLAN); err != nil {
  1325  		if err := initKeyring(fileLAN, a.config.EncryptKey); err != nil {
  1326  			return err
  1327  		}
  1328  	}
  1329  	if a.config.ServerMode && federationEnabled {
  1330  		if _, err := os.Stat(fileWAN); err != nil {
  1331  			if err := initKeyring(fileWAN, a.config.EncryptKey); err != nil {
  1332  				return err
  1333  			}
  1334  		}
  1335  	}
  1336  
  1337  LOAD:
  1338  	if _, err := os.Stat(fileLAN); err == nil {
  1339  		config.SerfLANConfig.KeyringFile = fileLAN
  1340  	}
  1341  	if err := loadKeyringFile(config.SerfLANConfig); err != nil {
  1342  		return err
  1343  	}
  1344  	if a.config.ServerMode && federationEnabled {
  1345  		if _, err := os.Stat(fileWAN); err == nil {
  1346  			config.SerfWANConfig.KeyringFile = fileWAN
  1347  		}
  1348  		if err := loadKeyringFile(config.SerfWANConfig); err != nil {
  1349  			return err
  1350  		}
  1351  	}
  1352  
  1353  	return nil
  1354  }
  1355  
  1356  // setupKeyrings is used to initialize and load keyrings during agent startup.
  1357  func (a *Agent) setupKeyrings(config *consul.Config) error {
  1358  	// First set up the LAN and WAN keyrings.
  1359  	if err := a.setupBaseKeyrings(config); err != nil {
  1360  		return err
  1361  	}
  1362  
  1363  	// If there's no LAN keyring then there's nothing else to set up for
  1364  	// any segments.
  1365  	lanKeyring := config.SerfLANConfig.MemberlistConfig.Keyring
  1366  	if lanKeyring == nil {
  1367  		return nil
  1368  	}
  1369  
  1370  	// Copy the initial state of the LAN keyring into each segment config.
  1371  	// Segments don't have their own keyring file, they rely on the LAN
  1372  	// holding the state so things can't get out of sync.
  1373  	k, pk := lanKeyring.GetKeys(), lanKeyring.GetPrimaryKey()
  1374  	for _, segment := range config.Segments {
  1375  		keyring, err := memberlist.NewKeyring(k, pk)
  1376  		if err != nil {
  1377  			return err
  1378  		}
  1379  		segment.SerfConfig.MemberlistConfig.Keyring = keyring
  1380  	}
  1381  	return nil
  1382  }
  1383  
  1384  // registerEndpoint registers a handler for the consul RPC server
  1385  // under a unique name while making it accessible under the provided
  1386  // name. This allows overwriting handlers for the golang net/rpc
  1387  // service which does not allow this.
  1388  func (a *Agent) registerEndpoint(name string, handler interface{}) error {
  1389  	srv, ok := a.delegate.(*consul.Server)
  1390  	if !ok {
  1391  		panic("agent must be a server")
  1392  	}
  1393  	realname := fmt.Sprintf("%s-%d", name, time.Now().UnixNano())
  1394  	a.endpointsLock.Lock()
  1395  	a.endpoints[name] = realname
  1396  	a.endpointsLock.Unlock()
  1397  	return srv.RegisterEndpoint(realname, handler)
  1398  }
  1399  
  1400  // RPC is used to make an RPC call to the Consul servers
  1401  // This allows the agent to implement the Consul.Interface
  1402  func (a *Agent) RPC(method string, args interface{}, reply interface{}) error {
  1403  	a.endpointsLock.RLock()
  1404  	// fast path: only translate if there are overrides
  1405  	if len(a.endpoints) > 0 {
  1406  		p := strings.SplitN(method, ".", 2)
  1407  		if e := a.endpoints[p[0]]; e != "" {
  1408  			method = e + "." + p[1]
  1409  		}
  1410  	}
  1411  	a.endpointsLock.RUnlock()
  1412  	return a.delegate.RPC(method, args, reply)
  1413  }
  1414  
  1415  // SnapshotRPC performs the requested snapshot RPC against the Consul server in
  1416  // a streaming manner. The contents of in will be read and passed along as the
  1417  // payload, and the response message will determine the error status, and any
  1418  // return payload will be written to out.
  1419  func (a *Agent) SnapshotRPC(args *structs.SnapshotRequest, in io.Reader, out io.Writer,
  1420  	replyFn structs.SnapshotReplyFn) error {
  1421  	return a.delegate.SnapshotRPC(args, in, out, replyFn)
  1422  }
  1423  
  1424  // Leave is used to prepare the agent for a graceful shutdown
  1425  func (a *Agent) Leave() error {
  1426  	return a.delegate.Leave()
  1427  }
  1428  
  1429  // ShutdownAgent is used to hard stop the agent. Should be preceded by
  1430  // Leave to do it gracefully. Should be followed by ShutdownEndpoints to
  1431  // terminate the HTTP and DNS servers as well.
  1432  func (a *Agent) ShutdownAgent() error {
  1433  	a.shutdownLock.Lock()
  1434  	defer a.shutdownLock.Unlock()
  1435  
  1436  	if a.shutdown {
  1437  		return nil
  1438  	}
  1439  	a.logger.Println("[INFO] agent: Requesting shutdown")
  1440  
  1441  	// Stop all the checks
  1442  	a.stateLock.Lock()
  1443  	defer a.stateLock.Unlock()
  1444  	for _, chk := range a.checkMonitors {
  1445  		chk.Stop()
  1446  	}
  1447  	for _, chk := range a.checkTTLs {
  1448  		chk.Stop()
  1449  	}
  1450  	for _, chk := range a.checkHTTPs {
  1451  		chk.Stop()
  1452  	}
  1453  	for _, chk := range a.checkTCPs {
  1454  		chk.Stop()
  1455  	}
  1456  	for _, chk := range a.checkGRPCs {
  1457  		chk.Stop()
  1458  	}
  1459  	for _, chk := range a.checkDockers {
  1460  		chk.Stop()
  1461  	}
  1462  	for _, chk := range a.checkAliases {
  1463  		chk.Stop()
  1464  	}
  1465  
  1466  	// Stop gRPC
  1467  	if a.grpcServer != nil {
  1468  		a.grpcServer.Stop()
  1469  	}
  1470  
  1471  	// Stop the proxy config manager
  1472  	if a.proxyConfig != nil {
  1473  		a.proxyConfig.Close()
  1474  	}
  1475  
  1476  	// Stop the proxy process manager
  1477  	if a.proxyManager != nil {
  1478  		// If persistence is disabled (implies DevMode but a subset of DevMode) then
  1479  		// don't leave the proxies running since the agent will not be able to
  1480  		// recover them later.
  1481  		if a.config.DataDir == "" {
  1482  			a.logger.Printf("[WARN] agent: dev mode disabled persistence, killing " +
  1483  				"all proxies since we can't recover them")
  1484  			if err := a.proxyManager.Kill(); err != nil {
  1485  				a.logger.Printf("[WARN] agent: error shutting down proxy manager: %s", err)
  1486  			}
  1487  		} else {
  1488  			if err := a.proxyManager.Close(); err != nil {
  1489  				a.logger.Printf("[WARN] agent: error shutting down proxy manager: %s", err)
  1490  			}
  1491  		}
  1492  	}
  1493  
  1494  	// Stop the cache background work
  1495  	if a.cache != nil {
  1496  		a.cache.Close()
  1497  	}
  1498  
  1499  	var err error
  1500  	if a.delegate != nil {
  1501  		err = a.delegate.Shutdown()
  1502  		if _, ok := a.delegate.(*consul.Server); ok {
  1503  			a.logger.Print("[INFO] agent: consul server down")
  1504  		} else {
  1505  			a.logger.Print("[INFO] agent: consul client down")
  1506  		}
  1507  	}
  1508  
  1509  	pidErr := a.deletePid()
  1510  	if pidErr != nil {
  1511  		a.logger.Println("[WARN] agent: could not delete pid file ", pidErr)
  1512  	}
  1513  
  1514  	a.logger.Println("[INFO] agent: shutdown complete")
  1515  	a.shutdown = true
  1516  	close(a.shutdownCh)
  1517  	return err
  1518  }
  1519  
  1520  // ShutdownEndpoints terminates the HTTP and DNS servers. Should be
  1521  // preceded by ShutdownAgent.
  1522  func (a *Agent) ShutdownEndpoints() {
  1523  	a.shutdownLock.Lock()
  1524  	defer a.shutdownLock.Unlock()
  1525  
  1526  	if len(a.dnsServers) == 0 && len(a.httpServers) == 0 {
  1527  		return
  1528  	}
  1529  
  1530  	for _, srv := range a.dnsServers {
  1531  		a.logger.Printf("[INFO] agent: Stopping DNS server %s (%s)", srv.Server.Addr, srv.Server.Net)
  1532  		srv.Shutdown()
  1533  	}
  1534  	a.dnsServers = nil
  1535  
  1536  	for _, srv := range a.httpServers {
  1537  		a.logger.Printf("[INFO] agent: Stopping %s server %s (%s)", strings.ToUpper(srv.proto), srv.ln.Addr().String(), srv.ln.Addr().Network())
  1538  		ctx, cancel := context.WithTimeout(context.Background(), time.Second)
  1539  		defer cancel()
  1540  		srv.Shutdown(ctx)
  1541  		if ctx.Err() == context.DeadlineExceeded {
  1542  			a.logger.Printf("[WARN] agent: Timeout stopping %s server %s (%s)", strings.ToUpper(srv.proto), srv.ln.Addr().String(), srv.ln.Addr().Network())
  1543  		}
  1544  	}
  1545  	a.httpServers = nil
  1546  
  1547  	a.logger.Println("[INFO] agent: Waiting for endpoints to shut down")
  1548  	a.wgServers.Wait()
  1549  	a.logger.Print("[INFO] agent: Endpoints down")
  1550  }
  1551  
  1552  // ReloadCh is used to return a channel that can be
  1553  // used for triggering reloads and returning a response.
  1554  func (a *Agent) ReloadCh() chan chan error {
  1555  	return a.reloadCh
  1556  }
  1557  
  1558  // RetryJoinCh is a channel that transports errors
  1559  // from the retry join process.
  1560  func (a *Agent) RetryJoinCh() <-chan error {
  1561  	return a.retryJoinCh
  1562  }
  1563  
  1564  // ShutdownCh is used to return a channel that can be
  1565  // selected to wait for the agent to perform a shutdown.
  1566  func (a *Agent) ShutdownCh() <-chan struct{} {
  1567  	return a.shutdownCh
  1568  }
  1569  
  1570  // JoinLAN is used to have the agent join a LAN cluster
  1571  func (a *Agent) JoinLAN(addrs []string) (n int, err error) {
  1572  	a.logger.Printf("[INFO] agent: (LAN) joining: %v", addrs)
  1573  	n, err = a.delegate.JoinLAN(addrs)
  1574  	a.logger.Printf("[INFO] agent: (LAN) joined: %d Err: %v", n, err)
  1575  	if err == nil && a.joinLANNotifier != nil {
  1576  		if notifErr := a.joinLANNotifier.Notify(systemd.Ready); notifErr != nil {
  1577  			a.logger.Printf("[DEBUG] agent: systemd notify failed: %v", notifErr)
  1578  		}
  1579  	}
  1580  	return
  1581  }
  1582  
  1583  // JoinWAN is used to have the agent join a WAN cluster
  1584  func (a *Agent) JoinWAN(addrs []string) (n int, err error) {
  1585  	a.logger.Printf("[INFO] agent: (WAN) joining: %v", addrs)
  1586  	if srv, ok := a.delegate.(*consul.Server); ok {
  1587  		n, err = srv.JoinWAN(addrs)
  1588  	} else {
  1589  		err = fmt.Errorf("Must be a server to join WAN cluster")
  1590  	}
  1591  	a.logger.Printf("[INFO] agent: (WAN) joined: %d Err: %v", n, err)
  1592  	return
  1593  }
  1594  
  1595  // ForceLeave is used to remove a failed node from the cluster
  1596  func (a *Agent) ForceLeave(node string) (err error) {
  1597  	a.logger.Printf("[INFO] agent: Force leaving node: %v", node)
  1598  	err = a.delegate.RemoveFailedNode(node)
  1599  	if err != nil {
  1600  		a.logger.Printf("[WARN] agent: Failed to remove node: %v", err)
  1601  	}
  1602  	return err
  1603  }
  1604  
  1605  // LocalMember is used to return the local node
  1606  func (a *Agent) LocalMember() serf.Member {
  1607  	return a.delegate.LocalMember()
  1608  }
  1609  
  1610  // LANMembers is used to retrieve the LAN members
  1611  func (a *Agent) LANMembers() []serf.Member {
  1612  	return a.delegate.LANMembers()
  1613  }
  1614  
  1615  // WANMembers is used to retrieve the WAN members
  1616  func (a *Agent) WANMembers() []serf.Member {
  1617  	if srv, ok := a.delegate.(*consul.Server); ok {
  1618  		return srv.WANMembers()
  1619  	}
  1620  	return nil
  1621  }
  1622  
  1623  // StartSync is called once Services and Checks are registered.
  1624  // This is called to prevent a race between clients and the anti-entropy routines
  1625  func (a *Agent) StartSync() {
  1626  	go a.sync.Run()
  1627  	a.logger.Printf("[INFO] agent: started state syncer")
  1628  }
  1629  
  1630  // PauseSync is used to pause anti-entropy while bulk changes are made. It also
  1631  // sets state that agent-local watches use to "ride out" config reloads and bulk
  1632  // updates which might spuriously unload state and reload it again.
  1633  func (a *Agent) PauseSync() {
  1634  	// Do this outside of lock as it has it's own locking
  1635  	a.sync.Pause()
  1636  
  1637  	// Coordinate local state watchers
  1638  	a.syncMu.Lock()
  1639  	defer a.syncMu.Unlock()
  1640  	if a.syncCh == nil {
  1641  		a.syncCh = make(chan struct{})
  1642  	}
  1643  }
  1644  
  1645  // ResumeSync is used to unpause anti-entropy after bulk changes are make
  1646  func (a *Agent) ResumeSync() {
  1647  	// a.sync maintains a stack/ref count of Pause calls since we call
  1648  	// Pause/Resume in nested way during a reload and AddService. We only want to
  1649  	// trigger local state watchers if this Resume call actually started sync back
  1650  	// up again (i.e. was the last resume on the stack). We could check that
  1651  	// separately with a.sync.Paused but that is racey since another Pause call
  1652  	// might be made between our Resume and checking Paused.
  1653  	resumed := a.sync.Resume()
  1654  
  1655  	if !resumed {
  1656  		// Return early so we don't notify local watchers until we are actually
  1657  		// resumed.
  1658  		return
  1659  	}
  1660  
  1661  	// Coordinate local state watchers
  1662  	a.syncMu.Lock()
  1663  	defer a.syncMu.Unlock()
  1664  
  1665  	if a.syncCh != nil {
  1666  		close(a.syncCh)
  1667  		a.syncCh = nil
  1668  	}
  1669  }
  1670  
  1671  // syncPausedCh returns either a channel or nil. If nil sync is not paused. If
  1672  // non-nil, the channel will be closed when sync resumes.
  1673  func (a *Agent) syncPausedCh() <-chan struct{} {
  1674  	a.syncMu.Lock()
  1675  	defer a.syncMu.Unlock()
  1676  	return a.syncCh
  1677  }
  1678  
  1679  // GetLANCoordinate returns the coordinates of this node in the local pools
  1680  // (assumes coordinates are enabled, so check that before calling).
  1681  func (a *Agent) GetLANCoordinate() (lib.CoordinateSet, error) {
  1682  	return a.delegate.GetLANCoordinate()
  1683  }
  1684  
  1685  // sendCoordinate is a long-running loop that periodically sends our coordinate
  1686  // to the server. Closing the agent's shutdownChannel will cause this to exit.
  1687  func (a *Agent) sendCoordinate() {
  1688  OUTER:
  1689  	for {
  1690  		rate := a.config.SyncCoordinateRateTarget
  1691  		min := a.config.SyncCoordinateIntervalMin
  1692  		intv := lib.RateScaledInterval(rate, min, len(a.LANMembers()))
  1693  		intv = intv + lib.RandomStagger(intv)
  1694  
  1695  		select {
  1696  		case <-time.After(intv):
  1697  			members := a.LANMembers()
  1698  			grok, err := consul.CanServersUnderstandProtocol(members, 3)
  1699  			if err != nil {
  1700  				a.logger.Printf("[ERR] agent: Failed to check servers: %s", err)
  1701  				continue
  1702  			}
  1703  			if !grok {
  1704  				a.logger.Printf("[DEBUG] agent: Skipping coordinate updates until servers are upgraded")
  1705  				continue
  1706  			}
  1707  
  1708  			cs, err := a.GetLANCoordinate()
  1709  			if err != nil {
  1710  				a.logger.Printf("[ERR] agent: Failed to get coordinate: %s", err)
  1711  				continue
  1712  			}
  1713  
  1714  			for segment, coord := range cs {
  1715  				req := structs.CoordinateUpdateRequest{
  1716  					Datacenter:   a.config.Datacenter,
  1717  					Node:         a.config.NodeName,
  1718  					Segment:      segment,
  1719  					Coord:        coord,
  1720  					WriteRequest: structs.WriteRequest{Token: a.tokens.AgentToken()},
  1721  				}
  1722  				var reply struct{}
  1723  				if err := a.RPC("Coordinate.Update", &req, &reply); err != nil {
  1724  					if acl.IsErrPermissionDenied(err) {
  1725  						a.logger.Printf("[WARN] agent: Coordinate update blocked by ACLs")
  1726  					} else {
  1727  						a.logger.Printf("[ERR] agent: Coordinate update error: %v", err)
  1728  					}
  1729  					continue OUTER
  1730  				}
  1731  			}
  1732  		case <-a.shutdownCh:
  1733  			return
  1734  		}
  1735  	}
  1736  }
  1737  
  1738  // reapServicesInternal does a single pass, looking for services to reap.
  1739  func (a *Agent) reapServicesInternal() {
  1740  	reaped := make(map[string]bool)
  1741  	for checkID, cs := range a.State.CriticalCheckStates() {
  1742  		serviceID := cs.Check.ServiceID
  1743  
  1744  		// There's nothing to do if there's no service.
  1745  		if serviceID == "" {
  1746  			continue
  1747  		}
  1748  
  1749  		// There might be multiple checks for one service, so
  1750  		// we don't need to reap multiple times.
  1751  		if reaped[serviceID] {
  1752  			continue
  1753  		}
  1754  
  1755  		// See if there's a timeout.
  1756  		// todo(fs): this looks fishy... why is there another data structure in the agent with its own lock?
  1757  		a.stateLock.Lock()
  1758  		timeout := a.checkReapAfter[checkID]
  1759  		a.stateLock.Unlock()
  1760  
  1761  		// Reap, if necessary. We keep track of which service
  1762  		// this is so that we won't try to remove it again.
  1763  		if timeout > 0 && cs.CriticalFor() > timeout {
  1764  			reaped[serviceID] = true
  1765  			if err := a.RemoveService(serviceID, true); err != nil {
  1766  				a.logger.Printf("[ERR] agent: unable to deregister service %q after check %q has been critical for too long: %s",
  1767  					serviceID, checkID, err)
  1768  			} else {
  1769  				a.logger.Printf("[INFO] agent: Check %q for service %q has been critical for too long; deregistered service",
  1770  					checkID, serviceID)
  1771  			}
  1772  		}
  1773  	}
  1774  }
  1775  
  1776  // reapServices is a long running goroutine that looks for checks that have been
  1777  // critical too long and deregisters their associated services.
  1778  func (a *Agent) reapServices() {
  1779  	for {
  1780  		select {
  1781  		case <-time.After(a.config.CheckReapInterval):
  1782  			a.reapServicesInternal()
  1783  
  1784  		case <-a.shutdownCh:
  1785  			return
  1786  		}
  1787  	}
  1788  
  1789  }
  1790  
  1791  // persistedService is used to wrap a service definition and bundle it
  1792  // with an ACL token so we can restore both at a later agent start.
  1793  type persistedService struct {
  1794  	Token   string
  1795  	Service *structs.NodeService
  1796  }
  1797  
  1798  // persistService saves a service definition to a JSON file in the data dir
  1799  func (a *Agent) persistService(service *structs.NodeService) error {
  1800  	svcPath := filepath.Join(a.config.DataDir, servicesDir, stringHash(service.ID))
  1801  
  1802  	wrapped := persistedService{
  1803  		Token:   a.State.ServiceToken(service.ID),
  1804  		Service: service,
  1805  	}
  1806  	encoded, err := json.Marshal(wrapped)
  1807  	if err != nil {
  1808  		return err
  1809  	}
  1810  
  1811  	return file.WriteAtomic(svcPath, encoded)
  1812  }
  1813  
  1814  // purgeService removes a persisted service definition file from the data dir
  1815  func (a *Agent) purgeService(serviceID string) error {
  1816  	svcPath := filepath.Join(a.config.DataDir, servicesDir, stringHash(serviceID))
  1817  	if _, err := os.Stat(svcPath); err == nil {
  1818  		return os.Remove(svcPath)
  1819  	}
  1820  	return nil
  1821  }
  1822  
  1823  // persistedProxy is used to wrap a proxy definition and bundle it with an Proxy
  1824  // token so we can continue to authenticate the running proxy after a restart.
  1825  type persistedProxy struct {
  1826  	ProxyToken string
  1827  	Proxy      *structs.ConnectManagedProxy
  1828  
  1829  	// Set to true when the proxy information originated from the agents configuration
  1830  	// as opposed to API registration.
  1831  	FromFile bool
  1832  }
  1833  
  1834  // persistProxy saves a proxy definition to a JSON file in the data dir
  1835  func (a *Agent) persistProxy(proxy *local.ManagedProxy, FromFile bool) error {
  1836  	proxyPath := filepath.Join(a.config.DataDir, proxyDir,
  1837  		stringHash(proxy.Proxy.ProxyService.ID))
  1838  
  1839  	wrapped := persistedProxy{
  1840  		ProxyToken: proxy.ProxyToken,
  1841  		Proxy:      proxy.Proxy,
  1842  		FromFile:   FromFile,
  1843  	}
  1844  	encoded, err := json.Marshal(wrapped)
  1845  	if err != nil {
  1846  		return err
  1847  	}
  1848  
  1849  	return file.WriteAtomic(proxyPath, encoded)
  1850  }
  1851  
  1852  // purgeProxy removes a persisted proxy definition file from the data dir
  1853  func (a *Agent) purgeProxy(proxyID string) error {
  1854  	proxyPath := filepath.Join(a.config.DataDir, proxyDir, stringHash(proxyID))
  1855  	if _, err := os.Stat(proxyPath); err == nil {
  1856  		return os.Remove(proxyPath)
  1857  	}
  1858  	return nil
  1859  }
  1860  
  1861  // persistCheck saves a check definition to the local agent's state directory
  1862  func (a *Agent) persistCheck(check *structs.HealthCheck, chkType *structs.CheckType) error {
  1863  	checkPath := filepath.Join(a.config.DataDir, checksDir, checkIDHash(check.CheckID))
  1864  
  1865  	// Create the persisted check
  1866  	wrapped := persistedCheck{
  1867  		Check:   check,
  1868  		ChkType: chkType,
  1869  		Token:   a.State.CheckToken(check.CheckID),
  1870  	}
  1871  
  1872  	encoded, err := json.Marshal(wrapped)
  1873  	if err != nil {
  1874  		return err
  1875  	}
  1876  
  1877  	return file.WriteAtomic(checkPath, encoded)
  1878  }
  1879  
  1880  // purgeCheck removes a persisted check definition file from the data dir
  1881  func (a *Agent) purgeCheck(checkID types.CheckID) error {
  1882  	checkPath := filepath.Join(a.config.DataDir, checksDir, checkIDHash(checkID))
  1883  	if _, err := os.Stat(checkPath); err == nil {
  1884  		return os.Remove(checkPath)
  1885  	}
  1886  	return nil
  1887  }
  1888  
  1889  // AddService is used to add a service entry.
  1890  // This entry is persistent and the agent will make a best effort to
  1891  // ensure it is registered
  1892  func (a *Agent) AddService(service *structs.NodeService, chkTypes []*structs.CheckType, persist bool, token string, source configSource) error {
  1893  	a.stateLock.Lock()
  1894  	defer a.stateLock.Unlock()
  1895  	return a.addServiceLocked(service, chkTypes, persist, token, source)
  1896  }
  1897  
  1898  func (a *Agent) addServiceLocked(service *structs.NodeService, chkTypes []*structs.CheckType, persist bool, token string, source configSource) error {
  1899  	if service.Service == "" {
  1900  		return fmt.Errorf("Service name missing")
  1901  	}
  1902  	if service.ID == "" && service.Service != "" {
  1903  		service.ID = service.Service
  1904  	}
  1905  	for _, check := range chkTypes {
  1906  		if err := check.Validate(); err != nil {
  1907  			return fmt.Errorf("Check is not valid: %v", err)
  1908  		}
  1909  	}
  1910  
  1911  	// Set default weights if not specified. This is important as it ensures AE
  1912  	// doesn't consider the service different since it has nil weights.
  1913  	if service.Weights == nil {
  1914  		service.Weights = &structs.Weights{Passing: 1, Warning: 1}
  1915  	}
  1916  
  1917  	// Warn if the service name is incompatible with DNS
  1918  	if InvalidDnsRe.MatchString(service.Service) {
  1919  		a.logger.Printf("[WARN] agent: Service name %q will not be discoverable "+
  1920  			"via DNS due to invalid characters. Valid characters include "+
  1921  			"all alpha-numerics and dashes.", service.Service)
  1922  	} else if len(service.Service) > MaxDNSLabelLength {
  1923  		a.logger.Printf("[WARN] agent: Service name %q will not be discoverable "+
  1924  			"via DNS due to it being too long. Valid lengths are between "+
  1925  			"1 and 63 bytes.", service.Service)
  1926  	}
  1927  
  1928  	// Warn if any tags are incompatible with DNS
  1929  	for _, tag := range service.Tags {
  1930  		if InvalidDnsRe.MatchString(tag) {
  1931  			a.logger.Printf("[DEBUG] agent: Service tag %q will not be discoverable "+
  1932  				"via DNS due to invalid characters. Valid characters include "+
  1933  				"all alpha-numerics and dashes.", tag)
  1934  		} else if len(tag) > MaxDNSLabelLength {
  1935  			a.logger.Printf("[DEBUG] agent: Service tag %q will not be discoverable "+
  1936  				"via DNS due to it being too long. Valid lengths are between "+
  1937  				"1 and 63 bytes.", tag)
  1938  		}
  1939  	}
  1940  
  1941  	// Pause the service syncs during modification
  1942  	a.PauseSync()
  1943  	defer a.ResumeSync()
  1944  
  1945  	// Take a snapshot of the current state of checks (if any), and
  1946  	// restore them before resuming anti-entropy.
  1947  	snap := a.snapshotCheckState()
  1948  	defer a.restoreCheckState(snap)
  1949  
  1950  	var checks []*structs.HealthCheck
  1951  
  1952  	// Create an associated health check
  1953  	for i, chkType := range chkTypes {
  1954  		checkID := string(chkType.CheckID)
  1955  		if checkID == "" {
  1956  			checkID = fmt.Sprintf("service:%s", service.ID)
  1957  			if len(chkTypes) > 1 {
  1958  				checkID += fmt.Sprintf(":%d", i+1)
  1959  			}
  1960  		}
  1961  		name := chkType.Name
  1962  		if name == "" {
  1963  			name = fmt.Sprintf("Service '%s' check", service.Service)
  1964  		}
  1965  		check := &structs.HealthCheck{
  1966  			Node:        a.config.NodeName,
  1967  			CheckID:     types.CheckID(checkID),
  1968  			Name:        name,
  1969  			Status:      api.HealthCritical,
  1970  			Notes:       chkType.Notes,
  1971  			ServiceID:   service.ID,
  1972  			ServiceName: service.Service,
  1973  			ServiceTags: service.Tags,
  1974  		}
  1975  		if chkType.Status != "" {
  1976  			check.Status = chkType.Status
  1977  		}
  1978  
  1979  		checks = append(checks, check)
  1980  	}
  1981  
  1982  	// cleanup, store the ids of services and checks that weren't previously
  1983  	// registered so we clean them up if somthing fails halfway through the
  1984  	// process.
  1985  	var cleanupServices []string
  1986  	var cleanupChecks []types.CheckID
  1987  
  1988  	if s := a.State.Service(service.ID); s == nil {
  1989  		cleanupServices = append(cleanupServices, service.ID)
  1990  	}
  1991  
  1992  	for _, check := range checks {
  1993  		if c := a.State.Check(check.CheckID); c == nil {
  1994  			cleanupChecks = append(cleanupChecks, check.CheckID)
  1995  		}
  1996  	}
  1997  
  1998  	err := a.State.AddServiceWithChecks(service, checks, token)
  1999  	if err != nil {
  2000  		a.cleanupRegistration(cleanupServices, cleanupChecks)
  2001  		return err
  2002  	}
  2003  
  2004  	for i := range checks {
  2005  		if err := a.addCheck(checks[i], chkTypes[i], service, persist, token, source); err != nil {
  2006  			a.cleanupRegistration(cleanupServices, cleanupChecks)
  2007  			return err
  2008  		}
  2009  
  2010  		if persist && a.config.DataDir != "" {
  2011  			if err := a.persistCheck(checks[i], chkTypes[i]); err != nil {
  2012  				a.cleanupRegistration(cleanupServices, cleanupChecks)
  2013  				return err
  2014  
  2015  			}
  2016  		}
  2017  	}
  2018  
  2019  	// Persist the service to a file
  2020  	if persist && a.config.DataDir != "" {
  2021  		if err := a.persistService(service); err != nil {
  2022  			a.cleanupRegistration(cleanupServices, cleanupChecks)
  2023  			return err
  2024  		}
  2025  	}
  2026  
  2027  	return nil
  2028  }
  2029  
  2030  // cleanupRegistration is called on  registration error to ensure no there are no
  2031  // leftovers after a partial failure
  2032  func (a *Agent) cleanupRegistration(serviceIDs []string, checksIDs []types.CheckID) {
  2033  	for _, s := range serviceIDs {
  2034  		if err := a.State.RemoveService(s); err != nil {
  2035  			a.logger.Printf("[ERR] consul: service registration: cleanup: failed to remove service %s: %s", s, err)
  2036  		}
  2037  		if err := a.purgeService(s); err != nil {
  2038  			a.logger.Printf("[ERR] consul: service registration: cleanup: failed to purge service %s file: %s", s, err)
  2039  		}
  2040  	}
  2041  
  2042  	for _, c := range checksIDs {
  2043  		a.cancelCheckMonitors(c)
  2044  		if err := a.State.RemoveCheck(c); err != nil {
  2045  			a.logger.Printf("[ERR] consul: service registration: cleanup: failed to remove check %s: %s", c, err)
  2046  		}
  2047  		if err := a.purgeCheck(c); err != nil {
  2048  			a.logger.Printf("[ERR] consul: service registration: cleanup: failed to purge check %s file: %s", c, err)
  2049  		}
  2050  	}
  2051  }
  2052  
  2053  // RemoveService is used to remove a service entry.
  2054  // The agent will make a best effort to ensure it is deregistered
  2055  func (a *Agent) RemoveService(serviceID string, persist bool) error {
  2056  	a.stateLock.Lock()
  2057  	defer a.stateLock.Unlock()
  2058  	return a.removeServiceLocked(serviceID, persist)
  2059  }
  2060  
  2061  // removeServiceLocked is used to remove a service entry.
  2062  // The agent will make a best effort to ensure it is deregistered
  2063  func (a *Agent) removeServiceLocked(serviceID string, persist bool) error {
  2064  	// Validate ServiceID
  2065  	if serviceID == "" {
  2066  		return fmt.Errorf("ServiceID missing")
  2067  	}
  2068  
  2069  	checks := a.State.Checks()
  2070  	var checkIDs []types.CheckID
  2071  	for id, check := range checks {
  2072  		if check.ServiceID != serviceID {
  2073  			continue
  2074  		}
  2075  		checkIDs = append(checkIDs, id)
  2076  	}
  2077  
  2078  	// Remove the associated managed proxy if it exists
  2079  	// This has to be DONE before purging configuration as might might have issues
  2080  	// With ACLs otherwise
  2081  	for proxyID, p := range a.State.Proxies() {
  2082  		if p.Proxy.TargetServiceID == serviceID {
  2083  			if err := a.removeProxyLocked(proxyID, true); err != nil {
  2084  				return err
  2085  			}
  2086  		}
  2087  	}
  2088  
  2089  	// Remove service immediately
  2090  	if err := a.State.RemoveServiceWithChecks(serviceID, checkIDs); err != nil {
  2091  		a.logger.Printf("[WARN] agent: Failed to deregister service %q: %s", serviceID, err)
  2092  		return nil
  2093  	}
  2094  
  2095  	// Remove the service from the data dir
  2096  	if persist {
  2097  		if err := a.purgeService(serviceID); err != nil {
  2098  			return err
  2099  		}
  2100  	}
  2101  
  2102  	// Deregister any associated health checks
  2103  	for checkID, check := range checks {
  2104  		if check.ServiceID != serviceID {
  2105  			continue
  2106  		}
  2107  		if err := a.removeCheckLocked(checkID, persist); err != nil {
  2108  			return err
  2109  		}
  2110  	}
  2111  
  2112  	a.logger.Printf("[DEBUG] agent: removed service %q", serviceID)
  2113  
  2114  	// If any Sidecar services exist for the removed service ID, remove them too.
  2115  	if sidecar := a.State.Service(a.sidecarServiceID(serviceID)); sidecar != nil {
  2116  		// Double check that it's not just an ID collision and we actually added
  2117  		// this from a sidecar.
  2118  		if sidecar.LocallyRegisteredAsSidecar {
  2119  			// Remove it!
  2120  			err := a.removeServiceLocked(a.sidecarServiceID(serviceID), persist)
  2121  			if err != nil {
  2122  				return err
  2123  			}
  2124  		}
  2125  	}
  2126  
  2127  	return nil
  2128  }
  2129  
  2130  // AddCheck is used to add a health check to the agent.
  2131  // This entry is persistent and the agent will make a best effort to
  2132  // ensure it is registered. The Check may include a CheckType which
  2133  // is used to automatically update the check status
  2134  func (a *Agent) AddCheck(check *structs.HealthCheck, chkType *structs.CheckType, persist bool, token string, source configSource) error {
  2135  	a.stateLock.Lock()
  2136  	defer a.stateLock.Unlock()
  2137  	return a.addCheckLocked(check, chkType, persist, token, source)
  2138  }
  2139  
  2140  func (a *Agent) addCheckLocked(check *structs.HealthCheck, chkType *structs.CheckType, persist bool, token string, source configSource) error {
  2141  	var service *structs.NodeService
  2142  
  2143  	if check.ServiceID != "" {
  2144  		service = a.State.Service(check.ServiceID)
  2145  		if service == nil {
  2146  			return fmt.Errorf("ServiceID %q does not exist", check.ServiceID)
  2147  		}
  2148  	}
  2149  
  2150  	// snapshot the current state of the health check to avoid potential flapping
  2151  	existing := a.State.Check(check.CheckID)
  2152  	defer func() {
  2153  		if existing != nil {
  2154  			a.State.UpdateCheck(check.CheckID, existing.Status, existing.Output)
  2155  		}
  2156  	}()
  2157  
  2158  	err := a.addCheck(check, chkType, service, persist, token, source)
  2159  	if err != nil {
  2160  		a.State.RemoveCheck(check.CheckID)
  2161  		return err
  2162  	}
  2163  
  2164  	// Add to the local state for anti-entropy
  2165  	err = a.State.AddCheck(check, token)
  2166  	if err != nil {
  2167  		return err
  2168  	}
  2169  
  2170  	// Persist the check
  2171  	if persist && a.config.DataDir != "" {
  2172  		return a.persistCheck(check, chkType)
  2173  	}
  2174  
  2175  	return nil
  2176  }
  2177  
  2178  func (a *Agent) addCheck(check *structs.HealthCheck, chkType *structs.CheckType, service *structs.NodeService, persist bool, token string, source configSource) error {
  2179  	if check.CheckID == "" {
  2180  		return fmt.Errorf("CheckID missing")
  2181  	}
  2182  
  2183  	if chkType != nil {
  2184  		if err := chkType.Validate(); err != nil {
  2185  			return fmt.Errorf("Check is not valid: %v", err)
  2186  		}
  2187  
  2188  		if chkType.IsScript() {
  2189  			if source == ConfigSourceLocal && !a.config.EnableLocalScriptChecks {
  2190  				return fmt.Errorf("Scripts are disabled on this agent; to enable, configure 'enable_script_checks' or 'enable_local_script_checks' to true")
  2191  			}
  2192  
  2193  			if source == ConfigSourceRemote && !a.config.EnableRemoteScriptChecks {
  2194  				return fmt.Errorf("Scripts are disabled on this agent from remote calls; to enable, configure 'enable_script_checks' to true")
  2195  			}
  2196  		}
  2197  	}
  2198  
  2199  	if check.ServiceID != "" {
  2200  		check.ServiceName = service.Service
  2201  		check.ServiceTags = service.Tags
  2202  	}
  2203  
  2204  	// Check if already registered
  2205  	if chkType != nil {
  2206  		switch {
  2207  
  2208  		case chkType.IsTTL():
  2209  			if existing, ok := a.checkTTLs[check.CheckID]; ok {
  2210  				existing.Stop()
  2211  				delete(a.checkTTLs, check.CheckID)
  2212  			}
  2213  
  2214  			ttl := &checks.CheckTTL{
  2215  				Notify:  a.State,
  2216  				CheckID: check.CheckID,
  2217  				TTL:     chkType.TTL,
  2218  				Logger:  a.logger,
  2219  			}
  2220  
  2221  			// Restore persisted state, if any
  2222  			if err := a.loadCheckState(check); err != nil {
  2223  				a.logger.Printf("[WARN] agent: failed restoring state for check %q: %s",
  2224  					check.CheckID, err)
  2225  			}
  2226  
  2227  			ttl.Start()
  2228  			a.checkTTLs[check.CheckID] = ttl
  2229  
  2230  		case chkType.IsHTTP():
  2231  			if existing, ok := a.checkHTTPs[check.CheckID]; ok {
  2232  				existing.Stop()
  2233  				delete(a.checkHTTPs, check.CheckID)
  2234  			}
  2235  			if chkType.Interval < checks.MinInterval {
  2236  				a.logger.Println(fmt.Sprintf("[WARN] agent: check '%s' has interval below minimum of %v",
  2237  					check.CheckID, checks.MinInterval))
  2238  				chkType.Interval = checks.MinInterval
  2239  			}
  2240  
  2241  			tlsClientConfig := a.tlsConfigurator.OutgoingTLSConfigForCheck(chkType.TLSSkipVerify)
  2242  
  2243  			http := &checks.CheckHTTP{
  2244  				Notify:          a.State,
  2245  				CheckID:         check.CheckID,
  2246  				HTTP:            chkType.HTTP,
  2247  				Header:          chkType.Header,
  2248  				Method:          chkType.Method,
  2249  				Interval:        chkType.Interval,
  2250  				Timeout:         chkType.Timeout,
  2251  				Logger:          a.logger,
  2252  				TLSClientConfig: tlsClientConfig,
  2253  			}
  2254  			http.Start()
  2255  			a.checkHTTPs[check.CheckID] = http
  2256  
  2257  		case chkType.IsTCP():
  2258  			if existing, ok := a.checkTCPs[check.CheckID]; ok {
  2259  				existing.Stop()
  2260  				delete(a.checkTCPs, check.CheckID)
  2261  			}
  2262  			if chkType.Interval < checks.MinInterval {
  2263  				a.logger.Println(fmt.Sprintf("[WARN] agent: check '%s' has interval below minimum of %v",
  2264  					check.CheckID, checks.MinInterval))
  2265  				chkType.Interval = checks.MinInterval
  2266  			}
  2267  
  2268  			tcp := &checks.CheckTCP{
  2269  				Notify:   a.State,
  2270  				CheckID:  check.CheckID,
  2271  				TCP:      chkType.TCP,
  2272  				Interval: chkType.Interval,
  2273  				Timeout:  chkType.Timeout,
  2274  				Logger:   a.logger,
  2275  			}
  2276  			tcp.Start()
  2277  			a.checkTCPs[check.CheckID] = tcp
  2278  
  2279  		case chkType.IsGRPC():
  2280  			if existing, ok := a.checkGRPCs[check.CheckID]; ok {
  2281  				existing.Stop()
  2282  				delete(a.checkGRPCs, check.CheckID)
  2283  			}
  2284  			if chkType.Interval < checks.MinInterval {
  2285  				a.logger.Println(fmt.Sprintf("[WARN] agent: check '%s' has interval below minimum of %v",
  2286  					check.CheckID, checks.MinInterval))
  2287  				chkType.Interval = checks.MinInterval
  2288  			}
  2289  
  2290  			var tlsClientConfig *tls.Config
  2291  			if chkType.GRPCUseTLS {
  2292  				tlsClientConfig = a.tlsConfigurator.OutgoingTLSConfigForCheck(chkType.TLSSkipVerify)
  2293  			}
  2294  
  2295  			grpc := &checks.CheckGRPC{
  2296  				Notify:          a.State,
  2297  				CheckID:         check.CheckID,
  2298  				GRPC:            chkType.GRPC,
  2299  				Interval:        chkType.Interval,
  2300  				Timeout:         chkType.Timeout,
  2301  				Logger:          a.logger,
  2302  				TLSClientConfig: tlsClientConfig,
  2303  			}
  2304  			grpc.Start()
  2305  			a.checkGRPCs[check.CheckID] = grpc
  2306  
  2307  		case chkType.IsDocker():
  2308  			if existing, ok := a.checkDockers[check.CheckID]; ok {
  2309  				existing.Stop()
  2310  				delete(a.checkDockers, check.CheckID)
  2311  			}
  2312  			if chkType.Interval < checks.MinInterval {
  2313  				a.logger.Println(fmt.Sprintf("[WARN] agent: check '%s' has interval below minimum of %v",
  2314  					check.CheckID, checks.MinInterval))
  2315  				chkType.Interval = checks.MinInterval
  2316  			}
  2317  
  2318  			if a.dockerClient == nil {
  2319  				dc, err := checks.NewDockerClient(os.Getenv("DOCKER_HOST"), checks.BufSize)
  2320  				if err != nil {
  2321  					a.logger.Printf("[ERR] agent: error creating docker client: %s", err)
  2322  					return err
  2323  				}
  2324  				a.logger.Printf("[DEBUG] agent: created docker client for %s", dc.Host())
  2325  				a.dockerClient = dc
  2326  			}
  2327  
  2328  			dockerCheck := &checks.CheckDocker{
  2329  				Notify:            a.State,
  2330  				CheckID:           check.CheckID,
  2331  				DockerContainerID: chkType.DockerContainerID,
  2332  				Shell:             chkType.Shell,
  2333  				ScriptArgs:        chkType.ScriptArgs,
  2334  				Interval:          chkType.Interval,
  2335  				Logger:            a.logger,
  2336  				Client:            a.dockerClient,
  2337  			}
  2338  			if prev := a.checkDockers[check.CheckID]; prev != nil {
  2339  				prev.Stop()
  2340  			}
  2341  			dockerCheck.Start()
  2342  			a.checkDockers[check.CheckID] = dockerCheck
  2343  
  2344  		case chkType.IsMonitor():
  2345  			if existing, ok := a.checkMonitors[check.CheckID]; ok {
  2346  				existing.Stop()
  2347  				delete(a.checkMonitors, check.CheckID)
  2348  			}
  2349  			if chkType.Interval < checks.MinInterval {
  2350  				a.logger.Printf("[WARN] agent: check '%s' has interval below minimum of %v",
  2351  					check.CheckID, checks.MinInterval)
  2352  				chkType.Interval = checks.MinInterval
  2353  			}
  2354  
  2355  			monitor := &checks.CheckMonitor{
  2356  				Notify:     a.State,
  2357  				CheckID:    check.CheckID,
  2358  				ScriptArgs: chkType.ScriptArgs,
  2359  				Interval:   chkType.Interval,
  2360  				Timeout:    chkType.Timeout,
  2361  				Logger:     a.logger,
  2362  			}
  2363  			monitor.Start()
  2364  			a.checkMonitors[check.CheckID] = monitor
  2365  
  2366  		case chkType.IsAlias():
  2367  			if existing, ok := a.checkAliases[check.CheckID]; ok {
  2368  				existing.Stop()
  2369  				delete(a.checkAliases, check.CheckID)
  2370  			}
  2371  
  2372  			var rpcReq structs.NodeSpecificRequest
  2373  			rpcReq.Datacenter = a.config.Datacenter
  2374  
  2375  			// The token to set is really important. The behavior below follows
  2376  			// the same behavior as anti-entropy: we use the user-specified token
  2377  			// if set (either on the service or check definition), otherwise
  2378  			// we use the "UserToken" on the agent. This is tested.
  2379  			rpcReq.Token = a.tokens.UserToken()
  2380  			if token != "" {
  2381  				rpcReq.Token = token
  2382  			}
  2383  
  2384  			chkImpl := &checks.CheckAlias{
  2385  				Notify:    a.State,
  2386  				RPC:       a.delegate,
  2387  				RPCReq:    rpcReq,
  2388  				CheckID:   check.CheckID,
  2389  				Node:      chkType.AliasNode,
  2390  				ServiceID: chkType.AliasService,
  2391  			}
  2392  			chkImpl.Start()
  2393  			a.checkAliases[check.CheckID] = chkImpl
  2394  
  2395  		default:
  2396  			return fmt.Errorf("Check type is not valid")
  2397  		}
  2398  
  2399  		if chkType.DeregisterCriticalServiceAfter > 0 {
  2400  			timeout := chkType.DeregisterCriticalServiceAfter
  2401  			if timeout < a.config.CheckDeregisterIntervalMin {
  2402  				timeout = a.config.CheckDeregisterIntervalMin
  2403  				a.logger.Println(fmt.Sprintf("[WARN] agent: check '%s' has deregister interval below minimum of %v",
  2404  					check.CheckID, a.config.CheckDeregisterIntervalMin))
  2405  			}
  2406  			a.checkReapAfter[check.CheckID] = timeout
  2407  		} else {
  2408  			delete(a.checkReapAfter, check.CheckID)
  2409  		}
  2410  	}
  2411  
  2412  	return nil
  2413  }
  2414  
  2415  // RemoveCheck is used to remove a health check.
  2416  // The agent will make a best effort to ensure it is deregistered
  2417  func (a *Agent) RemoveCheck(checkID types.CheckID, persist bool) error {
  2418  	a.stateLock.Lock()
  2419  	defer a.stateLock.Unlock()
  2420  	return a.removeCheckLocked(checkID, persist)
  2421  }
  2422  
  2423  // removeCheckLocked is used to remove a health check.
  2424  // The agent will make a best effort to ensure it is deregistered
  2425  func (a *Agent) removeCheckLocked(checkID types.CheckID, persist bool) error {
  2426  	// Validate CheckID
  2427  	if checkID == "" {
  2428  		return fmt.Errorf("CheckID missing")
  2429  	}
  2430  
  2431  	a.cancelCheckMonitors(checkID)
  2432  	a.State.RemoveCheck(checkID)
  2433  
  2434  	if persist {
  2435  		if err := a.purgeCheck(checkID); err != nil {
  2436  			return err
  2437  		}
  2438  		if err := a.purgeCheckState(checkID); err != nil {
  2439  			return err
  2440  		}
  2441  	}
  2442  	a.logger.Printf("[DEBUG] agent: removed check %q", checkID)
  2443  	return nil
  2444  }
  2445  
  2446  // addProxyLocked adds a new local Connect Proxy instance to be managed by the agent.
  2447  //
  2448  // This assumes that the agent's proxyLock is already held
  2449  //
  2450  // It REQUIRES that the service that is being proxied is already present in the
  2451  // local state. Note that this is only used for agent-managed proxies so we can
  2452  // ensure that we always make this true. For externally managed and registered
  2453  // proxies we explicitly allow the proxy to be registered first to make
  2454  // bootstrap ordering of a new service simpler but the same is not true here
  2455  // since this is only ever called when setting up a _managed_ proxy which was
  2456  // registered as part of a service registration either from config or HTTP API
  2457  // call.
  2458  //
  2459  // The restoredProxyToken argument should only be used when restoring proxy
  2460  // definitions from disk; new proxies must leave it blank to get a new token
  2461  // assigned. We need to restore from disk to enable to continue authenticating
  2462  // running proxies that already had that credential injected.
  2463  func (a *Agent) addProxyLocked(proxy *structs.ConnectManagedProxy, persist, FromFile bool,
  2464  	restoredProxyToken string, source configSource) error {
  2465  	// Lookup the target service token in state if there is one.
  2466  	token := a.State.ServiceToken(proxy.TargetServiceID)
  2467  
  2468  	// Copy the basic proxy structure so it isn't modified w/ defaults
  2469  	proxyCopy := *proxy
  2470  	proxy = &proxyCopy
  2471  	if err := a.applyProxyDefaults(proxy); err != nil {
  2472  		return err
  2473  	}
  2474  
  2475  	// Add the proxy to local state first since we may need to assign a port which
  2476  	// needs to be coordinate under state lock. AddProxy will generate the
  2477  	// NodeService for the proxy populated with the allocated (or configured) port
  2478  	// and an ID, but it doesn't add it to the agent directly since that could
  2479  	// deadlock and we may need to coordinate adding it and persisting etc.
  2480  	proxyState, err := a.State.AddProxy(proxy, token, restoredProxyToken)
  2481  	if err != nil {
  2482  		return err
  2483  	}
  2484  	proxyService := proxyState.Proxy.ProxyService
  2485  
  2486  	// Register proxy TCP check. The built in proxy doesn't listen publically
  2487  	// until it's loaded certs so this ensures we won't route traffic until it's
  2488  	// ready.
  2489  	proxyCfg, err := a.applyProxyConfigDefaults(proxyState.Proxy)
  2490  	if err != nil {
  2491  		return err
  2492  	}
  2493  	chkAddr := a.resolveProxyCheckAddress(proxyCfg)
  2494  	chkTypes := []*structs.CheckType{}
  2495  	if chkAddr != "" {
  2496  		chkTypes = []*structs.CheckType{
  2497  			&structs.CheckType{
  2498  				Name: "Connect Proxy Listening",
  2499  				TCP: fmt.Sprintf("%s:%d", chkAddr,
  2500  					proxyCfg["bind_port"]),
  2501  				Interval: 10 * time.Second,
  2502  			},
  2503  		}
  2504  	}
  2505  
  2506  	err = a.addServiceLocked(proxyService, chkTypes, persist, token, source)
  2507  	if err != nil {
  2508  		// Remove the state too
  2509  		a.State.RemoveProxy(proxyService.ID)
  2510  		return err
  2511  	}
  2512  
  2513  	// Persist the proxy
  2514  	if persist && a.config.DataDir != "" {
  2515  		return a.persistProxy(proxyState, FromFile)
  2516  	}
  2517  	return nil
  2518  }
  2519  
  2520  // AddProxy adds a new local Connect Proxy instance to be managed by the agent.
  2521  //
  2522  // It REQUIRES that the service that is being proxied is already present in the
  2523  // local state. Note that this is only used for agent-managed proxies so we can
  2524  // ensure that we always make this true. For externally managed and registered
  2525  // proxies we explicitly allow the proxy to be registered first to make
  2526  // bootstrap ordering of a new service simpler but the same is not true here
  2527  // since this is only ever called when setting up a _managed_ proxy which was
  2528  // registered as part of a service registration either from config or HTTP API
  2529  // call.
  2530  //
  2531  // The restoredProxyToken argument should only be used when restoring proxy
  2532  // definitions from disk; new proxies must leave it blank to get a new token
  2533  // assigned. We need to restore from disk to enable to continue authenticating
  2534  // running proxies that already had that credential injected.
  2535  func (a *Agent) AddProxy(proxy *structs.ConnectManagedProxy, persist, FromFile bool,
  2536  	restoredProxyToken string, source configSource) error {
  2537  	a.stateLock.Lock()
  2538  	defer a.stateLock.Unlock()
  2539  	return a.addProxyLocked(proxy, persist, FromFile, restoredProxyToken, source)
  2540  }
  2541  
  2542  // resolveProxyCheckAddress returns the best address to use for a TCP check of
  2543  // the proxy's public listener. It expects the input to already have default
  2544  // values populated by applyProxyConfigDefaults. It may return an empty string
  2545  // indicating that the TCP check should not be created at all.
  2546  //
  2547  // By default this uses the proxy's bind address which in turn defaults to the
  2548  // agent's bind address. If the proxy bind address ends up being 0.0.0.0 we have
  2549  // to assume the agent can dial it over loopback which is usually true.
  2550  //
  2551  // In some topologies such as proxy being in a different container, the IP the
  2552  // agent used to dial proxy over a local bridge might not be the same as the
  2553  // container's public routable IP address so we allow a manual override of the
  2554  // check address in config "tcp_check_address" too.
  2555  //
  2556  // Finally the TCP check can be disabled by another manual override
  2557  // "disable_tcp_check" in cases where the agent will never be able to dial the
  2558  // proxy directly for some reason.
  2559  func (a *Agent) resolveProxyCheckAddress(proxyCfg map[string]interface{}) string {
  2560  	// If user disabled the check return empty string
  2561  	if disable, ok := proxyCfg["disable_tcp_check"].(bool); ok && disable {
  2562  		return ""
  2563  	}
  2564  
  2565  	// If user specified a custom one, use that
  2566  	if chkAddr, ok := proxyCfg["tcp_check_address"].(string); ok && chkAddr != "" {
  2567  		return chkAddr
  2568  	}
  2569  
  2570  	// If we have a bind address and its diallable, use that
  2571  	if bindAddr, ok := proxyCfg["bind_address"].(string); ok &&
  2572  		bindAddr != "" && bindAddr != "0.0.0.0" && bindAddr != "[::]" {
  2573  		return bindAddr
  2574  	}
  2575  
  2576  	// Default to localhost
  2577  	return "127.0.0.1"
  2578  }
  2579  
  2580  // applyProxyConfigDefaults takes a *structs.ConnectManagedProxy and returns
  2581  // it's Config map merged with any defaults from the Agent's config. It would be
  2582  // nicer if this were defined as a method on structs.ConnectManagedProxy but we
  2583  // can't do that because ot the import cycle it causes with agent/config.
  2584  func (a *Agent) applyProxyConfigDefaults(p *structs.ConnectManagedProxy) (map[string]interface{}, error) {
  2585  	if p == nil || p.ProxyService == nil {
  2586  		// Should never happen but protect from panic
  2587  		return nil, fmt.Errorf("invalid proxy state")
  2588  	}
  2589  
  2590  	// Lookup the target service
  2591  	target := a.State.Service(p.TargetServiceID)
  2592  	if target == nil {
  2593  		// Can happen during deregistration race between proxy and scheduler.
  2594  		return nil, fmt.Errorf("unknown target service ID: %s", p.TargetServiceID)
  2595  	}
  2596  
  2597  	// Merge globals defaults
  2598  	config := make(map[string]interface{})
  2599  	for k, v := range a.config.ConnectProxyDefaultConfig {
  2600  		if _, ok := config[k]; !ok {
  2601  			config[k] = v
  2602  		}
  2603  	}
  2604  
  2605  	// Copy config from the proxy
  2606  	for k, v := range p.Config {
  2607  		config[k] = v
  2608  	}
  2609  
  2610  	// Set defaults for anything that is still not specified but required.
  2611  	// Note that these are not included in the content hash. Since we expect
  2612  	// them to be static in general but some like the default target service
  2613  	// port might not be. In that edge case services can set that explicitly
  2614  	// when they re-register which will be caught though.
  2615  	if _, ok := config["bind_port"]; !ok {
  2616  		config["bind_port"] = p.ProxyService.Port
  2617  	}
  2618  	if _, ok := config["bind_address"]; !ok {
  2619  		// Default to binding to the same address the agent is configured to
  2620  		// bind to.
  2621  		config["bind_address"] = a.config.BindAddr.String()
  2622  	}
  2623  	if _, ok := config["local_service_address"]; !ok {
  2624  		// Default to localhost and the port the service registered with
  2625  		config["local_service_address"] = fmt.Sprintf("127.0.0.1:%d", target.Port)
  2626  	}
  2627  
  2628  	// Basic type conversions for expected types.
  2629  	if raw, ok := config["bind_port"]; ok {
  2630  		switch v := raw.(type) {
  2631  		case float64:
  2632  			// Common since HCL/JSON parse as float64
  2633  			config["bind_port"] = int(v)
  2634  
  2635  			// NOTE(mitchellh): No default case since errors and validation
  2636  			// are handled by the ServiceDefinition.Validate function.
  2637  		}
  2638  	}
  2639  
  2640  	return config, nil
  2641  }
  2642  
  2643  // applyProxyDefaults modifies the given proxy by applying any configured
  2644  // defaults, such as the default execution mode, command, etc.
  2645  func (a *Agent) applyProxyDefaults(proxy *structs.ConnectManagedProxy) error {
  2646  	// Set the default exec mode
  2647  	if proxy.ExecMode == structs.ProxyExecModeUnspecified {
  2648  		mode, err := structs.NewProxyExecMode(a.config.ConnectProxyDefaultExecMode)
  2649  		if err != nil {
  2650  			return err
  2651  		}
  2652  
  2653  		proxy.ExecMode = mode
  2654  	}
  2655  	if proxy.ExecMode == structs.ProxyExecModeUnspecified {
  2656  		proxy.ExecMode = structs.ProxyExecModeDaemon
  2657  	}
  2658  
  2659  	// Set the default command to the globally configured default
  2660  	if len(proxy.Command) == 0 {
  2661  		switch proxy.ExecMode {
  2662  		case structs.ProxyExecModeDaemon:
  2663  			proxy.Command = a.config.ConnectProxyDefaultDaemonCommand
  2664  
  2665  		case structs.ProxyExecModeScript:
  2666  			proxy.Command = a.config.ConnectProxyDefaultScriptCommand
  2667  		}
  2668  	}
  2669  
  2670  	// If there is no globally configured default we need to get the
  2671  	// default command so we can do "consul connect proxy"
  2672  	if len(proxy.Command) == 0 {
  2673  		command, err := defaultProxyCommand(a.config)
  2674  		if err != nil {
  2675  			return err
  2676  		}
  2677  
  2678  		proxy.Command = command
  2679  	}
  2680  
  2681  	return nil
  2682  }
  2683  
  2684  // removeProxyLocked stops and removes a local proxy instance.
  2685  //
  2686  // It is assumed that this function is called while holding the proxyLock already
  2687  func (a *Agent) removeProxyLocked(proxyID string, persist bool) error {
  2688  	// Validate proxyID
  2689  	if proxyID == "" {
  2690  		return fmt.Errorf("proxyID missing")
  2691  	}
  2692  
  2693  	// Remove the proxy from the local state
  2694  	p, err := a.State.RemoveProxy(proxyID)
  2695  	if err != nil {
  2696  		return err
  2697  	}
  2698  
  2699  	// Remove the proxy service as well. The proxy ID is also the ID
  2700  	// of the servie, but we might as well use the service pointer.
  2701  	if err := a.removeServiceLocked(p.Proxy.ProxyService.ID, persist); err != nil {
  2702  		return err
  2703  	}
  2704  
  2705  	if persist && a.config.DataDir != "" {
  2706  		return a.purgeProxy(proxyID)
  2707  	}
  2708  
  2709  	return nil
  2710  }
  2711  
  2712  // RemoveProxy stops and removes a local proxy instance.
  2713  func (a *Agent) RemoveProxy(proxyID string, persist bool) error {
  2714  	a.stateLock.Lock()
  2715  	defer a.stateLock.Unlock()
  2716  	return a.removeProxyLocked(proxyID, persist)
  2717  }
  2718  
  2719  // verifyProxyToken takes a token and attempts to verify it against the
  2720  // targetService name. If targetProxy is specified, then the local proxy token
  2721  // must exactly match the given proxy ID. cert, config, etc.).
  2722  //
  2723  // The given token may be a local-only proxy token or it may be an ACL token. We
  2724  // will attempt to verify the local proxy token first.
  2725  //
  2726  // The effective ACL token is returned along with a boolean which is true if the
  2727  // match was against a proxy token rather than an ACL token, and any error. In
  2728  // the case the token matches a proxy token, then the ACL token used to register
  2729  // that proxy's target service is returned for use in any RPC calls the proxy
  2730  // needs to make on behalf of that service. If the token was an ACL token
  2731  // already then it is always returned. Provided error is nil, a valid ACL token
  2732  // is always returned.
  2733  func (a *Agent) verifyProxyToken(token, targetService,
  2734  	targetProxy string) (string, bool, error) {
  2735  	// If we specify a target proxy, we look up that proxy directly. Otherwise,
  2736  	// we resolve with any proxy we can find.
  2737  	var proxy *local.ManagedProxy
  2738  	if targetProxy != "" {
  2739  		proxy = a.State.Proxy(targetProxy)
  2740  		if proxy == nil {
  2741  			return "", false, fmt.Errorf("unknown proxy service ID: %q", targetProxy)
  2742  		}
  2743  
  2744  		// If the token DOESN'T match, then we reset the proxy which will
  2745  		// cause the logic below to fall back to normal ACLs. Otherwise,
  2746  		// we keep the proxy set because we also have to verify that the
  2747  		// target service matches on the proxy.
  2748  		if token != proxy.ProxyToken {
  2749  			proxy = nil
  2750  		}
  2751  	} else {
  2752  		proxy = a.resolveProxyToken(token)
  2753  	}
  2754  
  2755  	// The existence of a token isn't enough, we also need to verify
  2756  	// that the service name of the matching proxy matches our target
  2757  	// service.
  2758  	if proxy != nil {
  2759  		// Get the target service since we only have the name. The nil
  2760  		// check below should never be true since a proxy token always
  2761  		// represents the existence of a local service.
  2762  		target := a.State.Service(proxy.Proxy.TargetServiceID)
  2763  		if target == nil {
  2764  			return "", false, fmt.Errorf("proxy target service not found: %q",
  2765  				proxy.Proxy.TargetServiceID)
  2766  		}
  2767  
  2768  		if target.Service != targetService {
  2769  			return "", false, acl.ErrPermissionDenied
  2770  		}
  2771  
  2772  		// Resolve the actual ACL token used to register the proxy/service and
  2773  		// return that for use in RPC calls.
  2774  		return a.State.ServiceToken(proxy.Proxy.TargetServiceID), true, nil
  2775  	}
  2776  
  2777  	// Doesn't match, we have to do a full token resolution. The required
  2778  	// permission for any proxy-related endpoint is service:write, since
  2779  	// to register a proxy you require that permission and sensitive data
  2780  	// is usually present in the configuration.
  2781  	rule, err := a.resolveToken(token)
  2782  	if err != nil {
  2783  		return "", false, err
  2784  	}
  2785  	if rule != nil && !rule.ServiceWrite(targetService, nil) {
  2786  		return "", false, acl.ErrPermissionDenied
  2787  	}
  2788  
  2789  	return token, false, nil
  2790  }
  2791  
  2792  func (a *Agent) cancelCheckMonitors(checkID types.CheckID) {
  2793  	// Stop any monitors
  2794  	delete(a.checkReapAfter, checkID)
  2795  	if check, ok := a.checkMonitors[checkID]; ok {
  2796  		check.Stop()
  2797  		delete(a.checkMonitors, checkID)
  2798  	}
  2799  	if check, ok := a.checkHTTPs[checkID]; ok {
  2800  		check.Stop()
  2801  		delete(a.checkHTTPs, checkID)
  2802  	}
  2803  	if check, ok := a.checkTCPs[checkID]; ok {
  2804  		check.Stop()
  2805  		delete(a.checkTCPs, checkID)
  2806  	}
  2807  	if check, ok := a.checkGRPCs[checkID]; ok {
  2808  		check.Stop()
  2809  		delete(a.checkGRPCs, checkID)
  2810  	}
  2811  	if check, ok := a.checkTTLs[checkID]; ok {
  2812  		check.Stop()
  2813  		delete(a.checkTTLs, checkID)
  2814  	}
  2815  	if check, ok := a.checkDockers[checkID]; ok {
  2816  		check.Stop()
  2817  		delete(a.checkDockers, checkID)
  2818  	}
  2819  }
  2820  
  2821  // updateTTLCheck is used to update the status of a TTL check via the Agent API.
  2822  func (a *Agent) updateTTLCheck(checkID types.CheckID, status, output string) error {
  2823  	a.stateLock.Lock()
  2824  	defer a.stateLock.Unlock()
  2825  
  2826  	// Grab the TTL check.
  2827  	check, ok := a.checkTTLs[checkID]
  2828  	if !ok {
  2829  		return fmt.Errorf("CheckID %q does not have associated TTL", checkID)
  2830  	}
  2831  
  2832  	// Set the status through CheckTTL to reset the TTL.
  2833  	check.SetStatus(status, output)
  2834  
  2835  	// We don't write any files in dev mode so bail here.
  2836  	if a.config.DataDir == "" {
  2837  		return nil
  2838  	}
  2839  
  2840  	// Persist the state so the TTL check can come up in a good state after
  2841  	// an agent restart, especially with long TTL values.
  2842  	if err := a.persistCheckState(check, status, output); err != nil {
  2843  		return fmt.Errorf("failed persisting state for check %q: %s", checkID, err)
  2844  	}
  2845  
  2846  	return nil
  2847  }
  2848  
  2849  // persistCheckState is used to record the check status into the data dir.
  2850  // This allows the state to be restored on a later agent start. Currently
  2851  // only useful for TTL based checks.
  2852  func (a *Agent) persistCheckState(check *checks.CheckTTL, status, output string) error {
  2853  	// Create the persisted state
  2854  	state := persistedCheckState{
  2855  		CheckID: check.CheckID,
  2856  		Status:  status,
  2857  		Output:  output,
  2858  		Expires: time.Now().Add(check.TTL).Unix(),
  2859  	}
  2860  
  2861  	// Encode the state
  2862  	buf, err := json.Marshal(state)
  2863  	if err != nil {
  2864  		return err
  2865  	}
  2866  
  2867  	// Create the state dir if it doesn't exist
  2868  	dir := filepath.Join(a.config.DataDir, checkStateDir)
  2869  	if err := os.MkdirAll(dir, 0700); err != nil {
  2870  		return fmt.Errorf("failed creating check state dir %q: %s", dir, err)
  2871  	}
  2872  
  2873  	// Write the state to the file
  2874  	file := filepath.Join(dir, checkIDHash(check.CheckID))
  2875  
  2876  	// Create temp file in same dir, to make more likely atomic
  2877  	tempFile := file + ".tmp"
  2878  
  2879  	// persistCheckState is called frequently, so don't use writeFileAtomic to avoid calling fsync here
  2880  	if err := ioutil.WriteFile(tempFile, buf, 0600); err != nil {
  2881  		return fmt.Errorf("failed writing temp file %q: %s", tempFile, err)
  2882  	}
  2883  	if err := os.Rename(tempFile, file); err != nil {
  2884  		return fmt.Errorf("failed to rename temp file from %q to %q: %s", tempFile, file, err)
  2885  	}
  2886  
  2887  	return nil
  2888  }
  2889  
  2890  // loadCheckState is used to restore the persisted state of a check.
  2891  func (a *Agent) loadCheckState(check *structs.HealthCheck) error {
  2892  	// Try to read the persisted state for this check
  2893  	file := filepath.Join(a.config.DataDir, checkStateDir, checkIDHash(check.CheckID))
  2894  	buf, err := ioutil.ReadFile(file)
  2895  	if err != nil {
  2896  		if os.IsNotExist(err) {
  2897  			return nil
  2898  		}
  2899  		return fmt.Errorf("failed reading file %q: %s", file, err)
  2900  	}
  2901  
  2902  	// Decode the state data
  2903  	var p persistedCheckState
  2904  	if err := json.Unmarshal(buf, &p); err != nil {
  2905  		a.logger.Printf("[ERR] agent: failed decoding check state: %s", err)
  2906  		return a.purgeCheckState(check.CheckID)
  2907  	}
  2908  
  2909  	// Check if the state has expired
  2910  	if time.Now().Unix() >= p.Expires {
  2911  		a.logger.Printf("[DEBUG] agent: check state expired for %q, not restoring", check.CheckID)
  2912  		return a.purgeCheckState(check.CheckID)
  2913  	}
  2914  
  2915  	// Restore the fields from the state
  2916  	check.Output = p.Output
  2917  	check.Status = p.Status
  2918  	return nil
  2919  }
  2920  
  2921  // purgeCheckState is used to purge the state of a check from the data dir
  2922  func (a *Agent) purgeCheckState(checkID types.CheckID) error {
  2923  	file := filepath.Join(a.config.DataDir, checkStateDir, checkIDHash(checkID))
  2924  	err := os.Remove(file)
  2925  	if os.IsNotExist(err) {
  2926  		return nil
  2927  	}
  2928  	return err
  2929  }
  2930  
  2931  func (a *Agent) GossipEncrypted() bool {
  2932  	return a.delegate.Encrypted()
  2933  }
  2934  
  2935  // Stats is used to get various debugging state from the sub-systems
  2936  func (a *Agent) Stats() map[string]map[string]string {
  2937  	stats := a.delegate.Stats()
  2938  	stats["agent"] = map[string]string{
  2939  		"check_monitors": strconv.Itoa(len(a.checkMonitors)),
  2940  		"check_ttls":     strconv.Itoa(len(a.checkTTLs)),
  2941  	}
  2942  	for k, v := range a.State.Stats() {
  2943  		stats["agent"][k] = v
  2944  	}
  2945  
  2946  	revision := a.config.Revision
  2947  	if len(revision) > 8 {
  2948  		revision = revision[:8]
  2949  	}
  2950  	stats["build"] = map[string]string{
  2951  		"revision":   revision,
  2952  		"version":    a.config.Version,
  2953  		"prerelease": a.config.VersionPrerelease,
  2954  	}
  2955  	return stats
  2956  }
  2957  
  2958  // storePid is used to write out our PID to a file if necessary
  2959  func (a *Agent) storePid() error {
  2960  	// Quit fast if no pidfile
  2961  	pidPath := a.config.PidFile
  2962  	if pidPath == "" {
  2963  		return nil
  2964  	}
  2965  
  2966  	// Open the PID file
  2967  	pidFile, err := os.OpenFile(pidPath, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0666)
  2968  	if err != nil {
  2969  		return fmt.Errorf("Could not open pid file: %v", err)
  2970  	}
  2971  	defer pidFile.Close()
  2972  
  2973  	// Write out the PID
  2974  	pid := os.Getpid()
  2975  	_, err = pidFile.WriteString(fmt.Sprintf("%d", pid))
  2976  	if err != nil {
  2977  		return fmt.Errorf("Could not write to pid file: %s", err)
  2978  	}
  2979  	return nil
  2980  }
  2981  
  2982  // deletePid is used to delete our PID on exit
  2983  func (a *Agent) deletePid() error {
  2984  	// Quit fast if no pidfile
  2985  	pidPath := a.config.PidFile
  2986  	if pidPath == "" {
  2987  		return nil
  2988  	}
  2989  
  2990  	stat, err := os.Stat(pidPath)
  2991  	if err != nil {
  2992  		return fmt.Errorf("Could not remove pid file: %s", err)
  2993  	}
  2994  
  2995  	if stat.IsDir() {
  2996  		return fmt.Errorf("Specified pid file path is directory")
  2997  	}
  2998  
  2999  	err = os.Remove(pidPath)
  3000  	if err != nil {
  3001  		return fmt.Errorf("Could not remove pid file: %s", err)
  3002  	}
  3003  	return nil
  3004  }
  3005  
  3006  // loadServices will load service definitions from configuration and persisted
  3007  // definitions on disk, and load them into the local agent.
  3008  func (a *Agent) loadServices(conf *config.RuntimeConfig) error {
  3009  	// Register the services from config
  3010  	for _, service := range conf.Services {
  3011  		ns := service.NodeService()
  3012  		chkTypes, err := service.CheckTypes()
  3013  		if err != nil {
  3014  			return fmt.Errorf("Failed to validate checks for service %q: %v", service.Name, err)
  3015  		}
  3016  
  3017  		// Grab and validate sidecar if there is one too
  3018  		sidecar, sidecarChecks, sidecarToken, err := a.sidecarServiceFromNodeService(ns, service.Token)
  3019  		if err != nil {
  3020  			return fmt.Errorf("Failed to validate sidecar for service %q: %v", service.Name, err)
  3021  		}
  3022  
  3023  		// Remove sidecar from NodeService now it's done it's job it's just a config
  3024  		// syntax sugar and shouldn't be persisted in local or server state.
  3025  		ns.Connect.SidecarService = nil
  3026  
  3027  		if err := a.addServiceLocked(ns, chkTypes, false, service.Token, ConfigSourceLocal); err != nil {
  3028  			return fmt.Errorf("Failed to register service %q: %v", service.Name, err)
  3029  		}
  3030  
  3031  		// If there is a sidecar service, register that too.
  3032  		if sidecar != nil {
  3033  			if err := a.addServiceLocked(sidecar, sidecarChecks, false, sidecarToken, ConfigSourceLocal); err != nil {
  3034  				return fmt.Errorf("Failed to register sidecar for service %q: %v", service.Name, err)
  3035  			}
  3036  		}
  3037  	}
  3038  
  3039  	// Load any persisted services
  3040  	svcDir := filepath.Join(a.config.DataDir, servicesDir)
  3041  	files, err := ioutil.ReadDir(svcDir)
  3042  	if err != nil {
  3043  		if os.IsNotExist(err) {
  3044  			return nil
  3045  		}
  3046  		return fmt.Errorf("Failed reading services dir %q: %s", svcDir, err)
  3047  	}
  3048  	for _, fi := range files {
  3049  		// Skip all dirs
  3050  		if fi.IsDir() {
  3051  			continue
  3052  		}
  3053  
  3054  		// Skip all partially written temporary files
  3055  		if strings.HasSuffix(fi.Name(), "tmp") {
  3056  			a.logger.Printf("[WARN] agent: Ignoring temporary service file %v", fi.Name())
  3057  			continue
  3058  		}
  3059  
  3060  		// Open the file for reading
  3061  		file := filepath.Join(svcDir, fi.Name())
  3062  		fh, err := os.Open(file)
  3063  		if err != nil {
  3064  			return fmt.Errorf("failed opening service file %q: %s", file, err)
  3065  		}
  3066  
  3067  		// Read the contents into a buffer
  3068  		buf, err := ioutil.ReadAll(fh)
  3069  		fh.Close()
  3070  		if err != nil {
  3071  			return fmt.Errorf("failed reading service file %q: %s", file, err)
  3072  		}
  3073  
  3074  		// Try decoding the service definition
  3075  		var p persistedService
  3076  		if err := json.Unmarshal(buf, &p); err != nil {
  3077  			// Backwards-compatibility for pre-0.5.1 persisted services
  3078  			if err := json.Unmarshal(buf, &p.Service); err != nil {
  3079  				a.logger.Printf("[ERR] agent: Failed decoding service file %q: %s", file, err)
  3080  				continue
  3081  			}
  3082  		}
  3083  		serviceID := p.Service.ID
  3084  
  3085  		if a.State.Service(serviceID) != nil {
  3086  			// Purge previously persisted service. This allows config to be
  3087  			// preferred over services persisted from the API.
  3088  			a.logger.Printf("[DEBUG] agent: service %q exists, not restoring from %q",
  3089  				serviceID, file)
  3090  			if err := a.purgeService(serviceID); err != nil {
  3091  				return fmt.Errorf("failed purging service %q: %s", serviceID, err)
  3092  			}
  3093  		} else {
  3094  			a.logger.Printf("[DEBUG] agent: restored service definition %q from %q",
  3095  				serviceID, file)
  3096  			if err := a.addServiceLocked(p.Service, nil, false, p.Token, ConfigSourceLocal); err != nil {
  3097  				return fmt.Errorf("failed adding service %q: %s", serviceID, err)
  3098  			}
  3099  		}
  3100  	}
  3101  
  3102  	return nil
  3103  }
  3104  
  3105  // unloadServices will deregister all services.
  3106  func (a *Agent) unloadServices() error {
  3107  	for id := range a.State.Services() {
  3108  		if err := a.removeServiceLocked(id, false); err != nil {
  3109  			return fmt.Errorf("Failed deregistering service '%s': %v", id, err)
  3110  		}
  3111  	}
  3112  	return nil
  3113  }
  3114  
  3115  // loadChecks loads check definitions and/or persisted check definitions from
  3116  // disk and re-registers them with the local agent.
  3117  func (a *Agent) loadChecks(conf *config.RuntimeConfig) error {
  3118  	// Register the checks from config
  3119  	for _, check := range conf.Checks {
  3120  		health := check.HealthCheck(conf.NodeName)
  3121  		chkType := check.CheckType()
  3122  		if err := a.addCheckLocked(health, chkType, false, check.Token, ConfigSourceLocal); err != nil {
  3123  			return fmt.Errorf("Failed to register check '%s': %v %v", check.Name, err, check)
  3124  		}
  3125  	}
  3126  
  3127  	// Load any persisted checks
  3128  	checkDir := filepath.Join(a.config.DataDir, checksDir)
  3129  	files, err := ioutil.ReadDir(checkDir)
  3130  	if err != nil {
  3131  		if os.IsNotExist(err) {
  3132  			return nil
  3133  		}
  3134  		return fmt.Errorf("Failed reading checks dir %q: %s", checkDir, err)
  3135  	}
  3136  	for _, fi := range files {
  3137  		// Ignore dirs - we only care about the check definition files
  3138  		if fi.IsDir() {
  3139  			continue
  3140  		}
  3141  
  3142  		// Open the file for reading
  3143  		file := filepath.Join(checkDir, fi.Name())
  3144  		fh, err := os.Open(file)
  3145  		if err != nil {
  3146  			return fmt.Errorf("Failed opening check file %q: %s", file, err)
  3147  		}
  3148  
  3149  		// Read the contents into a buffer
  3150  		buf, err := ioutil.ReadAll(fh)
  3151  		fh.Close()
  3152  		if err != nil {
  3153  			return fmt.Errorf("failed reading check file %q: %s", file, err)
  3154  		}
  3155  
  3156  		// Decode the check
  3157  		var p persistedCheck
  3158  		if err := json.Unmarshal(buf, &p); err != nil {
  3159  			a.logger.Printf("[ERR] agent: Failed decoding check file %q: %s", file, err)
  3160  			continue
  3161  		}
  3162  		checkID := p.Check.CheckID
  3163  
  3164  		if a.State.Check(checkID) != nil {
  3165  			// Purge previously persisted check. This allows config to be
  3166  			// preferred over persisted checks from the API.
  3167  			a.logger.Printf("[DEBUG] agent: check %q exists, not restoring from %q",
  3168  				checkID, file)
  3169  			if err := a.purgeCheck(checkID); err != nil {
  3170  				return fmt.Errorf("Failed purging check %q: %s", checkID, err)
  3171  			}
  3172  		} else {
  3173  			// Default check to critical to avoid placing potentially unhealthy
  3174  			// services into the active pool
  3175  			p.Check.Status = api.HealthCritical
  3176  
  3177  			if err := a.addCheckLocked(p.Check, p.ChkType, false, p.Token, ConfigSourceLocal); err != nil {
  3178  				// Purge the check if it is unable to be restored.
  3179  				a.logger.Printf("[WARN] agent: Failed to restore check %q: %s",
  3180  					checkID, err)
  3181  				if err := a.purgeCheck(checkID); err != nil {
  3182  					return fmt.Errorf("Failed purging check %q: %s", checkID, err)
  3183  				}
  3184  			}
  3185  			a.logger.Printf("[DEBUG] agent: restored health check %q from %q",
  3186  				p.Check.CheckID, file)
  3187  		}
  3188  	}
  3189  
  3190  	return nil
  3191  }
  3192  
  3193  // unloadChecks will deregister all checks known to the local agent.
  3194  func (a *Agent) unloadChecks() error {
  3195  	for id := range a.State.Checks() {
  3196  		if err := a.removeCheckLocked(id, false); err != nil {
  3197  			return fmt.Errorf("Failed deregistering check '%s': %s", id, err)
  3198  		}
  3199  	}
  3200  	return nil
  3201  }
  3202  
  3203  // loadPersistedProxies will load connect proxy definitions from their
  3204  // persisted state on disk and return a slice of them
  3205  //
  3206  // This does not add them to the local
  3207  func (a *Agent) loadPersistedProxies() (map[string]persistedProxy, error) {
  3208  	persistedProxies := make(map[string]persistedProxy)
  3209  
  3210  	proxyDir := filepath.Join(a.config.DataDir, proxyDir)
  3211  	files, err := ioutil.ReadDir(proxyDir)
  3212  	if err != nil {
  3213  		if !os.IsNotExist(err) {
  3214  			return nil, fmt.Errorf("Failed reading proxies dir %q: %s", proxyDir, err)
  3215  		}
  3216  	}
  3217  
  3218  	for _, fi := range files {
  3219  		// Skip all dirs
  3220  		if fi.IsDir() {
  3221  			continue
  3222  		}
  3223  
  3224  		// Skip all partially written temporary files
  3225  		if strings.HasSuffix(fi.Name(), "tmp") {
  3226  			return nil, fmt.Errorf("Ignoring temporary proxy file %v", fi.Name())
  3227  		}
  3228  
  3229  		// Open the file for reading
  3230  		file := filepath.Join(proxyDir, fi.Name())
  3231  		fh, err := os.Open(file)
  3232  		if err != nil {
  3233  			return nil, fmt.Errorf("failed opening proxy file %q: %s", file, err)
  3234  		}
  3235  
  3236  		// Read the contents into a buffer
  3237  		buf, err := ioutil.ReadAll(fh)
  3238  		fh.Close()
  3239  		if err != nil {
  3240  			return nil, fmt.Errorf("failed reading proxy file %q: %s", file, err)
  3241  		}
  3242  
  3243  		// Try decoding the proxy definition
  3244  		var p persistedProxy
  3245  		if err := json.Unmarshal(buf, &p); err != nil {
  3246  			return nil, fmt.Errorf("Failed decoding proxy file %q: %s", file, err)
  3247  		}
  3248  		svcID := p.Proxy.TargetServiceID
  3249  
  3250  		persistedProxies[svcID] = p
  3251  	}
  3252  
  3253  	return persistedProxies, nil
  3254  }
  3255  
  3256  // loadProxies will load connect proxy definitions from configuration and
  3257  // persisted definitions on disk, and load them into the local agent.
  3258  func (a *Agent) loadProxies(conf *config.RuntimeConfig) error {
  3259  	persistedProxies, persistenceErr := a.loadPersistedProxies()
  3260  
  3261  	for _, svc := range conf.Services {
  3262  		if svc.Connect != nil {
  3263  			proxy, err := svc.ConnectManagedProxy()
  3264  			if err != nil {
  3265  				return fmt.Errorf("failed adding proxy: %s", err)
  3266  			}
  3267  			if proxy == nil {
  3268  				continue
  3269  			}
  3270  			restoredToken := ""
  3271  			if persisted, ok := persistedProxies[proxy.TargetServiceID]; ok {
  3272  				restoredToken = persisted.ProxyToken
  3273  			}
  3274  
  3275  			if err := a.addProxyLocked(proxy, true, true, restoredToken, ConfigSourceLocal); err != nil {
  3276  				return fmt.Errorf("failed adding proxy: %s", err)
  3277  			}
  3278  		}
  3279  	}
  3280  
  3281  	for _, persisted := range persistedProxies {
  3282  		proxyID := persisted.Proxy.ProxyService.ID
  3283  		if persisted.FromFile && a.State.Proxy(proxyID) == nil {
  3284  			// Purge proxies that were configured previously but are no longer in the config
  3285  			a.logger.Printf("[DEBUG] agent: purging stale persisted proxy %q", proxyID)
  3286  			if err := a.purgeProxy(proxyID); err != nil {
  3287  				return fmt.Errorf("failed purging proxy %q: %v", proxyID, err)
  3288  			}
  3289  		} else if !persisted.FromFile {
  3290  			if a.State.Proxy(proxyID) == nil {
  3291  				a.logger.Printf("[DEBUG] agent: restored proxy definition %q", proxyID)
  3292  				if err := a.addProxyLocked(persisted.Proxy, false, false, persisted.ProxyToken, ConfigSourceLocal); err != nil {
  3293  					return fmt.Errorf("failed adding proxy %q: %v", proxyID, err)
  3294  				}
  3295  			} else {
  3296  				a.logger.Printf("[WARN] agent: proxy definition %q was overwritten by a proxy definition within a config file", proxyID)
  3297  			}
  3298  		}
  3299  	}
  3300  
  3301  	return persistenceErr
  3302  }
  3303  
  3304  type persistedTokens struct {
  3305  	Replication string `json:"replication,omitempty"`
  3306  	AgentMaster string `json:"agent_master,omitempty"`
  3307  	Default     string `json:"default,omitempty"`
  3308  	Agent       string `json:"agent,omitempty"`
  3309  }
  3310  
  3311  func (a *Agent) getPersistedTokens() (*persistedTokens, error) {
  3312  	persistedTokens := &persistedTokens{}
  3313  	if !a.config.ACLEnableTokenPersistence {
  3314  		return persistedTokens, nil
  3315  	}
  3316  
  3317  	a.persistedTokensLock.RLock()
  3318  	defer a.persistedTokensLock.RUnlock()
  3319  
  3320  	tokensFullPath := filepath.Join(a.config.DataDir, tokensPath)
  3321  
  3322  	buf, err := ioutil.ReadFile(tokensFullPath)
  3323  	if err != nil {
  3324  		if os.IsNotExist(err) {
  3325  			// non-existence is not an error we care about
  3326  			return persistedTokens, nil
  3327  		}
  3328  		return persistedTokens, fmt.Errorf("failed reading tokens file %q: %s", tokensFullPath, err)
  3329  	}
  3330  
  3331  	if err := json.Unmarshal(buf, persistedTokens); err != nil {
  3332  		return persistedTokens, fmt.Errorf("failed to decode tokens file %q: %s", tokensFullPath, err)
  3333  	}
  3334  
  3335  	return persistedTokens, nil
  3336  }
  3337  
  3338  func (a *Agent) loadTokens(conf *config.RuntimeConfig) error {
  3339  	persistedTokens, persistenceErr := a.getPersistedTokens()
  3340  
  3341  	if persistenceErr != nil {
  3342  		a.logger.Printf("[WARN] unable to load persisted tokens: %v", persistenceErr)
  3343  	}
  3344  
  3345  	if persistedTokens.Default != "" {
  3346  		a.tokens.UpdateUserToken(persistedTokens.Default, token.TokenSourceAPI)
  3347  
  3348  		if conf.ACLToken != "" {
  3349  			a.logger.Printf("[WARN] \"default\" token present in both the configuration and persisted token store, using the persisted token")
  3350  		}
  3351  	} else {
  3352  		a.tokens.UpdateUserToken(conf.ACLToken, token.TokenSourceConfig)
  3353  	}
  3354  
  3355  	if persistedTokens.Agent != "" {
  3356  		a.tokens.UpdateAgentToken(persistedTokens.Agent, token.TokenSourceAPI)
  3357  
  3358  		if conf.ACLAgentToken != "" {
  3359  			a.logger.Printf("[WARN] \"agent\" token present in both the configuration and persisted token store, using the persisted token")
  3360  		}
  3361  	} else {
  3362  		a.tokens.UpdateAgentToken(conf.ACLAgentToken, token.TokenSourceConfig)
  3363  	}
  3364  
  3365  	if persistedTokens.AgentMaster != "" {
  3366  		a.tokens.UpdateAgentMasterToken(persistedTokens.AgentMaster, token.TokenSourceAPI)
  3367  
  3368  		if conf.ACLAgentMasterToken != "" {
  3369  			a.logger.Printf("[WARN] \"agent_master\" token present in both the configuration and persisted token store, using the persisted token")
  3370  		}
  3371  	} else {
  3372  		a.tokens.UpdateAgentMasterToken(conf.ACLAgentMasterToken, token.TokenSourceConfig)
  3373  	}
  3374  
  3375  	if persistedTokens.Replication != "" {
  3376  		a.tokens.UpdateReplicationToken(persistedTokens.Replication, token.TokenSourceAPI)
  3377  
  3378  		if conf.ACLReplicationToken != "" {
  3379  			a.logger.Printf("[WARN] \"replication\" token present in both the configuration and persisted token store, using the persisted token")
  3380  		}
  3381  	} else {
  3382  		a.tokens.UpdateReplicationToken(conf.ACLReplicationToken, token.TokenSourceConfig)
  3383  	}
  3384  
  3385  	return persistenceErr
  3386  }
  3387  
  3388  // unloadProxies will deregister all proxies known to the local agent.
  3389  func (a *Agent) unloadProxies() error {
  3390  	for id := range a.State.Proxies() {
  3391  		if err := a.removeProxyLocked(id, false); err != nil {
  3392  			return fmt.Errorf("Failed deregistering proxy '%s': %s", id, err)
  3393  		}
  3394  	}
  3395  	return nil
  3396  }
  3397  
  3398  // snapshotCheckState is used to snapshot the current state of the health
  3399  // checks. This is done before we reload our checks, so that we can properly
  3400  // restore into the same state.
  3401  func (a *Agent) snapshotCheckState() map[types.CheckID]*structs.HealthCheck {
  3402  	return a.State.Checks()
  3403  }
  3404  
  3405  // restoreCheckState is used to reset the health state based on a snapshot.
  3406  // This is done after we finish the reload to avoid any unnecessary flaps
  3407  // in health state and potential session invalidations.
  3408  func (a *Agent) restoreCheckState(snap map[types.CheckID]*structs.HealthCheck) {
  3409  	for id, check := range snap {
  3410  		a.State.UpdateCheck(id, check.Status, check.Output)
  3411  	}
  3412  }
  3413  
  3414  // loadMetadata loads node metadata fields from the agent config and
  3415  // updates them on the local agent.
  3416  func (a *Agent) loadMetadata(conf *config.RuntimeConfig) error {
  3417  	meta := map[string]string{}
  3418  	for k, v := range conf.NodeMeta {
  3419  		meta[k] = v
  3420  	}
  3421  	meta[structs.MetaSegmentKey] = conf.SegmentName
  3422  	return a.State.LoadMetadata(meta)
  3423  }
  3424  
  3425  // unloadMetadata resets the local metadata state
  3426  func (a *Agent) unloadMetadata() {
  3427  	a.State.UnloadMetadata()
  3428  }
  3429  
  3430  // serviceMaintCheckID returns the ID of a given service's maintenance check
  3431  func serviceMaintCheckID(serviceID string) types.CheckID {
  3432  	return types.CheckID(structs.ServiceMaintPrefix + serviceID)
  3433  }
  3434  
  3435  // EnableServiceMaintenance will register a false health check against the given
  3436  // service ID with critical status. This will exclude the service from queries.
  3437  func (a *Agent) EnableServiceMaintenance(serviceID, reason, token string) error {
  3438  	service, ok := a.State.Services()[serviceID]
  3439  	if !ok {
  3440  		return fmt.Errorf("No service registered with ID %q", serviceID)
  3441  	}
  3442  
  3443  	// Check if maintenance mode is not already enabled
  3444  	checkID := serviceMaintCheckID(serviceID)
  3445  	if _, ok := a.State.Checks()[checkID]; ok {
  3446  		return nil
  3447  	}
  3448  
  3449  	// Use default notes if no reason provided
  3450  	if reason == "" {
  3451  		reason = defaultServiceMaintReason
  3452  	}
  3453  
  3454  	// Create and register the critical health check
  3455  	check := &structs.HealthCheck{
  3456  		Node:        a.config.NodeName,
  3457  		CheckID:     checkID,
  3458  		Name:        "Service Maintenance Mode",
  3459  		Notes:       reason,
  3460  		ServiceID:   service.ID,
  3461  		ServiceName: service.Service,
  3462  		Status:      api.HealthCritical,
  3463  	}
  3464  	a.AddCheck(check, nil, true, token, ConfigSourceLocal)
  3465  	a.logger.Printf("[INFO] agent: Service %q entered maintenance mode", serviceID)
  3466  
  3467  	return nil
  3468  }
  3469  
  3470  // DisableServiceMaintenance will deregister the fake maintenance mode check
  3471  // if the service has been marked as in maintenance.
  3472  func (a *Agent) DisableServiceMaintenance(serviceID string) error {
  3473  	if _, ok := a.State.Services()[serviceID]; !ok {
  3474  		return fmt.Errorf("No service registered with ID %q", serviceID)
  3475  	}
  3476  
  3477  	// Check if maintenance mode is enabled
  3478  	checkID := serviceMaintCheckID(serviceID)
  3479  	if _, ok := a.State.Checks()[checkID]; !ok {
  3480  		return nil
  3481  	}
  3482  
  3483  	// Deregister the maintenance check
  3484  	a.RemoveCheck(checkID, true)
  3485  	a.logger.Printf("[INFO] agent: Service %q left maintenance mode", serviceID)
  3486  
  3487  	return nil
  3488  }
  3489  
  3490  // EnableNodeMaintenance places a node into maintenance mode.
  3491  func (a *Agent) EnableNodeMaintenance(reason, token string) {
  3492  	// Ensure node maintenance is not already enabled
  3493  	if _, ok := a.State.Checks()[structs.NodeMaint]; ok {
  3494  		return
  3495  	}
  3496  
  3497  	// Use a default notes value
  3498  	if reason == "" {
  3499  		reason = defaultNodeMaintReason
  3500  	}
  3501  
  3502  	// Create and register the node maintenance check
  3503  	check := &structs.HealthCheck{
  3504  		Node:    a.config.NodeName,
  3505  		CheckID: structs.NodeMaint,
  3506  		Name:    "Node Maintenance Mode",
  3507  		Notes:   reason,
  3508  		Status:  api.HealthCritical,
  3509  	}
  3510  	a.AddCheck(check, nil, true, token, ConfigSourceLocal)
  3511  	a.logger.Printf("[INFO] agent: Node entered maintenance mode")
  3512  }
  3513  
  3514  // DisableNodeMaintenance removes a node from maintenance mode
  3515  func (a *Agent) DisableNodeMaintenance() {
  3516  	if _, ok := a.State.Checks()[structs.NodeMaint]; !ok {
  3517  		return
  3518  	}
  3519  	a.RemoveCheck(structs.NodeMaint, true)
  3520  	a.logger.Printf("[INFO] agent: Node left maintenance mode")
  3521  }
  3522  
  3523  func (a *Agent) loadLimits(conf *config.RuntimeConfig) {
  3524  	a.config.RPCRateLimit = conf.RPCRateLimit
  3525  	a.config.RPCMaxBurst = conf.RPCMaxBurst
  3526  }
  3527  
  3528  func (a *Agent) ReloadConfig(newCfg *config.RuntimeConfig) error {
  3529  	// Bulk update the services and checks
  3530  	a.PauseSync()
  3531  	defer a.ResumeSync()
  3532  
  3533  	a.stateLock.Lock()
  3534  	defer a.stateLock.Unlock()
  3535  
  3536  	// Snapshot the current state, and restore it afterwards
  3537  	snap := a.snapshotCheckState()
  3538  	defer a.restoreCheckState(snap)
  3539  
  3540  	// First unload all checks, services, and metadata. This lets us begin the reload
  3541  	// with a clean slate.
  3542  	if err := a.unloadProxies(); err != nil {
  3543  		return fmt.Errorf("Failed unloading proxies: %s", err)
  3544  	}
  3545  	if err := a.unloadServices(); err != nil {
  3546  		return fmt.Errorf("Failed unloading services: %s", err)
  3547  	}
  3548  	if err := a.unloadChecks(); err != nil {
  3549  		return fmt.Errorf("Failed unloading checks: %s", err)
  3550  	}
  3551  	a.unloadMetadata()
  3552  
  3553  	// Reload tokens - should be done before all the other loading
  3554  	// to ensure the correct tokens are available for attaching to
  3555  	// the checks and service registrations.
  3556  	a.loadTokens(newCfg)
  3557  
  3558  	if err := a.tlsConfigurator.Update(newCfg.ToTLSUtilConfig()); err != nil {
  3559  		return fmt.Errorf("Failed reloading tls configuration: %s", err)
  3560  	}
  3561  
  3562  	// Reload service/check definitions and metadata.
  3563  	if err := a.loadServices(newCfg); err != nil {
  3564  		return fmt.Errorf("Failed reloading services: %s", err)
  3565  	}
  3566  	if err := a.loadProxies(newCfg); err != nil {
  3567  		return fmt.Errorf("Failed reloading proxies: %s", err)
  3568  	}
  3569  	if err := a.loadChecks(newCfg); err != nil {
  3570  		return fmt.Errorf("Failed reloading checks: %s", err)
  3571  	}
  3572  	if err := a.loadMetadata(newCfg); err != nil {
  3573  		return fmt.Errorf("Failed reloading metadata: %s", err)
  3574  	}
  3575  
  3576  	if err := a.reloadWatches(newCfg); err != nil {
  3577  		return fmt.Errorf("Failed reloading watches: %v", err)
  3578  	}
  3579  
  3580  	a.loadLimits(newCfg)
  3581  
  3582  	// create the config for the rpc server/client
  3583  	consulCfg, err := a.consulConfig()
  3584  	if err != nil {
  3585  		return err
  3586  	}
  3587  
  3588  	if err := a.delegate.ReloadConfig(consulCfg); err != nil {
  3589  		return err
  3590  	}
  3591  
  3592  	// Update filtered metrics
  3593  	metrics.UpdateFilter(newCfg.Telemetry.AllowedPrefixes,
  3594  		newCfg.Telemetry.BlockedPrefixes)
  3595  
  3596  	a.State.SetDiscardCheckOutput(newCfg.DiscardCheckOutput)
  3597  
  3598  	return nil
  3599  }
  3600  
  3601  // registerCache configures the cache and registers all the supported
  3602  // types onto the cache. This is NOT safe to call multiple times so
  3603  // care should be taken to call this exactly once after the cache
  3604  // field has been initialized.
  3605  func (a *Agent) registerCache() {
  3606  	// Note that you should register the _agent_ as the RPC implementation and not
  3607  	// the a.delegate directly, otherwise tests that rely on overriding RPC
  3608  	// routing via a.registerEndpoint will not work.
  3609  
  3610  	a.cache.RegisterType(cachetype.ConnectCARootName, &cachetype.ConnectCARoot{
  3611  		RPC: a,
  3612  	}, &cache.RegisterOptions{
  3613  		// Maintain a blocking query, retry dropped connections quickly
  3614  		Refresh:        true,
  3615  		RefreshTimer:   0 * time.Second,
  3616  		RefreshTimeout: 10 * time.Minute,
  3617  	})
  3618  
  3619  	a.cache.RegisterType(cachetype.ConnectCALeafName, &cachetype.ConnectCALeaf{
  3620  		RPC:                              a,
  3621  		Cache:                            a.cache,
  3622  		Datacenter:                       a.config.Datacenter,
  3623  		TestOverrideCAChangeInitialDelay: a.config.ConnectTestCALeafRootChangeSpread,
  3624  	}, &cache.RegisterOptions{
  3625  		// Maintain a blocking query, retry dropped connections quickly
  3626  		Refresh:        true,
  3627  		RefreshTimer:   0 * time.Second,
  3628  		RefreshTimeout: 10 * time.Minute,
  3629  	})
  3630  
  3631  	a.cache.RegisterType(cachetype.IntentionMatchName, &cachetype.IntentionMatch{
  3632  		RPC: a,
  3633  	}, &cache.RegisterOptions{
  3634  		// Maintain a blocking query, retry dropped connections quickly
  3635  		Refresh:        true,
  3636  		RefreshTimer:   0 * time.Second,
  3637  		RefreshTimeout: 10 * time.Minute,
  3638  	})
  3639  
  3640  	a.cache.RegisterType(cachetype.CatalogServicesName, &cachetype.CatalogServices{
  3641  		RPC: a,
  3642  	}, &cache.RegisterOptions{
  3643  		// Maintain a blocking query, retry dropped connections quickly
  3644  		Refresh:        true,
  3645  		RefreshTimer:   0 * time.Second,
  3646  		RefreshTimeout: 10 * time.Minute,
  3647  	})
  3648  
  3649  	a.cache.RegisterType(cachetype.HealthServicesName, &cachetype.HealthServices{
  3650  		RPC: a,
  3651  	}, &cache.RegisterOptions{
  3652  		// Maintain a blocking query, retry dropped connections quickly
  3653  		Refresh:        true,
  3654  		RefreshTimer:   0 * time.Second,
  3655  		RefreshTimeout: 10 * time.Minute,
  3656  	})
  3657  
  3658  	a.cache.RegisterType(cachetype.PreparedQueryName, &cachetype.PreparedQuery{
  3659  		RPC: a,
  3660  	}, &cache.RegisterOptions{
  3661  		// Prepared queries don't support blocking
  3662  		Refresh: false,
  3663  	})
  3664  
  3665  	a.cache.RegisterType(cachetype.NodeServicesName, &cachetype.NodeServices{
  3666  		RPC: a,
  3667  	}, &cache.RegisterOptions{
  3668  		// Maintain a blocking query, retry dropped connections quickly
  3669  		Refresh:        true,
  3670  		RefreshTimer:   0 * time.Second,
  3671  		RefreshTimeout: 10 * time.Minute,
  3672  	})
  3673  }
  3674  
  3675  // defaultProxyCommand returns the default Connect managed proxy command.
  3676  func defaultProxyCommand(agentCfg *config.RuntimeConfig) ([]string, error) {
  3677  	// Get the path to the current executable. This is cached once by the
  3678  	// library so this is effectively just a variable read.
  3679  	execPath, err := os.Executable()
  3680  	if err != nil {
  3681  		return nil, err
  3682  	}
  3683  
  3684  	// "consul connect proxy" default value for managed daemon proxy
  3685  	cmd := []string{execPath, "connect", "proxy"}
  3686  
  3687  	if agentCfg != nil && agentCfg.LogLevel != "INFO" {
  3688  		cmd = append(cmd, "-log-level", agentCfg.LogLevel)
  3689  	}
  3690  	return cmd, nil
  3691  }