github.com/Iqoqo/consul@v1.4.5/agent/agent.go (about) 1 package agent 2 3 import ( 4 "context" 5 "crypto/sha512" 6 "crypto/tls" 7 "encoding/json" 8 "fmt" 9 "io" 10 "io/ioutil" 11 "log" 12 "net" 13 "net/http" 14 "os" 15 "path/filepath" 16 "strconv" 17 "strings" 18 "sync" 19 "time" 20 21 "google.golang.org/grpc" 22 23 metrics "github.com/armon/go-metrics" 24 "github.com/hashicorp/consul/acl" 25 "github.com/hashicorp/consul/agent/ae" 26 "github.com/hashicorp/consul/agent/cache" 27 cachetype "github.com/hashicorp/consul/agent/cache-types" 28 "github.com/hashicorp/consul/agent/checks" 29 "github.com/hashicorp/consul/agent/config" 30 "github.com/hashicorp/consul/agent/consul" 31 "github.com/hashicorp/consul/agent/local" 32 "github.com/hashicorp/consul/agent/proxycfg" 33 "github.com/hashicorp/consul/agent/proxyprocess" 34 "github.com/hashicorp/consul/agent/structs" 35 "github.com/hashicorp/consul/agent/systemd" 36 "github.com/hashicorp/consul/agent/token" 37 "github.com/hashicorp/consul/agent/xds" 38 "github.com/hashicorp/consul/api" 39 "github.com/hashicorp/consul/ipaddr" 40 "github.com/hashicorp/consul/lib" 41 "github.com/hashicorp/consul/lib/file" 42 "github.com/hashicorp/consul/logger" 43 "github.com/hashicorp/consul/tlsutil" 44 "github.com/hashicorp/consul/types" 45 "github.com/hashicorp/consul/watch" 46 multierror "github.com/hashicorp/go-multierror" 47 uuid "github.com/hashicorp/go-uuid" 48 "github.com/hashicorp/memberlist" 49 "github.com/hashicorp/raft" 50 "github.com/hashicorp/serf/serf" 51 "github.com/shirou/gopsutil/host" 52 "golang.org/x/net/http2" 53 ) 54 55 const ( 56 // Path to save agent service definitions 57 servicesDir = "services" 58 59 // Path to save agent proxy definitions 60 proxyDir = "proxies" 61 62 // Path to save local agent checks 63 checksDir = "checks" 64 checkStateDir = "checks/state" 65 66 // Name of the file tokens will be persisted within 67 tokensPath = "acl-tokens.json" 68 69 // Default reasons for node/service maintenance mode 70 defaultNodeMaintReason = "Maintenance mode is enabled for this node, " + 71 "but no reason was provided. This is a default message." 72 defaultServiceMaintReason = "Maintenance mode is enabled for this " + 73 "service, but no reason was provided. This is a default message." 74 ) 75 76 type configSource int 77 78 const ( 79 ConfigSourceLocal configSource = iota 80 ConfigSourceRemote 81 ) 82 83 // delegate defines the interface shared by both 84 // consul.Client and consul.Server. 85 type delegate interface { 86 Encrypted() bool 87 GetLANCoordinate() (lib.CoordinateSet, error) 88 Leave() error 89 LANMembers() []serf.Member 90 LANMembersAllSegments() ([]serf.Member, error) 91 LANSegmentMembers(segment string) ([]serf.Member, error) 92 LocalMember() serf.Member 93 JoinLAN(addrs []string) (n int, err error) 94 RemoveFailedNode(node string) error 95 ResolveToken(secretID string) (acl.Authorizer, error) 96 RPC(method string, args interface{}, reply interface{}) error 97 ACLsEnabled() bool 98 UseLegacyACLs() bool 99 SnapshotRPC(args *structs.SnapshotRequest, in io.Reader, out io.Writer, replyFn structs.SnapshotReplyFn) error 100 Shutdown() error 101 Stats() map[string]map[string]string 102 ReloadConfig(config *consul.Config) error 103 enterpriseDelegate 104 } 105 106 // notifier is called after a successful JoinLAN. 107 type notifier interface { 108 Notify(string) error 109 } 110 111 // The agent is the long running process that is run on every machine. 112 // It exposes an RPC interface that is used by the CLI to control the 113 // agent. The agent runs the query interfaces like HTTP, DNS, and RPC. 114 // However, it can run in either a client, or server mode. In server 115 // mode, it runs a full Consul server. In client-only mode, it only forwards 116 // requests to other Consul servers. 117 type Agent struct { 118 // config is the agent configuration. 119 config *config.RuntimeConfig 120 121 // Used for writing our logs 122 logger *log.Logger 123 124 // Output sink for logs 125 LogOutput io.Writer 126 127 // Used for streaming logs to 128 LogWriter *logger.LogWriter 129 130 // In-memory sink used for collecting metrics 131 MemSink *metrics.InmemSink 132 133 // delegate is either a *consul.Server or *consul.Client 134 // depending on the configuration 135 delegate delegate 136 137 // aclMasterAuthorizer is an object that helps manage local ACL enforcement. 138 aclMasterAuthorizer acl.Authorizer 139 140 // state stores a local representation of the node, 141 // services and checks. Used for anti-entropy. 142 State *local.State 143 144 // sync manages the synchronization of the local 145 // and the remote state. 146 sync *ae.StateSyncer 147 148 // syncMu and syncCh are used to coordinate agent endpoints that are blocking 149 // on local state during a config reload. 150 syncMu sync.Mutex 151 syncCh chan struct{} 152 153 // cache is the in-memory cache for data the Agent requests. 154 cache *cache.Cache 155 156 // checkReapAfter maps the check ID to a timeout after which we should 157 // reap its associated service 158 checkReapAfter map[types.CheckID]time.Duration 159 160 // checkMonitors maps the check ID to an associated monitor 161 checkMonitors map[types.CheckID]*checks.CheckMonitor 162 163 // checkHTTPs maps the check ID to an associated HTTP check 164 checkHTTPs map[types.CheckID]*checks.CheckHTTP 165 166 // checkTCPs maps the check ID to an associated TCP check 167 checkTCPs map[types.CheckID]*checks.CheckTCP 168 169 // checkGRPCs maps the check ID to an associated GRPC check 170 checkGRPCs map[types.CheckID]*checks.CheckGRPC 171 172 // checkTTLs maps the check ID to an associated check TTL 173 checkTTLs map[types.CheckID]*checks.CheckTTL 174 175 // checkDockers maps the check ID to an associated Docker Exec based check 176 checkDockers map[types.CheckID]*checks.CheckDocker 177 178 // checkAliases maps the check ID to an associated Alias checks 179 checkAliases map[types.CheckID]*checks.CheckAlias 180 181 // stateLock protects the agent state 182 stateLock sync.Mutex 183 184 // dockerClient is the client for performing docker health checks. 185 dockerClient *checks.DockerClient 186 187 // eventCh is used to receive user events 188 eventCh chan serf.UserEvent 189 190 // eventBuf stores the most recent events in a ring buffer 191 // using eventIndex as the next index to insert into. This 192 // is guarded by eventLock. When an insert happens, the 193 // eventNotify group is notified. 194 eventBuf []*UserEvent 195 eventIndex int 196 eventLock sync.RWMutex 197 eventNotify NotifyGroup 198 199 reloadCh chan chan error 200 201 shutdown bool 202 shutdownCh chan struct{} 203 shutdownLock sync.Mutex 204 205 // joinLANNotifier is called after a successful JoinLAN. 206 joinLANNotifier notifier 207 208 // retryJoinCh transports errors from the retry join 209 // attempts. 210 retryJoinCh chan error 211 212 // endpoints maps unique RPC endpoint names to common ones 213 // to allow overriding of RPC handlers since the golang 214 // net/rpc server does not allow this. 215 endpoints map[string]string 216 endpointsLock sync.RWMutex 217 218 // dnsServer provides the DNS API 219 dnsServers []*DNSServer 220 221 // httpServers provides the HTTP API on various endpoints 222 httpServers []*HTTPServer 223 224 // wgServers is the wait group for all HTTP and DNS servers 225 wgServers sync.WaitGroup 226 227 // watchPlans tracks all the currently-running watch plans for the 228 // agent. 229 watchPlans []*watch.Plan 230 231 // tokens holds ACL tokens initially from the configuration, but can 232 // be updated at runtime, so should always be used instead of going to 233 // the configuration directly. 234 tokens *token.Store 235 236 // proxyManager is the proxy process manager for managed Connect proxies. 237 proxyManager *proxyprocess.Manager 238 239 // proxyConfig is the manager for proxy service (Kind = connect-proxy) 240 // configuration state. This ensures all state needed by a proxy registration 241 // is maintained in cache and handles pushing updates to that state into XDS 242 // server to be pushed out to Envoy. This is NOT related to managed proxies 243 // directly. 244 proxyConfig *proxycfg.Manager 245 246 // xdsServer is the Server instance that serves xDS gRPC API. 247 xdsServer *xds.Server 248 249 // grpcServer is the server instance used currently to serve xDS API for 250 // Envoy. 251 grpcServer *grpc.Server 252 253 // tlsConfigurator is the central instance to provide a *tls.Config 254 // based on the current consul configuration. 255 tlsConfigurator *tlsutil.Configurator 256 257 // persistedTokensLock is used to synchronize access to the persisted token 258 // store within the data directory. This will prevent loading while writing as 259 // well as multiple concurrent writes. 260 persistedTokensLock sync.RWMutex 261 } 262 263 func New(c *config.RuntimeConfig) (*Agent, error) { 264 if c.Datacenter == "" { 265 return nil, fmt.Errorf("Must configure a Datacenter") 266 } 267 if c.DataDir == "" && !c.DevMode { 268 return nil, fmt.Errorf("Must configure a DataDir") 269 } 270 271 a := &Agent{ 272 config: c, 273 checkReapAfter: make(map[types.CheckID]time.Duration), 274 checkMonitors: make(map[types.CheckID]*checks.CheckMonitor), 275 checkTTLs: make(map[types.CheckID]*checks.CheckTTL), 276 checkHTTPs: make(map[types.CheckID]*checks.CheckHTTP), 277 checkTCPs: make(map[types.CheckID]*checks.CheckTCP), 278 checkGRPCs: make(map[types.CheckID]*checks.CheckGRPC), 279 checkDockers: make(map[types.CheckID]*checks.CheckDocker), 280 checkAliases: make(map[types.CheckID]*checks.CheckAlias), 281 eventCh: make(chan serf.UserEvent, 1024), 282 eventBuf: make([]*UserEvent, 256), 283 joinLANNotifier: &systemd.Notifier{}, 284 reloadCh: make(chan chan error), 285 retryJoinCh: make(chan error), 286 shutdownCh: make(chan struct{}), 287 endpoints: make(map[string]string), 288 tokens: new(token.Store), 289 } 290 291 if err := a.initializeACLs(); err != nil { 292 return nil, err 293 } 294 295 return a, nil 296 } 297 298 func LocalConfig(cfg *config.RuntimeConfig) local.Config { 299 lc := local.Config{ 300 AdvertiseAddr: cfg.AdvertiseAddrLAN.String(), 301 CheckUpdateInterval: cfg.CheckUpdateInterval, 302 Datacenter: cfg.Datacenter, 303 DiscardCheckOutput: cfg.DiscardCheckOutput, 304 NodeID: cfg.NodeID, 305 NodeName: cfg.NodeName, 306 TaggedAddresses: map[string]string{}, 307 ProxyBindMinPort: cfg.ConnectProxyBindMinPort, 308 ProxyBindMaxPort: cfg.ConnectProxyBindMaxPort, 309 } 310 for k, v := range cfg.TaggedAddresses { 311 lc.TaggedAddresses[k] = v 312 } 313 return lc 314 } 315 316 func (a *Agent) setupProxyManager() error { 317 acfg, err := a.config.APIConfig(true) 318 if err != nil { 319 return fmt.Errorf("[INFO] agent: Connect managed proxies are disabled due to providing an invalid HTTP configuration") 320 } 321 a.proxyManager = proxyprocess.NewManager() 322 a.proxyManager.AllowRoot = a.config.ConnectProxyAllowManagedRoot 323 a.proxyManager.State = a.State 324 a.proxyManager.Logger = a.logger 325 if a.config.DataDir != "" { 326 // DataDir is required for all non-dev mode agents, but we want 327 // to allow setting the data dir for demos and so on for the agent, 328 // so do the check above instead. 329 a.proxyManager.DataDir = filepath.Join(a.config.DataDir, "proxy") 330 331 // Restore from our snapshot (if it exists) 332 if err := a.proxyManager.Restore(a.proxyManager.SnapshotPath()); err != nil { 333 a.logger.Printf("[WARN] agent: error restoring proxy state: %s", err) 334 } 335 } 336 a.proxyManager.ProxyEnv = acfg.GenerateEnv() 337 return nil 338 } 339 340 func (a *Agent) Start() error { 341 a.stateLock.Lock() 342 defer a.stateLock.Unlock() 343 344 c := a.config 345 346 logOutput := a.LogOutput 347 if a.logger == nil { 348 if logOutput == nil { 349 logOutput = os.Stderr 350 } 351 a.logger = log.New(logOutput, "", log.LstdFlags) 352 } 353 354 // Retrieve or generate the node ID before setting up the rest of the 355 // agent, which depends on it. 356 if err := a.setupNodeID(c); err != nil { 357 return fmt.Errorf("Failed to setup node ID: %v", err) 358 } 359 360 // Warn if the node name is incompatible with DNS 361 if InvalidDnsRe.MatchString(a.config.NodeName) { 362 a.logger.Printf("[WARN] agent: Node name %q will not be discoverable "+ 363 "via DNS due to invalid characters. Valid characters include "+ 364 "all alpha-numerics and dashes.", a.config.NodeName) 365 } else if len(a.config.NodeName) > MaxDNSLabelLength { 366 a.logger.Printf("[WARN] agent: Node name %q will not be discoverable "+ 367 "via DNS due to it being too long. Valid lengths are between "+ 368 "1 and 63 bytes.", a.config.NodeName) 369 } 370 371 // load the tokens - this requires the logger to be setup 372 // which is why we can't do this in New 373 a.loadTokens(a.config) 374 375 // create the local state 376 a.State = local.NewState(LocalConfig(c), a.logger, a.tokens) 377 378 // create the state synchronization manager which performs 379 // regular and on-demand state synchronizations (anti-entropy). 380 a.sync = ae.NewStateSyncer(a.State, c.AEInterval, a.shutdownCh, a.logger) 381 382 // create the cache 383 a.cache = cache.New(nil) 384 385 // create the config for the rpc server/client 386 consulCfg, err := a.consulConfig() 387 if err != nil { 388 return err 389 } 390 391 // ServerUp is used to inform that a new consul server is now 392 // up. This can be used to speed up the sync process if we are blocking 393 // waiting to discover a consul server 394 consulCfg.ServerUp = a.sync.SyncFull.Trigger 395 396 tlsConfigurator, err := tlsutil.NewConfigurator(c.ToTLSUtilConfig(), a.logger) 397 if err != nil { 398 return err 399 } 400 a.tlsConfigurator = tlsConfigurator 401 402 // Setup either the client or the server. 403 if c.ServerMode { 404 server, err := consul.NewServerLogger(consulCfg, a.logger, a.tokens, a.tlsConfigurator) 405 if err != nil { 406 return fmt.Errorf("Failed to start Consul server: %v", err) 407 } 408 a.delegate = server 409 } else { 410 client, err := consul.NewClientLogger(consulCfg, a.logger, a.tlsConfigurator) 411 if err != nil { 412 return fmt.Errorf("Failed to start Consul client: %v", err) 413 } 414 a.delegate = client 415 } 416 417 // the staggering of the state syncing depends on the cluster size. 418 a.sync.ClusterSize = func() int { return len(a.delegate.LANMembers()) } 419 420 // link the state with the consul server/client and the state syncer 421 // via callbacks. After several attempts this was easier than using 422 // channels since the event notification needs to be non-blocking 423 // and that should be hidden in the state syncer implementation. 424 a.State.Delegate = a.delegate 425 a.State.TriggerSyncChanges = a.sync.SyncChanges.Trigger 426 427 // Register the cache. We do this much later so the delegate is 428 // populated from above. 429 a.registerCache() 430 431 // Load checks/services/metadata. 432 if err := a.loadServices(c); err != nil { 433 return err 434 } 435 if err := a.loadProxies(c); err != nil { 436 return err 437 } 438 if err := a.loadChecks(c); err != nil { 439 return err 440 } 441 if err := a.loadMetadata(c); err != nil { 442 return err 443 } 444 445 // create the proxy process manager and start it. This is purposely 446 // done here after the local state above is loaded in so we can have 447 // a more accurate initial state view. 448 if !c.ConnectTestDisableManagedProxies { 449 if err := a.setupProxyManager(); err != nil { 450 a.logger.Printf(err.Error()) 451 } else { 452 go a.proxyManager.Run() 453 } 454 } 455 456 // Start the proxy config manager. 457 a.proxyConfig, err = proxycfg.NewManager(proxycfg.ManagerConfig{ 458 Cache: a.cache, 459 Logger: a.logger, 460 State: a.State, 461 Source: &structs.QuerySource{ 462 Node: a.config.NodeName, 463 Datacenter: a.config.Datacenter, 464 Segment: a.config.SegmentName, 465 }, 466 }) 467 if err != nil { 468 return err 469 } 470 go func() { 471 if err := a.proxyConfig.Run(); err != nil { 472 a.logger.Printf("[ERR] Proxy Config Manager exited: %s", err) 473 } 474 }() 475 476 // Start watching for critical services to deregister, based on their 477 // checks. 478 go a.reapServices() 479 480 // Start handling events. 481 go a.handleEvents() 482 483 // Start sending network coordinate to the server. 484 if !c.DisableCoordinates { 485 go a.sendCoordinate() 486 } 487 488 // Write out the PID file if necessary. 489 if err := a.storePid(); err != nil { 490 return err 491 } 492 493 // start DNS servers 494 if err := a.listenAndServeDNS(); err != nil { 495 return err 496 } 497 498 // Create listeners and unstarted servers; see comment on listenHTTP why 499 // we are doing this. 500 servers, err := a.listenHTTP() 501 if err != nil { 502 return err 503 } 504 505 // Start HTTP and HTTPS servers. 506 for _, srv := range servers { 507 if err := a.serveHTTP(srv); err != nil { 508 return err 509 } 510 a.httpServers = append(a.httpServers, srv) 511 } 512 513 // Start gRPC server. 514 if err := a.listenAndServeGRPC(); err != nil { 515 return err 516 } 517 518 // register watches 519 if err := a.reloadWatches(a.config); err != nil { 520 return err 521 } 522 523 // start retry join 524 go a.retryJoinLAN() 525 go a.retryJoinWAN() 526 527 return nil 528 } 529 530 func (a *Agent) listenAndServeGRPC() error { 531 if len(a.config.GRPCAddrs) < 1 { 532 return nil 533 } 534 535 a.xdsServer = &xds.Server{ 536 Logger: a.logger, 537 CfgMgr: a.proxyConfig, 538 Authz: a, 539 ResolveToken: a.resolveToken, 540 } 541 a.xdsServer.Initialize() 542 543 var err error 544 if a.config.HTTPSPort > 0 { 545 // gRPC uses the same TLS settings as the HTTPS API. If HTTPS is 546 // enabled then gRPC will require HTTPS as well. 547 a.grpcServer, err = a.xdsServer.GRPCServer(a.config.CertFile, a.config.KeyFile) 548 } else { 549 a.grpcServer, err = a.xdsServer.GRPCServer("", "") 550 } 551 if err != nil { 552 return err 553 } 554 555 ln, err := a.startListeners(a.config.GRPCAddrs) 556 if err != nil { 557 return err 558 } 559 560 for _, l := range ln { 561 go func(innerL net.Listener) { 562 a.logger.Printf("[INFO] agent: Started gRPC server on %s (%s)", 563 innerL.Addr().String(), innerL.Addr().Network()) 564 err := a.grpcServer.Serve(innerL) 565 if err != nil { 566 a.logger.Printf("[ERR] gRPC server failed: %s", err) 567 } 568 }(l) 569 } 570 return nil 571 } 572 573 func (a *Agent) listenAndServeDNS() error { 574 notif := make(chan net.Addr, len(a.config.DNSAddrs)) 575 errCh := make(chan error, len(a.config.DNSAddrs)) 576 for _, addr := range a.config.DNSAddrs { 577 // create server 578 s, err := NewDNSServer(a) 579 if err != nil { 580 return err 581 } 582 a.dnsServers = append(a.dnsServers, s) 583 584 // start server 585 a.wgServers.Add(1) 586 go func(addr net.Addr) { 587 defer a.wgServers.Done() 588 err := s.ListenAndServe(addr.Network(), addr.String(), func() { notif <- addr }) 589 if err != nil && !strings.Contains(err.Error(), "accept") { 590 errCh <- err 591 } 592 }(addr) 593 } 594 595 // wait for servers to be up 596 timeout := time.After(time.Second) 597 var merr *multierror.Error 598 for range a.config.DNSAddrs { 599 select { 600 case addr := <-notif: 601 a.logger.Printf("[INFO] agent: Started DNS server %s (%s)", addr.String(), addr.Network()) 602 603 case err := <-errCh: 604 merr = multierror.Append(merr, err) 605 case <-timeout: 606 merr = multierror.Append(merr, fmt.Errorf("agent: timeout starting DNS servers")) 607 break 608 } 609 } 610 return merr.ErrorOrNil() 611 } 612 613 func (a *Agent) startListeners(addrs []net.Addr) ([]net.Listener, error) { 614 var ln []net.Listener 615 for _, addr := range addrs { 616 var l net.Listener 617 var err error 618 619 switch x := addr.(type) { 620 case *net.UnixAddr: 621 l, err = a.listenSocket(x.Name) 622 if err != nil { 623 return nil, err 624 } 625 626 case *net.TCPAddr: 627 l, err = net.Listen("tcp", x.String()) 628 if err != nil { 629 return nil, err 630 } 631 l = &tcpKeepAliveListener{l.(*net.TCPListener)} 632 633 default: 634 return nil, fmt.Errorf("unsupported address type %T", addr) 635 } 636 ln = append(ln, l) 637 } 638 return ln, nil 639 } 640 641 // listenHTTP binds listeners to the provided addresses and also returns 642 // pre-configured HTTP servers which are not yet started. The motivation is 643 // that in the current startup/shutdown setup we de-couple the listener 644 // creation from the server startup assuming that if any of the listeners 645 // cannot be bound we fail immediately and later failures do not occur. 646 // Therefore, starting a server with a running listener is assumed to not 647 // produce an error. 648 // 649 // The second motivation is that an HTTPS server needs to use the same TLSConfig 650 // on both the listener and the HTTP server. When listeners and servers are 651 // created at different times this becomes difficult to handle without keeping 652 // the TLS configuration somewhere or recreating it. 653 // 654 // This approach should ultimately be refactored to the point where we just 655 // start the server and any error should trigger a proper shutdown of the agent. 656 func (a *Agent) listenHTTP() ([]*HTTPServer, error) { 657 var ln []net.Listener 658 var servers []*HTTPServer 659 start := func(proto string, addrs []net.Addr) error { 660 listeners, err := a.startListeners(addrs) 661 if err != nil { 662 return err 663 } 664 665 for _, l := range listeners { 666 var tlscfg *tls.Config 667 _, isTCP := l.(*tcpKeepAliveListener) 668 if isTCP && proto == "https" { 669 tlscfg = a.tlsConfigurator.IncomingHTTPSConfig() 670 l = tls.NewListener(l, tlscfg) 671 } 672 srv := &HTTPServer{ 673 Server: &http.Server{ 674 Addr: l.Addr().String(), 675 TLSConfig: tlscfg, 676 }, 677 ln: l, 678 agent: a, 679 blacklist: NewBlacklist(a.config.HTTPBlockEndpoints), 680 proto: proto, 681 } 682 srv.Server.Handler = srv.handler(a.config.EnableDebug) 683 684 // This will enable upgrading connections to HTTP/2 as 685 // part of TLS negotiation. 686 if proto == "https" { 687 err = http2.ConfigureServer(srv.Server, nil) 688 if err != nil { 689 return err 690 } 691 } 692 693 ln = append(ln, l) 694 servers = append(servers, srv) 695 } 696 return nil 697 } 698 699 if err := start("http", a.config.HTTPAddrs); err != nil { 700 for _, l := range ln { 701 l.Close() 702 } 703 return nil, err 704 } 705 if err := start("https", a.config.HTTPSAddrs); err != nil { 706 for _, l := range ln { 707 l.Close() 708 } 709 return nil, err 710 } 711 return servers, nil 712 } 713 714 // tcpKeepAliveListener sets TCP keep-alive timeouts on accepted 715 // connections. It's used so dead TCP connections eventually go away. 716 type tcpKeepAliveListener struct { 717 *net.TCPListener 718 } 719 720 func (ln tcpKeepAliveListener) Accept() (c net.Conn, err error) { 721 tc, err := ln.AcceptTCP() 722 if err != nil { 723 return 724 } 725 tc.SetKeepAlive(true) 726 tc.SetKeepAlivePeriod(30 * time.Second) 727 return tc, nil 728 } 729 730 func (a *Agent) listenSocket(path string) (net.Listener, error) { 731 if _, err := os.Stat(path); !os.IsNotExist(err) { 732 a.logger.Printf("[WARN] agent: Replacing socket %q", path) 733 } 734 if err := os.Remove(path); err != nil && !os.IsNotExist(err) { 735 return nil, fmt.Errorf("error removing socket file: %s", err) 736 } 737 l, err := net.Listen("unix", path) 738 if err != nil { 739 return nil, err 740 } 741 user, group, mode := a.config.UnixSocketUser, a.config.UnixSocketGroup, a.config.UnixSocketMode 742 if err := setFilePermissions(path, user, group, mode); err != nil { 743 return nil, fmt.Errorf("Failed setting up socket: %s", err) 744 } 745 return l, nil 746 } 747 748 func (a *Agent) serveHTTP(srv *HTTPServer) error { 749 // https://github.com/golang/go/issues/20239 750 // 751 // In go.8.1 there is a race between Serve and Shutdown. If 752 // Shutdown is called before the Serve go routine was scheduled then 753 // the Serve go routine never returns. This deadlocks the agent 754 // shutdown for some tests since it will wait forever. 755 notif := make(chan net.Addr) 756 a.wgServers.Add(1) 757 go func() { 758 defer a.wgServers.Done() 759 notif <- srv.ln.Addr() 760 err := srv.Serve(srv.ln) 761 if err != nil && err != http.ErrServerClosed { 762 a.logger.Print(err) 763 } 764 }() 765 766 select { 767 case addr := <-notif: 768 if srv.proto == "https" { 769 a.logger.Printf("[INFO] agent: Started HTTPS server on %s (%s)", addr.String(), addr.Network()) 770 } else { 771 a.logger.Printf("[INFO] agent: Started HTTP server on %s (%s)", addr.String(), addr.Network()) 772 } 773 return nil 774 case <-time.After(time.Second): 775 return fmt.Errorf("agent: timeout starting HTTP servers") 776 } 777 } 778 779 // reloadWatches stops any existing watch plans and attempts to load the given 780 // set of watches. 781 func (a *Agent) reloadWatches(cfg *config.RuntimeConfig) error { 782 // Stop the current watches. 783 for _, wp := range a.watchPlans { 784 wp.Stop() 785 } 786 a.watchPlans = nil 787 788 // Return if there are no watches now. 789 if len(cfg.Watches) == 0 { 790 return nil 791 } 792 793 // Watches use the API to talk to this agent, so that must be enabled. 794 if len(cfg.HTTPAddrs) == 0 && len(cfg.HTTPSAddrs) == 0 { 795 return fmt.Errorf("watch plans require an HTTP or HTTPS endpoint") 796 } 797 798 // Compile the watches 799 var watchPlans []*watch.Plan 800 for _, params := range cfg.Watches { 801 if handlerType, ok := params["handler_type"]; !ok { 802 params["handler_type"] = "script" 803 } else if handlerType != "http" && handlerType != "script" { 804 return fmt.Errorf("Handler type '%s' not recognized", params["handler_type"]) 805 } 806 807 // Don't let people use connect watches via this mechanism for now as it 808 // needs thought about how to do securely and shouldn't be necessary. Note 809 // that if the type assertion fails an type is not a string then 810 // ParseExample below will error so we don't need to handle that case. 811 if typ, ok := params["type"].(string); ok { 812 if strings.HasPrefix(typ, "connect_") { 813 return fmt.Errorf("Watch type %s is not allowed in agent config", typ) 814 } 815 } 816 817 // Parse the watches, excluding 'handler' and 'args' 818 wp, err := watch.ParseExempt(params, []string{"handler", "args"}) 819 if err != nil { 820 return fmt.Errorf("Failed to parse watch (%#v): %v", params, err) 821 } 822 823 // Get the handler and subprocess arguments 824 handler, hasHandler := wp.Exempt["handler"] 825 args, hasArgs := wp.Exempt["args"] 826 if hasHandler { 827 a.logger.Printf("[WARN] agent: The 'handler' field in watches has been deprecated " + 828 "and replaced with the 'args' field. See https://www.consul.io/docs/agent/watches.html") 829 } 830 if _, ok := handler.(string); hasHandler && !ok { 831 return fmt.Errorf("Watch handler must be a string") 832 } 833 if raw, ok := args.([]interface{}); hasArgs && ok { 834 var parsed []string 835 for _, arg := range raw { 836 v, ok := arg.(string) 837 if !ok { 838 return fmt.Errorf("Watch args must be a list of strings") 839 } 840 841 parsed = append(parsed, v) 842 } 843 wp.Exempt["args"] = parsed 844 } else if hasArgs && !ok { 845 return fmt.Errorf("Watch args must be a list of strings") 846 } 847 if hasHandler && hasArgs || hasHandler && wp.HandlerType == "http" || hasArgs && wp.HandlerType == "http" { 848 return fmt.Errorf("Only one watch handler allowed") 849 } 850 if !hasHandler && !hasArgs && wp.HandlerType != "http" { 851 return fmt.Errorf("Must define a watch handler") 852 } 853 854 // Store the watch plan 855 watchPlans = append(watchPlans, wp) 856 } 857 858 // Fire off a goroutine for each new watch plan. 859 for _, wp := range watchPlans { 860 config, err := a.config.APIConfig(true) 861 if err != nil { 862 a.logger.Printf("[ERR] agent: Failed to run watch: %v", err) 863 continue 864 } 865 866 a.watchPlans = append(a.watchPlans, wp) 867 go func(wp *watch.Plan) { 868 if h, ok := wp.Exempt["handler"]; ok { 869 wp.Handler = makeWatchHandler(a.LogOutput, h) 870 } else if h, ok := wp.Exempt["args"]; ok { 871 wp.Handler = makeWatchHandler(a.LogOutput, h) 872 } else { 873 httpConfig := wp.Exempt["http_handler_config"].(*watch.HttpHandlerConfig) 874 wp.Handler = makeHTTPWatchHandler(a.LogOutput, httpConfig) 875 } 876 wp.LogOutput = a.LogOutput 877 878 addr := config.Address 879 if config.Scheme == "https" { 880 addr = "https://" + addr 881 } 882 883 if err := wp.RunWithConfig(addr, config); err != nil { 884 a.logger.Printf("[ERR] agent: Failed to run watch: %v", err) 885 } 886 }(wp) 887 } 888 return nil 889 } 890 891 // consulConfig is used to return a consul configuration 892 func (a *Agent) consulConfig() (*consul.Config, error) { 893 // Start with the provided config or default config 894 base := consul.DefaultConfig() 895 896 // This is set when the agent starts up 897 base.NodeID = a.config.NodeID 898 899 // Apply dev mode 900 base.DevMode = a.config.DevMode 901 902 // Override with our config 903 // todo(fs): these are now always set in the runtime config so we can simplify this 904 // todo(fs): or is there a reason to keep it like that? 905 base.Datacenter = a.config.Datacenter 906 base.PrimaryDatacenter = a.config.PrimaryDatacenter 907 base.DataDir = a.config.DataDir 908 base.NodeName = a.config.NodeName 909 910 base.CoordinateUpdateBatchSize = a.config.ConsulCoordinateUpdateBatchSize 911 base.CoordinateUpdateMaxBatches = a.config.ConsulCoordinateUpdateMaxBatches 912 base.CoordinateUpdatePeriod = a.config.ConsulCoordinateUpdatePeriod 913 914 base.RaftConfig.HeartbeatTimeout = a.config.ConsulRaftHeartbeatTimeout 915 base.RaftConfig.LeaderLeaseTimeout = a.config.ConsulRaftLeaderLeaseTimeout 916 base.RaftConfig.ElectionTimeout = a.config.ConsulRaftElectionTimeout 917 918 base.SerfLANConfig.MemberlistConfig.BindAddr = a.config.SerfBindAddrLAN.IP.String() 919 base.SerfLANConfig.MemberlistConfig.BindPort = a.config.SerfBindAddrLAN.Port 920 base.SerfLANConfig.MemberlistConfig.AdvertiseAddr = a.config.SerfAdvertiseAddrLAN.IP.String() 921 base.SerfLANConfig.MemberlistConfig.AdvertisePort = a.config.SerfAdvertiseAddrLAN.Port 922 base.SerfLANConfig.MemberlistConfig.GossipVerifyIncoming = a.config.EncryptVerifyIncoming 923 base.SerfLANConfig.MemberlistConfig.GossipVerifyOutgoing = a.config.EncryptVerifyOutgoing 924 base.SerfLANConfig.MemberlistConfig.GossipInterval = a.config.GossipLANGossipInterval 925 base.SerfLANConfig.MemberlistConfig.GossipNodes = a.config.GossipLANGossipNodes 926 base.SerfLANConfig.MemberlistConfig.ProbeInterval = a.config.GossipLANProbeInterval 927 base.SerfLANConfig.MemberlistConfig.ProbeTimeout = a.config.GossipLANProbeTimeout 928 base.SerfLANConfig.MemberlistConfig.SuspicionMult = a.config.GossipLANSuspicionMult 929 base.SerfLANConfig.MemberlistConfig.RetransmitMult = a.config.GossipLANRetransmitMult 930 if a.config.ReconnectTimeoutLAN != 0 { 931 base.SerfLANConfig.ReconnectTimeout = a.config.ReconnectTimeoutLAN 932 } 933 934 if a.config.SerfBindAddrWAN != nil { 935 base.SerfWANConfig.MemberlistConfig.BindAddr = a.config.SerfBindAddrWAN.IP.String() 936 base.SerfWANConfig.MemberlistConfig.BindPort = a.config.SerfBindAddrWAN.Port 937 base.SerfWANConfig.MemberlistConfig.AdvertiseAddr = a.config.SerfAdvertiseAddrWAN.IP.String() 938 base.SerfWANConfig.MemberlistConfig.AdvertisePort = a.config.SerfAdvertiseAddrWAN.Port 939 base.SerfWANConfig.MemberlistConfig.GossipVerifyIncoming = a.config.EncryptVerifyIncoming 940 base.SerfWANConfig.MemberlistConfig.GossipVerifyOutgoing = a.config.EncryptVerifyOutgoing 941 base.SerfWANConfig.MemberlistConfig.GossipInterval = a.config.GossipWANGossipInterval 942 base.SerfWANConfig.MemberlistConfig.GossipNodes = a.config.GossipWANGossipNodes 943 base.SerfWANConfig.MemberlistConfig.ProbeInterval = a.config.GossipWANProbeInterval 944 base.SerfWANConfig.MemberlistConfig.ProbeTimeout = a.config.GossipWANProbeTimeout 945 base.SerfWANConfig.MemberlistConfig.SuspicionMult = a.config.GossipWANSuspicionMult 946 base.SerfWANConfig.MemberlistConfig.RetransmitMult = a.config.GossipWANRetransmitMult 947 if a.config.ReconnectTimeoutWAN != 0 { 948 base.SerfWANConfig.ReconnectTimeout = a.config.ReconnectTimeoutWAN 949 } 950 } else { 951 // Disable serf WAN federation 952 base.SerfWANConfig = nil 953 } 954 955 base.RPCAddr = a.config.RPCBindAddr 956 base.RPCAdvertise = a.config.RPCAdvertiseAddr 957 958 base.Segment = a.config.SegmentName 959 if len(a.config.Segments) > 0 { 960 segments, err := a.segmentConfig() 961 if err != nil { 962 return nil, err 963 } 964 base.Segments = segments 965 } 966 if a.config.Bootstrap { 967 base.Bootstrap = true 968 } 969 if a.config.RejoinAfterLeave { 970 base.RejoinAfterLeave = true 971 } 972 if a.config.BootstrapExpect != 0 { 973 base.BootstrapExpect = a.config.BootstrapExpect 974 } 975 if a.config.RPCProtocol > 0 { 976 base.ProtocolVersion = uint8(a.config.RPCProtocol) 977 } 978 if a.config.RaftProtocol != 0 { 979 base.RaftConfig.ProtocolVersion = raft.ProtocolVersion(a.config.RaftProtocol) 980 } 981 if a.config.RaftSnapshotThreshold != 0 { 982 base.RaftConfig.SnapshotThreshold = uint64(a.config.RaftSnapshotThreshold) 983 } 984 if a.config.RaftSnapshotInterval != 0 { 985 base.RaftConfig.SnapshotInterval = a.config.RaftSnapshotInterval 986 } 987 if a.config.ACLMasterToken != "" { 988 base.ACLMasterToken = a.config.ACLMasterToken 989 } 990 if a.config.ACLDatacenter != "" { 991 base.ACLDatacenter = a.config.ACLDatacenter 992 } 993 if a.config.ACLTokenTTL != 0 { 994 base.ACLTokenTTL = a.config.ACLTokenTTL 995 } 996 if a.config.ACLPolicyTTL != 0 { 997 base.ACLPolicyTTL = a.config.ACLPolicyTTL 998 } 999 if a.config.ACLDefaultPolicy != "" { 1000 base.ACLDefaultPolicy = a.config.ACLDefaultPolicy 1001 } 1002 if a.config.ACLDownPolicy != "" { 1003 base.ACLDownPolicy = a.config.ACLDownPolicy 1004 } 1005 base.ACLEnforceVersion8 = a.config.ACLEnforceVersion8 1006 base.ACLTokenReplication = a.config.ACLTokenReplication 1007 base.ACLsEnabled = a.config.ACLsEnabled 1008 if a.config.ACLEnableKeyListPolicy { 1009 base.ACLEnableKeyListPolicy = a.config.ACLEnableKeyListPolicy 1010 } 1011 if a.config.SessionTTLMin != 0 { 1012 base.SessionTTLMin = a.config.SessionTTLMin 1013 } 1014 if a.config.NonVotingServer { 1015 base.NonVoter = a.config.NonVotingServer 1016 } 1017 1018 // These are fully specified in the agent defaults, so we can simply 1019 // copy them over. 1020 base.AutopilotConfig.CleanupDeadServers = a.config.AutopilotCleanupDeadServers 1021 base.AutopilotConfig.LastContactThreshold = a.config.AutopilotLastContactThreshold 1022 base.AutopilotConfig.MaxTrailingLogs = uint64(a.config.AutopilotMaxTrailingLogs) 1023 base.AutopilotConfig.ServerStabilizationTime = a.config.AutopilotServerStabilizationTime 1024 base.AutopilotConfig.RedundancyZoneTag = a.config.AutopilotRedundancyZoneTag 1025 base.AutopilotConfig.DisableUpgradeMigration = a.config.AutopilotDisableUpgradeMigration 1026 base.AutopilotConfig.UpgradeVersionTag = a.config.AutopilotUpgradeVersionTag 1027 1028 // make sure the advertise address is always set 1029 if base.RPCAdvertise == nil { 1030 base.RPCAdvertise = base.RPCAddr 1031 } 1032 1033 // Rate limiting for RPC calls. 1034 if a.config.RPCRateLimit > 0 { 1035 base.RPCRate = a.config.RPCRateLimit 1036 } 1037 if a.config.RPCMaxBurst > 0 { 1038 base.RPCMaxBurst = a.config.RPCMaxBurst 1039 } 1040 1041 // RPC-related performance configs. 1042 if a.config.RPCHoldTimeout > 0 { 1043 base.RPCHoldTimeout = a.config.RPCHoldTimeout 1044 } 1045 if a.config.LeaveDrainTime > 0 { 1046 base.LeaveDrainTime = a.config.LeaveDrainTime 1047 } 1048 1049 // set the src address for outgoing rpc connections 1050 // Use port 0 so that outgoing connections use a random port. 1051 if !ipaddr.IsAny(base.RPCAddr.IP) { 1052 base.RPCSrcAddr = &net.TCPAddr{IP: base.RPCAddr.IP} 1053 } 1054 1055 // Format the build string 1056 revision := a.config.Revision 1057 if len(revision) > 8 { 1058 revision = revision[:8] 1059 } 1060 base.Build = fmt.Sprintf("%s%s:%s", a.config.Version, a.config.VersionPrerelease, revision) 1061 1062 // Copy the TLS configuration 1063 base.VerifyIncoming = a.config.VerifyIncoming || a.config.VerifyIncomingRPC 1064 if a.config.CAPath != "" || a.config.CAFile != "" { 1065 base.UseTLS = true 1066 } 1067 base.VerifyOutgoing = a.config.VerifyOutgoing 1068 base.VerifyServerHostname = a.config.VerifyServerHostname 1069 base.CAFile = a.config.CAFile 1070 base.CAPath = a.config.CAPath 1071 base.CertFile = a.config.CertFile 1072 base.KeyFile = a.config.KeyFile 1073 base.ServerName = a.config.ServerName 1074 base.Domain = a.config.DNSDomain 1075 base.TLSMinVersion = a.config.TLSMinVersion 1076 base.TLSCipherSuites = a.config.TLSCipherSuites 1077 base.TLSPreferServerCipherSuites = a.config.TLSPreferServerCipherSuites 1078 1079 // Copy the Connect CA bootstrap config 1080 if a.config.ConnectEnabled { 1081 base.ConnectEnabled = true 1082 1083 // Allow config to specify cluster_id provided it's a valid UUID. This is 1084 // meant only for tests where a deterministic ID makes fixtures much simpler 1085 // to work with but since it's only read on initial cluster bootstrap it's not 1086 // that much of a liability in production. The worst a user could do is 1087 // configure logically separate clusters with same ID by mistake but we can 1088 // avoid documenting this is even an option. 1089 if clusterID, ok := a.config.ConnectCAConfig["cluster_id"]; ok { 1090 if cIDStr, ok := clusterID.(string); ok { 1091 if _, err := uuid.ParseUUID(cIDStr); err == nil { 1092 // Valid UUID configured, use that 1093 base.CAConfig.ClusterID = cIDStr 1094 } 1095 } 1096 if base.CAConfig.ClusterID == "" { 1097 // If the tried to specify an ID but typoed it don't ignore as they will 1098 // then bootstrap with a new ID and have to throw away the whole cluster 1099 // and start again. 1100 a.logger.Println("[ERR] connect CA config cluster_id specified but " + 1101 "is not a valid UUID, aborting startup") 1102 return nil, fmt.Errorf("cluster_id was supplied but was not a valid UUID") 1103 } 1104 } 1105 1106 if a.config.ConnectCAProvider != "" { 1107 base.CAConfig.Provider = a.config.ConnectCAProvider 1108 } 1109 1110 // Merge connect CA Config regardless of provider (since there are some 1111 // common config options valid to all like leaf TTL). 1112 for k, v := range a.config.ConnectCAConfig { 1113 base.CAConfig.Config[k] = v 1114 } 1115 } 1116 1117 // Setup the user event callback 1118 base.UserEventHandler = func(e serf.UserEvent) { 1119 select { 1120 case a.eventCh <- e: 1121 case <-a.shutdownCh: 1122 } 1123 } 1124 1125 // Setup the loggers 1126 base.LogOutput = a.LogOutput 1127 1128 // This will set up the LAN keyring, as well as the WAN and any segments 1129 // for servers. 1130 if err := a.setupKeyrings(base); err != nil { 1131 return nil, fmt.Errorf("Failed to configure keyring: %v", err) 1132 } 1133 1134 return base, nil 1135 } 1136 1137 // Setup the serf and memberlist config for any defined network segments. 1138 func (a *Agent) segmentConfig() ([]consul.NetworkSegment, error) { 1139 var segments []consul.NetworkSegment 1140 config := a.config 1141 1142 for _, s := range config.Segments { 1143 serfConf := consul.DefaultConfig().SerfLANConfig 1144 1145 serfConf.MemberlistConfig.BindAddr = s.Bind.IP.String() 1146 serfConf.MemberlistConfig.BindPort = s.Bind.Port 1147 serfConf.MemberlistConfig.AdvertiseAddr = s.Advertise.IP.String() 1148 serfConf.MemberlistConfig.AdvertisePort = s.Advertise.Port 1149 1150 if config.ReconnectTimeoutLAN != 0 { 1151 serfConf.ReconnectTimeout = config.ReconnectTimeoutLAN 1152 } 1153 if config.EncryptVerifyIncoming { 1154 serfConf.MemberlistConfig.GossipVerifyIncoming = config.EncryptVerifyIncoming 1155 } 1156 if config.EncryptVerifyOutgoing { 1157 serfConf.MemberlistConfig.GossipVerifyOutgoing = config.EncryptVerifyOutgoing 1158 } 1159 1160 var rpcAddr *net.TCPAddr 1161 if s.RPCListener { 1162 rpcAddr = &net.TCPAddr{ 1163 IP: s.Bind.IP, 1164 Port: a.config.ServerPort, 1165 } 1166 } 1167 1168 segments = append(segments, consul.NetworkSegment{ 1169 Name: s.Name, 1170 Bind: serfConf.MemberlistConfig.BindAddr, 1171 Advertise: serfConf.MemberlistConfig.AdvertiseAddr, 1172 Port: s.Bind.Port, 1173 RPCAddr: rpcAddr, 1174 SerfConfig: serfConf, 1175 }) 1176 } 1177 1178 return segments, nil 1179 } 1180 1181 // makeRandomID will generate a random UUID for a node. 1182 func (a *Agent) makeRandomID() (string, error) { 1183 id, err := uuid.GenerateUUID() 1184 if err != nil { 1185 return "", err 1186 } 1187 1188 a.logger.Printf("[DEBUG] agent: Using random ID %q as node ID", id) 1189 return id, nil 1190 } 1191 1192 // makeNodeID will try to find a host-specific ID, or else will generate a 1193 // random ID. The returned ID will always be formatted as a GUID. We don't tell 1194 // the caller whether this ID is random or stable since the consequences are 1195 // high for us if this changes, so we will persist it either way. This will let 1196 // gopsutil change implementations without affecting in-place upgrades of nodes. 1197 func (a *Agent) makeNodeID() (string, error) { 1198 // If they've disabled host-based IDs then just make a random one. 1199 if a.config.DisableHostNodeID { 1200 return a.makeRandomID() 1201 } 1202 1203 // Try to get a stable ID associated with the host itself. 1204 info, err := host.Info() 1205 if err != nil { 1206 a.logger.Printf("[DEBUG] agent: Couldn't get a unique ID from the host: %v", err) 1207 return a.makeRandomID() 1208 } 1209 1210 // Make sure the host ID parses as a UUID, since we don't have complete 1211 // control over this process. 1212 id := strings.ToLower(info.HostID) 1213 if _, err := uuid.ParseUUID(id); err != nil { 1214 a.logger.Printf("[DEBUG] agent: Unique ID %q from host isn't formatted as a UUID: %v", 1215 id, err) 1216 return a.makeRandomID() 1217 } 1218 1219 // Hash the input to make it well distributed. The reported Host UUID may be 1220 // similar across nodes if they are on a cloud provider or on motherboards 1221 // created from the same batch. 1222 buf := sha512.Sum512([]byte(id)) 1223 id = fmt.Sprintf("%08x-%04x-%04x-%04x-%12x", 1224 buf[0:4], 1225 buf[4:6], 1226 buf[6:8], 1227 buf[8:10], 1228 buf[10:16]) 1229 1230 a.logger.Printf("[DEBUG] agent: Using unique ID %q from host as node ID", id) 1231 return id, nil 1232 } 1233 1234 // setupNodeID will pull the persisted node ID, if any, or create a random one 1235 // and persist it. 1236 func (a *Agent) setupNodeID(config *config.RuntimeConfig) error { 1237 // If they've configured a node ID manually then just use that, as 1238 // long as it's valid. 1239 if config.NodeID != "" { 1240 config.NodeID = types.NodeID(strings.ToLower(string(config.NodeID))) 1241 if _, err := uuid.ParseUUID(string(config.NodeID)); err != nil { 1242 return err 1243 } 1244 1245 return nil 1246 } 1247 1248 // For dev mode we have no filesystem access so just make one. 1249 if a.config.DataDir == "" { 1250 id, err := a.makeNodeID() 1251 if err != nil { 1252 return err 1253 } 1254 1255 config.NodeID = types.NodeID(id) 1256 return nil 1257 } 1258 1259 // Load saved state, if any. Since a user could edit this, we also 1260 // validate it. 1261 fileID := filepath.Join(config.DataDir, "node-id") 1262 if _, err := os.Stat(fileID); err == nil { 1263 rawID, err := ioutil.ReadFile(fileID) 1264 if err != nil { 1265 return err 1266 } 1267 1268 nodeID := strings.TrimSpace(string(rawID)) 1269 nodeID = strings.ToLower(nodeID) 1270 if _, err := uuid.ParseUUID(nodeID); err != nil { 1271 return err 1272 } 1273 1274 config.NodeID = types.NodeID(nodeID) 1275 } 1276 1277 // If we still don't have a valid node ID, make one. 1278 if config.NodeID == "" { 1279 id, err := a.makeNodeID() 1280 if err != nil { 1281 return err 1282 } 1283 if err := lib.EnsurePath(fileID, false); err != nil { 1284 return err 1285 } 1286 if err := ioutil.WriteFile(fileID, []byte(id), 0600); err != nil { 1287 return err 1288 } 1289 1290 config.NodeID = types.NodeID(id) 1291 } 1292 return nil 1293 } 1294 1295 // setupBaseKeyrings configures the LAN and WAN keyrings. 1296 func (a *Agent) setupBaseKeyrings(config *consul.Config) error { 1297 // If the keyring file is disabled then just poke the provided key 1298 // into the in-memory keyring. 1299 federationEnabled := config.SerfWANConfig != nil 1300 if a.config.DisableKeyringFile { 1301 if a.config.EncryptKey == "" { 1302 return nil 1303 } 1304 1305 keys := []string{a.config.EncryptKey} 1306 if err := loadKeyring(config.SerfLANConfig, keys); err != nil { 1307 return err 1308 } 1309 if a.config.ServerMode && federationEnabled { 1310 if err := loadKeyring(config.SerfWANConfig, keys); err != nil { 1311 return err 1312 } 1313 } 1314 return nil 1315 } 1316 1317 // Otherwise, we need to deal with the keyring files. 1318 fileLAN := filepath.Join(a.config.DataDir, SerfLANKeyring) 1319 fileWAN := filepath.Join(a.config.DataDir, SerfWANKeyring) 1320 1321 if a.config.EncryptKey == "" { 1322 goto LOAD 1323 } 1324 if _, err := os.Stat(fileLAN); err != nil { 1325 if err := initKeyring(fileLAN, a.config.EncryptKey); err != nil { 1326 return err 1327 } 1328 } 1329 if a.config.ServerMode && federationEnabled { 1330 if _, err := os.Stat(fileWAN); err != nil { 1331 if err := initKeyring(fileWAN, a.config.EncryptKey); err != nil { 1332 return err 1333 } 1334 } 1335 } 1336 1337 LOAD: 1338 if _, err := os.Stat(fileLAN); err == nil { 1339 config.SerfLANConfig.KeyringFile = fileLAN 1340 } 1341 if err := loadKeyringFile(config.SerfLANConfig); err != nil { 1342 return err 1343 } 1344 if a.config.ServerMode && federationEnabled { 1345 if _, err := os.Stat(fileWAN); err == nil { 1346 config.SerfWANConfig.KeyringFile = fileWAN 1347 } 1348 if err := loadKeyringFile(config.SerfWANConfig); err != nil { 1349 return err 1350 } 1351 } 1352 1353 return nil 1354 } 1355 1356 // setupKeyrings is used to initialize and load keyrings during agent startup. 1357 func (a *Agent) setupKeyrings(config *consul.Config) error { 1358 // First set up the LAN and WAN keyrings. 1359 if err := a.setupBaseKeyrings(config); err != nil { 1360 return err 1361 } 1362 1363 // If there's no LAN keyring then there's nothing else to set up for 1364 // any segments. 1365 lanKeyring := config.SerfLANConfig.MemberlistConfig.Keyring 1366 if lanKeyring == nil { 1367 return nil 1368 } 1369 1370 // Copy the initial state of the LAN keyring into each segment config. 1371 // Segments don't have their own keyring file, they rely on the LAN 1372 // holding the state so things can't get out of sync. 1373 k, pk := lanKeyring.GetKeys(), lanKeyring.GetPrimaryKey() 1374 for _, segment := range config.Segments { 1375 keyring, err := memberlist.NewKeyring(k, pk) 1376 if err != nil { 1377 return err 1378 } 1379 segment.SerfConfig.MemberlistConfig.Keyring = keyring 1380 } 1381 return nil 1382 } 1383 1384 // registerEndpoint registers a handler for the consul RPC server 1385 // under a unique name while making it accessible under the provided 1386 // name. This allows overwriting handlers for the golang net/rpc 1387 // service which does not allow this. 1388 func (a *Agent) registerEndpoint(name string, handler interface{}) error { 1389 srv, ok := a.delegate.(*consul.Server) 1390 if !ok { 1391 panic("agent must be a server") 1392 } 1393 realname := fmt.Sprintf("%s-%d", name, time.Now().UnixNano()) 1394 a.endpointsLock.Lock() 1395 a.endpoints[name] = realname 1396 a.endpointsLock.Unlock() 1397 return srv.RegisterEndpoint(realname, handler) 1398 } 1399 1400 // RPC is used to make an RPC call to the Consul servers 1401 // This allows the agent to implement the Consul.Interface 1402 func (a *Agent) RPC(method string, args interface{}, reply interface{}) error { 1403 a.endpointsLock.RLock() 1404 // fast path: only translate if there are overrides 1405 if len(a.endpoints) > 0 { 1406 p := strings.SplitN(method, ".", 2) 1407 if e := a.endpoints[p[0]]; e != "" { 1408 method = e + "." + p[1] 1409 } 1410 } 1411 a.endpointsLock.RUnlock() 1412 return a.delegate.RPC(method, args, reply) 1413 } 1414 1415 // SnapshotRPC performs the requested snapshot RPC against the Consul server in 1416 // a streaming manner. The contents of in will be read and passed along as the 1417 // payload, and the response message will determine the error status, and any 1418 // return payload will be written to out. 1419 func (a *Agent) SnapshotRPC(args *structs.SnapshotRequest, in io.Reader, out io.Writer, 1420 replyFn structs.SnapshotReplyFn) error { 1421 return a.delegate.SnapshotRPC(args, in, out, replyFn) 1422 } 1423 1424 // Leave is used to prepare the agent for a graceful shutdown 1425 func (a *Agent) Leave() error { 1426 return a.delegate.Leave() 1427 } 1428 1429 // ShutdownAgent is used to hard stop the agent. Should be preceded by 1430 // Leave to do it gracefully. Should be followed by ShutdownEndpoints to 1431 // terminate the HTTP and DNS servers as well. 1432 func (a *Agent) ShutdownAgent() error { 1433 a.shutdownLock.Lock() 1434 defer a.shutdownLock.Unlock() 1435 1436 if a.shutdown { 1437 return nil 1438 } 1439 a.logger.Println("[INFO] agent: Requesting shutdown") 1440 1441 // Stop all the checks 1442 a.stateLock.Lock() 1443 defer a.stateLock.Unlock() 1444 for _, chk := range a.checkMonitors { 1445 chk.Stop() 1446 } 1447 for _, chk := range a.checkTTLs { 1448 chk.Stop() 1449 } 1450 for _, chk := range a.checkHTTPs { 1451 chk.Stop() 1452 } 1453 for _, chk := range a.checkTCPs { 1454 chk.Stop() 1455 } 1456 for _, chk := range a.checkGRPCs { 1457 chk.Stop() 1458 } 1459 for _, chk := range a.checkDockers { 1460 chk.Stop() 1461 } 1462 for _, chk := range a.checkAliases { 1463 chk.Stop() 1464 } 1465 1466 // Stop gRPC 1467 if a.grpcServer != nil { 1468 a.grpcServer.Stop() 1469 } 1470 1471 // Stop the proxy config manager 1472 if a.proxyConfig != nil { 1473 a.proxyConfig.Close() 1474 } 1475 1476 // Stop the proxy process manager 1477 if a.proxyManager != nil { 1478 // If persistence is disabled (implies DevMode but a subset of DevMode) then 1479 // don't leave the proxies running since the agent will not be able to 1480 // recover them later. 1481 if a.config.DataDir == "" { 1482 a.logger.Printf("[WARN] agent: dev mode disabled persistence, killing " + 1483 "all proxies since we can't recover them") 1484 if err := a.proxyManager.Kill(); err != nil { 1485 a.logger.Printf("[WARN] agent: error shutting down proxy manager: %s", err) 1486 } 1487 } else { 1488 if err := a.proxyManager.Close(); err != nil { 1489 a.logger.Printf("[WARN] agent: error shutting down proxy manager: %s", err) 1490 } 1491 } 1492 } 1493 1494 // Stop the cache background work 1495 if a.cache != nil { 1496 a.cache.Close() 1497 } 1498 1499 var err error 1500 if a.delegate != nil { 1501 err = a.delegate.Shutdown() 1502 if _, ok := a.delegate.(*consul.Server); ok { 1503 a.logger.Print("[INFO] agent: consul server down") 1504 } else { 1505 a.logger.Print("[INFO] agent: consul client down") 1506 } 1507 } 1508 1509 pidErr := a.deletePid() 1510 if pidErr != nil { 1511 a.logger.Println("[WARN] agent: could not delete pid file ", pidErr) 1512 } 1513 1514 a.logger.Println("[INFO] agent: shutdown complete") 1515 a.shutdown = true 1516 close(a.shutdownCh) 1517 return err 1518 } 1519 1520 // ShutdownEndpoints terminates the HTTP and DNS servers. Should be 1521 // preceded by ShutdownAgent. 1522 func (a *Agent) ShutdownEndpoints() { 1523 a.shutdownLock.Lock() 1524 defer a.shutdownLock.Unlock() 1525 1526 if len(a.dnsServers) == 0 && len(a.httpServers) == 0 { 1527 return 1528 } 1529 1530 for _, srv := range a.dnsServers { 1531 a.logger.Printf("[INFO] agent: Stopping DNS server %s (%s)", srv.Server.Addr, srv.Server.Net) 1532 srv.Shutdown() 1533 } 1534 a.dnsServers = nil 1535 1536 for _, srv := range a.httpServers { 1537 a.logger.Printf("[INFO] agent: Stopping %s server %s (%s)", strings.ToUpper(srv.proto), srv.ln.Addr().String(), srv.ln.Addr().Network()) 1538 ctx, cancel := context.WithTimeout(context.Background(), time.Second) 1539 defer cancel() 1540 srv.Shutdown(ctx) 1541 if ctx.Err() == context.DeadlineExceeded { 1542 a.logger.Printf("[WARN] agent: Timeout stopping %s server %s (%s)", strings.ToUpper(srv.proto), srv.ln.Addr().String(), srv.ln.Addr().Network()) 1543 } 1544 } 1545 a.httpServers = nil 1546 1547 a.logger.Println("[INFO] agent: Waiting for endpoints to shut down") 1548 a.wgServers.Wait() 1549 a.logger.Print("[INFO] agent: Endpoints down") 1550 } 1551 1552 // ReloadCh is used to return a channel that can be 1553 // used for triggering reloads and returning a response. 1554 func (a *Agent) ReloadCh() chan chan error { 1555 return a.reloadCh 1556 } 1557 1558 // RetryJoinCh is a channel that transports errors 1559 // from the retry join process. 1560 func (a *Agent) RetryJoinCh() <-chan error { 1561 return a.retryJoinCh 1562 } 1563 1564 // ShutdownCh is used to return a channel that can be 1565 // selected to wait for the agent to perform a shutdown. 1566 func (a *Agent) ShutdownCh() <-chan struct{} { 1567 return a.shutdownCh 1568 } 1569 1570 // JoinLAN is used to have the agent join a LAN cluster 1571 func (a *Agent) JoinLAN(addrs []string) (n int, err error) { 1572 a.logger.Printf("[INFO] agent: (LAN) joining: %v", addrs) 1573 n, err = a.delegate.JoinLAN(addrs) 1574 a.logger.Printf("[INFO] agent: (LAN) joined: %d Err: %v", n, err) 1575 if err == nil && a.joinLANNotifier != nil { 1576 if notifErr := a.joinLANNotifier.Notify(systemd.Ready); notifErr != nil { 1577 a.logger.Printf("[DEBUG] agent: systemd notify failed: %v", notifErr) 1578 } 1579 } 1580 return 1581 } 1582 1583 // JoinWAN is used to have the agent join a WAN cluster 1584 func (a *Agent) JoinWAN(addrs []string) (n int, err error) { 1585 a.logger.Printf("[INFO] agent: (WAN) joining: %v", addrs) 1586 if srv, ok := a.delegate.(*consul.Server); ok { 1587 n, err = srv.JoinWAN(addrs) 1588 } else { 1589 err = fmt.Errorf("Must be a server to join WAN cluster") 1590 } 1591 a.logger.Printf("[INFO] agent: (WAN) joined: %d Err: %v", n, err) 1592 return 1593 } 1594 1595 // ForceLeave is used to remove a failed node from the cluster 1596 func (a *Agent) ForceLeave(node string) (err error) { 1597 a.logger.Printf("[INFO] agent: Force leaving node: %v", node) 1598 err = a.delegate.RemoveFailedNode(node) 1599 if err != nil { 1600 a.logger.Printf("[WARN] agent: Failed to remove node: %v", err) 1601 } 1602 return err 1603 } 1604 1605 // LocalMember is used to return the local node 1606 func (a *Agent) LocalMember() serf.Member { 1607 return a.delegate.LocalMember() 1608 } 1609 1610 // LANMembers is used to retrieve the LAN members 1611 func (a *Agent) LANMembers() []serf.Member { 1612 return a.delegate.LANMembers() 1613 } 1614 1615 // WANMembers is used to retrieve the WAN members 1616 func (a *Agent) WANMembers() []serf.Member { 1617 if srv, ok := a.delegate.(*consul.Server); ok { 1618 return srv.WANMembers() 1619 } 1620 return nil 1621 } 1622 1623 // StartSync is called once Services and Checks are registered. 1624 // This is called to prevent a race between clients and the anti-entropy routines 1625 func (a *Agent) StartSync() { 1626 go a.sync.Run() 1627 a.logger.Printf("[INFO] agent: started state syncer") 1628 } 1629 1630 // PauseSync is used to pause anti-entropy while bulk changes are made. It also 1631 // sets state that agent-local watches use to "ride out" config reloads and bulk 1632 // updates which might spuriously unload state and reload it again. 1633 func (a *Agent) PauseSync() { 1634 // Do this outside of lock as it has it's own locking 1635 a.sync.Pause() 1636 1637 // Coordinate local state watchers 1638 a.syncMu.Lock() 1639 defer a.syncMu.Unlock() 1640 if a.syncCh == nil { 1641 a.syncCh = make(chan struct{}) 1642 } 1643 } 1644 1645 // ResumeSync is used to unpause anti-entropy after bulk changes are make 1646 func (a *Agent) ResumeSync() { 1647 // a.sync maintains a stack/ref count of Pause calls since we call 1648 // Pause/Resume in nested way during a reload and AddService. We only want to 1649 // trigger local state watchers if this Resume call actually started sync back 1650 // up again (i.e. was the last resume on the stack). We could check that 1651 // separately with a.sync.Paused but that is racey since another Pause call 1652 // might be made between our Resume and checking Paused. 1653 resumed := a.sync.Resume() 1654 1655 if !resumed { 1656 // Return early so we don't notify local watchers until we are actually 1657 // resumed. 1658 return 1659 } 1660 1661 // Coordinate local state watchers 1662 a.syncMu.Lock() 1663 defer a.syncMu.Unlock() 1664 1665 if a.syncCh != nil { 1666 close(a.syncCh) 1667 a.syncCh = nil 1668 } 1669 } 1670 1671 // syncPausedCh returns either a channel or nil. If nil sync is not paused. If 1672 // non-nil, the channel will be closed when sync resumes. 1673 func (a *Agent) syncPausedCh() <-chan struct{} { 1674 a.syncMu.Lock() 1675 defer a.syncMu.Unlock() 1676 return a.syncCh 1677 } 1678 1679 // GetLANCoordinate returns the coordinates of this node in the local pools 1680 // (assumes coordinates are enabled, so check that before calling). 1681 func (a *Agent) GetLANCoordinate() (lib.CoordinateSet, error) { 1682 return a.delegate.GetLANCoordinate() 1683 } 1684 1685 // sendCoordinate is a long-running loop that periodically sends our coordinate 1686 // to the server. Closing the agent's shutdownChannel will cause this to exit. 1687 func (a *Agent) sendCoordinate() { 1688 OUTER: 1689 for { 1690 rate := a.config.SyncCoordinateRateTarget 1691 min := a.config.SyncCoordinateIntervalMin 1692 intv := lib.RateScaledInterval(rate, min, len(a.LANMembers())) 1693 intv = intv + lib.RandomStagger(intv) 1694 1695 select { 1696 case <-time.After(intv): 1697 members := a.LANMembers() 1698 grok, err := consul.CanServersUnderstandProtocol(members, 3) 1699 if err != nil { 1700 a.logger.Printf("[ERR] agent: Failed to check servers: %s", err) 1701 continue 1702 } 1703 if !grok { 1704 a.logger.Printf("[DEBUG] agent: Skipping coordinate updates until servers are upgraded") 1705 continue 1706 } 1707 1708 cs, err := a.GetLANCoordinate() 1709 if err != nil { 1710 a.logger.Printf("[ERR] agent: Failed to get coordinate: %s", err) 1711 continue 1712 } 1713 1714 for segment, coord := range cs { 1715 req := structs.CoordinateUpdateRequest{ 1716 Datacenter: a.config.Datacenter, 1717 Node: a.config.NodeName, 1718 Segment: segment, 1719 Coord: coord, 1720 WriteRequest: structs.WriteRequest{Token: a.tokens.AgentToken()}, 1721 } 1722 var reply struct{} 1723 if err := a.RPC("Coordinate.Update", &req, &reply); err != nil { 1724 if acl.IsErrPermissionDenied(err) { 1725 a.logger.Printf("[WARN] agent: Coordinate update blocked by ACLs") 1726 } else { 1727 a.logger.Printf("[ERR] agent: Coordinate update error: %v", err) 1728 } 1729 continue OUTER 1730 } 1731 } 1732 case <-a.shutdownCh: 1733 return 1734 } 1735 } 1736 } 1737 1738 // reapServicesInternal does a single pass, looking for services to reap. 1739 func (a *Agent) reapServicesInternal() { 1740 reaped := make(map[string]bool) 1741 for checkID, cs := range a.State.CriticalCheckStates() { 1742 serviceID := cs.Check.ServiceID 1743 1744 // There's nothing to do if there's no service. 1745 if serviceID == "" { 1746 continue 1747 } 1748 1749 // There might be multiple checks for one service, so 1750 // we don't need to reap multiple times. 1751 if reaped[serviceID] { 1752 continue 1753 } 1754 1755 // See if there's a timeout. 1756 // todo(fs): this looks fishy... why is there another data structure in the agent with its own lock? 1757 a.stateLock.Lock() 1758 timeout := a.checkReapAfter[checkID] 1759 a.stateLock.Unlock() 1760 1761 // Reap, if necessary. We keep track of which service 1762 // this is so that we won't try to remove it again. 1763 if timeout > 0 && cs.CriticalFor() > timeout { 1764 reaped[serviceID] = true 1765 if err := a.RemoveService(serviceID, true); err != nil { 1766 a.logger.Printf("[ERR] agent: unable to deregister service %q after check %q has been critical for too long: %s", 1767 serviceID, checkID, err) 1768 } else { 1769 a.logger.Printf("[INFO] agent: Check %q for service %q has been critical for too long; deregistered service", 1770 checkID, serviceID) 1771 } 1772 } 1773 } 1774 } 1775 1776 // reapServices is a long running goroutine that looks for checks that have been 1777 // critical too long and deregisters their associated services. 1778 func (a *Agent) reapServices() { 1779 for { 1780 select { 1781 case <-time.After(a.config.CheckReapInterval): 1782 a.reapServicesInternal() 1783 1784 case <-a.shutdownCh: 1785 return 1786 } 1787 } 1788 1789 } 1790 1791 // persistedService is used to wrap a service definition and bundle it 1792 // with an ACL token so we can restore both at a later agent start. 1793 type persistedService struct { 1794 Token string 1795 Service *structs.NodeService 1796 } 1797 1798 // persistService saves a service definition to a JSON file in the data dir 1799 func (a *Agent) persistService(service *structs.NodeService) error { 1800 svcPath := filepath.Join(a.config.DataDir, servicesDir, stringHash(service.ID)) 1801 1802 wrapped := persistedService{ 1803 Token: a.State.ServiceToken(service.ID), 1804 Service: service, 1805 } 1806 encoded, err := json.Marshal(wrapped) 1807 if err != nil { 1808 return err 1809 } 1810 1811 return file.WriteAtomic(svcPath, encoded) 1812 } 1813 1814 // purgeService removes a persisted service definition file from the data dir 1815 func (a *Agent) purgeService(serviceID string) error { 1816 svcPath := filepath.Join(a.config.DataDir, servicesDir, stringHash(serviceID)) 1817 if _, err := os.Stat(svcPath); err == nil { 1818 return os.Remove(svcPath) 1819 } 1820 return nil 1821 } 1822 1823 // persistedProxy is used to wrap a proxy definition and bundle it with an Proxy 1824 // token so we can continue to authenticate the running proxy after a restart. 1825 type persistedProxy struct { 1826 ProxyToken string 1827 Proxy *structs.ConnectManagedProxy 1828 1829 // Set to true when the proxy information originated from the agents configuration 1830 // as opposed to API registration. 1831 FromFile bool 1832 } 1833 1834 // persistProxy saves a proxy definition to a JSON file in the data dir 1835 func (a *Agent) persistProxy(proxy *local.ManagedProxy, FromFile bool) error { 1836 proxyPath := filepath.Join(a.config.DataDir, proxyDir, 1837 stringHash(proxy.Proxy.ProxyService.ID)) 1838 1839 wrapped := persistedProxy{ 1840 ProxyToken: proxy.ProxyToken, 1841 Proxy: proxy.Proxy, 1842 FromFile: FromFile, 1843 } 1844 encoded, err := json.Marshal(wrapped) 1845 if err != nil { 1846 return err 1847 } 1848 1849 return file.WriteAtomic(proxyPath, encoded) 1850 } 1851 1852 // purgeProxy removes a persisted proxy definition file from the data dir 1853 func (a *Agent) purgeProxy(proxyID string) error { 1854 proxyPath := filepath.Join(a.config.DataDir, proxyDir, stringHash(proxyID)) 1855 if _, err := os.Stat(proxyPath); err == nil { 1856 return os.Remove(proxyPath) 1857 } 1858 return nil 1859 } 1860 1861 // persistCheck saves a check definition to the local agent's state directory 1862 func (a *Agent) persistCheck(check *structs.HealthCheck, chkType *structs.CheckType) error { 1863 checkPath := filepath.Join(a.config.DataDir, checksDir, checkIDHash(check.CheckID)) 1864 1865 // Create the persisted check 1866 wrapped := persistedCheck{ 1867 Check: check, 1868 ChkType: chkType, 1869 Token: a.State.CheckToken(check.CheckID), 1870 } 1871 1872 encoded, err := json.Marshal(wrapped) 1873 if err != nil { 1874 return err 1875 } 1876 1877 return file.WriteAtomic(checkPath, encoded) 1878 } 1879 1880 // purgeCheck removes a persisted check definition file from the data dir 1881 func (a *Agent) purgeCheck(checkID types.CheckID) error { 1882 checkPath := filepath.Join(a.config.DataDir, checksDir, checkIDHash(checkID)) 1883 if _, err := os.Stat(checkPath); err == nil { 1884 return os.Remove(checkPath) 1885 } 1886 return nil 1887 } 1888 1889 // AddService is used to add a service entry. 1890 // This entry is persistent and the agent will make a best effort to 1891 // ensure it is registered 1892 func (a *Agent) AddService(service *structs.NodeService, chkTypes []*structs.CheckType, persist bool, token string, source configSource) error { 1893 a.stateLock.Lock() 1894 defer a.stateLock.Unlock() 1895 return a.addServiceLocked(service, chkTypes, persist, token, source) 1896 } 1897 1898 func (a *Agent) addServiceLocked(service *structs.NodeService, chkTypes []*structs.CheckType, persist bool, token string, source configSource) error { 1899 if service.Service == "" { 1900 return fmt.Errorf("Service name missing") 1901 } 1902 if service.ID == "" && service.Service != "" { 1903 service.ID = service.Service 1904 } 1905 for _, check := range chkTypes { 1906 if err := check.Validate(); err != nil { 1907 return fmt.Errorf("Check is not valid: %v", err) 1908 } 1909 } 1910 1911 // Set default weights if not specified. This is important as it ensures AE 1912 // doesn't consider the service different since it has nil weights. 1913 if service.Weights == nil { 1914 service.Weights = &structs.Weights{Passing: 1, Warning: 1} 1915 } 1916 1917 // Warn if the service name is incompatible with DNS 1918 if InvalidDnsRe.MatchString(service.Service) { 1919 a.logger.Printf("[WARN] agent: Service name %q will not be discoverable "+ 1920 "via DNS due to invalid characters. Valid characters include "+ 1921 "all alpha-numerics and dashes.", service.Service) 1922 } else if len(service.Service) > MaxDNSLabelLength { 1923 a.logger.Printf("[WARN] agent: Service name %q will not be discoverable "+ 1924 "via DNS due to it being too long. Valid lengths are between "+ 1925 "1 and 63 bytes.", service.Service) 1926 } 1927 1928 // Warn if any tags are incompatible with DNS 1929 for _, tag := range service.Tags { 1930 if InvalidDnsRe.MatchString(tag) { 1931 a.logger.Printf("[DEBUG] agent: Service tag %q will not be discoverable "+ 1932 "via DNS due to invalid characters. Valid characters include "+ 1933 "all alpha-numerics and dashes.", tag) 1934 } else if len(tag) > MaxDNSLabelLength { 1935 a.logger.Printf("[DEBUG] agent: Service tag %q will not be discoverable "+ 1936 "via DNS due to it being too long. Valid lengths are between "+ 1937 "1 and 63 bytes.", tag) 1938 } 1939 } 1940 1941 // Pause the service syncs during modification 1942 a.PauseSync() 1943 defer a.ResumeSync() 1944 1945 // Take a snapshot of the current state of checks (if any), and 1946 // restore them before resuming anti-entropy. 1947 snap := a.snapshotCheckState() 1948 defer a.restoreCheckState(snap) 1949 1950 var checks []*structs.HealthCheck 1951 1952 // Create an associated health check 1953 for i, chkType := range chkTypes { 1954 checkID := string(chkType.CheckID) 1955 if checkID == "" { 1956 checkID = fmt.Sprintf("service:%s", service.ID) 1957 if len(chkTypes) > 1 { 1958 checkID += fmt.Sprintf(":%d", i+1) 1959 } 1960 } 1961 name := chkType.Name 1962 if name == "" { 1963 name = fmt.Sprintf("Service '%s' check", service.Service) 1964 } 1965 check := &structs.HealthCheck{ 1966 Node: a.config.NodeName, 1967 CheckID: types.CheckID(checkID), 1968 Name: name, 1969 Status: api.HealthCritical, 1970 Notes: chkType.Notes, 1971 ServiceID: service.ID, 1972 ServiceName: service.Service, 1973 ServiceTags: service.Tags, 1974 } 1975 if chkType.Status != "" { 1976 check.Status = chkType.Status 1977 } 1978 1979 checks = append(checks, check) 1980 } 1981 1982 // cleanup, store the ids of services and checks that weren't previously 1983 // registered so we clean them up if somthing fails halfway through the 1984 // process. 1985 var cleanupServices []string 1986 var cleanupChecks []types.CheckID 1987 1988 if s := a.State.Service(service.ID); s == nil { 1989 cleanupServices = append(cleanupServices, service.ID) 1990 } 1991 1992 for _, check := range checks { 1993 if c := a.State.Check(check.CheckID); c == nil { 1994 cleanupChecks = append(cleanupChecks, check.CheckID) 1995 } 1996 } 1997 1998 err := a.State.AddServiceWithChecks(service, checks, token) 1999 if err != nil { 2000 a.cleanupRegistration(cleanupServices, cleanupChecks) 2001 return err 2002 } 2003 2004 for i := range checks { 2005 if err := a.addCheck(checks[i], chkTypes[i], service, persist, token, source); err != nil { 2006 a.cleanupRegistration(cleanupServices, cleanupChecks) 2007 return err 2008 } 2009 2010 if persist && a.config.DataDir != "" { 2011 if err := a.persistCheck(checks[i], chkTypes[i]); err != nil { 2012 a.cleanupRegistration(cleanupServices, cleanupChecks) 2013 return err 2014 2015 } 2016 } 2017 } 2018 2019 // Persist the service to a file 2020 if persist && a.config.DataDir != "" { 2021 if err := a.persistService(service); err != nil { 2022 a.cleanupRegistration(cleanupServices, cleanupChecks) 2023 return err 2024 } 2025 } 2026 2027 return nil 2028 } 2029 2030 // cleanupRegistration is called on registration error to ensure no there are no 2031 // leftovers after a partial failure 2032 func (a *Agent) cleanupRegistration(serviceIDs []string, checksIDs []types.CheckID) { 2033 for _, s := range serviceIDs { 2034 if err := a.State.RemoveService(s); err != nil { 2035 a.logger.Printf("[ERR] consul: service registration: cleanup: failed to remove service %s: %s", s, err) 2036 } 2037 if err := a.purgeService(s); err != nil { 2038 a.logger.Printf("[ERR] consul: service registration: cleanup: failed to purge service %s file: %s", s, err) 2039 } 2040 } 2041 2042 for _, c := range checksIDs { 2043 a.cancelCheckMonitors(c) 2044 if err := a.State.RemoveCheck(c); err != nil { 2045 a.logger.Printf("[ERR] consul: service registration: cleanup: failed to remove check %s: %s", c, err) 2046 } 2047 if err := a.purgeCheck(c); err != nil { 2048 a.logger.Printf("[ERR] consul: service registration: cleanup: failed to purge check %s file: %s", c, err) 2049 } 2050 } 2051 } 2052 2053 // RemoveService is used to remove a service entry. 2054 // The agent will make a best effort to ensure it is deregistered 2055 func (a *Agent) RemoveService(serviceID string, persist bool) error { 2056 a.stateLock.Lock() 2057 defer a.stateLock.Unlock() 2058 return a.removeServiceLocked(serviceID, persist) 2059 } 2060 2061 // removeServiceLocked is used to remove a service entry. 2062 // The agent will make a best effort to ensure it is deregistered 2063 func (a *Agent) removeServiceLocked(serviceID string, persist bool) error { 2064 // Validate ServiceID 2065 if serviceID == "" { 2066 return fmt.Errorf("ServiceID missing") 2067 } 2068 2069 checks := a.State.Checks() 2070 var checkIDs []types.CheckID 2071 for id, check := range checks { 2072 if check.ServiceID != serviceID { 2073 continue 2074 } 2075 checkIDs = append(checkIDs, id) 2076 } 2077 2078 // Remove the associated managed proxy if it exists 2079 // This has to be DONE before purging configuration as might might have issues 2080 // With ACLs otherwise 2081 for proxyID, p := range a.State.Proxies() { 2082 if p.Proxy.TargetServiceID == serviceID { 2083 if err := a.removeProxyLocked(proxyID, true); err != nil { 2084 return err 2085 } 2086 } 2087 } 2088 2089 // Remove service immediately 2090 if err := a.State.RemoveServiceWithChecks(serviceID, checkIDs); err != nil { 2091 a.logger.Printf("[WARN] agent: Failed to deregister service %q: %s", serviceID, err) 2092 return nil 2093 } 2094 2095 // Remove the service from the data dir 2096 if persist { 2097 if err := a.purgeService(serviceID); err != nil { 2098 return err 2099 } 2100 } 2101 2102 // Deregister any associated health checks 2103 for checkID, check := range checks { 2104 if check.ServiceID != serviceID { 2105 continue 2106 } 2107 if err := a.removeCheckLocked(checkID, persist); err != nil { 2108 return err 2109 } 2110 } 2111 2112 a.logger.Printf("[DEBUG] agent: removed service %q", serviceID) 2113 2114 // If any Sidecar services exist for the removed service ID, remove them too. 2115 if sidecar := a.State.Service(a.sidecarServiceID(serviceID)); sidecar != nil { 2116 // Double check that it's not just an ID collision and we actually added 2117 // this from a sidecar. 2118 if sidecar.LocallyRegisteredAsSidecar { 2119 // Remove it! 2120 err := a.removeServiceLocked(a.sidecarServiceID(serviceID), persist) 2121 if err != nil { 2122 return err 2123 } 2124 } 2125 } 2126 2127 return nil 2128 } 2129 2130 // AddCheck is used to add a health check to the agent. 2131 // This entry is persistent and the agent will make a best effort to 2132 // ensure it is registered. The Check may include a CheckType which 2133 // is used to automatically update the check status 2134 func (a *Agent) AddCheck(check *structs.HealthCheck, chkType *structs.CheckType, persist bool, token string, source configSource) error { 2135 a.stateLock.Lock() 2136 defer a.stateLock.Unlock() 2137 return a.addCheckLocked(check, chkType, persist, token, source) 2138 } 2139 2140 func (a *Agent) addCheckLocked(check *structs.HealthCheck, chkType *structs.CheckType, persist bool, token string, source configSource) error { 2141 var service *structs.NodeService 2142 2143 if check.ServiceID != "" { 2144 service = a.State.Service(check.ServiceID) 2145 if service == nil { 2146 return fmt.Errorf("ServiceID %q does not exist", check.ServiceID) 2147 } 2148 } 2149 2150 // snapshot the current state of the health check to avoid potential flapping 2151 existing := a.State.Check(check.CheckID) 2152 defer func() { 2153 if existing != nil { 2154 a.State.UpdateCheck(check.CheckID, existing.Status, existing.Output) 2155 } 2156 }() 2157 2158 err := a.addCheck(check, chkType, service, persist, token, source) 2159 if err != nil { 2160 a.State.RemoveCheck(check.CheckID) 2161 return err 2162 } 2163 2164 // Add to the local state for anti-entropy 2165 err = a.State.AddCheck(check, token) 2166 if err != nil { 2167 return err 2168 } 2169 2170 // Persist the check 2171 if persist && a.config.DataDir != "" { 2172 return a.persistCheck(check, chkType) 2173 } 2174 2175 return nil 2176 } 2177 2178 func (a *Agent) addCheck(check *structs.HealthCheck, chkType *structs.CheckType, service *structs.NodeService, persist bool, token string, source configSource) error { 2179 if check.CheckID == "" { 2180 return fmt.Errorf("CheckID missing") 2181 } 2182 2183 if chkType != nil { 2184 if err := chkType.Validate(); err != nil { 2185 return fmt.Errorf("Check is not valid: %v", err) 2186 } 2187 2188 if chkType.IsScript() { 2189 if source == ConfigSourceLocal && !a.config.EnableLocalScriptChecks { 2190 return fmt.Errorf("Scripts are disabled on this agent; to enable, configure 'enable_script_checks' or 'enable_local_script_checks' to true") 2191 } 2192 2193 if source == ConfigSourceRemote && !a.config.EnableRemoteScriptChecks { 2194 return fmt.Errorf("Scripts are disabled on this agent from remote calls; to enable, configure 'enable_script_checks' to true") 2195 } 2196 } 2197 } 2198 2199 if check.ServiceID != "" { 2200 check.ServiceName = service.Service 2201 check.ServiceTags = service.Tags 2202 } 2203 2204 // Check if already registered 2205 if chkType != nil { 2206 switch { 2207 2208 case chkType.IsTTL(): 2209 if existing, ok := a.checkTTLs[check.CheckID]; ok { 2210 existing.Stop() 2211 delete(a.checkTTLs, check.CheckID) 2212 } 2213 2214 ttl := &checks.CheckTTL{ 2215 Notify: a.State, 2216 CheckID: check.CheckID, 2217 TTL: chkType.TTL, 2218 Logger: a.logger, 2219 } 2220 2221 // Restore persisted state, if any 2222 if err := a.loadCheckState(check); err != nil { 2223 a.logger.Printf("[WARN] agent: failed restoring state for check %q: %s", 2224 check.CheckID, err) 2225 } 2226 2227 ttl.Start() 2228 a.checkTTLs[check.CheckID] = ttl 2229 2230 case chkType.IsHTTP(): 2231 if existing, ok := a.checkHTTPs[check.CheckID]; ok { 2232 existing.Stop() 2233 delete(a.checkHTTPs, check.CheckID) 2234 } 2235 if chkType.Interval < checks.MinInterval { 2236 a.logger.Println(fmt.Sprintf("[WARN] agent: check '%s' has interval below minimum of %v", 2237 check.CheckID, checks.MinInterval)) 2238 chkType.Interval = checks.MinInterval 2239 } 2240 2241 tlsClientConfig := a.tlsConfigurator.OutgoingTLSConfigForCheck(chkType.TLSSkipVerify) 2242 2243 http := &checks.CheckHTTP{ 2244 Notify: a.State, 2245 CheckID: check.CheckID, 2246 HTTP: chkType.HTTP, 2247 Header: chkType.Header, 2248 Method: chkType.Method, 2249 Interval: chkType.Interval, 2250 Timeout: chkType.Timeout, 2251 Logger: a.logger, 2252 TLSClientConfig: tlsClientConfig, 2253 } 2254 http.Start() 2255 a.checkHTTPs[check.CheckID] = http 2256 2257 case chkType.IsTCP(): 2258 if existing, ok := a.checkTCPs[check.CheckID]; ok { 2259 existing.Stop() 2260 delete(a.checkTCPs, check.CheckID) 2261 } 2262 if chkType.Interval < checks.MinInterval { 2263 a.logger.Println(fmt.Sprintf("[WARN] agent: check '%s' has interval below minimum of %v", 2264 check.CheckID, checks.MinInterval)) 2265 chkType.Interval = checks.MinInterval 2266 } 2267 2268 tcp := &checks.CheckTCP{ 2269 Notify: a.State, 2270 CheckID: check.CheckID, 2271 TCP: chkType.TCP, 2272 Interval: chkType.Interval, 2273 Timeout: chkType.Timeout, 2274 Logger: a.logger, 2275 } 2276 tcp.Start() 2277 a.checkTCPs[check.CheckID] = tcp 2278 2279 case chkType.IsGRPC(): 2280 if existing, ok := a.checkGRPCs[check.CheckID]; ok { 2281 existing.Stop() 2282 delete(a.checkGRPCs, check.CheckID) 2283 } 2284 if chkType.Interval < checks.MinInterval { 2285 a.logger.Println(fmt.Sprintf("[WARN] agent: check '%s' has interval below minimum of %v", 2286 check.CheckID, checks.MinInterval)) 2287 chkType.Interval = checks.MinInterval 2288 } 2289 2290 var tlsClientConfig *tls.Config 2291 if chkType.GRPCUseTLS { 2292 tlsClientConfig = a.tlsConfigurator.OutgoingTLSConfigForCheck(chkType.TLSSkipVerify) 2293 } 2294 2295 grpc := &checks.CheckGRPC{ 2296 Notify: a.State, 2297 CheckID: check.CheckID, 2298 GRPC: chkType.GRPC, 2299 Interval: chkType.Interval, 2300 Timeout: chkType.Timeout, 2301 Logger: a.logger, 2302 TLSClientConfig: tlsClientConfig, 2303 } 2304 grpc.Start() 2305 a.checkGRPCs[check.CheckID] = grpc 2306 2307 case chkType.IsDocker(): 2308 if existing, ok := a.checkDockers[check.CheckID]; ok { 2309 existing.Stop() 2310 delete(a.checkDockers, check.CheckID) 2311 } 2312 if chkType.Interval < checks.MinInterval { 2313 a.logger.Println(fmt.Sprintf("[WARN] agent: check '%s' has interval below minimum of %v", 2314 check.CheckID, checks.MinInterval)) 2315 chkType.Interval = checks.MinInterval 2316 } 2317 2318 if a.dockerClient == nil { 2319 dc, err := checks.NewDockerClient(os.Getenv("DOCKER_HOST"), checks.BufSize) 2320 if err != nil { 2321 a.logger.Printf("[ERR] agent: error creating docker client: %s", err) 2322 return err 2323 } 2324 a.logger.Printf("[DEBUG] agent: created docker client for %s", dc.Host()) 2325 a.dockerClient = dc 2326 } 2327 2328 dockerCheck := &checks.CheckDocker{ 2329 Notify: a.State, 2330 CheckID: check.CheckID, 2331 DockerContainerID: chkType.DockerContainerID, 2332 Shell: chkType.Shell, 2333 ScriptArgs: chkType.ScriptArgs, 2334 Interval: chkType.Interval, 2335 Logger: a.logger, 2336 Client: a.dockerClient, 2337 } 2338 if prev := a.checkDockers[check.CheckID]; prev != nil { 2339 prev.Stop() 2340 } 2341 dockerCheck.Start() 2342 a.checkDockers[check.CheckID] = dockerCheck 2343 2344 case chkType.IsMonitor(): 2345 if existing, ok := a.checkMonitors[check.CheckID]; ok { 2346 existing.Stop() 2347 delete(a.checkMonitors, check.CheckID) 2348 } 2349 if chkType.Interval < checks.MinInterval { 2350 a.logger.Printf("[WARN] agent: check '%s' has interval below minimum of %v", 2351 check.CheckID, checks.MinInterval) 2352 chkType.Interval = checks.MinInterval 2353 } 2354 2355 monitor := &checks.CheckMonitor{ 2356 Notify: a.State, 2357 CheckID: check.CheckID, 2358 ScriptArgs: chkType.ScriptArgs, 2359 Interval: chkType.Interval, 2360 Timeout: chkType.Timeout, 2361 Logger: a.logger, 2362 } 2363 monitor.Start() 2364 a.checkMonitors[check.CheckID] = monitor 2365 2366 case chkType.IsAlias(): 2367 if existing, ok := a.checkAliases[check.CheckID]; ok { 2368 existing.Stop() 2369 delete(a.checkAliases, check.CheckID) 2370 } 2371 2372 var rpcReq structs.NodeSpecificRequest 2373 rpcReq.Datacenter = a.config.Datacenter 2374 2375 // The token to set is really important. The behavior below follows 2376 // the same behavior as anti-entropy: we use the user-specified token 2377 // if set (either on the service or check definition), otherwise 2378 // we use the "UserToken" on the agent. This is tested. 2379 rpcReq.Token = a.tokens.UserToken() 2380 if token != "" { 2381 rpcReq.Token = token 2382 } 2383 2384 chkImpl := &checks.CheckAlias{ 2385 Notify: a.State, 2386 RPC: a.delegate, 2387 RPCReq: rpcReq, 2388 CheckID: check.CheckID, 2389 Node: chkType.AliasNode, 2390 ServiceID: chkType.AliasService, 2391 } 2392 chkImpl.Start() 2393 a.checkAliases[check.CheckID] = chkImpl 2394 2395 default: 2396 return fmt.Errorf("Check type is not valid") 2397 } 2398 2399 if chkType.DeregisterCriticalServiceAfter > 0 { 2400 timeout := chkType.DeregisterCriticalServiceAfter 2401 if timeout < a.config.CheckDeregisterIntervalMin { 2402 timeout = a.config.CheckDeregisterIntervalMin 2403 a.logger.Println(fmt.Sprintf("[WARN] agent: check '%s' has deregister interval below minimum of %v", 2404 check.CheckID, a.config.CheckDeregisterIntervalMin)) 2405 } 2406 a.checkReapAfter[check.CheckID] = timeout 2407 } else { 2408 delete(a.checkReapAfter, check.CheckID) 2409 } 2410 } 2411 2412 return nil 2413 } 2414 2415 // RemoveCheck is used to remove a health check. 2416 // The agent will make a best effort to ensure it is deregistered 2417 func (a *Agent) RemoveCheck(checkID types.CheckID, persist bool) error { 2418 a.stateLock.Lock() 2419 defer a.stateLock.Unlock() 2420 return a.removeCheckLocked(checkID, persist) 2421 } 2422 2423 // removeCheckLocked is used to remove a health check. 2424 // The agent will make a best effort to ensure it is deregistered 2425 func (a *Agent) removeCheckLocked(checkID types.CheckID, persist bool) error { 2426 // Validate CheckID 2427 if checkID == "" { 2428 return fmt.Errorf("CheckID missing") 2429 } 2430 2431 a.cancelCheckMonitors(checkID) 2432 a.State.RemoveCheck(checkID) 2433 2434 if persist { 2435 if err := a.purgeCheck(checkID); err != nil { 2436 return err 2437 } 2438 if err := a.purgeCheckState(checkID); err != nil { 2439 return err 2440 } 2441 } 2442 a.logger.Printf("[DEBUG] agent: removed check %q", checkID) 2443 return nil 2444 } 2445 2446 // addProxyLocked adds a new local Connect Proxy instance to be managed by the agent. 2447 // 2448 // This assumes that the agent's proxyLock is already held 2449 // 2450 // It REQUIRES that the service that is being proxied is already present in the 2451 // local state. Note that this is only used for agent-managed proxies so we can 2452 // ensure that we always make this true. For externally managed and registered 2453 // proxies we explicitly allow the proxy to be registered first to make 2454 // bootstrap ordering of a new service simpler but the same is not true here 2455 // since this is only ever called when setting up a _managed_ proxy which was 2456 // registered as part of a service registration either from config or HTTP API 2457 // call. 2458 // 2459 // The restoredProxyToken argument should only be used when restoring proxy 2460 // definitions from disk; new proxies must leave it blank to get a new token 2461 // assigned. We need to restore from disk to enable to continue authenticating 2462 // running proxies that already had that credential injected. 2463 func (a *Agent) addProxyLocked(proxy *structs.ConnectManagedProxy, persist, FromFile bool, 2464 restoredProxyToken string, source configSource) error { 2465 // Lookup the target service token in state if there is one. 2466 token := a.State.ServiceToken(proxy.TargetServiceID) 2467 2468 // Copy the basic proxy structure so it isn't modified w/ defaults 2469 proxyCopy := *proxy 2470 proxy = &proxyCopy 2471 if err := a.applyProxyDefaults(proxy); err != nil { 2472 return err 2473 } 2474 2475 // Add the proxy to local state first since we may need to assign a port which 2476 // needs to be coordinate under state lock. AddProxy will generate the 2477 // NodeService for the proxy populated with the allocated (or configured) port 2478 // and an ID, but it doesn't add it to the agent directly since that could 2479 // deadlock and we may need to coordinate adding it and persisting etc. 2480 proxyState, err := a.State.AddProxy(proxy, token, restoredProxyToken) 2481 if err != nil { 2482 return err 2483 } 2484 proxyService := proxyState.Proxy.ProxyService 2485 2486 // Register proxy TCP check. The built in proxy doesn't listen publically 2487 // until it's loaded certs so this ensures we won't route traffic until it's 2488 // ready. 2489 proxyCfg, err := a.applyProxyConfigDefaults(proxyState.Proxy) 2490 if err != nil { 2491 return err 2492 } 2493 chkAddr := a.resolveProxyCheckAddress(proxyCfg) 2494 chkTypes := []*structs.CheckType{} 2495 if chkAddr != "" { 2496 chkTypes = []*structs.CheckType{ 2497 &structs.CheckType{ 2498 Name: "Connect Proxy Listening", 2499 TCP: fmt.Sprintf("%s:%d", chkAddr, 2500 proxyCfg["bind_port"]), 2501 Interval: 10 * time.Second, 2502 }, 2503 } 2504 } 2505 2506 err = a.addServiceLocked(proxyService, chkTypes, persist, token, source) 2507 if err != nil { 2508 // Remove the state too 2509 a.State.RemoveProxy(proxyService.ID) 2510 return err 2511 } 2512 2513 // Persist the proxy 2514 if persist && a.config.DataDir != "" { 2515 return a.persistProxy(proxyState, FromFile) 2516 } 2517 return nil 2518 } 2519 2520 // AddProxy adds a new local Connect Proxy instance to be managed by the agent. 2521 // 2522 // It REQUIRES that the service that is being proxied is already present in the 2523 // local state. Note that this is only used for agent-managed proxies so we can 2524 // ensure that we always make this true. For externally managed and registered 2525 // proxies we explicitly allow the proxy to be registered first to make 2526 // bootstrap ordering of a new service simpler but the same is not true here 2527 // since this is only ever called when setting up a _managed_ proxy which was 2528 // registered as part of a service registration either from config or HTTP API 2529 // call. 2530 // 2531 // The restoredProxyToken argument should only be used when restoring proxy 2532 // definitions from disk; new proxies must leave it blank to get a new token 2533 // assigned. We need to restore from disk to enable to continue authenticating 2534 // running proxies that already had that credential injected. 2535 func (a *Agent) AddProxy(proxy *structs.ConnectManagedProxy, persist, FromFile bool, 2536 restoredProxyToken string, source configSource) error { 2537 a.stateLock.Lock() 2538 defer a.stateLock.Unlock() 2539 return a.addProxyLocked(proxy, persist, FromFile, restoredProxyToken, source) 2540 } 2541 2542 // resolveProxyCheckAddress returns the best address to use for a TCP check of 2543 // the proxy's public listener. It expects the input to already have default 2544 // values populated by applyProxyConfigDefaults. It may return an empty string 2545 // indicating that the TCP check should not be created at all. 2546 // 2547 // By default this uses the proxy's bind address which in turn defaults to the 2548 // agent's bind address. If the proxy bind address ends up being 0.0.0.0 we have 2549 // to assume the agent can dial it over loopback which is usually true. 2550 // 2551 // In some topologies such as proxy being in a different container, the IP the 2552 // agent used to dial proxy over a local bridge might not be the same as the 2553 // container's public routable IP address so we allow a manual override of the 2554 // check address in config "tcp_check_address" too. 2555 // 2556 // Finally the TCP check can be disabled by another manual override 2557 // "disable_tcp_check" in cases where the agent will never be able to dial the 2558 // proxy directly for some reason. 2559 func (a *Agent) resolveProxyCheckAddress(proxyCfg map[string]interface{}) string { 2560 // If user disabled the check return empty string 2561 if disable, ok := proxyCfg["disable_tcp_check"].(bool); ok && disable { 2562 return "" 2563 } 2564 2565 // If user specified a custom one, use that 2566 if chkAddr, ok := proxyCfg["tcp_check_address"].(string); ok && chkAddr != "" { 2567 return chkAddr 2568 } 2569 2570 // If we have a bind address and its diallable, use that 2571 if bindAddr, ok := proxyCfg["bind_address"].(string); ok && 2572 bindAddr != "" && bindAddr != "0.0.0.0" && bindAddr != "[::]" { 2573 return bindAddr 2574 } 2575 2576 // Default to localhost 2577 return "127.0.0.1" 2578 } 2579 2580 // applyProxyConfigDefaults takes a *structs.ConnectManagedProxy and returns 2581 // it's Config map merged with any defaults from the Agent's config. It would be 2582 // nicer if this were defined as a method on structs.ConnectManagedProxy but we 2583 // can't do that because ot the import cycle it causes with agent/config. 2584 func (a *Agent) applyProxyConfigDefaults(p *structs.ConnectManagedProxy) (map[string]interface{}, error) { 2585 if p == nil || p.ProxyService == nil { 2586 // Should never happen but protect from panic 2587 return nil, fmt.Errorf("invalid proxy state") 2588 } 2589 2590 // Lookup the target service 2591 target := a.State.Service(p.TargetServiceID) 2592 if target == nil { 2593 // Can happen during deregistration race between proxy and scheduler. 2594 return nil, fmt.Errorf("unknown target service ID: %s", p.TargetServiceID) 2595 } 2596 2597 // Merge globals defaults 2598 config := make(map[string]interface{}) 2599 for k, v := range a.config.ConnectProxyDefaultConfig { 2600 if _, ok := config[k]; !ok { 2601 config[k] = v 2602 } 2603 } 2604 2605 // Copy config from the proxy 2606 for k, v := range p.Config { 2607 config[k] = v 2608 } 2609 2610 // Set defaults for anything that is still not specified but required. 2611 // Note that these are not included in the content hash. Since we expect 2612 // them to be static in general but some like the default target service 2613 // port might not be. In that edge case services can set that explicitly 2614 // when they re-register which will be caught though. 2615 if _, ok := config["bind_port"]; !ok { 2616 config["bind_port"] = p.ProxyService.Port 2617 } 2618 if _, ok := config["bind_address"]; !ok { 2619 // Default to binding to the same address the agent is configured to 2620 // bind to. 2621 config["bind_address"] = a.config.BindAddr.String() 2622 } 2623 if _, ok := config["local_service_address"]; !ok { 2624 // Default to localhost and the port the service registered with 2625 config["local_service_address"] = fmt.Sprintf("127.0.0.1:%d", target.Port) 2626 } 2627 2628 // Basic type conversions for expected types. 2629 if raw, ok := config["bind_port"]; ok { 2630 switch v := raw.(type) { 2631 case float64: 2632 // Common since HCL/JSON parse as float64 2633 config["bind_port"] = int(v) 2634 2635 // NOTE(mitchellh): No default case since errors and validation 2636 // are handled by the ServiceDefinition.Validate function. 2637 } 2638 } 2639 2640 return config, nil 2641 } 2642 2643 // applyProxyDefaults modifies the given proxy by applying any configured 2644 // defaults, such as the default execution mode, command, etc. 2645 func (a *Agent) applyProxyDefaults(proxy *structs.ConnectManagedProxy) error { 2646 // Set the default exec mode 2647 if proxy.ExecMode == structs.ProxyExecModeUnspecified { 2648 mode, err := structs.NewProxyExecMode(a.config.ConnectProxyDefaultExecMode) 2649 if err != nil { 2650 return err 2651 } 2652 2653 proxy.ExecMode = mode 2654 } 2655 if proxy.ExecMode == structs.ProxyExecModeUnspecified { 2656 proxy.ExecMode = structs.ProxyExecModeDaemon 2657 } 2658 2659 // Set the default command to the globally configured default 2660 if len(proxy.Command) == 0 { 2661 switch proxy.ExecMode { 2662 case structs.ProxyExecModeDaemon: 2663 proxy.Command = a.config.ConnectProxyDefaultDaemonCommand 2664 2665 case structs.ProxyExecModeScript: 2666 proxy.Command = a.config.ConnectProxyDefaultScriptCommand 2667 } 2668 } 2669 2670 // If there is no globally configured default we need to get the 2671 // default command so we can do "consul connect proxy" 2672 if len(proxy.Command) == 0 { 2673 command, err := defaultProxyCommand(a.config) 2674 if err != nil { 2675 return err 2676 } 2677 2678 proxy.Command = command 2679 } 2680 2681 return nil 2682 } 2683 2684 // removeProxyLocked stops and removes a local proxy instance. 2685 // 2686 // It is assumed that this function is called while holding the proxyLock already 2687 func (a *Agent) removeProxyLocked(proxyID string, persist bool) error { 2688 // Validate proxyID 2689 if proxyID == "" { 2690 return fmt.Errorf("proxyID missing") 2691 } 2692 2693 // Remove the proxy from the local state 2694 p, err := a.State.RemoveProxy(proxyID) 2695 if err != nil { 2696 return err 2697 } 2698 2699 // Remove the proxy service as well. The proxy ID is also the ID 2700 // of the servie, but we might as well use the service pointer. 2701 if err := a.removeServiceLocked(p.Proxy.ProxyService.ID, persist); err != nil { 2702 return err 2703 } 2704 2705 if persist && a.config.DataDir != "" { 2706 return a.purgeProxy(proxyID) 2707 } 2708 2709 return nil 2710 } 2711 2712 // RemoveProxy stops and removes a local proxy instance. 2713 func (a *Agent) RemoveProxy(proxyID string, persist bool) error { 2714 a.stateLock.Lock() 2715 defer a.stateLock.Unlock() 2716 return a.removeProxyLocked(proxyID, persist) 2717 } 2718 2719 // verifyProxyToken takes a token and attempts to verify it against the 2720 // targetService name. If targetProxy is specified, then the local proxy token 2721 // must exactly match the given proxy ID. cert, config, etc.). 2722 // 2723 // The given token may be a local-only proxy token or it may be an ACL token. We 2724 // will attempt to verify the local proxy token first. 2725 // 2726 // The effective ACL token is returned along with a boolean which is true if the 2727 // match was against a proxy token rather than an ACL token, and any error. In 2728 // the case the token matches a proxy token, then the ACL token used to register 2729 // that proxy's target service is returned for use in any RPC calls the proxy 2730 // needs to make on behalf of that service. If the token was an ACL token 2731 // already then it is always returned. Provided error is nil, a valid ACL token 2732 // is always returned. 2733 func (a *Agent) verifyProxyToken(token, targetService, 2734 targetProxy string) (string, bool, error) { 2735 // If we specify a target proxy, we look up that proxy directly. Otherwise, 2736 // we resolve with any proxy we can find. 2737 var proxy *local.ManagedProxy 2738 if targetProxy != "" { 2739 proxy = a.State.Proxy(targetProxy) 2740 if proxy == nil { 2741 return "", false, fmt.Errorf("unknown proxy service ID: %q", targetProxy) 2742 } 2743 2744 // If the token DOESN'T match, then we reset the proxy which will 2745 // cause the logic below to fall back to normal ACLs. Otherwise, 2746 // we keep the proxy set because we also have to verify that the 2747 // target service matches on the proxy. 2748 if token != proxy.ProxyToken { 2749 proxy = nil 2750 } 2751 } else { 2752 proxy = a.resolveProxyToken(token) 2753 } 2754 2755 // The existence of a token isn't enough, we also need to verify 2756 // that the service name of the matching proxy matches our target 2757 // service. 2758 if proxy != nil { 2759 // Get the target service since we only have the name. The nil 2760 // check below should never be true since a proxy token always 2761 // represents the existence of a local service. 2762 target := a.State.Service(proxy.Proxy.TargetServiceID) 2763 if target == nil { 2764 return "", false, fmt.Errorf("proxy target service not found: %q", 2765 proxy.Proxy.TargetServiceID) 2766 } 2767 2768 if target.Service != targetService { 2769 return "", false, acl.ErrPermissionDenied 2770 } 2771 2772 // Resolve the actual ACL token used to register the proxy/service and 2773 // return that for use in RPC calls. 2774 return a.State.ServiceToken(proxy.Proxy.TargetServiceID), true, nil 2775 } 2776 2777 // Doesn't match, we have to do a full token resolution. The required 2778 // permission for any proxy-related endpoint is service:write, since 2779 // to register a proxy you require that permission and sensitive data 2780 // is usually present in the configuration. 2781 rule, err := a.resolveToken(token) 2782 if err != nil { 2783 return "", false, err 2784 } 2785 if rule != nil && !rule.ServiceWrite(targetService, nil) { 2786 return "", false, acl.ErrPermissionDenied 2787 } 2788 2789 return token, false, nil 2790 } 2791 2792 func (a *Agent) cancelCheckMonitors(checkID types.CheckID) { 2793 // Stop any monitors 2794 delete(a.checkReapAfter, checkID) 2795 if check, ok := a.checkMonitors[checkID]; ok { 2796 check.Stop() 2797 delete(a.checkMonitors, checkID) 2798 } 2799 if check, ok := a.checkHTTPs[checkID]; ok { 2800 check.Stop() 2801 delete(a.checkHTTPs, checkID) 2802 } 2803 if check, ok := a.checkTCPs[checkID]; ok { 2804 check.Stop() 2805 delete(a.checkTCPs, checkID) 2806 } 2807 if check, ok := a.checkGRPCs[checkID]; ok { 2808 check.Stop() 2809 delete(a.checkGRPCs, checkID) 2810 } 2811 if check, ok := a.checkTTLs[checkID]; ok { 2812 check.Stop() 2813 delete(a.checkTTLs, checkID) 2814 } 2815 if check, ok := a.checkDockers[checkID]; ok { 2816 check.Stop() 2817 delete(a.checkDockers, checkID) 2818 } 2819 } 2820 2821 // updateTTLCheck is used to update the status of a TTL check via the Agent API. 2822 func (a *Agent) updateTTLCheck(checkID types.CheckID, status, output string) error { 2823 a.stateLock.Lock() 2824 defer a.stateLock.Unlock() 2825 2826 // Grab the TTL check. 2827 check, ok := a.checkTTLs[checkID] 2828 if !ok { 2829 return fmt.Errorf("CheckID %q does not have associated TTL", checkID) 2830 } 2831 2832 // Set the status through CheckTTL to reset the TTL. 2833 check.SetStatus(status, output) 2834 2835 // We don't write any files in dev mode so bail here. 2836 if a.config.DataDir == "" { 2837 return nil 2838 } 2839 2840 // Persist the state so the TTL check can come up in a good state after 2841 // an agent restart, especially with long TTL values. 2842 if err := a.persistCheckState(check, status, output); err != nil { 2843 return fmt.Errorf("failed persisting state for check %q: %s", checkID, err) 2844 } 2845 2846 return nil 2847 } 2848 2849 // persistCheckState is used to record the check status into the data dir. 2850 // This allows the state to be restored on a later agent start. Currently 2851 // only useful for TTL based checks. 2852 func (a *Agent) persistCheckState(check *checks.CheckTTL, status, output string) error { 2853 // Create the persisted state 2854 state := persistedCheckState{ 2855 CheckID: check.CheckID, 2856 Status: status, 2857 Output: output, 2858 Expires: time.Now().Add(check.TTL).Unix(), 2859 } 2860 2861 // Encode the state 2862 buf, err := json.Marshal(state) 2863 if err != nil { 2864 return err 2865 } 2866 2867 // Create the state dir if it doesn't exist 2868 dir := filepath.Join(a.config.DataDir, checkStateDir) 2869 if err := os.MkdirAll(dir, 0700); err != nil { 2870 return fmt.Errorf("failed creating check state dir %q: %s", dir, err) 2871 } 2872 2873 // Write the state to the file 2874 file := filepath.Join(dir, checkIDHash(check.CheckID)) 2875 2876 // Create temp file in same dir, to make more likely atomic 2877 tempFile := file + ".tmp" 2878 2879 // persistCheckState is called frequently, so don't use writeFileAtomic to avoid calling fsync here 2880 if err := ioutil.WriteFile(tempFile, buf, 0600); err != nil { 2881 return fmt.Errorf("failed writing temp file %q: %s", tempFile, err) 2882 } 2883 if err := os.Rename(tempFile, file); err != nil { 2884 return fmt.Errorf("failed to rename temp file from %q to %q: %s", tempFile, file, err) 2885 } 2886 2887 return nil 2888 } 2889 2890 // loadCheckState is used to restore the persisted state of a check. 2891 func (a *Agent) loadCheckState(check *structs.HealthCheck) error { 2892 // Try to read the persisted state for this check 2893 file := filepath.Join(a.config.DataDir, checkStateDir, checkIDHash(check.CheckID)) 2894 buf, err := ioutil.ReadFile(file) 2895 if err != nil { 2896 if os.IsNotExist(err) { 2897 return nil 2898 } 2899 return fmt.Errorf("failed reading file %q: %s", file, err) 2900 } 2901 2902 // Decode the state data 2903 var p persistedCheckState 2904 if err := json.Unmarshal(buf, &p); err != nil { 2905 a.logger.Printf("[ERR] agent: failed decoding check state: %s", err) 2906 return a.purgeCheckState(check.CheckID) 2907 } 2908 2909 // Check if the state has expired 2910 if time.Now().Unix() >= p.Expires { 2911 a.logger.Printf("[DEBUG] agent: check state expired for %q, not restoring", check.CheckID) 2912 return a.purgeCheckState(check.CheckID) 2913 } 2914 2915 // Restore the fields from the state 2916 check.Output = p.Output 2917 check.Status = p.Status 2918 return nil 2919 } 2920 2921 // purgeCheckState is used to purge the state of a check from the data dir 2922 func (a *Agent) purgeCheckState(checkID types.CheckID) error { 2923 file := filepath.Join(a.config.DataDir, checkStateDir, checkIDHash(checkID)) 2924 err := os.Remove(file) 2925 if os.IsNotExist(err) { 2926 return nil 2927 } 2928 return err 2929 } 2930 2931 func (a *Agent) GossipEncrypted() bool { 2932 return a.delegate.Encrypted() 2933 } 2934 2935 // Stats is used to get various debugging state from the sub-systems 2936 func (a *Agent) Stats() map[string]map[string]string { 2937 stats := a.delegate.Stats() 2938 stats["agent"] = map[string]string{ 2939 "check_monitors": strconv.Itoa(len(a.checkMonitors)), 2940 "check_ttls": strconv.Itoa(len(a.checkTTLs)), 2941 } 2942 for k, v := range a.State.Stats() { 2943 stats["agent"][k] = v 2944 } 2945 2946 revision := a.config.Revision 2947 if len(revision) > 8 { 2948 revision = revision[:8] 2949 } 2950 stats["build"] = map[string]string{ 2951 "revision": revision, 2952 "version": a.config.Version, 2953 "prerelease": a.config.VersionPrerelease, 2954 } 2955 return stats 2956 } 2957 2958 // storePid is used to write out our PID to a file if necessary 2959 func (a *Agent) storePid() error { 2960 // Quit fast if no pidfile 2961 pidPath := a.config.PidFile 2962 if pidPath == "" { 2963 return nil 2964 } 2965 2966 // Open the PID file 2967 pidFile, err := os.OpenFile(pidPath, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0666) 2968 if err != nil { 2969 return fmt.Errorf("Could not open pid file: %v", err) 2970 } 2971 defer pidFile.Close() 2972 2973 // Write out the PID 2974 pid := os.Getpid() 2975 _, err = pidFile.WriteString(fmt.Sprintf("%d", pid)) 2976 if err != nil { 2977 return fmt.Errorf("Could not write to pid file: %s", err) 2978 } 2979 return nil 2980 } 2981 2982 // deletePid is used to delete our PID on exit 2983 func (a *Agent) deletePid() error { 2984 // Quit fast if no pidfile 2985 pidPath := a.config.PidFile 2986 if pidPath == "" { 2987 return nil 2988 } 2989 2990 stat, err := os.Stat(pidPath) 2991 if err != nil { 2992 return fmt.Errorf("Could not remove pid file: %s", err) 2993 } 2994 2995 if stat.IsDir() { 2996 return fmt.Errorf("Specified pid file path is directory") 2997 } 2998 2999 err = os.Remove(pidPath) 3000 if err != nil { 3001 return fmt.Errorf("Could not remove pid file: %s", err) 3002 } 3003 return nil 3004 } 3005 3006 // loadServices will load service definitions from configuration and persisted 3007 // definitions on disk, and load them into the local agent. 3008 func (a *Agent) loadServices(conf *config.RuntimeConfig) error { 3009 // Register the services from config 3010 for _, service := range conf.Services { 3011 ns := service.NodeService() 3012 chkTypes, err := service.CheckTypes() 3013 if err != nil { 3014 return fmt.Errorf("Failed to validate checks for service %q: %v", service.Name, err) 3015 } 3016 3017 // Grab and validate sidecar if there is one too 3018 sidecar, sidecarChecks, sidecarToken, err := a.sidecarServiceFromNodeService(ns, service.Token) 3019 if err != nil { 3020 return fmt.Errorf("Failed to validate sidecar for service %q: %v", service.Name, err) 3021 } 3022 3023 // Remove sidecar from NodeService now it's done it's job it's just a config 3024 // syntax sugar and shouldn't be persisted in local or server state. 3025 ns.Connect.SidecarService = nil 3026 3027 if err := a.addServiceLocked(ns, chkTypes, false, service.Token, ConfigSourceLocal); err != nil { 3028 return fmt.Errorf("Failed to register service %q: %v", service.Name, err) 3029 } 3030 3031 // If there is a sidecar service, register that too. 3032 if sidecar != nil { 3033 if err := a.addServiceLocked(sidecar, sidecarChecks, false, sidecarToken, ConfigSourceLocal); err != nil { 3034 return fmt.Errorf("Failed to register sidecar for service %q: %v", service.Name, err) 3035 } 3036 } 3037 } 3038 3039 // Load any persisted services 3040 svcDir := filepath.Join(a.config.DataDir, servicesDir) 3041 files, err := ioutil.ReadDir(svcDir) 3042 if err != nil { 3043 if os.IsNotExist(err) { 3044 return nil 3045 } 3046 return fmt.Errorf("Failed reading services dir %q: %s", svcDir, err) 3047 } 3048 for _, fi := range files { 3049 // Skip all dirs 3050 if fi.IsDir() { 3051 continue 3052 } 3053 3054 // Skip all partially written temporary files 3055 if strings.HasSuffix(fi.Name(), "tmp") { 3056 a.logger.Printf("[WARN] agent: Ignoring temporary service file %v", fi.Name()) 3057 continue 3058 } 3059 3060 // Open the file for reading 3061 file := filepath.Join(svcDir, fi.Name()) 3062 fh, err := os.Open(file) 3063 if err != nil { 3064 return fmt.Errorf("failed opening service file %q: %s", file, err) 3065 } 3066 3067 // Read the contents into a buffer 3068 buf, err := ioutil.ReadAll(fh) 3069 fh.Close() 3070 if err != nil { 3071 return fmt.Errorf("failed reading service file %q: %s", file, err) 3072 } 3073 3074 // Try decoding the service definition 3075 var p persistedService 3076 if err := json.Unmarshal(buf, &p); err != nil { 3077 // Backwards-compatibility for pre-0.5.1 persisted services 3078 if err := json.Unmarshal(buf, &p.Service); err != nil { 3079 a.logger.Printf("[ERR] agent: Failed decoding service file %q: %s", file, err) 3080 continue 3081 } 3082 } 3083 serviceID := p.Service.ID 3084 3085 if a.State.Service(serviceID) != nil { 3086 // Purge previously persisted service. This allows config to be 3087 // preferred over services persisted from the API. 3088 a.logger.Printf("[DEBUG] agent: service %q exists, not restoring from %q", 3089 serviceID, file) 3090 if err := a.purgeService(serviceID); err != nil { 3091 return fmt.Errorf("failed purging service %q: %s", serviceID, err) 3092 } 3093 } else { 3094 a.logger.Printf("[DEBUG] agent: restored service definition %q from %q", 3095 serviceID, file) 3096 if err := a.addServiceLocked(p.Service, nil, false, p.Token, ConfigSourceLocal); err != nil { 3097 return fmt.Errorf("failed adding service %q: %s", serviceID, err) 3098 } 3099 } 3100 } 3101 3102 return nil 3103 } 3104 3105 // unloadServices will deregister all services. 3106 func (a *Agent) unloadServices() error { 3107 for id := range a.State.Services() { 3108 if err := a.removeServiceLocked(id, false); err != nil { 3109 return fmt.Errorf("Failed deregistering service '%s': %v", id, err) 3110 } 3111 } 3112 return nil 3113 } 3114 3115 // loadChecks loads check definitions and/or persisted check definitions from 3116 // disk and re-registers them with the local agent. 3117 func (a *Agent) loadChecks(conf *config.RuntimeConfig) error { 3118 // Register the checks from config 3119 for _, check := range conf.Checks { 3120 health := check.HealthCheck(conf.NodeName) 3121 chkType := check.CheckType() 3122 if err := a.addCheckLocked(health, chkType, false, check.Token, ConfigSourceLocal); err != nil { 3123 return fmt.Errorf("Failed to register check '%s': %v %v", check.Name, err, check) 3124 } 3125 } 3126 3127 // Load any persisted checks 3128 checkDir := filepath.Join(a.config.DataDir, checksDir) 3129 files, err := ioutil.ReadDir(checkDir) 3130 if err != nil { 3131 if os.IsNotExist(err) { 3132 return nil 3133 } 3134 return fmt.Errorf("Failed reading checks dir %q: %s", checkDir, err) 3135 } 3136 for _, fi := range files { 3137 // Ignore dirs - we only care about the check definition files 3138 if fi.IsDir() { 3139 continue 3140 } 3141 3142 // Open the file for reading 3143 file := filepath.Join(checkDir, fi.Name()) 3144 fh, err := os.Open(file) 3145 if err != nil { 3146 return fmt.Errorf("Failed opening check file %q: %s", file, err) 3147 } 3148 3149 // Read the contents into a buffer 3150 buf, err := ioutil.ReadAll(fh) 3151 fh.Close() 3152 if err != nil { 3153 return fmt.Errorf("failed reading check file %q: %s", file, err) 3154 } 3155 3156 // Decode the check 3157 var p persistedCheck 3158 if err := json.Unmarshal(buf, &p); err != nil { 3159 a.logger.Printf("[ERR] agent: Failed decoding check file %q: %s", file, err) 3160 continue 3161 } 3162 checkID := p.Check.CheckID 3163 3164 if a.State.Check(checkID) != nil { 3165 // Purge previously persisted check. This allows config to be 3166 // preferred over persisted checks from the API. 3167 a.logger.Printf("[DEBUG] agent: check %q exists, not restoring from %q", 3168 checkID, file) 3169 if err := a.purgeCheck(checkID); err != nil { 3170 return fmt.Errorf("Failed purging check %q: %s", checkID, err) 3171 } 3172 } else { 3173 // Default check to critical to avoid placing potentially unhealthy 3174 // services into the active pool 3175 p.Check.Status = api.HealthCritical 3176 3177 if err := a.addCheckLocked(p.Check, p.ChkType, false, p.Token, ConfigSourceLocal); err != nil { 3178 // Purge the check if it is unable to be restored. 3179 a.logger.Printf("[WARN] agent: Failed to restore check %q: %s", 3180 checkID, err) 3181 if err := a.purgeCheck(checkID); err != nil { 3182 return fmt.Errorf("Failed purging check %q: %s", checkID, err) 3183 } 3184 } 3185 a.logger.Printf("[DEBUG] agent: restored health check %q from %q", 3186 p.Check.CheckID, file) 3187 } 3188 } 3189 3190 return nil 3191 } 3192 3193 // unloadChecks will deregister all checks known to the local agent. 3194 func (a *Agent) unloadChecks() error { 3195 for id := range a.State.Checks() { 3196 if err := a.removeCheckLocked(id, false); err != nil { 3197 return fmt.Errorf("Failed deregistering check '%s': %s", id, err) 3198 } 3199 } 3200 return nil 3201 } 3202 3203 // loadPersistedProxies will load connect proxy definitions from their 3204 // persisted state on disk and return a slice of them 3205 // 3206 // This does not add them to the local 3207 func (a *Agent) loadPersistedProxies() (map[string]persistedProxy, error) { 3208 persistedProxies := make(map[string]persistedProxy) 3209 3210 proxyDir := filepath.Join(a.config.DataDir, proxyDir) 3211 files, err := ioutil.ReadDir(proxyDir) 3212 if err != nil { 3213 if !os.IsNotExist(err) { 3214 return nil, fmt.Errorf("Failed reading proxies dir %q: %s", proxyDir, err) 3215 } 3216 } 3217 3218 for _, fi := range files { 3219 // Skip all dirs 3220 if fi.IsDir() { 3221 continue 3222 } 3223 3224 // Skip all partially written temporary files 3225 if strings.HasSuffix(fi.Name(), "tmp") { 3226 return nil, fmt.Errorf("Ignoring temporary proxy file %v", fi.Name()) 3227 } 3228 3229 // Open the file for reading 3230 file := filepath.Join(proxyDir, fi.Name()) 3231 fh, err := os.Open(file) 3232 if err != nil { 3233 return nil, fmt.Errorf("failed opening proxy file %q: %s", file, err) 3234 } 3235 3236 // Read the contents into a buffer 3237 buf, err := ioutil.ReadAll(fh) 3238 fh.Close() 3239 if err != nil { 3240 return nil, fmt.Errorf("failed reading proxy file %q: %s", file, err) 3241 } 3242 3243 // Try decoding the proxy definition 3244 var p persistedProxy 3245 if err := json.Unmarshal(buf, &p); err != nil { 3246 return nil, fmt.Errorf("Failed decoding proxy file %q: %s", file, err) 3247 } 3248 svcID := p.Proxy.TargetServiceID 3249 3250 persistedProxies[svcID] = p 3251 } 3252 3253 return persistedProxies, nil 3254 } 3255 3256 // loadProxies will load connect proxy definitions from configuration and 3257 // persisted definitions on disk, and load them into the local agent. 3258 func (a *Agent) loadProxies(conf *config.RuntimeConfig) error { 3259 persistedProxies, persistenceErr := a.loadPersistedProxies() 3260 3261 for _, svc := range conf.Services { 3262 if svc.Connect != nil { 3263 proxy, err := svc.ConnectManagedProxy() 3264 if err != nil { 3265 return fmt.Errorf("failed adding proxy: %s", err) 3266 } 3267 if proxy == nil { 3268 continue 3269 } 3270 restoredToken := "" 3271 if persisted, ok := persistedProxies[proxy.TargetServiceID]; ok { 3272 restoredToken = persisted.ProxyToken 3273 } 3274 3275 if err := a.addProxyLocked(proxy, true, true, restoredToken, ConfigSourceLocal); err != nil { 3276 return fmt.Errorf("failed adding proxy: %s", err) 3277 } 3278 } 3279 } 3280 3281 for _, persisted := range persistedProxies { 3282 proxyID := persisted.Proxy.ProxyService.ID 3283 if persisted.FromFile && a.State.Proxy(proxyID) == nil { 3284 // Purge proxies that were configured previously but are no longer in the config 3285 a.logger.Printf("[DEBUG] agent: purging stale persisted proxy %q", proxyID) 3286 if err := a.purgeProxy(proxyID); err != nil { 3287 return fmt.Errorf("failed purging proxy %q: %v", proxyID, err) 3288 } 3289 } else if !persisted.FromFile { 3290 if a.State.Proxy(proxyID) == nil { 3291 a.logger.Printf("[DEBUG] agent: restored proxy definition %q", proxyID) 3292 if err := a.addProxyLocked(persisted.Proxy, false, false, persisted.ProxyToken, ConfigSourceLocal); err != nil { 3293 return fmt.Errorf("failed adding proxy %q: %v", proxyID, err) 3294 } 3295 } else { 3296 a.logger.Printf("[WARN] agent: proxy definition %q was overwritten by a proxy definition within a config file", proxyID) 3297 } 3298 } 3299 } 3300 3301 return persistenceErr 3302 } 3303 3304 type persistedTokens struct { 3305 Replication string `json:"replication,omitempty"` 3306 AgentMaster string `json:"agent_master,omitempty"` 3307 Default string `json:"default,omitempty"` 3308 Agent string `json:"agent,omitempty"` 3309 } 3310 3311 func (a *Agent) getPersistedTokens() (*persistedTokens, error) { 3312 persistedTokens := &persistedTokens{} 3313 if !a.config.ACLEnableTokenPersistence { 3314 return persistedTokens, nil 3315 } 3316 3317 a.persistedTokensLock.RLock() 3318 defer a.persistedTokensLock.RUnlock() 3319 3320 tokensFullPath := filepath.Join(a.config.DataDir, tokensPath) 3321 3322 buf, err := ioutil.ReadFile(tokensFullPath) 3323 if err != nil { 3324 if os.IsNotExist(err) { 3325 // non-existence is not an error we care about 3326 return persistedTokens, nil 3327 } 3328 return persistedTokens, fmt.Errorf("failed reading tokens file %q: %s", tokensFullPath, err) 3329 } 3330 3331 if err := json.Unmarshal(buf, persistedTokens); err != nil { 3332 return persistedTokens, fmt.Errorf("failed to decode tokens file %q: %s", tokensFullPath, err) 3333 } 3334 3335 return persistedTokens, nil 3336 } 3337 3338 func (a *Agent) loadTokens(conf *config.RuntimeConfig) error { 3339 persistedTokens, persistenceErr := a.getPersistedTokens() 3340 3341 if persistenceErr != nil { 3342 a.logger.Printf("[WARN] unable to load persisted tokens: %v", persistenceErr) 3343 } 3344 3345 if persistedTokens.Default != "" { 3346 a.tokens.UpdateUserToken(persistedTokens.Default, token.TokenSourceAPI) 3347 3348 if conf.ACLToken != "" { 3349 a.logger.Printf("[WARN] \"default\" token present in both the configuration and persisted token store, using the persisted token") 3350 } 3351 } else { 3352 a.tokens.UpdateUserToken(conf.ACLToken, token.TokenSourceConfig) 3353 } 3354 3355 if persistedTokens.Agent != "" { 3356 a.tokens.UpdateAgentToken(persistedTokens.Agent, token.TokenSourceAPI) 3357 3358 if conf.ACLAgentToken != "" { 3359 a.logger.Printf("[WARN] \"agent\" token present in both the configuration and persisted token store, using the persisted token") 3360 } 3361 } else { 3362 a.tokens.UpdateAgentToken(conf.ACLAgentToken, token.TokenSourceConfig) 3363 } 3364 3365 if persistedTokens.AgentMaster != "" { 3366 a.tokens.UpdateAgentMasterToken(persistedTokens.AgentMaster, token.TokenSourceAPI) 3367 3368 if conf.ACLAgentMasterToken != "" { 3369 a.logger.Printf("[WARN] \"agent_master\" token present in both the configuration and persisted token store, using the persisted token") 3370 } 3371 } else { 3372 a.tokens.UpdateAgentMasterToken(conf.ACLAgentMasterToken, token.TokenSourceConfig) 3373 } 3374 3375 if persistedTokens.Replication != "" { 3376 a.tokens.UpdateReplicationToken(persistedTokens.Replication, token.TokenSourceAPI) 3377 3378 if conf.ACLReplicationToken != "" { 3379 a.logger.Printf("[WARN] \"replication\" token present in both the configuration and persisted token store, using the persisted token") 3380 } 3381 } else { 3382 a.tokens.UpdateReplicationToken(conf.ACLReplicationToken, token.TokenSourceConfig) 3383 } 3384 3385 return persistenceErr 3386 } 3387 3388 // unloadProxies will deregister all proxies known to the local agent. 3389 func (a *Agent) unloadProxies() error { 3390 for id := range a.State.Proxies() { 3391 if err := a.removeProxyLocked(id, false); err != nil { 3392 return fmt.Errorf("Failed deregistering proxy '%s': %s", id, err) 3393 } 3394 } 3395 return nil 3396 } 3397 3398 // snapshotCheckState is used to snapshot the current state of the health 3399 // checks. This is done before we reload our checks, so that we can properly 3400 // restore into the same state. 3401 func (a *Agent) snapshotCheckState() map[types.CheckID]*structs.HealthCheck { 3402 return a.State.Checks() 3403 } 3404 3405 // restoreCheckState is used to reset the health state based on a snapshot. 3406 // This is done after we finish the reload to avoid any unnecessary flaps 3407 // in health state and potential session invalidations. 3408 func (a *Agent) restoreCheckState(snap map[types.CheckID]*structs.HealthCheck) { 3409 for id, check := range snap { 3410 a.State.UpdateCheck(id, check.Status, check.Output) 3411 } 3412 } 3413 3414 // loadMetadata loads node metadata fields from the agent config and 3415 // updates them on the local agent. 3416 func (a *Agent) loadMetadata(conf *config.RuntimeConfig) error { 3417 meta := map[string]string{} 3418 for k, v := range conf.NodeMeta { 3419 meta[k] = v 3420 } 3421 meta[structs.MetaSegmentKey] = conf.SegmentName 3422 return a.State.LoadMetadata(meta) 3423 } 3424 3425 // unloadMetadata resets the local metadata state 3426 func (a *Agent) unloadMetadata() { 3427 a.State.UnloadMetadata() 3428 } 3429 3430 // serviceMaintCheckID returns the ID of a given service's maintenance check 3431 func serviceMaintCheckID(serviceID string) types.CheckID { 3432 return types.CheckID(structs.ServiceMaintPrefix + serviceID) 3433 } 3434 3435 // EnableServiceMaintenance will register a false health check against the given 3436 // service ID with critical status. This will exclude the service from queries. 3437 func (a *Agent) EnableServiceMaintenance(serviceID, reason, token string) error { 3438 service, ok := a.State.Services()[serviceID] 3439 if !ok { 3440 return fmt.Errorf("No service registered with ID %q", serviceID) 3441 } 3442 3443 // Check if maintenance mode is not already enabled 3444 checkID := serviceMaintCheckID(serviceID) 3445 if _, ok := a.State.Checks()[checkID]; ok { 3446 return nil 3447 } 3448 3449 // Use default notes if no reason provided 3450 if reason == "" { 3451 reason = defaultServiceMaintReason 3452 } 3453 3454 // Create and register the critical health check 3455 check := &structs.HealthCheck{ 3456 Node: a.config.NodeName, 3457 CheckID: checkID, 3458 Name: "Service Maintenance Mode", 3459 Notes: reason, 3460 ServiceID: service.ID, 3461 ServiceName: service.Service, 3462 Status: api.HealthCritical, 3463 } 3464 a.AddCheck(check, nil, true, token, ConfigSourceLocal) 3465 a.logger.Printf("[INFO] agent: Service %q entered maintenance mode", serviceID) 3466 3467 return nil 3468 } 3469 3470 // DisableServiceMaintenance will deregister the fake maintenance mode check 3471 // if the service has been marked as in maintenance. 3472 func (a *Agent) DisableServiceMaintenance(serviceID string) error { 3473 if _, ok := a.State.Services()[serviceID]; !ok { 3474 return fmt.Errorf("No service registered with ID %q", serviceID) 3475 } 3476 3477 // Check if maintenance mode is enabled 3478 checkID := serviceMaintCheckID(serviceID) 3479 if _, ok := a.State.Checks()[checkID]; !ok { 3480 return nil 3481 } 3482 3483 // Deregister the maintenance check 3484 a.RemoveCheck(checkID, true) 3485 a.logger.Printf("[INFO] agent: Service %q left maintenance mode", serviceID) 3486 3487 return nil 3488 } 3489 3490 // EnableNodeMaintenance places a node into maintenance mode. 3491 func (a *Agent) EnableNodeMaintenance(reason, token string) { 3492 // Ensure node maintenance is not already enabled 3493 if _, ok := a.State.Checks()[structs.NodeMaint]; ok { 3494 return 3495 } 3496 3497 // Use a default notes value 3498 if reason == "" { 3499 reason = defaultNodeMaintReason 3500 } 3501 3502 // Create and register the node maintenance check 3503 check := &structs.HealthCheck{ 3504 Node: a.config.NodeName, 3505 CheckID: structs.NodeMaint, 3506 Name: "Node Maintenance Mode", 3507 Notes: reason, 3508 Status: api.HealthCritical, 3509 } 3510 a.AddCheck(check, nil, true, token, ConfigSourceLocal) 3511 a.logger.Printf("[INFO] agent: Node entered maintenance mode") 3512 } 3513 3514 // DisableNodeMaintenance removes a node from maintenance mode 3515 func (a *Agent) DisableNodeMaintenance() { 3516 if _, ok := a.State.Checks()[structs.NodeMaint]; !ok { 3517 return 3518 } 3519 a.RemoveCheck(structs.NodeMaint, true) 3520 a.logger.Printf("[INFO] agent: Node left maintenance mode") 3521 } 3522 3523 func (a *Agent) loadLimits(conf *config.RuntimeConfig) { 3524 a.config.RPCRateLimit = conf.RPCRateLimit 3525 a.config.RPCMaxBurst = conf.RPCMaxBurst 3526 } 3527 3528 func (a *Agent) ReloadConfig(newCfg *config.RuntimeConfig) error { 3529 // Bulk update the services and checks 3530 a.PauseSync() 3531 defer a.ResumeSync() 3532 3533 a.stateLock.Lock() 3534 defer a.stateLock.Unlock() 3535 3536 // Snapshot the current state, and restore it afterwards 3537 snap := a.snapshotCheckState() 3538 defer a.restoreCheckState(snap) 3539 3540 // First unload all checks, services, and metadata. This lets us begin the reload 3541 // with a clean slate. 3542 if err := a.unloadProxies(); err != nil { 3543 return fmt.Errorf("Failed unloading proxies: %s", err) 3544 } 3545 if err := a.unloadServices(); err != nil { 3546 return fmt.Errorf("Failed unloading services: %s", err) 3547 } 3548 if err := a.unloadChecks(); err != nil { 3549 return fmt.Errorf("Failed unloading checks: %s", err) 3550 } 3551 a.unloadMetadata() 3552 3553 // Reload tokens - should be done before all the other loading 3554 // to ensure the correct tokens are available for attaching to 3555 // the checks and service registrations. 3556 a.loadTokens(newCfg) 3557 3558 if err := a.tlsConfigurator.Update(newCfg.ToTLSUtilConfig()); err != nil { 3559 return fmt.Errorf("Failed reloading tls configuration: %s", err) 3560 } 3561 3562 // Reload service/check definitions and metadata. 3563 if err := a.loadServices(newCfg); err != nil { 3564 return fmt.Errorf("Failed reloading services: %s", err) 3565 } 3566 if err := a.loadProxies(newCfg); err != nil { 3567 return fmt.Errorf("Failed reloading proxies: %s", err) 3568 } 3569 if err := a.loadChecks(newCfg); err != nil { 3570 return fmt.Errorf("Failed reloading checks: %s", err) 3571 } 3572 if err := a.loadMetadata(newCfg); err != nil { 3573 return fmt.Errorf("Failed reloading metadata: %s", err) 3574 } 3575 3576 if err := a.reloadWatches(newCfg); err != nil { 3577 return fmt.Errorf("Failed reloading watches: %v", err) 3578 } 3579 3580 a.loadLimits(newCfg) 3581 3582 // create the config for the rpc server/client 3583 consulCfg, err := a.consulConfig() 3584 if err != nil { 3585 return err 3586 } 3587 3588 if err := a.delegate.ReloadConfig(consulCfg); err != nil { 3589 return err 3590 } 3591 3592 // Update filtered metrics 3593 metrics.UpdateFilter(newCfg.Telemetry.AllowedPrefixes, 3594 newCfg.Telemetry.BlockedPrefixes) 3595 3596 a.State.SetDiscardCheckOutput(newCfg.DiscardCheckOutput) 3597 3598 return nil 3599 } 3600 3601 // registerCache configures the cache and registers all the supported 3602 // types onto the cache. This is NOT safe to call multiple times so 3603 // care should be taken to call this exactly once after the cache 3604 // field has been initialized. 3605 func (a *Agent) registerCache() { 3606 // Note that you should register the _agent_ as the RPC implementation and not 3607 // the a.delegate directly, otherwise tests that rely on overriding RPC 3608 // routing via a.registerEndpoint will not work. 3609 3610 a.cache.RegisterType(cachetype.ConnectCARootName, &cachetype.ConnectCARoot{ 3611 RPC: a, 3612 }, &cache.RegisterOptions{ 3613 // Maintain a blocking query, retry dropped connections quickly 3614 Refresh: true, 3615 RefreshTimer: 0 * time.Second, 3616 RefreshTimeout: 10 * time.Minute, 3617 }) 3618 3619 a.cache.RegisterType(cachetype.ConnectCALeafName, &cachetype.ConnectCALeaf{ 3620 RPC: a, 3621 Cache: a.cache, 3622 Datacenter: a.config.Datacenter, 3623 TestOverrideCAChangeInitialDelay: a.config.ConnectTestCALeafRootChangeSpread, 3624 }, &cache.RegisterOptions{ 3625 // Maintain a blocking query, retry dropped connections quickly 3626 Refresh: true, 3627 RefreshTimer: 0 * time.Second, 3628 RefreshTimeout: 10 * time.Minute, 3629 }) 3630 3631 a.cache.RegisterType(cachetype.IntentionMatchName, &cachetype.IntentionMatch{ 3632 RPC: a, 3633 }, &cache.RegisterOptions{ 3634 // Maintain a blocking query, retry dropped connections quickly 3635 Refresh: true, 3636 RefreshTimer: 0 * time.Second, 3637 RefreshTimeout: 10 * time.Minute, 3638 }) 3639 3640 a.cache.RegisterType(cachetype.CatalogServicesName, &cachetype.CatalogServices{ 3641 RPC: a, 3642 }, &cache.RegisterOptions{ 3643 // Maintain a blocking query, retry dropped connections quickly 3644 Refresh: true, 3645 RefreshTimer: 0 * time.Second, 3646 RefreshTimeout: 10 * time.Minute, 3647 }) 3648 3649 a.cache.RegisterType(cachetype.HealthServicesName, &cachetype.HealthServices{ 3650 RPC: a, 3651 }, &cache.RegisterOptions{ 3652 // Maintain a blocking query, retry dropped connections quickly 3653 Refresh: true, 3654 RefreshTimer: 0 * time.Second, 3655 RefreshTimeout: 10 * time.Minute, 3656 }) 3657 3658 a.cache.RegisterType(cachetype.PreparedQueryName, &cachetype.PreparedQuery{ 3659 RPC: a, 3660 }, &cache.RegisterOptions{ 3661 // Prepared queries don't support blocking 3662 Refresh: false, 3663 }) 3664 3665 a.cache.RegisterType(cachetype.NodeServicesName, &cachetype.NodeServices{ 3666 RPC: a, 3667 }, &cache.RegisterOptions{ 3668 // Maintain a blocking query, retry dropped connections quickly 3669 Refresh: true, 3670 RefreshTimer: 0 * time.Second, 3671 RefreshTimeout: 10 * time.Minute, 3672 }) 3673 } 3674 3675 // defaultProxyCommand returns the default Connect managed proxy command. 3676 func defaultProxyCommand(agentCfg *config.RuntimeConfig) ([]string, error) { 3677 // Get the path to the current executable. This is cached once by the 3678 // library so this is effectively just a variable read. 3679 execPath, err := os.Executable() 3680 if err != nil { 3681 return nil, err 3682 } 3683 3684 // "consul connect proxy" default value for managed daemon proxy 3685 cmd := []string{execPath, "connect", "proxy"} 3686 3687 if agentCfg != nil && agentCfg.LogLevel != "INFO" { 3688 cmd = append(cmd, "-log-level", agentCfg.LogLevel) 3689 } 3690 return cmd, nil 3691 }