github.com/criteo-forks/consul@v1.4.5-criteonogrpc/agent/agent.go (about) 1 package agent 2 3 import ( 4 "context" 5 "crypto/sha512" 6 "crypto/tls" 7 "encoding/json" 8 "fmt" 9 "io" 10 "io/ioutil" 11 "log" 12 "net" 13 "net/http" 14 "os" 15 "path/filepath" 16 "strconv" 17 "strings" 18 "sync" 19 "time" 20 21 "google.golang.org/grpc" 22 23 metrics "github.com/armon/go-metrics" 24 "github.com/hashicorp/consul/acl" 25 "github.com/hashicorp/consul/agent/ae" 26 "github.com/hashicorp/consul/agent/cache" 27 cachetype "github.com/hashicorp/consul/agent/cache-types" 28 "github.com/hashicorp/consul/agent/checks" 29 "github.com/hashicorp/consul/agent/config" 30 "github.com/hashicorp/consul/agent/consul" 31 "github.com/hashicorp/consul/agent/local" 32 "github.com/hashicorp/consul/agent/proxycfg" 33 "github.com/hashicorp/consul/agent/proxyprocess" 34 "github.com/hashicorp/consul/agent/structs" 35 "github.com/hashicorp/consul/agent/systemd" 36 "github.com/hashicorp/consul/agent/token" 37 "github.com/hashicorp/consul/agent/xds" 38 "github.com/hashicorp/consul/api" 39 "github.com/hashicorp/consul/ipaddr" 40 "github.com/hashicorp/consul/lib" 41 "github.com/hashicorp/consul/lib/file" 42 "github.com/hashicorp/consul/logger" 43 "github.com/hashicorp/consul/tlsutil" 44 "github.com/hashicorp/consul/types" 45 "github.com/hashicorp/consul/watch" 46 multierror "github.com/hashicorp/go-multierror" 47 uuid "github.com/hashicorp/go-uuid" 48 "github.com/hashicorp/memberlist" 49 "github.com/hashicorp/raft" 50 "github.com/hashicorp/serf/serf" 51 "github.com/shirou/gopsutil/host" 52 "golang.org/x/net/http2" 53 ) 54 55 const ( 56 // Path to save agent service definitions 57 servicesDir = "services" 58 59 // Path to save agent proxy definitions 60 proxyDir = "proxies" 61 62 // Path to save local agent checks 63 checksDir = "checks" 64 checkStateDir = "checks/state" 65 66 // Name of the file tokens will be persisted within 67 tokensPath = "acl-tokens.json" 68 69 // Default reasons for node/service maintenance mode 70 defaultNodeMaintReason = "Maintenance mode is enabled for this node, " + 71 "but no reason was provided. This is a default message." 72 defaultServiceMaintReason = "Maintenance mode is enabled for this " + 73 "service, but no reason was provided. This is a default message." 74 ) 75 76 type configSource int 77 78 const ( 79 ConfigSourceLocal configSource = iota 80 ConfigSourceRemote 81 ) 82 83 // delegate defines the interface shared by both 84 // consul.Client and consul.Server. 85 type delegate interface { 86 Encrypted() bool 87 GetLANCoordinate() (lib.CoordinateSet, error) 88 Leave() error 89 LANMembers() []serf.Member 90 LANMembersAllSegments() ([]serf.Member, error) 91 LANSegmentMembers(segment string) ([]serf.Member, error) 92 LocalMember() serf.Member 93 JoinLAN(addrs []string) (n int, err error) 94 RemoveFailedNode(node string) error 95 ResolveToken(secretID string) (acl.Authorizer, error) 96 RPC(method string, args interface{}, reply interface{}) error 97 ACLsEnabled() bool 98 UseLegacyACLs() bool 99 SnapshotRPC(args *structs.SnapshotRequest, in io.Reader, out io.Writer, replyFn structs.SnapshotReplyFn) error 100 Shutdown() error 101 Stats() map[string]map[string]string 102 ReloadConfig(config *consul.Config) error 103 enterpriseDelegate 104 } 105 106 // notifier is called after a successful JoinLAN. 107 type notifier interface { 108 Notify(string) error 109 } 110 111 // The agent is the long running process that is run on every machine. 112 // It exposes an RPC interface that is used by the CLI to control the 113 // agent. The agent runs the query interfaces like HTTP, DNS, and RPC. 114 // However, it can run in either a client, or server mode. In server 115 // mode, it runs a full Consul server. In client-only mode, it only forwards 116 // requests to other Consul servers. 117 type Agent struct { 118 // config is the agent configuration. 119 config *config.RuntimeConfig 120 121 // Used for writing our logs 122 logger *log.Logger 123 124 // Output sink for logs 125 LogOutput io.Writer 126 127 // Used for streaming logs to 128 LogWriter *logger.LogWriter 129 130 // In-memory sink used for collecting metrics 131 MemSink *metrics.InmemSink 132 133 // delegate is either a *consul.Server or *consul.Client 134 // depending on the configuration 135 delegate delegate 136 137 // aclMasterAuthorizer is an object that helps manage local ACL enforcement. 138 aclMasterAuthorizer acl.Authorizer 139 140 // state stores a local representation of the node, 141 // services and checks. Used for anti-entropy. 142 State *local.State 143 144 // sync manages the synchronization of the local 145 // and the remote state. 146 sync *ae.StateSyncer 147 148 // syncMu and syncCh are used to coordinate agent endpoints that are blocking 149 // on local state during a config reload. 150 syncMu sync.Mutex 151 syncCh chan struct{} 152 153 // cache is the in-memory cache for data the Agent requests. 154 cache *cache.Cache 155 156 // checkReapAfter maps the check ID to a timeout after which we should 157 // reap its associated service 158 checkReapAfter map[types.CheckID]time.Duration 159 160 // checkMonitors maps the check ID to an associated monitor 161 checkMonitors map[types.CheckID]*checks.CheckMonitor 162 163 // checkHTTPs maps the check ID to an associated HTTP check 164 checkHTTPs map[types.CheckID]*checks.CheckHTTP 165 166 // checkTCPs maps the check ID to an associated TCP check 167 checkTCPs map[types.CheckID]*checks.CheckTCP 168 169 // checkGRPCs maps the check ID to an associated GRPC check 170 checkGRPCs map[types.CheckID]*checks.CheckGRPC 171 172 // checkTTLs maps the check ID to an associated check TTL 173 checkTTLs map[types.CheckID]*checks.CheckTTL 174 175 // checkDockers maps the check ID to an associated Docker Exec based check 176 checkDockers map[types.CheckID]*checks.CheckDocker 177 178 // checkAliases maps the check ID to an associated Alias checks 179 checkAliases map[types.CheckID]*checks.CheckAlias 180 181 // stateLock protects the agent state 182 stateLock sync.Mutex 183 184 // dockerClient is the client for performing docker health checks. 185 dockerClient *checks.DockerClient 186 187 // eventCh is used to receive user events 188 eventCh chan serf.UserEvent 189 190 // eventBuf stores the most recent events in a ring buffer 191 // using eventIndex as the next index to insert into. This 192 // is guarded by eventLock. When an insert happens, the 193 // eventNotify group is notified. 194 eventBuf []*UserEvent 195 eventIndex int 196 eventLock sync.RWMutex 197 eventNotify NotifyGroup 198 199 reloadCh chan chan error 200 201 shutdown bool 202 shutdownCh chan struct{} 203 shutdownLock sync.Mutex 204 205 // joinLANNotifier is called after a successful JoinLAN. 206 joinLANNotifier notifier 207 208 // retryJoinCh transports errors from the retry join 209 // attempts. 210 retryJoinCh chan error 211 212 // endpoints maps unique RPC endpoint names to common ones 213 // to allow overriding of RPC handlers since the golang 214 // net/rpc server does not allow this. 215 endpoints map[string]string 216 endpointsLock sync.RWMutex 217 218 // dnsServer provides the DNS API 219 dnsServers []*DNSServer 220 221 // httpServers provides the HTTP API on various endpoints 222 httpServers []*HTTPServer 223 224 // wgServers is the wait group for all HTTP and DNS servers 225 wgServers sync.WaitGroup 226 227 // watchPlans tracks all the currently-running watch plans for the 228 // agent. 229 watchPlans []*watch.Plan 230 231 // tokens holds ACL tokens initially from the configuration, but can 232 // be updated at runtime, so should always be used instead of going to 233 // the configuration directly. 234 tokens *token.Store 235 236 // proxyManager is the proxy process manager for managed Connect proxies. 237 proxyManager *proxyprocess.Manager 238 239 // proxyConfig is the manager for proxy service (Kind = connect-proxy) 240 // configuration state. This ensures all state needed by a proxy registration 241 // is maintained in cache and handles pushing updates to that state into XDS 242 // server to be pushed out to Envoy. This is NOT related to managed proxies 243 // directly. 244 proxyConfig *proxycfg.Manager 245 246 // xdsServer is the Server instance that serves xDS gRPC API. 247 xdsServer *xds.Server 248 249 // grpcServer is the server instance used currently to serve xDS API for 250 // Envoy. 251 grpcServer *grpc.Server 252 253 // tlsConfigurator is the central instance to provide a *tls.Config 254 // based on the current consul configuration. 255 tlsConfigurator *tlsutil.Configurator 256 257 // persistedTokensLock is used to synchronize access to the persisted token 258 // store within the data directory. This will prevent loading while writing as 259 // well as multiple concurrent writes. 260 persistedTokensLock sync.RWMutex 261 } 262 263 func New(c *config.RuntimeConfig) (*Agent, error) { 264 if c.Datacenter == "" { 265 return nil, fmt.Errorf("Must configure a Datacenter") 266 } 267 if c.DataDir == "" && !c.DevMode { 268 return nil, fmt.Errorf("Must configure a DataDir") 269 } 270 271 a := &Agent{ 272 config: c, 273 checkReapAfter: make(map[types.CheckID]time.Duration), 274 checkMonitors: make(map[types.CheckID]*checks.CheckMonitor), 275 checkTTLs: make(map[types.CheckID]*checks.CheckTTL), 276 checkHTTPs: make(map[types.CheckID]*checks.CheckHTTP), 277 checkTCPs: make(map[types.CheckID]*checks.CheckTCP), 278 checkGRPCs: make(map[types.CheckID]*checks.CheckGRPC), 279 checkDockers: make(map[types.CheckID]*checks.CheckDocker), 280 checkAliases: make(map[types.CheckID]*checks.CheckAlias), 281 eventCh: make(chan serf.UserEvent, 1024), 282 eventBuf: make([]*UserEvent, 256), 283 joinLANNotifier: &systemd.Notifier{}, 284 reloadCh: make(chan chan error), 285 retryJoinCh: make(chan error), 286 shutdownCh: make(chan struct{}), 287 endpoints: make(map[string]string), 288 tokens: new(token.Store), 289 } 290 291 if err := a.initializeACLs(); err != nil { 292 return nil, err 293 } 294 295 return a, nil 296 } 297 298 func LocalConfig(cfg *config.RuntimeConfig) local.Config { 299 lc := local.Config{ 300 AdvertiseAddr: cfg.AdvertiseAddrLAN.String(), 301 CheckUpdateInterval: cfg.CheckUpdateInterval, 302 Datacenter: cfg.Datacenter, 303 DiscardCheckOutput: cfg.DiscardCheckOutput, 304 NodeID: cfg.NodeID, 305 NodeName: cfg.NodeName, 306 TaggedAddresses: map[string]string{}, 307 ProxyBindMinPort: cfg.ConnectProxyBindMinPort, 308 ProxyBindMaxPort: cfg.ConnectProxyBindMaxPort, 309 } 310 for k, v := range cfg.TaggedAddresses { 311 lc.TaggedAddresses[k] = v 312 } 313 return lc 314 } 315 316 func (a *Agent) setupProxyManager() error { 317 acfg, err := a.config.APIConfig(true) 318 if err != nil { 319 return fmt.Errorf("[INFO] agent: Connect managed proxies are disabled due to providing an invalid HTTP configuration") 320 } 321 a.proxyManager = proxyprocess.NewManager() 322 a.proxyManager.AllowRoot = a.config.ConnectProxyAllowManagedRoot 323 a.proxyManager.State = a.State 324 a.proxyManager.Logger = a.logger 325 if a.config.DataDir != "" { 326 // DataDir is required for all non-dev mode agents, but we want 327 // to allow setting the data dir for demos and so on for the agent, 328 // so do the check above instead. 329 a.proxyManager.DataDir = filepath.Join(a.config.DataDir, "proxy") 330 331 // Restore from our snapshot (if it exists) 332 if err := a.proxyManager.Restore(a.proxyManager.SnapshotPath()); err != nil { 333 a.logger.Printf("[WARN] agent: error restoring proxy state: %s", err) 334 } 335 } 336 a.proxyManager.ProxyEnv = acfg.GenerateEnv() 337 return nil 338 } 339 340 func (a *Agent) Start() error { 341 a.stateLock.Lock() 342 defer a.stateLock.Unlock() 343 344 c := a.config 345 346 logOutput := a.LogOutput 347 if a.logger == nil { 348 if logOutput == nil { 349 logOutput = os.Stderr 350 } 351 a.logger = log.New(logOutput, "", log.LstdFlags) 352 } 353 354 // Retrieve or generate the node ID before setting up the rest of the 355 // agent, which depends on it. 356 if err := a.setupNodeID(c); err != nil { 357 return fmt.Errorf("Failed to setup node ID: %v", err) 358 } 359 360 // Warn if the node name is incompatible with DNS 361 if InvalidDnsRe.MatchString(a.config.NodeName) { 362 a.logger.Printf("[WARN] agent: Node name %q will not be discoverable "+ 363 "via DNS due to invalid characters. Valid characters include "+ 364 "all alpha-numerics and dashes.", a.config.NodeName) 365 } else if len(a.config.NodeName) > MaxDNSLabelLength { 366 a.logger.Printf("[WARN] agent: Node name %q will not be discoverable "+ 367 "via DNS due to it being too long. Valid lengths are between "+ 368 "1 and 63 bytes.", a.config.NodeName) 369 } 370 371 // load the tokens - this requires the logger to be setup 372 // which is why we can't do this in New 373 a.loadTokens(a.config) 374 375 // create the local state 376 a.State = local.NewState(LocalConfig(c), a.logger, a.tokens) 377 378 // create the state synchronization manager which performs 379 // regular and on-demand state synchronizations (anti-entropy). 380 a.sync = ae.NewStateSyncer(a.State, c.AEInterval, a.shutdownCh, a.logger) 381 382 // create the cache 383 a.cache = cache.New(nil) 384 385 // create the config for the rpc server/client 386 consulCfg, err := a.consulConfig() 387 if err != nil { 388 return err 389 } 390 391 // ServerUp is used to inform that a new consul server is now 392 // up. This can be used to speed up the sync process if we are blocking 393 // waiting to discover a consul server 394 consulCfg.ServerUp = a.sync.SyncFull.Trigger 395 396 tlsConfigurator, err := tlsutil.NewConfigurator(c.ToTLSUtilConfig(), a.logger) 397 if err != nil { 398 return err 399 } 400 a.tlsConfigurator = tlsConfigurator 401 402 // Setup either the client or the server. 403 if c.ServerMode { 404 server, err := consul.NewServerLogger(consulCfg, a.logger, a.tokens, a.tlsConfigurator) 405 if err != nil { 406 return fmt.Errorf("Failed to start Consul server: %v", err) 407 } 408 a.delegate = server 409 } else { 410 client, err := consul.NewClientLogger(consulCfg, a.logger, a.tlsConfigurator) 411 if err != nil { 412 return fmt.Errorf("Failed to start Consul client: %v", err) 413 } 414 a.delegate = client 415 } 416 417 // the staggering of the state syncing depends on the cluster size. 418 a.sync.ClusterSize = func() int { return len(a.delegate.LANMembers()) } 419 420 // link the state with the consul server/client and the state syncer 421 // via callbacks. After several attempts this was easier than using 422 // channels since the event notification needs to be non-blocking 423 // and that should be hidden in the state syncer implementation. 424 a.State.Delegate = a.delegate 425 a.State.TriggerSyncChanges = a.sync.SyncChanges.Trigger 426 427 // Register the cache. We do this much later so the delegate is 428 // populated from above. 429 a.registerCache() 430 431 // Load checks/services/metadata. 432 if err := a.loadServices(c); err != nil { 433 return err 434 } 435 if err := a.loadProxies(c); err != nil { 436 return err 437 } 438 if err := a.loadChecks(c); err != nil { 439 return err 440 } 441 if err := a.loadMetadata(c); err != nil { 442 return err 443 } 444 445 // create the proxy process manager and start it. This is purposely 446 // done here after the local state above is loaded in so we can have 447 // a more accurate initial state view. 448 if !c.ConnectTestDisableManagedProxies { 449 if err := a.setupProxyManager(); err != nil { 450 a.logger.Printf(err.Error()) 451 } else { 452 go a.proxyManager.Run() 453 } 454 } 455 456 // Start the proxy config manager. 457 a.proxyConfig, err = proxycfg.NewManager(proxycfg.ManagerConfig{ 458 Cache: a.cache, 459 Logger: a.logger, 460 State: a.State, 461 Source: &structs.QuerySource{ 462 Node: a.config.NodeName, 463 Datacenter: a.config.Datacenter, 464 Segment: a.config.SegmentName, 465 }, 466 }) 467 if err != nil { 468 return err 469 } 470 go func() { 471 if err := a.proxyConfig.Run(); err != nil { 472 a.logger.Printf("[ERR] Proxy Config Manager exited: %s", err) 473 } 474 }() 475 476 // Start watching for critical services to deregister, based on their 477 // checks. 478 go a.reapServices() 479 480 // Start handling events. 481 go a.handleEvents() 482 483 // Start sending network coordinate to the server. 484 if !c.DisableCoordinates { 485 go a.sendCoordinate() 486 } 487 488 // Write out the PID file if necessary. 489 if err := a.storePid(); err != nil { 490 return err 491 } 492 493 // start DNS servers 494 if err := a.listenAndServeDNS(); err != nil { 495 return err 496 } 497 498 // Create listeners and unstarted servers; see comment on listenHTTP why 499 // we are doing this. 500 servers, err := a.listenHTTP() 501 if err != nil { 502 return err 503 } 504 505 // Start HTTP and HTTPS servers. 506 for _, srv := range servers { 507 if err := a.serveHTTP(srv); err != nil { 508 return err 509 } 510 a.httpServers = append(a.httpServers, srv) 511 } 512 513 // Start gRPC server. 514 if err := a.listenAndServeGRPC(); err != nil { 515 return err 516 } 517 518 // register watches 519 if err := a.reloadWatches(a.config); err != nil { 520 return err 521 } 522 523 // start retry join 524 go a.retryJoinLAN() 525 go a.retryJoinWAN() 526 527 return nil 528 } 529 530 func (a *Agent) listenAndServeGRPC() error { 531 if len(a.config.GRPCAddrs) < 1 { 532 return nil 533 } 534 535 a.xdsServer = &xds.Server{ 536 Logger: a.logger, 537 CfgMgr: a.proxyConfig, 538 Authz: a, 539 ResolveToken: a.resolveToken, 540 } 541 a.xdsServer.Initialize() 542 543 var err error 544 if a.config.HTTPSPort > 0 { 545 // gRPC uses the same TLS settings as the HTTPS API. If HTTPS is 546 // enabled then gRPC will require HTTPS as well. 547 a.grpcServer, err = a.xdsServer.GRPCServer(a.config.CertFile, a.config.KeyFile) 548 } else { 549 a.grpcServer, err = a.xdsServer.GRPCServer("", "") 550 } 551 if err != nil { 552 return err 553 } 554 555 ln, err := a.startListeners(a.config.GRPCAddrs) 556 if err != nil { 557 return err 558 } 559 560 for _, l := range ln { 561 go func(innerL net.Listener) { 562 a.logger.Printf("[INFO] agent: Started gRPC server on %s (%s)", 563 innerL.Addr().String(), innerL.Addr().Network()) 564 err := a.grpcServer.Serve(innerL) 565 if err != nil { 566 a.logger.Printf("[ERR] gRPC server failed: %s", err) 567 } 568 }(l) 569 } 570 return nil 571 } 572 573 func (a *Agent) listenAndServeDNS() error { 574 notif := make(chan net.Addr, len(a.config.DNSAddrs)) 575 errCh := make(chan error, len(a.config.DNSAddrs)) 576 for _, addr := range a.config.DNSAddrs { 577 // create server 578 s, err := NewDNSServer(a) 579 if err != nil { 580 return err 581 } 582 a.dnsServers = append(a.dnsServers, s) 583 584 // start server 585 a.wgServers.Add(1) 586 go func(addr net.Addr) { 587 defer a.wgServers.Done() 588 err := s.ListenAndServe(addr.Network(), addr.String(), func() { notif <- addr }) 589 if err != nil && !strings.Contains(err.Error(), "accept") { 590 errCh <- err 591 } 592 }(addr) 593 } 594 595 // wait for servers to be up 596 timeout := time.After(time.Second) 597 var merr *multierror.Error 598 for range a.config.DNSAddrs { 599 select { 600 case addr := <-notif: 601 a.logger.Printf("[INFO] agent: Started DNS server %s (%s)", addr.String(), addr.Network()) 602 603 case err := <-errCh: 604 merr = multierror.Append(merr, err) 605 case <-timeout: 606 merr = multierror.Append(merr, fmt.Errorf("agent: timeout starting DNS servers")) 607 break 608 } 609 } 610 return merr.ErrorOrNil() 611 } 612 613 func (a *Agent) startListeners(addrs []net.Addr) ([]net.Listener, error) { 614 var ln []net.Listener 615 for _, addr := range addrs { 616 var l net.Listener 617 var err error 618 619 switch x := addr.(type) { 620 case *net.UnixAddr: 621 l, err = a.listenSocket(x.Name) 622 if err != nil { 623 return nil, err 624 } 625 626 case *net.TCPAddr: 627 l, err = net.Listen("tcp", x.String()) 628 if err != nil { 629 return nil, err 630 } 631 l = &tcpKeepAliveListener{l.(*net.TCPListener)} 632 633 default: 634 return nil, fmt.Errorf("unsupported address type %T", addr) 635 } 636 ln = append(ln, l) 637 } 638 return ln, nil 639 } 640 641 // listenHTTP binds listeners to the provided addresses and also returns 642 // pre-configured HTTP servers which are not yet started. The motivation is 643 // that in the current startup/shutdown setup we de-couple the listener 644 // creation from the server startup assuming that if any of the listeners 645 // cannot be bound we fail immediately and later failures do not occur. 646 // Therefore, starting a server with a running listener is assumed to not 647 // produce an error. 648 // 649 // The second motivation is that an HTTPS server needs to use the same TLSConfig 650 // on both the listener and the HTTP server. When listeners and servers are 651 // created at different times this becomes difficult to handle without keeping 652 // the TLS configuration somewhere or recreating it. 653 // 654 // This approach should ultimately be refactored to the point where we just 655 // start the server and any error should trigger a proper shutdown of the agent. 656 func (a *Agent) listenHTTP() ([]*HTTPServer, error) { 657 var ln []net.Listener 658 var servers []*HTTPServer 659 start := func(proto string, addrs []net.Addr) error { 660 listeners, err := a.startListeners(addrs) 661 if err != nil { 662 return err 663 } 664 665 for _, l := range listeners { 666 var tlscfg *tls.Config 667 _, isTCP := l.(*tcpKeepAliveListener) 668 if isTCP && proto == "https" { 669 tlscfg = a.tlsConfigurator.IncomingHTTPSConfig() 670 l = tls.NewListener(l, tlscfg) 671 } 672 srv := &HTTPServer{ 673 Server: &http.Server{ 674 Addr: l.Addr().String(), 675 TLSConfig: tlscfg, 676 }, 677 ln: l, 678 agent: a, 679 blacklist: NewBlacklist(a.config.HTTPBlockEndpoints), 680 proto: proto, 681 } 682 srv.Server.Handler = srv.handler(a.config.EnableDebug) 683 684 // This will enable upgrading connections to HTTP/2 as 685 // part of TLS negotiation. 686 if proto == "https" { 687 err = http2.ConfigureServer(srv.Server, nil) 688 if err != nil { 689 return err 690 } 691 } 692 693 ln = append(ln, l) 694 servers = append(servers, srv) 695 } 696 return nil 697 } 698 699 if err := start("http", a.config.HTTPAddrs); err != nil { 700 for _, l := range ln { 701 l.Close() 702 } 703 return nil, err 704 } 705 if err := start("https", a.config.HTTPSAddrs); err != nil { 706 for _, l := range ln { 707 l.Close() 708 } 709 return nil, err 710 } 711 return servers, nil 712 } 713 714 // tcpKeepAliveListener sets TCP keep-alive timeouts on accepted 715 // connections. It's used so dead TCP connections eventually go away. 716 type tcpKeepAliveListener struct { 717 *net.TCPListener 718 } 719 720 func (ln tcpKeepAliveListener) Accept() (c net.Conn, err error) { 721 tc, err := ln.AcceptTCP() 722 if err != nil { 723 return 724 } 725 tc.SetKeepAlive(true) 726 tc.SetKeepAlivePeriod(30 * time.Second) 727 return tc, nil 728 } 729 730 func (a *Agent) listenSocket(path string) (net.Listener, error) { 731 if _, err := os.Stat(path); !os.IsNotExist(err) { 732 a.logger.Printf("[WARN] agent: Replacing socket %q", path) 733 } 734 if err := os.Remove(path); err != nil && !os.IsNotExist(err) { 735 return nil, fmt.Errorf("error removing socket file: %s", err) 736 } 737 l, err := net.Listen("unix", path) 738 if err != nil { 739 return nil, err 740 } 741 user, group, mode := a.config.UnixSocketUser, a.config.UnixSocketGroup, a.config.UnixSocketMode 742 if err := setFilePermissions(path, user, group, mode); err != nil { 743 return nil, fmt.Errorf("Failed setting up socket: %s", err) 744 } 745 return l, nil 746 } 747 748 func (a *Agent) serveHTTP(srv *HTTPServer) error { 749 // https://github.com/golang/go/issues/20239 750 // 751 // In go.8.1 there is a race between Serve and Shutdown. If 752 // Shutdown is called before the Serve go routine was scheduled then 753 // the Serve go routine never returns. This deadlocks the agent 754 // shutdown for some tests since it will wait forever. 755 notif := make(chan net.Addr) 756 a.wgServers.Add(1) 757 go func() { 758 defer a.wgServers.Done() 759 notif <- srv.ln.Addr() 760 err := srv.Serve(srv.ln) 761 if err != nil && err != http.ErrServerClosed { 762 a.logger.Print(err) 763 } 764 }() 765 766 select { 767 case addr := <-notif: 768 if srv.proto == "https" { 769 a.logger.Printf("[INFO] agent: Started HTTPS server on %s (%s)", addr.String(), addr.Network()) 770 } else { 771 a.logger.Printf("[INFO] agent: Started HTTP server on %s (%s)", addr.String(), addr.Network()) 772 } 773 return nil 774 case <-time.After(time.Second): 775 return fmt.Errorf("agent: timeout starting HTTP servers") 776 } 777 } 778 779 // reloadWatches stops any existing watch plans and attempts to load the given 780 // set of watches. 781 func (a *Agent) reloadWatches(cfg *config.RuntimeConfig) error { 782 // Stop the current watches. 783 for _, wp := range a.watchPlans { 784 wp.Stop() 785 } 786 a.watchPlans = nil 787 788 // Return if there are no watches now. 789 if len(cfg.Watches) == 0 { 790 return nil 791 } 792 793 // Watches use the API to talk to this agent, so that must be enabled. 794 if len(cfg.HTTPAddrs) == 0 && len(cfg.HTTPSAddrs) == 0 { 795 return fmt.Errorf("watch plans require an HTTP or HTTPS endpoint") 796 } 797 798 // Compile the watches 799 var watchPlans []*watch.Plan 800 for _, params := range cfg.Watches { 801 if handlerType, ok := params["handler_type"]; !ok { 802 params["handler_type"] = "script" 803 } else if handlerType != "http" && handlerType != "script" { 804 return fmt.Errorf("Handler type '%s' not recognized", params["handler_type"]) 805 } 806 807 // Don't let people use connect watches via this mechanism for now as it 808 // needs thought about how to do securely and shouldn't be necessary. Note 809 // that if the type assertion fails an type is not a string then 810 // ParseExample below will error so we don't need to handle that case. 811 if typ, ok := params["type"].(string); ok { 812 if strings.HasPrefix(typ, "connect_") { 813 return fmt.Errorf("Watch type %s is not allowed in agent config", typ) 814 } 815 } 816 817 // Parse the watches, excluding 'handler' and 'args' 818 wp, err := watch.ParseExempt(params, []string{"handler", "args"}) 819 if err != nil { 820 return fmt.Errorf("Failed to parse watch (%#v): %v", params, err) 821 } 822 823 // Get the handler and subprocess arguments 824 handler, hasHandler := wp.Exempt["handler"] 825 args, hasArgs := wp.Exempt["args"] 826 if hasHandler { 827 a.logger.Printf("[WARN] agent: The 'handler' field in watches has been deprecated " + 828 "and replaced with the 'args' field. See https://www.consul.io/docs/agent/watches.html") 829 } 830 if _, ok := handler.(string); hasHandler && !ok { 831 return fmt.Errorf("Watch handler must be a string") 832 } 833 if raw, ok := args.([]interface{}); hasArgs && ok { 834 var parsed []string 835 for _, arg := range raw { 836 v, ok := arg.(string) 837 if !ok { 838 return fmt.Errorf("Watch args must be a list of strings") 839 } 840 841 parsed = append(parsed, v) 842 } 843 wp.Exempt["args"] = parsed 844 } else if hasArgs && !ok { 845 return fmt.Errorf("Watch args must be a list of strings") 846 } 847 if hasHandler && hasArgs || hasHandler && wp.HandlerType == "http" || hasArgs && wp.HandlerType == "http" { 848 return fmt.Errorf("Only one watch handler allowed") 849 } 850 if !hasHandler && !hasArgs && wp.HandlerType != "http" { 851 return fmt.Errorf("Must define a watch handler") 852 } 853 854 // Store the watch plan 855 watchPlans = append(watchPlans, wp) 856 } 857 858 // Fire off a goroutine for each new watch plan. 859 for _, wp := range watchPlans { 860 config, err := a.config.APIConfig(true) 861 if err != nil { 862 a.logger.Printf("[ERR] agent: Failed to run watch: %v", err) 863 continue 864 } 865 866 a.watchPlans = append(a.watchPlans, wp) 867 go func(wp *watch.Plan) { 868 if h, ok := wp.Exempt["handler"]; ok { 869 wp.Handler = makeWatchHandler(a.LogOutput, h) 870 } else if h, ok := wp.Exempt["args"]; ok { 871 wp.Handler = makeWatchHandler(a.LogOutput, h) 872 } else { 873 httpConfig := wp.Exempt["http_handler_config"].(*watch.HttpHandlerConfig) 874 wp.Handler = makeHTTPWatchHandler(a.LogOutput, httpConfig) 875 } 876 wp.LogOutput = a.LogOutput 877 878 addr := config.Address 879 if config.Scheme == "https" { 880 addr = "https://" + addr 881 } 882 883 if err := wp.RunWithConfig(addr, config); err != nil { 884 a.logger.Printf("[ERR] agent: Failed to run watch: %v", err) 885 } 886 }(wp) 887 } 888 return nil 889 } 890 891 // consulConfig is used to return a consul configuration 892 func (a *Agent) consulConfig() (*consul.Config, error) { 893 // Start with the provided config or default config 894 base := consul.DefaultConfig() 895 896 // This is set when the agent starts up 897 base.NodeID = a.config.NodeID 898 899 // Apply dev mode 900 base.DevMode = a.config.DevMode 901 902 // Override with our config 903 // todo(fs): these are now always set in the runtime config so we can simplify this 904 // todo(fs): or is there a reason to keep it like that? 905 base.Datacenter = a.config.Datacenter 906 base.PrimaryDatacenter = a.config.PrimaryDatacenter 907 base.DataDir = a.config.DataDir 908 base.NodeName = a.config.NodeName 909 910 base.CoordinateUpdateBatchSize = a.config.ConsulCoordinateUpdateBatchSize 911 base.CoordinateUpdateMaxBatches = a.config.ConsulCoordinateUpdateMaxBatches 912 base.CoordinateUpdatePeriod = a.config.ConsulCoordinateUpdatePeriod 913 914 base.RaftConfig.HeartbeatTimeout = a.config.ConsulRaftHeartbeatTimeout 915 base.RaftConfig.LeaderLeaseTimeout = a.config.ConsulRaftLeaderLeaseTimeout 916 base.RaftConfig.ElectionTimeout = a.config.ConsulRaftElectionTimeout 917 918 base.SerfLANConfig.MemberlistConfig.BindAddr = a.config.SerfBindAddrLAN.IP.String() 919 base.SerfLANConfig.MemberlistConfig.BindPort = a.config.SerfBindAddrLAN.Port 920 base.SerfLANConfig.MemberlistConfig.AdvertiseAddr = a.config.SerfAdvertiseAddrLAN.IP.String() 921 base.SerfLANConfig.MemberlistConfig.AdvertisePort = a.config.SerfAdvertiseAddrLAN.Port 922 base.SerfLANConfig.MemberlistConfig.GossipVerifyIncoming = a.config.EncryptVerifyIncoming 923 base.SerfLANConfig.MemberlistConfig.GossipVerifyOutgoing = a.config.EncryptVerifyOutgoing 924 base.SerfLANConfig.MemberlistConfig.GossipInterval = a.config.GossipLANGossipInterval 925 base.SerfLANConfig.MemberlistConfig.GossipNodes = a.config.GossipLANGossipNodes 926 base.SerfLANConfig.MemberlistConfig.ProbeInterval = a.config.GossipLANProbeInterval 927 base.SerfLANConfig.MemberlistConfig.ProbeTimeout = a.config.GossipLANProbeTimeout 928 base.SerfLANConfig.MemberlistConfig.SuspicionMult = a.config.GossipLANSuspicionMult 929 base.SerfLANConfig.MemberlistConfig.RetransmitMult = a.config.GossipLANRetransmitMult 930 if a.config.ReconnectTimeoutLAN != 0 { 931 base.SerfLANConfig.ReconnectTimeout = a.config.ReconnectTimeoutLAN 932 } 933 934 if a.config.SerfBindAddrWAN != nil { 935 base.SerfWANConfig.MemberlistConfig.BindAddr = a.config.SerfBindAddrWAN.IP.String() 936 base.SerfWANConfig.MemberlistConfig.BindPort = a.config.SerfBindAddrWAN.Port 937 base.SerfWANConfig.MemberlistConfig.AdvertiseAddr = a.config.SerfAdvertiseAddrWAN.IP.String() 938 base.SerfWANConfig.MemberlistConfig.AdvertisePort = a.config.SerfAdvertiseAddrWAN.Port 939 base.SerfWANConfig.MemberlistConfig.GossipVerifyIncoming = a.config.EncryptVerifyIncoming 940 base.SerfWANConfig.MemberlistConfig.GossipVerifyOutgoing = a.config.EncryptVerifyOutgoing 941 base.SerfWANConfig.MemberlistConfig.GossipInterval = a.config.GossipWANGossipInterval 942 base.SerfWANConfig.MemberlistConfig.GossipNodes = a.config.GossipWANGossipNodes 943 base.SerfWANConfig.MemberlistConfig.ProbeInterval = a.config.GossipWANProbeInterval 944 base.SerfWANConfig.MemberlistConfig.ProbeTimeout = a.config.GossipWANProbeTimeout 945 base.SerfWANConfig.MemberlistConfig.SuspicionMult = a.config.GossipWANSuspicionMult 946 base.SerfWANConfig.MemberlistConfig.RetransmitMult = a.config.GossipWANRetransmitMult 947 if a.config.ReconnectTimeoutWAN != 0 { 948 base.SerfWANConfig.ReconnectTimeout = a.config.ReconnectTimeoutWAN 949 } 950 } else { 951 // Disable serf WAN federation 952 base.SerfWANConfig = nil 953 } 954 955 base.RPCAddr = a.config.RPCBindAddr 956 base.RPCAdvertise = a.config.RPCAdvertiseAddr 957 958 base.Segment = a.config.SegmentName 959 if len(a.config.Segments) > 0 { 960 segments, err := a.segmentConfig() 961 if err != nil { 962 return nil, err 963 } 964 base.Segments = segments 965 } 966 if a.config.Bootstrap { 967 base.Bootstrap = true 968 } 969 if a.config.RejoinAfterLeave { 970 base.RejoinAfterLeave = true 971 } 972 if a.config.BootstrapExpect != 0 { 973 base.BootstrapExpect = a.config.BootstrapExpect 974 } 975 if a.config.RPCProtocol > 0 { 976 base.ProtocolVersion = uint8(a.config.RPCProtocol) 977 } 978 if a.config.RaftProtocol != 0 { 979 base.RaftConfig.ProtocolVersion = raft.ProtocolVersion(a.config.RaftProtocol) 980 } 981 if a.config.RaftSnapshotThreshold != 0 { 982 base.RaftConfig.SnapshotThreshold = uint64(a.config.RaftSnapshotThreshold) 983 } 984 if a.config.RaftSnapshotInterval != 0 { 985 base.RaftConfig.SnapshotInterval = a.config.RaftSnapshotInterval 986 } 987 if a.config.ACLMasterToken != "" { 988 base.ACLMasterToken = a.config.ACLMasterToken 989 } 990 if a.config.ACLDatacenter != "" { 991 base.ACLDatacenter = a.config.ACLDatacenter 992 } 993 if a.config.ACLTokenTTL != 0 { 994 base.ACLTokenTTL = a.config.ACLTokenTTL 995 } 996 if a.config.ACLPolicyTTL != 0 { 997 base.ACLPolicyTTL = a.config.ACLPolicyTTL 998 } 999 if a.config.ACLDefaultPolicy != "" { 1000 base.ACLDefaultPolicy = a.config.ACLDefaultPolicy 1001 } 1002 if a.config.ACLDownPolicy != "" { 1003 base.ACLDownPolicy = a.config.ACLDownPolicy 1004 } 1005 base.ACLEnforceVersion8 = a.config.ACLEnforceVersion8 1006 base.ACLTokenReplication = a.config.ACLTokenReplication 1007 base.ACLsEnabled = a.config.ACLsEnabled 1008 if a.config.ACLEnableKeyListPolicy { 1009 base.ACLEnableKeyListPolicy = a.config.ACLEnableKeyListPolicy 1010 } 1011 if a.config.SessionTTLMin != 0 { 1012 base.SessionTTLMin = a.config.SessionTTLMin 1013 } 1014 if a.config.NonVotingServer { 1015 base.NonVoter = a.config.NonVotingServer 1016 } 1017 1018 // These are fully specified in the agent defaults, so we can simply 1019 // copy them over. 1020 base.AutopilotConfig.CleanupDeadServers = a.config.AutopilotCleanupDeadServers 1021 base.AutopilotConfig.LastContactThreshold = a.config.AutopilotLastContactThreshold 1022 base.AutopilotConfig.MaxTrailingLogs = uint64(a.config.AutopilotMaxTrailingLogs) 1023 base.AutopilotConfig.ServerStabilizationTime = a.config.AutopilotServerStabilizationTime 1024 base.AutopilotConfig.RedundancyZoneTag = a.config.AutopilotRedundancyZoneTag 1025 base.AutopilotConfig.DisableUpgradeMigration = a.config.AutopilotDisableUpgradeMigration 1026 base.AutopilotConfig.UpgradeVersionTag = a.config.AutopilotUpgradeVersionTag 1027 1028 // make sure the advertise address is always set 1029 if base.RPCAdvertise == nil { 1030 base.RPCAdvertise = base.RPCAddr 1031 } 1032 1033 // Rate limiting for RPC calls. 1034 if a.config.RPCRateLimit > 0 { 1035 base.RPCRate = a.config.RPCRateLimit 1036 } 1037 if a.config.RPCMaxBurst > 0 { 1038 base.RPCMaxBurst = a.config.RPCMaxBurst 1039 } 1040 1041 // RPC-related performance configs. 1042 if a.config.RPCHoldTimeout > 0 { 1043 base.RPCHoldTimeout = a.config.RPCHoldTimeout 1044 } 1045 if a.config.LeaveDrainTime > 0 { 1046 base.LeaveDrainTime = a.config.LeaveDrainTime 1047 } 1048 1049 // set the src address for outgoing rpc connections 1050 // Use port 0 so that outgoing connections use a random port. 1051 if !ipaddr.IsAny(base.RPCAddr.IP) { 1052 base.RPCSrcAddr = &net.TCPAddr{IP: base.RPCAddr.IP} 1053 } 1054 1055 // Format the build string 1056 revision := a.config.Revision 1057 if len(revision) > 8 { 1058 revision = revision[:8] 1059 } 1060 base.Build = fmt.Sprintf("%s%s:%s", a.config.Version, a.config.VersionPrerelease, revision) 1061 1062 // Copy the TLS configuration 1063 base.VerifyIncoming = a.config.VerifyIncoming || a.config.VerifyIncomingRPC 1064 if a.config.CAPath != "" || a.config.CAFile != "" { 1065 base.UseTLS = true 1066 } 1067 base.VerifyOutgoing = a.config.VerifyOutgoing 1068 base.VerifyServerHostname = a.config.VerifyServerHostname 1069 base.CAFile = a.config.CAFile 1070 base.CAPath = a.config.CAPath 1071 base.CertFile = a.config.CertFile 1072 base.KeyFile = a.config.KeyFile 1073 base.ServerName = a.config.ServerName 1074 base.Domain = a.config.DNSDomain 1075 base.TLSMinVersion = a.config.TLSMinVersion 1076 base.TLSCipherSuites = a.config.TLSCipherSuites 1077 base.TLSPreferServerCipherSuites = a.config.TLSPreferServerCipherSuites 1078 1079 // Copy the Connect CA bootstrap config 1080 if a.config.ConnectEnabled { 1081 base.ConnectEnabled = true 1082 1083 // Allow config to specify cluster_id provided it's a valid UUID. This is 1084 // meant only for tests where a deterministic ID makes fixtures much simpler 1085 // to work with but since it's only read on initial cluster bootstrap it's not 1086 // that much of a liability in production. The worst a user could do is 1087 // configure logically separate clusters with same ID by mistake but we can 1088 // avoid documenting this is even an option. 1089 if clusterID, ok := a.config.ConnectCAConfig["cluster_id"]; ok { 1090 if cIDStr, ok := clusterID.(string); ok { 1091 if _, err := uuid.ParseUUID(cIDStr); err == nil { 1092 // Valid UUID configured, use that 1093 base.CAConfig.ClusterID = cIDStr 1094 } 1095 } 1096 if base.CAConfig.ClusterID == "" { 1097 // If the tried to specify an ID but typoed it don't ignore as they will 1098 // then bootstrap with a new ID and have to throw away the whole cluster 1099 // and start again. 1100 a.logger.Println("[ERR] connect CA config cluster_id specified but " + 1101 "is not a valid UUID, aborting startup") 1102 return nil, fmt.Errorf("cluster_id was supplied but was not a valid UUID") 1103 } 1104 } 1105 1106 if a.config.ConnectCAProvider != "" { 1107 base.CAConfig.Provider = a.config.ConnectCAProvider 1108 } 1109 1110 // Merge connect CA Config regardless of provider (since there are some 1111 // common config options valid to all like leaf TTL). 1112 for k, v := range a.config.ConnectCAConfig { 1113 base.CAConfig.Config[k] = v 1114 } 1115 } 1116 1117 // Setup the user event callback 1118 base.UserEventHandler = func(e serf.UserEvent) { 1119 select { 1120 case a.eventCh <- e: 1121 case <-a.shutdownCh: 1122 } 1123 } 1124 1125 // Setup the loggers 1126 base.LogOutput = a.LogOutput 1127 1128 // This will set up the LAN keyring, as well as the WAN and any segments 1129 // for servers. 1130 if err := a.setupKeyrings(base); err != nil { 1131 return nil, fmt.Errorf("Failed to configure keyring: %v", err) 1132 } 1133 1134 base.WatchSoftLimit = a.config.WatchSoftLimit 1135 1136 return base, nil 1137 } 1138 1139 // Setup the serf and memberlist config for any defined network segments. 1140 func (a *Agent) segmentConfig() ([]consul.NetworkSegment, error) { 1141 var segments []consul.NetworkSegment 1142 config := a.config 1143 1144 for _, s := range config.Segments { 1145 serfConf := consul.DefaultConfig().SerfLANConfig 1146 1147 serfConf.MemberlistConfig.BindAddr = s.Bind.IP.String() 1148 serfConf.MemberlistConfig.BindPort = s.Bind.Port 1149 serfConf.MemberlistConfig.AdvertiseAddr = s.Advertise.IP.String() 1150 serfConf.MemberlistConfig.AdvertisePort = s.Advertise.Port 1151 1152 if config.ReconnectTimeoutLAN != 0 { 1153 serfConf.ReconnectTimeout = config.ReconnectTimeoutLAN 1154 } 1155 if config.EncryptVerifyIncoming { 1156 serfConf.MemberlistConfig.GossipVerifyIncoming = config.EncryptVerifyIncoming 1157 } 1158 if config.EncryptVerifyOutgoing { 1159 serfConf.MemberlistConfig.GossipVerifyOutgoing = config.EncryptVerifyOutgoing 1160 } 1161 1162 var rpcAddr *net.TCPAddr 1163 if s.RPCListener { 1164 rpcAddr = &net.TCPAddr{ 1165 IP: s.Bind.IP, 1166 Port: a.config.ServerPort, 1167 } 1168 } 1169 1170 segments = append(segments, consul.NetworkSegment{ 1171 Name: s.Name, 1172 Bind: serfConf.MemberlistConfig.BindAddr, 1173 Advertise: serfConf.MemberlistConfig.AdvertiseAddr, 1174 Port: s.Bind.Port, 1175 RPCAddr: rpcAddr, 1176 SerfConfig: serfConf, 1177 }) 1178 } 1179 1180 return segments, nil 1181 } 1182 1183 // makeRandomID will generate a random UUID for a node. 1184 func (a *Agent) makeRandomID() (string, error) { 1185 id, err := uuid.GenerateUUID() 1186 if err != nil { 1187 return "", err 1188 } 1189 1190 a.logger.Printf("[DEBUG] agent: Using random ID %q as node ID", id) 1191 return id, nil 1192 } 1193 1194 // makeNodeID will try to find a host-specific ID, or else will generate a 1195 // random ID. The returned ID will always be formatted as a GUID. We don't tell 1196 // the caller whether this ID is random or stable since the consequences are 1197 // high for us if this changes, so we will persist it either way. This will let 1198 // gopsutil change implementations without affecting in-place upgrades of nodes. 1199 func (a *Agent) makeNodeID() (string, error) { 1200 // If they've disabled host-based IDs then just make a random one. 1201 if a.config.DisableHostNodeID { 1202 return a.makeRandomID() 1203 } 1204 1205 // Try to get a stable ID associated with the host itself. 1206 info, err := host.Info() 1207 if err != nil { 1208 a.logger.Printf("[DEBUG] agent: Couldn't get a unique ID from the host: %v", err) 1209 return a.makeRandomID() 1210 } 1211 1212 // Make sure the host ID parses as a UUID, since we don't have complete 1213 // control over this process. 1214 id := strings.ToLower(info.HostID) 1215 if _, err := uuid.ParseUUID(id); err != nil { 1216 a.logger.Printf("[DEBUG] agent: Unique ID %q from host isn't formatted as a UUID: %v", 1217 id, err) 1218 return a.makeRandomID() 1219 } 1220 1221 // Hash the input to make it well distributed. The reported Host UUID may be 1222 // similar across nodes if they are on a cloud provider or on motherboards 1223 // created from the same batch. 1224 buf := sha512.Sum512([]byte(id)) 1225 id = fmt.Sprintf("%08x-%04x-%04x-%04x-%12x", 1226 buf[0:4], 1227 buf[4:6], 1228 buf[6:8], 1229 buf[8:10], 1230 buf[10:16]) 1231 1232 a.logger.Printf("[DEBUG] agent: Using unique ID %q from host as node ID", id) 1233 return id, nil 1234 } 1235 1236 // setupNodeID will pull the persisted node ID, if any, or create a random one 1237 // and persist it. 1238 func (a *Agent) setupNodeID(config *config.RuntimeConfig) error { 1239 // If they've configured a node ID manually then just use that, as 1240 // long as it's valid. 1241 if config.NodeID != "" { 1242 config.NodeID = types.NodeID(strings.ToLower(string(config.NodeID))) 1243 if _, err := uuid.ParseUUID(string(config.NodeID)); err != nil { 1244 return err 1245 } 1246 1247 return nil 1248 } 1249 1250 // For dev mode we have no filesystem access so just make one. 1251 if a.config.DataDir == "" { 1252 id, err := a.makeNodeID() 1253 if err != nil { 1254 return err 1255 } 1256 1257 config.NodeID = types.NodeID(id) 1258 return nil 1259 } 1260 1261 // Load saved state, if any. Since a user could edit this, we also 1262 // validate it. 1263 fileID := filepath.Join(config.DataDir, "node-id") 1264 if _, err := os.Stat(fileID); err == nil { 1265 rawID, err := ioutil.ReadFile(fileID) 1266 if err != nil { 1267 return err 1268 } 1269 1270 nodeID := strings.TrimSpace(string(rawID)) 1271 nodeID = strings.ToLower(nodeID) 1272 if _, err := uuid.ParseUUID(nodeID); err != nil { 1273 return err 1274 } 1275 1276 config.NodeID = types.NodeID(nodeID) 1277 } 1278 1279 // If we still don't have a valid node ID, make one. 1280 if config.NodeID == "" { 1281 id, err := a.makeNodeID() 1282 if err != nil { 1283 return err 1284 } 1285 if err := lib.EnsurePath(fileID, false); err != nil { 1286 return err 1287 } 1288 if err := ioutil.WriteFile(fileID, []byte(id), 0600); err != nil { 1289 return err 1290 } 1291 1292 config.NodeID = types.NodeID(id) 1293 } 1294 return nil 1295 } 1296 1297 // setupBaseKeyrings configures the LAN and WAN keyrings. 1298 func (a *Agent) setupBaseKeyrings(config *consul.Config) error { 1299 // If the keyring file is disabled then just poke the provided key 1300 // into the in-memory keyring. 1301 federationEnabled := config.SerfWANConfig != nil 1302 if a.config.DisableKeyringFile { 1303 if a.config.EncryptKey == "" { 1304 return nil 1305 } 1306 1307 keys := []string{a.config.EncryptKey} 1308 if err := loadKeyring(config.SerfLANConfig, keys); err != nil { 1309 return err 1310 } 1311 if a.config.ServerMode && federationEnabled { 1312 if err := loadKeyring(config.SerfWANConfig, keys); err != nil { 1313 return err 1314 } 1315 } 1316 return nil 1317 } 1318 1319 // Otherwise, we need to deal with the keyring files. 1320 fileLAN := filepath.Join(a.config.DataDir, SerfLANKeyring) 1321 fileWAN := filepath.Join(a.config.DataDir, SerfWANKeyring) 1322 1323 if a.config.EncryptKey == "" { 1324 goto LOAD 1325 } 1326 if _, err := os.Stat(fileLAN); err != nil { 1327 if err := initKeyring(fileLAN, a.config.EncryptKey); err != nil { 1328 return err 1329 } 1330 } 1331 if a.config.ServerMode && federationEnabled { 1332 if _, err := os.Stat(fileWAN); err != nil { 1333 if err := initKeyring(fileWAN, a.config.EncryptKey); err != nil { 1334 return err 1335 } 1336 } 1337 } 1338 1339 LOAD: 1340 if _, err := os.Stat(fileLAN); err == nil { 1341 config.SerfLANConfig.KeyringFile = fileLAN 1342 } 1343 if err := loadKeyringFile(config.SerfLANConfig); err != nil { 1344 return err 1345 } 1346 if a.config.ServerMode && federationEnabled { 1347 if _, err := os.Stat(fileWAN); err == nil { 1348 config.SerfWANConfig.KeyringFile = fileWAN 1349 } 1350 if err := loadKeyringFile(config.SerfWANConfig); err != nil { 1351 return err 1352 } 1353 } 1354 1355 return nil 1356 } 1357 1358 // setupKeyrings is used to initialize and load keyrings during agent startup. 1359 func (a *Agent) setupKeyrings(config *consul.Config) error { 1360 // First set up the LAN and WAN keyrings. 1361 if err := a.setupBaseKeyrings(config); err != nil { 1362 return err 1363 } 1364 1365 // If there's no LAN keyring then there's nothing else to set up for 1366 // any segments. 1367 lanKeyring := config.SerfLANConfig.MemberlistConfig.Keyring 1368 if lanKeyring == nil { 1369 return nil 1370 } 1371 1372 // Copy the initial state of the LAN keyring into each segment config. 1373 // Segments don't have their own keyring file, they rely on the LAN 1374 // holding the state so things can't get out of sync. 1375 k, pk := lanKeyring.GetKeys(), lanKeyring.GetPrimaryKey() 1376 for _, segment := range config.Segments { 1377 keyring, err := memberlist.NewKeyring(k, pk) 1378 if err != nil { 1379 return err 1380 } 1381 segment.SerfConfig.MemberlistConfig.Keyring = keyring 1382 } 1383 return nil 1384 } 1385 1386 // registerEndpoint registers a handler for the consul RPC server 1387 // under a unique name while making it accessible under the provided 1388 // name. This allows overwriting handlers for the golang net/rpc 1389 // service which does not allow this. 1390 func (a *Agent) registerEndpoint(name string, handler interface{}) error { 1391 srv, ok := a.delegate.(*consul.Server) 1392 if !ok { 1393 panic("agent must be a server") 1394 } 1395 realname := fmt.Sprintf("%s-%d", name, time.Now().UnixNano()) 1396 a.endpointsLock.Lock() 1397 a.endpoints[name] = realname 1398 a.endpointsLock.Unlock() 1399 return srv.RegisterEndpoint(realname, handler) 1400 } 1401 1402 // RPC is used to make an RPC call to the Consul servers 1403 // This allows the agent to implement the Consul.Interface 1404 func (a *Agent) RPC(method string, args interface{}, reply interface{}) error { 1405 a.endpointsLock.RLock() 1406 // fast path: only translate if there are overrides 1407 if len(a.endpoints) > 0 { 1408 p := strings.SplitN(method, ".", 2) 1409 if e := a.endpoints[p[0]]; e != "" { 1410 method = e + "." + p[1] 1411 } 1412 } 1413 a.endpointsLock.RUnlock() 1414 return a.delegate.RPC(method, args, reply) 1415 } 1416 1417 // SnapshotRPC performs the requested snapshot RPC against the Consul server in 1418 // a streaming manner. The contents of in will be read and passed along as the 1419 // payload, and the response message will determine the error status, and any 1420 // return payload will be written to out. 1421 func (a *Agent) SnapshotRPC(args *structs.SnapshotRequest, in io.Reader, out io.Writer, 1422 replyFn structs.SnapshotReplyFn) error { 1423 return a.delegate.SnapshotRPC(args, in, out, replyFn) 1424 } 1425 1426 // Leave is used to prepare the agent for a graceful shutdown 1427 func (a *Agent) Leave() error { 1428 return a.delegate.Leave() 1429 } 1430 1431 // ShutdownAgent is used to hard stop the agent. Should be preceded by 1432 // Leave to do it gracefully. Should be followed by ShutdownEndpoints to 1433 // terminate the HTTP and DNS servers as well. 1434 func (a *Agent) ShutdownAgent() error { 1435 a.shutdownLock.Lock() 1436 defer a.shutdownLock.Unlock() 1437 1438 if a.shutdown { 1439 return nil 1440 } 1441 a.logger.Println("[INFO] agent: Requesting shutdown") 1442 1443 // Stop all the checks 1444 a.stateLock.Lock() 1445 defer a.stateLock.Unlock() 1446 for _, chk := range a.checkMonitors { 1447 chk.Stop() 1448 } 1449 for _, chk := range a.checkTTLs { 1450 chk.Stop() 1451 } 1452 for _, chk := range a.checkHTTPs { 1453 chk.Stop() 1454 } 1455 for _, chk := range a.checkTCPs { 1456 chk.Stop() 1457 } 1458 for _, chk := range a.checkGRPCs { 1459 chk.Stop() 1460 } 1461 for _, chk := range a.checkDockers { 1462 chk.Stop() 1463 } 1464 for _, chk := range a.checkAliases { 1465 chk.Stop() 1466 } 1467 1468 // Stop gRPC 1469 if a.grpcServer != nil { 1470 a.grpcServer.Stop() 1471 } 1472 1473 // Stop the proxy config manager 1474 if a.proxyConfig != nil { 1475 a.proxyConfig.Close() 1476 } 1477 1478 // Stop the proxy process manager 1479 if a.proxyManager != nil { 1480 // If persistence is disabled (implies DevMode but a subset of DevMode) then 1481 // don't leave the proxies running since the agent will not be able to 1482 // recover them later. 1483 if a.config.DataDir == "" { 1484 a.logger.Printf("[WARN] agent: dev mode disabled persistence, killing " + 1485 "all proxies since we can't recover them") 1486 if err := a.proxyManager.Kill(); err != nil { 1487 a.logger.Printf("[WARN] agent: error shutting down proxy manager: %s", err) 1488 } 1489 } else { 1490 if err := a.proxyManager.Close(); err != nil { 1491 a.logger.Printf("[WARN] agent: error shutting down proxy manager: %s", err) 1492 } 1493 } 1494 } 1495 1496 // Stop the cache background work 1497 if a.cache != nil { 1498 a.cache.Close() 1499 } 1500 1501 var err error 1502 if a.delegate != nil { 1503 err = a.delegate.Shutdown() 1504 if _, ok := a.delegate.(*consul.Server); ok { 1505 a.logger.Print("[INFO] agent: consul server down") 1506 } else { 1507 a.logger.Print("[INFO] agent: consul client down") 1508 } 1509 } 1510 1511 pidErr := a.deletePid() 1512 if pidErr != nil { 1513 a.logger.Println("[WARN] agent: could not delete pid file ", pidErr) 1514 } 1515 1516 a.logger.Println("[INFO] agent: shutdown complete") 1517 a.shutdown = true 1518 close(a.shutdownCh) 1519 return err 1520 } 1521 1522 // ShutdownEndpoints terminates the HTTP and DNS servers. Should be 1523 // preceded by ShutdownAgent. 1524 func (a *Agent) ShutdownEndpoints() { 1525 a.shutdownLock.Lock() 1526 defer a.shutdownLock.Unlock() 1527 1528 if len(a.dnsServers) == 0 && len(a.httpServers) == 0 { 1529 return 1530 } 1531 1532 for _, srv := range a.dnsServers { 1533 a.logger.Printf("[INFO] agent: Stopping DNS server %s (%s)", srv.Server.Addr, srv.Server.Net) 1534 srv.Shutdown() 1535 } 1536 a.dnsServers = nil 1537 1538 for _, srv := range a.httpServers { 1539 a.logger.Printf("[INFO] agent: Stopping %s server %s (%s)", strings.ToUpper(srv.proto), srv.ln.Addr().String(), srv.ln.Addr().Network()) 1540 ctx, cancel := context.WithTimeout(context.Background(), time.Second) 1541 defer cancel() 1542 srv.Shutdown(ctx) 1543 if ctx.Err() == context.DeadlineExceeded { 1544 a.logger.Printf("[WARN] agent: Timeout stopping %s server %s (%s)", strings.ToUpper(srv.proto), srv.ln.Addr().String(), srv.ln.Addr().Network()) 1545 } 1546 } 1547 a.httpServers = nil 1548 1549 a.logger.Println("[INFO] agent: Waiting for endpoints to shut down") 1550 a.wgServers.Wait() 1551 a.logger.Print("[INFO] agent: Endpoints down") 1552 } 1553 1554 // ReloadCh is used to return a channel that can be 1555 // used for triggering reloads and returning a response. 1556 func (a *Agent) ReloadCh() chan chan error { 1557 return a.reloadCh 1558 } 1559 1560 // RetryJoinCh is a channel that transports errors 1561 // from the retry join process. 1562 func (a *Agent) RetryJoinCh() <-chan error { 1563 return a.retryJoinCh 1564 } 1565 1566 // ShutdownCh is used to return a channel that can be 1567 // selected to wait for the agent to perform a shutdown. 1568 func (a *Agent) ShutdownCh() <-chan struct{} { 1569 return a.shutdownCh 1570 } 1571 1572 // JoinLAN is used to have the agent join a LAN cluster 1573 func (a *Agent) JoinLAN(addrs []string) (n int, err error) { 1574 a.logger.Printf("[INFO] agent: (LAN) joining: %v", addrs) 1575 n, err = a.delegate.JoinLAN(addrs) 1576 a.logger.Printf("[INFO] agent: (LAN) joined: %d Err: %v", n, err) 1577 if err == nil && a.joinLANNotifier != nil { 1578 if notifErr := a.joinLANNotifier.Notify(systemd.Ready); notifErr != nil { 1579 a.logger.Printf("[DEBUG] agent: systemd notify failed: %v", notifErr) 1580 } 1581 } 1582 return 1583 } 1584 1585 // JoinWAN is used to have the agent join a WAN cluster 1586 func (a *Agent) JoinWAN(addrs []string) (n int, err error) { 1587 a.logger.Printf("[INFO] agent: (WAN) joining: %v", addrs) 1588 if srv, ok := a.delegate.(*consul.Server); ok { 1589 n, err = srv.JoinWAN(addrs) 1590 } else { 1591 err = fmt.Errorf("Must be a server to join WAN cluster") 1592 } 1593 a.logger.Printf("[INFO] agent: (WAN) joined: %d Err: %v", n, err) 1594 return 1595 } 1596 1597 // ForceLeave is used to remove a failed node from the cluster 1598 func (a *Agent) ForceLeave(node string) (err error) { 1599 a.logger.Printf("[INFO] agent: Force leaving node: %v", node) 1600 err = a.delegate.RemoveFailedNode(node) 1601 if err != nil { 1602 a.logger.Printf("[WARN] agent: Failed to remove node: %v", err) 1603 } 1604 return err 1605 } 1606 1607 // LocalMember is used to return the local node 1608 func (a *Agent) LocalMember() serf.Member { 1609 return a.delegate.LocalMember() 1610 } 1611 1612 // LANMembers is used to retrieve the LAN members 1613 func (a *Agent) LANMembers() []serf.Member { 1614 return a.delegate.LANMembers() 1615 } 1616 1617 // WANMembers is used to retrieve the WAN members 1618 func (a *Agent) WANMembers() []serf.Member { 1619 if srv, ok := a.delegate.(*consul.Server); ok { 1620 return srv.WANMembers() 1621 } 1622 return nil 1623 } 1624 1625 // StartSync is called once Services and Checks are registered. 1626 // This is called to prevent a race between clients and the anti-entropy routines 1627 func (a *Agent) StartSync() { 1628 go a.sync.Run() 1629 a.logger.Printf("[INFO] agent: started state syncer") 1630 } 1631 1632 // PauseSync is used to pause anti-entropy while bulk changes are made. It also 1633 // sets state that agent-local watches use to "ride out" config reloads and bulk 1634 // updates which might spuriously unload state and reload it again. 1635 func (a *Agent) PauseSync() { 1636 // Do this outside of lock as it has it's own locking 1637 a.sync.Pause() 1638 1639 // Coordinate local state watchers 1640 a.syncMu.Lock() 1641 defer a.syncMu.Unlock() 1642 if a.syncCh == nil { 1643 a.syncCh = make(chan struct{}) 1644 } 1645 } 1646 1647 // ResumeSync is used to unpause anti-entropy after bulk changes are make 1648 func (a *Agent) ResumeSync() { 1649 // a.sync maintains a stack/ref count of Pause calls since we call 1650 // Pause/Resume in nested way during a reload and AddService. We only want to 1651 // trigger local state watchers if this Resume call actually started sync back 1652 // up again (i.e. was the last resume on the stack). We could check that 1653 // separately with a.sync.Paused but that is racey since another Pause call 1654 // might be made between our Resume and checking Paused. 1655 resumed := a.sync.Resume() 1656 1657 if !resumed { 1658 // Return early so we don't notify local watchers until we are actually 1659 // resumed. 1660 return 1661 } 1662 1663 // Coordinate local state watchers 1664 a.syncMu.Lock() 1665 defer a.syncMu.Unlock() 1666 1667 if a.syncCh != nil { 1668 close(a.syncCh) 1669 a.syncCh = nil 1670 } 1671 } 1672 1673 // syncPausedCh returns either a channel or nil. If nil sync is not paused. If 1674 // non-nil, the channel will be closed when sync resumes. 1675 func (a *Agent) syncPausedCh() <-chan struct{} { 1676 a.syncMu.Lock() 1677 defer a.syncMu.Unlock() 1678 return a.syncCh 1679 } 1680 1681 // GetLANCoordinate returns the coordinates of this node in the local pools 1682 // (assumes coordinates are enabled, so check that before calling). 1683 func (a *Agent) GetLANCoordinate() (lib.CoordinateSet, error) { 1684 return a.delegate.GetLANCoordinate() 1685 } 1686 1687 // sendCoordinate is a long-running loop that periodically sends our coordinate 1688 // to the server. Closing the agent's shutdownChannel will cause this to exit. 1689 func (a *Agent) sendCoordinate() { 1690 OUTER: 1691 for { 1692 rate := a.config.SyncCoordinateRateTarget 1693 min := a.config.SyncCoordinateIntervalMin 1694 intv := lib.RateScaledInterval(rate, min, len(a.LANMembers())) 1695 intv = intv + lib.RandomStagger(intv) 1696 1697 select { 1698 case <-time.After(intv): 1699 members := a.LANMembers() 1700 grok, err := consul.CanServersUnderstandProtocol(members, 3) 1701 if err != nil { 1702 a.logger.Printf("[ERR] agent: Failed to check servers: %s", err) 1703 continue 1704 } 1705 if !grok { 1706 a.logger.Printf("[DEBUG] agent: Skipping coordinate updates until servers are upgraded") 1707 continue 1708 } 1709 1710 cs, err := a.GetLANCoordinate() 1711 if err != nil { 1712 a.logger.Printf("[ERR] agent: Failed to get coordinate: %s", err) 1713 continue 1714 } 1715 1716 for segment, coord := range cs { 1717 req := structs.CoordinateUpdateRequest{ 1718 Datacenter: a.config.Datacenter, 1719 Node: a.config.NodeName, 1720 Segment: segment, 1721 Coord: coord, 1722 WriteRequest: structs.WriteRequest{Token: a.tokens.AgentToken()}, 1723 } 1724 var reply struct{} 1725 if err := a.RPC("Coordinate.Update", &req, &reply); err != nil { 1726 if acl.IsErrPermissionDenied(err) { 1727 a.logger.Printf("[WARN] agent: Coordinate update blocked by ACLs") 1728 } else { 1729 a.logger.Printf("[ERR] agent: Coordinate update error: %v", err) 1730 } 1731 continue OUTER 1732 } 1733 } 1734 case <-a.shutdownCh: 1735 return 1736 } 1737 } 1738 } 1739 1740 // reapServicesInternal does a single pass, looking for services to reap. 1741 func (a *Agent) reapServicesInternal() { 1742 reaped := make(map[string]bool) 1743 for checkID, cs := range a.State.CriticalCheckStates() { 1744 serviceID := cs.Check.ServiceID 1745 1746 // There's nothing to do if there's no service. 1747 if serviceID == "" { 1748 continue 1749 } 1750 1751 // There might be multiple checks for one service, so 1752 // we don't need to reap multiple times. 1753 if reaped[serviceID] { 1754 continue 1755 } 1756 1757 // See if there's a timeout. 1758 // todo(fs): this looks fishy... why is there another data structure in the agent with its own lock? 1759 a.stateLock.Lock() 1760 timeout := a.checkReapAfter[checkID] 1761 a.stateLock.Unlock() 1762 1763 // Reap, if necessary. We keep track of which service 1764 // this is so that we won't try to remove it again. 1765 if timeout > 0 && cs.CriticalFor() > timeout { 1766 reaped[serviceID] = true 1767 if err := a.RemoveService(serviceID, true); err != nil { 1768 a.logger.Printf("[ERR] agent: unable to deregister service %q after check %q has been critical for too long: %s", 1769 serviceID, checkID, err) 1770 } else { 1771 a.logger.Printf("[INFO] agent: Check %q for service %q has been critical for too long; deregistered service", 1772 checkID, serviceID) 1773 } 1774 } 1775 } 1776 } 1777 1778 // reapServices is a long running goroutine that looks for checks that have been 1779 // critical too long and deregisters their associated services. 1780 func (a *Agent) reapServices() { 1781 for { 1782 select { 1783 case <-time.After(a.config.CheckReapInterval): 1784 a.reapServicesInternal() 1785 1786 case <-a.shutdownCh: 1787 return 1788 } 1789 } 1790 1791 } 1792 1793 // persistedService is used to wrap a service definition and bundle it 1794 // with an ACL token so we can restore both at a later agent start. 1795 type persistedService struct { 1796 Token string 1797 Service *structs.NodeService 1798 } 1799 1800 // persistService saves a service definition to a JSON file in the data dir 1801 func (a *Agent) persistService(service *structs.NodeService) error { 1802 svcPath := filepath.Join(a.config.DataDir, servicesDir, stringHash(service.ID)) 1803 1804 wrapped := persistedService{ 1805 Token: a.State.ServiceToken(service.ID), 1806 Service: service, 1807 } 1808 encoded, err := json.Marshal(wrapped) 1809 if err != nil { 1810 return err 1811 } 1812 1813 return file.WriteAtomic(svcPath, encoded) 1814 } 1815 1816 // purgeService removes a persisted service definition file from the data dir 1817 func (a *Agent) purgeService(serviceID string) error { 1818 svcPath := filepath.Join(a.config.DataDir, servicesDir, stringHash(serviceID)) 1819 if _, err := os.Stat(svcPath); err == nil { 1820 return os.Remove(svcPath) 1821 } 1822 return nil 1823 } 1824 1825 // persistedProxy is used to wrap a proxy definition and bundle it with an Proxy 1826 // token so we can continue to authenticate the running proxy after a restart. 1827 type persistedProxy struct { 1828 ProxyToken string 1829 Proxy *structs.ConnectManagedProxy 1830 1831 // Set to true when the proxy information originated from the agents configuration 1832 // as opposed to API registration. 1833 FromFile bool 1834 } 1835 1836 // persistProxy saves a proxy definition to a JSON file in the data dir 1837 func (a *Agent) persistProxy(proxy *local.ManagedProxy, FromFile bool) error { 1838 proxyPath := filepath.Join(a.config.DataDir, proxyDir, 1839 stringHash(proxy.Proxy.ProxyService.ID)) 1840 1841 wrapped := persistedProxy{ 1842 ProxyToken: proxy.ProxyToken, 1843 Proxy: proxy.Proxy, 1844 FromFile: FromFile, 1845 } 1846 encoded, err := json.Marshal(wrapped) 1847 if err != nil { 1848 return err 1849 } 1850 1851 return file.WriteAtomic(proxyPath, encoded) 1852 } 1853 1854 // purgeProxy removes a persisted proxy definition file from the data dir 1855 func (a *Agent) purgeProxy(proxyID string) error { 1856 proxyPath := filepath.Join(a.config.DataDir, proxyDir, stringHash(proxyID)) 1857 if _, err := os.Stat(proxyPath); err == nil { 1858 return os.Remove(proxyPath) 1859 } 1860 return nil 1861 } 1862 1863 // persistCheck saves a check definition to the local agent's state directory 1864 func (a *Agent) persistCheck(check *structs.HealthCheck, chkType *structs.CheckType) error { 1865 checkPath := filepath.Join(a.config.DataDir, checksDir, checkIDHash(check.CheckID)) 1866 1867 // Create the persisted check 1868 wrapped := persistedCheck{ 1869 Check: check, 1870 ChkType: chkType, 1871 Token: a.State.CheckToken(check.CheckID), 1872 } 1873 1874 encoded, err := json.Marshal(wrapped) 1875 if err != nil { 1876 return err 1877 } 1878 1879 return file.WriteAtomic(checkPath, encoded) 1880 } 1881 1882 // purgeCheck removes a persisted check definition file from the data dir 1883 func (a *Agent) purgeCheck(checkID types.CheckID) error { 1884 checkPath := filepath.Join(a.config.DataDir, checksDir, checkIDHash(checkID)) 1885 if _, err := os.Stat(checkPath); err == nil { 1886 return os.Remove(checkPath) 1887 } 1888 return nil 1889 } 1890 1891 // AddService is used to add a service entry. 1892 // This entry is persistent and the agent will make a best effort to 1893 // ensure it is registered 1894 func (a *Agent) AddService(service *structs.NodeService, chkTypes []*structs.CheckType, persist bool, token string, source configSource) error { 1895 a.stateLock.Lock() 1896 defer a.stateLock.Unlock() 1897 return a.addServiceLocked(service, chkTypes, persist, token, source) 1898 } 1899 1900 func (a *Agent) addServiceLocked(service *structs.NodeService, chkTypes []*structs.CheckType, persist bool, token string, source configSource) error { 1901 if service.Service == "" { 1902 return fmt.Errorf("Service name missing") 1903 } 1904 if service.ID == "" && service.Service != "" { 1905 service.ID = service.Service 1906 } 1907 for _, check := range chkTypes { 1908 if err := check.Validate(); err != nil { 1909 return fmt.Errorf("Check is not valid: %v", err) 1910 } 1911 } 1912 1913 // Set default weights if not specified. This is important as it ensures AE 1914 // doesn't consider the service different since it has nil weights. 1915 if service.Weights == nil { 1916 service.Weights = &structs.Weights{Passing: 1, Warning: 1} 1917 } 1918 1919 // Warn if the service name is incompatible with DNS 1920 if InvalidDnsRe.MatchString(service.Service) { 1921 a.logger.Printf("[WARN] agent: Service name %q will not be discoverable "+ 1922 "via DNS due to invalid characters. Valid characters include "+ 1923 "all alpha-numerics and dashes.", service.Service) 1924 } else if len(service.Service) > MaxDNSLabelLength { 1925 a.logger.Printf("[WARN] agent: Service name %q will not be discoverable "+ 1926 "via DNS due to it being too long. Valid lengths are between "+ 1927 "1 and 63 bytes.", service.Service) 1928 } 1929 1930 // Warn if any tags are incompatible with DNS 1931 for _, tag := range service.Tags { 1932 if InvalidDnsRe.MatchString(tag) { 1933 a.logger.Printf("[DEBUG] agent: Service tag %q will not be discoverable "+ 1934 "via DNS due to invalid characters. Valid characters include "+ 1935 "all alpha-numerics and dashes.", tag) 1936 } else if len(tag) > MaxDNSLabelLength { 1937 a.logger.Printf("[DEBUG] agent: Service tag %q will not be discoverable "+ 1938 "via DNS due to it being too long. Valid lengths are between "+ 1939 "1 and 63 bytes.", tag) 1940 } 1941 } 1942 1943 // Pause the service syncs during modification 1944 a.PauseSync() 1945 defer a.ResumeSync() 1946 1947 // Take a snapshot of the current state of checks (if any), and 1948 // restore them before resuming anti-entropy. 1949 snap := a.snapshotCheckState() 1950 defer a.restoreCheckState(snap) 1951 1952 var checks []*structs.HealthCheck 1953 1954 // Create an associated health check 1955 for i, chkType := range chkTypes { 1956 checkID := string(chkType.CheckID) 1957 if checkID == "" { 1958 checkID = fmt.Sprintf("service:%s", service.ID) 1959 if len(chkTypes) > 1 { 1960 checkID += fmt.Sprintf(":%d", i+1) 1961 } 1962 } 1963 name := chkType.Name 1964 if name == "" { 1965 name = fmt.Sprintf("Service '%s' check", service.Service) 1966 } 1967 check := &structs.HealthCheck{ 1968 Node: a.config.NodeName, 1969 CheckID: types.CheckID(checkID), 1970 Name: name, 1971 Status: api.HealthCritical, 1972 Notes: chkType.Notes, 1973 ServiceID: service.ID, 1974 ServiceName: service.Service, 1975 ServiceTags: service.Tags, 1976 } 1977 if chkType.Status != "" { 1978 check.Status = chkType.Status 1979 } 1980 1981 checks = append(checks, check) 1982 } 1983 1984 // cleanup, store the ids of services and checks that weren't previously 1985 // registered so we clean them up if somthing fails halfway through the 1986 // process. 1987 var cleanupServices []string 1988 var cleanupChecks []types.CheckID 1989 1990 if s := a.State.Service(service.ID); s == nil { 1991 cleanupServices = append(cleanupServices, service.ID) 1992 } 1993 1994 for _, check := range checks { 1995 if c := a.State.Check(check.CheckID); c == nil { 1996 cleanupChecks = append(cleanupChecks, check.CheckID) 1997 } 1998 } 1999 2000 err := a.State.AddServiceWithChecks(service, checks, token) 2001 if err != nil { 2002 a.cleanupRegistration(cleanupServices, cleanupChecks) 2003 return err 2004 } 2005 2006 for i := range checks { 2007 if err := a.addCheck(checks[i], chkTypes[i], service, persist, token, source); err != nil { 2008 a.cleanupRegistration(cleanupServices, cleanupChecks) 2009 return err 2010 } 2011 2012 if persist && a.config.DataDir != "" { 2013 if err := a.persistCheck(checks[i], chkTypes[i]); err != nil { 2014 a.cleanupRegistration(cleanupServices, cleanupChecks) 2015 return err 2016 2017 } 2018 } 2019 } 2020 2021 // Persist the service to a file 2022 if persist && a.config.DataDir != "" { 2023 if err := a.persistService(service); err != nil { 2024 a.cleanupRegistration(cleanupServices, cleanupChecks) 2025 return err 2026 } 2027 } 2028 2029 return nil 2030 } 2031 2032 // cleanupRegistration is called on registration error to ensure no there are no 2033 // leftovers after a partial failure 2034 func (a *Agent) cleanupRegistration(serviceIDs []string, checksIDs []types.CheckID) { 2035 for _, s := range serviceIDs { 2036 if err := a.State.RemoveService(s); err != nil { 2037 a.logger.Printf("[ERR] consul: service registration: cleanup: failed to remove service %s: %s", s, err) 2038 } 2039 if err := a.purgeService(s); err != nil { 2040 a.logger.Printf("[ERR] consul: service registration: cleanup: failed to purge service %s file: %s", s, err) 2041 } 2042 } 2043 2044 for _, c := range checksIDs { 2045 a.cancelCheckMonitors(c) 2046 if err := a.State.RemoveCheck(c); err != nil { 2047 a.logger.Printf("[ERR] consul: service registration: cleanup: failed to remove check %s: %s", c, err) 2048 } 2049 if err := a.purgeCheck(c); err != nil { 2050 a.logger.Printf("[ERR] consul: service registration: cleanup: failed to purge check %s file: %s", c, err) 2051 } 2052 } 2053 } 2054 2055 // RemoveService is used to remove a service entry. 2056 // The agent will make a best effort to ensure it is deregistered 2057 func (a *Agent) RemoveService(serviceID string, persist bool) error { 2058 a.stateLock.Lock() 2059 defer a.stateLock.Unlock() 2060 return a.removeServiceLocked(serviceID, persist) 2061 } 2062 2063 // removeServiceLocked is used to remove a service entry. 2064 // The agent will make a best effort to ensure it is deregistered 2065 func (a *Agent) removeServiceLocked(serviceID string, persist bool) error { 2066 // Validate ServiceID 2067 if serviceID == "" { 2068 return fmt.Errorf("ServiceID missing") 2069 } 2070 2071 checks := a.State.Checks() 2072 var checkIDs []types.CheckID 2073 for id, check := range checks { 2074 if check.ServiceID != serviceID { 2075 continue 2076 } 2077 checkIDs = append(checkIDs, id) 2078 } 2079 2080 // Remove the associated managed proxy if it exists 2081 // This has to be DONE before purging configuration as might might have issues 2082 // With ACLs otherwise 2083 for proxyID, p := range a.State.Proxies() { 2084 if p.Proxy.TargetServiceID == serviceID { 2085 if err := a.removeProxyLocked(proxyID, true); err != nil { 2086 return err 2087 } 2088 } 2089 } 2090 2091 // Remove service immediately 2092 if err := a.State.RemoveServiceWithChecks(serviceID, checkIDs); err != nil { 2093 a.logger.Printf("[WARN] agent: Failed to deregister service %q: %s", serviceID, err) 2094 return nil 2095 } 2096 2097 // Remove the service from the data dir 2098 if persist { 2099 if err := a.purgeService(serviceID); err != nil { 2100 return err 2101 } 2102 } 2103 2104 // Deregister any associated health checks 2105 for checkID, check := range checks { 2106 if check.ServiceID != serviceID { 2107 continue 2108 } 2109 if err := a.removeCheckLocked(checkID, persist); err != nil { 2110 return err 2111 } 2112 } 2113 2114 a.logger.Printf("[DEBUG] agent: removed service %q", serviceID) 2115 2116 // If any Sidecar services exist for the removed service ID, remove them too. 2117 if sidecar := a.State.Service(a.sidecarServiceID(serviceID)); sidecar != nil { 2118 // Double check that it's not just an ID collision and we actually added 2119 // this from a sidecar. 2120 if sidecar.LocallyRegisteredAsSidecar { 2121 // Remove it! 2122 err := a.removeServiceLocked(a.sidecarServiceID(serviceID), persist) 2123 if err != nil { 2124 return err 2125 } 2126 } 2127 } 2128 2129 return nil 2130 } 2131 2132 // AddCheck is used to add a health check to the agent. 2133 // This entry is persistent and the agent will make a best effort to 2134 // ensure it is registered. The Check may include a CheckType which 2135 // is used to automatically update the check status 2136 func (a *Agent) AddCheck(check *structs.HealthCheck, chkType *structs.CheckType, persist bool, token string, source configSource) error { 2137 a.stateLock.Lock() 2138 defer a.stateLock.Unlock() 2139 return a.addCheckLocked(check, chkType, persist, token, source) 2140 } 2141 2142 func (a *Agent) addCheckLocked(check *structs.HealthCheck, chkType *structs.CheckType, persist bool, token string, source configSource) error { 2143 var service *structs.NodeService 2144 2145 if check.ServiceID != "" { 2146 service = a.State.Service(check.ServiceID) 2147 if service == nil { 2148 return fmt.Errorf("ServiceID %q does not exist", check.ServiceID) 2149 } 2150 } 2151 2152 // snapshot the current state of the health check to avoid potential flapping 2153 existing := a.State.Check(check.CheckID) 2154 defer func() { 2155 if existing != nil { 2156 a.State.UpdateCheck(check.CheckID, existing.Status, existing.Output) 2157 } 2158 }() 2159 2160 err := a.addCheck(check, chkType, service, persist, token, source) 2161 if err != nil { 2162 a.State.RemoveCheck(check.CheckID) 2163 return err 2164 } 2165 2166 // Add to the local state for anti-entropy 2167 err = a.State.AddCheck(check, token) 2168 if err != nil { 2169 return err 2170 } 2171 2172 // Persist the check 2173 if persist && a.config.DataDir != "" { 2174 return a.persistCheck(check, chkType) 2175 } 2176 2177 return nil 2178 } 2179 2180 func (a *Agent) addCheck(check *structs.HealthCheck, chkType *structs.CheckType, service *structs.NodeService, persist bool, token string, source configSource) error { 2181 if check.CheckID == "" { 2182 return fmt.Errorf("CheckID missing") 2183 } 2184 2185 if chkType != nil { 2186 if err := chkType.Validate(); err != nil { 2187 return fmt.Errorf("Check is not valid: %v", err) 2188 } 2189 2190 if chkType.IsScript() { 2191 if source == ConfigSourceLocal && !a.config.EnableLocalScriptChecks { 2192 return fmt.Errorf("Scripts are disabled on this agent; to enable, configure 'enable_script_checks' or 'enable_local_script_checks' to true") 2193 } 2194 2195 if source == ConfigSourceRemote && !a.config.EnableRemoteScriptChecks { 2196 return fmt.Errorf("Scripts are disabled on this agent from remote calls; to enable, configure 'enable_script_checks' to true") 2197 } 2198 } 2199 } 2200 2201 if check.ServiceID != "" { 2202 check.ServiceName = service.Service 2203 check.ServiceTags = service.Tags 2204 } 2205 2206 // Check if already registered 2207 if chkType != nil { 2208 switch { 2209 2210 case chkType.IsTTL(): 2211 if existing, ok := a.checkTTLs[check.CheckID]; ok { 2212 existing.Stop() 2213 delete(a.checkTTLs, check.CheckID) 2214 } 2215 2216 ttl := &checks.CheckTTL{ 2217 Notify: a.State, 2218 CheckID: check.CheckID, 2219 TTL: chkType.TTL, 2220 Logger: a.logger, 2221 } 2222 2223 // Restore persisted state, if any 2224 if err := a.loadCheckState(check); err != nil { 2225 a.logger.Printf("[WARN] agent: failed restoring state for check %q: %s", 2226 check.CheckID, err) 2227 } 2228 2229 ttl.Start() 2230 a.checkTTLs[check.CheckID] = ttl 2231 2232 case chkType.IsHTTP(): 2233 if existing, ok := a.checkHTTPs[check.CheckID]; ok { 2234 existing.Stop() 2235 delete(a.checkHTTPs, check.CheckID) 2236 } 2237 if chkType.Interval < checks.MinInterval { 2238 a.logger.Println(fmt.Sprintf("[WARN] agent: check '%s' has interval below minimum of %v", 2239 check.CheckID, checks.MinInterval)) 2240 chkType.Interval = checks.MinInterval 2241 } 2242 2243 tlsClientConfig := a.tlsConfigurator.OutgoingTLSConfigForCheck(chkType.TLSSkipVerify) 2244 2245 http := &checks.CheckHTTP{ 2246 Notify: a.State, 2247 CheckID: check.CheckID, 2248 HTTP: chkType.HTTP, 2249 Header: chkType.Header, 2250 Method: chkType.Method, 2251 Interval: chkType.Interval, 2252 Timeout: chkType.Timeout, 2253 Logger: a.logger, 2254 TLSClientConfig: tlsClientConfig, 2255 } 2256 http.Start() 2257 a.checkHTTPs[check.CheckID] = http 2258 2259 case chkType.IsTCP(): 2260 if existing, ok := a.checkTCPs[check.CheckID]; ok { 2261 existing.Stop() 2262 delete(a.checkTCPs, check.CheckID) 2263 } 2264 if chkType.Interval < checks.MinInterval { 2265 a.logger.Println(fmt.Sprintf("[WARN] agent: check '%s' has interval below minimum of %v", 2266 check.CheckID, checks.MinInterval)) 2267 chkType.Interval = checks.MinInterval 2268 } 2269 2270 tcp := &checks.CheckTCP{ 2271 Notify: a.State, 2272 CheckID: check.CheckID, 2273 TCP: chkType.TCP, 2274 Interval: chkType.Interval, 2275 Timeout: chkType.Timeout, 2276 Logger: a.logger, 2277 } 2278 tcp.Start() 2279 a.checkTCPs[check.CheckID] = tcp 2280 2281 case chkType.IsGRPC(): 2282 if existing, ok := a.checkGRPCs[check.CheckID]; ok { 2283 existing.Stop() 2284 delete(a.checkGRPCs, check.CheckID) 2285 } 2286 if chkType.Interval < checks.MinInterval { 2287 a.logger.Println(fmt.Sprintf("[WARN] agent: check '%s' has interval below minimum of %v", 2288 check.CheckID, checks.MinInterval)) 2289 chkType.Interval = checks.MinInterval 2290 } 2291 2292 var tlsClientConfig *tls.Config 2293 if chkType.GRPCUseTLS { 2294 tlsClientConfig = a.tlsConfigurator.OutgoingTLSConfigForCheck(chkType.TLSSkipVerify) 2295 } 2296 2297 grpc := &checks.CheckGRPC{ 2298 Notify: a.State, 2299 CheckID: check.CheckID, 2300 GRPC: chkType.GRPC, 2301 Interval: chkType.Interval, 2302 Timeout: chkType.Timeout, 2303 Logger: a.logger, 2304 TLSClientConfig: tlsClientConfig, 2305 } 2306 grpc.Start() 2307 a.checkGRPCs[check.CheckID] = grpc 2308 2309 case chkType.IsDocker(): 2310 if existing, ok := a.checkDockers[check.CheckID]; ok { 2311 existing.Stop() 2312 delete(a.checkDockers, check.CheckID) 2313 } 2314 if chkType.Interval < checks.MinInterval { 2315 a.logger.Println(fmt.Sprintf("[WARN] agent: check '%s' has interval below minimum of %v", 2316 check.CheckID, checks.MinInterval)) 2317 chkType.Interval = checks.MinInterval 2318 } 2319 2320 if a.dockerClient == nil { 2321 dc, err := checks.NewDockerClient(os.Getenv("DOCKER_HOST"), checks.BufSize) 2322 if err != nil { 2323 a.logger.Printf("[ERR] agent: error creating docker client: %s", err) 2324 return err 2325 } 2326 a.logger.Printf("[DEBUG] agent: created docker client for %s", dc.Host()) 2327 a.dockerClient = dc 2328 } 2329 2330 dockerCheck := &checks.CheckDocker{ 2331 Notify: a.State, 2332 CheckID: check.CheckID, 2333 DockerContainerID: chkType.DockerContainerID, 2334 Shell: chkType.Shell, 2335 ScriptArgs: chkType.ScriptArgs, 2336 Interval: chkType.Interval, 2337 Logger: a.logger, 2338 Client: a.dockerClient, 2339 } 2340 if prev := a.checkDockers[check.CheckID]; prev != nil { 2341 prev.Stop() 2342 } 2343 dockerCheck.Start() 2344 a.checkDockers[check.CheckID] = dockerCheck 2345 2346 case chkType.IsMonitor(): 2347 if existing, ok := a.checkMonitors[check.CheckID]; ok { 2348 existing.Stop() 2349 delete(a.checkMonitors, check.CheckID) 2350 } 2351 if chkType.Interval < checks.MinInterval { 2352 a.logger.Printf("[WARN] agent: check '%s' has interval below minimum of %v", 2353 check.CheckID, checks.MinInterval) 2354 chkType.Interval = checks.MinInterval 2355 } 2356 2357 monitor := &checks.CheckMonitor{ 2358 Notify: a.State, 2359 CheckID: check.CheckID, 2360 ScriptArgs: chkType.ScriptArgs, 2361 Interval: chkType.Interval, 2362 Timeout: chkType.Timeout, 2363 Logger: a.logger, 2364 } 2365 monitor.Start() 2366 a.checkMonitors[check.CheckID] = monitor 2367 2368 case chkType.IsAlias(): 2369 if existing, ok := a.checkAliases[check.CheckID]; ok { 2370 existing.Stop() 2371 delete(a.checkAliases, check.CheckID) 2372 } 2373 2374 var rpcReq structs.NodeSpecificRequest 2375 rpcReq.Datacenter = a.config.Datacenter 2376 2377 // The token to set is really important. The behavior below follows 2378 // the same behavior as anti-entropy: we use the user-specified token 2379 // if set (either on the service or check definition), otherwise 2380 // we use the "UserToken" on the agent. This is tested. 2381 rpcReq.Token = a.tokens.UserToken() 2382 if token != "" { 2383 rpcReq.Token = token 2384 } 2385 2386 chkImpl := &checks.CheckAlias{ 2387 Notify: a.State, 2388 RPC: a.delegate, 2389 RPCReq: rpcReq, 2390 CheckID: check.CheckID, 2391 Node: chkType.AliasNode, 2392 ServiceID: chkType.AliasService, 2393 } 2394 chkImpl.Start() 2395 a.checkAliases[check.CheckID] = chkImpl 2396 2397 default: 2398 return fmt.Errorf("Check type is not valid") 2399 } 2400 2401 if chkType.DeregisterCriticalServiceAfter > 0 { 2402 timeout := chkType.DeregisterCriticalServiceAfter 2403 if timeout < a.config.CheckDeregisterIntervalMin { 2404 timeout = a.config.CheckDeregisterIntervalMin 2405 a.logger.Println(fmt.Sprintf("[WARN] agent: check '%s' has deregister interval below minimum of %v", 2406 check.CheckID, a.config.CheckDeregisterIntervalMin)) 2407 } 2408 a.checkReapAfter[check.CheckID] = timeout 2409 } else { 2410 delete(a.checkReapAfter, check.CheckID) 2411 } 2412 } 2413 2414 return nil 2415 } 2416 2417 // RemoveCheck is used to remove a health check. 2418 // The agent will make a best effort to ensure it is deregistered 2419 func (a *Agent) RemoveCheck(checkID types.CheckID, persist bool) error { 2420 a.stateLock.Lock() 2421 defer a.stateLock.Unlock() 2422 return a.removeCheckLocked(checkID, persist) 2423 } 2424 2425 // removeCheckLocked is used to remove a health check. 2426 // The agent will make a best effort to ensure it is deregistered 2427 func (a *Agent) removeCheckLocked(checkID types.CheckID, persist bool) error { 2428 // Validate CheckID 2429 if checkID == "" { 2430 return fmt.Errorf("CheckID missing") 2431 } 2432 2433 a.cancelCheckMonitors(checkID) 2434 a.State.RemoveCheck(checkID) 2435 2436 if persist { 2437 if err := a.purgeCheck(checkID); err != nil { 2438 return err 2439 } 2440 if err := a.purgeCheckState(checkID); err != nil { 2441 return err 2442 } 2443 } 2444 a.logger.Printf("[DEBUG] agent: removed check %q", checkID) 2445 return nil 2446 } 2447 2448 // addProxyLocked adds a new local Connect Proxy instance to be managed by the agent. 2449 // 2450 // This assumes that the agent's proxyLock is already held 2451 // 2452 // It REQUIRES that the service that is being proxied is already present in the 2453 // local state. Note that this is only used for agent-managed proxies so we can 2454 // ensure that we always make this true. For externally managed and registered 2455 // proxies we explicitly allow the proxy to be registered first to make 2456 // bootstrap ordering of a new service simpler but the same is not true here 2457 // since this is only ever called when setting up a _managed_ proxy which was 2458 // registered as part of a service registration either from config or HTTP API 2459 // call. 2460 // 2461 // The restoredProxyToken argument should only be used when restoring proxy 2462 // definitions from disk; new proxies must leave it blank to get a new token 2463 // assigned. We need to restore from disk to enable to continue authenticating 2464 // running proxies that already had that credential injected. 2465 func (a *Agent) addProxyLocked(proxy *structs.ConnectManagedProxy, persist, FromFile bool, 2466 restoredProxyToken string, source configSource) error { 2467 // Lookup the target service token in state if there is one. 2468 token := a.State.ServiceToken(proxy.TargetServiceID) 2469 2470 // Copy the basic proxy structure so it isn't modified w/ defaults 2471 proxyCopy := *proxy 2472 proxy = &proxyCopy 2473 if err := a.applyProxyDefaults(proxy); err != nil { 2474 return err 2475 } 2476 2477 // Add the proxy to local state first since we may need to assign a port which 2478 // needs to be coordinate under state lock. AddProxy will generate the 2479 // NodeService for the proxy populated with the allocated (or configured) port 2480 // and an ID, but it doesn't add it to the agent directly since that could 2481 // deadlock and we may need to coordinate adding it and persisting etc. 2482 proxyState, err := a.State.AddProxy(proxy, token, restoredProxyToken) 2483 if err != nil { 2484 return err 2485 } 2486 proxyService := proxyState.Proxy.ProxyService 2487 2488 // Register proxy TCP check. The built in proxy doesn't listen publically 2489 // until it's loaded certs so this ensures we won't route traffic until it's 2490 // ready. 2491 proxyCfg, err := a.applyProxyConfigDefaults(proxyState.Proxy) 2492 if err != nil { 2493 return err 2494 } 2495 chkAddr := a.resolveProxyCheckAddress(proxyCfg) 2496 chkTypes := []*structs.CheckType{} 2497 if chkAddr != "" { 2498 chkTypes = []*structs.CheckType{ 2499 &structs.CheckType{ 2500 Name: "Connect Proxy Listening", 2501 TCP: fmt.Sprintf("%s:%d", chkAddr, 2502 proxyCfg["bind_port"]), 2503 Interval: 10 * time.Second, 2504 }, 2505 } 2506 } 2507 2508 err = a.addServiceLocked(proxyService, chkTypes, persist, token, source) 2509 if err != nil { 2510 // Remove the state too 2511 a.State.RemoveProxy(proxyService.ID) 2512 return err 2513 } 2514 2515 // Persist the proxy 2516 if persist && a.config.DataDir != "" { 2517 return a.persistProxy(proxyState, FromFile) 2518 } 2519 return nil 2520 } 2521 2522 // AddProxy adds a new local Connect Proxy instance to be managed by the agent. 2523 // 2524 // It REQUIRES that the service that is being proxied is already present in the 2525 // local state. Note that this is only used for agent-managed proxies so we can 2526 // ensure that we always make this true. For externally managed and registered 2527 // proxies we explicitly allow the proxy to be registered first to make 2528 // bootstrap ordering of a new service simpler but the same is not true here 2529 // since this is only ever called when setting up a _managed_ proxy which was 2530 // registered as part of a service registration either from config or HTTP API 2531 // call. 2532 // 2533 // The restoredProxyToken argument should only be used when restoring proxy 2534 // definitions from disk; new proxies must leave it blank to get a new token 2535 // assigned. We need to restore from disk to enable to continue authenticating 2536 // running proxies that already had that credential injected. 2537 func (a *Agent) AddProxy(proxy *structs.ConnectManagedProxy, persist, FromFile bool, 2538 restoredProxyToken string, source configSource) error { 2539 a.stateLock.Lock() 2540 defer a.stateLock.Unlock() 2541 return a.addProxyLocked(proxy, persist, FromFile, restoredProxyToken, source) 2542 } 2543 2544 // resolveProxyCheckAddress returns the best address to use for a TCP check of 2545 // the proxy's public listener. It expects the input to already have default 2546 // values populated by applyProxyConfigDefaults. It may return an empty string 2547 // indicating that the TCP check should not be created at all. 2548 // 2549 // By default this uses the proxy's bind address which in turn defaults to the 2550 // agent's bind address. If the proxy bind address ends up being 0.0.0.0 we have 2551 // to assume the agent can dial it over loopback which is usually true. 2552 // 2553 // In some topologies such as proxy being in a different container, the IP the 2554 // agent used to dial proxy over a local bridge might not be the same as the 2555 // container's public routable IP address so we allow a manual override of the 2556 // check address in config "tcp_check_address" too. 2557 // 2558 // Finally the TCP check can be disabled by another manual override 2559 // "disable_tcp_check" in cases where the agent will never be able to dial the 2560 // proxy directly for some reason. 2561 func (a *Agent) resolveProxyCheckAddress(proxyCfg map[string]interface{}) string { 2562 // If user disabled the check return empty string 2563 if disable, ok := proxyCfg["disable_tcp_check"].(bool); ok && disable { 2564 return "" 2565 } 2566 2567 // If user specified a custom one, use that 2568 if chkAddr, ok := proxyCfg["tcp_check_address"].(string); ok && chkAddr != "" { 2569 return chkAddr 2570 } 2571 2572 // If we have a bind address and its diallable, use that 2573 if bindAddr, ok := proxyCfg["bind_address"].(string); ok && 2574 bindAddr != "" && bindAddr != "0.0.0.0" && bindAddr != "[::]" { 2575 return bindAddr 2576 } 2577 2578 // Default to localhost 2579 return "127.0.0.1" 2580 } 2581 2582 // applyProxyConfigDefaults takes a *structs.ConnectManagedProxy and returns 2583 // it's Config map merged with any defaults from the Agent's config. It would be 2584 // nicer if this were defined as a method on structs.ConnectManagedProxy but we 2585 // can't do that because ot the import cycle it causes with agent/config. 2586 func (a *Agent) applyProxyConfigDefaults(p *structs.ConnectManagedProxy) (map[string]interface{}, error) { 2587 if p == nil || p.ProxyService == nil { 2588 // Should never happen but protect from panic 2589 return nil, fmt.Errorf("invalid proxy state") 2590 } 2591 2592 // Lookup the target service 2593 target := a.State.Service(p.TargetServiceID) 2594 if target == nil { 2595 // Can happen during deregistration race between proxy and scheduler. 2596 return nil, fmt.Errorf("unknown target service ID: %s", p.TargetServiceID) 2597 } 2598 2599 // Merge globals defaults 2600 config := make(map[string]interface{}) 2601 for k, v := range a.config.ConnectProxyDefaultConfig { 2602 if _, ok := config[k]; !ok { 2603 config[k] = v 2604 } 2605 } 2606 2607 // Copy config from the proxy 2608 for k, v := range p.Config { 2609 config[k] = v 2610 } 2611 2612 // Set defaults for anything that is still not specified but required. 2613 // Note that these are not included in the content hash. Since we expect 2614 // them to be static in general but some like the default target service 2615 // port might not be. In that edge case services can set that explicitly 2616 // when they re-register which will be caught though. 2617 if _, ok := config["bind_port"]; !ok { 2618 config["bind_port"] = p.ProxyService.Port 2619 } 2620 if _, ok := config["bind_address"]; !ok { 2621 // Default to binding to the same address the agent is configured to 2622 // bind to. 2623 config["bind_address"] = a.config.BindAddr.String() 2624 } 2625 if _, ok := config["local_service_address"]; !ok { 2626 // Default to localhost and the port the service registered with 2627 config["local_service_address"] = fmt.Sprintf("127.0.0.1:%d", target.Port) 2628 } 2629 2630 // Basic type conversions for expected types. 2631 if raw, ok := config["bind_port"]; ok { 2632 switch v := raw.(type) { 2633 case float64: 2634 // Common since HCL/JSON parse as float64 2635 config["bind_port"] = int(v) 2636 2637 // NOTE(mitchellh): No default case since errors and validation 2638 // are handled by the ServiceDefinition.Validate function. 2639 } 2640 } 2641 2642 return config, nil 2643 } 2644 2645 // applyProxyDefaults modifies the given proxy by applying any configured 2646 // defaults, such as the default execution mode, command, etc. 2647 func (a *Agent) applyProxyDefaults(proxy *structs.ConnectManagedProxy) error { 2648 // Set the default exec mode 2649 if proxy.ExecMode == structs.ProxyExecModeUnspecified { 2650 mode, err := structs.NewProxyExecMode(a.config.ConnectProxyDefaultExecMode) 2651 if err != nil { 2652 return err 2653 } 2654 2655 proxy.ExecMode = mode 2656 } 2657 if proxy.ExecMode == structs.ProxyExecModeUnspecified { 2658 proxy.ExecMode = structs.ProxyExecModeDaemon 2659 } 2660 2661 // Set the default command to the globally configured default 2662 if len(proxy.Command) == 0 { 2663 switch proxy.ExecMode { 2664 case structs.ProxyExecModeDaemon: 2665 proxy.Command = a.config.ConnectProxyDefaultDaemonCommand 2666 2667 case structs.ProxyExecModeScript: 2668 proxy.Command = a.config.ConnectProxyDefaultScriptCommand 2669 } 2670 } 2671 2672 // If there is no globally configured default we need to get the 2673 // default command so we can do "consul connect proxy" 2674 if len(proxy.Command) == 0 { 2675 command, err := defaultProxyCommand(a.config) 2676 if err != nil { 2677 return err 2678 } 2679 2680 proxy.Command = command 2681 } 2682 2683 return nil 2684 } 2685 2686 // removeProxyLocked stops and removes a local proxy instance. 2687 // 2688 // It is assumed that this function is called while holding the proxyLock already 2689 func (a *Agent) removeProxyLocked(proxyID string, persist bool) error { 2690 // Validate proxyID 2691 if proxyID == "" { 2692 return fmt.Errorf("proxyID missing") 2693 } 2694 2695 // Remove the proxy from the local state 2696 p, err := a.State.RemoveProxy(proxyID) 2697 if err != nil { 2698 return err 2699 } 2700 2701 // Remove the proxy service as well. The proxy ID is also the ID 2702 // of the servie, but we might as well use the service pointer. 2703 if err := a.removeServiceLocked(p.Proxy.ProxyService.ID, persist); err != nil { 2704 return err 2705 } 2706 2707 if persist && a.config.DataDir != "" { 2708 return a.purgeProxy(proxyID) 2709 } 2710 2711 return nil 2712 } 2713 2714 // RemoveProxy stops and removes a local proxy instance. 2715 func (a *Agent) RemoveProxy(proxyID string, persist bool) error { 2716 a.stateLock.Lock() 2717 defer a.stateLock.Unlock() 2718 return a.removeProxyLocked(proxyID, persist) 2719 } 2720 2721 // verifyProxyToken takes a token and attempts to verify it against the 2722 // targetService name. If targetProxy is specified, then the local proxy token 2723 // must exactly match the given proxy ID. cert, config, etc.). 2724 // 2725 // The given token may be a local-only proxy token or it may be an ACL token. We 2726 // will attempt to verify the local proxy token first. 2727 // 2728 // The effective ACL token is returned along with a boolean which is true if the 2729 // match was against a proxy token rather than an ACL token, and any error. In 2730 // the case the token matches a proxy token, then the ACL token used to register 2731 // that proxy's target service is returned for use in any RPC calls the proxy 2732 // needs to make on behalf of that service. If the token was an ACL token 2733 // already then it is always returned. Provided error is nil, a valid ACL token 2734 // is always returned. 2735 func (a *Agent) verifyProxyToken(token, targetService, 2736 targetProxy string) (string, bool, error) { 2737 // If we specify a target proxy, we look up that proxy directly. Otherwise, 2738 // we resolve with any proxy we can find. 2739 var proxy *local.ManagedProxy 2740 if targetProxy != "" { 2741 proxy = a.State.Proxy(targetProxy) 2742 if proxy == nil { 2743 return "", false, fmt.Errorf("unknown proxy service ID: %q", targetProxy) 2744 } 2745 2746 // If the token DOESN'T match, then we reset the proxy which will 2747 // cause the logic below to fall back to normal ACLs. Otherwise, 2748 // we keep the proxy set because we also have to verify that the 2749 // target service matches on the proxy. 2750 if token != proxy.ProxyToken { 2751 proxy = nil 2752 } 2753 } else { 2754 proxy = a.resolveProxyToken(token) 2755 } 2756 2757 // The existence of a token isn't enough, we also need to verify 2758 // that the service name of the matching proxy matches our target 2759 // service. 2760 if proxy != nil { 2761 // Get the target service since we only have the name. The nil 2762 // check below should never be true since a proxy token always 2763 // represents the existence of a local service. 2764 target := a.State.Service(proxy.Proxy.TargetServiceID) 2765 if target == nil { 2766 return "", false, fmt.Errorf("proxy target service not found: %q", 2767 proxy.Proxy.TargetServiceID) 2768 } 2769 2770 if target.Service != targetService { 2771 return "", false, acl.ErrPermissionDenied 2772 } 2773 2774 // Resolve the actual ACL token used to register the proxy/service and 2775 // return that for use in RPC calls. 2776 return a.State.ServiceToken(proxy.Proxy.TargetServiceID), true, nil 2777 } 2778 2779 // Doesn't match, we have to do a full token resolution. The required 2780 // permission for any proxy-related endpoint is service:write, since 2781 // to register a proxy you require that permission and sensitive data 2782 // is usually present in the configuration. 2783 rule, err := a.resolveToken(token) 2784 if err != nil { 2785 return "", false, err 2786 } 2787 if rule != nil && !rule.ServiceWrite(targetService, nil) { 2788 return "", false, acl.ErrPermissionDenied 2789 } 2790 2791 return token, false, nil 2792 } 2793 2794 func (a *Agent) cancelCheckMonitors(checkID types.CheckID) { 2795 // Stop any monitors 2796 delete(a.checkReapAfter, checkID) 2797 if check, ok := a.checkMonitors[checkID]; ok { 2798 check.Stop() 2799 delete(a.checkMonitors, checkID) 2800 } 2801 if check, ok := a.checkHTTPs[checkID]; ok { 2802 check.Stop() 2803 delete(a.checkHTTPs, checkID) 2804 } 2805 if check, ok := a.checkTCPs[checkID]; ok { 2806 check.Stop() 2807 delete(a.checkTCPs, checkID) 2808 } 2809 if check, ok := a.checkGRPCs[checkID]; ok { 2810 check.Stop() 2811 delete(a.checkGRPCs, checkID) 2812 } 2813 if check, ok := a.checkTTLs[checkID]; ok { 2814 check.Stop() 2815 delete(a.checkTTLs, checkID) 2816 } 2817 if check, ok := a.checkDockers[checkID]; ok { 2818 check.Stop() 2819 delete(a.checkDockers, checkID) 2820 } 2821 } 2822 2823 // updateTTLCheck is used to update the status of a TTL check via the Agent API. 2824 func (a *Agent) updateTTLCheck(checkID types.CheckID, status, output string) error { 2825 a.stateLock.Lock() 2826 defer a.stateLock.Unlock() 2827 2828 // Grab the TTL check. 2829 check, ok := a.checkTTLs[checkID] 2830 if !ok { 2831 return fmt.Errorf("CheckID %q does not have associated TTL", checkID) 2832 } 2833 2834 // Set the status through CheckTTL to reset the TTL. 2835 check.SetStatus(status, output) 2836 2837 // We don't write any files in dev mode so bail here. 2838 if a.config.DataDir == "" { 2839 return nil 2840 } 2841 2842 // Persist the state so the TTL check can come up in a good state after 2843 // an agent restart, especially with long TTL values. 2844 if err := a.persistCheckState(check, status, output); err != nil { 2845 return fmt.Errorf("failed persisting state for check %q: %s", checkID, err) 2846 } 2847 2848 return nil 2849 } 2850 2851 // persistCheckState is used to record the check status into the data dir. 2852 // This allows the state to be restored on a later agent start. Currently 2853 // only useful for TTL based checks. 2854 func (a *Agent) persistCheckState(check *checks.CheckTTL, status, output string) error { 2855 // Create the persisted state 2856 state := persistedCheckState{ 2857 CheckID: check.CheckID, 2858 Status: status, 2859 Output: output, 2860 Expires: time.Now().Add(check.TTL).Unix(), 2861 } 2862 2863 // Encode the state 2864 buf, err := json.Marshal(state) 2865 if err != nil { 2866 return err 2867 } 2868 2869 // Create the state dir if it doesn't exist 2870 dir := filepath.Join(a.config.DataDir, checkStateDir) 2871 if err := os.MkdirAll(dir, 0700); err != nil { 2872 return fmt.Errorf("failed creating check state dir %q: %s", dir, err) 2873 } 2874 2875 // Write the state to the file 2876 file := filepath.Join(dir, checkIDHash(check.CheckID)) 2877 2878 // Create temp file in same dir, to make more likely atomic 2879 tempFile := file + ".tmp" 2880 2881 // persistCheckState is called frequently, so don't use writeFileAtomic to avoid calling fsync here 2882 if err := ioutil.WriteFile(tempFile, buf, 0600); err != nil { 2883 return fmt.Errorf("failed writing temp file %q: %s", tempFile, err) 2884 } 2885 if err := os.Rename(tempFile, file); err != nil { 2886 return fmt.Errorf("failed to rename temp file from %q to %q: %s", tempFile, file, err) 2887 } 2888 2889 return nil 2890 } 2891 2892 // loadCheckState is used to restore the persisted state of a check. 2893 func (a *Agent) loadCheckState(check *structs.HealthCheck) error { 2894 // Try to read the persisted state for this check 2895 file := filepath.Join(a.config.DataDir, checkStateDir, checkIDHash(check.CheckID)) 2896 buf, err := ioutil.ReadFile(file) 2897 if err != nil { 2898 if os.IsNotExist(err) { 2899 return nil 2900 } 2901 return fmt.Errorf("failed reading file %q: %s", file, err) 2902 } 2903 2904 // Decode the state data 2905 var p persistedCheckState 2906 if err := json.Unmarshal(buf, &p); err != nil { 2907 a.logger.Printf("[ERR] agent: failed decoding check state: %s", err) 2908 return a.purgeCheckState(check.CheckID) 2909 } 2910 2911 // Check if the state has expired 2912 if time.Now().Unix() >= p.Expires { 2913 a.logger.Printf("[DEBUG] agent: check state expired for %q, not restoring", check.CheckID) 2914 return a.purgeCheckState(check.CheckID) 2915 } 2916 2917 // Restore the fields from the state 2918 check.Output = p.Output 2919 check.Status = p.Status 2920 return nil 2921 } 2922 2923 // purgeCheckState is used to purge the state of a check from the data dir 2924 func (a *Agent) purgeCheckState(checkID types.CheckID) error { 2925 file := filepath.Join(a.config.DataDir, checkStateDir, checkIDHash(checkID)) 2926 err := os.Remove(file) 2927 if os.IsNotExist(err) { 2928 return nil 2929 } 2930 return err 2931 } 2932 2933 func (a *Agent) GossipEncrypted() bool { 2934 return a.delegate.Encrypted() 2935 } 2936 2937 // Stats is used to get various debugging state from the sub-systems 2938 func (a *Agent) Stats() map[string]map[string]string { 2939 stats := a.delegate.Stats() 2940 stats["agent"] = map[string]string{ 2941 "check_monitors": strconv.Itoa(len(a.checkMonitors)), 2942 "check_ttls": strconv.Itoa(len(a.checkTTLs)), 2943 } 2944 for k, v := range a.State.Stats() { 2945 stats["agent"][k] = v 2946 } 2947 2948 revision := a.config.Revision 2949 if len(revision) > 8 { 2950 revision = revision[:8] 2951 } 2952 stats["build"] = map[string]string{ 2953 "revision": revision, 2954 "version": a.config.Version, 2955 "prerelease": a.config.VersionPrerelease, 2956 } 2957 return stats 2958 } 2959 2960 // storePid is used to write out our PID to a file if necessary 2961 func (a *Agent) storePid() error { 2962 // Quit fast if no pidfile 2963 pidPath := a.config.PidFile 2964 if pidPath == "" { 2965 return nil 2966 } 2967 2968 // Open the PID file 2969 pidFile, err := os.OpenFile(pidPath, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0666) 2970 if err != nil { 2971 return fmt.Errorf("Could not open pid file: %v", err) 2972 } 2973 defer pidFile.Close() 2974 2975 // Write out the PID 2976 pid := os.Getpid() 2977 _, err = pidFile.WriteString(fmt.Sprintf("%d", pid)) 2978 if err != nil { 2979 return fmt.Errorf("Could not write to pid file: %s", err) 2980 } 2981 return nil 2982 } 2983 2984 // deletePid is used to delete our PID on exit 2985 func (a *Agent) deletePid() error { 2986 // Quit fast if no pidfile 2987 pidPath := a.config.PidFile 2988 if pidPath == "" { 2989 return nil 2990 } 2991 2992 stat, err := os.Stat(pidPath) 2993 if err != nil { 2994 return fmt.Errorf("Could not remove pid file: %s", err) 2995 } 2996 2997 if stat.IsDir() { 2998 return fmt.Errorf("Specified pid file path is directory") 2999 } 3000 3001 err = os.Remove(pidPath) 3002 if err != nil { 3003 return fmt.Errorf("Could not remove pid file: %s", err) 3004 } 3005 return nil 3006 } 3007 3008 // loadServices will load service definitions from configuration and persisted 3009 // definitions on disk, and load them into the local agent. 3010 func (a *Agent) loadServices(conf *config.RuntimeConfig) error { 3011 // Register the services from config 3012 for _, service := range conf.Services { 3013 ns := service.NodeService() 3014 chkTypes, err := service.CheckTypes() 3015 if err != nil { 3016 return fmt.Errorf("Failed to validate checks for service %q: %v", service.Name, err) 3017 } 3018 3019 // Grab and validate sidecar if there is one too 3020 sidecar, sidecarChecks, sidecarToken, err := a.sidecarServiceFromNodeService(ns, service.Token) 3021 if err != nil { 3022 return fmt.Errorf("Failed to validate sidecar for service %q: %v", service.Name, err) 3023 } 3024 3025 // Remove sidecar from NodeService now it's done it's job it's just a config 3026 // syntax sugar and shouldn't be persisted in local or server state. 3027 ns.Connect.SidecarService = nil 3028 3029 if err := a.addServiceLocked(ns, chkTypes, false, service.Token, ConfigSourceLocal); err != nil { 3030 return fmt.Errorf("Failed to register service %q: %v", service.Name, err) 3031 } 3032 3033 // If there is a sidecar service, register that too. 3034 if sidecar != nil { 3035 if err := a.addServiceLocked(sidecar, sidecarChecks, false, sidecarToken, ConfigSourceLocal); err != nil { 3036 return fmt.Errorf("Failed to register sidecar for service %q: %v", service.Name, err) 3037 } 3038 } 3039 } 3040 3041 // Load any persisted services 3042 svcDir := filepath.Join(a.config.DataDir, servicesDir) 3043 files, err := ioutil.ReadDir(svcDir) 3044 if err != nil { 3045 if os.IsNotExist(err) { 3046 return nil 3047 } 3048 return fmt.Errorf("Failed reading services dir %q: %s", svcDir, err) 3049 } 3050 for _, fi := range files { 3051 // Skip all dirs 3052 if fi.IsDir() { 3053 continue 3054 } 3055 3056 // Skip all partially written temporary files 3057 if strings.HasSuffix(fi.Name(), "tmp") { 3058 a.logger.Printf("[WARN] agent: Ignoring temporary service file %v", fi.Name()) 3059 continue 3060 } 3061 3062 // Open the file for reading 3063 file := filepath.Join(svcDir, fi.Name()) 3064 fh, err := os.Open(file) 3065 if err != nil { 3066 return fmt.Errorf("failed opening service file %q: %s", file, err) 3067 } 3068 3069 // Read the contents into a buffer 3070 buf, err := ioutil.ReadAll(fh) 3071 fh.Close() 3072 if err != nil { 3073 return fmt.Errorf("failed reading service file %q: %s", file, err) 3074 } 3075 3076 // Try decoding the service definition 3077 var p persistedService 3078 if err := json.Unmarshal(buf, &p); err != nil { 3079 // Backwards-compatibility for pre-0.5.1 persisted services 3080 if err := json.Unmarshal(buf, &p.Service); err != nil { 3081 a.logger.Printf("[ERR] agent: Failed decoding service file %q: %s", file, err) 3082 continue 3083 } 3084 } 3085 serviceID := p.Service.ID 3086 3087 if a.State.Service(serviceID) != nil { 3088 // Purge previously persisted service. This allows config to be 3089 // preferred over services persisted from the API. 3090 a.logger.Printf("[DEBUG] agent: service %q exists, not restoring from %q", 3091 serviceID, file) 3092 if err := a.purgeService(serviceID); err != nil { 3093 return fmt.Errorf("failed purging service %q: %s", serviceID, err) 3094 } 3095 } else { 3096 a.logger.Printf("[DEBUG] agent: restored service definition %q from %q", 3097 serviceID, file) 3098 if err := a.addServiceLocked(p.Service, nil, false, p.Token, ConfigSourceLocal); err != nil { 3099 return fmt.Errorf("failed adding service %q: %s", serviceID, err) 3100 } 3101 } 3102 } 3103 3104 return nil 3105 } 3106 3107 // unloadServices will deregister all services. 3108 func (a *Agent) unloadServices() error { 3109 for id := range a.State.Services() { 3110 if err := a.removeServiceLocked(id, false); err != nil { 3111 return fmt.Errorf("Failed deregistering service '%s': %v", id, err) 3112 } 3113 } 3114 return nil 3115 } 3116 3117 // loadChecks loads check definitions and/or persisted check definitions from 3118 // disk and re-registers them with the local agent. 3119 func (a *Agent) loadChecks(conf *config.RuntimeConfig) error { 3120 // Register the checks from config 3121 for _, check := range conf.Checks { 3122 health := check.HealthCheck(conf.NodeName) 3123 chkType := check.CheckType() 3124 if err := a.addCheckLocked(health, chkType, false, check.Token, ConfigSourceLocal); err != nil { 3125 return fmt.Errorf("Failed to register check '%s': %v %v", check.Name, err, check) 3126 } 3127 } 3128 3129 // Load any persisted checks 3130 checkDir := filepath.Join(a.config.DataDir, checksDir) 3131 files, err := ioutil.ReadDir(checkDir) 3132 if err != nil { 3133 if os.IsNotExist(err) { 3134 return nil 3135 } 3136 return fmt.Errorf("Failed reading checks dir %q: %s", checkDir, err) 3137 } 3138 for _, fi := range files { 3139 // Ignore dirs - we only care about the check definition files 3140 if fi.IsDir() { 3141 continue 3142 } 3143 3144 // Open the file for reading 3145 file := filepath.Join(checkDir, fi.Name()) 3146 fh, err := os.Open(file) 3147 if err != nil { 3148 return fmt.Errorf("Failed opening check file %q: %s", file, err) 3149 } 3150 3151 // Read the contents into a buffer 3152 buf, err := ioutil.ReadAll(fh) 3153 fh.Close() 3154 if err != nil { 3155 return fmt.Errorf("failed reading check file %q: %s", file, err) 3156 } 3157 3158 // Decode the check 3159 var p persistedCheck 3160 if err := json.Unmarshal(buf, &p); err != nil { 3161 a.logger.Printf("[ERR] agent: Failed decoding check file %q: %s", file, err) 3162 continue 3163 } 3164 checkID := p.Check.CheckID 3165 3166 if a.State.Check(checkID) != nil { 3167 // Purge previously persisted check. This allows config to be 3168 // preferred over persisted checks from the API. 3169 a.logger.Printf("[DEBUG] agent: check %q exists, not restoring from %q", 3170 checkID, file) 3171 if err := a.purgeCheck(checkID); err != nil { 3172 return fmt.Errorf("Failed purging check %q: %s", checkID, err) 3173 } 3174 } else { 3175 // Default check to critical to avoid placing potentially unhealthy 3176 // services into the active pool 3177 p.Check.Status = api.HealthCritical 3178 3179 if err := a.addCheckLocked(p.Check, p.ChkType, false, p.Token, ConfigSourceLocal); err != nil { 3180 // Purge the check if it is unable to be restored. 3181 a.logger.Printf("[WARN] agent: Failed to restore check %q: %s", 3182 checkID, err) 3183 if err := a.purgeCheck(checkID); err != nil { 3184 return fmt.Errorf("Failed purging check %q: %s", checkID, err) 3185 } 3186 } 3187 a.logger.Printf("[DEBUG] agent: restored health check %q from %q", 3188 p.Check.CheckID, file) 3189 } 3190 } 3191 3192 return nil 3193 } 3194 3195 // unloadChecks will deregister all checks known to the local agent. 3196 func (a *Agent) unloadChecks() error { 3197 for id := range a.State.Checks() { 3198 if err := a.removeCheckLocked(id, false); err != nil { 3199 return fmt.Errorf("Failed deregistering check '%s': %s", id, err) 3200 } 3201 } 3202 return nil 3203 } 3204 3205 // loadPersistedProxies will load connect proxy definitions from their 3206 // persisted state on disk and return a slice of them 3207 // 3208 // This does not add them to the local 3209 func (a *Agent) loadPersistedProxies() (map[string]persistedProxy, error) { 3210 persistedProxies := make(map[string]persistedProxy) 3211 3212 proxyDir := filepath.Join(a.config.DataDir, proxyDir) 3213 files, err := ioutil.ReadDir(proxyDir) 3214 if err != nil { 3215 if !os.IsNotExist(err) { 3216 return nil, fmt.Errorf("Failed reading proxies dir %q: %s", proxyDir, err) 3217 } 3218 } 3219 3220 for _, fi := range files { 3221 // Skip all dirs 3222 if fi.IsDir() { 3223 continue 3224 } 3225 3226 // Skip all partially written temporary files 3227 if strings.HasSuffix(fi.Name(), "tmp") { 3228 return nil, fmt.Errorf("Ignoring temporary proxy file %v", fi.Name()) 3229 } 3230 3231 // Open the file for reading 3232 file := filepath.Join(proxyDir, fi.Name()) 3233 fh, err := os.Open(file) 3234 if err != nil { 3235 return nil, fmt.Errorf("failed opening proxy file %q: %s", file, err) 3236 } 3237 3238 // Read the contents into a buffer 3239 buf, err := ioutil.ReadAll(fh) 3240 fh.Close() 3241 if err != nil { 3242 return nil, fmt.Errorf("failed reading proxy file %q: %s", file, err) 3243 } 3244 3245 // Try decoding the proxy definition 3246 var p persistedProxy 3247 if err := json.Unmarshal(buf, &p); err != nil { 3248 return nil, fmt.Errorf("Failed decoding proxy file %q: %s", file, err) 3249 } 3250 svcID := p.Proxy.TargetServiceID 3251 3252 persistedProxies[svcID] = p 3253 } 3254 3255 return persistedProxies, nil 3256 } 3257 3258 // loadProxies will load connect proxy definitions from configuration and 3259 // persisted definitions on disk, and load them into the local agent. 3260 func (a *Agent) loadProxies(conf *config.RuntimeConfig) error { 3261 persistedProxies, persistenceErr := a.loadPersistedProxies() 3262 3263 for _, svc := range conf.Services { 3264 if svc.Connect != nil { 3265 proxy, err := svc.ConnectManagedProxy() 3266 if err != nil { 3267 return fmt.Errorf("failed adding proxy: %s", err) 3268 } 3269 if proxy == nil { 3270 continue 3271 } 3272 restoredToken := "" 3273 if persisted, ok := persistedProxies[proxy.TargetServiceID]; ok { 3274 restoredToken = persisted.ProxyToken 3275 } 3276 3277 if err := a.addProxyLocked(proxy, true, true, restoredToken, ConfigSourceLocal); err != nil { 3278 return fmt.Errorf("failed adding proxy: %s", err) 3279 } 3280 } 3281 } 3282 3283 for _, persisted := range persistedProxies { 3284 proxyID := persisted.Proxy.ProxyService.ID 3285 if persisted.FromFile && a.State.Proxy(proxyID) == nil { 3286 // Purge proxies that were configured previously but are no longer in the config 3287 a.logger.Printf("[DEBUG] agent: purging stale persisted proxy %q", proxyID) 3288 if err := a.purgeProxy(proxyID); err != nil { 3289 return fmt.Errorf("failed purging proxy %q: %v", proxyID, err) 3290 } 3291 } else if !persisted.FromFile { 3292 if a.State.Proxy(proxyID) == nil { 3293 a.logger.Printf("[DEBUG] agent: restored proxy definition %q", proxyID) 3294 if err := a.addProxyLocked(persisted.Proxy, false, false, persisted.ProxyToken, ConfigSourceLocal); err != nil { 3295 return fmt.Errorf("failed adding proxy %q: %v", proxyID, err) 3296 } 3297 } else { 3298 a.logger.Printf("[WARN] agent: proxy definition %q was overwritten by a proxy definition within a config file", proxyID) 3299 } 3300 } 3301 } 3302 3303 return persistenceErr 3304 } 3305 3306 type persistedTokens struct { 3307 Replication string `json:"replication,omitempty"` 3308 AgentMaster string `json:"agent_master,omitempty"` 3309 Default string `json:"default,omitempty"` 3310 Agent string `json:"agent,omitempty"` 3311 } 3312 3313 func (a *Agent) getPersistedTokens() (*persistedTokens, error) { 3314 persistedTokens := &persistedTokens{} 3315 if !a.config.ACLEnableTokenPersistence { 3316 return persistedTokens, nil 3317 } 3318 3319 a.persistedTokensLock.RLock() 3320 defer a.persistedTokensLock.RUnlock() 3321 3322 tokensFullPath := filepath.Join(a.config.DataDir, tokensPath) 3323 3324 buf, err := ioutil.ReadFile(tokensFullPath) 3325 if err != nil { 3326 if os.IsNotExist(err) { 3327 // non-existence is not an error we care about 3328 return persistedTokens, nil 3329 } 3330 return persistedTokens, fmt.Errorf("failed reading tokens file %q: %s", tokensFullPath, err) 3331 } 3332 3333 if err := json.Unmarshal(buf, persistedTokens); err != nil { 3334 return persistedTokens, fmt.Errorf("failed to decode tokens file %q: %s", tokensFullPath, err) 3335 } 3336 3337 return persistedTokens, nil 3338 } 3339 3340 func (a *Agent) loadTokens(conf *config.RuntimeConfig) error { 3341 persistedTokens, persistenceErr := a.getPersistedTokens() 3342 3343 if persistenceErr != nil { 3344 a.logger.Printf("[WARN] unable to load persisted tokens: %v", persistenceErr) 3345 } 3346 3347 if persistedTokens.Default != "" { 3348 a.tokens.UpdateUserToken(persistedTokens.Default, token.TokenSourceAPI) 3349 3350 if conf.ACLToken != "" { 3351 a.logger.Printf("[WARN] \"default\" token present in both the configuration and persisted token store, using the persisted token") 3352 } 3353 } else { 3354 a.tokens.UpdateUserToken(conf.ACLToken, token.TokenSourceConfig) 3355 } 3356 3357 if persistedTokens.Agent != "" { 3358 a.tokens.UpdateAgentToken(persistedTokens.Agent, token.TokenSourceAPI) 3359 3360 if conf.ACLAgentToken != "" { 3361 a.logger.Printf("[WARN] \"agent\" token present in both the configuration and persisted token store, using the persisted token") 3362 } 3363 } else { 3364 a.tokens.UpdateAgentToken(conf.ACLAgentToken, token.TokenSourceConfig) 3365 } 3366 3367 if persistedTokens.AgentMaster != "" { 3368 a.tokens.UpdateAgentMasterToken(persistedTokens.AgentMaster, token.TokenSourceAPI) 3369 3370 if conf.ACLAgentMasterToken != "" { 3371 a.logger.Printf("[WARN] \"agent_master\" token present in both the configuration and persisted token store, using the persisted token") 3372 } 3373 } else { 3374 a.tokens.UpdateAgentMasterToken(conf.ACLAgentMasterToken, token.TokenSourceConfig) 3375 } 3376 3377 if persistedTokens.Replication != "" { 3378 a.tokens.UpdateReplicationToken(persistedTokens.Replication, token.TokenSourceAPI) 3379 3380 if conf.ACLReplicationToken != "" { 3381 a.logger.Printf("[WARN] \"replication\" token present in both the configuration and persisted token store, using the persisted token") 3382 } 3383 } else { 3384 a.tokens.UpdateReplicationToken(conf.ACLReplicationToken, token.TokenSourceConfig) 3385 } 3386 3387 return persistenceErr 3388 } 3389 3390 // unloadProxies will deregister all proxies known to the local agent. 3391 func (a *Agent) unloadProxies() error { 3392 for id := range a.State.Proxies() { 3393 if err := a.removeProxyLocked(id, false); err != nil { 3394 return fmt.Errorf("Failed deregistering proxy '%s': %s", id, err) 3395 } 3396 } 3397 return nil 3398 } 3399 3400 // snapshotCheckState is used to snapshot the current state of the health 3401 // checks. This is done before we reload our checks, so that we can properly 3402 // restore into the same state. 3403 func (a *Agent) snapshotCheckState() map[types.CheckID]*structs.HealthCheck { 3404 return a.State.Checks() 3405 } 3406 3407 // restoreCheckState is used to reset the health state based on a snapshot. 3408 // This is done after we finish the reload to avoid any unnecessary flaps 3409 // in health state and potential session invalidations. 3410 func (a *Agent) restoreCheckState(snap map[types.CheckID]*structs.HealthCheck) { 3411 for id, check := range snap { 3412 a.State.UpdateCheck(id, check.Status, check.Output) 3413 } 3414 } 3415 3416 // loadMetadata loads node metadata fields from the agent config and 3417 // updates them on the local agent. 3418 func (a *Agent) loadMetadata(conf *config.RuntimeConfig) error { 3419 meta := map[string]string{} 3420 for k, v := range conf.NodeMeta { 3421 meta[k] = v 3422 } 3423 meta[structs.MetaSegmentKey] = conf.SegmentName 3424 return a.State.LoadMetadata(meta) 3425 } 3426 3427 // unloadMetadata resets the local metadata state 3428 func (a *Agent) unloadMetadata() { 3429 a.State.UnloadMetadata() 3430 } 3431 3432 // serviceMaintCheckID returns the ID of a given service's maintenance check 3433 func serviceMaintCheckID(serviceID string) types.CheckID { 3434 return types.CheckID(structs.ServiceMaintPrefix + serviceID) 3435 } 3436 3437 // EnableServiceMaintenance will register a false health check against the given 3438 // service ID with critical status. This will exclude the service from queries. 3439 func (a *Agent) EnableServiceMaintenance(serviceID, reason, token string) error { 3440 service, ok := a.State.Services()[serviceID] 3441 if !ok { 3442 return fmt.Errorf("No service registered with ID %q", serviceID) 3443 } 3444 3445 // Check if maintenance mode is not already enabled 3446 checkID := serviceMaintCheckID(serviceID) 3447 if _, ok := a.State.Checks()[checkID]; ok { 3448 return nil 3449 } 3450 3451 // Use default notes if no reason provided 3452 if reason == "" { 3453 reason = defaultServiceMaintReason 3454 } 3455 3456 // Create and register the critical health check 3457 check := &structs.HealthCheck{ 3458 Node: a.config.NodeName, 3459 CheckID: checkID, 3460 Name: "Service Maintenance Mode", 3461 Notes: reason, 3462 ServiceID: service.ID, 3463 ServiceName: service.Service, 3464 Status: api.HealthCritical, 3465 } 3466 a.AddCheck(check, nil, true, token, ConfigSourceLocal) 3467 a.logger.Printf("[INFO] agent: Service %q entered maintenance mode", serviceID) 3468 3469 return nil 3470 } 3471 3472 // DisableServiceMaintenance will deregister the fake maintenance mode check 3473 // if the service has been marked as in maintenance. 3474 func (a *Agent) DisableServiceMaintenance(serviceID string) error { 3475 if _, ok := a.State.Services()[serviceID]; !ok { 3476 return fmt.Errorf("No service registered with ID %q", serviceID) 3477 } 3478 3479 // Check if maintenance mode is enabled 3480 checkID := serviceMaintCheckID(serviceID) 3481 if _, ok := a.State.Checks()[checkID]; !ok { 3482 return nil 3483 } 3484 3485 // Deregister the maintenance check 3486 a.RemoveCheck(checkID, true) 3487 a.logger.Printf("[INFO] agent: Service %q left maintenance mode", serviceID) 3488 3489 return nil 3490 } 3491 3492 // EnableNodeMaintenance places a node into maintenance mode. 3493 func (a *Agent) EnableNodeMaintenance(reason, token string) { 3494 // Ensure node maintenance is not already enabled 3495 if _, ok := a.State.Checks()[structs.NodeMaint]; ok { 3496 return 3497 } 3498 3499 // Use a default notes value 3500 if reason == "" { 3501 reason = defaultNodeMaintReason 3502 } 3503 3504 // Create and register the node maintenance check 3505 check := &structs.HealthCheck{ 3506 Node: a.config.NodeName, 3507 CheckID: structs.NodeMaint, 3508 Name: "Node Maintenance Mode", 3509 Notes: reason, 3510 Status: api.HealthCritical, 3511 } 3512 a.AddCheck(check, nil, true, token, ConfigSourceLocal) 3513 a.logger.Printf("[INFO] agent: Node entered maintenance mode") 3514 } 3515 3516 // DisableNodeMaintenance removes a node from maintenance mode 3517 func (a *Agent) DisableNodeMaintenance() { 3518 if _, ok := a.State.Checks()[structs.NodeMaint]; !ok { 3519 return 3520 } 3521 a.RemoveCheck(structs.NodeMaint, true) 3522 a.logger.Printf("[INFO] agent: Node left maintenance mode") 3523 } 3524 3525 func (a *Agent) loadLimits(conf *config.RuntimeConfig) { 3526 a.config.RPCRateLimit = conf.RPCRateLimit 3527 a.config.RPCMaxBurst = conf.RPCMaxBurst 3528 } 3529 3530 func (a *Agent) ReloadConfig(newCfg *config.RuntimeConfig) error { 3531 // Bulk update the services and checks 3532 a.PauseSync() 3533 defer a.ResumeSync() 3534 3535 a.stateLock.Lock() 3536 defer a.stateLock.Unlock() 3537 3538 // Snapshot the current state, and restore it afterwards 3539 snap := a.snapshotCheckState() 3540 defer a.restoreCheckState(snap) 3541 3542 // First unload all checks, services, and metadata. This lets us begin the reload 3543 // with a clean slate. 3544 if err := a.unloadProxies(); err != nil { 3545 return fmt.Errorf("Failed unloading proxies: %s", err) 3546 } 3547 if err := a.unloadServices(); err != nil { 3548 return fmt.Errorf("Failed unloading services: %s", err) 3549 } 3550 if err := a.unloadChecks(); err != nil { 3551 return fmt.Errorf("Failed unloading checks: %s", err) 3552 } 3553 a.unloadMetadata() 3554 3555 // Reload tokens - should be done before all the other loading 3556 // to ensure the correct tokens are available for attaching to 3557 // the checks and service registrations. 3558 a.loadTokens(newCfg) 3559 3560 if err := a.tlsConfigurator.Update(newCfg.ToTLSUtilConfig()); err != nil { 3561 return fmt.Errorf("Failed reloading tls configuration: %s", err) 3562 } 3563 3564 // Reload service/check definitions and metadata. 3565 if err := a.loadServices(newCfg); err != nil { 3566 return fmt.Errorf("Failed reloading services: %s", err) 3567 } 3568 if err := a.loadProxies(newCfg); err != nil { 3569 return fmt.Errorf("Failed reloading proxies: %s", err) 3570 } 3571 if err := a.loadChecks(newCfg); err != nil { 3572 return fmt.Errorf("Failed reloading checks: %s", err) 3573 } 3574 if err := a.loadMetadata(newCfg); err != nil { 3575 return fmt.Errorf("Failed reloading metadata: %s", err) 3576 } 3577 3578 if err := a.reloadWatches(newCfg); err != nil { 3579 return fmt.Errorf("Failed reloading watches: %v", err) 3580 } 3581 3582 a.loadLimits(newCfg) 3583 3584 // create the config for the rpc server/client 3585 consulCfg, err := a.consulConfig() 3586 if err != nil { 3587 return err 3588 } 3589 3590 if err := a.delegate.ReloadConfig(consulCfg); err != nil { 3591 return err 3592 } 3593 3594 // Update filtered metrics 3595 metrics.UpdateFilter(newCfg.Telemetry.AllowedPrefixes, 3596 newCfg.Telemetry.BlockedPrefixes) 3597 3598 a.State.SetDiscardCheckOutput(newCfg.DiscardCheckOutput) 3599 3600 return nil 3601 } 3602 3603 // registerCache configures the cache and registers all the supported 3604 // types onto the cache. This is NOT safe to call multiple times so 3605 // care should be taken to call this exactly once after the cache 3606 // field has been initialized. 3607 func (a *Agent) registerCache() { 3608 // Note that you should register the _agent_ as the RPC implementation and not 3609 // the a.delegate directly, otherwise tests that rely on overriding RPC 3610 // routing via a.registerEndpoint will not work. 3611 3612 a.cache.RegisterType(cachetype.ConnectCARootName, &cachetype.ConnectCARoot{ 3613 RPC: a, 3614 }, &cache.RegisterOptions{ 3615 // Maintain a blocking query, retry dropped connections quickly 3616 Refresh: true, 3617 RefreshTimer: 0 * time.Second, 3618 RefreshTimeout: 10 * time.Minute, 3619 }) 3620 3621 a.cache.RegisterType(cachetype.ConnectCALeafName, &cachetype.ConnectCALeaf{ 3622 RPC: a, 3623 Cache: a.cache, 3624 Datacenter: a.config.Datacenter, 3625 TestOverrideCAChangeInitialDelay: a.config.ConnectTestCALeafRootChangeSpread, 3626 }, &cache.RegisterOptions{ 3627 // Maintain a blocking query, retry dropped connections quickly 3628 Refresh: true, 3629 RefreshTimer: 0 * time.Second, 3630 RefreshTimeout: 10 * time.Minute, 3631 }) 3632 3633 a.cache.RegisterType(cachetype.IntentionMatchName, &cachetype.IntentionMatch{ 3634 RPC: a, 3635 }, &cache.RegisterOptions{ 3636 // Maintain a blocking query, retry dropped connections quickly 3637 Refresh: true, 3638 RefreshTimer: 0 * time.Second, 3639 RefreshTimeout: 10 * time.Minute, 3640 }) 3641 3642 a.cache.RegisterType(cachetype.CatalogServicesName, &cachetype.CatalogServices{ 3643 RPC: a, 3644 }, &cache.RegisterOptions{ 3645 // Maintain a blocking query, retry dropped connections quickly 3646 Refresh: true, 3647 RefreshTimer: 0 * time.Second, 3648 RefreshTimeout: 10 * time.Minute, 3649 }) 3650 3651 a.cache.RegisterType(cachetype.HealthServicesName, &cachetype.HealthServices{ 3652 RPC: a, 3653 }, &cache.RegisterOptions{ 3654 // Maintain a blocking query, retry dropped connections quickly 3655 Refresh: true, 3656 RefreshTimer: 0 * time.Second, 3657 RefreshTimeout: 10 * time.Minute, 3658 }) 3659 3660 a.cache.RegisterType(cachetype.PreparedQueryName, &cachetype.PreparedQuery{ 3661 RPC: a, 3662 }, &cache.RegisterOptions{ 3663 // Prepared queries don't support blocking 3664 Refresh: false, 3665 }) 3666 3667 a.cache.RegisterType(cachetype.NodeServicesName, &cachetype.NodeServices{ 3668 RPC: a, 3669 }, &cache.RegisterOptions{ 3670 // Maintain a blocking query, retry dropped connections quickly 3671 Refresh: true, 3672 RefreshTimer: 0 * time.Second, 3673 RefreshTimeout: 10 * time.Minute, 3674 }) 3675 } 3676 3677 // defaultProxyCommand returns the default Connect managed proxy command. 3678 func defaultProxyCommand(agentCfg *config.RuntimeConfig) ([]string, error) { 3679 // Get the path to the current executable. This is cached once by the 3680 // library so this is effectively just a variable read. 3681 execPath, err := os.Executable() 3682 if err != nil { 3683 return nil, err 3684 } 3685 3686 // "consul connect proxy" default value for managed daemon proxy 3687 cmd := []string{execPath, "connect", "proxy"} 3688 3689 if agentCfg != nil && agentCfg.LogLevel != "INFO" { 3690 cmd = append(cmd, "-log-level", agentCfg.LogLevel) 3691 } 3692 return cmd, nil 3693 }