github.com/smithx10/nomad@v0.9.1-rc1/command/agent/consul/client.go (about) 1 package consul 2 3 import ( 4 "context" 5 "fmt" 6 "net" 7 "net/url" 8 "strconv" 9 "strings" 10 "sync" 11 "sync/atomic" 12 "time" 13 14 metrics "github.com/armon/go-metrics" 15 log "github.com/hashicorp/go-hclog" 16 17 "github.com/hashicorp/consul/api" 18 "github.com/hashicorp/nomad/helper" 19 "github.com/hashicorp/nomad/nomad/structs" 20 "github.com/hashicorp/nomad/plugins/drivers" 21 ) 22 23 const ( 24 // nomadServicePrefix is the prefix that scopes all Nomad registered 25 // services (both agent and task entries). 26 nomadServicePrefix = "_nomad" 27 28 // nomadTaskPrefix is the prefix that scopes Nomad registered services 29 // for tasks. 30 nomadTaskPrefix = nomadServicePrefix + "-task-" 31 32 // defaultRetryInterval is how quickly to retry syncing services and 33 // checks to Consul when an error occurs. Will backoff up to a max. 34 defaultRetryInterval = time.Second 35 36 // defaultMaxRetryInterval is the default max retry interval. 37 defaultMaxRetryInterval = 30 * time.Second 38 39 // defaultPeriodicalInterval is the interval at which the service 40 // client reconciles state between the desired services and checks and 41 // what's actually registered in Consul. This is done at an interval, 42 // rather than being purely edge triggered, to handle the case that the 43 // Consul agent's state may change underneath us 44 defaultPeriodicInterval = 30 * time.Second 45 46 // ttlCheckBuffer is the time interval that Nomad can take to report Consul 47 // the check result 48 ttlCheckBuffer = 31 * time.Second 49 50 // defaultShutdownWait is how long Shutdown() should block waiting for 51 // enqueued operations to sync to Consul by default. 52 defaultShutdownWait = time.Minute 53 54 // DefaultQueryWaitDuration is the max duration the Consul Agent will 55 // spend waiting for a response from a Consul Query. 56 DefaultQueryWaitDuration = 2 * time.Second 57 58 // ServiceTagHTTP is the tag assigned to HTTP services 59 ServiceTagHTTP = "http" 60 61 // ServiceTagRPC is the tag assigned to RPC services 62 ServiceTagRPC = "rpc" 63 64 // ServiceTagSerf is the tag assigned to Serf services 65 ServiceTagSerf = "serf" 66 ) 67 68 // CatalogAPI is the consul/api.Catalog API used by Nomad. 69 type CatalogAPI interface { 70 Datacenters() ([]string, error) 71 Service(service, tag string, q *api.QueryOptions) ([]*api.CatalogService, *api.QueryMeta, error) 72 } 73 74 // AgentAPI is the consul/api.Agent API used by Nomad. 75 type AgentAPI interface { 76 Services() (map[string]*api.AgentService, error) 77 Checks() (map[string]*api.AgentCheck, error) 78 CheckRegister(check *api.AgentCheckRegistration) error 79 CheckDeregister(checkID string) error 80 Self() (map[string]map[string]interface{}, error) 81 ServiceRegister(service *api.AgentServiceRegistration) error 82 ServiceDeregister(serviceID string) error 83 UpdateTTL(id, output, status string) error 84 } 85 86 // operations are submitted to the main loop via commit() for synchronizing 87 // with Consul. 88 type operations struct { 89 regServices []*api.AgentServiceRegistration 90 regChecks []*api.AgentCheckRegistration 91 scripts []*scriptCheck 92 93 deregServices []string 94 deregChecks []string 95 } 96 97 // AllocRegistration holds the status of services registered for a particular 98 // allocations by task. 99 type AllocRegistration struct { 100 // Tasks maps the name of a task to its registered services and checks 101 Tasks map[string]*TaskRegistration 102 } 103 104 func (a *AllocRegistration) copy() *AllocRegistration { 105 c := &AllocRegistration{ 106 Tasks: make(map[string]*TaskRegistration, len(a.Tasks)), 107 } 108 109 for k, v := range a.Tasks { 110 c.Tasks[k] = v.copy() 111 } 112 113 return c 114 } 115 116 // NumServices returns the number of registered services 117 func (a *AllocRegistration) NumServices() int { 118 if a == nil { 119 return 0 120 } 121 122 total := 0 123 for _, treg := range a.Tasks { 124 for _, sreg := range treg.Services { 125 if sreg.Service != nil { 126 total++ 127 } 128 } 129 } 130 131 return total 132 } 133 134 // NumChecks returns the number of registered checks 135 func (a *AllocRegistration) NumChecks() int { 136 if a == nil { 137 return 0 138 } 139 140 total := 0 141 for _, treg := range a.Tasks { 142 for _, sreg := range treg.Services { 143 total += len(sreg.Checks) 144 } 145 } 146 147 return total 148 } 149 150 // TaskRegistration holds the status of services registered for a particular 151 // task. 152 type TaskRegistration struct { 153 Services map[string]*ServiceRegistration 154 } 155 156 func (t *TaskRegistration) copy() *TaskRegistration { 157 c := &TaskRegistration{ 158 Services: make(map[string]*ServiceRegistration, len(t.Services)), 159 } 160 161 for k, v := range t.Services { 162 c.Services[k] = v.copy() 163 } 164 165 return c 166 } 167 168 // ServiceRegistration holds the status of a registered Consul Service and its 169 // Checks. 170 type ServiceRegistration struct { 171 // serviceID and checkIDs are internal fields that track just the IDs of the 172 // services/checks registered in Consul. It is used to materialize the other 173 // fields when queried. 174 serviceID string 175 checkIDs map[string]struct{} 176 177 // Service is the AgentService registered in Consul. 178 Service *api.AgentService 179 180 // Checks is the status of the registered checks. 181 Checks []*api.AgentCheck 182 } 183 184 func (s *ServiceRegistration) copy() *ServiceRegistration { 185 // Copy does not copy the external fields but only the internal fields. This 186 // is so that the caller of AllocRegistrations can not access the internal 187 // fields and that method uses these fields to populate the external fields. 188 return &ServiceRegistration{ 189 serviceID: s.serviceID, 190 checkIDs: helper.CopyMapStringStruct(s.checkIDs), 191 } 192 } 193 194 // ServiceClient handles task and agent service registration with Consul. 195 type ServiceClient struct { 196 client AgentAPI 197 logger log.Logger 198 retryInterval time.Duration 199 maxRetryInterval time.Duration 200 periodicInterval time.Duration 201 202 // exitCh is closed when the main Run loop exits 203 exitCh chan struct{} 204 205 // shutdownCh is closed when the client should shutdown 206 shutdownCh chan struct{} 207 208 // shutdownWait is how long Shutdown() blocks waiting for the final 209 // sync() to finish. Defaults to defaultShutdownWait 210 shutdownWait time.Duration 211 212 opCh chan *operations 213 214 services map[string]*api.AgentServiceRegistration 215 checks map[string]*api.AgentCheckRegistration 216 scripts map[string]*scriptCheck 217 runningScripts map[string]*scriptHandle 218 219 // allocRegistrations stores the services and checks that are registered 220 // with Consul by allocation ID. 221 allocRegistrations map[string]*AllocRegistration 222 allocRegistrationsLock sync.RWMutex 223 224 // agent services and checks record entries for the agent itself which 225 // should be removed on shutdown 226 agentServices map[string]struct{} 227 agentChecks map[string]struct{} 228 agentLock sync.Mutex 229 230 // seen is 1 if Consul has ever been seen; otherwise 0. Accessed with 231 // atomics. 232 seen int32 233 234 // checkWatcher restarts checks that are unhealthy. 235 checkWatcher *checkWatcher 236 237 // isClientAgent specifies whether this Consul client is being used 238 // by a Nomad client. 239 isClientAgent bool 240 } 241 242 // NewServiceClient creates a new Consul ServiceClient from an existing Consul API 243 // Client, logger and takes whether the client is being used by a Nomad Client agent. 244 // When being used by a Nomad client, this Consul client reconciles all services and 245 // checks created by Nomad on behalf of running tasks. 246 func NewServiceClient(consulClient AgentAPI, logger log.Logger, isNomadClient bool) *ServiceClient { 247 logger = logger.ResetNamed("consul.sync") 248 return &ServiceClient{ 249 client: consulClient, 250 logger: logger, 251 retryInterval: defaultRetryInterval, 252 maxRetryInterval: defaultMaxRetryInterval, 253 periodicInterval: defaultPeriodicInterval, 254 exitCh: make(chan struct{}), 255 shutdownCh: make(chan struct{}), 256 shutdownWait: defaultShutdownWait, 257 opCh: make(chan *operations, 8), 258 services: make(map[string]*api.AgentServiceRegistration), 259 checks: make(map[string]*api.AgentCheckRegistration), 260 scripts: make(map[string]*scriptCheck), 261 runningScripts: make(map[string]*scriptHandle), 262 allocRegistrations: make(map[string]*AllocRegistration), 263 agentServices: make(map[string]struct{}), 264 agentChecks: make(map[string]struct{}), 265 checkWatcher: newCheckWatcher(logger, consulClient), 266 isClientAgent: isNomadClient, 267 } 268 } 269 270 // seen is used by markSeen and hasSeen 271 const seen = 1 272 273 // markSeen marks Consul as having been seen (meaning at least one operation 274 // has succeeded). 275 func (c *ServiceClient) markSeen() { 276 atomic.StoreInt32(&c.seen, seen) 277 } 278 279 // hasSeen returns true if any Consul operation has ever succeeded. Useful to 280 // squelch errors if Consul isn't running. 281 func (c *ServiceClient) hasSeen() bool { 282 return atomic.LoadInt32(&c.seen) == seen 283 } 284 285 // Run the Consul main loop which retries operations against Consul. It should 286 // be called exactly once. 287 func (c *ServiceClient) Run() { 288 defer close(c.exitCh) 289 290 ctx, cancel := context.WithCancel(context.Background()) 291 defer cancel() 292 293 // init will be closed when Consul has been contacted 294 init := make(chan struct{}) 295 go checkConsulTLSSkipVerify(ctx, c.logger, c.client, init) 296 297 // Process operations while waiting for initial contact with Consul but 298 // do not sync until contact has been made. 299 INIT: 300 for { 301 select { 302 case <-init: 303 c.markSeen() 304 break INIT 305 case <-c.shutdownCh: 306 return 307 case ops := <-c.opCh: 308 c.merge(ops) 309 } 310 } 311 c.logger.Trace("able to contact Consul") 312 313 // Block until contact with Consul has been established 314 // Start checkWatcher 315 go c.checkWatcher.Run(ctx) 316 317 // Always immediately sync to reconcile Nomad and Consul's state 318 retryTimer := time.NewTimer(0) 319 320 failures := 0 321 for { 322 select { 323 case <-retryTimer.C: 324 case <-c.shutdownCh: 325 // Cancel check watcher but sync one last time 326 cancel() 327 case ops := <-c.opCh: 328 c.merge(ops) 329 } 330 331 if err := c.sync(); err != nil { 332 if failures == 0 { 333 // Log on the first failure 334 c.logger.Warn("failed to update services in Consul", "error", err) 335 } else if failures%10 == 0 { 336 // Log every 10th consecutive failure 337 c.logger.Error("still unable to update services in Consul", "failures", failures, "error", err) 338 } 339 340 failures++ 341 if !retryTimer.Stop() { 342 // Timer already expired, since the timer may 343 // or may not have been read in the select{} 344 // above, conditionally receive on it 345 select { 346 case <-retryTimer.C: 347 default: 348 } 349 } 350 backoff := c.retryInterval * time.Duration(failures) 351 if backoff > c.maxRetryInterval { 352 backoff = c.maxRetryInterval 353 } 354 retryTimer.Reset(backoff) 355 } else { 356 if failures > 0 { 357 c.logger.Info("successfully updated services in Consul") 358 failures = 0 359 } 360 361 // Reset timer to periodic interval to periodically 362 // reconile with Consul 363 if !retryTimer.Stop() { 364 select { 365 case <-retryTimer.C: 366 default: 367 } 368 } 369 retryTimer.Reset(c.periodicInterval) 370 } 371 372 select { 373 case <-c.shutdownCh: 374 // Exit only after sync'ing all outstanding operations 375 if len(c.opCh) > 0 { 376 for len(c.opCh) > 0 { 377 c.merge(<-c.opCh) 378 } 379 continue 380 } 381 return 382 default: 383 } 384 385 } 386 } 387 388 // commit operations unless already shutting down. 389 func (c *ServiceClient) commit(ops *operations) { 390 select { 391 case c.opCh <- ops: 392 case <-c.shutdownCh: 393 } 394 } 395 396 // merge registrations into state map prior to sync'ing with Consul 397 func (c *ServiceClient) merge(ops *operations) { 398 for _, s := range ops.regServices { 399 c.services[s.ID] = s 400 } 401 for _, check := range ops.regChecks { 402 c.checks[check.ID] = check 403 } 404 for _, s := range ops.scripts { 405 c.scripts[s.id] = s 406 } 407 for _, sid := range ops.deregServices { 408 delete(c.services, sid) 409 } 410 for _, cid := range ops.deregChecks { 411 if script, ok := c.runningScripts[cid]; ok { 412 script.cancel() 413 delete(c.scripts, cid) 414 delete(c.runningScripts, cid) 415 } 416 delete(c.checks, cid) 417 } 418 metrics.SetGauge([]string{"client", "consul", "services"}, float32(len(c.services))) 419 metrics.SetGauge([]string{"client", "consul", "checks"}, float32(len(c.checks))) 420 metrics.SetGauge([]string{"client", "consul", "script_checks"}, float32(len(c.runningScripts))) 421 } 422 423 // sync enqueued operations. 424 func (c *ServiceClient) sync() error { 425 sreg, creg, sdereg, cdereg := 0, 0, 0, 0 426 427 consulServices, err := c.client.Services() 428 if err != nil { 429 metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1) 430 return fmt.Errorf("error querying Consul services: %v", err) 431 } 432 433 consulChecks, err := c.client.Checks() 434 if err != nil { 435 metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1) 436 return fmt.Errorf("error querying Consul checks: %v", err) 437 } 438 439 // Remove Nomad services in Consul but unknown locally 440 for id := range consulServices { 441 if _, ok := c.services[id]; ok { 442 // Known service, skip 443 continue 444 } 445 446 // Ignore if this is not a Nomad managed service. Also ignore 447 // Nomad managed services if this is not a client agent. 448 // This is to prevent server agents from removing services 449 // registered by client agents 450 if !isNomadService(id) || !c.isClientAgent { 451 // Not managed by Nomad, skip 452 continue 453 } 454 455 // Unknown Nomad managed service; kill 456 if err := c.client.ServiceDeregister(id); err != nil { 457 if isOldNomadService(id) { 458 // Don't hard-fail on old entries. See #3620 459 continue 460 } 461 462 metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1) 463 return err 464 } 465 sdereg++ 466 metrics.IncrCounter([]string{"client", "consul", "service_deregistrations"}, 1) 467 } 468 469 // Add Nomad services missing from Consul 470 for id, locals := range c.services { 471 if _, ok := consulServices[id]; !ok { 472 if err = c.client.ServiceRegister(locals); err != nil { 473 metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1) 474 return err 475 } 476 sreg++ 477 metrics.IncrCounter([]string{"client", "consul", "service_registrations"}, 1) 478 } 479 } 480 481 // Remove Nomad checks in Consul but unknown locally 482 for id, check := range consulChecks { 483 if _, ok := c.checks[id]; ok { 484 // Known check, leave it 485 continue 486 } 487 488 // Ignore if this is not a Nomad managed check. Also ignore 489 // Nomad managed checks if this is not a client agent. 490 // This is to prevent server agents from removing checks 491 // registered by client agents 492 if !isNomadService(check.ServiceID) || !c.isClientAgent { 493 // Service not managed by Nomad, skip 494 continue 495 } 496 497 // Unknown Nomad managed check; remove 498 if err := c.client.CheckDeregister(id); err != nil { 499 if isOldNomadService(check.ServiceID) { 500 // Don't hard-fail on old entries. 501 continue 502 } 503 504 metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1) 505 return err 506 } 507 cdereg++ 508 metrics.IncrCounter([]string{"client", "consul", "check_deregistrations"}, 1) 509 } 510 511 // Add Nomad checks missing from Consul 512 for id, check := range c.checks { 513 if _, ok := consulChecks[id]; ok { 514 // Already in Consul; skipping 515 continue 516 } 517 518 if err := c.client.CheckRegister(check); err != nil { 519 metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1) 520 return err 521 } 522 creg++ 523 metrics.IncrCounter([]string{"client", "consul", "check_registrations"}, 1) 524 525 // Handle starting scripts 526 if script, ok := c.scripts[id]; ok { 527 // If it's already running, cancel and replace 528 if oldScript, running := c.runningScripts[id]; running { 529 oldScript.cancel() 530 } 531 // Start and store the handle 532 c.runningScripts[id] = script.run() 533 } 534 } 535 536 // Only log if something was actually synced 537 if sreg > 0 || sdereg > 0 || creg > 0 || cdereg > 0 { 538 c.logger.Debug("sync complete", "registered_services", sreg, "deregistered_services", sdereg, 539 "registered_checks", creg, "deregistered_checks", cdereg) 540 } 541 return nil 542 } 543 544 // RegisterAgent registers Nomad agents (client or server). The 545 // Service.PortLabel should be a literal port to be parsed with SplitHostPort. 546 // Script checks are not supported and will return an error. Registration is 547 // asynchronous. 548 // 549 // Agents will be deregistered when Shutdown is called. 550 func (c *ServiceClient) RegisterAgent(role string, services []*structs.Service) error { 551 ops := operations{} 552 553 for _, service := range services { 554 id := makeAgentServiceID(role, service) 555 556 // Unlike tasks, agents don't use port labels. Agent ports are 557 // stored directly in the PortLabel. 558 host, rawport, err := net.SplitHostPort(service.PortLabel) 559 if err != nil { 560 return fmt.Errorf("error parsing port label %q from service %q: %v", service.PortLabel, service.Name, err) 561 } 562 port, err := strconv.Atoi(rawport) 563 if err != nil { 564 return fmt.Errorf("error parsing port %q from service %q: %v", rawport, service.Name, err) 565 } 566 serviceReg := &api.AgentServiceRegistration{ 567 ID: id, 568 Name: service.Name, 569 Tags: service.Tags, 570 Address: host, 571 Port: port, 572 // This enables the consul UI to show that Nomad registered this service 573 Meta: map[string]string{ 574 "external-source": "nomad", 575 }, 576 } 577 ops.regServices = append(ops.regServices, serviceReg) 578 579 for _, check := range service.Checks { 580 checkID := makeCheckID(id, check) 581 if check.Type == structs.ServiceCheckScript { 582 return fmt.Errorf("service %q contains invalid check: agent checks do not support scripts", service.Name) 583 } 584 checkHost, checkPort := serviceReg.Address, serviceReg.Port 585 if check.PortLabel != "" { 586 // Unlike tasks, agents don't use port labels. Agent ports are 587 // stored directly in the PortLabel. 588 host, rawport, err := net.SplitHostPort(check.PortLabel) 589 if err != nil { 590 return fmt.Errorf("error parsing port label %q from check %q: %v", service.PortLabel, check.Name, err) 591 } 592 port, err := strconv.Atoi(rawport) 593 if err != nil { 594 return fmt.Errorf("error parsing port %q from check %q: %v", rawport, check.Name, err) 595 } 596 checkHost, checkPort = host, port 597 } 598 checkReg, err := createCheckReg(id, checkID, check, checkHost, checkPort) 599 if err != nil { 600 return fmt.Errorf("failed to add check %q: %v", check.Name, err) 601 } 602 ops.regChecks = append(ops.regChecks, checkReg) 603 } 604 } 605 606 // Don't bother committing agent checks if we're already shutting down 607 c.agentLock.Lock() 608 defer c.agentLock.Unlock() 609 select { 610 case <-c.shutdownCh: 611 return nil 612 default: 613 } 614 615 // Now add them to the registration queue 616 c.commit(&ops) 617 618 // Record IDs for deregistering on shutdown 619 for _, id := range ops.regServices { 620 c.agentServices[id.ID] = struct{}{} 621 } 622 for _, id := range ops.regChecks { 623 c.agentChecks[id.ID] = struct{}{} 624 } 625 return nil 626 } 627 628 // serviceRegs creates service registrations, check registrations, and script 629 // checks from a service. It returns a service registration object with the 630 // service and check IDs populated. 631 func (c *ServiceClient) serviceRegs(ops *operations, service *structs.Service, task *TaskServices) ( 632 *ServiceRegistration, error) { 633 634 // Get the services ID 635 id := makeTaskServiceID(task.AllocID, task.Name, service, task.Canary) 636 sreg := &ServiceRegistration{ 637 serviceID: id, 638 checkIDs: make(map[string]struct{}, len(service.Checks)), 639 } 640 641 // Service address modes default to auto 642 addrMode := service.AddressMode 643 if addrMode == "" { 644 addrMode = structs.AddressModeAuto 645 } 646 647 // Determine the address to advertise based on the mode 648 ip, port, err := getAddress(addrMode, service.PortLabel, task.Networks, task.DriverNetwork) 649 if err != nil { 650 return nil, fmt.Errorf("unable to get address for service %q: %v", service.Name, err) 651 } 652 653 // Determine whether to use tags or canary_tags 654 var tags []string 655 if task.Canary && len(service.CanaryTags) > 0 { 656 tags = make([]string, len(service.CanaryTags)) 657 copy(tags, service.CanaryTags) 658 } else { 659 tags = make([]string, len(service.Tags)) 660 copy(tags, service.Tags) 661 } 662 663 // Build the Consul Service registration request 664 serviceReg := &api.AgentServiceRegistration{ 665 ID: id, 666 Name: service.Name, 667 Tags: tags, 668 Address: ip, 669 Port: port, 670 // This enables the consul UI to show that Nomad registered this service 671 Meta: map[string]string{ 672 "external-source": "nomad", 673 }, 674 } 675 ops.regServices = append(ops.regServices, serviceReg) 676 677 // Build the check registrations 678 checkIDs, err := c.checkRegs(ops, id, service, task) 679 if err != nil { 680 return nil, err 681 } 682 for _, cid := range checkIDs { 683 sreg.checkIDs[cid] = struct{}{} 684 } 685 return sreg, nil 686 } 687 688 // checkRegs registers the checks for the given service and returns the 689 // registered check ids. 690 func (c *ServiceClient) checkRegs(ops *operations, serviceID string, service *structs.Service, 691 task *TaskServices) ([]string, error) { 692 693 // Fast path 694 numChecks := len(service.Checks) 695 if numChecks == 0 { 696 return nil, nil 697 } 698 699 checkIDs := make([]string, 0, numChecks) 700 for _, check := range service.Checks { 701 checkID := makeCheckID(serviceID, check) 702 checkIDs = append(checkIDs, checkID) 703 if check.Type == structs.ServiceCheckScript { 704 if task.DriverExec == nil { 705 return nil, fmt.Errorf("driver doesn't support script checks") 706 } 707 708 sc := newScriptCheck(task.AllocID, task.Name, checkID, check, task.DriverExec, 709 c.client, c.logger, c.shutdownCh) 710 ops.scripts = append(ops.scripts, sc) 711 712 // Skip getAddress for script checks 713 checkReg, err := createCheckReg(serviceID, checkID, check, "", 0) 714 if err != nil { 715 return nil, fmt.Errorf("failed to add script check %q: %v", check.Name, err) 716 } 717 ops.regChecks = append(ops.regChecks, checkReg) 718 continue 719 } 720 721 // Default to the service's port but allow check to override 722 portLabel := check.PortLabel 723 if portLabel == "" { 724 // Default to the service's port label 725 portLabel = service.PortLabel 726 } 727 728 // Checks address mode defaults to host for pre-#3380 backward compat 729 addrMode := check.AddressMode 730 if addrMode == "" { 731 addrMode = structs.AddressModeHost 732 } 733 734 ip, port, err := getAddress(addrMode, portLabel, task.Networks, task.DriverNetwork) 735 if err != nil { 736 return nil, fmt.Errorf("error getting address for check %q: %v", check.Name, err) 737 } 738 739 checkReg, err := createCheckReg(serviceID, checkID, check, ip, port) 740 if err != nil { 741 return nil, fmt.Errorf("failed to add check %q: %v", check.Name, err) 742 } 743 ops.regChecks = append(ops.regChecks, checkReg) 744 } 745 return checkIDs, nil 746 } 747 748 // RegisterTask with Consul. Adds all service entries and checks to Consul. If 749 // exec is nil and a script check exists an error is returned. 750 // 751 // If the service IP is set it used as the address in the service registration. 752 // Checks will always use the IP from the Task struct (host's IP). 753 // 754 // Actual communication with Consul is done asynchronously (see Run). 755 func (c *ServiceClient) RegisterTask(task *TaskServices) error { 756 // Fast path 757 numServices := len(task.Services) 758 if numServices == 0 { 759 return nil 760 } 761 762 t := new(TaskRegistration) 763 t.Services = make(map[string]*ServiceRegistration, numServices) 764 765 ops := &operations{} 766 for _, service := range task.Services { 767 sreg, err := c.serviceRegs(ops, service, task) 768 if err != nil { 769 return err 770 } 771 t.Services[sreg.serviceID] = sreg 772 } 773 774 // Add the task to the allocation's registration 775 c.addTaskRegistration(task.AllocID, task.Name, t) 776 777 c.commit(ops) 778 779 // Start watching checks. Done after service registrations are built 780 // since an error building them could leak watches. 781 for _, service := range task.Services { 782 serviceID := makeTaskServiceID(task.AllocID, task.Name, service, task.Canary) 783 for _, check := range service.Checks { 784 if check.TriggersRestarts() { 785 checkID := makeCheckID(serviceID, check) 786 c.checkWatcher.Watch(task.AllocID, task.Name, checkID, check, task.Restarter) 787 } 788 } 789 } 790 return nil 791 } 792 793 // UpdateTask in Consul. Does not alter the service if only checks have 794 // changed. 795 // 796 // DriverNetwork must not change between invocations for the same allocation. 797 func (c *ServiceClient) UpdateTask(old, newTask *TaskServices) error { 798 ops := &operations{} 799 800 taskReg := new(TaskRegistration) 801 taskReg.Services = make(map[string]*ServiceRegistration, len(newTask.Services)) 802 803 existingIDs := make(map[string]*structs.Service, len(old.Services)) 804 for _, s := range old.Services { 805 existingIDs[makeTaskServiceID(old.AllocID, old.Name, s, old.Canary)] = s 806 } 807 newIDs := make(map[string]*structs.Service, len(newTask.Services)) 808 for _, s := range newTask.Services { 809 newIDs[makeTaskServiceID(newTask.AllocID, newTask.Name, s, newTask.Canary)] = s 810 } 811 812 // Loop over existing Service IDs to see if they have been removed or 813 // updated. 814 for existingID, existingSvc := range existingIDs { 815 newSvc, ok := newIDs[existingID] 816 if !ok { 817 // Existing service entry removed 818 ops.deregServices = append(ops.deregServices, existingID) 819 for _, check := range existingSvc.Checks { 820 cid := makeCheckID(existingID, check) 821 ops.deregChecks = append(ops.deregChecks, cid) 822 823 // Unwatch watched checks 824 if check.TriggersRestarts() { 825 c.checkWatcher.Unwatch(cid) 826 } 827 } 828 continue 829 } 830 831 // Service exists and hasn't changed, don't re-add it later 832 delete(newIDs, existingID) 833 834 // Service still exists so add it to the task's registration 835 sreg := &ServiceRegistration{ 836 serviceID: existingID, 837 checkIDs: make(map[string]struct{}, len(newSvc.Checks)), 838 } 839 taskReg.Services[existingID] = sreg 840 841 // See if any checks were updated 842 existingChecks := make(map[string]*structs.ServiceCheck, len(existingSvc.Checks)) 843 for _, check := range existingSvc.Checks { 844 existingChecks[makeCheckID(existingID, check)] = check 845 } 846 847 // Register new checks 848 for _, check := range newSvc.Checks { 849 checkID := makeCheckID(existingID, check) 850 if _, exists := existingChecks[checkID]; exists { 851 // Check exists, so don't remove it 852 delete(existingChecks, checkID) 853 sreg.checkIDs[checkID] = struct{}{} 854 } 855 856 // New check on an unchanged service; add them now 857 newCheckIDs, err := c.checkRegs(ops, existingID, newSvc, newTask) 858 if err != nil { 859 return err 860 } 861 862 for _, checkID := range newCheckIDs { 863 sreg.checkIDs[checkID] = struct{}{} 864 865 } 866 867 // Update all watched checks as CheckRestart fields aren't part of ID 868 if check.TriggersRestarts() { 869 c.checkWatcher.Watch(newTask.AllocID, newTask.Name, checkID, check, newTask.Restarter) 870 } 871 } 872 873 // Remove existing checks not in updated service 874 for cid, check := range existingChecks { 875 ops.deregChecks = append(ops.deregChecks, cid) 876 877 // Unwatch checks 878 if check.TriggersRestarts() { 879 c.checkWatcher.Unwatch(cid) 880 } 881 } 882 } 883 884 // Any remaining services should just be enqueued directly 885 for _, newSvc := range newIDs { 886 sreg, err := c.serviceRegs(ops, newSvc, newTask) 887 if err != nil { 888 return err 889 } 890 891 taskReg.Services[sreg.serviceID] = sreg 892 } 893 894 // Add the task to the allocation's registration 895 c.addTaskRegistration(newTask.AllocID, newTask.Name, taskReg) 896 897 c.commit(ops) 898 899 // Start watching checks. Done after service registrations are built 900 // since an error building them could leak watches. 901 for _, service := range newIDs { 902 serviceID := makeTaskServiceID(newTask.AllocID, newTask.Name, service, newTask.Canary) 903 for _, check := range service.Checks { 904 if check.TriggersRestarts() { 905 checkID := makeCheckID(serviceID, check) 906 c.checkWatcher.Watch(newTask.AllocID, newTask.Name, checkID, check, newTask.Restarter) 907 } 908 } 909 } 910 return nil 911 } 912 913 // RemoveTask from Consul. Removes all service entries and checks. 914 // 915 // Actual communication with Consul is done asynchronously (see Run). 916 func (c *ServiceClient) RemoveTask(task *TaskServices) { 917 ops := operations{} 918 919 for _, service := range task.Services { 920 id := makeTaskServiceID(task.AllocID, task.Name, service, task.Canary) 921 ops.deregServices = append(ops.deregServices, id) 922 923 for _, check := range service.Checks { 924 cid := makeCheckID(id, check) 925 ops.deregChecks = append(ops.deregChecks, cid) 926 927 if check.TriggersRestarts() { 928 c.checkWatcher.Unwatch(cid) 929 } 930 } 931 } 932 933 // Remove the task from the alloc's registrations 934 c.removeTaskRegistration(task.AllocID, task.Name) 935 936 // Now add them to the deregistration fields; main Run loop will update 937 c.commit(&ops) 938 } 939 940 // AllocRegistrations returns the registrations for the given allocation. If the 941 // allocation has no reservations, the response is a nil object. 942 func (c *ServiceClient) AllocRegistrations(allocID string) (*AllocRegistration, error) { 943 // Get the internal struct using the lock 944 c.allocRegistrationsLock.RLock() 945 regInternal, ok := c.allocRegistrations[allocID] 946 if !ok { 947 c.allocRegistrationsLock.RUnlock() 948 return nil, nil 949 } 950 951 // Copy so we don't expose internal structs 952 reg := regInternal.copy() 953 c.allocRegistrationsLock.RUnlock() 954 955 // Query the services and checks to populate the allocation registrations. 956 services, err := c.client.Services() 957 if err != nil { 958 return nil, err 959 } 960 961 checks, err := c.client.Checks() 962 if err != nil { 963 return nil, err 964 } 965 966 // Populate the object 967 for _, treg := range reg.Tasks { 968 for serviceID, sreg := range treg.Services { 969 sreg.Service = services[serviceID] 970 for checkID := range sreg.checkIDs { 971 if check, ok := checks[checkID]; ok { 972 sreg.Checks = append(sreg.Checks, check) 973 } 974 } 975 } 976 } 977 978 return reg, nil 979 } 980 981 // Shutdown the Consul client. Update running task registrations and deregister 982 // agent from Consul. On first call blocks up to shutdownWait before giving up 983 // on syncing operations. 984 func (c *ServiceClient) Shutdown() error { 985 // Serialize Shutdown calls with RegisterAgent to prevent leaking agent 986 // entries. 987 c.agentLock.Lock() 988 defer c.agentLock.Unlock() 989 select { 990 case <-c.shutdownCh: 991 return nil 992 default: 993 close(c.shutdownCh) 994 } 995 996 // Give run loop time to sync, but don't block indefinitely 997 deadline := time.After(c.shutdownWait) 998 999 // Wait for Run to finish any outstanding operations and exit 1000 select { 1001 case <-c.exitCh: 1002 case <-deadline: 1003 // Don't wait forever though 1004 } 1005 1006 // If Consul was never seen nothing could be written so exit early 1007 if !c.hasSeen() { 1008 return nil 1009 } 1010 1011 // Always attempt to deregister Nomad agent Consul entries, even if 1012 // deadline was reached 1013 for id := range c.agentServices { 1014 if err := c.client.ServiceDeregister(id); err != nil { 1015 c.logger.Error("failed deregistering agent service", "service_id", id, "error", err) 1016 } 1017 } 1018 for id := range c.agentChecks { 1019 if err := c.client.CheckDeregister(id); err != nil { 1020 c.logger.Error("failed deregistering agent check", "check_id", id, "error", err) 1021 } 1022 } 1023 1024 // Give script checks time to exit (no need to lock as Run() has exited) 1025 for _, h := range c.runningScripts { 1026 select { 1027 case <-h.wait(): 1028 case <-deadline: 1029 return fmt.Errorf("timed out waiting for script checks to run") 1030 } 1031 } 1032 return nil 1033 } 1034 1035 // addTaskRegistration adds the task registration for the given allocation. 1036 func (c *ServiceClient) addTaskRegistration(allocID, taskName string, reg *TaskRegistration) { 1037 c.allocRegistrationsLock.Lock() 1038 defer c.allocRegistrationsLock.Unlock() 1039 1040 alloc, ok := c.allocRegistrations[allocID] 1041 if !ok { 1042 alloc = &AllocRegistration{ 1043 Tasks: make(map[string]*TaskRegistration), 1044 } 1045 c.allocRegistrations[allocID] = alloc 1046 } 1047 alloc.Tasks[taskName] = reg 1048 } 1049 1050 // removeTaskRegistration removes the task registration for the given allocation. 1051 func (c *ServiceClient) removeTaskRegistration(allocID, taskName string) { 1052 c.allocRegistrationsLock.Lock() 1053 defer c.allocRegistrationsLock.Unlock() 1054 1055 alloc, ok := c.allocRegistrations[allocID] 1056 if !ok { 1057 return 1058 } 1059 1060 // Delete the task and if it is the last one also delete the alloc's 1061 // registration 1062 delete(alloc.Tasks, taskName) 1063 if len(alloc.Tasks) == 0 { 1064 delete(c.allocRegistrations, allocID) 1065 } 1066 } 1067 1068 // makeAgentServiceID creates a unique ID for identifying an agent service in 1069 // Consul. 1070 // 1071 // Agent service IDs are of the form: 1072 // 1073 // {nomadServicePrefix}-{ROLE}-b32(sha1({Service.Name}-{Service.Tags...}) 1074 // Example Server ID: _nomad-server-fbbk265qn4tmt25nd4ep42tjvmyj3hr4 1075 // Example Client ID: _nomad-client-ggnjpgl7yn7rgmvxzilmpvrzzvrszc7l 1076 // 1077 func makeAgentServiceID(role string, service *structs.Service) string { 1078 return fmt.Sprintf("%s-%s-%s", nomadServicePrefix, role, service.Hash(role, "", false)) 1079 } 1080 1081 // makeTaskServiceID creates a unique ID for identifying a task service in 1082 // Consul. All structs.Service fields are included in the ID's hash except 1083 // Checks. This allows updates to merely compare IDs. 1084 // 1085 // Example Service ID: _nomad-task-TNM333JKJPM5AK4FAS3VXQLXFDWOF4VH 1086 func makeTaskServiceID(allocID, taskName string, service *structs.Service, canary bool) string { 1087 return nomadTaskPrefix + service.Hash(allocID, taskName, canary) 1088 } 1089 1090 // makeCheckID creates a unique ID for a check. 1091 func makeCheckID(serviceID string, check *structs.ServiceCheck) string { 1092 return check.Hash(serviceID) 1093 } 1094 1095 // createCheckReg creates a Check that can be registered with Consul. 1096 // 1097 // Script checks simply have a TTL set and the caller is responsible for 1098 // running the script and heartbeating. 1099 func createCheckReg(serviceID, checkID string, check *structs.ServiceCheck, host string, port int) (*api.AgentCheckRegistration, error) { 1100 chkReg := api.AgentCheckRegistration{ 1101 ID: checkID, 1102 Name: check.Name, 1103 ServiceID: serviceID, 1104 } 1105 chkReg.Status = check.InitialStatus 1106 chkReg.Timeout = check.Timeout.String() 1107 chkReg.Interval = check.Interval.String() 1108 1109 // Require an address for http or tcp checks 1110 if port == 0 && check.RequiresPort() { 1111 return nil, fmt.Errorf("%s checks require an address", check.Type) 1112 } 1113 1114 switch check.Type { 1115 case structs.ServiceCheckHTTP: 1116 proto := check.Protocol 1117 if proto == "" { 1118 proto = "http" 1119 } 1120 if check.TLSSkipVerify { 1121 chkReg.TLSSkipVerify = true 1122 } 1123 base := url.URL{ 1124 Scheme: proto, 1125 Host: net.JoinHostPort(host, strconv.Itoa(port)), 1126 } 1127 relative, err := url.Parse(check.Path) 1128 if err != nil { 1129 return nil, err 1130 } 1131 url := base.ResolveReference(relative) 1132 chkReg.HTTP = url.String() 1133 chkReg.Method = check.Method 1134 chkReg.Header = check.Header 1135 1136 case structs.ServiceCheckTCP: 1137 chkReg.TCP = net.JoinHostPort(host, strconv.Itoa(port)) 1138 1139 case structs.ServiceCheckScript: 1140 chkReg.TTL = (check.Interval + ttlCheckBuffer).String() 1141 // As of Consul 1.0.0 setting TTL and Interval is a 400 1142 chkReg.Interval = "" 1143 1144 case structs.ServiceCheckGRPC: 1145 chkReg.GRPC = fmt.Sprintf("%s/%s", net.JoinHostPort(host, strconv.Itoa(port)), check.GRPCService) 1146 chkReg.GRPCUseTLS = check.GRPCUseTLS 1147 if check.TLSSkipVerify { 1148 chkReg.TLSSkipVerify = true 1149 } 1150 1151 default: 1152 return nil, fmt.Errorf("check type %+q not valid", check.Type) 1153 } 1154 return &chkReg, nil 1155 } 1156 1157 // isNomadService returns true if the ID matches the pattern of a Nomad managed 1158 // service (new or old formats). Agent services return false as independent 1159 // client and server agents may be running on the same machine. #2827 1160 func isNomadService(id string) bool { 1161 return strings.HasPrefix(id, nomadTaskPrefix) || isOldNomadService(id) 1162 } 1163 1164 // isOldNomadService returns true if the ID matches an old pattern managed by 1165 // Nomad. 1166 // 1167 // Pre-0.7.1 task service IDs are of the form: 1168 // 1169 // {nomadServicePrefix}-executor-{ALLOC_ID}-{Service.Name}-{Service.Tags...} 1170 // Example Service ID: _nomad-executor-1234-echo-http-tag1-tag2-tag3 1171 // 1172 func isOldNomadService(id string) bool { 1173 const prefix = nomadServicePrefix + "-executor" 1174 return strings.HasPrefix(id, prefix) 1175 } 1176 1177 // getAddress returns the IP and port to use for a service or check. If no port 1178 // label is specified (an empty value), zero values are returned because no 1179 // address could be resolved. 1180 func getAddress(addrMode, portLabel string, networks structs.Networks, driverNet *drivers.DriverNetwork) (string, int, error) { 1181 switch addrMode { 1182 case structs.AddressModeAuto: 1183 if driverNet.Advertise() { 1184 addrMode = structs.AddressModeDriver 1185 } else { 1186 addrMode = structs.AddressModeHost 1187 } 1188 return getAddress(addrMode, portLabel, networks, driverNet) 1189 case structs.AddressModeHost: 1190 if portLabel == "" { 1191 if len(networks) != 1 { 1192 // If no networks are specified return zero 1193 // values. Consul will advertise the host IP 1194 // with no port. This is the pre-0.7.1 behavior 1195 // some people rely on. 1196 return "", 0, nil 1197 } 1198 1199 return networks[0].IP, 0, nil 1200 } 1201 1202 // Default path: use host ip:port 1203 ip, port := networks.Port(portLabel) 1204 if ip == "" && port <= 0 { 1205 return "", 0, fmt.Errorf("invalid port %q: port label not found", portLabel) 1206 } 1207 return ip, port, nil 1208 1209 case structs.AddressModeDriver: 1210 // Require a driver network if driver address mode is used 1211 if driverNet == nil { 1212 return "", 0, fmt.Errorf(`cannot use address_mode="driver": no driver network exists`) 1213 } 1214 1215 // If no port label is specified just return the IP 1216 if portLabel == "" { 1217 return driverNet.IP, 0, nil 1218 } 1219 1220 // If the port is a label, use the driver's port (not the host's) 1221 if port, ok := driverNet.PortMap[portLabel]; ok { 1222 return driverNet.IP, port, nil 1223 } 1224 1225 // If port isn't a label, try to parse it as a literal port number 1226 port, err := strconv.Atoi(portLabel) 1227 if err != nil { 1228 // Don't include Atoi error message as user likely 1229 // never intended it to be a numeric and it creates a 1230 // confusing error message 1231 return "", 0, fmt.Errorf("invalid port label %q: port labels in driver address_mode must be numeric or in the driver's port map", portLabel) 1232 } 1233 if port <= 0 { 1234 return "", 0, fmt.Errorf("invalid port: %q: port must be >0", portLabel) 1235 } 1236 1237 return driverNet.IP, port, nil 1238 1239 default: 1240 // Shouldn't happen due to validation, but enforce invariants 1241 return "", 0, fmt.Errorf("invalid address mode %q", addrMode) 1242 } 1243 }