github.com/smintz/nomad@v0.8.3/command/agent/consul/client.go (about) 1 package consul 2 3 import ( 4 "context" 5 "fmt" 6 "log" 7 "net" 8 "net/url" 9 "strconv" 10 "strings" 11 "sync" 12 "sync/atomic" 13 "time" 14 15 metrics "github.com/armon/go-metrics" 16 "github.com/hashicorp/consul/api" 17 "github.com/hashicorp/nomad/client/driver" 18 cstructs "github.com/hashicorp/nomad/client/structs" 19 "github.com/hashicorp/nomad/helper" 20 "github.com/hashicorp/nomad/nomad/structs" 21 ) 22 23 const ( 24 // nomadServicePrefix is the prefix that scopes all Nomad registered 25 // services (both agent and task entries). 26 nomadServicePrefix = "_nomad" 27 28 // nomadTaskPrefix is the prefix that scopes Nomad registered services 29 // for tasks. 30 nomadTaskPrefix = nomadServicePrefix + "-task-" 31 32 // defaultRetryInterval is how quickly to retry syncing services and 33 // checks to Consul when an error occurs. Will backoff up to a max. 34 defaultRetryInterval = time.Second 35 36 // defaultMaxRetryInterval is the default max retry interval. 37 defaultMaxRetryInterval = 30 * time.Second 38 39 // defaultPeriodicalInterval is the interval at which the service 40 // client reconciles state between the desired services and checks and 41 // what's actually registered in Consul. This is done at an interval, 42 // rather than being purely edge triggered, to handle the case that the 43 // Consul agent's state may change underneath us 44 defaultPeriodicInterval = 30 * time.Second 45 46 // ttlCheckBuffer is the time interval that Nomad can take to report Consul 47 // the check result 48 ttlCheckBuffer = 31 * time.Second 49 50 // defaultShutdownWait is how long Shutdown() should block waiting for 51 // enqueued operations to sync to Consul by default. 52 defaultShutdownWait = time.Minute 53 54 // DefaultQueryWaitDuration is the max duration the Consul Agent will 55 // spend waiting for a response from a Consul Query. 56 DefaultQueryWaitDuration = 2 * time.Second 57 58 // ServiceTagHTTP is the tag assigned to HTTP services 59 ServiceTagHTTP = "http" 60 61 // ServiceTagRPC is the tag assigned to RPC services 62 ServiceTagRPC = "rpc" 63 64 // ServiceTagSerf is the tag assigned to Serf services 65 ServiceTagSerf = "serf" 66 ) 67 68 // CatalogAPI is the consul/api.Catalog API used by Nomad. 69 type CatalogAPI interface { 70 Datacenters() ([]string, error) 71 Service(service, tag string, q *api.QueryOptions) ([]*api.CatalogService, *api.QueryMeta, error) 72 } 73 74 // AgentAPI is the consul/api.Agent API used by Nomad. 75 type AgentAPI interface { 76 Services() (map[string]*api.AgentService, error) 77 Checks() (map[string]*api.AgentCheck, error) 78 CheckRegister(check *api.AgentCheckRegistration) error 79 CheckDeregister(checkID string) error 80 Self() (map[string]map[string]interface{}, error) 81 ServiceRegister(service *api.AgentServiceRegistration) error 82 ServiceDeregister(serviceID string) error 83 UpdateTTL(id, output, status string) error 84 } 85 86 // operations are submitted to the main loop via commit() for synchronizing 87 // with Consul. 88 type operations struct { 89 regServices []*api.AgentServiceRegistration 90 regChecks []*api.AgentCheckRegistration 91 scripts []*scriptCheck 92 93 deregServices []string 94 deregChecks []string 95 } 96 97 // AllocRegistration holds the status of services registered for a particular 98 // allocations by task. 99 type AllocRegistration struct { 100 // Tasks maps the name of a task to its registered services and checks 101 Tasks map[string]*TaskRegistration 102 } 103 104 func (a *AllocRegistration) copy() *AllocRegistration { 105 c := &AllocRegistration{ 106 Tasks: make(map[string]*TaskRegistration, len(a.Tasks)), 107 } 108 109 for k, v := range a.Tasks { 110 c.Tasks[k] = v.copy() 111 } 112 113 return c 114 } 115 116 // NumServices returns the number of registered services 117 func (a *AllocRegistration) NumServices() int { 118 if a == nil { 119 return 0 120 } 121 122 total := 0 123 for _, treg := range a.Tasks { 124 for _, sreg := range treg.Services { 125 if sreg.Service != nil { 126 total++ 127 } 128 } 129 } 130 131 return total 132 } 133 134 // NumChecks returns the number of registered checks 135 func (a *AllocRegistration) NumChecks() int { 136 if a == nil { 137 return 0 138 } 139 140 total := 0 141 for _, treg := range a.Tasks { 142 for _, sreg := range treg.Services { 143 total += len(sreg.Checks) 144 } 145 } 146 147 return total 148 } 149 150 // TaskRegistration holds the status of services registered for a particular 151 // task. 152 type TaskRegistration struct { 153 Services map[string]*ServiceRegistration 154 } 155 156 func (t *TaskRegistration) copy() *TaskRegistration { 157 c := &TaskRegistration{ 158 Services: make(map[string]*ServiceRegistration, len(t.Services)), 159 } 160 161 for k, v := range t.Services { 162 c.Services[k] = v.copy() 163 } 164 165 return c 166 } 167 168 // ServiceRegistration holds the status of a registered Consul Service and its 169 // Checks. 170 type ServiceRegistration struct { 171 // serviceID and checkIDs are internal fields that track just the IDs of the 172 // services/checks registered in Consul. It is used to materialize the other 173 // fields when queried. 174 serviceID string 175 checkIDs map[string]struct{} 176 177 // Service is the AgentService registered in Consul. 178 Service *api.AgentService 179 180 // Checks is the status of the registered checks. 181 Checks []*api.AgentCheck 182 } 183 184 func (s *ServiceRegistration) copy() *ServiceRegistration { 185 // Copy does not copy the external fields but only the internal fields. This 186 // is so that the caller of AllocRegistrations can not access the internal 187 // fields and that method uses these fields to populate the external fields. 188 return &ServiceRegistration{ 189 serviceID: s.serviceID, 190 checkIDs: helper.CopyMapStringStruct(s.checkIDs), 191 } 192 } 193 194 // ServiceClient handles task and agent service registration with Consul. 195 type ServiceClient struct { 196 client AgentAPI 197 logger *log.Logger 198 retryInterval time.Duration 199 maxRetryInterval time.Duration 200 periodicInterval time.Duration 201 202 // exitCh is closed when the main Run loop exits 203 exitCh chan struct{} 204 205 // shutdownCh is closed when the client should shutdown 206 shutdownCh chan struct{} 207 208 // shutdownWait is how long Shutdown() blocks waiting for the final 209 // sync() to finish. Defaults to defaultShutdownWait 210 shutdownWait time.Duration 211 212 opCh chan *operations 213 214 services map[string]*api.AgentServiceRegistration 215 checks map[string]*api.AgentCheckRegistration 216 scripts map[string]*scriptCheck 217 runningScripts map[string]*scriptHandle 218 219 // allocRegistrations stores the services and checks that are registered 220 // with Consul by allocation ID. 221 allocRegistrations map[string]*AllocRegistration 222 allocRegistrationsLock sync.RWMutex 223 224 // agent services and checks record entries for the agent itself which 225 // should be removed on shutdown 226 agentServices map[string]struct{} 227 agentChecks map[string]struct{} 228 agentLock sync.Mutex 229 230 // seen is 1 if Consul has ever been seen; otherwise 0. Accessed with 231 // atomics. 232 seen int32 233 234 // checkWatcher restarts checks that are unhealthy. 235 checkWatcher *checkWatcher 236 } 237 238 // NewServiceClient creates a new Consul ServiceClient from an existing Consul API 239 // Client and logger. 240 func NewServiceClient(consulClient AgentAPI, logger *log.Logger) *ServiceClient { 241 return &ServiceClient{ 242 client: consulClient, 243 logger: logger, 244 retryInterval: defaultRetryInterval, 245 maxRetryInterval: defaultMaxRetryInterval, 246 periodicInterval: defaultPeriodicInterval, 247 exitCh: make(chan struct{}), 248 shutdownCh: make(chan struct{}), 249 shutdownWait: defaultShutdownWait, 250 opCh: make(chan *operations, 8), 251 services: make(map[string]*api.AgentServiceRegistration), 252 checks: make(map[string]*api.AgentCheckRegistration), 253 scripts: make(map[string]*scriptCheck), 254 runningScripts: make(map[string]*scriptHandle), 255 allocRegistrations: make(map[string]*AllocRegistration), 256 agentServices: make(map[string]struct{}), 257 agentChecks: make(map[string]struct{}), 258 checkWatcher: newCheckWatcher(logger, consulClient), 259 } 260 } 261 262 // seen is used by markSeen and hasSeen 263 const seen = 1 264 265 // markSeen marks Consul as having been seen (meaning at least one operation 266 // has succeeded). 267 func (c *ServiceClient) markSeen() { 268 atomic.StoreInt32(&c.seen, seen) 269 } 270 271 // hasSeen returns true if any Consul operation has ever succeeded. Useful to 272 // squelch errors if Consul isn't running. 273 func (c *ServiceClient) hasSeen() bool { 274 return atomic.LoadInt32(&c.seen) == seen 275 } 276 277 // Run the Consul main loop which retries operations against Consul. It should 278 // be called exactly once. 279 func (c *ServiceClient) Run() { 280 defer close(c.exitCh) 281 282 ctx, cancel := context.WithCancel(context.Background()) 283 defer cancel() 284 285 // init will be closed when Consul has been contacted 286 init := make(chan struct{}) 287 go checkConsulTLSSkipVerify(ctx, c.logger, c.client, init) 288 289 // Process operations while waiting for initial contact with Consul but 290 // do not sync until contact has been made. 291 INIT: 292 for { 293 select { 294 case <-init: 295 c.markSeen() 296 break INIT 297 case <-c.shutdownCh: 298 return 299 case ops := <-c.opCh: 300 c.merge(ops) 301 } 302 } 303 c.logger.Printf("[TRACE] consul.sync: able to contact Consul") 304 305 // Block until contact with Consul has been established 306 // Start checkWatcher 307 go c.checkWatcher.Run(ctx) 308 309 // Always immediately sync to reconcile Nomad and Consul's state 310 retryTimer := time.NewTimer(0) 311 312 failures := 0 313 for { 314 select { 315 case <-retryTimer.C: 316 case <-c.shutdownCh: 317 // Cancel check watcher but sync one last time 318 cancel() 319 case ops := <-c.opCh: 320 c.merge(ops) 321 } 322 323 if err := c.sync(); err != nil { 324 if failures == 0 { 325 // Log on the first failure 326 c.logger.Printf("[WARN] consul.sync: failed to update services in Consul: %v", err) 327 } else if failures%10 == 0 { 328 // Log every 10th consecutive failure 329 c.logger.Printf("[ERR] consul.sync: still unable to update services in Consul after %d failures; latest error: %v", failures, err) 330 } 331 332 failures++ 333 if !retryTimer.Stop() { 334 // Timer already expired, since the timer may 335 // or may not have been read in the select{} 336 // above, conditionally receive on it 337 select { 338 case <-retryTimer.C: 339 default: 340 } 341 } 342 backoff := c.retryInterval * time.Duration(failures) 343 if backoff > c.maxRetryInterval { 344 backoff = c.maxRetryInterval 345 } 346 retryTimer.Reset(backoff) 347 } else { 348 if failures > 0 { 349 c.logger.Printf("[INFO] consul.sync: successfully updated services in Consul") 350 failures = 0 351 } 352 353 // Reset timer to periodic interval to periodically 354 // reconile with Consul 355 if !retryTimer.Stop() { 356 select { 357 case <-retryTimer.C: 358 default: 359 } 360 } 361 retryTimer.Reset(c.periodicInterval) 362 } 363 364 select { 365 case <-c.shutdownCh: 366 // Exit only after sync'ing all outstanding operations 367 if len(c.opCh) > 0 { 368 for len(c.opCh) > 0 { 369 c.merge(<-c.opCh) 370 } 371 continue 372 } 373 return 374 default: 375 } 376 377 } 378 } 379 380 // commit operations unless already shutting down. 381 func (c *ServiceClient) commit(ops *operations) { 382 select { 383 case c.opCh <- ops: 384 case <-c.shutdownCh: 385 } 386 } 387 388 // merge registrations into state map prior to sync'ing with Consul 389 func (c *ServiceClient) merge(ops *operations) { 390 for _, s := range ops.regServices { 391 c.services[s.ID] = s 392 } 393 for _, check := range ops.regChecks { 394 c.checks[check.ID] = check 395 } 396 for _, s := range ops.scripts { 397 c.scripts[s.id] = s 398 } 399 for _, sid := range ops.deregServices { 400 delete(c.services, sid) 401 } 402 for _, cid := range ops.deregChecks { 403 if script, ok := c.runningScripts[cid]; ok { 404 script.cancel() 405 delete(c.scripts, cid) 406 delete(c.runningScripts, cid) 407 } 408 delete(c.checks, cid) 409 } 410 metrics.SetGauge([]string{"client", "consul", "services"}, float32(len(c.services))) 411 metrics.SetGauge([]string{"client", "consul", "checks"}, float32(len(c.checks))) 412 metrics.SetGauge([]string{"client", "consul", "script_checks"}, float32(len(c.runningScripts))) 413 } 414 415 // sync enqueued operations. 416 func (c *ServiceClient) sync() error { 417 sreg, creg, sdereg, cdereg := 0, 0, 0, 0 418 419 consulServices, err := c.client.Services() 420 if err != nil { 421 metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1) 422 return fmt.Errorf("error querying Consul services: %v", err) 423 } 424 425 consulChecks, err := c.client.Checks() 426 if err != nil { 427 metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1) 428 return fmt.Errorf("error querying Consul checks: %v", err) 429 } 430 431 // Remove Nomad services in Consul but unknown locally 432 for id := range consulServices { 433 if _, ok := c.services[id]; ok { 434 // Known service, skip 435 continue 436 } 437 if !isNomadService(id) { 438 // Not managed by Nomad, skip 439 continue 440 } 441 442 // Unknown Nomad managed service; kill 443 if err := c.client.ServiceDeregister(id); err != nil { 444 if isOldNomadService(id) { 445 // Don't hard-fail on old entries. See #3620 446 continue 447 } 448 449 metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1) 450 return err 451 } 452 sdereg++ 453 metrics.IncrCounter([]string{"client", "consul", "service_deregistrations"}, 1) 454 } 455 456 // Add Nomad services missing from Consul 457 for id, locals := range c.services { 458 if _, ok := consulServices[id]; !ok { 459 if err = c.client.ServiceRegister(locals); err != nil { 460 metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1) 461 return err 462 } 463 sreg++ 464 metrics.IncrCounter([]string{"client", "consul", "service_registrations"}, 1) 465 } 466 } 467 468 // Remove Nomad checks in Consul but unknown locally 469 for id, check := range consulChecks { 470 if _, ok := c.checks[id]; ok { 471 // Known check, leave it 472 continue 473 } 474 if !isNomadService(check.ServiceID) { 475 // Service not managed by Nomad, skip 476 continue 477 } 478 479 // Unknown Nomad managed check; remove 480 if err := c.client.CheckDeregister(id); err != nil { 481 if isOldNomadService(check.ServiceID) { 482 // Don't hard-fail on old entries. 483 continue 484 } 485 486 metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1) 487 return err 488 } 489 cdereg++ 490 metrics.IncrCounter([]string{"client", "consul", "check_deregistrations"}, 1) 491 } 492 493 // Add Nomad checks missing from Consul 494 for id, check := range c.checks { 495 if _, ok := consulChecks[id]; ok { 496 // Already in Consul; skipping 497 continue 498 } 499 500 if err := c.client.CheckRegister(check); err != nil { 501 metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1) 502 return err 503 } 504 creg++ 505 metrics.IncrCounter([]string{"client", "consul", "check_registrations"}, 1) 506 507 // Handle starting scripts 508 if script, ok := c.scripts[id]; ok { 509 // If it's already running, cancel and replace 510 if oldScript, running := c.runningScripts[id]; running { 511 oldScript.cancel() 512 } 513 // Start and store the handle 514 c.runningScripts[id] = script.run() 515 } 516 } 517 518 c.logger.Printf("[DEBUG] consul.sync: registered %d services, %d checks; deregistered %d services, %d checks", 519 sreg, creg, sdereg, cdereg) 520 return nil 521 } 522 523 // RegisterAgent registers Nomad agents (client or server). The 524 // Service.PortLabel should be a literal port to be parsed with SplitHostPort. 525 // Script checks are not supported and will return an error. Registration is 526 // asynchronous. 527 // 528 // Agents will be deregistered when Shutdown is called. 529 func (c *ServiceClient) RegisterAgent(role string, services []*structs.Service) error { 530 ops := operations{} 531 532 for _, service := range services { 533 id := makeAgentServiceID(role, service) 534 535 // Unlike tasks, agents don't use port labels. Agent ports are 536 // stored directly in the PortLabel. 537 host, rawport, err := net.SplitHostPort(service.PortLabel) 538 if err != nil { 539 return fmt.Errorf("error parsing port label %q from service %q: %v", service.PortLabel, service.Name, err) 540 } 541 port, err := strconv.Atoi(rawport) 542 if err != nil { 543 return fmt.Errorf("error parsing port %q from service %q: %v", rawport, service.Name, err) 544 } 545 serviceReg := &api.AgentServiceRegistration{ 546 ID: id, 547 Name: service.Name, 548 Tags: service.Tags, 549 Address: host, 550 Port: port, 551 } 552 ops.regServices = append(ops.regServices, serviceReg) 553 554 for _, check := range service.Checks { 555 checkID := makeCheckID(id, check) 556 if check.Type == structs.ServiceCheckScript { 557 return fmt.Errorf("service %q contains invalid check: agent checks do not support scripts", service.Name) 558 } 559 checkHost, checkPort := serviceReg.Address, serviceReg.Port 560 if check.PortLabel != "" { 561 // Unlike tasks, agents don't use port labels. Agent ports are 562 // stored directly in the PortLabel. 563 host, rawport, err := net.SplitHostPort(check.PortLabel) 564 if err != nil { 565 return fmt.Errorf("error parsing port label %q from check %q: %v", service.PortLabel, check.Name, err) 566 } 567 port, err := strconv.Atoi(rawport) 568 if err != nil { 569 return fmt.Errorf("error parsing port %q from check %q: %v", rawport, check.Name, err) 570 } 571 checkHost, checkPort = host, port 572 } 573 checkReg, err := createCheckReg(id, checkID, check, checkHost, checkPort) 574 if err != nil { 575 return fmt.Errorf("failed to add check %q: %v", check.Name, err) 576 } 577 ops.regChecks = append(ops.regChecks, checkReg) 578 } 579 } 580 581 // Don't bother committing agent checks if we're already shutting down 582 c.agentLock.Lock() 583 defer c.agentLock.Unlock() 584 select { 585 case <-c.shutdownCh: 586 return nil 587 default: 588 } 589 590 // Now add them to the registration queue 591 c.commit(&ops) 592 593 // Record IDs for deregistering on shutdown 594 for _, id := range ops.regServices { 595 c.agentServices[id.ID] = struct{}{} 596 } 597 for _, id := range ops.regChecks { 598 c.agentChecks[id.ID] = struct{}{} 599 } 600 return nil 601 } 602 603 // serviceRegs creates service registrations, check registrations, and script 604 // checks from a service. It returns a service registration object with the 605 // service and check IDs populated. 606 func (c *ServiceClient) serviceRegs(ops *operations, allocID string, service *structs.Service, 607 task *structs.Task, exec driver.ScriptExecutor, net *cstructs.DriverNetwork) (*ServiceRegistration, error) { 608 609 // Get the services ID 610 id := makeTaskServiceID(allocID, task.Name, service) 611 sreg := &ServiceRegistration{ 612 serviceID: id, 613 checkIDs: make(map[string]struct{}, len(service.Checks)), 614 } 615 616 // Service address modes default to auto 617 addrMode := service.AddressMode 618 if addrMode == "" { 619 addrMode = structs.AddressModeAuto 620 } 621 622 // Determine the address to advertise based on the mode 623 ip, port, err := getAddress(addrMode, service.PortLabel, task.Resources.Networks, net) 624 if err != nil { 625 return nil, fmt.Errorf("unable to get address for service %q: %v", service.Name, err) 626 } 627 628 // Build the Consul Service registration request 629 serviceReg := &api.AgentServiceRegistration{ 630 ID: id, 631 Name: service.Name, 632 Tags: make([]string, len(service.Tags)), 633 Address: ip, 634 Port: port, 635 } 636 // copy isn't strictly necessary but can avoid bugs especially 637 // with tests that may reuse Tasks 638 copy(serviceReg.Tags, service.Tags) 639 ops.regServices = append(ops.regServices, serviceReg) 640 641 // Build the check registrations 642 checkIDs, err := c.checkRegs(ops, allocID, id, service, task, exec, net) 643 if err != nil { 644 return nil, err 645 } 646 for _, cid := range checkIDs { 647 sreg.checkIDs[cid] = struct{}{} 648 } 649 return sreg, nil 650 } 651 652 // checkRegs registers the checks for the given service and returns the 653 // registered check ids. 654 func (c *ServiceClient) checkRegs(ops *operations, allocID, serviceID string, service *structs.Service, 655 task *structs.Task, exec driver.ScriptExecutor, net *cstructs.DriverNetwork) ([]string, error) { 656 657 // Fast path 658 numChecks := len(service.Checks) 659 if numChecks == 0 { 660 return nil, nil 661 } 662 663 checkIDs := make([]string, 0, numChecks) 664 for _, check := range service.Checks { 665 checkID := makeCheckID(serviceID, check) 666 checkIDs = append(checkIDs, checkID) 667 if check.Type == structs.ServiceCheckScript { 668 if exec == nil { 669 return nil, fmt.Errorf("driver doesn't support script checks") 670 } 671 ops.scripts = append(ops.scripts, newScriptCheck( 672 allocID, task.Name, checkID, check, exec, c.client, c.logger, c.shutdownCh)) 673 674 // Skip getAddress for script checks 675 checkReg, err := createCheckReg(serviceID, checkID, check, "", 0) 676 if err != nil { 677 return nil, fmt.Errorf("failed to add script check %q: %v", check.Name, err) 678 } 679 ops.regChecks = append(ops.regChecks, checkReg) 680 continue 681 } 682 683 // Default to the service's port but allow check to override 684 portLabel := check.PortLabel 685 if portLabel == "" { 686 // Default to the service's port label 687 portLabel = service.PortLabel 688 } 689 690 // Checks address mode defaults to host for pre-#3380 backward compat 691 addrMode := check.AddressMode 692 if addrMode == "" { 693 addrMode = structs.AddressModeHost 694 } 695 696 ip, port, err := getAddress(addrMode, portLabel, task.Resources.Networks, net) 697 if err != nil { 698 return nil, fmt.Errorf("error getting address for check %q: %v", check.Name, err) 699 } 700 701 checkReg, err := createCheckReg(serviceID, checkID, check, ip, port) 702 if err != nil { 703 return nil, fmt.Errorf("failed to add check %q: %v", check.Name, err) 704 } 705 ops.regChecks = append(ops.regChecks, checkReg) 706 } 707 return checkIDs, nil 708 } 709 710 // RegisterTask with Consul. Adds all service entries and checks to Consul. If 711 // exec is nil and a script check exists an error is returned. 712 // 713 // If the service IP is set it used as the address in the service registration. 714 // Checks will always use the IP from the Task struct (host's IP). 715 // 716 // Actual communication with Consul is done asynchronously (see Run). 717 func (c *ServiceClient) RegisterTask(allocID string, task *structs.Task, restarter TaskRestarter, exec driver.ScriptExecutor, net *cstructs.DriverNetwork) error { 718 // Fast path 719 numServices := len(task.Services) 720 if numServices == 0 { 721 return nil 722 } 723 724 t := new(TaskRegistration) 725 t.Services = make(map[string]*ServiceRegistration, numServices) 726 727 ops := &operations{} 728 for _, service := range task.Services { 729 sreg, err := c.serviceRegs(ops, allocID, service, task, exec, net) 730 if err != nil { 731 return err 732 } 733 t.Services[sreg.serviceID] = sreg 734 } 735 736 // Add the task to the allocation's registration 737 c.addTaskRegistration(allocID, task.Name, t) 738 739 c.commit(ops) 740 741 // Start watching checks. Done after service registrations are built 742 // since an error building them could leak watches. 743 for _, service := range task.Services { 744 serviceID := makeTaskServiceID(allocID, task.Name, service) 745 for _, check := range service.Checks { 746 if check.TriggersRestarts() { 747 checkID := makeCheckID(serviceID, check) 748 c.checkWatcher.Watch(allocID, task.Name, checkID, check, restarter) 749 } 750 } 751 } 752 return nil 753 } 754 755 // UpdateTask in Consul. Does not alter the service if only checks have 756 // changed. 757 // 758 // DriverNetwork must not change between invocations for the same allocation. 759 func (c *ServiceClient) UpdateTask(allocID string, existing, newTask *structs.Task, restarter TaskRestarter, exec driver.ScriptExecutor, net *cstructs.DriverNetwork) error { 760 ops := &operations{} 761 762 taskReg := new(TaskRegistration) 763 taskReg.Services = make(map[string]*ServiceRegistration, len(newTask.Services)) 764 765 existingIDs := make(map[string]*structs.Service, len(existing.Services)) 766 for _, s := range existing.Services { 767 existingIDs[makeTaskServiceID(allocID, existing.Name, s)] = s 768 } 769 newIDs := make(map[string]*structs.Service, len(newTask.Services)) 770 for _, s := range newTask.Services { 771 newIDs[makeTaskServiceID(allocID, newTask.Name, s)] = s 772 } 773 774 // Loop over existing Service IDs to see if they have been removed or 775 // updated. 776 for existingID, existingSvc := range existingIDs { 777 newSvc, ok := newIDs[existingID] 778 if !ok { 779 // Existing service entry removed 780 ops.deregServices = append(ops.deregServices, existingID) 781 for _, check := range existingSvc.Checks { 782 cid := makeCheckID(existingID, check) 783 ops.deregChecks = append(ops.deregChecks, cid) 784 785 // Unwatch watched checks 786 if check.TriggersRestarts() { 787 c.checkWatcher.Unwatch(cid) 788 } 789 } 790 continue 791 } 792 793 // Service exists and hasn't changed, don't re-add it later 794 delete(newIDs, existingID) 795 796 // Service still exists so add it to the task's registration 797 sreg := &ServiceRegistration{ 798 serviceID: existingID, 799 checkIDs: make(map[string]struct{}, len(newSvc.Checks)), 800 } 801 taskReg.Services[existingID] = sreg 802 803 // See if any checks were updated 804 existingChecks := make(map[string]*structs.ServiceCheck, len(existingSvc.Checks)) 805 for _, check := range existingSvc.Checks { 806 existingChecks[makeCheckID(existingID, check)] = check 807 } 808 809 // Register new checks 810 for _, check := range newSvc.Checks { 811 checkID := makeCheckID(existingID, check) 812 if _, exists := existingChecks[checkID]; exists { 813 // Check exists, so don't remove it 814 delete(existingChecks, checkID) 815 sreg.checkIDs[checkID] = struct{}{} 816 } 817 818 // New check on an unchanged service; add them now 819 newCheckIDs, err := c.checkRegs(ops, allocID, existingID, newSvc, newTask, exec, net) 820 if err != nil { 821 return err 822 } 823 824 for _, checkID := range newCheckIDs { 825 sreg.checkIDs[checkID] = struct{}{} 826 827 } 828 829 // Update all watched checks as CheckRestart fields aren't part of ID 830 if check.TriggersRestarts() { 831 c.checkWatcher.Watch(allocID, newTask.Name, checkID, check, restarter) 832 } 833 } 834 835 // Remove existing checks not in updated service 836 for cid, check := range existingChecks { 837 ops.deregChecks = append(ops.deregChecks, cid) 838 839 // Unwatch checks 840 if check.TriggersRestarts() { 841 c.checkWatcher.Unwatch(cid) 842 } 843 } 844 } 845 846 // Any remaining services should just be enqueued directly 847 for _, newSvc := range newIDs { 848 sreg, err := c.serviceRegs(ops, allocID, newSvc, newTask, exec, net) 849 if err != nil { 850 return err 851 } 852 853 taskReg.Services[sreg.serviceID] = sreg 854 } 855 856 // Add the task to the allocation's registration 857 c.addTaskRegistration(allocID, newTask.Name, taskReg) 858 859 c.commit(ops) 860 861 // Start watching checks. Done after service registrations are built 862 // since an error building them could leak watches. 863 for _, service := range newIDs { 864 serviceID := makeTaskServiceID(allocID, newTask.Name, service) 865 for _, check := range service.Checks { 866 if check.TriggersRestarts() { 867 checkID := makeCheckID(serviceID, check) 868 c.checkWatcher.Watch(allocID, newTask.Name, checkID, check, restarter) 869 } 870 } 871 } 872 return nil 873 } 874 875 // RemoveTask from Consul. Removes all service entries and checks. 876 // 877 // Actual communication with Consul is done asynchronously (see Run). 878 func (c *ServiceClient) RemoveTask(allocID string, task *structs.Task) { 879 ops := operations{} 880 881 for _, service := range task.Services { 882 id := makeTaskServiceID(allocID, task.Name, service) 883 ops.deregServices = append(ops.deregServices, id) 884 885 for _, check := range service.Checks { 886 cid := makeCheckID(id, check) 887 ops.deregChecks = append(ops.deregChecks, cid) 888 889 if check.TriggersRestarts() { 890 c.checkWatcher.Unwatch(cid) 891 } 892 } 893 } 894 895 // Remove the task from the alloc's registrations 896 c.removeTaskRegistration(allocID, task.Name) 897 898 // Now add them to the deregistration fields; main Run loop will update 899 c.commit(&ops) 900 } 901 902 // AllocRegistrations returns the registrations for the given allocation. If the 903 // allocation has no reservations, the response is a nil object. 904 func (c *ServiceClient) AllocRegistrations(allocID string) (*AllocRegistration, error) { 905 // Get the internal struct using the lock 906 c.allocRegistrationsLock.RLock() 907 regInternal, ok := c.allocRegistrations[allocID] 908 if !ok { 909 c.allocRegistrationsLock.RUnlock() 910 return nil, nil 911 } 912 913 // Copy so we don't expose internal structs 914 reg := regInternal.copy() 915 c.allocRegistrationsLock.RUnlock() 916 917 // Query the services and checks to populate the allocation registrations. 918 services, err := c.client.Services() 919 if err != nil { 920 return nil, err 921 } 922 923 checks, err := c.client.Checks() 924 if err != nil { 925 return nil, err 926 } 927 928 // Populate the object 929 for _, treg := range reg.Tasks { 930 for serviceID, sreg := range treg.Services { 931 sreg.Service = services[serviceID] 932 for checkID := range sreg.checkIDs { 933 if check, ok := checks[checkID]; ok { 934 sreg.Checks = append(sreg.Checks, check) 935 } 936 } 937 } 938 } 939 940 return reg, nil 941 } 942 943 // Shutdown the Consul client. Update running task registrations and deregister 944 // agent from Consul. On first call blocks up to shutdownWait before giving up 945 // on syncing operations. 946 func (c *ServiceClient) Shutdown() error { 947 // Serialize Shutdown calls with RegisterAgent to prevent leaking agent 948 // entries. 949 c.agentLock.Lock() 950 defer c.agentLock.Unlock() 951 select { 952 case <-c.shutdownCh: 953 return nil 954 default: 955 close(c.shutdownCh) 956 } 957 958 // Give run loop time to sync, but don't block indefinitely 959 deadline := time.After(c.shutdownWait) 960 961 // Wait for Run to finish any outstanding operations and exit 962 select { 963 case <-c.exitCh: 964 case <-deadline: 965 // Don't wait forever though 966 } 967 968 // If Consul was never seen nothing could be written so exit early 969 if !c.hasSeen() { 970 return nil 971 } 972 973 // Always attempt to deregister Nomad agent Consul entries, even if 974 // deadline was reached 975 for id := range c.agentServices { 976 if err := c.client.ServiceDeregister(id); err != nil { 977 c.logger.Printf("[ERR] consul.sync: error deregistering agent service (id: %q): %v", id, err) 978 } 979 } 980 for id := range c.agentChecks { 981 if err := c.client.CheckDeregister(id); err != nil { 982 c.logger.Printf("[ERR] consul.sync: error deregistering agent service (id: %q): %v", id, err) 983 } 984 } 985 986 // Give script checks time to exit (no need to lock as Run() has exited) 987 for _, h := range c.runningScripts { 988 select { 989 case <-h.wait(): 990 case <-deadline: 991 return fmt.Errorf("timed out waiting for script checks to run") 992 } 993 } 994 return nil 995 } 996 997 // addTaskRegistration adds the task registration for the given allocation. 998 func (c *ServiceClient) addTaskRegistration(allocID, taskName string, reg *TaskRegistration) { 999 c.allocRegistrationsLock.Lock() 1000 defer c.allocRegistrationsLock.Unlock() 1001 1002 alloc, ok := c.allocRegistrations[allocID] 1003 if !ok { 1004 alloc = &AllocRegistration{ 1005 Tasks: make(map[string]*TaskRegistration), 1006 } 1007 c.allocRegistrations[allocID] = alloc 1008 } 1009 alloc.Tasks[taskName] = reg 1010 } 1011 1012 // removeTaskRegistration removes the task registration for the given allocation. 1013 func (c *ServiceClient) removeTaskRegistration(allocID, taskName string) { 1014 c.allocRegistrationsLock.Lock() 1015 defer c.allocRegistrationsLock.Unlock() 1016 1017 alloc, ok := c.allocRegistrations[allocID] 1018 if !ok { 1019 return 1020 } 1021 1022 // Delete the task and if it is the last one also delete the alloc's 1023 // registration 1024 delete(alloc.Tasks, taskName) 1025 if len(alloc.Tasks) == 0 { 1026 delete(c.allocRegistrations, allocID) 1027 } 1028 } 1029 1030 // makeAgentServiceID creates a unique ID for identifying an agent service in 1031 // Consul. 1032 // 1033 // Agent service IDs are of the form: 1034 // 1035 // {nomadServicePrefix}-{ROLE}-b32(sha1({Service.Name}-{Service.Tags...}) 1036 // Example Server ID: _nomad-server-fbbk265qn4tmt25nd4ep42tjvmyj3hr4 1037 // Example Client ID: _nomad-client-ggnjpgl7yn7rgmvxzilmpvrzzvrszc7l 1038 // 1039 func makeAgentServiceID(role string, service *structs.Service) string { 1040 return fmt.Sprintf("%s-%s-%s", nomadServicePrefix, role, service.Hash(role, "")) 1041 } 1042 1043 // makeTaskServiceID creates a unique ID for identifying a task service in 1044 // Consul. All structs.Service fields are included in the ID's hash except 1045 // Checks. This allows updates to merely compare IDs. 1046 // 1047 // Example Service ID: _nomad-task-TNM333JKJPM5AK4FAS3VXQLXFDWOF4VH 1048 func makeTaskServiceID(allocID, taskName string, service *structs.Service) string { 1049 return nomadTaskPrefix + service.Hash(allocID, taskName) 1050 } 1051 1052 // makeCheckID creates a unique ID for a check. 1053 func makeCheckID(serviceID string, check *structs.ServiceCheck) string { 1054 return check.Hash(serviceID) 1055 } 1056 1057 // createCheckReg creates a Check that can be registered with Consul. 1058 // 1059 // Script checks simply have a TTL set and the caller is responsible for 1060 // running the script and heartbeating. 1061 func createCheckReg(serviceID, checkID string, check *structs.ServiceCheck, host string, port int) (*api.AgentCheckRegistration, error) { 1062 chkReg := api.AgentCheckRegistration{ 1063 ID: checkID, 1064 Name: check.Name, 1065 ServiceID: serviceID, 1066 } 1067 chkReg.Status = check.InitialStatus 1068 chkReg.Timeout = check.Timeout.String() 1069 chkReg.Interval = check.Interval.String() 1070 1071 // Require an address for http or tcp checks 1072 if port == 0 && check.RequiresPort() { 1073 return nil, fmt.Errorf("%s checks require an address", check.Type) 1074 } 1075 1076 switch check.Type { 1077 case structs.ServiceCheckHTTP: 1078 proto := check.Protocol 1079 if proto == "" { 1080 proto = "http" 1081 } 1082 if check.TLSSkipVerify { 1083 chkReg.TLSSkipVerify = true 1084 } 1085 base := url.URL{ 1086 Scheme: proto, 1087 Host: net.JoinHostPort(host, strconv.Itoa(port)), 1088 } 1089 relative, err := url.Parse(check.Path) 1090 if err != nil { 1091 return nil, err 1092 } 1093 url := base.ResolveReference(relative) 1094 chkReg.HTTP = url.String() 1095 chkReg.Method = check.Method 1096 chkReg.Header = check.Header 1097 case structs.ServiceCheckTCP: 1098 chkReg.TCP = net.JoinHostPort(host, strconv.Itoa(port)) 1099 case structs.ServiceCheckScript: 1100 chkReg.TTL = (check.Interval + ttlCheckBuffer).String() 1101 // As of Consul 1.0.0 setting TTL and Interval is a 400 1102 chkReg.Interval = "" 1103 default: 1104 return nil, fmt.Errorf("check type %+q not valid", check.Type) 1105 } 1106 return &chkReg, nil 1107 } 1108 1109 // isNomadService returns true if the ID matches the pattern of a Nomad managed 1110 // service (new or old formats). Agent services return false as independent 1111 // client and server agents may be running on the same machine. #2827 1112 func isNomadService(id string) bool { 1113 return strings.HasPrefix(id, nomadTaskPrefix) || isOldNomadService(id) 1114 } 1115 1116 // isOldNomadService returns true if the ID matches an old pattern managed by 1117 // Nomad. 1118 // 1119 // Pre-0.7.1 task service IDs are of the form: 1120 // 1121 // {nomadServicePrefix}-executor-{ALLOC_ID}-{Service.Name}-{Service.Tags...} 1122 // Example Service ID: _nomad-executor-1234-echo-http-tag1-tag2-tag3 1123 // 1124 func isOldNomadService(id string) bool { 1125 const prefix = nomadServicePrefix + "-executor" 1126 return strings.HasPrefix(id, prefix) 1127 } 1128 1129 // getAddress returns the IP and port to use for a service or check. If no port 1130 // label is specified (an empty value), zero values are returned because no 1131 // address could be resolved. 1132 func getAddress(addrMode, portLabel string, networks structs.Networks, driverNet *cstructs.DriverNetwork) (string, int, error) { 1133 switch addrMode { 1134 case structs.AddressModeAuto: 1135 if driverNet.Advertise() { 1136 addrMode = structs.AddressModeDriver 1137 } else { 1138 addrMode = structs.AddressModeHost 1139 } 1140 return getAddress(addrMode, portLabel, networks, driverNet) 1141 case structs.AddressModeHost: 1142 if portLabel == "" { 1143 if len(networks) != 1 { 1144 // If no networks are specified return zero 1145 // values. Consul will advertise the host IP 1146 // with no port. This is the pre-0.7.1 behavior 1147 // some people rely on. 1148 return "", 0, nil 1149 } 1150 1151 return networks[0].IP, 0, nil 1152 } 1153 1154 // Default path: use host ip:port 1155 ip, port := networks.Port(portLabel) 1156 if ip == "" && port <= 0 { 1157 return "", 0, fmt.Errorf("invalid port %q: port label not found", portLabel) 1158 } 1159 return ip, port, nil 1160 1161 case structs.AddressModeDriver: 1162 // Require a driver network if driver address mode is used 1163 if driverNet == nil { 1164 return "", 0, fmt.Errorf(`cannot use address_mode="driver": no driver network exists`) 1165 } 1166 1167 // If no port label is specified just return the IP 1168 if portLabel == "" { 1169 return driverNet.IP, 0, nil 1170 } 1171 1172 // If the port is a label, use the driver's port (not the host's) 1173 if port, ok := driverNet.PortMap[portLabel]; ok { 1174 return driverNet.IP, port, nil 1175 } 1176 1177 // If port isn't a label, try to parse it as a literal port number 1178 port, err := strconv.Atoi(portLabel) 1179 if err != nil { 1180 // Don't include Atoi error message as user likely 1181 // never intended it to be a numeric and it creates a 1182 // confusing error message 1183 return "", 0, fmt.Errorf("invalid port label %q: port labels in driver address_mode must be numeric or in the driver's port map", portLabel) 1184 } 1185 if port <= 0 { 1186 return "", 0, fmt.Errorf("invalid port: %q: port must be >0", portLabel) 1187 } 1188 1189 return driverNet.IP, port, nil 1190 1191 default: 1192 // Shouldn't happen due to validation, but enforce invariants 1193 return "", 0, fmt.Errorf("invalid address mode %q", addrMode) 1194 } 1195 }