github.com/nir0s/nomad@v0.8.7-rc1/command/agent/consul/client.go (about) 1 package consul 2 3 import ( 4 "context" 5 "fmt" 6 "log" 7 "net" 8 "net/url" 9 "strconv" 10 "strings" 11 "sync" 12 "sync/atomic" 13 "time" 14 15 metrics "github.com/armon/go-metrics" 16 "github.com/hashicorp/consul/api" 17 cstructs "github.com/hashicorp/nomad/client/structs" 18 "github.com/hashicorp/nomad/helper" 19 "github.com/hashicorp/nomad/nomad/structs" 20 ) 21 22 const ( 23 // nomadServicePrefix is the prefix that scopes all Nomad registered 24 // services (both agent and task entries). 25 nomadServicePrefix = "_nomad" 26 27 // nomadTaskPrefix is the prefix that scopes Nomad registered services 28 // for tasks. 29 nomadTaskPrefix = nomadServicePrefix + "-task-" 30 31 // defaultRetryInterval is how quickly to retry syncing services and 32 // checks to Consul when an error occurs. Will backoff up to a max. 33 defaultRetryInterval = time.Second 34 35 // defaultMaxRetryInterval is the default max retry interval. 36 defaultMaxRetryInterval = 30 * time.Second 37 38 // defaultPeriodicalInterval is the interval at which the service 39 // client reconciles state between the desired services and checks and 40 // what's actually registered in Consul. This is done at an interval, 41 // rather than being purely edge triggered, to handle the case that the 42 // Consul agent's state may change underneath us 43 defaultPeriodicInterval = 30 * time.Second 44 45 // ttlCheckBuffer is the time interval that Nomad can take to report Consul 46 // the check result 47 ttlCheckBuffer = 31 * time.Second 48 49 // defaultShutdownWait is how long Shutdown() should block waiting for 50 // enqueued operations to sync to Consul by default. 51 defaultShutdownWait = time.Minute 52 53 // DefaultQueryWaitDuration is the max duration the Consul Agent will 54 // spend waiting for a response from a Consul Query. 55 DefaultQueryWaitDuration = 2 * time.Second 56 57 // ServiceTagHTTP is the tag assigned to HTTP services 58 ServiceTagHTTP = "http" 59 60 // ServiceTagRPC is the tag assigned to RPC services 61 ServiceTagRPC = "rpc" 62 63 // ServiceTagSerf is the tag assigned to Serf services 64 ServiceTagSerf = "serf" 65 ) 66 67 // CatalogAPI is the consul/api.Catalog API used by Nomad. 68 type CatalogAPI interface { 69 Datacenters() ([]string, error) 70 Service(service, tag string, q *api.QueryOptions) ([]*api.CatalogService, *api.QueryMeta, error) 71 } 72 73 // AgentAPI is the consul/api.Agent API used by Nomad. 74 type AgentAPI interface { 75 Services() (map[string]*api.AgentService, error) 76 Checks() (map[string]*api.AgentCheck, error) 77 CheckRegister(check *api.AgentCheckRegistration) error 78 CheckDeregister(checkID string) error 79 Self() (map[string]map[string]interface{}, error) 80 ServiceRegister(service *api.AgentServiceRegistration) error 81 ServiceDeregister(serviceID string) error 82 UpdateTTL(id, output, status string) error 83 } 84 85 // operations are submitted to the main loop via commit() for synchronizing 86 // with Consul. 87 type operations struct { 88 regServices []*api.AgentServiceRegistration 89 regChecks []*api.AgentCheckRegistration 90 scripts []*scriptCheck 91 92 deregServices []string 93 deregChecks []string 94 } 95 96 // AllocRegistration holds the status of services registered for a particular 97 // allocations by task. 98 type AllocRegistration struct { 99 // Tasks maps the name of a task to its registered services and checks 100 Tasks map[string]*TaskRegistration 101 } 102 103 func (a *AllocRegistration) copy() *AllocRegistration { 104 c := &AllocRegistration{ 105 Tasks: make(map[string]*TaskRegistration, len(a.Tasks)), 106 } 107 108 for k, v := range a.Tasks { 109 c.Tasks[k] = v.copy() 110 } 111 112 return c 113 } 114 115 // NumServices returns the number of registered services 116 func (a *AllocRegistration) NumServices() int { 117 if a == nil { 118 return 0 119 } 120 121 total := 0 122 for _, treg := range a.Tasks { 123 for _, sreg := range treg.Services { 124 if sreg.Service != nil { 125 total++ 126 } 127 } 128 } 129 130 return total 131 } 132 133 // NumChecks returns the number of registered checks 134 func (a *AllocRegistration) NumChecks() int { 135 if a == nil { 136 return 0 137 } 138 139 total := 0 140 for _, treg := range a.Tasks { 141 for _, sreg := range treg.Services { 142 total += len(sreg.Checks) 143 } 144 } 145 146 return total 147 } 148 149 // TaskRegistration holds the status of services registered for a particular 150 // task. 151 type TaskRegistration struct { 152 Services map[string]*ServiceRegistration 153 } 154 155 func (t *TaskRegistration) copy() *TaskRegistration { 156 c := &TaskRegistration{ 157 Services: make(map[string]*ServiceRegistration, len(t.Services)), 158 } 159 160 for k, v := range t.Services { 161 c.Services[k] = v.copy() 162 } 163 164 return c 165 } 166 167 // ServiceRegistration holds the status of a registered Consul Service and its 168 // Checks. 169 type ServiceRegistration struct { 170 // serviceID and checkIDs are internal fields that track just the IDs of the 171 // services/checks registered in Consul. It is used to materialize the other 172 // fields when queried. 173 serviceID string 174 checkIDs map[string]struct{} 175 176 // Service is the AgentService registered in Consul. 177 Service *api.AgentService 178 179 // Checks is the status of the registered checks. 180 Checks []*api.AgentCheck 181 } 182 183 func (s *ServiceRegistration) copy() *ServiceRegistration { 184 // Copy does not copy the external fields but only the internal fields. This 185 // is so that the caller of AllocRegistrations can not access the internal 186 // fields and that method uses these fields to populate the external fields. 187 return &ServiceRegistration{ 188 serviceID: s.serviceID, 189 checkIDs: helper.CopyMapStringStruct(s.checkIDs), 190 } 191 } 192 193 // ServiceClient handles task and agent service registration with Consul. 194 type ServiceClient struct { 195 client AgentAPI 196 logger *log.Logger 197 retryInterval time.Duration 198 maxRetryInterval time.Duration 199 periodicInterval time.Duration 200 201 // exitCh is closed when the main Run loop exits 202 exitCh chan struct{} 203 204 // shutdownCh is closed when the client should shutdown 205 shutdownCh chan struct{} 206 207 // shutdownWait is how long Shutdown() blocks waiting for the final 208 // sync() to finish. Defaults to defaultShutdownWait 209 shutdownWait time.Duration 210 211 opCh chan *operations 212 213 services map[string]*api.AgentServiceRegistration 214 checks map[string]*api.AgentCheckRegistration 215 scripts map[string]*scriptCheck 216 runningScripts map[string]*scriptHandle 217 218 // allocRegistrations stores the services and checks that are registered 219 // with Consul by allocation ID. 220 allocRegistrations map[string]*AllocRegistration 221 allocRegistrationsLock sync.RWMutex 222 223 // agent services and checks record entries for the agent itself which 224 // should be removed on shutdown 225 agentServices map[string]struct{} 226 agentChecks map[string]struct{} 227 agentLock sync.Mutex 228 229 // seen is 1 if Consul has ever been seen; otherwise 0. Accessed with 230 // atomics. 231 seen int32 232 233 // checkWatcher restarts checks that are unhealthy. 234 checkWatcher *checkWatcher 235 236 // isClientAgent specifies whether this Consul client is being used 237 // by a Nomad client. 238 isClientAgent bool 239 } 240 241 // NewServiceClient creates a new Consul ServiceClient from an existing Consul API 242 // Client, logger and takes whether the client is being used by a Nomad Client agent. 243 // When being used by a Nomad client, this Consul client reconciles all services and 244 // checks created by Nomad on behalf of running tasks. 245 func NewServiceClient(consulClient AgentAPI, logger *log.Logger, isNomadClient bool) *ServiceClient { 246 return &ServiceClient{ 247 client: consulClient, 248 logger: logger, 249 retryInterval: defaultRetryInterval, 250 maxRetryInterval: defaultMaxRetryInterval, 251 periodicInterval: defaultPeriodicInterval, 252 exitCh: make(chan struct{}), 253 shutdownCh: make(chan struct{}), 254 shutdownWait: defaultShutdownWait, 255 opCh: make(chan *operations, 8), 256 services: make(map[string]*api.AgentServiceRegistration), 257 checks: make(map[string]*api.AgentCheckRegistration), 258 scripts: make(map[string]*scriptCheck), 259 runningScripts: make(map[string]*scriptHandle), 260 allocRegistrations: make(map[string]*AllocRegistration), 261 agentServices: make(map[string]struct{}), 262 agentChecks: make(map[string]struct{}), 263 checkWatcher: newCheckWatcher(logger, consulClient), 264 isClientAgent: isNomadClient, 265 } 266 } 267 268 // seen is used by markSeen and hasSeen 269 const seen = 1 270 271 // markSeen marks Consul as having been seen (meaning at least one operation 272 // has succeeded). 273 func (c *ServiceClient) markSeen() { 274 atomic.StoreInt32(&c.seen, seen) 275 } 276 277 // hasSeen returns true if any Consul operation has ever succeeded. Useful to 278 // squelch errors if Consul isn't running. 279 func (c *ServiceClient) hasSeen() bool { 280 return atomic.LoadInt32(&c.seen) == seen 281 } 282 283 // Run the Consul main loop which retries operations against Consul. It should 284 // be called exactly once. 285 func (c *ServiceClient) Run() { 286 defer close(c.exitCh) 287 288 ctx, cancel := context.WithCancel(context.Background()) 289 defer cancel() 290 291 // init will be closed when Consul has been contacted 292 init := make(chan struct{}) 293 go checkConsulTLSSkipVerify(ctx, c.logger, c.client, init) 294 295 // Process operations while waiting for initial contact with Consul but 296 // do not sync until contact has been made. 297 INIT: 298 for { 299 select { 300 case <-init: 301 c.markSeen() 302 break INIT 303 case <-c.shutdownCh: 304 return 305 case ops := <-c.opCh: 306 c.merge(ops) 307 } 308 } 309 c.logger.Printf("[TRACE] consul.sync: able to contact Consul") 310 311 // Block until contact with Consul has been established 312 // Start checkWatcher 313 go c.checkWatcher.Run(ctx) 314 315 // Always immediately sync to reconcile Nomad and Consul's state 316 retryTimer := time.NewTimer(0) 317 318 failures := 0 319 for { 320 select { 321 case <-retryTimer.C: 322 case <-c.shutdownCh: 323 // Cancel check watcher but sync one last time 324 cancel() 325 case ops := <-c.opCh: 326 c.merge(ops) 327 } 328 329 if err := c.sync(); err != nil { 330 if failures == 0 { 331 // Log on the first failure 332 c.logger.Printf("[WARN] consul.sync: failed to update services in Consul: %v", err) 333 } else if failures%10 == 0 { 334 // Log every 10th consecutive failure 335 c.logger.Printf("[ERR] consul.sync: still unable to update services in Consul after %d failures; latest error: %v", failures, err) 336 } 337 338 failures++ 339 if !retryTimer.Stop() { 340 // Timer already expired, since the timer may 341 // or may not have been read in the select{} 342 // above, conditionally receive on it 343 select { 344 case <-retryTimer.C: 345 default: 346 } 347 } 348 backoff := c.retryInterval * time.Duration(failures) 349 if backoff > c.maxRetryInterval { 350 backoff = c.maxRetryInterval 351 } 352 retryTimer.Reset(backoff) 353 } else { 354 if failures > 0 { 355 c.logger.Printf("[INFO] consul.sync: successfully updated services in Consul") 356 failures = 0 357 } 358 359 // Reset timer to periodic interval to periodically 360 // reconile with Consul 361 if !retryTimer.Stop() { 362 select { 363 case <-retryTimer.C: 364 default: 365 } 366 } 367 retryTimer.Reset(c.periodicInterval) 368 } 369 370 select { 371 case <-c.shutdownCh: 372 // Exit only after sync'ing all outstanding operations 373 if len(c.opCh) > 0 { 374 for len(c.opCh) > 0 { 375 c.merge(<-c.opCh) 376 } 377 continue 378 } 379 return 380 default: 381 } 382 383 } 384 } 385 386 // commit operations unless already shutting down. 387 func (c *ServiceClient) commit(ops *operations) { 388 select { 389 case c.opCh <- ops: 390 case <-c.shutdownCh: 391 } 392 } 393 394 // merge registrations into state map prior to sync'ing with Consul 395 func (c *ServiceClient) merge(ops *operations) { 396 for _, s := range ops.regServices { 397 c.services[s.ID] = s 398 } 399 for _, check := range ops.regChecks { 400 c.checks[check.ID] = check 401 } 402 for _, s := range ops.scripts { 403 c.scripts[s.id] = s 404 } 405 for _, sid := range ops.deregServices { 406 delete(c.services, sid) 407 } 408 for _, cid := range ops.deregChecks { 409 if script, ok := c.runningScripts[cid]; ok { 410 script.cancel() 411 delete(c.scripts, cid) 412 delete(c.runningScripts, cid) 413 } 414 delete(c.checks, cid) 415 } 416 metrics.SetGauge([]string{"client", "consul", "services"}, float32(len(c.services))) 417 metrics.SetGauge([]string{"client", "consul", "checks"}, float32(len(c.checks))) 418 metrics.SetGauge([]string{"client", "consul", "script_checks"}, float32(len(c.runningScripts))) 419 } 420 421 // sync enqueued operations. 422 func (c *ServiceClient) sync() error { 423 sreg, creg, sdereg, cdereg := 0, 0, 0, 0 424 425 consulServices, err := c.client.Services() 426 if err != nil { 427 metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1) 428 return fmt.Errorf("error querying Consul services: %v", err) 429 } 430 431 consulChecks, err := c.client.Checks() 432 if err != nil { 433 metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1) 434 return fmt.Errorf("error querying Consul checks: %v", err) 435 } 436 437 // Remove Nomad services in Consul but unknown locally 438 for id := range consulServices { 439 if _, ok := c.services[id]; ok { 440 // Known service, skip 441 continue 442 } 443 444 // Ignore if this is not a Nomad managed service. Also ignore 445 // Nomad managed services if this is not a client agent. 446 // This is to prevent server agents from removing services 447 // registered by client agents 448 if !isNomadService(id) || !c.isClientAgent { 449 // Not managed by Nomad, skip 450 continue 451 } 452 453 // Unknown Nomad managed service; kill 454 if err := c.client.ServiceDeregister(id); err != nil { 455 if isOldNomadService(id) { 456 // Don't hard-fail on old entries. See #3620 457 continue 458 } 459 460 metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1) 461 return err 462 } 463 sdereg++ 464 metrics.IncrCounter([]string{"client", "consul", "service_deregistrations"}, 1) 465 } 466 467 // Add Nomad services missing from Consul 468 for id, locals := range c.services { 469 if _, ok := consulServices[id]; !ok { 470 if err = c.client.ServiceRegister(locals); err != nil { 471 metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1) 472 return err 473 } 474 sreg++ 475 metrics.IncrCounter([]string{"client", "consul", "service_registrations"}, 1) 476 } 477 } 478 479 // Remove Nomad checks in Consul but unknown locally 480 for id, check := range consulChecks { 481 if _, ok := c.checks[id]; ok { 482 // Known check, leave it 483 continue 484 } 485 486 // Ignore if this is not a Nomad managed check. Also ignore 487 // Nomad managed checks if this is not a client agent. 488 // This is to prevent server agents from removing checks 489 // registered by client agents 490 if !isNomadService(check.ServiceID) || !c.isClientAgent { 491 // Service not managed by Nomad, skip 492 continue 493 } 494 495 // Unknown Nomad managed check; remove 496 if err := c.client.CheckDeregister(id); err != nil { 497 if isOldNomadService(check.ServiceID) { 498 // Don't hard-fail on old entries. 499 continue 500 } 501 502 metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1) 503 return err 504 } 505 cdereg++ 506 metrics.IncrCounter([]string{"client", "consul", "check_deregistrations"}, 1) 507 } 508 509 // Add Nomad checks missing from Consul 510 for id, check := range c.checks { 511 if _, ok := consulChecks[id]; ok { 512 // Already in Consul; skipping 513 continue 514 } 515 516 if err := c.client.CheckRegister(check); err != nil { 517 metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1) 518 return err 519 } 520 creg++ 521 metrics.IncrCounter([]string{"client", "consul", "check_registrations"}, 1) 522 523 // Handle starting scripts 524 if script, ok := c.scripts[id]; ok { 525 // If it's already running, cancel and replace 526 if oldScript, running := c.runningScripts[id]; running { 527 oldScript.cancel() 528 } 529 // Start and store the handle 530 c.runningScripts[id] = script.run() 531 } 532 } 533 534 c.logger.Printf("[DEBUG] consul.sync: registered %d services, %d checks; deregistered %d services, %d checks", 535 sreg, creg, sdereg, cdereg) 536 return nil 537 } 538 539 // RegisterAgent registers Nomad agents (client or server). The 540 // Service.PortLabel should be a literal port to be parsed with SplitHostPort. 541 // Script checks are not supported and will return an error. Registration is 542 // asynchronous. 543 // 544 // Agents will be deregistered when Shutdown is called. 545 func (c *ServiceClient) RegisterAgent(role string, services []*structs.Service) error { 546 ops := operations{} 547 548 for _, service := range services { 549 id := makeAgentServiceID(role, service) 550 551 // Unlike tasks, agents don't use port labels. Agent ports are 552 // stored directly in the PortLabel. 553 host, rawport, err := net.SplitHostPort(service.PortLabel) 554 if err != nil { 555 return fmt.Errorf("error parsing port label %q from service %q: %v", service.PortLabel, service.Name, err) 556 } 557 port, err := strconv.Atoi(rawport) 558 if err != nil { 559 return fmt.Errorf("error parsing port %q from service %q: %v", rawport, service.Name, err) 560 } 561 serviceReg := &api.AgentServiceRegistration{ 562 ID: id, 563 Name: service.Name, 564 Tags: service.Tags, 565 Address: host, 566 Port: port, 567 } 568 ops.regServices = append(ops.regServices, serviceReg) 569 570 for _, check := range service.Checks { 571 checkID := makeCheckID(id, check) 572 if check.Type == structs.ServiceCheckScript { 573 return fmt.Errorf("service %q contains invalid check: agent checks do not support scripts", service.Name) 574 } 575 checkHost, checkPort := serviceReg.Address, serviceReg.Port 576 if check.PortLabel != "" { 577 // Unlike tasks, agents don't use port labels. Agent ports are 578 // stored directly in the PortLabel. 579 host, rawport, err := net.SplitHostPort(check.PortLabel) 580 if err != nil { 581 return fmt.Errorf("error parsing port label %q from check %q: %v", service.PortLabel, check.Name, err) 582 } 583 port, err := strconv.Atoi(rawport) 584 if err != nil { 585 return fmt.Errorf("error parsing port %q from check %q: %v", rawport, check.Name, err) 586 } 587 checkHost, checkPort = host, port 588 } 589 checkReg, err := createCheckReg(id, checkID, check, checkHost, checkPort) 590 if err != nil { 591 return fmt.Errorf("failed to add check %q: %v", check.Name, err) 592 } 593 ops.regChecks = append(ops.regChecks, checkReg) 594 } 595 } 596 597 // Don't bother committing agent checks if we're already shutting down 598 c.agentLock.Lock() 599 defer c.agentLock.Unlock() 600 select { 601 case <-c.shutdownCh: 602 return nil 603 default: 604 } 605 606 // Now add them to the registration queue 607 c.commit(&ops) 608 609 // Record IDs for deregistering on shutdown 610 for _, id := range ops.regServices { 611 c.agentServices[id.ID] = struct{}{} 612 } 613 for _, id := range ops.regChecks { 614 c.agentChecks[id.ID] = struct{}{} 615 } 616 return nil 617 } 618 619 // serviceRegs creates service registrations, check registrations, and script 620 // checks from a service. It returns a service registration object with the 621 // service and check IDs populated. 622 func (c *ServiceClient) serviceRegs(ops *operations, service *structs.Service, task *TaskServices) ( 623 *ServiceRegistration, error) { 624 625 // Get the services ID 626 id := makeTaskServiceID(task.AllocID, task.Name, service, task.Canary) 627 sreg := &ServiceRegistration{ 628 serviceID: id, 629 checkIDs: make(map[string]struct{}, len(service.Checks)), 630 } 631 632 // Service address modes default to auto 633 addrMode := service.AddressMode 634 if addrMode == "" { 635 addrMode = structs.AddressModeAuto 636 } 637 638 // Determine the address to advertise based on the mode 639 ip, port, err := getAddress(addrMode, service.PortLabel, task.Networks, task.DriverNetwork) 640 if err != nil { 641 return nil, fmt.Errorf("unable to get address for service %q: %v", service.Name, err) 642 } 643 644 // Determine whether to use tags or canary_tags 645 var tags []string 646 if task.Canary && len(service.CanaryTags) > 0 { 647 tags = make([]string, len(service.CanaryTags)) 648 copy(tags, service.CanaryTags) 649 } else { 650 tags = make([]string, len(service.Tags)) 651 copy(tags, service.Tags) 652 } 653 654 // Build the Consul Service registration request 655 serviceReg := &api.AgentServiceRegistration{ 656 ID: id, 657 Name: service.Name, 658 Tags: tags, 659 Address: ip, 660 Port: port, 661 } 662 ops.regServices = append(ops.regServices, serviceReg) 663 664 // Build the check registrations 665 checkIDs, err := c.checkRegs(ops, id, service, task) 666 if err != nil { 667 return nil, err 668 } 669 for _, cid := range checkIDs { 670 sreg.checkIDs[cid] = struct{}{} 671 } 672 return sreg, nil 673 } 674 675 // checkRegs registers the checks for the given service and returns the 676 // registered check ids. 677 func (c *ServiceClient) checkRegs(ops *operations, serviceID string, service *structs.Service, 678 task *TaskServices) ([]string, error) { 679 680 // Fast path 681 numChecks := len(service.Checks) 682 if numChecks == 0 { 683 return nil, nil 684 } 685 686 checkIDs := make([]string, 0, numChecks) 687 for _, check := range service.Checks { 688 checkID := makeCheckID(serviceID, check) 689 checkIDs = append(checkIDs, checkID) 690 if check.Type == structs.ServiceCheckScript { 691 if task.DriverExec == nil { 692 return nil, fmt.Errorf("driver doesn't support script checks") 693 } 694 695 sc := newScriptCheck(task.AllocID, task.Name, checkID, check, task.DriverExec, 696 c.client, c.logger, c.shutdownCh) 697 ops.scripts = append(ops.scripts, sc) 698 699 // Skip getAddress for script checks 700 checkReg, err := createCheckReg(serviceID, checkID, check, "", 0) 701 if err != nil { 702 return nil, fmt.Errorf("failed to add script check %q: %v", check.Name, err) 703 } 704 ops.regChecks = append(ops.regChecks, checkReg) 705 continue 706 } 707 708 // Default to the service's port but allow check to override 709 portLabel := check.PortLabel 710 if portLabel == "" { 711 // Default to the service's port label 712 portLabel = service.PortLabel 713 } 714 715 // Checks address mode defaults to host for pre-#3380 backward compat 716 addrMode := check.AddressMode 717 if addrMode == "" { 718 addrMode = structs.AddressModeHost 719 } 720 721 ip, port, err := getAddress(addrMode, portLabel, task.Networks, task.DriverNetwork) 722 if err != nil { 723 return nil, fmt.Errorf("error getting address for check %q: %v", check.Name, err) 724 } 725 726 checkReg, err := createCheckReg(serviceID, checkID, check, ip, port) 727 if err != nil { 728 return nil, fmt.Errorf("failed to add check %q: %v", check.Name, err) 729 } 730 ops.regChecks = append(ops.regChecks, checkReg) 731 } 732 return checkIDs, nil 733 } 734 735 // RegisterTask with Consul. Adds all service entries and checks to Consul. If 736 // exec is nil and a script check exists an error is returned. 737 // 738 // If the service IP is set it used as the address in the service registration. 739 // Checks will always use the IP from the Task struct (host's IP). 740 // 741 // Actual communication with Consul is done asynchronously (see Run). 742 func (c *ServiceClient) RegisterTask(task *TaskServices) error { 743 // Fast path 744 numServices := len(task.Services) 745 if numServices == 0 { 746 return nil 747 } 748 749 t := new(TaskRegistration) 750 t.Services = make(map[string]*ServiceRegistration, numServices) 751 752 ops := &operations{} 753 for _, service := range task.Services { 754 sreg, err := c.serviceRegs(ops, service, task) 755 if err != nil { 756 return err 757 } 758 t.Services[sreg.serviceID] = sreg 759 } 760 761 // Add the task to the allocation's registration 762 c.addTaskRegistration(task.AllocID, task.Name, t) 763 764 c.commit(ops) 765 766 // Start watching checks. Done after service registrations are built 767 // since an error building them could leak watches. 768 for _, service := range task.Services { 769 serviceID := makeTaskServiceID(task.AllocID, task.Name, service, task.Canary) 770 for _, check := range service.Checks { 771 if check.TriggersRestarts() { 772 checkID := makeCheckID(serviceID, check) 773 c.checkWatcher.Watch(task.AllocID, task.Name, checkID, check, task.Restarter) 774 } 775 } 776 } 777 return nil 778 } 779 780 // UpdateTask in Consul. Does not alter the service if only checks have 781 // changed. 782 // 783 // DriverNetwork must not change between invocations for the same allocation. 784 func (c *ServiceClient) UpdateTask(old, newTask *TaskServices) error { 785 ops := &operations{} 786 787 taskReg := new(TaskRegistration) 788 taskReg.Services = make(map[string]*ServiceRegistration, len(newTask.Services)) 789 790 existingIDs := make(map[string]*structs.Service, len(old.Services)) 791 for _, s := range old.Services { 792 existingIDs[makeTaskServiceID(old.AllocID, old.Name, s, old.Canary)] = s 793 } 794 newIDs := make(map[string]*structs.Service, len(newTask.Services)) 795 for _, s := range newTask.Services { 796 newIDs[makeTaskServiceID(newTask.AllocID, newTask.Name, s, newTask.Canary)] = s 797 } 798 799 // Loop over existing Service IDs to see if they have been removed or 800 // updated. 801 for existingID, existingSvc := range existingIDs { 802 newSvc, ok := newIDs[existingID] 803 if !ok { 804 // Existing service entry removed 805 ops.deregServices = append(ops.deregServices, existingID) 806 for _, check := range existingSvc.Checks { 807 cid := makeCheckID(existingID, check) 808 ops.deregChecks = append(ops.deregChecks, cid) 809 810 // Unwatch watched checks 811 if check.TriggersRestarts() { 812 c.checkWatcher.Unwatch(cid) 813 } 814 } 815 continue 816 } 817 818 // Service exists and hasn't changed, don't re-add it later 819 delete(newIDs, existingID) 820 821 // Service still exists so add it to the task's registration 822 sreg := &ServiceRegistration{ 823 serviceID: existingID, 824 checkIDs: make(map[string]struct{}, len(newSvc.Checks)), 825 } 826 taskReg.Services[existingID] = sreg 827 828 // See if any checks were updated 829 existingChecks := make(map[string]*structs.ServiceCheck, len(existingSvc.Checks)) 830 for _, check := range existingSvc.Checks { 831 existingChecks[makeCheckID(existingID, check)] = check 832 } 833 834 // Register new checks 835 for _, check := range newSvc.Checks { 836 checkID := makeCheckID(existingID, check) 837 if _, exists := existingChecks[checkID]; exists { 838 // Check exists, so don't remove it 839 delete(existingChecks, checkID) 840 sreg.checkIDs[checkID] = struct{}{} 841 } 842 843 // New check on an unchanged service; add them now 844 newCheckIDs, err := c.checkRegs(ops, existingID, newSvc, newTask) 845 if err != nil { 846 return err 847 } 848 849 for _, checkID := range newCheckIDs { 850 sreg.checkIDs[checkID] = struct{}{} 851 852 } 853 854 // Update all watched checks as CheckRestart fields aren't part of ID 855 if check.TriggersRestarts() { 856 c.checkWatcher.Watch(newTask.AllocID, newTask.Name, checkID, check, newTask.Restarter) 857 } 858 } 859 860 // Remove existing checks not in updated service 861 for cid, check := range existingChecks { 862 ops.deregChecks = append(ops.deregChecks, cid) 863 864 // Unwatch checks 865 if check.TriggersRestarts() { 866 c.checkWatcher.Unwatch(cid) 867 } 868 } 869 } 870 871 // Any remaining services should just be enqueued directly 872 for _, newSvc := range newIDs { 873 sreg, err := c.serviceRegs(ops, newSvc, newTask) 874 if err != nil { 875 return err 876 } 877 878 taskReg.Services[sreg.serviceID] = sreg 879 } 880 881 // Add the task to the allocation's registration 882 c.addTaskRegistration(newTask.AllocID, newTask.Name, taskReg) 883 884 c.commit(ops) 885 886 // Start watching checks. Done after service registrations are built 887 // since an error building them could leak watches. 888 for _, service := range newIDs { 889 serviceID := makeTaskServiceID(newTask.AllocID, newTask.Name, service, newTask.Canary) 890 for _, check := range service.Checks { 891 if check.TriggersRestarts() { 892 checkID := makeCheckID(serviceID, check) 893 c.checkWatcher.Watch(newTask.AllocID, newTask.Name, checkID, check, newTask.Restarter) 894 } 895 } 896 } 897 return nil 898 } 899 900 // RemoveTask from Consul. Removes all service entries and checks. 901 // 902 // Actual communication with Consul is done asynchronously (see Run). 903 func (c *ServiceClient) RemoveTask(task *TaskServices) { 904 ops := operations{} 905 906 for _, service := range task.Services { 907 id := makeTaskServiceID(task.AllocID, task.Name, service, task.Canary) 908 ops.deregServices = append(ops.deregServices, id) 909 910 for _, check := range service.Checks { 911 cid := makeCheckID(id, check) 912 ops.deregChecks = append(ops.deregChecks, cid) 913 914 if check.TriggersRestarts() { 915 c.checkWatcher.Unwatch(cid) 916 } 917 } 918 } 919 920 // Remove the task from the alloc's registrations 921 c.removeTaskRegistration(task.AllocID, task.Name) 922 923 // Now add them to the deregistration fields; main Run loop will update 924 c.commit(&ops) 925 } 926 927 // AllocRegistrations returns the registrations for the given allocation. If the 928 // allocation has no reservations, the response is a nil object. 929 func (c *ServiceClient) AllocRegistrations(allocID string) (*AllocRegistration, error) { 930 // Get the internal struct using the lock 931 c.allocRegistrationsLock.RLock() 932 regInternal, ok := c.allocRegistrations[allocID] 933 if !ok { 934 c.allocRegistrationsLock.RUnlock() 935 return nil, nil 936 } 937 938 // Copy so we don't expose internal structs 939 reg := regInternal.copy() 940 c.allocRegistrationsLock.RUnlock() 941 942 // Query the services and checks to populate the allocation registrations. 943 services, err := c.client.Services() 944 if err != nil { 945 return nil, err 946 } 947 948 checks, err := c.client.Checks() 949 if err != nil { 950 return nil, err 951 } 952 953 // Populate the object 954 for _, treg := range reg.Tasks { 955 for serviceID, sreg := range treg.Services { 956 sreg.Service = services[serviceID] 957 for checkID := range sreg.checkIDs { 958 if check, ok := checks[checkID]; ok { 959 sreg.Checks = append(sreg.Checks, check) 960 } 961 } 962 } 963 } 964 965 return reg, nil 966 } 967 968 // Shutdown the Consul client. Update running task registrations and deregister 969 // agent from Consul. On first call blocks up to shutdownWait before giving up 970 // on syncing operations. 971 func (c *ServiceClient) Shutdown() error { 972 // Serialize Shutdown calls with RegisterAgent to prevent leaking agent 973 // entries. 974 c.agentLock.Lock() 975 defer c.agentLock.Unlock() 976 select { 977 case <-c.shutdownCh: 978 return nil 979 default: 980 close(c.shutdownCh) 981 } 982 983 // Give run loop time to sync, but don't block indefinitely 984 deadline := time.After(c.shutdownWait) 985 986 // Wait for Run to finish any outstanding operations and exit 987 select { 988 case <-c.exitCh: 989 case <-deadline: 990 // Don't wait forever though 991 } 992 993 // If Consul was never seen nothing could be written so exit early 994 if !c.hasSeen() { 995 return nil 996 } 997 998 // Always attempt to deregister Nomad agent Consul entries, even if 999 // deadline was reached 1000 for id := range c.agentServices { 1001 if err := c.client.ServiceDeregister(id); err != nil { 1002 c.logger.Printf("[ERR] consul.sync: error deregistering agent service (id: %q): %v", id, err) 1003 } 1004 } 1005 for id := range c.agentChecks { 1006 if err := c.client.CheckDeregister(id); err != nil { 1007 c.logger.Printf("[ERR] consul.sync: error deregistering agent service (id: %q): %v", id, err) 1008 } 1009 } 1010 1011 // Give script checks time to exit (no need to lock as Run() has exited) 1012 for _, h := range c.runningScripts { 1013 select { 1014 case <-h.wait(): 1015 case <-deadline: 1016 return fmt.Errorf("timed out waiting for script checks to run") 1017 } 1018 } 1019 return nil 1020 } 1021 1022 // addTaskRegistration adds the task registration for the given allocation. 1023 func (c *ServiceClient) addTaskRegistration(allocID, taskName string, reg *TaskRegistration) { 1024 c.allocRegistrationsLock.Lock() 1025 defer c.allocRegistrationsLock.Unlock() 1026 1027 alloc, ok := c.allocRegistrations[allocID] 1028 if !ok { 1029 alloc = &AllocRegistration{ 1030 Tasks: make(map[string]*TaskRegistration), 1031 } 1032 c.allocRegistrations[allocID] = alloc 1033 } 1034 alloc.Tasks[taskName] = reg 1035 } 1036 1037 // removeTaskRegistration removes the task registration for the given allocation. 1038 func (c *ServiceClient) removeTaskRegistration(allocID, taskName string) { 1039 c.allocRegistrationsLock.Lock() 1040 defer c.allocRegistrationsLock.Unlock() 1041 1042 alloc, ok := c.allocRegistrations[allocID] 1043 if !ok { 1044 return 1045 } 1046 1047 // Delete the task and if it is the last one also delete the alloc's 1048 // registration 1049 delete(alloc.Tasks, taskName) 1050 if len(alloc.Tasks) == 0 { 1051 delete(c.allocRegistrations, allocID) 1052 } 1053 } 1054 1055 // makeAgentServiceID creates a unique ID for identifying an agent service in 1056 // Consul. 1057 // 1058 // Agent service IDs are of the form: 1059 // 1060 // {nomadServicePrefix}-{ROLE}-b32(sha1({Service.Name}-{Service.Tags...}) 1061 // Example Server ID: _nomad-server-fbbk265qn4tmt25nd4ep42tjvmyj3hr4 1062 // Example Client ID: _nomad-client-ggnjpgl7yn7rgmvxzilmpvrzzvrszc7l 1063 // 1064 func makeAgentServiceID(role string, service *structs.Service) string { 1065 return fmt.Sprintf("%s-%s-%s", nomadServicePrefix, role, service.Hash(role, "", false)) 1066 } 1067 1068 // makeTaskServiceID creates a unique ID for identifying a task service in 1069 // Consul. All structs.Service fields are included in the ID's hash except 1070 // Checks. This allows updates to merely compare IDs. 1071 // 1072 // Example Service ID: _nomad-task-TNM333JKJPM5AK4FAS3VXQLXFDWOF4VH 1073 func makeTaskServiceID(allocID, taskName string, service *structs.Service, canary bool) string { 1074 return nomadTaskPrefix + service.Hash(allocID, taskName, canary) 1075 } 1076 1077 // makeCheckID creates a unique ID for a check. 1078 func makeCheckID(serviceID string, check *structs.ServiceCheck) string { 1079 return check.Hash(serviceID) 1080 } 1081 1082 // createCheckReg creates a Check that can be registered with Consul. 1083 // 1084 // Script checks simply have a TTL set and the caller is responsible for 1085 // running the script and heartbeating. 1086 func createCheckReg(serviceID, checkID string, check *structs.ServiceCheck, host string, port int) (*api.AgentCheckRegistration, error) { 1087 chkReg := api.AgentCheckRegistration{ 1088 ID: checkID, 1089 Name: check.Name, 1090 ServiceID: serviceID, 1091 } 1092 chkReg.Status = check.InitialStatus 1093 chkReg.Timeout = check.Timeout.String() 1094 chkReg.Interval = check.Interval.String() 1095 1096 // Require an address for http or tcp checks 1097 if port == 0 && check.RequiresPort() { 1098 return nil, fmt.Errorf("%s checks require an address", check.Type) 1099 } 1100 1101 switch check.Type { 1102 case structs.ServiceCheckHTTP: 1103 proto := check.Protocol 1104 if proto == "" { 1105 proto = "http" 1106 } 1107 if check.TLSSkipVerify { 1108 chkReg.TLSSkipVerify = true 1109 } 1110 base := url.URL{ 1111 Scheme: proto, 1112 Host: net.JoinHostPort(host, strconv.Itoa(port)), 1113 } 1114 relative, err := url.Parse(check.Path) 1115 if err != nil { 1116 return nil, err 1117 } 1118 url := base.ResolveReference(relative) 1119 chkReg.HTTP = url.String() 1120 chkReg.Method = check.Method 1121 chkReg.Header = check.Header 1122 1123 case structs.ServiceCheckTCP: 1124 chkReg.TCP = net.JoinHostPort(host, strconv.Itoa(port)) 1125 1126 case structs.ServiceCheckScript: 1127 chkReg.TTL = (check.Interval + ttlCheckBuffer).String() 1128 // As of Consul 1.0.0 setting TTL and Interval is a 400 1129 chkReg.Interval = "" 1130 1131 case structs.ServiceCheckGRPC: 1132 chkReg.GRPC = fmt.Sprintf("%s/%s", net.JoinHostPort(host, strconv.Itoa(port)), check.GRPCService) 1133 chkReg.GRPCUseTLS = check.GRPCUseTLS 1134 if check.TLSSkipVerify { 1135 chkReg.TLSSkipVerify = true 1136 } 1137 1138 default: 1139 return nil, fmt.Errorf("check type %+q not valid", check.Type) 1140 } 1141 return &chkReg, nil 1142 } 1143 1144 // isNomadService returns true if the ID matches the pattern of a Nomad managed 1145 // service (new or old formats). Agent services return false as independent 1146 // client and server agents may be running on the same machine. #2827 1147 func isNomadService(id string) bool { 1148 return strings.HasPrefix(id, nomadTaskPrefix) || isOldNomadService(id) 1149 } 1150 1151 // isOldNomadService returns true if the ID matches an old pattern managed by 1152 // Nomad. 1153 // 1154 // Pre-0.7.1 task service IDs are of the form: 1155 // 1156 // {nomadServicePrefix}-executor-{ALLOC_ID}-{Service.Name}-{Service.Tags...} 1157 // Example Service ID: _nomad-executor-1234-echo-http-tag1-tag2-tag3 1158 // 1159 func isOldNomadService(id string) bool { 1160 const prefix = nomadServicePrefix + "-executor" 1161 return strings.HasPrefix(id, prefix) 1162 } 1163 1164 // getAddress returns the IP and port to use for a service or check. If no port 1165 // label is specified (an empty value), zero values are returned because no 1166 // address could be resolved. 1167 func getAddress(addrMode, portLabel string, networks structs.Networks, driverNet *cstructs.DriverNetwork) (string, int, error) { 1168 switch addrMode { 1169 case structs.AddressModeAuto: 1170 if driverNet.Advertise() { 1171 addrMode = structs.AddressModeDriver 1172 } else { 1173 addrMode = structs.AddressModeHost 1174 } 1175 return getAddress(addrMode, portLabel, networks, driverNet) 1176 case structs.AddressModeHost: 1177 if portLabel == "" { 1178 if len(networks) != 1 { 1179 // If no networks are specified return zero 1180 // values. Consul will advertise the host IP 1181 // with no port. This is the pre-0.7.1 behavior 1182 // some people rely on. 1183 return "", 0, nil 1184 } 1185 1186 return networks[0].IP, 0, nil 1187 } 1188 1189 // Default path: use host ip:port 1190 ip, port := networks.Port(portLabel) 1191 if ip == "" && port <= 0 { 1192 return "", 0, fmt.Errorf("invalid port %q: port label not found", portLabel) 1193 } 1194 return ip, port, nil 1195 1196 case structs.AddressModeDriver: 1197 // Require a driver network if driver address mode is used 1198 if driverNet == nil { 1199 return "", 0, fmt.Errorf(`cannot use address_mode="driver": no driver network exists`) 1200 } 1201 1202 // If no port label is specified just return the IP 1203 if portLabel == "" { 1204 return driverNet.IP, 0, nil 1205 } 1206 1207 // If the port is a label, use the driver's port (not the host's) 1208 if port, ok := driverNet.PortMap[portLabel]; ok { 1209 return driverNet.IP, port, nil 1210 } 1211 1212 // If port isn't a label, try to parse it as a literal port number 1213 port, err := strconv.Atoi(portLabel) 1214 if err != nil { 1215 // Don't include Atoi error message as user likely 1216 // never intended it to be a numeric and it creates a 1217 // confusing error message 1218 return "", 0, fmt.Errorf("invalid port label %q: port labels in driver address_mode must be numeric or in the driver's port map", portLabel) 1219 } 1220 if port <= 0 { 1221 return "", 0, fmt.Errorf("invalid port: %q: port must be >0", portLabel) 1222 } 1223 1224 return driverNet.IP, port, nil 1225 1226 default: 1227 // Shouldn't happen due to validation, but enforce invariants 1228 return "", 0, fmt.Errorf("invalid address mode %q", addrMode) 1229 } 1230 }