github.com/anuvu/nomad@v0.8.7-atom1/command/agent/consul/client.go (about) 1 package consul 2 3 import ( 4 "context" 5 "fmt" 6 "log" 7 "net" 8 "net/url" 9 "strconv" 10 "strings" 11 "sync" 12 "sync/atomic" 13 "time" 14 15 metrics "github.com/armon/go-metrics" 16 "github.com/hashicorp/consul/api" 17 cstructs "github.com/hashicorp/nomad/client/structs" 18 "github.com/hashicorp/nomad/helper" 19 "github.com/hashicorp/nomad/nomad/structs" 20 ) 21 22 const ( 23 // nomadServicePrefix is the prefix that scopes all Nomad registered 24 // services (both agent and task entries). 25 nomadServicePrefix = "_nomad" 26 27 // nomadTaskPrefix is the prefix that scopes Nomad registered services 28 // for tasks. 29 nomadTaskPrefix = nomadServicePrefix + "-task-" 30 31 // defaultRetryInterval is how quickly to retry syncing services and 32 // checks to Consul when an error occurs. Will backoff up to a max. 33 defaultRetryInterval = time.Second 34 35 // defaultMaxRetryInterval is the default max retry interval. 36 defaultMaxRetryInterval = 30 * time.Second 37 38 // defaultPeriodicalInterval is the interval at which the service 39 // client reconciles state between the desired services and checks and 40 // what's actually registered in Consul. This is done at an interval, 41 // rather than being purely edge triggered, to handle the case that the 42 // Consul agent's state may change underneath us 43 defaultPeriodicInterval = 30 * time.Second 44 45 // ttlCheckBuffer is the time interval that Nomad can take to report Consul 46 // the check result 47 ttlCheckBuffer = 31 * time.Second 48 49 // defaultShutdownWait is how long Shutdown() should block waiting for 50 // enqueued operations to sync to Consul by default. 51 defaultShutdownWait = time.Minute 52 53 // DefaultQueryWaitDuration is the max duration the Consul Agent will 54 // spend waiting for a response from a Consul Query. 55 DefaultQueryWaitDuration = 2 * time.Second 56 57 // ServiceTagHTTP is the tag assigned to HTTP services 58 ServiceTagHTTP = "http" 59 60 // ServiceTagRPC is the tag assigned to RPC services 61 ServiceTagRPC = "rpc" 62 63 // ServiceTagSerf is the tag assigned to Serf services 64 ServiceTagSerf = "serf" 65 ) 66 67 // CatalogAPI is the consul/api.Catalog API used by Nomad. 68 type CatalogAPI interface { 69 Datacenters() ([]string, error) 70 Service(service, tag string, q *api.QueryOptions) ([]*api.CatalogService, *api.QueryMeta, error) 71 } 72 73 // AgentAPI is the consul/api.Agent API used by Nomad. 74 type AgentAPI interface { 75 Services() (map[string]*api.AgentService, error) 76 Checks() (map[string]*api.AgentCheck, error) 77 CheckRegister(check *api.AgentCheckRegistration) error 78 CheckDeregister(checkID string) error 79 Self() (map[string]map[string]interface{}, error) 80 ServiceRegister(service *api.AgentServiceRegistration) error 81 ServiceDeregister(serviceID string) error 82 UpdateTTL(id, output, status string) error 83 } 84 85 // operations are submitted to the main loop via commit() for synchronizing 86 // with Consul. 87 type operations struct { 88 regServices []*api.AgentServiceRegistration 89 regChecks []*api.AgentCheckRegistration 90 scripts []*scriptCheck 91 92 deregServices []string 93 deregChecks []string 94 } 95 96 // AllocRegistration holds the status of services registered for a particular 97 // allocations by task. 98 type AllocRegistration struct { 99 // Tasks maps the name of a task to its registered services and checks 100 Tasks map[string]*TaskRegistration 101 } 102 103 func (a *AllocRegistration) copy() *AllocRegistration { 104 c := &AllocRegistration{ 105 Tasks: make(map[string]*TaskRegistration, len(a.Tasks)), 106 } 107 108 for k, v := range a.Tasks { 109 c.Tasks[k] = v.copy() 110 } 111 112 return c 113 } 114 115 // NumServices returns the number of registered services 116 func (a *AllocRegistration) NumServices() int { 117 if a == nil { 118 return 0 119 } 120 121 total := 0 122 for _, treg := range a.Tasks { 123 for _, sreg := range treg.Services { 124 if sreg.Service != nil { 125 total++ 126 } 127 } 128 } 129 130 return total 131 } 132 133 // NumChecks returns the number of registered checks 134 func (a *AllocRegistration) NumChecks() int { 135 if a == nil { 136 return 0 137 } 138 139 total := 0 140 for _, treg := range a.Tasks { 141 for _, sreg := range treg.Services { 142 total += len(sreg.Checks) 143 } 144 } 145 146 return total 147 } 148 149 // TaskRegistration holds the status of services registered for a particular 150 // task. 151 type TaskRegistration struct { 152 Services map[string]*ServiceRegistration 153 } 154 155 func (t *TaskRegistration) copy() *TaskRegistration { 156 c := &TaskRegistration{ 157 Services: make(map[string]*ServiceRegistration, len(t.Services)), 158 } 159 160 for k, v := range t.Services { 161 c.Services[k] = v.copy() 162 } 163 164 return c 165 } 166 167 // ServiceRegistration holds the status of a registered Consul Service and its 168 // Checks. 169 type ServiceRegistration struct { 170 // serviceID and checkIDs are internal fields that track just the IDs of the 171 // services/checks registered in Consul. It is used to materialize the other 172 // fields when queried. 173 serviceID string 174 checkIDs map[string]struct{} 175 176 // Service is the AgentService registered in Consul. 177 Service *api.AgentService 178 179 // Checks is the status of the registered checks. 180 Checks []*api.AgentCheck 181 } 182 183 func (s *ServiceRegistration) copy() *ServiceRegistration { 184 // Copy does not copy the external fields but only the internal fields. This 185 // is so that the caller of AllocRegistrations can not access the internal 186 // fields and that method uses these fields to populate the external fields. 187 return &ServiceRegistration{ 188 serviceID: s.serviceID, 189 checkIDs: helper.CopyMapStringStruct(s.checkIDs), 190 } 191 } 192 193 // ServiceClient handles task and agent service registration with Consul. 194 type ServiceClient struct { 195 client AgentAPI 196 logger *log.Logger 197 retryInterval time.Duration 198 maxRetryInterval time.Duration 199 periodicInterval time.Duration 200 201 // exitCh is closed when the main Run loop exits 202 exitCh chan struct{} 203 204 // shutdownCh is closed when the client should shutdown 205 shutdownCh chan struct{} 206 207 // shutdownWait is how long Shutdown() blocks waiting for the final 208 // sync() to finish. Defaults to defaultShutdownWait 209 shutdownWait time.Duration 210 211 opCh chan *operations 212 213 services map[string]*api.AgentServiceRegistration 214 checks map[string]*api.AgentCheckRegistration 215 scripts map[string]*scriptCheck 216 runningScripts map[string]*scriptHandle 217 218 // allocRegistrations stores the services and checks that are registered 219 // with Consul by allocation ID. 220 allocRegistrations map[string]*AllocRegistration 221 allocRegistrationsLock sync.RWMutex 222 223 // agent services and checks record entries for the agent itself which 224 // should be removed on shutdown 225 agentServices map[string]struct{} 226 agentChecks map[string]struct{} 227 agentLock sync.Mutex 228 229 // seen is 1 if Consul has ever been seen; otherwise 0. Accessed with 230 // atomics. 231 seen int32 232 233 // checkWatcher restarts checks that are unhealthy. 234 checkWatcher *checkWatcher 235 236 // isClientAgent specifies whether this Consul client is being used 237 // by a Nomad client. 238 isClientAgent bool 239 } 240 241 // NewServiceClient creates a new Consul ServiceClient from an existing Consul API 242 // Client, logger and takes whether the client is being used by a Nomad Client agent. 243 // When being used by a Nomad client, this Consul client reconciles all services and 244 // checks created by Nomad on behalf of running tasks. 245 func NewServiceClient(consulClient AgentAPI, logger *log.Logger, isNomadClient bool) *ServiceClient { 246 return &ServiceClient{ 247 client: consulClient, 248 logger: logger, 249 retryInterval: defaultRetryInterval, 250 maxRetryInterval: defaultMaxRetryInterval, 251 periodicInterval: defaultPeriodicInterval, 252 exitCh: make(chan struct{}), 253 shutdownCh: make(chan struct{}), 254 shutdownWait: defaultShutdownWait, 255 opCh: make(chan *operations, 8), 256 services: make(map[string]*api.AgentServiceRegistration), 257 checks: make(map[string]*api.AgentCheckRegistration), 258 scripts: make(map[string]*scriptCheck), 259 runningScripts: make(map[string]*scriptHandle), 260 allocRegistrations: make(map[string]*AllocRegistration), 261 agentServices: make(map[string]struct{}), 262 agentChecks: make(map[string]struct{}), 263 checkWatcher: newCheckWatcher(logger, consulClient), 264 isClientAgent: isNomadClient, 265 } 266 } 267 268 // seen is used by markSeen and hasSeen 269 const seen = 1 270 271 // markSeen marks Consul as having been seen (meaning at least one operation 272 // has succeeded). 273 func (c *ServiceClient) markSeen() { 274 atomic.StoreInt32(&c.seen, seen) 275 } 276 277 // hasSeen returns true if any Consul operation has ever succeeded. Useful to 278 // squelch errors if Consul isn't running. 279 func (c *ServiceClient) hasSeen() bool { 280 return atomic.LoadInt32(&c.seen) == seen 281 } 282 283 // Run the Consul main loop which retries operations against Consul. It should 284 // be called exactly once. 285 func (c *ServiceClient) Run() { 286 defer close(c.exitCh) 287 288 ctx, cancel := context.WithCancel(context.Background()) 289 defer cancel() 290 291 // init will be closed when Consul has been contacted 292 init := make(chan struct{}) 293 go checkConsulTLSSkipVerify(ctx, c.logger, c.client, init) 294 295 // Process operations while waiting for initial contact with Consul but 296 // do not sync until contact has been made. 297 INIT: 298 for { 299 select { 300 case <-init: 301 c.markSeen() 302 break INIT 303 case <-c.shutdownCh: 304 return 305 case ops := <-c.opCh: 306 c.merge(ops) 307 } 308 } 309 c.logger.Printf("[TRACE] consul.sync: able to contact Consul") 310 311 // Block until contact with Consul has been established 312 // Start checkWatcher 313 go c.checkWatcher.Run(ctx) 314 315 // Always immediately sync to reconcile Nomad and Consul's state 316 retryTimer := time.NewTimer(0) 317 318 failures := 0 319 for { 320 select { 321 case <-retryTimer.C: 322 case <-c.shutdownCh: 323 // Cancel check watcher but sync one last time 324 cancel() 325 case ops := <-c.opCh: 326 c.merge(ops) 327 } 328 329 if err := c.sync(); err != nil { 330 if failures == 0 { 331 // Log on the first failure 332 c.logger.Printf("[WARN] consul.sync: failed to update services in Consul: %v", err) 333 } else if failures%10 == 0 { 334 // Log every 10th consecutive failure 335 c.logger.Printf("[ERR] consul.sync: still unable to update services in Consul after %d failures; latest error: %v", failures, err) 336 } 337 338 failures++ 339 if !retryTimer.Stop() { 340 // Timer already expired, since the timer may 341 // or may not have been read in the select{} 342 // above, conditionally receive on it 343 select { 344 case <-retryTimer.C: 345 default: 346 } 347 } 348 backoff := c.retryInterval * time.Duration(failures) 349 if backoff > c.maxRetryInterval { 350 backoff = c.maxRetryInterval 351 } 352 retryTimer.Reset(backoff) 353 } else { 354 if failures > 0 { 355 c.logger.Printf("[INFO] consul.sync: successfully updated services in Consul") 356 failures = 0 357 } 358 359 // Reset timer to periodic interval to periodically 360 // reconile with Consul 361 if !retryTimer.Stop() { 362 select { 363 case <-retryTimer.C: 364 default: 365 } 366 } 367 retryTimer.Reset(c.periodicInterval) 368 } 369 370 select { 371 case <-c.shutdownCh: 372 // Exit only after sync'ing all outstanding operations 373 if len(c.opCh) > 0 { 374 for len(c.opCh) > 0 { 375 c.merge(<-c.opCh) 376 } 377 continue 378 } 379 return 380 default: 381 } 382 383 } 384 } 385 386 // commit operations unless already shutting down. 387 func (c *ServiceClient) commit(ops *operations) { 388 select { 389 case c.opCh <- ops: 390 case <-c.shutdownCh: 391 } 392 } 393 394 // merge registrations into state map prior to sync'ing with Consul 395 func (c *ServiceClient) merge(ops *operations) { 396 for _, s := range ops.regServices { 397 c.services[s.ID] = s 398 } 399 for _, check := range ops.regChecks { 400 c.checks[check.ID] = check 401 } 402 for _, s := range ops.scripts { 403 c.scripts[s.id] = s 404 } 405 for _, sid := range ops.deregServices { 406 delete(c.services, sid) 407 } 408 for _, cid := range ops.deregChecks { 409 if script, ok := c.runningScripts[cid]; ok { 410 script.cancel() 411 delete(c.scripts, cid) 412 delete(c.runningScripts, cid) 413 } 414 delete(c.checks, cid) 415 } 416 metrics.SetGauge([]string{"client", "consul", "services"}, float32(len(c.services))) 417 metrics.SetGauge([]string{"client", "consul", "checks"}, float32(len(c.checks))) 418 metrics.SetGauge([]string{"client", "consul", "script_checks"}, float32(len(c.runningScripts))) 419 } 420 421 // sync enqueued operations. 422 func (c *ServiceClient) sync() error { 423 sreg, creg, sdereg, cdereg := 0, 0, 0, 0 424 425 consulServices, err := c.client.Services() 426 if err != nil { 427 metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1) 428 return fmt.Errorf("error querying Consul services: %v", err) 429 } 430 431 consulChecks, err := c.client.Checks() 432 if err != nil { 433 metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1) 434 return fmt.Errorf("error querying Consul checks: %v", err) 435 } 436 437 // Remove Nomad services in Consul but unknown locally 438 for id := range consulServices { 439 if _, ok := c.services[id]; ok { 440 // Known service, skip 441 continue 442 } 443 444 // Ignore if this is not a Nomad managed service. Also ignore 445 // Nomad managed services if this is not a client agent. 446 // This is to prevent server agents from removing services 447 // registered by client agents 448 if !isNomadService(id) || !c.isClientAgent { 449 // Not managed by Nomad, skip 450 continue 451 } 452 453 // Unknown Nomad managed service; kill 454 if err := c.client.ServiceDeregister(id); err != nil { 455 if isOldNomadService(id) { 456 // Don't hard-fail on old entries. See #3620 457 continue 458 } 459 460 metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1) 461 return err 462 } 463 sdereg++ 464 metrics.IncrCounter([]string{"client", "consul", "service_deregistrations"}, 1) 465 } 466 467 // Add Nomad services missing from Consul 468 for id, locals := range c.services { 469 if _, ok := consulServices[id]; !ok { 470 if err = c.client.ServiceRegister(locals); err != nil { 471 metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1) 472 return err 473 } 474 sreg++ 475 metrics.IncrCounter([]string{"client", "consul", "service_registrations"}, 1) 476 } 477 } 478 479 // Remove Nomad checks in Consul but unknown locally 480 for id, check := range consulChecks { 481 if _, ok := c.checks[id]; ok { 482 // Known check, leave it 483 continue 484 } 485 486 // Ignore if this is not a Nomad managed check. Also ignore 487 // Nomad managed checks if this is not a client agent. 488 // This is to prevent server agents from removing checks 489 // registered by client agents 490 if !isNomadService(check.ServiceID) || !c.isClientAgent { 491 // Service not managed by Nomad, skip 492 continue 493 } 494 495 // Unknown Nomad managed check; remove 496 if err := c.client.CheckDeregister(id); err != nil { 497 if isOldNomadService(check.ServiceID) { 498 // Don't hard-fail on old entries. 499 continue 500 } 501 502 metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1) 503 return err 504 } 505 cdereg++ 506 metrics.IncrCounter([]string{"client", "consul", "check_deregistrations"}, 1) 507 } 508 509 // Add Nomad checks missing from Consul 510 for id, check := range c.checks { 511 if _, ok := consulChecks[id]; ok { 512 // Already in Consul; skipping 513 continue 514 } 515 516 if err := c.client.CheckRegister(check); err != nil { 517 metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1) 518 return err 519 } 520 creg++ 521 metrics.IncrCounter([]string{"client", "consul", "check_registrations"}, 1) 522 523 // Handle starting scripts 524 if script, ok := c.scripts[id]; ok { 525 // If it's already running, cancel and replace 526 if oldScript, running := c.runningScripts[id]; running { 527 oldScript.cancel() 528 } 529 // Start and store the handle 530 c.runningScripts[id] = script.run() 531 } 532 } 533 534 c.logger.Printf("[DEBUG] consul.sync: registered %d services, %d checks; deregistered %d services, %d checks", 535 sreg, creg, sdereg, cdereg) 536 return nil 537 } 538 539 // RegisterAgent registers Nomad agents (client or server). The 540 // Service.PortLabel should be a literal port to be parsed with SplitHostPort. 541 // Script checks are not supported and will return an error. Registration is 542 // asynchronous. 543 // 544 // Agents will be deregistered when Shutdown is called. 545 func (c *ServiceClient) RegisterAgent(role string, services []*structs.Service) error { 546 ops := operations{} 547 548 for _, service := range services { 549 id := makeAgentServiceID(role, service) 550 551 // Unlike tasks, agents don't use port labels. Agent ports are 552 // stored directly in the PortLabel. 553 host, rawport, err := net.SplitHostPort(service.PortLabel) 554 if err != nil { 555 return fmt.Errorf("error parsing port label %q from service %q: %v", service.PortLabel, service.Name, err) 556 } 557 port, err := strconv.Atoi(rawport) 558 if err != nil { 559 return fmt.Errorf("error parsing port %q from service %q: %v", rawport, service.Name, err) 560 } 561 serviceReg := &api.AgentServiceRegistration{ 562 ID: id, 563 Name: service.Name, 564 Tags: service.Tags, 565 Address: host, 566 Port: port, 567 } 568 ops.regServices = append(ops.regServices, serviceReg) 569 570 for _, check := range service.Checks { 571 checkID := makeCheckID(id, check) 572 if check.Type == structs.ServiceCheckScript { 573 return fmt.Errorf("service %q contains invalid check: agent checks do not support scripts", service.Name) 574 } 575 checkHost, checkPort := serviceReg.Address, serviceReg.Port 576 if check.PortLabel != "" { 577 // Unlike tasks, agents don't use port labels. Agent ports are 578 // stored directly in the PortLabel. 579 host, rawport, err := net.SplitHostPort(check.PortLabel) 580 if err != nil { 581 return fmt.Errorf("error parsing port label %q from check %q: %v", service.PortLabel, check.Name, err) 582 } 583 port, err := strconv.Atoi(rawport) 584 if err != nil { 585 return fmt.Errorf("error parsing port %q from check %q: %v", rawport, check.Name, err) 586 } 587 checkHost, checkPort = host, port 588 } 589 checkReg, err := createCheckReg(id, checkID, check, checkHost, checkPort) 590 if err != nil { 591 return fmt.Errorf("failed to add check %q: %v", check.Name, err) 592 } 593 ops.regChecks = append(ops.regChecks, checkReg) 594 } 595 } 596 597 // Don't bother committing agent checks if we're already shutting down 598 c.agentLock.Lock() 599 defer c.agentLock.Unlock() 600 select { 601 case <-c.shutdownCh: 602 return nil 603 default: 604 } 605 606 // Now add them to the registration queue 607 c.commit(&ops) 608 609 // Record IDs for deregistering on shutdown 610 for _, id := range ops.regServices { 611 c.agentServices[id.ID] = struct{}{} 612 } 613 for _, id := range ops.regChecks { 614 c.agentChecks[id.ID] = struct{}{} 615 } 616 return nil 617 } 618 619 // serviceRegs creates service registrations, check registrations, and script 620 // checks from a service. It returns a service registration object with the 621 // service and check IDs populated. 622 func (c *ServiceClient) serviceRegs(ops *operations, service *structs.Service, task *TaskServices) ( 623 *ServiceRegistration, error) { 624 625 // Get the services ID 626 id := makeTaskServiceID(task.AllocID, task.Name, service, task.Canary) 627 sreg := &ServiceRegistration{ 628 serviceID: id, 629 checkIDs: make(map[string]struct{}, len(service.Checks)), 630 } 631 632 // Service address modes default to auto 633 addrMode := service.AddressMode 634 if addrMode == "" { 635 addrMode = structs.AddressModeAuto 636 } 637 638 // Determine the address to advertise based on the mode 639 ip, port, err := getAddress(addrMode, service.PortLabel, task.Networks, task.DriverNetwork) 640 if err != nil { 641 return nil, fmt.Errorf("unable to get address for service %q: %v", service.Name, err) 642 } 643 644 // Determine whether to use tags or canary_tags 645 var tags []string 646 if task.Canary && len(service.CanaryTags) > 0 { 647 tags = make([]string, len(service.CanaryTags)) 648 copy(tags, service.CanaryTags) 649 } else { 650 tags = make([]string, len(service.Tags)) 651 copy(tags, service.Tags) 652 } 653 newtag := task.Name + strconv.Itoa(task.AllocIndex) 654 tags = append(tags, newtag) 655 656 // Build the Consul Service registration request 657 serviceReg := &api.AgentServiceRegistration{ 658 ID: id, 659 Name: service.Name, 660 Tags: tags, 661 Address: ip, 662 Port: port, 663 } 664 ops.regServices = append(ops.regServices, serviceReg) 665 666 // Build the check registrations 667 checkIDs, err := c.checkRegs(ops, id, service, task) 668 if err != nil { 669 return nil, err 670 } 671 for _, cid := range checkIDs { 672 sreg.checkIDs[cid] = struct{}{} 673 } 674 return sreg, nil 675 } 676 677 // checkRegs registers the checks for the given service and returns the 678 // registered check ids. 679 func (c *ServiceClient) checkRegs(ops *operations, serviceID string, service *structs.Service, 680 task *TaskServices) ([]string, error) { 681 682 // Fast path 683 numChecks := len(service.Checks) 684 if numChecks == 0 { 685 return nil, nil 686 } 687 688 checkIDs := make([]string, 0, numChecks) 689 for _, check := range service.Checks { 690 checkID := makeCheckID(serviceID, check) 691 checkIDs = append(checkIDs, checkID) 692 if check.Type == structs.ServiceCheckScript { 693 if task.DriverExec == nil { 694 return nil, fmt.Errorf("driver doesn't support script checks") 695 } 696 697 sc := newScriptCheck(task.AllocID, task.Name, checkID, check, task.DriverExec, 698 c.client, c.logger, c.shutdownCh) 699 ops.scripts = append(ops.scripts, sc) 700 701 // Skip getAddress for script checks 702 checkReg, err := createCheckReg(serviceID, checkID, check, "", 0) 703 if err != nil { 704 return nil, fmt.Errorf("failed to add script check %q: %v", check.Name, err) 705 } 706 ops.regChecks = append(ops.regChecks, checkReg) 707 continue 708 } 709 710 // Default to the service's port but allow check to override 711 portLabel := check.PortLabel 712 if portLabel == "" { 713 // Default to the service's port label 714 portLabel = service.PortLabel 715 } 716 717 // Checks address mode defaults to host for pre-#3380 backward compat 718 addrMode := check.AddressMode 719 if addrMode == "" { 720 addrMode = structs.AddressModeHost 721 } 722 723 ip, port, err := getAddress(addrMode, portLabel, task.Networks, task.DriverNetwork) 724 if err != nil { 725 return nil, fmt.Errorf("error getting address for check %q: %v", check.Name, err) 726 } 727 728 checkReg, err := createCheckReg(serviceID, checkID, check, ip, port) 729 if err != nil { 730 return nil, fmt.Errorf("failed to add check %q: %v", check.Name, err) 731 } 732 ops.regChecks = append(ops.regChecks, checkReg) 733 } 734 return checkIDs, nil 735 } 736 737 // RegisterTask with Consul. Adds all service entries and checks to Consul. If 738 // exec is nil and a script check exists an error is returned. 739 // 740 // If the service IP is set it used as the address in the service registration. 741 // Checks will always use the IP from the Task struct (host's IP). 742 // 743 // Actual communication with Consul is done asynchronously (see Run). 744 func (c *ServiceClient) RegisterTask(task *TaskServices) error { 745 // Fast path 746 numServices := len(task.Services) 747 if numServices == 0 { 748 return nil 749 } 750 751 t := new(TaskRegistration) 752 t.Services = make(map[string]*ServiceRegistration, numServices) 753 754 ops := &operations{} 755 for _, service := range task.Services { 756 sreg, err := c.serviceRegs(ops, service, task) 757 if err != nil { 758 return err 759 } 760 t.Services[sreg.serviceID] = sreg 761 } 762 763 // Add the task to the allocation's registration 764 c.addTaskRegistration(task.AllocID, task.Name, t) 765 766 c.commit(ops) 767 768 // Start watching checks. Done after service registrations are built 769 // since an error building them could leak watches. 770 for _, service := range task.Services { 771 serviceID := makeTaskServiceID(task.AllocID, task.Name, service, task.Canary) 772 for _, check := range service.Checks { 773 if check.TriggersRestarts() { 774 checkID := makeCheckID(serviceID, check) 775 c.checkWatcher.Watch(task.AllocID, task.Name, checkID, check, task.Restarter) 776 } 777 } 778 } 779 return nil 780 } 781 782 // UpdateTask in Consul. Does not alter the service if only checks have 783 // changed. 784 // 785 // DriverNetwork must not change between invocations for the same allocation. 786 func (c *ServiceClient) UpdateTask(old, newTask *TaskServices) error { 787 ops := &operations{} 788 789 taskReg := new(TaskRegistration) 790 taskReg.Services = make(map[string]*ServiceRegistration, len(newTask.Services)) 791 792 existingIDs := make(map[string]*structs.Service, len(old.Services)) 793 for _, s := range old.Services { 794 existingIDs[makeTaskServiceID(old.AllocID, old.Name, s, old.Canary)] = s 795 } 796 newIDs := make(map[string]*structs.Service, len(newTask.Services)) 797 for _, s := range newTask.Services { 798 newIDs[makeTaskServiceID(newTask.AllocID, newTask.Name, s, newTask.Canary)] = s 799 } 800 801 // Loop over existing Service IDs to see if they have been removed or 802 // updated. 803 for existingID, existingSvc := range existingIDs { 804 newSvc, ok := newIDs[existingID] 805 if !ok { 806 // Existing service entry removed 807 ops.deregServices = append(ops.deregServices, existingID) 808 for _, check := range existingSvc.Checks { 809 cid := makeCheckID(existingID, check) 810 ops.deregChecks = append(ops.deregChecks, cid) 811 812 // Unwatch watched checks 813 if check.TriggersRestarts() { 814 c.checkWatcher.Unwatch(cid) 815 } 816 } 817 continue 818 } 819 820 // Service exists and hasn't changed, don't re-add it later 821 delete(newIDs, existingID) 822 823 // Service still exists so add it to the task's registration 824 sreg := &ServiceRegistration{ 825 serviceID: existingID, 826 checkIDs: make(map[string]struct{}, len(newSvc.Checks)), 827 } 828 taskReg.Services[existingID] = sreg 829 830 // See if any checks were updated 831 existingChecks := make(map[string]*structs.ServiceCheck, len(existingSvc.Checks)) 832 for _, check := range existingSvc.Checks { 833 existingChecks[makeCheckID(existingID, check)] = check 834 } 835 836 // Register new checks 837 for _, check := range newSvc.Checks { 838 checkID := makeCheckID(existingID, check) 839 if _, exists := existingChecks[checkID]; exists { 840 // Check exists, so don't remove it 841 delete(existingChecks, checkID) 842 sreg.checkIDs[checkID] = struct{}{} 843 } 844 845 // New check on an unchanged service; add them now 846 newCheckIDs, err := c.checkRegs(ops, existingID, newSvc, newTask) 847 if err != nil { 848 return err 849 } 850 851 for _, checkID := range newCheckIDs { 852 sreg.checkIDs[checkID] = struct{}{} 853 854 } 855 856 // Update all watched checks as CheckRestart fields aren't part of ID 857 if check.TriggersRestarts() { 858 c.checkWatcher.Watch(newTask.AllocID, newTask.Name, checkID, check, newTask.Restarter) 859 } 860 } 861 862 // Remove existing checks not in updated service 863 for cid, check := range existingChecks { 864 ops.deregChecks = append(ops.deregChecks, cid) 865 866 // Unwatch checks 867 if check.TriggersRestarts() { 868 c.checkWatcher.Unwatch(cid) 869 } 870 } 871 } 872 873 // Any remaining services should just be enqueued directly 874 for _, newSvc := range newIDs { 875 sreg, err := c.serviceRegs(ops, newSvc, newTask) 876 if err != nil { 877 return err 878 } 879 880 taskReg.Services[sreg.serviceID] = sreg 881 } 882 883 // Add the task to the allocation's registration 884 c.addTaskRegistration(newTask.AllocID, newTask.Name, taskReg) 885 886 c.commit(ops) 887 888 // Start watching checks. Done after service registrations are built 889 // since an error building them could leak watches. 890 for _, service := range newIDs { 891 serviceID := makeTaskServiceID(newTask.AllocID, newTask.Name, service, newTask.Canary) 892 for _, check := range service.Checks { 893 if check.TriggersRestarts() { 894 checkID := makeCheckID(serviceID, check) 895 c.checkWatcher.Watch(newTask.AllocID, newTask.Name, checkID, check, newTask.Restarter) 896 } 897 } 898 } 899 return nil 900 } 901 902 // RemoveTask from Consul. Removes all service entries and checks. 903 // 904 // Actual communication with Consul is done asynchronously (see Run). 905 func (c *ServiceClient) RemoveTask(task *TaskServices) { 906 ops := operations{} 907 908 for _, service := range task.Services { 909 id := makeTaskServiceID(task.AllocID, task.Name, service, task.Canary) 910 ops.deregServices = append(ops.deregServices, id) 911 912 for _, check := range service.Checks { 913 cid := makeCheckID(id, check) 914 ops.deregChecks = append(ops.deregChecks, cid) 915 916 if check.TriggersRestarts() { 917 c.checkWatcher.Unwatch(cid) 918 } 919 } 920 } 921 922 // Remove the task from the alloc's registrations 923 c.removeTaskRegistration(task.AllocID, task.Name) 924 925 // Now add them to the deregistration fields; main Run loop will update 926 c.commit(&ops) 927 } 928 929 // AllocRegistrations returns the registrations for the given allocation. If the 930 // allocation has no reservations, the response is a nil object. 931 func (c *ServiceClient) AllocRegistrations(allocID string) (*AllocRegistration, error) { 932 // Get the internal struct using the lock 933 c.allocRegistrationsLock.RLock() 934 regInternal, ok := c.allocRegistrations[allocID] 935 if !ok { 936 c.allocRegistrationsLock.RUnlock() 937 return nil, nil 938 } 939 940 // Copy so we don't expose internal structs 941 reg := regInternal.copy() 942 c.allocRegistrationsLock.RUnlock() 943 944 // Query the services and checks to populate the allocation registrations. 945 services, err := c.client.Services() 946 if err != nil { 947 return nil, err 948 } 949 950 checks, err := c.client.Checks() 951 if err != nil { 952 return nil, err 953 } 954 955 // Populate the object 956 for _, treg := range reg.Tasks { 957 for serviceID, sreg := range treg.Services { 958 sreg.Service = services[serviceID] 959 for checkID := range sreg.checkIDs { 960 if check, ok := checks[checkID]; ok { 961 sreg.Checks = append(sreg.Checks, check) 962 } 963 } 964 } 965 } 966 967 return reg, nil 968 } 969 970 // Shutdown the Consul client. Update running task registrations and deregister 971 // agent from Consul. On first call blocks up to shutdownWait before giving up 972 // on syncing operations. 973 func (c *ServiceClient) Shutdown() error { 974 // Serialize Shutdown calls with RegisterAgent to prevent leaking agent 975 // entries. 976 c.agentLock.Lock() 977 defer c.agentLock.Unlock() 978 select { 979 case <-c.shutdownCh: 980 return nil 981 default: 982 close(c.shutdownCh) 983 } 984 985 // Give run loop time to sync, but don't block indefinitely 986 deadline := time.After(c.shutdownWait) 987 988 // Wait for Run to finish any outstanding operations and exit 989 select { 990 case <-c.exitCh: 991 case <-deadline: 992 // Don't wait forever though 993 } 994 995 // If Consul was never seen nothing could be written so exit early 996 if !c.hasSeen() { 997 return nil 998 } 999 1000 // Always attempt to deregister Nomad agent Consul entries, even if 1001 // deadline was reached 1002 for id := range c.agentServices { 1003 if err := c.client.ServiceDeregister(id); err != nil { 1004 c.logger.Printf("[ERR] consul.sync: error deregistering agent service (id: %q): %v", id, err) 1005 } 1006 } 1007 for id := range c.agentChecks { 1008 if err := c.client.CheckDeregister(id); err != nil { 1009 c.logger.Printf("[ERR] consul.sync: error deregistering agent service (id: %q): %v", id, err) 1010 } 1011 } 1012 1013 // Give script checks time to exit (no need to lock as Run() has exited) 1014 for _, h := range c.runningScripts { 1015 select { 1016 case <-h.wait(): 1017 case <-deadline: 1018 return fmt.Errorf("timed out waiting for script checks to run") 1019 } 1020 } 1021 return nil 1022 } 1023 1024 // addTaskRegistration adds the task registration for the given allocation. 1025 func (c *ServiceClient) addTaskRegistration(allocID, taskName string, reg *TaskRegistration) { 1026 c.allocRegistrationsLock.Lock() 1027 defer c.allocRegistrationsLock.Unlock() 1028 1029 alloc, ok := c.allocRegistrations[allocID] 1030 if !ok { 1031 alloc = &AllocRegistration{ 1032 Tasks: make(map[string]*TaskRegistration), 1033 } 1034 c.allocRegistrations[allocID] = alloc 1035 } 1036 alloc.Tasks[taskName] = reg 1037 } 1038 1039 // removeTaskRegistration removes the task registration for the given allocation. 1040 func (c *ServiceClient) removeTaskRegistration(allocID, taskName string) { 1041 c.allocRegistrationsLock.Lock() 1042 defer c.allocRegistrationsLock.Unlock() 1043 1044 alloc, ok := c.allocRegistrations[allocID] 1045 if !ok { 1046 return 1047 } 1048 1049 // Delete the task and if it is the last one also delete the alloc's 1050 // registration 1051 delete(alloc.Tasks, taskName) 1052 if len(alloc.Tasks) == 0 { 1053 delete(c.allocRegistrations, allocID) 1054 } 1055 } 1056 1057 // makeAgentServiceID creates a unique ID for identifying an agent service in 1058 // Consul. 1059 // 1060 // Agent service IDs are of the form: 1061 // 1062 // {nomadServicePrefix}-{ROLE}-b32(sha1({Service.Name}-{Service.Tags...}) 1063 // Example Server ID: _nomad-server-fbbk265qn4tmt25nd4ep42tjvmyj3hr4 1064 // Example Client ID: _nomad-client-ggnjpgl7yn7rgmvxzilmpvrzzvrszc7l 1065 // 1066 func makeAgentServiceID(role string, service *structs.Service) string { 1067 return fmt.Sprintf("%s-%s-%s", nomadServicePrefix, role, service.Hash(role, "", false)) 1068 } 1069 1070 // makeTaskServiceID creates a unique ID for identifying a task service in 1071 // Consul. All structs.Service fields are included in the ID's hash except 1072 // Checks. This allows updates to merely compare IDs. 1073 // 1074 // Example Service ID: _nomad-task-TNM333JKJPM5AK4FAS3VXQLXFDWOF4VH 1075 func makeTaskServiceID(allocID, taskName string, service *structs.Service, canary bool) string { 1076 return nomadTaskPrefix + service.Hash(allocID, taskName, canary) 1077 } 1078 1079 // makeCheckID creates a unique ID for a check. 1080 func makeCheckID(serviceID string, check *structs.ServiceCheck) string { 1081 return check.Hash(serviceID) 1082 } 1083 1084 // createCheckReg creates a Check that can be registered with Consul. 1085 // 1086 // Script checks simply have a TTL set and the caller is responsible for 1087 // running the script and heartbeating. 1088 func createCheckReg(serviceID, checkID string, check *structs.ServiceCheck, host string, port int) (*api.AgentCheckRegistration, error) { 1089 chkReg := api.AgentCheckRegistration{ 1090 ID: checkID, 1091 Name: check.Name, 1092 ServiceID: serviceID, 1093 } 1094 chkReg.Status = check.InitialStatus 1095 chkReg.Timeout = check.Timeout.String() 1096 chkReg.Interval = check.Interval.String() 1097 1098 // Require an address for http or tcp checks 1099 if port == 0 && check.RequiresPort() { 1100 return nil, fmt.Errorf("%s checks require an address", check.Type) 1101 } 1102 1103 switch check.Type { 1104 case structs.ServiceCheckHTTP: 1105 proto := check.Protocol 1106 if proto == "" { 1107 proto = "http" 1108 } 1109 if check.TLSSkipVerify { 1110 chkReg.TLSSkipVerify = true 1111 } 1112 base := url.URL{ 1113 Scheme: proto, 1114 Host: net.JoinHostPort(host, strconv.Itoa(port)), 1115 } 1116 relative, err := url.Parse(check.Path) 1117 if err != nil { 1118 return nil, err 1119 } 1120 url := base.ResolveReference(relative) 1121 chkReg.HTTP = url.String() 1122 chkReg.Method = check.Method 1123 chkReg.Header = check.Header 1124 1125 case structs.ServiceCheckTCP: 1126 chkReg.TCP = net.JoinHostPort(host, strconv.Itoa(port)) 1127 1128 case structs.ServiceCheckScript: 1129 chkReg.TTL = (check.Interval + ttlCheckBuffer).String() 1130 // As of Consul 1.0.0 setting TTL and Interval is a 400 1131 chkReg.Interval = "" 1132 1133 case structs.ServiceCheckGRPC: 1134 chkReg.GRPC = fmt.Sprintf("%s/%s", net.JoinHostPort(host, strconv.Itoa(port)), check.GRPCService) 1135 chkReg.GRPCUseTLS = check.GRPCUseTLS 1136 if check.TLSSkipVerify { 1137 chkReg.TLSSkipVerify = true 1138 } 1139 1140 default: 1141 return nil, fmt.Errorf("check type %+q not valid", check.Type) 1142 } 1143 return &chkReg, nil 1144 } 1145 1146 // isNomadService returns true if the ID matches the pattern of a Nomad managed 1147 // service (new or old formats). Agent services return false as independent 1148 // client and server agents may be running on the same machine. #2827 1149 func isNomadService(id string) bool { 1150 return strings.HasPrefix(id, nomadTaskPrefix) || isOldNomadService(id) 1151 } 1152 1153 // isOldNomadService returns true if the ID matches an old pattern managed by 1154 // Nomad. 1155 // 1156 // Pre-0.7.1 task service IDs are of the form: 1157 // 1158 // {nomadServicePrefix}-executor-{ALLOC_ID}-{Service.Name}-{Service.Tags...} 1159 // Example Service ID: _nomad-executor-1234-echo-http-tag1-tag2-tag3 1160 // 1161 func isOldNomadService(id string) bool { 1162 const prefix = nomadServicePrefix + "-executor" 1163 return strings.HasPrefix(id, prefix) 1164 } 1165 1166 // getAddress returns the IP and port to use for a service or check. If no port 1167 // label is specified (an empty value), zero values are returned because no 1168 // address could be resolved. 1169 func getAddress(addrMode, portLabel string, networks structs.Networks, driverNet *cstructs.DriverNetwork) (string, int, error) { 1170 switch addrMode { 1171 case structs.AddressModeAuto: 1172 if driverNet.Advertise() { 1173 addrMode = structs.AddressModeDriver 1174 } else { 1175 addrMode = structs.AddressModeHost 1176 } 1177 return getAddress(addrMode, portLabel, networks, driverNet) 1178 case structs.AddressModeHost: 1179 if portLabel == "" { 1180 if len(networks) != 1 { 1181 // If no networks are specified return zero 1182 // values. Consul will advertise the host IP 1183 // with no port. This is the pre-0.7.1 behavior 1184 // some people rely on. 1185 return "", 0, nil 1186 } 1187 1188 return networks[0].IP, 0, nil 1189 } 1190 1191 // Default path: use host ip:port 1192 ip, port := networks.Port(portLabel) 1193 if ip == "" && port <= 0 { 1194 return "", 0, fmt.Errorf("invalid port %q: port label not found", portLabel) 1195 } 1196 return ip, port, nil 1197 1198 case structs.AddressModeDriver: 1199 // Require a driver network if driver address mode is used 1200 if driverNet == nil { 1201 return "", 0, fmt.Errorf(`cannot use address_mode="driver": no driver network exists`) 1202 } 1203 1204 // If no port label is specified just return the IP 1205 if portLabel == "" { 1206 return driverNet.IP, 0, nil 1207 } 1208 1209 // If the port is a label, use the driver's port (not the host's) 1210 if port, ok := driverNet.PortMap[portLabel]; ok { 1211 return driverNet.IP, port, nil 1212 } 1213 1214 // If port isn't a label, try to parse it as a literal port number 1215 port, err := strconv.Atoi(portLabel) 1216 if err != nil { 1217 // Don't include Atoi error message as user likely 1218 // never intended it to be a numeric and it creates a 1219 // confusing error message 1220 return "", 0, fmt.Errorf("invalid port label %q: port labels in driver address_mode must be numeric or in the driver's port map", portLabel) 1221 } 1222 if port <= 0 { 1223 return "", 0, fmt.Errorf("invalid port: %q: port must be >0", portLabel) 1224 } 1225 1226 return driverNet.IP, port, nil 1227 1228 default: 1229 // Shouldn't happen due to validation, but enforce invariants 1230 return "", 0, fmt.Errorf("invalid address mode %q", addrMode) 1231 } 1232 }