github.com/ferranbt/nomad@v0.9.3-0.20190607002617-85c449b7667c/command/agent/consul/client.go (about) 1 package consul 2 3 import ( 4 "context" 5 "fmt" 6 "net" 7 "net/url" 8 "reflect" 9 "strconv" 10 "strings" 11 "sync" 12 "sync/atomic" 13 "time" 14 15 metrics "github.com/armon/go-metrics" 16 log "github.com/hashicorp/go-hclog" 17 18 "github.com/hashicorp/consul/api" 19 "github.com/hashicorp/nomad/helper" 20 "github.com/hashicorp/nomad/nomad/structs" 21 "github.com/hashicorp/nomad/plugins/drivers" 22 ) 23 24 const ( 25 // nomadServicePrefix is the prefix that scopes all Nomad registered 26 // services (both agent and task entries). 27 nomadServicePrefix = "_nomad" 28 29 // nomadTaskPrefix is the prefix that scopes Nomad registered services 30 // for tasks. 31 nomadTaskPrefix = nomadServicePrefix + "-task-" 32 33 // nomadCheckPrefix is the prefix that scopes Nomad registered checks for 34 // services. 35 nomadCheckPrefix = nomadServicePrefix + "-check-" 36 37 // defaultRetryInterval is how quickly to retry syncing services and 38 // checks to Consul when an error occurs. Will backoff up to a max. 39 defaultRetryInterval = time.Second 40 41 // defaultMaxRetryInterval is the default max retry interval. 42 defaultMaxRetryInterval = 30 * time.Second 43 44 // defaultPeriodicalInterval is the interval at which the service 45 // client reconciles state between the desired services and checks and 46 // what's actually registered in Consul. This is done at an interval, 47 // rather than being purely edge triggered, to handle the case that the 48 // Consul agent's state may change underneath us 49 defaultPeriodicInterval = 30 * time.Second 50 51 // ttlCheckBuffer is the time interval that Nomad can take to report Consul 52 // the check result 53 ttlCheckBuffer = 31 * time.Second 54 55 // defaultShutdownWait is how long Shutdown() should block waiting for 56 // enqueued operations to sync to Consul by default. 57 defaultShutdownWait = time.Minute 58 59 // DefaultQueryWaitDuration is the max duration the Consul Agent will 60 // spend waiting for a response from a Consul Query. 61 DefaultQueryWaitDuration = 2 * time.Second 62 63 // ServiceTagHTTP is the tag assigned to HTTP services 64 ServiceTagHTTP = "http" 65 66 // ServiceTagRPC is the tag assigned to RPC services 67 ServiceTagRPC = "rpc" 68 69 // ServiceTagSerf is the tag assigned to Serf services 70 ServiceTagSerf = "serf" 71 ) 72 73 // CatalogAPI is the consul/api.Catalog API used by Nomad. 74 type CatalogAPI interface { 75 Datacenters() ([]string, error) 76 Service(service, tag string, q *api.QueryOptions) ([]*api.CatalogService, *api.QueryMeta, error) 77 } 78 79 // AgentAPI is the consul/api.Agent API used by Nomad. 80 type AgentAPI interface { 81 Services() (map[string]*api.AgentService, error) 82 Checks() (map[string]*api.AgentCheck, error) 83 CheckRegister(check *api.AgentCheckRegistration) error 84 CheckDeregister(checkID string) error 85 Self() (map[string]map[string]interface{}, error) 86 ServiceRegister(service *api.AgentServiceRegistration) error 87 ServiceDeregister(serviceID string) error 88 UpdateTTL(id, output, status string) error 89 } 90 91 func agentServiceUpdateRequired(reg *api.AgentServiceRegistration, svc *api.AgentService) bool { 92 return !(reg.Kind == svc.Kind && 93 reg.ID == svc.ID && 94 reg.Port == svc.Port && 95 reg.Address == svc.Address && 96 reg.Name == svc.Service && 97 reflect.DeepEqual(reg.Tags, svc.Tags)) 98 } 99 100 // operations are submitted to the main loop via commit() for synchronizing 101 // with Consul. 102 type operations struct { 103 regServices []*api.AgentServiceRegistration 104 regChecks []*api.AgentCheckRegistration 105 scripts []*scriptCheck 106 107 deregServices []string 108 deregChecks []string 109 } 110 111 // AllocRegistration holds the status of services registered for a particular 112 // allocations by task. 113 type AllocRegistration struct { 114 // Tasks maps the name of a task to its registered services and checks 115 Tasks map[string]*TaskRegistration 116 } 117 118 func (a *AllocRegistration) copy() *AllocRegistration { 119 c := &AllocRegistration{ 120 Tasks: make(map[string]*TaskRegistration, len(a.Tasks)), 121 } 122 123 for k, v := range a.Tasks { 124 c.Tasks[k] = v.copy() 125 } 126 127 return c 128 } 129 130 // NumServices returns the number of registered services 131 func (a *AllocRegistration) NumServices() int { 132 if a == nil { 133 return 0 134 } 135 136 total := 0 137 for _, treg := range a.Tasks { 138 for _, sreg := range treg.Services { 139 if sreg.Service != nil { 140 total++ 141 } 142 } 143 } 144 145 return total 146 } 147 148 // NumChecks returns the number of registered checks 149 func (a *AllocRegistration) NumChecks() int { 150 if a == nil { 151 return 0 152 } 153 154 total := 0 155 for _, treg := range a.Tasks { 156 for _, sreg := range treg.Services { 157 total += len(sreg.Checks) 158 } 159 } 160 161 return total 162 } 163 164 // TaskRegistration holds the status of services registered for a particular 165 // task. 166 type TaskRegistration struct { 167 Services map[string]*ServiceRegistration 168 } 169 170 func (t *TaskRegistration) copy() *TaskRegistration { 171 c := &TaskRegistration{ 172 Services: make(map[string]*ServiceRegistration, len(t.Services)), 173 } 174 175 for k, v := range t.Services { 176 c.Services[k] = v.copy() 177 } 178 179 return c 180 } 181 182 // ServiceRegistration holds the status of a registered Consul Service and its 183 // Checks. 184 type ServiceRegistration struct { 185 // serviceID and checkIDs are internal fields that track just the IDs of the 186 // services/checks registered in Consul. It is used to materialize the other 187 // fields when queried. 188 serviceID string 189 checkIDs map[string]struct{} 190 191 // Service is the AgentService registered in Consul. 192 Service *api.AgentService 193 194 // Checks is the status of the registered checks. 195 Checks []*api.AgentCheck 196 } 197 198 func (s *ServiceRegistration) copy() *ServiceRegistration { 199 // Copy does not copy the external fields but only the internal fields. This 200 // is so that the caller of AllocRegistrations can not access the internal 201 // fields and that method uses these fields to populate the external fields. 202 return &ServiceRegistration{ 203 serviceID: s.serviceID, 204 checkIDs: helper.CopyMapStringStruct(s.checkIDs), 205 } 206 } 207 208 // ServiceClient handles task and agent service registration with Consul. 209 type ServiceClient struct { 210 client AgentAPI 211 logger log.Logger 212 retryInterval time.Duration 213 maxRetryInterval time.Duration 214 periodicInterval time.Duration 215 216 // exitCh is closed when the main Run loop exits 217 exitCh chan struct{} 218 219 // shutdownCh is closed when the client should shutdown 220 shutdownCh chan struct{} 221 222 // shutdownWait is how long Shutdown() blocks waiting for the final 223 // sync() to finish. Defaults to defaultShutdownWait 224 shutdownWait time.Duration 225 226 opCh chan *operations 227 228 services map[string]*api.AgentServiceRegistration 229 checks map[string]*api.AgentCheckRegistration 230 scripts map[string]*scriptCheck 231 runningScripts map[string]*scriptHandle 232 233 // allocRegistrations stores the services and checks that are registered 234 // with Consul by allocation ID. 235 allocRegistrations map[string]*AllocRegistration 236 allocRegistrationsLock sync.RWMutex 237 238 // agent services and checks record entries for the agent itself which 239 // should be removed on shutdown 240 agentServices map[string]struct{} 241 agentChecks map[string]struct{} 242 agentLock sync.Mutex 243 244 // seen is 1 if Consul has ever been seen; otherwise 0. Accessed with 245 // atomics. 246 seen int32 247 248 // checkWatcher restarts checks that are unhealthy. 249 checkWatcher *checkWatcher 250 251 // isClientAgent specifies whether this Consul client is being used 252 // by a Nomad client. 253 isClientAgent bool 254 } 255 256 // NewServiceClient creates a new Consul ServiceClient from an existing Consul API 257 // Client, logger and takes whether the client is being used by a Nomad Client agent. 258 // When being used by a Nomad client, this Consul client reconciles all services and 259 // checks created by Nomad on behalf of running tasks. 260 func NewServiceClient(consulClient AgentAPI, logger log.Logger, isNomadClient bool) *ServiceClient { 261 logger = logger.ResetNamed("consul.sync") 262 return &ServiceClient{ 263 client: consulClient, 264 logger: logger, 265 retryInterval: defaultRetryInterval, 266 maxRetryInterval: defaultMaxRetryInterval, 267 periodicInterval: defaultPeriodicInterval, 268 exitCh: make(chan struct{}), 269 shutdownCh: make(chan struct{}), 270 shutdownWait: defaultShutdownWait, 271 opCh: make(chan *operations, 8), 272 services: make(map[string]*api.AgentServiceRegistration), 273 checks: make(map[string]*api.AgentCheckRegistration), 274 scripts: make(map[string]*scriptCheck), 275 runningScripts: make(map[string]*scriptHandle), 276 allocRegistrations: make(map[string]*AllocRegistration), 277 agentServices: make(map[string]struct{}), 278 agentChecks: make(map[string]struct{}), 279 checkWatcher: newCheckWatcher(logger, consulClient), 280 isClientAgent: isNomadClient, 281 } 282 } 283 284 // seen is used by markSeen and hasSeen 285 const seen = 1 286 287 // markSeen marks Consul as having been seen (meaning at least one operation 288 // has succeeded). 289 func (c *ServiceClient) markSeen() { 290 atomic.StoreInt32(&c.seen, seen) 291 } 292 293 // hasSeen returns true if any Consul operation has ever succeeded. Useful to 294 // squelch errors if Consul isn't running. 295 func (c *ServiceClient) hasSeen() bool { 296 return atomic.LoadInt32(&c.seen) == seen 297 } 298 299 // Run the Consul main loop which retries operations against Consul. It should 300 // be called exactly once. 301 func (c *ServiceClient) Run() { 302 defer close(c.exitCh) 303 304 ctx, cancel := context.WithCancel(context.Background()) 305 defer cancel() 306 307 // init will be closed when Consul has been contacted 308 init := make(chan struct{}) 309 go checkConsulTLSSkipVerify(ctx, c.logger, c.client, init) 310 311 // Process operations while waiting for initial contact with Consul but 312 // do not sync until contact has been made. 313 INIT: 314 for { 315 select { 316 case <-init: 317 c.markSeen() 318 break INIT 319 case <-c.shutdownCh: 320 return 321 case ops := <-c.opCh: 322 c.merge(ops) 323 } 324 } 325 c.logger.Trace("able to contact Consul") 326 327 // Block until contact with Consul has been established 328 // Start checkWatcher 329 go c.checkWatcher.Run(ctx) 330 331 // Always immediately sync to reconcile Nomad and Consul's state 332 retryTimer := time.NewTimer(0) 333 334 failures := 0 335 for { 336 select { 337 case <-retryTimer.C: 338 case <-c.shutdownCh: 339 // Cancel check watcher but sync one last time 340 cancel() 341 case ops := <-c.opCh: 342 c.merge(ops) 343 } 344 345 if err := c.sync(); err != nil { 346 if failures == 0 { 347 // Log on the first failure 348 c.logger.Warn("failed to update services in Consul", "error", err) 349 } else if failures%10 == 0 { 350 // Log every 10th consecutive failure 351 c.logger.Error("still unable to update services in Consul", "failures", failures, "error", err) 352 } 353 354 failures++ 355 if !retryTimer.Stop() { 356 // Timer already expired, since the timer may 357 // or may not have been read in the select{} 358 // above, conditionally receive on it 359 select { 360 case <-retryTimer.C: 361 default: 362 } 363 } 364 backoff := c.retryInterval * time.Duration(failures) 365 if backoff > c.maxRetryInterval { 366 backoff = c.maxRetryInterval 367 } 368 retryTimer.Reset(backoff) 369 } else { 370 if failures > 0 { 371 c.logger.Info("successfully updated services in Consul") 372 failures = 0 373 } 374 375 // Reset timer to periodic interval to periodically 376 // reconile with Consul 377 if !retryTimer.Stop() { 378 select { 379 case <-retryTimer.C: 380 default: 381 } 382 } 383 retryTimer.Reset(c.periodicInterval) 384 } 385 386 select { 387 case <-c.shutdownCh: 388 // Exit only after sync'ing all outstanding operations 389 if len(c.opCh) > 0 { 390 for len(c.opCh) > 0 { 391 c.merge(<-c.opCh) 392 } 393 continue 394 } 395 return 396 default: 397 } 398 399 } 400 } 401 402 // commit operations unless already shutting down. 403 func (c *ServiceClient) commit(ops *operations) { 404 select { 405 case c.opCh <- ops: 406 case <-c.shutdownCh: 407 } 408 } 409 410 // merge registrations into state map prior to sync'ing with Consul 411 func (c *ServiceClient) merge(ops *operations) { 412 for _, s := range ops.regServices { 413 c.services[s.ID] = s 414 } 415 for _, check := range ops.regChecks { 416 c.checks[check.ID] = check 417 } 418 for _, s := range ops.scripts { 419 c.scripts[s.id] = s 420 } 421 for _, sid := range ops.deregServices { 422 delete(c.services, sid) 423 } 424 for _, cid := range ops.deregChecks { 425 if script, ok := c.runningScripts[cid]; ok { 426 script.cancel() 427 delete(c.scripts, cid) 428 delete(c.runningScripts, cid) 429 } 430 delete(c.checks, cid) 431 } 432 metrics.SetGauge([]string{"client", "consul", "services"}, float32(len(c.services))) 433 metrics.SetGauge([]string{"client", "consul", "checks"}, float32(len(c.checks))) 434 metrics.SetGauge([]string{"client", "consul", "script_checks"}, float32(len(c.runningScripts))) 435 } 436 437 // sync enqueued operations. 438 func (c *ServiceClient) sync() error { 439 sreg, creg, sdereg, cdereg := 0, 0, 0, 0 440 441 consulServices, err := c.client.Services() 442 if err != nil { 443 metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1) 444 return fmt.Errorf("error querying Consul services: %v", err) 445 } 446 447 consulChecks, err := c.client.Checks() 448 if err != nil { 449 metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1) 450 return fmt.Errorf("error querying Consul checks: %v", err) 451 } 452 453 // Remove Nomad services in Consul but unknown locally 454 for id := range consulServices { 455 if _, ok := c.services[id]; ok { 456 // Known service, skip 457 continue 458 } 459 460 // Ignore if this is not a Nomad managed service. Also ignore 461 // Nomad managed services if this is not a client agent. 462 // This is to prevent server agents from removing services 463 // registered by client agents 464 if !isNomadService(id) || !c.isClientAgent { 465 // Not managed by Nomad, skip 466 continue 467 } 468 469 // Unknown Nomad managed service; kill 470 if err := c.client.ServiceDeregister(id); err != nil { 471 if isOldNomadService(id) { 472 // Don't hard-fail on old entries. See #3620 473 continue 474 } 475 476 metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1) 477 return err 478 } 479 sdereg++ 480 metrics.IncrCounter([]string{"client", "consul", "service_deregistrations"}, 1) 481 } 482 483 // Add Nomad services missing from Consul, or where the service has been updated. 484 for id, locals := range c.services { 485 existingSvc, ok := consulServices[id] 486 487 if ok { 488 // There is an existing registration of this service in Consul, so here 489 // we validate to see if the service has been invalidated to see if it 490 // should be updated. 491 if !agentServiceUpdateRequired(locals, existingSvc) { 492 // No Need to update services that have not changed 493 continue 494 } 495 } 496 497 if err = c.client.ServiceRegister(locals); err != nil { 498 metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1) 499 return err 500 } 501 sreg++ 502 metrics.IncrCounter([]string{"client", "consul", "service_registrations"}, 1) 503 } 504 505 // Remove Nomad checks in Consul but unknown locally 506 for id, check := range consulChecks { 507 if _, ok := c.checks[id]; ok { 508 // Known check, leave it 509 continue 510 } 511 512 // Ignore if this is not a Nomad managed check. Also ignore 513 // Nomad managed checks if this is not a client agent. 514 // This is to prevent server agents from removing checks 515 // registered by client agents 516 if !isNomadService(check.ServiceID) || !c.isClientAgent || !isNomadCheck(check.CheckID) { 517 // Service not managed by Nomad, skip 518 continue 519 } 520 521 // Unknown Nomad managed check; remove 522 if err := c.client.CheckDeregister(id); err != nil { 523 if isOldNomadService(check.ServiceID) { 524 // Don't hard-fail on old entries. 525 continue 526 } 527 528 metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1) 529 return err 530 } 531 cdereg++ 532 metrics.IncrCounter([]string{"client", "consul", "check_deregistrations"}, 1) 533 } 534 535 // Add Nomad checks missing from Consul 536 for id, check := range c.checks { 537 if _, ok := consulChecks[id]; ok { 538 // Already in Consul; skipping 539 continue 540 } 541 542 if err := c.client.CheckRegister(check); err != nil { 543 metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1) 544 return err 545 } 546 creg++ 547 metrics.IncrCounter([]string{"client", "consul", "check_registrations"}, 1) 548 549 // Handle starting scripts 550 if script, ok := c.scripts[id]; ok { 551 // If it's already running, cancel and replace 552 if oldScript, running := c.runningScripts[id]; running { 553 oldScript.cancel() 554 } 555 // Start and store the handle 556 c.runningScripts[id] = script.run() 557 } 558 } 559 560 // Only log if something was actually synced 561 if sreg > 0 || sdereg > 0 || creg > 0 || cdereg > 0 { 562 c.logger.Debug("sync complete", "registered_services", sreg, "deregistered_services", sdereg, 563 "registered_checks", creg, "deregistered_checks", cdereg) 564 } 565 return nil 566 } 567 568 // RegisterAgent registers Nomad agents (client or server). The 569 // Service.PortLabel should be a literal port to be parsed with SplitHostPort. 570 // Script checks are not supported and will return an error. Registration is 571 // asynchronous. 572 // 573 // Agents will be deregistered when Shutdown is called. 574 func (c *ServiceClient) RegisterAgent(role string, services []*structs.Service) error { 575 ops := operations{} 576 577 for _, service := range services { 578 id := makeAgentServiceID(role, service) 579 580 // Unlike tasks, agents don't use port labels. Agent ports are 581 // stored directly in the PortLabel. 582 host, rawport, err := net.SplitHostPort(service.PortLabel) 583 if err != nil { 584 return fmt.Errorf("error parsing port label %q from service %q: %v", service.PortLabel, service.Name, err) 585 } 586 port, err := strconv.Atoi(rawport) 587 if err != nil { 588 return fmt.Errorf("error parsing port %q from service %q: %v", rawport, service.Name, err) 589 } 590 serviceReg := &api.AgentServiceRegistration{ 591 ID: id, 592 Name: service.Name, 593 Tags: service.Tags, 594 Address: host, 595 Port: port, 596 // This enables the consul UI to show that Nomad registered this service 597 Meta: map[string]string{ 598 "external-source": "nomad", 599 }, 600 } 601 ops.regServices = append(ops.regServices, serviceReg) 602 603 for _, check := range service.Checks { 604 checkID := makeCheckID(id, check) 605 if check.Type == structs.ServiceCheckScript { 606 return fmt.Errorf("service %q contains invalid check: agent checks do not support scripts", service.Name) 607 } 608 checkHost, checkPort := serviceReg.Address, serviceReg.Port 609 if check.PortLabel != "" { 610 // Unlike tasks, agents don't use port labels. Agent ports are 611 // stored directly in the PortLabel. 612 host, rawport, err := net.SplitHostPort(check.PortLabel) 613 if err != nil { 614 return fmt.Errorf("error parsing port label %q from check %q: %v", service.PortLabel, check.Name, err) 615 } 616 port, err := strconv.Atoi(rawport) 617 if err != nil { 618 return fmt.Errorf("error parsing port %q from check %q: %v", rawport, check.Name, err) 619 } 620 checkHost, checkPort = host, port 621 } 622 checkReg, err := createCheckReg(id, checkID, check, checkHost, checkPort) 623 if err != nil { 624 return fmt.Errorf("failed to add check %q: %v", check.Name, err) 625 } 626 ops.regChecks = append(ops.regChecks, checkReg) 627 } 628 } 629 630 // Don't bother committing agent checks if we're already shutting down 631 c.agentLock.Lock() 632 defer c.agentLock.Unlock() 633 select { 634 case <-c.shutdownCh: 635 return nil 636 default: 637 } 638 639 // Now add them to the registration queue 640 c.commit(&ops) 641 642 // Record IDs for deregistering on shutdown 643 for _, id := range ops.regServices { 644 c.agentServices[id.ID] = struct{}{} 645 } 646 for _, id := range ops.regChecks { 647 c.agentChecks[id.ID] = struct{}{} 648 } 649 return nil 650 } 651 652 // serviceRegs creates service registrations, check registrations, and script 653 // checks from a service. It returns a service registration object with the 654 // service and check IDs populated. 655 func (c *ServiceClient) serviceRegs(ops *operations, service *structs.Service, task *TaskServices) ( 656 *ServiceRegistration, error) { 657 658 // Get the services ID 659 id := makeTaskServiceID(task.AllocID, task.Name, service, task.Canary) 660 sreg := &ServiceRegistration{ 661 serviceID: id, 662 checkIDs: make(map[string]struct{}, len(service.Checks)), 663 } 664 665 // Service address modes default to auto 666 addrMode := service.AddressMode 667 if addrMode == "" { 668 addrMode = structs.AddressModeAuto 669 } 670 671 // Determine the address to advertise based on the mode 672 ip, port, err := getAddress(addrMode, service.PortLabel, task.Networks, task.DriverNetwork) 673 if err != nil { 674 return nil, fmt.Errorf("unable to get address for service %q: %v", service.Name, err) 675 } 676 677 // Determine whether to use tags or canary_tags 678 var tags []string 679 if task.Canary && len(service.CanaryTags) > 0 { 680 tags = make([]string, len(service.CanaryTags)) 681 copy(tags, service.CanaryTags) 682 } else { 683 tags = make([]string, len(service.Tags)) 684 copy(tags, service.Tags) 685 } 686 687 // Build the Consul Service registration request 688 serviceReg := &api.AgentServiceRegistration{ 689 ID: id, 690 Name: service.Name, 691 Tags: tags, 692 Address: ip, 693 Port: port, 694 // This enables the consul UI to show that Nomad registered this service 695 Meta: map[string]string{ 696 "external-source": "nomad", 697 }, 698 } 699 ops.regServices = append(ops.regServices, serviceReg) 700 701 // Build the check registrations 702 checkIDs, err := c.checkRegs(ops, id, service, task) 703 if err != nil { 704 return nil, err 705 } 706 for _, cid := range checkIDs { 707 sreg.checkIDs[cid] = struct{}{} 708 } 709 return sreg, nil 710 } 711 712 // checkRegs registers the checks for the given service and returns the 713 // registered check ids. 714 func (c *ServiceClient) checkRegs(ops *operations, serviceID string, service *structs.Service, 715 task *TaskServices) ([]string, error) { 716 717 // Fast path 718 numChecks := len(service.Checks) 719 if numChecks == 0 { 720 return nil, nil 721 } 722 723 checkIDs := make([]string, 0, numChecks) 724 for _, check := range service.Checks { 725 checkID := makeCheckID(serviceID, check) 726 checkIDs = append(checkIDs, checkID) 727 if check.Type == structs.ServiceCheckScript { 728 if task.DriverExec == nil { 729 return nil, fmt.Errorf("driver doesn't support script checks") 730 } 731 732 sc := newScriptCheck(task.AllocID, task.Name, checkID, check, task.DriverExec, 733 c.client, c.logger, c.shutdownCh) 734 ops.scripts = append(ops.scripts, sc) 735 736 // Skip getAddress for script checks 737 checkReg, err := createCheckReg(serviceID, checkID, check, "", 0) 738 if err != nil { 739 return nil, fmt.Errorf("failed to add script check %q: %v", check.Name, err) 740 } 741 ops.regChecks = append(ops.regChecks, checkReg) 742 continue 743 } 744 745 // Default to the service's port but allow check to override 746 portLabel := check.PortLabel 747 if portLabel == "" { 748 // Default to the service's port label 749 portLabel = service.PortLabel 750 } 751 752 // Checks address mode defaults to host for pre-#3380 backward compat 753 addrMode := check.AddressMode 754 if addrMode == "" { 755 addrMode = structs.AddressModeHost 756 } 757 758 ip, port, err := getAddress(addrMode, portLabel, task.Networks, task.DriverNetwork) 759 if err != nil { 760 return nil, fmt.Errorf("error getting address for check %q: %v", check.Name, err) 761 } 762 763 checkReg, err := createCheckReg(serviceID, checkID, check, ip, port) 764 if err != nil { 765 return nil, fmt.Errorf("failed to add check %q: %v", check.Name, err) 766 } 767 ops.regChecks = append(ops.regChecks, checkReg) 768 } 769 return checkIDs, nil 770 } 771 772 // RegisterTask with Consul. Adds all service entries and checks to Consul. If 773 // exec is nil and a script check exists an error is returned. 774 // 775 // If the service IP is set it used as the address in the service registration. 776 // Checks will always use the IP from the Task struct (host's IP). 777 // 778 // Actual communication with Consul is done asynchronously (see Run). 779 func (c *ServiceClient) RegisterTask(task *TaskServices) error { 780 // Fast path 781 numServices := len(task.Services) 782 if numServices == 0 { 783 return nil 784 } 785 786 t := new(TaskRegistration) 787 t.Services = make(map[string]*ServiceRegistration, numServices) 788 789 ops := &operations{} 790 for _, service := range task.Services { 791 sreg, err := c.serviceRegs(ops, service, task) 792 if err != nil { 793 return err 794 } 795 t.Services[sreg.serviceID] = sreg 796 } 797 798 // Add the task to the allocation's registration 799 c.addTaskRegistration(task.AllocID, task.Name, t) 800 801 c.commit(ops) 802 803 // Start watching checks. Done after service registrations are built 804 // since an error building them could leak watches. 805 for _, service := range task.Services { 806 serviceID := makeTaskServiceID(task.AllocID, task.Name, service, task.Canary) 807 for _, check := range service.Checks { 808 if check.TriggersRestarts() { 809 checkID := makeCheckID(serviceID, check) 810 c.checkWatcher.Watch(task.AllocID, task.Name, checkID, check, task.Restarter) 811 } 812 } 813 } 814 return nil 815 } 816 817 // UpdateTask in Consul. Does not alter the service if only checks have 818 // changed. 819 // 820 // DriverNetwork must not change between invocations for the same allocation. 821 func (c *ServiceClient) UpdateTask(old, newTask *TaskServices) error { 822 ops := &operations{} 823 824 taskReg := new(TaskRegistration) 825 taskReg.Services = make(map[string]*ServiceRegistration, len(newTask.Services)) 826 827 existingIDs := make(map[string]*structs.Service, len(old.Services)) 828 for _, s := range old.Services { 829 existingIDs[makeTaskServiceID(old.AllocID, old.Name, s, old.Canary)] = s 830 } 831 newIDs := make(map[string]*structs.Service, len(newTask.Services)) 832 for _, s := range newTask.Services { 833 newIDs[makeTaskServiceID(newTask.AllocID, newTask.Name, s, newTask.Canary)] = s 834 } 835 836 // Loop over existing Service IDs to see if they have been removed 837 for existingID, existingSvc := range existingIDs { 838 newSvc, ok := newIDs[existingID] 839 840 if !ok { 841 // Existing service entry removed 842 ops.deregServices = append(ops.deregServices, existingID) 843 for _, check := range existingSvc.Checks { 844 cid := makeCheckID(existingID, check) 845 ops.deregChecks = append(ops.deregChecks, cid) 846 847 // Unwatch watched checks 848 if check.TriggersRestarts() { 849 c.checkWatcher.Unwatch(cid) 850 } 851 } 852 continue 853 } 854 855 oldHash := existingSvc.Hash(old.AllocID, old.Name, old.Canary) 856 newHash := newSvc.Hash(newTask.AllocID, newTask.Name, newTask.Canary) 857 if oldHash == newHash { 858 // Service exists and hasn't changed, don't re-add it later 859 delete(newIDs, existingID) 860 } 861 862 // Service still exists so add it to the task's registration 863 sreg := &ServiceRegistration{ 864 serviceID: existingID, 865 checkIDs: make(map[string]struct{}, len(newSvc.Checks)), 866 } 867 taskReg.Services[existingID] = sreg 868 869 // See if any checks were updated 870 existingChecks := make(map[string]*structs.ServiceCheck, len(existingSvc.Checks)) 871 for _, check := range existingSvc.Checks { 872 existingChecks[makeCheckID(existingID, check)] = check 873 } 874 875 // Register new checks 876 for _, check := range newSvc.Checks { 877 checkID := makeCheckID(existingID, check) 878 if _, exists := existingChecks[checkID]; exists { 879 // Check is still required. Remove it from the map so it doesn't get 880 // deleted later. 881 delete(existingChecks, checkID) 882 sreg.checkIDs[checkID] = struct{}{} 883 } 884 885 // New check on an unchanged service; add them now 886 newCheckIDs, err := c.checkRegs(ops, existingID, newSvc, newTask) 887 if err != nil { 888 return err 889 } 890 891 for _, checkID := range newCheckIDs { 892 sreg.checkIDs[checkID] = struct{}{} 893 } 894 895 // Update all watched checks as CheckRestart fields aren't part of ID 896 if check.TriggersRestarts() { 897 c.checkWatcher.Watch(newTask.AllocID, newTask.Name, checkID, check, newTask.Restarter) 898 } 899 } 900 901 // Remove existing checks not in updated service 902 for cid, check := range existingChecks { 903 ops.deregChecks = append(ops.deregChecks, cid) 904 905 // Unwatch checks 906 if check.TriggersRestarts() { 907 c.checkWatcher.Unwatch(cid) 908 } 909 } 910 } 911 912 // Any remaining services should just be enqueued directly 913 for _, newSvc := range newIDs { 914 sreg, err := c.serviceRegs(ops, newSvc, newTask) 915 if err != nil { 916 return err 917 } 918 919 taskReg.Services[sreg.serviceID] = sreg 920 } 921 922 // Add the task to the allocation's registration 923 c.addTaskRegistration(newTask.AllocID, newTask.Name, taskReg) 924 925 c.commit(ops) 926 927 // Start watching checks. Done after service registrations are built 928 // since an error building them could leak watches. 929 for _, service := range newIDs { 930 serviceID := makeTaskServiceID(newTask.AllocID, newTask.Name, service, newTask.Canary) 931 for _, check := range service.Checks { 932 if check.TriggersRestarts() { 933 checkID := makeCheckID(serviceID, check) 934 c.checkWatcher.Watch(newTask.AllocID, newTask.Name, checkID, check, newTask.Restarter) 935 } 936 } 937 } 938 return nil 939 } 940 941 // RemoveTask from Consul. Removes all service entries and checks. 942 // 943 // Actual communication with Consul is done asynchronously (see Run). 944 func (c *ServiceClient) RemoveTask(task *TaskServices) { 945 ops := operations{} 946 947 for _, service := range task.Services { 948 id := makeTaskServiceID(task.AllocID, task.Name, service, task.Canary) 949 ops.deregServices = append(ops.deregServices, id) 950 951 for _, check := range service.Checks { 952 cid := makeCheckID(id, check) 953 ops.deregChecks = append(ops.deregChecks, cid) 954 955 if check.TriggersRestarts() { 956 c.checkWatcher.Unwatch(cid) 957 } 958 } 959 } 960 961 // Remove the task from the alloc's registrations 962 c.removeTaskRegistration(task.AllocID, task.Name) 963 964 // Now add them to the deregistration fields; main Run loop will update 965 c.commit(&ops) 966 } 967 968 // AllocRegistrations returns the registrations for the given allocation. If the 969 // allocation has no reservations, the response is a nil object. 970 func (c *ServiceClient) AllocRegistrations(allocID string) (*AllocRegistration, error) { 971 // Get the internal struct using the lock 972 c.allocRegistrationsLock.RLock() 973 regInternal, ok := c.allocRegistrations[allocID] 974 if !ok { 975 c.allocRegistrationsLock.RUnlock() 976 return nil, nil 977 } 978 979 // Copy so we don't expose internal structs 980 reg := regInternal.copy() 981 c.allocRegistrationsLock.RUnlock() 982 983 // Query the services and checks to populate the allocation registrations. 984 services, err := c.client.Services() 985 if err != nil { 986 return nil, err 987 } 988 989 checks, err := c.client.Checks() 990 if err != nil { 991 return nil, err 992 } 993 994 // Populate the object 995 for _, treg := range reg.Tasks { 996 for serviceID, sreg := range treg.Services { 997 sreg.Service = services[serviceID] 998 for checkID := range sreg.checkIDs { 999 if check, ok := checks[checkID]; ok { 1000 sreg.Checks = append(sreg.Checks, check) 1001 } 1002 } 1003 } 1004 } 1005 1006 return reg, nil 1007 } 1008 1009 // Shutdown the Consul client. Update running task registrations and deregister 1010 // agent from Consul. On first call blocks up to shutdownWait before giving up 1011 // on syncing operations. 1012 func (c *ServiceClient) Shutdown() error { 1013 // Serialize Shutdown calls with RegisterAgent to prevent leaking agent 1014 // entries. 1015 c.agentLock.Lock() 1016 defer c.agentLock.Unlock() 1017 select { 1018 case <-c.shutdownCh: 1019 return nil 1020 default: 1021 close(c.shutdownCh) 1022 } 1023 1024 // Give run loop time to sync, but don't block indefinitely 1025 deadline := time.After(c.shutdownWait) 1026 1027 // Wait for Run to finish any outstanding operations and exit 1028 select { 1029 case <-c.exitCh: 1030 case <-deadline: 1031 // Don't wait forever though 1032 } 1033 1034 // If Consul was never seen nothing could be written so exit early 1035 if !c.hasSeen() { 1036 return nil 1037 } 1038 1039 // Always attempt to deregister Nomad agent Consul entries, even if 1040 // deadline was reached 1041 for id := range c.agentServices { 1042 if err := c.client.ServiceDeregister(id); err != nil { 1043 c.logger.Error("failed deregistering agent service", "service_id", id, "error", err) 1044 } 1045 } 1046 for id := range c.agentChecks { 1047 if err := c.client.CheckDeregister(id); err != nil { 1048 c.logger.Error("failed deregistering agent check", "check_id", id, "error", err) 1049 } 1050 } 1051 1052 // Give script checks time to exit (no need to lock as Run() has exited) 1053 for _, h := range c.runningScripts { 1054 select { 1055 case <-h.wait(): 1056 case <-deadline: 1057 return fmt.Errorf("timed out waiting for script checks to run") 1058 } 1059 } 1060 return nil 1061 } 1062 1063 // addTaskRegistration adds the task registration for the given allocation. 1064 func (c *ServiceClient) addTaskRegistration(allocID, taskName string, reg *TaskRegistration) { 1065 c.allocRegistrationsLock.Lock() 1066 defer c.allocRegistrationsLock.Unlock() 1067 1068 alloc, ok := c.allocRegistrations[allocID] 1069 if !ok { 1070 alloc = &AllocRegistration{ 1071 Tasks: make(map[string]*TaskRegistration), 1072 } 1073 c.allocRegistrations[allocID] = alloc 1074 } 1075 alloc.Tasks[taskName] = reg 1076 } 1077 1078 // removeTaskRegistration removes the task registration for the given allocation. 1079 func (c *ServiceClient) removeTaskRegistration(allocID, taskName string) { 1080 c.allocRegistrationsLock.Lock() 1081 defer c.allocRegistrationsLock.Unlock() 1082 1083 alloc, ok := c.allocRegistrations[allocID] 1084 if !ok { 1085 return 1086 } 1087 1088 // Delete the task and if it is the last one also delete the alloc's 1089 // registration 1090 delete(alloc.Tasks, taskName) 1091 if len(alloc.Tasks) == 0 { 1092 delete(c.allocRegistrations, allocID) 1093 } 1094 } 1095 1096 // makeAgentServiceID creates a unique ID for identifying an agent service in 1097 // Consul. 1098 // 1099 // Agent service IDs are of the form: 1100 // 1101 // {nomadServicePrefix}-{ROLE}-b32(sha1({Service.Name}-{Service.Tags...}) 1102 // Example Server ID: _nomad-server-fbbk265qn4tmt25nd4ep42tjvmyj3hr4 1103 // Example Client ID: _nomad-client-ggnjpgl7yn7rgmvxzilmpvrzzvrszc7l 1104 // 1105 func makeAgentServiceID(role string, service *structs.Service) string { 1106 return fmt.Sprintf("%s-%s-%s", nomadServicePrefix, role, service.Hash(role, "", false)) 1107 } 1108 1109 // makeTaskServiceID creates a unique ID for identifying a task service in 1110 // Consul. All structs.Service fields are included in the ID's hash except 1111 // Checks. This allows updates to merely compare IDs. 1112 // 1113 // Example Service ID: _nomad-task-b4e61df9-b095-d64e-f241-23860da1375f-redis-http 1114 func makeTaskServiceID(allocID, taskName string, service *structs.Service, canary bool) string { 1115 return fmt.Sprintf("%s%s-%s-%s", nomadTaskPrefix, allocID, taskName, service.Name) 1116 } 1117 1118 // makeCheckID creates a unique ID for a check. 1119 // 1120 // Example Check ID: _nomad-check-434ae42f9a57c5705344974ac38de2aee0ee089d 1121 func makeCheckID(serviceID string, check *structs.ServiceCheck) string { 1122 return fmt.Sprintf("%s%s", nomadCheckPrefix, check.Hash(serviceID)) 1123 } 1124 1125 // createCheckReg creates a Check that can be registered with Consul. 1126 // 1127 // Script checks simply have a TTL set and the caller is responsible for 1128 // running the script and heartbeating. 1129 func createCheckReg(serviceID, checkID string, check *structs.ServiceCheck, host string, port int) (*api.AgentCheckRegistration, error) { 1130 chkReg := api.AgentCheckRegistration{ 1131 ID: checkID, 1132 Name: check.Name, 1133 ServiceID: serviceID, 1134 } 1135 chkReg.Status = check.InitialStatus 1136 chkReg.Timeout = check.Timeout.String() 1137 chkReg.Interval = check.Interval.String() 1138 1139 // Require an address for http or tcp checks 1140 if port == 0 && check.RequiresPort() { 1141 return nil, fmt.Errorf("%s checks require an address", check.Type) 1142 } 1143 1144 switch check.Type { 1145 case structs.ServiceCheckHTTP: 1146 proto := check.Protocol 1147 if proto == "" { 1148 proto = "http" 1149 } 1150 if check.TLSSkipVerify { 1151 chkReg.TLSSkipVerify = true 1152 } 1153 base := url.URL{ 1154 Scheme: proto, 1155 Host: net.JoinHostPort(host, strconv.Itoa(port)), 1156 } 1157 relative, err := url.Parse(check.Path) 1158 if err != nil { 1159 return nil, err 1160 } 1161 url := base.ResolveReference(relative) 1162 chkReg.HTTP = url.String() 1163 chkReg.Method = check.Method 1164 chkReg.Header = check.Header 1165 1166 case structs.ServiceCheckTCP: 1167 chkReg.TCP = net.JoinHostPort(host, strconv.Itoa(port)) 1168 1169 case structs.ServiceCheckScript: 1170 chkReg.TTL = (check.Interval + ttlCheckBuffer).String() 1171 // As of Consul 1.0.0 setting TTL and Interval is a 400 1172 chkReg.Interval = "" 1173 1174 case structs.ServiceCheckGRPC: 1175 chkReg.GRPC = fmt.Sprintf("%s/%s", net.JoinHostPort(host, strconv.Itoa(port)), check.GRPCService) 1176 chkReg.GRPCUseTLS = check.GRPCUseTLS 1177 if check.TLSSkipVerify { 1178 chkReg.TLSSkipVerify = true 1179 } 1180 1181 default: 1182 return nil, fmt.Errorf("check type %+q not valid", check.Type) 1183 } 1184 return &chkReg, nil 1185 } 1186 1187 // isNomadCheck returns true if the ID matches the pattern of a Nomad managed 1188 // check. 1189 func isNomadCheck(id string) bool { 1190 return strings.HasPrefix(id, nomadCheckPrefix) 1191 } 1192 1193 // isNomadService returns true if the ID matches the pattern of a Nomad managed 1194 // service (new or old formats). Agent services return false as independent 1195 // client and server agents may be running on the same machine. #2827 1196 func isNomadService(id string) bool { 1197 return strings.HasPrefix(id, nomadTaskPrefix) || isOldNomadService(id) 1198 } 1199 1200 // isOldNomadService returns true if the ID matches an old pattern managed by 1201 // Nomad. 1202 // 1203 // Pre-0.7.1 task service IDs are of the form: 1204 // 1205 // {nomadServicePrefix}-executor-{ALLOC_ID}-{Service.Name}-{Service.Tags...} 1206 // Example Service ID: _nomad-executor-1234-echo-http-tag1-tag2-tag3 1207 // 1208 func isOldNomadService(id string) bool { 1209 const prefix = nomadServicePrefix + "-executor" 1210 return strings.HasPrefix(id, prefix) 1211 } 1212 1213 // getAddress returns the IP and port to use for a service or check. If no port 1214 // label is specified (an empty value), zero values are returned because no 1215 // address could be resolved. 1216 func getAddress(addrMode, portLabel string, networks structs.Networks, driverNet *drivers.DriverNetwork) (string, int, error) { 1217 switch addrMode { 1218 case structs.AddressModeAuto: 1219 if driverNet.Advertise() { 1220 addrMode = structs.AddressModeDriver 1221 } else { 1222 addrMode = structs.AddressModeHost 1223 } 1224 return getAddress(addrMode, portLabel, networks, driverNet) 1225 case structs.AddressModeHost: 1226 if portLabel == "" { 1227 if len(networks) != 1 { 1228 // If no networks are specified return zero 1229 // values. Consul will advertise the host IP 1230 // with no port. This is the pre-0.7.1 behavior 1231 // some people rely on. 1232 return "", 0, nil 1233 } 1234 1235 return networks[0].IP, 0, nil 1236 } 1237 1238 // Default path: use host ip:port 1239 ip, port := networks.Port(portLabel) 1240 if ip == "" && port <= 0 { 1241 return "", 0, fmt.Errorf("invalid port %q: port label not found", portLabel) 1242 } 1243 return ip, port, nil 1244 1245 case structs.AddressModeDriver: 1246 // Require a driver network if driver address mode is used 1247 if driverNet == nil { 1248 return "", 0, fmt.Errorf(`cannot use address_mode="driver": no driver network exists`) 1249 } 1250 1251 // If no port label is specified just return the IP 1252 if portLabel == "" { 1253 return driverNet.IP, 0, nil 1254 } 1255 1256 // If the port is a label, use the driver's port (not the host's) 1257 if port, ok := driverNet.PortMap[portLabel]; ok { 1258 return driverNet.IP, port, nil 1259 } 1260 1261 // If port isn't a label, try to parse it as a literal port number 1262 port, err := strconv.Atoi(portLabel) 1263 if err != nil { 1264 // Don't include Atoi error message as user likely 1265 // never intended it to be a numeric and it creates a 1266 // confusing error message 1267 return "", 0, fmt.Errorf("invalid port label %q: port labels in driver address_mode must be numeric or in the driver's port map", portLabel) 1268 } 1269 if port <= 0 { 1270 return "", 0, fmt.Errorf("invalid port: %q: port must be >0", portLabel) 1271 } 1272 1273 return driverNet.IP, port, nil 1274 1275 default: 1276 // Shouldn't happen due to validation, but enforce invariants 1277 return "", 0, fmt.Errorf("invalid address mode %q", addrMode) 1278 } 1279 }