github.com/hspak/nomad@v0.7.2-0.20180309000617-bc4ae22a39a5/command/agent/consul/client.go (about) 1 package consul 2 3 import ( 4 "context" 5 "fmt" 6 "log" 7 "net" 8 "net/url" 9 "strconv" 10 "strings" 11 "sync" 12 "sync/atomic" 13 "time" 14 15 metrics "github.com/armon/go-metrics" 16 "github.com/hashicorp/consul/api" 17 "github.com/hashicorp/nomad/client/driver" 18 cstructs "github.com/hashicorp/nomad/client/structs" 19 "github.com/hashicorp/nomad/helper" 20 "github.com/hashicorp/nomad/nomad/structs" 21 ) 22 23 const ( 24 // nomadServicePrefix is the prefix that scopes all Nomad registered 25 // services (both agent and task entries). 26 nomadServicePrefix = "_nomad" 27 28 // nomadTaskPrefix is the prefix that scopes Nomad registered services 29 // for tasks. 30 nomadTaskPrefix = nomadServicePrefix + "-task-" 31 32 // defaultRetryInterval is how quickly to retry syncing services and 33 // checks to Consul when an error occurs. Will backoff up to a max. 34 defaultRetryInterval = time.Second 35 36 // defaultMaxRetryInterval is the default max retry interval. 37 defaultMaxRetryInterval = 30 * time.Second 38 39 // ttlCheckBuffer is the time interval that Nomad can take to report Consul 40 // the check result 41 ttlCheckBuffer = 31 * time.Second 42 43 // defaultShutdownWait is how long Shutdown() should block waiting for 44 // enqueued operations to sync to Consul by default. 45 defaultShutdownWait = time.Minute 46 47 // DefaultQueryWaitDuration is the max duration the Consul Agent will 48 // spend waiting for a response from a Consul Query. 49 DefaultQueryWaitDuration = 2 * time.Second 50 51 // ServiceTagHTTP is the tag assigned to HTTP services 52 ServiceTagHTTP = "http" 53 54 // ServiceTagRPC is the tag assigned to RPC services 55 ServiceTagRPC = "rpc" 56 57 // ServiceTagSerf is the tag assigned to Serf services 58 ServiceTagSerf = "serf" 59 ) 60 61 // CatalogAPI is the consul/api.Catalog API used by Nomad. 62 type CatalogAPI interface { 63 Datacenters() ([]string, error) 64 Service(service, tag string, q *api.QueryOptions) ([]*api.CatalogService, *api.QueryMeta, error) 65 } 66 67 // AgentAPI is the consul/api.Agent API used by Nomad. 68 type AgentAPI interface { 69 Services() (map[string]*api.AgentService, error) 70 Checks() (map[string]*api.AgentCheck, error) 71 CheckRegister(check *api.AgentCheckRegistration) error 72 CheckDeregister(checkID string) error 73 ServiceRegister(service *api.AgentServiceRegistration) error 74 ServiceDeregister(serviceID string) error 75 UpdateTTL(id, output, status string) error 76 } 77 78 // operations are submitted to the main loop via commit() for synchronizing 79 // with Consul. 80 type operations struct { 81 regServices []*api.AgentServiceRegistration 82 regChecks []*api.AgentCheckRegistration 83 scripts []*scriptCheck 84 85 deregServices []string 86 deregChecks []string 87 } 88 89 // AllocRegistration holds the status of services registered for a particular 90 // allocations by task. 91 type AllocRegistration struct { 92 // Tasks maps the name of a task to its registered services and checks 93 Tasks map[string]*TaskRegistration 94 } 95 96 func (a *AllocRegistration) copy() *AllocRegistration { 97 c := &AllocRegistration{ 98 Tasks: make(map[string]*TaskRegistration, len(a.Tasks)), 99 } 100 101 for k, v := range a.Tasks { 102 c.Tasks[k] = v.copy() 103 } 104 105 return c 106 } 107 108 // NumServices returns the number of registered services 109 func (a *AllocRegistration) NumServices() int { 110 if a == nil { 111 return 0 112 } 113 114 total := 0 115 for _, treg := range a.Tasks { 116 for _, sreg := range treg.Services { 117 if sreg.Service != nil { 118 total++ 119 } 120 } 121 } 122 123 return total 124 } 125 126 // NumChecks returns the number of registered checks 127 func (a *AllocRegistration) NumChecks() int { 128 if a == nil { 129 return 0 130 } 131 132 total := 0 133 for _, treg := range a.Tasks { 134 for _, sreg := range treg.Services { 135 total += len(sreg.Checks) 136 } 137 } 138 139 return total 140 } 141 142 // TaskRegistration holds the status of services registered for a particular 143 // task. 144 type TaskRegistration struct { 145 Services map[string]*ServiceRegistration 146 } 147 148 func (t *TaskRegistration) copy() *TaskRegistration { 149 c := &TaskRegistration{ 150 Services: make(map[string]*ServiceRegistration, len(t.Services)), 151 } 152 153 for k, v := range t.Services { 154 c.Services[k] = v.copy() 155 } 156 157 return c 158 } 159 160 // ServiceRegistration holds the status of a registered Consul Service and its 161 // Checks. 162 type ServiceRegistration struct { 163 // serviceID and checkIDs are internal fields that track just the IDs of the 164 // services/checks registered in Consul. It is used to materialize the other 165 // fields when queried. 166 serviceID string 167 checkIDs map[string]struct{} 168 169 // Service is the AgentService registered in Consul. 170 Service *api.AgentService 171 172 // Checks is the status of the registered checks. 173 Checks []*api.AgentCheck 174 } 175 176 func (s *ServiceRegistration) copy() *ServiceRegistration { 177 // Copy does not copy the external fields but only the internal fields. This 178 // is so that the caller of AllocRegistrations can not access the internal 179 // fields and that method uses these fields to populate the external fields. 180 return &ServiceRegistration{ 181 serviceID: s.serviceID, 182 checkIDs: helper.CopyMapStringStruct(s.checkIDs), 183 } 184 } 185 186 // ServiceClient handles task and agent service registration with Consul. 187 type ServiceClient struct { 188 client AgentAPI 189 logger *log.Logger 190 retryInterval time.Duration 191 maxRetryInterval time.Duration 192 193 // skipVerifySupport is true if the local Consul agent suppots TLSSkipVerify 194 skipVerifySupport bool 195 196 // exitCh is closed when the main Run loop exits 197 exitCh chan struct{} 198 199 // shutdownCh is closed when the client should shutdown 200 shutdownCh chan struct{} 201 202 // shutdownWait is how long Shutdown() blocks waiting for the final 203 // sync() to finish. Defaults to defaultShutdownWait 204 shutdownWait time.Duration 205 206 opCh chan *operations 207 208 services map[string]*api.AgentServiceRegistration 209 checks map[string]*api.AgentCheckRegistration 210 scripts map[string]*scriptCheck 211 runningScripts map[string]*scriptHandle 212 213 // allocRegistrations stores the services and checks that are registered 214 // with Consul by allocation ID. 215 allocRegistrations map[string]*AllocRegistration 216 allocRegistrationsLock sync.RWMutex 217 218 // agent services and checks record entries for the agent itself which 219 // should be removed on shutdown 220 agentServices map[string]struct{} 221 agentChecks map[string]struct{} 222 agentLock sync.Mutex 223 224 // seen is 1 if Consul has ever been seen; otherise 0. Accessed with 225 // atomics. 226 seen int32 227 228 // checkWatcher restarts checks that are unhealthy. 229 checkWatcher *checkWatcher 230 } 231 232 // NewServiceClient creates a new Consul ServiceClient from an existing Consul API 233 // Client and logger. 234 func NewServiceClient(consulClient AgentAPI, skipVerifySupport bool, logger *log.Logger) *ServiceClient { 235 return &ServiceClient{ 236 client: consulClient, 237 skipVerifySupport: skipVerifySupport, 238 logger: logger, 239 retryInterval: defaultRetryInterval, 240 maxRetryInterval: defaultMaxRetryInterval, 241 exitCh: make(chan struct{}), 242 shutdownCh: make(chan struct{}), 243 shutdownWait: defaultShutdownWait, 244 opCh: make(chan *operations, 8), 245 services: make(map[string]*api.AgentServiceRegistration), 246 checks: make(map[string]*api.AgentCheckRegistration), 247 scripts: make(map[string]*scriptCheck), 248 runningScripts: make(map[string]*scriptHandle), 249 allocRegistrations: make(map[string]*AllocRegistration), 250 agentServices: make(map[string]struct{}), 251 agentChecks: make(map[string]struct{}), 252 checkWatcher: newCheckWatcher(logger, consulClient), 253 } 254 } 255 256 // seen is used by markSeen and hasSeen 257 const seen = 1 258 259 // markSeen marks Consul as having been seen (meaning at least one operation 260 // has succeeded). 261 func (c *ServiceClient) markSeen() { 262 atomic.StoreInt32(&c.seen, seen) 263 } 264 265 // hasSeen returns true if any Consul operation has ever succeeded. Useful to 266 // squelch errors if Consul isn't running. 267 func (c *ServiceClient) hasSeen() bool { 268 return atomic.LoadInt32(&c.seen) == seen 269 } 270 271 // Run the Consul main loop which retries operations against Consul. It should 272 // be called exactly once. 273 func (c *ServiceClient) Run() { 274 defer close(c.exitCh) 275 276 // start checkWatcher 277 ctx, cancelWatcher := context.WithCancel(context.Background()) 278 defer cancelWatcher() 279 go c.checkWatcher.Run(ctx) 280 281 retryTimer := time.NewTimer(0) 282 <-retryTimer.C // disabled by default 283 failures := 0 284 for { 285 select { 286 case <-retryTimer.C: 287 case <-c.shutdownCh: 288 cancelWatcher() 289 case ops := <-c.opCh: 290 c.merge(ops) 291 } 292 293 if err := c.sync(); err != nil { 294 if failures == 0 { 295 // Log on the first failure 296 c.logger.Printf("[WARN] consul.sync: failed to update services in Consul: %v", err) 297 } else if failures%10 == 0 { 298 // Log every 10th consecutive failure 299 c.logger.Printf("[ERR] consul.sync: still unable to update services in Consul after %d failures; latest error: %v", failures, err) 300 } 301 302 failures++ 303 if !retryTimer.Stop() { 304 // Timer already expired, since the timer may 305 // or may not have been read in the select{} 306 // above, conditionally receive on it 307 select { 308 case <-retryTimer.C: 309 default: 310 } 311 } 312 backoff := c.retryInterval * time.Duration(failures) 313 if backoff > c.maxRetryInterval { 314 backoff = c.maxRetryInterval 315 } 316 retryTimer.Reset(backoff) 317 } else { 318 if failures > 0 { 319 c.logger.Printf("[INFO] consul.sync: successfully updated services in Consul") 320 failures = 0 321 } 322 } 323 324 select { 325 case <-c.shutdownCh: 326 // Exit only after sync'ing all outstanding operations 327 if len(c.opCh) > 0 { 328 for len(c.opCh) > 0 { 329 c.merge(<-c.opCh) 330 } 331 continue 332 } 333 return 334 default: 335 } 336 337 } 338 } 339 340 // commit operations unless already shutting down. 341 func (c *ServiceClient) commit(ops *operations) { 342 select { 343 case c.opCh <- ops: 344 case <-c.shutdownCh: 345 } 346 } 347 348 // merge registrations into state map prior to sync'ing with Consul 349 func (c *ServiceClient) merge(ops *operations) { 350 for _, s := range ops.regServices { 351 c.services[s.ID] = s 352 } 353 for _, check := range ops.regChecks { 354 c.checks[check.ID] = check 355 } 356 for _, s := range ops.scripts { 357 c.scripts[s.id] = s 358 } 359 for _, sid := range ops.deregServices { 360 delete(c.services, sid) 361 } 362 for _, cid := range ops.deregChecks { 363 if script, ok := c.runningScripts[cid]; ok { 364 script.cancel() 365 delete(c.scripts, cid) 366 delete(c.runningScripts, cid) 367 } 368 delete(c.checks, cid) 369 } 370 metrics.SetGauge([]string{"client", "consul", "services"}, float32(len(c.services))) 371 metrics.SetGauge([]string{"client", "consul", "checks"}, float32(len(c.checks))) 372 metrics.SetGauge([]string{"client", "consul", "script_checks"}, float32(len(c.runningScripts))) 373 } 374 375 // sync enqueued operations. 376 func (c *ServiceClient) sync() error { 377 sreg, creg, sdereg, cdereg := 0, 0, 0, 0 378 379 consulServices, err := c.client.Services() 380 if err != nil { 381 metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1) 382 return fmt.Errorf("error querying Consul services: %v", err) 383 } 384 385 consulChecks, err := c.client.Checks() 386 if err != nil { 387 metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1) 388 return fmt.Errorf("error querying Consul checks: %v", err) 389 } 390 391 // Remove Nomad services in Consul but unknown locally 392 for id := range consulServices { 393 if _, ok := c.services[id]; ok { 394 // Known service, skip 395 continue 396 } 397 if !isNomadService(id) { 398 // Not managed by Nomad, skip 399 continue 400 } 401 402 // Unknown Nomad managed service; kill 403 if err := c.client.ServiceDeregister(id); err != nil { 404 if isOldNomadService(id) { 405 // Don't hard-fail on old entries. See #3620 406 continue 407 } 408 409 metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1) 410 return err 411 } 412 sdereg++ 413 metrics.IncrCounter([]string{"client", "consul", "service_deregistrations"}, 1) 414 } 415 416 // Add Nomad services missing from Consul 417 for id, locals := range c.services { 418 if _, ok := consulServices[id]; !ok { 419 if err = c.client.ServiceRegister(locals); err != nil { 420 metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1) 421 return err 422 } 423 sreg++ 424 metrics.IncrCounter([]string{"client", "consul", "service_registrations"}, 1) 425 } 426 } 427 428 // Remove Nomad checks in Consul but unknown locally 429 for id, check := range consulChecks { 430 if _, ok := c.checks[id]; ok { 431 // Known check, leave it 432 continue 433 } 434 if !isNomadService(check.ServiceID) { 435 // Service not managed by Nomad, skip 436 continue 437 } 438 439 // Unknown Nomad managed check; remove 440 if err := c.client.CheckDeregister(id); err != nil { 441 if isOldNomadService(check.ServiceID) { 442 // Don't hard-fail on old entries. 443 continue 444 } 445 446 metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1) 447 return err 448 } 449 cdereg++ 450 metrics.IncrCounter([]string{"client", "consul", "check_deregistrations"}, 1) 451 } 452 453 // Add Nomad checks missing from Consul 454 for id, check := range c.checks { 455 if _, ok := consulChecks[id]; ok { 456 // Already in Consul; skipping 457 continue 458 } 459 460 if err := c.client.CheckRegister(check); err != nil { 461 metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1) 462 return err 463 } 464 creg++ 465 metrics.IncrCounter([]string{"client", "consul", "check_registrations"}, 1) 466 467 // Handle starting scripts 468 if script, ok := c.scripts[id]; ok { 469 // If it's already running, cancel and replace 470 if oldScript, running := c.runningScripts[id]; running { 471 oldScript.cancel() 472 } 473 // Start and store the handle 474 c.runningScripts[id] = script.run() 475 } 476 } 477 478 // A Consul operation has succeeded, mark Consul as having been seen 479 c.markSeen() 480 481 c.logger.Printf("[DEBUG] consul.sync: registered %d services, %d checks; deregistered %d services, %d checks", 482 sreg, creg, sdereg, cdereg) 483 return nil 484 } 485 486 // RegisterAgent registers Nomad agents (client or server). The 487 // Service.PortLabel should be a literal port to be parsed with SplitHostPort. 488 // Script checks are not supported and will return an error. Registration is 489 // asynchronous. 490 // 491 // Agents will be deregistered when Shutdown is called. 492 func (c *ServiceClient) RegisterAgent(role string, services []*structs.Service) error { 493 ops := operations{} 494 495 for _, service := range services { 496 id := makeAgentServiceID(role, service) 497 498 // Unlike tasks, agents don't use port labels. Agent ports are 499 // stored directly in the PortLabel. 500 host, rawport, err := net.SplitHostPort(service.PortLabel) 501 if err != nil { 502 return fmt.Errorf("error parsing port label %q from service %q: %v", service.PortLabel, service.Name, err) 503 } 504 port, err := strconv.Atoi(rawport) 505 if err != nil { 506 return fmt.Errorf("error parsing port %q from service %q: %v", rawport, service.Name, err) 507 } 508 serviceReg := &api.AgentServiceRegistration{ 509 ID: id, 510 Name: service.Name, 511 Tags: service.Tags, 512 Address: host, 513 Port: port, 514 } 515 ops.regServices = append(ops.regServices, serviceReg) 516 517 for _, check := range service.Checks { 518 checkID := makeCheckID(id, check) 519 if check.Type == structs.ServiceCheckScript { 520 return fmt.Errorf("service %q contains invalid check: agent checks do not support scripts", service.Name) 521 } 522 checkHost, checkPort := serviceReg.Address, serviceReg.Port 523 if check.PortLabel != "" { 524 // Unlike tasks, agents don't use port labels. Agent ports are 525 // stored directly in the PortLabel. 526 host, rawport, err := net.SplitHostPort(check.PortLabel) 527 if err != nil { 528 return fmt.Errorf("error parsing port label %q from check %q: %v", service.PortLabel, check.Name, err) 529 } 530 port, err := strconv.Atoi(rawport) 531 if err != nil { 532 return fmt.Errorf("error parsing port %q from check %q: %v", rawport, check.Name, err) 533 } 534 checkHost, checkPort = host, port 535 } 536 checkReg, err := createCheckReg(id, checkID, check, checkHost, checkPort) 537 if err != nil { 538 return fmt.Errorf("failed to add check %q: %v", check.Name, err) 539 } 540 ops.regChecks = append(ops.regChecks, checkReg) 541 } 542 } 543 544 // Don't bother committing agent checks if we're already shutting down 545 c.agentLock.Lock() 546 defer c.agentLock.Unlock() 547 select { 548 case <-c.shutdownCh: 549 return nil 550 default: 551 } 552 553 // Now add them to the registration queue 554 c.commit(&ops) 555 556 // Record IDs for deregistering on shutdown 557 for _, id := range ops.regServices { 558 c.agentServices[id.ID] = struct{}{} 559 } 560 for _, id := range ops.regChecks { 561 c.agentChecks[id.ID] = struct{}{} 562 } 563 return nil 564 } 565 566 // serviceRegs creates service registrations, check registrations, and script 567 // checks from a service. It returns a service registration object with the 568 // service and check IDs populated. 569 func (c *ServiceClient) serviceRegs(ops *operations, allocID string, service *structs.Service, 570 task *structs.Task, exec driver.ScriptExecutor, net *cstructs.DriverNetwork) (*ServiceRegistration, error) { 571 572 // Get the services ID 573 id := makeTaskServiceID(allocID, task.Name, service) 574 sreg := &ServiceRegistration{ 575 serviceID: id, 576 checkIDs: make(map[string]struct{}, len(service.Checks)), 577 } 578 579 // Service address modes default to auto 580 addrMode := service.AddressMode 581 if addrMode == "" { 582 addrMode = structs.AddressModeAuto 583 } 584 585 // Determine the address to advertise based on the mode 586 ip, port, err := getAddress(addrMode, service.PortLabel, task.Resources.Networks, net) 587 if err != nil { 588 return nil, fmt.Errorf("unable to get address for service %q: %v", service.Name, err) 589 } 590 591 // Build the Consul Service registration request 592 serviceReg := &api.AgentServiceRegistration{ 593 ID: id, 594 Name: service.Name, 595 Tags: make([]string, len(service.Tags)), 596 Address: ip, 597 Port: port, 598 } 599 // copy isn't strictly necessary but can avoid bugs especially 600 // with tests that may reuse Tasks 601 copy(serviceReg.Tags, service.Tags) 602 ops.regServices = append(ops.regServices, serviceReg) 603 604 // Build the check registrations 605 checkIDs, err := c.checkRegs(ops, allocID, id, service, task, exec, net) 606 if err != nil { 607 return nil, err 608 } 609 for _, cid := range checkIDs { 610 sreg.checkIDs[cid] = struct{}{} 611 } 612 return sreg, nil 613 } 614 615 // checkRegs registers the checks for the given service and returns the 616 // registered check ids. 617 func (c *ServiceClient) checkRegs(ops *operations, allocID, serviceID string, service *structs.Service, 618 task *structs.Task, exec driver.ScriptExecutor, net *cstructs.DriverNetwork) ([]string, error) { 619 620 // Fast path 621 numChecks := len(service.Checks) 622 if numChecks == 0 { 623 return nil, nil 624 } 625 626 checkIDs := make([]string, 0, numChecks) 627 for _, check := range service.Checks { 628 if check.TLSSkipVerify && !c.skipVerifySupport { 629 c.logger.Printf("[WARN] consul.sync: skipping check %q for task %q alloc %q because Consul doesn't support tls_skip_verify. Please upgrade to Consul >= 0.7.2.", 630 check.Name, task.Name, allocID) 631 continue 632 } 633 checkID := makeCheckID(serviceID, check) 634 checkIDs = append(checkIDs, checkID) 635 if check.Type == structs.ServiceCheckScript { 636 if exec == nil { 637 return nil, fmt.Errorf("driver doesn't support script checks") 638 } 639 ops.scripts = append(ops.scripts, newScriptCheck( 640 allocID, task.Name, checkID, check, exec, c.client, c.logger, c.shutdownCh)) 641 642 // Skip getAddress for script checks 643 checkReg, err := createCheckReg(serviceID, checkID, check, "", 0) 644 if err != nil { 645 return nil, fmt.Errorf("failed to add script check %q: %v", check.Name, err) 646 } 647 ops.regChecks = append(ops.regChecks, checkReg) 648 continue 649 } 650 651 // Default to the service's port but allow check to override 652 portLabel := check.PortLabel 653 if portLabel == "" { 654 // Default to the service's port label 655 portLabel = service.PortLabel 656 } 657 658 // Checks address mode defaults to host for pre-#3380 backward compat 659 addrMode := check.AddressMode 660 if addrMode == "" { 661 addrMode = structs.AddressModeHost 662 } 663 664 ip, port, err := getAddress(addrMode, portLabel, task.Resources.Networks, net) 665 if err != nil { 666 return nil, fmt.Errorf("error getting address for check %q: %v", check.Name, err) 667 } 668 669 checkReg, err := createCheckReg(serviceID, checkID, check, ip, port) 670 if err != nil { 671 return nil, fmt.Errorf("failed to add check %q: %v", check.Name, err) 672 } 673 ops.regChecks = append(ops.regChecks, checkReg) 674 } 675 return checkIDs, nil 676 } 677 678 // RegisterTask with Consul. Adds all service entries and checks to Consul. If 679 // exec is nil and a script check exists an error is returned. 680 // 681 // If the service IP is set it used as the address in the service registration. 682 // Checks will always use the IP from the Task struct (host's IP). 683 // 684 // Actual communication with Consul is done asynchrously (see Run). 685 func (c *ServiceClient) RegisterTask(allocID string, task *structs.Task, restarter TaskRestarter, exec driver.ScriptExecutor, net *cstructs.DriverNetwork) error { 686 // Fast path 687 numServices := len(task.Services) 688 if numServices == 0 { 689 return nil 690 } 691 692 t := new(TaskRegistration) 693 t.Services = make(map[string]*ServiceRegistration, numServices) 694 695 ops := &operations{} 696 for _, service := range task.Services { 697 sreg, err := c.serviceRegs(ops, allocID, service, task, exec, net) 698 if err != nil { 699 return err 700 } 701 t.Services[sreg.serviceID] = sreg 702 } 703 704 // Add the task to the allocation's registration 705 c.addTaskRegistration(allocID, task.Name, t) 706 707 c.commit(ops) 708 709 // Start watching checks. Done after service registrations are built 710 // since an error building them could leak watches. 711 for _, service := range task.Services { 712 serviceID := makeTaskServiceID(allocID, task.Name, service) 713 for _, check := range service.Checks { 714 if check.TriggersRestarts() { 715 checkID := makeCheckID(serviceID, check) 716 c.checkWatcher.Watch(allocID, task.Name, checkID, check, restarter) 717 } 718 } 719 } 720 return nil 721 } 722 723 // UpdateTask in Consul. Does not alter the service if only checks have 724 // changed. 725 // 726 // DriverNetwork must not change between invocations for the same allocation. 727 func (c *ServiceClient) UpdateTask(allocID string, existing, newTask *structs.Task, restarter TaskRestarter, exec driver.ScriptExecutor, net *cstructs.DriverNetwork) error { 728 ops := &operations{} 729 730 taskReg := new(TaskRegistration) 731 taskReg.Services = make(map[string]*ServiceRegistration, len(newTask.Services)) 732 733 existingIDs := make(map[string]*structs.Service, len(existing.Services)) 734 for _, s := range existing.Services { 735 existingIDs[makeTaskServiceID(allocID, existing.Name, s)] = s 736 } 737 newIDs := make(map[string]*structs.Service, len(newTask.Services)) 738 for _, s := range newTask.Services { 739 newIDs[makeTaskServiceID(allocID, newTask.Name, s)] = s 740 } 741 742 // Loop over existing Service IDs to see if they have been removed or 743 // updated. 744 for existingID, existingSvc := range existingIDs { 745 newSvc, ok := newIDs[existingID] 746 if !ok { 747 // Existing service entry removed 748 ops.deregServices = append(ops.deregServices, existingID) 749 for _, check := range existingSvc.Checks { 750 cid := makeCheckID(existingID, check) 751 ops.deregChecks = append(ops.deregChecks, cid) 752 753 // Unwatch watched checks 754 if check.TriggersRestarts() { 755 c.checkWatcher.Unwatch(cid) 756 } 757 } 758 continue 759 } 760 761 // Service exists and hasn't changed, don't re-add it later 762 delete(newIDs, existingID) 763 764 // Service still exists so add it to the task's registration 765 sreg := &ServiceRegistration{ 766 serviceID: existingID, 767 checkIDs: make(map[string]struct{}, len(newSvc.Checks)), 768 } 769 taskReg.Services[existingID] = sreg 770 771 // See if any checks were updated 772 existingChecks := make(map[string]*structs.ServiceCheck, len(existingSvc.Checks)) 773 for _, check := range existingSvc.Checks { 774 existingChecks[makeCheckID(existingID, check)] = check 775 } 776 777 // Register new checks 778 for _, check := range newSvc.Checks { 779 checkID := makeCheckID(existingID, check) 780 if _, exists := existingChecks[checkID]; exists { 781 // Check exists, so don't remove it 782 delete(existingChecks, checkID) 783 sreg.checkIDs[checkID] = struct{}{} 784 } 785 786 // New check on an unchanged service; add them now 787 newCheckIDs, err := c.checkRegs(ops, allocID, existingID, newSvc, newTask, exec, net) 788 if err != nil { 789 return err 790 } 791 792 for _, checkID := range newCheckIDs { 793 sreg.checkIDs[checkID] = struct{}{} 794 795 } 796 797 // Update all watched checks as CheckRestart fields aren't part of ID 798 if check.TriggersRestarts() { 799 c.checkWatcher.Watch(allocID, newTask.Name, checkID, check, restarter) 800 } 801 } 802 803 // Remove existing checks not in updated service 804 for cid, check := range existingChecks { 805 ops.deregChecks = append(ops.deregChecks, cid) 806 807 // Unwatch checks 808 if check.TriggersRestarts() { 809 c.checkWatcher.Unwatch(cid) 810 } 811 } 812 } 813 814 // Any remaining services should just be enqueued directly 815 for _, newSvc := range newIDs { 816 sreg, err := c.serviceRegs(ops, allocID, newSvc, newTask, exec, net) 817 if err != nil { 818 return err 819 } 820 821 taskReg.Services[sreg.serviceID] = sreg 822 } 823 824 // Add the task to the allocation's registration 825 c.addTaskRegistration(allocID, newTask.Name, taskReg) 826 827 c.commit(ops) 828 829 // Start watching checks. Done after service registrations are built 830 // since an error building them could leak watches. 831 for _, service := range newIDs { 832 serviceID := makeTaskServiceID(allocID, newTask.Name, service) 833 for _, check := range service.Checks { 834 if check.TriggersRestarts() { 835 checkID := makeCheckID(serviceID, check) 836 c.checkWatcher.Watch(allocID, newTask.Name, checkID, check, restarter) 837 } 838 } 839 } 840 return nil 841 } 842 843 // RemoveTask from Consul. Removes all service entries and checks. 844 // 845 // Actual communication with Consul is done asynchrously (see Run). 846 func (c *ServiceClient) RemoveTask(allocID string, task *structs.Task) { 847 ops := operations{} 848 849 for _, service := range task.Services { 850 id := makeTaskServiceID(allocID, task.Name, service) 851 ops.deregServices = append(ops.deregServices, id) 852 853 for _, check := range service.Checks { 854 cid := makeCheckID(id, check) 855 ops.deregChecks = append(ops.deregChecks, cid) 856 857 if check.TriggersRestarts() { 858 c.checkWatcher.Unwatch(cid) 859 } 860 } 861 } 862 863 // Remove the task from the alloc's registrations 864 c.removeTaskRegistration(allocID, task.Name) 865 866 // Now add them to the deregistration fields; main Run loop will update 867 c.commit(&ops) 868 } 869 870 // AllocRegistrations returns the registrations for the given allocation. If the 871 // allocation has no reservations, the response is a nil object. 872 func (c *ServiceClient) AllocRegistrations(allocID string) (*AllocRegistration, error) { 873 // Get the internal struct using the lock 874 c.allocRegistrationsLock.RLock() 875 regInternal, ok := c.allocRegistrations[allocID] 876 if !ok { 877 c.allocRegistrationsLock.RUnlock() 878 return nil, nil 879 } 880 881 // Copy so we don't expose internal structs 882 reg := regInternal.copy() 883 c.allocRegistrationsLock.RUnlock() 884 885 // Query the services and checks to populate the allocation registrations. 886 services, err := c.client.Services() 887 if err != nil { 888 return nil, err 889 } 890 891 checks, err := c.client.Checks() 892 if err != nil { 893 return nil, err 894 } 895 896 // Populate the object 897 for _, treg := range reg.Tasks { 898 for serviceID, sreg := range treg.Services { 899 sreg.Service = services[serviceID] 900 for checkID := range sreg.checkIDs { 901 if check, ok := checks[checkID]; ok { 902 sreg.Checks = append(sreg.Checks, check) 903 } 904 } 905 } 906 } 907 908 return reg, nil 909 } 910 911 // Shutdown the Consul client. Update running task registations and deregister 912 // agent from Consul. On first call blocks up to shutdownWait before giving up 913 // on syncing operations. 914 func (c *ServiceClient) Shutdown() error { 915 // Serialize Shutdown calls with RegisterAgent to prevent leaking agent 916 // entries. 917 c.agentLock.Lock() 918 defer c.agentLock.Unlock() 919 select { 920 case <-c.shutdownCh: 921 return nil 922 default: 923 close(c.shutdownCh) 924 } 925 926 // Give run loop time to sync, but don't block indefinitely 927 deadline := time.After(c.shutdownWait) 928 929 // Wait for Run to finish any outstanding operations and exit 930 select { 931 case <-c.exitCh: 932 case <-deadline: 933 // Don't wait forever though 934 } 935 936 // If Consul was never seen nothing could be written so exit early 937 if !c.hasSeen() { 938 return nil 939 } 940 941 // Always attempt to deregister Nomad agent Consul entries, even if 942 // deadline was reached 943 for id := range c.agentServices { 944 if err := c.client.ServiceDeregister(id); err != nil { 945 c.logger.Printf("[ERR] consul.sync: error deregistering agent service (id: %q): %v", id, err) 946 } 947 } 948 for id := range c.agentChecks { 949 if err := c.client.CheckDeregister(id); err != nil { 950 c.logger.Printf("[ERR] consul.sync: error deregistering agent service (id: %q): %v", id, err) 951 } 952 } 953 954 // Give script checks time to exit (no need to lock as Run() has exited) 955 for _, h := range c.runningScripts { 956 select { 957 case <-h.wait(): 958 case <-deadline: 959 return fmt.Errorf("timed out waiting for script checks to run") 960 } 961 } 962 return nil 963 } 964 965 // addTaskRegistration adds the task registration for the given allocation. 966 func (c *ServiceClient) addTaskRegistration(allocID, taskName string, reg *TaskRegistration) { 967 c.allocRegistrationsLock.Lock() 968 defer c.allocRegistrationsLock.Unlock() 969 970 alloc, ok := c.allocRegistrations[allocID] 971 if !ok { 972 alloc = &AllocRegistration{ 973 Tasks: make(map[string]*TaskRegistration), 974 } 975 c.allocRegistrations[allocID] = alloc 976 } 977 alloc.Tasks[taskName] = reg 978 } 979 980 // removeTaskRegistration removes the task registration for the given allocation. 981 func (c *ServiceClient) removeTaskRegistration(allocID, taskName string) { 982 c.allocRegistrationsLock.Lock() 983 defer c.allocRegistrationsLock.Unlock() 984 985 alloc, ok := c.allocRegistrations[allocID] 986 if !ok { 987 return 988 } 989 990 // Delete the task and if it is the last one also delete the alloc's 991 // registration 992 delete(alloc.Tasks, taskName) 993 if len(alloc.Tasks) == 0 { 994 delete(c.allocRegistrations, allocID) 995 } 996 } 997 998 // makeAgentServiceID creates a unique ID for identifying an agent service in 999 // Consul. 1000 // 1001 // Agent service IDs are of the form: 1002 // 1003 // {nomadServicePrefix}-{ROLE}-b32(sha1({Service.Name}-{Service.Tags...}) 1004 // Example Server ID: _nomad-server-fbbk265qn4tmt25nd4ep42tjvmyj3hr4 1005 // Example Client ID: _nomad-client-ggnjpgl7yn7rgmvxzilmpvrzzvrszc7l 1006 // 1007 func makeAgentServiceID(role string, service *structs.Service) string { 1008 return fmt.Sprintf("%s-%s-%s", nomadServicePrefix, role, service.Hash(role, "")) 1009 } 1010 1011 // makeTaskServiceID creates a unique ID for identifying a task service in 1012 // Consul. All structs.Service fields are included in the ID's hash except 1013 // Checks. This allows updates to merely compare IDs. 1014 // 1015 // Example Service ID: _nomad-task-TNM333JKJPM5AK4FAS3VXQLXFDWOF4VH 1016 func makeTaskServiceID(allocID, taskName string, service *structs.Service) string { 1017 return nomadTaskPrefix + service.Hash(allocID, taskName) 1018 } 1019 1020 // makeCheckID creates a unique ID for a check. 1021 func makeCheckID(serviceID string, check *structs.ServiceCheck) string { 1022 return check.Hash(serviceID) 1023 } 1024 1025 // createCheckReg creates a Check that can be registered with Consul. 1026 // 1027 // Script checks simply have a TTL set and the caller is responsible for 1028 // running the script and heartbeating. 1029 func createCheckReg(serviceID, checkID string, check *structs.ServiceCheck, host string, port int) (*api.AgentCheckRegistration, error) { 1030 chkReg := api.AgentCheckRegistration{ 1031 ID: checkID, 1032 Name: check.Name, 1033 ServiceID: serviceID, 1034 } 1035 chkReg.Status = check.InitialStatus 1036 chkReg.Timeout = check.Timeout.String() 1037 chkReg.Interval = check.Interval.String() 1038 1039 // Require an address for http or tcp checks 1040 if port == 0 && check.RequiresPort() { 1041 return nil, fmt.Errorf("%s checks require an address", check.Type) 1042 } 1043 1044 switch check.Type { 1045 case structs.ServiceCheckHTTP: 1046 proto := check.Protocol 1047 if proto == "" { 1048 proto = "http" 1049 } 1050 if check.TLSSkipVerify { 1051 chkReg.TLSSkipVerify = true 1052 } 1053 base := url.URL{ 1054 Scheme: proto, 1055 Host: net.JoinHostPort(host, strconv.Itoa(port)), 1056 } 1057 relative, err := url.Parse(check.Path) 1058 if err != nil { 1059 return nil, err 1060 } 1061 url := base.ResolveReference(relative) 1062 chkReg.HTTP = url.String() 1063 chkReg.Method = check.Method 1064 chkReg.Header = check.Header 1065 case structs.ServiceCheckTCP: 1066 chkReg.TCP = net.JoinHostPort(host, strconv.Itoa(port)) 1067 case structs.ServiceCheckScript: 1068 chkReg.TTL = (check.Interval + ttlCheckBuffer).String() 1069 // As of Consul 1.0.0 setting TTL and Interval is a 400 1070 chkReg.Interval = "" 1071 default: 1072 return nil, fmt.Errorf("check type %+q not valid", check.Type) 1073 } 1074 return &chkReg, nil 1075 } 1076 1077 // isNomadService returns true if the ID matches the pattern of a Nomad managed 1078 // service (new or old formats). Agent services return false as independent 1079 // client and server agents may be running on the same machine. #2827 1080 func isNomadService(id string) bool { 1081 return strings.HasPrefix(id, nomadTaskPrefix) || isOldNomadService(id) 1082 } 1083 1084 // isOldNomadService returns true if the ID matches an old pattern managed by 1085 // Nomad. 1086 // 1087 // Pre-0.7.1 task service IDs are of the form: 1088 // 1089 // {nomadServicePrefix}-executor-{ALLOC_ID}-{Service.Name}-{Service.Tags...} 1090 // Example Service ID: _nomad-executor-1234-echo-http-tag1-tag2-tag3 1091 // 1092 func isOldNomadService(id string) bool { 1093 const prefix = nomadServicePrefix + "-executor" 1094 return strings.HasPrefix(id, prefix) 1095 } 1096 1097 // getAddress returns the IP and port to use for a service or check. If no port 1098 // label is specified (an empty value), zero values are returned because no 1099 // address could be resolved. 1100 func getAddress(addrMode, portLabel string, networks structs.Networks, driverNet *cstructs.DriverNetwork) (string, int, error) { 1101 switch addrMode { 1102 case structs.AddressModeAuto: 1103 if driverNet.Advertise() { 1104 addrMode = structs.AddressModeDriver 1105 } else { 1106 addrMode = structs.AddressModeHost 1107 } 1108 return getAddress(addrMode, portLabel, networks, driverNet) 1109 case structs.AddressModeHost: 1110 if portLabel == "" { 1111 if len(networks) != 1 { 1112 // If no networks are specified return zero 1113 // values. Consul will advertise the host IP 1114 // with no port. This is the pre-0.7.1 behavior 1115 // some people rely on. 1116 return "", 0, nil 1117 } 1118 1119 return networks[0].IP, 0, nil 1120 } 1121 1122 // Default path: use host ip:port 1123 ip, port := networks.Port(portLabel) 1124 if ip == "" && port <= 0 { 1125 return "", 0, fmt.Errorf("invalid port %q: port label not found", portLabel) 1126 } 1127 return ip, port, nil 1128 1129 case structs.AddressModeDriver: 1130 // Require a driver network if driver address mode is used 1131 if driverNet == nil { 1132 return "", 0, fmt.Errorf(`cannot use address_mode="driver": no driver network exists`) 1133 } 1134 1135 // If no port label is specified just return the IP 1136 if portLabel == "" { 1137 return driverNet.IP, 0, nil 1138 } 1139 1140 // If the port is a label, use the driver's port (not the host's) 1141 if port, ok := driverNet.PortMap[portLabel]; ok { 1142 return driverNet.IP, port, nil 1143 } 1144 1145 // If port isn't a label, try to parse it as a literal port number 1146 port, err := strconv.Atoi(portLabel) 1147 if err != nil { 1148 // Don't include Atoi error message as user likely 1149 // never intended it to be a numeric and it creates a 1150 // confusing error message 1151 return "", 0, fmt.Errorf("invalid port label %q: port labels in driver address_mode must be numeric or in the driver's port map", portLabel) 1152 } 1153 if port <= 0 { 1154 return "", 0, fmt.Errorf("invalid port: %q: port must be >0", portLabel) 1155 } 1156 1157 return driverNet.IP, port, nil 1158 1159 default: 1160 // Shouldn't happen due to validation, but enforce invariants 1161 return "", 0, fmt.Errorf("invalid address mode %q", addrMode) 1162 } 1163 }