github.com/djenriquez/nomad-1@v0.8.1/command/agent/consul/client.go (about) 1 package consul 2 3 import ( 4 "context" 5 "fmt" 6 "log" 7 "net" 8 "net/url" 9 "strconv" 10 "strings" 11 "sync" 12 "sync/atomic" 13 "time" 14 15 metrics "github.com/armon/go-metrics" 16 "github.com/hashicorp/consul/api" 17 "github.com/hashicorp/nomad/client/driver" 18 cstructs "github.com/hashicorp/nomad/client/structs" 19 "github.com/hashicorp/nomad/helper" 20 "github.com/hashicorp/nomad/nomad/structs" 21 ) 22 23 const ( 24 // nomadServicePrefix is the prefix that scopes all Nomad registered 25 // services (both agent and task entries). 26 nomadServicePrefix = "_nomad" 27 28 // nomadTaskPrefix is the prefix that scopes Nomad registered services 29 // for tasks. 30 nomadTaskPrefix = nomadServicePrefix + "-task-" 31 32 // defaultRetryInterval is how quickly to retry syncing services and 33 // checks to Consul when an error occurs. Will backoff up to a max. 34 defaultRetryInterval = time.Second 35 36 // defaultMaxRetryInterval is the default max retry interval. 37 defaultMaxRetryInterval = 30 * time.Second 38 39 // ttlCheckBuffer is the time interval that Nomad can take to report Consul 40 // the check result 41 ttlCheckBuffer = 31 * time.Second 42 43 // defaultShutdownWait is how long Shutdown() should block waiting for 44 // enqueued operations to sync to Consul by default. 45 defaultShutdownWait = time.Minute 46 47 // DefaultQueryWaitDuration is the max duration the Consul Agent will 48 // spend waiting for a response from a Consul Query. 49 DefaultQueryWaitDuration = 2 * time.Second 50 51 // ServiceTagHTTP is the tag assigned to HTTP services 52 ServiceTagHTTP = "http" 53 54 // ServiceTagRPC is the tag assigned to RPC services 55 ServiceTagRPC = "rpc" 56 57 // ServiceTagSerf is the tag assigned to Serf services 58 ServiceTagSerf = "serf" 59 ) 60 61 // CatalogAPI is the consul/api.Catalog API used by Nomad. 62 type CatalogAPI interface { 63 Datacenters() ([]string, error) 64 Service(service, tag string, q *api.QueryOptions) ([]*api.CatalogService, *api.QueryMeta, error) 65 } 66 67 // AgentAPI is the consul/api.Agent API used by Nomad. 68 type AgentAPI interface { 69 Services() (map[string]*api.AgentService, error) 70 Checks() (map[string]*api.AgentCheck, error) 71 CheckRegister(check *api.AgentCheckRegistration) error 72 CheckDeregister(checkID string) error 73 Self() (map[string]map[string]interface{}, error) 74 ServiceRegister(service *api.AgentServiceRegistration) error 75 ServiceDeregister(serviceID string) error 76 UpdateTTL(id, output, status string) error 77 } 78 79 // operations are submitted to the main loop via commit() for synchronizing 80 // with Consul. 81 type operations struct { 82 regServices []*api.AgentServiceRegistration 83 regChecks []*api.AgentCheckRegistration 84 scripts []*scriptCheck 85 86 deregServices []string 87 deregChecks []string 88 } 89 90 // AllocRegistration holds the status of services registered for a particular 91 // allocations by task. 92 type AllocRegistration struct { 93 // Tasks maps the name of a task to its registered services and checks 94 Tasks map[string]*TaskRegistration 95 } 96 97 func (a *AllocRegistration) copy() *AllocRegistration { 98 c := &AllocRegistration{ 99 Tasks: make(map[string]*TaskRegistration, len(a.Tasks)), 100 } 101 102 for k, v := range a.Tasks { 103 c.Tasks[k] = v.copy() 104 } 105 106 return c 107 } 108 109 // NumServices returns the number of registered services 110 func (a *AllocRegistration) NumServices() int { 111 if a == nil { 112 return 0 113 } 114 115 total := 0 116 for _, treg := range a.Tasks { 117 for _, sreg := range treg.Services { 118 if sreg.Service != nil { 119 total++ 120 } 121 } 122 } 123 124 return total 125 } 126 127 // NumChecks returns the number of registered checks 128 func (a *AllocRegistration) NumChecks() int { 129 if a == nil { 130 return 0 131 } 132 133 total := 0 134 for _, treg := range a.Tasks { 135 for _, sreg := range treg.Services { 136 total += len(sreg.Checks) 137 } 138 } 139 140 return total 141 } 142 143 // TaskRegistration holds the status of services registered for a particular 144 // task. 145 type TaskRegistration struct { 146 Services map[string]*ServiceRegistration 147 } 148 149 func (t *TaskRegistration) copy() *TaskRegistration { 150 c := &TaskRegistration{ 151 Services: make(map[string]*ServiceRegistration, len(t.Services)), 152 } 153 154 for k, v := range t.Services { 155 c.Services[k] = v.copy() 156 } 157 158 return c 159 } 160 161 // ServiceRegistration holds the status of a registered Consul Service and its 162 // Checks. 163 type ServiceRegistration struct { 164 // serviceID and checkIDs are internal fields that track just the IDs of the 165 // services/checks registered in Consul. It is used to materialize the other 166 // fields when queried. 167 serviceID string 168 checkIDs map[string]struct{} 169 170 // Service is the AgentService registered in Consul. 171 Service *api.AgentService 172 173 // Checks is the status of the registered checks. 174 Checks []*api.AgentCheck 175 } 176 177 func (s *ServiceRegistration) copy() *ServiceRegistration { 178 // Copy does not copy the external fields but only the internal fields. This 179 // is so that the caller of AllocRegistrations can not access the internal 180 // fields and that method uses these fields to populate the external fields. 181 return &ServiceRegistration{ 182 serviceID: s.serviceID, 183 checkIDs: helper.CopyMapStringStruct(s.checkIDs), 184 } 185 } 186 187 // ServiceClient handles task and agent service registration with Consul. 188 type ServiceClient struct { 189 client AgentAPI 190 logger *log.Logger 191 retryInterval time.Duration 192 maxRetryInterval time.Duration 193 194 // exitCh is closed when the main Run loop exits 195 exitCh chan struct{} 196 197 // shutdownCh is closed when the client should shutdown 198 shutdownCh chan struct{} 199 200 // shutdownWait is how long Shutdown() blocks waiting for the final 201 // sync() to finish. Defaults to defaultShutdownWait 202 shutdownWait time.Duration 203 204 opCh chan *operations 205 206 services map[string]*api.AgentServiceRegistration 207 checks map[string]*api.AgentCheckRegistration 208 scripts map[string]*scriptCheck 209 runningScripts map[string]*scriptHandle 210 211 // allocRegistrations stores the services and checks that are registered 212 // with Consul by allocation ID. 213 allocRegistrations map[string]*AllocRegistration 214 allocRegistrationsLock sync.RWMutex 215 216 // agent services and checks record entries for the agent itself which 217 // should be removed on shutdown 218 agentServices map[string]struct{} 219 agentChecks map[string]struct{} 220 agentLock sync.Mutex 221 222 // seen is 1 if Consul has ever been seen; otherwise 0. Accessed with 223 // atomics. 224 seen int32 225 226 // checkWatcher restarts checks that are unhealthy. 227 checkWatcher *checkWatcher 228 } 229 230 // NewServiceClient creates a new Consul ServiceClient from an existing Consul API 231 // Client and logger. 232 func NewServiceClient(consulClient AgentAPI, logger *log.Logger) *ServiceClient { 233 return &ServiceClient{ 234 client: consulClient, 235 logger: logger, 236 retryInterval: defaultRetryInterval, 237 maxRetryInterval: defaultMaxRetryInterval, 238 exitCh: make(chan struct{}), 239 shutdownCh: make(chan struct{}), 240 shutdownWait: defaultShutdownWait, 241 opCh: make(chan *operations, 8), 242 services: make(map[string]*api.AgentServiceRegistration), 243 checks: make(map[string]*api.AgentCheckRegistration), 244 scripts: make(map[string]*scriptCheck), 245 runningScripts: make(map[string]*scriptHandle), 246 allocRegistrations: make(map[string]*AllocRegistration), 247 agentServices: make(map[string]struct{}), 248 agentChecks: make(map[string]struct{}), 249 checkWatcher: newCheckWatcher(logger, consulClient), 250 } 251 } 252 253 // seen is used by markSeen and hasSeen 254 const seen = 1 255 256 // markSeen marks Consul as having been seen (meaning at least one operation 257 // has succeeded). 258 func (c *ServiceClient) markSeen() { 259 atomic.StoreInt32(&c.seen, seen) 260 } 261 262 // hasSeen returns true if any Consul operation has ever succeeded. Useful to 263 // squelch errors if Consul isn't running. 264 func (c *ServiceClient) hasSeen() bool { 265 return atomic.LoadInt32(&c.seen) == seen 266 } 267 268 // Run the Consul main loop which retries operations against Consul. It should 269 // be called exactly once. 270 func (c *ServiceClient) Run() { 271 defer close(c.exitCh) 272 273 ctx, cancel := context.WithCancel(context.Background()) 274 defer cancel() 275 276 // init will be closed when Consul has been contacted 277 init := make(chan struct{}) 278 go checkConsulTLSSkipVerify(ctx, c.logger, c.client, init) 279 280 // Process operations while waiting for initial contact with Consul but 281 // do not sync until contact has been made. 282 hasOps := false 283 INIT: 284 for { 285 select { 286 case <-init: 287 c.markSeen() 288 break INIT 289 case <-c.shutdownCh: 290 return 291 case ops := <-c.opCh: 292 hasOps = true 293 c.merge(ops) 294 } 295 } 296 c.logger.Printf("[TRACE] consul.sync: able to contact Consul") 297 298 // Block until contact with Consul has been established 299 // Start checkWatcher 300 go c.checkWatcher.Run(ctx) 301 302 retryTimer := time.NewTimer(0) 303 if !hasOps { 304 // No pending operations so don't immediately sync 305 <-retryTimer.C 306 } 307 308 failures := 0 309 for { 310 select { 311 case <-retryTimer.C: 312 case <-c.shutdownCh: 313 // Cancel check watcher but sync one last time 314 cancel() 315 case ops := <-c.opCh: 316 c.merge(ops) 317 } 318 319 if err := c.sync(); err != nil { 320 if failures == 0 { 321 // Log on the first failure 322 c.logger.Printf("[WARN] consul.sync: failed to update services in Consul: %v", err) 323 } else if failures%10 == 0 { 324 // Log every 10th consecutive failure 325 c.logger.Printf("[ERR] consul.sync: still unable to update services in Consul after %d failures; latest error: %v", failures, err) 326 } 327 328 failures++ 329 if !retryTimer.Stop() { 330 // Timer already expired, since the timer may 331 // or may not have been read in the select{} 332 // above, conditionally receive on it 333 select { 334 case <-retryTimer.C: 335 default: 336 } 337 } 338 backoff := c.retryInterval * time.Duration(failures) 339 if backoff > c.maxRetryInterval { 340 backoff = c.maxRetryInterval 341 } 342 retryTimer.Reset(backoff) 343 } else { 344 if failures > 0 { 345 c.logger.Printf("[INFO] consul.sync: successfully updated services in Consul") 346 failures = 0 347 } 348 } 349 350 select { 351 case <-c.shutdownCh: 352 // Exit only after sync'ing all outstanding operations 353 if len(c.opCh) > 0 { 354 for len(c.opCh) > 0 { 355 c.merge(<-c.opCh) 356 } 357 continue 358 } 359 return 360 default: 361 } 362 363 } 364 } 365 366 // commit operations unless already shutting down. 367 func (c *ServiceClient) commit(ops *operations) { 368 select { 369 case c.opCh <- ops: 370 case <-c.shutdownCh: 371 } 372 } 373 374 // merge registrations into state map prior to sync'ing with Consul 375 func (c *ServiceClient) merge(ops *operations) { 376 for _, s := range ops.regServices { 377 c.services[s.ID] = s 378 } 379 for _, check := range ops.regChecks { 380 c.checks[check.ID] = check 381 } 382 for _, s := range ops.scripts { 383 c.scripts[s.id] = s 384 } 385 for _, sid := range ops.deregServices { 386 delete(c.services, sid) 387 } 388 for _, cid := range ops.deregChecks { 389 if script, ok := c.runningScripts[cid]; ok { 390 script.cancel() 391 delete(c.scripts, cid) 392 delete(c.runningScripts, cid) 393 } 394 delete(c.checks, cid) 395 } 396 metrics.SetGauge([]string{"client", "consul", "services"}, float32(len(c.services))) 397 metrics.SetGauge([]string{"client", "consul", "checks"}, float32(len(c.checks))) 398 metrics.SetGauge([]string{"client", "consul", "script_checks"}, float32(len(c.runningScripts))) 399 } 400 401 // sync enqueued operations. 402 func (c *ServiceClient) sync() error { 403 sreg, creg, sdereg, cdereg := 0, 0, 0, 0 404 405 consulServices, err := c.client.Services() 406 if err != nil { 407 metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1) 408 return fmt.Errorf("error querying Consul services: %v", err) 409 } 410 411 consulChecks, err := c.client.Checks() 412 if err != nil { 413 metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1) 414 return fmt.Errorf("error querying Consul checks: %v", err) 415 } 416 417 // Remove Nomad services in Consul but unknown locally 418 for id := range consulServices { 419 if _, ok := c.services[id]; ok { 420 // Known service, skip 421 continue 422 } 423 if !isNomadService(id) { 424 // Not managed by Nomad, skip 425 continue 426 } 427 428 // Unknown Nomad managed service; kill 429 if err := c.client.ServiceDeregister(id); err != nil { 430 if isOldNomadService(id) { 431 // Don't hard-fail on old entries. See #3620 432 continue 433 } 434 435 metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1) 436 return err 437 } 438 sdereg++ 439 metrics.IncrCounter([]string{"client", "consul", "service_deregistrations"}, 1) 440 } 441 442 // Add Nomad services missing from Consul 443 for id, locals := range c.services { 444 if _, ok := consulServices[id]; !ok { 445 if err = c.client.ServiceRegister(locals); err != nil { 446 metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1) 447 return err 448 } 449 sreg++ 450 metrics.IncrCounter([]string{"client", "consul", "service_registrations"}, 1) 451 } 452 } 453 454 // Remove Nomad checks in Consul but unknown locally 455 for id, check := range consulChecks { 456 if _, ok := c.checks[id]; ok { 457 // Known check, leave it 458 continue 459 } 460 if !isNomadService(check.ServiceID) { 461 // Service not managed by Nomad, skip 462 continue 463 } 464 465 // Unknown Nomad managed check; remove 466 if err := c.client.CheckDeregister(id); err != nil { 467 if isOldNomadService(check.ServiceID) { 468 // Don't hard-fail on old entries. 469 continue 470 } 471 472 metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1) 473 return err 474 } 475 cdereg++ 476 metrics.IncrCounter([]string{"client", "consul", "check_deregistrations"}, 1) 477 } 478 479 // Add Nomad checks missing from Consul 480 for id, check := range c.checks { 481 if _, ok := consulChecks[id]; ok { 482 // Already in Consul; skipping 483 continue 484 } 485 486 if err := c.client.CheckRegister(check); err != nil { 487 metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1) 488 return err 489 } 490 creg++ 491 metrics.IncrCounter([]string{"client", "consul", "check_registrations"}, 1) 492 493 // Handle starting scripts 494 if script, ok := c.scripts[id]; ok { 495 // If it's already running, cancel and replace 496 if oldScript, running := c.runningScripts[id]; running { 497 oldScript.cancel() 498 } 499 // Start and store the handle 500 c.runningScripts[id] = script.run() 501 } 502 } 503 504 c.logger.Printf("[DEBUG] consul.sync: registered %d services, %d checks; deregistered %d services, %d checks", 505 sreg, creg, sdereg, cdereg) 506 return nil 507 } 508 509 // RegisterAgent registers Nomad agents (client or server). The 510 // Service.PortLabel should be a literal port to be parsed with SplitHostPort. 511 // Script checks are not supported and will return an error. Registration is 512 // asynchronous. 513 // 514 // Agents will be deregistered when Shutdown is called. 515 func (c *ServiceClient) RegisterAgent(role string, services []*structs.Service) error { 516 ops := operations{} 517 518 for _, service := range services { 519 id := makeAgentServiceID(role, service) 520 521 // Unlike tasks, agents don't use port labels. Agent ports are 522 // stored directly in the PortLabel. 523 host, rawport, err := net.SplitHostPort(service.PortLabel) 524 if err != nil { 525 return fmt.Errorf("error parsing port label %q from service %q: %v", service.PortLabel, service.Name, err) 526 } 527 port, err := strconv.Atoi(rawport) 528 if err != nil { 529 return fmt.Errorf("error parsing port %q from service %q: %v", rawport, service.Name, err) 530 } 531 serviceReg := &api.AgentServiceRegistration{ 532 ID: id, 533 Name: service.Name, 534 Tags: service.Tags, 535 Address: host, 536 Port: port, 537 } 538 ops.regServices = append(ops.regServices, serviceReg) 539 540 for _, check := range service.Checks { 541 checkID := makeCheckID(id, check) 542 if check.Type == structs.ServiceCheckScript { 543 return fmt.Errorf("service %q contains invalid check: agent checks do not support scripts", service.Name) 544 } 545 checkHost, checkPort := serviceReg.Address, serviceReg.Port 546 if check.PortLabel != "" { 547 // Unlike tasks, agents don't use port labels. Agent ports are 548 // stored directly in the PortLabel. 549 host, rawport, err := net.SplitHostPort(check.PortLabel) 550 if err != nil { 551 return fmt.Errorf("error parsing port label %q from check %q: %v", service.PortLabel, check.Name, err) 552 } 553 port, err := strconv.Atoi(rawport) 554 if err != nil { 555 return fmt.Errorf("error parsing port %q from check %q: %v", rawport, check.Name, err) 556 } 557 checkHost, checkPort = host, port 558 } 559 checkReg, err := createCheckReg(id, checkID, check, checkHost, checkPort) 560 if err != nil { 561 return fmt.Errorf("failed to add check %q: %v", check.Name, err) 562 } 563 ops.regChecks = append(ops.regChecks, checkReg) 564 } 565 } 566 567 // Don't bother committing agent checks if we're already shutting down 568 c.agentLock.Lock() 569 defer c.agentLock.Unlock() 570 select { 571 case <-c.shutdownCh: 572 return nil 573 default: 574 } 575 576 // Now add them to the registration queue 577 c.commit(&ops) 578 579 // Record IDs for deregistering on shutdown 580 for _, id := range ops.regServices { 581 c.agentServices[id.ID] = struct{}{} 582 } 583 for _, id := range ops.regChecks { 584 c.agentChecks[id.ID] = struct{}{} 585 } 586 return nil 587 } 588 589 // serviceRegs creates service registrations, check registrations, and script 590 // checks from a service. It returns a service registration object with the 591 // service and check IDs populated. 592 func (c *ServiceClient) serviceRegs(ops *operations, allocID string, service *structs.Service, 593 task *structs.Task, exec driver.ScriptExecutor, net *cstructs.DriverNetwork) (*ServiceRegistration, error) { 594 595 // Get the services ID 596 id := makeTaskServiceID(allocID, task.Name, service) 597 sreg := &ServiceRegistration{ 598 serviceID: id, 599 checkIDs: make(map[string]struct{}, len(service.Checks)), 600 } 601 602 // Service address modes default to auto 603 addrMode := service.AddressMode 604 if addrMode == "" { 605 addrMode = structs.AddressModeAuto 606 } 607 608 // Determine the address to advertise based on the mode 609 ip, port, err := getAddress(addrMode, service.PortLabel, task.Resources.Networks, net) 610 if err != nil { 611 return nil, fmt.Errorf("unable to get address for service %q: %v", service.Name, err) 612 } 613 614 // Build the Consul Service registration request 615 serviceReg := &api.AgentServiceRegistration{ 616 ID: id, 617 Name: service.Name, 618 Tags: make([]string, len(service.Tags)), 619 Address: ip, 620 Port: port, 621 } 622 // copy isn't strictly necessary but can avoid bugs especially 623 // with tests that may reuse Tasks 624 copy(serviceReg.Tags, service.Tags) 625 ops.regServices = append(ops.regServices, serviceReg) 626 627 // Build the check registrations 628 checkIDs, err := c.checkRegs(ops, allocID, id, service, task, exec, net) 629 if err != nil { 630 return nil, err 631 } 632 for _, cid := range checkIDs { 633 sreg.checkIDs[cid] = struct{}{} 634 } 635 return sreg, nil 636 } 637 638 // checkRegs registers the checks for the given service and returns the 639 // registered check ids. 640 func (c *ServiceClient) checkRegs(ops *operations, allocID, serviceID string, service *structs.Service, 641 task *structs.Task, exec driver.ScriptExecutor, net *cstructs.DriverNetwork) ([]string, error) { 642 643 // Fast path 644 numChecks := len(service.Checks) 645 if numChecks == 0 { 646 return nil, nil 647 } 648 649 checkIDs := make([]string, 0, numChecks) 650 for _, check := range service.Checks { 651 checkID := makeCheckID(serviceID, check) 652 checkIDs = append(checkIDs, checkID) 653 if check.Type == structs.ServiceCheckScript { 654 if exec == nil { 655 return nil, fmt.Errorf("driver doesn't support script checks") 656 } 657 ops.scripts = append(ops.scripts, newScriptCheck( 658 allocID, task.Name, checkID, check, exec, c.client, c.logger, c.shutdownCh)) 659 660 // Skip getAddress for script checks 661 checkReg, err := createCheckReg(serviceID, checkID, check, "", 0) 662 if err != nil { 663 return nil, fmt.Errorf("failed to add script check %q: %v", check.Name, err) 664 } 665 ops.regChecks = append(ops.regChecks, checkReg) 666 continue 667 } 668 669 // Default to the service's port but allow check to override 670 portLabel := check.PortLabel 671 if portLabel == "" { 672 // Default to the service's port label 673 portLabel = service.PortLabel 674 } 675 676 // Checks address mode defaults to host for pre-#3380 backward compat 677 addrMode := check.AddressMode 678 if addrMode == "" { 679 addrMode = structs.AddressModeHost 680 } 681 682 ip, port, err := getAddress(addrMode, portLabel, task.Resources.Networks, net) 683 if err != nil { 684 return nil, fmt.Errorf("error getting address for check %q: %v", check.Name, err) 685 } 686 687 checkReg, err := createCheckReg(serviceID, checkID, check, ip, port) 688 if err != nil { 689 return nil, fmt.Errorf("failed to add check %q: %v", check.Name, err) 690 } 691 ops.regChecks = append(ops.regChecks, checkReg) 692 } 693 return checkIDs, nil 694 } 695 696 // RegisterTask with Consul. Adds all service entries and checks to Consul. If 697 // exec is nil and a script check exists an error is returned. 698 // 699 // If the service IP is set it used as the address in the service registration. 700 // Checks will always use the IP from the Task struct (host's IP). 701 // 702 // Actual communication with Consul is done asynchronously (see Run). 703 func (c *ServiceClient) RegisterTask(allocID string, task *structs.Task, restarter TaskRestarter, exec driver.ScriptExecutor, net *cstructs.DriverNetwork) error { 704 // Fast path 705 numServices := len(task.Services) 706 if numServices == 0 { 707 return nil 708 } 709 710 t := new(TaskRegistration) 711 t.Services = make(map[string]*ServiceRegistration, numServices) 712 713 ops := &operations{} 714 for _, service := range task.Services { 715 sreg, err := c.serviceRegs(ops, allocID, service, task, exec, net) 716 if err != nil { 717 return err 718 } 719 t.Services[sreg.serviceID] = sreg 720 } 721 722 // Add the task to the allocation's registration 723 c.addTaskRegistration(allocID, task.Name, t) 724 725 c.commit(ops) 726 727 // Start watching checks. Done after service registrations are built 728 // since an error building them could leak watches. 729 for _, service := range task.Services { 730 serviceID := makeTaskServiceID(allocID, task.Name, service) 731 for _, check := range service.Checks { 732 if check.TriggersRestarts() { 733 checkID := makeCheckID(serviceID, check) 734 c.checkWatcher.Watch(allocID, task.Name, checkID, check, restarter) 735 } 736 } 737 } 738 return nil 739 } 740 741 // UpdateTask in Consul. Does not alter the service if only checks have 742 // changed. 743 // 744 // DriverNetwork must not change between invocations for the same allocation. 745 func (c *ServiceClient) UpdateTask(allocID string, existing, newTask *structs.Task, restarter TaskRestarter, exec driver.ScriptExecutor, net *cstructs.DriverNetwork) error { 746 ops := &operations{} 747 748 taskReg := new(TaskRegistration) 749 taskReg.Services = make(map[string]*ServiceRegistration, len(newTask.Services)) 750 751 existingIDs := make(map[string]*structs.Service, len(existing.Services)) 752 for _, s := range existing.Services { 753 existingIDs[makeTaskServiceID(allocID, existing.Name, s)] = s 754 } 755 newIDs := make(map[string]*structs.Service, len(newTask.Services)) 756 for _, s := range newTask.Services { 757 newIDs[makeTaskServiceID(allocID, newTask.Name, s)] = s 758 } 759 760 // Loop over existing Service IDs to see if they have been removed or 761 // updated. 762 for existingID, existingSvc := range existingIDs { 763 newSvc, ok := newIDs[existingID] 764 if !ok { 765 // Existing service entry removed 766 ops.deregServices = append(ops.deregServices, existingID) 767 for _, check := range existingSvc.Checks { 768 cid := makeCheckID(existingID, check) 769 ops.deregChecks = append(ops.deregChecks, cid) 770 771 // Unwatch watched checks 772 if check.TriggersRestarts() { 773 c.checkWatcher.Unwatch(cid) 774 } 775 } 776 continue 777 } 778 779 // Service exists and hasn't changed, don't re-add it later 780 delete(newIDs, existingID) 781 782 // Service still exists so add it to the task's registration 783 sreg := &ServiceRegistration{ 784 serviceID: existingID, 785 checkIDs: make(map[string]struct{}, len(newSvc.Checks)), 786 } 787 taskReg.Services[existingID] = sreg 788 789 // See if any checks were updated 790 existingChecks := make(map[string]*structs.ServiceCheck, len(existingSvc.Checks)) 791 for _, check := range existingSvc.Checks { 792 existingChecks[makeCheckID(existingID, check)] = check 793 } 794 795 // Register new checks 796 for _, check := range newSvc.Checks { 797 checkID := makeCheckID(existingID, check) 798 if _, exists := existingChecks[checkID]; exists { 799 // Check exists, so don't remove it 800 delete(existingChecks, checkID) 801 sreg.checkIDs[checkID] = struct{}{} 802 } 803 804 // New check on an unchanged service; add them now 805 newCheckIDs, err := c.checkRegs(ops, allocID, existingID, newSvc, newTask, exec, net) 806 if err != nil { 807 return err 808 } 809 810 for _, checkID := range newCheckIDs { 811 sreg.checkIDs[checkID] = struct{}{} 812 813 } 814 815 // Update all watched checks as CheckRestart fields aren't part of ID 816 if check.TriggersRestarts() { 817 c.checkWatcher.Watch(allocID, newTask.Name, checkID, check, restarter) 818 } 819 } 820 821 // Remove existing checks not in updated service 822 for cid, check := range existingChecks { 823 ops.deregChecks = append(ops.deregChecks, cid) 824 825 // Unwatch checks 826 if check.TriggersRestarts() { 827 c.checkWatcher.Unwatch(cid) 828 } 829 } 830 } 831 832 // Any remaining services should just be enqueued directly 833 for _, newSvc := range newIDs { 834 sreg, err := c.serviceRegs(ops, allocID, newSvc, newTask, exec, net) 835 if err != nil { 836 return err 837 } 838 839 taskReg.Services[sreg.serviceID] = sreg 840 } 841 842 // Add the task to the allocation's registration 843 c.addTaskRegistration(allocID, newTask.Name, taskReg) 844 845 c.commit(ops) 846 847 // Start watching checks. Done after service registrations are built 848 // since an error building them could leak watches. 849 for _, service := range newIDs { 850 serviceID := makeTaskServiceID(allocID, newTask.Name, service) 851 for _, check := range service.Checks { 852 if check.TriggersRestarts() { 853 checkID := makeCheckID(serviceID, check) 854 c.checkWatcher.Watch(allocID, newTask.Name, checkID, check, restarter) 855 } 856 } 857 } 858 return nil 859 } 860 861 // RemoveTask from Consul. Removes all service entries and checks. 862 // 863 // Actual communication with Consul is done asynchronously (see Run). 864 func (c *ServiceClient) RemoveTask(allocID string, task *structs.Task) { 865 ops := operations{} 866 867 for _, service := range task.Services { 868 id := makeTaskServiceID(allocID, task.Name, service) 869 ops.deregServices = append(ops.deregServices, id) 870 871 for _, check := range service.Checks { 872 cid := makeCheckID(id, check) 873 ops.deregChecks = append(ops.deregChecks, cid) 874 875 if check.TriggersRestarts() { 876 c.checkWatcher.Unwatch(cid) 877 } 878 } 879 } 880 881 // Remove the task from the alloc's registrations 882 c.removeTaskRegistration(allocID, task.Name) 883 884 // Now add them to the deregistration fields; main Run loop will update 885 c.commit(&ops) 886 } 887 888 // AllocRegistrations returns the registrations for the given allocation. If the 889 // allocation has no reservations, the response is a nil object. 890 func (c *ServiceClient) AllocRegistrations(allocID string) (*AllocRegistration, error) { 891 // Get the internal struct using the lock 892 c.allocRegistrationsLock.RLock() 893 regInternal, ok := c.allocRegistrations[allocID] 894 if !ok { 895 c.allocRegistrationsLock.RUnlock() 896 return nil, nil 897 } 898 899 // Copy so we don't expose internal structs 900 reg := regInternal.copy() 901 c.allocRegistrationsLock.RUnlock() 902 903 // Query the services and checks to populate the allocation registrations. 904 services, err := c.client.Services() 905 if err != nil { 906 return nil, err 907 } 908 909 checks, err := c.client.Checks() 910 if err != nil { 911 return nil, err 912 } 913 914 // Populate the object 915 for _, treg := range reg.Tasks { 916 for serviceID, sreg := range treg.Services { 917 sreg.Service = services[serviceID] 918 for checkID := range sreg.checkIDs { 919 if check, ok := checks[checkID]; ok { 920 sreg.Checks = append(sreg.Checks, check) 921 } 922 } 923 } 924 } 925 926 return reg, nil 927 } 928 929 // Shutdown the Consul client. Update running task registrations and deregister 930 // agent from Consul. On first call blocks up to shutdownWait before giving up 931 // on syncing operations. 932 func (c *ServiceClient) Shutdown() error { 933 // Serialize Shutdown calls with RegisterAgent to prevent leaking agent 934 // entries. 935 c.agentLock.Lock() 936 defer c.agentLock.Unlock() 937 select { 938 case <-c.shutdownCh: 939 return nil 940 default: 941 close(c.shutdownCh) 942 } 943 944 // Give run loop time to sync, but don't block indefinitely 945 deadline := time.After(c.shutdownWait) 946 947 // Wait for Run to finish any outstanding operations and exit 948 select { 949 case <-c.exitCh: 950 case <-deadline: 951 // Don't wait forever though 952 } 953 954 // If Consul was never seen nothing could be written so exit early 955 if !c.hasSeen() { 956 return nil 957 } 958 959 // Always attempt to deregister Nomad agent Consul entries, even if 960 // deadline was reached 961 for id := range c.agentServices { 962 if err := c.client.ServiceDeregister(id); err != nil { 963 c.logger.Printf("[ERR] consul.sync: error deregistering agent service (id: %q): %v", id, err) 964 } 965 } 966 for id := range c.agentChecks { 967 if err := c.client.CheckDeregister(id); err != nil { 968 c.logger.Printf("[ERR] consul.sync: error deregistering agent service (id: %q): %v", id, err) 969 } 970 } 971 972 // Give script checks time to exit (no need to lock as Run() has exited) 973 for _, h := range c.runningScripts { 974 select { 975 case <-h.wait(): 976 case <-deadline: 977 return fmt.Errorf("timed out waiting for script checks to run") 978 } 979 } 980 return nil 981 } 982 983 // addTaskRegistration adds the task registration for the given allocation. 984 func (c *ServiceClient) addTaskRegistration(allocID, taskName string, reg *TaskRegistration) { 985 c.allocRegistrationsLock.Lock() 986 defer c.allocRegistrationsLock.Unlock() 987 988 alloc, ok := c.allocRegistrations[allocID] 989 if !ok { 990 alloc = &AllocRegistration{ 991 Tasks: make(map[string]*TaskRegistration), 992 } 993 c.allocRegistrations[allocID] = alloc 994 } 995 alloc.Tasks[taskName] = reg 996 } 997 998 // removeTaskRegistration removes the task registration for the given allocation. 999 func (c *ServiceClient) removeTaskRegistration(allocID, taskName string) { 1000 c.allocRegistrationsLock.Lock() 1001 defer c.allocRegistrationsLock.Unlock() 1002 1003 alloc, ok := c.allocRegistrations[allocID] 1004 if !ok { 1005 return 1006 } 1007 1008 // Delete the task and if it is the last one also delete the alloc's 1009 // registration 1010 delete(alloc.Tasks, taskName) 1011 if len(alloc.Tasks) == 0 { 1012 delete(c.allocRegistrations, allocID) 1013 } 1014 } 1015 1016 // makeAgentServiceID creates a unique ID for identifying an agent service in 1017 // Consul. 1018 // 1019 // Agent service IDs are of the form: 1020 // 1021 // {nomadServicePrefix}-{ROLE}-b32(sha1({Service.Name}-{Service.Tags...}) 1022 // Example Server ID: _nomad-server-fbbk265qn4tmt25nd4ep42tjvmyj3hr4 1023 // Example Client ID: _nomad-client-ggnjpgl7yn7rgmvxzilmpvrzzvrszc7l 1024 // 1025 func makeAgentServiceID(role string, service *structs.Service) string { 1026 return fmt.Sprintf("%s-%s-%s", nomadServicePrefix, role, service.Hash(role, "")) 1027 } 1028 1029 // makeTaskServiceID creates a unique ID for identifying a task service in 1030 // Consul. All structs.Service fields are included in the ID's hash except 1031 // Checks. This allows updates to merely compare IDs. 1032 // 1033 // Example Service ID: _nomad-task-TNM333JKJPM5AK4FAS3VXQLXFDWOF4VH 1034 func makeTaskServiceID(allocID, taskName string, service *structs.Service) string { 1035 return nomadTaskPrefix + service.Hash(allocID, taskName) 1036 } 1037 1038 // makeCheckID creates a unique ID for a check. 1039 func makeCheckID(serviceID string, check *structs.ServiceCheck) string { 1040 return check.Hash(serviceID) 1041 } 1042 1043 // createCheckReg creates a Check that can be registered with Consul. 1044 // 1045 // Script checks simply have a TTL set and the caller is responsible for 1046 // running the script and heartbeating. 1047 func createCheckReg(serviceID, checkID string, check *structs.ServiceCheck, host string, port int) (*api.AgentCheckRegistration, error) { 1048 chkReg := api.AgentCheckRegistration{ 1049 ID: checkID, 1050 Name: check.Name, 1051 ServiceID: serviceID, 1052 } 1053 chkReg.Status = check.InitialStatus 1054 chkReg.Timeout = check.Timeout.String() 1055 chkReg.Interval = check.Interval.String() 1056 1057 // Require an address for http or tcp checks 1058 if port == 0 && check.RequiresPort() { 1059 return nil, fmt.Errorf("%s checks require an address", check.Type) 1060 } 1061 1062 switch check.Type { 1063 case structs.ServiceCheckHTTP: 1064 proto := check.Protocol 1065 if proto == "" { 1066 proto = "http" 1067 } 1068 if check.TLSSkipVerify { 1069 chkReg.TLSSkipVerify = true 1070 } 1071 base := url.URL{ 1072 Scheme: proto, 1073 Host: net.JoinHostPort(host, strconv.Itoa(port)), 1074 } 1075 relative, err := url.Parse(check.Path) 1076 if err != nil { 1077 return nil, err 1078 } 1079 url := base.ResolveReference(relative) 1080 chkReg.HTTP = url.String() 1081 chkReg.Method = check.Method 1082 chkReg.Header = check.Header 1083 case structs.ServiceCheckTCP: 1084 chkReg.TCP = net.JoinHostPort(host, strconv.Itoa(port)) 1085 case structs.ServiceCheckScript: 1086 chkReg.TTL = (check.Interval + ttlCheckBuffer).String() 1087 // As of Consul 1.0.0 setting TTL and Interval is a 400 1088 chkReg.Interval = "" 1089 default: 1090 return nil, fmt.Errorf("check type %+q not valid", check.Type) 1091 } 1092 return &chkReg, nil 1093 } 1094 1095 // isNomadService returns true if the ID matches the pattern of a Nomad managed 1096 // service (new or old formats). Agent services return false as independent 1097 // client and server agents may be running on the same machine. #2827 1098 func isNomadService(id string) bool { 1099 return strings.HasPrefix(id, nomadTaskPrefix) || isOldNomadService(id) 1100 } 1101 1102 // isOldNomadService returns true if the ID matches an old pattern managed by 1103 // Nomad. 1104 // 1105 // Pre-0.7.1 task service IDs are of the form: 1106 // 1107 // {nomadServicePrefix}-executor-{ALLOC_ID}-{Service.Name}-{Service.Tags...} 1108 // Example Service ID: _nomad-executor-1234-echo-http-tag1-tag2-tag3 1109 // 1110 func isOldNomadService(id string) bool { 1111 const prefix = nomadServicePrefix + "-executor" 1112 return strings.HasPrefix(id, prefix) 1113 } 1114 1115 // getAddress returns the IP and port to use for a service or check. If no port 1116 // label is specified (an empty value), zero values are returned because no 1117 // address could be resolved. 1118 func getAddress(addrMode, portLabel string, networks structs.Networks, driverNet *cstructs.DriverNetwork) (string, int, error) { 1119 switch addrMode { 1120 case structs.AddressModeAuto: 1121 if driverNet.Advertise() { 1122 addrMode = structs.AddressModeDriver 1123 } else { 1124 addrMode = structs.AddressModeHost 1125 } 1126 return getAddress(addrMode, portLabel, networks, driverNet) 1127 case structs.AddressModeHost: 1128 if portLabel == "" { 1129 if len(networks) != 1 { 1130 // If no networks are specified return zero 1131 // values. Consul will advertise the host IP 1132 // with no port. This is the pre-0.7.1 behavior 1133 // some people rely on. 1134 return "", 0, nil 1135 } 1136 1137 return networks[0].IP, 0, nil 1138 } 1139 1140 // Default path: use host ip:port 1141 ip, port := networks.Port(portLabel) 1142 if ip == "" && port <= 0 { 1143 return "", 0, fmt.Errorf("invalid port %q: port label not found", portLabel) 1144 } 1145 return ip, port, nil 1146 1147 case structs.AddressModeDriver: 1148 // Require a driver network if driver address mode is used 1149 if driverNet == nil { 1150 return "", 0, fmt.Errorf(`cannot use address_mode="driver": no driver network exists`) 1151 } 1152 1153 // If no port label is specified just return the IP 1154 if portLabel == "" { 1155 return driverNet.IP, 0, nil 1156 } 1157 1158 // If the port is a label, use the driver's port (not the host's) 1159 if port, ok := driverNet.PortMap[portLabel]; ok { 1160 return driverNet.IP, port, nil 1161 } 1162 1163 // If port isn't a label, try to parse it as a literal port number 1164 port, err := strconv.Atoi(portLabel) 1165 if err != nil { 1166 // Don't include Atoi error message as user likely 1167 // never intended it to be a numeric and it creates a 1168 // confusing error message 1169 return "", 0, fmt.Errorf("invalid port label %q: port labels in driver address_mode must be numeric or in the driver's port map", portLabel) 1170 } 1171 if port <= 0 { 1172 return "", 0, fmt.Errorf("invalid port: %q: port must be >0", portLabel) 1173 } 1174 1175 return driverNet.IP, port, nil 1176 1177 default: 1178 // Shouldn't happen due to validation, but enforce invariants 1179 return "", 0, fmt.Errorf("invalid address mode %q", addrMode) 1180 } 1181 }