github.com/smintz/nomad@v0.8.3/command/agent/consul/client.go (about)

     1  package consul
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"log"
     7  	"net"
     8  	"net/url"
     9  	"strconv"
    10  	"strings"
    11  	"sync"
    12  	"sync/atomic"
    13  	"time"
    14  
    15  	metrics "github.com/armon/go-metrics"
    16  	"github.com/hashicorp/consul/api"
    17  	"github.com/hashicorp/nomad/client/driver"
    18  	cstructs "github.com/hashicorp/nomad/client/structs"
    19  	"github.com/hashicorp/nomad/helper"
    20  	"github.com/hashicorp/nomad/nomad/structs"
    21  )
    22  
    23  const (
    24  	// nomadServicePrefix is the prefix that scopes all Nomad registered
    25  	// services (both agent and task entries).
    26  	nomadServicePrefix = "_nomad"
    27  
    28  	// nomadTaskPrefix is the prefix that scopes Nomad registered services
    29  	// for tasks.
    30  	nomadTaskPrefix = nomadServicePrefix + "-task-"
    31  
    32  	// defaultRetryInterval is how quickly to retry syncing services and
    33  	// checks to Consul when an error occurs. Will backoff up to a max.
    34  	defaultRetryInterval = time.Second
    35  
    36  	// defaultMaxRetryInterval is the default max retry interval.
    37  	defaultMaxRetryInterval = 30 * time.Second
    38  
    39  	// defaultPeriodicalInterval is the interval at which the service
    40  	// client reconciles state between the desired services and checks and
    41  	// what's actually registered in Consul. This is done at an interval,
    42  	// rather than being purely edge triggered, to handle the case that the
    43  	// Consul agent's state may change underneath us
    44  	defaultPeriodicInterval = 30 * time.Second
    45  
    46  	// ttlCheckBuffer is the time interval that Nomad can take to report Consul
    47  	// the check result
    48  	ttlCheckBuffer = 31 * time.Second
    49  
    50  	// defaultShutdownWait is how long Shutdown() should block waiting for
    51  	// enqueued operations to sync to Consul by default.
    52  	defaultShutdownWait = time.Minute
    53  
    54  	// DefaultQueryWaitDuration is the max duration the Consul Agent will
    55  	// spend waiting for a response from a Consul Query.
    56  	DefaultQueryWaitDuration = 2 * time.Second
    57  
    58  	// ServiceTagHTTP is the tag assigned to HTTP services
    59  	ServiceTagHTTP = "http"
    60  
    61  	// ServiceTagRPC is the tag assigned to RPC services
    62  	ServiceTagRPC = "rpc"
    63  
    64  	// ServiceTagSerf is the tag assigned to Serf services
    65  	ServiceTagSerf = "serf"
    66  )
    67  
    68  // CatalogAPI is the consul/api.Catalog API used by Nomad.
    69  type CatalogAPI interface {
    70  	Datacenters() ([]string, error)
    71  	Service(service, tag string, q *api.QueryOptions) ([]*api.CatalogService, *api.QueryMeta, error)
    72  }
    73  
    74  // AgentAPI is the consul/api.Agent API used by Nomad.
    75  type AgentAPI interface {
    76  	Services() (map[string]*api.AgentService, error)
    77  	Checks() (map[string]*api.AgentCheck, error)
    78  	CheckRegister(check *api.AgentCheckRegistration) error
    79  	CheckDeregister(checkID string) error
    80  	Self() (map[string]map[string]interface{}, error)
    81  	ServiceRegister(service *api.AgentServiceRegistration) error
    82  	ServiceDeregister(serviceID string) error
    83  	UpdateTTL(id, output, status string) error
    84  }
    85  
    86  // operations are submitted to the main loop via commit() for synchronizing
    87  // with Consul.
    88  type operations struct {
    89  	regServices []*api.AgentServiceRegistration
    90  	regChecks   []*api.AgentCheckRegistration
    91  	scripts     []*scriptCheck
    92  
    93  	deregServices []string
    94  	deregChecks   []string
    95  }
    96  
    97  // AllocRegistration holds the status of services registered for a particular
    98  // allocations by task.
    99  type AllocRegistration struct {
   100  	// Tasks maps the name of a task to its registered services and checks
   101  	Tasks map[string]*TaskRegistration
   102  }
   103  
   104  func (a *AllocRegistration) copy() *AllocRegistration {
   105  	c := &AllocRegistration{
   106  		Tasks: make(map[string]*TaskRegistration, len(a.Tasks)),
   107  	}
   108  
   109  	for k, v := range a.Tasks {
   110  		c.Tasks[k] = v.copy()
   111  	}
   112  
   113  	return c
   114  }
   115  
   116  // NumServices returns the number of registered services
   117  func (a *AllocRegistration) NumServices() int {
   118  	if a == nil {
   119  		return 0
   120  	}
   121  
   122  	total := 0
   123  	for _, treg := range a.Tasks {
   124  		for _, sreg := range treg.Services {
   125  			if sreg.Service != nil {
   126  				total++
   127  			}
   128  		}
   129  	}
   130  
   131  	return total
   132  }
   133  
   134  // NumChecks returns the number of registered checks
   135  func (a *AllocRegistration) NumChecks() int {
   136  	if a == nil {
   137  		return 0
   138  	}
   139  
   140  	total := 0
   141  	for _, treg := range a.Tasks {
   142  		for _, sreg := range treg.Services {
   143  			total += len(sreg.Checks)
   144  		}
   145  	}
   146  
   147  	return total
   148  }
   149  
   150  // TaskRegistration holds the status of services registered for a particular
   151  // task.
   152  type TaskRegistration struct {
   153  	Services map[string]*ServiceRegistration
   154  }
   155  
   156  func (t *TaskRegistration) copy() *TaskRegistration {
   157  	c := &TaskRegistration{
   158  		Services: make(map[string]*ServiceRegistration, len(t.Services)),
   159  	}
   160  
   161  	for k, v := range t.Services {
   162  		c.Services[k] = v.copy()
   163  	}
   164  
   165  	return c
   166  }
   167  
   168  // ServiceRegistration holds the status of a registered Consul Service and its
   169  // Checks.
   170  type ServiceRegistration struct {
   171  	// serviceID and checkIDs are internal fields that track just the IDs of the
   172  	// services/checks registered in Consul. It is used to materialize the other
   173  	// fields when queried.
   174  	serviceID string
   175  	checkIDs  map[string]struct{}
   176  
   177  	// Service is the AgentService registered in Consul.
   178  	Service *api.AgentService
   179  
   180  	// Checks is the status of the registered checks.
   181  	Checks []*api.AgentCheck
   182  }
   183  
   184  func (s *ServiceRegistration) copy() *ServiceRegistration {
   185  	// Copy does not copy the external fields but only the internal fields. This
   186  	// is so that the caller of AllocRegistrations can not access the internal
   187  	// fields and that method uses these fields to populate the external fields.
   188  	return &ServiceRegistration{
   189  		serviceID: s.serviceID,
   190  		checkIDs:  helper.CopyMapStringStruct(s.checkIDs),
   191  	}
   192  }
   193  
   194  // ServiceClient handles task and agent service registration with Consul.
   195  type ServiceClient struct {
   196  	client           AgentAPI
   197  	logger           *log.Logger
   198  	retryInterval    time.Duration
   199  	maxRetryInterval time.Duration
   200  	periodicInterval time.Duration
   201  
   202  	// exitCh is closed when the main Run loop exits
   203  	exitCh chan struct{}
   204  
   205  	// shutdownCh is closed when the client should shutdown
   206  	shutdownCh chan struct{}
   207  
   208  	// shutdownWait is how long Shutdown() blocks waiting for the final
   209  	// sync() to finish. Defaults to defaultShutdownWait
   210  	shutdownWait time.Duration
   211  
   212  	opCh chan *operations
   213  
   214  	services       map[string]*api.AgentServiceRegistration
   215  	checks         map[string]*api.AgentCheckRegistration
   216  	scripts        map[string]*scriptCheck
   217  	runningScripts map[string]*scriptHandle
   218  
   219  	// allocRegistrations stores the services and checks that are registered
   220  	// with Consul by allocation ID.
   221  	allocRegistrations     map[string]*AllocRegistration
   222  	allocRegistrationsLock sync.RWMutex
   223  
   224  	// agent services and checks record entries for the agent itself which
   225  	// should be removed on shutdown
   226  	agentServices map[string]struct{}
   227  	agentChecks   map[string]struct{}
   228  	agentLock     sync.Mutex
   229  
   230  	// seen is 1 if Consul has ever been seen; otherwise 0. Accessed with
   231  	// atomics.
   232  	seen int32
   233  
   234  	// checkWatcher restarts checks that are unhealthy.
   235  	checkWatcher *checkWatcher
   236  }
   237  
   238  // NewServiceClient creates a new Consul ServiceClient from an existing Consul API
   239  // Client and logger.
   240  func NewServiceClient(consulClient AgentAPI, logger *log.Logger) *ServiceClient {
   241  	return &ServiceClient{
   242  		client:             consulClient,
   243  		logger:             logger,
   244  		retryInterval:      defaultRetryInterval,
   245  		maxRetryInterval:   defaultMaxRetryInterval,
   246  		periodicInterval:   defaultPeriodicInterval,
   247  		exitCh:             make(chan struct{}),
   248  		shutdownCh:         make(chan struct{}),
   249  		shutdownWait:       defaultShutdownWait,
   250  		opCh:               make(chan *operations, 8),
   251  		services:           make(map[string]*api.AgentServiceRegistration),
   252  		checks:             make(map[string]*api.AgentCheckRegistration),
   253  		scripts:            make(map[string]*scriptCheck),
   254  		runningScripts:     make(map[string]*scriptHandle),
   255  		allocRegistrations: make(map[string]*AllocRegistration),
   256  		agentServices:      make(map[string]struct{}),
   257  		agentChecks:        make(map[string]struct{}),
   258  		checkWatcher:       newCheckWatcher(logger, consulClient),
   259  	}
   260  }
   261  
   262  // seen is used by markSeen and hasSeen
   263  const seen = 1
   264  
   265  // markSeen marks Consul as having been seen (meaning at least one operation
   266  // has succeeded).
   267  func (c *ServiceClient) markSeen() {
   268  	atomic.StoreInt32(&c.seen, seen)
   269  }
   270  
   271  // hasSeen returns true if any Consul operation has ever succeeded. Useful to
   272  // squelch errors if Consul isn't running.
   273  func (c *ServiceClient) hasSeen() bool {
   274  	return atomic.LoadInt32(&c.seen) == seen
   275  }
   276  
   277  // Run the Consul main loop which retries operations against Consul. It should
   278  // be called exactly once.
   279  func (c *ServiceClient) Run() {
   280  	defer close(c.exitCh)
   281  
   282  	ctx, cancel := context.WithCancel(context.Background())
   283  	defer cancel()
   284  
   285  	// init will be closed when Consul has been contacted
   286  	init := make(chan struct{})
   287  	go checkConsulTLSSkipVerify(ctx, c.logger, c.client, init)
   288  
   289  	// Process operations while waiting for initial contact with Consul but
   290  	// do not sync until contact has been made.
   291  INIT:
   292  	for {
   293  		select {
   294  		case <-init:
   295  			c.markSeen()
   296  			break INIT
   297  		case <-c.shutdownCh:
   298  			return
   299  		case ops := <-c.opCh:
   300  			c.merge(ops)
   301  		}
   302  	}
   303  	c.logger.Printf("[TRACE] consul.sync: able to contact Consul")
   304  
   305  	// Block until contact with Consul has been established
   306  	// Start checkWatcher
   307  	go c.checkWatcher.Run(ctx)
   308  
   309  	// Always immediately sync to reconcile Nomad and Consul's state
   310  	retryTimer := time.NewTimer(0)
   311  
   312  	failures := 0
   313  	for {
   314  		select {
   315  		case <-retryTimer.C:
   316  		case <-c.shutdownCh:
   317  			// Cancel check watcher but sync one last time
   318  			cancel()
   319  		case ops := <-c.opCh:
   320  			c.merge(ops)
   321  		}
   322  
   323  		if err := c.sync(); err != nil {
   324  			if failures == 0 {
   325  				// Log on the first failure
   326  				c.logger.Printf("[WARN] consul.sync: failed to update services in Consul: %v", err)
   327  			} else if failures%10 == 0 {
   328  				// Log every 10th consecutive failure
   329  				c.logger.Printf("[ERR] consul.sync: still unable to update services in Consul after %d failures; latest error: %v", failures, err)
   330  			}
   331  
   332  			failures++
   333  			if !retryTimer.Stop() {
   334  				// Timer already expired, since the timer may
   335  				// or may not have been read in the select{}
   336  				// above, conditionally receive on it
   337  				select {
   338  				case <-retryTimer.C:
   339  				default:
   340  				}
   341  			}
   342  			backoff := c.retryInterval * time.Duration(failures)
   343  			if backoff > c.maxRetryInterval {
   344  				backoff = c.maxRetryInterval
   345  			}
   346  			retryTimer.Reset(backoff)
   347  		} else {
   348  			if failures > 0 {
   349  				c.logger.Printf("[INFO] consul.sync: successfully updated services in Consul")
   350  				failures = 0
   351  			}
   352  
   353  			// Reset timer to periodic interval to periodically
   354  			// reconile with Consul
   355  			if !retryTimer.Stop() {
   356  				select {
   357  				case <-retryTimer.C:
   358  				default:
   359  				}
   360  			}
   361  			retryTimer.Reset(c.periodicInterval)
   362  		}
   363  
   364  		select {
   365  		case <-c.shutdownCh:
   366  			// Exit only after sync'ing all outstanding operations
   367  			if len(c.opCh) > 0 {
   368  				for len(c.opCh) > 0 {
   369  					c.merge(<-c.opCh)
   370  				}
   371  				continue
   372  			}
   373  			return
   374  		default:
   375  		}
   376  
   377  	}
   378  }
   379  
   380  // commit operations unless already shutting down.
   381  func (c *ServiceClient) commit(ops *operations) {
   382  	select {
   383  	case c.opCh <- ops:
   384  	case <-c.shutdownCh:
   385  	}
   386  }
   387  
   388  // merge registrations into state map prior to sync'ing with Consul
   389  func (c *ServiceClient) merge(ops *operations) {
   390  	for _, s := range ops.regServices {
   391  		c.services[s.ID] = s
   392  	}
   393  	for _, check := range ops.regChecks {
   394  		c.checks[check.ID] = check
   395  	}
   396  	for _, s := range ops.scripts {
   397  		c.scripts[s.id] = s
   398  	}
   399  	for _, sid := range ops.deregServices {
   400  		delete(c.services, sid)
   401  	}
   402  	for _, cid := range ops.deregChecks {
   403  		if script, ok := c.runningScripts[cid]; ok {
   404  			script.cancel()
   405  			delete(c.scripts, cid)
   406  			delete(c.runningScripts, cid)
   407  		}
   408  		delete(c.checks, cid)
   409  	}
   410  	metrics.SetGauge([]string{"client", "consul", "services"}, float32(len(c.services)))
   411  	metrics.SetGauge([]string{"client", "consul", "checks"}, float32(len(c.checks)))
   412  	metrics.SetGauge([]string{"client", "consul", "script_checks"}, float32(len(c.runningScripts)))
   413  }
   414  
   415  // sync enqueued operations.
   416  func (c *ServiceClient) sync() error {
   417  	sreg, creg, sdereg, cdereg := 0, 0, 0, 0
   418  
   419  	consulServices, err := c.client.Services()
   420  	if err != nil {
   421  		metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1)
   422  		return fmt.Errorf("error querying Consul services: %v", err)
   423  	}
   424  
   425  	consulChecks, err := c.client.Checks()
   426  	if err != nil {
   427  		metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1)
   428  		return fmt.Errorf("error querying Consul checks: %v", err)
   429  	}
   430  
   431  	// Remove Nomad services in Consul but unknown locally
   432  	for id := range consulServices {
   433  		if _, ok := c.services[id]; ok {
   434  			// Known service, skip
   435  			continue
   436  		}
   437  		if !isNomadService(id) {
   438  			// Not managed by Nomad, skip
   439  			continue
   440  		}
   441  
   442  		// Unknown Nomad managed service; kill
   443  		if err := c.client.ServiceDeregister(id); err != nil {
   444  			if isOldNomadService(id) {
   445  				// Don't hard-fail on old entries. See #3620
   446  				continue
   447  			}
   448  
   449  			metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1)
   450  			return err
   451  		}
   452  		sdereg++
   453  		metrics.IncrCounter([]string{"client", "consul", "service_deregistrations"}, 1)
   454  	}
   455  
   456  	// Add Nomad services missing from Consul
   457  	for id, locals := range c.services {
   458  		if _, ok := consulServices[id]; !ok {
   459  			if err = c.client.ServiceRegister(locals); err != nil {
   460  				metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1)
   461  				return err
   462  			}
   463  			sreg++
   464  			metrics.IncrCounter([]string{"client", "consul", "service_registrations"}, 1)
   465  		}
   466  	}
   467  
   468  	// Remove Nomad checks in Consul but unknown locally
   469  	for id, check := range consulChecks {
   470  		if _, ok := c.checks[id]; ok {
   471  			// Known check, leave it
   472  			continue
   473  		}
   474  		if !isNomadService(check.ServiceID) {
   475  			// Service not managed by Nomad, skip
   476  			continue
   477  		}
   478  
   479  		// Unknown Nomad managed check; remove
   480  		if err := c.client.CheckDeregister(id); err != nil {
   481  			if isOldNomadService(check.ServiceID) {
   482  				// Don't hard-fail on old entries.
   483  				continue
   484  			}
   485  
   486  			metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1)
   487  			return err
   488  		}
   489  		cdereg++
   490  		metrics.IncrCounter([]string{"client", "consul", "check_deregistrations"}, 1)
   491  	}
   492  
   493  	// Add Nomad checks missing from Consul
   494  	for id, check := range c.checks {
   495  		if _, ok := consulChecks[id]; ok {
   496  			// Already in Consul; skipping
   497  			continue
   498  		}
   499  
   500  		if err := c.client.CheckRegister(check); err != nil {
   501  			metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1)
   502  			return err
   503  		}
   504  		creg++
   505  		metrics.IncrCounter([]string{"client", "consul", "check_registrations"}, 1)
   506  
   507  		// Handle starting scripts
   508  		if script, ok := c.scripts[id]; ok {
   509  			// If it's already running, cancel and replace
   510  			if oldScript, running := c.runningScripts[id]; running {
   511  				oldScript.cancel()
   512  			}
   513  			// Start and store the handle
   514  			c.runningScripts[id] = script.run()
   515  		}
   516  	}
   517  
   518  	c.logger.Printf("[DEBUG] consul.sync: registered %d services, %d checks; deregistered %d services, %d checks",
   519  		sreg, creg, sdereg, cdereg)
   520  	return nil
   521  }
   522  
   523  // RegisterAgent registers Nomad agents (client or server). The
   524  // Service.PortLabel should be a literal port to be parsed with SplitHostPort.
   525  // Script checks are not supported and will return an error. Registration is
   526  // asynchronous.
   527  //
   528  // Agents will be deregistered when Shutdown is called.
   529  func (c *ServiceClient) RegisterAgent(role string, services []*structs.Service) error {
   530  	ops := operations{}
   531  
   532  	for _, service := range services {
   533  		id := makeAgentServiceID(role, service)
   534  
   535  		// Unlike tasks, agents don't use port labels. Agent ports are
   536  		// stored directly in the PortLabel.
   537  		host, rawport, err := net.SplitHostPort(service.PortLabel)
   538  		if err != nil {
   539  			return fmt.Errorf("error parsing port label %q from service %q: %v", service.PortLabel, service.Name, err)
   540  		}
   541  		port, err := strconv.Atoi(rawport)
   542  		if err != nil {
   543  			return fmt.Errorf("error parsing port %q from service %q: %v", rawport, service.Name, err)
   544  		}
   545  		serviceReg := &api.AgentServiceRegistration{
   546  			ID:      id,
   547  			Name:    service.Name,
   548  			Tags:    service.Tags,
   549  			Address: host,
   550  			Port:    port,
   551  		}
   552  		ops.regServices = append(ops.regServices, serviceReg)
   553  
   554  		for _, check := range service.Checks {
   555  			checkID := makeCheckID(id, check)
   556  			if check.Type == structs.ServiceCheckScript {
   557  				return fmt.Errorf("service %q contains invalid check: agent checks do not support scripts", service.Name)
   558  			}
   559  			checkHost, checkPort := serviceReg.Address, serviceReg.Port
   560  			if check.PortLabel != "" {
   561  				// Unlike tasks, agents don't use port labels. Agent ports are
   562  				// stored directly in the PortLabel.
   563  				host, rawport, err := net.SplitHostPort(check.PortLabel)
   564  				if err != nil {
   565  					return fmt.Errorf("error parsing port label %q from check %q: %v", service.PortLabel, check.Name, err)
   566  				}
   567  				port, err := strconv.Atoi(rawport)
   568  				if err != nil {
   569  					return fmt.Errorf("error parsing port %q from check %q: %v", rawport, check.Name, err)
   570  				}
   571  				checkHost, checkPort = host, port
   572  			}
   573  			checkReg, err := createCheckReg(id, checkID, check, checkHost, checkPort)
   574  			if err != nil {
   575  				return fmt.Errorf("failed to add check %q: %v", check.Name, err)
   576  			}
   577  			ops.regChecks = append(ops.regChecks, checkReg)
   578  		}
   579  	}
   580  
   581  	// Don't bother committing agent checks if we're already shutting down
   582  	c.agentLock.Lock()
   583  	defer c.agentLock.Unlock()
   584  	select {
   585  	case <-c.shutdownCh:
   586  		return nil
   587  	default:
   588  	}
   589  
   590  	// Now add them to the registration queue
   591  	c.commit(&ops)
   592  
   593  	// Record IDs for deregistering on shutdown
   594  	for _, id := range ops.regServices {
   595  		c.agentServices[id.ID] = struct{}{}
   596  	}
   597  	for _, id := range ops.regChecks {
   598  		c.agentChecks[id.ID] = struct{}{}
   599  	}
   600  	return nil
   601  }
   602  
   603  // serviceRegs creates service registrations, check registrations, and script
   604  // checks from a service. It returns a service registration object with the
   605  // service and check IDs populated.
   606  func (c *ServiceClient) serviceRegs(ops *operations, allocID string, service *structs.Service,
   607  	task *structs.Task, exec driver.ScriptExecutor, net *cstructs.DriverNetwork) (*ServiceRegistration, error) {
   608  
   609  	// Get the services ID
   610  	id := makeTaskServiceID(allocID, task.Name, service)
   611  	sreg := &ServiceRegistration{
   612  		serviceID: id,
   613  		checkIDs:  make(map[string]struct{}, len(service.Checks)),
   614  	}
   615  
   616  	// Service address modes default to auto
   617  	addrMode := service.AddressMode
   618  	if addrMode == "" {
   619  		addrMode = structs.AddressModeAuto
   620  	}
   621  
   622  	// Determine the address to advertise based on the mode
   623  	ip, port, err := getAddress(addrMode, service.PortLabel, task.Resources.Networks, net)
   624  	if err != nil {
   625  		return nil, fmt.Errorf("unable to get address for service %q: %v", service.Name, err)
   626  	}
   627  
   628  	// Build the Consul Service registration request
   629  	serviceReg := &api.AgentServiceRegistration{
   630  		ID:      id,
   631  		Name:    service.Name,
   632  		Tags:    make([]string, len(service.Tags)),
   633  		Address: ip,
   634  		Port:    port,
   635  	}
   636  	// copy isn't strictly necessary but can avoid bugs especially
   637  	// with tests that may reuse Tasks
   638  	copy(serviceReg.Tags, service.Tags)
   639  	ops.regServices = append(ops.regServices, serviceReg)
   640  
   641  	// Build the check registrations
   642  	checkIDs, err := c.checkRegs(ops, allocID, id, service, task, exec, net)
   643  	if err != nil {
   644  		return nil, err
   645  	}
   646  	for _, cid := range checkIDs {
   647  		sreg.checkIDs[cid] = struct{}{}
   648  	}
   649  	return sreg, nil
   650  }
   651  
   652  // checkRegs registers the checks for the given service and returns the
   653  // registered check ids.
   654  func (c *ServiceClient) checkRegs(ops *operations, allocID, serviceID string, service *structs.Service,
   655  	task *structs.Task, exec driver.ScriptExecutor, net *cstructs.DriverNetwork) ([]string, error) {
   656  
   657  	// Fast path
   658  	numChecks := len(service.Checks)
   659  	if numChecks == 0 {
   660  		return nil, nil
   661  	}
   662  
   663  	checkIDs := make([]string, 0, numChecks)
   664  	for _, check := range service.Checks {
   665  		checkID := makeCheckID(serviceID, check)
   666  		checkIDs = append(checkIDs, checkID)
   667  		if check.Type == structs.ServiceCheckScript {
   668  			if exec == nil {
   669  				return nil, fmt.Errorf("driver doesn't support script checks")
   670  			}
   671  			ops.scripts = append(ops.scripts, newScriptCheck(
   672  				allocID, task.Name, checkID, check, exec, c.client, c.logger, c.shutdownCh))
   673  
   674  			// Skip getAddress for script checks
   675  			checkReg, err := createCheckReg(serviceID, checkID, check, "", 0)
   676  			if err != nil {
   677  				return nil, fmt.Errorf("failed to add script check %q: %v", check.Name, err)
   678  			}
   679  			ops.regChecks = append(ops.regChecks, checkReg)
   680  			continue
   681  		}
   682  
   683  		// Default to the service's port but allow check to override
   684  		portLabel := check.PortLabel
   685  		if portLabel == "" {
   686  			// Default to the service's port label
   687  			portLabel = service.PortLabel
   688  		}
   689  
   690  		// Checks address mode defaults to host for pre-#3380 backward compat
   691  		addrMode := check.AddressMode
   692  		if addrMode == "" {
   693  			addrMode = structs.AddressModeHost
   694  		}
   695  
   696  		ip, port, err := getAddress(addrMode, portLabel, task.Resources.Networks, net)
   697  		if err != nil {
   698  			return nil, fmt.Errorf("error getting address for check %q: %v", check.Name, err)
   699  		}
   700  
   701  		checkReg, err := createCheckReg(serviceID, checkID, check, ip, port)
   702  		if err != nil {
   703  			return nil, fmt.Errorf("failed to add check %q: %v", check.Name, err)
   704  		}
   705  		ops.regChecks = append(ops.regChecks, checkReg)
   706  	}
   707  	return checkIDs, nil
   708  }
   709  
   710  // RegisterTask with Consul. Adds all service entries and checks to Consul. If
   711  // exec is nil and a script check exists an error is returned.
   712  //
   713  // If the service IP is set it used as the address in the service registration.
   714  // Checks will always use the IP from the Task struct (host's IP).
   715  //
   716  // Actual communication with Consul is done asynchronously (see Run).
   717  func (c *ServiceClient) RegisterTask(allocID string, task *structs.Task, restarter TaskRestarter, exec driver.ScriptExecutor, net *cstructs.DriverNetwork) error {
   718  	// Fast path
   719  	numServices := len(task.Services)
   720  	if numServices == 0 {
   721  		return nil
   722  	}
   723  
   724  	t := new(TaskRegistration)
   725  	t.Services = make(map[string]*ServiceRegistration, numServices)
   726  
   727  	ops := &operations{}
   728  	for _, service := range task.Services {
   729  		sreg, err := c.serviceRegs(ops, allocID, service, task, exec, net)
   730  		if err != nil {
   731  			return err
   732  		}
   733  		t.Services[sreg.serviceID] = sreg
   734  	}
   735  
   736  	// Add the task to the allocation's registration
   737  	c.addTaskRegistration(allocID, task.Name, t)
   738  
   739  	c.commit(ops)
   740  
   741  	// Start watching checks. Done after service registrations are built
   742  	// since an error building them could leak watches.
   743  	for _, service := range task.Services {
   744  		serviceID := makeTaskServiceID(allocID, task.Name, service)
   745  		for _, check := range service.Checks {
   746  			if check.TriggersRestarts() {
   747  				checkID := makeCheckID(serviceID, check)
   748  				c.checkWatcher.Watch(allocID, task.Name, checkID, check, restarter)
   749  			}
   750  		}
   751  	}
   752  	return nil
   753  }
   754  
   755  // UpdateTask in Consul. Does not alter the service if only checks have
   756  // changed.
   757  //
   758  // DriverNetwork must not change between invocations for the same allocation.
   759  func (c *ServiceClient) UpdateTask(allocID string, existing, newTask *structs.Task, restarter TaskRestarter, exec driver.ScriptExecutor, net *cstructs.DriverNetwork) error {
   760  	ops := &operations{}
   761  
   762  	taskReg := new(TaskRegistration)
   763  	taskReg.Services = make(map[string]*ServiceRegistration, len(newTask.Services))
   764  
   765  	existingIDs := make(map[string]*structs.Service, len(existing.Services))
   766  	for _, s := range existing.Services {
   767  		existingIDs[makeTaskServiceID(allocID, existing.Name, s)] = s
   768  	}
   769  	newIDs := make(map[string]*structs.Service, len(newTask.Services))
   770  	for _, s := range newTask.Services {
   771  		newIDs[makeTaskServiceID(allocID, newTask.Name, s)] = s
   772  	}
   773  
   774  	// Loop over existing Service IDs to see if they have been removed or
   775  	// updated.
   776  	for existingID, existingSvc := range existingIDs {
   777  		newSvc, ok := newIDs[existingID]
   778  		if !ok {
   779  			// Existing service entry removed
   780  			ops.deregServices = append(ops.deregServices, existingID)
   781  			for _, check := range existingSvc.Checks {
   782  				cid := makeCheckID(existingID, check)
   783  				ops.deregChecks = append(ops.deregChecks, cid)
   784  
   785  				// Unwatch watched checks
   786  				if check.TriggersRestarts() {
   787  					c.checkWatcher.Unwatch(cid)
   788  				}
   789  			}
   790  			continue
   791  		}
   792  
   793  		// Service exists and hasn't changed, don't re-add it later
   794  		delete(newIDs, existingID)
   795  
   796  		// Service still exists so add it to the task's registration
   797  		sreg := &ServiceRegistration{
   798  			serviceID: existingID,
   799  			checkIDs:  make(map[string]struct{}, len(newSvc.Checks)),
   800  		}
   801  		taskReg.Services[existingID] = sreg
   802  
   803  		// See if any checks were updated
   804  		existingChecks := make(map[string]*structs.ServiceCheck, len(existingSvc.Checks))
   805  		for _, check := range existingSvc.Checks {
   806  			existingChecks[makeCheckID(existingID, check)] = check
   807  		}
   808  
   809  		// Register new checks
   810  		for _, check := range newSvc.Checks {
   811  			checkID := makeCheckID(existingID, check)
   812  			if _, exists := existingChecks[checkID]; exists {
   813  				// Check exists, so don't remove it
   814  				delete(existingChecks, checkID)
   815  				sreg.checkIDs[checkID] = struct{}{}
   816  			}
   817  
   818  			// New check on an unchanged service; add them now
   819  			newCheckIDs, err := c.checkRegs(ops, allocID, existingID, newSvc, newTask, exec, net)
   820  			if err != nil {
   821  				return err
   822  			}
   823  
   824  			for _, checkID := range newCheckIDs {
   825  				sreg.checkIDs[checkID] = struct{}{}
   826  
   827  			}
   828  
   829  			// Update all watched checks as CheckRestart fields aren't part of ID
   830  			if check.TriggersRestarts() {
   831  				c.checkWatcher.Watch(allocID, newTask.Name, checkID, check, restarter)
   832  			}
   833  		}
   834  
   835  		// Remove existing checks not in updated service
   836  		for cid, check := range existingChecks {
   837  			ops.deregChecks = append(ops.deregChecks, cid)
   838  
   839  			// Unwatch checks
   840  			if check.TriggersRestarts() {
   841  				c.checkWatcher.Unwatch(cid)
   842  			}
   843  		}
   844  	}
   845  
   846  	// Any remaining services should just be enqueued directly
   847  	for _, newSvc := range newIDs {
   848  		sreg, err := c.serviceRegs(ops, allocID, newSvc, newTask, exec, net)
   849  		if err != nil {
   850  			return err
   851  		}
   852  
   853  		taskReg.Services[sreg.serviceID] = sreg
   854  	}
   855  
   856  	// Add the task to the allocation's registration
   857  	c.addTaskRegistration(allocID, newTask.Name, taskReg)
   858  
   859  	c.commit(ops)
   860  
   861  	// Start watching checks. Done after service registrations are built
   862  	// since an error building them could leak watches.
   863  	for _, service := range newIDs {
   864  		serviceID := makeTaskServiceID(allocID, newTask.Name, service)
   865  		for _, check := range service.Checks {
   866  			if check.TriggersRestarts() {
   867  				checkID := makeCheckID(serviceID, check)
   868  				c.checkWatcher.Watch(allocID, newTask.Name, checkID, check, restarter)
   869  			}
   870  		}
   871  	}
   872  	return nil
   873  }
   874  
   875  // RemoveTask from Consul. Removes all service entries and checks.
   876  //
   877  // Actual communication with Consul is done asynchronously (see Run).
   878  func (c *ServiceClient) RemoveTask(allocID string, task *structs.Task) {
   879  	ops := operations{}
   880  
   881  	for _, service := range task.Services {
   882  		id := makeTaskServiceID(allocID, task.Name, service)
   883  		ops.deregServices = append(ops.deregServices, id)
   884  
   885  		for _, check := range service.Checks {
   886  			cid := makeCheckID(id, check)
   887  			ops.deregChecks = append(ops.deregChecks, cid)
   888  
   889  			if check.TriggersRestarts() {
   890  				c.checkWatcher.Unwatch(cid)
   891  			}
   892  		}
   893  	}
   894  
   895  	// Remove the task from the alloc's registrations
   896  	c.removeTaskRegistration(allocID, task.Name)
   897  
   898  	// Now add them to the deregistration fields; main Run loop will update
   899  	c.commit(&ops)
   900  }
   901  
   902  // AllocRegistrations returns the registrations for the given allocation. If the
   903  // allocation has no reservations, the response is a nil object.
   904  func (c *ServiceClient) AllocRegistrations(allocID string) (*AllocRegistration, error) {
   905  	// Get the internal struct using the lock
   906  	c.allocRegistrationsLock.RLock()
   907  	regInternal, ok := c.allocRegistrations[allocID]
   908  	if !ok {
   909  		c.allocRegistrationsLock.RUnlock()
   910  		return nil, nil
   911  	}
   912  
   913  	// Copy so we don't expose internal structs
   914  	reg := regInternal.copy()
   915  	c.allocRegistrationsLock.RUnlock()
   916  
   917  	// Query the services and checks to populate the allocation registrations.
   918  	services, err := c.client.Services()
   919  	if err != nil {
   920  		return nil, err
   921  	}
   922  
   923  	checks, err := c.client.Checks()
   924  	if err != nil {
   925  		return nil, err
   926  	}
   927  
   928  	// Populate the object
   929  	for _, treg := range reg.Tasks {
   930  		for serviceID, sreg := range treg.Services {
   931  			sreg.Service = services[serviceID]
   932  			for checkID := range sreg.checkIDs {
   933  				if check, ok := checks[checkID]; ok {
   934  					sreg.Checks = append(sreg.Checks, check)
   935  				}
   936  			}
   937  		}
   938  	}
   939  
   940  	return reg, nil
   941  }
   942  
   943  // Shutdown the Consul client. Update running task registrations and deregister
   944  // agent from Consul. On first call blocks up to shutdownWait before giving up
   945  // on syncing operations.
   946  func (c *ServiceClient) Shutdown() error {
   947  	// Serialize Shutdown calls with RegisterAgent to prevent leaking agent
   948  	// entries.
   949  	c.agentLock.Lock()
   950  	defer c.agentLock.Unlock()
   951  	select {
   952  	case <-c.shutdownCh:
   953  		return nil
   954  	default:
   955  		close(c.shutdownCh)
   956  	}
   957  
   958  	// Give run loop time to sync, but don't block indefinitely
   959  	deadline := time.After(c.shutdownWait)
   960  
   961  	// Wait for Run to finish any outstanding operations and exit
   962  	select {
   963  	case <-c.exitCh:
   964  	case <-deadline:
   965  		// Don't wait forever though
   966  	}
   967  
   968  	// If Consul was never seen nothing could be written so exit early
   969  	if !c.hasSeen() {
   970  		return nil
   971  	}
   972  
   973  	// Always attempt to deregister Nomad agent Consul entries, even if
   974  	// deadline was reached
   975  	for id := range c.agentServices {
   976  		if err := c.client.ServiceDeregister(id); err != nil {
   977  			c.logger.Printf("[ERR] consul.sync: error deregistering agent service (id: %q): %v", id, err)
   978  		}
   979  	}
   980  	for id := range c.agentChecks {
   981  		if err := c.client.CheckDeregister(id); err != nil {
   982  			c.logger.Printf("[ERR] consul.sync: error deregistering agent service (id: %q): %v", id, err)
   983  		}
   984  	}
   985  
   986  	// Give script checks time to exit (no need to lock as Run() has exited)
   987  	for _, h := range c.runningScripts {
   988  		select {
   989  		case <-h.wait():
   990  		case <-deadline:
   991  			return fmt.Errorf("timed out waiting for script checks to run")
   992  		}
   993  	}
   994  	return nil
   995  }
   996  
   997  // addTaskRegistration adds the task registration for the given allocation.
   998  func (c *ServiceClient) addTaskRegistration(allocID, taskName string, reg *TaskRegistration) {
   999  	c.allocRegistrationsLock.Lock()
  1000  	defer c.allocRegistrationsLock.Unlock()
  1001  
  1002  	alloc, ok := c.allocRegistrations[allocID]
  1003  	if !ok {
  1004  		alloc = &AllocRegistration{
  1005  			Tasks: make(map[string]*TaskRegistration),
  1006  		}
  1007  		c.allocRegistrations[allocID] = alloc
  1008  	}
  1009  	alloc.Tasks[taskName] = reg
  1010  }
  1011  
  1012  // removeTaskRegistration removes the task registration for the given allocation.
  1013  func (c *ServiceClient) removeTaskRegistration(allocID, taskName string) {
  1014  	c.allocRegistrationsLock.Lock()
  1015  	defer c.allocRegistrationsLock.Unlock()
  1016  
  1017  	alloc, ok := c.allocRegistrations[allocID]
  1018  	if !ok {
  1019  		return
  1020  	}
  1021  
  1022  	// Delete the task and if it is the last one also delete the alloc's
  1023  	// registration
  1024  	delete(alloc.Tasks, taskName)
  1025  	if len(alloc.Tasks) == 0 {
  1026  		delete(c.allocRegistrations, allocID)
  1027  	}
  1028  }
  1029  
  1030  // makeAgentServiceID creates a unique ID for identifying an agent service in
  1031  // Consul.
  1032  //
  1033  // Agent service IDs are of the form:
  1034  //
  1035  //	{nomadServicePrefix}-{ROLE}-b32(sha1({Service.Name}-{Service.Tags...})
  1036  //	Example Server ID: _nomad-server-fbbk265qn4tmt25nd4ep42tjvmyj3hr4
  1037  //	Example Client ID: _nomad-client-ggnjpgl7yn7rgmvxzilmpvrzzvrszc7l
  1038  //
  1039  func makeAgentServiceID(role string, service *structs.Service) string {
  1040  	return fmt.Sprintf("%s-%s-%s", nomadServicePrefix, role, service.Hash(role, ""))
  1041  }
  1042  
  1043  // makeTaskServiceID creates a unique ID for identifying a task service in
  1044  // Consul. All structs.Service fields are included in the ID's hash except
  1045  // Checks. This allows updates to merely compare IDs.
  1046  //
  1047  //	Example Service ID: _nomad-task-TNM333JKJPM5AK4FAS3VXQLXFDWOF4VH
  1048  func makeTaskServiceID(allocID, taskName string, service *structs.Service) string {
  1049  	return nomadTaskPrefix + service.Hash(allocID, taskName)
  1050  }
  1051  
  1052  // makeCheckID creates a unique ID for a check.
  1053  func makeCheckID(serviceID string, check *structs.ServiceCheck) string {
  1054  	return check.Hash(serviceID)
  1055  }
  1056  
  1057  // createCheckReg creates a Check that can be registered with Consul.
  1058  //
  1059  // Script checks simply have a TTL set and the caller is responsible for
  1060  // running the script and heartbeating.
  1061  func createCheckReg(serviceID, checkID string, check *structs.ServiceCheck, host string, port int) (*api.AgentCheckRegistration, error) {
  1062  	chkReg := api.AgentCheckRegistration{
  1063  		ID:        checkID,
  1064  		Name:      check.Name,
  1065  		ServiceID: serviceID,
  1066  	}
  1067  	chkReg.Status = check.InitialStatus
  1068  	chkReg.Timeout = check.Timeout.String()
  1069  	chkReg.Interval = check.Interval.String()
  1070  
  1071  	// Require an address for http or tcp checks
  1072  	if port == 0 && check.RequiresPort() {
  1073  		return nil, fmt.Errorf("%s checks require an address", check.Type)
  1074  	}
  1075  
  1076  	switch check.Type {
  1077  	case structs.ServiceCheckHTTP:
  1078  		proto := check.Protocol
  1079  		if proto == "" {
  1080  			proto = "http"
  1081  		}
  1082  		if check.TLSSkipVerify {
  1083  			chkReg.TLSSkipVerify = true
  1084  		}
  1085  		base := url.URL{
  1086  			Scheme: proto,
  1087  			Host:   net.JoinHostPort(host, strconv.Itoa(port)),
  1088  		}
  1089  		relative, err := url.Parse(check.Path)
  1090  		if err != nil {
  1091  			return nil, err
  1092  		}
  1093  		url := base.ResolveReference(relative)
  1094  		chkReg.HTTP = url.String()
  1095  		chkReg.Method = check.Method
  1096  		chkReg.Header = check.Header
  1097  	case structs.ServiceCheckTCP:
  1098  		chkReg.TCP = net.JoinHostPort(host, strconv.Itoa(port))
  1099  	case structs.ServiceCheckScript:
  1100  		chkReg.TTL = (check.Interval + ttlCheckBuffer).String()
  1101  		// As of Consul 1.0.0 setting TTL and Interval is a 400
  1102  		chkReg.Interval = ""
  1103  	default:
  1104  		return nil, fmt.Errorf("check type %+q not valid", check.Type)
  1105  	}
  1106  	return &chkReg, nil
  1107  }
  1108  
  1109  // isNomadService returns true if the ID matches the pattern of a Nomad managed
  1110  // service (new or old formats). Agent services return false as independent
  1111  // client and server agents may be running on the same machine. #2827
  1112  func isNomadService(id string) bool {
  1113  	return strings.HasPrefix(id, nomadTaskPrefix) || isOldNomadService(id)
  1114  }
  1115  
  1116  // isOldNomadService returns true if the ID matches an old pattern managed by
  1117  // Nomad.
  1118  //
  1119  // Pre-0.7.1 task service IDs are of the form:
  1120  //
  1121  //	{nomadServicePrefix}-executor-{ALLOC_ID}-{Service.Name}-{Service.Tags...}
  1122  //	Example Service ID: _nomad-executor-1234-echo-http-tag1-tag2-tag3
  1123  //
  1124  func isOldNomadService(id string) bool {
  1125  	const prefix = nomadServicePrefix + "-executor"
  1126  	return strings.HasPrefix(id, prefix)
  1127  }
  1128  
  1129  // getAddress returns the IP and port to use for a service or check. If no port
  1130  // label is specified (an empty value), zero values are returned because no
  1131  // address could be resolved.
  1132  func getAddress(addrMode, portLabel string, networks structs.Networks, driverNet *cstructs.DriverNetwork) (string, int, error) {
  1133  	switch addrMode {
  1134  	case structs.AddressModeAuto:
  1135  		if driverNet.Advertise() {
  1136  			addrMode = structs.AddressModeDriver
  1137  		} else {
  1138  			addrMode = structs.AddressModeHost
  1139  		}
  1140  		return getAddress(addrMode, portLabel, networks, driverNet)
  1141  	case structs.AddressModeHost:
  1142  		if portLabel == "" {
  1143  			if len(networks) != 1 {
  1144  				// If no networks are specified return zero
  1145  				// values. Consul will advertise the host IP
  1146  				// with no port. This is the pre-0.7.1 behavior
  1147  				// some people rely on.
  1148  				return "", 0, nil
  1149  			}
  1150  
  1151  			return networks[0].IP, 0, nil
  1152  		}
  1153  
  1154  		// Default path: use host ip:port
  1155  		ip, port := networks.Port(portLabel)
  1156  		if ip == "" && port <= 0 {
  1157  			return "", 0, fmt.Errorf("invalid port %q: port label not found", portLabel)
  1158  		}
  1159  		return ip, port, nil
  1160  
  1161  	case structs.AddressModeDriver:
  1162  		// Require a driver network if driver address mode is used
  1163  		if driverNet == nil {
  1164  			return "", 0, fmt.Errorf(`cannot use address_mode="driver": no driver network exists`)
  1165  		}
  1166  
  1167  		// If no port label is specified just return the IP
  1168  		if portLabel == "" {
  1169  			return driverNet.IP, 0, nil
  1170  		}
  1171  
  1172  		// If the port is a label, use the driver's port (not the host's)
  1173  		if port, ok := driverNet.PortMap[portLabel]; ok {
  1174  			return driverNet.IP, port, nil
  1175  		}
  1176  
  1177  		// If port isn't a label, try to parse it as a literal port number
  1178  		port, err := strconv.Atoi(portLabel)
  1179  		if err != nil {
  1180  			// Don't include Atoi error message as user likely
  1181  			// never intended it to be a numeric and it creates a
  1182  			// confusing error message
  1183  			return "", 0, fmt.Errorf("invalid port label %q: port labels in driver address_mode must be numeric or in the driver's port map", portLabel)
  1184  		}
  1185  		if port <= 0 {
  1186  			return "", 0, fmt.Errorf("invalid port: %q: port must be >0", portLabel)
  1187  		}
  1188  
  1189  		return driverNet.IP, port, nil
  1190  
  1191  	default:
  1192  		// Shouldn't happen due to validation, but enforce invariants
  1193  		return "", 0, fmt.Errorf("invalid address mode %q", addrMode)
  1194  	}
  1195  }