github.com/smithx10/nomad@v0.9.1-rc1/command/agent/consul/client.go (about)

     1  package consul
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"net"
     7  	"net/url"
     8  	"strconv"
     9  	"strings"
    10  	"sync"
    11  	"sync/atomic"
    12  	"time"
    13  
    14  	metrics "github.com/armon/go-metrics"
    15  	log "github.com/hashicorp/go-hclog"
    16  
    17  	"github.com/hashicorp/consul/api"
    18  	"github.com/hashicorp/nomad/helper"
    19  	"github.com/hashicorp/nomad/nomad/structs"
    20  	"github.com/hashicorp/nomad/plugins/drivers"
    21  )
    22  
    23  const (
    24  	// nomadServicePrefix is the prefix that scopes all Nomad registered
    25  	// services (both agent and task entries).
    26  	nomadServicePrefix = "_nomad"
    27  
    28  	// nomadTaskPrefix is the prefix that scopes Nomad registered services
    29  	// for tasks.
    30  	nomadTaskPrefix = nomadServicePrefix + "-task-"
    31  
    32  	// defaultRetryInterval is how quickly to retry syncing services and
    33  	// checks to Consul when an error occurs. Will backoff up to a max.
    34  	defaultRetryInterval = time.Second
    35  
    36  	// defaultMaxRetryInterval is the default max retry interval.
    37  	defaultMaxRetryInterval = 30 * time.Second
    38  
    39  	// defaultPeriodicalInterval is the interval at which the service
    40  	// client reconciles state between the desired services and checks and
    41  	// what's actually registered in Consul. This is done at an interval,
    42  	// rather than being purely edge triggered, to handle the case that the
    43  	// Consul agent's state may change underneath us
    44  	defaultPeriodicInterval = 30 * time.Second
    45  
    46  	// ttlCheckBuffer is the time interval that Nomad can take to report Consul
    47  	// the check result
    48  	ttlCheckBuffer = 31 * time.Second
    49  
    50  	// defaultShutdownWait is how long Shutdown() should block waiting for
    51  	// enqueued operations to sync to Consul by default.
    52  	defaultShutdownWait = time.Minute
    53  
    54  	// DefaultQueryWaitDuration is the max duration the Consul Agent will
    55  	// spend waiting for a response from a Consul Query.
    56  	DefaultQueryWaitDuration = 2 * time.Second
    57  
    58  	// ServiceTagHTTP is the tag assigned to HTTP services
    59  	ServiceTagHTTP = "http"
    60  
    61  	// ServiceTagRPC is the tag assigned to RPC services
    62  	ServiceTagRPC = "rpc"
    63  
    64  	// ServiceTagSerf is the tag assigned to Serf services
    65  	ServiceTagSerf = "serf"
    66  )
    67  
    68  // CatalogAPI is the consul/api.Catalog API used by Nomad.
    69  type CatalogAPI interface {
    70  	Datacenters() ([]string, error)
    71  	Service(service, tag string, q *api.QueryOptions) ([]*api.CatalogService, *api.QueryMeta, error)
    72  }
    73  
    74  // AgentAPI is the consul/api.Agent API used by Nomad.
    75  type AgentAPI interface {
    76  	Services() (map[string]*api.AgentService, error)
    77  	Checks() (map[string]*api.AgentCheck, error)
    78  	CheckRegister(check *api.AgentCheckRegistration) error
    79  	CheckDeregister(checkID string) error
    80  	Self() (map[string]map[string]interface{}, error)
    81  	ServiceRegister(service *api.AgentServiceRegistration) error
    82  	ServiceDeregister(serviceID string) error
    83  	UpdateTTL(id, output, status string) error
    84  }
    85  
    86  // operations are submitted to the main loop via commit() for synchronizing
    87  // with Consul.
    88  type operations struct {
    89  	regServices []*api.AgentServiceRegistration
    90  	regChecks   []*api.AgentCheckRegistration
    91  	scripts     []*scriptCheck
    92  
    93  	deregServices []string
    94  	deregChecks   []string
    95  }
    96  
    97  // AllocRegistration holds the status of services registered for a particular
    98  // allocations by task.
    99  type AllocRegistration struct {
   100  	// Tasks maps the name of a task to its registered services and checks
   101  	Tasks map[string]*TaskRegistration
   102  }
   103  
   104  func (a *AllocRegistration) copy() *AllocRegistration {
   105  	c := &AllocRegistration{
   106  		Tasks: make(map[string]*TaskRegistration, len(a.Tasks)),
   107  	}
   108  
   109  	for k, v := range a.Tasks {
   110  		c.Tasks[k] = v.copy()
   111  	}
   112  
   113  	return c
   114  }
   115  
   116  // NumServices returns the number of registered services
   117  func (a *AllocRegistration) NumServices() int {
   118  	if a == nil {
   119  		return 0
   120  	}
   121  
   122  	total := 0
   123  	for _, treg := range a.Tasks {
   124  		for _, sreg := range treg.Services {
   125  			if sreg.Service != nil {
   126  				total++
   127  			}
   128  		}
   129  	}
   130  
   131  	return total
   132  }
   133  
   134  // NumChecks returns the number of registered checks
   135  func (a *AllocRegistration) NumChecks() int {
   136  	if a == nil {
   137  		return 0
   138  	}
   139  
   140  	total := 0
   141  	for _, treg := range a.Tasks {
   142  		for _, sreg := range treg.Services {
   143  			total += len(sreg.Checks)
   144  		}
   145  	}
   146  
   147  	return total
   148  }
   149  
   150  // TaskRegistration holds the status of services registered for a particular
   151  // task.
   152  type TaskRegistration struct {
   153  	Services map[string]*ServiceRegistration
   154  }
   155  
   156  func (t *TaskRegistration) copy() *TaskRegistration {
   157  	c := &TaskRegistration{
   158  		Services: make(map[string]*ServiceRegistration, len(t.Services)),
   159  	}
   160  
   161  	for k, v := range t.Services {
   162  		c.Services[k] = v.copy()
   163  	}
   164  
   165  	return c
   166  }
   167  
   168  // ServiceRegistration holds the status of a registered Consul Service and its
   169  // Checks.
   170  type ServiceRegistration struct {
   171  	// serviceID and checkIDs are internal fields that track just the IDs of the
   172  	// services/checks registered in Consul. It is used to materialize the other
   173  	// fields when queried.
   174  	serviceID string
   175  	checkIDs  map[string]struct{}
   176  
   177  	// Service is the AgentService registered in Consul.
   178  	Service *api.AgentService
   179  
   180  	// Checks is the status of the registered checks.
   181  	Checks []*api.AgentCheck
   182  }
   183  
   184  func (s *ServiceRegistration) copy() *ServiceRegistration {
   185  	// Copy does not copy the external fields but only the internal fields. This
   186  	// is so that the caller of AllocRegistrations can not access the internal
   187  	// fields and that method uses these fields to populate the external fields.
   188  	return &ServiceRegistration{
   189  		serviceID: s.serviceID,
   190  		checkIDs:  helper.CopyMapStringStruct(s.checkIDs),
   191  	}
   192  }
   193  
   194  // ServiceClient handles task and agent service registration with Consul.
   195  type ServiceClient struct {
   196  	client           AgentAPI
   197  	logger           log.Logger
   198  	retryInterval    time.Duration
   199  	maxRetryInterval time.Duration
   200  	periodicInterval time.Duration
   201  
   202  	// exitCh is closed when the main Run loop exits
   203  	exitCh chan struct{}
   204  
   205  	// shutdownCh is closed when the client should shutdown
   206  	shutdownCh chan struct{}
   207  
   208  	// shutdownWait is how long Shutdown() blocks waiting for the final
   209  	// sync() to finish. Defaults to defaultShutdownWait
   210  	shutdownWait time.Duration
   211  
   212  	opCh chan *operations
   213  
   214  	services       map[string]*api.AgentServiceRegistration
   215  	checks         map[string]*api.AgentCheckRegistration
   216  	scripts        map[string]*scriptCheck
   217  	runningScripts map[string]*scriptHandle
   218  
   219  	// allocRegistrations stores the services and checks that are registered
   220  	// with Consul by allocation ID.
   221  	allocRegistrations     map[string]*AllocRegistration
   222  	allocRegistrationsLock sync.RWMutex
   223  
   224  	// agent services and checks record entries for the agent itself which
   225  	// should be removed on shutdown
   226  	agentServices map[string]struct{}
   227  	agentChecks   map[string]struct{}
   228  	agentLock     sync.Mutex
   229  
   230  	// seen is 1 if Consul has ever been seen; otherwise 0. Accessed with
   231  	// atomics.
   232  	seen int32
   233  
   234  	// checkWatcher restarts checks that are unhealthy.
   235  	checkWatcher *checkWatcher
   236  
   237  	// isClientAgent specifies whether this Consul client is being used
   238  	// by a Nomad client.
   239  	isClientAgent bool
   240  }
   241  
   242  // NewServiceClient creates a new Consul ServiceClient from an existing Consul API
   243  // Client, logger and takes whether the client is being used by a Nomad Client agent.
   244  // When being used by a Nomad client, this Consul client reconciles all services and
   245  // checks created by Nomad on behalf of running tasks.
   246  func NewServiceClient(consulClient AgentAPI, logger log.Logger, isNomadClient bool) *ServiceClient {
   247  	logger = logger.ResetNamed("consul.sync")
   248  	return &ServiceClient{
   249  		client:             consulClient,
   250  		logger:             logger,
   251  		retryInterval:      defaultRetryInterval,
   252  		maxRetryInterval:   defaultMaxRetryInterval,
   253  		periodicInterval:   defaultPeriodicInterval,
   254  		exitCh:             make(chan struct{}),
   255  		shutdownCh:         make(chan struct{}),
   256  		shutdownWait:       defaultShutdownWait,
   257  		opCh:               make(chan *operations, 8),
   258  		services:           make(map[string]*api.AgentServiceRegistration),
   259  		checks:             make(map[string]*api.AgentCheckRegistration),
   260  		scripts:            make(map[string]*scriptCheck),
   261  		runningScripts:     make(map[string]*scriptHandle),
   262  		allocRegistrations: make(map[string]*AllocRegistration),
   263  		agentServices:      make(map[string]struct{}),
   264  		agentChecks:        make(map[string]struct{}),
   265  		checkWatcher:       newCheckWatcher(logger, consulClient),
   266  		isClientAgent:      isNomadClient,
   267  	}
   268  }
   269  
   270  // seen is used by markSeen and hasSeen
   271  const seen = 1
   272  
   273  // markSeen marks Consul as having been seen (meaning at least one operation
   274  // has succeeded).
   275  func (c *ServiceClient) markSeen() {
   276  	atomic.StoreInt32(&c.seen, seen)
   277  }
   278  
   279  // hasSeen returns true if any Consul operation has ever succeeded. Useful to
   280  // squelch errors if Consul isn't running.
   281  func (c *ServiceClient) hasSeen() bool {
   282  	return atomic.LoadInt32(&c.seen) == seen
   283  }
   284  
   285  // Run the Consul main loop which retries operations against Consul. It should
   286  // be called exactly once.
   287  func (c *ServiceClient) Run() {
   288  	defer close(c.exitCh)
   289  
   290  	ctx, cancel := context.WithCancel(context.Background())
   291  	defer cancel()
   292  
   293  	// init will be closed when Consul has been contacted
   294  	init := make(chan struct{})
   295  	go checkConsulTLSSkipVerify(ctx, c.logger, c.client, init)
   296  
   297  	// Process operations while waiting for initial contact with Consul but
   298  	// do not sync until contact has been made.
   299  INIT:
   300  	for {
   301  		select {
   302  		case <-init:
   303  			c.markSeen()
   304  			break INIT
   305  		case <-c.shutdownCh:
   306  			return
   307  		case ops := <-c.opCh:
   308  			c.merge(ops)
   309  		}
   310  	}
   311  	c.logger.Trace("able to contact Consul")
   312  
   313  	// Block until contact with Consul has been established
   314  	// Start checkWatcher
   315  	go c.checkWatcher.Run(ctx)
   316  
   317  	// Always immediately sync to reconcile Nomad and Consul's state
   318  	retryTimer := time.NewTimer(0)
   319  
   320  	failures := 0
   321  	for {
   322  		select {
   323  		case <-retryTimer.C:
   324  		case <-c.shutdownCh:
   325  			// Cancel check watcher but sync one last time
   326  			cancel()
   327  		case ops := <-c.opCh:
   328  			c.merge(ops)
   329  		}
   330  
   331  		if err := c.sync(); err != nil {
   332  			if failures == 0 {
   333  				// Log on the first failure
   334  				c.logger.Warn("failed to update services in Consul", "error", err)
   335  			} else if failures%10 == 0 {
   336  				// Log every 10th consecutive failure
   337  				c.logger.Error("still unable to update services in Consul", "failures", failures, "error", err)
   338  			}
   339  
   340  			failures++
   341  			if !retryTimer.Stop() {
   342  				// Timer already expired, since the timer may
   343  				// or may not have been read in the select{}
   344  				// above, conditionally receive on it
   345  				select {
   346  				case <-retryTimer.C:
   347  				default:
   348  				}
   349  			}
   350  			backoff := c.retryInterval * time.Duration(failures)
   351  			if backoff > c.maxRetryInterval {
   352  				backoff = c.maxRetryInterval
   353  			}
   354  			retryTimer.Reset(backoff)
   355  		} else {
   356  			if failures > 0 {
   357  				c.logger.Info("successfully updated services in Consul")
   358  				failures = 0
   359  			}
   360  
   361  			// Reset timer to periodic interval to periodically
   362  			// reconile with Consul
   363  			if !retryTimer.Stop() {
   364  				select {
   365  				case <-retryTimer.C:
   366  				default:
   367  				}
   368  			}
   369  			retryTimer.Reset(c.periodicInterval)
   370  		}
   371  
   372  		select {
   373  		case <-c.shutdownCh:
   374  			// Exit only after sync'ing all outstanding operations
   375  			if len(c.opCh) > 0 {
   376  				for len(c.opCh) > 0 {
   377  					c.merge(<-c.opCh)
   378  				}
   379  				continue
   380  			}
   381  			return
   382  		default:
   383  		}
   384  
   385  	}
   386  }
   387  
   388  // commit operations unless already shutting down.
   389  func (c *ServiceClient) commit(ops *operations) {
   390  	select {
   391  	case c.opCh <- ops:
   392  	case <-c.shutdownCh:
   393  	}
   394  }
   395  
   396  // merge registrations into state map prior to sync'ing with Consul
   397  func (c *ServiceClient) merge(ops *operations) {
   398  	for _, s := range ops.regServices {
   399  		c.services[s.ID] = s
   400  	}
   401  	for _, check := range ops.regChecks {
   402  		c.checks[check.ID] = check
   403  	}
   404  	for _, s := range ops.scripts {
   405  		c.scripts[s.id] = s
   406  	}
   407  	for _, sid := range ops.deregServices {
   408  		delete(c.services, sid)
   409  	}
   410  	for _, cid := range ops.deregChecks {
   411  		if script, ok := c.runningScripts[cid]; ok {
   412  			script.cancel()
   413  			delete(c.scripts, cid)
   414  			delete(c.runningScripts, cid)
   415  		}
   416  		delete(c.checks, cid)
   417  	}
   418  	metrics.SetGauge([]string{"client", "consul", "services"}, float32(len(c.services)))
   419  	metrics.SetGauge([]string{"client", "consul", "checks"}, float32(len(c.checks)))
   420  	metrics.SetGauge([]string{"client", "consul", "script_checks"}, float32(len(c.runningScripts)))
   421  }
   422  
   423  // sync enqueued operations.
   424  func (c *ServiceClient) sync() error {
   425  	sreg, creg, sdereg, cdereg := 0, 0, 0, 0
   426  
   427  	consulServices, err := c.client.Services()
   428  	if err != nil {
   429  		metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1)
   430  		return fmt.Errorf("error querying Consul services: %v", err)
   431  	}
   432  
   433  	consulChecks, err := c.client.Checks()
   434  	if err != nil {
   435  		metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1)
   436  		return fmt.Errorf("error querying Consul checks: %v", err)
   437  	}
   438  
   439  	// Remove Nomad services in Consul but unknown locally
   440  	for id := range consulServices {
   441  		if _, ok := c.services[id]; ok {
   442  			// Known service, skip
   443  			continue
   444  		}
   445  
   446  		// Ignore if this is not a Nomad managed service. Also ignore
   447  		// Nomad managed services if this is not a client agent.
   448  		// This is to prevent server agents from removing services
   449  		// registered by client agents
   450  		if !isNomadService(id) || !c.isClientAgent {
   451  			// Not managed by Nomad, skip
   452  			continue
   453  		}
   454  
   455  		// Unknown Nomad managed service; kill
   456  		if err := c.client.ServiceDeregister(id); err != nil {
   457  			if isOldNomadService(id) {
   458  				// Don't hard-fail on old entries. See #3620
   459  				continue
   460  			}
   461  
   462  			metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1)
   463  			return err
   464  		}
   465  		sdereg++
   466  		metrics.IncrCounter([]string{"client", "consul", "service_deregistrations"}, 1)
   467  	}
   468  
   469  	// Add Nomad services missing from Consul
   470  	for id, locals := range c.services {
   471  		if _, ok := consulServices[id]; !ok {
   472  			if err = c.client.ServiceRegister(locals); err != nil {
   473  				metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1)
   474  				return err
   475  			}
   476  			sreg++
   477  			metrics.IncrCounter([]string{"client", "consul", "service_registrations"}, 1)
   478  		}
   479  	}
   480  
   481  	// Remove Nomad checks in Consul but unknown locally
   482  	for id, check := range consulChecks {
   483  		if _, ok := c.checks[id]; ok {
   484  			// Known check, leave it
   485  			continue
   486  		}
   487  
   488  		// Ignore if this is not a Nomad managed check. Also ignore
   489  		// Nomad managed checks if this is not a client agent.
   490  		// This is to prevent server agents from removing checks
   491  		// registered by client agents
   492  		if !isNomadService(check.ServiceID) || !c.isClientAgent {
   493  			// Service not managed by Nomad, skip
   494  			continue
   495  		}
   496  
   497  		// Unknown Nomad managed check; remove
   498  		if err := c.client.CheckDeregister(id); err != nil {
   499  			if isOldNomadService(check.ServiceID) {
   500  				// Don't hard-fail on old entries.
   501  				continue
   502  			}
   503  
   504  			metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1)
   505  			return err
   506  		}
   507  		cdereg++
   508  		metrics.IncrCounter([]string{"client", "consul", "check_deregistrations"}, 1)
   509  	}
   510  
   511  	// Add Nomad checks missing from Consul
   512  	for id, check := range c.checks {
   513  		if _, ok := consulChecks[id]; ok {
   514  			// Already in Consul; skipping
   515  			continue
   516  		}
   517  
   518  		if err := c.client.CheckRegister(check); err != nil {
   519  			metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1)
   520  			return err
   521  		}
   522  		creg++
   523  		metrics.IncrCounter([]string{"client", "consul", "check_registrations"}, 1)
   524  
   525  		// Handle starting scripts
   526  		if script, ok := c.scripts[id]; ok {
   527  			// If it's already running, cancel and replace
   528  			if oldScript, running := c.runningScripts[id]; running {
   529  				oldScript.cancel()
   530  			}
   531  			// Start and store the handle
   532  			c.runningScripts[id] = script.run()
   533  		}
   534  	}
   535  
   536  	// Only log if something was actually synced
   537  	if sreg > 0 || sdereg > 0 || creg > 0 || cdereg > 0 {
   538  		c.logger.Debug("sync complete", "registered_services", sreg, "deregistered_services", sdereg,
   539  			"registered_checks", creg, "deregistered_checks", cdereg)
   540  	}
   541  	return nil
   542  }
   543  
   544  // RegisterAgent registers Nomad agents (client or server). The
   545  // Service.PortLabel should be a literal port to be parsed with SplitHostPort.
   546  // Script checks are not supported and will return an error. Registration is
   547  // asynchronous.
   548  //
   549  // Agents will be deregistered when Shutdown is called.
   550  func (c *ServiceClient) RegisterAgent(role string, services []*structs.Service) error {
   551  	ops := operations{}
   552  
   553  	for _, service := range services {
   554  		id := makeAgentServiceID(role, service)
   555  
   556  		// Unlike tasks, agents don't use port labels. Agent ports are
   557  		// stored directly in the PortLabel.
   558  		host, rawport, err := net.SplitHostPort(service.PortLabel)
   559  		if err != nil {
   560  			return fmt.Errorf("error parsing port label %q from service %q: %v", service.PortLabel, service.Name, err)
   561  		}
   562  		port, err := strconv.Atoi(rawport)
   563  		if err != nil {
   564  			return fmt.Errorf("error parsing port %q from service %q: %v", rawport, service.Name, err)
   565  		}
   566  		serviceReg := &api.AgentServiceRegistration{
   567  			ID:      id,
   568  			Name:    service.Name,
   569  			Tags:    service.Tags,
   570  			Address: host,
   571  			Port:    port,
   572  			// This enables the consul UI to show that Nomad registered this service
   573  			Meta: map[string]string{
   574  				"external-source": "nomad",
   575  			},
   576  		}
   577  		ops.regServices = append(ops.regServices, serviceReg)
   578  
   579  		for _, check := range service.Checks {
   580  			checkID := makeCheckID(id, check)
   581  			if check.Type == structs.ServiceCheckScript {
   582  				return fmt.Errorf("service %q contains invalid check: agent checks do not support scripts", service.Name)
   583  			}
   584  			checkHost, checkPort := serviceReg.Address, serviceReg.Port
   585  			if check.PortLabel != "" {
   586  				// Unlike tasks, agents don't use port labels. Agent ports are
   587  				// stored directly in the PortLabel.
   588  				host, rawport, err := net.SplitHostPort(check.PortLabel)
   589  				if err != nil {
   590  					return fmt.Errorf("error parsing port label %q from check %q: %v", service.PortLabel, check.Name, err)
   591  				}
   592  				port, err := strconv.Atoi(rawport)
   593  				if err != nil {
   594  					return fmt.Errorf("error parsing port %q from check %q: %v", rawport, check.Name, err)
   595  				}
   596  				checkHost, checkPort = host, port
   597  			}
   598  			checkReg, err := createCheckReg(id, checkID, check, checkHost, checkPort)
   599  			if err != nil {
   600  				return fmt.Errorf("failed to add check %q: %v", check.Name, err)
   601  			}
   602  			ops.regChecks = append(ops.regChecks, checkReg)
   603  		}
   604  	}
   605  
   606  	// Don't bother committing agent checks if we're already shutting down
   607  	c.agentLock.Lock()
   608  	defer c.agentLock.Unlock()
   609  	select {
   610  	case <-c.shutdownCh:
   611  		return nil
   612  	default:
   613  	}
   614  
   615  	// Now add them to the registration queue
   616  	c.commit(&ops)
   617  
   618  	// Record IDs for deregistering on shutdown
   619  	for _, id := range ops.regServices {
   620  		c.agentServices[id.ID] = struct{}{}
   621  	}
   622  	for _, id := range ops.regChecks {
   623  		c.agentChecks[id.ID] = struct{}{}
   624  	}
   625  	return nil
   626  }
   627  
   628  // serviceRegs creates service registrations, check registrations, and script
   629  // checks from a service. It returns a service registration object with the
   630  // service and check IDs populated.
   631  func (c *ServiceClient) serviceRegs(ops *operations, service *structs.Service, task *TaskServices) (
   632  	*ServiceRegistration, error) {
   633  
   634  	// Get the services ID
   635  	id := makeTaskServiceID(task.AllocID, task.Name, service, task.Canary)
   636  	sreg := &ServiceRegistration{
   637  		serviceID: id,
   638  		checkIDs:  make(map[string]struct{}, len(service.Checks)),
   639  	}
   640  
   641  	// Service address modes default to auto
   642  	addrMode := service.AddressMode
   643  	if addrMode == "" {
   644  		addrMode = structs.AddressModeAuto
   645  	}
   646  
   647  	// Determine the address to advertise based on the mode
   648  	ip, port, err := getAddress(addrMode, service.PortLabel, task.Networks, task.DriverNetwork)
   649  	if err != nil {
   650  		return nil, fmt.Errorf("unable to get address for service %q: %v", service.Name, err)
   651  	}
   652  
   653  	// Determine whether to use tags or canary_tags
   654  	var tags []string
   655  	if task.Canary && len(service.CanaryTags) > 0 {
   656  		tags = make([]string, len(service.CanaryTags))
   657  		copy(tags, service.CanaryTags)
   658  	} else {
   659  		tags = make([]string, len(service.Tags))
   660  		copy(tags, service.Tags)
   661  	}
   662  
   663  	// Build the Consul Service registration request
   664  	serviceReg := &api.AgentServiceRegistration{
   665  		ID:      id,
   666  		Name:    service.Name,
   667  		Tags:    tags,
   668  		Address: ip,
   669  		Port:    port,
   670  		// This enables the consul UI to show that Nomad registered this service
   671  		Meta: map[string]string{
   672  			"external-source": "nomad",
   673  		},
   674  	}
   675  	ops.regServices = append(ops.regServices, serviceReg)
   676  
   677  	// Build the check registrations
   678  	checkIDs, err := c.checkRegs(ops, id, service, task)
   679  	if err != nil {
   680  		return nil, err
   681  	}
   682  	for _, cid := range checkIDs {
   683  		sreg.checkIDs[cid] = struct{}{}
   684  	}
   685  	return sreg, nil
   686  }
   687  
   688  // checkRegs registers the checks for the given service and returns the
   689  // registered check ids.
   690  func (c *ServiceClient) checkRegs(ops *operations, serviceID string, service *structs.Service,
   691  	task *TaskServices) ([]string, error) {
   692  
   693  	// Fast path
   694  	numChecks := len(service.Checks)
   695  	if numChecks == 0 {
   696  		return nil, nil
   697  	}
   698  
   699  	checkIDs := make([]string, 0, numChecks)
   700  	for _, check := range service.Checks {
   701  		checkID := makeCheckID(serviceID, check)
   702  		checkIDs = append(checkIDs, checkID)
   703  		if check.Type == structs.ServiceCheckScript {
   704  			if task.DriverExec == nil {
   705  				return nil, fmt.Errorf("driver doesn't support script checks")
   706  			}
   707  
   708  			sc := newScriptCheck(task.AllocID, task.Name, checkID, check, task.DriverExec,
   709  				c.client, c.logger, c.shutdownCh)
   710  			ops.scripts = append(ops.scripts, sc)
   711  
   712  			// Skip getAddress for script checks
   713  			checkReg, err := createCheckReg(serviceID, checkID, check, "", 0)
   714  			if err != nil {
   715  				return nil, fmt.Errorf("failed to add script check %q: %v", check.Name, err)
   716  			}
   717  			ops.regChecks = append(ops.regChecks, checkReg)
   718  			continue
   719  		}
   720  
   721  		// Default to the service's port but allow check to override
   722  		portLabel := check.PortLabel
   723  		if portLabel == "" {
   724  			// Default to the service's port label
   725  			portLabel = service.PortLabel
   726  		}
   727  
   728  		// Checks address mode defaults to host for pre-#3380 backward compat
   729  		addrMode := check.AddressMode
   730  		if addrMode == "" {
   731  			addrMode = structs.AddressModeHost
   732  		}
   733  
   734  		ip, port, err := getAddress(addrMode, portLabel, task.Networks, task.DriverNetwork)
   735  		if err != nil {
   736  			return nil, fmt.Errorf("error getting address for check %q: %v", check.Name, err)
   737  		}
   738  
   739  		checkReg, err := createCheckReg(serviceID, checkID, check, ip, port)
   740  		if err != nil {
   741  			return nil, fmt.Errorf("failed to add check %q: %v", check.Name, err)
   742  		}
   743  		ops.regChecks = append(ops.regChecks, checkReg)
   744  	}
   745  	return checkIDs, nil
   746  }
   747  
   748  // RegisterTask with Consul. Adds all service entries and checks to Consul. If
   749  // exec is nil and a script check exists an error is returned.
   750  //
   751  // If the service IP is set it used as the address in the service registration.
   752  // Checks will always use the IP from the Task struct (host's IP).
   753  //
   754  // Actual communication with Consul is done asynchronously (see Run).
   755  func (c *ServiceClient) RegisterTask(task *TaskServices) error {
   756  	// Fast path
   757  	numServices := len(task.Services)
   758  	if numServices == 0 {
   759  		return nil
   760  	}
   761  
   762  	t := new(TaskRegistration)
   763  	t.Services = make(map[string]*ServiceRegistration, numServices)
   764  
   765  	ops := &operations{}
   766  	for _, service := range task.Services {
   767  		sreg, err := c.serviceRegs(ops, service, task)
   768  		if err != nil {
   769  			return err
   770  		}
   771  		t.Services[sreg.serviceID] = sreg
   772  	}
   773  
   774  	// Add the task to the allocation's registration
   775  	c.addTaskRegistration(task.AllocID, task.Name, t)
   776  
   777  	c.commit(ops)
   778  
   779  	// Start watching checks. Done after service registrations are built
   780  	// since an error building them could leak watches.
   781  	for _, service := range task.Services {
   782  		serviceID := makeTaskServiceID(task.AllocID, task.Name, service, task.Canary)
   783  		for _, check := range service.Checks {
   784  			if check.TriggersRestarts() {
   785  				checkID := makeCheckID(serviceID, check)
   786  				c.checkWatcher.Watch(task.AllocID, task.Name, checkID, check, task.Restarter)
   787  			}
   788  		}
   789  	}
   790  	return nil
   791  }
   792  
   793  // UpdateTask in Consul. Does not alter the service if only checks have
   794  // changed.
   795  //
   796  // DriverNetwork must not change between invocations for the same allocation.
   797  func (c *ServiceClient) UpdateTask(old, newTask *TaskServices) error {
   798  	ops := &operations{}
   799  
   800  	taskReg := new(TaskRegistration)
   801  	taskReg.Services = make(map[string]*ServiceRegistration, len(newTask.Services))
   802  
   803  	existingIDs := make(map[string]*structs.Service, len(old.Services))
   804  	for _, s := range old.Services {
   805  		existingIDs[makeTaskServiceID(old.AllocID, old.Name, s, old.Canary)] = s
   806  	}
   807  	newIDs := make(map[string]*structs.Service, len(newTask.Services))
   808  	for _, s := range newTask.Services {
   809  		newIDs[makeTaskServiceID(newTask.AllocID, newTask.Name, s, newTask.Canary)] = s
   810  	}
   811  
   812  	// Loop over existing Service IDs to see if they have been removed or
   813  	// updated.
   814  	for existingID, existingSvc := range existingIDs {
   815  		newSvc, ok := newIDs[existingID]
   816  		if !ok {
   817  			// Existing service entry removed
   818  			ops.deregServices = append(ops.deregServices, existingID)
   819  			for _, check := range existingSvc.Checks {
   820  				cid := makeCheckID(existingID, check)
   821  				ops.deregChecks = append(ops.deregChecks, cid)
   822  
   823  				// Unwatch watched checks
   824  				if check.TriggersRestarts() {
   825  					c.checkWatcher.Unwatch(cid)
   826  				}
   827  			}
   828  			continue
   829  		}
   830  
   831  		// Service exists and hasn't changed, don't re-add it later
   832  		delete(newIDs, existingID)
   833  
   834  		// Service still exists so add it to the task's registration
   835  		sreg := &ServiceRegistration{
   836  			serviceID: existingID,
   837  			checkIDs:  make(map[string]struct{}, len(newSvc.Checks)),
   838  		}
   839  		taskReg.Services[existingID] = sreg
   840  
   841  		// See if any checks were updated
   842  		existingChecks := make(map[string]*structs.ServiceCheck, len(existingSvc.Checks))
   843  		for _, check := range existingSvc.Checks {
   844  			existingChecks[makeCheckID(existingID, check)] = check
   845  		}
   846  
   847  		// Register new checks
   848  		for _, check := range newSvc.Checks {
   849  			checkID := makeCheckID(existingID, check)
   850  			if _, exists := existingChecks[checkID]; exists {
   851  				// Check exists, so don't remove it
   852  				delete(existingChecks, checkID)
   853  				sreg.checkIDs[checkID] = struct{}{}
   854  			}
   855  
   856  			// New check on an unchanged service; add them now
   857  			newCheckIDs, err := c.checkRegs(ops, existingID, newSvc, newTask)
   858  			if err != nil {
   859  				return err
   860  			}
   861  
   862  			for _, checkID := range newCheckIDs {
   863  				sreg.checkIDs[checkID] = struct{}{}
   864  
   865  			}
   866  
   867  			// Update all watched checks as CheckRestart fields aren't part of ID
   868  			if check.TriggersRestarts() {
   869  				c.checkWatcher.Watch(newTask.AllocID, newTask.Name, checkID, check, newTask.Restarter)
   870  			}
   871  		}
   872  
   873  		// Remove existing checks not in updated service
   874  		for cid, check := range existingChecks {
   875  			ops.deregChecks = append(ops.deregChecks, cid)
   876  
   877  			// Unwatch checks
   878  			if check.TriggersRestarts() {
   879  				c.checkWatcher.Unwatch(cid)
   880  			}
   881  		}
   882  	}
   883  
   884  	// Any remaining services should just be enqueued directly
   885  	for _, newSvc := range newIDs {
   886  		sreg, err := c.serviceRegs(ops, newSvc, newTask)
   887  		if err != nil {
   888  			return err
   889  		}
   890  
   891  		taskReg.Services[sreg.serviceID] = sreg
   892  	}
   893  
   894  	// Add the task to the allocation's registration
   895  	c.addTaskRegistration(newTask.AllocID, newTask.Name, taskReg)
   896  
   897  	c.commit(ops)
   898  
   899  	// Start watching checks. Done after service registrations are built
   900  	// since an error building them could leak watches.
   901  	for _, service := range newIDs {
   902  		serviceID := makeTaskServiceID(newTask.AllocID, newTask.Name, service, newTask.Canary)
   903  		for _, check := range service.Checks {
   904  			if check.TriggersRestarts() {
   905  				checkID := makeCheckID(serviceID, check)
   906  				c.checkWatcher.Watch(newTask.AllocID, newTask.Name, checkID, check, newTask.Restarter)
   907  			}
   908  		}
   909  	}
   910  	return nil
   911  }
   912  
   913  // RemoveTask from Consul. Removes all service entries and checks.
   914  //
   915  // Actual communication with Consul is done asynchronously (see Run).
   916  func (c *ServiceClient) RemoveTask(task *TaskServices) {
   917  	ops := operations{}
   918  
   919  	for _, service := range task.Services {
   920  		id := makeTaskServiceID(task.AllocID, task.Name, service, task.Canary)
   921  		ops.deregServices = append(ops.deregServices, id)
   922  
   923  		for _, check := range service.Checks {
   924  			cid := makeCheckID(id, check)
   925  			ops.deregChecks = append(ops.deregChecks, cid)
   926  
   927  			if check.TriggersRestarts() {
   928  				c.checkWatcher.Unwatch(cid)
   929  			}
   930  		}
   931  	}
   932  
   933  	// Remove the task from the alloc's registrations
   934  	c.removeTaskRegistration(task.AllocID, task.Name)
   935  
   936  	// Now add them to the deregistration fields; main Run loop will update
   937  	c.commit(&ops)
   938  }
   939  
   940  // AllocRegistrations returns the registrations for the given allocation. If the
   941  // allocation has no reservations, the response is a nil object.
   942  func (c *ServiceClient) AllocRegistrations(allocID string) (*AllocRegistration, error) {
   943  	// Get the internal struct using the lock
   944  	c.allocRegistrationsLock.RLock()
   945  	regInternal, ok := c.allocRegistrations[allocID]
   946  	if !ok {
   947  		c.allocRegistrationsLock.RUnlock()
   948  		return nil, nil
   949  	}
   950  
   951  	// Copy so we don't expose internal structs
   952  	reg := regInternal.copy()
   953  	c.allocRegistrationsLock.RUnlock()
   954  
   955  	// Query the services and checks to populate the allocation registrations.
   956  	services, err := c.client.Services()
   957  	if err != nil {
   958  		return nil, err
   959  	}
   960  
   961  	checks, err := c.client.Checks()
   962  	if err != nil {
   963  		return nil, err
   964  	}
   965  
   966  	// Populate the object
   967  	for _, treg := range reg.Tasks {
   968  		for serviceID, sreg := range treg.Services {
   969  			sreg.Service = services[serviceID]
   970  			for checkID := range sreg.checkIDs {
   971  				if check, ok := checks[checkID]; ok {
   972  					sreg.Checks = append(sreg.Checks, check)
   973  				}
   974  			}
   975  		}
   976  	}
   977  
   978  	return reg, nil
   979  }
   980  
   981  // Shutdown the Consul client. Update running task registrations and deregister
   982  // agent from Consul. On first call blocks up to shutdownWait before giving up
   983  // on syncing operations.
   984  func (c *ServiceClient) Shutdown() error {
   985  	// Serialize Shutdown calls with RegisterAgent to prevent leaking agent
   986  	// entries.
   987  	c.agentLock.Lock()
   988  	defer c.agentLock.Unlock()
   989  	select {
   990  	case <-c.shutdownCh:
   991  		return nil
   992  	default:
   993  		close(c.shutdownCh)
   994  	}
   995  
   996  	// Give run loop time to sync, but don't block indefinitely
   997  	deadline := time.After(c.shutdownWait)
   998  
   999  	// Wait for Run to finish any outstanding operations and exit
  1000  	select {
  1001  	case <-c.exitCh:
  1002  	case <-deadline:
  1003  		// Don't wait forever though
  1004  	}
  1005  
  1006  	// If Consul was never seen nothing could be written so exit early
  1007  	if !c.hasSeen() {
  1008  		return nil
  1009  	}
  1010  
  1011  	// Always attempt to deregister Nomad agent Consul entries, even if
  1012  	// deadline was reached
  1013  	for id := range c.agentServices {
  1014  		if err := c.client.ServiceDeregister(id); err != nil {
  1015  			c.logger.Error("failed deregistering agent service", "service_id", id, "error", err)
  1016  		}
  1017  	}
  1018  	for id := range c.agentChecks {
  1019  		if err := c.client.CheckDeregister(id); err != nil {
  1020  			c.logger.Error("failed deregistering agent check", "check_id", id, "error", err)
  1021  		}
  1022  	}
  1023  
  1024  	// Give script checks time to exit (no need to lock as Run() has exited)
  1025  	for _, h := range c.runningScripts {
  1026  		select {
  1027  		case <-h.wait():
  1028  		case <-deadline:
  1029  			return fmt.Errorf("timed out waiting for script checks to run")
  1030  		}
  1031  	}
  1032  	return nil
  1033  }
  1034  
  1035  // addTaskRegistration adds the task registration for the given allocation.
  1036  func (c *ServiceClient) addTaskRegistration(allocID, taskName string, reg *TaskRegistration) {
  1037  	c.allocRegistrationsLock.Lock()
  1038  	defer c.allocRegistrationsLock.Unlock()
  1039  
  1040  	alloc, ok := c.allocRegistrations[allocID]
  1041  	if !ok {
  1042  		alloc = &AllocRegistration{
  1043  			Tasks: make(map[string]*TaskRegistration),
  1044  		}
  1045  		c.allocRegistrations[allocID] = alloc
  1046  	}
  1047  	alloc.Tasks[taskName] = reg
  1048  }
  1049  
  1050  // removeTaskRegistration removes the task registration for the given allocation.
  1051  func (c *ServiceClient) removeTaskRegistration(allocID, taskName string) {
  1052  	c.allocRegistrationsLock.Lock()
  1053  	defer c.allocRegistrationsLock.Unlock()
  1054  
  1055  	alloc, ok := c.allocRegistrations[allocID]
  1056  	if !ok {
  1057  		return
  1058  	}
  1059  
  1060  	// Delete the task and if it is the last one also delete the alloc's
  1061  	// registration
  1062  	delete(alloc.Tasks, taskName)
  1063  	if len(alloc.Tasks) == 0 {
  1064  		delete(c.allocRegistrations, allocID)
  1065  	}
  1066  }
  1067  
  1068  // makeAgentServiceID creates a unique ID for identifying an agent service in
  1069  // Consul.
  1070  //
  1071  // Agent service IDs are of the form:
  1072  //
  1073  //	{nomadServicePrefix}-{ROLE}-b32(sha1({Service.Name}-{Service.Tags...})
  1074  //	Example Server ID: _nomad-server-fbbk265qn4tmt25nd4ep42tjvmyj3hr4
  1075  //	Example Client ID: _nomad-client-ggnjpgl7yn7rgmvxzilmpvrzzvrszc7l
  1076  //
  1077  func makeAgentServiceID(role string, service *structs.Service) string {
  1078  	return fmt.Sprintf("%s-%s-%s", nomadServicePrefix, role, service.Hash(role, "", false))
  1079  }
  1080  
  1081  // makeTaskServiceID creates a unique ID for identifying a task service in
  1082  // Consul. All structs.Service fields are included in the ID's hash except
  1083  // Checks. This allows updates to merely compare IDs.
  1084  //
  1085  //	Example Service ID: _nomad-task-TNM333JKJPM5AK4FAS3VXQLXFDWOF4VH
  1086  func makeTaskServiceID(allocID, taskName string, service *structs.Service, canary bool) string {
  1087  	return nomadTaskPrefix + service.Hash(allocID, taskName, canary)
  1088  }
  1089  
  1090  // makeCheckID creates a unique ID for a check.
  1091  func makeCheckID(serviceID string, check *structs.ServiceCheck) string {
  1092  	return check.Hash(serviceID)
  1093  }
  1094  
  1095  // createCheckReg creates a Check that can be registered with Consul.
  1096  //
  1097  // Script checks simply have a TTL set and the caller is responsible for
  1098  // running the script and heartbeating.
  1099  func createCheckReg(serviceID, checkID string, check *structs.ServiceCheck, host string, port int) (*api.AgentCheckRegistration, error) {
  1100  	chkReg := api.AgentCheckRegistration{
  1101  		ID:        checkID,
  1102  		Name:      check.Name,
  1103  		ServiceID: serviceID,
  1104  	}
  1105  	chkReg.Status = check.InitialStatus
  1106  	chkReg.Timeout = check.Timeout.String()
  1107  	chkReg.Interval = check.Interval.String()
  1108  
  1109  	// Require an address for http or tcp checks
  1110  	if port == 0 && check.RequiresPort() {
  1111  		return nil, fmt.Errorf("%s checks require an address", check.Type)
  1112  	}
  1113  
  1114  	switch check.Type {
  1115  	case structs.ServiceCheckHTTP:
  1116  		proto := check.Protocol
  1117  		if proto == "" {
  1118  			proto = "http"
  1119  		}
  1120  		if check.TLSSkipVerify {
  1121  			chkReg.TLSSkipVerify = true
  1122  		}
  1123  		base := url.URL{
  1124  			Scheme: proto,
  1125  			Host:   net.JoinHostPort(host, strconv.Itoa(port)),
  1126  		}
  1127  		relative, err := url.Parse(check.Path)
  1128  		if err != nil {
  1129  			return nil, err
  1130  		}
  1131  		url := base.ResolveReference(relative)
  1132  		chkReg.HTTP = url.String()
  1133  		chkReg.Method = check.Method
  1134  		chkReg.Header = check.Header
  1135  
  1136  	case structs.ServiceCheckTCP:
  1137  		chkReg.TCP = net.JoinHostPort(host, strconv.Itoa(port))
  1138  
  1139  	case structs.ServiceCheckScript:
  1140  		chkReg.TTL = (check.Interval + ttlCheckBuffer).String()
  1141  		// As of Consul 1.0.0 setting TTL and Interval is a 400
  1142  		chkReg.Interval = ""
  1143  
  1144  	case structs.ServiceCheckGRPC:
  1145  		chkReg.GRPC = fmt.Sprintf("%s/%s", net.JoinHostPort(host, strconv.Itoa(port)), check.GRPCService)
  1146  		chkReg.GRPCUseTLS = check.GRPCUseTLS
  1147  		if check.TLSSkipVerify {
  1148  			chkReg.TLSSkipVerify = true
  1149  		}
  1150  
  1151  	default:
  1152  		return nil, fmt.Errorf("check type %+q not valid", check.Type)
  1153  	}
  1154  	return &chkReg, nil
  1155  }
  1156  
  1157  // isNomadService returns true if the ID matches the pattern of a Nomad managed
  1158  // service (new or old formats). Agent services return false as independent
  1159  // client and server agents may be running on the same machine. #2827
  1160  func isNomadService(id string) bool {
  1161  	return strings.HasPrefix(id, nomadTaskPrefix) || isOldNomadService(id)
  1162  }
  1163  
  1164  // isOldNomadService returns true if the ID matches an old pattern managed by
  1165  // Nomad.
  1166  //
  1167  // Pre-0.7.1 task service IDs are of the form:
  1168  //
  1169  //	{nomadServicePrefix}-executor-{ALLOC_ID}-{Service.Name}-{Service.Tags...}
  1170  //	Example Service ID: _nomad-executor-1234-echo-http-tag1-tag2-tag3
  1171  //
  1172  func isOldNomadService(id string) bool {
  1173  	const prefix = nomadServicePrefix + "-executor"
  1174  	return strings.HasPrefix(id, prefix)
  1175  }
  1176  
  1177  // getAddress returns the IP and port to use for a service or check. If no port
  1178  // label is specified (an empty value), zero values are returned because no
  1179  // address could be resolved.
  1180  func getAddress(addrMode, portLabel string, networks structs.Networks, driverNet *drivers.DriverNetwork) (string, int, error) {
  1181  	switch addrMode {
  1182  	case structs.AddressModeAuto:
  1183  		if driverNet.Advertise() {
  1184  			addrMode = structs.AddressModeDriver
  1185  		} else {
  1186  			addrMode = structs.AddressModeHost
  1187  		}
  1188  		return getAddress(addrMode, portLabel, networks, driverNet)
  1189  	case structs.AddressModeHost:
  1190  		if portLabel == "" {
  1191  			if len(networks) != 1 {
  1192  				// If no networks are specified return zero
  1193  				// values. Consul will advertise the host IP
  1194  				// with no port. This is the pre-0.7.1 behavior
  1195  				// some people rely on.
  1196  				return "", 0, nil
  1197  			}
  1198  
  1199  			return networks[0].IP, 0, nil
  1200  		}
  1201  
  1202  		// Default path: use host ip:port
  1203  		ip, port := networks.Port(portLabel)
  1204  		if ip == "" && port <= 0 {
  1205  			return "", 0, fmt.Errorf("invalid port %q: port label not found", portLabel)
  1206  		}
  1207  		return ip, port, nil
  1208  
  1209  	case structs.AddressModeDriver:
  1210  		// Require a driver network if driver address mode is used
  1211  		if driverNet == nil {
  1212  			return "", 0, fmt.Errorf(`cannot use address_mode="driver": no driver network exists`)
  1213  		}
  1214  
  1215  		// If no port label is specified just return the IP
  1216  		if portLabel == "" {
  1217  			return driverNet.IP, 0, nil
  1218  		}
  1219  
  1220  		// If the port is a label, use the driver's port (not the host's)
  1221  		if port, ok := driverNet.PortMap[portLabel]; ok {
  1222  			return driverNet.IP, port, nil
  1223  		}
  1224  
  1225  		// If port isn't a label, try to parse it as a literal port number
  1226  		port, err := strconv.Atoi(portLabel)
  1227  		if err != nil {
  1228  			// Don't include Atoi error message as user likely
  1229  			// never intended it to be a numeric and it creates a
  1230  			// confusing error message
  1231  			return "", 0, fmt.Errorf("invalid port label %q: port labels in driver address_mode must be numeric or in the driver's port map", portLabel)
  1232  		}
  1233  		if port <= 0 {
  1234  			return "", 0, fmt.Errorf("invalid port: %q: port must be >0", portLabel)
  1235  		}
  1236  
  1237  		return driverNet.IP, port, nil
  1238  
  1239  	default:
  1240  		// Shouldn't happen due to validation, but enforce invariants
  1241  		return "", 0, fmt.Errorf("invalid address mode %q", addrMode)
  1242  	}
  1243  }