github.com/Ilhicas/nomad@v1.0.4-0.20210304152020-e86851182bc3/command/agent/consul/service_client.go (about)

     1  package consul
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"net"
     7  	"net/url"
     8  	"reflect"
     9  	"strconv"
    10  	"strings"
    11  	"sync"
    12  	"sync/atomic"
    13  	"time"
    14  
    15  	metrics "github.com/armon/go-metrics"
    16  	log "github.com/hashicorp/go-hclog"
    17  
    18  	"github.com/hashicorp/consul/api"
    19  	"github.com/hashicorp/nomad/helper"
    20  	"github.com/hashicorp/nomad/nomad/structs"
    21  	"github.com/hashicorp/nomad/plugins/drivers"
    22  )
    23  
    24  const (
    25  	// nomadServicePrefix is the prefix that scopes all Nomad registered
    26  	// services (both agent and task entries).
    27  	nomadServicePrefix = "_nomad"
    28  
    29  	// nomadTaskPrefix is the prefix that scopes Nomad registered services
    30  	// for tasks.
    31  	nomadTaskPrefix = nomadServicePrefix + "-task-"
    32  
    33  	// nomadCheckPrefix is the prefix that scopes Nomad registered checks for
    34  	// services.
    35  	nomadCheckPrefix = nomadServicePrefix + "-check-"
    36  
    37  	// defaultRetryInterval is how quickly to retry syncing services and
    38  	// checks to Consul when an error occurs. Will backoff up to a max.
    39  	defaultRetryInterval = time.Second
    40  
    41  	// defaultMaxRetryInterval is the default max retry interval.
    42  	defaultMaxRetryInterval = 30 * time.Second
    43  
    44  	// defaultPeriodicalInterval is the interval at which the service
    45  	// client reconciles state between the desired services and checks and
    46  	// what's actually registered in Consul. This is done at an interval,
    47  	// rather than being purely edge triggered, to handle the case that the
    48  	// Consul agent's state may change underneath us
    49  	defaultPeriodicInterval = 30 * time.Second
    50  
    51  	// ttlCheckBuffer is the time interval that Nomad can take to report Consul
    52  	// the check result
    53  	ttlCheckBuffer = 31 * time.Second
    54  
    55  	// defaultShutdownWait is how long Shutdown() should block waiting for
    56  	// enqueued operations to sync to Consul by default.
    57  	defaultShutdownWait = time.Minute
    58  
    59  	// DefaultQueryWaitDuration is the max duration the Consul Agent will
    60  	// spend waiting for a response from a Consul Query.
    61  	DefaultQueryWaitDuration = 2 * time.Second
    62  
    63  	// ServiceTagHTTP is the tag assigned to HTTP services
    64  	ServiceTagHTTP = "http"
    65  
    66  	// ServiceTagRPC is the tag assigned to RPC services
    67  	ServiceTagRPC = "rpc"
    68  
    69  	// ServiceTagSerf is the tag assigned to Serf services
    70  	ServiceTagSerf = "serf"
    71  
    72  	// deregisterProbationPeriod is the initialization period where
    73  	// services registered in Consul but not in Nomad don't get deregistered,
    74  	// to allow for nomad restoring tasks
    75  	deregisterProbationPeriod = time.Minute
    76  )
    77  
    78  // Additional Consul ACLs required
    79  // - Consul Template: key:read
    80  //   Used in tasks with template stanza that use Consul keys.
    81  
    82  // CatalogAPI is the consul/api.Catalog API used by Nomad.
    83  //
    84  // ACL requirements
    85  // - node:read (listing datacenters)
    86  // - service:read
    87  type CatalogAPI interface {
    88  	Datacenters() ([]string, error)
    89  	Service(service, tag string, q *api.QueryOptions) ([]*api.CatalogService, *api.QueryMeta, error)
    90  }
    91  
    92  // AgentAPI is the consul/api.Agent API used by Nomad.
    93  //
    94  // ACL requirements
    95  // - agent:read
    96  // - service:write
    97  type AgentAPI interface {
    98  	Services() (map[string]*api.AgentService, error)
    99  	Checks() (map[string]*api.AgentCheck, error)
   100  	CheckRegister(check *api.AgentCheckRegistration) error
   101  	CheckDeregister(checkID string) error
   102  	Self() (map[string]map[string]interface{}, error)
   103  	ServiceRegister(service *api.AgentServiceRegistration) error
   104  	ServiceDeregister(serviceID string) error
   105  	UpdateTTL(id, output, status string) error
   106  }
   107  
   108  // ConfigAPI is the consul/api.ConfigEntries API subset used by Nomad Server.
   109  //
   110  // ACL requirements
   111  // - operator:write (server only)
   112  type ConfigAPI interface {
   113  	Set(entry api.ConfigEntry, w *api.WriteOptions) (bool, *api.WriteMeta, error)
   114  	// Delete(kind, name string, w *api.WriteOptions) (*api.WriteMeta, error) (not used)
   115  }
   116  
   117  // ACLsAPI is the consul/api.ACL API subset used by Nomad Server.
   118  //
   119  // ACL requirements
   120  // - acl:write (server only)
   121  type ACLsAPI interface {
   122  	// We are looking up by [operator token] SecretID, which implies we need
   123  	// to use this method instead of the normal TokenRead, which can only be
   124  	// used to lookup tokens by their AccessorID.
   125  	TokenReadSelf(q *api.QueryOptions) (*api.ACLToken, *api.QueryMeta, error)
   126  	PolicyRead(policyID string, q *api.QueryOptions) (*api.ACLPolicy, *api.QueryMeta, error)
   127  	RoleRead(roleID string, q *api.QueryOptions) (*api.ACLRole, *api.QueryMeta, error)
   128  	TokenCreate(partial *api.ACLToken, q *api.WriteOptions) (*api.ACLToken, *api.WriteMeta, error)
   129  	TokenDelete(accessorID string, q *api.WriteOptions) (*api.WriteMeta, error)
   130  	TokenList(q *api.QueryOptions) ([]*api.ACLTokenListEntry, *api.QueryMeta, error)
   131  }
   132  
   133  // agentServiceUpdateRequired checks if any critical fields in Nomad's version
   134  // of a service definition are different from the existing service definition as
   135  // known by Consul.
   136  //
   137  //  reason - The syncReason that triggered this synchronization with the consul
   138  //           agent API.
   139  //  wanted - Nomad's view of what the service definition is intended to be.
   140  //           Not nil.
   141  //  existing - Consul's view (agent, not catalog) of the actual service definition.
   142  //           Not nil.
   143  //  sidecar - Consul's view (agent, not catalog) of the service definition of the sidecar
   144  //           associated with existing that may or may not exist.
   145  //           May be nil.
   146  func agentServiceUpdateRequired(reason syncReason, wanted *api.AgentServiceRegistration, existing *api.AgentService, sidecar *api.AgentService) bool {
   147  	switch reason {
   148  	case syncPeriodic:
   149  		// In a periodic sync with Consul, we need to respect the value of
   150  		// the enable_tag_override field so that we maintain the illusion that the
   151  		// user is in control of the Consul tags, as they may be externally edited
   152  		// via the Consul catalog API (e.g. a user manually sets them).
   153  		//
   154  		// As Consul does by disabling anti-entropy for the tags field, Nomad will
   155  		// ignore differences in the tags field during the periodic syncs with
   156  		// the Consul agent API.
   157  		//
   158  		// We do so by over-writing the nomad service registration by the value
   159  		// of the tags that Consul contains, if enable_tag_override = true.
   160  		maybeTweakTags(wanted, existing, sidecar)
   161  		return different(wanted, existing, sidecar)
   162  
   163  	default:
   164  		// A non-periodic sync with Consul indicates an operation has been set
   165  		// on the queue. This happens when service has been added / removed / modified
   166  		// and implies the Consul agent should be sync'd with nomad, because
   167  		// nomad is the ultimate source of truth for the service definition.
   168  		return different(wanted, existing, sidecar)
   169  	}
   170  }
   171  
   172  // maybeTweakTags will override wanted.Tags with a copy of existing.Tags only if
   173  // EnableTagOverride is true. Otherwise the wanted service registration is left
   174  // unchanged.
   175  func maybeTweakTags(wanted *api.AgentServiceRegistration, existing *api.AgentService, sidecar *api.AgentService) {
   176  	if wanted.EnableTagOverride {
   177  		wanted.Tags = helper.CopySliceString(existing.Tags)
   178  		// If the service registration also defines a sidecar service, use the ETO
   179  		// setting for the parent service to also apply to the sidecar.
   180  		if wanted.Connect != nil && wanted.Connect.SidecarService != nil {
   181  			if sidecar != nil {
   182  				wanted.Connect.SidecarService.Tags = helper.CopySliceString(sidecar.Tags)
   183  			}
   184  		}
   185  	}
   186  }
   187  
   188  // different compares the wanted state of the service registration with the actual
   189  // (cached) state of the service registration reported by Consul. If any of the
   190  // critical fields are not deeply equal, they considered different.
   191  func different(wanted *api.AgentServiceRegistration, existing *api.AgentService, sidecar *api.AgentService) bool {
   192  	switch {
   193  	case wanted.Kind != existing.Kind:
   194  		return true
   195  	case wanted.ID != existing.ID:
   196  		return true
   197  	case wanted.Port != existing.Port:
   198  		return true
   199  	case wanted.Address != existing.Address:
   200  		return true
   201  	case wanted.Name != existing.Service:
   202  		return true
   203  	case wanted.EnableTagOverride != existing.EnableTagOverride:
   204  		return true
   205  	case !reflect.DeepEqual(wanted.Meta, existing.Meta):
   206  		return true
   207  	case tagsDifferent(wanted.Tags, existing.Tags):
   208  		return true
   209  	case connectSidecarDifferent(wanted, sidecar):
   210  		return true
   211  	}
   212  	return false
   213  }
   214  
   215  func tagsDifferent(a, b []string) bool {
   216  	if len(a) != len(b) {
   217  		return true
   218  	}
   219  
   220  	for i, valueA := range a {
   221  		if b[i] != valueA {
   222  			return true
   223  		}
   224  	}
   225  
   226  	return false
   227  }
   228  
   229  // sidecarTagsDifferent includes the special logic for comparing sidecar tags
   230  // from Nomad vs. Consul perspective. Because Consul forces the sidecar tags
   231  // to inherit the parent service tags if the sidecar tags are unset, we need to
   232  // take that into consideration when Nomad's sidecar tags are unset by instead
   233  // comparing them to the parent service tags.
   234  func sidecarTagsDifferent(parent, wanted, sidecar []string) bool {
   235  	if len(wanted) == 0 {
   236  		return tagsDifferent(parent, sidecar)
   237  	}
   238  	return tagsDifferent(wanted, sidecar)
   239  }
   240  
   241  // connectSidecarDifferent returns true if Nomad expects there to be a sidecar
   242  // hanging off the desired parent service definition on the Consul side, and does
   243  // not match with what Consul has.
   244  func connectSidecarDifferent(wanted *api.AgentServiceRegistration, sidecar *api.AgentService) bool {
   245  	if wanted.Connect != nil && wanted.Connect.SidecarService != nil {
   246  		if sidecar == nil {
   247  			// consul lost our sidecar (?)
   248  			return true
   249  		}
   250  
   251  		if sidecarTagsDifferent(wanted.Tags, wanted.Connect.SidecarService.Tags, sidecar.Tags) {
   252  			// tags on the nomad definition have been modified
   253  			return true
   254  		}
   255  	}
   256  
   257  	// Either Nomad does not expect there to be a sidecar_service, or there is
   258  	// no actionable difference from the Consul sidecar_service definition.
   259  	return false
   260  }
   261  
   262  // operations are submitted to the main loop via commit() for synchronizing
   263  // with Consul.
   264  type operations struct {
   265  	regServices   []*api.AgentServiceRegistration
   266  	regChecks     []*api.AgentCheckRegistration
   267  	deregServices []string
   268  	deregChecks   []string
   269  }
   270  
   271  // AllocRegistration holds the status of services registered for a particular
   272  // allocations by task.
   273  type AllocRegistration struct {
   274  	// Tasks maps the name of a task to its registered services and checks
   275  	Tasks map[string]*ServiceRegistrations
   276  }
   277  
   278  func (a *AllocRegistration) copy() *AllocRegistration {
   279  	c := &AllocRegistration{
   280  		Tasks: make(map[string]*ServiceRegistrations, len(a.Tasks)),
   281  	}
   282  
   283  	for k, v := range a.Tasks {
   284  		c.Tasks[k] = v.copy()
   285  	}
   286  
   287  	return c
   288  }
   289  
   290  // NumServices returns the number of registered services
   291  func (a *AllocRegistration) NumServices() int {
   292  	if a == nil {
   293  		return 0
   294  	}
   295  
   296  	total := 0
   297  	for _, treg := range a.Tasks {
   298  		for _, sreg := range treg.Services {
   299  			if sreg.Service != nil {
   300  				total++
   301  			}
   302  		}
   303  	}
   304  
   305  	return total
   306  }
   307  
   308  // NumChecks returns the number of registered checks
   309  func (a *AllocRegistration) NumChecks() int {
   310  	if a == nil {
   311  		return 0
   312  	}
   313  
   314  	total := 0
   315  	for _, treg := range a.Tasks {
   316  		for _, sreg := range treg.Services {
   317  			total += len(sreg.Checks)
   318  		}
   319  	}
   320  
   321  	return total
   322  }
   323  
   324  // ServiceRegistrations holds the status of services registered for a particular
   325  // task or task group.
   326  type ServiceRegistrations struct {
   327  	Services map[string]*ServiceRegistration
   328  }
   329  
   330  func (t *ServiceRegistrations) copy() *ServiceRegistrations {
   331  	c := &ServiceRegistrations{
   332  		Services: make(map[string]*ServiceRegistration, len(t.Services)),
   333  	}
   334  
   335  	for k, v := range t.Services {
   336  		c.Services[k] = v.copy()
   337  	}
   338  
   339  	return c
   340  }
   341  
   342  // ServiceRegistration holds the status of a registered Consul Service and its
   343  // Checks.
   344  type ServiceRegistration struct {
   345  	// serviceID and checkIDs are internal fields that track just the IDs of the
   346  	// services/checks registered in Consul. It is used to materialize the other
   347  	// fields when queried.
   348  	serviceID string
   349  	checkIDs  map[string]struct{}
   350  
   351  	// CheckOnUpdate is a map of checkIDs and the associated OnUpdate value
   352  	// from the ServiceCheck It is used to determine how a reported checks
   353  	// status should be evaluated.
   354  	CheckOnUpdate map[string]string
   355  
   356  	// Service is the AgentService registered in Consul.
   357  	Service *api.AgentService
   358  
   359  	// Checks is the status of the registered checks.
   360  	Checks []*api.AgentCheck
   361  }
   362  
   363  func (s *ServiceRegistration) copy() *ServiceRegistration {
   364  	// Copy does not copy the external fields but only the internal fields. This
   365  	// is so that the caller of AllocRegistrations can not access the internal
   366  	// fields and that method uses these fields to populate the external fields.
   367  	return &ServiceRegistration{
   368  		serviceID:     s.serviceID,
   369  		checkIDs:      helper.CopyMapStringStruct(s.checkIDs),
   370  		CheckOnUpdate: helper.CopyMapStringString(s.CheckOnUpdate),
   371  	}
   372  }
   373  
   374  // ServiceClient handles task and agent service registration with Consul.
   375  type ServiceClient struct {
   376  	client           AgentAPI
   377  	logger           log.Logger
   378  	retryInterval    time.Duration
   379  	maxRetryInterval time.Duration
   380  	periodicInterval time.Duration
   381  
   382  	// exitCh is closed when the main Run loop exits
   383  	exitCh chan struct{}
   384  
   385  	// shutdownCh is closed when the client should shutdown
   386  	shutdownCh chan struct{}
   387  
   388  	// shutdownWait is how long Shutdown() blocks waiting for the final
   389  	// sync() to finish. Defaults to defaultShutdownWait
   390  	shutdownWait time.Duration
   391  
   392  	opCh chan *operations
   393  
   394  	services map[string]*api.AgentServiceRegistration
   395  	checks   map[string]*api.AgentCheckRegistration
   396  
   397  	explicitlyDeregisteredServices map[string]bool
   398  	explicitlyDeregisteredChecks   map[string]bool
   399  
   400  	// allocRegistrations stores the services and checks that are registered
   401  	// with Consul by allocation ID.
   402  	allocRegistrations     map[string]*AllocRegistration
   403  	allocRegistrationsLock sync.RWMutex
   404  
   405  	// agent services and checks record entries for the agent itself which
   406  	// should be removed on shutdown
   407  	agentServices map[string]struct{}
   408  	agentChecks   map[string]struct{}
   409  	agentLock     sync.Mutex
   410  
   411  	// seen is 1 if Consul has ever been seen; otherwise 0. Accessed with
   412  	// atomics.
   413  	seen int32
   414  
   415  	// deregisterProbationExpiry is the time before which consul sync shouldn't deregister
   416  	// unknown services.
   417  	// Used to mitigate risk of deleting restored services upon client restart.
   418  	deregisterProbationExpiry time.Time
   419  
   420  	// checkWatcher restarts checks that are unhealthy.
   421  	checkWatcher *checkWatcher
   422  
   423  	// isClientAgent specifies whether this Consul client is being used
   424  	// by a Nomad client.
   425  	isClientAgent bool
   426  }
   427  
   428  // NewServiceClient creates a new Consul ServiceClient from an existing Consul API
   429  // Client, logger and takes whether the client is being used by a Nomad Client agent.
   430  // When being used by a Nomad client, this Consul client reconciles all services and
   431  // checks created by Nomad on behalf of running tasks.
   432  func NewServiceClient(consulClient AgentAPI, logger log.Logger, isNomadClient bool) *ServiceClient {
   433  	logger = logger.ResetNamed("consul.sync")
   434  	return &ServiceClient{
   435  		client:                         consulClient,
   436  		logger:                         logger,
   437  		retryInterval:                  defaultRetryInterval,
   438  		maxRetryInterval:               defaultMaxRetryInterval,
   439  		periodicInterval:               defaultPeriodicInterval,
   440  		exitCh:                         make(chan struct{}),
   441  		shutdownCh:                     make(chan struct{}),
   442  		shutdownWait:                   defaultShutdownWait,
   443  		opCh:                           make(chan *operations, 8),
   444  		services:                       make(map[string]*api.AgentServiceRegistration),
   445  		checks:                         make(map[string]*api.AgentCheckRegistration),
   446  		explicitlyDeregisteredServices: make(map[string]bool),
   447  		explicitlyDeregisteredChecks:   make(map[string]bool),
   448  		allocRegistrations:             make(map[string]*AllocRegistration),
   449  		agentServices:                  make(map[string]struct{}),
   450  		agentChecks:                    make(map[string]struct{}),
   451  		checkWatcher:                   newCheckWatcher(logger, consulClient),
   452  		isClientAgent:                  isNomadClient,
   453  		deregisterProbationExpiry:      time.Now().Add(deregisterProbationPeriod),
   454  	}
   455  }
   456  
   457  // seen is used by markSeen and hasSeen
   458  const seen = 1
   459  
   460  // markSeen marks Consul as having been seen (meaning at least one operation
   461  // has succeeded).
   462  func (c *ServiceClient) markSeen() {
   463  	atomic.StoreInt32(&c.seen, seen)
   464  }
   465  
   466  // hasSeen returns true if any Consul operation has ever succeeded. Useful to
   467  // squelch errors if Consul isn't running.
   468  func (c *ServiceClient) hasSeen() bool {
   469  	return atomic.LoadInt32(&c.seen) == seen
   470  }
   471  
   472  // syncReason indicates why a sync operation with consul is about to happen.
   473  //
   474  // The trigger for a sync may have implications on the behavior of the sync itself.
   475  // In particular if a service is defined with enable_tag_override=true, the sync
   476  // should ignore changes to the service's Tags field.
   477  type syncReason byte
   478  
   479  const (
   480  	syncPeriodic = iota
   481  	syncShutdown
   482  	syncNewOps
   483  )
   484  
   485  // Run the Consul main loop which retries operations against Consul. It should
   486  // be called exactly once.
   487  func (c *ServiceClient) Run() {
   488  	defer close(c.exitCh)
   489  
   490  	ctx, cancel := context.WithCancel(context.Background())
   491  	defer cancel()
   492  
   493  	// init will be closed when Consul has been contacted
   494  	init := make(chan struct{})
   495  	go checkConsulTLSSkipVerify(ctx, c.logger, c.client, init)
   496  
   497  	// Process operations while waiting for initial contact with Consul but
   498  	// do not sync until contact has been made.
   499  INIT:
   500  	for {
   501  		select {
   502  		case <-init:
   503  			c.markSeen()
   504  			break INIT
   505  		case <-c.shutdownCh:
   506  			return
   507  		case ops := <-c.opCh:
   508  			c.merge(ops)
   509  		}
   510  	}
   511  	c.logger.Trace("able to contact Consul")
   512  
   513  	// Block until contact with Consul has been established
   514  	// Start checkWatcher
   515  	go c.checkWatcher.Run(ctx)
   516  
   517  	// Always immediately sync to reconcile Nomad and Consul's state
   518  	retryTimer := time.NewTimer(0)
   519  
   520  	failures := 0
   521  	for {
   522  		// On every iteration take note of what the trigger for the next sync
   523  		// was, so that it may be referenced during the sync itself.
   524  		var reasonForSync syncReason
   525  
   526  		select {
   527  		case <-retryTimer.C:
   528  			reasonForSync = syncPeriodic
   529  		case <-c.shutdownCh:
   530  			reasonForSync = syncShutdown
   531  			// Cancel check watcher but sync one last time
   532  			cancel()
   533  		case ops := <-c.opCh:
   534  			reasonForSync = syncNewOps
   535  			c.merge(ops)
   536  		}
   537  
   538  		if err := c.sync(reasonForSync); err != nil {
   539  			if failures == 0 {
   540  				// Log on the first failure
   541  				c.logger.Warn("failed to update services in Consul", "error", err)
   542  			} else if failures%10 == 0 {
   543  				// Log every 10th consecutive failure
   544  				c.logger.Error("still unable to update services in Consul", "failures", failures, "error", err)
   545  			}
   546  
   547  			failures++
   548  			if !retryTimer.Stop() {
   549  				// Timer already expired, since the timer may
   550  				// or may not have been read in the select{}
   551  				// above, conditionally receive on it
   552  				select {
   553  				case <-retryTimer.C:
   554  				default:
   555  				}
   556  			}
   557  			backoff := c.retryInterval * time.Duration(failures)
   558  			if backoff > c.maxRetryInterval {
   559  				backoff = c.maxRetryInterval
   560  			}
   561  			retryTimer.Reset(backoff)
   562  		} else {
   563  			if failures > 0 {
   564  				c.logger.Info("successfully updated services in Consul")
   565  				failures = 0
   566  			}
   567  
   568  			// on successful sync, clear deregistered consul entities
   569  			c.clearExplicitlyDeregistered()
   570  
   571  			// Reset timer to periodic interval to periodically
   572  			// reconile with Consul
   573  			if !retryTimer.Stop() {
   574  				select {
   575  				case <-retryTimer.C:
   576  				default:
   577  				}
   578  			}
   579  			retryTimer.Reset(c.periodicInterval)
   580  		}
   581  
   582  		select {
   583  		case <-c.shutdownCh:
   584  			// Exit only after sync'ing all outstanding operations
   585  			if len(c.opCh) > 0 {
   586  				for len(c.opCh) > 0 {
   587  					c.merge(<-c.opCh)
   588  				}
   589  				continue
   590  			}
   591  			return
   592  		default:
   593  		}
   594  
   595  	}
   596  }
   597  
   598  // commit operations unless already shutting down.
   599  func (c *ServiceClient) commit(ops *operations) {
   600  	select {
   601  	case c.opCh <- ops:
   602  	case <-c.shutdownCh:
   603  	}
   604  }
   605  
   606  func (c *ServiceClient) clearExplicitlyDeregistered() {
   607  	c.explicitlyDeregisteredServices = map[string]bool{}
   608  	c.explicitlyDeregisteredChecks = map[string]bool{}
   609  }
   610  
   611  // merge registrations into state map prior to sync'ing with Consul
   612  func (c *ServiceClient) merge(ops *operations) {
   613  	for _, s := range ops.regServices {
   614  		c.services[s.ID] = s
   615  	}
   616  	for _, check := range ops.regChecks {
   617  		c.checks[check.ID] = check
   618  	}
   619  	for _, sid := range ops.deregServices {
   620  		delete(c.services, sid)
   621  		c.explicitlyDeregisteredServices[sid] = true
   622  	}
   623  	for _, cid := range ops.deregChecks {
   624  		delete(c.checks, cid)
   625  		c.explicitlyDeregisteredChecks[cid] = true
   626  	}
   627  	metrics.SetGauge([]string{"client", "consul", "services"}, float32(len(c.services)))
   628  	metrics.SetGauge([]string{"client", "consul", "checks"}, float32(len(c.checks)))
   629  }
   630  
   631  // sync enqueued operations.
   632  func (c *ServiceClient) sync(reason syncReason) error {
   633  	sreg, creg, sdereg, cdereg := 0, 0, 0, 0
   634  
   635  	consulServices, err := c.client.Services()
   636  	if err != nil {
   637  		metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1)
   638  		return fmt.Errorf("error querying Consul services: %v", err)
   639  	}
   640  
   641  	inProbation := time.Now().Before(c.deregisterProbationExpiry)
   642  
   643  	// Remove Nomad services in Consul but unknown locally
   644  	for id := range consulServices {
   645  		if _, ok := c.services[id]; ok {
   646  			// Known service, skip
   647  			continue
   648  		}
   649  
   650  		// Ignore if this is not a Nomad managed service. Also ignore
   651  		// Nomad managed services if this is not a client agent.
   652  		// This is to prevent server agents from removing services
   653  		// registered by client agents
   654  		if !isNomadService(id) || !c.isClientAgent {
   655  			// Not managed by Nomad, skip
   656  			continue
   657  		}
   658  
   659  		// Ignore unknown services during probation
   660  		if inProbation && !c.explicitlyDeregisteredServices[id] {
   661  			continue
   662  		}
   663  
   664  		// Ignore if this is a service for a Nomad managed sidecar proxy.
   665  		if isNomadSidecar(id, c.services) {
   666  			continue
   667  		}
   668  
   669  		// Unknown Nomad managed service; kill
   670  		if err := c.client.ServiceDeregister(id); err != nil {
   671  			if isOldNomadService(id) {
   672  				// Don't hard-fail on old entries. See #3620
   673  				continue
   674  			}
   675  
   676  			metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1)
   677  			return err
   678  		}
   679  		sdereg++
   680  		metrics.IncrCounter([]string{"client", "consul", "service_deregistrations"}, 1)
   681  	}
   682  
   683  	// Add Nomad services missing from Consul, or where the service has been updated.
   684  	for id, serviceInNomad := range c.services {
   685  
   686  		serviceInConsul, exists := consulServices[id]
   687  		sidecarInConsul := getNomadSidecar(id, consulServices)
   688  
   689  		if !exists || agentServiceUpdateRequired(reason, serviceInNomad, serviceInConsul, sidecarInConsul) {
   690  			if err = c.client.ServiceRegister(serviceInNomad); err != nil {
   691  				metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1)
   692  				return err
   693  			}
   694  			sreg++
   695  			metrics.IncrCounter([]string{"client", "consul", "service_registrations"}, 1)
   696  		}
   697  
   698  	}
   699  
   700  	consulChecks, err := c.client.Checks()
   701  	if err != nil {
   702  		metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1)
   703  		return fmt.Errorf("error querying Consul checks: %v", err)
   704  	}
   705  
   706  	// Remove Nomad checks in Consul but unknown locally
   707  	for id, check := range consulChecks {
   708  		if _, ok := c.checks[id]; ok {
   709  			// Known check, leave it
   710  			continue
   711  		}
   712  
   713  		// Ignore if this is not a Nomad managed check. Also ignore
   714  		// Nomad managed checks if this is not a client agent.
   715  		// This is to prevent server agents from removing checks
   716  		// registered by client agents
   717  		if !isNomadService(check.ServiceID) || !c.isClientAgent || !isNomadCheck(check.CheckID) {
   718  			// Service not managed by Nomad, skip
   719  			continue
   720  		}
   721  
   722  		// Ignore unknown services during probation
   723  		if inProbation && !c.explicitlyDeregisteredChecks[id] {
   724  			continue
   725  		}
   726  
   727  		// Ignore if this is a check for a Nomad managed sidecar proxy.
   728  		if isNomadSidecar(check.ServiceID, c.services) {
   729  			continue
   730  		}
   731  
   732  		// Unknown Nomad managed check; remove
   733  		if err := c.client.CheckDeregister(id); err != nil {
   734  			if isOldNomadService(check.ServiceID) {
   735  				// Don't hard-fail on old entries.
   736  				continue
   737  			}
   738  
   739  			metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1)
   740  			return err
   741  		}
   742  		cdereg++
   743  		metrics.IncrCounter([]string{"client", "consul", "check_deregistrations"}, 1)
   744  	}
   745  
   746  	// Add Nomad checks missing from Consul
   747  	for id, check := range c.checks {
   748  		if _, ok := consulChecks[id]; ok {
   749  			// Already in Consul; skipping
   750  			continue
   751  		}
   752  
   753  		if err := c.client.CheckRegister(check); err != nil {
   754  			metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1)
   755  			return err
   756  		}
   757  		creg++
   758  		metrics.IncrCounter([]string{"client", "consul", "check_registrations"}, 1)
   759  	}
   760  
   761  	// Only log if something was actually synced
   762  	if sreg > 0 || sdereg > 0 || creg > 0 || cdereg > 0 {
   763  		c.logger.Debug("sync complete", "registered_services", sreg, "deregistered_services", sdereg,
   764  			"registered_checks", creg, "deregistered_checks", cdereg)
   765  	}
   766  	return nil
   767  }
   768  
   769  // RegisterAgent registers Nomad agents (client or server). The
   770  // Service.PortLabel should be a literal port to be parsed with SplitHostPort.
   771  // Script checks are not supported and will return an error. Registration is
   772  // asynchronous.
   773  //
   774  // Agents will be deregistered when Shutdown is called.
   775  func (c *ServiceClient) RegisterAgent(role string, services []*structs.Service) error {
   776  	ops := operations{}
   777  
   778  	for _, service := range services {
   779  		id := makeAgentServiceID(role, service)
   780  
   781  		// Unlike tasks, agents don't use port labels. Agent ports are
   782  		// stored directly in the PortLabel.
   783  		host, rawport, err := net.SplitHostPort(service.PortLabel)
   784  		if err != nil {
   785  			return fmt.Errorf("error parsing port label %q from service %q: %v", service.PortLabel, service.Name, err)
   786  		}
   787  		port, err := strconv.Atoi(rawport)
   788  		if err != nil {
   789  			return fmt.Errorf("error parsing port %q from service %q: %v", rawport, service.Name, err)
   790  		}
   791  		serviceReg := &api.AgentServiceRegistration{
   792  			ID:      id,
   793  			Name:    service.Name,
   794  			Tags:    service.Tags,
   795  			Address: host,
   796  			Port:    port,
   797  			// This enables the consul UI to show that Nomad registered this service
   798  			Meta: map[string]string{
   799  				"external-source": "nomad",
   800  			},
   801  		}
   802  		ops.regServices = append(ops.regServices, serviceReg)
   803  
   804  		for _, check := range service.Checks {
   805  			checkID := MakeCheckID(id, check)
   806  			if check.Type == structs.ServiceCheckScript {
   807  				return fmt.Errorf("service %q contains invalid check: agent checks do not support scripts", service.Name)
   808  			}
   809  			checkHost, checkPort := serviceReg.Address, serviceReg.Port
   810  			if check.PortLabel != "" {
   811  				// Unlike tasks, agents don't use port labels. Agent ports are
   812  				// stored directly in the PortLabel.
   813  				host, rawport, err := net.SplitHostPort(check.PortLabel)
   814  				if err != nil {
   815  					return fmt.Errorf("error parsing port label %q from check %q: %v", service.PortLabel, check.Name, err)
   816  				}
   817  				port, err := strconv.Atoi(rawport)
   818  				if err != nil {
   819  					return fmt.Errorf("error parsing port %q from check %q: %v", rawport, check.Name, err)
   820  				}
   821  				checkHost, checkPort = host, port
   822  			}
   823  			checkReg, err := createCheckReg(id, checkID, check, checkHost, checkPort)
   824  			if err != nil {
   825  				return fmt.Errorf("failed to add check %q: %v", check.Name, err)
   826  			}
   827  			ops.regChecks = append(ops.regChecks, checkReg)
   828  		}
   829  	}
   830  
   831  	// Don't bother committing agent checks if we're already shutting down
   832  	c.agentLock.Lock()
   833  	defer c.agentLock.Unlock()
   834  	select {
   835  	case <-c.shutdownCh:
   836  		return nil
   837  	default:
   838  	}
   839  
   840  	// Now add them to the registration queue
   841  	c.commit(&ops)
   842  
   843  	// Record IDs for deregistering on shutdown
   844  	for _, id := range ops.regServices {
   845  		c.agentServices[id.ID] = struct{}{}
   846  	}
   847  	for _, id := range ops.regChecks {
   848  		c.agentChecks[id.ID] = struct{}{}
   849  	}
   850  	return nil
   851  }
   852  
   853  // serviceRegs creates service registrations, check registrations, and script
   854  // checks from a service. It returns a service registration object with the
   855  // service and check IDs populated.
   856  func (c *ServiceClient) serviceRegs(ops *operations, service *structs.Service, workload *WorkloadServices) (
   857  	*ServiceRegistration, error) {
   858  
   859  	// Get the services ID
   860  	id := MakeAllocServiceID(workload.AllocID, workload.Name(), service)
   861  	sreg := &ServiceRegistration{
   862  		serviceID:     id,
   863  		checkIDs:      make(map[string]struct{}, len(service.Checks)),
   864  		CheckOnUpdate: make(map[string]string, len(service.Checks)),
   865  	}
   866  
   867  	// Service address modes default to auto
   868  	addrMode := service.AddressMode
   869  	if addrMode == "" {
   870  		addrMode = structs.AddressModeAuto
   871  	}
   872  
   873  	// Determine the address to advertise based on the mode
   874  	ip, port, err := getAddress(addrMode, service.PortLabel, workload.Networks, workload.DriverNetwork, workload.Ports, workload.NetworkStatus)
   875  	if err != nil {
   876  		return nil, fmt.Errorf("unable to get address for service %q: %v", service.Name, err)
   877  	}
   878  
   879  	// Determine whether to use tags or canary_tags
   880  	var tags []string
   881  	if workload.Canary && len(service.CanaryTags) > 0 {
   882  		tags = make([]string, len(service.CanaryTags))
   883  		copy(tags, service.CanaryTags)
   884  	} else {
   885  		tags = make([]string, len(service.Tags))
   886  		copy(tags, service.Tags)
   887  	}
   888  
   889  	// newConnect returns (nil, nil) if there's no Connect-enabled service.
   890  	connect, err := newConnect(id, service.Name, service.Connect, workload.Networks, workload.Ports)
   891  	if err != nil {
   892  		return nil, fmt.Errorf("invalid Consul Connect configuration for service %q: %v", service.Name, err)
   893  	}
   894  
   895  	// newConnectGateway returns nil if there's no Connect gateway.
   896  	gateway := newConnectGateway(service.Name, service.Connect)
   897  
   898  	// Determine whether to use meta or canary_meta
   899  	var meta map[string]string
   900  	if workload.Canary && len(service.CanaryMeta) > 0 {
   901  		meta = make(map[string]string, len(service.CanaryMeta)+1)
   902  		for k, v := range service.CanaryMeta {
   903  			meta[k] = v
   904  		}
   905  	} else {
   906  		meta = make(map[string]string, len(service.Meta)+1)
   907  		for k, v := range service.Meta {
   908  			meta[k] = v
   909  		}
   910  	}
   911  
   912  	// This enables the consul UI to show that Nomad registered this service
   913  	meta["external-source"] = "nomad"
   914  
   915  	// Explicitly set the Consul service Kind in case this service represents
   916  	// one of the Connect gateway types.
   917  	kind := api.ServiceKindTypical
   918  	switch {
   919  	case service.Connect.IsIngress():
   920  		kind = api.ServiceKindIngressGateway
   921  	case service.Connect.IsTerminating():
   922  		kind = api.ServiceKindTerminatingGateway
   923  		// set the default port if bridge / default listener set
   924  		if defaultBind, exists := service.Connect.Gateway.Proxy.EnvoyGatewayBindAddresses["default"]; exists {
   925  			portLabel := fmt.Sprintf("%s-%s", structs.ConnectTerminatingPrefix, service.Name)
   926  			if dynPort, ok := workload.Ports.Get(portLabel); ok {
   927  				defaultBind.Port = dynPort.Value
   928  			}
   929  		}
   930  	}
   931  
   932  	// Build the Consul Service registration request
   933  	serviceReg := &api.AgentServiceRegistration{
   934  		Kind:              kind,
   935  		ID:                id,
   936  		Name:              service.Name,
   937  		Tags:              tags,
   938  		EnableTagOverride: service.EnableTagOverride,
   939  		Address:           ip,
   940  		Port:              port,
   941  		Meta:              meta,
   942  		Connect:           connect, // will be nil if no Connect stanza
   943  		Proxy:             gateway, // will be nil if no Connect Gateway stanza
   944  	}
   945  	ops.regServices = append(ops.regServices, serviceReg)
   946  
   947  	// Build the check registrations
   948  	checkRegs, err := c.checkRegs(id, service, workload, sreg)
   949  	if err != nil {
   950  		return nil, err
   951  	}
   952  	for _, registration := range checkRegs {
   953  		sreg.checkIDs[registration.ID] = struct{}{}
   954  		ops.regChecks = append(ops.regChecks, registration)
   955  	}
   956  
   957  	return sreg, nil
   958  }
   959  
   960  // checkRegs creates check registrations for the given service
   961  func (c *ServiceClient) checkRegs(serviceID string, service *structs.Service,
   962  	workload *WorkloadServices, sreg *ServiceRegistration) ([]*api.AgentCheckRegistration, error) {
   963  
   964  	registrations := make([]*api.AgentCheckRegistration, 0, len(service.Checks))
   965  	for _, check := range service.Checks {
   966  		var ip string
   967  		var port int
   968  
   969  		if check.Type != structs.ServiceCheckScript {
   970  			portLabel := check.PortLabel
   971  			if portLabel == "" {
   972  				portLabel = service.PortLabel
   973  			}
   974  
   975  			addrMode := check.AddressMode
   976  			if addrMode == "" {
   977  				// pre-#3380 compat
   978  				addrMode = structs.AddressModeHost
   979  			}
   980  
   981  			var err error
   982  			ip, port, err = getAddress(addrMode, portLabel, workload.Networks, workload.DriverNetwork, workload.Ports, workload.NetworkStatus)
   983  			if err != nil {
   984  				return nil, fmt.Errorf("error getting address for check %q: %v", check.Name, err)
   985  			}
   986  		}
   987  
   988  		checkID := MakeCheckID(serviceID, check)
   989  		registration, err := createCheckReg(serviceID, checkID, check, ip, port)
   990  		if err != nil {
   991  			return nil, fmt.Errorf("failed to add check %q: %v", check.Name, err)
   992  		}
   993  		sreg.CheckOnUpdate[checkID] = check.OnUpdate
   994  
   995  		registrations = append(registrations, registration)
   996  	}
   997  
   998  	return registrations, nil
   999  }
  1000  
  1001  // RegisterWorkload with Consul. Adds all service entries and checks to Consul.
  1002  //
  1003  // If the service IP is set it used as the address in the service registration.
  1004  // Checks will always use the IP from the Task struct (host's IP).
  1005  //
  1006  // Actual communication with Consul is done asynchronously (see Run).
  1007  func (c *ServiceClient) RegisterWorkload(workload *WorkloadServices) error {
  1008  	// Fast path
  1009  	numServices := len(workload.Services)
  1010  	if numServices == 0 {
  1011  		return nil
  1012  	}
  1013  
  1014  	t := new(ServiceRegistrations)
  1015  	t.Services = make(map[string]*ServiceRegistration, numServices)
  1016  
  1017  	ops := &operations{}
  1018  	for _, service := range workload.Services {
  1019  		sreg, err := c.serviceRegs(ops, service, workload)
  1020  		if err != nil {
  1021  			return err
  1022  		}
  1023  		t.Services[sreg.serviceID] = sreg
  1024  	}
  1025  
  1026  	// Add the workload to the allocation's registration
  1027  	c.addRegistrations(workload.AllocID, workload.Name(), t)
  1028  
  1029  	c.commit(ops)
  1030  
  1031  	// Start watching checks. Done after service registrations are built
  1032  	// since an error building them could leak watches.
  1033  	for _, service := range workload.Services {
  1034  		serviceID := MakeAllocServiceID(workload.AllocID, workload.Name(), service)
  1035  		for _, check := range service.Checks {
  1036  			if check.TriggersRestarts() {
  1037  				checkID := MakeCheckID(serviceID, check)
  1038  				c.checkWatcher.Watch(workload.AllocID, workload.Name(), checkID, check, workload.Restarter)
  1039  			}
  1040  		}
  1041  	}
  1042  	return nil
  1043  }
  1044  
  1045  // UpdateWorkload in Consul. Does not alter the service if only checks have
  1046  // changed.
  1047  //
  1048  // DriverNetwork must not change between invocations for the same allocation.
  1049  func (c *ServiceClient) UpdateWorkload(old, newWorkload *WorkloadServices) error {
  1050  	ops := new(operations)
  1051  	regs := new(ServiceRegistrations)
  1052  	regs.Services = make(map[string]*ServiceRegistration, len(newWorkload.Services))
  1053  
  1054  	newIDs := make(map[string]*structs.Service, len(newWorkload.Services))
  1055  	for _, s := range newWorkload.Services {
  1056  		newIDs[MakeAllocServiceID(newWorkload.AllocID, newWorkload.Name(), s)] = s
  1057  	}
  1058  
  1059  	// Loop over existing Services to see if they have been removed
  1060  	for _, existingSvc := range old.Services {
  1061  		existingID := MakeAllocServiceID(old.AllocID, old.Name(), existingSvc)
  1062  		newSvc, ok := newIDs[existingID]
  1063  
  1064  		if !ok {
  1065  			// Existing service entry removed
  1066  			ops.deregServices = append(ops.deregServices, existingID)
  1067  			for _, check := range existingSvc.Checks {
  1068  				cid := MakeCheckID(existingID, check)
  1069  				ops.deregChecks = append(ops.deregChecks, cid)
  1070  
  1071  				// Unwatch watched checks
  1072  				if check.TriggersRestarts() {
  1073  					c.checkWatcher.Unwatch(cid)
  1074  				}
  1075  			}
  1076  			continue
  1077  		}
  1078  
  1079  		oldHash := existingSvc.Hash(old.AllocID, old.Name(), old.Canary)
  1080  		newHash := newSvc.Hash(newWorkload.AllocID, newWorkload.Name(), newWorkload.Canary)
  1081  		if oldHash == newHash {
  1082  			// Service exists and hasn't changed, don't re-add it later
  1083  			delete(newIDs, existingID)
  1084  		}
  1085  
  1086  		// Service still exists so add it to the task's registration
  1087  		sreg := &ServiceRegistration{
  1088  			serviceID:     existingID,
  1089  			checkIDs:      make(map[string]struct{}, len(newSvc.Checks)),
  1090  			CheckOnUpdate: make(map[string]string, len(newSvc.Checks)),
  1091  		}
  1092  		regs.Services[existingID] = sreg
  1093  
  1094  		// See if any checks were updated
  1095  		existingChecks := make(map[string]*structs.ServiceCheck, len(existingSvc.Checks))
  1096  		for _, check := range existingSvc.Checks {
  1097  			existingChecks[MakeCheckID(existingID, check)] = check
  1098  		}
  1099  
  1100  		// Register new checks
  1101  		for _, check := range newSvc.Checks {
  1102  			checkID := MakeCheckID(existingID, check)
  1103  			if _, exists := existingChecks[checkID]; exists {
  1104  				// Check is still required. Remove it from the map so it doesn't get
  1105  				// deleted later.
  1106  				delete(existingChecks, checkID)
  1107  				sreg.checkIDs[checkID] = struct{}{}
  1108  				sreg.CheckOnUpdate[checkID] = check.OnUpdate
  1109  			}
  1110  
  1111  			// New check on an unchanged service; add them now
  1112  			checkRegs, err := c.checkRegs(existingID, newSvc, newWorkload, sreg)
  1113  			if err != nil {
  1114  				return err
  1115  			}
  1116  
  1117  			for _, registration := range checkRegs {
  1118  				sreg.checkIDs[registration.ID] = struct{}{}
  1119  				sreg.CheckOnUpdate[registration.ID] = check.OnUpdate
  1120  				ops.regChecks = append(ops.regChecks, registration)
  1121  			}
  1122  
  1123  			// Update all watched checks as CheckRestart fields aren't part of ID
  1124  			if check.TriggersRestarts() {
  1125  				c.checkWatcher.Watch(newWorkload.AllocID, newWorkload.Name(), checkID, check, newWorkload.Restarter)
  1126  			}
  1127  		}
  1128  
  1129  		// Remove existing checks not in updated service
  1130  		for cid, check := range existingChecks {
  1131  			ops.deregChecks = append(ops.deregChecks, cid)
  1132  
  1133  			// Unwatch checks
  1134  			if check.TriggersRestarts() {
  1135  				c.checkWatcher.Unwatch(cid)
  1136  			}
  1137  		}
  1138  	}
  1139  
  1140  	// Any remaining services should just be enqueued directly
  1141  	for _, newSvc := range newIDs {
  1142  		sreg, err := c.serviceRegs(ops, newSvc, newWorkload)
  1143  		if err != nil {
  1144  			return err
  1145  		}
  1146  
  1147  		regs.Services[sreg.serviceID] = sreg
  1148  	}
  1149  
  1150  	// Add the task to the allocation's registration
  1151  	c.addRegistrations(newWorkload.AllocID, newWorkload.Name(), regs)
  1152  
  1153  	c.commit(ops)
  1154  
  1155  	// Start watching checks. Done after service registrations are built
  1156  	// since an error building them could leak watches.
  1157  	for serviceID, service := range newIDs {
  1158  		for _, check := range service.Checks {
  1159  			if check.TriggersRestarts() {
  1160  				checkID := MakeCheckID(serviceID, check)
  1161  				c.checkWatcher.Watch(newWorkload.AllocID, newWorkload.Name(), checkID, check, newWorkload.Restarter)
  1162  			}
  1163  		}
  1164  	}
  1165  
  1166  	return nil
  1167  }
  1168  
  1169  // RemoveWorkload from Consul. Removes all service entries and checks.
  1170  //
  1171  // Actual communication with Consul is done asynchronously (see Run).
  1172  func (c *ServiceClient) RemoveWorkload(workload *WorkloadServices) {
  1173  	ops := operations{}
  1174  
  1175  	for _, service := range workload.Services {
  1176  		id := MakeAllocServiceID(workload.AllocID, workload.Name(), service)
  1177  		ops.deregServices = append(ops.deregServices, id)
  1178  
  1179  		for _, check := range service.Checks {
  1180  			cid := MakeCheckID(id, check)
  1181  			ops.deregChecks = append(ops.deregChecks, cid)
  1182  
  1183  			if check.TriggersRestarts() {
  1184  				c.checkWatcher.Unwatch(cid)
  1185  			}
  1186  		}
  1187  	}
  1188  
  1189  	// Remove the workload from the alloc's registrations
  1190  	c.removeRegistration(workload.AllocID, workload.Name())
  1191  
  1192  	// Now add them to the deregistration fields; main Run loop will update
  1193  	c.commit(&ops)
  1194  }
  1195  
  1196  // AllocRegistrations returns the registrations for the given allocation. If the
  1197  // allocation has no reservations, the response is a nil object.
  1198  func (c *ServiceClient) AllocRegistrations(allocID string) (*AllocRegistration, error) {
  1199  	// Get the internal struct using the lock
  1200  	c.allocRegistrationsLock.RLock()
  1201  	regInternal, ok := c.allocRegistrations[allocID]
  1202  	if !ok {
  1203  		c.allocRegistrationsLock.RUnlock()
  1204  		return nil, nil
  1205  	}
  1206  
  1207  	// Copy so we don't expose internal structs
  1208  	reg := regInternal.copy()
  1209  	c.allocRegistrationsLock.RUnlock()
  1210  
  1211  	// Query the services and checks to populate the allocation registrations.
  1212  	services, err := c.client.Services()
  1213  	if err != nil {
  1214  		return nil, err
  1215  	}
  1216  
  1217  	checks, err := c.client.Checks()
  1218  	if err != nil {
  1219  		return nil, err
  1220  	}
  1221  
  1222  	// Populate the object
  1223  	for _, treg := range reg.Tasks {
  1224  		for serviceID, sreg := range treg.Services {
  1225  			sreg.Service = services[serviceID]
  1226  			for checkID := range sreg.checkIDs {
  1227  				if check, ok := checks[checkID]; ok {
  1228  					sreg.Checks = append(sreg.Checks, check)
  1229  				}
  1230  			}
  1231  		}
  1232  	}
  1233  
  1234  	return reg, nil
  1235  }
  1236  
  1237  // UpdateTTL is used to update the TTL of a check. Typically this will only be
  1238  // called to heartbeat script checks.
  1239  func (c *ServiceClient) UpdateTTL(id, output, status string) error {
  1240  	return c.client.UpdateTTL(id, output, status)
  1241  }
  1242  
  1243  // Shutdown the Consul client. Update running task registrations and deregister
  1244  // agent from Consul. On first call blocks up to shutdownWait before giving up
  1245  // on syncing operations.
  1246  func (c *ServiceClient) Shutdown() error {
  1247  	// Serialize Shutdown calls with RegisterAgent to prevent leaking agent
  1248  	// entries.
  1249  	c.agentLock.Lock()
  1250  	defer c.agentLock.Unlock()
  1251  	select {
  1252  	case <-c.shutdownCh:
  1253  		return nil
  1254  	default:
  1255  		close(c.shutdownCh)
  1256  	}
  1257  
  1258  	// Give run loop time to sync, but don't block indefinitely
  1259  	deadline := time.After(c.shutdownWait)
  1260  
  1261  	// Wait for Run to finish any outstanding operations and exit
  1262  	select {
  1263  	case <-c.exitCh:
  1264  	case <-deadline:
  1265  		// Don't wait forever though
  1266  	}
  1267  
  1268  	// If Consul was never seen nothing could be written so exit early
  1269  	if !c.hasSeen() {
  1270  		return nil
  1271  	}
  1272  
  1273  	// Always attempt to deregister Nomad agent Consul entries, even if
  1274  	// deadline was reached
  1275  	for id := range c.agentServices {
  1276  		if err := c.client.ServiceDeregister(id); err != nil {
  1277  			c.logger.Error("failed deregistering agent service", "service_id", id, "error", err)
  1278  		}
  1279  	}
  1280  
  1281  	remainingChecks, err := c.client.Checks()
  1282  	if err != nil {
  1283  		c.logger.Error("failed listing remaining checks after deregistering services", "error", err)
  1284  	}
  1285  
  1286  	checkRemains := func(id string) bool {
  1287  		for _, c := range remainingChecks {
  1288  			if c.CheckID == id {
  1289  				return true
  1290  			}
  1291  		}
  1292  		return false
  1293  	}
  1294  
  1295  	for id := range c.agentChecks {
  1296  		// if we couldn't populate remainingChecks it is unlikely that CheckDeregister will work, but try anyway
  1297  		// if we could list the remaining checks, verify that the check we store still exists before removing it.
  1298  		if remainingChecks == nil || checkRemains(id) {
  1299  			if err := c.client.CheckDeregister(id); err != nil {
  1300  				c.logger.Error("failed deregistering agent check", "check_id", id, "error", err)
  1301  			}
  1302  		}
  1303  	}
  1304  
  1305  	return nil
  1306  }
  1307  
  1308  // addRegistration adds the service registrations for the given allocation.
  1309  func (c *ServiceClient) addRegistrations(allocID, taskName string, reg *ServiceRegistrations) {
  1310  	c.allocRegistrationsLock.Lock()
  1311  	defer c.allocRegistrationsLock.Unlock()
  1312  
  1313  	alloc, ok := c.allocRegistrations[allocID]
  1314  	if !ok {
  1315  		alloc = &AllocRegistration{
  1316  			Tasks: make(map[string]*ServiceRegistrations),
  1317  		}
  1318  		c.allocRegistrations[allocID] = alloc
  1319  	}
  1320  	alloc.Tasks[taskName] = reg
  1321  }
  1322  
  1323  // removeRegistrations removes the registration for the given allocation.
  1324  func (c *ServiceClient) removeRegistration(allocID, taskName string) {
  1325  	c.allocRegistrationsLock.Lock()
  1326  	defer c.allocRegistrationsLock.Unlock()
  1327  
  1328  	alloc, ok := c.allocRegistrations[allocID]
  1329  	if !ok {
  1330  		return
  1331  	}
  1332  
  1333  	// Delete the task and if it is the last one also delete the alloc's
  1334  	// registration
  1335  	delete(alloc.Tasks, taskName)
  1336  	if len(alloc.Tasks) == 0 {
  1337  		delete(c.allocRegistrations, allocID)
  1338  	}
  1339  }
  1340  
  1341  // makeAgentServiceID creates a unique ID for identifying an agent service in
  1342  // Consul.
  1343  //
  1344  // Agent service IDs are of the form:
  1345  //
  1346  //	{nomadServicePrefix}-{ROLE}-b32(sha1({Service.Name}-{Service.Tags...})
  1347  //	Example Server ID: _nomad-server-fbbk265qn4tmt25nd4ep42tjvmyj3hr4
  1348  //	Example Client ID: _nomad-client-ggnjpgl7yn7rgmvxzilmpvrzzvrszc7l
  1349  //
  1350  func makeAgentServiceID(role string, service *structs.Service) string {
  1351  	return fmt.Sprintf("%s-%s-%s", nomadServicePrefix, role, service.Hash(role, "", false))
  1352  }
  1353  
  1354  // MakeAllocServiceID creates a unique ID for identifying an alloc service in
  1355  // Consul.
  1356  //
  1357  //	Example Service ID: _nomad-task-b4e61df9-b095-d64e-f241-23860da1375f-redis-http-http
  1358  func MakeAllocServiceID(allocID, taskName string, service *structs.Service) string {
  1359  	return fmt.Sprintf("%s%s-%s-%s-%s", nomadTaskPrefix, allocID, taskName, service.Name, service.PortLabel)
  1360  }
  1361  
  1362  // MakeCheckID creates a unique ID for a check.
  1363  //
  1364  //  Example Check ID: _nomad-check-434ae42f9a57c5705344974ac38de2aee0ee089d
  1365  func MakeCheckID(serviceID string, check *structs.ServiceCheck) string {
  1366  	return fmt.Sprintf("%s%s", nomadCheckPrefix, check.Hash(serviceID))
  1367  }
  1368  
  1369  // createCheckReg creates a Check that can be registered with Consul.
  1370  //
  1371  // Script checks simply have a TTL set and the caller is responsible for
  1372  // running the script and heart-beating.
  1373  func createCheckReg(serviceID, checkID string, check *structs.ServiceCheck, host string, port int) (*api.AgentCheckRegistration, error) {
  1374  	chkReg := api.AgentCheckRegistration{
  1375  		ID:        checkID,
  1376  		Name:      check.Name,
  1377  		ServiceID: serviceID,
  1378  	}
  1379  	chkReg.Status = check.InitialStatus
  1380  	chkReg.Timeout = check.Timeout.String()
  1381  	chkReg.Interval = check.Interval.String()
  1382  	chkReg.SuccessBeforePassing = check.SuccessBeforePassing
  1383  	chkReg.FailuresBeforeCritical = check.FailuresBeforeCritical
  1384  
  1385  	// Require an address for http or tcp checks
  1386  	if port == 0 && check.RequiresPort() {
  1387  		return nil, fmt.Errorf("%s checks require an address", check.Type)
  1388  	}
  1389  
  1390  	switch check.Type {
  1391  	case structs.ServiceCheckHTTP:
  1392  		proto := check.Protocol
  1393  		if proto == "" {
  1394  			proto = "http"
  1395  		}
  1396  		if check.TLSSkipVerify {
  1397  			chkReg.TLSSkipVerify = true
  1398  		}
  1399  		base := url.URL{
  1400  			Scheme: proto,
  1401  			Host:   net.JoinHostPort(host, strconv.Itoa(port)),
  1402  		}
  1403  		relative, err := url.Parse(check.Path)
  1404  		if err != nil {
  1405  			return nil, err
  1406  		}
  1407  		checkURL := base.ResolveReference(relative)
  1408  		chkReg.HTTP = checkURL.String()
  1409  		chkReg.Method = check.Method
  1410  		chkReg.Header = check.Header
  1411  
  1412  	case structs.ServiceCheckTCP:
  1413  		chkReg.TCP = net.JoinHostPort(host, strconv.Itoa(port))
  1414  
  1415  	case structs.ServiceCheckScript:
  1416  		chkReg.TTL = (check.Interval + ttlCheckBuffer).String()
  1417  		// As of Consul 1.0.0 setting TTL and Interval is a 400
  1418  		chkReg.Interval = ""
  1419  
  1420  	case structs.ServiceCheckGRPC:
  1421  		chkReg.GRPC = fmt.Sprintf("%s/%s", net.JoinHostPort(host, strconv.Itoa(port)), check.GRPCService)
  1422  		chkReg.GRPCUseTLS = check.GRPCUseTLS
  1423  		if check.TLSSkipVerify {
  1424  			chkReg.TLSSkipVerify = true
  1425  		}
  1426  
  1427  	default:
  1428  		return nil, fmt.Errorf("check type %+q not valid", check.Type)
  1429  	}
  1430  	return &chkReg, nil
  1431  }
  1432  
  1433  // isNomadCheck returns true if the ID matches the pattern of a Nomad managed
  1434  // check.
  1435  func isNomadCheck(id string) bool {
  1436  	return strings.HasPrefix(id, nomadCheckPrefix)
  1437  }
  1438  
  1439  // isNomadService returns true if the ID matches the pattern of a Nomad managed
  1440  // service (new or old formats). Agent services return false as independent
  1441  // client and server agents may be running on the same machine. #2827
  1442  func isNomadService(id string) bool {
  1443  	return strings.HasPrefix(id, nomadTaskPrefix) || isOldNomadService(id)
  1444  }
  1445  
  1446  // isOldNomadService returns true if the ID matches an old pattern managed by
  1447  // Nomad.
  1448  //
  1449  // Pre-0.7.1 task service IDs are of the form:
  1450  //
  1451  //	{nomadServicePrefix}-executor-{ALLOC_ID}-{Service.Name}-{Service.Tags...}
  1452  //	Example Service ID: _nomad-executor-1234-echo-http-tag1-tag2-tag3
  1453  //
  1454  func isOldNomadService(id string) bool {
  1455  	const prefix = nomadServicePrefix + "-executor"
  1456  	return strings.HasPrefix(id, prefix)
  1457  }
  1458  
  1459  const (
  1460  	sidecarSuffix = "-sidecar-proxy"
  1461  )
  1462  
  1463  // isNomadSidecar returns true if the ID matches a sidecar proxy for a Nomad
  1464  // managed service.
  1465  //
  1466  // For example if you have a Connect enabled service with the ID:
  1467  //
  1468  //	_nomad-task-5229c7f8-376b-3ccc-edd9-981e238f7033-cache-redis-cache-db
  1469  //
  1470  // Consul will create a service for the sidecar proxy with the ID:
  1471  //
  1472  //	_nomad-task-5229c7f8-376b-3ccc-edd9-981e238f7033-cache-redis-cache-db-sidecar-proxy
  1473  //
  1474  func isNomadSidecar(id string, services map[string]*api.AgentServiceRegistration) bool {
  1475  	if !strings.HasSuffix(id, sidecarSuffix) {
  1476  		return false
  1477  	}
  1478  
  1479  	// Make sure the Nomad managed service for this proxy still exists.
  1480  	_, ok := services[id[:len(id)-len(sidecarSuffix)]]
  1481  	return ok
  1482  }
  1483  
  1484  // getNomadSidecar returns the service registration of the sidecar for the managed
  1485  // service with the specified id.
  1486  //
  1487  // If the managed service of the specified id does not exist, or the service does
  1488  // not have a sidecar proxy, nil is returned.
  1489  func getNomadSidecar(id string, services map[string]*api.AgentService) *api.AgentService {
  1490  	if _, exists := services[id]; !exists {
  1491  		return nil
  1492  	}
  1493  
  1494  	sidecarID := id + sidecarSuffix
  1495  	return services[sidecarID]
  1496  }
  1497  
  1498  // getAddress returns the IP and port to use for a service or check. If no port
  1499  // label is specified (an empty value), zero values are returned because no
  1500  // address could be resolved.
  1501  func getAddress(addrMode, portLabel string, networks structs.Networks, driverNet *drivers.DriverNetwork, ports structs.AllocatedPorts, netStatus *structs.AllocNetworkStatus) (string, int, error) {
  1502  	switch addrMode {
  1503  	case structs.AddressModeAuto:
  1504  		if driverNet.Advertise() {
  1505  			addrMode = structs.AddressModeDriver
  1506  		} else {
  1507  			addrMode = structs.AddressModeHost
  1508  		}
  1509  		return getAddress(addrMode, portLabel, networks, driverNet, ports, netStatus)
  1510  	case structs.AddressModeHost:
  1511  		if portLabel == "" {
  1512  			if len(networks) != 1 {
  1513  				// If no networks are specified return zero
  1514  				// values. Consul will advertise the host IP
  1515  				// with no port. This is the pre-0.7.1 behavior
  1516  				// some people rely on.
  1517  				return "", 0, nil
  1518  			}
  1519  
  1520  			return networks[0].IP, 0, nil
  1521  		}
  1522  
  1523  		// Default path: use host ip:port
  1524  		// Try finding port in the AllocatedPorts struct first
  1525  		// Check in Networks struct for backwards compatibility if not found
  1526  		mapping, ok := ports.Get(portLabel)
  1527  		if !ok {
  1528  			mapping = networks.Port(portLabel)
  1529  			if mapping.Value > 0 {
  1530  				return mapping.HostIP, mapping.Value, nil
  1531  			}
  1532  
  1533  			// If port isn't a label, try to parse it as a literal port number
  1534  			port, err := strconv.Atoi(portLabel)
  1535  			if err != nil {
  1536  				// Don't include Atoi error message as user likely
  1537  				// never intended it to be a numeric and it creates a
  1538  				// confusing error message
  1539  				return "", 0, fmt.Errorf("invalid port %q: port label not found", portLabel)
  1540  			}
  1541  			if port <= 0 {
  1542  				return "", 0, fmt.Errorf("invalid port: %q: port must be >0", portLabel)
  1543  			}
  1544  
  1545  			// A number was given which will use the Consul agent's address and the given port
  1546  			// Returning a blank string as an address will use the Consul agent's address
  1547  			return "", port, nil
  1548  		}
  1549  		return mapping.HostIP, mapping.Value, nil
  1550  
  1551  	case structs.AddressModeDriver:
  1552  		// Require a driver network if driver address mode is used
  1553  		if driverNet == nil {
  1554  			return "", 0, fmt.Errorf(`cannot use address_mode="driver": no driver network exists`)
  1555  		}
  1556  
  1557  		// If no port label is specified just return the IP
  1558  		if portLabel == "" {
  1559  			return driverNet.IP, 0, nil
  1560  		}
  1561  
  1562  		// If the port is a label, use the driver's port (not the host's)
  1563  		if port, ok := ports.Get(portLabel); ok {
  1564  			return driverNet.IP, port.To, nil
  1565  		}
  1566  
  1567  		// Check if old style driver portmap is used
  1568  		if port, ok := driverNet.PortMap[portLabel]; ok {
  1569  			return driverNet.IP, port, nil
  1570  		}
  1571  
  1572  		// If port isn't a label, try to parse it as a literal port number
  1573  		port, err := strconv.Atoi(portLabel)
  1574  		if err != nil {
  1575  			// Don't include Atoi error message as user likely
  1576  			// never intended it to be a numeric and it creates a
  1577  			// confusing error message
  1578  			return "", 0, fmt.Errorf("invalid port label %q: port labels in driver address_mode must be numeric or in the driver's port map", portLabel)
  1579  		}
  1580  		if port <= 0 {
  1581  			return "", 0, fmt.Errorf("invalid port: %q: port must be >0", portLabel)
  1582  		}
  1583  
  1584  		return driverNet.IP, port, nil
  1585  
  1586  	case structs.AddressModeAlloc:
  1587  		if netStatus == nil {
  1588  			return "", 0, fmt.Errorf(`cannot use address_mode="alloc": no allocation network status reported`)
  1589  		}
  1590  
  1591  		// If no port label is specified just return the IP
  1592  		if portLabel == "" {
  1593  			return netStatus.Address, 0, nil
  1594  		}
  1595  
  1596  		// If port is a label and is found then return it
  1597  		if port, ok := ports.Get(portLabel); ok {
  1598  			// Use port.To value unless not set
  1599  			if port.To > 0 {
  1600  				return netStatus.Address, port.To, nil
  1601  			}
  1602  			return netStatus.Address, port.Value, nil
  1603  		}
  1604  
  1605  		// Check if port is a literal number
  1606  		port, err := strconv.Atoi(portLabel)
  1607  		if err != nil {
  1608  			// User likely specified wrong port label here
  1609  			return "", 0, fmt.Errorf("invalid port %q: port label not found or is not numeric", portLabel)
  1610  		}
  1611  		if port <= 0 {
  1612  			return "", 0, fmt.Errorf("invalid port: %q: port must be >0", portLabel)
  1613  		}
  1614  		return netStatus.Address, port, nil
  1615  
  1616  	default:
  1617  		// Shouldn't happen due to validation, but enforce invariants
  1618  		return "", 0, fmt.Errorf("invalid address mode %q", addrMode)
  1619  	}
  1620  }