github.com/blixtra/nomad@v0.7.2-0.20171221000451-da9a1d7bb050/command/agent/consul/client.go (about)

     1  package consul
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"log"
     7  	"net"
     8  	"net/url"
     9  	"strconv"
    10  	"strings"
    11  	"sync"
    12  	"sync/atomic"
    13  	"time"
    14  
    15  	metrics "github.com/armon/go-metrics"
    16  	"github.com/hashicorp/consul/api"
    17  	"github.com/hashicorp/nomad/client/driver"
    18  	cstructs "github.com/hashicorp/nomad/client/structs"
    19  	"github.com/hashicorp/nomad/helper"
    20  	"github.com/hashicorp/nomad/nomad/structs"
    21  )
    22  
    23  const (
    24  	// nomadServicePrefix is the prefix that scopes all Nomad registered
    25  	// services (both agent and task entries).
    26  	nomadServicePrefix = "_nomad"
    27  
    28  	// nomadTaskPrefix is the prefix that scopes Nomad registered services
    29  	// for tasks.
    30  	nomadTaskPrefix = nomadServicePrefix + "-task-"
    31  
    32  	// defaultRetryInterval is how quickly to retry syncing services and
    33  	// checks to Consul when an error occurs. Will backoff up to a max.
    34  	defaultRetryInterval = time.Second
    35  
    36  	// defaultMaxRetryInterval is the default max retry interval.
    37  	defaultMaxRetryInterval = 30 * time.Second
    38  
    39  	// ttlCheckBuffer is the time interval that Nomad can take to report Consul
    40  	// the check result
    41  	ttlCheckBuffer = 31 * time.Second
    42  
    43  	// defaultShutdownWait is how long Shutdown() should block waiting for
    44  	// enqueued operations to sync to Consul by default.
    45  	defaultShutdownWait = time.Minute
    46  
    47  	// DefaultQueryWaitDuration is the max duration the Consul Agent will
    48  	// spend waiting for a response from a Consul Query.
    49  	DefaultQueryWaitDuration = 2 * time.Second
    50  
    51  	// ServiceTagHTTP is the tag assigned to HTTP services
    52  	ServiceTagHTTP = "http"
    53  
    54  	// ServiceTagRPC is the tag assigned to RPC services
    55  	ServiceTagRPC = "rpc"
    56  
    57  	// ServiceTagSerf is the tag assigned to Serf services
    58  	ServiceTagSerf = "serf"
    59  )
    60  
    61  // CatalogAPI is the consul/api.Catalog API used by Nomad.
    62  type CatalogAPI interface {
    63  	Datacenters() ([]string, error)
    64  	Service(service, tag string, q *api.QueryOptions) ([]*api.CatalogService, *api.QueryMeta, error)
    65  }
    66  
    67  // AgentAPI is the consul/api.Agent API used by Nomad.
    68  type AgentAPI interface {
    69  	Services() (map[string]*api.AgentService, error)
    70  	Checks() (map[string]*api.AgentCheck, error)
    71  	CheckRegister(check *api.AgentCheckRegistration) error
    72  	CheckDeregister(checkID string) error
    73  	ServiceRegister(service *api.AgentServiceRegistration) error
    74  	ServiceDeregister(serviceID string) error
    75  	UpdateTTL(id, output, status string) error
    76  }
    77  
    78  // operations are submitted to the main loop via commit() for synchronizing
    79  // with Consul.
    80  type operations struct {
    81  	regServices []*api.AgentServiceRegistration
    82  	regChecks   []*api.AgentCheckRegistration
    83  	scripts     []*scriptCheck
    84  
    85  	deregServices []string
    86  	deregChecks   []string
    87  }
    88  
    89  // AllocRegistration holds the status of services registered for a particular
    90  // allocations by task.
    91  type AllocRegistration struct {
    92  	// Tasks maps the name of a task to its registered services and checks
    93  	Tasks map[string]*TaskRegistration
    94  }
    95  
    96  func (a *AllocRegistration) copy() *AllocRegistration {
    97  	c := &AllocRegistration{
    98  		Tasks: make(map[string]*TaskRegistration, len(a.Tasks)),
    99  	}
   100  
   101  	for k, v := range a.Tasks {
   102  		c.Tasks[k] = v.copy()
   103  	}
   104  
   105  	return c
   106  }
   107  
   108  // NumServices returns the number of registered services
   109  func (a *AllocRegistration) NumServices() int {
   110  	if a == nil {
   111  		return 0
   112  	}
   113  
   114  	total := 0
   115  	for _, treg := range a.Tasks {
   116  		for _, sreg := range treg.Services {
   117  			if sreg.Service != nil {
   118  				total++
   119  			}
   120  		}
   121  	}
   122  
   123  	return total
   124  }
   125  
   126  // NumChecks returns the number of registered checks
   127  func (a *AllocRegistration) NumChecks() int {
   128  	if a == nil {
   129  		return 0
   130  	}
   131  
   132  	total := 0
   133  	for _, treg := range a.Tasks {
   134  		for _, sreg := range treg.Services {
   135  			total += len(sreg.Checks)
   136  		}
   137  	}
   138  
   139  	return total
   140  }
   141  
   142  // TaskRegistration holds the status of services registered for a particular
   143  // task.
   144  type TaskRegistration struct {
   145  	Services map[string]*ServiceRegistration
   146  }
   147  
   148  func (t *TaskRegistration) copy() *TaskRegistration {
   149  	c := &TaskRegistration{
   150  		Services: make(map[string]*ServiceRegistration, len(t.Services)),
   151  	}
   152  
   153  	for k, v := range t.Services {
   154  		c.Services[k] = v.copy()
   155  	}
   156  
   157  	return c
   158  }
   159  
   160  // ServiceRegistration holds the status of a registered Consul Service and its
   161  // Checks.
   162  type ServiceRegistration struct {
   163  	// serviceID and checkIDs are internal fields that track just the IDs of the
   164  	// services/checks registered in Consul. It is used to materialize the other
   165  	// fields when queried.
   166  	serviceID string
   167  	checkIDs  map[string]struct{}
   168  
   169  	// Service is the AgentService registered in Consul.
   170  	Service *api.AgentService
   171  
   172  	// Checks is the status of the registered checks.
   173  	Checks []*api.AgentCheck
   174  }
   175  
   176  func (s *ServiceRegistration) copy() *ServiceRegistration {
   177  	// Copy does not copy the external fields but only the internal fields. This
   178  	// is so that the caller of AllocRegistrations can not access the internal
   179  	// fields and that method uses these fields to populate the external fields.
   180  	return &ServiceRegistration{
   181  		serviceID: s.serviceID,
   182  		checkIDs:  helper.CopyMapStringStruct(s.checkIDs),
   183  	}
   184  }
   185  
   186  // ServiceClient handles task and agent service registration with Consul.
   187  type ServiceClient struct {
   188  	client           AgentAPI
   189  	logger           *log.Logger
   190  	retryInterval    time.Duration
   191  	maxRetryInterval time.Duration
   192  
   193  	// skipVerifySupport is true if the local Consul agent suppots TLSSkipVerify
   194  	skipVerifySupport bool
   195  
   196  	// exitCh is closed when the main Run loop exits
   197  	exitCh chan struct{}
   198  
   199  	// shutdownCh is closed when the client should shutdown
   200  	shutdownCh chan struct{}
   201  
   202  	// shutdownWait is how long Shutdown() blocks waiting for the final
   203  	// sync() to finish. Defaults to defaultShutdownWait
   204  	shutdownWait time.Duration
   205  
   206  	opCh chan *operations
   207  
   208  	services       map[string]*api.AgentServiceRegistration
   209  	checks         map[string]*api.AgentCheckRegistration
   210  	scripts        map[string]*scriptCheck
   211  	runningScripts map[string]*scriptHandle
   212  
   213  	// allocRegistrations stores the services and checks that are registered
   214  	// with Consul by allocation ID.
   215  	allocRegistrations     map[string]*AllocRegistration
   216  	allocRegistrationsLock sync.RWMutex
   217  
   218  	// agent services and checks record entries for the agent itself which
   219  	// should be removed on shutdown
   220  	agentServices map[string]struct{}
   221  	agentChecks   map[string]struct{}
   222  	agentLock     sync.Mutex
   223  
   224  	// seen is 1 if Consul has ever been seen; otherise 0. Accessed with
   225  	// atomics.
   226  	seen int32
   227  
   228  	// checkWatcher restarts checks that are unhealthy.
   229  	checkWatcher *checkWatcher
   230  }
   231  
   232  // NewServiceClient creates a new Consul ServiceClient from an existing Consul API
   233  // Client and logger.
   234  func NewServiceClient(consulClient AgentAPI, skipVerifySupport bool, logger *log.Logger) *ServiceClient {
   235  	return &ServiceClient{
   236  		client:             consulClient,
   237  		skipVerifySupport:  skipVerifySupport,
   238  		logger:             logger,
   239  		retryInterval:      defaultRetryInterval,
   240  		maxRetryInterval:   defaultMaxRetryInterval,
   241  		exitCh:             make(chan struct{}),
   242  		shutdownCh:         make(chan struct{}),
   243  		shutdownWait:       defaultShutdownWait,
   244  		opCh:               make(chan *operations, 8),
   245  		services:           make(map[string]*api.AgentServiceRegistration),
   246  		checks:             make(map[string]*api.AgentCheckRegistration),
   247  		scripts:            make(map[string]*scriptCheck),
   248  		runningScripts:     make(map[string]*scriptHandle),
   249  		allocRegistrations: make(map[string]*AllocRegistration),
   250  		agentServices:      make(map[string]struct{}),
   251  		agentChecks:        make(map[string]struct{}),
   252  		checkWatcher:       newCheckWatcher(logger, consulClient),
   253  	}
   254  }
   255  
   256  // seen is used by markSeen and hasSeen
   257  const seen = 1
   258  
   259  // markSeen marks Consul as having been seen (meaning at least one operation
   260  // has succeeded).
   261  func (c *ServiceClient) markSeen() {
   262  	atomic.StoreInt32(&c.seen, seen)
   263  }
   264  
   265  // hasSeen returns true if any Consul operation has ever succeeded. Useful to
   266  // squelch errors if Consul isn't running.
   267  func (c *ServiceClient) hasSeen() bool {
   268  	return atomic.LoadInt32(&c.seen) == seen
   269  }
   270  
   271  // Run the Consul main loop which retries operations against Consul. It should
   272  // be called exactly once.
   273  func (c *ServiceClient) Run() {
   274  	defer close(c.exitCh)
   275  
   276  	// start checkWatcher
   277  	ctx, cancelWatcher := context.WithCancel(context.Background())
   278  	defer cancelWatcher()
   279  	go c.checkWatcher.Run(ctx)
   280  
   281  	retryTimer := time.NewTimer(0)
   282  	<-retryTimer.C // disabled by default
   283  	failures := 0
   284  	for {
   285  		select {
   286  		case <-retryTimer.C:
   287  		case <-c.shutdownCh:
   288  			cancelWatcher()
   289  		case ops := <-c.opCh:
   290  			c.merge(ops)
   291  		}
   292  
   293  		if err := c.sync(); err != nil {
   294  			if failures == 0 {
   295  				// Log on the first failure
   296  				c.logger.Printf("[WARN] consul.sync: failed to update services in Consul: %v", err)
   297  			} else if failures%10 == 0 {
   298  				// Log every 10th consecutive failure
   299  				c.logger.Printf("[ERR] consul.sync: still unable to update services in Consul after %d failures; latest error: %v", failures, err)
   300  			}
   301  
   302  			failures++
   303  			if !retryTimer.Stop() {
   304  				// Timer already expired, since the timer may
   305  				// or may not have been read in the select{}
   306  				// above, conditionally receive on it
   307  				select {
   308  				case <-retryTimer.C:
   309  				default:
   310  				}
   311  			}
   312  			backoff := c.retryInterval * time.Duration(failures)
   313  			if backoff > c.maxRetryInterval {
   314  				backoff = c.maxRetryInterval
   315  			}
   316  			retryTimer.Reset(backoff)
   317  		} else {
   318  			if failures > 0 {
   319  				c.logger.Printf("[INFO] consul.sync: successfully updated services in Consul")
   320  				failures = 0
   321  			}
   322  		}
   323  
   324  		select {
   325  		case <-c.shutdownCh:
   326  			// Exit only after sync'ing all outstanding operations
   327  			if len(c.opCh) > 0 {
   328  				for len(c.opCh) > 0 {
   329  					c.merge(<-c.opCh)
   330  				}
   331  				continue
   332  			}
   333  			return
   334  		default:
   335  		}
   336  
   337  	}
   338  }
   339  
   340  // commit operations unless already shutting down.
   341  func (c *ServiceClient) commit(ops *operations) {
   342  	select {
   343  	case c.opCh <- ops:
   344  	case <-c.shutdownCh:
   345  	}
   346  }
   347  
   348  // merge registrations into state map prior to sync'ing with Consul
   349  func (c *ServiceClient) merge(ops *operations) {
   350  	for _, s := range ops.regServices {
   351  		c.services[s.ID] = s
   352  	}
   353  	for _, check := range ops.regChecks {
   354  		c.checks[check.ID] = check
   355  	}
   356  	for _, s := range ops.scripts {
   357  		c.scripts[s.id] = s
   358  	}
   359  	for _, sid := range ops.deregServices {
   360  		delete(c.services, sid)
   361  	}
   362  	for _, cid := range ops.deregChecks {
   363  		if script, ok := c.runningScripts[cid]; ok {
   364  			script.cancel()
   365  			delete(c.scripts, cid)
   366  			delete(c.runningScripts, cid)
   367  		}
   368  		delete(c.checks, cid)
   369  	}
   370  	metrics.SetGauge([]string{"client", "consul", "services"}, float32(len(c.services)))
   371  	metrics.SetGauge([]string{"client", "consul", "checks"}, float32(len(c.checks)))
   372  	metrics.SetGauge([]string{"client", "consul", "script_checks"}, float32(len(c.runningScripts)))
   373  }
   374  
   375  // sync enqueued operations.
   376  func (c *ServiceClient) sync() error {
   377  	sreg, creg, sdereg, cdereg := 0, 0, 0, 0
   378  
   379  	consulServices, err := c.client.Services()
   380  	if err != nil {
   381  		metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1)
   382  		return fmt.Errorf("error querying Consul services: %v", err)
   383  	}
   384  
   385  	consulChecks, err := c.client.Checks()
   386  	if err != nil {
   387  		metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1)
   388  		return fmt.Errorf("error querying Consul checks: %v", err)
   389  	}
   390  
   391  	// Remove Nomad services in Consul but unknown locally
   392  	for id := range consulServices {
   393  		if _, ok := c.services[id]; ok {
   394  			// Known service, skip
   395  			continue
   396  		}
   397  		if !isNomadService(id) {
   398  			// Not managed by Nomad, skip
   399  			continue
   400  		}
   401  
   402  		// Unknown Nomad managed service; kill
   403  		if err := c.client.ServiceDeregister(id); err != nil {
   404  			if isOldNomadService(id) {
   405  				// Don't hard-fail on old entries. See #3620
   406  				continue
   407  			}
   408  
   409  			metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1)
   410  			return err
   411  		}
   412  		sdereg++
   413  		metrics.IncrCounter([]string{"client", "consul", "service_deregistrations"}, 1)
   414  	}
   415  
   416  	// Add Nomad services missing from Consul
   417  	for id, locals := range c.services {
   418  		if _, ok := consulServices[id]; !ok {
   419  			if err = c.client.ServiceRegister(locals); err != nil {
   420  				metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1)
   421  				return err
   422  			}
   423  			sreg++
   424  			metrics.IncrCounter([]string{"client", "consul", "service_registrations"}, 1)
   425  		}
   426  	}
   427  
   428  	// Remove Nomad checks in Consul but unknown locally
   429  	for id, check := range consulChecks {
   430  		if _, ok := c.checks[id]; ok {
   431  			// Known check, leave it
   432  			continue
   433  		}
   434  		if !isNomadService(check.ServiceID) {
   435  			// Service not managed by Nomad, skip
   436  			continue
   437  		}
   438  
   439  		// Unknown Nomad managed check; remove
   440  		if err := c.client.CheckDeregister(id); err != nil {
   441  			if isOldNomadService(check.ServiceID) {
   442  				// Don't hard-fail on old entries.
   443  				continue
   444  			}
   445  
   446  			metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1)
   447  			return err
   448  		}
   449  		cdereg++
   450  		metrics.IncrCounter([]string{"client", "consul", "check_deregistrations"}, 1)
   451  	}
   452  
   453  	// Add Nomad checks missing from Consul
   454  	for id, check := range c.checks {
   455  		if _, ok := consulChecks[id]; ok {
   456  			// Already in Consul; skipping
   457  			continue
   458  		}
   459  
   460  		if err := c.client.CheckRegister(check); err != nil {
   461  			metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1)
   462  			return err
   463  		}
   464  		creg++
   465  		metrics.IncrCounter([]string{"client", "consul", "check_registrations"}, 1)
   466  
   467  		// Handle starting scripts
   468  		if script, ok := c.scripts[id]; ok {
   469  			// If it's already running, cancel and replace
   470  			if oldScript, running := c.runningScripts[id]; running {
   471  				oldScript.cancel()
   472  			}
   473  			// Start and store the handle
   474  			c.runningScripts[id] = script.run()
   475  		}
   476  	}
   477  
   478  	// A Consul operation has succeeded, mark Consul as having been seen
   479  	c.markSeen()
   480  
   481  	c.logger.Printf("[DEBUG] consul.sync: registered %d services, %d checks; deregistered %d services, %d checks",
   482  		sreg, creg, sdereg, cdereg)
   483  	return nil
   484  }
   485  
   486  // RegisterAgent registers Nomad agents (client or server). The
   487  // Service.PortLabel should be a literal port to be parsed with SplitHostPort.
   488  // Script checks are not supported and will return an error. Registration is
   489  // asynchronous.
   490  //
   491  // Agents will be deregistered when Shutdown is called.
   492  func (c *ServiceClient) RegisterAgent(role string, services []*structs.Service) error {
   493  	ops := operations{}
   494  
   495  	for _, service := range services {
   496  		id := makeAgentServiceID(role, service)
   497  
   498  		// Unlike tasks, agents don't use port labels. Agent ports are
   499  		// stored directly in the PortLabel.
   500  		host, rawport, err := net.SplitHostPort(service.PortLabel)
   501  		if err != nil {
   502  			return fmt.Errorf("error parsing port label %q from service %q: %v", service.PortLabel, service.Name, err)
   503  		}
   504  		port, err := strconv.Atoi(rawport)
   505  		if err != nil {
   506  			return fmt.Errorf("error parsing port %q from service %q: %v", rawport, service.Name, err)
   507  		}
   508  		serviceReg := &api.AgentServiceRegistration{
   509  			ID:      id,
   510  			Name:    service.Name,
   511  			Tags:    service.Tags,
   512  			Address: host,
   513  			Port:    port,
   514  		}
   515  		ops.regServices = append(ops.regServices, serviceReg)
   516  
   517  		for _, check := range service.Checks {
   518  			checkID := makeCheckID(id, check)
   519  			if check.Type == structs.ServiceCheckScript {
   520  				return fmt.Errorf("service %q contains invalid check: agent checks do not support scripts", service.Name)
   521  			}
   522  			checkHost, checkPort := serviceReg.Address, serviceReg.Port
   523  			if check.PortLabel != "" {
   524  				// Unlike tasks, agents don't use port labels. Agent ports are
   525  				// stored directly in the PortLabel.
   526  				host, rawport, err := net.SplitHostPort(check.PortLabel)
   527  				if err != nil {
   528  					return fmt.Errorf("error parsing port label %q from check %q: %v", service.PortLabel, check.Name, err)
   529  				}
   530  				port, err := strconv.Atoi(rawport)
   531  				if err != nil {
   532  					return fmt.Errorf("error parsing port %q from check %q: %v", rawport, check.Name, err)
   533  				}
   534  				checkHost, checkPort = host, port
   535  			}
   536  			checkReg, err := createCheckReg(id, checkID, check, checkHost, checkPort)
   537  			if err != nil {
   538  				return fmt.Errorf("failed to add check %q: %v", check.Name, err)
   539  			}
   540  			ops.regChecks = append(ops.regChecks, checkReg)
   541  		}
   542  	}
   543  
   544  	// Don't bother committing agent checks if we're already shutting down
   545  	c.agentLock.Lock()
   546  	defer c.agentLock.Unlock()
   547  	select {
   548  	case <-c.shutdownCh:
   549  		return nil
   550  	default:
   551  	}
   552  
   553  	// Now add them to the registration queue
   554  	c.commit(&ops)
   555  
   556  	// Record IDs for deregistering on shutdown
   557  	for _, id := range ops.regServices {
   558  		c.agentServices[id.ID] = struct{}{}
   559  	}
   560  	for _, id := range ops.regChecks {
   561  		c.agentChecks[id.ID] = struct{}{}
   562  	}
   563  	return nil
   564  }
   565  
   566  // serviceRegs creates service registrations, check registrations, and script
   567  // checks from a service. It returns a service registration object with the
   568  // service and check IDs populated.
   569  func (c *ServiceClient) serviceRegs(ops *operations, allocID string, service *structs.Service,
   570  	task *structs.Task, exec driver.ScriptExecutor, net *cstructs.DriverNetwork) (*ServiceRegistration, error) {
   571  
   572  	// Get the services ID
   573  	id := makeTaskServiceID(allocID, task.Name, service)
   574  	sreg := &ServiceRegistration{
   575  		serviceID: id,
   576  		checkIDs:  make(map[string]struct{}, len(service.Checks)),
   577  	}
   578  
   579  	// Service address modes default to auto
   580  	addrMode := service.AddressMode
   581  	if addrMode == "" {
   582  		addrMode = structs.AddressModeAuto
   583  	}
   584  
   585  	// Determine the address to advertise based on the mode
   586  	ip, port, err := getAddress(addrMode, service.PortLabel, task.Resources.Networks, net)
   587  	if err != nil {
   588  		return nil, fmt.Errorf("unable to get address for service %q: %v", service.Name, err)
   589  	}
   590  
   591  	// Build the Consul Service registration request
   592  	serviceReg := &api.AgentServiceRegistration{
   593  		ID:      id,
   594  		Name:    service.Name,
   595  		Tags:    make([]string, len(service.Tags)),
   596  		Address: ip,
   597  		Port:    port,
   598  	}
   599  	// copy isn't strictly necessary but can avoid bugs especially
   600  	// with tests that may reuse Tasks
   601  	copy(serviceReg.Tags, service.Tags)
   602  	ops.regServices = append(ops.regServices, serviceReg)
   603  
   604  	// Build the check registrations
   605  	checkIDs, err := c.checkRegs(ops, allocID, id, service, task, exec, net)
   606  	if err != nil {
   607  		return nil, err
   608  	}
   609  	for _, cid := range checkIDs {
   610  		sreg.checkIDs[cid] = struct{}{}
   611  	}
   612  	return sreg, nil
   613  }
   614  
   615  // checkRegs registers the checks for the given service and returns the
   616  // registered check ids.
   617  func (c *ServiceClient) checkRegs(ops *operations, allocID, serviceID string, service *structs.Service,
   618  	task *structs.Task, exec driver.ScriptExecutor, net *cstructs.DriverNetwork) ([]string, error) {
   619  
   620  	// Fast path
   621  	numChecks := len(service.Checks)
   622  	if numChecks == 0 {
   623  		return nil, nil
   624  	}
   625  
   626  	checkIDs := make([]string, 0, numChecks)
   627  	for _, check := range service.Checks {
   628  		if check.TLSSkipVerify && !c.skipVerifySupport {
   629  			c.logger.Printf("[WARN] consul.sync: skipping check %q for task %q alloc %q because Consul doesn't support tls_skip_verify. Please upgrade to Consul >= 0.7.2.",
   630  				check.Name, task.Name, allocID)
   631  			continue
   632  		}
   633  		checkID := makeCheckID(serviceID, check)
   634  		checkIDs = append(checkIDs, checkID)
   635  		if check.Type == structs.ServiceCheckScript {
   636  			if exec == nil {
   637  				return nil, fmt.Errorf("driver doesn't support script checks")
   638  			}
   639  			ops.scripts = append(ops.scripts, newScriptCheck(
   640  				allocID, task.Name, checkID, check, exec, c.client, c.logger, c.shutdownCh))
   641  
   642  			// Skip getAddress for script checks
   643  			checkReg, err := createCheckReg(serviceID, checkID, check, "", 0)
   644  			if err != nil {
   645  				return nil, fmt.Errorf("failed to add script check %q: %v", check.Name, err)
   646  			}
   647  			ops.regChecks = append(ops.regChecks, checkReg)
   648  			continue
   649  		}
   650  
   651  		// Default to the service's port but allow check to override
   652  		portLabel := check.PortLabel
   653  		if portLabel == "" {
   654  			// Default to the service's port label
   655  			portLabel = service.PortLabel
   656  		}
   657  
   658  		// Checks address mode defaults to host for pre-#3380 backward compat
   659  		addrMode := check.AddressMode
   660  		if addrMode == "" {
   661  			addrMode = structs.AddressModeHost
   662  		}
   663  
   664  		ip, port, err := getAddress(addrMode, portLabel, task.Resources.Networks, net)
   665  		if err != nil {
   666  			return nil, fmt.Errorf("error getting address for check %q: %v", check.Name, err)
   667  		}
   668  
   669  		checkReg, err := createCheckReg(serviceID, checkID, check, ip, port)
   670  		if err != nil {
   671  			return nil, fmt.Errorf("failed to add check %q: %v", check.Name, err)
   672  		}
   673  		ops.regChecks = append(ops.regChecks, checkReg)
   674  	}
   675  	return checkIDs, nil
   676  }
   677  
   678  // RegisterTask with Consul. Adds all service entries and checks to Consul. If
   679  // exec is nil and a script check exists an error is returned.
   680  //
   681  // If the service IP is set it used as the address in the service registration.
   682  // Checks will always use the IP from the Task struct (host's IP).
   683  //
   684  // Actual communication with Consul is done asynchrously (see Run).
   685  func (c *ServiceClient) RegisterTask(allocID string, task *structs.Task, restarter TaskRestarter, exec driver.ScriptExecutor, net *cstructs.DriverNetwork) error {
   686  	// Fast path
   687  	numServices := len(task.Services)
   688  	if numServices == 0 {
   689  		return nil
   690  	}
   691  
   692  	t := new(TaskRegistration)
   693  	t.Services = make(map[string]*ServiceRegistration, numServices)
   694  
   695  	ops := &operations{}
   696  	for _, service := range task.Services {
   697  		sreg, err := c.serviceRegs(ops, allocID, service, task, exec, net)
   698  		if err != nil {
   699  			return err
   700  		}
   701  		t.Services[sreg.serviceID] = sreg
   702  	}
   703  
   704  	// Add the task to the allocation's registration
   705  	c.addTaskRegistration(allocID, task.Name, t)
   706  
   707  	c.commit(ops)
   708  
   709  	// Start watching checks. Done after service registrations are built
   710  	// since an error building them could leak watches.
   711  	for _, service := range task.Services {
   712  		serviceID := makeTaskServiceID(allocID, task.Name, service)
   713  		for _, check := range service.Checks {
   714  			if check.TriggersRestarts() {
   715  				checkID := makeCheckID(serviceID, check)
   716  				c.checkWatcher.Watch(allocID, task.Name, checkID, check, restarter)
   717  			}
   718  		}
   719  	}
   720  	return nil
   721  }
   722  
   723  // UpdateTask in Consul. Does not alter the service if only checks have
   724  // changed.
   725  //
   726  // DriverNetwork must not change between invocations for the same allocation.
   727  func (c *ServiceClient) UpdateTask(allocID string, existing, newTask *structs.Task, restarter TaskRestarter, exec driver.ScriptExecutor, net *cstructs.DriverNetwork) error {
   728  	ops := &operations{}
   729  
   730  	taskReg := new(TaskRegistration)
   731  	taskReg.Services = make(map[string]*ServiceRegistration, len(newTask.Services))
   732  
   733  	existingIDs := make(map[string]*structs.Service, len(existing.Services))
   734  	for _, s := range existing.Services {
   735  		existingIDs[makeTaskServiceID(allocID, existing.Name, s)] = s
   736  	}
   737  	newIDs := make(map[string]*structs.Service, len(newTask.Services))
   738  	for _, s := range newTask.Services {
   739  		newIDs[makeTaskServiceID(allocID, newTask.Name, s)] = s
   740  	}
   741  
   742  	// Loop over existing Service IDs to see if they have been removed or
   743  	// updated.
   744  	for existingID, existingSvc := range existingIDs {
   745  		newSvc, ok := newIDs[existingID]
   746  		if !ok {
   747  			// Existing service entry removed
   748  			ops.deregServices = append(ops.deregServices, existingID)
   749  			for _, check := range existingSvc.Checks {
   750  				cid := makeCheckID(existingID, check)
   751  				ops.deregChecks = append(ops.deregChecks, cid)
   752  
   753  				// Unwatch watched checks
   754  				if check.TriggersRestarts() {
   755  					c.checkWatcher.Unwatch(cid)
   756  				}
   757  			}
   758  			continue
   759  		}
   760  
   761  		// Service exists and hasn't changed, don't re-add it later
   762  		delete(newIDs, existingID)
   763  
   764  		// Service still exists so add it to the task's registration
   765  		sreg := &ServiceRegistration{
   766  			serviceID: existingID,
   767  			checkIDs:  make(map[string]struct{}, len(newSvc.Checks)),
   768  		}
   769  		taskReg.Services[existingID] = sreg
   770  
   771  		// See if any checks were updated
   772  		existingChecks := make(map[string]*structs.ServiceCheck, len(existingSvc.Checks))
   773  		for _, check := range existingSvc.Checks {
   774  			existingChecks[makeCheckID(existingID, check)] = check
   775  		}
   776  
   777  		// Register new checks
   778  		for _, check := range newSvc.Checks {
   779  			checkID := makeCheckID(existingID, check)
   780  			if _, exists := existingChecks[checkID]; exists {
   781  				// Check exists, so don't remove it
   782  				delete(existingChecks, checkID)
   783  				sreg.checkIDs[checkID] = struct{}{}
   784  			}
   785  
   786  			// New check on an unchanged service; add them now
   787  			newCheckIDs, err := c.checkRegs(ops, allocID, existingID, newSvc, newTask, exec, net)
   788  			if err != nil {
   789  				return err
   790  			}
   791  
   792  			for _, checkID := range newCheckIDs {
   793  				sreg.checkIDs[checkID] = struct{}{}
   794  
   795  			}
   796  
   797  			// Update all watched checks as CheckRestart fields aren't part of ID
   798  			if check.TriggersRestarts() {
   799  				c.checkWatcher.Watch(allocID, newTask.Name, checkID, check, restarter)
   800  			}
   801  		}
   802  
   803  		// Remove existing checks not in updated service
   804  		for cid, check := range existingChecks {
   805  			ops.deregChecks = append(ops.deregChecks, cid)
   806  
   807  			// Unwatch checks
   808  			if check.TriggersRestarts() {
   809  				c.checkWatcher.Unwatch(cid)
   810  			}
   811  		}
   812  	}
   813  
   814  	// Any remaining services should just be enqueued directly
   815  	for _, newSvc := range newIDs {
   816  		sreg, err := c.serviceRegs(ops, allocID, newSvc, newTask, exec, net)
   817  		if err != nil {
   818  			return err
   819  		}
   820  
   821  		taskReg.Services[sreg.serviceID] = sreg
   822  	}
   823  
   824  	// Add the task to the allocation's registration
   825  	c.addTaskRegistration(allocID, newTask.Name, taskReg)
   826  
   827  	c.commit(ops)
   828  
   829  	// Start watching checks. Done after service registrations are built
   830  	// since an error building them could leak watches.
   831  	for _, service := range newIDs {
   832  		serviceID := makeTaskServiceID(allocID, newTask.Name, service)
   833  		for _, check := range service.Checks {
   834  			if check.TriggersRestarts() {
   835  				checkID := makeCheckID(serviceID, check)
   836  				c.checkWatcher.Watch(allocID, newTask.Name, checkID, check, restarter)
   837  			}
   838  		}
   839  	}
   840  	return nil
   841  }
   842  
   843  // RemoveTask from Consul. Removes all service entries and checks.
   844  //
   845  // Actual communication with Consul is done asynchrously (see Run).
   846  func (c *ServiceClient) RemoveTask(allocID string, task *structs.Task) {
   847  	ops := operations{}
   848  
   849  	for _, service := range task.Services {
   850  		id := makeTaskServiceID(allocID, task.Name, service)
   851  		ops.deregServices = append(ops.deregServices, id)
   852  
   853  		for _, check := range service.Checks {
   854  			cid := makeCheckID(id, check)
   855  			ops.deregChecks = append(ops.deregChecks, cid)
   856  
   857  			if check.TriggersRestarts() {
   858  				c.checkWatcher.Unwatch(cid)
   859  			}
   860  		}
   861  	}
   862  
   863  	// Remove the task from the alloc's registrations
   864  	c.removeTaskRegistration(allocID, task.Name)
   865  
   866  	// Now add them to the deregistration fields; main Run loop will update
   867  	c.commit(&ops)
   868  }
   869  
   870  // AllocRegistrations returns the registrations for the given allocation. If the
   871  // allocation has no reservations, the response is a nil object.
   872  func (c *ServiceClient) AllocRegistrations(allocID string) (*AllocRegistration, error) {
   873  	// Get the internal struct using the lock
   874  	c.allocRegistrationsLock.RLock()
   875  	regInternal, ok := c.allocRegistrations[allocID]
   876  	if !ok {
   877  		c.allocRegistrationsLock.RUnlock()
   878  		return nil, nil
   879  	}
   880  
   881  	// Copy so we don't expose internal structs
   882  	reg := regInternal.copy()
   883  	c.allocRegistrationsLock.RUnlock()
   884  
   885  	// Query the services and checks to populate the allocation registrations.
   886  	services, err := c.client.Services()
   887  	if err != nil {
   888  		return nil, err
   889  	}
   890  
   891  	checks, err := c.client.Checks()
   892  	if err != nil {
   893  		return nil, err
   894  	}
   895  
   896  	// Populate the object
   897  	for _, treg := range reg.Tasks {
   898  		for serviceID, sreg := range treg.Services {
   899  			sreg.Service = services[serviceID]
   900  			for checkID := range sreg.checkIDs {
   901  				if check, ok := checks[checkID]; ok {
   902  					sreg.Checks = append(sreg.Checks, check)
   903  				}
   904  			}
   905  		}
   906  	}
   907  
   908  	return reg, nil
   909  }
   910  
   911  // Shutdown the Consul client. Update running task registations and deregister
   912  // agent from Consul. On first call blocks up to shutdownWait before giving up
   913  // on syncing operations.
   914  func (c *ServiceClient) Shutdown() error {
   915  	// Serialize Shutdown calls with RegisterAgent to prevent leaking agent
   916  	// entries.
   917  	c.agentLock.Lock()
   918  	defer c.agentLock.Unlock()
   919  	select {
   920  	case <-c.shutdownCh:
   921  		return nil
   922  	default:
   923  		close(c.shutdownCh)
   924  	}
   925  
   926  	// Give run loop time to sync, but don't block indefinitely
   927  	deadline := time.After(c.shutdownWait)
   928  
   929  	// Wait for Run to finish any outstanding operations and exit
   930  	select {
   931  	case <-c.exitCh:
   932  	case <-deadline:
   933  		// Don't wait forever though
   934  	}
   935  
   936  	// If Consul was never seen nothing could be written so exit early
   937  	if !c.hasSeen() {
   938  		return nil
   939  	}
   940  
   941  	// Always attempt to deregister Nomad agent Consul entries, even if
   942  	// deadline was reached
   943  	for id := range c.agentServices {
   944  		if err := c.client.ServiceDeregister(id); err != nil {
   945  			c.logger.Printf("[ERR] consul.sync: error deregistering agent service (id: %q): %v", id, err)
   946  		}
   947  	}
   948  	for id := range c.agentChecks {
   949  		if err := c.client.CheckDeregister(id); err != nil {
   950  			c.logger.Printf("[ERR] consul.sync: error deregistering agent service (id: %q): %v", id, err)
   951  		}
   952  	}
   953  
   954  	// Give script checks time to exit (no need to lock as Run() has exited)
   955  	for _, h := range c.runningScripts {
   956  		select {
   957  		case <-h.wait():
   958  		case <-deadline:
   959  			return fmt.Errorf("timed out waiting for script checks to run")
   960  		}
   961  	}
   962  	return nil
   963  }
   964  
   965  // addTaskRegistration adds the task registration for the given allocation.
   966  func (c *ServiceClient) addTaskRegistration(allocID, taskName string, reg *TaskRegistration) {
   967  	c.allocRegistrationsLock.Lock()
   968  	defer c.allocRegistrationsLock.Unlock()
   969  
   970  	alloc, ok := c.allocRegistrations[allocID]
   971  	if !ok {
   972  		alloc = &AllocRegistration{
   973  			Tasks: make(map[string]*TaskRegistration),
   974  		}
   975  		c.allocRegistrations[allocID] = alloc
   976  	}
   977  	alloc.Tasks[taskName] = reg
   978  }
   979  
   980  // removeTaskRegistration removes the task registration for the given allocation.
   981  func (c *ServiceClient) removeTaskRegistration(allocID, taskName string) {
   982  	c.allocRegistrationsLock.Lock()
   983  	defer c.allocRegistrationsLock.Unlock()
   984  
   985  	alloc, ok := c.allocRegistrations[allocID]
   986  	if !ok {
   987  		return
   988  	}
   989  
   990  	// Delete the task and if it is the last one also delete the alloc's
   991  	// registration
   992  	delete(alloc.Tasks, taskName)
   993  	if len(alloc.Tasks) == 0 {
   994  		delete(c.allocRegistrations, allocID)
   995  	}
   996  }
   997  
   998  // makeAgentServiceID creates a unique ID for identifying an agent service in
   999  // Consul.
  1000  //
  1001  // Agent service IDs are of the form:
  1002  //
  1003  //	{nomadServicePrefix}-{ROLE}-b32(sha1({Service.Name}-{Service.Tags...})
  1004  //	Example Server ID: _nomad-server-fbbk265qn4tmt25nd4ep42tjvmyj3hr4
  1005  //	Example Client ID: _nomad-client-ggnjpgl7yn7rgmvxzilmpvrzzvrszc7l
  1006  //
  1007  func makeAgentServiceID(role string, service *structs.Service) string {
  1008  	return fmt.Sprintf("%s-%s-%s", nomadServicePrefix, role, service.Hash(role, ""))
  1009  }
  1010  
  1011  // makeTaskServiceID creates a unique ID for identifying a task service in
  1012  // Consul. All structs.Service fields are included in the ID's hash except
  1013  // Checks. This allows updates to merely compare IDs.
  1014  //
  1015  //	Example Service ID: _nomad-task-TNM333JKJPM5AK4FAS3VXQLXFDWOF4VH
  1016  func makeTaskServiceID(allocID, taskName string, service *structs.Service) string {
  1017  	return nomadTaskPrefix + service.Hash(allocID, taskName)
  1018  }
  1019  
  1020  // makeCheckID creates a unique ID for a check.
  1021  func makeCheckID(serviceID string, check *structs.ServiceCheck) string {
  1022  	return check.Hash(serviceID)
  1023  }
  1024  
  1025  // createCheckReg creates a Check that can be registered with Consul.
  1026  //
  1027  // Script checks simply have a TTL set and the caller is responsible for
  1028  // running the script and heartbeating.
  1029  func createCheckReg(serviceID, checkID string, check *structs.ServiceCheck, host string, port int) (*api.AgentCheckRegistration, error) {
  1030  	chkReg := api.AgentCheckRegistration{
  1031  		ID:        checkID,
  1032  		Name:      check.Name,
  1033  		ServiceID: serviceID,
  1034  	}
  1035  	chkReg.Status = check.InitialStatus
  1036  	chkReg.Timeout = check.Timeout.String()
  1037  	chkReg.Interval = check.Interval.String()
  1038  
  1039  	// Require an address for http or tcp checks
  1040  	if port == 0 && check.RequiresPort() {
  1041  		return nil, fmt.Errorf("%s checks require an address", check.Type)
  1042  	}
  1043  
  1044  	switch check.Type {
  1045  	case structs.ServiceCheckHTTP:
  1046  		proto := check.Protocol
  1047  		if proto == "" {
  1048  			proto = "http"
  1049  		}
  1050  		if check.TLSSkipVerify {
  1051  			chkReg.TLSSkipVerify = true
  1052  		}
  1053  		base := url.URL{
  1054  			Scheme: proto,
  1055  			Host:   net.JoinHostPort(host, strconv.Itoa(port)),
  1056  		}
  1057  		relative, err := url.Parse(check.Path)
  1058  		if err != nil {
  1059  			return nil, err
  1060  		}
  1061  		url := base.ResolveReference(relative)
  1062  		chkReg.HTTP = url.String()
  1063  		chkReg.Method = check.Method
  1064  		chkReg.Header = check.Header
  1065  	case structs.ServiceCheckTCP:
  1066  		chkReg.TCP = net.JoinHostPort(host, strconv.Itoa(port))
  1067  	case structs.ServiceCheckScript:
  1068  		chkReg.TTL = (check.Interval + ttlCheckBuffer).String()
  1069  		// As of Consul 1.0.0 setting TTL and Interval is a 400
  1070  		chkReg.Interval = ""
  1071  	default:
  1072  		return nil, fmt.Errorf("check type %+q not valid", check.Type)
  1073  	}
  1074  	return &chkReg, nil
  1075  }
  1076  
  1077  // isNomadService returns true if the ID matches the pattern of a Nomad managed
  1078  // service (new or old formats). Agent services return false as independent
  1079  // client and server agents may be running on the same machine. #2827
  1080  func isNomadService(id string) bool {
  1081  	return strings.HasPrefix(id, nomadTaskPrefix) || isOldNomadService(id)
  1082  }
  1083  
  1084  // isOldNomadService returns true if the ID matches an old pattern managed by
  1085  // Nomad.
  1086  //
  1087  // Pre-0.7.1 task service IDs are of the form:
  1088  //
  1089  //	{nomadServicePrefix}-executor-{ALLOC_ID}-{Service.Name}-{Service.Tags...}
  1090  //	Example Service ID: _nomad-executor-1234-echo-http-tag1-tag2-tag3
  1091  //
  1092  func isOldNomadService(id string) bool {
  1093  	const prefix = nomadServicePrefix + "-executor"
  1094  	return strings.HasPrefix(id, prefix)
  1095  }
  1096  
  1097  // getAddress returns the IP and port to use for a service or check. If no port
  1098  // label is specified (an empty value), zero values are returned because no
  1099  // address could be resolved.
  1100  func getAddress(addrMode, portLabel string, networks structs.Networks, driverNet *cstructs.DriverNetwork) (string, int, error) {
  1101  	// No port label specified, no address can be assembled
  1102  	if portLabel == "" {
  1103  		return "", 0, nil
  1104  	}
  1105  
  1106  	switch addrMode {
  1107  	case structs.AddressModeAuto:
  1108  		if driverNet.Advertise() {
  1109  			addrMode = structs.AddressModeDriver
  1110  		} else {
  1111  			addrMode = structs.AddressModeHost
  1112  		}
  1113  		return getAddress(addrMode, portLabel, networks, driverNet)
  1114  	case structs.AddressModeHost:
  1115  		// Default path: use host ip:port
  1116  		ip, port := networks.Port(portLabel)
  1117  		if ip == "" && port <= 0 {
  1118  			return "", 0, fmt.Errorf("invalid port %q: port label not found", portLabel)
  1119  		}
  1120  		return ip, port, nil
  1121  
  1122  	case structs.AddressModeDriver:
  1123  		// Require a driver network if driver address mode is used
  1124  		if driverNet == nil {
  1125  			return "", 0, fmt.Errorf(`cannot use address_mode="driver": no driver network exists`)
  1126  		}
  1127  
  1128  		// If the port is a label, use the driver's port (not the host's)
  1129  		if port, ok := driverNet.PortMap[portLabel]; ok {
  1130  			return driverNet.IP, port, nil
  1131  		}
  1132  
  1133  		// If port isn't a label, try to parse it as a literal port number
  1134  		port, err := strconv.Atoi(portLabel)
  1135  		if err != nil {
  1136  			return "", 0, fmt.Errorf("invalid port %q: %v", portLabel, err)
  1137  		}
  1138  		if port <= 0 {
  1139  			return "", 0, fmt.Errorf("invalid port: %q: port 0 is invalid", portLabel)
  1140  		}
  1141  
  1142  		return driverNet.IP, port, nil
  1143  
  1144  	default:
  1145  		// Shouldn't happen due to validation, but enforce invariants
  1146  		return "", 0, fmt.Errorf("invalid address mode %q", addrMode)
  1147  	}
  1148  }