github.com/djenriquez/nomad-1@v0.8.1/command/agent/consul/client.go (about)

     1  package consul
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"log"
     7  	"net"
     8  	"net/url"
     9  	"strconv"
    10  	"strings"
    11  	"sync"
    12  	"sync/atomic"
    13  	"time"
    14  
    15  	metrics "github.com/armon/go-metrics"
    16  	"github.com/hashicorp/consul/api"
    17  	"github.com/hashicorp/nomad/client/driver"
    18  	cstructs "github.com/hashicorp/nomad/client/structs"
    19  	"github.com/hashicorp/nomad/helper"
    20  	"github.com/hashicorp/nomad/nomad/structs"
    21  )
    22  
    23  const (
    24  	// nomadServicePrefix is the prefix that scopes all Nomad registered
    25  	// services (both agent and task entries).
    26  	nomadServicePrefix = "_nomad"
    27  
    28  	// nomadTaskPrefix is the prefix that scopes Nomad registered services
    29  	// for tasks.
    30  	nomadTaskPrefix = nomadServicePrefix + "-task-"
    31  
    32  	// defaultRetryInterval is how quickly to retry syncing services and
    33  	// checks to Consul when an error occurs. Will backoff up to a max.
    34  	defaultRetryInterval = time.Second
    35  
    36  	// defaultMaxRetryInterval is the default max retry interval.
    37  	defaultMaxRetryInterval = 30 * time.Second
    38  
    39  	// ttlCheckBuffer is the time interval that Nomad can take to report Consul
    40  	// the check result
    41  	ttlCheckBuffer = 31 * time.Second
    42  
    43  	// defaultShutdownWait is how long Shutdown() should block waiting for
    44  	// enqueued operations to sync to Consul by default.
    45  	defaultShutdownWait = time.Minute
    46  
    47  	// DefaultQueryWaitDuration is the max duration the Consul Agent will
    48  	// spend waiting for a response from a Consul Query.
    49  	DefaultQueryWaitDuration = 2 * time.Second
    50  
    51  	// ServiceTagHTTP is the tag assigned to HTTP services
    52  	ServiceTagHTTP = "http"
    53  
    54  	// ServiceTagRPC is the tag assigned to RPC services
    55  	ServiceTagRPC = "rpc"
    56  
    57  	// ServiceTagSerf is the tag assigned to Serf services
    58  	ServiceTagSerf = "serf"
    59  )
    60  
    61  // CatalogAPI is the consul/api.Catalog API used by Nomad.
    62  type CatalogAPI interface {
    63  	Datacenters() ([]string, error)
    64  	Service(service, tag string, q *api.QueryOptions) ([]*api.CatalogService, *api.QueryMeta, error)
    65  }
    66  
    67  // AgentAPI is the consul/api.Agent API used by Nomad.
    68  type AgentAPI interface {
    69  	Services() (map[string]*api.AgentService, error)
    70  	Checks() (map[string]*api.AgentCheck, error)
    71  	CheckRegister(check *api.AgentCheckRegistration) error
    72  	CheckDeregister(checkID string) error
    73  	Self() (map[string]map[string]interface{}, error)
    74  	ServiceRegister(service *api.AgentServiceRegistration) error
    75  	ServiceDeregister(serviceID string) error
    76  	UpdateTTL(id, output, status string) error
    77  }
    78  
    79  // operations are submitted to the main loop via commit() for synchronizing
    80  // with Consul.
    81  type operations struct {
    82  	regServices []*api.AgentServiceRegistration
    83  	regChecks   []*api.AgentCheckRegistration
    84  	scripts     []*scriptCheck
    85  
    86  	deregServices []string
    87  	deregChecks   []string
    88  }
    89  
    90  // AllocRegistration holds the status of services registered for a particular
    91  // allocations by task.
    92  type AllocRegistration struct {
    93  	// Tasks maps the name of a task to its registered services and checks
    94  	Tasks map[string]*TaskRegistration
    95  }
    96  
    97  func (a *AllocRegistration) copy() *AllocRegistration {
    98  	c := &AllocRegistration{
    99  		Tasks: make(map[string]*TaskRegistration, len(a.Tasks)),
   100  	}
   101  
   102  	for k, v := range a.Tasks {
   103  		c.Tasks[k] = v.copy()
   104  	}
   105  
   106  	return c
   107  }
   108  
   109  // NumServices returns the number of registered services
   110  func (a *AllocRegistration) NumServices() int {
   111  	if a == nil {
   112  		return 0
   113  	}
   114  
   115  	total := 0
   116  	for _, treg := range a.Tasks {
   117  		for _, sreg := range treg.Services {
   118  			if sreg.Service != nil {
   119  				total++
   120  			}
   121  		}
   122  	}
   123  
   124  	return total
   125  }
   126  
   127  // NumChecks returns the number of registered checks
   128  func (a *AllocRegistration) NumChecks() int {
   129  	if a == nil {
   130  		return 0
   131  	}
   132  
   133  	total := 0
   134  	for _, treg := range a.Tasks {
   135  		for _, sreg := range treg.Services {
   136  			total += len(sreg.Checks)
   137  		}
   138  	}
   139  
   140  	return total
   141  }
   142  
   143  // TaskRegistration holds the status of services registered for a particular
   144  // task.
   145  type TaskRegistration struct {
   146  	Services map[string]*ServiceRegistration
   147  }
   148  
   149  func (t *TaskRegistration) copy() *TaskRegistration {
   150  	c := &TaskRegistration{
   151  		Services: make(map[string]*ServiceRegistration, len(t.Services)),
   152  	}
   153  
   154  	for k, v := range t.Services {
   155  		c.Services[k] = v.copy()
   156  	}
   157  
   158  	return c
   159  }
   160  
   161  // ServiceRegistration holds the status of a registered Consul Service and its
   162  // Checks.
   163  type ServiceRegistration struct {
   164  	// serviceID and checkIDs are internal fields that track just the IDs of the
   165  	// services/checks registered in Consul. It is used to materialize the other
   166  	// fields when queried.
   167  	serviceID string
   168  	checkIDs  map[string]struct{}
   169  
   170  	// Service is the AgentService registered in Consul.
   171  	Service *api.AgentService
   172  
   173  	// Checks is the status of the registered checks.
   174  	Checks []*api.AgentCheck
   175  }
   176  
   177  func (s *ServiceRegistration) copy() *ServiceRegistration {
   178  	// Copy does not copy the external fields but only the internal fields. This
   179  	// is so that the caller of AllocRegistrations can not access the internal
   180  	// fields and that method uses these fields to populate the external fields.
   181  	return &ServiceRegistration{
   182  		serviceID: s.serviceID,
   183  		checkIDs:  helper.CopyMapStringStruct(s.checkIDs),
   184  	}
   185  }
   186  
   187  // ServiceClient handles task and agent service registration with Consul.
   188  type ServiceClient struct {
   189  	client           AgentAPI
   190  	logger           *log.Logger
   191  	retryInterval    time.Duration
   192  	maxRetryInterval time.Duration
   193  
   194  	// exitCh is closed when the main Run loop exits
   195  	exitCh chan struct{}
   196  
   197  	// shutdownCh is closed when the client should shutdown
   198  	shutdownCh chan struct{}
   199  
   200  	// shutdownWait is how long Shutdown() blocks waiting for the final
   201  	// sync() to finish. Defaults to defaultShutdownWait
   202  	shutdownWait time.Duration
   203  
   204  	opCh chan *operations
   205  
   206  	services       map[string]*api.AgentServiceRegistration
   207  	checks         map[string]*api.AgentCheckRegistration
   208  	scripts        map[string]*scriptCheck
   209  	runningScripts map[string]*scriptHandle
   210  
   211  	// allocRegistrations stores the services and checks that are registered
   212  	// with Consul by allocation ID.
   213  	allocRegistrations     map[string]*AllocRegistration
   214  	allocRegistrationsLock sync.RWMutex
   215  
   216  	// agent services and checks record entries for the agent itself which
   217  	// should be removed on shutdown
   218  	agentServices map[string]struct{}
   219  	agentChecks   map[string]struct{}
   220  	agentLock     sync.Mutex
   221  
   222  	// seen is 1 if Consul has ever been seen; otherwise 0. Accessed with
   223  	// atomics.
   224  	seen int32
   225  
   226  	// checkWatcher restarts checks that are unhealthy.
   227  	checkWatcher *checkWatcher
   228  }
   229  
   230  // NewServiceClient creates a new Consul ServiceClient from an existing Consul API
   231  // Client and logger.
   232  func NewServiceClient(consulClient AgentAPI, logger *log.Logger) *ServiceClient {
   233  	return &ServiceClient{
   234  		client:             consulClient,
   235  		logger:             logger,
   236  		retryInterval:      defaultRetryInterval,
   237  		maxRetryInterval:   defaultMaxRetryInterval,
   238  		exitCh:             make(chan struct{}),
   239  		shutdownCh:         make(chan struct{}),
   240  		shutdownWait:       defaultShutdownWait,
   241  		opCh:               make(chan *operations, 8),
   242  		services:           make(map[string]*api.AgentServiceRegistration),
   243  		checks:             make(map[string]*api.AgentCheckRegistration),
   244  		scripts:            make(map[string]*scriptCheck),
   245  		runningScripts:     make(map[string]*scriptHandle),
   246  		allocRegistrations: make(map[string]*AllocRegistration),
   247  		agentServices:      make(map[string]struct{}),
   248  		agentChecks:        make(map[string]struct{}),
   249  		checkWatcher:       newCheckWatcher(logger, consulClient),
   250  	}
   251  }
   252  
   253  // seen is used by markSeen and hasSeen
   254  const seen = 1
   255  
   256  // markSeen marks Consul as having been seen (meaning at least one operation
   257  // has succeeded).
   258  func (c *ServiceClient) markSeen() {
   259  	atomic.StoreInt32(&c.seen, seen)
   260  }
   261  
   262  // hasSeen returns true if any Consul operation has ever succeeded. Useful to
   263  // squelch errors if Consul isn't running.
   264  func (c *ServiceClient) hasSeen() bool {
   265  	return atomic.LoadInt32(&c.seen) == seen
   266  }
   267  
   268  // Run the Consul main loop which retries operations against Consul. It should
   269  // be called exactly once.
   270  func (c *ServiceClient) Run() {
   271  	defer close(c.exitCh)
   272  
   273  	ctx, cancel := context.WithCancel(context.Background())
   274  	defer cancel()
   275  
   276  	// init will be closed when Consul has been contacted
   277  	init := make(chan struct{})
   278  	go checkConsulTLSSkipVerify(ctx, c.logger, c.client, init)
   279  
   280  	// Process operations while waiting for initial contact with Consul but
   281  	// do not sync until contact has been made.
   282  	hasOps := false
   283  INIT:
   284  	for {
   285  		select {
   286  		case <-init:
   287  			c.markSeen()
   288  			break INIT
   289  		case <-c.shutdownCh:
   290  			return
   291  		case ops := <-c.opCh:
   292  			hasOps = true
   293  			c.merge(ops)
   294  		}
   295  	}
   296  	c.logger.Printf("[TRACE] consul.sync: able to contact Consul")
   297  
   298  	// Block until contact with Consul has been established
   299  	// Start checkWatcher
   300  	go c.checkWatcher.Run(ctx)
   301  
   302  	retryTimer := time.NewTimer(0)
   303  	if !hasOps {
   304  		// No pending operations so don't immediately sync
   305  		<-retryTimer.C
   306  	}
   307  
   308  	failures := 0
   309  	for {
   310  		select {
   311  		case <-retryTimer.C:
   312  		case <-c.shutdownCh:
   313  			// Cancel check watcher but sync one last time
   314  			cancel()
   315  		case ops := <-c.opCh:
   316  			c.merge(ops)
   317  		}
   318  
   319  		if err := c.sync(); err != nil {
   320  			if failures == 0 {
   321  				// Log on the first failure
   322  				c.logger.Printf("[WARN] consul.sync: failed to update services in Consul: %v", err)
   323  			} else if failures%10 == 0 {
   324  				// Log every 10th consecutive failure
   325  				c.logger.Printf("[ERR] consul.sync: still unable to update services in Consul after %d failures; latest error: %v", failures, err)
   326  			}
   327  
   328  			failures++
   329  			if !retryTimer.Stop() {
   330  				// Timer already expired, since the timer may
   331  				// or may not have been read in the select{}
   332  				// above, conditionally receive on it
   333  				select {
   334  				case <-retryTimer.C:
   335  				default:
   336  				}
   337  			}
   338  			backoff := c.retryInterval * time.Duration(failures)
   339  			if backoff > c.maxRetryInterval {
   340  				backoff = c.maxRetryInterval
   341  			}
   342  			retryTimer.Reset(backoff)
   343  		} else {
   344  			if failures > 0 {
   345  				c.logger.Printf("[INFO] consul.sync: successfully updated services in Consul")
   346  				failures = 0
   347  			}
   348  		}
   349  
   350  		select {
   351  		case <-c.shutdownCh:
   352  			// Exit only after sync'ing all outstanding operations
   353  			if len(c.opCh) > 0 {
   354  				for len(c.opCh) > 0 {
   355  					c.merge(<-c.opCh)
   356  				}
   357  				continue
   358  			}
   359  			return
   360  		default:
   361  		}
   362  
   363  	}
   364  }
   365  
   366  // commit operations unless already shutting down.
   367  func (c *ServiceClient) commit(ops *operations) {
   368  	select {
   369  	case c.opCh <- ops:
   370  	case <-c.shutdownCh:
   371  	}
   372  }
   373  
   374  // merge registrations into state map prior to sync'ing with Consul
   375  func (c *ServiceClient) merge(ops *operations) {
   376  	for _, s := range ops.regServices {
   377  		c.services[s.ID] = s
   378  	}
   379  	for _, check := range ops.regChecks {
   380  		c.checks[check.ID] = check
   381  	}
   382  	for _, s := range ops.scripts {
   383  		c.scripts[s.id] = s
   384  	}
   385  	for _, sid := range ops.deregServices {
   386  		delete(c.services, sid)
   387  	}
   388  	for _, cid := range ops.deregChecks {
   389  		if script, ok := c.runningScripts[cid]; ok {
   390  			script.cancel()
   391  			delete(c.scripts, cid)
   392  			delete(c.runningScripts, cid)
   393  		}
   394  		delete(c.checks, cid)
   395  	}
   396  	metrics.SetGauge([]string{"client", "consul", "services"}, float32(len(c.services)))
   397  	metrics.SetGauge([]string{"client", "consul", "checks"}, float32(len(c.checks)))
   398  	metrics.SetGauge([]string{"client", "consul", "script_checks"}, float32(len(c.runningScripts)))
   399  }
   400  
   401  // sync enqueued operations.
   402  func (c *ServiceClient) sync() error {
   403  	sreg, creg, sdereg, cdereg := 0, 0, 0, 0
   404  
   405  	consulServices, err := c.client.Services()
   406  	if err != nil {
   407  		metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1)
   408  		return fmt.Errorf("error querying Consul services: %v", err)
   409  	}
   410  
   411  	consulChecks, err := c.client.Checks()
   412  	if err != nil {
   413  		metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1)
   414  		return fmt.Errorf("error querying Consul checks: %v", err)
   415  	}
   416  
   417  	// Remove Nomad services in Consul but unknown locally
   418  	for id := range consulServices {
   419  		if _, ok := c.services[id]; ok {
   420  			// Known service, skip
   421  			continue
   422  		}
   423  		if !isNomadService(id) {
   424  			// Not managed by Nomad, skip
   425  			continue
   426  		}
   427  
   428  		// Unknown Nomad managed service; kill
   429  		if err := c.client.ServiceDeregister(id); err != nil {
   430  			if isOldNomadService(id) {
   431  				// Don't hard-fail on old entries. See #3620
   432  				continue
   433  			}
   434  
   435  			metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1)
   436  			return err
   437  		}
   438  		sdereg++
   439  		metrics.IncrCounter([]string{"client", "consul", "service_deregistrations"}, 1)
   440  	}
   441  
   442  	// Add Nomad services missing from Consul
   443  	for id, locals := range c.services {
   444  		if _, ok := consulServices[id]; !ok {
   445  			if err = c.client.ServiceRegister(locals); err != nil {
   446  				metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1)
   447  				return err
   448  			}
   449  			sreg++
   450  			metrics.IncrCounter([]string{"client", "consul", "service_registrations"}, 1)
   451  		}
   452  	}
   453  
   454  	// Remove Nomad checks in Consul but unknown locally
   455  	for id, check := range consulChecks {
   456  		if _, ok := c.checks[id]; ok {
   457  			// Known check, leave it
   458  			continue
   459  		}
   460  		if !isNomadService(check.ServiceID) {
   461  			// Service not managed by Nomad, skip
   462  			continue
   463  		}
   464  
   465  		// Unknown Nomad managed check; remove
   466  		if err := c.client.CheckDeregister(id); err != nil {
   467  			if isOldNomadService(check.ServiceID) {
   468  				// Don't hard-fail on old entries.
   469  				continue
   470  			}
   471  
   472  			metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1)
   473  			return err
   474  		}
   475  		cdereg++
   476  		metrics.IncrCounter([]string{"client", "consul", "check_deregistrations"}, 1)
   477  	}
   478  
   479  	// Add Nomad checks missing from Consul
   480  	for id, check := range c.checks {
   481  		if _, ok := consulChecks[id]; ok {
   482  			// Already in Consul; skipping
   483  			continue
   484  		}
   485  
   486  		if err := c.client.CheckRegister(check); err != nil {
   487  			metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1)
   488  			return err
   489  		}
   490  		creg++
   491  		metrics.IncrCounter([]string{"client", "consul", "check_registrations"}, 1)
   492  
   493  		// Handle starting scripts
   494  		if script, ok := c.scripts[id]; ok {
   495  			// If it's already running, cancel and replace
   496  			if oldScript, running := c.runningScripts[id]; running {
   497  				oldScript.cancel()
   498  			}
   499  			// Start and store the handle
   500  			c.runningScripts[id] = script.run()
   501  		}
   502  	}
   503  
   504  	c.logger.Printf("[DEBUG] consul.sync: registered %d services, %d checks; deregistered %d services, %d checks",
   505  		sreg, creg, sdereg, cdereg)
   506  	return nil
   507  }
   508  
   509  // RegisterAgent registers Nomad agents (client or server). The
   510  // Service.PortLabel should be a literal port to be parsed with SplitHostPort.
   511  // Script checks are not supported and will return an error. Registration is
   512  // asynchronous.
   513  //
   514  // Agents will be deregistered when Shutdown is called.
   515  func (c *ServiceClient) RegisterAgent(role string, services []*structs.Service) error {
   516  	ops := operations{}
   517  
   518  	for _, service := range services {
   519  		id := makeAgentServiceID(role, service)
   520  
   521  		// Unlike tasks, agents don't use port labels. Agent ports are
   522  		// stored directly in the PortLabel.
   523  		host, rawport, err := net.SplitHostPort(service.PortLabel)
   524  		if err != nil {
   525  			return fmt.Errorf("error parsing port label %q from service %q: %v", service.PortLabel, service.Name, err)
   526  		}
   527  		port, err := strconv.Atoi(rawport)
   528  		if err != nil {
   529  			return fmt.Errorf("error parsing port %q from service %q: %v", rawport, service.Name, err)
   530  		}
   531  		serviceReg := &api.AgentServiceRegistration{
   532  			ID:      id,
   533  			Name:    service.Name,
   534  			Tags:    service.Tags,
   535  			Address: host,
   536  			Port:    port,
   537  		}
   538  		ops.regServices = append(ops.regServices, serviceReg)
   539  
   540  		for _, check := range service.Checks {
   541  			checkID := makeCheckID(id, check)
   542  			if check.Type == structs.ServiceCheckScript {
   543  				return fmt.Errorf("service %q contains invalid check: agent checks do not support scripts", service.Name)
   544  			}
   545  			checkHost, checkPort := serviceReg.Address, serviceReg.Port
   546  			if check.PortLabel != "" {
   547  				// Unlike tasks, agents don't use port labels. Agent ports are
   548  				// stored directly in the PortLabel.
   549  				host, rawport, err := net.SplitHostPort(check.PortLabel)
   550  				if err != nil {
   551  					return fmt.Errorf("error parsing port label %q from check %q: %v", service.PortLabel, check.Name, err)
   552  				}
   553  				port, err := strconv.Atoi(rawport)
   554  				if err != nil {
   555  					return fmt.Errorf("error parsing port %q from check %q: %v", rawport, check.Name, err)
   556  				}
   557  				checkHost, checkPort = host, port
   558  			}
   559  			checkReg, err := createCheckReg(id, checkID, check, checkHost, checkPort)
   560  			if err != nil {
   561  				return fmt.Errorf("failed to add check %q: %v", check.Name, err)
   562  			}
   563  			ops.regChecks = append(ops.regChecks, checkReg)
   564  		}
   565  	}
   566  
   567  	// Don't bother committing agent checks if we're already shutting down
   568  	c.agentLock.Lock()
   569  	defer c.agentLock.Unlock()
   570  	select {
   571  	case <-c.shutdownCh:
   572  		return nil
   573  	default:
   574  	}
   575  
   576  	// Now add them to the registration queue
   577  	c.commit(&ops)
   578  
   579  	// Record IDs for deregistering on shutdown
   580  	for _, id := range ops.regServices {
   581  		c.agentServices[id.ID] = struct{}{}
   582  	}
   583  	for _, id := range ops.regChecks {
   584  		c.agentChecks[id.ID] = struct{}{}
   585  	}
   586  	return nil
   587  }
   588  
   589  // serviceRegs creates service registrations, check registrations, and script
   590  // checks from a service. It returns a service registration object with the
   591  // service and check IDs populated.
   592  func (c *ServiceClient) serviceRegs(ops *operations, allocID string, service *structs.Service,
   593  	task *structs.Task, exec driver.ScriptExecutor, net *cstructs.DriverNetwork) (*ServiceRegistration, error) {
   594  
   595  	// Get the services ID
   596  	id := makeTaskServiceID(allocID, task.Name, service)
   597  	sreg := &ServiceRegistration{
   598  		serviceID: id,
   599  		checkIDs:  make(map[string]struct{}, len(service.Checks)),
   600  	}
   601  
   602  	// Service address modes default to auto
   603  	addrMode := service.AddressMode
   604  	if addrMode == "" {
   605  		addrMode = structs.AddressModeAuto
   606  	}
   607  
   608  	// Determine the address to advertise based on the mode
   609  	ip, port, err := getAddress(addrMode, service.PortLabel, task.Resources.Networks, net)
   610  	if err != nil {
   611  		return nil, fmt.Errorf("unable to get address for service %q: %v", service.Name, err)
   612  	}
   613  
   614  	// Build the Consul Service registration request
   615  	serviceReg := &api.AgentServiceRegistration{
   616  		ID:      id,
   617  		Name:    service.Name,
   618  		Tags:    make([]string, len(service.Tags)),
   619  		Address: ip,
   620  		Port:    port,
   621  	}
   622  	// copy isn't strictly necessary but can avoid bugs especially
   623  	// with tests that may reuse Tasks
   624  	copy(serviceReg.Tags, service.Tags)
   625  	ops.regServices = append(ops.regServices, serviceReg)
   626  
   627  	// Build the check registrations
   628  	checkIDs, err := c.checkRegs(ops, allocID, id, service, task, exec, net)
   629  	if err != nil {
   630  		return nil, err
   631  	}
   632  	for _, cid := range checkIDs {
   633  		sreg.checkIDs[cid] = struct{}{}
   634  	}
   635  	return sreg, nil
   636  }
   637  
   638  // checkRegs registers the checks for the given service and returns the
   639  // registered check ids.
   640  func (c *ServiceClient) checkRegs(ops *operations, allocID, serviceID string, service *structs.Service,
   641  	task *structs.Task, exec driver.ScriptExecutor, net *cstructs.DriverNetwork) ([]string, error) {
   642  
   643  	// Fast path
   644  	numChecks := len(service.Checks)
   645  	if numChecks == 0 {
   646  		return nil, nil
   647  	}
   648  
   649  	checkIDs := make([]string, 0, numChecks)
   650  	for _, check := range service.Checks {
   651  		checkID := makeCheckID(serviceID, check)
   652  		checkIDs = append(checkIDs, checkID)
   653  		if check.Type == structs.ServiceCheckScript {
   654  			if exec == nil {
   655  				return nil, fmt.Errorf("driver doesn't support script checks")
   656  			}
   657  			ops.scripts = append(ops.scripts, newScriptCheck(
   658  				allocID, task.Name, checkID, check, exec, c.client, c.logger, c.shutdownCh))
   659  
   660  			// Skip getAddress for script checks
   661  			checkReg, err := createCheckReg(serviceID, checkID, check, "", 0)
   662  			if err != nil {
   663  				return nil, fmt.Errorf("failed to add script check %q: %v", check.Name, err)
   664  			}
   665  			ops.regChecks = append(ops.regChecks, checkReg)
   666  			continue
   667  		}
   668  
   669  		// Default to the service's port but allow check to override
   670  		portLabel := check.PortLabel
   671  		if portLabel == "" {
   672  			// Default to the service's port label
   673  			portLabel = service.PortLabel
   674  		}
   675  
   676  		// Checks address mode defaults to host for pre-#3380 backward compat
   677  		addrMode := check.AddressMode
   678  		if addrMode == "" {
   679  			addrMode = structs.AddressModeHost
   680  		}
   681  
   682  		ip, port, err := getAddress(addrMode, portLabel, task.Resources.Networks, net)
   683  		if err != nil {
   684  			return nil, fmt.Errorf("error getting address for check %q: %v", check.Name, err)
   685  		}
   686  
   687  		checkReg, err := createCheckReg(serviceID, checkID, check, ip, port)
   688  		if err != nil {
   689  			return nil, fmt.Errorf("failed to add check %q: %v", check.Name, err)
   690  		}
   691  		ops.regChecks = append(ops.regChecks, checkReg)
   692  	}
   693  	return checkIDs, nil
   694  }
   695  
   696  // RegisterTask with Consul. Adds all service entries and checks to Consul. If
   697  // exec is nil and a script check exists an error is returned.
   698  //
   699  // If the service IP is set it used as the address in the service registration.
   700  // Checks will always use the IP from the Task struct (host's IP).
   701  //
   702  // Actual communication with Consul is done asynchronously (see Run).
   703  func (c *ServiceClient) RegisterTask(allocID string, task *structs.Task, restarter TaskRestarter, exec driver.ScriptExecutor, net *cstructs.DriverNetwork) error {
   704  	// Fast path
   705  	numServices := len(task.Services)
   706  	if numServices == 0 {
   707  		return nil
   708  	}
   709  
   710  	t := new(TaskRegistration)
   711  	t.Services = make(map[string]*ServiceRegistration, numServices)
   712  
   713  	ops := &operations{}
   714  	for _, service := range task.Services {
   715  		sreg, err := c.serviceRegs(ops, allocID, service, task, exec, net)
   716  		if err != nil {
   717  			return err
   718  		}
   719  		t.Services[sreg.serviceID] = sreg
   720  	}
   721  
   722  	// Add the task to the allocation's registration
   723  	c.addTaskRegistration(allocID, task.Name, t)
   724  
   725  	c.commit(ops)
   726  
   727  	// Start watching checks. Done after service registrations are built
   728  	// since an error building them could leak watches.
   729  	for _, service := range task.Services {
   730  		serviceID := makeTaskServiceID(allocID, task.Name, service)
   731  		for _, check := range service.Checks {
   732  			if check.TriggersRestarts() {
   733  				checkID := makeCheckID(serviceID, check)
   734  				c.checkWatcher.Watch(allocID, task.Name, checkID, check, restarter)
   735  			}
   736  		}
   737  	}
   738  	return nil
   739  }
   740  
   741  // UpdateTask in Consul. Does not alter the service if only checks have
   742  // changed.
   743  //
   744  // DriverNetwork must not change between invocations for the same allocation.
   745  func (c *ServiceClient) UpdateTask(allocID string, existing, newTask *structs.Task, restarter TaskRestarter, exec driver.ScriptExecutor, net *cstructs.DriverNetwork) error {
   746  	ops := &operations{}
   747  
   748  	taskReg := new(TaskRegistration)
   749  	taskReg.Services = make(map[string]*ServiceRegistration, len(newTask.Services))
   750  
   751  	existingIDs := make(map[string]*structs.Service, len(existing.Services))
   752  	for _, s := range existing.Services {
   753  		existingIDs[makeTaskServiceID(allocID, existing.Name, s)] = s
   754  	}
   755  	newIDs := make(map[string]*structs.Service, len(newTask.Services))
   756  	for _, s := range newTask.Services {
   757  		newIDs[makeTaskServiceID(allocID, newTask.Name, s)] = s
   758  	}
   759  
   760  	// Loop over existing Service IDs to see if they have been removed or
   761  	// updated.
   762  	for existingID, existingSvc := range existingIDs {
   763  		newSvc, ok := newIDs[existingID]
   764  		if !ok {
   765  			// Existing service entry removed
   766  			ops.deregServices = append(ops.deregServices, existingID)
   767  			for _, check := range existingSvc.Checks {
   768  				cid := makeCheckID(existingID, check)
   769  				ops.deregChecks = append(ops.deregChecks, cid)
   770  
   771  				// Unwatch watched checks
   772  				if check.TriggersRestarts() {
   773  					c.checkWatcher.Unwatch(cid)
   774  				}
   775  			}
   776  			continue
   777  		}
   778  
   779  		// Service exists and hasn't changed, don't re-add it later
   780  		delete(newIDs, existingID)
   781  
   782  		// Service still exists so add it to the task's registration
   783  		sreg := &ServiceRegistration{
   784  			serviceID: existingID,
   785  			checkIDs:  make(map[string]struct{}, len(newSvc.Checks)),
   786  		}
   787  		taskReg.Services[existingID] = sreg
   788  
   789  		// See if any checks were updated
   790  		existingChecks := make(map[string]*structs.ServiceCheck, len(existingSvc.Checks))
   791  		for _, check := range existingSvc.Checks {
   792  			existingChecks[makeCheckID(existingID, check)] = check
   793  		}
   794  
   795  		// Register new checks
   796  		for _, check := range newSvc.Checks {
   797  			checkID := makeCheckID(existingID, check)
   798  			if _, exists := existingChecks[checkID]; exists {
   799  				// Check exists, so don't remove it
   800  				delete(existingChecks, checkID)
   801  				sreg.checkIDs[checkID] = struct{}{}
   802  			}
   803  
   804  			// New check on an unchanged service; add them now
   805  			newCheckIDs, err := c.checkRegs(ops, allocID, existingID, newSvc, newTask, exec, net)
   806  			if err != nil {
   807  				return err
   808  			}
   809  
   810  			for _, checkID := range newCheckIDs {
   811  				sreg.checkIDs[checkID] = struct{}{}
   812  
   813  			}
   814  
   815  			// Update all watched checks as CheckRestart fields aren't part of ID
   816  			if check.TriggersRestarts() {
   817  				c.checkWatcher.Watch(allocID, newTask.Name, checkID, check, restarter)
   818  			}
   819  		}
   820  
   821  		// Remove existing checks not in updated service
   822  		for cid, check := range existingChecks {
   823  			ops.deregChecks = append(ops.deregChecks, cid)
   824  
   825  			// Unwatch checks
   826  			if check.TriggersRestarts() {
   827  				c.checkWatcher.Unwatch(cid)
   828  			}
   829  		}
   830  	}
   831  
   832  	// Any remaining services should just be enqueued directly
   833  	for _, newSvc := range newIDs {
   834  		sreg, err := c.serviceRegs(ops, allocID, newSvc, newTask, exec, net)
   835  		if err != nil {
   836  			return err
   837  		}
   838  
   839  		taskReg.Services[sreg.serviceID] = sreg
   840  	}
   841  
   842  	// Add the task to the allocation's registration
   843  	c.addTaskRegistration(allocID, newTask.Name, taskReg)
   844  
   845  	c.commit(ops)
   846  
   847  	// Start watching checks. Done after service registrations are built
   848  	// since an error building them could leak watches.
   849  	for _, service := range newIDs {
   850  		serviceID := makeTaskServiceID(allocID, newTask.Name, service)
   851  		for _, check := range service.Checks {
   852  			if check.TriggersRestarts() {
   853  				checkID := makeCheckID(serviceID, check)
   854  				c.checkWatcher.Watch(allocID, newTask.Name, checkID, check, restarter)
   855  			}
   856  		}
   857  	}
   858  	return nil
   859  }
   860  
   861  // RemoveTask from Consul. Removes all service entries and checks.
   862  //
   863  // Actual communication with Consul is done asynchronously (see Run).
   864  func (c *ServiceClient) RemoveTask(allocID string, task *structs.Task) {
   865  	ops := operations{}
   866  
   867  	for _, service := range task.Services {
   868  		id := makeTaskServiceID(allocID, task.Name, service)
   869  		ops.deregServices = append(ops.deregServices, id)
   870  
   871  		for _, check := range service.Checks {
   872  			cid := makeCheckID(id, check)
   873  			ops.deregChecks = append(ops.deregChecks, cid)
   874  
   875  			if check.TriggersRestarts() {
   876  				c.checkWatcher.Unwatch(cid)
   877  			}
   878  		}
   879  	}
   880  
   881  	// Remove the task from the alloc's registrations
   882  	c.removeTaskRegistration(allocID, task.Name)
   883  
   884  	// Now add them to the deregistration fields; main Run loop will update
   885  	c.commit(&ops)
   886  }
   887  
   888  // AllocRegistrations returns the registrations for the given allocation. If the
   889  // allocation has no reservations, the response is a nil object.
   890  func (c *ServiceClient) AllocRegistrations(allocID string) (*AllocRegistration, error) {
   891  	// Get the internal struct using the lock
   892  	c.allocRegistrationsLock.RLock()
   893  	regInternal, ok := c.allocRegistrations[allocID]
   894  	if !ok {
   895  		c.allocRegistrationsLock.RUnlock()
   896  		return nil, nil
   897  	}
   898  
   899  	// Copy so we don't expose internal structs
   900  	reg := regInternal.copy()
   901  	c.allocRegistrationsLock.RUnlock()
   902  
   903  	// Query the services and checks to populate the allocation registrations.
   904  	services, err := c.client.Services()
   905  	if err != nil {
   906  		return nil, err
   907  	}
   908  
   909  	checks, err := c.client.Checks()
   910  	if err != nil {
   911  		return nil, err
   912  	}
   913  
   914  	// Populate the object
   915  	for _, treg := range reg.Tasks {
   916  		for serviceID, sreg := range treg.Services {
   917  			sreg.Service = services[serviceID]
   918  			for checkID := range sreg.checkIDs {
   919  				if check, ok := checks[checkID]; ok {
   920  					sreg.Checks = append(sreg.Checks, check)
   921  				}
   922  			}
   923  		}
   924  	}
   925  
   926  	return reg, nil
   927  }
   928  
   929  // Shutdown the Consul client. Update running task registrations and deregister
   930  // agent from Consul. On first call blocks up to shutdownWait before giving up
   931  // on syncing operations.
   932  func (c *ServiceClient) Shutdown() error {
   933  	// Serialize Shutdown calls with RegisterAgent to prevent leaking agent
   934  	// entries.
   935  	c.agentLock.Lock()
   936  	defer c.agentLock.Unlock()
   937  	select {
   938  	case <-c.shutdownCh:
   939  		return nil
   940  	default:
   941  		close(c.shutdownCh)
   942  	}
   943  
   944  	// Give run loop time to sync, but don't block indefinitely
   945  	deadline := time.After(c.shutdownWait)
   946  
   947  	// Wait for Run to finish any outstanding operations and exit
   948  	select {
   949  	case <-c.exitCh:
   950  	case <-deadline:
   951  		// Don't wait forever though
   952  	}
   953  
   954  	// If Consul was never seen nothing could be written so exit early
   955  	if !c.hasSeen() {
   956  		return nil
   957  	}
   958  
   959  	// Always attempt to deregister Nomad agent Consul entries, even if
   960  	// deadline was reached
   961  	for id := range c.agentServices {
   962  		if err := c.client.ServiceDeregister(id); err != nil {
   963  			c.logger.Printf("[ERR] consul.sync: error deregistering agent service (id: %q): %v", id, err)
   964  		}
   965  	}
   966  	for id := range c.agentChecks {
   967  		if err := c.client.CheckDeregister(id); err != nil {
   968  			c.logger.Printf("[ERR] consul.sync: error deregistering agent service (id: %q): %v", id, err)
   969  		}
   970  	}
   971  
   972  	// Give script checks time to exit (no need to lock as Run() has exited)
   973  	for _, h := range c.runningScripts {
   974  		select {
   975  		case <-h.wait():
   976  		case <-deadline:
   977  			return fmt.Errorf("timed out waiting for script checks to run")
   978  		}
   979  	}
   980  	return nil
   981  }
   982  
   983  // addTaskRegistration adds the task registration for the given allocation.
   984  func (c *ServiceClient) addTaskRegistration(allocID, taskName string, reg *TaskRegistration) {
   985  	c.allocRegistrationsLock.Lock()
   986  	defer c.allocRegistrationsLock.Unlock()
   987  
   988  	alloc, ok := c.allocRegistrations[allocID]
   989  	if !ok {
   990  		alloc = &AllocRegistration{
   991  			Tasks: make(map[string]*TaskRegistration),
   992  		}
   993  		c.allocRegistrations[allocID] = alloc
   994  	}
   995  	alloc.Tasks[taskName] = reg
   996  }
   997  
   998  // removeTaskRegistration removes the task registration for the given allocation.
   999  func (c *ServiceClient) removeTaskRegistration(allocID, taskName string) {
  1000  	c.allocRegistrationsLock.Lock()
  1001  	defer c.allocRegistrationsLock.Unlock()
  1002  
  1003  	alloc, ok := c.allocRegistrations[allocID]
  1004  	if !ok {
  1005  		return
  1006  	}
  1007  
  1008  	// Delete the task and if it is the last one also delete the alloc's
  1009  	// registration
  1010  	delete(alloc.Tasks, taskName)
  1011  	if len(alloc.Tasks) == 0 {
  1012  		delete(c.allocRegistrations, allocID)
  1013  	}
  1014  }
  1015  
  1016  // makeAgentServiceID creates a unique ID for identifying an agent service in
  1017  // Consul.
  1018  //
  1019  // Agent service IDs are of the form:
  1020  //
  1021  //	{nomadServicePrefix}-{ROLE}-b32(sha1({Service.Name}-{Service.Tags...})
  1022  //	Example Server ID: _nomad-server-fbbk265qn4tmt25nd4ep42tjvmyj3hr4
  1023  //	Example Client ID: _nomad-client-ggnjpgl7yn7rgmvxzilmpvrzzvrszc7l
  1024  //
  1025  func makeAgentServiceID(role string, service *structs.Service) string {
  1026  	return fmt.Sprintf("%s-%s-%s", nomadServicePrefix, role, service.Hash(role, ""))
  1027  }
  1028  
  1029  // makeTaskServiceID creates a unique ID for identifying a task service in
  1030  // Consul. All structs.Service fields are included in the ID's hash except
  1031  // Checks. This allows updates to merely compare IDs.
  1032  //
  1033  //	Example Service ID: _nomad-task-TNM333JKJPM5AK4FAS3VXQLXFDWOF4VH
  1034  func makeTaskServiceID(allocID, taskName string, service *structs.Service) string {
  1035  	return nomadTaskPrefix + service.Hash(allocID, taskName)
  1036  }
  1037  
  1038  // makeCheckID creates a unique ID for a check.
  1039  func makeCheckID(serviceID string, check *structs.ServiceCheck) string {
  1040  	return check.Hash(serviceID)
  1041  }
  1042  
  1043  // createCheckReg creates a Check that can be registered with Consul.
  1044  //
  1045  // Script checks simply have a TTL set and the caller is responsible for
  1046  // running the script and heartbeating.
  1047  func createCheckReg(serviceID, checkID string, check *structs.ServiceCheck, host string, port int) (*api.AgentCheckRegistration, error) {
  1048  	chkReg := api.AgentCheckRegistration{
  1049  		ID:        checkID,
  1050  		Name:      check.Name,
  1051  		ServiceID: serviceID,
  1052  	}
  1053  	chkReg.Status = check.InitialStatus
  1054  	chkReg.Timeout = check.Timeout.String()
  1055  	chkReg.Interval = check.Interval.String()
  1056  
  1057  	// Require an address for http or tcp checks
  1058  	if port == 0 && check.RequiresPort() {
  1059  		return nil, fmt.Errorf("%s checks require an address", check.Type)
  1060  	}
  1061  
  1062  	switch check.Type {
  1063  	case structs.ServiceCheckHTTP:
  1064  		proto := check.Protocol
  1065  		if proto == "" {
  1066  			proto = "http"
  1067  		}
  1068  		if check.TLSSkipVerify {
  1069  			chkReg.TLSSkipVerify = true
  1070  		}
  1071  		base := url.URL{
  1072  			Scheme: proto,
  1073  			Host:   net.JoinHostPort(host, strconv.Itoa(port)),
  1074  		}
  1075  		relative, err := url.Parse(check.Path)
  1076  		if err != nil {
  1077  			return nil, err
  1078  		}
  1079  		url := base.ResolveReference(relative)
  1080  		chkReg.HTTP = url.String()
  1081  		chkReg.Method = check.Method
  1082  		chkReg.Header = check.Header
  1083  	case structs.ServiceCheckTCP:
  1084  		chkReg.TCP = net.JoinHostPort(host, strconv.Itoa(port))
  1085  	case structs.ServiceCheckScript:
  1086  		chkReg.TTL = (check.Interval + ttlCheckBuffer).String()
  1087  		// As of Consul 1.0.0 setting TTL and Interval is a 400
  1088  		chkReg.Interval = ""
  1089  	default:
  1090  		return nil, fmt.Errorf("check type %+q not valid", check.Type)
  1091  	}
  1092  	return &chkReg, nil
  1093  }
  1094  
  1095  // isNomadService returns true if the ID matches the pattern of a Nomad managed
  1096  // service (new or old formats). Agent services return false as independent
  1097  // client and server agents may be running on the same machine. #2827
  1098  func isNomadService(id string) bool {
  1099  	return strings.HasPrefix(id, nomadTaskPrefix) || isOldNomadService(id)
  1100  }
  1101  
  1102  // isOldNomadService returns true if the ID matches an old pattern managed by
  1103  // Nomad.
  1104  //
  1105  // Pre-0.7.1 task service IDs are of the form:
  1106  //
  1107  //	{nomadServicePrefix}-executor-{ALLOC_ID}-{Service.Name}-{Service.Tags...}
  1108  //	Example Service ID: _nomad-executor-1234-echo-http-tag1-tag2-tag3
  1109  //
  1110  func isOldNomadService(id string) bool {
  1111  	const prefix = nomadServicePrefix + "-executor"
  1112  	return strings.HasPrefix(id, prefix)
  1113  }
  1114  
  1115  // getAddress returns the IP and port to use for a service or check. If no port
  1116  // label is specified (an empty value), zero values are returned because no
  1117  // address could be resolved.
  1118  func getAddress(addrMode, portLabel string, networks structs.Networks, driverNet *cstructs.DriverNetwork) (string, int, error) {
  1119  	switch addrMode {
  1120  	case structs.AddressModeAuto:
  1121  		if driverNet.Advertise() {
  1122  			addrMode = structs.AddressModeDriver
  1123  		} else {
  1124  			addrMode = structs.AddressModeHost
  1125  		}
  1126  		return getAddress(addrMode, portLabel, networks, driverNet)
  1127  	case structs.AddressModeHost:
  1128  		if portLabel == "" {
  1129  			if len(networks) != 1 {
  1130  				// If no networks are specified return zero
  1131  				// values. Consul will advertise the host IP
  1132  				// with no port. This is the pre-0.7.1 behavior
  1133  				// some people rely on.
  1134  				return "", 0, nil
  1135  			}
  1136  
  1137  			return networks[0].IP, 0, nil
  1138  		}
  1139  
  1140  		// Default path: use host ip:port
  1141  		ip, port := networks.Port(portLabel)
  1142  		if ip == "" && port <= 0 {
  1143  			return "", 0, fmt.Errorf("invalid port %q: port label not found", portLabel)
  1144  		}
  1145  		return ip, port, nil
  1146  
  1147  	case structs.AddressModeDriver:
  1148  		// Require a driver network if driver address mode is used
  1149  		if driverNet == nil {
  1150  			return "", 0, fmt.Errorf(`cannot use address_mode="driver": no driver network exists`)
  1151  		}
  1152  
  1153  		// If no port label is specified just return the IP
  1154  		if portLabel == "" {
  1155  			return driverNet.IP, 0, nil
  1156  		}
  1157  
  1158  		// If the port is a label, use the driver's port (not the host's)
  1159  		if port, ok := driverNet.PortMap[portLabel]; ok {
  1160  			return driverNet.IP, port, nil
  1161  		}
  1162  
  1163  		// If port isn't a label, try to parse it as a literal port number
  1164  		port, err := strconv.Atoi(portLabel)
  1165  		if err != nil {
  1166  			// Don't include Atoi error message as user likely
  1167  			// never intended it to be a numeric and it creates a
  1168  			// confusing error message
  1169  			return "", 0, fmt.Errorf("invalid port label %q: port labels in driver address_mode must be numeric or in the driver's port map", portLabel)
  1170  		}
  1171  		if port <= 0 {
  1172  			return "", 0, fmt.Errorf("invalid port: %q: port must be >0", portLabel)
  1173  		}
  1174  
  1175  		return driverNet.IP, port, nil
  1176  
  1177  	default:
  1178  		// Shouldn't happen due to validation, but enforce invariants
  1179  		return "", 0, fmt.Errorf("invalid address mode %q", addrMode)
  1180  	}
  1181  }