github.com/anuvu/nomad@v0.8.7-atom1/command/agent/consul/client.go (about)

     1  package consul
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"log"
     7  	"net"
     8  	"net/url"
     9  	"strconv"
    10  	"strings"
    11  	"sync"
    12  	"sync/atomic"
    13  	"time"
    14  
    15  	metrics "github.com/armon/go-metrics"
    16  	"github.com/hashicorp/consul/api"
    17  	cstructs "github.com/hashicorp/nomad/client/structs"
    18  	"github.com/hashicorp/nomad/helper"
    19  	"github.com/hashicorp/nomad/nomad/structs"
    20  )
    21  
    22  const (
    23  	// nomadServicePrefix is the prefix that scopes all Nomad registered
    24  	// services (both agent and task entries).
    25  	nomadServicePrefix = "_nomad"
    26  
    27  	// nomadTaskPrefix is the prefix that scopes Nomad registered services
    28  	// for tasks.
    29  	nomadTaskPrefix = nomadServicePrefix + "-task-"
    30  
    31  	// defaultRetryInterval is how quickly to retry syncing services and
    32  	// checks to Consul when an error occurs. Will backoff up to a max.
    33  	defaultRetryInterval = time.Second
    34  
    35  	// defaultMaxRetryInterval is the default max retry interval.
    36  	defaultMaxRetryInterval = 30 * time.Second
    37  
    38  	// defaultPeriodicalInterval is the interval at which the service
    39  	// client reconciles state between the desired services and checks and
    40  	// what's actually registered in Consul. This is done at an interval,
    41  	// rather than being purely edge triggered, to handle the case that the
    42  	// Consul agent's state may change underneath us
    43  	defaultPeriodicInterval = 30 * time.Second
    44  
    45  	// ttlCheckBuffer is the time interval that Nomad can take to report Consul
    46  	// the check result
    47  	ttlCheckBuffer = 31 * time.Second
    48  
    49  	// defaultShutdownWait is how long Shutdown() should block waiting for
    50  	// enqueued operations to sync to Consul by default.
    51  	defaultShutdownWait = time.Minute
    52  
    53  	// DefaultQueryWaitDuration is the max duration the Consul Agent will
    54  	// spend waiting for a response from a Consul Query.
    55  	DefaultQueryWaitDuration = 2 * time.Second
    56  
    57  	// ServiceTagHTTP is the tag assigned to HTTP services
    58  	ServiceTagHTTP = "http"
    59  
    60  	// ServiceTagRPC is the tag assigned to RPC services
    61  	ServiceTagRPC = "rpc"
    62  
    63  	// ServiceTagSerf is the tag assigned to Serf services
    64  	ServiceTagSerf = "serf"
    65  )
    66  
    67  // CatalogAPI is the consul/api.Catalog API used by Nomad.
    68  type CatalogAPI interface {
    69  	Datacenters() ([]string, error)
    70  	Service(service, tag string, q *api.QueryOptions) ([]*api.CatalogService, *api.QueryMeta, error)
    71  }
    72  
    73  // AgentAPI is the consul/api.Agent API used by Nomad.
    74  type AgentAPI interface {
    75  	Services() (map[string]*api.AgentService, error)
    76  	Checks() (map[string]*api.AgentCheck, error)
    77  	CheckRegister(check *api.AgentCheckRegistration) error
    78  	CheckDeregister(checkID string) error
    79  	Self() (map[string]map[string]interface{}, error)
    80  	ServiceRegister(service *api.AgentServiceRegistration) error
    81  	ServiceDeregister(serviceID string) error
    82  	UpdateTTL(id, output, status string) error
    83  }
    84  
    85  // operations are submitted to the main loop via commit() for synchronizing
    86  // with Consul.
    87  type operations struct {
    88  	regServices []*api.AgentServiceRegistration
    89  	regChecks   []*api.AgentCheckRegistration
    90  	scripts     []*scriptCheck
    91  
    92  	deregServices []string
    93  	deregChecks   []string
    94  }
    95  
    96  // AllocRegistration holds the status of services registered for a particular
    97  // allocations by task.
    98  type AllocRegistration struct {
    99  	// Tasks maps the name of a task to its registered services and checks
   100  	Tasks map[string]*TaskRegistration
   101  }
   102  
   103  func (a *AllocRegistration) copy() *AllocRegistration {
   104  	c := &AllocRegistration{
   105  		Tasks: make(map[string]*TaskRegistration, len(a.Tasks)),
   106  	}
   107  
   108  	for k, v := range a.Tasks {
   109  		c.Tasks[k] = v.copy()
   110  	}
   111  
   112  	return c
   113  }
   114  
   115  // NumServices returns the number of registered services
   116  func (a *AllocRegistration) NumServices() int {
   117  	if a == nil {
   118  		return 0
   119  	}
   120  
   121  	total := 0
   122  	for _, treg := range a.Tasks {
   123  		for _, sreg := range treg.Services {
   124  			if sreg.Service != nil {
   125  				total++
   126  			}
   127  		}
   128  	}
   129  
   130  	return total
   131  }
   132  
   133  // NumChecks returns the number of registered checks
   134  func (a *AllocRegistration) NumChecks() int {
   135  	if a == nil {
   136  		return 0
   137  	}
   138  
   139  	total := 0
   140  	for _, treg := range a.Tasks {
   141  		for _, sreg := range treg.Services {
   142  			total += len(sreg.Checks)
   143  		}
   144  	}
   145  
   146  	return total
   147  }
   148  
   149  // TaskRegistration holds the status of services registered for a particular
   150  // task.
   151  type TaskRegistration struct {
   152  	Services map[string]*ServiceRegistration
   153  }
   154  
   155  func (t *TaskRegistration) copy() *TaskRegistration {
   156  	c := &TaskRegistration{
   157  		Services: make(map[string]*ServiceRegistration, len(t.Services)),
   158  	}
   159  
   160  	for k, v := range t.Services {
   161  		c.Services[k] = v.copy()
   162  	}
   163  
   164  	return c
   165  }
   166  
   167  // ServiceRegistration holds the status of a registered Consul Service and its
   168  // Checks.
   169  type ServiceRegistration struct {
   170  	// serviceID and checkIDs are internal fields that track just the IDs of the
   171  	// services/checks registered in Consul. It is used to materialize the other
   172  	// fields when queried.
   173  	serviceID string
   174  	checkIDs  map[string]struct{}
   175  
   176  	// Service is the AgentService registered in Consul.
   177  	Service *api.AgentService
   178  
   179  	// Checks is the status of the registered checks.
   180  	Checks []*api.AgentCheck
   181  }
   182  
   183  func (s *ServiceRegistration) copy() *ServiceRegistration {
   184  	// Copy does not copy the external fields but only the internal fields. This
   185  	// is so that the caller of AllocRegistrations can not access the internal
   186  	// fields and that method uses these fields to populate the external fields.
   187  	return &ServiceRegistration{
   188  		serviceID: s.serviceID,
   189  		checkIDs:  helper.CopyMapStringStruct(s.checkIDs),
   190  	}
   191  }
   192  
   193  // ServiceClient handles task and agent service registration with Consul.
   194  type ServiceClient struct {
   195  	client           AgentAPI
   196  	logger           *log.Logger
   197  	retryInterval    time.Duration
   198  	maxRetryInterval time.Duration
   199  	periodicInterval time.Duration
   200  
   201  	// exitCh is closed when the main Run loop exits
   202  	exitCh chan struct{}
   203  
   204  	// shutdownCh is closed when the client should shutdown
   205  	shutdownCh chan struct{}
   206  
   207  	// shutdownWait is how long Shutdown() blocks waiting for the final
   208  	// sync() to finish. Defaults to defaultShutdownWait
   209  	shutdownWait time.Duration
   210  
   211  	opCh chan *operations
   212  
   213  	services       map[string]*api.AgentServiceRegistration
   214  	checks         map[string]*api.AgentCheckRegistration
   215  	scripts        map[string]*scriptCheck
   216  	runningScripts map[string]*scriptHandle
   217  
   218  	// allocRegistrations stores the services and checks that are registered
   219  	// with Consul by allocation ID.
   220  	allocRegistrations     map[string]*AllocRegistration
   221  	allocRegistrationsLock sync.RWMutex
   222  
   223  	// agent services and checks record entries for the agent itself which
   224  	// should be removed on shutdown
   225  	agentServices map[string]struct{}
   226  	agentChecks   map[string]struct{}
   227  	agentLock     sync.Mutex
   228  
   229  	// seen is 1 if Consul has ever been seen; otherwise 0. Accessed with
   230  	// atomics.
   231  	seen int32
   232  
   233  	// checkWatcher restarts checks that are unhealthy.
   234  	checkWatcher *checkWatcher
   235  
   236  	// isClientAgent specifies whether this Consul client is being used
   237  	// by a Nomad client.
   238  	isClientAgent bool
   239  }
   240  
   241  // NewServiceClient creates a new Consul ServiceClient from an existing Consul API
   242  // Client, logger and takes whether the client is being used by a Nomad Client agent.
   243  // When being used by a Nomad client, this Consul client reconciles all services and
   244  // checks created by Nomad on behalf of running tasks.
   245  func NewServiceClient(consulClient AgentAPI, logger *log.Logger, isNomadClient bool) *ServiceClient {
   246  	return &ServiceClient{
   247  		client:             consulClient,
   248  		logger:             logger,
   249  		retryInterval:      defaultRetryInterval,
   250  		maxRetryInterval:   defaultMaxRetryInterval,
   251  		periodicInterval:   defaultPeriodicInterval,
   252  		exitCh:             make(chan struct{}),
   253  		shutdownCh:         make(chan struct{}),
   254  		shutdownWait:       defaultShutdownWait,
   255  		opCh:               make(chan *operations, 8),
   256  		services:           make(map[string]*api.AgentServiceRegistration),
   257  		checks:             make(map[string]*api.AgentCheckRegistration),
   258  		scripts:            make(map[string]*scriptCheck),
   259  		runningScripts:     make(map[string]*scriptHandle),
   260  		allocRegistrations: make(map[string]*AllocRegistration),
   261  		agentServices:      make(map[string]struct{}),
   262  		agentChecks:        make(map[string]struct{}),
   263  		checkWatcher:       newCheckWatcher(logger, consulClient),
   264  		isClientAgent:      isNomadClient,
   265  	}
   266  }
   267  
   268  // seen is used by markSeen and hasSeen
   269  const seen = 1
   270  
   271  // markSeen marks Consul as having been seen (meaning at least one operation
   272  // has succeeded).
   273  func (c *ServiceClient) markSeen() {
   274  	atomic.StoreInt32(&c.seen, seen)
   275  }
   276  
   277  // hasSeen returns true if any Consul operation has ever succeeded. Useful to
   278  // squelch errors if Consul isn't running.
   279  func (c *ServiceClient) hasSeen() bool {
   280  	return atomic.LoadInt32(&c.seen) == seen
   281  }
   282  
   283  // Run the Consul main loop which retries operations against Consul. It should
   284  // be called exactly once.
   285  func (c *ServiceClient) Run() {
   286  	defer close(c.exitCh)
   287  
   288  	ctx, cancel := context.WithCancel(context.Background())
   289  	defer cancel()
   290  
   291  	// init will be closed when Consul has been contacted
   292  	init := make(chan struct{})
   293  	go checkConsulTLSSkipVerify(ctx, c.logger, c.client, init)
   294  
   295  	// Process operations while waiting for initial contact with Consul but
   296  	// do not sync until contact has been made.
   297  INIT:
   298  	for {
   299  		select {
   300  		case <-init:
   301  			c.markSeen()
   302  			break INIT
   303  		case <-c.shutdownCh:
   304  			return
   305  		case ops := <-c.opCh:
   306  			c.merge(ops)
   307  		}
   308  	}
   309  	c.logger.Printf("[TRACE] consul.sync: able to contact Consul")
   310  
   311  	// Block until contact with Consul has been established
   312  	// Start checkWatcher
   313  	go c.checkWatcher.Run(ctx)
   314  
   315  	// Always immediately sync to reconcile Nomad and Consul's state
   316  	retryTimer := time.NewTimer(0)
   317  
   318  	failures := 0
   319  	for {
   320  		select {
   321  		case <-retryTimer.C:
   322  		case <-c.shutdownCh:
   323  			// Cancel check watcher but sync one last time
   324  			cancel()
   325  		case ops := <-c.opCh:
   326  			c.merge(ops)
   327  		}
   328  
   329  		if err := c.sync(); err != nil {
   330  			if failures == 0 {
   331  				// Log on the first failure
   332  				c.logger.Printf("[WARN] consul.sync: failed to update services in Consul: %v", err)
   333  			} else if failures%10 == 0 {
   334  				// Log every 10th consecutive failure
   335  				c.logger.Printf("[ERR] consul.sync: still unable to update services in Consul after %d failures; latest error: %v", failures, err)
   336  			}
   337  
   338  			failures++
   339  			if !retryTimer.Stop() {
   340  				// Timer already expired, since the timer may
   341  				// or may not have been read in the select{}
   342  				// above, conditionally receive on it
   343  				select {
   344  				case <-retryTimer.C:
   345  				default:
   346  				}
   347  			}
   348  			backoff := c.retryInterval * time.Duration(failures)
   349  			if backoff > c.maxRetryInterval {
   350  				backoff = c.maxRetryInterval
   351  			}
   352  			retryTimer.Reset(backoff)
   353  		} else {
   354  			if failures > 0 {
   355  				c.logger.Printf("[INFO] consul.sync: successfully updated services in Consul")
   356  				failures = 0
   357  			}
   358  
   359  			// Reset timer to periodic interval to periodically
   360  			// reconile with Consul
   361  			if !retryTimer.Stop() {
   362  				select {
   363  				case <-retryTimer.C:
   364  				default:
   365  				}
   366  			}
   367  			retryTimer.Reset(c.periodicInterval)
   368  		}
   369  
   370  		select {
   371  		case <-c.shutdownCh:
   372  			// Exit only after sync'ing all outstanding operations
   373  			if len(c.opCh) > 0 {
   374  				for len(c.opCh) > 0 {
   375  					c.merge(<-c.opCh)
   376  				}
   377  				continue
   378  			}
   379  			return
   380  		default:
   381  		}
   382  
   383  	}
   384  }
   385  
   386  // commit operations unless already shutting down.
   387  func (c *ServiceClient) commit(ops *operations) {
   388  	select {
   389  	case c.opCh <- ops:
   390  	case <-c.shutdownCh:
   391  	}
   392  }
   393  
   394  // merge registrations into state map prior to sync'ing with Consul
   395  func (c *ServiceClient) merge(ops *operations) {
   396  	for _, s := range ops.regServices {
   397  		c.services[s.ID] = s
   398  	}
   399  	for _, check := range ops.regChecks {
   400  		c.checks[check.ID] = check
   401  	}
   402  	for _, s := range ops.scripts {
   403  		c.scripts[s.id] = s
   404  	}
   405  	for _, sid := range ops.deregServices {
   406  		delete(c.services, sid)
   407  	}
   408  	for _, cid := range ops.deregChecks {
   409  		if script, ok := c.runningScripts[cid]; ok {
   410  			script.cancel()
   411  			delete(c.scripts, cid)
   412  			delete(c.runningScripts, cid)
   413  		}
   414  		delete(c.checks, cid)
   415  	}
   416  	metrics.SetGauge([]string{"client", "consul", "services"}, float32(len(c.services)))
   417  	metrics.SetGauge([]string{"client", "consul", "checks"}, float32(len(c.checks)))
   418  	metrics.SetGauge([]string{"client", "consul", "script_checks"}, float32(len(c.runningScripts)))
   419  }
   420  
   421  // sync enqueued operations.
   422  func (c *ServiceClient) sync() error {
   423  	sreg, creg, sdereg, cdereg := 0, 0, 0, 0
   424  
   425  	consulServices, err := c.client.Services()
   426  	if err != nil {
   427  		metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1)
   428  		return fmt.Errorf("error querying Consul services: %v", err)
   429  	}
   430  
   431  	consulChecks, err := c.client.Checks()
   432  	if err != nil {
   433  		metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1)
   434  		return fmt.Errorf("error querying Consul checks: %v", err)
   435  	}
   436  
   437  	// Remove Nomad services in Consul but unknown locally
   438  	for id := range consulServices {
   439  		if _, ok := c.services[id]; ok {
   440  			// Known service, skip
   441  			continue
   442  		}
   443  
   444  		// Ignore if this is not a Nomad managed service. Also ignore
   445  		// Nomad managed services if this is not a client agent.
   446  		// This is to prevent server agents from removing services
   447  		// registered by client agents
   448  		if !isNomadService(id) || !c.isClientAgent {
   449  			// Not managed by Nomad, skip
   450  			continue
   451  		}
   452  
   453  		// Unknown Nomad managed service; kill
   454  		if err := c.client.ServiceDeregister(id); err != nil {
   455  			if isOldNomadService(id) {
   456  				// Don't hard-fail on old entries. See #3620
   457  				continue
   458  			}
   459  
   460  			metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1)
   461  			return err
   462  		}
   463  		sdereg++
   464  		metrics.IncrCounter([]string{"client", "consul", "service_deregistrations"}, 1)
   465  	}
   466  
   467  	// Add Nomad services missing from Consul
   468  	for id, locals := range c.services {
   469  		if _, ok := consulServices[id]; !ok {
   470  			if err = c.client.ServiceRegister(locals); err != nil {
   471  				metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1)
   472  				return err
   473  			}
   474  			sreg++
   475  			metrics.IncrCounter([]string{"client", "consul", "service_registrations"}, 1)
   476  		}
   477  	}
   478  
   479  	// Remove Nomad checks in Consul but unknown locally
   480  	for id, check := range consulChecks {
   481  		if _, ok := c.checks[id]; ok {
   482  			// Known check, leave it
   483  			continue
   484  		}
   485  
   486  		// Ignore if this is not a Nomad managed check. Also ignore
   487  		// Nomad managed checks if this is not a client agent.
   488  		// This is to prevent server agents from removing checks
   489  		// registered by client agents
   490  		if !isNomadService(check.ServiceID) || !c.isClientAgent {
   491  			// Service not managed by Nomad, skip
   492  			continue
   493  		}
   494  
   495  		// Unknown Nomad managed check; remove
   496  		if err := c.client.CheckDeregister(id); err != nil {
   497  			if isOldNomadService(check.ServiceID) {
   498  				// Don't hard-fail on old entries.
   499  				continue
   500  			}
   501  
   502  			metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1)
   503  			return err
   504  		}
   505  		cdereg++
   506  		metrics.IncrCounter([]string{"client", "consul", "check_deregistrations"}, 1)
   507  	}
   508  
   509  	// Add Nomad checks missing from Consul
   510  	for id, check := range c.checks {
   511  		if _, ok := consulChecks[id]; ok {
   512  			// Already in Consul; skipping
   513  			continue
   514  		}
   515  
   516  		if err := c.client.CheckRegister(check); err != nil {
   517  			metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1)
   518  			return err
   519  		}
   520  		creg++
   521  		metrics.IncrCounter([]string{"client", "consul", "check_registrations"}, 1)
   522  
   523  		// Handle starting scripts
   524  		if script, ok := c.scripts[id]; ok {
   525  			// If it's already running, cancel and replace
   526  			if oldScript, running := c.runningScripts[id]; running {
   527  				oldScript.cancel()
   528  			}
   529  			// Start and store the handle
   530  			c.runningScripts[id] = script.run()
   531  		}
   532  	}
   533  
   534  	c.logger.Printf("[DEBUG] consul.sync: registered %d services, %d checks; deregistered %d services, %d checks",
   535  		sreg, creg, sdereg, cdereg)
   536  	return nil
   537  }
   538  
   539  // RegisterAgent registers Nomad agents (client or server). The
   540  // Service.PortLabel should be a literal port to be parsed with SplitHostPort.
   541  // Script checks are not supported and will return an error. Registration is
   542  // asynchronous.
   543  //
   544  // Agents will be deregistered when Shutdown is called.
   545  func (c *ServiceClient) RegisterAgent(role string, services []*structs.Service) error {
   546  	ops := operations{}
   547  
   548  	for _, service := range services {
   549  		id := makeAgentServiceID(role, service)
   550  
   551  		// Unlike tasks, agents don't use port labels. Agent ports are
   552  		// stored directly in the PortLabel.
   553  		host, rawport, err := net.SplitHostPort(service.PortLabel)
   554  		if err != nil {
   555  			return fmt.Errorf("error parsing port label %q from service %q: %v", service.PortLabel, service.Name, err)
   556  		}
   557  		port, err := strconv.Atoi(rawport)
   558  		if err != nil {
   559  			return fmt.Errorf("error parsing port %q from service %q: %v", rawport, service.Name, err)
   560  		}
   561  		serviceReg := &api.AgentServiceRegistration{
   562  			ID:      id,
   563  			Name:    service.Name,
   564  			Tags:    service.Tags,
   565  			Address: host,
   566  			Port:    port,
   567  		}
   568  		ops.regServices = append(ops.regServices, serviceReg)
   569  
   570  		for _, check := range service.Checks {
   571  			checkID := makeCheckID(id, check)
   572  			if check.Type == structs.ServiceCheckScript {
   573  				return fmt.Errorf("service %q contains invalid check: agent checks do not support scripts", service.Name)
   574  			}
   575  			checkHost, checkPort := serviceReg.Address, serviceReg.Port
   576  			if check.PortLabel != "" {
   577  				// Unlike tasks, agents don't use port labels. Agent ports are
   578  				// stored directly in the PortLabel.
   579  				host, rawport, err := net.SplitHostPort(check.PortLabel)
   580  				if err != nil {
   581  					return fmt.Errorf("error parsing port label %q from check %q: %v", service.PortLabel, check.Name, err)
   582  				}
   583  				port, err := strconv.Atoi(rawport)
   584  				if err != nil {
   585  					return fmt.Errorf("error parsing port %q from check %q: %v", rawport, check.Name, err)
   586  				}
   587  				checkHost, checkPort = host, port
   588  			}
   589  			checkReg, err := createCheckReg(id, checkID, check, checkHost, checkPort)
   590  			if err != nil {
   591  				return fmt.Errorf("failed to add check %q: %v", check.Name, err)
   592  			}
   593  			ops.regChecks = append(ops.regChecks, checkReg)
   594  		}
   595  	}
   596  
   597  	// Don't bother committing agent checks if we're already shutting down
   598  	c.agentLock.Lock()
   599  	defer c.agentLock.Unlock()
   600  	select {
   601  	case <-c.shutdownCh:
   602  		return nil
   603  	default:
   604  	}
   605  
   606  	// Now add them to the registration queue
   607  	c.commit(&ops)
   608  
   609  	// Record IDs for deregistering on shutdown
   610  	for _, id := range ops.regServices {
   611  		c.agentServices[id.ID] = struct{}{}
   612  	}
   613  	for _, id := range ops.regChecks {
   614  		c.agentChecks[id.ID] = struct{}{}
   615  	}
   616  	return nil
   617  }
   618  
   619  // serviceRegs creates service registrations, check registrations, and script
   620  // checks from a service. It returns a service registration object with the
   621  // service and check IDs populated.
   622  func (c *ServiceClient) serviceRegs(ops *operations, service *structs.Service, task *TaskServices) (
   623  	*ServiceRegistration, error) {
   624  
   625  	// Get the services ID
   626  	id := makeTaskServiceID(task.AllocID, task.Name, service, task.Canary)
   627  	sreg := &ServiceRegistration{
   628  		serviceID: id,
   629  		checkIDs:  make(map[string]struct{}, len(service.Checks)),
   630  	}
   631  
   632  	// Service address modes default to auto
   633  	addrMode := service.AddressMode
   634  	if addrMode == "" {
   635  		addrMode = structs.AddressModeAuto
   636  	}
   637  
   638  	// Determine the address to advertise based on the mode
   639  	ip, port, err := getAddress(addrMode, service.PortLabel, task.Networks, task.DriverNetwork)
   640  	if err != nil {
   641  		return nil, fmt.Errorf("unable to get address for service %q: %v", service.Name, err)
   642  	}
   643  
   644  	// Determine whether to use tags or canary_tags
   645  	var tags []string
   646  	if task.Canary && len(service.CanaryTags) > 0 {
   647  		tags = make([]string, len(service.CanaryTags))
   648  		copy(tags, service.CanaryTags)
   649  	} else {
   650  		tags = make([]string, len(service.Tags))
   651  		copy(tags, service.Tags)
   652  	}
   653  	newtag := task.Name + strconv.Itoa(task.AllocIndex)
   654  	tags = append(tags, newtag)
   655  
   656  	// Build the Consul Service registration request
   657  	serviceReg := &api.AgentServiceRegistration{
   658  		ID:      id,
   659  		Name:    service.Name,
   660  		Tags:    tags,
   661  		Address: ip,
   662  		Port:    port,
   663  	}
   664  	ops.regServices = append(ops.regServices, serviceReg)
   665  
   666  	// Build the check registrations
   667  	checkIDs, err := c.checkRegs(ops, id, service, task)
   668  	if err != nil {
   669  		return nil, err
   670  	}
   671  	for _, cid := range checkIDs {
   672  		sreg.checkIDs[cid] = struct{}{}
   673  	}
   674  	return sreg, nil
   675  }
   676  
   677  // checkRegs registers the checks for the given service and returns the
   678  // registered check ids.
   679  func (c *ServiceClient) checkRegs(ops *operations, serviceID string, service *structs.Service,
   680  	task *TaskServices) ([]string, error) {
   681  
   682  	// Fast path
   683  	numChecks := len(service.Checks)
   684  	if numChecks == 0 {
   685  		return nil, nil
   686  	}
   687  
   688  	checkIDs := make([]string, 0, numChecks)
   689  	for _, check := range service.Checks {
   690  		checkID := makeCheckID(serviceID, check)
   691  		checkIDs = append(checkIDs, checkID)
   692  		if check.Type == structs.ServiceCheckScript {
   693  			if task.DriverExec == nil {
   694  				return nil, fmt.Errorf("driver doesn't support script checks")
   695  			}
   696  
   697  			sc := newScriptCheck(task.AllocID, task.Name, checkID, check, task.DriverExec,
   698  				c.client, c.logger, c.shutdownCh)
   699  			ops.scripts = append(ops.scripts, sc)
   700  
   701  			// Skip getAddress for script checks
   702  			checkReg, err := createCheckReg(serviceID, checkID, check, "", 0)
   703  			if err != nil {
   704  				return nil, fmt.Errorf("failed to add script check %q: %v", check.Name, err)
   705  			}
   706  			ops.regChecks = append(ops.regChecks, checkReg)
   707  			continue
   708  		}
   709  
   710  		// Default to the service's port but allow check to override
   711  		portLabel := check.PortLabel
   712  		if portLabel == "" {
   713  			// Default to the service's port label
   714  			portLabel = service.PortLabel
   715  		}
   716  
   717  		// Checks address mode defaults to host for pre-#3380 backward compat
   718  		addrMode := check.AddressMode
   719  		if addrMode == "" {
   720  			addrMode = structs.AddressModeHost
   721  		}
   722  
   723  		ip, port, err := getAddress(addrMode, portLabel, task.Networks, task.DriverNetwork)
   724  		if err != nil {
   725  			return nil, fmt.Errorf("error getting address for check %q: %v", check.Name, err)
   726  		}
   727  
   728  		checkReg, err := createCheckReg(serviceID, checkID, check, ip, port)
   729  		if err != nil {
   730  			return nil, fmt.Errorf("failed to add check %q: %v", check.Name, err)
   731  		}
   732  		ops.regChecks = append(ops.regChecks, checkReg)
   733  	}
   734  	return checkIDs, nil
   735  }
   736  
   737  // RegisterTask with Consul. Adds all service entries and checks to Consul. If
   738  // exec is nil and a script check exists an error is returned.
   739  //
   740  // If the service IP is set it used as the address in the service registration.
   741  // Checks will always use the IP from the Task struct (host's IP).
   742  //
   743  // Actual communication with Consul is done asynchronously (see Run).
   744  func (c *ServiceClient) RegisterTask(task *TaskServices) error {
   745  	// Fast path
   746  	numServices := len(task.Services)
   747  	if numServices == 0 {
   748  		return nil
   749  	}
   750  
   751  	t := new(TaskRegistration)
   752  	t.Services = make(map[string]*ServiceRegistration, numServices)
   753  
   754  	ops := &operations{}
   755  	for _, service := range task.Services {
   756  		sreg, err := c.serviceRegs(ops, service, task)
   757  		if err != nil {
   758  			return err
   759  		}
   760  		t.Services[sreg.serviceID] = sreg
   761  	}
   762  
   763  	// Add the task to the allocation's registration
   764  	c.addTaskRegistration(task.AllocID, task.Name, t)
   765  
   766  	c.commit(ops)
   767  
   768  	// Start watching checks. Done after service registrations are built
   769  	// since an error building them could leak watches.
   770  	for _, service := range task.Services {
   771  		serviceID := makeTaskServiceID(task.AllocID, task.Name, service, task.Canary)
   772  		for _, check := range service.Checks {
   773  			if check.TriggersRestarts() {
   774  				checkID := makeCheckID(serviceID, check)
   775  				c.checkWatcher.Watch(task.AllocID, task.Name, checkID, check, task.Restarter)
   776  			}
   777  		}
   778  	}
   779  	return nil
   780  }
   781  
   782  // UpdateTask in Consul. Does not alter the service if only checks have
   783  // changed.
   784  //
   785  // DriverNetwork must not change between invocations for the same allocation.
   786  func (c *ServiceClient) UpdateTask(old, newTask *TaskServices) error {
   787  	ops := &operations{}
   788  
   789  	taskReg := new(TaskRegistration)
   790  	taskReg.Services = make(map[string]*ServiceRegistration, len(newTask.Services))
   791  
   792  	existingIDs := make(map[string]*structs.Service, len(old.Services))
   793  	for _, s := range old.Services {
   794  		existingIDs[makeTaskServiceID(old.AllocID, old.Name, s, old.Canary)] = s
   795  	}
   796  	newIDs := make(map[string]*structs.Service, len(newTask.Services))
   797  	for _, s := range newTask.Services {
   798  		newIDs[makeTaskServiceID(newTask.AllocID, newTask.Name, s, newTask.Canary)] = s
   799  	}
   800  
   801  	// Loop over existing Service IDs to see if they have been removed or
   802  	// updated.
   803  	for existingID, existingSvc := range existingIDs {
   804  		newSvc, ok := newIDs[existingID]
   805  		if !ok {
   806  			// Existing service entry removed
   807  			ops.deregServices = append(ops.deregServices, existingID)
   808  			for _, check := range existingSvc.Checks {
   809  				cid := makeCheckID(existingID, check)
   810  				ops.deregChecks = append(ops.deregChecks, cid)
   811  
   812  				// Unwatch watched checks
   813  				if check.TriggersRestarts() {
   814  					c.checkWatcher.Unwatch(cid)
   815  				}
   816  			}
   817  			continue
   818  		}
   819  
   820  		// Service exists and hasn't changed, don't re-add it later
   821  		delete(newIDs, existingID)
   822  
   823  		// Service still exists so add it to the task's registration
   824  		sreg := &ServiceRegistration{
   825  			serviceID: existingID,
   826  			checkIDs:  make(map[string]struct{}, len(newSvc.Checks)),
   827  		}
   828  		taskReg.Services[existingID] = sreg
   829  
   830  		// See if any checks were updated
   831  		existingChecks := make(map[string]*structs.ServiceCheck, len(existingSvc.Checks))
   832  		for _, check := range existingSvc.Checks {
   833  			existingChecks[makeCheckID(existingID, check)] = check
   834  		}
   835  
   836  		// Register new checks
   837  		for _, check := range newSvc.Checks {
   838  			checkID := makeCheckID(existingID, check)
   839  			if _, exists := existingChecks[checkID]; exists {
   840  				// Check exists, so don't remove it
   841  				delete(existingChecks, checkID)
   842  				sreg.checkIDs[checkID] = struct{}{}
   843  			}
   844  
   845  			// New check on an unchanged service; add them now
   846  			newCheckIDs, err := c.checkRegs(ops, existingID, newSvc, newTask)
   847  			if err != nil {
   848  				return err
   849  			}
   850  
   851  			for _, checkID := range newCheckIDs {
   852  				sreg.checkIDs[checkID] = struct{}{}
   853  
   854  			}
   855  
   856  			// Update all watched checks as CheckRestart fields aren't part of ID
   857  			if check.TriggersRestarts() {
   858  				c.checkWatcher.Watch(newTask.AllocID, newTask.Name, checkID, check, newTask.Restarter)
   859  			}
   860  		}
   861  
   862  		// Remove existing checks not in updated service
   863  		for cid, check := range existingChecks {
   864  			ops.deregChecks = append(ops.deregChecks, cid)
   865  
   866  			// Unwatch checks
   867  			if check.TriggersRestarts() {
   868  				c.checkWatcher.Unwatch(cid)
   869  			}
   870  		}
   871  	}
   872  
   873  	// Any remaining services should just be enqueued directly
   874  	for _, newSvc := range newIDs {
   875  		sreg, err := c.serviceRegs(ops, newSvc, newTask)
   876  		if err != nil {
   877  			return err
   878  		}
   879  
   880  		taskReg.Services[sreg.serviceID] = sreg
   881  	}
   882  
   883  	// Add the task to the allocation's registration
   884  	c.addTaskRegistration(newTask.AllocID, newTask.Name, taskReg)
   885  
   886  	c.commit(ops)
   887  
   888  	// Start watching checks. Done after service registrations are built
   889  	// since an error building them could leak watches.
   890  	for _, service := range newIDs {
   891  		serviceID := makeTaskServiceID(newTask.AllocID, newTask.Name, service, newTask.Canary)
   892  		for _, check := range service.Checks {
   893  			if check.TriggersRestarts() {
   894  				checkID := makeCheckID(serviceID, check)
   895  				c.checkWatcher.Watch(newTask.AllocID, newTask.Name, checkID, check, newTask.Restarter)
   896  			}
   897  		}
   898  	}
   899  	return nil
   900  }
   901  
   902  // RemoveTask from Consul. Removes all service entries and checks.
   903  //
   904  // Actual communication with Consul is done asynchronously (see Run).
   905  func (c *ServiceClient) RemoveTask(task *TaskServices) {
   906  	ops := operations{}
   907  
   908  	for _, service := range task.Services {
   909  		id := makeTaskServiceID(task.AllocID, task.Name, service, task.Canary)
   910  		ops.deregServices = append(ops.deregServices, id)
   911  
   912  		for _, check := range service.Checks {
   913  			cid := makeCheckID(id, check)
   914  			ops.deregChecks = append(ops.deregChecks, cid)
   915  
   916  			if check.TriggersRestarts() {
   917  				c.checkWatcher.Unwatch(cid)
   918  			}
   919  		}
   920  	}
   921  
   922  	// Remove the task from the alloc's registrations
   923  	c.removeTaskRegistration(task.AllocID, task.Name)
   924  
   925  	// Now add them to the deregistration fields; main Run loop will update
   926  	c.commit(&ops)
   927  }
   928  
   929  // AllocRegistrations returns the registrations for the given allocation. If the
   930  // allocation has no reservations, the response is a nil object.
   931  func (c *ServiceClient) AllocRegistrations(allocID string) (*AllocRegistration, error) {
   932  	// Get the internal struct using the lock
   933  	c.allocRegistrationsLock.RLock()
   934  	regInternal, ok := c.allocRegistrations[allocID]
   935  	if !ok {
   936  		c.allocRegistrationsLock.RUnlock()
   937  		return nil, nil
   938  	}
   939  
   940  	// Copy so we don't expose internal structs
   941  	reg := regInternal.copy()
   942  	c.allocRegistrationsLock.RUnlock()
   943  
   944  	// Query the services and checks to populate the allocation registrations.
   945  	services, err := c.client.Services()
   946  	if err != nil {
   947  		return nil, err
   948  	}
   949  
   950  	checks, err := c.client.Checks()
   951  	if err != nil {
   952  		return nil, err
   953  	}
   954  
   955  	// Populate the object
   956  	for _, treg := range reg.Tasks {
   957  		for serviceID, sreg := range treg.Services {
   958  			sreg.Service = services[serviceID]
   959  			for checkID := range sreg.checkIDs {
   960  				if check, ok := checks[checkID]; ok {
   961  					sreg.Checks = append(sreg.Checks, check)
   962  				}
   963  			}
   964  		}
   965  	}
   966  
   967  	return reg, nil
   968  }
   969  
   970  // Shutdown the Consul client. Update running task registrations and deregister
   971  // agent from Consul. On first call blocks up to shutdownWait before giving up
   972  // on syncing operations.
   973  func (c *ServiceClient) Shutdown() error {
   974  	// Serialize Shutdown calls with RegisterAgent to prevent leaking agent
   975  	// entries.
   976  	c.agentLock.Lock()
   977  	defer c.agentLock.Unlock()
   978  	select {
   979  	case <-c.shutdownCh:
   980  		return nil
   981  	default:
   982  		close(c.shutdownCh)
   983  	}
   984  
   985  	// Give run loop time to sync, but don't block indefinitely
   986  	deadline := time.After(c.shutdownWait)
   987  
   988  	// Wait for Run to finish any outstanding operations and exit
   989  	select {
   990  	case <-c.exitCh:
   991  	case <-deadline:
   992  		// Don't wait forever though
   993  	}
   994  
   995  	// If Consul was never seen nothing could be written so exit early
   996  	if !c.hasSeen() {
   997  		return nil
   998  	}
   999  
  1000  	// Always attempt to deregister Nomad agent Consul entries, even if
  1001  	// deadline was reached
  1002  	for id := range c.agentServices {
  1003  		if err := c.client.ServiceDeregister(id); err != nil {
  1004  			c.logger.Printf("[ERR] consul.sync: error deregistering agent service (id: %q): %v", id, err)
  1005  		}
  1006  	}
  1007  	for id := range c.agentChecks {
  1008  		if err := c.client.CheckDeregister(id); err != nil {
  1009  			c.logger.Printf("[ERR] consul.sync: error deregistering agent service (id: %q): %v", id, err)
  1010  		}
  1011  	}
  1012  
  1013  	// Give script checks time to exit (no need to lock as Run() has exited)
  1014  	for _, h := range c.runningScripts {
  1015  		select {
  1016  		case <-h.wait():
  1017  		case <-deadline:
  1018  			return fmt.Errorf("timed out waiting for script checks to run")
  1019  		}
  1020  	}
  1021  	return nil
  1022  }
  1023  
  1024  // addTaskRegistration adds the task registration for the given allocation.
  1025  func (c *ServiceClient) addTaskRegistration(allocID, taskName string, reg *TaskRegistration) {
  1026  	c.allocRegistrationsLock.Lock()
  1027  	defer c.allocRegistrationsLock.Unlock()
  1028  
  1029  	alloc, ok := c.allocRegistrations[allocID]
  1030  	if !ok {
  1031  		alloc = &AllocRegistration{
  1032  			Tasks: make(map[string]*TaskRegistration),
  1033  		}
  1034  		c.allocRegistrations[allocID] = alloc
  1035  	}
  1036  	alloc.Tasks[taskName] = reg
  1037  }
  1038  
  1039  // removeTaskRegistration removes the task registration for the given allocation.
  1040  func (c *ServiceClient) removeTaskRegistration(allocID, taskName string) {
  1041  	c.allocRegistrationsLock.Lock()
  1042  	defer c.allocRegistrationsLock.Unlock()
  1043  
  1044  	alloc, ok := c.allocRegistrations[allocID]
  1045  	if !ok {
  1046  		return
  1047  	}
  1048  
  1049  	// Delete the task and if it is the last one also delete the alloc's
  1050  	// registration
  1051  	delete(alloc.Tasks, taskName)
  1052  	if len(alloc.Tasks) == 0 {
  1053  		delete(c.allocRegistrations, allocID)
  1054  	}
  1055  }
  1056  
  1057  // makeAgentServiceID creates a unique ID for identifying an agent service in
  1058  // Consul.
  1059  //
  1060  // Agent service IDs are of the form:
  1061  //
  1062  //	{nomadServicePrefix}-{ROLE}-b32(sha1({Service.Name}-{Service.Tags...})
  1063  //	Example Server ID: _nomad-server-fbbk265qn4tmt25nd4ep42tjvmyj3hr4
  1064  //	Example Client ID: _nomad-client-ggnjpgl7yn7rgmvxzilmpvrzzvrszc7l
  1065  //
  1066  func makeAgentServiceID(role string, service *structs.Service) string {
  1067  	return fmt.Sprintf("%s-%s-%s", nomadServicePrefix, role, service.Hash(role, "", false))
  1068  }
  1069  
  1070  // makeTaskServiceID creates a unique ID for identifying a task service in
  1071  // Consul. All structs.Service fields are included in the ID's hash except
  1072  // Checks. This allows updates to merely compare IDs.
  1073  //
  1074  //	Example Service ID: _nomad-task-TNM333JKJPM5AK4FAS3VXQLXFDWOF4VH
  1075  func makeTaskServiceID(allocID, taskName string, service *structs.Service, canary bool) string {
  1076  	return nomadTaskPrefix + service.Hash(allocID, taskName, canary)
  1077  }
  1078  
  1079  // makeCheckID creates a unique ID for a check.
  1080  func makeCheckID(serviceID string, check *structs.ServiceCheck) string {
  1081  	return check.Hash(serviceID)
  1082  }
  1083  
  1084  // createCheckReg creates a Check that can be registered with Consul.
  1085  //
  1086  // Script checks simply have a TTL set and the caller is responsible for
  1087  // running the script and heartbeating.
  1088  func createCheckReg(serviceID, checkID string, check *structs.ServiceCheck, host string, port int) (*api.AgentCheckRegistration, error) {
  1089  	chkReg := api.AgentCheckRegistration{
  1090  		ID:        checkID,
  1091  		Name:      check.Name,
  1092  		ServiceID: serviceID,
  1093  	}
  1094  	chkReg.Status = check.InitialStatus
  1095  	chkReg.Timeout = check.Timeout.String()
  1096  	chkReg.Interval = check.Interval.String()
  1097  
  1098  	// Require an address for http or tcp checks
  1099  	if port == 0 && check.RequiresPort() {
  1100  		return nil, fmt.Errorf("%s checks require an address", check.Type)
  1101  	}
  1102  
  1103  	switch check.Type {
  1104  	case structs.ServiceCheckHTTP:
  1105  		proto := check.Protocol
  1106  		if proto == "" {
  1107  			proto = "http"
  1108  		}
  1109  		if check.TLSSkipVerify {
  1110  			chkReg.TLSSkipVerify = true
  1111  		}
  1112  		base := url.URL{
  1113  			Scheme: proto,
  1114  			Host:   net.JoinHostPort(host, strconv.Itoa(port)),
  1115  		}
  1116  		relative, err := url.Parse(check.Path)
  1117  		if err != nil {
  1118  			return nil, err
  1119  		}
  1120  		url := base.ResolveReference(relative)
  1121  		chkReg.HTTP = url.String()
  1122  		chkReg.Method = check.Method
  1123  		chkReg.Header = check.Header
  1124  
  1125  	case structs.ServiceCheckTCP:
  1126  		chkReg.TCP = net.JoinHostPort(host, strconv.Itoa(port))
  1127  
  1128  	case structs.ServiceCheckScript:
  1129  		chkReg.TTL = (check.Interval + ttlCheckBuffer).String()
  1130  		// As of Consul 1.0.0 setting TTL and Interval is a 400
  1131  		chkReg.Interval = ""
  1132  
  1133  	case structs.ServiceCheckGRPC:
  1134  		chkReg.GRPC = fmt.Sprintf("%s/%s", net.JoinHostPort(host, strconv.Itoa(port)), check.GRPCService)
  1135  		chkReg.GRPCUseTLS = check.GRPCUseTLS
  1136  		if check.TLSSkipVerify {
  1137  			chkReg.TLSSkipVerify = true
  1138  		}
  1139  
  1140  	default:
  1141  		return nil, fmt.Errorf("check type %+q not valid", check.Type)
  1142  	}
  1143  	return &chkReg, nil
  1144  }
  1145  
  1146  // isNomadService returns true if the ID matches the pattern of a Nomad managed
  1147  // service (new or old formats). Agent services return false as independent
  1148  // client and server agents may be running on the same machine. #2827
  1149  func isNomadService(id string) bool {
  1150  	return strings.HasPrefix(id, nomadTaskPrefix) || isOldNomadService(id)
  1151  }
  1152  
  1153  // isOldNomadService returns true if the ID matches an old pattern managed by
  1154  // Nomad.
  1155  //
  1156  // Pre-0.7.1 task service IDs are of the form:
  1157  //
  1158  //	{nomadServicePrefix}-executor-{ALLOC_ID}-{Service.Name}-{Service.Tags...}
  1159  //	Example Service ID: _nomad-executor-1234-echo-http-tag1-tag2-tag3
  1160  //
  1161  func isOldNomadService(id string) bool {
  1162  	const prefix = nomadServicePrefix + "-executor"
  1163  	return strings.HasPrefix(id, prefix)
  1164  }
  1165  
  1166  // getAddress returns the IP and port to use for a service or check. If no port
  1167  // label is specified (an empty value), zero values are returned because no
  1168  // address could be resolved.
  1169  func getAddress(addrMode, portLabel string, networks structs.Networks, driverNet *cstructs.DriverNetwork) (string, int, error) {
  1170  	switch addrMode {
  1171  	case structs.AddressModeAuto:
  1172  		if driverNet.Advertise() {
  1173  			addrMode = structs.AddressModeDriver
  1174  		} else {
  1175  			addrMode = structs.AddressModeHost
  1176  		}
  1177  		return getAddress(addrMode, portLabel, networks, driverNet)
  1178  	case structs.AddressModeHost:
  1179  		if portLabel == "" {
  1180  			if len(networks) != 1 {
  1181  				// If no networks are specified return zero
  1182  				// values. Consul will advertise the host IP
  1183  				// with no port. This is the pre-0.7.1 behavior
  1184  				// some people rely on.
  1185  				return "", 0, nil
  1186  			}
  1187  
  1188  			return networks[0].IP, 0, nil
  1189  		}
  1190  
  1191  		// Default path: use host ip:port
  1192  		ip, port := networks.Port(portLabel)
  1193  		if ip == "" && port <= 0 {
  1194  			return "", 0, fmt.Errorf("invalid port %q: port label not found", portLabel)
  1195  		}
  1196  		return ip, port, nil
  1197  
  1198  	case structs.AddressModeDriver:
  1199  		// Require a driver network if driver address mode is used
  1200  		if driverNet == nil {
  1201  			return "", 0, fmt.Errorf(`cannot use address_mode="driver": no driver network exists`)
  1202  		}
  1203  
  1204  		// If no port label is specified just return the IP
  1205  		if portLabel == "" {
  1206  			return driverNet.IP, 0, nil
  1207  		}
  1208  
  1209  		// If the port is a label, use the driver's port (not the host's)
  1210  		if port, ok := driverNet.PortMap[portLabel]; ok {
  1211  			return driverNet.IP, port, nil
  1212  		}
  1213  
  1214  		// If port isn't a label, try to parse it as a literal port number
  1215  		port, err := strconv.Atoi(portLabel)
  1216  		if err != nil {
  1217  			// Don't include Atoi error message as user likely
  1218  			// never intended it to be a numeric and it creates a
  1219  			// confusing error message
  1220  			return "", 0, fmt.Errorf("invalid port label %q: port labels in driver address_mode must be numeric or in the driver's port map", portLabel)
  1221  		}
  1222  		if port <= 0 {
  1223  			return "", 0, fmt.Errorf("invalid port: %q: port must be >0", portLabel)
  1224  		}
  1225  
  1226  		return driverNet.IP, port, nil
  1227  
  1228  	default:
  1229  		// Shouldn't happen due to validation, but enforce invariants
  1230  		return "", 0, fmt.Errorf("invalid address mode %q", addrMode)
  1231  	}
  1232  }