
     1  // Copyright 2015 The Prometheus Authors
     2  // Licensed under the Apache License, Version 2.0 (the "License");
     3  // you may not use this file except in compliance with the License.
     4  // You may obtain a copy of the License at
     5  //
     6  //
     7  //
     8  // Unless required by applicable law or agreed to in writing, software
     9  // distributed under the License is distributed on an "AS IS" BASIS,
    10  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    11  // See the License for the specific language governing permissions and
    12  // limitations under the License.
    14  package consul // revive:disable-line:import-shadowing package name is not referenced
    16  import (
    17  	"context"
    18  	"errors"
    19  	"fmt"
    20  	"net"
    21  	"strconv"
    22  	"strings"
    23  	"time"
    25  	consul ""
    26  	""
    27  	""
    28  	""
    29  	""
    30  	""
    31  	""
    32  	""
    33  )
    35  const (
    36  	watchTimeout  = 2 * time.Minute
    37  	retryInterval = 15 * time.Second
    39  	// addressLabel is the name for the label containing a target's address.
    40  	addressLabel = model.MetaLabelPrefix + "consul_address"
    41  	// nodeLabel is the name for the label containing a target's node name.
    42  	nodeLabel = model.MetaLabelPrefix + "consul_node"
    43  	// metaDataLabel is the prefix for the labels mapping to a target's metadata.
    44  	metaDataLabel = model.MetaLabelPrefix + "consul_metadata_"
    45  	// serviceMetaDataLabel is the prefix for the labels mapping to a target's service metadata.
    46  	serviceMetaDataLabel = model.MetaLabelPrefix + "consul_service_metadata_"
    47  	// tagsLabel is the name of the label containing the tags assigned to the target.
    48  	tagsLabel = model.MetaLabelPrefix + "consul_tags"
    49  	// serviceLabel is the name of the label containing the service name.
    50  	serviceLabel = model.MetaLabelPrefix + "consul_service"
    51  	// healthLabel is the name of the label containing the health of the service instance
    52  	healthLabel = model.MetaLabelPrefix + "consul_health"
    53  	// serviceAddressLabel is the name of the label containing the (optional) service address.
    54  	serviceAddressLabel = model.MetaLabelPrefix + "consul_service_address"
    55  	// servicePortLabel is the name of the label containing the service port.
    56  	servicePortLabel = model.MetaLabelPrefix + "consul_service_port"
    57  	// datacenterLabel is the name of the label containing the datacenter ID.
    58  	datacenterLabel = model.MetaLabelPrefix + "consul_dc"
    59  	// namespaceLabel is the name of the label containing the namespace (Consul Enterprise only).
    60  	namespaceLabel = model.MetaLabelPrefix + "consul_namespace"
    61  	// partitionLabel is the name of the label containing the Admin Partition (Consul Enterprise only).
    62  	partitionLabel = model.MetaLabelPrefix + "consul_partition"
    63  	// taggedAddressesLabel is the prefix for the labels mapping to a target's tagged addresses.
    64  	taggedAddressesLabel = model.MetaLabelPrefix + "consul_tagged_address_"
    65  	// serviceIDLabel is the name of the label containing the service ID.
    66  	serviceIDLabel = model.MetaLabelPrefix + "consul_service_id"
    68  	// Constants for instrumentation.
    69  	namespace = "pyroscope"
    70  )
    72  var (
    73  	rpcFailuresCount = prometheus.NewCounter(
    74  		prometheus.CounterOpts{
    75  			Namespace: namespace,
    76  			Name:      "sd_consul_rpc_failures_total",
    77  			Help:      "The number of Consul RPC call failures.",
    78  		})
    79  	rpcDuration = prometheus.NewSummaryVec(
    80  		prometheus.SummaryOpts{
    81  			Namespace:  namespace,
    82  			Name:       "sd_consul_rpc_duration_seconds",
    83  			Help:       "The duration of a Consul RPC call in seconds.",
    84  			Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001},
    85  		},
    86  		[]string{"endpoint", "call"},
    87  	)
    89  	// Initialize metric vectors.
    90  	servicesRPCDuration = rpcDuration.WithLabelValues("catalog", "services")
    91  	serviceRPCDuration  = rpcDuration.WithLabelValues("catalog", "service")
    93  	// DefaultSDConfig is the default Consul SD configuration.
    94  	DefaultSDConfig = SDConfig{
    95  		TagSeparator:     ",",
    96  		Scheme:           "http",
    97  		Server:           "localhost:8500",
    98  		AllowStale:       true,
    99  		RefreshInterval:  model.Duration(30 * time.Second),
   100  		HTTPClientConfig: config.DefaultHTTPClientConfig,
   101  	}
   102  )
   104  func init() {
   105  	discovery.RegisterConfig(&SDConfig{})
   106  	prometheus.MustRegister(rpcFailuresCount, rpcDuration)
   107  }
   109  // SDConfig is the configuration for Consul service discovery.
   110  type SDConfig struct {
   111  	Server       string        `yaml:"server,omitempty"`
   112  	Token        config.Secret `yaml:"token,omitempty"`
   113  	Datacenter   string        `yaml:"datacenter,omitempty"`
   114  	Namespace    string        `yaml:"namespace,omitempty"`
   115  	Partition    string        `yaml:"partition,omitempty"`
   116  	TagSeparator string        `yaml:"tag-separator,omitempty"`
   117  	Scheme       string        `yaml:"scheme,omitempty"`
   118  	Username     string        `yaml:"username,omitempty"`
   119  	Password     config.Secret `yaml:"password,omitempty"`
   121  	// See,
   122  	// stale reads are a lot cheaper and are a necessity if you have >5k targets.
   123  	AllowStale bool `yaml:"allow-stale"`
   124  	// By default use blocking queries (
   125  	// but allow users to throttle updates if necessary. This can be useful because of "bugs" like
   126  	// which cause an un-necessary
   127  	// amount of requests on consul.
   128  	RefreshInterval model.Duration `yaml:"refresh-interval,omitempty"`
   130  	// See
   131  	// The list of services for which targets are discovered.
   132  	// Defaults to all services if empty.
   133  	Services []string `yaml:"services,omitempty"`
   134  	// A list of tags used to filter instances inside a service. Services must contain all tags in the list.
   135  	ServiceTags []string `yaml:"tags,omitempty"`
   136  	// Desired node metadata.
   137  	NodeMeta map[string]string `yaml:"node-meta,omitempty"`
   139  	HTTPClientConfig config.HTTPClientConfig `yaml:",inline"`
   140  }
   142  // Name returns the name of the Config.
   143  func (*SDConfig) Name() string { return "consul" }
   145  // NewDiscoverer returns a Discoverer for the Config.
   146  func (c *SDConfig) NewDiscoverer(opts discovery.DiscovererOptions) (discovery.Discoverer, error) {
   147  	return NewDiscovery(c, opts.Logger)
   148  }
   150  // SetDirectory joins any relative file paths with dir.
   151  func (c *SDConfig) SetDirectory(dir string) {
   152  	c.HTTPClientConfig.SetDirectory(dir)
   153  }
   155  // UnmarshalYAML implements the yaml.Unmarshaler interface.
   156  func (c *SDConfig) UnmarshalYAML(unmarshal func(interface{}) error) error {
   157  	*c = DefaultSDConfig
   158  	type plain SDConfig
   159  	err := unmarshal((*plain)(c))
   160  	if err != nil {
   161  		return err
   162  	}
   163  	if strings.TrimSpace(c.Server) == "" {
   164  		return errors.New("consul SD configuration requires a server address")
   165  	}
   166  	if c.Username != "" || c.Password != "" {
   167  		if c.HTTPClientConfig.BasicAuth != nil {
   168  			return errors.New("at most one of consul SD configuration username and password and basic auth can be configured")
   169  		}
   170  		c.HTTPClientConfig.BasicAuth = &config.BasicAuth{
   171  			Username: c.Username,
   172  			Password: c.Password,
   173  		}
   174  	}
   175  	if c.Token != "" && (c.HTTPClientConfig.Authorization != nil || c.HTTPClientConfig.OAuth2 != nil) {
   176  		return errors.New("at most one of consul SD token, authorization, or oauth2 can be configured")
   177  	}
   178  	return c.HTTPClientConfig.Validate()
   179  }
   181  // Discovery retrieves target information from a Consul server
   182  // and updates them via watches.
   183  type Discovery struct {
   184  	client           *consul.Client
   185  	clientDatacenter string
   186  	clientNamespace  string
   187  	clientPartition  string
   188  	tagSeparator     string
   189  	watchedServices  []string // Set of services which will be discovered.
   190  	watchedTags      []string // Tags used to filter instances of a service.
   191  	watchedNodeMeta  map[string]string
   192  	allowStale       bool
   193  	refreshInterval  time.Duration
   194  	finalizer        func()
   195  	logger           logrus.FieldLogger
   196  }
   198  // NewDiscovery returns a new Discovery for the given config.
   199  func NewDiscovery(conf *SDConfig, logger logrus.FieldLogger) (*Discovery, error) {
   200  	wrapper, err := config.NewClientFromConfig(conf.HTTPClientConfig, "consul_sd", config.WithIdleConnTimeout(2*watchTimeout))
   201  	if err != nil {
   202  		return nil, err
   203  	}
   204  	wrapper.Timeout = watchTimeout + 15*time.Second
   206  	clientConf := &consul.Config{
   207  		Address:    conf.Server,
   208  		Scheme:     conf.Scheme,
   209  		Datacenter: conf.Datacenter,
   210  		Namespace:  conf.Namespace,
   211  		Partition:  conf.Partition,
   212  		Token:      string(conf.Token),
   213  		HttpClient: wrapper,
   214  	}
   215  	client, err := consul.NewClient(clientConf)
   216  	if err != nil {
   217  		return nil, err
   218  	}
   219  	cd := &Discovery{
   220  		client:           client,
   221  		tagSeparator:     conf.TagSeparator,
   222  		watchedServices:  conf.Services,
   223  		watchedTags:      conf.ServiceTags,
   224  		watchedNodeMeta:  conf.NodeMeta,
   225  		allowStale:       conf.AllowStale,
   226  		refreshInterval:  time.Duration(conf.RefreshInterval),
   227  		clientDatacenter: conf.Datacenter,
   228  		clientNamespace:  conf.Namespace,
   229  		clientPartition:  conf.Partition,
   230  		finalizer:        wrapper.CloseIdleConnections,
   231  		logger:           logger,
   232  	}
   233  	return cd, nil
   234  }
   236  // shouldWatch returns whether the service of the given name should be watched.
   237  func (d *Discovery) shouldWatch(name string, tags []string) bool {
   238  	return d.shouldWatchFromName(name) && d.shouldWatchFromTags(tags)
   239  }
   241  // shouldWatch returns whether the service of the given name should be watched based on its name.
   242  func (d *Discovery) shouldWatchFromName(name string) bool {
   243  	// If there's no fixed set of watched services, we watch everything.
   244  	if len(d.watchedServices) == 0 {
   245  		return true
   246  	}
   248  	for _, sn := range d.watchedServices {
   249  		if sn == name {
   250  			return true
   251  		}
   252  	}
   253  	return false
   254  }
   256  // shouldWatch returns whether the service of the given name should be watched based on its tags.
   257  // This gets called when the user doesn't specify a list of services in order to avoid watching
   258  // *all* services. Details in
   259  func (d *Discovery) shouldWatchFromTags(tags []string) bool {
   260  	// If there's no fixed set of watched tags, we watch everything.
   261  	if len(d.watchedTags) == 0 {
   262  		return true
   263  	}
   265  tagOuter:
   266  	for _, wtag := range d.watchedTags {
   267  		for _, tag := range tags {
   268  			if wtag == tag {
   269  				continue tagOuter
   270  			}
   271  		}
   272  		return false
   273  	}
   274  	return true
   275  }
   277  // Get the local datacenter if not specified.
   278  func (d *Discovery) getDatacenter() error {
   279  	// If the datacenter was not set from clientConf, let's get it from the local Consul agent
   280  	// (Consul default is to use local node's datacenter if one isn't given for a query).
   281  	if d.clientDatacenter != "" {
   282  		return nil
   283  	}
   285  	info, err := d.client.Agent().Self()
   286  	if err != nil {
   287  		d.logger.WithError(err).Error("error retrieving datacenter name")
   288  		rpcFailuresCount.Inc()
   289  		return err
   290  	}
   292  	dc, ok := info["Config"]["Datacenter"].(string)
   293  	if !ok {
   294  		err := fmt.Errorf("invalid value '%v' for Config.Datacenter", info["Config"]["Datacenter"])
   295  		d.logger.WithError(err).Error("error retrieving datacenter name")
   296  		return err
   297  	}
   299  	d.clientDatacenter = dc
   300  	d.logger = logrus.WithField("datacenter", dc)
   301  	return nil
   302  }
   304  // Initialize the Discoverer run.
   305  func (d *Discovery) initialize(ctx context.Context) {
   306  	// Loop until we manage to get the local datacenter.
   307  	for {
   308  		// We have to check the context at least once. The checks during channel sends
   309  		// do not guarantee that.
   310  		select {
   311  		case <-ctx.Done():
   312  			return
   313  		default:
   314  		}
   316  		// Get the local datacenter first, if necessary.
   317  		err := d.getDatacenter()
   318  		if err != nil {
   319  			time.Sleep(retryInterval)
   320  			continue
   321  		}
   322  		// We are good to go.
   323  		return
   324  	}
   325  }
   327  // Run implements the Discoverer interface.
   328  func (d *Discovery) Run(ctx context.Context, ch chan<- []*targetgroup.Group) {
   329  	if d.finalizer != nil {
   330  		defer d.finalizer()
   331  	}
   332  	d.initialize(ctx)
   334  	if len(d.watchedServices) == 0 || len(d.watchedTags) != 0 {
   335  		// We need to watch the catalog.
   336  		ticker := time.NewTicker(d.refreshInterval)
   338  		// Watched services and their cancellation functions.
   339  		services := make(map[string]func())
   340  		var lastIndex uint64
   342  		for {
   343  			select {
   344  			case <-ctx.Done():
   345  				ticker.Stop()
   346  				return
   347  			default:
   348  				d.watchServices(ctx, ch, &lastIndex, services)
   349  				<-ticker.C
   350  			}
   351  		}
   352  	} else {
   353  		// We only have fully defined services.
   354  		for _, name := range d.watchedServices {
   355  			d.watchService(ctx, ch, name)
   356  		}
   357  		<-ctx.Done()
   358  	}
   359  }
   361  // Watch the catalog for new services we would like to watch. This is called only
   362  // when we don't know yet the names of the services and need to ask Consul the
   363  // entire list of services.
   364  func (d *Discovery) watchServices(ctx context.Context, ch chan<- []*targetgroup.Group, lastIndex *uint64, services map[string]func()) {
   365  	catalog := d.client.Catalog()
   366  	d.logger.WithField("tags", strings.Join(d.watchedTags, ",")).Debug("watching services")
   368  	opts := &consul.QueryOptions{
   369  		WaitIndex:  *lastIndex,
   370  		WaitTime:   watchTimeout,
   371  		AllowStale: d.allowStale,
   372  		NodeMeta:   d.watchedNodeMeta,
   373  	}
   374  	t0 := time.Now()
   375  	srvs, meta, err := catalog.Services(opts.WithContext(ctx))
   376  	elapsed := time.Since(t0)
   377  	servicesRPCDuration.Observe(elapsed.Seconds())
   379  	// Check the context before in order to exit early.
   380  	select {
   381  	case <-ctx.Done():
   382  		return
   383  	default:
   384  	}
   386  	if err != nil {
   387  		d.logger.WithError(err).Error("error refreshing service list")
   388  		rpcFailuresCount.Inc()
   389  		time.Sleep(retryInterval)
   390  		return
   391  	}
   392  	// If the index equals the previous one, the watch timed out with no update.
   393  	if meta.LastIndex == *lastIndex {
   394  		return
   395  	}
   396  	*lastIndex = meta.LastIndex
   398  	// Check for new services.
   399  	for name := range srvs {
   400  		// catalog.Service() returns a map of service name to tags, we can use that to watch
   401  		// only the services that have the tag we are looking for (if specified).
   402  		// In the future consul will also support server side for service metadata.
   403  		//
   404  		if !d.shouldWatch(name, srvs[name]) {
   405  			continue
   406  		}
   407  		if _, ok := services[name]; ok {
   408  			continue // We are already watching the service.
   409  		}
   411  		wctx, cancel := context.WithCancel(ctx)
   412  		d.watchService(wctx, ch, name)
   413  		services[name] = cancel
   414  	}
   416  	// Check for removed services.
   417  	for name, cancel := range services {
   418  		if _, ok := srvs[name]; !ok {
   419  			// Call the watch cancellation function.
   420  			cancel()
   421  			delete(services, name)
   423  			// Send clearing target group.
   424  			select {
   425  			case <-ctx.Done():
   426  				return
   427  			case ch <- []*targetgroup.Group{{Source: name}}:
   428  			}
   429  		}
   430  	}
   432  	// Send targetgroup with no targets if nothing was discovered.
   433  	if len(services) == 0 {
   434  		select {
   435  		case <-ctx.Done():
   436  			return
   437  		case ch <- []*targetgroup.Group{{}}:
   438  		}
   439  	}
   440  }
   442  // consulService contains data belonging to the same service.
   443  type consulService struct {
   444  	name         string
   445  	tags         []string
   446  	labels       model.LabelSet
   447  	discovery    *Discovery
   448  	client       *consul.Client
   449  	tagSeparator string
   450  	logger       logrus.FieldLogger
   451  }
   453  // Start watching a service.
   454  func (d *Discovery) watchService(ctx context.Context, ch chan<- []*targetgroup.Group, name string) {
   455  	srv := &consulService{
   456  		discovery: d,
   457  		client:    d.client,
   458  		name:      name,
   459  		tags:      d.watchedTags,
   460  		labels: model.LabelSet{
   461  			serviceLabel:    model.LabelValue(name),
   462  			datacenterLabel: model.LabelValue(d.clientDatacenter),
   463  		},
   464  		tagSeparator: d.tagSeparator,
   465  		logger:       d.logger,
   466  	}
   468  	go func() {
   469  		ticker := time.NewTicker(d.refreshInterval)
   470  		defer ticker.Stop()
   471  		var lastIndex uint64
   472  		health := srv.client.Health()
   473  		for {
   474  			select {
   475  			case <-ctx.Done():
   476  				return
   477  			default:
   478, ch, health, &lastIndex)
   479  				select {
   480  				case <-ticker.C:
   481  				case <-ctx.Done():
   482  					return
   483  				}
   484  			}
   485  		}
   486  	}()
   487  }
   489  // Get updates for a service.
   490  func (srv *consulService) watch(ctx context.Context, ch chan<- []*targetgroup.Group, health *consul.Health, lastIndex *uint64) {
   491  	srv.logger.WithField("service","tags", strings.Join(srv.tags, ",")).Debug("watching service")
   493  	opts := &consul.QueryOptions{
   494  		WaitIndex:  *lastIndex,
   495  		WaitTime:   watchTimeout,
   496  		AllowStale: srv.discovery.allowStale,
   497  		NodeMeta:   srv.discovery.watchedNodeMeta,
   498  	}
   500  	t0 := time.Now()
   501  	serviceNodes, meta, err := health.ServiceMultipleTags(, srv.tags, false, opts.WithContext(ctx))
   502  	elapsed := time.Since(t0)
   503  	serviceRPCDuration.Observe(elapsed.Seconds())
   505  	// Check the context before in order to exit early.
   506  	select {
   507  	case <-ctx.Done():
   508  		return
   509  	default:
   510  		// Continue.
   511  	}
   513  	if err != nil {
   514  		srv.logger.WithError(err).WithField("service","tags", strings.Join(srv.tags, ",")).Error("error refreshing service")
   515  		rpcFailuresCount.Inc()
   516  		time.Sleep(retryInterval)
   517  		return
   518  	}
   519  	// If the index equals the previous one, the watch timed out with no update.
   520  	if meta.LastIndex == *lastIndex {
   521  		return
   522  	}
   523  	*lastIndex = meta.LastIndex
   525  	tgroup := targetgroup.Group{
   526  		Source:,
   527  		Labels:  srv.labels,
   528  		Targets: make([]model.LabelSet, 0, len(serviceNodes)),
   529  	}
   531  	for _, serviceNode := range serviceNodes {
   532  		// We surround the separated list with the separator as well. This way regular expressions
   533  		// in relabeling rules don't have to consider tag positions.
   534  		tags := srv.tagSeparator + strings.Join(serviceNode.Service.Tags, srv.tagSeparator) + srv.tagSeparator
   536  		// If the service address is not empty it should be used instead of the node address
   537  		// since the service may be registered remotely through a different node.
   538  		var addr string
   539  		if serviceNode.Service.Address != "" {
   540  			addr = net.JoinHostPort(serviceNode.Service.Address, fmt.Sprintf("%d", serviceNode.Service.Port))
   541  		} else {
   542  			addr = net.JoinHostPort(serviceNode.Node.Address, fmt.Sprintf("%d", serviceNode.Service.Port))
   543  		}
   545  		labels := model.LabelSet{
   546  			model.AddressLabel:  model.LabelValue(addr),
   547  			addressLabel:        model.LabelValue(serviceNode.Node.Address),
   548  			nodeLabel:           model.LabelValue(serviceNode.Node.Node),
   549  			namespaceLabel:      model.LabelValue(serviceNode.Service.Namespace),
   550  			partitionLabel:      model.LabelValue(serviceNode.Service.Partition),
   551  			tagsLabel:           model.LabelValue(tags),
   552  			serviceAddressLabel: model.LabelValue(serviceNode.Service.Address),
   553  			servicePortLabel:    model.LabelValue(strconv.Itoa(serviceNode.Service.Port)),
   554  			serviceIDLabel:      model.LabelValue(serviceNode.Service.ID),
   555  			healthLabel:         model.LabelValue(serviceNode.Checks.AggregatedStatus()),
   556  		}
   558  		// Add all key/value pairs from the node's metadata as their own labels.
   559  		for k, v := range serviceNode.Node.Meta {
   560  			name := strutil.SanitizeLabelName(k)
   561  			labels[metaDataLabel+model.LabelName(name)] = model.LabelValue(v)
   562  		}
   564  		// Add all key/value pairs from the service's metadata as their own labels.
   565  		for k, v := range serviceNode.Service.Meta {
   566  			name := strutil.SanitizeLabelName(k)
   567  			labels[serviceMetaDataLabel+model.LabelName(name)] = model.LabelValue(v)
   568  		}
   570  		// Add all key/value pairs from the service's tagged addresses as their own labels.
   571  		for k, v := range serviceNode.Node.TaggedAddresses {
   572  			name := strutil.SanitizeLabelName(k)
   573  			labels[taggedAddressesLabel+model.LabelName(name)] = model.LabelValue(v)
   574  		}
   576  		tgroup.Targets = append(tgroup.Targets, labels)
   577  	}
   579  	select {
   580  	case <-ctx.Done():
   581  	case ch <- []*targetgroup.Group{&tgroup}:
   582  	}
   583  }