github.com/yankunsam/loki/v2@v2.6.3-0.20220817130409-389df5235c27/clients/pkg/promtail/discovery/consulagent/consul.go (about)

     1  // This code was adapted from the consul service discovery
     2  // package in prometheus: https://github.com/prometheus/prometheus/blob/main/discovery/consul/consul.go
     3  // which is copyrighted: 2015 The Prometheus Authors
     4  // and licensed under the Apache License, Version 2.0 (the "License");
     5  
     6  package consulagent
     7  
     8  import (
     9  	"context"
    10  	"encoding/json"
    11  	"fmt"
    12  	"net"
    13  	"net/http"
    14  	"strconv"
    15  	"strings"
    16  	"time"
    17  
    18  	"github.com/go-kit/log"
    19  	"github.com/go-kit/log/level"
    20  	consul "github.com/hashicorp/consul/api"
    21  	conntrack "github.com/mwitkow/go-conntrack"
    22  	"github.com/pkg/errors"
    23  	"github.com/prometheus/client_golang/prometheus"
    24  	"github.com/prometheus/common/config"
    25  	"github.com/prometheus/common/model"
    26  
    27  	"github.com/prometheus/prometheus/discovery"
    28  	"github.com/prometheus/prometheus/discovery/targetgroup"
    29  	"github.com/prometheus/prometheus/util/strutil"
    30  )
    31  
    32  const (
    33  	watchTimeout  = 2 * time.Minute
    34  	retryInterval = 15 * time.Second
    35  
    36  	// addressLabel is the name for the label containing a target's address.
    37  	addressLabel = model.MetaLabelPrefix + "consulagent_address"
    38  	// nodeLabel is the name for the label containing a target's node name.
    39  	nodeLabel = model.MetaLabelPrefix + "consulagent_node"
    40  	// metaDataLabel is the prefix for the labels mapping to a target's metadata.
    41  	metaDataLabel = model.MetaLabelPrefix + "consulagent_metadata_"
    42  	// serviceMetaDataLabel is the prefix for the labels mapping to a target's service metadata.
    43  	serviceMetaDataLabel = model.MetaLabelPrefix + "consulagent_service_metadata_"
    44  	// tagsLabel is the name of the label containing the tags assigned to the target.
    45  	tagsLabel = model.MetaLabelPrefix + "consulagent_tags"
    46  	// serviceLabel is the name of the label containing the service name.
    47  	serviceLabel = model.MetaLabelPrefix + "consulagent_service"
    48  	// healthLabel is the name of the label containing the health of the service instance
    49  	healthLabel = model.MetaLabelPrefix + "consulagent_health"
    50  	// serviceAddressLabel is the name of the label containing the (optional) service address.
    51  	serviceAddressLabel = model.MetaLabelPrefix + "consulagent_service_address"
    52  	//servicePortLabel is the name of the label containing the service port.
    53  	servicePortLabel = model.MetaLabelPrefix + "consulagent_service_port"
    54  	// datacenterLabel is the name of the label containing the datacenter ID.
    55  	datacenterLabel = model.MetaLabelPrefix + "consulagent_dc"
    56  	// taggedAddressesLabel is the prefix for the labels mapping to a target's tagged addresses.
    57  	taggedAddressesLabel = model.MetaLabelPrefix + "consulagent_tagged_address_"
    58  	// serviceIDLabel is the name of the label containing the service ID.
    59  	serviceIDLabel = model.MetaLabelPrefix + "consulagent_service_id"
    60  
    61  	// Constants for instrumentation.
    62  	namespace = "prometheus"
    63  )
    64  
    65  var (
    66  	rpcFailuresCount = prometheus.NewCounter(
    67  		prometheus.CounterOpts{
    68  			Namespace: namespace,
    69  			Name:      "sd_consulagent_rpc_failures_total",
    70  			Help:      "The number of Consul Agent RPC call failures.",
    71  		})
    72  	rpcDuration = prometheus.NewSummaryVec(
    73  		prometheus.SummaryOpts{
    74  			Namespace:  namespace,
    75  			Name:       "sd_consulagent_rpc_duration_seconds",
    76  			Help:       "The duration of a Consul Agent RPC call in seconds.",
    77  			Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001},
    78  		},
    79  		[]string{"endpoint", "call"},
    80  	)
    81  
    82  	// Initialize metric vectors.
    83  	servicesRPCDuration = rpcDuration.WithLabelValues("agent", "services")
    84  	serviceRPCDuration  = rpcDuration.WithLabelValues("agent", "service")
    85  
    86  	// DefaultSDConfig is the default Consul SD configuration.
    87  	DefaultSDConfig = SDConfig{
    88  		TagSeparator:    ",",
    89  		Scheme:          "http",
    90  		Server:          "localhost:8500",
    91  		AllowStale:      true,
    92  		RefreshInterval: model.Duration(30 * time.Second),
    93  	}
    94  )
    95  
    96  func init() {
    97  	discovery.RegisterConfig(&SDConfig{})
    98  	prometheus.MustRegister(rpcFailuresCount)
    99  	prometheus.MustRegister(rpcDuration)
   100  }
   101  
   102  // SDConfig is the configuration for Consul service discovery.
   103  type SDConfig struct {
   104  	Server       string        `yaml:"server,omitempty"`
   105  	Token        config.Secret `yaml:"token,omitempty"`
   106  	Datacenter   string        `yaml:"datacenter,omitempty"`
   107  	TagSeparator string        `yaml:"tag_separator,omitempty"`
   108  	Scheme       string        `yaml:"scheme,omitempty"`
   109  	Username     string        `yaml:"username,omitempty"`
   110  	Password     config.Secret `yaml:"password,omitempty"`
   111  
   112  	// See https://www.consul.io/docs/internals/consensus.html#consistency-modes,
   113  	// stale reads are a lot cheaper and are a necessity if you have >5k targets.
   114  	AllowStale bool `yaml:"allow_stale"`
   115  	// By default use blocking queries (https://www.consul.io/api/index.html#blocking-queries)
   116  	// but allow users to throttle updates if necessary. This can be useful because of "bugs" like
   117  	// https://github.com/hashicorp/consul/issues/3712 which cause an un-necessary
   118  	// amount of requests on consul.
   119  	RefreshInterval model.Duration `yaml:"refresh_interval,omitempty"`
   120  
   121  	// See https://www.consul.io/api/catalog.html#list-services
   122  	// The list of services for which targets are discovered.
   123  	// Defaults to all services if empty.
   124  	Services []string `yaml:"services,omitempty"`
   125  	// A list of tags used to filter instances inside a service. Services must contain all tags in the list.
   126  	ServiceTags []string `yaml:"tags,omitempty"`
   127  	// Desired node metadata.
   128  	NodeMeta map[string]string `yaml:"node_meta,omitempty"`
   129  
   130  	TLSConfig config.TLSConfig `yaml:"tls_config,omitempty"`
   131  }
   132  
   133  // Name returns the name of the Config.
   134  func (*SDConfig) Name() string { return "consulagent" }
   135  
   136  // NewDiscoverer returns a Discoverer for the Config.
   137  func (c *SDConfig) NewDiscoverer(opts discovery.DiscovererOptions) (discovery.Discoverer, error) {
   138  	return NewDiscovery(c, opts.Logger)
   139  }
   140  
   141  // SetDirectory joins any relative file paths with dir.
   142  func (c *SDConfig) SetDirectory(dir string) {
   143  	c.TLSConfig.SetDirectory(dir)
   144  }
   145  
   146  // UnmarshalYAML implements the yaml.Unmarshaler interface.
   147  func (c *SDConfig) UnmarshalYAML(unmarshal func(interface{}) error) error {
   148  	*c = DefaultSDConfig
   149  	type plain SDConfig
   150  	err := unmarshal((*plain)(c))
   151  	if err != nil {
   152  		return err
   153  	}
   154  	if strings.TrimSpace(c.Server) == "" {
   155  		return errors.New("consulagent SD configuration requires a server address")
   156  	}
   157  	return nil
   158  }
   159  
   160  // Discovery retrieves target information from a Consul server
   161  // and updates them via watches.
   162  type Discovery struct {
   163  	client           *consul.Client
   164  	clientDatacenter string
   165  	tagSeparator     string
   166  	watchedServices  []string // Set of services which will be discovered.
   167  	watchedTags      []string // Tags used to filter instances of a service.
   168  	watchedNodeMeta  map[string]string
   169  	allowStale       bool
   170  	refreshInterval  time.Duration
   171  	finalizer        func()
   172  	logger           log.Logger
   173  }
   174  
   175  // NewDiscovery returns a new Discovery for the given config.
   176  func NewDiscovery(conf *SDConfig, logger log.Logger) (*Discovery, error) {
   177  	if logger == nil {
   178  		logger = log.NewNopLogger()
   179  	}
   180  
   181  	tls, err := config.NewTLSConfig(&conf.TLSConfig)
   182  	if err != nil {
   183  		return nil, err
   184  	}
   185  	transport := &http.Transport{
   186  		IdleConnTimeout: 2 * watchTimeout,
   187  		TLSClientConfig: tls,
   188  		DialContext: conntrack.NewDialContextFunc(
   189  			conntrack.DialWithTracing(),
   190  			conntrack.DialWithName("consulagent_sd"),
   191  		),
   192  	}
   193  	wrapper := &http.Client{
   194  		Transport: transport,
   195  		Timeout:   watchTimeout + 15*time.Second,
   196  	}
   197  
   198  	clientConf := &consul.Config{
   199  		Address:    conf.Server,
   200  		Scheme:     conf.Scheme,
   201  		Datacenter: conf.Datacenter,
   202  		Token:      string(conf.Token),
   203  		HttpAuth: &consul.HttpBasicAuth{
   204  			Username: conf.Username,
   205  			Password: string(conf.Password),
   206  		},
   207  		HttpClient: wrapper,
   208  	}
   209  	client, err := consul.NewClient(clientConf)
   210  	if err != nil {
   211  		return nil, err
   212  	}
   213  	cd := &Discovery{
   214  		client:           client,
   215  		tagSeparator:     conf.TagSeparator,
   216  		watchedServices:  conf.Services,
   217  		watchedTags:      conf.ServiceTags,
   218  		watchedNodeMeta:  conf.NodeMeta,
   219  		allowStale:       conf.AllowStale,
   220  		refreshInterval:  time.Duration(conf.RefreshInterval),
   221  		clientDatacenter: conf.Datacenter,
   222  		finalizer:        transport.CloseIdleConnections,
   223  		logger:           logger,
   224  	}
   225  	return cd, nil
   226  }
   227  
   228  // shouldWatch returns whether the service of the given name should be watched.
   229  func (d *Discovery) shouldWatch(name string, tags []string) bool {
   230  	return d.shouldWatchFromName(name) && d.shouldWatchFromTags(tags)
   231  }
   232  
   233  // shouldWatch returns whether the service of the given name should be watched based on its name.
   234  func (d *Discovery) shouldWatchFromName(name string) bool {
   235  	// If there's no fixed set of watched services, we watch everything.
   236  	if len(d.watchedServices) == 0 {
   237  		return true
   238  	}
   239  
   240  	for _, sn := range d.watchedServices {
   241  		if sn == name {
   242  			return true
   243  		}
   244  	}
   245  	return false
   246  }
   247  
   248  // shouldWatch returns whether the service of the given name should be watched based on its tags.
   249  // This gets called when the user doesn't specify a list of services in order to avoid watching
   250  // *all* services. Details in https://github.com/prometheus/prometheus/pull/3814
   251  func (d *Discovery) shouldWatchFromTags(tags []string) bool {
   252  	// If there's no fixed set of watched tags, we watch everything.
   253  	if len(d.watchedTags) == 0 {
   254  		return true
   255  	}
   256  
   257  tagOuter:
   258  	for _, wtag := range d.watchedTags {
   259  		for _, tag := range tags {
   260  			if wtag == tag {
   261  				continue tagOuter
   262  			}
   263  		}
   264  		return false
   265  	}
   266  	return true
   267  }
   268  
   269  // Get the local datacenter if not specified.
   270  func (d *Discovery) getDatacenter() error {
   271  	// If the datacenter was not set from clientConf, let's get it from the local Consul agent
   272  	// (Consul default is to use local node's datacenter if one isn't given for a query).
   273  	if d.clientDatacenter != "" {
   274  		return nil
   275  	}
   276  	info, err := d.client.Agent().Self()
   277  	if err != nil {
   278  		level.Error(d.logger).Log("msg", "Error retrieving datacenter name", "err", err)
   279  		rpcFailuresCount.Inc()
   280  		return err
   281  	}
   282  
   283  	dc, ok := info["Config"]["Datacenter"].(string)
   284  	if !ok {
   285  		err := errors.Errorf("invalid value '%v' for Config.Datacenter", info["Config"]["Datacenter"])
   286  		level.Error(d.logger).Log("msg", "Error retrieving datacenter name", "err", err)
   287  		return err
   288  	}
   289  
   290  	d.clientDatacenter = dc
   291  	return nil
   292  }
   293  
   294  // Initialize the Discoverer run.
   295  func (d *Discovery) initialize(ctx context.Context) {
   296  	// Loop until we manage to get the local datacenter.
   297  	for {
   298  		// We have to check the context at least once. The checks during channel sends
   299  		// do not guarantee that.
   300  		select {
   301  		case <-ctx.Done():
   302  			return
   303  		default:
   304  		}
   305  
   306  		// Get the local datacenter first, if necessary.
   307  		err := d.getDatacenter()
   308  		if err != nil {
   309  			time.Sleep(retryInterval)
   310  			continue
   311  		}
   312  		// We are good to go.
   313  		return
   314  	}
   315  }
   316  
   317  // Run implements the Discoverer interface.
   318  func (d *Discovery) Run(ctx context.Context, ch chan<- []*targetgroup.Group) {
   319  	if d.finalizer != nil {
   320  		defer d.finalizer()
   321  	}
   322  	d.initialize(ctx)
   323  
   324  	if len(d.watchedServices) == 0 || len(d.watchedTags) != 0 {
   325  		// We need to watch the agent.
   326  		ticker := time.NewTicker(d.refreshInterval)
   327  
   328  		// Watched services and their cancellation functions.
   329  		services := make(map[string]func())
   330  
   331  		for {
   332  			select {
   333  			case <-ctx.Done():
   334  				ticker.Stop()
   335  				return
   336  			default:
   337  				d.watchServices(ctx, ch, services)
   338  				<-ticker.C
   339  			}
   340  		}
   341  	} else {
   342  		// We only have fully defined services.
   343  		for _, name := range d.watchedServices {
   344  			d.watchService(ctx, ch, name)
   345  		}
   346  		<-ctx.Done()
   347  	}
   348  }
   349  
   350  // Watch the catalog for new services we would like to watch. This is called only
   351  // when we don't know yet the names of the services and need to ask Consul the
   352  // entire list of services.
   353  func (d *Discovery) watchServices(ctx context.Context, ch chan<- []*targetgroup.Group, services map[string]func()) {
   354  	agent := d.client.Agent()
   355  	level.Debug(d.logger).Log("msg", "Watching services", "tags", strings.Join(d.watchedTags, ","))
   356  
   357  	t0 := time.Now()
   358  	srvs, err := agent.Services()
   359  	elapsed := time.Since(t0)
   360  	servicesRPCDuration.Observe(elapsed.Seconds())
   361  
   362  	// Check the context before in order to exit early.
   363  	select {
   364  	case <-ctx.Done():
   365  		return
   366  	default:
   367  	}
   368  
   369  	if err != nil {
   370  		level.Error(d.logger).Log("msg", "Error refreshing service list", "err", err)
   371  		rpcFailuresCount.Inc()
   372  		time.Sleep(retryInterval)
   373  		return
   374  	}
   375  
   376  	discoveredServices := make(map[string]*consul.AgentService)
   377  	for _, srv := range srvs {
   378  		name := srv.Service
   379  		discoveredServices[name] = srv
   380  
   381  		// use service name and tags to only watch
   382  		// the services that have the tag we are looking for (if specified).
   383  		// When no tags have been specified this will return true.
   384  		if !d.shouldWatch(name, srv.Tags) {
   385  			continue
   386  		}
   387  		if _, ok := services[name]; ok {
   388  			continue // We are already watching the service.
   389  		}
   390  
   391  		wctx, cancel := context.WithCancel(ctx)
   392  		d.watchService(wctx, ch, name)
   393  		services[name] = cancel
   394  	}
   395  
   396  	// Check for removed services.
   397  	for name, cancel := range services {
   398  		if _, ok := discoveredServices[name]; !ok {
   399  			level.Debug(d.logger).Log(
   400  				"msg", "removing service since consul no longer has a record of it",
   401  				"name", name)
   402  			// Call the watch cancellation function.
   403  			cancel()
   404  			delete(services, name)
   405  
   406  			// Send clearing target group.
   407  			select {
   408  			case <-ctx.Done():
   409  				return
   410  			case ch <- []*targetgroup.Group{{Source: name}}:
   411  			}
   412  		}
   413  	}
   414  
   415  	// Send targetgroup with no targets if nothing was discovered.
   416  	if len(services) == 0 {
   417  		select {
   418  		case <-ctx.Done():
   419  			return
   420  		case ch <- []*targetgroup.Group{{}}:
   421  		}
   422  	}
   423  }
   424  
   425  // consulService contains data belonging to the same service.
   426  type consulService struct {
   427  	name         string
   428  	tags         []string
   429  	labels       model.LabelSet
   430  	discovery    *Discovery
   431  	client       *consul.Client
   432  	tagSeparator string
   433  	logger       log.Logger
   434  }
   435  
   436  // Start watching a service.
   437  func (d *Discovery) watchService(ctx context.Context, ch chan<- []*targetgroup.Group, name string) {
   438  	srv := &consulService{
   439  		discovery: d,
   440  		client:    d.client,
   441  		name:      name,
   442  		tags:      d.watchedTags,
   443  		labels: model.LabelSet{
   444  			serviceLabel:    model.LabelValue(name),
   445  			datacenterLabel: model.LabelValue(d.clientDatacenter),
   446  		},
   447  		tagSeparator: d.tagSeparator,
   448  		logger:       d.logger,
   449  	}
   450  
   451  	go func() {
   452  		ticker := time.NewTicker(d.refreshInterval)
   453  		defer ticker.Stop()
   454  		agent := srv.client.Agent()
   455  		for {
   456  			select {
   457  			case <-ctx.Done():
   458  				return
   459  			default:
   460  				srv.watch(ctx, ch, agent)
   461  				select {
   462  				case <-ticker.C:
   463  				case <-ctx.Done():
   464  					return
   465  				}
   466  			}
   467  		}
   468  	}()
   469  }
   470  
   471  // Get updates for a service.
   472  func (srv *consulService) watch(ctx context.Context, ch chan<- []*targetgroup.Group, agent *consul.Agent) {
   473  	level.Debug(srv.logger).Log("msg", "Watching service", "service", srv.name, "tags", strings.Join(srv.tags, ","))
   474  
   475  	t0 := time.Now()
   476  	aggregatedStatus, serviceChecks, err := agent.AgentHealthServiceByName(srv.name)
   477  	elapsed := time.Since(t0)
   478  	serviceRPCDuration.Observe(elapsed.Seconds())
   479  
   480  	// Check the context before in order to exit early.
   481  	select {
   482  	case <-ctx.Done():
   483  		return
   484  	default:
   485  		// Continue.
   486  	}
   487  
   488  	if err != nil {
   489  		level.Error(srv.logger).Log("msg", "Error refreshing service", "service", srv.name, "tags", strings.Join(srv.tags, ","), "err", err)
   490  		rpcFailuresCount.Inc()
   491  		time.Sleep(retryInterval)
   492  		return
   493  	}
   494  
   495  	self, err := agent.Self()
   496  	if err != nil {
   497  		level.Error(srv.logger).Log("msg", "failed to get agent info from agent api", "err", err)
   498  		return
   499  	}
   500  	var member = consul.AgentMember{}
   501  	memberBytes, err := json.Marshal(self["Member"])
   502  	if err != nil {
   503  		level.Error(srv.logger).Log("msg", "failed to get member information from agent", "err", err)
   504  		return
   505  	}
   506  	err = json.Unmarshal(memberBytes, &member)
   507  	if err != nil {
   508  		level.Error(srv.logger).Log("msg", "failed to unmarshal member information from agent", "err", err)
   509  		return
   510  	}
   511  
   512  	nodeName := self["Config"]["NodeName"].(string)
   513  	meta := self["Meta"]
   514  
   515  	tgroup := targetgroup.Group{
   516  		Source:  srv.name,
   517  		Labels:  srv.labels,
   518  		Targets: make([]model.LabelSet, 0, len(serviceChecks)),
   519  	}
   520  
   521  	for _, srvCheck := range serviceChecks {
   522  		// We surround the separated list with the separator as well. This way regular expressions
   523  		// in relabeling rules don't have to consider tag positions.
   524  		var tags = srv.tagSeparator + strings.Join(srvCheck.Service.Tags, srv.tagSeparator) + srv.tagSeparator
   525  
   526  		// If the service address is not empty it should be used instead of the node address
   527  		// since the service may be registered remotely through a different node.
   528  		var addr string
   529  		if srvCheck.Service.Address != "" {
   530  			addr = net.JoinHostPort(srvCheck.Service.Address, fmt.Sprintf("%d", srvCheck.Service.Port))
   531  		} else {
   532  			addr = net.JoinHostPort(member.Addr, fmt.Sprintf("%d", srvCheck.Service.Port))
   533  		}
   534  
   535  		labels := model.LabelSet{
   536  			model.AddressLabel:  model.LabelValue(addr),
   537  			addressLabel:        model.LabelValue(member.Addr),
   538  			nodeLabel:           model.LabelValue(nodeName),
   539  			tagsLabel:           model.LabelValue(tags),
   540  			serviceAddressLabel: model.LabelValue(srvCheck.Service.Address),
   541  			servicePortLabel:    model.LabelValue(strconv.Itoa(srvCheck.Service.Port)),
   542  			serviceIDLabel:      model.LabelValue(srvCheck.Service.ID),
   543  			healthLabel:         model.LabelValue(aggregatedStatus),
   544  		}
   545  
   546  		// Add all key/value pairs from the node's metadata as their own labels.
   547  		for k, v := range meta {
   548  			if str, ok := v.(string); ok {
   549  				name := strutil.SanitizeLabelName(k)
   550  				labels[metaDataLabel+model.LabelName(name)] = model.LabelValue(str)
   551  			}
   552  		}
   553  
   554  		// Add all key/value pairs from the service's metadata as their own labels.
   555  		for k, v := range srvCheck.Service.Meta {
   556  			name := strutil.SanitizeLabelName(k)
   557  			labels[serviceMetaDataLabel+model.LabelName(name)] = model.LabelValue(v)
   558  		}
   559  
   560  		// Add all key/value pairs from the service's tagged addresses as their own labels.
   561  		for k, v := range srvCheck.Service.TaggedAddresses {
   562  			name := strutil.SanitizeLabelName(k)
   563  			address := fmt.Sprintf("%s:%d", v.Address, v.Port)
   564  			labels[taggedAddressesLabel+model.LabelName(name)] = model.LabelValue(address)
   565  		}
   566  
   567  		tgroup.Targets = append(tgroup.Targets, labels)
   568  	}
   569  
   570  	select {
   571  	case <-ctx.Done():
   572  	case ch <- []*targetgroup.Group{&tgroup}:
   573  	}
   574  }