github.com/yankunsam/loki/v2@v2.6.3-0.20220817130409-389df5235c27/clients/pkg/promtail/discovery/consulagent/consul.go (about) 1 // This code was adapted from the consul service discovery 2 // package in prometheus: https://github.com/prometheus/prometheus/blob/main/discovery/consul/consul.go 3 // which is copyrighted: 2015 The Prometheus Authors 4 // and licensed under the Apache License, Version 2.0 (the "License"); 5 6 package consulagent 7 8 import ( 9 "context" 10 "encoding/json" 11 "fmt" 12 "net" 13 "net/http" 14 "strconv" 15 "strings" 16 "time" 17 18 "github.com/go-kit/log" 19 "github.com/go-kit/log/level" 20 consul "github.com/hashicorp/consul/api" 21 conntrack "github.com/mwitkow/go-conntrack" 22 "github.com/pkg/errors" 23 "github.com/prometheus/client_golang/prometheus" 24 "github.com/prometheus/common/config" 25 "github.com/prometheus/common/model" 26 27 "github.com/prometheus/prometheus/discovery" 28 "github.com/prometheus/prometheus/discovery/targetgroup" 29 "github.com/prometheus/prometheus/util/strutil" 30 ) 31 32 const ( 33 watchTimeout = 2 * time.Minute 34 retryInterval = 15 * time.Second 35 36 // addressLabel is the name for the label containing a target's address. 37 addressLabel = model.MetaLabelPrefix + "consulagent_address" 38 // nodeLabel is the name for the label containing a target's node name. 39 nodeLabel = model.MetaLabelPrefix + "consulagent_node" 40 // metaDataLabel is the prefix for the labels mapping to a target's metadata. 41 metaDataLabel = model.MetaLabelPrefix + "consulagent_metadata_" 42 // serviceMetaDataLabel is the prefix for the labels mapping to a target's service metadata. 43 serviceMetaDataLabel = model.MetaLabelPrefix + "consulagent_service_metadata_" 44 // tagsLabel is the name of the label containing the tags assigned to the target. 45 tagsLabel = model.MetaLabelPrefix + "consulagent_tags" 46 // serviceLabel is the name of the label containing the service name. 47 serviceLabel = model.MetaLabelPrefix + "consulagent_service" 48 // healthLabel is the name of the label containing the health of the service instance 49 healthLabel = model.MetaLabelPrefix + "consulagent_health" 50 // serviceAddressLabel is the name of the label containing the (optional) service address. 51 serviceAddressLabel = model.MetaLabelPrefix + "consulagent_service_address" 52 //servicePortLabel is the name of the label containing the service port. 53 servicePortLabel = model.MetaLabelPrefix + "consulagent_service_port" 54 // datacenterLabel is the name of the label containing the datacenter ID. 55 datacenterLabel = model.MetaLabelPrefix + "consulagent_dc" 56 // taggedAddressesLabel is the prefix for the labels mapping to a target's tagged addresses. 57 taggedAddressesLabel = model.MetaLabelPrefix + "consulagent_tagged_address_" 58 // serviceIDLabel is the name of the label containing the service ID. 59 serviceIDLabel = model.MetaLabelPrefix + "consulagent_service_id" 60 61 // Constants for instrumentation. 62 namespace = "prometheus" 63 ) 64 65 var ( 66 rpcFailuresCount = prometheus.NewCounter( 67 prometheus.CounterOpts{ 68 Namespace: namespace, 69 Name: "sd_consulagent_rpc_failures_total", 70 Help: "The number of Consul Agent RPC call failures.", 71 }) 72 rpcDuration = prometheus.NewSummaryVec( 73 prometheus.SummaryOpts{ 74 Namespace: namespace, 75 Name: "sd_consulagent_rpc_duration_seconds", 76 Help: "The duration of a Consul Agent RPC call in seconds.", 77 Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001}, 78 }, 79 []string{"endpoint", "call"}, 80 ) 81 82 // Initialize metric vectors. 83 servicesRPCDuration = rpcDuration.WithLabelValues("agent", "services") 84 serviceRPCDuration = rpcDuration.WithLabelValues("agent", "service") 85 86 // DefaultSDConfig is the default Consul SD configuration. 87 DefaultSDConfig = SDConfig{ 88 TagSeparator: ",", 89 Scheme: "http", 90 Server: "localhost:8500", 91 AllowStale: true, 92 RefreshInterval: model.Duration(30 * time.Second), 93 } 94 ) 95 96 func init() { 97 discovery.RegisterConfig(&SDConfig{}) 98 prometheus.MustRegister(rpcFailuresCount) 99 prometheus.MustRegister(rpcDuration) 100 } 101 102 // SDConfig is the configuration for Consul service discovery. 103 type SDConfig struct { 104 Server string `yaml:"server,omitempty"` 105 Token config.Secret `yaml:"token,omitempty"` 106 Datacenter string `yaml:"datacenter,omitempty"` 107 TagSeparator string `yaml:"tag_separator,omitempty"` 108 Scheme string `yaml:"scheme,omitempty"` 109 Username string `yaml:"username,omitempty"` 110 Password config.Secret `yaml:"password,omitempty"` 111 112 // See https://www.consul.io/docs/internals/consensus.html#consistency-modes, 113 // stale reads are a lot cheaper and are a necessity if you have >5k targets. 114 AllowStale bool `yaml:"allow_stale"` 115 // By default use blocking queries (https://www.consul.io/api/index.html#blocking-queries) 116 // but allow users to throttle updates if necessary. This can be useful because of "bugs" like 117 // https://github.com/hashicorp/consul/issues/3712 which cause an un-necessary 118 // amount of requests on consul. 119 RefreshInterval model.Duration `yaml:"refresh_interval,omitempty"` 120 121 // See https://www.consul.io/api/catalog.html#list-services 122 // The list of services for which targets are discovered. 123 // Defaults to all services if empty. 124 Services []string `yaml:"services,omitempty"` 125 // A list of tags used to filter instances inside a service. Services must contain all tags in the list. 126 ServiceTags []string `yaml:"tags,omitempty"` 127 // Desired node metadata. 128 NodeMeta map[string]string `yaml:"node_meta,omitempty"` 129 130 TLSConfig config.TLSConfig `yaml:"tls_config,omitempty"` 131 } 132 133 // Name returns the name of the Config. 134 func (*SDConfig) Name() string { return "consulagent" } 135 136 // NewDiscoverer returns a Discoverer for the Config. 137 func (c *SDConfig) NewDiscoverer(opts discovery.DiscovererOptions) (discovery.Discoverer, error) { 138 return NewDiscovery(c, opts.Logger) 139 } 140 141 // SetDirectory joins any relative file paths with dir. 142 func (c *SDConfig) SetDirectory(dir string) { 143 c.TLSConfig.SetDirectory(dir) 144 } 145 146 // UnmarshalYAML implements the yaml.Unmarshaler interface. 147 func (c *SDConfig) UnmarshalYAML(unmarshal func(interface{}) error) error { 148 *c = DefaultSDConfig 149 type plain SDConfig 150 err := unmarshal((*plain)(c)) 151 if err != nil { 152 return err 153 } 154 if strings.TrimSpace(c.Server) == "" { 155 return errors.New("consulagent SD configuration requires a server address") 156 } 157 return nil 158 } 159 160 // Discovery retrieves target information from a Consul server 161 // and updates them via watches. 162 type Discovery struct { 163 client *consul.Client 164 clientDatacenter string 165 tagSeparator string 166 watchedServices []string // Set of services which will be discovered. 167 watchedTags []string // Tags used to filter instances of a service. 168 watchedNodeMeta map[string]string 169 allowStale bool 170 refreshInterval time.Duration 171 finalizer func() 172 logger log.Logger 173 } 174 175 // NewDiscovery returns a new Discovery for the given config. 176 func NewDiscovery(conf *SDConfig, logger log.Logger) (*Discovery, error) { 177 if logger == nil { 178 logger = log.NewNopLogger() 179 } 180 181 tls, err := config.NewTLSConfig(&conf.TLSConfig) 182 if err != nil { 183 return nil, err 184 } 185 transport := &http.Transport{ 186 IdleConnTimeout: 2 * watchTimeout, 187 TLSClientConfig: tls, 188 DialContext: conntrack.NewDialContextFunc( 189 conntrack.DialWithTracing(), 190 conntrack.DialWithName("consulagent_sd"), 191 ), 192 } 193 wrapper := &http.Client{ 194 Transport: transport, 195 Timeout: watchTimeout + 15*time.Second, 196 } 197 198 clientConf := &consul.Config{ 199 Address: conf.Server, 200 Scheme: conf.Scheme, 201 Datacenter: conf.Datacenter, 202 Token: string(conf.Token), 203 HttpAuth: &consul.HttpBasicAuth{ 204 Username: conf.Username, 205 Password: string(conf.Password), 206 }, 207 HttpClient: wrapper, 208 } 209 client, err := consul.NewClient(clientConf) 210 if err != nil { 211 return nil, err 212 } 213 cd := &Discovery{ 214 client: client, 215 tagSeparator: conf.TagSeparator, 216 watchedServices: conf.Services, 217 watchedTags: conf.ServiceTags, 218 watchedNodeMeta: conf.NodeMeta, 219 allowStale: conf.AllowStale, 220 refreshInterval: time.Duration(conf.RefreshInterval), 221 clientDatacenter: conf.Datacenter, 222 finalizer: transport.CloseIdleConnections, 223 logger: logger, 224 } 225 return cd, nil 226 } 227 228 // shouldWatch returns whether the service of the given name should be watched. 229 func (d *Discovery) shouldWatch(name string, tags []string) bool { 230 return d.shouldWatchFromName(name) && d.shouldWatchFromTags(tags) 231 } 232 233 // shouldWatch returns whether the service of the given name should be watched based on its name. 234 func (d *Discovery) shouldWatchFromName(name string) bool { 235 // If there's no fixed set of watched services, we watch everything. 236 if len(d.watchedServices) == 0 { 237 return true 238 } 239 240 for _, sn := range d.watchedServices { 241 if sn == name { 242 return true 243 } 244 } 245 return false 246 } 247 248 // shouldWatch returns whether the service of the given name should be watched based on its tags. 249 // This gets called when the user doesn't specify a list of services in order to avoid watching 250 // *all* services. Details in https://github.com/prometheus/prometheus/pull/3814 251 func (d *Discovery) shouldWatchFromTags(tags []string) bool { 252 // If there's no fixed set of watched tags, we watch everything. 253 if len(d.watchedTags) == 0 { 254 return true 255 } 256 257 tagOuter: 258 for _, wtag := range d.watchedTags { 259 for _, tag := range tags { 260 if wtag == tag { 261 continue tagOuter 262 } 263 } 264 return false 265 } 266 return true 267 } 268 269 // Get the local datacenter if not specified. 270 func (d *Discovery) getDatacenter() error { 271 // If the datacenter was not set from clientConf, let's get it from the local Consul agent 272 // (Consul default is to use local node's datacenter if one isn't given for a query). 273 if d.clientDatacenter != "" { 274 return nil 275 } 276 info, err := d.client.Agent().Self() 277 if err != nil { 278 level.Error(d.logger).Log("msg", "Error retrieving datacenter name", "err", err) 279 rpcFailuresCount.Inc() 280 return err 281 } 282 283 dc, ok := info["Config"]["Datacenter"].(string) 284 if !ok { 285 err := errors.Errorf("invalid value '%v' for Config.Datacenter", info["Config"]["Datacenter"]) 286 level.Error(d.logger).Log("msg", "Error retrieving datacenter name", "err", err) 287 return err 288 } 289 290 d.clientDatacenter = dc 291 return nil 292 } 293 294 // Initialize the Discoverer run. 295 func (d *Discovery) initialize(ctx context.Context) { 296 // Loop until we manage to get the local datacenter. 297 for { 298 // We have to check the context at least once. The checks during channel sends 299 // do not guarantee that. 300 select { 301 case <-ctx.Done(): 302 return 303 default: 304 } 305 306 // Get the local datacenter first, if necessary. 307 err := d.getDatacenter() 308 if err != nil { 309 time.Sleep(retryInterval) 310 continue 311 } 312 // We are good to go. 313 return 314 } 315 } 316 317 // Run implements the Discoverer interface. 318 func (d *Discovery) Run(ctx context.Context, ch chan<- []*targetgroup.Group) { 319 if d.finalizer != nil { 320 defer d.finalizer() 321 } 322 d.initialize(ctx) 323 324 if len(d.watchedServices) == 0 || len(d.watchedTags) != 0 { 325 // We need to watch the agent. 326 ticker := time.NewTicker(d.refreshInterval) 327 328 // Watched services and their cancellation functions. 329 services := make(map[string]func()) 330 331 for { 332 select { 333 case <-ctx.Done(): 334 ticker.Stop() 335 return 336 default: 337 d.watchServices(ctx, ch, services) 338 <-ticker.C 339 } 340 } 341 } else { 342 // We only have fully defined services. 343 for _, name := range d.watchedServices { 344 d.watchService(ctx, ch, name) 345 } 346 <-ctx.Done() 347 } 348 } 349 350 // Watch the catalog for new services we would like to watch. This is called only 351 // when we don't know yet the names of the services and need to ask Consul the 352 // entire list of services. 353 func (d *Discovery) watchServices(ctx context.Context, ch chan<- []*targetgroup.Group, services map[string]func()) { 354 agent := d.client.Agent() 355 level.Debug(d.logger).Log("msg", "Watching services", "tags", strings.Join(d.watchedTags, ",")) 356 357 t0 := time.Now() 358 srvs, err := agent.Services() 359 elapsed := time.Since(t0) 360 servicesRPCDuration.Observe(elapsed.Seconds()) 361 362 // Check the context before in order to exit early. 363 select { 364 case <-ctx.Done(): 365 return 366 default: 367 } 368 369 if err != nil { 370 level.Error(d.logger).Log("msg", "Error refreshing service list", "err", err) 371 rpcFailuresCount.Inc() 372 time.Sleep(retryInterval) 373 return 374 } 375 376 discoveredServices := make(map[string]*consul.AgentService) 377 for _, srv := range srvs { 378 name := srv.Service 379 discoveredServices[name] = srv 380 381 // use service name and tags to only watch 382 // the services that have the tag we are looking for (if specified). 383 // When no tags have been specified this will return true. 384 if !d.shouldWatch(name, srv.Tags) { 385 continue 386 } 387 if _, ok := services[name]; ok { 388 continue // We are already watching the service. 389 } 390 391 wctx, cancel := context.WithCancel(ctx) 392 d.watchService(wctx, ch, name) 393 services[name] = cancel 394 } 395 396 // Check for removed services. 397 for name, cancel := range services { 398 if _, ok := discoveredServices[name]; !ok { 399 level.Debug(d.logger).Log( 400 "msg", "removing service since consul no longer has a record of it", 401 "name", name) 402 // Call the watch cancellation function. 403 cancel() 404 delete(services, name) 405 406 // Send clearing target group. 407 select { 408 case <-ctx.Done(): 409 return 410 case ch <- []*targetgroup.Group{{Source: name}}: 411 } 412 } 413 } 414 415 // Send targetgroup with no targets if nothing was discovered. 416 if len(services) == 0 { 417 select { 418 case <-ctx.Done(): 419 return 420 case ch <- []*targetgroup.Group{{}}: 421 } 422 } 423 } 424 425 // consulService contains data belonging to the same service. 426 type consulService struct { 427 name string 428 tags []string 429 labels model.LabelSet 430 discovery *Discovery 431 client *consul.Client 432 tagSeparator string 433 logger log.Logger 434 } 435 436 // Start watching a service. 437 func (d *Discovery) watchService(ctx context.Context, ch chan<- []*targetgroup.Group, name string) { 438 srv := &consulService{ 439 discovery: d, 440 client: d.client, 441 name: name, 442 tags: d.watchedTags, 443 labels: model.LabelSet{ 444 serviceLabel: model.LabelValue(name), 445 datacenterLabel: model.LabelValue(d.clientDatacenter), 446 }, 447 tagSeparator: d.tagSeparator, 448 logger: d.logger, 449 } 450 451 go func() { 452 ticker := time.NewTicker(d.refreshInterval) 453 defer ticker.Stop() 454 agent := srv.client.Agent() 455 for { 456 select { 457 case <-ctx.Done(): 458 return 459 default: 460 srv.watch(ctx, ch, agent) 461 select { 462 case <-ticker.C: 463 case <-ctx.Done(): 464 return 465 } 466 } 467 } 468 }() 469 } 470 471 // Get updates for a service. 472 func (srv *consulService) watch(ctx context.Context, ch chan<- []*targetgroup.Group, agent *consul.Agent) { 473 level.Debug(srv.logger).Log("msg", "Watching service", "service", srv.name, "tags", strings.Join(srv.tags, ",")) 474 475 t0 := time.Now() 476 aggregatedStatus, serviceChecks, err := agent.AgentHealthServiceByName(srv.name) 477 elapsed := time.Since(t0) 478 serviceRPCDuration.Observe(elapsed.Seconds()) 479 480 // Check the context before in order to exit early. 481 select { 482 case <-ctx.Done(): 483 return 484 default: 485 // Continue. 486 } 487 488 if err != nil { 489 level.Error(srv.logger).Log("msg", "Error refreshing service", "service", srv.name, "tags", strings.Join(srv.tags, ","), "err", err) 490 rpcFailuresCount.Inc() 491 time.Sleep(retryInterval) 492 return 493 } 494 495 self, err := agent.Self() 496 if err != nil { 497 level.Error(srv.logger).Log("msg", "failed to get agent info from agent api", "err", err) 498 return 499 } 500 var member = consul.AgentMember{} 501 memberBytes, err := json.Marshal(self["Member"]) 502 if err != nil { 503 level.Error(srv.logger).Log("msg", "failed to get member information from agent", "err", err) 504 return 505 } 506 err = json.Unmarshal(memberBytes, &member) 507 if err != nil { 508 level.Error(srv.logger).Log("msg", "failed to unmarshal member information from agent", "err", err) 509 return 510 } 511 512 nodeName := self["Config"]["NodeName"].(string) 513 meta := self["Meta"] 514 515 tgroup := targetgroup.Group{ 516 Source: srv.name, 517 Labels: srv.labels, 518 Targets: make([]model.LabelSet, 0, len(serviceChecks)), 519 } 520 521 for _, srvCheck := range serviceChecks { 522 // We surround the separated list with the separator as well. This way regular expressions 523 // in relabeling rules don't have to consider tag positions. 524 var tags = srv.tagSeparator + strings.Join(srvCheck.Service.Tags, srv.tagSeparator) + srv.tagSeparator 525 526 // If the service address is not empty it should be used instead of the node address 527 // since the service may be registered remotely through a different node. 528 var addr string 529 if srvCheck.Service.Address != "" { 530 addr = net.JoinHostPort(srvCheck.Service.Address, fmt.Sprintf("%d", srvCheck.Service.Port)) 531 } else { 532 addr = net.JoinHostPort(member.Addr, fmt.Sprintf("%d", srvCheck.Service.Port)) 533 } 534 535 labels := model.LabelSet{ 536 model.AddressLabel: model.LabelValue(addr), 537 addressLabel: model.LabelValue(member.Addr), 538 nodeLabel: model.LabelValue(nodeName), 539 tagsLabel: model.LabelValue(tags), 540 serviceAddressLabel: model.LabelValue(srvCheck.Service.Address), 541 servicePortLabel: model.LabelValue(strconv.Itoa(srvCheck.Service.Port)), 542 serviceIDLabel: model.LabelValue(srvCheck.Service.ID), 543 healthLabel: model.LabelValue(aggregatedStatus), 544 } 545 546 // Add all key/value pairs from the node's metadata as their own labels. 547 for k, v := range meta { 548 if str, ok := v.(string); ok { 549 name := strutil.SanitizeLabelName(k) 550 labels[metaDataLabel+model.LabelName(name)] = model.LabelValue(str) 551 } 552 } 553 554 // Add all key/value pairs from the service's metadata as their own labels. 555 for k, v := range srvCheck.Service.Meta { 556 name := strutil.SanitizeLabelName(k) 557 labels[serviceMetaDataLabel+model.LabelName(name)] = model.LabelValue(v) 558 } 559 560 // Add all key/value pairs from the service's tagged addresses as their own labels. 561 for k, v := range srvCheck.Service.TaggedAddresses { 562 name := strutil.SanitizeLabelName(k) 563 address := fmt.Sprintf("%s:%d", v.Address, v.Port) 564 labels[taggedAddressesLabel+model.LabelName(name)] = model.LabelValue(address) 565 } 566 567 tgroup.Targets = append(tgroup.Targets, labels) 568 } 569 570 select { 571 case <-ctx.Done(): 572 case ch <- []*targetgroup.Group{&tgroup}: 573 } 574 }