github.com/pyroscope-io/pyroscope@v0.37.3-0.20230725203016-5f6947968bd0/pkg/scrape/discovery/consul/consul.go (about) 1 // Copyright 2015 The Prometheus Authors 2 // Licensed under the Apache License, Version 2.0 (the "License"); 3 // you may not use this file except in compliance with the License. 4 // You may obtain a copy of the License at 5 // 6 // http://www.apache.org/licenses/LICENSE-2.0 7 // 8 // Unless required by applicable law or agreed to in writing, software 9 // distributed under the License is distributed on an "AS IS" BASIS, 10 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 // See the License for the specific language governing permissions and 12 // limitations under the License. 13 14 package consul // revive:disable-line:import-shadowing package name is not referenced 15 16 import ( 17 "context" 18 "errors" 19 "fmt" 20 "net" 21 "strconv" 22 "strings" 23 "time" 24 25 consul "github.com/hashicorp/consul/api" 26 "github.com/prometheus/client_golang/prometheus" 27 "github.com/pyroscope-io/pyroscope/pkg/scrape/config" 28 "github.com/pyroscope-io/pyroscope/pkg/scrape/discovery" 29 "github.com/pyroscope-io/pyroscope/pkg/scrape/discovery/targetgroup" 30 "github.com/pyroscope-io/pyroscope/pkg/scrape/model" 31 "github.com/pyroscope-io/pyroscope/pkg/util/strutil" 32 "github.com/sirupsen/logrus" 33 ) 34 35 const ( 36 watchTimeout = 2 * time.Minute 37 retryInterval = 15 * time.Second 38 39 // addressLabel is the name for the label containing a target's address. 40 addressLabel = model.MetaLabelPrefix + "consul_address" 41 // nodeLabel is the name for the label containing a target's node name. 42 nodeLabel = model.MetaLabelPrefix + "consul_node" 43 // metaDataLabel is the prefix for the labels mapping to a target's metadata. 44 metaDataLabel = model.MetaLabelPrefix + "consul_metadata_" 45 // serviceMetaDataLabel is the prefix for the labels mapping to a target's service metadata. 46 serviceMetaDataLabel = model.MetaLabelPrefix + "consul_service_metadata_" 47 // tagsLabel is the name of the label containing the tags assigned to the target. 48 tagsLabel = model.MetaLabelPrefix + "consul_tags" 49 // serviceLabel is the name of the label containing the service name. 50 serviceLabel = model.MetaLabelPrefix + "consul_service" 51 // healthLabel is the name of the label containing the health of the service instance 52 healthLabel = model.MetaLabelPrefix + "consul_health" 53 // serviceAddressLabel is the name of the label containing the (optional) service address. 54 serviceAddressLabel = model.MetaLabelPrefix + "consul_service_address" 55 // servicePortLabel is the name of the label containing the service port. 56 servicePortLabel = model.MetaLabelPrefix + "consul_service_port" 57 // datacenterLabel is the name of the label containing the datacenter ID. 58 datacenterLabel = model.MetaLabelPrefix + "consul_dc" 59 // namespaceLabel is the name of the label containing the namespace (Consul Enterprise only). 60 namespaceLabel = model.MetaLabelPrefix + "consul_namespace" 61 // partitionLabel is the name of the label containing the Admin Partition (Consul Enterprise only). 62 partitionLabel = model.MetaLabelPrefix + "consul_partition" 63 // taggedAddressesLabel is the prefix for the labels mapping to a target's tagged addresses. 64 taggedAddressesLabel = model.MetaLabelPrefix + "consul_tagged_address_" 65 // serviceIDLabel is the name of the label containing the service ID. 66 serviceIDLabel = model.MetaLabelPrefix + "consul_service_id" 67 68 // Constants for instrumentation. 69 namespace = "pyroscope" 70 ) 71 72 var ( 73 rpcFailuresCount = prometheus.NewCounter( 74 prometheus.CounterOpts{ 75 Namespace: namespace, 76 Name: "sd_consul_rpc_failures_total", 77 Help: "The number of Consul RPC call failures.", 78 }) 79 rpcDuration = prometheus.NewSummaryVec( 80 prometheus.SummaryOpts{ 81 Namespace: namespace, 82 Name: "sd_consul_rpc_duration_seconds", 83 Help: "The duration of a Consul RPC call in seconds.", 84 Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001}, 85 }, 86 []string{"endpoint", "call"}, 87 ) 88 89 // Initialize metric vectors. 90 servicesRPCDuration = rpcDuration.WithLabelValues("catalog", "services") 91 serviceRPCDuration = rpcDuration.WithLabelValues("catalog", "service") 92 93 // DefaultSDConfig is the default Consul SD configuration. 94 DefaultSDConfig = SDConfig{ 95 TagSeparator: ",", 96 Scheme: "http", 97 Server: "localhost:8500", 98 AllowStale: true, 99 RefreshInterval: model.Duration(30 * time.Second), 100 HTTPClientConfig: config.DefaultHTTPClientConfig, 101 } 102 ) 103 104 func init() { 105 discovery.RegisterConfig(&SDConfig{}) 106 prometheus.MustRegister(rpcFailuresCount, rpcDuration) 107 } 108 109 // SDConfig is the configuration for Consul service discovery. 110 type SDConfig struct { 111 Server string `yaml:"server,omitempty"` 112 Token config.Secret `yaml:"token,omitempty"` 113 Datacenter string `yaml:"datacenter,omitempty"` 114 Namespace string `yaml:"namespace,omitempty"` 115 Partition string `yaml:"partition,omitempty"` 116 TagSeparator string `yaml:"tag-separator,omitempty"` 117 Scheme string `yaml:"scheme,omitempty"` 118 Username string `yaml:"username,omitempty"` 119 Password config.Secret `yaml:"password,omitempty"` 120 121 // See https://www.consul.io/docs/internals/consensus.html#consistency-modes, 122 // stale reads are a lot cheaper and are a necessity if you have >5k targets. 123 AllowStale bool `yaml:"allow-stale"` 124 // By default use blocking queries (https://www.consul.io/api/index.html#blocking-queries) 125 // but allow users to throttle updates if necessary. This can be useful because of "bugs" like 126 // https://github.com/hashicorp/consul/issues/3712 which cause an un-necessary 127 // amount of requests on consul. 128 RefreshInterval model.Duration `yaml:"refresh-interval,omitempty"` 129 130 // See https://www.consul.io/api/catalog.html#list-services 131 // The list of services for which targets are discovered. 132 // Defaults to all services if empty. 133 Services []string `yaml:"services,omitempty"` 134 // A list of tags used to filter instances inside a service. Services must contain all tags in the list. 135 ServiceTags []string `yaml:"tags,omitempty"` 136 // Desired node metadata. 137 NodeMeta map[string]string `yaml:"node-meta,omitempty"` 138 139 HTTPClientConfig config.HTTPClientConfig `yaml:",inline"` 140 } 141 142 // Name returns the name of the Config. 143 func (*SDConfig) Name() string { return "consul" } 144 145 // NewDiscoverer returns a Discoverer for the Config. 146 func (c *SDConfig) NewDiscoverer(opts discovery.DiscovererOptions) (discovery.Discoverer, error) { 147 return NewDiscovery(c, opts.Logger) 148 } 149 150 // SetDirectory joins any relative file paths with dir. 151 func (c *SDConfig) SetDirectory(dir string) { 152 c.HTTPClientConfig.SetDirectory(dir) 153 } 154 155 // UnmarshalYAML implements the yaml.Unmarshaler interface. 156 func (c *SDConfig) UnmarshalYAML(unmarshal func(interface{}) error) error { 157 *c = DefaultSDConfig 158 type plain SDConfig 159 err := unmarshal((*plain)(c)) 160 if err != nil { 161 return err 162 } 163 if strings.TrimSpace(c.Server) == "" { 164 return errors.New("consul SD configuration requires a server address") 165 } 166 if c.Username != "" || c.Password != "" { 167 if c.HTTPClientConfig.BasicAuth != nil { 168 return errors.New("at most one of consul SD configuration username and password and basic auth can be configured") 169 } 170 c.HTTPClientConfig.BasicAuth = &config.BasicAuth{ 171 Username: c.Username, 172 Password: c.Password, 173 } 174 } 175 if c.Token != "" && (c.HTTPClientConfig.Authorization != nil || c.HTTPClientConfig.OAuth2 != nil) { 176 return errors.New("at most one of consul SD token, authorization, or oauth2 can be configured") 177 } 178 return c.HTTPClientConfig.Validate() 179 } 180 181 // Discovery retrieves target information from a Consul server 182 // and updates them via watches. 183 type Discovery struct { 184 client *consul.Client 185 clientDatacenter string 186 clientNamespace string 187 clientPartition string 188 tagSeparator string 189 watchedServices []string // Set of services which will be discovered. 190 watchedTags []string // Tags used to filter instances of a service. 191 watchedNodeMeta map[string]string 192 allowStale bool 193 refreshInterval time.Duration 194 finalizer func() 195 logger logrus.FieldLogger 196 } 197 198 // NewDiscovery returns a new Discovery for the given config. 199 func NewDiscovery(conf *SDConfig, logger logrus.FieldLogger) (*Discovery, error) { 200 wrapper, err := config.NewClientFromConfig(conf.HTTPClientConfig, "consul_sd", config.WithIdleConnTimeout(2*watchTimeout)) 201 if err != nil { 202 return nil, err 203 } 204 wrapper.Timeout = watchTimeout + 15*time.Second 205 206 clientConf := &consul.Config{ 207 Address: conf.Server, 208 Scheme: conf.Scheme, 209 Datacenter: conf.Datacenter, 210 Namespace: conf.Namespace, 211 Partition: conf.Partition, 212 Token: string(conf.Token), 213 HttpClient: wrapper, 214 } 215 client, err := consul.NewClient(clientConf) 216 if err != nil { 217 return nil, err 218 } 219 cd := &Discovery{ 220 client: client, 221 tagSeparator: conf.TagSeparator, 222 watchedServices: conf.Services, 223 watchedTags: conf.ServiceTags, 224 watchedNodeMeta: conf.NodeMeta, 225 allowStale: conf.AllowStale, 226 refreshInterval: time.Duration(conf.RefreshInterval), 227 clientDatacenter: conf.Datacenter, 228 clientNamespace: conf.Namespace, 229 clientPartition: conf.Partition, 230 finalizer: wrapper.CloseIdleConnections, 231 logger: logger, 232 } 233 return cd, nil 234 } 235 236 // shouldWatch returns whether the service of the given name should be watched. 237 func (d *Discovery) shouldWatch(name string, tags []string) bool { 238 return d.shouldWatchFromName(name) && d.shouldWatchFromTags(tags) 239 } 240 241 // shouldWatch returns whether the service of the given name should be watched based on its name. 242 func (d *Discovery) shouldWatchFromName(name string) bool { 243 // If there's no fixed set of watched services, we watch everything. 244 if len(d.watchedServices) == 0 { 245 return true 246 } 247 248 for _, sn := range d.watchedServices { 249 if sn == name { 250 return true 251 } 252 } 253 return false 254 } 255 256 // shouldWatch returns whether the service of the given name should be watched based on its tags. 257 // This gets called when the user doesn't specify a list of services in order to avoid watching 258 // *all* services. Details in https://github.com/prometheus/prometheus/pull/3814 259 func (d *Discovery) shouldWatchFromTags(tags []string) bool { 260 // If there's no fixed set of watched tags, we watch everything. 261 if len(d.watchedTags) == 0 { 262 return true 263 } 264 265 tagOuter: 266 for _, wtag := range d.watchedTags { 267 for _, tag := range tags { 268 if wtag == tag { 269 continue tagOuter 270 } 271 } 272 return false 273 } 274 return true 275 } 276 277 // Get the local datacenter if not specified. 278 func (d *Discovery) getDatacenter() error { 279 // If the datacenter was not set from clientConf, let's get it from the local Consul agent 280 // (Consul default is to use local node's datacenter if one isn't given for a query). 281 if d.clientDatacenter != "" { 282 return nil 283 } 284 285 info, err := d.client.Agent().Self() 286 if err != nil { 287 d.logger.WithError(err).Error("error retrieving datacenter name") 288 rpcFailuresCount.Inc() 289 return err 290 } 291 292 dc, ok := info["Config"]["Datacenter"].(string) 293 if !ok { 294 err := fmt.Errorf("invalid value '%v' for Config.Datacenter", info["Config"]["Datacenter"]) 295 d.logger.WithError(err).Error("error retrieving datacenter name") 296 return err 297 } 298 299 d.clientDatacenter = dc 300 d.logger = logrus.WithField("datacenter", dc) 301 return nil 302 } 303 304 // Initialize the Discoverer run. 305 func (d *Discovery) initialize(ctx context.Context) { 306 // Loop until we manage to get the local datacenter. 307 for { 308 // We have to check the context at least once. The checks during channel sends 309 // do not guarantee that. 310 select { 311 case <-ctx.Done(): 312 return 313 default: 314 } 315 316 // Get the local datacenter first, if necessary. 317 err := d.getDatacenter() 318 if err != nil { 319 time.Sleep(retryInterval) 320 continue 321 } 322 // We are good to go. 323 return 324 } 325 } 326 327 // Run implements the Discoverer interface. 328 func (d *Discovery) Run(ctx context.Context, ch chan<- []*targetgroup.Group) { 329 if d.finalizer != nil { 330 defer d.finalizer() 331 } 332 d.initialize(ctx) 333 334 if len(d.watchedServices) == 0 || len(d.watchedTags) != 0 { 335 // We need to watch the catalog. 336 ticker := time.NewTicker(d.refreshInterval) 337 338 // Watched services and their cancellation functions. 339 services := make(map[string]func()) 340 var lastIndex uint64 341 342 for { 343 select { 344 case <-ctx.Done(): 345 ticker.Stop() 346 return 347 default: 348 d.watchServices(ctx, ch, &lastIndex, services) 349 <-ticker.C 350 } 351 } 352 } else { 353 // We only have fully defined services. 354 for _, name := range d.watchedServices { 355 d.watchService(ctx, ch, name) 356 } 357 <-ctx.Done() 358 } 359 } 360 361 // Watch the catalog for new services we would like to watch. This is called only 362 // when we don't know yet the names of the services and need to ask Consul the 363 // entire list of services. 364 func (d *Discovery) watchServices(ctx context.Context, ch chan<- []*targetgroup.Group, lastIndex *uint64, services map[string]func()) { 365 catalog := d.client.Catalog() 366 d.logger.WithField("tags", strings.Join(d.watchedTags, ",")).Debug("watching services") 367 368 opts := &consul.QueryOptions{ 369 WaitIndex: *lastIndex, 370 WaitTime: watchTimeout, 371 AllowStale: d.allowStale, 372 NodeMeta: d.watchedNodeMeta, 373 } 374 t0 := time.Now() 375 srvs, meta, err := catalog.Services(opts.WithContext(ctx)) 376 elapsed := time.Since(t0) 377 servicesRPCDuration.Observe(elapsed.Seconds()) 378 379 // Check the context before in order to exit early. 380 select { 381 case <-ctx.Done(): 382 return 383 default: 384 } 385 386 if err != nil { 387 d.logger.WithError(err).Error("error refreshing service list") 388 rpcFailuresCount.Inc() 389 time.Sleep(retryInterval) 390 return 391 } 392 // If the index equals the previous one, the watch timed out with no update. 393 if meta.LastIndex == *lastIndex { 394 return 395 } 396 *lastIndex = meta.LastIndex 397 398 // Check for new services. 399 for name := range srvs { 400 // catalog.Service() returns a map of service name to tags, we can use that to watch 401 // only the services that have the tag we are looking for (if specified). 402 // In the future consul will also support server side for service metadata. 403 // https://github.com/hashicorp/consul/issues/1107 404 if !d.shouldWatch(name, srvs[name]) { 405 continue 406 } 407 if _, ok := services[name]; ok { 408 continue // We are already watching the service. 409 } 410 411 wctx, cancel := context.WithCancel(ctx) 412 d.watchService(wctx, ch, name) 413 services[name] = cancel 414 } 415 416 // Check for removed services. 417 for name, cancel := range services { 418 if _, ok := srvs[name]; !ok { 419 // Call the watch cancellation function. 420 cancel() 421 delete(services, name) 422 423 // Send clearing target group. 424 select { 425 case <-ctx.Done(): 426 return 427 case ch <- []*targetgroup.Group{{Source: name}}: 428 } 429 } 430 } 431 432 // Send targetgroup with no targets if nothing was discovered. 433 if len(services) == 0 { 434 select { 435 case <-ctx.Done(): 436 return 437 case ch <- []*targetgroup.Group{{}}: 438 } 439 } 440 } 441 442 // consulService contains data belonging to the same service. 443 type consulService struct { 444 name string 445 tags []string 446 labels model.LabelSet 447 discovery *Discovery 448 client *consul.Client 449 tagSeparator string 450 logger logrus.FieldLogger 451 } 452 453 // Start watching a service. 454 func (d *Discovery) watchService(ctx context.Context, ch chan<- []*targetgroup.Group, name string) { 455 srv := &consulService{ 456 discovery: d, 457 client: d.client, 458 name: name, 459 tags: d.watchedTags, 460 labels: model.LabelSet{ 461 serviceLabel: model.LabelValue(name), 462 datacenterLabel: model.LabelValue(d.clientDatacenter), 463 }, 464 tagSeparator: d.tagSeparator, 465 logger: d.logger, 466 } 467 468 go func() { 469 ticker := time.NewTicker(d.refreshInterval) 470 defer ticker.Stop() 471 var lastIndex uint64 472 health := srv.client.Health() 473 for { 474 select { 475 case <-ctx.Done(): 476 return 477 default: 478 srv.watch(ctx, ch, health, &lastIndex) 479 select { 480 case <-ticker.C: 481 case <-ctx.Done(): 482 return 483 } 484 } 485 } 486 }() 487 } 488 489 // Get updates for a service. 490 func (srv *consulService) watch(ctx context.Context, ch chan<- []*targetgroup.Group, health *consul.Health, lastIndex *uint64) { 491 srv.logger.WithField("service", srv.name).WithField("tags", strings.Join(srv.tags, ",")).Debug("watching service") 492 493 opts := &consul.QueryOptions{ 494 WaitIndex: *lastIndex, 495 WaitTime: watchTimeout, 496 AllowStale: srv.discovery.allowStale, 497 NodeMeta: srv.discovery.watchedNodeMeta, 498 } 499 500 t0 := time.Now() 501 serviceNodes, meta, err := health.ServiceMultipleTags(srv.name, srv.tags, false, opts.WithContext(ctx)) 502 elapsed := time.Since(t0) 503 serviceRPCDuration.Observe(elapsed.Seconds()) 504 505 // Check the context before in order to exit early. 506 select { 507 case <-ctx.Done(): 508 return 509 default: 510 // Continue. 511 } 512 513 if err != nil { 514 srv.logger.WithError(err).WithField("service", srv.name).WithField("tags", strings.Join(srv.tags, ",")).Error("error refreshing service") 515 rpcFailuresCount.Inc() 516 time.Sleep(retryInterval) 517 return 518 } 519 // If the index equals the previous one, the watch timed out with no update. 520 if meta.LastIndex == *lastIndex { 521 return 522 } 523 *lastIndex = meta.LastIndex 524 525 tgroup := targetgroup.Group{ 526 Source: srv.name, 527 Labels: srv.labels, 528 Targets: make([]model.LabelSet, 0, len(serviceNodes)), 529 } 530 531 for _, serviceNode := range serviceNodes { 532 // We surround the separated list with the separator as well. This way regular expressions 533 // in relabeling rules don't have to consider tag positions. 534 tags := srv.tagSeparator + strings.Join(serviceNode.Service.Tags, srv.tagSeparator) + srv.tagSeparator 535 536 // If the service address is not empty it should be used instead of the node address 537 // since the service may be registered remotely through a different node. 538 var addr string 539 if serviceNode.Service.Address != "" { 540 addr = net.JoinHostPort(serviceNode.Service.Address, fmt.Sprintf("%d", serviceNode.Service.Port)) 541 } else { 542 addr = net.JoinHostPort(serviceNode.Node.Address, fmt.Sprintf("%d", serviceNode.Service.Port)) 543 } 544 545 labels := model.LabelSet{ 546 model.AddressLabel: model.LabelValue(addr), 547 addressLabel: model.LabelValue(serviceNode.Node.Address), 548 nodeLabel: model.LabelValue(serviceNode.Node.Node), 549 namespaceLabel: model.LabelValue(serviceNode.Service.Namespace), 550 partitionLabel: model.LabelValue(serviceNode.Service.Partition), 551 tagsLabel: model.LabelValue(tags), 552 serviceAddressLabel: model.LabelValue(serviceNode.Service.Address), 553 servicePortLabel: model.LabelValue(strconv.Itoa(serviceNode.Service.Port)), 554 serviceIDLabel: model.LabelValue(serviceNode.Service.ID), 555 healthLabel: model.LabelValue(serviceNode.Checks.AggregatedStatus()), 556 } 557 558 // Add all key/value pairs from the node's metadata as their own labels. 559 for k, v := range serviceNode.Node.Meta { 560 name := strutil.SanitizeLabelName(k) 561 labels[metaDataLabel+model.LabelName(name)] = model.LabelValue(v) 562 } 563 564 // Add all key/value pairs from the service's metadata as their own labels. 565 for k, v := range serviceNode.Service.Meta { 566 name := strutil.SanitizeLabelName(k) 567 labels[serviceMetaDataLabel+model.LabelName(name)] = model.LabelValue(v) 568 } 569 570 // Add all key/value pairs from the service's tagged addresses as their own labels. 571 for k, v := range serviceNode.Node.TaggedAddresses { 572 name := strutil.SanitizeLabelName(k) 573 labels[taggedAddressesLabel+model.LabelName(name)] = model.LabelValue(v) 574 } 575 576 tgroup.Targets = append(tgroup.Targets, labels) 577 } 578 579 select { 580 case <-ctx.Done(): 581 case ch <- []*targetgroup.Group{&tgroup}: 582 } 583 }