github.com/hernad/nomad@v1.6.112/command/agent/consul/service_client.go (about) 1 // Copyright (c) HashiCorp, Inc. 2 // SPDX-License-Identifier: MPL-2.0 3 4 package consul 5 6 import ( 7 "context" 8 "fmt" 9 "net" 10 "net/url" 11 "reflect" 12 "regexp" 13 "strconv" 14 "strings" 15 "sync" 16 "sync/atomic" 17 "time" 18 19 "github.com/armon/go-metrics" 20 "github.com/hashicorp/consul/api" 21 "github.com/hashicorp/go-hclog" 22 "github.com/hashicorp/go-set" 23 "github.com/hernad/nomad/client/serviceregistration" 24 "github.com/hernad/nomad/helper" 25 "github.com/hernad/nomad/helper/envoy" 26 "github.com/hernad/nomad/nomad/structs" 27 "golang.org/x/exp/maps" 28 "golang.org/x/exp/slices" 29 ) 30 31 const ( 32 // nomadServicePrefix is the prefix that scopes all Nomad registered 33 // services (both agent and task entries). 34 nomadServicePrefix = "_nomad" 35 36 // nomadServerPrefix is the prefix that scopes Nomad registered Servers. 37 nomadServerPrefix = nomadServicePrefix + "-server-" 38 39 // nomadClientPrefix is the prefix that scopes Nomad registered Clients. 40 nomadClientPrefix = nomadServicePrefix + "-client-" 41 42 // nomadTaskPrefix is the prefix that scopes Nomad registered services 43 // for tasks. 44 nomadTaskPrefix = nomadServicePrefix + "-task-" 45 46 // nomadCheckPrefix is the prefix that scopes Nomad registered checks for 47 // services. 48 nomadCheckPrefix = nomadServicePrefix + "-check-" 49 50 // defaultRetryInterval is how quickly to retry syncing services and 51 // checks to Consul when an error occurs. Will backoff up to a max. 52 defaultRetryInterval = time.Second 53 54 // defaultMaxRetryInterval is the default max retry interval. 55 defaultMaxRetryInterval = 30 * time.Second 56 57 // defaultPeriodicalInterval is the interval at which the service 58 // client reconciles state between the desired services and checks and 59 // what's actually registered in Consul. This is done at an interval, 60 // rather than being purely edge triggered, to handle the case that the 61 // Consul agent's state may change underneath us 62 defaultPeriodicInterval = 30 * time.Second 63 64 // ttlCheckBuffer is the time interval that Nomad can take to report Consul 65 // the check result 66 ttlCheckBuffer = 31 * time.Second 67 68 // defaultShutdownWait is how long Shutdown() should block waiting for 69 // enqueued operations to sync to Consul by default. 70 defaultShutdownWait = time.Minute 71 72 // DefaultQueryWaitDuration is the max duration the Consul Agent will 73 // spend waiting for a response from a Consul Query. 74 DefaultQueryWaitDuration = 2 * time.Second 75 76 // ServiceTagHTTP is the tag assigned to HTTP services 77 ServiceTagHTTP = "http" 78 79 // ServiceTagRPC is the tag assigned to RPC services 80 ServiceTagRPC = "rpc" 81 82 // ServiceTagSerf is the tag assigned to Serf services 83 ServiceTagSerf = "serf" 84 85 // deregisterProbationPeriod is the initialization period where 86 // services registered in Consul but not in Nomad don't get deregistered, 87 // to allow for nomad restoring tasks 88 deregisterProbationPeriod = time.Minute 89 ) 90 91 // Additional Consul ACLs required 92 // - Consul Template: key:read 93 // Used in tasks with template block that use Consul keys. 94 95 // CatalogAPI is the consul/api.Catalog API used by Nomad. 96 // 97 // ACL requirements 98 // - node:read (listing datacenters) 99 // - service:read 100 type CatalogAPI interface { 101 Datacenters() ([]string, error) 102 Service(service, tag string, q *api.QueryOptions) ([]*api.CatalogService, *api.QueryMeta, error) 103 } 104 105 // NamespaceAPI is the consul/api.Namespace API used by Nomad. 106 // 107 // ACL requirements 108 // - operator:read OR namespace:*:read 109 type NamespaceAPI interface { 110 List(q *api.QueryOptions) ([]*api.Namespace, *api.QueryMeta, error) 111 } 112 113 // AgentAPI is the consul/api.Agent API used by Nomad. 114 // 115 // ACL requirements 116 // - agent:read 117 // - service:write 118 type AgentAPI interface { 119 CheckRegister(check *api.AgentCheckRegistration) error 120 CheckDeregisterOpts(checkID string, q *api.QueryOptions) error 121 ChecksWithFilterOpts(filter string, q *api.QueryOptions) (map[string]*api.AgentCheck, error) 122 UpdateTTLOpts(id, output, status string, q *api.QueryOptions) error 123 124 ServiceRegister(service *api.AgentServiceRegistration) error 125 ServiceDeregisterOpts(serviceID string, q *api.QueryOptions) error 126 ServicesWithFilterOpts(filter string, q *api.QueryOptions) (map[string]*api.AgentService, error) 127 128 Self() (map[string]map[string]interface{}, error) 129 } 130 131 // ConfigAPI is the consul/api.ConfigEntries API subset used by Nomad Server. 132 // 133 // ACL requirements 134 // - operator:write (server only) 135 type ConfigAPI interface { 136 Set(entry api.ConfigEntry, w *api.WriteOptions) (bool, *api.WriteMeta, error) 137 // Delete(kind, name string, w *api.WriteOptions) (*api.WriteMeta, error) (not used) 138 } 139 140 // ACLsAPI is the consul/api.ACL API subset used by Nomad Server. 141 // 142 // ACL requirements 143 // - acl:write (server only) 144 type ACLsAPI interface { 145 TokenReadSelf(q *api.QueryOptions) (*api.ACLToken, *api.QueryMeta, error) // for lookup via operator token 146 PolicyRead(policyID string, q *api.QueryOptions) (*api.ACLPolicy, *api.QueryMeta, error) 147 RoleRead(roleID string, q *api.QueryOptions) (*api.ACLRole, *api.QueryMeta, error) 148 TokenCreate(partial *api.ACLToken, q *api.WriteOptions) (*api.ACLToken, *api.WriteMeta, error) 149 TokenDelete(accessorID string, q *api.WriteOptions) (*api.WriteMeta, error) 150 TokenList(q *api.QueryOptions) ([]*api.ACLTokenListEntry, *api.QueryMeta, error) 151 } 152 153 // agentServiceUpdateRequired checks if any critical fields in Nomad's version 154 // of a service definition are different from the existing service definition as 155 // known by Consul. 156 // 157 // reason - The syncReason that triggered this synchronization with the consul 158 // agent API. 159 // wanted - Nomad's view of what the service definition is intended to be. 160 // Not nil. 161 // existing - Consul's view (agent, not catalog) of the actual service definition. 162 // Not nil. 163 // sidecar - Consul's view (agent, not catalog) of the service definition of the sidecar 164 // associated with existing that may or may not exist. 165 // May be nil. 166 func (s *ServiceClient) agentServiceUpdateRequired(reason syncReason, wanted *api.AgentServiceRegistration, existing *api.AgentService, sidecar *api.AgentService) bool { 167 switch reason { 168 case syncPeriodic: 169 // In a periodic sync with Consul, we need to respect the value of 170 // the enable_tag_override field so that we maintain the illusion that the 171 // user is in control of the Consul tags, as they may be externally edited 172 // via the Consul catalog API (e.g. a user manually sets them). 173 // 174 // As Consul does by disabling anti-entropy for the tags field, Nomad will 175 // ignore differences in the tags field during the periodic syncs with 176 // the Consul agent API. 177 // 178 // We do so by over-writing the nomad service registration by the value 179 // of the tags that Consul contains, if enable_tag_override = true. 180 maybeTweakTags(wanted, existing, sidecar) 181 182 // Also, purge tagged address fields of nomad agent services. 183 maybeTweakTaggedAddresses(wanted, existing) 184 185 // Okay now it is safe to compare. 186 return s.different(wanted, existing, sidecar) 187 188 default: 189 // A non-periodic sync with Consul indicates an operation has been set 190 // on the queue. This happens when service has been added / removed / modified 191 // and implies the Consul agent should be sync'd with nomad, because 192 // nomad is the ultimate source of truth for the service definition. 193 194 // But do purge tagged address fields of nomad agent services. 195 maybeTweakTaggedAddresses(wanted, existing) 196 197 // Okay now it is safe to compare. 198 return s.different(wanted, existing, sidecar) 199 } 200 } 201 202 // maybeTweakTags will override wanted.Tags with a copy of existing.Tags only if 203 // EnableTagOverride is true. Otherwise the wanted service registration is left 204 // unchanged. 205 func maybeTweakTags(wanted *api.AgentServiceRegistration, existing *api.AgentService, sidecar *api.AgentService) { 206 if wanted.EnableTagOverride { 207 wanted.Tags = slices.Clone(existing.Tags) 208 // If the service registration also defines a sidecar service, use the ETO 209 // setting for the parent service to also apply to the sidecar. 210 if wanted.Connect != nil && wanted.Connect.SidecarService != nil { 211 if sidecar != nil { 212 wanted.Connect.SidecarService.Tags = slices.Clone(sidecar.Tags) 213 } 214 } 215 } 216 } 217 218 // maybeTweakTaggedAddresses will remove the Consul-injected .TaggedAddresses fields 219 // from existing if wanted represents a Nomad agent (Client or Server) or Nomad managed 220 // service, which do not themselves configure those tagged addresses. We do this 221 // because Consul will magically set the .TaggedAddress to values Nomad does not 222 // know about if they are submitted as unset. 223 func maybeTweakTaggedAddresses(wanted *api.AgentServiceRegistration, existing *api.AgentService) { 224 if isNomadAgent(wanted.ID) || isNomadService(wanted.ID) { 225 if _, exists := wanted.TaggedAddresses["lan_ipv4"]; !exists { 226 delete(existing.TaggedAddresses, "lan_ipv4") 227 } 228 if _, exists := wanted.TaggedAddresses["wan_ipv4"]; !exists { 229 delete(existing.TaggedAddresses, "wan_ipv4") 230 } 231 if _, exists := wanted.TaggedAddresses["lan_ipv6"]; !exists { 232 delete(existing.TaggedAddresses, "lan_ipv6") 233 } 234 if _, exists := wanted.TaggedAddresses["wan_ipv6"]; !exists { 235 delete(existing.TaggedAddresses, "wan_ipv6") 236 } 237 } 238 } 239 240 // different compares the wanted state of the service registration with the actual 241 // (cached) state of the service registration reported by Consul. If any of the 242 // critical fields are not deeply equal, they considered different. 243 func (s *ServiceClient) different(wanted *api.AgentServiceRegistration, existing *api.AgentService, sidecar *api.AgentService) bool { 244 trace := func(field string, left, right any) { 245 s.logger.Trace("registrations different", "id", wanted.ID, 246 "field", field, "wanted", fmt.Sprintf("%#v", left), "existing", fmt.Sprintf("%#v", right), 247 ) 248 } 249 250 switch { 251 case wanted.Kind != existing.Kind: 252 trace("kind", wanted.Kind, existing.Kind) 253 return true 254 case wanted.ID != existing.ID: 255 trace("id", wanted.ID, existing.ID) 256 return true 257 case wanted.Port != existing.Port: 258 trace("port", wanted.Port, existing.Port) 259 return true 260 case wanted.Address != existing.Address: 261 trace("address", wanted.Address, existing.Address) 262 return true 263 case wanted.Name != existing.Service: 264 trace("service name", wanted.Name, existing.Service) 265 return true 266 case wanted.EnableTagOverride != existing.EnableTagOverride: 267 trace("enable_tag_override", wanted.EnableTagOverride, existing.EnableTagOverride) 268 return true 269 case !maps.Equal(wanted.Meta, existing.Meta): 270 trace("meta", wanted.Meta, existing.Meta) 271 return true 272 case !maps.Equal(wanted.TaggedAddresses, existing.TaggedAddresses): 273 trace("tagged_addresses", wanted.TaggedAddresses, existing.TaggedAddresses) 274 return true 275 case !helper.SliceSetEq(wanted.Tags, existing.Tags): 276 trace("tags", wanted.Tags, existing.Tags) 277 return true 278 case connectSidecarDifferent(wanted, sidecar): 279 trace("connect_sidecar", wanted.Name, existing.Service) 280 return true 281 } 282 return false 283 } 284 285 // sidecarTagsDifferent includes the special logic for comparing sidecar tags 286 // from Nomad vs. Consul perspective. Because Consul forces the sidecar tags 287 // to inherit the parent service tags if the sidecar tags are unset, we need to 288 // take that into consideration when Nomad's sidecar tags are unset by instead 289 // comparing them to the parent service tags. 290 func sidecarTagsDifferent(parent, wanted, sidecar []string) bool { 291 if len(wanted) == 0 { 292 return !helper.SliceSetEq(parent, sidecar) 293 } 294 return !helper.SliceSetEq(wanted, sidecar) 295 } 296 297 // proxyUpstreamsDifferent determines if the sidecar_service.proxy.upstreams 298 // configurations are different between the desired sidecar service state, and 299 // the actual sidecar service state currently registered in Consul. 300 func proxyUpstreamsDifferent(wanted *api.AgentServiceConnect, sidecar *api.AgentServiceConnectProxyConfig) bool { 301 // There is similar code that already does this in Nomad's API package, 302 // however here we are operating on Consul API package structs, and they do not 303 // provide such helper functions. 304 305 getProxyUpstreams := func(pc *api.AgentServiceConnectProxyConfig) []api.Upstream { 306 switch { 307 case pc == nil: 308 return nil 309 case len(pc.Upstreams) == 0: 310 return nil 311 default: 312 return pc.Upstreams 313 } 314 } 315 316 getConnectUpstreams := func(sc *api.AgentServiceConnect) []api.Upstream { 317 switch { 318 case sc.SidecarService.Proxy == nil: 319 return nil 320 case len(sc.SidecarService.Proxy.Upstreams) == 0: 321 return nil 322 default: 323 return sc.SidecarService.Proxy.Upstreams 324 } 325 } 326 327 upstreamsDifferent := func(a, b []api.Upstream) bool { 328 if len(a) != len(b) { 329 return true 330 } 331 332 for i := 0; i < len(a); i++ { 333 A := a[i] 334 B := b[i] 335 switch { 336 case A.Datacenter != B.Datacenter: 337 return true 338 case A.DestinationName != B.DestinationName: 339 return true 340 case A.LocalBindAddress != B.LocalBindAddress: 341 return true 342 case A.LocalBindPort != B.LocalBindPort: 343 return true 344 case A.MeshGateway.Mode != B.MeshGateway.Mode: 345 return true 346 case !reflect.DeepEqual(A.Config, B.Config): 347 return true 348 } 349 } 350 return false 351 } 352 353 return upstreamsDifferent( 354 getConnectUpstreams(wanted), 355 getProxyUpstreams(sidecar), 356 ) 357 } 358 359 // connectSidecarDifferent returns true if Nomad expects there to be a sidecar 360 // hanging off the desired parent service definition on the Consul side, and does 361 // not match with what Consul has. 362 // 363 // This is used to determine if the connect sidecar service registration should be 364 // updated - potentially (but not necessarily) in-place. 365 func connectSidecarDifferent(wanted *api.AgentServiceRegistration, sidecar *api.AgentService) bool { 366 if wanted.Connect != nil && wanted.Connect.SidecarService != nil { 367 if sidecar == nil { 368 // consul lost our sidecar (?) 369 return true 370 } 371 372 if sidecarTagsDifferent(wanted.Tags, wanted.Connect.SidecarService.Tags, sidecar.Tags) { 373 // tags on the nomad definition have been modified 374 return true 375 } 376 377 if proxyUpstreamsDifferent(wanted.Connect, sidecar.Proxy) { 378 // proxy upstreams on the nomad definition have been modified 379 return true 380 } 381 } 382 383 // Either Nomad does not expect there to be a sidecar_service, or there is 384 // no actionable difference from the Consul sidecar_service definition. 385 return false 386 } 387 388 // operations are submitted to the main loop via commit() for synchronizing 389 // with Consul. 390 type operations struct { 391 regServices []*api.AgentServiceRegistration 392 regChecks []*api.AgentCheckRegistration 393 deregServices []string 394 deregChecks []string 395 } 396 397 func (o *operations) empty() bool { 398 switch { 399 case o == nil: 400 return true 401 case len(o.regServices) > 0: 402 return false 403 case len(o.regChecks) > 0: 404 return false 405 case len(o.deregServices) > 0: 406 return false 407 case len(o.deregChecks) > 0: 408 return false 409 default: 410 return true 411 } 412 } 413 414 func (o *operations) String() string { 415 return fmt.Sprintf("<%d, %d, %d, %d>", len(o.regServices), len(o.regChecks), len(o.deregServices), len(o.deregChecks)) 416 } 417 418 // ServiceClient handles task and agent service registration with Consul. 419 type ServiceClient struct { 420 agentAPI AgentAPI 421 namespacesClient *NamespacesClient 422 423 logger hclog.Logger 424 retryInterval time.Duration 425 maxRetryInterval time.Duration 426 periodicInterval time.Duration 427 428 // exitCh is closed when the main Run loop exits 429 exitCh chan struct{} 430 431 // shutdownCh is closed when the client should shutdown 432 shutdownCh chan struct{} 433 434 // shutdownWait is how long Shutdown() blocks waiting for the final 435 // sync() to finish. Defaults to defaultShutdownWait 436 shutdownWait time.Duration 437 438 opCh chan *operations 439 440 services map[string]*api.AgentServiceRegistration 441 checks map[string]*api.AgentCheckRegistration 442 443 explicitlyDeregisteredServices *set.Set[string] 444 explicitlyDeregisteredChecks *set.Set[string] 445 446 // allocRegistrations stores the services and checks that are registered 447 // with Consul by allocation ID. 448 allocRegistrations map[string]*serviceregistration.AllocRegistration 449 allocRegistrationsLock sync.RWMutex 450 451 // Nomad agent services and checks that are recorded so they can be removed 452 // on shutdown. Defers to consul namespace specified in client consul config. 453 agentServices *set.Set[string] 454 agentChecks *set.Set[string] 455 agentLock sync.Mutex 456 457 // seen is 1 if Consul has ever been seen; otherwise 0. Accessed with 458 // atomics. 459 seen int32 460 461 // deregisterProbationExpiry is the time before which consul sync shouldn't deregister 462 // unknown services. 463 // Used to mitigate risk of deleting restored services upon client restart. 464 deregisterProbationExpiry time.Time 465 466 // checkWatcher restarts checks that are unhealthy. 467 checkWatcher *serviceregistration.UniversalCheckWatcher 468 469 // isClientAgent specifies whether this Consul client is being used 470 // by a Nomad client. 471 isClientAgent bool 472 } 473 474 // checkStatusGetter is the consul-specific implementation of serviceregistration.CheckStatusGetter 475 type checkStatusGetter struct { 476 agentAPI AgentAPI 477 namespacesClient *NamespacesClient 478 } 479 480 func (csg *checkStatusGetter) Get() (map[string]string, error) { 481 // Get the list of all namespaces so we can iterate them. 482 namespaces, err := csg.namespacesClient.List() 483 if err != nil { 484 return nil, err 485 } 486 487 results := make(map[string]string) 488 for _, namespace := range namespaces { 489 resultsInNamespace, err := csg.agentAPI.ChecksWithFilterOpts("", &api.QueryOptions{Namespace: normalizeNamespace(namespace)}) 490 if err != nil { 491 return nil, err 492 } 493 494 for k, v := range resultsInNamespace { 495 results[k] = v.Status 496 } 497 } 498 return results, nil 499 } 500 501 // NewServiceClient creates a new Consul ServiceClient from an existing Consul API 502 // Client, logger and takes whether the client is being used by a Nomad Client agent. 503 // When being used by a Nomad client, this Consul client reconciles all services and 504 // checks created by Nomad on behalf of running tasks. 505 func NewServiceClient(agentAPI AgentAPI, namespacesClient *NamespacesClient, logger hclog.Logger, isNomadClient bool) *ServiceClient { 506 logger = logger.ResetNamed("consul.sync") 507 return &ServiceClient{ 508 agentAPI: agentAPI, 509 namespacesClient: namespacesClient, 510 logger: logger, 511 retryInterval: defaultRetryInterval, 512 maxRetryInterval: defaultMaxRetryInterval, 513 periodicInterval: defaultPeriodicInterval, 514 exitCh: make(chan struct{}), 515 shutdownCh: make(chan struct{}), 516 shutdownWait: defaultShutdownWait, 517 opCh: make(chan *operations, 8), 518 services: make(map[string]*api.AgentServiceRegistration), 519 checks: make(map[string]*api.AgentCheckRegistration), 520 explicitlyDeregisteredServices: set.New[string](0), 521 explicitlyDeregisteredChecks: set.New[string](0), 522 allocRegistrations: make(map[string]*serviceregistration.AllocRegistration), 523 agentServices: set.New[string](4), 524 agentChecks: set.New[string](0), 525 isClientAgent: isNomadClient, 526 deregisterProbationExpiry: time.Now().Add(deregisterProbationPeriod), 527 checkWatcher: serviceregistration.NewCheckWatcher(logger, &checkStatusGetter{ 528 agentAPI: agentAPI, 529 namespacesClient: namespacesClient, 530 }), 531 } 532 } 533 534 // seen is used by markSeen and hasSeen 535 const seen = 1 536 537 // markSeen marks Consul as having been seen (meaning at least one operation 538 // has succeeded). 539 func (c *ServiceClient) markSeen() { 540 atomic.StoreInt32(&c.seen, seen) 541 } 542 543 // hasSeen returns true if any Consul operation has ever succeeded. Useful to 544 // squelch errors if Consul isn't running. 545 func (c *ServiceClient) hasSeen() bool { 546 return atomic.LoadInt32(&c.seen) == seen 547 } 548 549 // syncReason indicates why a sync operation with consul is about to happen. 550 // 551 // The trigger for a sync may have implications on the behavior of the sync itself. 552 // In particular if a service is defined with enable_tag_override=true, the sync 553 // should ignore changes to the service's Tags field. 554 type syncReason byte 555 556 const ( 557 syncPeriodic syncReason = iota 558 syncShutdown 559 syncNewOps 560 ) 561 562 func (sr syncReason) String() string { 563 switch sr { 564 case syncPeriodic: 565 return "periodic" 566 case syncShutdown: 567 return "shutdown" 568 case syncNewOps: 569 return "operations" 570 default: 571 return "unexpected" 572 } 573 } 574 575 // Run the Consul main loop which retries operations against Consul. It should 576 // be called exactly once. 577 func (c *ServiceClient) Run() { 578 defer close(c.exitCh) 579 580 ctx, cancel := context.WithCancel(context.Background()) 581 defer cancel() 582 583 // init will be closed when Consul has been contacted 584 init := make(chan struct{}) 585 go checkConsulTLSSkipVerify(ctx, c.logger, c.agentAPI, init) 586 587 // Process operations while waiting for initial contact with Consul but 588 // do not sync until contact has been made. 589 INIT: 590 for { 591 select { 592 case <-init: 593 c.markSeen() 594 break INIT 595 case <-c.shutdownCh: 596 return 597 case ops := <-c.opCh: 598 c.merge(ops) 599 } 600 } 601 c.logger.Trace("able to contact Consul") 602 603 // Block until contact with Consul has been established 604 // Start checkWatcher 605 go c.checkWatcher.Run(ctx) 606 607 // Always immediately sync to reconcile Nomad and Consul's state 608 retryTimer := time.NewTimer(0) 609 610 failures := 0 611 for { 612 // On every iteration take note of what the trigger for the next sync 613 // was, so that it may be referenced during the sync itself. 614 var reasonForSync syncReason 615 616 select { 617 case <-retryTimer.C: 618 reasonForSync = syncPeriodic 619 case <-c.shutdownCh: 620 reasonForSync = syncShutdown 621 // Cancel check watcher but sync one last time 622 cancel() 623 case ops := <-c.opCh: 624 reasonForSync = syncNewOps 625 c.merge(ops) 626 } 627 628 if err := c.sync(reasonForSync); err != nil { 629 if failures == 0 { 630 // Log on the first failure 631 c.logger.Warn("failed to update services in Consul", "error", err) 632 } else if failures%10 == 0 { 633 // Log every 10th consecutive failure 634 c.logger.Error("still unable to update services in Consul", "failures", failures, "error", err) 635 } 636 637 failures++ 638 if !retryTimer.Stop() { 639 // Timer already expired, since the timer may 640 // or may not have been read in the select{} 641 // above, conditionally receive on it 642 select { 643 case <-retryTimer.C: 644 default: 645 } 646 } 647 backoff := c.retryInterval * time.Duration(failures) 648 if backoff > c.maxRetryInterval { 649 backoff = c.maxRetryInterval 650 } 651 retryTimer.Reset(backoff) 652 } else { 653 if failures > 0 { 654 c.logger.Info("successfully updated services in Consul") 655 failures = 0 656 } 657 658 // on successful sync, clear deregistered consul entities 659 c.clearExplicitlyDeregistered() 660 661 // Reset timer to periodic interval to periodically 662 // reconile with Consul 663 if !retryTimer.Stop() { 664 select { 665 case <-retryTimer.C: 666 default: 667 } 668 } 669 retryTimer.Reset(c.periodicInterval) 670 } 671 672 select { 673 case <-c.shutdownCh: 674 // Exit only after sync'ing all outstanding operations 675 if len(c.opCh) > 0 { 676 for len(c.opCh) > 0 { 677 c.merge(<-c.opCh) 678 } 679 continue 680 } 681 return 682 default: 683 } 684 685 } 686 } 687 688 // commit operations unless already shutting down. 689 func (c *ServiceClient) commit(ops *operations) { 690 c.logger.Trace("commit sync operations", "ops", ops) 691 692 // Ignore empty operations - ideally callers will optimize out syncs with 693 // nothing to do, but be defensive anyway. Sending an empty ops on the chan 694 // will trigger an unnecessary sync with Consul. 695 if ops.empty() { 696 return 697 } 698 699 // Prioritize doing nothing if we are being signaled to shutdown. 700 select { 701 case <-c.shutdownCh: 702 return 703 default: 704 } 705 706 // Send the ops down the ops chan, triggering a sync with Consul. Unless we 707 // receive a signal to shutdown. 708 select { 709 case c.opCh <- ops: 710 case <-c.shutdownCh: 711 } 712 } 713 714 func (c *ServiceClient) clearExplicitlyDeregistered() { 715 c.explicitlyDeregisteredServices = set.New[string](0) 716 c.explicitlyDeregisteredChecks = set.New[string](0) 717 } 718 719 // merge registrations into state map prior to sync'ing with Consul 720 func (c *ServiceClient) merge(ops *operations) { 721 for _, s := range ops.regServices { 722 c.services[s.ID] = s 723 } 724 for _, check := range ops.regChecks { 725 c.checks[check.ID] = check 726 } 727 for _, sid := range ops.deregServices { 728 delete(c.services, sid) 729 c.explicitlyDeregisteredServices.Insert(sid) 730 } 731 for _, cid := range ops.deregChecks { 732 delete(c.checks, cid) 733 c.explicitlyDeregisteredChecks.Insert(cid) 734 } 735 metrics.SetGauge([]string{"client", "consul", "services"}, float32(len(c.services))) 736 metrics.SetGauge([]string{"client", "consul", "checks"}, float32(len(c.checks))) 737 } 738 739 // sync enqueued operations. 740 func (c *ServiceClient) sync(reason syncReason) error { 741 c.logger.Trace("execute sync", "reason", reason) 742 743 sreg, creg, sdereg, cdereg := 0, 0, 0, 0 744 var err error 745 746 // Get the list of all namespaces created so we can iterate them. 747 namespaces, err := c.namespacesClient.List() 748 if err != nil { 749 metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1) 750 return fmt.Errorf("failed to query Consul namespaces: %w", err) 751 } 752 753 // Accumulate all services in Consul across all namespaces. 754 servicesInConsul := make(map[string]*api.AgentService) 755 for _, namespace := range namespaces { 756 if nsServices, err := c.agentAPI.ServicesWithFilterOpts("", &api.QueryOptions{Namespace: normalizeNamespace(namespace)}); err != nil { 757 metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1) 758 return fmt.Errorf("failed to query Consul services: %w", err) 759 } else { 760 for k, v := range nsServices { 761 servicesInConsul[k] = v 762 } 763 } 764 } 765 766 // Compute whether we are still in probation period where we will avoid 767 // de-registering services. 768 inProbation := time.Now().Before(c.deregisterProbationExpiry) 769 770 // Remove Nomad services in Consul but unknown to Nomad. 771 for id := range servicesInConsul { 772 if _, ok := c.services[id]; ok { 773 // Known service, skip 774 continue 775 } 776 777 // Ignore if this is not a Nomad managed service. Also ignore 778 // Nomad managed services if this is not a client agent. 779 // This is to prevent server agents from removing services 780 // registered by client agents 781 if !isNomadService(id) || !c.isClientAgent { 782 // Not managed by Nomad, skip 783 continue 784 } 785 786 // Ignore unknown services during probation 787 if inProbation && !c.explicitlyDeregisteredServices.Contains(id) { 788 continue 789 } 790 791 // Ignore if this is a service for a Nomad managed sidecar proxy. 792 if maybeConnectSidecar(id) { 793 continue 794 } 795 796 // Get the Consul namespace this service is in. 797 ns := servicesInConsul[id].Namespace 798 799 // If this service has a sidecar, we need to remove the sidecar first, 800 // otherwise Consul will produce a warning and an error when removing 801 // the parent service. 802 // 803 // The sidecar is not tracked on the Nomad side; it was registered 804 // implicitly through the parent service. 805 if sidecar := getNomadSidecar(id, servicesInConsul); sidecar != nil { 806 if err := c.agentAPI.ServiceDeregisterOpts(sidecar.ID, &api.QueryOptions{Namespace: ns}); err != nil { 807 metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1) 808 return err 809 } 810 } 811 812 // Remove the unwanted service. 813 if err := c.agentAPI.ServiceDeregisterOpts(id, &api.QueryOptions{Namespace: ns}); err != nil { 814 if isOldNomadService(id) { 815 // Don't hard-fail on old entries. See #3620 816 continue 817 } 818 819 metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1) 820 return err 821 } 822 sdereg++ 823 metrics.IncrCounter([]string{"client", "consul", "service_deregistrations"}, 1) 824 } 825 826 // Add Nomad managed services missing in Consul, or updated via Nomad. 827 for id, serviceInNomad := range c.services { 828 serviceInConsul, exists := servicesInConsul[id] 829 sidecarInConsul := getNomadSidecar(id, servicesInConsul) 830 831 if !exists || c.agentServiceUpdateRequired(reason, serviceInNomad, serviceInConsul, sidecarInConsul) { 832 c.logger.Trace("must register service", "id", id, "exists", exists, "reason", reason) 833 if err = c.agentAPI.ServiceRegister(serviceInNomad); err != nil { 834 metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1) 835 return err 836 } 837 sreg++ 838 metrics.IncrCounter([]string{"client", "consul", "service_registrations"}, 1) 839 } 840 841 } 842 843 checksInConsul := make(map[string]*api.AgentCheck) 844 for _, namespace := range namespaces { 845 nsChecks, err := c.agentAPI.ChecksWithFilterOpts("", &api.QueryOptions{Namespace: normalizeNamespace(namespace)}) 846 if err != nil { 847 metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1) 848 return fmt.Errorf("failed to query Consul checks: %w", err) 849 } 850 for k, v := range nsChecks { 851 checksInConsul[k] = v 852 } 853 } 854 855 // Remove Nomad checks in Consul but unknown locally 856 for id, check := range checksInConsul { 857 if _, ok := c.checks[id]; ok { 858 // Known check, leave it 859 continue 860 } 861 862 // Ignore if this is not a Nomad managed check. Also ignore 863 // Nomad managed checks if this is not a client agent. 864 // This is to prevent server agents from removing checks 865 // registered by client agents 866 if !isNomadService(check.ServiceID) || !c.isClientAgent || !isNomadCheck(check.CheckID) { 867 // Service not managed by Nomad, skip 868 continue 869 } 870 871 // Ignore unknown services during probation 872 if inProbation && !c.explicitlyDeregisteredChecks.Contains(id) { 873 continue 874 } 875 876 // Ignore if this is a check for a Nomad managed sidecar proxy. 877 if maybeSidecarProxyCheck(id) { 878 continue 879 } 880 881 // Unknown Nomad managed check; remove 882 if err := c.agentAPI.CheckDeregisterOpts(id, &api.QueryOptions{Namespace: check.Namespace}); err != nil { 883 if isOldNomadService(check.ServiceID) { 884 // Don't hard-fail on old entries. 885 continue 886 } 887 888 metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1) 889 return err 890 } 891 cdereg++ 892 metrics.IncrCounter([]string{"client", "consul", "check_deregistrations"}, 1) 893 } 894 895 // Add Nomad checks missing from Consul 896 for id, check := range c.checks { 897 if _, ok := checksInConsul[id]; ok { 898 // Already in Consul; skipping 899 continue 900 } 901 if err := c.agentAPI.CheckRegister(check); err != nil { 902 metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1) 903 return err 904 } 905 creg++ 906 metrics.IncrCounter([]string{"client", "consul", "check_registrations"}, 1) 907 } 908 909 // Only log if something was actually synced 910 if sreg > 0 || sdereg > 0 || creg > 0 || cdereg > 0 { 911 c.logger.Debug("sync complete", "registered_services", sreg, "deregistered_services", sdereg, 912 "registered_checks", creg, "deregistered_checks", cdereg) 913 } 914 return nil 915 } 916 917 // RegisterAgent registers Nomad agents (client or server). The 918 // Service.PortLabel should be a literal port to be parsed with SplitHostPort. 919 // Script checks are not supported and will return an error. Registration is 920 // asynchronous. 921 // 922 // Agents will be deregistered when Shutdown is called. 923 // 924 // Note: no need to manually plumb Consul namespace into the agent service registration 925 // or its check registrations, because the Nomad Client's Consul Client will already 926 // have the Nomad Client's Consul Namespace set on startup. 927 func (c *ServiceClient) RegisterAgent(role string, services []*structs.Service) error { 928 ops := operations{} 929 930 for _, service := range services { 931 id := makeAgentServiceID(role, service) 932 933 // Unlike tasks, agents don't use port labels. Agent ports are 934 // stored directly in the PortLabel. 935 host, rawport, err := net.SplitHostPort(service.PortLabel) 936 if err != nil { 937 return fmt.Errorf("error parsing port label %q from service %q: %v", service.PortLabel, service.Name, err) 938 } 939 port, err := strconv.Atoi(rawport) 940 if err != nil { 941 return fmt.Errorf("error parsing port %q from service %q: %v", rawport, service.Name, err) 942 } 943 serviceReg := &api.AgentServiceRegistration{ 944 ID: id, 945 Name: service.Name, 946 Tags: service.Tags, 947 Address: host, 948 Port: port, 949 // This enables the consul UI to show that Nomad registered this service 950 Meta: map[string]string{ 951 "external-source": "nomad", 952 }, 953 } 954 ops.regServices = append(ops.regServices, serviceReg) 955 956 for _, check := range service.Checks { 957 checkID := MakeCheckID(id, check) 958 if check.Type == structs.ServiceCheckScript { 959 return fmt.Errorf("service %q contains invalid check: agent checks do not support scripts", service.Name) 960 } 961 checkHost, checkPort := serviceReg.Address, serviceReg.Port 962 if check.PortLabel != "" { 963 // Unlike tasks, agents don't use port labels. Agent ports are 964 // stored directly in the PortLabel. 965 host, rawport, err := net.SplitHostPort(check.PortLabel) 966 if err != nil { 967 return fmt.Errorf("error parsing port label %q from check %q: %v", service.PortLabel, check.Name, err) 968 } 969 port, err := strconv.Atoi(rawport) 970 if err != nil { 971 return fmt.Errorf("error parsing port %q from check %q: %v", rawport, check.Name, err) 972 } 973 checkHost, checkPort = host, port 974 } 975 checkReg, err := createCheckReg(id, checkID, check, checkHost, checkPort, "") 976 if err != nil { 977 return fmt.Errorf("failed to add check %q: %v", check.Name, err) 978 } 979 ops.regChecks = append(ops.regChecks, checkReg) 980 } 981 } 982 983 // Don't bother committing agent checks if we're already shutting down 984 c.agentLock.Lock() 985 defer c.agentLock.Unlock() 986 select { 987 case <-c.shutdownCh: 988 return nil 989 default: 990 } 991 992 // Now add them to the registration queue 993 c.commit(&ops) 994 995 // Record IDs for deregistering on shutdown 996 for _, id := range ops.regServices { 997 c.agentServices.Insert(id.ID) 998 } 999 for _, id := range ops.regChecks { 1000 c.agentChecks.Insert(id.ID) 1001 } 1002 return nil 1003 } 1004 1005 // serviceRegs creates service registrations, check registrations, and script 1006 // checks from a service. It returns a service registration object with the 1007 // service and check IDs populated. 1008 func (c *ServiceClient) serviceRegs( 1009 ops *operations, 1010 service *structs.Service, 1011 workload *serviceregistration.WorkloadServices, 1012 ) (*serviceregistration.ServiceRegistration, error) { 1013 1014 // Get the services ID 1015 id := serviceregistration.MakeAllocServiceID(workload.AllocInfo.AllocID, workload.Name(), service) 1016 sreg := &serviceregistration.ServiceRegistration{ 1017 ServiceID: id, 1018 CheckIDs: make(map[string]struct{}, len(service.Checks)), 1019 CheckOnUpdate: make(map[string]string, len(service.Checks)), 1020 } 1021 1022 // Service address modes default to auto 1023 addrMode := service.AddressMode 1024 if addrMode == "" { 1025 addrMode = structs.AddressModeAuto 1026 } 1027 1028 // Determine the address to advertise based on the mode 1029 ip, port, err := serviceregistration.GetAddress( 1030 service.Address, addrMode, service.PortLabel, workload.Networks, workload.DriverNetwork, workload.Ports, workload.NetworkStatus) 1031 if err != nil { 1032 return nil, fmt.Errorf("unable to get address for service %q: %v", service.Name, err) 1033 } 1034 1035 // Determine whether to use tags or canary_tags 1036 var tags []string 1037 if workload.Canary && len(service.CanaryTags) > 0 { 1038 tags = make([]string, len(service.CanaryTags)) 1039 copy(tags, service.CanaryTags) 1040 } else { 1041 tags = make([]string, len(service.Tags)) 1042 copy(tags, service.Tags) 1043 } 1044 1045 // newConnect returns (nil, nil) if there's no Connect-enabled service. 1046 connect, err := newConnect(id, workload.AllocInfo, service.Name, service.Connect, workload.Networks, workload.Ports) 1047 if err != nil { 1048 return nil, fmt.Errorf("invalid Consul Connect configuration for service %q: %v", service.Name, err) 1049 } 1050 1051 // newConnectGateway returns nil if there's no Connect gateway. 1052 gateway := newConnectGateway(service.Connect) 1053 1054 // Determine whether to use meta or canary_meta 1055 var meta map[string]string 1056 if workload.Canary && len(service.CanaryMeta) > 0 { 1057 meta = make(map[string]string, len(service.CanaryMeta)+1) 1058 for k, v := range service.CanaryMeta { 1059 meta[k] = v 1060 } 1061 } else { 1062 meta = make(map[string]string, len(service.Meta)+1) 1063 for k, v := range service.Meta { 1064 meta[k] = v 1065 } 1066 } 1067 1068 // This enables the consul UI to show that Nomad registered this service 1069 meta["external-source"] = "nomad" 1070 1071 // Explicitly set the Consul service Kind in case this service represents 1072 // one of the Connect gateway types. 1073 kind := api.ServiceKindTypical 1074 switch { 1075 case service.Connect.IsIngress(): 1076 kind = api.ServiceKindIngressGateway 1077 case service.Connect.IsTerminating(): 1078 kind = api.ServiceKindTerminatingGateway 1079 1080 if proxy := service.Connect.Gateway.Proxy; proxy != nil { 1081 // set the default port if bridge / default listener set 1082 if defaultBind, exists := proxy.EnvoyGatewayBindAddresses["default"]; exists { 1083 portLabel := envoy.PortLabel(structs.ConnectTerminatingPrefix, service.Name, "") 1084 if dynPort, ok := workload.Ports.Get(portLabel); ok { 1085 defaultBind.Port = dynPort.Value 1086 } 1087 } 1088 } 1089 case service.Connect.IsMesh(): 1090 kind = api.ServiceKindMeshGateway 1091 1092 if proxy := service.Connect.Gateway.Proxy; proxy != nil { 1093 // wan uses the service port label, which is typically on a discrete host_network 1094 if wanBind, exists := proxy.EnvoyGatewayBindAddresses["wan"]; exists { 1095 if wanPort, ok := workload.Ports.Get(service.PortLabel); ok { 1096 wanBind.Port = wanPort.Value 1097 } 1098 } 1099 // lan uses a nomad generated dynamic port on the default network 1100 if lanBind, exists := proxy.EnvoyGatewayBindAddresses["lan"]; exists { 1101 portLabel := envoy.PortLabel(structs.ConnectMeshPrefix, service.Name, "lan") 1102 if dynPort, ok := workload.Ports.Get(portLabel); ok { 1103 lanBind.Port = dynPort.Value 1104 } 1105 } 1106 } 1107 } 1108 1109 taggedAddresses, err := parseTaggedAddresses(service.TaggedAddresses, port) 1110 if err != nil { 1111 return nil, err 1112 } 1113 1114 // Build the Consul Service registration request 1115 serviceReg := &api.AgentServiceRegistration{ 1116 Kind: kind, 1117 ID: id, 1118 Name: service.Name, 1119 Namespace: workload.ProviderNamespace, 1120 Tags: tags, 1121 EnableTagOverride: service.EnableTagOverride, 1122 Address: ip, 1123 Port: port, 1124 Meta: meta, 1125 TaggedAddresses: taggedAddresses, 1126 Connect: connect, // will be nil if no Connect block 1127 Proxy: gateway, // will be nil if no Connect Gateway block 1128 Checks: make([]*api.AgentServiceCheck, 0, len(service.Checks)), 1129 } 1130 ops.regServices = append(ops.regServices, serviceReg) 1131 1132 // Build the check registrations 1133 checkRegs, err := c.checkRegs(id, service, workload, sreg) 1134 if err != nil { 1135 return nil, err 1136 } 1137 1138 for _, registration := range checkRegs { 1139 sreg.CheckIDs[registration.ID] = struct{}{} 1140 ops.regChecks = append(ops.regChecks, registration) 1141 serviceReg.Checks = append( 1142 serviceReg.Checks, 1143 apiCheckRegistrationToCheck(registration), 1144 ) 1145 } 1146 1147 return sreg, nil 1148 } 1149 1150 // apiCheckRegistrationToCheck converts a check registration to a check, so that 1151 // we can include them in the initial service registration. It is expected the 1152 // Nomad-conversion (e.g. turning script checks into ttl checks) has already been 1153 // applied. 1154 func apiCheckRegistrationToCheck(r *api.AgentCheckRegistration) *api.AgentServiceCheck { 1155 return &api.AgentServiceCheck{ 1156 CheckID: r.ID, 1157 Name: r.Name, 1158 Interval: r.Interval, 1159 Timeout: r.Timeout, 1160 TTL: r.TTL, 1161 HTTP: r.HTTP, 1162 Header: maps.Clone(r.Header), 1163 Method: r.Method, 1164 Body: r.Body, 1165 TCP: r.TCP, 1166 Status: r.Status, 1167 TLSServerName: r.TLSServerName, 1168 TLSSkipVerify: r.TLSSkipVerify, 1169 GRPC: r.GRPC, 1170 GRPCUseTLS: r.GRPCUseTLS, 1171 SuccessBeforePassing: r.SuccessBeforePassing, 1172 FailuresBeforeCritical: r.FailuresBeforeCritical, 1173 } 1174 } 1175 1176 // checkRegs creates check registrations for the given service 1177 func (c *ServiceClient) checkRegs( 1178 serviceID string, 1179 service *structs.Service, 1180 workload *serviceregistration.WorkloadServices, 1181 sreg *serviceregistration.ServiceRegistration, 1182 ) ([]*api.AgentCheckRegistration, error) { 1183 1184 registrations := make([]*api.AgentCheckRegistration, 0, len(service.Checks)) 1185 for _, check := range service.Checks { 1186 var ip string 1187 var port int 1188 1189 if check.Type != structs.ServiceCheckScript { 1190 portLabel := check.PortLabel 1191 if portLabel == "" { 1192 portLabel = service.PortLabel 1193 } 1194 1195 addrMode := check.AddressMode 1196 if addrMode == "" { 1197 if service.Address != "" { 1198 // if the service is using a custom address, enable the check 1199 // to use that address 1200 addrMode = structs.AddressModeAuto 1201 } else { 1202 // otherwise default to the host address 1203 addrMode = structs.AddressModeHost 1204 } 1205 } 1206 1207 var err error 1208 ip, port, err = serviceregistration.GetAddress( 1209 service.Address, addrMode, portLabel, workload.Networks, workload.DriverNetwork, workload.Ports, workload.NetworkStatus) 1210 if err != nil { 1211 return nil, fmt.Errorf("error getting address for check %q: %v", check.Name, err) 1212 } 1213 } 1214 1215 checkID := MakeCheckID(serviceID, check) 1216 registration, err := createCheckReg(serviceID, checkID, check, ip, port, workload.ProviderNamespace) 1217 if err != nil { 1218 return nil, fmt.Errorf("failed to add check %q: %v", check.Name, err) 1219 } 1220 sreg.CheckOnUpdate[checkID] = check.OnUpdate 1221 registrations = append(registrations, registration) 1222 } 1223 1224 return registrations, nil 1225 } 1226 1227 // RegisterWorkload with Consul. Adds all service entries and checks to Consul. 1228 // 1229 // If the service IP is set it used as the address in the service registration. 1230 // Checks will always use the IP from the Task struct (host's IP). 1231 // 1232 // Actual communication with Consul is done asynchronously (see Run). 1233 func (c *ServiceClient) RegisterWorkload(workload *serviceregistration.WorkloadServices) error { 1234 // Fast path 1235 numServices := len(workload.Services) 1236 if numServices == 0 { 1237 return nil 1238 } 1239 1240 t := new(serviceregistration.ServiceRegistrations) 1241 t.Services = make(map[string]*serviceregistration.ServiceRegistration, numServices) 1242 1243 ops := &operations{} 1244 for _, service := range workload.Services { 1245 sreg, err := c.serviceRegs(ops, service, workload) 1246 if err != nil { 1247 return err 1248 } 1249 t.Services[sreg.ServiceID] = sreg 1250 } 1251 1252 // Add the workload to the allocation's registration 1253 c.addRegistrations(workload.AllocInfo.AllocID, workload.Name(), t) 1254 1255 c.commit(ops) 1256 1257 // Start watching checks. Done after service registrations are built 1258 // since an error building them could leak watches. 1259 for _, service := range workload.Services { 1260 serviceID := serviceregistration.MakeAllocServiceID(workload.AllocInfo.AllocID, workload.Name(), service) 1261 for _, check := range service.Checks { 1262 if check.TriggersRestarts() { 1263 checkID := MakeCheckID(serviceID, check) 1264 c.checkWatcher.Watch(workload.AllocInfo.AllocID, workload.Name(), checkID, check, workload.Restarter) 1265 } 1266 } 1267 } 1268 return nil 1269 } 1270 1271 // UpdateWorkload in Consul. Does not alter the service if only checks have 1272 // changed. 1273 // 1274 // DriverNetwork must not change between invocations for the same allocation. 1275 func (c *ServiceClient) UpdateWorkload(old, newWorkload *serviceregistration.WorkloadServices) error { 1276 ops := new(operations) 1277 regs := new(serviceregistration.ServiceRegistrations) 1278 regs.Services = make(map[string]*serviceregistration.ServiceRegistration, len(newWorkload.Services)) 1279 1280 newIDs := make(map[string]*structs.Service, len(newWorkload.Services)) 1281 for _, s := range newWorkload.Services { 1282 newIDs[serviceregistration.MakeAllocServiceID(newWorkload.AllocInfo.AllocID, newWorkload.Name(), s)] = s 1283 } 1284 1285 // Loop over existing Services to see if they have been removed 1286 for _, existingSvc := range old.Services { 1287 existingID := serviceregistration.MakeAllocServiceID(old.AllocInfo.AllocID, old.Name(), existingSvc) 1288 newSvc, ok := newIDs[existingID] 1289 1290 if !ok { 1291 // Existing service entry removed 1292 ops.deregServices = append(ops.deregServices, existingID) 1293 for _, check := range existingSvc.Checks { 1294 cid := MakeCheckID(existingID, check) 1295 ops.deregChecks = append(ops.deregChecks, cid) 1296 1297 // Unwatch watched checks 1298 if check.TriggersRestarts() { 1299 c.checkWatcher.Unwatch(cid) 1300 } 1301 } 1302 continue 1303 } 1304 1305 oldHash := existingSvc.Hash(old.AllocInfo.AllocID, old.Name(), old.Canary) 1306 newHash := newSvc.Hash(newWorkload.AllocInfo.AllocID, newWorkload.Name(), newWorkload.Canary) 1307 if oldHash == newHash { 1308 // Service exists and hasn't changed, don't re-add it later 1309 delete(newIDs, existingID) 1310 } 1311 1312 // Service still exists so add it to the task's registration 1313 sreg := &serviceregistration.ServiceRegistration{ 1314 ServiceID: existingID, 1315 CheckIDs: make(map[string]struct{}, len(newSvc.Checks)), 1316 CheckOnUpdate: make(map[string]string, len(newSvc.Checks)), 1317 } 1318 regs.Services[existingID] = sreg 1319 1320 // See if any checks were updated 1321 existingChecks := make(map[string]*structs.ServiceCheck, len(existingSvc.Checks)) 1322 for _, check := range existingSvc.Checks { 1323 existingChecks[MakeCheckID(existingID, check)] = check 1324 } 1325 1326 // Register new checks 1327 for _, check := range newSvc.Checks { 1328 checkID := MakeCheckID(existingID, check) 1329 if _, exists := existingChecks[checkID]; exists { 1330 // Check is still required. Remove it from the map so it doesn't get 1331 // deleted later. 1332 delete(existingChecks, checkID) 1333 sreg.CheckIDs[checkID] = struct{}{} 1334 sreg.CheckOnUpdate[checkID] = check.OnUpdate 1335 } 1336 1337 // New check on an unchanged service; add them now 1338 checkRegs, err := c.checkRegs(existingID, newSvc, newWorkload, sreg) 1339 if err != nil { 1340 return err 1341 } 1342 1343 for _, registration := range checkRegs { 1344 sreg.CheckIDs[registration.ID] = struct{}{} 1345 sreg.CheckOnUpdate[registration.ID] = check.OnUpdate 1346 ops.regChecks = append(ops.regChecks, registration) 1347 } 1348 1349 // Update all watched checks as CheckRestart fields aren't part of ID 1350 if check.TriggersRestarts() { 1351 c.checkWatcher.Watch(newWorkload.AllocInfo.AllocID, newWorkload.Name(), checkID, check, newWorkload.Restarter) 1352 } 1353 } 1354 1355 // Remove existing checks not in updated service 1356 for cid, check := range existingChecks { 1357 ops.deregChecks = append(ops.deregChecks, cid) 1358 1359 // Unwatch checks 1360 if check.TriggersRestarts() { 1361 c.checkWatcher.Unwatch(cid) 1362 } 1363 } 1364 } 1365 1366 // Any remaining services should just be enqueued directly 1367 for _, newSvc := range newIDs { 1368 sreg, err := c.serviceRegs(ops, newSvc, newWorkload) 1369 if err != nil { 1370 return err 1371 } 1372 1373 regs.Services[sreg.ServiceID] = sreg 1374 } 1375 1376 // Add the task to the allocation's registration 1377 c.addRegistrations(newWorkload.AllocInfo.AllocID, newWorkload.Name(), regs) 1378 1379 c.commit(ops) 1380 1381 // Start watching checks. Done after service registrations are built 1382 // since an error building them could leak watches. 1383 for serviceID, service := range newIDs { 1384 for _, check := range service.Checks { 1385 if check.TriggersRestarts() { 1386 checkID := MakeCheckID(serviceID, check) 1387 c.checkWatcher.Watch(newWorkload.AllocInfo.AllocID, newWorkload.Name(), checkID, check, newWorkload.Restarter) 1388 } 1389 } 1390 } 1391 1392 return nil 1393 } 1394 1395 // RemoveWorkload from Consul. Removes all service entries and checks. 1396 // 1397 // Actual communication with Consul is done asynchronously (see Run). 1398 func (c *ServiceClient) RemoveWorkload(workload *serviceregistration.WorkloadServices) { 1399 ops := operations{} 1400 1401 for _, service := range workload.Services { 1402 id := serviceregistration.MakeAllocServiceID(workload.AllocInfo.AllocID, workload.Name(), service) 1403 ops.deregServices = append(ops.deregServices, id) 1404 1405 for _, check := range service.Checks { 1406 cid := MakeCheckID(id, check) 1407 ops.deregChecks = append(ops.deregChecks, cid) 1408 1409 if check.TriggersRestarts() { 1410 c.checkWatcher.Unwatch(cid) 1411 } 1412 } 1413 } 1414 1415 // Remove the workload from the alloc's registrations 1416 c.removeRegistration(workload.AllocInfo.AllocID, workload.Name()) 1417 1418 // Now add them to the deregistration fields; main Run loop will update 1419 c.commit(&ops) 1420 } 1421 1422 // normalizeNamespace will turn the "default" namespace into the empty string, 1423 // so that Consul OSS will not produce an error setting something in the default 1424 // namespace. 1425 func normalizeNamespace(namespace string) string { 1426 if namespace == "default" { 1427 return "" 1428 } 1429 return namespace 1430 } 1431 1432 // AllocRegistrations returns the registrations for the given allocation. If the 1433 // allocation has no registrations, the response is a nil object. 1434 func (c *ServiceClient) AllocRegistrations(allocID string) (*serviceregistration.AllocRegistration, error) { 1435 // Get the internal struct using the lock 1436 c.allocRegistrationsLock.RLock() 1437 regInternal, ok := c.allocRegistrations[allocID] 1438 if !ok { 1439 c.allocRegistrationsLock.RUnlock() 1440 return nil, nil 1441 } 1442 1443 // Copy so we don't expose internal structs 1444 reg := regInternal.Copy() 1445 c.allocRegistrationsLock.RUnlock() 1446 1447 // Get the list of all namespaces created so we can iterate them. 1448 namespaces, err := c.namespacesClient.List() 1449 if err != nil { 1450 return nil, fmt.Errorf("failed to retrieve namespaces from consul: %w", err) 1451 } 1452 1453 services := make(map[string]*api.AgentService) 1454 checks := make(map[string]*api.AgentCheck) 1455 1456 // Query the services and checks to populate the allocation registrations. 1457 for _, namespace := range namespaces { 1458 nsServices, err := c.agentAPI.ServicesWithFilterOpts("", &api.QueryOptions{Namespace: normalizeNamespace(namespace)}) 1459 if err != nil { 1460 return nil, fmt.Errorf("failed to retrieve services from consul: %w", err) 1461 } 1462 for k, v := range nsServices { 1463 services[k] = v 1464 } 1465 1466 nsChecks, err := c.agentAPI.ChecksWithFilterOpts("", &api.QueryOptions{Namespace: normalizeNamespace(namespace)}) 1467 if err != nil { 1468 return nil, fmt.Errorf("failed to retrieve checks from consul: %w", err) 1469 } 1470 for k, v := range nsChecks { 1471 checks[k] = v 1472 } 1473 } 1474 1475 // Populate the object 1476 for _, treg := range reg.Tasks { 1477 for serviceID, sreg := range treg.Services { 1478 sreg.Service = services[serviceID] 1479 for checkID := range sreg.CheckIDs { 1480 if check, ok := checks[checkID]; ok { 1481 sreg.Checks = append(sreg.Checks, check) 1482 } 1483 } 1484 } 1485 } 1486 1487 return reg, nil 1488 } 1489 1490 // UpdateTTL is used to update the TTL of a check. Typically this will only be 1491 // called to heartbeat script checks. 1492 func (c *ServiceClient) UpdateTTL(id, namespace, output, status string) error { 1493 ns := normalizeNamespace(namespace) 1494 return c.agentAPI.UpdateTTLOpts(id, output, status, &api.QueryOptions{Namespace: ns}) 1495 } 1496 1497 // Shutdown the Consul client. Update running task registrations and deregister 1498 // agent from Consul. On first call blocks up to shutdownWait before giving up 1499 // on syncing operations. 1500 func (c *ServiceClient) Shutdown() error { 1501 // Serialize Shutdown calls with RegisterAgent to prevent leaking agent 1502 // entries. 1503 c.agentLock.Lock() 1504 defer c.agentLock.Unlock() 1505 select { 1506 case <-c.shutdownCh: 1507 return nil 1508 default: 1509 close(c.shutdownCh) 1510 } 1511 1512 // Give run loop time to sync, but don't block indefinitely 1513 deadline := time.After(c.shutdownWait) 1514 1515 // Wait for Run to finish any outstanding operations and exit 1516 select { 1517 case <-c.exitCh: 1518 case <-deadline: 1519 // Don't wait forever though 1520 } 1521 1522 // If Consul was never seen nothing could be written so exit early 1523 if !c.hasSeen() { 1524 return nil 1525 } 1526 1527 // Always attempt to deregister Nomad agent Consul entries, even if 1528 // deadline was reached 1529 for _, id := range c.agentServices.List() { 1530 if err := c.agentAPI.ServiceDeregisterOpts(id, nil); err != nil { 1531 c.logger.Error("failed deregistering agent service", "service_id", id, "error", err) 1532 } 1533 } 1534 1535 namespaces, err := c.namespacesClient.List() 1536 if err != nil { 1537 c.logger.Error("failed to retrieve namespaces from consul", "error", err) 1538 } 1539 1540 remainingChecks := make(map[string]*api.AgentCheck) 1541 for _, namespace := range namespaces { 1542 nsChecks, err := c.agentAPI.ChecksWithFilterOpts("", &api.QueryOptions{Namespace: normalizeNamespace(namespace)}) 1543 if err != nil { 1544 c.logger.Error("failed to retrieve checks from consul", "error", err) 1545 } 1546 for k, v := range nsChecks { 1547 remainingChecks[k] = v 1548 } 1549 } 1550 1551 checkRemains := func(id string) bool { 1552 for _, c := range remainingChecks { 1553 if c.CheckID == id { 1554 return true 1555 } 1556 } 1557 return false 1558 } 1559 1560 for _, id := range c.agentChecks.List() { 1561 // if we couldn't populate remainingChecks it is unlikely that CheckDeregister will work, but try anyway 1562 // if we could list the remaining checks, verify that the check we store still exists before removing it. 1563 if remainingChecks == nil || checkRemains(id) { 1564 ns := remainingChecks[id].Namespace 1565 if err := c.agentAPI.CheckDeregisterOpts(id, &api.QueryOptions{Namespace: ns}); err != nil { 1566 c.logger.Error("failed deregistering agent check", "check_id", id, "error", err) 1567 } 1568 } 1569 } 1570 1571 return nil 1572 } 1573 1574 // addRegistration adds the service registrations for the given allocation. 1575 func (c *ServiceClient) addRegistrations(allocID, taskName string, reg *serviceregistration.ServiceRegistrations) { 1576 c.allocRegistrationsLock.Lock() 1577 defer c.allocRegistrationsLock.Unlock() 1578 1579 alloc, ok := c.allocRegistrations[allocID] 1580 if !ok { 1581 alloc = &serviceregistration.AllocRegistration{ 1582 Tasks: make(map[string]*serviceregistration.ServiceRegistrations), 1583 } 1584 c.allocRegistrations[allocID] = alloc 1585 } 1586 alloc.Tasks[taskName] = reg 1587 } 1588 1589 // removeRegistrations removes the registration for the given allocation. 1590 func (c *ServiceClient) removeRegistration(allocID, taskName string) { 1591 c.allocRegistrationsLock.Lock() 1592 defer c.allocRegistrationsLock.Unlock() 1593 1594 alloc, ok := c.allocRegistrations[allocID] 1595 if !ok { 1596 return 1597 } 1598 1599 // Delete the task and if it is the last one also delete the alloc's 1600 // registration 1601 delete(alloc.Tasks, taskName) 1602 if len(alloc.Tasks) == 0 { 1603 delete(c.allocRegistrations, allocID) 1604 } 1605 } 1606 1607 // makeAgentServiceID creates a unique ID for identifying an agent service in 1608 // Consul. 1609 // 1610 // Agent service IDs are of the form: 1611 // 1612 // {nomadServicePrefix}-{ROLE}-b32(sha1({Service.Name}-{Service.Tags...}) 1613 // Example Server ID: _nomad-server-fbbk265qn4tmt25nd4ep42tjvmyj3hr4 1614 // Example Client ID: _nomad-client-ggnjpgl7yn7rgmvxzilmpvrzzvrszc7l 1615 func makeAgentServiceID(role string, service *structs.Service) string { 1616 return fmt.Sprintf("%s-%s-%s", nomadServicePrefix, role, service.Hash(role, "", false)) 1617 } 1618 1619 // MakeCheckID creates a unique ID for a check. 1620 // 1621 // Example Check ID: _nomad-check-434ae42f9a57c5705344974ac38de2aee0ee089d 1622 func MakeCheckID(serviceID string, check *structs.ServiceCheck) string { 1623 return fmt.Sprintf("%s%s", nomadCheckPrefix, check.Hash(serviceID)) 1624 } 1625 1626 // createCheckReg creates a Check that can be registered with Consul. 1627 // 1628 // Script checks simply have a TTL set and the caller is responsible for 1629 // running the script and heart-beating. 1630 func createCheckReg(serviceID, checkID string, check *structs.ServiceCheck, host string, port int, namespace string) (*api.AgentCheckRegistration, error) { 1631 chkReg := api.AgentCheckRegistration{ 1632 ID: checkID, 1633 Name: check.Name, 1634 ServiceID: serviceID, 1635 Namespace: normalizeNamespace(namespace), 1636 } 1637 chkReg.Status = check.InitialStatus 1638 chkReg.Timeout = check.Timeout.String() 1639 chkReg.Interval = check.Interval.String() 1640 chkReg.SuccessBeforePassing = check.SuccessBeforePassing 1641 chkReg.FailuresBeforeCritical = check.FailuresBeforeCritical 1642 1643 // Require an address for http or tcp checks 1644 if port == 0 && check.RequiresPort() { 1645 return nil, fmt.Errorf("%s checks require an address", check.Type) 1646 } 1647 1648 switch check.Type { 1649 case structs.ServiceCheckHTTP: 1650 proto := check.Protocol 1651 if proto == "" { 1652 proto = "http" 1653 } 1654 if check.TLSSkipVerify { 1655 chkReg.TLSSkipVerify = true 1656 } 1657 chkReg.TLSServerName = check.TLSServerName 1658 base := url.URL{ 1659 Scheme: proto, 1660 Host: net.JoinHostPort(host, strconv.Itoa(port)), 1661 } 1662 relative, err := url.Parse(check.Path) 1663 if err != nil { 1664 return nil, err 1665 } 1666 checkURL := base.ResolveReference(relative) 1667 chkReg.HTTP = checkURL.String() 1668 chkReg.Method = check.Method 1669 chkReg.Header = check.Header 1670 chkReg.Body = check.Body 1671 1672 case structs.ServiceCheckTCP: 1673 chkReg.TCP = net.JoinHostPort(host, strconv.Itoa(port)) 1674 1675 case structs.ServiceCheckScript: 1676 chkReg.TTL = (check.Interval + ttlCheckBuffer).String() 1677 // As of Consul 1.0.0 setting TTL and Interval is a 400 1678 chkReg.Interval = "" 1679 1680 case structs.ServiceCheckGRPC: 1681 chkReg.GRPC = fmt.Sprintf("%s/%s", net.JoinHostPort(host, strconv.Itoa(port)), check.GRPCService) 1682 chkReg.GRPCUseTLS = check.GRPCUseTLS 1683 if check.TLSSkipVerify { 1684 chkReg.TLSSkipVerify = true 1685 } 1686 chkReg.TLSServerName = check.TLSServerName 1687 1688 default: 1689 return nil, fmt.Errorf("check type %+q not valid", check.Type) 1690 } 1691 return &chkReg, nil 1692 } 1693 1694 // isNomadClient returns true if id represents a Nomad Client registration. 1695 func isNomadClient(id string) bool { 1696 return strings.HasPrefix(id, nomadClientPrefix) 1697 } 1698 1699 // isNomadServer returns true if id represents a Nomad Server registration. 1700 func isNomadServer(id string) bool { 1701 return strings.HasPrefix(id, nomadServerPrefix) 1702 } 1703 1704 // isNomadAgent returns true if id represents a Nomad Client or Server registration. 1705 func isNomadAgent(id string) bool { 1706 return isNomadClient(id) || isNomadServer(id) 1707 } 1708 1709 // isNomadService returns true if the ID matches the pattern of a Nomad managed 1710 // service (new or old formats). Agent services return false as independent 1711 // client and server agents may be running on the same machine. #2827 1712 func isNomadService(id string) bool { 1713 return strings.HasPrefix(id, nomadTaskPrefix) || isOldNomadService(id) 1714 } 1715 1716 // isNomadCheck returns true if the ID matches the pattern of a Nomad managed 1717 // check. 1718 func isNomadCheck(id string) bool { 1719 return strings.HasPrefix(id, nomadCheckPrefix) 1720 } 1721 1722 // isOldNomadService returns true if the ID matches an old pattern managed by 1723 // Nomad. 1724 // 1725 // Pre-0.7.1 task service IDs are of the form: 1726 // 1727 // {nomadServicePrefix}-executor-{ALLOC_ID}-{Service.Name}-{Service.Tags...} 1728 // Example Service ID: _nomad-executor-1234-echo-http-tag1-tag2-tag3 1729 func isOldNomadService(id string) bool { 1730 const prefix = nomadServicePrefix + "-executor" 1731 return strings.HasPrefix(id, prefix) 1732 } 1733 1734 const ( 1735 sidecarSuffix = "-sidecar-proxy" 1736 ) 1737 1738 // maybeConnectSidecar returns true if the ID is likely of a Connect sidecar proxy. 1739 // This function should only be used to determine if Nomad should skip managing 1740 // service id; it could produce false negatives for non-Nomad managed services 1741 // (i.e. someone set the ID manually), but Nomad does not manage those anyway. 1742 // 1743 // It is important not to reference the parent service, which may or may not still 1744 // be tracked by Nomad internally. 1745 // 1746 // For example if you have a Connect enabled service with the ID: 1747 // 1748 // _nomad-task-5229c7f8-376b-3ccc-edd9-981e238f7033-cache-redis-cache-db 1749 // 1750 // Consul will create a service for the sidecar proxy with the ID: 1751 // 1752 // _nomad-task-5229c7f8-376b-3ccc-edd9-981e238f7033-cache-redis-cache-db-sidecar-proxy 1753 func maybeConnectSidecar(id string) bool { 1754 return strings.HasSuffix(id, sidecarSuffix) 1755 } 1756 1757 var ( 1758 sidecarProxyCheckRe = regexp.MustCompile(`^service:_nomad-.+-sidecar-proxy(:[\d]+)?$`) 1759 ) 1760 1761 // maybeSidecarProxyCheck returns true if the ID likely matches a Nomad generated 1762 // check ID used in the context of a Nomad managed Connect sidecar proxy. This function 1763 // should only be used to determine if Nomad should skip managing a check; it can 1764 // produce false negatives for non-Nomad managed Connect sidecar proxy checks (i.e. 1765 // someone set the ID manually), but Nomad does not manage those anyway. 1766 // 1767 // For example if you have a Connect enabled service with the ID: 1768 // 1769 // _nomad-task-5229c7f8-376b-3ccc-edd9-981e238f7033-cache-redis-cache-db 1770 // 1771 // Nomad will create a Connect sidecar proxy of ID: 1772 // 1773 // _nomad-task-5229c7f8-376b-3ccc-edd9-981e238f7033-cache-redis-cache-db-sidecar-proxy 1774 // 1775 // With default checks like: 1776 // 1777 // service:_nomad-task-2f5fb517-57d4-44ee-7780-dc1cb6e103cd-group-api-count-api-9001-sidecar-proxy:1 1778 // service:_nomad-task-2f5fb517-57d4-44ee-7780-dc1cb6e103cd-group-api-count-api-9001-sidecar-proxy:2 1779 // 1780 // Unless sidecar_service.disable_default_tcp_check is set, in which case the 1781 // default check is: 1782 // 1783 // service:_nomad-task-322616db-2680-35d8-0d10-b50a0a0aa4cd-group-api-count-api-9001-sidecar-proxy 1784 func maybeSidecarProxyCheck(id string) bool { 1785 return sidecarProxyCheckRe.MatchString(id) 1786 } 1787 1788 // getNomadSidecar returns the service registration of the sidecar for the managed 1789 // service with the specified id. 1790 // 1791 // If the managed service of the specified id does not exist, or the service does 1792 // not have a sidecar proxy, nil is returned. 1793 func getNomadSidecar(id string, services map[string]*api.AgentService) *api.AgentService { 1794 if _, exists := services[id]; !exists { 1795 return nil 1796 } 1797 1798 sidecarID := id + sidecarSuffix 1799 return services[sidecarID] 1800 } 1801 1802 func parseAddress(raw string, port int) (api.ServiceAddress, error) { 1803 result := api.ServiceAddress{} 1804 addr, portStr, err := net.SplitHostPort(raw) 1805 // Error message from Go's net/ipsock.go 1806 if err != nil { 1807 if !strings.Contains(err.Error(), "missing port in address") { 1808 return result, fmt.Errorf("error parsing address %q: %v", raw, err) 1809 } 1810 1811 // Use the whole input as the address if there wasn't a port. 1812 if ip := net.ParseIP(raw); ip == nil { 1813 return result, fmt.Errorf("error parsing address %q: not an IP address", raw) 1814 } 1815 addr = raw 1816 } 1817 1818 if portStr != "" { 1819 port, err = strconv.Atoi(portStr) 1820 if err != nil { 1821 return result, fmt.Errorf("error parsing port %q: %v", portStr, err) 1822 } 1823 } 1824 1825 result.Address = addr 1826 result.Port = port 1827 return result, nil 1828 } 1829 1830 // morph the tagged_addresses map into the structure consul api wants 1831 func parseTaggedAddresses(m map[string]string, port int) (map[string]api.ServiceAddress, error) { 1832 result := make(map[string]api.ServiceAddress, len(m)) 1833 for k, v := range m { 1834 sa, err := parseAddress(v, port) 1835 if err != nil { 1836 return nil, err 1837 } 1838 result[k] = sa 1839 } 1840 return result, nil 1841 }