github.com/anth0d/nomad@v0.0.0-20221214183521-ae3a0a2cad06/command/agent/consul/service_client.go (about) 1 package consul 2 3 import ( 4 "context" 5 "fmt" 6 "net" 7 "net/url" 8 "reflect" 9 "regexp" 10 "strconv" 11 "strings" 12 "sync" 13 "sync/atomic" 14 "time" 15 16 "github.com/armon/go-metrics" 17 "github.com/hashicorp/consul/api" 18 "github.com/hashicorp/go-hclog" 19 "github.com/hashicorp/go-set" 20 "github.com/hashicorp/nomad/client/serviceregistration" 21 "github.com/hashicorp/nomad/helper" 22 "github.com/hashicorp/nomad/helper/envoy" 23 "github.com/hashicorp/nomad/nomad/structs" 24 "golang.org/x/exp/maps" 25 "golang.org/x/exp/slices" 26 ) 27 28 const ( 29 // nomadServicePrefix is the prefix that scopes all Nomad registered 30 // services (both agent and task entries). 31 nomadServicePrefix = "_nomad" 32 33 // nomadServerPrefix is the prefix that scopes Nomad registered Servers. 34 nomadServerPrefix = nomadServicePrefix + "-server-" 35 36 // nomadClientPrefix is the prefix that scopes Nomad registered Clients. 37 nomadClientPrefix = nomadServicePrefix + "-client-" 38 39 // nomadTaskPrefix is the prefix that scopes Nomad registered services 40 // for tasks. 41 nomadTaskPrefix = nomadServicePrefix + "-task-" 42 43 // nomadCheckPrefix is the prefix that scopes Nomad registered checks for 44 // services. 45 nomadCheckPrefix = nomadServicePrefix + "-check-" 46 47 // defaultRetryInterval is how quickly to retry syncing services and 48 // checks to Consul when an error occurs. Will backoff up to a max. 49 defaultRetryInterval = time.Second 50 51 // defaultMaxRetryInterval is the default max retry interval. 52 defaultMaxRetryInterval = 30 * time.Second 53 54 // defaultPeriodicalInterval is the interval at which the service 55 // client reconciles state between the desired services and checks and 56 // what's actually registered in Consul. This is done at an interval, 57 // rather than being purely edge triggered, to handle the case that the 58 // Consul agent's state may change underneath us 59 defaultPeriodicInterval = 30 * time.Second 60 61 // ttlCheckBuffer is the time interval that Nomad can take to report Consul 62 // the check result 63 ttlCheckBuffer = 31 * time.Second 64 65 // defaultShutdownWait is how long Shutdown() should block waiting for 66 // enqueued operations to sync to Consul by default. 67 defaultShutdownWait = time.Minute 68 69 // DefaultQueryWaitDuration is the max duration the Consul Agent will 70 // spend waiting for a response from a Consul Query. 71 DefaultQueryWaitDuration = 2 * time.Second 72 73 // ServiceTagHTTP is the tag assigned to HTTP services 74 ServiceTagHTTP = "http" 75 76 // ServiceTagRPC is the tag assigned to RPC services 77 ServiceTagRPC = "rpc" 78 79 // ServiceTagSerf is the tag assigned to Serf services 80 ServiceTagSerf = "serf" 81 82 // deregisterProbationPeriod is the initialization period where 83 // services registered in Consul but not in Nomad don't get deregistered, 84 // to allow for nomad restoring tasks 85 deregisterProbationPeriod = time.Minute 86 ) 87 88 // Additional Consul ACLs required 89 // - Consul Template: key:read 90 // Used in tasks with template stanza that use Consul keys. 91 92 // CatalogAPI is the consul/api.Catalog API used by Nomad. 93 // 94 // ACL requirements 95 // - node:read (listing datacenters) 96 // - service:read 97 type CatalogAPI interface { 98 Datacenters() ([]string, error) 99 Service(service, tag string, q *api.QueryOptions) ([]*api.CatalogService, *api.QueryMeta, error) 100 } 101 102 // NamespaceAPI is the consul/api.Namespace API used by Nomad. 103 // 104 // ACL requirements 105 // - operator:read OR namespace:*:read 106 type NamespaceAPI interface { 107 List(q *api.QueryOptions) ([]*api.Namespace, *api.QueryMeta, error) 108 } 109 110 // AgentAPI is the consul/api.Agent API used by Nomad. 111 // 112 // ACL requirements 113 // - agent:read 114 // - service:write 115 type AgentAPI interface { 116 CheckRegister(check *api.AgentCheckRegistration) error 117 CheckDeregisterOpts(checkID string, q *api.QueryOptions) error 118 ChecksWithFilterOpts(filter string, q *api.QueryOptions) (map[string]*api.AgentCheck, error) 119 UpdateTTLOpts(id, output, status string, q *api.QueryOptions) error 120 121 ServiceRegister(service *api.AgentServiceRegistration) error 122 ServiceDeregisterOpts(serviceID string, q *api.QueryOptions) error 123 ServicesWithFilterOpts(filter string, q *api.QueryOptions) (map[string]*api.AgentService, error) 124 125 Self() (map[string]map[string]interface{}, error) 126 } 127 128 // ConfigAPI is the consul/api.ConfigEntries API subset used by Nomad Server. 129 // 130 // ACL requirements 131 // - operator:write (server only) 132 type ConfigAPI interface { 133 Set(entry api.ConfigEntry, w *api.WriteOptions) (bool, *api.WriteMeta, error) 134 // Delete(kind, name string, w *api.WriteOptions) (*api.WriteMeta, error) (not used) 135 } 136 137 // ACLsAPI is the consul/api.ACL API subset used by Nomad Server. 138 // 139 // ACL requirements 140 // - acl:write (server only) 141 type ACLsAPI interface { 142 TokenReadSelf(q *api.QueryOptions) (*api.ACLToken, *api.QueryMeta, error) // for lookup via operator token 143 PolicyRead(policyID string, q *api.QueryOptions) (*api.ACLPolicy, *api.QueryMeta, error) 144 RoleRead(roleID string, q *api.QueryOptions) (*api.ACLRole, *api.QueryMeta, error) 145 TokenCreate(partial *api.ACLToken, q *api.WriteOptions) (*api.ACLToken, *api.WriteMeta, error) 146 TokenDelete(accessorID string, q *api.WriteOptions) (*api.WriteMeta, error) 147 TokenList(q *api.QueryOptions) ([]*api.ACLTokenListEntry, *api.QueryMeta, error) 148 } 149 150 // agentServiceUpdateRequired checks if any critical fields in Nomad's version 151 // of a service definition are different from the existing service definition as 152 // known by Consul. 153 // 154 // reason - The syncReason that triggered this synchronization with the consul 155 // agent API. 156 // wanted - Nomad's view of what the service definition is intended to be. 157 // Not nil. 158 // existing - Consul's view (agent, not catalog) of the actual service definition. 159 // Not nil. 160 // sidecar - Consul's view (agent, not catalog) of the service definition of the sidecar 161 // associated with existing that may or may not exist. 162 // May be nil. 163 func (s *ServiceClient) agentServiceUpdateRequired(reason syncReason, wanted *api.AgentServiceRegistration, existing *api.AgentService, sidecar *api.AgentService) bool { 164 switch reason { 165 case syncPeriodic: 166 // In a periodic sync with Consul, we need to respect the value of 167 // the enable_tag_override field so that we maintain the illusion that the 168 // user is in control of the Consul tags, as they may be externally edited 169 // via the Consul catalog API (e.g. a user manually sets them). 170 // 171 // As Consul does by disabling anti-entropy for the tags field, Nomad will 172 // ignore differences in the tags field during the periodic syncs with 173 // the Consul agent API. 174 // 175 // We do so by over-writing the nomad service registration by the value 176 // of the tags that Consul contains, if enable_tag_override = true. 177 maybeTweakTags(wanted, existing, sidecar) 178 179 // Also, purge tagged address fields of nomad agent services. 180 maybeTweakTaggedAddresses(wanted, existing) 181 182 // Okay now it is safe to compare. 183 return s.different(wanted, existing, sidecar) 184 185 default: 186 // A non-periodic sync with Consul indicates an operation has been set 187 // on the queue. This happens when service has been added / removed / modified 188 // and implies the Consul agent should be sync'd with nomad, because 189 // nomad is the ultimate source of truth for the service definition. 190 191 // But do purge tagged address fields of nomad agent services. 192 maybeTweakTaggedAddresses(wanted, existing) 193 194 // Okay now it is safe to compare. 195 return s.different(wanted, existing, sidecar) 196 } 197 } 198 199 // maybeTweakTags will override wanted.Tags with a copy of existing.Tags only if 200 // EnableTagOverride is true. Otherwise the wanted service registration is left 201 // unchanged. 202 func maybeTweakTags(wanted *api.AgentServiceRegistration, existing *api.AgentService, sidecar *api.AgentService) { 203 if wanted.EnableTagOverride { 204 wanted.Tags = slices.Clone(existing.Tags) 205 // If the service registration also defines a sidecar service, use the ETO 206 // setting for the parent service to also apply to the sidecar. 207 if wanted.Connect != nil && wanted.Connect.SidecarService != nil { 208 if sidecar != nil { 209 wanted.Connect.SidecarService.Tags = slices.Clone(sidecar.Tags) 210 } 211 } 212 } 213 } 214 215 // maybeTweakTaggedAddresses will remove the Consul-injected .TaggedAddresses fields 216 // from existing if wanted represents a Nomad agent (Client or Server) or Nomad managed 217 // service, which do not themselves configure those tagged addresses. We do this 218 // because Consul will magically set the .TaggedAddress to values Nomad does not 219 // know about if they are submitted as unset. 220 func maybeTweakTaggedAddresses(wanted *api.AgentServiceRegistration, existing *api.AgentService) { 221 if isNomadAgent(wanted.ID) || isNomadService(wanted.ID) { 222 if _, exists := wanted.TaggedAddresses["lan_ipv4"]; !exists { 223 delete(existing.TaggedAddresses, "lan_ipv4") 224 } 225 if _, exists := wanted.TaggedAddresses["wan_ipv4"]; !exists { 226 delete(existing.TaggedAddresses, "wan_ipv4") 227 } 228 if _, exists := wanted.TaggedAddresses["lan_ipv6"]; !exists { 229 delete(existing.TaggedAddresses, "lan_ipv6") 230 } 231 if _, exists := wanted.TaggedAddresses["wan_ipv6"]; !exists { 232 delete(existing.TaggedAddresses, "wan_ipv6") 233 } 234 } 235 } 236 237 // different compares the wanted state of the service registration with the actual 238 // (cached) state of the service registration reported by Consul. If any of the 239 // critical fields are not deeply equal, they considered different. 240 func (s *ServiceClient) different(wanted *api.AgentServiceRegistration, existing *api.AgentService, sidecar *api.AgentService) bool { 241 trace := func(field string, left, right any) { 242 s.logger.Trace("registrations different", "id", wanted.ID, 243 "field", field, "wanted", fmt.Sprintf("%#v", left), "existing", fmt.Sprintf("%#v", right), 244 ) 245 } 246 247 switch { 248 case wanted.Kind != existing.Kind: 249 trace("kind", wanted.Kind, existing.Kind) 250 return true 251 case wanted.ID != existing.ID: 252 trace("id", wanted.ID, existing.ID) 253 return true 254 case wanted.Port != existing.Port: 255 trace("port", wanted.Port, existing.Port) 256 return true 257 case wanted.Address != existing.Address: 258 trace("address", wanted.Address, existing.Address) 259 return true 260 case wanted.Name != existing.Service: 261 trace("service name", wanted.Name, existing.Service) 262 return true 263 case wanted.EnableTagOverride != existing.EnableTagOverride: 264 trace("enable_tag_override", wanted.EnableTagOverride, existing.EnableTagOverride) 265 return true 266 case !maps.Equal(wanted.Meta, existing.Meta): 267 trace("meta", wanted.Meta, existing.Meta) 268 return true 269 case !maps.Equal(wanted.TaggedAddresses, existing.TaggedAddresses): 270 trace("tagged_addresses", wanted.TaggedAddresses, existing.TaggedAddresses) 271 return true 272 case !helper.SliceSetEq(wanted.Tags, existing.Tags): 273 trace("tags", wanted.Tags, existing.Tags) 274 return true 275 case connectSidecarDifferent(wanted, sidecar): 276 trace("connect_sidecar", wanted.Name, existing.Service) 277 return true 278 } 279 return false 280 } 281 282 // sidecarTagsDifferent includes the special logic for comparing sidecar tags 283 // from Nomad vs. Consul perspective. Because Consul forces the sidecar tags 284 // to inherit the parent service tags if the sidecar tags are unset, we need to 285 // take that into consideration when Nomad's sidecar tags are unset by instead 286 // comparing them to the parent service tags. 287 func sidecarTagsDifferent(parent, wanted, sidecar []string) bool { 288 if len(wanted) == 0 { 289 return !helper.SliceSetEq(parent, sidecar) 290 } 291 return !helper.SliceSetEq(wanted, sidecar) 292 } 293 294 // proxyUpstreamsDifferent determines if the sidecar_service.proxy.upstreams 295 // configurations are different between the desired sidecar service state, and 296 // the actual sidecar service state currently registered in Consul. 297 func proxyUpstreamsDifferent(wanted *api.AgentServiceConnect, sidecar *api.AgentServiceConnectProxyConfig) bool { 298 // There is similar code that already does this in Nomad's API package, 299 // however here we are operating on Consul API package structs, and they do not 300 // provide such helper functions. 301 302 getProxyUpstreams := func(pc *api.AgentServiceConnectProxyConfig) []api.Upstream { 303 switch { 304 case pc == nil: 305 return nil 306 case len(pc.Upstreams) == 0: 307 return nil 308 default: 309 return pc.Upstreams 310 } 311 } 312 313 getConnectUpstreams := func(sc *api.AgentServiceConnect) []api.Upstream { 314 switch { 315 case sc.SidecarService.Proxy == nil: 316 return nil 317 case len(sc.SidecarService.Proxy.Upstreams) == 0: 318 return nil 319 default: 320 return sc.SidecarService.Proxy.Upstreams 321 } 322 } 323 324 upstreamsDifferent := func(a, b []api.Upstream) bool { 325 if len(a) != len(b) { 326 return true 327 } 328 329 for i := 0; i < len(a); i++ { 330 A := a[i] 331 B := b[i] 332 switch { 333 case A.Datacenter != B.Datacenter: 334 return true 335 case A.DestinationName != B.DestinationName: 336 return true 337 case A.LocalBindAddress != B.LocalBindAddress: 338 return true 339 case A.LocalBindPort != B.LocalBindPort: 340 return true 341 case A.MeshGateway.Mode != B.MeshGateway.Mode: 342 return true 343 case !reflect.DeepEqual(A.Config, B.Config): 344 return true 345 } 346 } 347 return false 348 } 349 350 return upstreamsDifferent( 351 getConnectUpstreams(wanted), 352 getProxyUpstreams(sidecar), 353 ) 354 } 355 356 // connectSidecarDifferent returns true if Nomad expects there to be a sidecar 357 // hanging off the desired parent service definition on the Consul side, and does 358 // not match with what Consul has. 359 // 360 // This is used to determine if the connect sidecar service registration should be 361 // updated - potentially (but not necessarily) in-place. 362 func connectSidecarDifferent(wanted *api.AgentServiceRegistration, sidecar *api.AgentService) bool { 363 if wanted.Connect != nil && wanted.Connect.SidecarService != nil { 364 if sidecar == nil { 365 // consul lost our sidecar (?) 366 return true 367 } 368 369 if sidecarTagsDifferent(wanted.Tags, wanted.Connect.SidecarService.Tags, sidecar.Tags) { 370 // tags on the nomad definition have been modified 371 return true 372 } 373 374 if proxyUpstreamsDifferent(wanted.Connect, sidecar.Proxy) { 375 // proxy upstreams on the nomad definition have been modified 376 return true 377 } 378 } 379 380 // Either Nomad does not expect there to be a sidecar_service, or there is 381 // no actionable difference from the Consul sidecar_service definition. 382 return false 383 } 384 385 // operations are submitted to the main loop via commit() for synchronizing 386 // with Consul. 387 type operations struct { 388 regServices []*api.AgentServiceRegistration 389 regChecks []*api.AgentCheckRegistration 390 deregServices []string 391 deregChecks []string 392 } 393 394 func (o *operations) empty() bool { 395 switch { 396 case o == nil: 397 return true 398 case len(o.regServices) > 0: 399 return false 400 case len(o.regChecks) > 0: 401 return false 402 case len(o.deregServices) > 0: 403 return false 404 case len(o.deregChecks) > 0: 405 return false 406 default: 407 return true 408 } 409 } 410 411 func (o *operations) String() string { 412 return fmt.Sprintf("<%d, %d, %d, %d>", len(o.regServices), len(o.regChecks), len(o.deregServices), len(o.deregChecks)) 413 } 414 415 // ServiceClient handles task and agent service registration with Consul. 416 type ServiceClient struct { 417 agentAPI AgentAPI 418 namespacesClient *NamespacesClient 419 420 logger hclog.Logger 421 retryInterval time.Duration 422 maxRetryInterval time.Duration 423 periodicInterval time.Duration 424 425 // exitCh is closed when the main Run loop exits 426 exitCh chan struct{} 427 428 // shutdownCh is closed when the client should shutdown 429 shutdownCh chan struct{} 430 431 // shutdownWait is how long Shutdown() blocks waiting for the final 432 // sync() to finish. Defaults to defaultShutdownWait 433 shutdownWait time.Duration 434 435 opCh chan *operations 436 437 services map[string]*api.AgentServiceRegistration 438 checks map[string]*api.AgentCheckRegistration 439 440 explicitlyDeregisteredServices *set.Set[string] 441 explicitlyDeregisteredChecks *set.Set[string] 442 443 // allocRegistrations stores the services and checks that are registered 444 // with Consul by allocation ID. 445 allocRegistrations map[string]*serviceregistration.AllocRegistration 446 allocRegistrationsLock sync.RWMutex 447 448 // Nomad agent services and checks that are recorded so they can be removed 449 // on shutdown. Defers to consul namespace specified in client consul config. 450 agentServices *set.Set[string] 451 agentChecks *set.Set[string] 452 agentLock sync.Mutex 453 454 // seen is 1 if Consul has ever been seen; otherwise 0. Accessed with 455 // atomics. 456 seen int32 457 458 // deregisterProbationExpiry is the time before which consul sync shouldn't deregister 459 // unknown services. 460 // Used to mitigate risk of deleting restored services upon client restart. 461 deregisterProbationExpiry time.Time 462 463 // checkWatcher restarts checks that are unhealthy. 464 checkWatcher *serviceregistration.UniversalCheckWatcher 465 466 // isClientAgent specifies whether this Consul client is being used 467 // by a Nomad client. 468 isClientAgent bool 469 } 470 471 // checkStatusGetter is the consul-specific implementation of serviceregistration.CheckStatusGetter 472 type checkStatusGetter struct { 473 agentAPI AgentAPI 474 namespacesClient *NamespacesClient 475 } 476 477 func (csg *checkStatusGetter) Get() (map[string]string, error) { 478 // Get the list of all namespaces so we can iterate them. 479 namespaces, err := csg.namespacesClient.List() 480 if err != nil { 481 return nil, err 482 } 483 484 results := make(map[string]string) 485 for _, namespace := range namespaces { 486 resultsInNamespace, err := csg.agentAPI.ChecksWithFilterOpts("", &api.QueryOptions{Namespace: normalizeNamespace(namespace)}) 487 if err != nil { 488 return nil, err 489 } 490 491 for k, v := range resultsInNamespace { 492 results[k] = v.Status 493 } 494 } 495 return results, nil 496 } 497 498 // NewServiceClient creates a new Consul ServiceClient from an existing Consul API 499 // Client, logger and takes whether the client is being used by a Nomad Client agent. 500 // When being used by a Nomad client, this Consul client reconciles all services and 501 // checks created by Nomad on behalf of running tasks. 502 func NewServiceClient(agentAPI AgentAPI, namespacesClient *NamespacesClient, logger hclog.Logger, isNomadClient bool) *ServiceClient { 503 logger = logger.ResetNamed("consul.sync") 504 return &ServiceClient{ 505 agentAPI: agentAPI, 506 namespacesClient: namespacesClient, 507 logger: logger, 508 retryInterval: defaultRetryInterval, 509 maxRetryInterval: defaultMaxRetryInterval, 510 periodicInterval: defaultPeriodicInterval, 511 exitCh: make(chan struct{}), 512 shutdownCh: make(chan struct{}), 513 shutdownWait: defaultShutdownWait, 514 opCh: make(chan *operations, 8), 515 services: make(map[string]*api.AgentServiceRegistration), 516 checks: make(map[string]*api.AgentCheckRegistration), 517 explicitlyDeregisteredServices: set.New[string](0), 518 explicitlyDeregisteredChecks: set.New[string](0), 519 allocRegistrations: make(map[string]*serviceregistration.AllocRegistration), 520 agentServices: set.New[string](4), 521 agentChecks: set.New[string](0), 522 isClientAgent: isNomadClient, 523 deregisterProbationExpiry: time.Now().Add(deregisterProbationPeriod), 524 checkWatcher: serviceregistration.NewCheckWatcher(logger, &checkStatusGetter{ 525 agentAPI: agentAPI, 526 namespacesClient: namespacesClient, 527 }), 528 } 529 } 530 531 // seen is used by markSeen and hasSeen 532 const seen = 1 533 534 // markSeen marks Consul as having been seen (meaning at least one operation 535 // has succeeded). 536 func (c *ServiceClient) markSeen() { 537 atomic.StoreInt32(&c.seen, seen) 538 } 539 540 // hasSeen returns true if any Consul operation has ever succeeded. Useful to 541 // squelch errors if Consul isn't running. 542 func (c *ServiceClient) hasSeen() bool { 543 return atomic.LoadInt32(&c.seen) == seen 544 } 545 546 // syncReason indicates why a sync operation with consul is about to happen. 547 // 548 // The trigger for a sync may have implications on the behavior of the sync itself. 549 // In particular if a service is defined with enable_tag_override=true, the sync 550 // should ignore changes to the service's Tags field. 551 type syncReason byte 552 553 const ( 554 syncPeriodic syncReason = iota 555 syncShutdown 556 syncNewOps 557 ) 558 559 func (sr syncReason) String() string { 560 switch sr { 561 case syncPeriodic: 562 return "periodic" 563 case syncShutdown: 564 return "shutdown" 565 case syncNewOps: 566 return "operations" 567 default: 568 return "unexpected" 569 } 570 } 571 572 // Run the Consul main loop which retries operations against Consul. It should 573 // be called exactly once. 574 func (c *ServiceClient) Run() { 575 defer close(c.exitCh) 576 577 ctx, cancel := context.WithCancel(context.Background()) 578 defer cancel() 579 580 // init will be closed when Consul has been contacted 581 init := make(chan struct{}) 582 go checkConsulTLSSkipVerify(ctx, c.logger, c.agentAPI, init) 583 584 // Process operations while waiting for initial contact with Consul but 585 // do not sync until contact has been made. 586 INIT: 587 for { 588 select { 589 case <-init: 590 c.markSeen() 591 break INIT 592 case <-c.shutdownCh: 593 return 594 case ops := <-c.opCh: 595 c.merge(ops) 596 } 597 } 598 c.logger.Trace("able to contact Consul") 599 600 // Block until contact with Consul has been established 601 // Start checkWatcher 602 go c.checkWatcher.Run(ctx) 603 604 // Always immediately sync to reconcile Nomad and Consul's state 605 retryTimer := time.NewTimer(0) 606 607 failures := 0 608 for { 609 // On every iteration take note of what the trigger for the next sync 610 // was, so that it may be referenced during the sync itself. 611 var reasonForSync syncReason 612 613 select { 614 case <-retryTimer.C: 615 reasonForSync = syncPeriodic 616 case <-c.shutdownCh: 617 reasonForSync = syncShutdown 618 // Cancel check watcher but sync one last time 619 cancel() 620 case ops := <-c.opCh: 621 reasonForSync = syncNewOps 622 c.merge(ops) 623 } 624 625 if err := c.sync(reasonForSync); err != nil { 626 if failures == 0 { 627 // Log on the first failure 628 c.logger.Warn("failed to update services in Consul", "error", err) 629 } else if failures%10 == 0 { 630 // Log every 10th consecutive failure 631 c.logger.Error("still unable to update services in Consul", "failures", failures, "error", err) 632 } 633 634 failures++ 635 if !retryTimer.Stop() { 636 // Timer already expired, since the timer may 637 // or may not have been read in the select{} 638 // above, conditionally receive on it 639 select { 640 case <-retryTimer.C: 641 default: 642 } 643 } 644 backoff := c.retryInterval * time.Duration(failures) 645 if backoff > c.maxRetryInterval { 646 backoff = c.maxRetryInterval 647 } 648 retryTimer.Reset(backoff) 649 } else { 650 if failures > 0 { 651 c.logger.Info("successfully updated services in Consul") 652 failures = 0 653 } 654 655 // on successful sync, clear deregistered consul entities 656 c.clearExplicitlyDeregistered() 657 658 // Reset timer to periodic interval to periodically 659 // reconile with Consul 660 if !retryTimer.Stop() { 661 select { 662 case <-retryTimer.C: 663 default: 664 } 665 } 666 retryTimer.Reset(c.periodicInterval) 667 } 668 669 select { 670 case <-c.shutdownCh: 671 // Exit only after sync'ing all outstanding operations 672 if len(c.opCh) > 0 { 673 for len(c.opCh) > 0 { 674 c.merge(<-c.opCh) 675 } 676 continue 677 } 678 return 679 default: 680 } 681 682 } 683 } 684 685 // commit operations unless already shutting down. 686 func (c *ServiceClient) commit(ops *operations) { 687 c.logger.Trace("commit sync operations", "ops", ops) 688 689 // Ignore empty operations - ideally callers will optimize out syncs with 690 // nothing to do, but be defensive anyway. Sending an empty ops on the chan 691 // will trigger an unnecessary sync with Consul. 692 if ops.empty() { 693 return 694 } 695 696 // Prioritize doing nothing if we are being signaled to shutdown. 697 select { 698 case <-c.shutdownCh: 699 return 700 default: 701 } 702 703 // Send the ops down the ops chan, triggering a sync with Consul. Unless we 704 // receive a signal to shutdown. 705 select { 706 case c.opCh <- ops: 707 case <-c.shutdownCh: 708 } 709 } 710 711 func (c *ServiceClient) clearExplicitlyDeregistered() { 712 c.explicitlyDeregisteredServices = set.New[string](0) 713 c.explicitlyDeregisteredChecks = set.New[string](0) 714 } 715 716 // merge registrations into state map prior to sync'ing with Consul 717 func (c *ServiceClient) merge(ops *operations) { 718 for _, s := range ops.regServices { 719 c.services[s.ID] = s 720 } 721 for _, check := range ops.regChecks { 722 c.checks[check.ID] = check 723 } 724 for _, sid := range ops.deregServices { 725 delete(c.services, sid) 726 c.explicitlyDeregisteredServices.Insert(sid) 727 } 728 for _, cid := range ops.deregChecks { 729 delete(c.checks, cid) 730 c.explicitlyDeregisteredChecks.Insert(cid) 731 } 732 metrics.SetGauge([]string{"client", "consul", "services"}, float32(len(c.services))) 733 metrics.SetGauge([]string{"client", "consul", "checks"}, float32(len(c.checks))) 734 } 735 736 // sync enqueued operations. 737 func (c *ServiceClient) sync(reason syncReason) error { 738 c.logger.Trace("execute sync", "reason", reason) 739 740 sreg, creg, sdereg, cdereg := 0, 0, 0, 0 741 var err error 742 743 // Get the list of all namespaces created so we can iterate them. 744 namespaces, err := c.namespacesClient.List() 745 if err != nil { 746 metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1) 747 return fmt.Errorf("failed to query Consul namespaces: %w", err) 748 } 749 750 // Accumulate all services in Consul across all namespaces. 751 servicesInConsul := make(map[string]*api.AgentService) 752 for _, namespace := range namespaces { 753 if nsServices, err := c.agentAPI.ServicesWithFilterOpts("", &api.QueryOptions{Namespace: normalizeNamespace(namespace)}); err != nil { 754 metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1) 755 return fmt.Errorf("failed to query Consul services: %w", err) 756 } else { 757 for k, v := range nsServices { 758 servicesInConsul[k] = v 759 } 760 } 761 } 762 763 // Compute whether we are still in probation period where we will avoid 764 // de-registering services. 765 inProbation := time.Now().Before(c.deregisterProbationExpiry) 766 767 // Remove Nomad services in Consul but unknown to Nomad. 768 for id := range servicesInConsul { 769 if _, ok := c.services[id]; ok { 770 // Known service, skip 771 continue 772 } 773 774 // Ignore if this is not a Nomad managed service. Also ignore 775 // Nomad managed services if this is not a client agent. 776 // This is to prevent server agents from removing services 777 // registered by client agents 778 if !isNomadService(id) || !c.isClientAgent { 779 // Not managed by Nomad, skip 780 continue 781 } 782 783 // Ignore unknown services during probation 784 if inProbation && !c.explicitlyDeregisteredServices.Contains(id) { 785 continue 786 } 787 788 // Ignore if this is a service for a Nomad managed sidecar proxy. 789 if maybeConnectSidecar(id) { 790 continue 791 } 792 793 // Get the Consul namespace this service is in. 794 ns := servicesInConsul[id].Namespace 795 796 // If this service has a sidecar, we need to remove the sidecar first, 797 // otherwise Consul will produce a warning and an error when removing 798 // the parent service. 799 // 800 // The sidecar is not tracked on the Nomad side; it was registered 801 // implicitly through the parent service. 802 if sidecar := getNomadSidecar(id, servicesInConsul); sidecar != nil { 803 if err := c.agentAPI.ServiceDeregisterOpts(sidecar.ID, &api.QueryOptions{Namespace: ns}); err != nil { 804 metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1) 805 return err 806 } 807 } 808 809 // Remove the unwanted service. 810 if err := c.agentAPI.ServiceDeregisterOpts(id, &api.QueryOptions{Namespace: ns}); err != nil { 811 if isOldNomadService(id) { 812 // Don't hard-fail on old entries. See #3620 813 continue 814 } 815 816 metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1) 817 return err 818 } 819 sdereg++ 820 metrics.IncrCounter([]string{"client", "consul", "service_deregistrations"}, 1) 821 } 822 823 // Add Nomad managed services missing in Consul, or updated via Nomad. 824 for id, serviceInNomad := range c.services { 825 serviceInConsul, exists := servicesInConsul[id] 826 sidecarInConsul := getNomadSidecar(id, servicesInConsul) 827 828 if !exists || c.agentServiceUpdateRequired(reason, serviceInNomad, serviceInConsul, sidecarInConsul) { 829 c.logger.Trace("must register service", "id", id, "exists", exists, "reason", reason) 830 if err = c.agentAPI.ServiceRegister(serviceInNomad); err != nil { 831 metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1) 832 return err 833 } 834 sreg++ 835 metrics.IncrCounter([]string{"client", "consul", "service_registrations"}, 1) 836 } 837 838 } 839 840 checksInConsul := make(map[string]*api.AgentCheck) 841 for _, namespace := range namespaces { 842 nsChecks, err := c.agentAPI.ChecksWithFilterOpts("", &api.QueryOptions{Namespace: normalizeNamespace(namespace)}) 843 if err != nil { 844 metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1) 845 return fmt.Errorf("failed to query Consul checks: %w", err) 846 } 847 for k, v := range nsChecks { 848 checksInConsul[k] = v 849 } 850 } 851 852 // Remove Nomad checks in Consul but unknown locally 853 for id, check := range checksInConsul { 854 if _, ok := c.checks[id]; ok { 855 // Known check, leave it 856 continue 857 } 858 859 // Ignore if this is not a Nomad managed check. Also ignore 860 // Nomad managed checks if this is not a client agent. 861 // This is to prevent server agents from removing checks 862 // registered by client agents 863 if !isNomadService(check.ServiceID) || !c.isClientAgent || !isNomadCheck(check.CheckID) { 864 // Service not managed by Nomad, skip 865 continue 866 } 867 868 // Ignore unknown services during probation 869 if inProbation && !c.explicitlyDeregisteredChecks.Contains(id) { 870 continue 871 } 872 873 // Ignore if this is a check for a Nomad managed sidecar proxy. 874 if maybeSidecarProxyCheck(id) { 875 continue 876 } 877 878 // Unknown Nomad managed check; remove 879 if err := c.agentAPI.CheckDeregisterOpts(id, &api.QueryOptions{Namespace: check.Namespace}); err != nil { 880 if isOldNomadService(check.ServiceID) { 881 // Don't hard-fail on old entries. 882 continue 883 } 884 885 metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1) 886 return err 887 } 888 cdereg++ 889 metrics.IncrCounter([]string{"client", "consul", "check_deregistrations"}, 1) 890 } 891 892 // Add Nomad checks missing from Consul 893 for id, check := range c.checks { 894 if _, ok := checksInConsul[id]; ok { 895 // Already in Consul; skipping 896 continue 897 } 898 if err := c.agentAPI.CheckRegister(check); err != nil { 899 metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1) 900 return err 901 } 902 creg++ 903 metrics.IncrCounter([]string{"client", "consul", "check_registrations"}, 1) 904 } 905 906 // Only log if something was actually synced 907 if sreg > 0 || sdereg > 0 || creg > 0 || cdereg > 0 { 908 c.logger.Debug("sync complete", "registered_services", sreg, "deregistered_services", sdereg, 909 "registered_checks", creg, "deregistered_checks", cdereg) 910 } 911 return nil 912 } 913 914 // RegisterAgent registers Nomad agents (client or server). The 915 // Service.PortLabel should be a literal port to be parsed with SplitHostPort. 916 // Script checks are not supported and will return an error. Registration is 917 // asynchronous. 918 // 919 // Agents will be deregistered when Shutdown is called. 920 // 921 // Note: no need to manually plumb Consul namespace into the agent service registration 922 // or its check registrations, because the Nomad Client's Consul Client will already 923 // have the Nomad Client's Consul Namespace set on startup. 924 func (c *ServiceClient) RegisterAgent(role string, services []*structs.Service) error { 925 ops := operations{} 926 927 for _, service := range services { 928 id := makeAgentServiceID(role, service) 929 930 // Unlike tasks, agents don't use port labels. Agent ports are 931 // stored directly in the PortLabel. 932 host, rawport, err := net.SplitHostPort(service.PortLabel) 933 if err != nil { 934 return fmt.Errorf("error parsing port label %q from service %q: %v", service.PortLabel, service.Name, err) 935 } 936 port, err := strconv.Atoi(rawport) 937 if err != nil { 938 return fmt.Errorf("error parsing port %q from service %q: %v", rawport, service.Name, err) 939 } 940 serviceReg := &api.AgentServiceRegistration{ 941 ID: id, 942 Name: service.Name, 943 Tags: service.Tags, 944 Address: host, 945 Port: port, 946 // This enables the consul UI to show that Nomad registered this service 947 Meta: map[string]string{ 948 "external-source": "nomad", 949 }, 950 } 951 ops.regServices = append(ops.regServices, serviceReg) 952 953 for _, check := range service.Checks { 954 checkID := MakeCheckID(id, check) 955 if check.Type == structs.ServiceCheckScript { 956 return fmt.Errorf("service %q contains invalid check: agent checks do not support scripts", service.Name) 957 } 958 checkHost, checkPort := serviceReg.Address, serviceReg.Port 959 if check.PortLabel != "" { 960 // Unlike tasks, agents don't use port labels. Agent ports are 961 // stored directly in the PortLabel. 962 host, rawport, err := net.SplitHostPort(check.PortLabel) 963 if err != nil { 964 return fmt.Errorf("error parsing port label %q from check %q: %v", service.PortLabel, check.Name, err) 965 } 966 port, err := strconv.Atoi(rawport) 967 if err != nil { 968 return fmt.Errorf("error parsing port %q from check %q: %v", rawport, check.Name, err) 969 } 970 checkHost, checkPort = host, port 971 } 972 checkReg, err := createCheckReg(id, checkID, check, checkHost, checkPort, "") 973 if err != nil { 974 return fmt.Errorf("failed to add check %q: %v", check.Name, err) 975 } 976 ops.regChecks = append(ops.regChecks, checkReg) 977 } 978 } 979 980 // Don't bother committing agent checks if we're already shutting down 981 c.agentLock.Lock() 982 defer c.agentLock.Unlock() 983 select { 984 case <-c.shutdownCh: 985 return nil 986 default: 987 } 988 989 // Now add them to the registration queue 990 c.commit(&ops) 991 992 // Record IDs for deregistering on shutdown 993 for _, id := range ops.regServices { 994 c.agentServices.Insert(id.ID) 995 } 996 for _, id := range ops.regChecks { 997 c.agentChecks.Insert(id.ID) 998 } 999 return nil 1000 } 1001 1002 // serviceRegs creates service registrations, check registrations, and script 1003 // checks from a service. It returns a service registration object with the 1004 // service and check IDs populated. 1005 func (c *ServiceClient) serviceRegs( 1006 ops *operations, 1007 service *structs.Service, 1008 workload *serviceregistration.WorkloadServices, 1009 ) (*serviceregistration.ServiceRegistration, error) { 1010 1011 // Get the services ID 1012 id := serviceregistration.MakeAllocServiceID(workload.AllocInfo.AllocID, workload.Name(), service) 1013 sreg := &serviceregistration.ServiceRegistration{ 1014 ServiceID: id, 1015 CheckIDs: make(map[string]struct{}, len(service.Checks)), 1016 CheckOnUpdate: make(map[string]string, len(service.Checks)), 1017 } 1018 1019 // Service address modes default to auto 1020 addrMode := service.AddressMode 1021 if addrMode == "" { 1022 addrMode = structs.AddressModeAuto 1023 } 1024 1025 // Determine the address to advertise based on the mode 1026 ip, port, err := serviceregistration.GetAddress( 1027 service.Address, addrMode, service.PortLabel, workload.Networks, workload.DriverNetwork, workload.Ports, workload.NetworkStatus) 1028 if err != nil { 1029 return nil, fmt.Errorf("unable to get address for service %q: %v", service.Name, err) 1030 } 1031 1032 // Determine whether to use tags or canary_tags 1033 var tags []string 1034 if workload.Canary && len(service.CanaryTags) > 0 { 1035 tags = make([]string, len(service.CanaryTags)) 1036 copy(tags, service.CanaryTags) 1037 } else { 1038 tags = make([]string, len(service.Tags)) 1039 copy(tags, service.Tags) 1040 } 1041 1042 // newConnect returns (nil, nil) if there's no Connect-enabled service. 1043 connect, err := newConnect(id, workload.AllocInfo, service.Name, service.Connect, workload.Networks, workload.Ports) 1044 if err != nil { 1045 return nil, fmt.Errorf("invalid Consul Connect configuration for service %q: %v", service.Name, err) 1046 } 1047 1048 // newConnectGateway returns nil if there's no Connect gateway. 1049 gateway := newConnectGateway(service.Connect) 1050 1051 // Determine whether to use meta or canary_meta 1052 var meta map[string]string 1053 if workload.Canary && len(service.CanaryMeta) > 0 { 1054 meta = make(map[string]string, len(service.CanaryMeta)+1) 1055 for k, v := range service.CanaryMeta { 1056 meta[k] = v 1057 } 1058 } else { 1059 meta = make(map[string]string, len(service.Meta)+1) 1060 for k, v := range service.Meta { 1061 meta[k] = v 1062 } 1063 } 1064 1065 // This enables the consul UI to show that Nomad registered this service 1066 meta["external-source"] = "nomad" 1067 1068 // Explicitly set the Consul service Kind in case this service represents 1069 // one of the Connect gateway types. 1070 kind := api.ServiceKindTypical 1071 switch { 1072 case service.Connect.IsIngress(): 1073 kind = api.ServiceKindIngressGateway 1074 case service.Connect.IsTerminating(): 1075 kind = api.ServiceKindTerminatingGateway 1076 1077 if proxy := service.Connect.Gateway.Proxy; proxy != nil { 1078 // set the default port if bridge / default listener set 1079 if defaultBind, exists := proxy.EnvoyGatewayBindAddresses["default"]; exists { 1080 portLabel := envoy.PortLabel(structs.ConnectTerminatingPrefix, service.Name, "") 1081 if dynPort, ok := workload.Ports.Get(portLabel); ok { 1082 defaultBind.Port = dynPort.Value 1083 } 1084 } 1085 } 1086 case service.Connect.IsMesh(): 1087 kind = api.ServiceKindMeshGateway 1088 1089 if proxy := service.Connect.Gateway.Proxy; proxy != nil { 1090 // wan uses the service port label, which is typically on a discrete host_network 1091 if wanBind, exists := proxy.EnvoyGatewayBindAddresses["wan"]; exists { 1092 if wanPort, ok := workload.Ports.Get(service.PortLabel); ok { 1093 wanBind.Port = wanPort.Value 1094 } 1095 } 1096 // lan uses a nomad generated dynamic port on the default network 1097 if lanBind, exists := proxy.EnvoyGatewayBindAddresses["lan"]; exists { 1098 portLabel := envoy.PortLabel(structs.ConnectMeshPrefix, service.Name, "lan") 1099 if dynPort, ok := workload.Ports.Get(portLabel); ok { 1100 lanBind.Port = dynPort.Value 1101 } 1102 } 1103 } 1104 } 1105 1106 taggedAddresses, err := parseTaggedAddresses(service.TaggedAddresses, port) 1107 if err != nil { 1108 return nil, err 1109 } 1110 1111 // Build the Consul Service registration request 1112 serviceReg := &api.AgentServiceRegistration{ 1113 Kind: kind, 1114 ID: id, 1115 Name: service.Name, 1116 Namespace: workload.ProviderNamespace, 1117 Tags: tags, 1118 EnableTagOverride: service.EnableTagOverride, 1119 Address: ip, 1120 Port: port, 1121 Meta: meta, 1122 TaggedAddresses: taggedAddresses, 1123 Connect: connect, // will be nil if no Connect stanza 1124 Proxy: gateway, // will be nil if no Connect Gateway stanza 1125 Checks: make([]*api.AgentServiceCheck, 0, len(service.Checks)), 1126 } 1127 ops.regServices = append(ops.regServices, serviceReg) 1128 1129 // Build the check registrations 1130 checkRegs, err := c.checkRegs(id, service, workload, sreg) 1131 if err != nil { 1132 return nil, err 1133 } 1134 1135 for _, registration := range checkRegs { 1136 sreg.CheckIDs[registration.ID] = struct{}{} 1137 ops.regChecks = append(ops.regChecks, registration) 1138 serviceReg.Checks = append( 1139 serviceReg.Checks, 1140 apiCheckRegistrationToCheck(registration), 1141 ) 1142 } 1143 1144 return sreg, nil 1145 } 1146 1147 // apiCheckRegistrationToCheck converts a check registration to a check, so that 1148 // we can include them in the initial service registration. It is expected the 1149 // Nomad-conversion (e.g. turning script checks into ttl checks) has already been 1150 // applied. 1151 func apiCheckRegistrationToCheck(r *api.AgentCheckRegistration) *api.AgentServiceCheck { 1152 return &api.AgentServiceCheck{ 1153 CheckID: r.ID, 1154 Name: r.Name, 1155 Interval: r.Interval, 1156 Timeout: r.Timeout, 1157 TTL: r.TTL, 1158 HTTP: r.HTTP, 1159 Header: maps.Clone(r.Header), 1160 Method: r.Method, 1161 Body: r.Body, 1162 TCP: r.TCP, 1163 Status: r.Status, 1164 TLSSkipVerify: r.TLSSkipVerify, 1165 GRPC: r.GRPC, 1166 GRPCUseTLS: r.GRPCUseTLS, 1167 SuccessBeforePassing: r.SuccessBeforePassing, 1168 FailuresBeforeCritical: r.FailuresBeforeCritical, 1169 } 1170 } 1171 1172 // checkRegs creates check registrations for the given service 1173 func (c *ServiceClient) checkRegs( 1174 serviceID string, 1175 service *structs.Service, 1176 workload *serviceregistration.WorkloadServices, 1177 sreg *serviceregistration.ServiceRegistration, 1178 ) ([]*api.AgentCheckRegistration, error) { 1179 1180 registrations := make([]*api.AgentCheckRegistration, 0, len(service.Checks)) 1181 for _, check := range service.Checks { 1182 var ip string 1183 var port int 1184 1185 if check.Type != structs.ServiceCheckScript { 1186 portLabel := check.PortLabel 1187 if portLabel == "" { 1188 portLabel = service.PortLabel 1189 } 1190 1191 addrMode := check.AddressMode 1192 if addrMode == "" { 1193 if service.Address != "" { 1194 // if the service is using a custom address, enable the check 1195 // to use that address 1196 addrMode = structs.AddressModeAuto 1197 } else { 1198 // otherwise default to the host address 1199 addrMode = structs.AddressModeHost 1200 } 1201 } 1202 1203 var err error 1204 ip, port, err = serviceregistration.GetAddress( 1205 service.Address, addrMode, portLabel, workload.Networks, workload.DriverNetwork, workload.Ports, workload.NetworkStatus) 1206 if err != nil { 1207 return nil, fmt.Errorf("error getting address for check %q: %v", check.Name, err) 1208 } 1209 } 1210 1211 checkID := MakeCheckID(serviceID, check) 1212 registration, err := createCheckReg(serviceID, checkID, check, ip, port, workload.ProviderNamespace) 1213 if err != nil { 1214 return nil, fmt.Errorf("failed to add check %q: %v", check.Name, err) 1215 } 1216 sreg.CheckOnUpdate[checkID] = check.OnUpdate 1217 registrations = append(registrations, registration) 1218 } 1219 1220 return registrations, nil 1221 } 1222 1223 // RegisterWorkload with Consul. Adds all service entries and checks to Consul. 1224 // 1225 // If the service IP is set it used as the address in the service registration. 1226 // Checks will always use the IP from the Task struct (host's IP). 1227 // 1228 // Actual communication with Consul is done asynchronously (see Run). 1229 func (c *ServiceClient) RegisterWorkload(workload *serviceregistration.WorkloadServices) error { 1230 // Fast path 1231 numServices := len(workload.Services) 1232 if numServices == 0 { 1233 return nil 1234 } 1235 1236 t := new(serviceregistration.ServiceRegistrations) 1237 t.Services = make(map[string]*serviceregistration.ServiceRegistration, numServices) 1238 1239 ops := &operations{} 1240 for _, service := range workload.Services { 1241 sreg, err := c.serviceRegs(ops, service, workload) 1242 if err != nil { 1243 return err 1244 } 1245 t.Services[sreg.ServiceID] = sreg 1246 } 1247 1248 // Add the workload to the allocation's registration 1249 c.addRegistrations(workload.AllocInfo.AllocID, workload.Name(), t) 1250 1251 c.commit(ops) 1252 1253 // Start watching checks. Done after service registrations are built 1254 // since an error building them could leak watches. 1255 for _, service := range workload.Services { 1256 serviceID := serviceregistration.MakeAllocServiceID(workload.AllocInfo.AllocID, workload.Name(), service) 1257 for _, check := range service.Checks { 1258 if check.TriggersRestarts() { 1259 checkID := MakeCheckID(serviceID, check) 1260 c.checkWatcher.Watch(workload.AllocInfo.AllocID, workload.Name(), checkID, check, workload.Restarter) 1261 } 1262 } 1263 } 1264 return nil 1265 } 1266 1267 // UpdateWorkload in Consul. Does not alter the service if only checks have 1268 // changed. 1269 // 1270 // DriverNetwork must not change between invocations for the same allocation. 1271 func (c *ServiceClient) UpdateWorkload(old, newWorkload *serviceregistration.WorkloadServices) error { 1272 ops := new(operations) 1273 regs := new(serviceregistration.ServiceRegistrations) 1274 regs.Services = make(map[string]*serviceregistration.ServiceRegistration, len(newWorkload.Services)) 1275 1276 newIDs := make(map[string]*structs.Service, len(newWorkload.Services)) 1277 for _, s := range newWorkload.Services { 1278 newIDs[serviceregistration.MakeAllocServiceID(newWorkload.AllocInfo.AllocID, newWorkload.Name(), s)] = s 1279 } 1280 1281 // Loop over existing Services to see if they have been removed 1282 for _, existingSvc := range old.Services { 1283 existingID := serviceregistration.MakeAllocServiceID(old.AllocInfo.AllocID, old.Name(), existingSvc) 1284 newSvc, ok := newIDs[existingID] 1285 1286 if !ok { 1287 // Existing service entry removed 1288 ops.deregServices = append(ops.deregServices, existingID) 1289 for _, check := range existingSvc.Checks { 1290 cid := MakeCheckID(existingID, check) 1291 ops.deregChecks = append(ops.deregChecks, cid) 1292 1293 // Unwatch watched checks 1294 if check.TriggersRestarts() { 1295 c.checkWatcher.Unwatch(cid) 1296 } 1297 } 1298 continue 1299 } 1300 1301 oldHash := existingSvc.Hash(old.AllocInfo.AllocID, old.Name(), old.Canary) 1302 newHash := newSvc.Hash(newWorkload.AllocInfo.AllocID, newWorkload.Name(), newWorkload.Canary) 1303 if oldHash == newHash { 1304 // Service exists and hasn't changed, don't re-add it later 1305 delete(newIDs, existingID) 1306 } 1307 1308 // Service still exists so add it to the task's registration 1309 sreg := &serviceregistration.ServiceRegistration{ 1310 ServiceID: existingID, 1311 CheckIDs: make(map[string]struct{}, len(newSvc.Checks)), 1312 CheckOnUpdate: make(map[string]string, len(newSvc.Checks)), 1313 } 1314 regs.Services[existingID] = sreg 1315 1316 // See if any checks were updated 1317 existingChecks := make(map[string]*structs.ServiceCheck, len(existingSvc.Checks)) 1318 for _, check := range existingSvc.Checks { 1319 existingChecks[MakeCheckID(existingID, check)] = check 1320 } 1321 1322 // Register new checks 1323 for _, check := range newSvc.Checks { 1324 checkID := MakeCheckID(existingID, check) 1325 if _, exists := existingChecks[checkID]; exists { 1326 // Check is still required. Remove it from the map so it doesn't get 1327 // deleted later. 1328 delete(existingChecks, checkID) 1329 sreg.CheckIDs[checkID] = struct{}{} 1330 sreg.CheckOnUpdate[checkID] = check.OnUpdate 1331 } 1332 1333 // New check on an unchanged service; add them now 1334 checkRegs, err := c.checkRegs(existingID, newSvc, newWorkload, sreg) 1335 if err != nil { 1336 return err 1337 } 1338 1339 for _, registration := range checkRegs { 1340 sreg.CheckIDs[registration.ID] = struct{}{} 1341 sreg.CheckOnUpdate[registration.ID] = check.OnUpdate 1342 ops.regChecks = append(ops.regChecks, registration) 1343 } 1344 1345 // Update all watched checks as CheckRestart fields aren't part of ID 1346 if check.TriggersRestarts() { 1347 c.checkWatcher.Watch(newWorkload.AllocInfo.AllocID, newWorkload.Name(), checkID, check, newWorkload.Restarter) 1348 } 1349 } 1350 1351 // Remove existing checks not in updated service 1352 for cid, check := range existingChecks { 1353 ops.deregChecks = append(ops.deregChecks, cid) 1354 1355 // Unwatch checks 1356 if check.TriggersRestarts() { 1357 c.checkWatcher.Unwatch(cid) 1358 } 1359 } 1360 } 1361 1362 // Any remaining services should just be enqueued directly 1363 for _, newSvc := range newIDs { 1364 sreg, err := c.serviceRegs(ops, newSvc, newWorkload) 1365 if err != nil { 1366 return err 1367 } 1368 1369 regs.Services[sreg.ServiceID] = sreg 1370 } 1371 1372 // Add the task to the allocation's registration 1373 c.addRegistrations(newWorkload.AllocInfo.AllocID, newWorkload.Name(), regs) 1374 1375 c.commit(ops) 1376 1377 // Start watching checks. Done after service registrations are built 1378 // since an error building them could leak watches. 1379 for serviceID, service := range newIDs { 1380 for _, check := range service.Checks { 1381 if check.TriggersRestarts() { 1382 checkID := MakeCheckID(serviceID, check) 1383 c.checkWatcher.Watch(newWorkload.AllocInfo.AllocID, newWorkload.Name(), checkID, check, newWorkload.Restarter) 1384 } 1385 } 1386 } 1387 1388 return nil 1389 } 1390 1391 // RemoveWorkload from Consul. Removes all service entries and checks. 1392 // 1393 // Actual communication with Consul is done asynchronously (see Run). 1394 func (c *ServiceClient) RemoveWorkload(workload *serviceregistration.WorkloadServices) { 1395 ops := operations{} 1396 1397 for _, service := range workload.Services { 1398 id := serviceregistration.MakeAllocServiceID(workload.AllocInfo.AllocID, workload.Name(), service) 1399 ops.deregServices = append(ops.deregServices, id) 1400 1401 for _, check := range service.Checks { 1402 cid := MakeCheckID(id, check) 1403 ops.deregChecks = append(ops.deregChecks, cid) 1404 1405 if check.TriggersRestarts() { 1406 c.checkWatcher.Unwatch(cid) 1407 } 1408 } 1409 } 1410 1411 // Remove the workload from the alloc's registrations 1412 c.removeRegistration(workload.AllocInfo.AllocID, workload.Name()) 1413 1414 // Now add them to the deregistration fields; main Run loop will update 1415 c.commit(&ops) 1416 } 1417 1418 // normalizeNamespace will turn the "default" namespace into the empty string, 1419 // so that Consul OSS will not produce an error setting something in the default 1420 // namespace. 1421 func normalizeNamespace(namespace string) string { 1422 if namespace == "default" { 1423 return "" 1424 } 1425 return namespace 1426 } 1427 1428 // AllocRegistrations returns the registrations for the given allocation. If the 1429 // allocation has no registrations, the response is a nil object. 1430 func (c *ServiceClient) AllocRegistrations(allocID string) (*serviceregistration.AllocRegistration, error) { 1431 // Get the internal struct using the lock 1432 c.allocRegistrationsLock.RLock() 1433 regInternal, ok := c.allocRegistrations[allocID] 1434 if !ok { 1435 c.allocRegistrationsLock.RUnlock() 1436 return nil, nil 1437 } 1438 1439 // Copy so we don't expose internal structs 1440 reg := regInternal.Copy() 1441 c.allocRegistrationsLock.RUnlock() 1442 1443 // Get the list of all namespaces created so we can iterate them. 1444 namespaces, err := c.namespacesClient.List() 1445 if err != nil { 1446 return nil, fmt.Errorf("failed to retrieve namespaces from consul: %w", err) 1447 } 1448 1449 services := make(map[string]*api.AgentService) 1450 checks := make(map[string]*api.AgentCheck) 1451 1452 // Query the services and checks to populate the allocation registrations. 1453 for _, namespace := range namespaces { 1454 nsServices, err := c.agentAPI.ServicesWithFilterOpts("", &api.QueryOptions{Namespace: normalizeNamespace(namespace)}) 1455 if err != nil { 1456 return nil, fmt.Errorf("failed to retrieve services from consul: %w", err) 1457 } 1458 for k, v := range nsServices { 1459 services[k] = v 1460 } 1461 1462 nsChecks, err := c.agentAPI.ChecksWithFilterOpts("", &api.QueryOptions{Namespace: normalizeNamespace(namespace)}) 1463 if err != nil { 1464 return nil, fmt.Errorf("failed to retrieve checks from consul: %w", err) 1465 } 1466 for k, v := range nsChecks { 1467 checks[k] = v 1468 } 1469 } 1470 1471 // Populate the object 1472 for _, treg := range reg.Tasks { 1473 for serviceID, sreg := range treg.Services { 1474 sreg.Service = services[serviceID] 1475 for checkID := range sreg.CheckIDs { 1476 if check, ok := checks[checkID]; ok { 1477 sreg.Checks = append(sreg.Checks, check) 1478 } 1479 } 1480 } 1481 } 1482 1483 return reg, nil 1484 } 1485 1486 // UpdateTTL is used to update the TTL of a check. Typically this will only be 1487 // called to heartbeat script checks. 1488 func (c *ServiceClient) UpdateTTL(id, namespace, output, status string) error { 1489 ns := normalizeNamespace(namespace) 1490 return c.agentAPI.UpdateTTLOpts(id, output, status, &api.QueryOptions{Namespace: ns}) 1491 } 1492 1493 // Shutdown the Consul client. Update running task registrations and deregister 1494 // agent from Consul. On first call blocks up to shutdownWait before giving up 1495 // on syncing operations. 1496 func (c *ServiceClient) Shutdown() error { 1497 // Serialize Shutdown calls with RegisterAgent to prevent leaking agent 1498 // entries. 1499 c.agentLock.Lock() 1500 defer c.agentLock.Unlock() 1501 select { 1502 case <-c.shutdownCh: 1503 return nil 1504 default: 1505 close(c.shutdownCh) 1506 } 1507 1508 // Give run loop time to sync, but don't block indefinitely 1509 deadline := time.After(c.shutdownWait) 1510 1511 // Wait for Run to finish any outstanding operations and exit 1512 select { 1513 case <-c.exitCh: 1514 case <-deadline: 1515 // Don't wait forever though 1516 } 1517 1518 // If Consul was never seen nothing could be written so exit early 1519 if !c.hasSeen() { 1520 return nil 1521 } 1522 1523 // Always attempt to deregister Nomad agent Consul entries, even if 1524 // deadline was reached 1525 for _, id := range c.agentServices.List() { 1526 if err := c.agentAPI.ServiceDeregisterOpts(id, nil); err != nil { 1527 c.logger.Error("failed deregistering agent service", "service_id", id, "error", err) 1528 } 1529 } 1530 1531 namespaces, err := c.namespacesClient.List() 1532 if err != nil { 1533 c.logger.Error("failed to retrieve namespaces from consul", "error", err) 1534 } 1535 1536 remainingChecks := make(map[string]*api.AgentCheck) 1537 for _, namespace := range namespaces { 1538 nsChecks, err := c.agentAPI.ChecksWithFilterOpts("", &api.QueryOptions{Namespace: normalizeNamespace(namespace)}) 1539 if err != nil { 1540 c.logger.Error("failed to retrieve checks from consul", "error", err) 1541 } 1542 for k, v := range nsChecks { 1543 remainingChecks[k] = v 1544 } 1545 } 1546 1547 checkRemains := func(id string) bool { 1548 for _, c := range remainingChecks { 1549 if c.CheckID == id { 1550 return true 1551 } 1552 } 1553 return false 1554 } 1555 1556 for _, id := range c.agentChecks.List() { 1557 // if we couldn't populate remainingChecks it is unlikely that CheckDeregister will work, but try anyway 1558 // if we could list the remaining checks, verify that the check we store still exists before removing it. 1559 if remainingChecks == nil || checkRemains(id) { 1560 ns := remainingChecks[id].Namespace 1561 if err := c.agentAPI.CheckDeregisterOpts(id, &api.QueryOptions{Namespace: ns}); err != nil { 1562 c.logger.Error("failed deregistering agent check", "check_id", id, "error", err) 1563 } 1564 } 1565 } 1566 1567 return nil 1568 } 1569 1570 // addRegistration adds the service registrations for the given allocation. 1571 func (c *ServiceClient) addRegistrations(allocID, taskName string, reg *serviceregistration.ServiceRegistrations) { 1572 c.allocRegistrationsLock.Lock() 1573 defer c.allocRegistrationsLock.Unlock() 1574 1575 alloc, ok := c.allocRegistrations[allocID] 1576 if !ok { 1577 alloc = &serviceregistration.AllocRegistration{ 1578 Tasks: make(map[string]*serviceregistration.ServiceRegistrations), 1579 } 1580 c.allocRegistrations[allocID] = alloc 1581 } 1582 alloc.Tasks[taskName] = reg 1583 } 1584 1585 // removeRegistrations removes the registration for the given allocation. 1586 func (c *ServiceClient) removeRegistration(allocID, taskName string) { 1587 c.allocRegistrationsLock.Lock() 1588 defer c.allocRegistrationsLock.Unlock() 1589 1590 alloc, ok := c.allocRegistrations[allocID] 1591 if !ok { 1592 return 1593 } 1594 1595 // Delete the task and if it is the last one also delete the alloc's 1596 // registration 1597 delete(alloc.Tasks, taskName) 1598 if len(alloc.Tasks) == 0 { 1599 delete(c.allocRegistrations, allocID) 1600 } 1601 } 1602 1603 // makeAgentServiceID creates a unique ID for identifying an agent service in 1604 // Consul. 1605 // 1606 // Agent service IDs are of the form: 1607 // 1608 // {nomadServicePrefix}-{ROLE}-b32(sha1({Service.Name}-{Service.Tags...}) 1609 // Example Server ID: _nomad-server-fbbk265qn4tmt25nd4ep42tjvmyj3hr4 1610 // Example Client ID: _nomad-client-ggnjpgl7yn7rgmvxzilmpvrzzvrszc7l 1611 func makeAgentServiceID(role string, service *structs.Service) string { 1612 return fmt.Sprintf("%s-%s-%s", nomadServicePrefix, role, service.Hash(role, "", false)) 1613 } 1614 1615 // MakeCheckID creates a unique ID for a check. 1616 // 1617 // Example Check ID: _nomad-check-434ae42f9a57c5705344974ac38de2aee0ee089d 1618 func MakeCheckID(serviceID string, check *structs.ServiceCheck) string { 1619 return fmt.Sprintf("%s%s", nomadCheckPrefix, check.Hash(serviceID)) 1620 } 1621 1622 // createCheckReg creates a Check that can be registered with Consul. 1623 // 1624 // Script checks simply have a TTL set and the caller is responsible for 1625 // running the script and heart-beating. 1626 func createCheckReg(serviceID, checkID string, check *structs.ServiceCheck, host string, port int, namespace string) (*api.AgentCheckRegistration, error) { 1627 chkReg := api.AgentCheckRegistration{ 1628 ID: checkID, 1629 Name: check.Name, 1630 ServiceID: serviceID, 1631 Namespace: normalizeNamespace(namespace), 1632 } 1633 chkReg.Status = check.InitialStatus 1634 chkReg.Timeout = check.Timeout.String() 1635 chkReg.Interval = check.Interval.String() 1636 chkReg.SuccessBeforePassing = check.SuccessBeforePassing 1637 chkReg.FailuresBeforeCritical = check.FailuresBeforeCritical 1638 1639 // Require an address for http or tcp checks 1640 if port == 0 && check.RequiresPort() { 1641 return nil, fmt.Errorf("%s checks require an address", check.Type) 1642 } 1643 1644 switch check.Type { 1645 case structs.ServiceCheckHTTP: 1646 proto := check.Protocol 1647 if proto == "" { 1648 proto = "http" 1649 } 1650 if check.TLSSkipVerify { 1651 chkReg.TLSSkipVerify = true 1652 } 1653 base := url.URL{ 1654 Scheme: proto, 1655 Host: net.JoinHostPort(host, strconv.Itoa(port)), 1656 } 1657 relative, err := url.Parse(check.Path) 1658 if err != nil { 1659 return nil, err 1660 } 1661 checkURL := base.ResolveReference(relative) 1662 chkReg.HTTP = checkURL.String() 1663 chkReg.Method = check.Method 1664 chkReg.Header = check.Header 1665 chkReg.Body = check.Body 1666 1667 case structs.ServiceCheckTCP: 1668 chkReg.TCP = net.JoinHostPort(host, strconv.Itoa(port)) 1669 1670 case structs.ServiceCheckScript: 1671 chkReg.TTL = (check.Interval + ttlCheckBuffer).String() 1672 // As of Consul 1.0.0 setting TTL and Interval is a 400 1673 chkReg.Interval = "" 1674 1675 case structs.ServiceCheckGRPC: 1676 chkReg.GRPC = fmt.Sprintf("%s/%s", net.JoinHostPort(host, strconv.Itoa(port)), check.GRPCService) 1677 chkReg.GRPCUseTLS = check.GRPCUseTLS 1678 if check.TLSSkipVerify { 1679 chkReg.TLSSkipVerify = true 1680 } 1681 1682 default: 1683 return nil, fmt.Errorf("check type %+q not valid", check.Type) 1684 } 1685 return &chkReg, nil 1686 } 1687 1688 // isNomadClient returns true if id represents a Nomad Client registration. 1689 func isNomadClient(id string) bool { 1690 return strings.HasPrefix(id, nomadClientPrefix) 1691 } 1692 1693 // isNomadServer returns true if id represents a Nomad Server registration. 1694 func isNomadServer(id string) bool { 1695 return strings.HasPrefix(id, nomadServerPrefix) 1696 } 1697 1698 // isNomadAgent returns true if id represents a Nomad Client or Server registration. 1699 func isNomadAgent(id string) bool { 1700 return isNomadClient(id) || isNomadServer(id) 1701 } 1702 1703 // isNomadService returns true if the ID matches the pattern of a Nomad managed 1704 // service (new or old formats). Agent services return false as independent 1705 // client and server agents may be running on the same machine. #2827 1706 func isNomadService(id string) bool { 1707 return strings.HasPrefix(id, nomadTaskPrefix) || isOldNomadService(id) 1708 } 1709 1710 // isNomadCheck returns true if the ID matches the pattern of a Nomad managed 1711 // check. 1712 func isNomadCheck(id string) bool { 1713 return strings.HasPrefix(id, nomadCheckPrefix) 1714 } 1715 1716 // isOldNomadService returns true if the ID matches an old pattern managed by 1717 // Nomad. 1718 // 1719 // Pre-0.7.1 task service IDs are of the form: 1720 // 1721 // {nomadServicePrefix}-executor-{ALLOC_ID}-{Service.Name}-{Service.Tags...} 1722 // Example Service ID: _nomad-executor-1234-echo-http-tag1-tag2-tag3 1723 func isOldNomadService(id string) bool { 1724 const prefix = nomadServicePrefix + "-executor" 1725 return strings.HasPrefix(id, prefix) 1726 } 1727 1728 const ( 1729 sidecarSuffix = "-sidecar-proxy" 1730 ) 1731 1732 // maybeConnectSidecar returns true if the ID is likely of a Connect sidecar proxy. 1733 // This function should only be used to determine if Nomad should skip managing 1734 // service id; it could produce false negatives for non-Nomad managed services 1735 // (i.e. someone set the ID manually), but Nomad does not manage those anyway. 1736 // 1737 // It is important not to reference the parent service, which may or may not still 1738 // be tracked by Nomad internally. 1739 // 1740 // For example if you have a Connect enabled service with the ID: 1741 // 1742 // _nomad-task-5229c7f8-376b-3ccc-edd9-981e238f7033-cache-redis-cache-db 1743 // 1744 // Consul will create a service for the sidecar proxy with the ID: 1745 // 1746 // _nomad-task-5229c7f8-376b-3ccc-edd9-981e238f7033-cache-redis-cache-db-sidecar-proxy 1747 func maybeConnectSidecar(id string) bool { 1748 return strings.HasSuffix(id, sidecarSuffix) 1749 } 1750 1751 var ( 1752 sidecarProxyCheckRe = regexp.MustCompile(`^service:_nomad-.+-sidecar-proxy(:[\d]+)?$`) 1753 ) 1754 1755 // maybeSidecarProxyCheck returns true if the ID likely matches a Nomad generated 1756 // check ID used in the context of a Nomad managed Connect sidecar proxy. This function 1757 // should only be used to determine if Nomad should skip managing a check; it can 1758 // produce false negatives for non-Nomad managed Connect sidecar proxy checks (i.e. 1759 // someone set the ID manually), but Nomad does not manage those anyway. 1760 // 1761 // For example if you have a Connect enabled service with the ID: 1762 // 1763 // _nomad-task-5229c7f8-376b-3ccc-edd9-981e238f7033-cache-redis-cache-db 1764 // 1765 // Nomad will create a Connect sidecar proxy of ID: 1766 // 1767 // _nomad-task-5229c7f8-376b-3ccc-edd9-981e238f7033-cache-redis-cache-db-sidecar-proxy 1768 // 1769 // With default checks like: 1770 // 1771 // service:_nomad-task-2f5fb517-57d4-44ee-7780-dc1cb6e103cd-group-api-count-api-9001-sidecar-proxy:1 1772 // service:_nomad-task-2f5fb517-57d4-44ee-7780-dc1cb6e103cd-group-api-count-api-9001-sidecar-proxy:2 1773 // 1774 // Unless sidecar_service.disable_default_tcp_check is set, in which case the 1775 // default check is: 1776 // 1777 // service:_nomad-task-322616db-2680-35d8-0d10-b50a0a0aa4cd-group-api-count-api-9001-sidecar-proxy 1778 func maybeSidecarProxyCheck(id string) bool { 1779 return sidecarProxyCheckRe.MatchString(id) 1780 } 1781 1782 // getNomadSidecar returns the service registration of the sidecar for the managed 1783 // service with the specified id. 1784 // 1785 // If the managed service of the specified id does not exist, or the service does 1786 // not have a sidecar proxy, nil is returned. 1787 func getNomadSidecar(id string, services map[string]*api.AgentService) *api.AgentService { 1788 if _, exists := services[id]; !exists { 1789 return nil 1790 } 1791 1792 sidecarID := id + sidecarSuffix 1793 return services[sidecarID] 1794 } 1795 1796 func parseAddress(raw string, port int) (api.ServiceAddress, error) { 1797 result := api.ServiceAddress{} 1798 addr, portStr, err := net.SplitHostPort(raw) 1799 // Error message from Go's net/ipsock.go 1800 if err != nil { 1801 if !strings.Contains(err.Error(), "missing port in address") { 1802 return result, fmt.Errorf("error parsing address %q: %v", raw, err) 1803 } 1804 1805 // Use the whole input as the address if there wasn't a port. 1806 if ip := net.ParseIP(raw); ip == nil { 1807 return result, fmt.Errorf("error parsing address %q: not an IP address", raw) 1808 } 1809 addr = raw 1810 } 1811 1812 if portStr != "" { 1813 port, err = strconv.Atoi(portStr) 1814 if err != nil { 1815 return result, fmt.Errorf("error parsing port %q: %v", portStr, err) 1816 } 1817 } 1818 1819 result.Address = addr 1820 result.Port = port 1821 return result, nil 1822 } 1823 1824 // morph the tagged_addresses map into the structure consul api wants 1825 func parseTaggedAddresses(m map[string]string, port int) (map[string]api.ServiceAddress, error) { 1826 result := make(map[string]api.ServiceAddress, len(m)) 1827 for k, v := range m { 1828 sa, err := parseAddress(v, port) 1829 if err != nil { 1830 return nil, err 1831 } 1832 result[k] = sa 1833 } 1834 return result, nil 1835 }