github.com/sl1pm4t/consul@v1.4.5-0.20190325224627-74c31c540f9c/agent/local/state.go (about) 1 package local 2 3 import ( 4 "fmt" 5 "log" 6 "math/rand" 7 "reflect" 8 "strconv" 9 "strings" 10 "sync" 11 "sync/atomic" 12 "time" 13 14 metrics "github.com/armon/go-metrics" 15 16 "github.com/hashicorp/consul/acl" 17 "github.com/hashicorp/consul/agent/structs" 18 "github.com/hashicorp/consul/agent/token" 19 "github.com/hashicorp/consul/api" 20 "github.com/hashicorp/consul/lib" 21 "github.com/hashicorp/consul/types" 22 uuid "github.com/hashicorp/go-uuid" 23 ) 24 25 // Config is the configuration for the State. 26 type Config struct { 27 AdvertiseAddr string 28 CheckUpdateInterval time.Duration 29 Datacenter string 30 DiscardCheckOutput bool 31 NodeID types.NodeID 32 NodeName string 33 TaggedAddresses map[string]string 34 ProxyBindMinPort int 35 ProxyBindMaxPort int 36 } 37 38 // ServiceState describes the state of a service record. 39 type ServiceState struct { 40 // Service is the local copy of the service record. 41 Service *structs.NodeService 42 43 // Token is the ACL to update or delete the service record on the 44 // server. 45 Token string 46 47 // InSync contains whether the local state of the service record 48 // is in sync with the remote state on the server. 49 InSync bool 50 51 // Deleted is true when the service record has been marked as deleted 52 // but has not been removed on the server yet. 53 Deleted bool 54 55 // WatchCh is closed when the service state changes suitable for use in a 56 // memdb.WatchSet when watching agent local changes with hash-based blocking. 57 WatchCh chan struct{} 58 } 59 60 // Clone returns a shallow copy of the object. The service record still points 61 // to the original service record and must not be modified. The WatchCh is also 62 // still pointing to the original so the clone will be update when the original 63 // is. 64 func (s *ServiceState) Clone() *ServiceState { 65 s2 := new(ServiceState) 66 *s2 = *s 67 return s2 68 } 69 70 // CheckState describes the state of a health check record. 71 type CheckState struct { 72 // Check is the local copy of the health check record. 73 Check *structs.HealthCheck 74 75 // Token is the ACL record to update or delete the health check 76 // record on the server. 77 Token string 78 79 // CriticalTime is the last time the health check status went 80 // from non-critical to critical. When the health check is not 81 // in critical state the value is the zero value. 82 CriticalTime time.Time 83 84 // DeferCheck is used to delay the sync of a health check when 85 // only the output has changed. This rate limits changes which 86 // do not affect the state of the node and/or service. 87 DeferCheck *time.Timer 88 89 // InSync contains whether the local state of the health check 90 // record is in sync with the remote state on the server. 91 InSync bool 92 93 // Deleted is true when the health check record has been marked as 94 // deleted but has not been removed on the server yet. 95 Deleted bool 96 } 97 98 // Clone returns a shallow copy of the object. The check record and the 99 // defer timer still point to the original values and must not be 100 // modified. 101 func (c *CheckState) Clone() *CheckState { 102 c2 := new(CheckState) 103 *c2 = *c 104 return c2 105 } 106 107 // Critical returns true when the health check is in critical state. 108 func (c *CheckState) Critical() bool { 109 return !c.CriticalTime.IsZero() 110 } 111 112 // CriticalFor returns the amount of time the service has been in critical 113 // state. Its value is undefined when the service is not in critical state. 114 func (c *CheckState) CriticalFor() time.Duration { 115 return time.Since(c.CriticalTime) 116 } 117 118 type rpc interface { 119 RPC(method string, args interface{}, reply interface{}) error 120 } 121 122 // ManagedProxy represents the local state for a registered proxy instance. 123 type ManagedProxy struct { 124 Proxy *structs.ConnectManagedProxy 125 126 // ProxyToken is a special local-only security token that grants the bearer 127 // access to the proxy's config as well as allowing it to request certificates 128 // on behalf of the target service. Certain connect endpoints will validate 129 // against this token and if it matches will then use the target service's 130 // registration token to actually authenticate the upstream RPC on behalf of 131 // the service. This token is passed securely to the proxy process via ENV 132 // vars and should never be exposed any other way. Unmanaged proxies will 133 // never see this and need to use service-scoped ACL tokens distributed 134 // externally. It is persisted in the local state to allow authenticating 135 // running proxies after the agent restarts. 136 // 137 // TODO(banks): In theory we only need to persist this at all to _validate_ 138 // which means we could keep only a hash in memory and on disk and only pass 139 // the actual token to the process on startup. That would require a bit of 140 // refactoring though to have the required interaction with the proxy manager. 141 ProxyToken string 142 143 // WatchCh is a close-only chan that is closed when the proxy is removed or 144 // updated. 145 WatchCh chan struct{} 146 } 147 148 // State is used to represent the node's services, 149 // and checks. We use it to perform anti-entropy with the 150 // catalog representation 151 type State struct { 152 sync.RWMutex 153 154 // Delegate the RPC interface to the consul server or agent. 155 // 156 // It is set after both the state and the consul server/agent have 157 // been created. 158 Delegate rpc 159 160 // TriggerSyncChanges is used to notify the state syncer that a 161 // partial sync should be performed. 162 // 163 // It is set after both the state and the state syncer have been 164 // created. 165 TriggerSyncChanges func() 166 167 logger *log.Logger 168 169 // Config is the agent config 170 config Config 171 172 // nodeInfoInSync tracks whether the server has our correct top-level 173 // node information in sync 174 nodeInfoInSync bool 175 176 // Services tracks the local services 177 services map[string]*ServiceState 178 179 // Checks tracks the local checks. checkAliases are aliased checks. 180 checks map[types.CheckID]*CheckState 181 checkAliases map[string]map[types.CheckID]chan<- struct{} 182 183 // metadata tracks the node metadata fields 184 metadata map[string]string 185 186 // discardCheckOutput stores whether the output of health checks 187 // is stored in the raft log. 188 discardCheckOutput atomic.Value // bool 189 190 // tokens contains the ACL tokens 191 tokens *token.Store 192 193 // notifyHandlers is a map of registered channel listeners that are sent 194 // messages whenever state changes occur. For now these events only include 195 // service registration and deregistration since that is all that is needed 196 // but the same mechanism could be used for other state changes. 197 // 198 // Note that we haven't refactored managedProxyHandlers into this mechanism 199 // yet because that is soon to be deprecated and removed so it's easier to 200 // just leave them separate until managed proxies are removed entirely. Any 201 // future notifications should re-use this mechanism though. 202 notifyHandlers map[chan<- struct{}]struct{} 203 204 // managedProxies is a map of all managed connect proxies registered locally on 205 // this agent. This is NOT kept in sync with servers since it's agent-local 206 // config only. Proxy instances have separate service registrations in the 207 // services map above which are kept in sync via anti-entropy. Un-managed 208 // proxies (that registered themselves separately from the service 209 // registration) do not appear here as the agent doesn't need to manage their 210 // process nor config. The _do_ still exist in services above though as 211 // services with Kind == connect-proxy. 212 // 213 // managedProxyHandlers is a map of registered channel listeners that 214 // are sent a message each time a proxy changes via Add or RemoveProxy. 215 managedProxies map[string]*ManagedProxy 216 managedProxyHandlers map[chan<- struct{}]struct{} 217 } 218 219 // NewState creates a new local state for the agent. 220 func NewState(c Config, lg *log.Logger, tokens *token.Store) *State { 221 l := &State{ 222 config: c, 223 logger: lg, 224 services: make(map[string]*ServiceState), 225 checks: make(map[types.CheckID]*CheckState), 226 checkAliases: make(map[string]map[types.CheckID]chan<- struct{}), 227 metadata: make(map[string]string), 228 tokens: tokens, 229 notifyHandlers: make(map[chan<- struct{}]struct{}), 230 managedProxies: make(map[string]*ManagedProxy), 231 managedProxyHandlers: make(map[chan<- struct{}]struct{}), 232 } 233 l.SetDiscardCheckOutput(c.DiscardCheckOutput) 234 return l 235 } 236 237 // SetDiscardCheckOutput configures whether the check output 238 // is discarded. This can be changed at runtime. 239 func (l *State) SetDiscardCheckOutput(b bool) { 240 l.discardCheckOutput.Store(b) 241 } 242 243 // ServiceToken returns the configured ACL token for the given 244 // service ID. If none is present, the agent's token is returned. 245 func (l *State) ServiceToken(id string) string { 246 l.RLock() 247 defer l.RUnlock() 248 return l.serviceToken(id) 249 } 250 251 // serviceToken returns an ACL token associated with a service. 252 // This method is not synchronized and the lock must already be held. 253 func (l *State) serviceToken(id string) string { 254 var token string 255 if s := l.services[id]; s != nil { 256 token = s.Token 257 } 258 if token == "" { 259 token = l.tokens.UserToken() 260 } 261 return token 262 } 263 264 // AddService is used to add a service entry to the local state. 265 // This entry is persistent and the agent will make a best effort to 266 // ensure it is registered 267 func (l *State) AddService(service *structs.NodeService, token string) error { 268 l.Lock() 269 defer l.Unlock() 270 return l.addServiceLocked(service, token) 271 } 272 273 func (l *State) addServiceLocked(service *structs.NodeService, token string) error { 274 if service == nil { 275 return fmt.Errorf("no service") 276 } 277 278 // use the service name as id if the id was omitted 279 if service.ID == "" { 280 service.ID = service.Service 281 } 282 283 l.setServiceStateLocked(&ServiceState{ 284 Service: service, 285 Token: token, 286 }) 287 return nil 288 } 289 290 // AddServiceWithChecks adds a service and its check tp the local state atomically 291 func (l *State) AddServiceWithChecks(service *structs.NodeService, checks []*structs.HealthCheck, token string) error { 292 l.Lock() 293 defer l.Unlock() 294 295 if err := l.addServiceLocked(service, token); err != nil { 296 return err 297 } 298 299 for _, check := range checks { 300 if err := l.addCheckLocked(check, token); err != nil { 301 return err 302 } 303 } 304 305 return nil 306 } 307 308 // RemoveService is used to remove a service entry from the local state. 309 // The agent will make a best effort to ensure it is deregistered. 310 func (l *State) RemoveService(id string) error { 311 l.Lock() 312 defer l.Unlock() 313 return l.removeServiceLocked(id) 314 } 315 316 // RemoveServiceWithChecks removes a service and its check from the local state atomically 317 func (l *State) RemoveServiceWithChecks(serviceID string, checkIDs []types.CheckID) error { 318 l.Lock() 319 defer l.Unlock() 320 321 if err := l.removeServiceLocked(serviceID); err != nil { 322 return err 323 } 324 325 for _, id := range checkIDs { 326 if err := l.removeCheckLocked(id); err != nil { 327 return err 328 } 329 } 330 331 return nil 332 } 333 334 func (l *State) removeServiceLocked(id string) error { 335 336 s := l.services[id] 337 if s == nil || s.Deleted { 338 return fmt.Errorf("Service %q does not exist", id) 339 } 340 341 // To remove the service on the server we need the token. 342 // Therefore, we mark the service as deleted and keep the 343 // entry around until it is actually removed. 344 s.InSync = false 345 s.Deleted = true 346 if s.WatchCh != nil { 347 close(s.WatchCh) 348 s.WatchCh = nil 349 } 350 l.TriggerSyncChanges() 351 l.broadcastUpdateLocked() 352 353 return nil 354 } 355 356 // Service returns the locally registered service that the 357 // agent is aware of and are being kept in sync with the server 358 func (l *State) Service(id string) *structs.NodeService { 359 l.RLock() 360 defer l.RUnlock() 361 362 s := l.services[id] 363 if s == nil || s.Deleted { 364 return nil 365 } 366 return s.Service 367 } 368 369 // Services returns the locally registered services that the 370 // agent is aware of and are being kept in sync with the server 371 func (l *State) Services() map[string]*structs.NodeService { 372 l.RLock() 373 defer l.RUnlock() 374 375 m := make(map[string]*structs.NodeService) 376 for id, s := range l.services { 377 if s.Deleted { 378 continue 379 } 380 m[id] = s.Service 381 } 382 return m 383 } 384 385 // ServiceState returns a shallow copy of the current service state record. The 386 // service record still points to the original service record and must not be 387 // modified. The WatchCh for the copy returned will also be closed when the 388 // actual service state is changed. 389 func (l *State) ServiceState(id string) *ServiceState { 390 l.RLock() 391 defer l.RUnlock() 392 393 s := l.services[id] 394 if s == nil || s.Deleted { 395 return nil 396 } 397 return s.Clone() 398 } 399 400 // SetServiceState is used to overwrite a raw service state with the given 401 // state. This method is safe to be called concurrently but should only be used 402 // during testing. You should most likely call AddService instead. 403 func (l *State) SetServiceState(s *ServiceState) { 404 l.Lock() 405 defer l.Unlock() 406 407 l.setServiceStateLocked(s) 408 } 409 410 func (l *State) setServiceStateLocked(s *ServiceState) { 411 s.WatchCh = make(chan struct{}) 412 413 old, hasOld := l.services[s.Service.ID] 414 l.services[s.Service.ID] = s 415 416 if hasOld && old.WatchCh != nil { 417 close(old.WatchCh) 418 } 419 420 l.TriggerSyncChanges() 421 l.broadcastUpdateLocked() 422 } 423 424 // ServiceStates returns a shallow copy of all service state records. 425 // The service record still points to the original service record and 426 // must not be modified. 427 func (l *State) ServiceStates() map[string]*ServiceState { 428 l.RLock() 429 defer l.RUnlock() 430 431 m := make(map[string]*ServiceState) 432 for id, s := range l.services { 433 if s.Deleted { 434 continue 435 } 436 m[id] = s.Clone() 437 } 438 return m 439 } 440 441 // CheckToken is used to return the configured health check token for a 442 // Check, or if none is configured, the default agent ACL token. 443 func (l *State) CheckToken(checkID types.CheckID) string { 444 l.RLock() 445 defer l.RUnlock() 446 return l.checkToken(checkID) 447 } 448 449 // checkToken returns an ACL token associated with a check. 450 // This method is not synchronized and the lock must already be held. 451 func (l *State) checkToken(id types.CheckID) string { 452 var token string 453 c := l.checks[id] 454 if c != nil { 455 token = c.Token 456 } 457 if token == "" { 458 token = l.tokens.UserToken() 459 } 460 return token 461 } 462 463 // AddCheck is used to add a health check to the local state. 464 // This entry is persistent and the agent will make a best effort to 465 // ensure it is registered 466 func (l *State) AddCheck(check *structs.HealthCheck, token string) error { 467 l.Lock() 468 defer l.Unlock() 469 470 return l.addCheckLocked(check, token) 471 } 472 473 func (l *State) addCheckLocked(check *structs.HealthCheck, token string) error { 474 if check == nil { 475 return fmt.Errorf("no check") 476 } 477 478 // clone the check since we will be modifying it. 479 check = check.Clone() 480 481 if l.discardCheckOutput.Load().(bool) { 482 check.Output = "" 483 } 484 485 // if there is a serviceID associated with the check, make sure it exists before adding it 486 // NOTE - This logic may be moved to be handled within the Agent's Addcheck method after a refactor 487 if _, ok := l.services[check.ServiceID]; check.ServiceID != "" && !ok { 488 return fmt.Errorf("Check %q refers to non-existent service %q", check.CheckID, check.ServiceID) 489 } 490 491 // hard-set the node name 492 check.Node = l.config.NodeName 493 494 l.setCheckStateLocked(&CheckState{ 495 Check: check, 496 Token: token, 497 }) 498 return nil 499 } 500 501 // AddAliasCheck creates an alias check. When any check for the srcServiceID is 502 // changed, checkID will reflect that using the same semantics as 503 // checks.CheckAlias. 504 // 505 // This is a local optimization so that the Alias check doesn't need to use 506 // blocking queries against the remote server for check updates for local 507 // services. 508 func (l *State) AddAliasCheck(checkID types.CheckID, srcServiceID string, notifyCh chan<- struct{}) error { 509 l.Lock() 510 defer l.Unlock() 511 512 m, ok := l.checkAliases[srcServiceID] 513 if !ok { 514 m = make(map[types.CheckID]chan<- struct{}) 515 l.checkAliases[srcServiceID] = m 516 } 517 m[checkID] = notifyCh 518 519 return nil 520 } 521 522 // RemoveAliasCheck removes the mapping for the alias check. 523 func (l *State) RemoveAliasCheck(checkID types.CheckID, srcServiceID string) { 524 l.Lock() 525 defer l.Unlock() 526 527 if m, ok := l.checkAliases[srcServiceID]; ok { 528 delete(m, checkID) 529 if len(m) == 0 { 530 delete(l.checkAliases, srcServiceID) 531 } 532 } 533 } 534 535 // RemoveCheck is used to remove a health check from the local state. 536 // The agent will make a best effort to ensure it is deregistered 537 // todo(fs): RemoveService returns an error for a non-existent service. RemoveCheck should as well. 538 // todo(fs): Check code that calls this to handle the error. 539 func (l *State) RemoveCheck(id types.CheckID) error { 540 l.Lock() 541 defer l.Unlock() 542 return l.removeCheckLocked(id) 543 } 544 545 func (l *State) removeCheckLocked(id types.CheckID) error { 546 c := l.checks[id] 547 if c == nil || c.Deleted { 548 return fmt.Errorf("Check %q does not exist", id) 549 } 550 551 // To remove the check on the server we need the token. 552 // Therefore, we mark the service as deleted and keep the 553 // entry around until it is actually removed. 554 c.InSync = false 555 c.Deleted = true 556 l.TriggerSyncChanges() 557 558 return nil 559 } 560 561 // UpdateCheck is used to update the status of a check 562 func (l *State) UpdateCheck(id types.CheckID, status, output string) { 563 l.Lock() 564 defer l.Unlock() 565 566 c := l.checks[id] 567 if c == nil || c.Deleted { 568 return 569 } 570 571 if l.discardCheckOutput.Load().(bool) { 572 output = "" 573 } 574 575 // Update the critical time tracking (this doesn't cause a server updates 576 // so we can always keep this up to date). 577 if status == api.HealthCritical { 578 if !c.Critical() { 579 c.CriticalTime = time.Now() 580 } 581 } else { 582 c.CriticalTime = time.Time{} 583 } 584 585 // Do nothing if update is idempotent 586 if c.Check.Status == status && c.Check.Output == output { 587 return 588 } 589 590 // Defer a sync if the output has changed. This is an optimization around 591 // frequent updates of output. Instead, we update the output internally, 592 // and periodically do a write-back to the servers. If there is a status 593 // change we do the write immediately. 594 if l.config.CheckUpdateInterval > 0 && c.Check.Status == status { 595 c.Check.Output = output 596 if c.DeferCheck == nil { 597 d := l.config.CheckUpdateInterval 598 intv := time.Duration(uint64(d)/2) + lib.RandomStagger(d) 599 c.DeferCheck = time.AfterFunc(intv, func() { 600 l.Lock() 601 defer l.Unlock() 602 603 c := l.checks[id] 604 if c == nil { 605 return 606 } 607 c.DeferCheck = nil 608 if c.Deleted { 609 return 610 } 611 c.InSync = false 612 l.TriggerSyncChanges() 613 }) 614 } 615 return 616 } 617 618 // If this is a check for an aliased service, then notify the waiters. 619 if aliases, ok := l.checkAliases[c.Check.ServiceID]; ok && len(aliases) > 0 { 620 for _, notifyCh := range aliases { 621 // Do not block. All notify channels should be buffered to at 622 // least 1 in which case not-blocking does not result in loss 623 // of data because a failed send means a notification is 624 // already queued. This must be called with the lock held. 625 select { 626 case notifyCh <- struct{}{}: 627 default: 628 } 629 } 630 } 631 632 // Update status and mark out of sync 633 c.Check.Status = status 634 c.Check.Output = output 635 c.InSync = false 636 l.TriggerSyncChanges() 637 } 638 639 // Check returns the locally registered check that the 640 // agent is aware of and are being kept in sync with the server 641 func (l *State) Check(id types.CheckID) *structs.HealthCheck { 642 l.RLock() 643 defer l.RUnlock() 644 645 c := l.checks[id] 646 if c == nil || c.Deleted { 647 return nil 648 } 649 return c.Check 650 } 651 652 // Checks returns the locally registered checks that the 653 // agent is aware of and are being kept in sync with the server 654 func (l *State) Checks() map[types.CheckID]*structs.HealthCheck { 655 m := make(map[types.CheckID]*structs.HealthCheck) 656 for id, c := range l.CheckStates() { 657 m[id] = c.Check 658 } 659 return m 660 } 661 662 // CheckState returns a shallow copy of the current health check state 663 // record. The health check record and the deferred check still point to 664 // the original values and must not be modified. 665 func (l *State) CheckState(id types.CheckID) *CheckState { 666 l.RLock() 667 defer l.RUnlock() 668 669 c := l.checks[id] 670 if c == nil || c.Deleted { 671 return nil 672 } 673 return c.Clone() 674 } 675 676 // SetCheckState is used to overwrite a raw check state with the given 677 // state. This method is safe to be called concurrently but should only be used 678 // during testing. You should most likely call AddCheck instead. 679 func (l *State) SetCheckState(c *CheckState) { 680 l.Lock() 681 defer l.Unlock() 682 683 l.setCheckStateLocked(c) 684 } 685 686 func (l *State) setCheckStateLocked(c *CheckState) { 687 l.checks[c.Check.CheckID] = c 688 l.TriggerSyncChanges() 689 } 690 691 // CheckStates returns a shallow copy of all health check state records. 692 // The health check records and the deferred checks still point to 693 // the original values and must not be modified. 694 func (l *State) CheckStates() map[types.CheckID]*CheckState { 695 l.RLock() 696 defer l.RUnlock() 697 698 m := make(map[types.CheckID]*CheckState) 699 for id, c := range l.checks { 700 if c.Deleted { 701 continue 702 } 703 m[id] = c.Clone() 704 } 705 return m 706 } 707 708 // CriticalCheckStates returns the locally registered checks that the 709 // agent is aware of and are being kept in sync with the server. 710 // The map contains a shallow copy of the current check states but 711 // references to the actual check definition which must not be 712 // modified. 713 func (l *State) CriticalCheckStates() map[types.CheckID]*CheckState { 714 l.RLock() 715 defer l.RUnlock() 716 717 m := make(map[types.CheckID]*CheckState) 718 for id, c := range l.checks { 719 if c.Deleted || !c.Critical() { 720 continue 721 } 722 m[id] = c.Clone() 723 } 724 return m 725 } 726 727 // AddProxy is used to add a connect proxy entry to the local state. This 728 // assumes the proxy's NodeService is already registered via Agent.AddService 729 // (since that has to do other book keeping). The token passed here is the ACL 730 // token the service used to register itself so must have write on service 731 // record. AddProxy returns the newly added proxy and an error. 732 // 733 // The restoredProxyToken argument should only be used when restoring proxy 734 // definitions from disk; new proxies must leave it blank to get a new token 735 // assigned. We need to restore from disk to enable to continue authenticating 736 // running proxies that already had that credential injected. 737 func (l *State) AddProxy(proxy *structs.ConnectManagedProxy, token, 738 restoredProxyToken string) (*ManagedProxy, error) { 739 if proxy == nil { 740 return nil, fmt.Errorf("no proxy") 741 } 742 743 // Lookup the local service 744 target := l.Service(proxy.TargetServiceID) 745 if target == nil { 746 return nil, fmt.Errorf("target service ID %s not registered", 747 proxy.TargetServiceID) 748 } 749 750 // Get bind info from config 751 cfg, err := proxy.ParseConfig() 752 if err != nil { 753 return nil, err 754 } 755 756 // Construct almost all of the NodeService that needs to be registered by the 757 // caller outside of the lock. 758 svc := &structs.NodeService{ 759 Kind: structs.ServiceKindConnectProxy, 760 ID: target.ID + "-proxy", 761 Service: target.Service + "-proxy", 762 Proxy: structs.ConnectProxyConfig{ 763 DestinationServiceName: target.Service, 764 LocalServiceAddress: cfg.LocalServiceAddress, 765 LocalServicePort: cfg.LocalServicePort, 766 }, 767 Address: cfg.BindAddress, 768 Port: cfg.BindPort, 769 } 770 771 // Set default port now while the target is known 772 if svc.Proxy.LocalServicePort < 1 { 773 svc.Proxy.LocalServicePort = target.Port 774 } 775 776 // Lock now. We can't lock earlier as l.Service would deadlock and shouldn't 777 // anyway to minimize the critical section. 778 l.Lock() 779 defer l.Unlock() 780 781 pToken := restoredProxyToken 782 783 // Does this proxy instance already exist? 784 if existing, ok := l.managedProxies[svc.ID]; ok { 785 // Keep the existing proxy token so we don't have to restart proxy to 786 // re-inject token. 787 pToken = existing.ProxyToken 788 // If the user didn't explicitly change the port, use the old one instead of 789 // assigning new. 790 if svc.Port < 1 { 791 svc.Port = existing.Proxy.ProxyService.Port 792 } 793 } else if proxyService, ok := l.services[svc.ID]; ok { 794 // The proxy-service already exists so keep the port that got assigned. This 795 // happens on reload from disk since service definitions are reloaded first. 796 svc.Port = proxyService.Service.Port 797 } 798 799 // If this is a new instance, generate a token 800 if pToken == "" { 801 pToken, err = uuid.GenerateUUID() 802 if err != nil { 803 return nil, err 804 } 805 } 806 807 // Allocate port if needed (min and max inclusive). 808 rangeLen := l.config.ProxyBindMaxPort - l.config.ProxyBindMinPort + 1 809 if svc.Port < 1 && l.config.ProxyBindMinPort > 0 && rangeLen > 0 { 810 // This should be a really short list so don't bother optimizing lookup yet. 811 OUTER: 812 for _, offset := range rand.Perm(rangeLen) { 813 p := l.config.ProxyBindMinPort + offset 814 // See if this port was already allocated to another proxy 815 for _, other := range l.managedProxies { 816 if other.Proxy.ProxyService.Port == p { 817 // already taken, skip to next random pick in the range 818 continue OUTER 819 } 820 } 821 // We made it through all existing proxies without a match so claim this one 822 svc.Port = p 823 break 824 } 825 } 826 // If no ports left (or auto ports disabled) fail 827 if svc.Port < 1 { 828 return nil, fmt.Errorf("no port provided for proxy bind_port and none "+ 829 " left in the allocated range [%d, %d]", l.config.ProxyBindMinPort, 830 l.config.ProxyBindMaxPort) 831 } 832 833 proxy.ProxyService = svc 834 835 // All set, add the proxy and return the service 836 if old, ok := l.managedProxies[svc.ID]; ok { 837 // Notify watchers of the existing proxy config that it's changing. Note 838 // this is safe here even before the map is updated since we still hold the 839 // state lock and the watcher can't re-read the new config until we return 840 // anyway. 841 close(old.WatchCh) 842 } 843 l.managedProxies[svc.ID] = &ManagedProxy{ 844 Proxy: proxy, 845 ProxyToken: pToken, 846 WatchCh: make(chan struct{}), 847 } 848 849 // Notify 850 for ch := range l.managedProxyHandlers { 851 // Do not block 852 select { 853 case ch <- struct{}{}: 854 default: 855 } 856 } 857 858 // No need to trigger sync as proxy state is local only. 859 return l.managedProxies[svc.ID], nil 860 } 861 862 // RemoveProxy is used to remove a proxy entry from the local state. 863 // This returns the proxy that was removed. 864 func (l *State) RemoveProxy(id string) (*ManagedProxy, error) { 865 l.Lock() 866 defer l.Unlock() 867 868 p := l.managedProxies[id] 869 if p == nil { 870 return nil, fmt.Errorf("Proxy %s does not exist", id) 871 } 872 delete(l.managedProxies, id) 873 874 // Notify watchers of the existing proxy config that it's changed. 875 close(p.WatchCh) 876 877 // Notify 878 for ch := range l.managedProxyHandlers { 879 // Do not block 880 select { 881 case ch <- struct{}{}: 882 default: 883 } 884 } 885 886 // No need to trigger sync as proxy state is local only. 887 return p, nil 888 } 889 890 // Proxy returns the local proxy state. 891 func (l *State) Proxy(id string) *ManagedProxy { 892 l.RLock() 893 defer l.RUnlock() 894 return l.managedProxies[id] 895 } 896 897 // Proxies returns the locally registered proxies. 898 func (l *State) Proxies() map[string]*ManagedProxy { 899 l.RLock() 900 defer l.RUnlock() 901 902 m := make(map[string]*ManagedProxy) 903 for id, p := range l.managedProxies { 904 m[id] = p 905 } 906 return m 907 } 908 909 // broadcastUpdateLocked assumes l is locked and delivers an update to all 910 // registered watchers. 911 func (l *State) broadcastUpdateLocked() { 912 for ch := range l.notifyHandlers { 913 // Do not block 914 select { 915 case ch <- struct{}{}: 916 default: 917 } 918 } 919 } 920 921 // Notify will register a channel to receive messages when the local state 922 // changes. Only service add/remove are supported for now. See notes on 923 // l.notifyHandlers for more details. 924 // 925 // This will not block on channel send so ensure the channel has a buffer. Note 926 // that any buffer size is generally fine since actual data is not sent over the 927 // channel, so a dropped send due to a full buffer does not result in any loss 928 // of data. The fact that a buffer already contains a notification means that 929 // the receiver will still be notified that changes occurred. 930 func (l *State) Notify(ch chan<- struct{}) { 931 l.Lock() 932 defer l.Unlock() 933 l.notifyHandlers[ch] = struct{}{} 934 } 935 936 // StopNotify will deregister a channel receiving state change notifications. 937 // Pair this with all calls to Notify to clean up state. 938 func (l *State) StopNotify(ch chan<- struct{}) { 939 l.Lock() 940 defer l.Unlock() 941 delete(l.notifyHandlers, ch) 942 } 943 944 // NotifyProxy will register a channel to receive messages when the 945 // configuration or set of proxies changes. This will not block on 946 // channel send so ensure the channel has a buffer. Note that any buffer 947 // size is generally fine since actual data is not sent over the channel, 948 // so a dropped send due to a full buffer does not result in any loss of 949 // data. The fact that a buffer already contains a notification means that 950 // the receiver will still be notified that changes occurred. 951 // 952 // NOTE(mitchellh): This could be more generalized but for my use case I 953 // only needed proxy events. In the future if it were to be generalized I 954 // would add a new Notify method and remove the proxy-specific ones. 955 func (l *State) NotifyProxy(ch chan<- struct{}) { 956 l.Lock() 957 defer l.Unlock() 958 l.managedProxyHandlers[ch] = struct{}{} 959 } 960 961 // StopNotifyProxy will deregister a channel receiving proxy notifications. 962 // Pair this with all calls to NotifyProxy to clean up state. 963 func (l *State) StopNotifyProxy(ch chan<- struct{}) { 964 l.Lock() 965 defer l.Unlock() 966 delete(l.managedProxyHandlers, ch) 967 } 968 969 // Metadata returns the local node metadata fields that the 970 // agent is aware of and are being kept in sync with the server 971 func (l *State) Metadata() map[string]string { 972 l.RLock() 973 defer l.RUnlock() 974 975 m := make(map[string]string) 976 for k, v := range l.metadata { 977 m[k] = v 978 } 979 return m 980 } 981 982 // LoadMetadata loads node metadata fields from the agent config and 983 // updates them on the local agent. 984 func (l *State) LoadMetadata(data map[string]string) error { 985 l.Lock() 986 defer l.Unlock() 987 988 for k, v := range data { 989 l.metadata[k] = v 990 } 991 l.TriggerSyncChanges() 992 return nil 993 } 994 995 // UnloadMetadata resets the local metadata state 996 func (l *State) UnloadMetadata() { 997 l.Lock() 998 defer l.Unlock() 999 l.metadata = make(map[string]string) 1000 } 1001 1002 // Stats is used to get various debugging state from the sub-systems 1003 func (l *State) Stats() map[string]string { 1004 l.RLock() 1005 defer l.RUnlock() 1006 1007 services := 0 1008 for _, s := range l.services { 1009 if s.Deleted { 1010 continue 1011 } 1012 services++ 1013 } 1014 1015 checks := 0 1016 for _, c := range l.checks { 1017 if c.Deleted { 1018 continue 1019 } 1020 checks++ 1021 } 1022 1023 return map[string]string{ 1024 "services": strconv.Itoa(services), 1025 "checks": strconv.Itoa(checks), 1026 } 1027 } 1028 1029 // updateSyncState does a read of the server state, and updates 1030 // the local sync status as appropriate 1031 func (l *State) updateSyncState() error { 1032 // Get all checks and services from the master 1033 req := structs.NodeSpecificRequest{ 1034 Datacenter: l.config.Datacenter, 1035 Node: l.config.NodeName, 1036 QueryOptions: structs.QueryOptions{Token: l.tokens.AgentToken()}, 1037 } 1038 1039 var out1 structs.IndexedNodeServices 1040 if err := l.Delegate.RPC("Catalog.NodeServices", &req, &out1); err != nil { 1041 return err 1042 } 1043 1044 var out2 structs.IndexedHealthChecks 1045 if err := l.Delegate.RPC("Health.NodeChecks", &req, &out2); err != nil { 1046 return err 1047 } 1048 1049 // Create useful data structures for traversal 1050 remoteServices := make(map[string]*structs.NodeService) 1051 if out1.NodeServices != nil { 1052 remoteServices = out1.NodeServices.Services 1053 } 1054 1055 remoteChecks := make(map[types.CheckID]*structs.HealthCheck, len(out2.HealthChecks)) 1056 for _, rc := range out2.HealthChecks { 1057 remoteChecks[rc.CheckID] = rc 1058 } 1059 1060 // Traverse all checks, services and the node info to determine 1061 // which entries need to be updated on or removed from the server 1062 1063 l.Lock() 1064 defer l.Unlock() 1065 1066 // Check if node info needs syncing 1067 if out1.NodeServices == nil || out1.NodeServices.Node == nil || 1068 out1.NodeServices.Node.ID != l.config.NodeID || 1069 !reflect.DeepEqual(out1.NodeServices.Node.TaggedAddresses, l.config.TaggedAddresses) || 1070 !reflect.DeepEqual(out1.NodeServices.Node.Meta, l.metadata) { 1071 l.nodeInfoInSync = false 1072 } 1073 1074 // Check which services need syncing 1075 1076 // Look for local services that do not exist remotely and mark them for 1077 // syncing so that they will be pushed to the server later 1078 for id, s := range l.services { 1079 if remoteServices[id] == nil { 1080 s.InSync = false 1081 } 1082 } 1083 1084 // Traverse the list of services from the server. 1085 // Remote services which do not exist locally have been deregistered. 1086 // Otherwise, check whether the two definitions are still in sync. 1087 for id, rs := range remoteServices { 1088 ls := l.services[id] 1089 if ls == nil { 1090 // The consul service is managed automatically and does 1091 // not need to be deregistered 1092 if id == structs.ConsulServiceID { 1093 continue 1094 } 1095 1096 // Mark a remote service that does not exist locally as deleted so 1097 // that it will be removed on the server later. 1098 l.services[id] = &ServiceState{Deleted: true} 1099 continue 1100 } 1101 1102 // If the service is already scheduled for removal skip it 1103 if ls.Deleted { 1104 continue 1105 } 1106 1107 // If our definition is different, we need to update it. Make a 1108 // copy so that we don't retain a pointer to any actual state 1109 // store info for in-memory RPCs. 1110 if ls.Service.EnableTagOverride { 1111 ls.Service.Tags = make([]string, len(rs.Tags)) 1112 copy(ls.Service.Tags, rs.Tags) 1113 } 1114 ls.InSync = ls.Service.IsSame(rs) 1115 } 1116 1117 // Check which checks need syncing 1118 1119 // Look for local checks that do not exist remotely and mark them for 1120 // syncing so that they will be pushed to the server later 1121 for id, c := range l.checks { 1122 if remoteChecks[id] == nil { 1123 c.InSync = false 1124 } 1125 } 1126 1127 // Traverse the list of checks from the server. 1128 // Remote checks which do not exist locally have been deregistered. 1129 // Otherwise, check whether the two definitions are still in sync. 1130 for id, rc := range remoteChecks { 1131 lc := l.checks[id] 1132 1133 if lc == nil { 1134 // The Serf check is created automatically and does not 1135 // need to be deregistered. 1136 if id == structs.SerfCheckID { 1137 l.logger.Printf("[DEBUG] agent: Skipping remote check %q since it is managed automatically", id) 1138 continue 1139 } 1140 1141 // Mark a remote check that does not exist locally as deleted so 1142 // that it will be removed on the server later. 1143 l.checks[id] = &CheckState{Deleted: true} 1144 continue 1145 } 1146 1147 // If the check is already scheduled for removal skip it. 1148 if lc.Deleted { 1149 continue 1150 } 1151 1152 // If our definition is different, we need to update it 1153 if l.config.CheckUpdateInterval == 0 { 1154 lc.InSync = lc.Check.IsSame(rc) 1155 continue 1156 } 1157 1158 // Copy the existing check before potentially modifying 1159 // it before the compare operation. 1160 lcCopy := lc.Check.Clone() 1161 1162 // Copy the server's check before modifying, otherwise 1163 // in-memory RPCs will have side effects. 1164 rcCopy := rc.Clone() 1165 1166 // If there's a defer timer active then we've got a 1167 // potentially spammy check so we don't sync the output 1168 // during this sweep since the timer will mark the check 1169 // out of sync for us. Otherwise, it is safe to sync the 1170 // output now. This is especially important for checks 1171 // that don't change state after they are created, in 1172 // which case we'd never see their output synced back ever. 1173 if lc.DeferCheck != nil { 1174 lcCopy.Output = "" 1175 rcCopy.Output = "" 1176 } 1177 lc.InSync = lcCopy.IsSame(rcCopy) 1178 } 1179 return nil 1180 } 1181 1182 // SyncFull determines the delta between the local and remote state 1183 // and synchronizes the changes. 1184 func (l *State) SyncFull() error { 1185 // note that we do not acquire the lock here since the methods 1186 // we are calling will do that themselves. 1187 // 1188 // Also note that we don't hold the lock for the entire operation 1189 // but release it between the two calls. This is not an issue since 1190 // the algorithm is best-effort to achieve eventual consistency. 1191 // SyncChanges will sync whatever updateSyncState() has determined 1192 // needs updating. 1193 1194 if err := l.updateSyncState(); err != nil { 1195 return err 1196 } 1197 return l.SyncChanges() 1198 } 1199 1200 // SyncChanges pushes checks, services and node info data which has been 1201 // marked out of sync or deleted to the server. 1202 func (l *State) SyncChanges() error { 1203 l.Lock() 1204 defer l.Unlock() 1205 1206 // We will do node-level info syncing at the end, since it will get 1207 // updated by a service or check sync anyway, given how the register 1208 // API works. 1209 1210 // Sync the services 1211 // (logging happens in the helper methods) 1212 for id, s := range l.services { 1213 var err error 1214 switch { 1215 case s.Deleted: 1216 err = l.deleteService(id) 1217 case !s.InSync: 1218 err = l.syncService(id) 1219 default: 1220 l.logger.Printf("[DEBUG] agent: Service %q in sync", id) 1221 } 1222 if err != nil { 1223 return err 1224 } 1225 } 1226 1227 // Sync the checks 1228 // (logging happens in the helper methods) 1229 for id, c := range l.checks { 1230 var err error 1231 switch { 1232 case c.Deleted: 1233 err = l.deleteCheck(id) 1234 case !c.InSync: 1235 if c.DeferCheck != nil { 1236 c.DeferCheck.Stop() 1237 c.DeferCheck = nil 1238 } 1239 err = l.syncCheck(id) 1240 default: 1241 l.logger.Printf("[DEBUG] agent: Check %q in sync", id) 1242 } 1243 if err != nil { 1244 return err 1245 } 1246 } 1247 1248 // Now sync the node level info if we need to, and didn't do any of 1249 // the other sync operations. 1250 if l.nodeInfoInSync { 1251 l.logger.Printf("[DEBUG] agent: Node info in sync") 1252 return nil 1253 } 1254 return l.syncNodeInfo() 1255 } 1256 1257 // deleteService is used to delete a service from the server 1258 func (l *State) deleteService(id string) error { 1259 if id == "" { 1260 return fmt.Errorf("ServiceID missing") 1261 } 1262 1263 req := structs.DeregisterRequest{ 1264 Datacenter: l.config.Datacenter, 1265 Node: l.config.NodeName, 1266 ServiceID: id, 1267 WriteRequest: structs.WriteRequest{Token: l.serviceToken(id)}, 1268 } 1269 var out struct{} 1270 err := l.Delegate.RPC("Catalog.Deregister", &req, &out) 1271 switch { 1272 case err == nil || strings.Contains(err.Error(), "Unknown service"): 1273 delete(l.services, id) 1274 l.logger.Printf("[INFO] agent: Deregistered service %q", id) 1275 return nil 1276 1277 case acl.IsErrPermissionDenied(err), acl.IsErrNotFound(err): 1278 // todo(fs): mark the service to be in sync to prevent excessive retrying before next full sync 1279 // todo(fs): some backoff strategy might be a better solution 1280 l.services[id].InSync = true 1281 l.logger.Printf("[WARN] agent: Service %q deregistration blocked by ACLs", id) 1282 metrics.IncrCounter([]string{"acl", "blocked", "service", "deregistration"}, 1) 1283 return nil 1284 1285 default: 1286 l.logger.Printf("[WARN] agent: Deregistering service %q failed. %s", id, err) 1287 return err 1288 } 1289 } 1290 1291 // deleteCheck is used to delete a check from the server 1292 func (l *State) deleteCheck(id types.CheckID) error { 1293 if id == "" { 1294 return fmt.Errorf("CheckID missing") 1295 } 1296 1297 req := structs.DeregisterRequest{ 1298 Datacenter: l.config.Datacenter, 1299 Node: l.config.NodeName, 1300 CheckID: id, 1301 WriteRequest: structs.WriteRequest{Token: l.checkToken(id)}, 1302 } 1303 var out struct{} 1304 err := l.Delegate.RPC("Catalog.Deregister", &req, &out) 1305 switch { 1306 case err == nil || strings.Contains(err.Error(), "Unknown check"): 1307 c := l.checks[id] 1308 if c != nil && c.DeferCheck != nil { 1309 c.DeferCheck.Stop() 1310 } 1311 delete(l.checks, id) 1312 l.logger.Printf("[INFO] agent: Deregistered check %q", id) 1313 return nil 1314 1315 case acl.IsErrPermissionDenied(err), acl.IsErrNotFound(err): 1316 // todo(fs): mark the check to be in sync to prevent excessive retrying before next full sync 1317 // todo(fs): some backoff strategy might be a better solution 1318 l.checks[id].InSync = true 1319 l.logger.Printf("[WARN] agent: Check %q deregistration blocked by ACLs", id) 1320 metrics.IncrCounter([]string{"acl", "blocked", "check", "deregistration"}, 1) 1321 return nil 1322 1323 default: 1324 l.logger.Printf("[WARN] agent: Deregistering check %q failed. %s", id, err) 1325 return err 1326 } 1327 } 1328 1329 // syncService is used to sync a service to the server 1330 func (l *State) syncService(id string) error { 1331 // If the service has associated checks that are out of sync, 1332 // piggyback them on the service sync so they are part of the 1333 // same transaction and are registered atomically. We only let 1334 // checks ride on service registrations with the same token, 1335 // otherwise we need to register them separately so they don't 1336 // pick up privileges from the service token. 1337 var checks structs.HealthChecks 1338 for checkID, c := range l.checks { 1339 if c.Deleted || c.InSync { 1340 continue 1341 } 1342 if c.Check.ServiceID != id { 1343 continue 1344 } 1345 if l.serviceToken(id) != l.checkToken(checkID) { 1346 continue 1347 } 1348 checks = append(checks, c.Check) 1349 } 1350 1351 req := structs.RegisterRequest{ 1352 Datacenter: l.config.Datacenter, 1353 ID: l.config.NodeID, 1354 Node: l.config.NodeName, 1355 Address: l.config.AdvertiseAddr, 1356 TaggedAddresses: l.config.TaggedAddresses, 1357 NodeMeta: l.metadata, 1358 Service: l.services[id].Service, 1359 WriteRequest: structs.WriteRequest{Token: l.serviceToken(id)}, 1360 } 1361 1362 // Backwards-compatibility for Consul < 0.5 1363 if len(checks) == 1 { 1364 req.Check = checks[0] 1365 } else { 1366 req.Checks = checks 1367 } 1368 1369 var out struct{} 1370 err := l.Delegate.RPC("Catalog.Register", &req, &out) 1371 switch { 1372 case err == nil: 1373 l.services[id].InSync = true 1374 // Given how the register API works, this info is also updated 1375 // every time we sync a service. 1376 l.nodeInfoInSync = true 1377 for _, check := range checks { 1378 l.checks[check.CheckID].InSync = true 1379 } 1380 l.logger.Printf("[INFO] agent: Synced service %q", id) 1381 return nil 1382 1383 case acl.IsErrPermissionDenied(err), acl.IsErrNotFound(err): 1384 // todo(fs): mark the service and the checks to be in sync to prevent excessive retrying before next full sync 1385 // todo(fs): some backoff strategy might be a better solution 1386 l.services[id].InSync = true 1387 for _, check := range checks { 1388 l.checks[check.CheckID].InSync = true 1389 } 1390 l.logger.Printf("[WARN] agent: Service %q registration blocked by ACLs", id) 1391 metrics.IncrCounter([]string{"acl", "blocked", "service", "registration"}, 1) 1392 return nil 1393 1394 default: 1395 l.logger.Printf("[WARN] agent: Syncing service %q failed. %s", id, err) 1396 return err 1397 } 1398 } 1399 1400 // syncCheck is used to sync a check to the server 1401 func (l *State) syncCheck(id types.CheckID) error { 1402 c := l.checks[id] 1403 1404 req := structs.RegisterRequest{ 1405 Datacenter: l.config.Datacenter, 1406 ID: l.config.NodeID, 1407 Node: l.config.NodeName, 1408 Address: l.config.AdvertiseAddr, 1409 TaggedAddresses: l.config.TaggedAddresses, 1410 NodeMeta: l.metadata, 1411 Check: c.Check, 1412 WriteRequest: structs.WriteRequest{Token: l.checkToken(id)}, 1413 } 1414 1415 // Pull in the associated service if any 1416 s := l.services[c.Check.ServiceID] 1417 if s != nil && !s.Deleted { 1418 req.Service = s.Service 1419 } 1420 1421 var out struct{} 1422 err := l.Delegate.RPC("Catalog.Register", &req, &out) 1423 switch { 1424 case err == nil: 1425 l.checks[id].InSync = true 1426 // Given how the register API works, this info is also updated 1427 // every time we sync a check. 1428 l.nodeInfoInSync = true 1429 l.logger.Printf("[INFO] agent: Synced check %q", id) 1430 return nil 1431 1432 case acl.IsErrPermissionDenied(err), acl.IsErrNotFound(err): 1433 // todo(fs): mark the check to be in sync to prevent excessive retrying before next full sync 1434 // todo(fs): some backoff strategy might be a better solution 1435 l.checks[id].InSync = true 1436 l.logger.Printf("[WARN] agent: Check %q registration blocked by ACLs", id) 1437 metrics.IncrCounter([]string{"acl", "blocked", "check", "registration"}, 1) 1438 return nil 1439 1440 default: 1441 l.logger.Printf("[WARN] agent: Syncing check %q failed. %s", id, err) 1442 return err 1443 } 1444 } 1445 1446 func (l *State) syncNodeInfo() error { 1447 req := structs.RegisterRequest{ 1448 Datacenter: l.config.Datacenter, 1449 ID: l.config.NodeID, 1450 Node: l.config.NodeName, 1451 Address: l.config.AdvertiseAddr, 1452 TaggedAddresses: l.config.TaggedAddresses, 1453 NodeMeta: l.metadata, 1454 WriteRequest: structs.WriteRequest{Token: l.tokens.AgentToken()}, 1455 } 1456 var out struct{} 1457 err := l.Delegate.RPC("Catalog.Register", &req, &out) 1458 switch { 1459 case err == nil: 1460 l.nodeInfoInSync = true 1461 l.logger.Printf("[INFO] agent: Synced node info") 1462 return nil 1463 1464 case acl.IsErrPermissionDenied(err), acl.IsErrNotFound(err): 1465 // todo(fs): mark the node info to be in sync to prevent excessive retrying before next full sync 1466 // todo(fs): some backoff strategy might be a better solution 1467 l.nodeInfoInSync = true 1468 l.logger.Printf("[WARN] agent: Node info update blocked by ACLs") 1469 metrics.IncrCounter([]string{"acl", "blocked", "node", "registration"}, 1) 1470 return nil 1471 1472 default: 1473 l.logger.Printf("[WARN] agent: Syncing node info failed. %s", err) 1474 return err 1475 } 1476 }