github.com/blixtra/nomad@v0.7.2-0.20171221000451-da9a1d7bb050/client/client.go (about) 1 package client 2 3 import ( 4 "errors" 5 "fmt" 6 "io/ioutil" 7 "log" 8 "net" 9 "os" 10 "path/filepath" 11 "strconv" 12 "strings" 13 "sync" 14 "time" 15 16 metrics "github.com/armon/go-metrics" 17 "github.com/boltdb/bolt" 18 consulapi "github.com/hashicorp/consul/api" 19 "github.com/hashicorp/consul/lib" 20 multierror "github.com/hashicorp/go-multierror" 21 "github.com/hashicorp/nomad/client/allocdir" 22 "github.com/hashicorp/nomad/client/config" 23 "github.com/hashicorp/nomad/client/driver" 24 "github.com/hashicorp/nomad/client/fingerprint" 25 "github.com/hashicorp/nomad/client/stats" 26 "github.com/hashicorp/nomad/client/vaultclient" 27 "github.com/hashicorp/nomad/command/agent/consul" 28 "github.com/hashicorp/nomad/helper" 29 "github.com/hashicorp/nomad/helper/tlsutil" 30 "github.com/hashicorp/nomad/helper/uuid" 31 "github.com/hashicorp/nomad/nomad" 32 "github.com/hashicorp/nomad/nomad/structs" 33 vaultapi "github.com/hashicorp/vault/api" 34 "github.com/mitchellh/hashstructure" 35 "github.com/shirou/gopsutil/host" 36 ) 37 38 const ( 39 // clientRPCCache controls how long we keep an idle connection 40 // open to a server 41 clientRPCCache = 5 * time.Minute 42 43 // clientMaxStreams controsl how many idle streams we keep 44 // open to a server 45 clientMaxStreams = 2 46 47 // datacenterQueryLimit searches through up to this many adjacent 48 // datacenters looking for the Nomad server service. 49 datacenterQueryLimit = 9 50 51 // registerRetryIntv is minimum interval on which we retry 52 // registration. We pick a value between this and 2x this. 53 registerRetryIntv = 15 * time.Second 54 55 // getAllocRetryIntv is minimum interval on which we retry 56 // to fetch allocations. We pick a value between this and 2x this. 57 getAllocRetryIntv = 30 * time.Second 58 59 // devModeRetryIntv is the retry interval used for development 60 devModeRetryIntv = time.Second 61 62 // stateSnapshotIntv is how often the client snapshots state 63 stateSnapshotIntv = 60 * time.Second 64 65 // initialHeartbeatStagger is used to stagger the interval between 66 // starting and the initial heartbeat. After the initial heartbeat, 67 // we switch to using the TTL specified by the servers. 68 initialHeartbeatStagger = 10 * time.Second 69 70 // nodeUpdateRetryIntv is how often the client checks for updates to the 71 // node attributes or meta map. 72 nodeUpdateRetryIntv = 5 * time.Second 73 74 // allocSyncIntv is the batching period of allocation updates before they 75 // are synced with the server. 76 allocSyncIntv = 200 * time.Millisecond 77 78 // allocSyncRetryIntv is the interval on which we retry updating 79 // the status of the allocation 80 allocSyncRetryIntv = 5 * time.Second 81 ) 82 83 // ClientStatsReporter exposes all the APIs related to resource usage of a Nomad 84 // Client 85 type ClientStatsReporter interface { 86 // GetAllocStats returns the AllocStatsReporter for the passed allocation. 87 // If it does not exist an error is reported. 88 GetAllocStats(allocID string) (AllocStatsReporter, error) 89 90 // LatestHostStats returns the latest resource usage stats for the host 91 LatestHostStats() *stats.HostStats 92 } 93 94 // Client is used to implement the client interaction with Nomad. Clients 95 // are expected to register as a schedulable node to the servers, and to 96 // run allocations as determined by the servers. 97 type Client struct { 98 config *config.Config 99 start time.Time 100 101 // stateDB is used to efficiently store client state. 102 stateDB *bolt.DB 103 104 // configCopy is a copy that should be passed to alloc-runners. 105 configCopy *config.Config 106 configLock sync.RWMutex 107 108 logger *log.Logger 109 110 connPool *nomad.ConnPool 111 112 // servers is the (optionally prioritized) list of nomad servers 113 servers *serverlist 114 115 // heartbeat related times for tracking how often to heartbeat 116 lastHeartbeat time.Time 117 heartbeatTTL time.Duration 118 haveHeartbeated bool 119 heartbeatLock sync.Mutex 120 121 // triggerDiscoveryCh triggers Consul discovery; see triggerDiscovery 122 triggerDiscoveryCh chan struct{} 123 124 // discovered will be ticked whenever Consul discovery completes 125 // successfully 126 serversDiscoveredCh chan struct{} 127 128 // allocs maps alloc IDs to their AllocRunner. This map includes all 129 // AllocRunners - running and GC'd - until the server GCs them. 130 allocs map[string]*AllocRunner 131 allocLock sync.RWMutex 132 133 // allocUpdates stores allocations that need to be synced to the server. 134 allocUpdates chan *structs.Allocation 135 136 // consulService is Nomad's custom Consul client for managing services 137 // and checks. 138 consulService ConsulServiceAPI 139 140 // consulCatalog is the subset of Consul's Catalog API Nomad uses. 141 consulCatalog consul.CatalogAPI 142 143 // HostStatsCollector collects host resource usage stats 144 hostStatsCollector *stats.HostStatsCollector 145 146 shutdown bool 147 shutdownCh chan struct{} 148 shutdownLock sync.Mutex 149 150 // vaultClient is used to interact with Vault for token and secret renewals 151 vaultClient vaultclient.VaultClient 152 153 // garbageCollector is used to garbage collect terminal allocations present 154 // in the node automatically 155 garbageCollector *AllocGarbageCollector 156 157 // clientACLResolver holds the ACL resolution state 158 clientACLResolver 159 160 // baseLabels are used when emitting tagged metrics. All client metrics will 161 // have these tags, and optionally more. 162 baseLabels []metrics.Label 163 } 164 165 var ( 166 // noServersErr is returned by the RPC method when the client has no 167 // configured servers. This is used to trigger Consul discovery if 168 // enabled. 169 noServersErr = errors.New("no servers") 170 ) 171 172 // NewClient is used to create a new client from the given configuration 173 func NewClient(cfg *config.Config, consulCatalog consul.CatalogAPI, consulService ConsulServiceAPI, logger *log.Logger) (*Client, error) { 174 // Create the tls wrapper 175 var tlsWrap tlsutil.RegionWrapper 176 if cfg.TLSConfig.EnableRPC { 177 tw, err := cfg.TLSConfiguration().OutgoingTLSWrapper() 178 if err != nil { 179 return nil, err 180 } 181 tlsWrap = tw 182 } 183 184 // Create the client 185 c := &Client{ 186 config: cfg, 187 consulCatalog: consulCatalog, 188 consulService: consulService, 189 start: time.Now(), 190 connPool: nomad.NewPool(cfg.LogOutput, clientRPCCache, clientMaxStreams, tlsWrap), 191 logger: logger, 192 allocs: make(map[string]*AllocRunner), 193 allocUpdates: make(chan *structs.Allocation, 64), 194 shutdownCh: make(chan struct{}), 195 servers: newServerList(), 196 triggerDiscoveryCh: make(chan struct{}), 197 serversDiscoveredCh: make(chan struct{}), 198 } 199 200 // Initialize the client 201 if err := c.init(); err != nil { 202 return nil, fmt.Errorf("failed to initialize client: %v", err) 203 } 204 205 // Initialize the ACL state 206 if err := c.clientACLResolver.init(); err != nil { 207 return nil, fmt.Errorf("failed to initialize ACL state: %v", err) 208 } 209 210 // Add the stats collector 211 statsCollector := stats.NewHostStatsCollector(logger, c.config.AllocDir) 212 c.hostStatsCollector = statsCollector 213 214 // Add the garbage collector 215 gcConfig := &GCConfig{ 216 MaxAllocs: cfg.GCMaxAllocs, 217 DiskUsageThreshold: cfg.GCDiskUsageThreshold, 218 InodeUsageThreshold: cfg.GCInodeUsageThreshold, 219 Interval: cfg.GCInterval, 220 ParallelDestroys: cfg.GCParallelDestroys, 221 ReservedDiskMB: cfg.Node.Reserved.DiskMB, 222 } 223 c.garbageCollector = NewAllocGarbageCollector(logger, statsCollector, c, gcConfig) 224 go c.garbageCollector.Run() 225 226 // Setup the node 227 if err := c.setupNode(); err != nil { 228 return nil, fmt.Errorf("node setup failed: %v", err) 229 } 230 231 // Fingerprint the node 232 if err := c.fingerprint(); err != nil { 233 return nil, fmt.Errorf("fingerprinting failed: %v", err) 234 } 235 236 // Scan for drivers 237 if err := c.setupDrivers(); err != nil { 238 return nil, fmt.Errorf("driver setup failed: %v", err) 239 } 240 241 // Setup the reserved resources 242 c.reservePorts() 243 244 // Store the config copy before restoring state but after it has been 245 // initialized. 246 c.configLock.Lock() 247 c.configCopy = c.config.Copy() 248 c.configLock.Unlock() 249 250 // Set the preconfigured list of static servers 251 c.configLock.RLock() 252 if len(c.configCopy.Servers) > 0 { 253 if err := c.SetServers(c.configCopy.Servers); err != nil { 254 logger.Printf("[WARN] client: None of the configured servers are valid: %v", err) 255 } 256 } 257 c.configLock.RUnlock() 258 259 // Setup Consul discovery if enabled 260 if c.configCopy.ConsulConfig.ClientAutoJoin != nil && *c.configCopy.ConsulConfig.ClientAutoJoin { 261 go c.consulDiscovery() 262 if len(c.servers.all()) == 0 { 263 // No configured servers; trigger discovery manually 264 c.triggerDiscoveryCh <- struct{}{} 265 } 266 } 267 268 // Setup the vault client for token and secret renewals 269 if err := c.setupVaultClient(); err != nil { 270 return nil, fmt.Errorf("failed to setup vault client: %v", err) 271 } 272 273 // Restore the state 274 if err := c.restoreState(); err != nil { 275 logger.Printf("[ERR] client: failed to restore state: %v", err) 276 logger.Printf("[ERR] client: Nomad is unable to start due to corrupt state. "+ 277 "The safest way to proceed is to manually stop running task processes "+ 278 "and remove Nomad's state (%q) and alloc (%q) directories before "+ 279 "restarting. Lost allocations will be rescheduled.", 280 c.config.StateDir, c.config.AllocDir) 281 logger.Printf("[ERR] client: Corrupt state is often caused by a bug. Please " + 282 "report as much information as possible to " + 283 "https://github.com/hashicorp/nomad/issues") 284 return nil, fmt.Errorf("failed to restore state") 285 } 286 287 // Register and then start heartbeating to the servers. 288 go c.registerAndHeartbeat() 289 290 // Begin periodic snapshotting of state. 291 go c.periodicSnapshot() 292 293 // Begin syncing allocations to the server 294 go c.allocSync() 295 296 // Start the client! 297 go c.run() 298 299 // Start collecting stats 300 go c.emitStats() 301 302 c.logger.Printf("[INFO] client: Node ID %q", c.NodeID()) 303 return c, nil 304 } 305 306 // init is used to initialize the client and perform any setup 307 // needed before we begin starting its various components. 308 func (c *Client) init() error { 309 // Ensure the state dir exists if we have one 310 if c.config.StateDir != "" { 311 if err := os.MkdirAll(c.config.StateDir, 0700); err != nil { 312 return fmt.Errorf("failed creating state dir: %s", err) 313 } 314 315 } else { 316 // Othewise make a temp directory to use. 317 p, err := ioutil.TempDir("", "NomadClient") 318 if err != nil { 319 return fmt.Errorf("failed creating temporary directory for the StateDir: %v", err) 320 } 321 322 p, err = filepath.EvalSymlinks(p) 323 if err != nil { 324 return fmt.Errorf("failed to find temporary directory for the StateDir: %v", err) 325 } 326 327 c.config.StateDir = p 328 } 329 c.logger.Printf("[INFO] client: using state directory %v", c.config.StateDir) 330 331 // Create or open the state database 332 db, err := bolt.Open(filepath.Join(c.config.StateDir, "state.db"), 0600, nil) 333 if err != nil { 334 return fmt.Errorf("failed to create state database: %v", err) 335 } 336 c.stateDB = db 337 338 // Ensure the alloc dir exists if we have one 339 if c.config.AllocDir != "" { 340 if err := os.MkdirAll(c.config.AllocDir, 0711); err != nil { 341 return fmt.Errorf("failed creating alloc dir: %s", err) 342 } 343 } else { 344 // Othewise make a temp directory to use. 345 p, err := ioutil.TempDir("", "NomadClient") 346 if err != nil { 347 return fmt.Errorf("failed creating temporary directory for the AllocDir: %v", err) 348 } 349 350 p, err = filepath.EvalSymlinks(p) 351 if err != nil { 352 return fmt.Errorf("failed to find temporary directory for the AllocDir: %v", err) 353 } 354 355 // Change the permissions to have the execute bit 356 if err := os.Chmod(p, 0711); err != nil { 357 return fmt.Errorf("failed to change directory permissions for the AllocDir: %v", err) 358 } 359 360 c.config.AllocDir = p 361 } 362 363 c.logger.Printf("[INFO] client: using alloc directory %v", c.config.AllocDir) 364 return nil 365 } 366 367 // Leave is used to prepare the client to leave the cluster 368 func (c *Client) Leave() error { 369 // TODO 370 return nil 371 } 372 373 // GetConfig returns the config of the client for testing purposes only 374 func (c *Client) GetConfig() *config.Config { 375 return c.config 376 } 377 378 // Datacenter returns the datacenter for the given client 379 func (c *Client) Datacenter() string { 380 return c.config.Node.Datacenter 381 } 382 383 // Region returns the region for the given client 384 func (c *Client) Region() string { 385 return c.config.Region 386 } 387 388 // NodeID returns the node ID for the given client 389 func (c *Client) NodeID() string { 390 return c.config.Node.ID 391 } 392 393 // secretNodeID returns the secret node ID for the given client 394 func (c *Client) secretNodeID() string { 395 return c.config.Node.SecretID 396 } 397 398 // RPCMajorVersion returns the structs.ApiMajorVersion supported by the 399 // client. 400 func (c *Client) RPCMajorVersion() int { 401 return structs.ApiMajorVersion 402 } 403 404 // RPCMinorVersion returns the structs.ApiMinorVersion supported by the 405 // client. 406 func (c *Client) RPCMinorVersion() int { 407 return structs.ApiMinorVersion 408 } 409 410 // Shutdown is used to tear down the client 411 func (c *Client) Shutdown() error { 412 c.logger.Printf("[INFO] client: shutting down") 413 c.shutdownLock.Lock() 414 defer c.shutdownLock.Unlock() 415 416 if c.shutdown { 417 return nil 418 } 419 420 // Defer closing the database 421 defer func() { 422 if err := c.stateDB.Close(); err != nil { 423 c.logger.Printf("[ERR] client: failed to close state database on shutdown: %v", err) 424 } 425 }() 426 427 // Stop renewing tokens and secrets 428 if c.vaultClient != nil { 429 c.vaultClient.Stop() 430 } 431 432 // Stop Garbage collector 433 c.garbageCollector.Stop() 434 435 // Destroy all the running allocations. 436 if c.config.DevMode { 437 for _, ar := range c.getAllocRunners() { 438 ar.Destroy() 439 <-ar.WaitCh() 440 } 441 } 442 443 c.shutdown = true 444 close(c.shutdownCh) 445 c.connPool.Shutdown() 446 return c.saveState() 447 } 448 449 // RPC is used to forward an RPC call to a nomad server, or fail if no servers. 450 func (c *Client) RPC(method string, args interface{}, reply interface{}) error { 451 // Invoke the RPCHandler if it exists 452 if c.config.RPCHandler != nil { 453 return c.config.RPCHandler.RPC(method, args, reply) 454 } 455 456 servers := c.servers.all() 457 if len(servers) == 0 { 458 return noServersErr 459 } 460 461 var mErr multierror.Error 462 for _, s := range servers { 463 // Make the RPC request 464 if err := c.connPool.RPC(c.Region(), s.addr, c.RPCMajorVersion(), method, args, reply); err != nil { 465 errmsg := fmt.Errorf("RPC failed to server %s: %v", s.addr, err) 466 mErr.Errors = append(mErr.Errors, errmsg) 467 c.logger.Printf("[DEBUG] client: %v", errmsg) 468 c.servers.failed(s) 469 continue 470 } 471 c.servers.good(s) 472 return nil 473 } 474 475 return mErr.ErrorOrNil() 476 } 477 478 // Stats is used to return statistics for debugging and insight 479 // for various sub-systems 480 func (c *Client) Stats() map[string]map[string]string { 481 c.heartbeatLock.Lock() 482 defer c.heartbeatLock.Unlock() 483 stats := map[string]map[string]string{ 484 "client": { 485 "node_id": c.NodeID(), 486 "known_servers": c.servers.all().String(), 487 "num_allocations": strconv.Itoa(c.NumAllocs()), 488 "last_heartbeat": fmt.Sprintf("%v", time.Since(c.lastHeartbeat)), 489 "heartbeat_ttl": fmt.Sprintf("%v", c.heartbeatTTL), 490 }, 491 "runtime": nomad.RuntimeStats(), 492 } 493 return stats 494 } 495 496 // CollectAllocation garbage collects a single allocation on a node. Returns 497 // true if alloc was found and garbage collected; otherwise false. 498 func (c *Client) CollectAllocation(allocID string) bool { 499 return c.garbageCollector.Collect(allocID) 500 } 501 502 // CollectAllAllocs garbage collects all allocations on a node in the terminal 503 // state 504 func (c *Client) CollectAllAllocs() { 505 c.garbageCollector.CollectAll() 506 } 507 508 // Node returns the locally registered node 509 func (c *Client) Node() *structs.Node { 510 c.configLock.RLock() 511 defer c.configLock.RUnlock() 512 return c.configCopy.Node 513 } 514 515 // StatsReporter exposes the various APIs related resource usage of a Nomad 516 // client 517 func (c *Client) StatsReporter() ClientStatsReporter { 518 return c 519 } 520 521 func (c *Client) GetAllocStats(allocID string) (AllocStatsReporter, error) { 522 c.allocLock.RLock() 523 defer c.allocLock.RUnlock() 524 ar, ok := c.allocs[allocID] 525 if !ok { 526 return nil, fmt.Errorf("unknown allocation ID %q", allocID) 527 } 528 return ar.StatsReporter(), nil 529 } 530 531 // HostStats returns all the stats related to a Nomad client 532 func (c *Client) LatestHostStats() *stats.HostStats { 533 return c.hostStatsCollector.Stats() 534 } 535 536 // ValidateMigrateToken verifies that a token is for a specific client and 537 // allocation, and has been created by a trusted party that has privileged 538 // knowledge of the client's secret identifier 539 func (c *Client) ValidateMigrateToken(allocID, migrateToken string) bool { 540 if !c.config.ACLEnabled { 541 return true 542 } 543 544 return nomad.CompareMigrateToken(allocID, c.secretNodeID(), migrateToken) 545 } 546 547 // GetAllocFS returns the AllocFS interface for the alloc dir of an allocation 548 func (c *Client) GetAllocFS(allocID string) (allocdir.AllocDirFS, error) { 549 c.allocLock.RLock() 550 defer c.allocLock.RUnlock() 551 552 ar, ok := c.allocs[allocID] 553 if !ok { 554 return nil, fmt.Errorf("unknown allocation ID %q", allocID) 555 } 556 return ar.GetAllocDir(), nil 557 } 558 559 // GetClientAlloc returns the allocation from the client 560 func (c *Client) GetClientAlloc(allocID string) (*structs.Allocation, error) { 561 all := c.allAllocs() 562 alloc, ok := all[allocID] 563 if !ok { 564 return nil, fmt.Errorf("unknown allocation ID %q", allocID) 565 } 566 return alloc, nil 567 } 568 569 // GetServers returns the list of nomad servers this client is aware of. 570 func (c *Client) GetServers() []string { 571 endpoints := c.servers.all() 572 res := make([]string, len(endpoints)) 573 for i := range endpoints { 574 res[i] = endpoints[i].addr.String() 575 } 576 return res 577 } 578 579 // SetServers sets a new list of nomad servers to connect to. As long as one 580 // server is resolvable no error is returned. 581 func (c *Client) SetServers(servers []string) error { 582 endpoints := make([]*endpoint, 0, len(servers)) 583 var merr multierror.Error 584 for _, s := range servers { 585 addr, err := resolveServer(s) 586 if err != nil { 587 c.logger.Printf("[DEBUG] client: ignoring server %s due to resolution error: %v", s, err) 588 merr.Errors = append(merr.Errors, err) 589 continue 590 } 591 592 // Valid endpoint, append it without a priority as this API 593 // doesn't support different priorities for different servers 594 endpoints = append(endpoints, &endpoint{name: s, addr: addr}) 595 } 596 597 // Only return errors if no servers are valid 598 if len(endpoints) == 0 { 599 if len(merr.Errors) > 0 { 600 return merr.ErrorOrNil() 601 } 602 return noServersErr 603 } 604 605 c.servers.set(endpoints) 606 return nil 607 } 608 609 // restoreState is used to restore our state from the data dir 610 func (c *Client) restoreState() error { 611 if c.config.DevMode { 612 return nil 613 } 614 615 // COMPAT: Remove in 0.7.0 616 // 0.6.0 transistioned from individual state files to a single bolt-db. 617 // The upgrade path is to: 618 // Check if old state exists 619 // If so, restore from that and delete old state 620 // Restore using state database 621 622 // Allocs holds the IDs of the allocations being restored 623 var allocs []string 624 625 // Upgrading tracks whether this is a pre 0.6.0 upgrade path 626 var upgrading bool 627 628 // Scan the directory 629 allocDir := filepath.Join(c.config.StateDir, "alloc") 630 list, err := ioutil.ReadDir(allocDir) 631 if err != nil && !os.IsNotExist(err) { 632 return fmt.Errorf("failed to list alloc state: %v", err) 633 } else if err == nil && len(list) != 0 { 634 upgrading = true 635 for _, entry := range list { 636 allocs = append(allocs, entry.Name()) 637 } 638 } else { 639 // Normal path 640 err := c.stateDB.View(func(tx *bolt.Tx) error { 641 allocs, err = getAllAllocationIDs(tx) 642 if err != nil { 643 return fmt.Errorf("failed to list allocations: %v", err) 644 } 645 return nil 646 }) 647 if err != nil { 648 return err 649 } 650 } 651 652 // Load each alloc back 653 var mErr multierror.Error 654 for _, id := range allocs { 655 alloc := &structs.Allocation{ID: id} 656 657 // don't worry about blocking/migrating when restoring 658 watcher := noopPrevAlloc{} 659 660 c.configLock.RLock() 661 ar := NewAllocRunner(c.logger, c.configCopy, c.stateDB, c.updateAllocStatus, alloc, c.vaultClient, c.consulService, watcher) 662 c.configLock.RUnlock() 663 664 c.allocLock.Lock() 665 c.allocs[id] = ar 666 c.allocLock.Unlock() 667 668 if err := ar.RestoreState(); err != nil { 669 c.logger.Printf("[ERR] client: failed to restore state for alloc %q: %v", id, err) 670 mErr.Errors = append(mErr.Errors, err) 671 } else { 672 go ar.Run() 673 674 if upgrading { 675 if err := ar.SaveState(); err != nil { 676 c.logger.Printf("[WARN] client: initial save state for alloc %q failed: %v", id, err) 677 } 678 } 679 } 680 } 681 682 // Delete all the entries 683 if upgrading { 684 if err := os.RemoveAll(allocDir); err != nil { 685 mErr.Errors = append(mErr.Errors, err) 686 } 687 } 688 689 return mErr.ErrorOrNil() 690 } 691 692 // saveState is used to snapshot our state into the data dir. 693 func (c *Client) saveState() error { 694 if c.config.DevMode { 695 return nil 696 } 697 698 var wg sync.WaitGroup 699 var l sync.Mutex 700 var mErr multierror.Error 701 runners := c.getAllocRunners() 702 wg.Add(len(runners)) 703 704 for id, ar := range runners { 705 go func(id string, ar *AllocRunner) { 706 err := ar.SaveState() 707 if err != nil { 708 c.logger.Printf("[ERR] client: failed to save state for alloc %q: %v", id, err) 709 l.Lock() 710 multierror.Append(&mErr, err) 711 l.Unlock() 712 } 713 wg.Done() 714 }(id, ar) 715 } 716 717 wg.Wait() 718 return mErr.ErrorOrNil() 719 } 720 721 // getAllocRunners returns a snapshot of the current set of alloc runners. 722 func (c *Client) getAllocRunners() map[string]*AllocRunner { 723 c.allocLock.RLock() 724 defer c.allocLock.RUnlock() 725 runners := make(map[string]*AllocRunner, len(c.allocs)) 726 for id, ar := range c.allocs { 727 runners[id] = ar 728 } 729 return runners 730 } 731 732 // NumAllocs returns the number of un-GC'd allocs this client has. Used to 733 // fulfill the AllocCounter interface for the GC. 734 func (c *Client) NumAllocs() int { 735 n := 0 736 c.allocLock.RLock() 737 for _, a := range c.allocs { 738 if !a.IsDestroyed() { 739 n++ 740 } 741 } 742 c.allocLock.RUnlock() 743 return n 744 } 745 746 // nodeID restores, or generates if necessary, a unique node ID and SecretID. 747 // The node ID is, if available, a persistent unique ID. The secret ID is a 748 // high-entropy random UUID. 749 func (c *Client) nodeID() (id, secret string, err error) { 750 var hostID string 751 hostInfo, err := host.Info() 752 if !c.config.NoHostUUID && err == nil { 753 if hashed, ok := helper.HashUUID(hostInfo.HostID); ok { 754 hostID = hashed 755 } 756 } 757 758 if hostID == "" { 759 // Generate a random hostID if no constant ID is available on 760 // this platform. 761 hostID = uuid.Generate() 762 } 763 764 // Do not persist in dev mode 765 if c.config.DevMode { 766 return hostID, uuid.Generate(), nil 767 } 768 769 // Attempt to read existing ID 770 idPath := filepath.Join(c.config.StateDir, "client-id") 771 idBuf, err := ioutil.ReadFile(idPath) 772 if err != nil && !os.IsNotExist(err) { 773 return "", "", err 774 } 775 776 // Attempt to read existing secret ID 777 secretPath := filepath.Join(c.config.StateDir, "secret-id") 778 secretBuf, err := ioutil.ReadFile(secretPath) 779 if err != nil && !os.IsNotExist(err) { 780 return "", "", err 781 } 782 783 // Use existing ID if any 784 if len(idBuf) != 0 { 785 id = strings.ToLower(string(idBuf)) 786 } else { 787 id = hostID 788 789 // Persist the ID 790 if err := ioutil.WriteFile(idPath, []byte(id), 0700); err != nil { 791 return "", "", err 792 } 793 } 794 795 if len(secretBuf) != 0 { 796 secret = string(secretBuf) 797 } else { 798 // Generate new ID 799 secret = uuid.Generate() 800 801 // Persist the ID 802 if err := ioutil.WriteFile(secretPath, []byte(secret), 0700); err != nil { 803 return "", "", err 804 } 805 } 806 807 return id, secret, nil 808 } 809 810 // setupNode is used to setup the initial node 811 func (c *Client) setupNode() error { 812 node := c.config.Node 813 if node == nil { 814 node = &structs.Node{} 815 c.config.Node = node 816 } 817 // Generate an ID and secret for the node 818 id, secretID, err := c.nodeID() 819 if err != nil { 820 return fmt.Errorf("node ID setup failed: %v", err) 821 } 822 823 node.ID = id 824 node.SecretID = secretID 825 if node.Attributes == nil { 826 node.Attributes = make(map[string]string) 827 } 828 if node.Links == nil { 829 node.Links = make(map[string]string) 830 } 831 if node.Meta == nil { 832 node.Meta = make(map[string]string) 833 } 834 if node.Resources == nil { 835 node.Resources = &structs.Resources{} 836 } 837 if node.Reserved == nil { 838 node.Reserved = &structs.Resources{} 839 } 840 if node.Datacenter == "" { 841 node.Datacenter = "dc1" 842 } 843 if node.Name == "" { 844 node.Name, _ = os.Hostname() 845 } 846 if node.Name == "" { 847 node.Name = node.ID 848 } 849 node.Status = structs.NodeStatusInit 850 return nil 851 } 852 853 // reservePorts is used to reserve ports on the fingerprinted network devices. 854 func (c *Client) reservePorts() { 855 c.configLock.RLock() 856 defer c.configLock.RUnlock() 857 global := c.config.GloballyReservedPorts 858 if len(global) == 0 { 859 return 860 } 861 862 node := c.config.Node 863 networks := node.Resources.Networks 864 reservedIndex := make(map[string]*structs.NetworkResource, len(networks)) 865 for _, resNet := range node.Reserved.Networks { 866 reservedIndex[resNet.IP] = resNet 867 } 868 869 // Go through each network device and reserve ports on it. 870 for _, net := range networks { 871 res, ok := reservedIndex[net.IP] 872 if !ok { 873 res = net.Copy() 874 res.MBits = 0 875 reservedIndex[net.IP] = res 876 } 877 878 for _, portVal := range global { 879 p := structs.Port{Value: portVal} 880 res.ReservedPorts = append(res.ReservedPorts, p) 881 } 882 } 883 884 // Clear the reserved networks. 885 if node.Reserved == nil { 886 node.Reserved = new(structs.Resources) 887 } else { 888 node.Reserved.Networks = nil 889 } 890 891 // Restore the reserved networks 892 for _, net := range reservedIndex { 893 node.Reserved.Networks = append(node.Reserved.Networks, net) 894 } 895 } 896 897 // fingerprint is used to fingerprint the client and setup the node 898 func (c *Client) fingerprint() error { 899 whitelist := c.config.ReadStringListToMap("fingerprint.whitelist") 900 whitelistEnabled := len(whitelist) > 0 901 blacklist := c.config.ReadStringListToMap("fingerprint.blacklist") 902 903 c.logger.Printf("[DEBUG] client: built-in fingerprints: %v", fingerprint.BuiltinFingerprints()) 904 905 var applied []string 906 var skipped []string 907 for _, name := range fingerprint.BuiltinFingerprints() { 908 // Skip modules that are not in the whitelist if it is enabled. 909 if _, ok := whitelist[name]; whitelistEnabled && !ok { 910 skipped = append(skipped, name) 911 continue 912 } 913 // Skip modules that are in the blacklist 914 if _, ok := blacklist[name]; ok { 915 skipped = append(skipped, name) 916 continue 917 } 918 f, err := fingerprint.NewFingerprint(name, c.logger) 919 if err != nil { 920 return err 921 } 922 923 c.configLock.Lock() 924 applies, err := f.Fingerprint(c.config, c.config.Node) 925 c.configLock.Unlock() 926 if err != nil { 927 return err 928 } 929 if applies { 930 applied = append(applied, name) 931 } 932 p, period := f.Periodic() 933 if p { 934 // TODO: If more periodic fingerprinters are added, then 935 // fingerprintPeriodic should be used to handle all the periodic 936 // fingerprinters by using a priority queue. 937 go c.fingerprintPeriodic(name, f, period) 938 } 939 } 940 c.logger.Printf("[DEBUG] client: applied fingerprints %v", applied) 941 if len(skipped) != 0 { 942 c.logger.Printf("[DEBUG] client: fingerprint modules skipped due to white/blacklist: %v", skipped) 943 } 944 return nil 945 } 946 947 // fingerprintPeriodic runs a fingerprinter at the specified duration. 948 func (c *Client) fingerprintPeriodic(name string, f fingerprint.Fingerprint, d time.Duration) { 949 c.logger.Printf("[DEBUG] client: fingerprinting %v every %v", name, d) 950 for { 951 select { 952 case <-time.After(d): 953 c.configLock.Lock() 954 if _, err := f.Fingerprint(c.config, c.config.Node); err != nil { 955 c.logger.Printf("[DEBUG] client: periodic fingerprinting for %v failed: %v", name, err) 956 } 957 c.configLock.Unlock() 958 case <-c.shutdownCh: 959 return 960 } 961 } 962 } 963 964 // setupDrivers is used to find the available drivers 965 func (c *Client) setupDrivers() error { 966 // Build the white/blacklists of drivers. 967 whitelist := c.config.ReadStringListToMap("driver.whitelist") 968 whitelistEnabled := len(whitelist) > 0 969 blacklist := c.config.ReadStringListToMap("driver.blacklist") 970 971 var avail []string 972 var skipped []string 973 driverCtx := driver.NewDriverContext("", "", c.config, c.config.Node, c.logger, nil) 974 for name := range driver.BuiltinDrivers { 975 // Skip fingerprinting drivers that are not in the whitelist if it is 976 // enabled. 977 if _, ok := whitelist[name]; whitelistEnabled && !ok { 978 skipped = append(skipped, name) 979 continue 980 } 981 // Skip fingerprinting drivers that are in the blacklist 982 if _, ok := blacklist[name]; ok { 983 skipped = append(skipped, name) 984 continue 985 } 986 987 d, err := driver.NewDriver(name, driverCtx) 988 if err != nil { 989 return err 990 } 991 c.configLock.Lock() 992 applies, err := d.Fingerprint(c.config, c.config.Node) 993 c.configLock.Unlock() 994 if err != nil { 995 return err 996 } 997 if applies { 998 avail = append(avail, name) 999 } 1000 1001 p, period := d.Periodic() 1002 if p { 1003 go c.fingerprintPeriodic(name, d, period) 1004 } 1005 1006 } 1007 1008 c.logger.Printf("[DEBUG] client: available drivers %v", avail) 1009 1010 if len(skipped) != 0 { 1011 c.logger.Printf("[DEBUG] client: drivers skipped due to white/blacklist: %v", skipped) 1012 } 1013 1014 return nil 1015 } 1016 1017 // retryIntv calculates a retry interval value given the base 1018 func (c *Client) retryIntv(base time.Duration) time.Duration { 1019 if c.config.DevMode { 1020 return devModeRetryIntv 1021 } 1022 return base + lib.RandomStagger(base) 1023 } 1024 1025 // registerAndHeartbeat is a long lived goroutine used to register the client 1026 // and then start heartbeatng to the server. 1027 func (c *Client) registerAndHeartbeat() { 1028 // Before registering capture the hashes of the Node's attribute and 1029 // metadata maps. The hashes may be out of date with what registers but this 1030 // is okay since the loop checking for node updates will detect this and 1031 // reregister. This is necessary to avoid races between the periodic 1032 // fingerprinters and the node registering. 1033 attrHash, metaHash, err := nodeMapHashes(c.Node()) 1034 if err != nil { 1035 c.logger.Printf("[ERR] client: failed to determine initial node hashes. May result in stale node being registered: %v", err) 1036 } 1037 1038 // Register the node 1039 c.retryRegisterNode() 1040 1041 // Start watching changes for node changes 1042 go c.watchNodeUpdates(attrHash, metaHash) 1043 1044 // Setup the heartbeat timer, for the initial registration 1045 // we want to do this quickly. We want to do it extra quickly 1046 // in development mode. 1047 var heartbeat <-chan time.Time 1048 if c.config.DevMode { 1049 heartbeat = time.After(0) 1050 } else { 1051 heartbeat = time.After(lib.RandomStagger(initialHeartbeatStagger)) 1052 } 1053 1054 for { 1055 select { 1056 case <-c.serversDiscoveredCh: 1057 case <-heartbeat: 1058 case <-c.shutdownCh: 1059 return 1060 } 1061 1062 if err := c.updateNodeStatus(); err != nil { 1063 // The servers have changed such that this node has not been 1064 // registered before 1065 if strings.Contains(err.Error(), "node not found") { 1066 // Re-register the node 1067 c.logger.Printf("[INFO] client: re-registering node") 1068 c.retryRegisterNode() 1069 heartbeat = time.After(lib.RandomStagger(initialHeartbeatStagger)) 1070 } else { 1071 intv := c.retryIntv(registerRetryIntv) 1072 c.logger.Printf("[ERR] client: heartbeating failed. Retrying in %v: %v", intv, err) 1073 heartbeat = time.After(intv) 1074 1075 // if heartbeating fails, trigger Consul discovery 1076 c.triggerDiscovery() 1077 } 1078 } else { 1079 c.heartbeatLock.Lock() 1080 heartbeat = time.After(c.heartbeatTTL) 1081 c.heartbeatLock.Unlock() 1082 } 1083 } 1084 } 1085 1086 // periodicSnapshot is a long lived goroutine used to periodically snapshot the 1087 // state of the client 1088 func (c *Client) periodicSnapshot() { 1089 // Create a snapshot timer 1090 snapshot := time.After(stateSnapshotIntv) 1091 1092 for { 1093 select { 1094 case <-snapshot: 1095 snapshot = time.After(stateSnapshotIntv) 1096 if err := c.saveState(); err != nil { 1097 c.logger.Printf("[ERR] client: failed to save state: %v", err) 1098 } 1099 1100 case <-c.shutdownCh: 1101 return 1102 } 1103 } 1104 } 1105 1106 // run is a long lived goroutine used to run the client 1107 func (c *Client) run() { 1108 // Watch for changes in allocations 1109 allocUpdates := make(chan *allocUpdates, 8) 1110 go c.watchAllocations(allocUpdates) 1111 1112 for { 1113 select { 1114 case update := <-allocUpdates: 1115 c.runAllocs(update) 1116 1117 case <-c.shutdownCh: 1118 return 1119 } 1120 } 1121 } 1122 1123 // nodeMapHashes returns the hashes of the passed Node's attribute and metadata 1124 // maps. 1125 func nodeMapHashes(node *structs.Node) (attrHash, metaHash uint64, err error) { 1126 attrHash, err = hashstructure.Hash(node.Attributes, nil) 1127 if err != nil { 1128 return 0, 0, fmt.Errorf("unable to calculate node attributes hash: %v", err) 1129 } 1130 // Calculate node meta map hash 1131 metaHash, err = hashstructure.Hash(node.Meta, nil) 1132 if err != nil { 1133 return 0, 0, fmt.Errorf("unable to calculate node meta hash: %v", err) 1134 } 1135 return attrHash, metaHash, nil 1136 } 1137 1138 // hasNodeChanged calculates a hash for the node attributes- and meta map. 1139 // The new hash values are compared against the old (passed-in) hash values to 1140 // determine if the node properties have changed. It returns the new hash values 1141 // in case they are different from the old hash values. 1142 func (c *Client) hasNodeChanged(oldAttrHash uint64, oldMetaHash uint64) (bool, uint64, uint64) { 1143 c.configLock.RLock() 1144 defer c.configLock.RUnlock() 1145 1146 // Check if the Node that is being updated by fingerprinters has changed. 1147 newAttrHash, newMetaHash, err := nodeMapHashes(c.config.Node) 1148 if err != nil { 1149 c.logger.Printf("[DEBUG] client: unable to calculate node hashes: %v", err) 1150 } 1151 if newAttrHash != oldAttrHash || newMetaHash != oldMetaHash { 1152 return true, newAttrHash, newMetaHash 1153 } 1154 return false, oldAttrHash, oldMetaHash 1155 } 1156 1157 // retryRegisterNode is used to register the node or update the registration and 1158 // retry in case of failure. 1159 func (c *Client) retryRegisterNode() { 1160 for { 1161 err := c.registerNode() 1162 if err == nil { 1163 // Registered! 1164 return 1165 } 1166 1167 if err == noServersErr { 1168 c.logger.Print("[DEBUG] client: registration waiting on servers") 1169 c.triggerDiscovery() 1170 } else { 1171 c.logger.Printf("[ERR] client: registration failure: %v", err) 1172 } 1173 select { 1174 case <-c.serversDiscoveredCh: 1175 case <-time.After(c.retryIntv(registerRetryIntv)): 1176 case <-c.shutdownCh: 1177 return 1178 } 1179 } 1180 } 1181 1182 // registerNode is used to register the node or update the registration 1183 func (c *Client) registerNode() error { 1184 node := c.Node() 1185 req := structs.NodeRegisterRequest{ 1186 Node: node, 1187 WriteRequest: structs.WriteRequest{Region: c.Region()}, 1188 } 1189 var resp structs.NodeUpdateResponse 1190 if err := c.RPC("Node.Register", &req, &resp); err != nil { 1191 return err 1192 } 1193 1194 // Update the node status to ready after we register. 1195 c.configLock.Lock() 1196 node.Status = structs.NodeStatusReady 1197 c.configLock.Unlock() 1198 1199 c.logger.Printf("[INFO] client: node registration complete") 1200 if len(resp.EvalIDs) != 0 { 1201 c.logger.Printf("[DEBUG] client: %d evaluations triggered by node registration", len(resp.EvalIDs)) 1202 } 1203 1204 c.heartbeatLock.Lock() 1205 defer c.heartbeatLock.Unlock() 1206 c.lastHeartbeat = time.Now() 1207 c.heartbeatTTL = resp.HeartbeatTTL 1208 return nil 1209 } 1210 1211 // updateNodeStatus is used to heartbeat and update the status of the node 1212 func (c *Client) updateNodeStatus() error { 1213 start := time.Now() 1214 req := structs.NodeUpdateStatusRequest{ 1215 NodeID: c.NodeID(), 1216 Status: structs.NodeStatusReady, 1217 WriteRequest: structs.WriteRequest{Region: c.Region()}, 1218 } 1219 var resp structs.NodeUpdateResponse 1220 if err := c.RPC("Node.UpdateStatus", &req, &resp); err != nil { 1221 c.triggerDiscovery() 1222 return fmt.Errorf("failed to update status: %v", err) 1223 } 1224 end := time.Now() 1225 1226 if len(resp.EvalIDs) != 0 { 1227 c.logger.Printf("[DEBUG] client: %d evaluations triggered by node update", len(resp.EvalIDs)) 1228 } 1229 1230 // Update the last heartbeat and the new TTL, capturing the old values 1231 c.heartbeatLock.Lock() 1232 last := c.lastHeartbeat 1233 oldTTL := c.heartbeatTTL 1234 haveHeartbeated := c.haveHeartbeated 1235 c.lastHeartbeat = time.Now() 1236 c.heartbeatTTL = resp.HeartbeatTTL 1237 c.haveHeartbeated = true 1238 c.heartbeatLock.Unlock() 1239 c.logger.Printf("[TRACE] client: next heartbeat in %v", resp.HeartbeatTTL) 1240 1241 if resp.Index != 0 { 1242 c.logger.Printf("[DEBUG] client: state updated to %s", req.Status) 1243 1244 // We have potentially missed our TTL log how delayed we were 1245 if haveHeartbeated { 1246 c.logger.Printf("[WARN] client: heartbeat missed (request took %v). Heartbeat TTL was %v and heartbeated after %v", 1247 end.Sub(start), oldTTL, time.Since(last)) 1248 } 1249 } 1250 1251 // Convert []*NodeServerInfo to []*endpoints 1252 localdc := c.Datacenter() 1253 servers := make(endpoints, 0, len(resp.Servers)) 1254 for _, s := range resp.Servers { 1255 addr, err := resolveServer(s.RPCAdvertiseAddr) 1256 if err != nil { 1257 c.logger.Printf("[WARN] client: ignoring invalid server %q: %v", s.RPCAdvertiseAddr, err) 1258 continue 1259 } 1260 e := endpoint{name: s.RPCAdvertiseAddr, addr: addr} 1261 if s.Datacenter != localdc { 1262 // server is non-local; de-prioritize 1263 e.priority = 1 1264 } 1265 servers = append(servers, &e) 1266 } 1267 if len(servers) == 0 { 1268 return fmt.Errorf("server returned no valid servers") 1269 } 1270 c.servers.set(servers) 1271 1272 // Begin polling Consul if there is no Nomad leader. We could be 1273 // heartbeating to a Nomad server that is in the minority of a 1274 // partition of the Nomad server quorum, but this Nomad Agent still 1275 // has connectivity to the existing majority of Nomad Servers, but 1276 // only if it queries Consul. 1277 if resp.LeaderRPCAddr == "" { 1278 c.triggerDiscovery() 1279 } 1280 1281 return nil 1282 } 1283 1284 // updateAllocStatus is used to update the status of an allocation 1285 func (c *Client) updateAllocStatus(alloc *structs.Allocation) { 1286 if alloc.Terminated() { 1287 // Terminated, mark for GC if we're still tracking this alloc 1288 // runner. If it's not being tracked that means the server has 1289 // already GC'd it (see removeAlloc). 1290 c.allocLock.RLock() 1291 ar, ok := c.allocs[alloc.ID] 1292 c.allocLock.RUnlock() 1293 1294 if ok { 1295 c.garbageCollector.MarkForCollection(ar) 1296 1297 // Trigger a GC in case we're over thresholds and just 1298 // waiting for eligible allocs. 1299 c.garbageCollector.Trigger() 1300 } 1301 } 1302 1303 // Strip all the information that can be reconstructed at the server. Only 1304 // send the fields that are updatable by the client. 1305 stripped := new(structs.Allocation) 1306 stripped.ID = alloc.ID 1307 stripped.NodeID = c.NodeID() 1308 stripped.TaskStates = alloc.TaskStates 1309 stripped.ClientStatus = alloc.ClientStatus 1310 stripped.ClientDescription = alloc.ClientDescription 1311 stripped.DeploymentStatus = alloc.DeploymentStatus 1312 1313 select { 1314 case c.allocUpdates <- stripped: 1315 case <-c.shutdownCh: 1316 } 1317 } 1318 1319 // allocSync is a long lived function that batches allocation updates to the 1320 // server. 1321 func (c *Client) allocSync() { 1322 staggered := false 1323 syncTicker := time.NewTicker(allocSyncIntv) 1324 updates := make(map[string]*structs.Allocation) 1325 for { 1326 select { 1327 case <-c.shutdownCh: 1328 syncTicker.Stop() 1329 return 1330 case alloc := <-c.allocUpdates: 1331 // Batch the allocation updates until the timer triggers. 1332 updates[alloc.ID] = alloc 1333 case <-syncTicker.C: 1334 // Fast path if there are no updates 1335 if len(updates) == 0 { 1336 continue 1337 } 1338 1339 sync := make([]*structs.Allocation, 0, len(updates)) 1340 for _, alloc := range updates { 1341 sync = append(sync, alloc) 1342 } 1343 1344 // Send to server. 1345 args := structs.AllocUpdateRequest{ 1346 Alloc: sync, 1347 WriteRequest: structs.WriteRequest{Region: c.Region()}, 1348 } 1349 1350 var resp structs.GenericResponse 1351 if err := c.RPC("Node.UpdateAlloc", &args, &resp); err != nil { 1352 c.logger.Printf("[ERR] client: failed to update allocations: %v", err) 1353 syncTicker.Stop() 1354 syncTicker = time.NewTicker(c.retryIntv(allocSyncRetryIntv)) 1355 staggered = true 1356 } else { 1357 updates = make(map[string]*structs.Allocation) 1358 if staggered { 1359 syncTicker.Stop() 1360 syncTicker = time.NewTicker(allocSyncIntv) 1361 staggered = false 1362 } 1363 } 1364 } 1365 } 1366 } 1367 1368 // allocUpdates holds the results of receiving updated allocations from the 1369 // servers. 1370 type allocUpdates struct { 1371 // pulled is the set of allocations that were downloaded from the servers. 1372 pulled map[string]*structs.Allocation 1373 1374 // filtered is the set of allocations that were not pulled because their 1375 // AllocModifyIndex didn't change. 1376 filtered map[string]struct{} 1377 1378 // migrateTokens are a list of tokens necessary for when clients pull data 1379 // from authorized volumes 1380 migrateTokens map[string]string 1381 } 1382 1383 // watchAllocations is used to scan for updates to allocations 1384 func (c *Client) watchAllocations(updates chan *allocUpdates) { 1385 // The request and response for getting the map of allocations that should 1386 // be running on the Node to their AllocModifyIndex which is incremented 1387 // when the allocation is updated by the servers. 1388 req := structs.NodeSpecificRequest{ 1389 NodeID: c.NodeID(), 1390 SecretID: c.secretNodeID(), 1391 QueryOptions: structs.QueryOptions{ 1392 Region: c.Region(), 1393 AllowStale: true, 1394 }, 1395 } 1396 var resp structs.NodeClientAllocsResponse 1397 1398 // The request and response for pulling down the set of allocations that are 1399 // new, or updated server side. 1400 allocsReq := structs.AllocsGetRequest{ 1401 QueryOptions: structs.QueryOptions{ 1402 Region: c.Region(), 1403 AllowStale: true, 1404 }, 1405 } 1406 var allocsResp structs.AllocsGetResponse 1407 1408 OUTER: 1409 for { 1410 // Get the allocation modify index map, blocking for updates. We will 1411 // use this to determine exactly what allocations need to be downloaded 1412 // in full. 1413 resp = structs.NodeClientAllocsResponse{} 1414 err := c.RPC("Node.GetClientAllocs", &req, &resp) 1415 if err != nil { 1416 // Shutdown often causes EOF errors, so check for shutdown first 1417 select { 1418 case <-c.shutdownCh: 1419 return 1420 default: 1421 } 1422 1423 // COMPAT: Remove in 0.6. This is to allow the case in which the 1424 // servers are not fully upgraded before the clients register. This 1425 // can cause the SecretID to be lost 1426 if strings.Contains(err.Error(), "node secret ID does not match") { 1427 c.logger.Printf("[DEBUG] client: re-registering node as there was a secret ID mismatch: %v", err) 1428 c.retryRegisterNode() 1429 } else if err != noServersErr { 1430 c.logger.Printf("[ERR] client: failed to query for node allocations: %v", err) 1431 } 1432 retry := c.retryIntv(getAllocRetryIntv) 1433 select { 1434 case <-c.serversDiscoveredCh: 1435 continue 1436 case <-time.After(retry): 1437 continue 1438 case <-c.shutdownCh: 1439 return 1440 } 1441 } 1442 1443 // Check for shutdown 1444 select { 1445 case <-c.shutdownCh: 1446 return 1447 default: 1448 } 1449 1450 // Filter all allocations whose AllocModifyIndex was not incremented. 1451 // These are the allocations who have either not been updated, or whose 1452 // updates are a result of the client sending an update for the alloc. 1453 // This lets us reduce the network traffic to the server as we don't 1454 // need to pull all the allocations. 1455 var pull []string 1456 filtered := make(map[string]struct{}) 1457 runners := c.getAllocRunners() 1458 var pullIndex uint64 1459 for allocID, modifyIndex := range resp.Allocs { 1460 // Pull the allocation if we don't have an alloc runner for the 1461 // allocation or if the alloc runner requires an updated allocation. 1462 runner, ok := runners[allocID] 1463 1464 if !ok || runner.shouldUpdate(modifyIndex) { 1465 // Only pull allocs that are required. Filtered 1466 // allocs might be at a higher index, so ignore 1467 // it. 1468 if modifyIndex > pullIndex { 1469 pullIndex = modifyIndex 1470 } 1471 pull = append(pull, allocID) 1472 } else { 1473 filtered[allocID] = struct{}{} 1474 } 1475 } 1476 1477 // Pull the allocations that passed filtering. 1478 allocsResp.Allocs = nil 1479 var pulledAllocs map[string]*structs.Allocation 1480 if len(pull) != 0 { 1481 // Pull the allocations that need to be updated. 1482 allocsReq.AllocIDs = pull 1483 allocsReq.MinQueryIndex = pullIndex - 1 1484 allocsResp = structs.AllocsGetResponse{} 1485 if err := c.RPC("Alloc.GetAllocs", &allocsReq, &allocsResp); err != nil { 1486 c.logger.Printf("[ERR] client: failed to query updated allocations: %v", err) 1487 retry := c.retryIntv(getAllocRetryIntv) 1488 select { 1489 case <-c.serversDiscoveredCh: 1490 continue 1491 case <-time.After(retry): 1492 continue 1493 case <-c.shutdownCh: 1494 return 1495 } 1496 } 1497 1498 // Ensure that we received all the allocations we wanted 1499 pulledAllocs = make(map[string]*structs.Allocation, len(allocsResp.Allocs)) 1500 for _, alloc := range allocsResp.Allocs { 1501 pulledAllocs[alloc.ID] = alloc 1502 } 1503 1504 for _, desiredID := range pull { 1505 if _, ok := pulledAllocs[desiredID]; !ok { 1506 // We didn't get everything we wanted. Do not update the 1507 // MinQueryIndex, sleep and then retry. 1508 wait := c.retryIntv(2 * time.Second) 1509 select { 1510 case <-time.After(wait): 1511 // Wait for the server we contact to receive the 1512 // allocations 1513 continue OUTER 1514 case <-c.shutdownCh: 1515 return 1516 } 1517 } 1518 } 1519 1520 // Check for shutdown 1521 select { 1522 case <-c.shutdownCh: 1523 return 1524 default: 1525 } 1526 } 1527 1528 c.logger.Printf("[DEBUG] client: updated allocations at index %d (total %d) (pulled %d) (filtered %d)", 1529 resp.Index, len(resp.Allocs), len(allocsResp.Allocs), len(filtered)) 1530 1531 // Update the query index. 1532 if resp.Index > req.MinQueryIndex { 1533 req.MinQueryIndex = resp.Index 1534 } 1535 1536 // Push the updates. 1537 update := &allocUpdates{ 1538 filtered: filtered, 1539 pulled: pulledAllocs, 1540 migrateTokens: resp.MigrateTokens, 1541 } 1542 select { 1543 case updates <- update: 1544 case <-c.shutdownCh: 1545 return 1546 } 1547 } 1548 } 1549 1550 // watchNodeUpdates periodically checks for changes to the node attributes or 1551 // meta map. The passed hashes are the initial hash values for the attribute and 1552 // metadata of the node respectively. 1553 func (c *Client) watchNodeUpdates(attrHash, metaHash uint64) { 1554 c.logger.Printf("[DEBUG] client: periodically checking for node changes at duration %v", nodeUpdateRetryIntv) 1555 1556 var changed bool 1557 for { 1558 select { 1559 case <-time.After(c.retryIntv(nodeUpdateRetryIntv)): 1560 changed, attrHash, metaHash = c.hasNodeChanged(attrHash, metaHash) 1561 if changed { 1562 c.logger.Printf("[DEBUG] client: state changed, updating node.") 1563 1564 // Update the config copy. 1565 c.configLock.Lock() 1566 node := c.config.Node.Copy() 1567 c.configCopy.Node = node 1568 c.configLock.Unlock() 1569 1570 c.retryRegisterNode() 1571 } 1572 case <-c.shutdownCh: 1573 return 1574 } 1575 } 1576 } 1577 1578 // runAllocs is invoked when we get an updated set of allocations 1579 func (c *Client) runAllocs(update *allocUpdates) { 1580 // Get the existing allocs 1581 c.allocLock.RLock() 1582 exist := make([]*structs.Allocation, 0, len(c.allocs)) 1583 for _, ar := range c.allocs { 1584 exist = append(exist, ar.alloc) 1585 } 1586 c.allocLock.RUnlock() 1587 1588 // Diff the existing and updated allocations 1589 diff := diffAllocs(exist, update) 1590 c.logger.Printf("[DEBUG] client: %#v", diff) 1591 1592 // Remove the old allocations 1593 for _, remove := range diff.removed { 1594 c.removeAlloc(remove) 1595 } 1596 1597 // Update the existing allocations 1598 for _, update := range diff.updated { 1599 if err := c.updateAlloc(update.exist, update.updated); err != nil { 1600 c.logger.Printf("[ERR] client: failed to update alloc %q: %v", 1601 update.exist.ID, err) 1602 } 1603 } 1604 1605 // Make room for new allocations before running 1606 if err := c.garbageCollector.MakeRoomFor(diff.added); err != nil { 1607 c.logger.Printf("[ERR] client: error making room for new allocations: %v", err) 1608 } 1609 1610 // Start the new allocations 1611 for _, add := range diff.added { 1612 migrateToken := update.migrateTokens[add.ID] 1613 if err := c.addAlloc(add, migrateToken); err != nil { 1614 c.logger.Printf("[ERR] client: failed to add alloc '%s': %v", 1615 add.ID, err) 1616 } 1617 } 1618 1619 // Trigger the GC once more now that new allocs are started that could 1620 // have caused thesholds to be exceeded 1621 c.garbageCollector.Trigger() 1622 } 1623 1624 // removeAlloc is invoked when we should remove an allocation because it has 1625 // been removed by the server. 1626 func (c *Client) removeAlloc(alloc *structs.Allocation) { 1627 c.allocLock.Lock() 1628 ar, ok := c.allocs[alloc.ID] 1629 if !ok { 1630 c.allocLock.Unlock() 1631 c.logger.Printf("[WARN] client: missing context for alloc '%s'", alloc.ID) 1632 return 1633 } 1634 1635 // Stop tracking alloc runner as it's been GC'd by the server 1636 delete(c.allocs, alloc.ID) 1637 c.allocLock.Unlock() 1638 1639 // Ensure the GC has a reference and then collect. Collecting through the GC 1640 // applies rate limiting 1641 c.garbageCollector.MarkForCollection(ar) 1642 1643 // GC immediately since the server has GC'd it 1644 go c.garbageCollector.Collect(alloc.ID) 1645 } 1646 1647 // updateAlloc is invoked when we should update an allocation 1648 func (c *Client) updateAlloc(exist, update *structs.Allocation) error { 1649 c.allocLock.RLock() 1650 ar, ok := c.allocs[exist.ID] 1651 c.allocLock.RUnlock() 1652 if !ok { 1653 c.logger.Printf("[WARN] client: missing context for alloc '%s'", exist.ID) 1654 return nil 1655 } 1656 1657 ar.Update(update) 1658 return nil 1659 } 1660 1661 // addAlloc is invoked when we should add an allocation 1662 func (c *Client) addAlloc(alloc *structs.Allocation, migrateToken string) error { 1663 // Check if we already have an alloc runner 1664 c.allocLock.Lock() 1665 defer c.allocLock.Unlock() 1666 if _, ok := c.allocs[alloc.ID]; ok { 1667 c.logger.Printf("[DEBUG]: client: dropping duplicate add allocation request: %q", alloc.ID) 1668 return nil 1669 } 1670 1671 // get the previous alloc runner - if one exists - for the 1672 // blocking/migrating watcher 1673 var prevAR *AllocRunner 1674 if alloc.PreviousAllocation != "" { 1675 prevAR = c.allocs[alloc.PreviousAllocation] 1676 } 1677 1678 c.configLock.RLock() 1679 prevAlloc := newAllocWatcher(alloc, prevAR, c, c.configCopy, c.logger, migrateToken) 1680 1681 ar := NewAllocRunner(c.logger, c.configCopy, c.stateDB, c.updateAllocStatus, alloc, c.vaultClient, c.consulService, prevAlloc) 1682 c.configLock.RUnlock() 1683 1684 // Store the alloc runner. 1685 c.allocs[alloc.ID] = ar 1686 1687 if err := ar.SaveState(); err != nil { 1688 c.logger.Printf("[WARN] client: initial save state for alloc %q failed: %v", alloc.ID, err) 1689 } 1690 1691 go ar.Run() 1692 return nil 1693 } 1694 1695 // setupVaultClient creates an object to periodically renew tokens and secrets 1696 // with vault. 1697 func (c *Client) setupVaultClient() error { 1698 var err error 1699 c.vaultClient, err = vaultclient.NewVaultClient(c.config.VaultConfig, c.logger, c.deriveToken) 1700 if err != nil { 1701 return err 1702 } 1703 1704 if c.vaultClient == nil { 1705 c.logger.Printf("[ERR] client: failed to create vault client") 1706 return fmt.Errorf("failed to create vault client") 1707 } 1708 1709 // Start renewing tokens and secrets 1710 c.vaultClient.Start() 1711 1712 return nil 1713 } 1714 1715 // deriveToken takes in an allocation and a set of tasks and derives vault 1716 // tokens for each of the tasks, unwraps all of them using the supplied vault 1717 // client and returns a map of unwrapped tokens, indexed by the task name. 1718 func (c *Client) deriveToken(alloc *structs.Allocation, taskNames []string, vclient *vaultapi.Client) (map[string]string, error) { 1719 if alloc == nil { 1720 return nil, fmt.Errorf("nil allocation") 1721 } 1722 1723 if taskNames == nil || len(taskNames) == 0 { 1724 return nil, fmt.Errorf("missing task names") 1725 } 1726 1727 group := alloc.Job.LookupTaskGroup(alloc.TaskGroup) 1728 if group == nil { 1729 return nil, fmt.Errorf("group name in allocation is not present in job") 1730 } 1731 1732 verifiedTasks := []string{} 1733 // Check if the given task names actually exist in the allocation 1734 for _, taskName := range taskNames { 1735 found := false 1736 for _, task := range group.Tasks { 1737 if task.Name == taskName { 1738 found = true 1739 } 1740 } 1741 if !found { 1742 c.logger.Printf("[ERR] task %q not found in the allocation", taskName) 1743 return nil, fmt.Errorf("task %q not found in the allocaition", taskName) 1744 } 1745 verifiedTasks = append(verifiedTasks, taskName) 1746 } 1747 1748 // DeriveVaultToken of nomad server can take in a set of tasks and 1749 // creates tokens for all the tasks. 1750 req := &structs.DeriveVaultTokenRequest{ 1751 NodeID: c.NodeID(), 1752 SecretID: c.secretNodeID(), 1753 AllocID: alloc.ID, 1754 Tasks: verifiedTasks, 1755 QueryOptions: structs.QueryOptions{ 1756 Region: c.Region(), 1757 AllowStale: false, 1758 }, 1759 } 1760 1761 // Derive the tokens 1762 var resp structs.DeriveVaultTokenResponse 1763 if err := c.RPC("Node.DeriveVaultToken", &req, &resp); err != nil { 1764 c.logger.Printf("[ERR] client.vault: DeriveVaultToken RPC failed: %v", err) 1765 return nil, fmt.Errorf("DeriveVaultToken RPC failed: %v", err) 1766 } 1767 if resp.Error != nil { 1768 c.logger.Printf("[ERR] client.vault: failed to derive vault tokens: %v", resp.Error) 1769 return nil, resp.Error 1770 } 1771 if resp.Tasks == nil { 1772 c.logger.Printf("[ERR] client.vault: failed to derive vault token: invalid response") 1773 return nil, fmt.Errorf("failed to derive vault tokens: invalid response") 1774 } 1775 1776 unwrappedTokens := make(map[string]string) 1777 1778 // Retrieve the wrapped tokens from the response and unwrap it 1779 for _, taskName := range verifiedTasks { 1780 // Get the wrapped token 1781 wrappedToken, ok := resp.Tasks[taskName] 1782 if !ok { 1783 c.logger.Printf("[ERR] client.vault: wrapped token missing for task %q", taskName) 1784 return nil, fmt.Errorf("wrapped token missing for task %q", taskName) 1785 } 1786 1787 // Unwrap the vault token 1788 unwrapResp, err := vclient.Logical().Unwrap(wrappedToken) 1789 if err != nil { 1790 return nil, fmt.Errorf("failed to unwrap the token for task %q: %v", taskName, err) 1791 } 1792 if unwrapResp == nil || unwrapResp.Auth == nil || unwrapResp.Auth.ClientToken == "" { 1793 return nil, fmt.Errorf("failed to unwrap the token for task %q", taskName) 1794 } 1795 1796 // Append the unwrapped token to the return value 1797 unwrappedTokens[taskName] = unwrapResp.Auth.ClientToken 1798 } 1799 1800 return unwrappedTokens, nil 1801 } 1802 1803 // triggerDiscovery causes a Consul discovery to begin (if one hasn't alread) 1804 func (c *Client) triggerDiscovery() { 1805 select { 1806 case c.triggerDiscoveryCh <- struct{}{}: 1807 // Discovery goroutine was released to execute 1808 default: 1809 // Discovery goroutine was already running 1810 } 1811 } 1812 1813 // consulDiscovery waits for the signal to attempt server discovery via Consul. 1814 // It's intended to be started in a goroutine. See triggerDiscovery() for 1815 // causing consul discovery from other code locations. 1816 func (c *Client) consulDiscovery() { 1817 for { 1818 select { 1819 case <-c.triggerDiscoveryCh: 1820 if err := c.consulDiscoveryImpl(); err != nil { 1821 c.logger.Printf("[ERR] client.consul: error discovering nomad servers: %v", err) 1822 } 1823 case <-c.shutdownCh: 1824 return 1825 } 1826 } 1827 } 1828 1829 func (c *Client) consulDiscoveryImpl() error { 1830 // Acquire heartbeat lock to prevent heartbeat from running 1831 // concurrently with discovery. Concurrent execution is safe, however 1832 // discovery is usually triggered when heartbeating has failed so 1833 // there's no point in allowing it. 1834 c.heartbeatLock.Lock() 1835 defer c.heartbeatLock.Unlock() 1836 1837 dcs, err := c.consulCatalog.Datacenters() 1838 if err != nil { 1839 return fmt.Errorf("client.consul: unable to query Consul datacenters: %v", err) 1840 } 1841 if len(dcs) > 2 { 1842 // Query the local DC first, then shuffle the 1843 // remaining DCs. Future heartbeats will cause Nomad 1844 // Clients to fixate on their local datacenter so 1845 // it's okay to talk with remote DCs. If the no 1846 // Nomad servers are available within 1847 // datacenterQueryLimit, the next heartbeat will pick 1848 // a new set of servers so it's okay. 1849 shuffleStrings(dcs[1:]) 1850 dcs = dcs[0:lib.MinInt(len(dcs), datacenterQueryLimit)] 1851 } 1852 1853 // Query for servers in this client's region only 1854 region := c.Region() 1855 rpcargs := structs.GenericRequest{ 1856 QueryOptions: structs.QueryOptions{ 1857 Region: region, 1858 }, 1859 } 1860 1861 serviceName := c.configCopy.ConsulConfig.ServerServiceName 1862 var mErr multierror.Error 1863 var servers endpoints 1864 c.logger.Printf("[DEBUG] client.consul: bootstrap contacting following Consul DCs: %+q", dcs) 1865 DISCOLOOP: 1866 for _, dc := range dcs { 1867 consulOpts := &consulapi.QueryOptions{ 1868 AllowStale: true, 1869 Datacenter: dc, 1870 Near: "_agent", 1871 WaitTime: consul.DefaultQueryWaitDuration, 1872 } 1873 consulServices, _, err := c.consulCatalog.Service(serviceName, consul.ServiceTagRPC, consulOpts) 1874 if err != nil { 1875 mErr.Errors = append(mErr.Errors, fmt.Errorf("unable to query service %+q from Consul datacenter %+q: %v", serviceName, dc, err)) 1876 continue 1877 } 1878 1879 for _, s := range consulServices { 1880 port := strconv.Itoa(s.ServicePort) 1881 addrstr := s.ServiceAddress 1882 if addrstr == "" { 1883 addrstr = s.Address 1884 } 1885 addr, err := net.ResolveTCPAddr("tcp", net.JoinHostPort(addrstr, port)) 1886 if err != nil { 1887 mErr.Errors = append(mErr.Errors, err) 1888 continue 1889 } 1890 var peers []string 1891 if err := c.connPool.RPC(region, addr, c.RPCMajorVersion(), "Status.Peers", rpcargs, &peers); err != nil { 1892 mErr.Errors = append(mErr.Errors, err) 1893 continue 1894 } 1895 1896 // Successfully received the Server peers list of the correct 1897 // region 1898 for _, p := range peers { 1899 addr, err := net.ResolveTCPAddr("tcp", p) 1900 if err != nil { 1901 mErr.Errors = append(mErr.Errors, err) 1902 } 1903 servers = append(servers, &endpoint{name: p, addr: addr}) 1904 } 1905 if len(servers) > 0 { 1906 break DISCOLOOP 1907 } 1908 } 1909 } 1910 if len(servers) == 0 { 1911 if len(mErr.Errors) > 0 { 1912 return mErr.ErrorOrNil() 1913 } 1914 return fmt.Errorf("no Nomad Servers advertising service %q in Consul datacenters: %+q", serviceName, dcs) 1915 } 1916 1917 c.logger.Printf("[INFO] client.consul: discovered following Servers: %s", servers) 1918 c.servers.set(servers) 1919 1920 // Notify waiting rpc calls. If a goroutine just failed an RPC call and 1921 // isn't receiving on this chan yet they'll still retry eventually. 1922 // This is a shortcircuit for the longer retry intervals. 1923 for { 1924 select { 1925 case c.serversDiscoveredCh <- struct{}{}: 1926 default: 1927 return nil 1928 } 1929 } 1930 } 1931 1932 // emitStats collects host resource usage stats periodically 1933 func (c *Client) emitStats() { 1934 // Assign labels directly before emitting stats so the information expected 1935 // is ready 1936 c.baseLabels = []metrics.Label{{Name: "node_id", Value: c.NodeID()}, {Name: "datacenter", Value: c.Datacenter()}} 1937 1938 // Start collecting host stats right away and then keep collecting every 1939 // collection interval 1940 next := time.NewTimer(0) 1941 defer next.Stop() 1942 for { 1943 select { 1944 case <-next.C: 1945 err := c.hostStatsCollector.Collect() 1946 next.Reset(c.config.StatsCollectionInterval) 1947 if err != nil { 1948 c.logger.Printf("[WARN] client: error fetching host resource usage stats: %v", err) 1949 continue 1950 } 1951 1952 // Publish Node metrics if operator has opted in 1953 if c.config.PublishNodeMetrics { 1954 c.emitHostStats() 1955 } 1956 1957 c.emitClientMetrics() 1958 case <-c.shutdownCh: 1959 return 1960 } 1961 } 1962 } 1963 1964 // setGaugeForMemoryStats proxies metrics for memory specific statistics 1965 func (c *Client) setGaugeForMemoryStats(nodeID string, hStats *stats.HostStats) { 1966 if !c.config.DisableTaggedMetrics { 1967 metrics.SetGaugeWithLabels([]string{"client", "host", "memory", "total"}, float32(hStats.Memory.Total), c.baseLabels) 1968 metrics.SetGaugeWithLabels([]string{"client", "host", "memory", "available"}, float32(hStats.Memory.Available), c.baseLabels) 1969 metrics.SetGaugeWithLabels([]string{"client", "host", "memory", "used"}, float32(hStats.Memory.Used), c.baseLabels) 1970 metrics.SetGaugeWithLabels([]string{"client", "host", "memory", "free"}, float32(hStats.Memory.Free), c.baseLabels) 1971 } 1972 1973 if c.config.BackwardsCompatibleMetrics { 1974 metrics.SetGauge([]string{"client", "host", "memory", nodeID, "total"}, float32(hStats.Memory.Total)) 1975 metrics.SetGauge([]string{"client", "host", "memory", nodeID, "available"}, float32(hStats.Memory.Available)) 1976 metrics.SetGauge([]string{"client", "host", "memory", nodeID, "used"}, float32(hStats.Memory.Used)) 1977 metrics.SetGauge([]string{"client", "host", "memory", nodeID, "free"}, float32(hStats.Memory.Free)) 1978 } 1979 } 1980 1981 // setGaugeForCPUStats proxies metrics for CPU specific statistics 1982 func (c *Client) setGaugeForCPUStats(nodeID string, hStats *stats.HostStats) { 1983 for _, cpu := range hStats.CPU { 1984 if !c.config.DisableTaggedMetrics { 1985 labels := append(c.baseLabels, metrics.Label{ 1986 Name: "cpu", 1987 Value: cpu.CPU, 1988 }) 1989 1990 metrics.SetGaugeWithLabels([]string{"client", "host", "cpu", "total"}, float32(cpu.Total), labels) 1991 metrics.SetGaugeWithLabels([]string{"client", "host", "cpu", "user"}, float32(cpu.User), labels) 1992 metrics.SetGaugeWithLabels([]string{"client", "host", "cpu", "idle"}, float32(cpu.Idle), labels) 1993 metrics.SetGaugeWithLabels([]string{"client", "host", "cpu", "system"}, float32(cpu.System), labels) 1994 } 1995 1996 if c.config.BackwardsCompatibleMetrics { 1997 metrics.SetGauge([]string{"client", "host", "cpu", nodeID, cpu.CPU, "total"}, float32(cpu.Total)) 1998 metrics.SetGauge([]string{"client", "host", "cpu", nodeID, cpu.CPU, "user"}, float32(cpu.User)) 1999 metrics.SetGauge([]string{"client", "host", "cpu", nodeID, cpu.CPU, "idle"}, float32(cpu.Idle)) 2000 metrics.SetGauge([]string{"client", "host", "cpu", nodeID, cpu.CPU, "system"}, float32(cpu.System)) 2001 } 2002 } 2003 } 2004 2005 // setGaugeForDiskStats proxies metrics for disk specific statistics 2006 func (c *Client) setGaugeForDiskStats(nodeID string, hStats *stats.HostStats) { 2007 for _, disk := range hStats.DiskStats { 2008 if !c.config.DisableTaggedMetrics { 2009 labels := append(c.baseLabels, metrics.Label{ 2010 Name: "disk", 2011 Value: disk.Device, 2012 }) 2013 2014 metrics.SetGaugeWithLabels([]string{"client", "host", "disk", "size"}, float32(disk.Size), labels) 2015 metrics.SetGaugeWithLabels([]string{"client", "host", "disk", "used"}, float32(disk.Used), labels) 2016 metrics.SetGaugeWithLabels([]string{"client", "host", "disk", "available"}, float32(disk.Available), labels) 2017 metrics.SetGaugeWithLabels([]string{"client", "host", "disk", "used_percent"}, float32(disk.UsedPercent), labels) 2018 metrics.SetGaugeWithLabels([]string{"client", "host", "disk", "inodes_percent"}, float32(disk.InodesUsedPercent), labels) 2019 } 2020 2021 if c.config.BackwardsCompatibleMetrics { 2022 metrics.SetGauge([]string{"client", "host", "disk", nodeID, disk.Device, "size"}, float32(disk.Size)) 2023 metrics.SetGauge([]string{"client", "host", "disk", nodeID, disk.Device, "used"}, float32(disk.Used)) 2024 metrics.SetGauge([]string{"client", "host", "disk", nodeID, disk.Device, "available"}, float32(disk.Available)) 2025 metrics.SetGauge([]string{"client", "host", "disk", nodeID, disk.Device, "used_percent"}, float32(disk.UsedPercent)) 2026 metrics.SetGauge([]string{"client", "host", "disk", nodeID, disk.Device, "inodes_percent"}, float32(disk.InodesUsedPercent)) 2027 } 2028 } 2029 } 2030 2031 // setGaugeForAllocationStats proxies metrics for allocation specific statistics 2032 func (c *Client) setGaugeForAllocationStats(nodeID string) { 2033 c.configLock.RLock() 2034 node := c.configCopy.Node 2035 c.configLock.RUnlock() 2036 total := node.Resources 2037 res := node.Reserved 2038 allocated := c.getAllocatedResources(node) 2039 2040 // Emit allocated 2041 if !c.config.DisableTaggedMetrics { 2042 metrics.SetGaugeWithLabels([]string{"client", "allocated", "memory"}, float32(allocated.MemoryMB), c.baseLabels) 2043 metrics.SetGaugeWithLabels([]string{"client", "allocated", "disk"}, float32(allocated.DiskMB), c.baseLabels) 2044 metrics.SetGaugeWithLabels([]string{"client", "allocated", "cpu"}, float32(allocated.CPU), c.baseLabels) 2045 metrics.SetGaugeWithLabels([]string{"client", "allocated", "iops"}, float32(allocated.IOPS), c.baseLabels) 2046 } 2047 2048 if c.config.BackwardsCompatibleMetrics { 2049 metrics.SetGauge([]string{"client", "allocated", "memory", nodeID}, float32(allocated.MemoryMB)) 2050 metrics.SetGauge([]string{"client", "allocated", "disk", nodeID}, float32(allocated.DiskMB)) 2051 metrics.SetGauge([]string{"client", "allocated", "cpu", nodeID}, float32(allocated.CPU)) 2052 metrics.SetGauge([]string{"client", "allocated", "iops", nodeID}, float32(allocated.IOPS)) 2053 } 2054 2055 for _, n := range allocated.Networks { 2056 if !c.config.DisableTaggedMetrics { 2057 labels := append(c.baseLabels, metrics.Label{ 2058 Name: "device", 2059 Value: n.Device, 2060 }) 2061 metrics.SetGaugeWithLabels([]string{"client", "allocated", "network"}, float32(n.MBits), labels) 2062 } 2063 2064 if c.config.BackwardsCompatibleMetrics { 2065 metrics.SetGauge([]string{"client", "allocated", "network", n.Device, nodeID}, float32(n.MBits)) 2066 } 2067 } 2068 2069 // Emit unallocated 2070 unallocatedMem := total.MemoryMB - res.MemoryMB - allocated.MemoryMB 2071 unallocatedDisk := total.DiskMB - res.DiskMB - allocated.DiskMB 2072 unallocatedCpu := total.CPU - res.CPU - allocated.CPU 2073 unallocatedIops := total.IOPS - res.IOPS - allocated.IOPS 2074 2075 if !c.config.DisableTaggedMetrics { 2076 metrics.SetGaugeWithLabels([]string{"client", "unallocated", "memory"}, float32(unallocatedMem), c.baseLabels) 2077 metrics.SetGaugeWithLabels([]string{"client", "unallocated", "disk"}, float32(unallocatedDisk), c.baseLabels) 2078 metrics.SetGaugeWithLabels([]string{"client", "unallocated", "cpu"}, float32(unallocatedCpu), c.baseLabels) 2079 metrics.SetGaugeWithLabels([]string{"client", "unallocated", "iops"}, float32(unallocatedIops), c.baseLabels) 2080 } 2081 2082 if c.config.BackwardsCompatibleMetrics { 2083 metrics.SetGauge([]string{"client", "unallocated", "memory", nodeID}, float32(unallocatedMem)) 2084 metrics.SetGauge([]string{"client", "unallocated", "disk", nodeID}, float32(unallocatedDisk)) 2085 metrics.SetGauge([]string{"client", "unallocated", "cpu", nodeID}, float32(unallocatedCpu)) 2086 metrics.SetGauge([]string{"client", "unallocated", "iops", nodeID}, float32(unallocatedIops)) 2087 } 2088 2089 for _, n := range allocated.Networks { 2090 totalIdx := total.NetIndex(n) 2091 if totalIdx != -1 { 2092 continue 2093 } 2094 2095 totalMbits := total.Networks[totalIdx].MBits 2096 unallocatedMbits := totalMbits - n.MBits 2097 2098 if !c.config.DisableTaggedMetrics { 2099 labels := append(c.baseLabels, metrics.Label{ 2100 Name: "device", 2101 Value: n.Device, 2102 }) 2103 metrics.SetGaugeWithLabels([]string{"client", "unallocated", "network"}, float32(unallocatedMbits), labels) 2104 } 2105 2106 if c.config.BackwardsCompatibleMetrics { 2107 metrics.SetGauge([]string{"client", "unallocated", "network", n.Device, nodeID}, float32(unallocatedMbits)) 2108 } 2109 } 2110 } 2111 2112 // No lables are required so we emit with only a key/value syntax 2113 func (c *Client) setGaugeForUptime(hStats *stats.HostStats) { 2114 if !c.config.DisableTaggedMetrics { 2115 metrics.SetGaugeWithLabels([]string{"uptime"}, float32(hStats.Uptime), c.baseLabels) 2116 } 2117 if c.config.BackwardsCompatibleMetrics { 2118 metrics.SetGauge([]string{"uptime"}, float32(hStats.Uptime)) 2119 } 2120 } 2121 2122 // emitHostStats pushes host resource usage stats to remote metrics collection sinks 2123 func (c *Client) emitHostStats() { 2124 nodeID := c.NodeID() 2125 hStats := c.hostStatsCollector.Stats() 2126 2127 c.setGaugeForMemoryStats(nodeID, hStats) 2128 c.setGaugeForUptime(hStats) 2129 c.setGaugeForCPUStats(nodeID, hStats) 2130 c.setGaugeForDiskStats(nodeID, hStats) 2131 } 2132 2133 // emitClientMetrics emits lower volume client metrics 2134 func (c *Client) emitClientMetrics() { 2135 nodeID := c.NodeID() 2136 2137 c.setGaugeForAllocationStats(nodeID) 2138 2139 // Emit allocation metrics 2140 blocked, migrating, pending, running, terminal := 0, 0, 0, 0, 0 2141 for _, ar := range c.getAllocRunners() { 2142 switch ar.Alloc().ClientStatus { 2143 case structs.AllocClientStatusPending: 2144 switch { 2145 case ar.IsWaiting(): 2146 blocked++ 2147 case ar.IsMigrating(): 2148 migrating++ 2149 default: 2150 pending++ 2151 } 2152 case structs.AllocClientStatusRunning: 2153 running++ 2154 case structs.AllocClientStatusComplete, structs.AllocClientStatusFailed: 2155 terminal++ 2156 } 2157 } 2158 2159 if !c.config.DisableTaggedMetrics { 2160 metrics.SetGaugeWithLabels([]string{"client", "allocations", "migrating"}, float32(migrating), c.baseLabels) 2161 metrics.SetGaugeWithLabels([]string{"client", "allocations", "blocked"}, float32(blocked), c.baseLabels) 2162 metrics.SetGaugeWithLabels([]string{"client", "allocations", "pending"}, float32(pending), c.baseLabels) 2163 metrics.SetGaugeWithLabels([]string{"client", "allocations", "running"}, float32(running), c.baseLabels) 2164 metrics.SetGaugeWithLabels([]string{"client", "allocations", "terminal"}, float32(terminal), c.baseLabels) 2165 } 2166 2167 if c.config.BackwardsCompatibleMetrics { 2168 metrics.SetGauge([]string{"client", "allocations", "migrating", nodeID}, float32(migrating)) 2169 metrics.SetGauge([]string{"client", "allocations", "blocked", nodeID}, float32(blocked)) 2170 metrics.SetGauge([]string{"client", "allocations", "pending", nodeID}, float32(pending)) 2171 metrics.SetGauge([]string{"client", "allocations", "running", nodeID}, float32(running)) 2172 metrics.SetGauge([]string{"client", "allocations", "terminal", nodeID}, float32(terminal)) 2173 } 2174 } 2175 2176 func (c *Client) getAllocatedResources(selfNode *structs.Node) *structs.Resources { 2177 // Unfortunately the allocs only have IP so we need to match them to the 2178 // device 2179 cidrToDevice := make(map[*net.IPNet]string, len(selfNode.Resources.Networks)) 2180 for _, n := range selfNode.Resources.Networks { 2181 _, ipnet, err := net.ParseCIDR(n.CIDR) 2182 if err != nil { 2183 continue 2184 } 2185 cidrToDevice[ipnet] = n.Device 2186 } 2187 2188 // Sum the allocated resources 2189 allocs := c.allAllocs() 2190 var allocated structs.Resources 2191 allocatedDeviceMbits := make(map[string]int) 2192 for _, alloc := range allocs { 2193 if !alloc.TerminalStatus() { 2194 allocated.Add(alloc.Resources) 2195 for _, allocatedNetwork := range alloc.Resources.Networks { 2196 for cidr, dev := range cidrToDevice { 2197 ip := net.ParseIP(allocatedNetwork.IP) 2198 if cidr.Contains(ip) { 2199 allocatedDeviceMbits[dev] += allocatedNetwork.MBits 2200 break 2201 } 2202 } 2203 } 2204 } 2205 } 2206 2207 // Clear the networks 2208 allocated.Networks = nil 2209 for dev, speed := range allocatedDeviceMbits { 2210 net := &structs.NetworkResource{ 2211 Device: dev, 2212 MBits: speed, 2213 } 2214 allocated.Networks = append(allocated.Networks, net) 2215 } 2216 2217 return &allocated 2218 } 2219 2220 // allAllocs returns all the allocations managed by the client 2221 func (c *Client) allAllocs() map[string]*structs.Allocation { 2222 ars := c.getAllocRunners() 2223 allocs := make(map[string]*structs.Allocation, len(ars)) 2224 for _, ar := range c.getAllocRunners() { 2225 a := ar.Alloc() 2226 allocs[a.ID] = a 2227 } 2228 return allocs 2229 } 2230 2231 // resolveServer given a sever's address as a string, return it's resolved 2232 // net.Addr or an error. 2233 func resolveServer(s string) (net.Addr, error) { 2234 const defaultClientPort = "4647" // default client RPC port 2235 host, port, err := net.SplitHostPort(s) 2236 if err != nil { 2237 if strings.Contains(err.Error(), "missing port") { 2238 host = s 2239 port = defaultClientPort 2240 } else { 2241 return nil, err 2242 } 2243 } 2244 return net.ResolveTCPAddr("tcp", net.JoinHostPort(host, port)) 2245 }