github.com/hspak/nomad@v0.7.2-0.20180309000617-bc4ae22a39a5/client/client.go (about) 1 package client 2 3 import ( 4 "errors" 5 "fmt" 6 "io/ioutil" 7 "log" 8 "net" 9 "net/rpc" 10 "os" 11 "path/filepath" 12 "sort" 13 "strconv" 14 "strings" 15 "sync" 16 "time" 17 18 metrics "github.com/armon/go-metrics" 19 "github.com/boltdb/bolt" 20 consulapi "github.com/hashicorp/consul/api" 21 "github.com/hashicorp/consul/lib" 22 multierror "github.com/hashicorp/go-multierror" 23 "github.com/hashicorp/nomad/client/allocdir" 24 "github.com/hashicorp/nomad/client/config" 25 "github.com/hashicorp/nomad/client/servers" 26 "github.com/hashicorp/nomad/client/stats" 27 cstructs "github.com/hashicorp/nomad/client/structs" 28 "github.com/hashicorp/nomad/client/vaultclient" 29 "github.com/hashicorp/nomad/command/agent/consul" 30 "github.com/hashicorp/nomad/helper" 31 "github.com/hashicorp/nomad/helper/pool" 32 hstats "github.com/hashicorp/nomad/helper/stats" 33 "github.com/hashicorp/nomad/helper/tlsutil" 34 "github.com/hashicorp/nomad/helper/uuid" 35 "github.com/hashicorp/nomad/nomad/structs" 36 nconfig "github.com/hashicorp/nomad/nomad/structs/config" 37 vaultapi "github.com/hashicorp/vault/api" 38 "github.com/shirou/gopsutil/host" 39 ) 40 41 const ( 42 // clientRPCCache controls how long we keep an idle connection 43 // open to a server 44 clientRPCCache = 5 * time.Minute 45 46 // clientMaxStreams controsl how many idle streams we keep 47 // open to a server 48 clientMaxStreams = 2 49 50 // datacenterQueryLimit searches through up to this many adjacent 51 // datacenters looking for the Nomad server service. 52 datacenterQueryLimit = 9 53 54 // registerRetryIntv is minimum interval on which we retry 55 // registration. We pick a value between this and 2x this. 56 registerRetryIntv = 15 * time.Second 57 58 // getAllocRetryIntv is minimum interval on which we retry 59 // to fetch allocations. We pick a value between this and 2x this. 60 getAllocRetryIntv = 30 * time.Second 61 62 // devModeRetryIntv is the retry interval used for development 63 devModeRetryIntv = time.Second 64 65 // stateSnapshotIntv is how often the client snapshots state 66 stateSnapshotIntv = 60 * time.Second 67 68 // initialHeartbeatStagger is used to stagger the interval between 69 // starting and the initial heartbeat. After the initial heartbeat, 70 // we switch to using the TTL specified by the servers. 71 initialHeartbeatStagger = 10 * time.Second 72 73 // nodeUpdateRetryIntv is how often the client checks for updates to the 74 // node attributes or meta map. 75 nodeUpdateRetryIntv = 5 * time.Second 76 77 // allocSyncIntv is the batching period of allocation updates before they 78 // are synced with the server. 79 allocSyncIntv = 200 * time.Millisecond 80 81 // allocSyncRetryIntv is the interval on which we retry updating 82 // the status of the allocation 83 allocSyncRetryIntv = 5 * time.Second 84 ) 85 86 // ClientStatsReporter exposes all the APIs related to resource usage of a Nomad 87 // Client 88 type ClientStatsReporter interface { 89 // GetAllocStats returns the AllocStatsReporter for the passed allocation. 90 // If it does not exist an error is reported. 91 GetAllocStats(allocID string) (AllocStatsReporter, error) 92 93 // LatestHostStats returns the latest resource usage stats for the host 94 LatestHostStats() *stats.HostStats 95 } 96 97 // Client is used to implement the client interaction with Nomad. Clients 98 // are expected to register as a schedulable node to the servers, and to 99 // run allocations as determined by the servers. 100 type Client struct { 101 config *config.Config 102 start time.Time 103 104 // stateDB is used to efficiently store client state. 105 stateDB *bolt.DB 106 107 // configCopy is a copy that should be passed to alloc-runners. 108 configCopy *config.Config 109 configLock sync.RWMutex 110 111 logger *log.Logger 112 113 connPool *pool.ConnPool 114 115 // tlsWrap is used to wrap outbound connections using TLS. It should be 116 // accessed using the lock. 117 tlsWrap tlsutil.RegionWrapper 118 tlsWrapLock sync.RWMutex 119 120 // servers is the list of nomad servers 121 servers *servers.Manager 122 123 // heartbeat related times for tracking how often to heartbeat 124 lastHeartbeat time.Time 125 heartbeatTTL time.Duration 126 haveHeartbeated bool 127 heartbeatLock sync.Mutex 128 129 // triggerDiscoveryCh triggers Consul discovery; see triggerDiscovery 130 triggerDiscoveryCh chan struct{} 131 132 // triggerNodeUpdate triggers the client to mark the Node as changed and 133 // update it. 134 triggerNodeUpdate chan struct{} 135 136 // discovered will be ticked whenever Consul discovery completes 137 // successfully 138 serversDiscoveredCh chan struct{} 139 140 // allocs maps alloc IDs to their AllocRunner. This map includes all 141 // AllocRunners - running and GC'd - until the server GCs them. 142 allocs map[string]*AllocRunner 143 allocLock sync.RWMutex 144 145 // allocUpdates stores allocations that need to be synced to the server. 146 allocUpdates chan *structs.Allocation 147 148 // consulService is Nomad's custom Consul client for managing services 149 // and checks. 150 consulService ConsulServiceAPI 151 152 // consulCatalog is the subset of Consul's Catalog API Nomad uses. 153 consulCatalog consul.CatalogAPI 154 155 // HostStatsCollector collects host resource usage stats 156 hostStatsCollector *stats.HostStatsCollector 157 158 shutdown bool 159 shutdownCh chan struct{} 160 shutdownLock sync.Mutex 161 162 // vaultClient is used to interact with Vault for token and secret renewals 163 vaultClient vaultclient.VaultClient 164 165 // garbageCollector is used to garbage collect terminal allocations present 166 // in the node automatically 167 garbageCollector *AllocGarbageCollector 168 169 // clientACLResolver holds the ACL resolution state 170 clientACLResolver 171 172 // rpcServer is used to serve RPCs by the local agent. 173 rpcServer *rpc.Server 174 endpoints rpcEndpoints 175 streamingRpcs *structs.StreamingRpcRegistery 176 177 // baseLabels are used when emitting tagged metrics. All client metrics will 178 // have these tags, and optionally more. 179 baseLabels []metrics.Label 180 } 181 182 var ( 183 // noServersErr is returned by the RPC method when the client has no 184 // configured servers. This is used to trigger Consul discovery if 185 // enabled. 186 noServersErr = errors.New("no servers") 187 ) 188 189 // NewClient is used to create a new client from the given configuration 190 func NewClient(cfg *config.Config, consulCatalog consul.CatalogAPI, consulService ConsulServiceAPI, logger *log.Logger) (*Client, error) { 191 // Create the tls wrapper 192 var tlsWrap tlsutil.RegionWrapper 193 if cfg.TLSConfig.EnableRPC { 194 tw, err := cfg.TLSConfiguration().OutgoingTLSWrapper() 195 if err != nil { 196 return nil, err 197 } 198 tlsWrap = tw 199 } 200 201 // Create the client 202 c := &Client{ 203 config: cfg, 204 consulCatalog: consulCatalog, 205 consulService: consulService, 206 start: time.Now(), 207 connPool: pool.NewPool(cfg.LogOutput, clientRPCCache, clientMaxStreams, tlsWrap), 208 tlsWrap: tlsWrap, 209 streamingRpcs: structs.NewStreamingRpcRegistery(), 210 logger: logger, 211 allocs: make(map[string]*AllocRunner), 212 allocUpdates: make(chan *structs.Allocation, 64), 213 shutdownCh: make(chan struct{}), 214 triggerDiscoveryCh: make(chan struct{}), 215 triggerNodeUpdate: make(chan struct{}, 8), 216 serversDiscoveredCh: make(chan struct{}), 217 } 218 219 // Initialize the server manager 220 c.servers = servers.New(c.logger, c.shutdownCh, c) 221 222 // Initialize the client 223 if err := c.init(); err != nil { 224 return nil, fmt.Errorf("failed to initialize client: %v", err) 225 } 226 227 // Setup the clients RPC server 228 c.setupClientRpc() 229 230 // Initialize the ACL state 231 if err := c.clientACLResolver.init(); err != nil { 232 return nil, fmt.Errorf("failed to initialize ACL state: %v", err) 233 } 234 235 // Add the stats collector 236 statsCollector := stats.NewHostStatsCollector(logger, c.config.AllocDir) 237 c.hostStatsCollector = statsCollector 238 239 // Add the garbage collector 240 gcConfig := &GCConfig{ 241 MaxAllocs: cfg.GCMaxAllocs, 242 DiskUsageThreshold: cfg.GCDiskUsageThreshold, 243 InodeUsageThreshold: cfg.GCInodeUsageThreshold, 244 Interval: cfg.GCInterval, 245 ParallelDestroys: cfg.GCParallelDestroys, 246 ReservedDiskMB: cfg.Node.Reserved.DiskMB, 247 } 248 c.garbageCollector = NewAllocGarbageCollector(logger, statsCollector, c, gcConfig) 249 go c.garbageCollector.Run() 250 251 // Setup the node 252 if err := c.setupNode(); err != nil { 253 return nil, fmt.Errorf("node setup failed: %v", err) 254 } 255 256 fingerprintManager := NewFingerprintManager(c.GetConfig, c.config.Node, 257 c.shutdownCh, c.updateNodeFromFingerprint, c.logger) 258 259 // Fingerprint the node and scan for drivers 260 if err := fingerprintManager.Run(); err != nil { 261 return nil, fmt.Errorf("fingerprinting failed: %v", err) 262 } 263 264 // Setup the reserved resources 265 c.reservePorts() 266 267 // Store the config copy before restoring state but after it has been 268 // initialized. 269 c.configLock.Lock() 270 c.configCopy = c.config.Copy() 271 c.configLock.Unlock() 272 273 // Set the preconfigured list of static servers 274 c.configLock.RLock() 275 if len(c.configCopy.Servers) > 0 { 276 if err := c.setServersImpl(c.configCopy.Servers, true); err != nil { 277 logger.Printf("[WARN] client: None of the configured servers are valid: %v", err) 278 } 279 } 280 c.configLock.RUnlock() 281 282 // Setup Consul discovery if enabled 283 if c.configCopy.ConsulConfig.ClientAutoJoin != nil && *c.configCopy.ConsulConfig.ClientAutoJoin { 284 go c.consulDiscovery() 285 if c.servers.NumServers() == 0 { 286 // No configured servers; trigger discovery manually 287 c.triggerDiscoveryCh <- struct{}{} 288 } 289 } 290 291 // Setup the vault client for token and secret renewals 292 if err := c.setupVaultClient(); err != nil { 293 return nil, fmt.Errorf("failed to setup vault client: %v", err) 294 } 295 296 // Restore the state 297 if err := c.restoreState(); err != nil { 298 logger.Printf("[ERR] client: failed to restore state: %v", err) 299 logger.Printf("[ERR] client: Nomad is unable to start due to corrupt state. "+ 300 "The safest way to proceed is to manually stop running task processes "+ 301 "and remove Nomad's state (%q) and alloc (%q) directories before "+ 302 "restarting. Lost allocations will be rescheduled.", 303 c.config.StateDir, c.config.AllocDir) 304 logger.Printf("[ERR] client: Corrupt state is often caused by a bug. Please " + 305 "report as much information as possible to " + 306 "https://github.com/hashicorp/nomad/issues") 307 return nil, fmt.Errorf("failed to restore state") 308 } 309 310 // Register and then start heartbeating to the servers. 311 go c.registerAndHeartbeat() 312 313 // Begin periodic snapshotting of state. 314 go c.periodicSnapshot() 315 316 // Begin syncing allocations to the server 317 go c.allocSync() 318 319 // Start the client! 320 go c.run() 321 322 // Start collecting stats 323 go c.emitStats() 324 325 c.logger.Printf("[INFO] client: Node ID %q", c.NodeID()) 326 return c, nil 327 } 328 329 // init is used to initialize the client and perform any setup 330 // needed before we begin starting its various components. 331 func (c *Client) init() error { 332 // Ensure the state dir exists if we have one 333 if c.config.StateDir != "" { 334 if err := os.MkdirAll(c.config.StateDir, 0700); err != nil { 335 return fmt.Errorf("failed creating state dir: %s", err) 336 } 337 338 } else { 339 // Othewise make a temp directory to use. 340 p, err := ioutil.TempDir("", "NomadClient") 341 if err != nil { 342 return fmt.Errorf("failed creating temporary directory for the StateDir: %v", err) 343 } 344 345 p, err = filepath.EvalSymlinks(p) 346 if err != nil { 347 return fmt.Errorf("failed to find temporary directory for the StateDir: %v", err) 348 } 349 350 c.config.StateDir = p 351 } 352 c.logger.Printf("[INFO] client: using state directory %v", c.config.StateDir) 353 354 // Create or open the state database 355 db, err := bolt.Open(filepath.Join(c.config.StateDir, "state.db"), 0600, nil) 356 if err != nil { 357 return fmt.Errorf("failed to create state database: %v", err) 358 } 359 c.stateDB = db 360 361 // Ensure the alloc dir exists if we have one 362 if c.config.AllocDir != "" { 363 if err := os.MkdirAll(c.config.AllocDir, 0711); err != nil { 364 return fmt.Errorf("failed creating alloc dir: %s", err) 365 } 366 } else { 367 // Othewise make a temp directory to use. 368 p, err := ioutil.TempDir("", "NomadClient") 369 if err != nil { 370 return fmt.Errorf("failed creating temporary directory for the AllocDir: %v", err) 371 } 372 373 p, err = filepath.EvalSymlinks(p) 374 if err != nil { 375 return fmt.Errorf("failed to find temporary directory for the AllocDir: %v", err) 376 } 377 378 // Change the permissions to have the execute bit 379 if err := os.Chmod(p, 0711); err != nil { 380 return fmt.Errorf("failed to change directory permissions for the AllocDir: %v", err) 381 } 382 383 c.config.AllocDir = p 384 } 385 386 c.logger.Printf("[INFO] client: using alloc directory %v", c.config.AllocDir) 387 return nil 388 } 389 390 // reloadTLSConnections allows a client to reload its TLS configuration on the 391 // fly 392 func (c *Client) reloadTLSConnections(newConfig *nconfig.TLSConfig) error { 393 var tlsWrap tlsutil.RegionWrapper 394 if newConfig != nil && newConfig.EnableRPC { 395 tw, err := tlsutil.NewTLSConfiguration(newConfig).OutgoingTLSWrapper() 396 if err != nil { 397 return err 398 } 399 tlsWrap = tw 400 } 401 402 // Store the new tls wrapper. 403 c.tlsWrapLock.Lock() 404 c.tlsWrap = tlsWrap 405 c.tlsWrapLock.Unlock() 406 407 // Keep the client configuration up to date as we use configuration values to 408 // decide on what type of connections to accept 409 c.configLock.Lock() 410 c.config.TLSConfig = newConfig 411 c.configLock.Unlock() 412 413 c.connPool.ReloadTLS(tlsWrap) 414 415 return nil 416 } 417 418 // Reload allows a client to reload its configuration on the fly 419 func (c *Client) Reload(newConfig *config.Config) error { 420 return c.reloadTLSConnections(newConfig.TLSConfig) 421 } 422 423 // Leave is used to prepare the client to leave the cluster 424 func (c *Client) Leave() error { 425 // TODO 426 return nil 427 } 428 429 // GetConfig returns the config of the client 430 func (c *Client) GetConfig() *config.Config { 431 c.configLock.Lock() 432 defer c.configLock.Unlock() 433 return c.config 434 } 435 436 // Datacenter returns the datacenter for the given client 437 func (c *Client) Datacenter() string { 438 return c.config.Node.Datacenter 439 } 440 441 // Region returns the region for the given client 442 func (c *Client) Region() string { 443 return c.config.Region 444 } 445 446 // NodeID returns the node ID for the given client 447 func (c *Client) NodeID() string { 448 return c.config.Node.ID 449 } 450 451 // secretNodeID returns the secret node ID for the given client 452 func (c *Client) secretNodeID() string { 453 return c.config.Node.SecretID 454 } 455 456 // RPCMajorVersion returns the structs.ApiMajorVersion supported by the 457 // client. 458 func (c *Client) RPCMajorVersion() int { 459 return structs.ApiMajorVersion 460 } 461 462 // RPCMinorVersion returns the structs.ApiMinorVersion supported by the 463 // client. 464 func (c *Client) RPCMinorVersion() int { 465 return structs.ApiMinorVersion 466 } 467 468 // Shutdown is used to tear down the client 469 func (c *Client) Shutdown() error { 470 c.logger.Printf("[INFO] client: shutting down") 471 c.shutdownLock.Lock() 472 defer c.shutdownLock.Unlock() 473 474 if c.shutdown { 475 return nil 476 } 477 478 // Defer closing the database 479 defer func() { 480 if err := c.stateDB.Close(); err != nil { 481 c.logger.Printf("[ERR] client: failed to close state database on shutdown: %v", err) 482 } 483 }() 484 485 // Stop renewing tokens and secrets 486 if c.vaultClient != nil { 487 c.vaultClient.Stop() 488 } 489 490 // Stop Garbage collector 491 c.garbageCollector.Stop() 492 493 // Destroy all the running allocations. 494 if c.config.DevMode { 495 for _, ar := range c.getAllocRunners() { 496 ar.Destroy() 497 <-ar.WaitCh() 498 } 499 } 500 501 c.shutdown = true 502 close(c.shutdownCh) 503 c.connPool.Shutdown() 504 return c.saveState() 505 } 506 507 // Stats is used to return statistics for debugging and insight 508 // for various sub-systems 509 func (c *Client) Stats() map[string]map[string]string { 510 c.heartbeatLock.Lock() 511 defer c.heartbeatLock.Unlock() 512 stats := map[string]map[string]string{ 513 "client": { 514 "node_id": c.NodeID(), 515 "known_servers": strings.Join(c.GetServers(), ","), 516 "num_allocations": strconv.Itoa(c.NumAllocs()), 517 "last_heartbeat": fmt.Sprintf("%v", time.Since(c.lastHeartbeat)), 518 "heartbeat_ttl": fmt.Sprintf("%v", c.heartbeatTTL), 519 }, 520 "runtime": hstats.RuntimeStats(), 521 } 522 return stats 523 } 524 525 // CollectAllocation garbage collects a single allocation on a node. Returns 526 // true if alloc was found and garbage collected; otherwise false. 527 func (c *Client) CollectAllocation(allocID string) bool { 528 return c.garbageCollector.Collect(allocID) 529 } 530 531 // CollectAllAllocs garbage collects all allocations on a node in the terminal 532 // state 533 func (c *Client) CollectAllAllocs() { 534 c.garbageCollector.CollectAll() 535 } 536 537 // Node returns the locally registered node 538 func (c *Client) Node() *structs.Node { 539 c.configLock.RLock() 540 defer c.configLock.RUnlock() 541 return c.configCopy.Node 542 } 543 544 // StatsReporter exposes the various APIs related resource usage of a Nomad 545 // client 546 func (c *Client) StatsReporter() ClientStatsReporter { 547 return c 548 } 549 550 func (c *Client) GetAllocStats(allocID string) (AllocStatsReporter, error) { 551 c.allocLock.RLock() 552 defer c.allocLock.RUnlock() 553 ar, ok := c.allocs[allocID] 554 if !ok { 555 return nil, structs.NewErrUnknownAllocation(allocID) 556 } 557 return ar.StatsReporter(), nil 558 } 559 560 // HostStats returns all the stats related to a Nomad client 561 func (c *Client) LatestHostStats() *stats.HostStats { 562 return c.hostStatsCollector.Stats() 563 } 564 565 // ValidateMigrateToken verifies that a token is for a specific client and 566 // allocation, and has been created by a trusted party that has privileged 567 // knowledge of the client's secret identifier 568 func (c *Client) ValidateMigrateToken(allocID, migrateToken string) bool { 569 if !c.config.ACLEnabled { 570 return true 571 } 572 573 return structs.CompareMigrateToken(allocID, c.secretNodeID(), migrateToken) 574 } 575 576 // GetAllocFS returns the AllocFS interface for the alloc dir of an allocation 577 func (c *Client) GetAllocFS(allocID string) (allocdir.AllocDirFS, error) { 578 c.allocLock.RLock() 579 defer c.allocLock.RUnlock() 580 581 ar, ok := c.allocs[allocID] 582 if !ok { 583 return nil, structs.NewErrUnknownAllocation(allocID) 584 } 585 return ar.GetAllocDir(), nil 586 } 587 588 // GetClientAlloc returns the allocation from the client 589 func (c *Client) GetClientAlloc(allocID string) (*structs.Allocation, error) { 590 all := c.allAllocs() 591 alloc, ok := all[allocID] 592 if !ok { 593 return nil, structs.NewErrUnknownAllocation(allocID) 594 } 595 return alloc, nil 596 } 597 598 // GetServers returns the list of nomad servers this client is aware of. 599 func (c *Client) GetServers() []string { 600 endpoints := c.servers.GetServers() 601 res := make([]string, len(endpoints)) 602 for i := range endpoints { 603 res[i] = endpoints[i].String() 604 } 605 sort.Strings(res) 606 return res 607 } 608 609 // SetServers sets a new list of nomad servers to connect to. As long as one 610 // server is resolvable no error is returned. 611 func (c *Client) SetServers(in []string) error { 612 return c.setServersImpl(in, false) 613 } 614 615 // setServersImpl sets a new list of nomad servers to connect to. If force is 616 // set, we add the server to the internal severlist even if the server could not 617 // be pinged. An error is returned if no endpoints were valid when non-forcing. 618 // 619 // Force should be used when setting the servers from the initial configuration 620 // since the server may be starting up in parallel and initial pings may fail. 621 func (c *Client) setServersImpl(in []string, force bool) error { 622 var mu sync.Mutex 623 var wg sync.WaitGroup 624 var merr multierror.Error 625 626 endpoints := make([]*servers.Server, 0, len(in)) 627 wg.Add(len(in)) 628 629 for _, s := range in { 630 go func(srv string) { 631 defer wg.Done() 632 addr, err := resolveServer(srv) 633 if err != nil { 634 c.logger.Printf("[DEBUG] client: ignoring server %s due to resolution error: %v", srv, err) 635 merr.Errors = append(merr.Errors, err) 636 return 637 } 638 639 // Try to ping to check if it is a real server 640 if err := c.Ping(addr); err != nil { 641 merr.Errors = append(merr.Errors, fmt.Errorf("Server at address %s failed ping: %v", addr, err)) 642 643 // If we are forcing the setting of the servers, inject it to 644 // the serverlist even if we can't ping immediately. 645 if !force { 646 return 647 } 648 } 649 650 mu.Lock() 651 endpoints = append(endpoints, &servers.Server{Addr: addr}) 652 mu.Unlock() 653 }(s) 654 } 655 656 wg.Wait() 657 658 // Only return errors if no servers are valid 659 if len(endpoints) == 0 { 660 if len(merr.Errors) > 0 { 661 return merr.ErrorOrNil() 662 } 663 return noServersErr 664 } 665 666 c.servers.SetServers(endpoints) 667 return nil 668 } 669 670 // restoreState is used to restore our state from the data dir 671 func (c *Client) restoreState() error { 672 if c.config.DevMode { 673 return nil 674 } 675 676 // COMPAT: Remove in 0.7.0 677 // 0.6.0 transistioned from individual state files to a single bolt-db. 678 // The upgrade path is to: 679 // Check if old state exists 680 // If so, restore from that and delete old state 681 // Restore using state database 682 683 // Allocs holds the IDs of the allocations being restored 684 var allocs []string 685 686 // Upgrading tracks whether this is a pre 0.6.0 upgrade path 687 var upgrading bool 688 689 // Scan the directory 690 allocDir := filepath.Join(c.config.StateDir, "alloc") 691 list, err := ioutil.ReadDir(allocDir) 692 if err != nil && !os.IsNotExist(err) { 693 return fmt.Errorf("failed to list alloc state: %v", err) 694 } else if err == nil && len(list) != 0 { 695 upgrading = true 696 for _, entry := range list { 697 allocs = append(allocs, entry.Name()) 698 } 699 } else { 700 // Normal path 701 err := c.stateDB.View(func(tx *bolt.Tx) error { 702 allocs, err = getAllAllocationIDs(tx) 703 if err != nil { 704 return fmt.Errorf("failed to list allocations: %v", err) 705 } 706 return nil 707 }) 708 if err != nil { 709 return err 710 } 711 } 712 713 // Load each alloc back 714 var mErr multierror.Error 715 for _, id := range allocs { 716 alloc := &structs.Allocation{ID: id} 717 718 // don't worry about blocking/migrating when restoring 719 watcher := noopPrevAlloc{} 720 721 c.configLock.RLock() 722 ar := NewAllocRunner(c.logger, c.configCopy, c.stateDB, c.updateAllocStatus, alloc, c.vaultClient, c.consulService, watcher) 723 c.configLock.RUnlock() 724 725 c.allocLock.Lock() 726 c.allocs[id] = ar 727 c.allocLock.Unlock() 728 729 if err := ar.RestoreState(); err != nil { 730 c.logger.Printf("[ERR] client: failed to restore state for alloc %q: %v", id, err) 731 mErr.Errors = append(mErr.Errors, err) 732 } else { 733 go ar.Run() 734 735 if upgrading { 736 if err := ar.SaveState(); err != nil { 737 c.logger.Printf("[WARN] client: initial save state for alloc %q failed: %v", id, err) 738 } 739 } 740 } 741 } 742 743 // Delete all the entries 744 if upgrading { 745 if err := os.RemoveAll(allocDir); err != nil { 746 mErr.Errors = append(mErr.Errors, err) 747 } 748 } 749 750 return mErr.ErrorOrNil() 751 } 752 753 // saveState is used to snapshot our state into the data dir. 754 func (c *Client) saveState() error { 755 if c.config.DevMode { 756 return nil 757 } 758 759 var wg sync.WaitGroup 760 var l sync.Mutex 761 var mErr multierror.Error 762 runners := c.getAllocRunners() 763 wg.Add(len(runners)) 764 765 for id, ar := range runners { 766 go func(id string, ar *AllocRunner) { 767 err := ar.SaveState() 768 if err != nil { 769 c.logger.Printf("[ERR] client: failed to save state for alloc %q: %v", id, err) 770 l.Lock() 771 multierror.Append(&mErr, err) 772 l.Unlock() 773 } 774 wg.Done() 775 }(id, ar) 776 } 777 778 wg.Wait() 779 return mErr.ErrorOrNil() 780 } 781 782 // getAllocRunners returns a snapshot of the current set of alloc runners. 783 func (c *Client) getAllocRunners() map[string]*AllocRunner { 784 c.allocLock.RLock() 785 defer c.allocLock.RUnlock() 786 runners := make(map[string]*AllocRunner, len(c.allocs)) 787 for id, ar := range c.allocs { 788 runners[id] = ar 789 } 790 return runners 791 } 792 793 // NumAllocs returns the number of un-GC'd allocs this client has. Used to 794 // fulfill the AllocCounter interface for the GC. 795 func (c *Client) NumAllocs() int { 796 n := 0 797 c.allocLock.RLock() 798 for _, a := range c.allocs { 799 if !a.IsDestroyed() { 800 n++ 801 } 802 } 803 c.allocLock.RUnlock() 804 return n 805 } 806 807 // nodeID restores, or generates if necessary, a unique node ID and SecretID. 808 // The node ID is, if available, a persistent unique ID. The secret ID is a 809 // high-entropy random UUID. 810 func (c *Client) nodeID() (id, secret string, err error) { 811 var hostID string 812 hostInfo, err := host.Info() 813 if !c.config.NoHostUUID && err == nil { 814 if hashed, ok := helper.HashUUID(hostInfo.HostID); ok { 815 hostID = hashed 816 } 817 } 818 819 if hostID == "" { 820 // Generate a random hostID if no constant ID is available on 821 // this platform. 822 hostID = uuid.Generate() 823 } 824 825 // Do not persist in dev mode 826 if c.config.DevMode { 827 return hostID, uuid.Generate(), nil 828 } 829 830 // Attempt to read existing ID 831 idPath := filepath.Join(c.config.StateDir, "client-id") 832 idBuf, err := ioutil.ReadFile(idPath) 833 if err != nil && !os.IsNotExist(err) { 834 return "", "", err 835 } 836 837 // Attempt to read existing secret ID 838 secretPath := filepath.Join(c.config.StateDir, "secret-id") 839 secretBuf, err := ioutil.ReadFile(secretPath) 840 if err != nil && !os.IsNotExist(err) { 841 return "", "", err 842 } 843 844 // Use existing ID if any 845 if len(idBuf) != 0 { 846 id = strings.ToLower(string(idBuf)) 847 } else { 848 id = hostID 849 850 // Persist the ID 851 if err := ioutil.WriteFile(idPath, []byte(id), 0700); err != nil { 852 return "", "", err 853 } 854 } 855 856 if len(secretBuf) != 0 { 857 secret = string(secretBuf) 858 } else { 859 // Generate new ID 860 secret = uuid.Generate() 861 862 // Persist the ID 863 if err := ioutil.WriteFile(secretPath, []byte(secret), 0700); err != nil { 864 return "", "", err 865 } 866 } 867 868 return id, secret, nil 869 } 870 871 // setupNode is used to setup the initial node 872 func (c *Client) setupNode() error { 873 node := c.config.Node 874 if node == nil { 875 node = &structs.Node{} 876 c.config.Node = node 877 } 878 // Generate an ID and secret for the node 879 id, secretID, err := c.nodeID() 880 if err != nil { 881 return fmt.Errorf("node ID setup failed: %v", err) 882 } 883 884 node.ID = id 885 node.SecretID = secretID 886 if node.Attributes == nil { 887 node.Attributes = make(map[string]string) 888 } 889 if node.Links == nil { 890 node.Links = make(map[string]string) 891 } 892 if node.Meta == nil { 893 node.Meta = make(map[string]string) 894 } 895 if node.Resources == nil { 896 node.Resources = &structs.Resources{} 897 } 898 if node.Reserved == nil { 899 node.Reserved = &structs.Resources{} 900 } 901 if node.Datacenter == "" { 902 node.Datacenter = "dc1" 903 } 904 if node.Name == "" { 905 node.Name, _ = os.Hostname() 906 } 907 if node.Name == "" { 908 node.Name = node.ID 909 } 910 node.Status = structs.NodeStatusInit 911 return nil 912 } 913 914 // reservePorts is used to reserve ports on the fingerprinted network devices. 915 func (c *Client) reservePorts() { 916 c.configLock.RLock() 917 defer c.configLock.RUnlock() 918 global := c.config.GloballyReservedPorts 919 if len(global) == 0 { 920 return 921 } 922 923 node := c.config.Node 924 networks := node.Resources.Networks 925 reservedIndex := make(map[string]*structs.NetworkResource, len(networks)) 926 for _, resNet := range node.Reserved.Networks { 927 reservedIndex[resNet.IP] = resNet 928 } 929 930 // Go through each network device and reserve ports on it. 931 for _, net := range networks { 932 res, ok := reservedIndex[net.IP] 933 if !ok { 934 res = net.Copy() 935 res.MBits = 0 936 reservedIndex[net.IP] = res 937 } 938 939 for _, portVal := range global { 940 p := structs.Port{Value: portVal} 941 res.ReservedPorts = append(res.ReservedPorts, p) 942 } 943 } 944 945 // Clear the reserved networks. 946 if node.Reserved == nil { 947 node.Reserved = new(structs.Resources) 948 } else { 949 node.Reserved.Networks = nil 950 } 951 952 // Restore the reserved networks 953 for _, net := range reservedIndex { 954 node.Reserved.Networks = append(node.Reserved.Networks, net) 955 } 956 } 957 958 // updateNodeFromFingerprint updates the node with the result of 959 // fingerprinting the node from the diff that was created 960 func (c *Client) updateNodeFromFingerprint(response *cstructs.FingerprintResponse) *structs.Node { 961 c.configLock.Lock() 962 defer c.configLock.Unlock() 963 964 nodeHasChanged := false 965 966 for name, newVal := range response.Attributes { 967 oldVal := c.config.Node.Attributes[name] 968 if oldVal == newVal { 969 continue 970 } 971 972 nodeHasChanged = true 973 if newVal == "" { 974 delete(c.config.Node.Attributes, name) 975 } else { 976 c.config.Node.Attributes[name] = newVal 977 } 978 } 979 980 // update node links and resources from the diff created from 981 // fingerprinting 982 for name, newVal := range response.Links { 983 oldVal := c.config.Node.Links[name] 984 if oldVal == newVal { 985 continue 986 } 987 988 nodeHasChanged = true 989 if newVal == "" { 990 delete(c.config.Node.Links, name) 991 } else { 992 c.config.Node.Links[name] = newVal 993 } 994 } 995 996 if response.Resources != nil && !resourcesAreEqual(c.config.Node.Resources, response.Resources) { 997 nodeHasChanged = true 998 c.config.Node.Resources.Merge(response.Resources) 999 } 1000 1001 if nodeHasChanged { 1002 c.updateNode() 1003 } 1004 return c.config.Node 1005 } 1006 1007 // resourcesAreEqual is a temporary function to compare whether resources are 1008 // equal. We can use this until we change fingerprinters to set pointers on a 1009 // return type. 1010 func resourcesAreEqual(first, second *structs.Resources) bool { 1011 if first.CPU != second.CPU { 1012 return false 1013 } 1014 if first.MemoryMB != second.MemoryMB { 1015 return false 1016 } 1017 if first.DiskMB != second.DiskMB { 1018 return false 1019 } 1020 if first.IOPS != second.IOPS { 1021 return false 1022 } 1023 if len(first.Networks) != len(second.Networks) { 1024 return false 1025 } 1026 for i, e := range first.Networks { 1027 if len(second.Networks) < i { 1028 return false 1029 } 1030 f := second.Networks[i] 1031 if !e.Equals(f) { 1032 return false 1033 } 1034 } 1035 return true 1036 } 1037 1038 // retryIntv calculates a retry interval value given the base 1039 func (c *Client) retryIntv(base time.Duration) time.Duration { 1040 if c.config.DevMode { 1041 return devModeRetryIntv 1042 } 1043 return base + lib.RandomStagger(base) 1044 } 1045 1046 // registerAndHeartbeat is a long lived goroutine used to register the client 1047 // and then start heartbeatng to the server. 1048 func (c *Client) registerAndHeartbeat() { 1049 // Register the node 1050 c.retryRegisterNode() 1051 1052 // Start watching changes for node changes 1053 go c.watchNodeUpdates() 1054 1055 // Setup the heartbeat timer, for the initial registration 1056 // we want to do this quickly. We want to do it extra quickly 1057 // in development mode. 1058 var heartbeat <-chan time.Time 1059 if c.config.DevMode { 1060 heartbeat = time.After(0) 1061 } else { 1062 heartbeat = time.After(lib.RandomStagger(initialHeartbeatStagger)) 1063 } 1064 1065 for { 1066 select { 1067 case <-c.serversDiscoveredCh: 1068 case <-heartbeat: 1069 case <-c.shutdownCh: 1070 return 1071 } 1072 1073 if err := c.updateNodeStatus(); err != nil { 1074 // The servers have changed such that this node has not been 1075 // registered before 1076 if strings.Contains(err.Error(), "node not found") { 1077 // Re-register the node 1078 c.logger.Printf("[INFO] client: re-registering node") 1079 c.retryRegisterNode() 1080 heartbeat = time.After(lib.RandomStagger(initialHeartbeatStagger)) 1081 } else { 1082 intv := c.retryIntv(registerRetryIntv) 1083 c.logger.Printf("[ERR] client: heartbeating failed. Retrying in %v: %v", intv, err) 1084 heartbeat = time.After(intv) 1085 1086 // if heartbeating fails, trigger Consul discovery 1087 c.triggerDiscovery() 1088 } 1089 } else { 1090 c.heartbeatLock.Lock() 1091 heartbeat = time.After(c.heartbeatTTL) 1092 c.heartbeatLock.Unlock() 1093 } 1094 } 1095 } 1096 1097 // periodicSnapshot is a long lived goroutine used to periodically snapshot the 1098 // state of the client 1099 func (c *Client) periodicSnapshot() { 1100 // Create a snapshot timer 1101 snapshot := time.After(stateSnapshotIntv) 1102 1103 for { 1104 select { 1105 case <-snapshot: 1106 snapshot = time.After(stateSnapshotIntv) 1107 if err := c.saveState(); err != nil { 1108 c.logger.Printf("[ERR] client: failed to save state: %v", err) 1109 } 1110 1111 case <-c.shutdownCh: 1112 return 1113 } 1114 } 1115 } 1116 1117 // run is a long lived goroutine used to run the client 1118 func (c *Client) run() { 1119 // Watch for changes in allocations 1120 allocUpdates := make(chan *allocUpdates, 8) 1121 go c.watchAllocations(allocUpdates) 1122 1123 for { 1124 select { 1125 case update := <-allocUpdates: 1126 c.runAllocs(update) 1127 1128 case <-c.shutdownCh: 1129 return 1130 } 1131 } 1132 } 1133 1134 // retryRegisterNode is used to register the node or update the registration and 1135 // retry in case of failure. 1136 func (c *Client) retryRegisterNode() { 1137 for { 1138 err := c.registerNode() 1139 if err == nil { 1140 // Registered! 1141 return 1142 } 1143 1144 if err == noServersErr { 1145 c.logger.Print("[DEBUG] client: registration waiting on servers") 1146 c.triggerDiscovery() 1147 } else { 1148 c.logger.Printf("[ERR] client: registration failure: %v", err) 1149 } 1150 select { 1151 case <-c.serversDiscoveredCh: 1152 case <-time.After(c.retryIntv(registerRetryIntv)): 1153 case <-c.shutdownCh: 1154 return 1155 } 1156 } 1157 } 1158 1159 // registerNode is used to register the node or update the registration 1160 func (c *Client) registerNode() error { 1161 node := c.Node() 1162 req := structs.NodeRegisterRequest{ 1163 Node: node, 1164 WriteRequest: structs.WriteRequest{Region: c.Region()}, 1165 } 1166 var resp structs.NodeUpdateResponse 1167 if err := c.RPC("Node.Register", &req, &resp); err != nil { 1168 return err 1169 } 1170 1171 // Update the node status to ready after we register. 1172 c.configLock.Lock() 1173 node.Status = structs.NodeStatusReady 1174 c.configLock.Unlock() 1175 1176 c.logger.Printf("[INFO] client: node registration complete") 1177 if len(resp.EvalIDs) != 0 { 1178 c.logger.Printf("[DEBUG] client: %d evaluations triggered by node registration", len(resp.EvalIDs)) 1179 } 1180 1181 c.heartbeatLock.Lock() 1182 defer c.heartbeatLock.Unlock() 1183 c.lastHeartbeat = time.Now() 1184 c.heartbeatTTL = resp.HeartbeatTTL 1185 return nil 1186 } 1187 1188 // updateNodeStatus is used to heartbeat and update the status of the node 1189 func (c *Client) updateNodeStatus() error { 1190 start := time.Now() 1191 req := structs.NodeUpdateStatusRequest{ 1192 NodeID: c.NodeID(), 1193 Status: structs.NodeStatusReady, 1194 WriteRequest: structs.WriteRequest{Region: c.Region()}, 1195 } 1196 var resp structs.NodeUpdateResponse 1197 if err := c.RPC("Node.UpdateStatus", &req, &resp); err != nil { 1198 c.triggerDiscovery() 1199 return fmt.Errorf("failed to update status: %v", err) 1200 } 1201 end := time.Now() 1202 1203 if len(resp.EvalIDs) != 0 { 1204 c.logger.Printf("[DEBUG] client: %d evaluations triggered by node update", len(resp.EvalIDs)) 1205 } 1206 1207 // Update the last heartbeat and the new TTL, capturing the old values 1208 c.heartbeatLock.Lock() 1209 last := c.lastHeartbeat 1210 oldTTL := c.heartbeatTTL 1211 haveHeartbeated := c.haveHeartbeated 1212 c.lastHeartbeat = time.Now() 1213 c.heartbeatTTL = resp.HeartbeatTTL 1214 c.haveHeartbeated = true 1215 c.heartbeatLock.Unlock() 1216 c.logger.Printf("[TRACE] client: next heartbeat in %v", resp.HeartbeatTTL) 1217 1218 if resp.Index != 0 { 1219 c.logger.Printf("[DEBUG] client: state updated to %s", req.Status) 1220 1221 // We have potentially missed our TTL log how delayed we were 1222 if haveHeartbeated { 1223 c.logger.Printf("[WARN] client: heartbeat missed (request took %v). Heartbeat TTL was %v and heartbeated after %v", 1224 end.Sub(start), oldTTL, time.Since(last)) 1225 } 1226 } 1227 1228 // Update the number of nodes in the cluster so we can adjust our server 1229 // rebalance rate. 1230 c.servers.SetNumNodes(resp.NumNodes) 1231 1232 // Convert []*NodeServerInfo to []*servers.Server 1233 nomadServers := make([]*servers.Server, 0, len(resp.Servers)) 1234 for _, s := range resp.Servers { 1235 addr, err := resolveServer(s.RPCAdvertiseAddr) 1236 if err != nil { 1237 c.logger.Printf("[WARN] client: ignoring invalid server %q: %v", s.RPCAdvertiseAddr, err) 1238 continue 1239 } 1240 e := &servers.Server{DC: s.Datacenter, Addr: addr} 1241 nomadServers = append(nomadServers, e) 1242 } 1243 if len(nomadServers) == 0 { 1244 return fmt.Errorf("heartbeat response returned no valid servers") 1245 } 1246 c.servers.SetServers(nomadServers) 1247 1248 // Begin polling Consul if there is no Nomad leader. We could be 1249 // heartbeating to a Nomad server that is in the minority of a 1250 // partition of the Nomad server quorum, but this Nomad Agent still 1251 // has connectivity to the existing majority of Nomad Servers, but 1252 // only if it queries Consul. 1253 if resp.LeaderRPCAddr == "" { 1254 c.triggerDiscovery() 1255 } 1256 1257 return nil 1258 } 1259 1260 // updateAllocStatus is used to update the status of an allocation 1261 func (c *Client) updateAllocStatus(alloc *structs.Allocation) { 1262 if alloc.Terminated() { 1263 // Terminated, mark for GC if we're still tracking this alloc 1264 // runner. If it's not being tracked that means the server has 1265 // already GC'd it (see removeAlloc). 1266 c.allocLock.RLock() 1267 ar, ok := c.allocs[alloc.ID] 1268 c.allocLock.RUnlock() 1269 1270 if ok { 1271 c.garbageCollector.MarkForCollection(ar) 1272 1273 // Trigger a GC in case we're over thresholds and just 1274 // waiting for eligible allocs. 1275 c.garbageCollector.Trigger() 1276 } 1277 } 1278 1279 // Strip all the information that can be reconstructed at the server. Only 1280 // send the fields that are updatable by the client. 1281 stripped := new(structs.Allocation) 1282 stripped.ID = alloc.ID 1283 stripped.NodeID = c.NodeID() 1284 stripped.TaskStates = alloc.TaskStates 1285 stripped.ClientStatus = alloc.ClientStatus 1286 stripped.ClientDescription = alloc.ClientDescription 1287 stripped.DeploymentStatus = alloc.DeploymentStatus 1288 1289 select { 1290 case c.allocUpdates <- stripped: 1291 case <-c.shutdownCh: 1292 } 1293 } 1294 1295 // allocSync is a long lived function that batches allocation updates to the 1296 // server. 1297 func (c *Client) allocSync() { 1298 staggered := false 1299 syncTicker := time.NewTicker(allocSyncIntv) 1300 updates := make(map[string]*structs.Allocation) 1301 for { 1302 select { 1303 case <-c.shutdownCh: 1304 syncTicker.Stop() 1305 return 1306 case alloc := <-c.allocUpdates: 1307 // Batch the allocation updates until the timer triggers. 1308 updates[alloc.ID] = alloc 1309 case <-syncTicker.C: 1310 // Fast path if there are no updates 1311 if len(updates) == 0 { 1312 continue 1313 } 1314 1315 sync := make([]*structs.Allocation, 0, len(updates)) 1316 for _, alloc := range updates { 1317 sync = append(sync, alloc) 1318 } 1319 1320 // Send to server. 1321 args := structs.AllocUpdateRequest{ 1322 Alloc: sync, 1323 WriteRequest: structs.WriteRequest{Region: c.Region()}, 1324 } 1325 1326 var resp structs.GenericResponse 1327 if err := c.RPC("Node.UpdateAlloc", &args, &resp); err != nil { 1328 c.logger.Printf("[ERR] client: failed to update allocations: %v", err) 1329 syncTicker.Stop() 1330 syncTicker = time.NewTicker(c.retryIntv(allocSyncRetryIntv)) 1331 staggered = true 1332 } else { 1333 updates = make(map[string]*structs.Allocation) 1334 if staggered { 1335 syncTicker.Stop() 1336 syncTicker = time.NewTicker(allocSyncIntv) 1337 staggered = false 1338 } 1339 } 1340 } 1341 } 1342 } 1343 1344 // allocUpdates holds the results of receiving updated allocations from the 1345 // servers. 1346 type allocUpdates struct { 1347 // pulled is the set of allocations that were downloaded from the servers. 1348 pulled map[string]*structs.Allocation 1349 1350 // filtered is the set of allocations that were not pulled because their 1351 // AllocModifyIndex didn't change. 1352 filtered map[string]struct{} 1353 1354 // migrateTokens are a list of tokens necessary for when clients pull data 1355 // from authorized volumes 1356 migrateTokens map[string]string 1357 } 1358 1359 // watchAllocations is used to scan for updates to allocations 1360 func (c *Client) watchAllocations(updates chan *allocUpdates) { 1361 // The request and response for getting the map of allocations that should 1362 // be running on the Node to their AllocModifyIndex which is incremented 1363 // when the allocation is updated by the servers. 1364 req := structs.NodeSpecificRequest{ 1365 NodeID: c.NodeID(), 1366 SecretID: c.secretNodeID(), 1367 QueryOptions: structs.QueryOptions{ 1368 Region: c.Region(), 1369 AllowStale: true, 1370 }, 1371 } 1372 var resp structs.NodeClientAllocsResponse 1373 1374 // The request and response for pulling down the set of allocations that are 1375 // new, or updated server side. 1376 allocsReq := structs.AllocsGetRequest{ 1377 QueryOptions: structs.QueryOptions{ 1378 Region: c.Region(), 1379 AllowStale: true, 1380 }, 1381 } 1382 var allocsResp structs.AllocsGetResponse 1383 1384 OUTER: 1385 for { 1386 // Get the allocation modify index map, blocking for updates. We will 1387 // use this to determine exactly what allocations need to be downloaded 1388 // in full. 1389 resp = structs.NodeClientAllocsResponse{} 1390 err := c.RPC("Node.GetClientAllocs", &req, &resp) 1391 if err != nil { 1392 // Shutdown often causes EOF errors, so check for shutdown first 1393 select { 1394 case <-c.shutdownCh: 1395 return 1396 default: 1397 } 1398 1399 // COMPAT: Remove in 0.6. This is to allow the case in which the 1400 // servers are not fully upgraded before the clients register. This 1401 // can cause the SecretID to be lost 1402 if strings.Contains(err.Error(), "node secret ID does not match") { 1403 c.logger.Printf("[DEBUG] client: re-registering node as there was a secret ID mismatch: %v", err) 1404 c.retryRegisterNode() 1405 } else if err != noServersErr { 1406 c.logger.Printf("[ERR] client: failed to query for node allocations: %v", err) 1407 } 1408 retry := c.retryIntv(getAllocRetryIntv) 1409 select { 1410 case <-c.serversDiscoveredCh: 1411 continue 1412 case <-time.After(retry): 1413 continue 1414 case <-c.shutdownCh: 1415 return 1416 } 1417 } 1418 1419 // Check for shutdown 1420 select { 1421 case <-c.shutdownCh: 1422 return 1423 default: 1424 } 1425 1426 // Filter all allocations whose AllocModifyIndex was not incremented. 1427 // These are the allocations who have either not been updated, or whose 1428 // updates are a result of the client sending an update for the alloc. 1429 // This lets us reduce the network traffic to the server as we don't 1430 // need to pull all the allocations. 1431 var pull []string 1432 filtered := make(map[string]struct{}) 1433 runners := c.getAllocRunners() 1434 var pullIndex uint64 1435 for allocID, modifyIndex := range resp.Allocs { 1436 // Pull the allocation if we don't have an alloc runner for the 1437 // allocation or if the alloc runner requires an updated allocation. 1438 runner, ok := runners[allocID] 1439 1440 if !ok || runner.shouldUpdate(modifyIndex) { 1441 // Only pull allocs that are required. Filtered 1442 // allocs might be at a higher index, so ignore 1443 // it. 1444 if modifyIndex > pullIndex { 1445 pullIndex = modifyIndex 1446 } 1447 pull = append(pull, allocID) 1448 } else { 1449 filtered[allocID] = struct{}{} 1450 } 1451 } 1452 1453 // Pull the allocations that passed filtering. 1454 allocsResp.Allocs = nil 1455 var pulledAllocs map[string]*structs.Allocation 1456 if len(pull) != 0 { 1457 // Pull the allocations that need to be updated. 1458 allocsReq.AllocIDs = pull 1459 allocsReq.MinQueryIndex = pullIndex - 1 1460 allocsResp = structs.AllocsGetResponse{} 1461 if err := c.RPC("Alloc.GetAllocs", &allocsReq, &allocsResp); err != nil { 1462 c.logger.Printf("[ERR] client: failed to query updated allocations: %v", err) 1463 retry := c.retryIntv(getAllocRetryIntv) 1464 select { 1465 case <-c.serversDiscoveredCh: 1466 continue 1467 case <-time.After(retry): 1468 continue 1469 case <-c.shutdownCh: 1470 return 1471 } 1472 } 1473 1474 // Ensure that we received all the allocations we wanted 1475 pulledAllocs = make(map[string]*structs.Allocation, len(allocsResp.Allocs)) 1476 for _, alloc := range allocsResp.Allocs { 1477 pulledAllocs[alloc.ID] = alloc 1478 } 1479 1480 for _, desiredID := range pull { 1481 if _, ok := pulledAllocs[desiredID]; !ok { 1482 // We didn't get everything we wanted. Do not update the 1483 // MinQueryIndex, sleep and then retry. 1484 wait := c.retryIntv(2 * time.Second) 1485 select { 1486 case <-time.After(wait): 1487 // Wait for the server we contact to receive the 1488 // allocations 1489 continue OUTER 1490 case <-c.shutdownCh: 1491 return 1492 } 1493 } 1494 } 1495 1496 // Check for shutdown 1497 select { 1498 case <-c.shutdownCh: 1499 return 1500 default: 1501 } 1502 } 1503 1504 c.logger.Printf("[DEBUG] client: updated allocations at index %d (total %d) (pulled %d) (filtered %d)", 1505 resp.Index, len(resp.Allocs), len(allocsResp.Allocs), len(filtered)) 1506 1507 // Update the query index. 1508 if resp.Index > req.MinQueryIndex { 1509 req.MinQueryIndex = resp.Index 1510 } 1511 1512 // Push the updates. 1513 update := &allocUpdates{ 1514 filtered: filtered, 1515 pulled: pulledAllocs, 1516 migrateTokens: resp.MigrateTokens, 1517 } 1518 select { 1519 case updates <- update: 1520 case <-c.shutdownCh: 1521 return 1522 } 1523 } 1524 } 1525 1526 // updateNode triggers a client to update its node copy if it isn't doing 1527 // so already 1528 func (c *Client) updateNode() { 1529 select { 1530 case c.triggerNodeUpdate <- struct{}{}: 1531 // Node update goroutine was released to execute 1532 default: 1533 // Node update goroutine was already running 1534 } 1535 } 1536 1537 // watchNodeUpdates blocks until it is edge triggered. Once triggered, 1538 // it will update the client node copy and re-register the node. 1539 func (c *Client) watchNodeUpdates() { 1540 var hasChanged bool 1541 timer := time.NewTimer(c.retryIntv(nodeUpdateRetryIntv)) 1542 defer timer.Stop() 1543 1544 for { 1545 select { 1546 case <-timer.C: 1547 c.logger.Printf("[DEBUG] client: state changed, updating node and re-registering.") 1548 1549 // Update the config copy. 1550 c.configLock.Lock() 1551 node := c.config.Node.Copy() 1552 c.configCopy.Node = node 1553 c.configLock.Unlock() 1554 1555 c.retryRegisterNode() 1556 1557 hasChanged = false 1558 case <-c.triggerNodeUpdate: 1559 if hasChanged { 1560 continue 1561 } 1562 hasChanged = true 1563 timer.Reset(c.retryIntv(nodeUpdateRetryIntv)) 1564 case <-c.shutdownCh: 1565 return 1566 } 1567 } 1568 } 1569 1570 // runAllocs is invoked when we get an updated set of allocations 1571 func (c *Client) runAllocs(update *allocUpdates) { 1572 // Get the existing allocs 1573 c.allocLock.RLock() 1574 exist := make([]*structs.Allocation, 0, len(c.allocs)) 1575 for _, ar := range c.allocs { 1576 exist = append(exist, ar.alloc) 1577 } 1578 c.allocLock.RUnlock() 1579 1580 // Diff the existing and updated allocations 1581 diff := diffAllocs(exist, update) 1582 c.logger.Printf("[DEBUG] client: %#v", diff) 1583 1584 // Remove the old allocations 1585 for _, remove := range diff.removed { 1586 c.removeAlloc(remove) 1587 } 1588 1589 // Update the existing allocations 1590 for _, update := range diff.updated { 1591 if err := c.updateAlloc(update.exist, update.updated); err != nil { 1592 c.logger.Printf("[ERR] client: failed to update alloc %q: %v", 1593 update.exist.ID, err) 1594 } 1595 } 1596 1597 // Make room for new allocations before running 1598 if err := c.garbageCollector.MakeRoomFor(diff.added); err != nil { 1599 c.logger.Printf("[ERR] client: error making room for new allocations: %v", err) 1600 } 1601 1602 // Start the new allocations 1603 for _, add := range diff.added { 1604 migrateToken := update.migrateTokens[add.ID] 1605 if err := c.addAlloc(add, migrateToken); err != nil { 1606 c.logger.Printf("[ERR] client: failed to add alloc '%s': %v", 1607 add.ID, err) 1608 } 1609 } 1610 1611 // Trigger the GC once more now that new allocs are started that could 1612 // have caused thesholds to be exceeded 1613 c.garbageCollector.Trigger() 1614 } 1615 1616 // removeAlloc is invoked when we should remove an allocation because it has 1617 // been removed by the server. 1618 func (c *Client) removeAlloc(alloc *structs.Allocation) { 1619 c.allocLock.Lock() 1620 ar, ok := c.allocs[alloc.ID] 1621 if !ok { 1622 c.allocLock.Unlock() 1623 c.logger.Printf("[WARN] client: missing context for alloc '%s'", alloc.ID) 1624 return 1625 } 1626 1627 // Stop tracking alloc runner as it's been GC'd by the server 1628 delete(c.allocs, alloc.ID) 1629 c.allocLock.Unlock() 1630 1631 // Ensure the GC has a reference and then collect. Collecting through the GC 1632 // applies rate limiting 1633 c.garbageCollector.MarkForCollection(ar) 1634 1635 // GC immediately since the server has GC'd it 1636 go c.garbageCollector.Collect(alloc.ID) 1637 } 1638 1639 // updateAlloc is invoked when we should update an allocation 1640 func (c *Client) updateAlloc(exist, update *structs.Allocation) error { 1641 c.allocLock.RLock() 1642 ar, ok := c.allocs[exist.ID] 1643 c.allocLock.RUnlock() 1644 if !ok { 1645 c.logger.Printf("[WARN] client: missing context for alloc '%s'", exist.ID) 1646 return nil 1647 } 1648 1649 ar.Update(update) 1650 return nil 1651 } 1652 1653 // addAlloc is invoked when we should add an allocation 1654 func (c *Client) addAlloc(alloc *structs.Allocation, migrateToken string) error { 1655 // Check if we already have an alloc runner 1656 c.allocLock.Lock() 1657 defer c.allocLock.Unlock() 1658 if _, ok := c.allocs[alloc.ID]; ok { 1659 c.logger.Printf("[DEBUG]: client: dropping duplicate add allocation request: %q", alloc.ID) 1660 return nil 1661 } 1662 1663 // get the previous alloc runner - if one exists - for the 1664 // blocking/migrating watcher 1665 var prevAR *AllocRunner 1666 if alloc.PreviousAllocation != "" { 1667 prevAR = c.allocs[alloc.PreviousAllocation] 1668 } 1669 1670 c.configLock.RLock() 1671 prevAlloc := newAllocWatcher(alloc, prevAR, c, c.configCopy, c.logger, migrateToken) 1672 1673 ar := NewAllocRunner(c.logger, c.configCopy, c.stateDB, c.updateAllocStatus, alloc, c.vaultClient, c.consulService, prevAlloc) 1674 c.configLock.RUnlock() 1675 1676 // Store the alloc runner. 1677 c.allocs[alloc.ID] = ar 1678 1679 if err := ar.SaveState(); err != nil { 1680 c.logger.Printf("[WARN] client: initial save state for alloc %q failed: %v", alloc.ID, err) 1681 } 1682 1683 go ar.Run() 1684 return nil 1685 } 1686 1687 // setupVaultClient creates an object to periodically renew tokens and secrets 1688 // with vault. 1689 func (c *Client) setupVaultClient() error { 1690 var err error 1691 c.vaultClient, err = vaultclient.NewVaultClient(c.config.VaultConfig, c.logger, c.deriveToken) 1692 if err != nil { 1693 return err 1694 } 1695 1696 if c.vaultClient == nil { 1697 c.logger.Printf("[ERR] client: failed to create vault client") 1698 return fmt.Errorf("failed to create vault client") 1699 } 1700 1701 // Start renewing tokens and secrets 1702 c.vaultClient.Start() 1703 1704 return nil 1705 } 1706 1707 // deriveToken takes in an allocation and a set of tasks and derives vault 1708 // tokens for each of the tasks, unwraps all of them using the supplied vault 1709 // client and returns a map of unwrapped tokens, indexed by the task name. 1710 func (c *Client) deriveToken(alloc *structs.Allocation, taskNames []string, vclient *vaultapi.Client) (map[string]string, error) { 1711 if alloc == nil { 1712 return nil, fmt.Errorf("nil allocation") 1713 } 1714 1715 if taskNames == nil || len(taskNames) == 0 { 1716 return nil, fmt.Errorf("missing task names") 1717 } 1718 1719 group := alloc.Job.LookupTaskGroup(alloc.TaskGroup) 1720 if group == nil { 1721 return nil, fmt.Errorf("group name in allocation is not present in job") 1722 } 1723 1724 verifiedTasks := []string{} 1725 // Check if the given task names actually exist in the allocation 1726 for _, taskName := range taskNames { 1727 found := false 1728 for _, task := range group.Tasks { 1729 if task.Name == taskName { 1730 found = true 1731 } 1732 } 1733 if !found { 1734 c.logger.Printf("[ERR] task %q not found in the allocation", taskName) 1735 return nil, fmt.Errorf("task %q not found in the allocaition", taskName) 1736 } 1737 verifiedTasks = append(verifiedTasks, taskName) 1738 } 1739 1740 // DeriveVaultToken of nomad server can take in a set of tasks and 1741 // creates tokens for all the tasks. 1742 req := &structs.DeriveVaultTokenRequest{ 1743 NodeID: c.NodeID(), 1744 SecretID: c.secretNodeID(), 1745 AllocID: alloc.ID, 1746 Tasks: verifiedTasks, 1747 QueryOptions: structs.QueryOptions{ 1748 Region: c.Region(), 1749 AllowStale: false, 1750 }, 1751 } 1752 1753 // Derive the tokens 1754 var resp structs.DeriveVaultTokenResponse 1755 if err := c.RPC("Node.DeriveVaultToken", &req, &resp); err != nil { 1756 c.logger.Printf("[ERR] client.vault: DeriveVaultToken RPC failed: %v", err) 1757 return nil, fmt.Errorf("DeriveVaultToken RPC failed: %v", err) 1758 } 1759 if resp.Error != nil { 1760 c.logger.Printf("[ERR] client.vault: failed to derive vault tokens: %v", resp.Error) 1761 return nil, resp.Error 1762 } 1763 if resp.Tasks == nil { 1764 c.logger.Printf("[ERR] client.vault: failed to derive vault token: invalid response") 1765 return nil, fmt.Errorf("failed to derive vault tokens: invalid response") 1766 } 1767 1768 unwrappedTokens := make(map[string]string) 1769 1770 // Retrieve the wrapped tokens from the response and unwrap it 1771 for _, taskName := range verifiedTasks { 1772 // Get the wrapped token 1773 wrappedToken, ok := resp.Tasks[taskName] 1774 if !ok { 1775 c.logger.Printf("[ERR] client.vault: wrapped token missing for task %q", taskName) 1776 return nil, fmt.Errorf("wrapped token missing for task %q", taskName) 1777 } 1778 1779 // Unwrap the vault token 1780 unwrapResp, err := vclient.Logical().Unwrap(wrappedToken) 1781 if err != nil { 1782 return nil, fmt.Errorf("failed to unwrap the token for task %q: %v", taskName, err) 1783 } 1784 if unwrapResp == nil || unwrapResp.Auth == nil || unwrapResp.Auth.ClientToken == "" { 1785 return nil, fmt.Errorf("failed to unwrap the token for task %q", taskName) 1786 } 1787 1788 // Append the unwrapped token to the return value 1789 unwrappedTokens[taskName] = unwrapResp.Auth.ClientToken 1790 } 1791 1792 return unwrappedTokens, nil 1793 } 1794 1795 // triggerDiscovery causes a Consul discovery to begin (if one hasn't alread) 1796 func (c *Client) triggerDiscovery() { 1797 select { 1798 case c.triggerDiscoveryCh <- struct{}{}: 1799 // Discovery goroutine was released to execute 1800 default: 1801 // Discovery goroutine was already running 1802 } 1803 } 1804 1805 // consulDiscovery waits for the signal to attempt server discovery via Consul. 1806 // It's intended to be started in a goroutine. See triggerDiscovery() for 1807 // causing consul discovery from other code locations. 1808 func (c *Client) consulDiscovery() { 1809 for { 1810 select { 1811 case <-c.triggerDiscoveryCh: 1812 if err := c.consulDiscoveryImpl(); err != nil { 1813 c.logger.Printf("[ERR] client.consul: error discovering nomad servers: %v", err) 1814 } 1815 case <-c.shutdownCh: 1816 return 1817 } 1818 } 1819 } 1820 1821 func (c *Client) consulDiscoveryImpl() error { 1822 // Acquire heartbeat lock to prevent heartbeat from running 1823 // concurrently with discovery. Concurrent execution is safe, however 1824 // discovery is usually triggered when heartbeating has failed so 1825 // there's no point in allowing it. 1826 c.heartbeatLock.Lock() 1827 defer c.heartbeatLock.Unlock() 1828 1829 dcs, err := c.consulCatalog.Datacenters() 1830 if err != nil { 1831 return fmt.Errorf("client.consul: unable to query Consul datacenters: %v", err) 1832 } 1833 if len(dcs) > 2 { 1834 // Query the local DC first, then shuffle the 1835 // remaining DCs. Future heartbeats will cause Nomad 1836 // Clients to fixate on their local datacenter so 1837 // it's okay to talk with remote DCs. If the no 1838 // Nomad servers are available within 1839 // datacenterQueryLimit, the next heartbeat will pick 1840 // a new set of servers so it's okay. 1841 shuffleStrings(dcs[1:]) 1842 dcs = dcs[0:lib.MinInt(len(dcs), datacenterQueryLimit)] 1843 } 1844 1845 // Query for servers in this client's region only 1846 region := c.Region() 1847 rpcargs := structs.GenericRequest{ 1848 QueryOptions: structs.QueryOptions{ 1849 Region: region, 1850 }, 1851 } 1852 1853 serviceName := c.configCopy.ConsulConfig.ServerServiceName 1854 var mErr multierror.Error 1855 var nomadServers servers.Servers 1856 c.logger.Printf("[DEBUG] client.consul: bootstrap contacting following Consul DCs: %+q", dcs) 1857 DISCOLOOP: 1858 for _, dc := range dcs { 1859 consulOpts := &consulapi.QueryOptions{ 1860 AllowStale: true, 1861 Datacenter: dc, 1862 Near: "_agent", 1863 WaitTime: consul.DefaultQueryWaitDuration, 1864 } 1865 consulServices, _, err := c.consulCatalog.Service(serviceName, consul.ServiceTagRPC, consulOpts) 1866 if err != nil { 1867 mErr.Errors = append(mErr.Errors, fmt.Errorf("unable to query service %+q from Consul datacenter %+q: %v", serviceName, dc, err)) 1868 continue 1869 } 1870 1871 for _, s := range consulServices { 1872 port := strconv.Itoa(s.ServicePort) 1873 addrstr := s.ServiceAddress 1874 if addrstr == "" { 1875 addrstr = s.Address 1876 } 1877 addr, err := net.ResolveTCPAddr("tcp", net.JoinHostPort(addrstr, port)) 1878 if err != nil { 1879 mErr.Errors = append(mErr.Errors, err) 1880 continue 1881 } 1882 var peers []string 1883 if err := c.connPool.RPC(region, addr, c.RPCMajorVersion(), "Status.Peers", rpcargs, &peers); err != nil { 1884 mErr.Errors = append(mErr.Errors, err) 1885 continue 1886 } 1887 1888 // Successfully received the Server peers list of the correct 1889 // region 1890 for _, p := range peers { 1891 addr, err := net.ResolveTCPAddr("tcp", p) 1892 if err != nil { 1893 mErr.Errors = append(mErr.Errors, err) 1894 } 1895 srv := &servers.Server{Addr: addr} 1896 nomadServers = append(nomadServers, srv) 1897 } 1898 if len(nomadServers) > 0 { 1899 break DISCOLOOP 1900 } 1901 } 1902 } 1903 if len(nomadServers) == 0 { 1904 if len(mErr.Errors) > 0 { 1905 return mErr.ErrorOrNil() 1906 } 1907 return fmt.Errorf("no Nomad Servers advertising service %q in Consul datacenters: %+q", serviceName, dcs) 1908 } 1909 1910 c.logger.Printf("[INFO] client.consul: discovered following Servers: %s", nomadServers) 1911 c.servers.SetServers(nomadServers) 1912 1913 // Notify waiting rpc calls. If a goroutine just failed an RPC call and 1914 // isn't receiving on this chan yet they'll still retry eventually. 1915 // This is a shortcircuit for the longer retry intervals. 1916 for { 1917 select { 1918 case c.serversDiscoveredCh <- struct{}{}: 1919 default: 1920 return nil 1921 } 1922 } 1923 } 1924 1925 // emitStats collects host resource usage stats periodically 1926 func (c *Client) emitStats() { 1927 // Assign labels directly before emitting stats so the information expected 1928 // is ready 1929 c.baseLabels = []metrics.Label{{Name: "node_id", Value: c.NodeID()}, {Name: "datacenter", Value: c.Datacenter()}} 1930 1931 // Start collecting host stats right away and then keep collecting every 1932 // collection interval 1933 next := time.NewTimer(0) 1934 defer next.Stop() 1935 for { 1936 select { 1937 case <-next.C: 1938 err := c.hostStatsCollector.Collect() 1939 next.Reset(c.config.StatsCollectionInterval) 1940 if err != nil { 1941 c.logger.Printf("[WARN] client: error fetching host resource usage stats: %v", err) 1942 continue 1943 } 1944 1945 // Publish Node metrics if operator has opted in 1946 if c.config.PublishNodeMetrics { 1947 c.emitHostStats() 1948 } 1949 1950 c.emitClientMetrics() 1951 case <-c.shutdownCh: 1952 return 1953 } 1954 } 1955 } 1956 1957 // setGaugeForMemoryStats proxies metrics for memory specific statistics 1958 func (c *Client) setGaugeForMemoryStats(nodeID string, hStats *stats.HostStats) { 1959 if !c.config.DisableTaggedMetrics { 1960 metrics.SetGaugeWithLabels([]string{"client", "host", "memory", "total"}, float32(hStats.Memory.Total), c.baseLabels) 1961 metrics.SetGaugeWithLabels([]string{"client", "host", "memory", "available"}, float32(hStats.Memory.Available), c.baseLabels) 1962 metrics.SetGaugeWithLabels([]string{"client", "host", "memory", "used"}, float32(hStats.Memory.Used), c.baseLabels) 1963 metrics.SetGaugeWithLabels([]string{"client", "host", "memory", "free"}, float32(hStats.Memory.Free), c.baseLabels) 1964 } 1965 1966 if c.config.BackwardsCompatibleMetrics { 1967 metrics.SetGauge([]string{"client", "host", "memory", nodeID, "total"}, float32(hStats.Memory.Total)) 1968 metrics.SetGauge([]string{"client", "host", "memory", nodeID, "available"}, float32(hStats.Memory.Available)) 1969 metrics.SetGauge([]string{"client", "host", "memory", nodeID, "used"}, float32(hStats.Memory.Used)) 1970 metrics.SetGauge([]string{"client", "host", "memory", nodeID, "free"}, float32(hStats.Memory.Free)) 1971 } 1972 } 1973 1974 // setGaugeForCPUStats proxies metrics for CPU specific statistics 1975 func (c *Client) setGaugeForCPUStats(nodeID string, hStats *stats.HostStats) { 1976 for _, cpu := range hStats.CPU { 1977 if !c.config.DisableTaggedMetrics { 1978 labels := append(c.baseLabels, metrics.Label{ 1979 Name: "cpu", 1980 Value: cpu.CPU, 1981 }) 1982 1983 metrics.SetGaugeWithLabels([]string{"client", "host", "cpu", "total"}, float32(cpu.Total), labels) 1984 metrics.SetGaugeWithLabels([]string{"client", "host", "cpu", "user"}, float32(cpu.User), labels) 1985 metrics.SetGaugeWithLabels([]string{"client", "host", "cpu", "idle"}, float32(cpu.Idle), labels) 1986 metrics.SetGaugeWithLabels([]string{"client", "host", "cpu", "system"}, float32(cpu.System), labels) 1987 } 1988 1989 if c.config.BackwardsCompatibleMetrics { 1990 metrics.SetGauge([]string{"client", "host", "cpu", nodeID, cpu.CPU, "total"}, float32(cpu.Total)) 1991 metrics.SetGauge([]string{"client", "host", "cpu", nodeID, cpu.CPU, "user"}, float32(cpu.User)) 1992 metrics.SetGauge([]string{"client", "host", "cpu", nodeID, cpu.CPU, "idle"}, float32(cpu.Idle)) 1993 metrics.SetGauge([]string{"client", "host", "cpu", nodeID, cpu.CPU, "system"}, float32(cpu.System)) 1994 } 1995 } 1996 } 1997 1998 // setGaugeForDiskStats proxies metrics for disk specific statistics 1999 func (c *Client) setGaugeForDiskStats(nodeID string, hStats *stats.HostStats) { 2000 for _, disk := range hStats.DiskStats { 2001 if !c.config.DisableTaggedMetrics { 2002 labels := append(c.baseLabels, metrics.Label{ 2003 Name: "disk", 2004 Value: disk.Device, 2005 }) 2006 2007 metrics.SetGaugeWithLabels([]string{"client", "host", "disk", "size"}, float32(disk.Size), labels) 2008 metrics.SetGaugeWithLabels([]string{"client", "host", "disk", "used"}, float32(disk.Used), labels) 2009 metrics.SetGaugeWithLabels([]string{"client", "host", "disk", "available"}, float32(disk.Available), labels) 2010 metrics.SetGaugeWithLabels([]string{"client", "host", "disk", "used_percent"}, float32(disk.UsedPercent), labels) 2011 metrics.SetGaugeWithLabels([]string{"client", "host", "disk", "inodes_percent"}, float32(disk.InodesUsedPercent), labels) 2012 } 2013 2014 if c.config.BackwardsCompatibleMetrics { 2015 metrics.SetGauge([]string{"client", "host", "disk", nodeID, disk.Device, "size"}, float32(disk.Size)) 2016 metrics.SetGauge([]string{"client", "host", "disk", nodeID, disk.Device, "used"}, float32(disk.Used)) 2017 metrics.SetGauge([]string{"client", "host", "disk", nodeID, disk.Device, "available"}, float32(disk.Available)) 2018 metrics.SetGauge([]string{"client", "host", "disk", nodeID, disk.Device, "used_percent"}, float32(disk.UsedPercent)) 2019 metrics.SetGauge([]string{"client", "host", "disk", nodeID, disk.Device, "inodes_percent"}, float32(disk.InodesUsedPercent)) 2020 } 2021 } 2022 } 2023 2024 // setGaugeForAllocationStats proxies metrics for allocation specific statistics 2025 func (c *Client) setGaugeForAllocationStats(nodeID string) { 2026 c.configLock.RLock() 2027 node := c.configCopy.Node 2028 c.configLock.RUnlock() 2029 total := node.Resources 2030 res := node.Reserved 2031 allocated := c.getAllocatedResources(node) 2032 2033 // Emit allocated 2034 if !c.config.DisableTaggedMetrics { 2035 metrics.SetGaugeWithLabels([]string{"client", "allocated", "memory"}, float32(allocated.MemoryMB), c.baseLabels) 2036 metrics.SetGaugeWithLabels([]string{"client", "allocated", "disk"}, float32(allocated.DiskMB), c.baseLabels) 2037 metrics.SetGaugeWithLabels([]string{"client", "allocated", "cpu"}, float32(allocated.CPU), c.baseLabels) 2038 metrics.SetGaugeWithLabels([]string{"client", "allocated", "iops"}, float32(allocated.IOPS), c.baseLabels) 2039 } 2040 2041 if c.config.BackwardsCompatibleMetrics { 2042 metrics.SetGauge([]string{"client", "allocated", "memory", nodeID}, float32(allocated.MemoryMB)) 2043 metrics.SetGauge([]string{"client", "allocated", "disk", nodeID}, float32(allocated.DiskMB)) 2044 metrics.SetGauge([]string{"client", "allocated", "cpu", nodeID}, float32(allocated.CPU)) 2045 metrics.SetGauge([]string{"client", "allocated", "iops", nodeID}, float32(allocated.IOPS)) 2046 } 2047 2048 for _, n := range allocated.Networks { 2049 if !c.config.DisableTaggedMetrics { 2050 labels := append(c.baseLabels, metrics.Label{ 2051 Name: "device", 2052 Value: n.Device, 2053 }) 2054 metrics.SetGaugeWithLabels([]string{"client", "allocated", "network"}, float32(n.MBits), labels) 2055 } 2056 2057 if c.config.BackwardsCompatibleMetrics { 2058 metrics.SetGauge([]string{"client", "allocated", "network", n.Device, nodeID}, float32(n.MBits)) 2059 } 2060 } 2061 2062 // Emit unallocated 2063 unallocatedMem := total.MemoryMB - res.MemoryMB - allocated.MemoryMB 2064 unallocatedDisk := total.DiskMB - res.DiskMB - allocated.DiskMB 2065 unallocatedCpu := total.CPU - res.CPU - allocated.CPU 2066 unallocatedIops := total.IOPS - res.IOPS - allocated.IOPS 2067 2068 if !c.config.DisableTaggedMetrics { 2069 metrics.SetGaugeWithLabels([]string{"client", "unallocated", "memory"}, float32(unallocatedMem), c.baseLabels) 2070 metrics.SetGaugeWithLabels([]string{"client", "unallocated", "disk"}, float32(unallocatedDisk), c.baseLabels) 2071 metrics.SetGaugeWithLabels([]string{"client", "unallocated", "cpu"}, float32(unallocatedCpu), c.baseLabels) 2072 metrics.SetGaugeWithLabels([]string{"client", "unallocated", "iops"}, float32(unallocatedIops), c.baseLabels) 2073 } 2074 2075 if c.config.BackwardsCompatibleMetrics { 2076 metrics.SetGauge([]string{"client", "unallocated", "memory", nodeID}, float32(unallocatedMem)) 2077 metrics.SetGauge([]string{"client", "unallocated", "disk", nodeID}, float32(unallocatedDisk)) 2078 metrics.SetGauge([]string{"client", "unallocated", "cpu", nodeID}, float32(unallocatedCpu)) 2079 metrics.SetGauge([]string{"client", "unallocated", "iops", nodeID}, float32(unallocatedIops)) 2080 } 2081 2082 for _, n := range allocated.Networks { 2083 totalIdx := total.NetIndex(n) 2084 if totalIdx != -1 { 2085 continue 2086 } 2087 2088 totalMbits := total.Networks[totalIdx].MBits 2089 unallocatedMbits := totalMbits - n.MBits 2090 2091 if !c.config.DisableTaggedMetrics { 2092 labels := append(c.baseLabels, metrics.Label{ 2093 Name: "device", 2094 Value: n.Device, 2095 }) 2096 metrics.SetGaugeWithLabels([]string{"client", "unallocated", "network"}, float32(unallocatedMbits), labels) 2097 } 2098 2099 if c.config.BackwardsCompatibleMetrics { 2100 metrics.SetGauge([]string{"client", "unallocated", "network", n.Device, nodeID}, float32(unallocatedMbits)) 2101 } 2102 } 2103 } 2104 2105 // No lables are required so we emit with only a key/value syntax 2106 func (c *Client) setGaugeForUptime(hStats *stats.HostStats) { 2107 if !c.config.DisableTaggedMetrics { 2108 metrics.SetGaugeWithLabels([]string{"uptime"}, float32(hStats.Uptime), c.baseLabels) 2109 } 2110 if c.config.BackwardsCompatibleMetrics { 2111 metrics.SetGauge([]string{"uptime"}, float32(hStats.Uptime)) 2112 } 2113 } 2114 2115 // emitHostStats pushes host resource usage stats to remote metrics collection sinks 2116 func (c *Client) emitHostStats() { 2117 nodeID := c.NodeID() 2118 hStats := c.hostStatsCollector.Stats() 2119 2120 c.setGaugeForMemoryStats(nodeID, hStats) 2121 c.setGaugeForUptime(hStats) 2122 c.setGaugeForCPUStats(nodeID, hStats) 2123 c.setGaugeForDiskStats(nodeID, hStats) 2124 } 2125 2126 // emitClientMetrics emits lower volume client metrics 2127 func (c *Client) emitClientMetrics() { 2128 nodeID := c.NodeID() 2129 2130 c.setGaugeForAllocationStats(nodeID) 2131 2132 // Emit allocation metrics 2133 blocked, migrating, pending, running, terminal := 0, 0, 0, 0, 0 2134 for _, ar := range c.getAllocRunners() { 2135 switch ar.Alloc().ClientStatus { 2136 case structs.AllocClientStatusPending: 2137 switch { 2138 case ar.IsWaiting(): 2139 blocked++ 2140 case ar.IsMigrating(): 2141 migrating++ 2142 default: 2143 pending++ 2144 } 2145 case structs.AllocClientStatusRunning: 2146 running++ 2147 case structs.AllocClientStatusComplete, structs.AllocClientStatusFailed: 2148 terminal++ 2149 } 2150 } 2151 2152 if !c.config.DisableTaggedMetrics { 2153 metrics.SetGaugeWithLabels([]string{"client", "allocations", "migrating"}, float32(migrating), c.baseLabels) 2154 metrics.SetGaugeWithLabels([]string{"client", "allocations", "blocked"}, float32(blocked), c.baseLabels) 2155 metrics.SetGaugeWithLabels([]string{"client", "allocations", "pending"}, float32(pending), c.baseLabels) 2156 metrics.SetGaugeWithLabels([]string{"client", "allocations", "running"}, float32(running), c.baseLabels) 2157 metrics.SetGaugeWithLabels([]string{"client", "allocations", "terminal"}, float32(terminal), c.baseLabels) 2158 } 2159 2160 if c.config.BackwardsCompatibleMetrics { 2161 metrics.SetGauge([]string{"client", "allocations", "migrating", nodeID}, float32(migrating)) 2162 metrics.SetGauge([]string{"client", "allocations", "blocked", nodeID}, float32(blocked)) 2163 metrics.SetGauge([]string{"client", "allocations", "pending", nodeID}, float32(pending)) 2164 metrics.SetGauge([]string{"client", "allocations", "running", nodeID}, float32(running)) 2165 metrics.SetGauge([]string{"client", "allocations", "terminal", nodeID}, float32(terminal)) 2166 } 2167 } 2168 2169 func (c *Client) getAllocatedResources(selfNode *structs.Node) *structs.Resources { 2170 // Unfortunately the allocs only have IP so we need to match them to the 2171 // device 2172 cidrToDevice := make(map[*net.IPNet]string, len(selfNode.Resources.Networks)) 2173 for _, n := range selfNode.Resources.Networks { 2174 _, ipnet, err := net.ParseCIDR(n.CIDR) 2175 if err != nil { 2176 continue 2177 } 2178 cidrToDevice[ipnet] = n.Device 2179 } 2180 2181 // Sum the allocated resources 2182 allocs := c.allAllocs() 2183 var allocated structs.Resources 2184 allocatedDeviceMbits := make(map[string]int) 2185 for _, alloc := range allocs { 2186 if !alloc.TerminalStatus() { 2187 allocated.Add(alloc.Resources) 2188 for _, allocatedNetwork := range alloc.Resources.Networks { 2189 for cidr, dev := range cidrToDevice { 2190 ip := net.ParseIP(allocatedNetwork.IP) 2191 if cidr.Contains(ip) { 2192 allocatedDeviceMbits[dev] += allocatedNetwork.MBits 2193 break 2194 } 2195 } 2196 } 2197 } 2198 } 2199 2200 // Clear the networks 2201 allocated.Networks = nil 2202 for dev, speed := range allocatedDeviceMbits { 2203 net := &structs.NetworkResource{ 2204 Device: dev, 2205 MBits: speed, 2206 } 2207 allocated.Networks = append(allocated.Networks, net) 2208 } 2209 2210 return &allocated 2211 } 2212 2213 // allAllocs returns all the allocations managed by the client 2214 func (c *Client) allAllocs() map[string]*structs.Allocation { 2215 ars := c.getAllocRunners() 2216 allocs := make(map[string]*structs.Allocation, len(ars)) 2217 for _, ar := range c.getAllocRunners() { 2218 a := ar.Alloc() 2219 allocs[a.ID] = a 2220 } 2221 return allocs 2222 }