github.com/maier/nomad@v0.4.1-0.20161110003312-a9e3d0b8549d/client/client.go (about) 1 package client 2 3 import ( 4 "archive/tar" 5 "errors" 6 "fmt" 7 "io" 8 "io/ioutil" 9 "log" 10 "net" 11 "os" 12 "path/filepath" 13 "strconv" 14 "strings" 15 "sync" 16 "time" 17 18 "github.com/armon/go-metrics" 19 consulapi "github.com/hashicorp/consul/api" 20 "github.com/hashicorp/consul/lib" 21 "github.com/hashicorp/go-multierror" 22 nomadapi "github.com/hashicorp/nomad/api" 23 "github.com/hashicorp/nomad/client/allocdir" 24 "github.com/hashicorp/nomad/client/config" 25 "github.com/hashicorp/nomad/client/driver" 26 "github.com/hashicorp/nomad/client/fingerprint" 27 "github.com/hashicorp/nomad/client/stats" 28 "github.com/hashicorp/nomad/client/vaultclient" 29 "github.com/hashicorp/nomad/command/agent/consul" 30 "github.com/hashicorp/nomad/helper/tlsutil" 31 "github.com/hashicorp/nomad/nomad" 32 "github.com/hashicorp/nomad/nomad/structs" 33 vaultapi "github.com/hashicorp/vault/api" 34 "github.com/mitchellh/hashstructure" 35 ) 36 37 const ( 38 // clientRPCCache controls how long we keep an idle connection 39 // open to a server 40 clientRPCCache = 5 * time.Minute 41 42 // clientMaxStreams controsl how many idle streams we keep 43 // open to a server 44 clientMaxStreams = 2 45 46 // datacenterQueryLimit searches through up to this many adjacent 47 // datacenters looking for the Nomad server service. 48 datacenterQueryLimit = 9 49 50 // consulReaperIntv is the interval at which the Consul reaper will 51 // run. 52 consulReaperIntv = 5 * time.Second 53 54 // registerRetryIntv is minimum interval on which we retry 55 // registration. We pick a value between this and 2x this. 56 registerRetryIntv = 15 * time.Second 57 58 // getAllocRetryIntv is minimum interval on which we retry 59 // to fetch allocations. We pick a value between this and 2x this. 60 getAllocRetryIntv = 30 * time.Second 61 62 // devModeRetryIntv is the retry interval used for development 63 devModeRetryIntv = time.Second 64 65 // stateSnapshotIntv is how often the client snapshots state 66 stateSnapshotIntv = 60 * time.Second 67 68 // initialHeartbeatStagger is used to stagger the interval between 69 // starting and the intial heartbeat. After the intial heartbeat, 70 // we switch to using the TTL specified by the servers. 71 initialHeartbeatStagger = 10 * time.Second 72 73 // nodeUpdateRetryIntv is how often the client checks for updates to the 74 // node attributes or meta map. 75 nodeUpdateRetryIntv = 5 * time.Second 76 77 // allocSyncIntv is the batching period of allocation updates before they 78 // are synced with the server. 79 allocSyncIntv = 200 * time.Millisecond 80 81 // allocSyncRetryIntv is the interval on which we retry updating 82 // the status of the allocation 83 allocSyncRetryIntv = 5 * time.Second 84 ) 85 86 // ClientStatsReporter exposes all the APIs related to resource usage of a Nomad 87 // Client 88 type ClientStatsReporter interface { 89 // GetAllocStats returns the AllocStatsReporter for the passed allocation. 90 // If it does not exist an error is reported. 91 GetAllocStats(allocID string) (AllocStatsReporter, error) 92 93 // LatestHostStats returns the latest resource usage stats for the host 94 LatestHostStats() *stats.HostStats 95 } 96 97 // Client is used to implement the client interaction with Nomad. Clients 98 // are expected to register as a schedulable node to the servers, and to 99 // run allocations as determined by the servers. 100 type Client struct { 101 config *config.Config 102 start time.Time 103 104 // configCopy is a copy that should be passed to alloc-runners. 105 configCopy *config.Config 106 configLock sync.RWMutex 107 108 logger *log.Logger 109 110 connPool *nomad.ConnPool 111 112 // servers is the (optionally prioritized) list of nomad servers 113 servers *serverlist 114 115 // heartbeat related times for tracking how often to heartbeat 116 lastHeartbeat time.Time 117 heartbeatTTL time.Duration 118 heartbeatLock sync.Mutex 119 120 // triggerDiscoveryCh triggers Consul discovery; see triggerDiscovery 121 triggerDiscoveryCh chan struct{} 122 123 // discovered will be ticked whenever Consul discovery completes 124 // succesfully 125 serversDiscoveredCh chan struct{} 126 127 // allocs is the current set of allocations 128 allocs map[string]*AllocRunner 129 allocLock sync.RWMutex 130 131 // blockedAllocations are allocations which are blocked because their 132 // chained allocations haven't finished running 133 blockedAllocations map[string]*structs.Allocation 134 blockedAllocsLock sync.RWMutex 135 136 // allocUpdates stores allocations that need to be synced to the server. 137 allocUpdates chan *structs.Allocation 138 139 // consulSyncer advertises this Nomad Agent with Consul 140 consulSyncer *consul.Syncer 141 142 // HostStatsCollector collects host resource usage stats 143 hostStatsCollector *stats.HostStatsCollector 144 resourceUsage *stats.HostStats 145 resourceUsageLock sync.RWMutex 146 147 shutdown bool 148 shutdownCh chan struct{} 149 shutdownLock sync.Mutex 150 151 // vaultClient is used to interact with Vault for token and secret renewals 152 vaultClient vaultclient.VaultClient 153 154 // migratingAllocs is the set of allocs whose data migration is in flight 155 migratingAllocs map[string]chan struct{} 156 migratingAllocsLock sync.Mutex 157 } 158 159 var ( 160 // noServersErr is returned by the RPC method when the client has no 161 // configured servers. This is used to trigger Consul discovery if 162 // enabled. 163 noServersErr = errors.New("no servers") 164 ) 165 166 // NewClient is used to create a new client from the given configuration 167 func NewClient(cfg *config.Config, consulSyncer *consul.Syncer, logger *log.Logger) (*Client, error) { 168 // Create the tls wrapper 169 var tlsWrap tlsutil.RegionWrapper 170 if cfg.TLSConfig.EnableRPC { 171 tw, err := cfg.TLSConfiguration().OutgoingTLSWrapper() 172 if err != nil { 173 return nil, err 174 } 175 tlsWrap = tw 176 } 177 178 // Create the client 179 c := &Client{ 180 config: cfg, 181 consulSyncer: consulSyncer, 182 start: time.Now(), 183 connPool: nomad.NewPool(cfg.LogOutput, clientRPCCache, clientMaxStreams, tlsWrap), 184 logger: logger, 185 hostStatsCollector: stats.NewHostStatsCollector(), 186 allocs: make(map[string]*AllocRunner), 187 blockedAllocations: make(map[string]*structs.Allocation), 188 allocUpdates: make(chan *structs.Allocation, 64), 189 shutdownCh: make(chan struct{}), 190 migratingAllocs: make(map[string]chan struct{}), 191 servers: newServerList(), 192 triggerDiscoveryCh: make(chan struct{}), 193 serversDiscoveredCh: make(chan struct{}), 194 } 195 196 // Initialize the client 197 if err := c.init(); err != nil { 198 return nil, fmt.Errorf("failed to initialize client: %v", err) 199 } 200 201 // Setup the node 202 if err := c.setupNode(); err != nil { 203 return nil, fmt.Errorf("node setup failed: %v", err) 204 } 205 206 // Fingerprint the node 207 if err := c.fingerprint(); err != nil { 208 return nil, fmt.Errorf("fingerprinting failed: %v", err) 209 } 210 211 // Scan for drivers 212 if err := c.setupDrivers(); err != nil { 213 return nil, fmt.Errorf("driver setup failed: %v", err) 214 } 215 216 // Setup the reserved resources 217 c.reservePorts() 218 219 // Store the config copy before restoring state but after it has been 220 // initialized. 221 c.configLock.Lock() 222 c.configCopy = c.config.Copy() 223 c.configLock.Unlock() 224 225 // Set the preconfigured list of static servers 226 c.configLock.RLock() 227 if len(c.configCopy.Servers) > 0 { 228 if err := c.SetServers(c.configCopy.Servers); err != nil { 229 logger.Printf("[WARN] client: None of the configured servers are valid: %v", err) 230 } 231 } 232 c.configLock.RUnlock() 233 234 // Setup Consul discovery if enabled 235 if c.configCopy.ConsulConfig.ClientAutoJoin { 236 go c.consulDiscovery() 237 if len(c.servers.all()) == 0 { 238 // No configured servers; trigger discovery manually 239 c.triggerDiscoveryCh <- struct{}{} 240 } 241 } 242 243 // Start Consul reaper 244 go c.consulReaper() 245 246 // Setup the vault client for token and secret renewals 247 if err := c.setupVaultClient(); err != nil { 248 return nil, fmt.Errorf("failed to setup vault client: %v", err) 249 } 250 251 // Restore the state 252 if err := c.restoreState(); err != nil { 253 return nil, fmt.Errorf("failed to restore state: %v", err) 254 } 255 256 // Register and then start heartbeating to the servers. 257 go c.registerAndHeartbeat() 258 259 // Begin periodic snapshotting of state. 260 go c.periodicSnapshot() 261 262 // Begin syncing allocations to the server 263 go c.allocSync() 264 265 // Start the client! 266 go c.run() 267 268 // Start collecting stats 269 go c.collectHostStats() 270 271 c.logger.Printf("[INFO] client: Node ID %q", c.Node().ID) 272 return c, nil 273 } 274 275 // init is used to initialize the client and perform any setup 276 // needed before we begin starting its various components. 277 func (c *Client) init() error { 278 // Ensure the state dir exists if we have one 279 if c.config.StateDir != "" { 280 if err := os.MkdirAll(c.config.StateDir, 0700); err != nil { 281 return fmt.Errorf("failed creating state dir: %s", err) 282 } 283 284 } else { 285 // Othewise make a temp directory to use. 286 p, err := ioutil.TempDir("", "NomadClient") 287 if err != nil { 288 return fmt.Errorf("failed creating temporary directory for the StateDir: %v", err) 289 } 290 291 p, err = filepath.EvalSymlinks(p) 292 if err != nil { 293 return fmt.Errorf("failed to find temporary directory for the StateDir: %v", err) 294 } 295 296 c.config.StateDir = p 297 } 298 c.logger.Printf("[INFO] client: using state directory %v", c.config.StateDir) 299 300 // Ensure the alloc dir exists if we have one 301 if c.config.AllocDir != "" { 302 if err := os.MkdirAll(c.config.AllocDir, 0755); err != nil { 303 return fmt.Errorf("failed creating alloc dir: %s", err) 304 } 305 } else { 306 // Othewise make a temp directory to use. 307 p, err := ioutil.TempDir("", "NomadClient") 308 if err != nil { 309 return fmt.Errorf("failed creating temporary directory for the AllocDir: %v", err) 310 } 311 312 p, err = filepath.EvalSymlinks(p) 313 if err != nil { 314 return fmt.Errorf("failed to find temporary directory for the AllocDir: %v", err) 315 } 316 317 c.config.AllocDir = p 318 } 319 320 c.logger.Printf("[INFO] client: using alloc directory %v", c.config.AllocDir) 321 return nil 322 } 323 324 // Leave is used to prepare the client to leave the cluster 325 func (c *Client) Leave() error { 326 // TODO 327 return nil 328 } 329 330 // Datacenter returns the datacenter for the given client 331 func (c *Client) Datacenter() string { 332 c.configLock.RLock() 333 dc := c.configCopy.Node.Datacenter 334 c.configLock.RUnlock() 335 return dc 336 } 337 338 // Region returns the region for the given client 339 func (c *Client) Region() string { 340 return c.config.Region 341 } 342 343 // RPCMajorVersion returns the structs.ApiMajorVersion supported by the 344 // client. 345 func (c *Client) RPCMajorVersion() int { 346 return structs.ApiMajorVersion 347 } 348 349 // RPCMinorVersion returns the structs.ApiMinorVersion supported by the 350 // client. 351 func (c *Client) RPCMinorVersion() int { 352 return structs.ApiMinorVersion 353 } 354 355 // Shutdown is used to tear down the client 356 func (c *Client) Shutdown() error { 357 c.logger.Printf("[INFO] client: shutting down") 358 c.shutdownLock.Lock() 359 defer c.shutdownLock.Unlock() 360 361 if c.shutdown { 362 return nil 363 } 364 365 // Stop renewing tokens and secrets 366 if c.vaultClient != nil { 367 c.vaultClient.Stop() 368 } 369 370 // Destroy all the running allocations. 371 if c.config.DevMode { 372 c.allocLock.Lock() 373 for _, ar := range c.allocs { 374 ar.Destroy() 375 <-ar.WaitCh() 376 } 377 c.allocLock.Unlock() 378 } 379 380 c.shutdown = true 381 close(c.shutdownCh) 382 c.connPool.Shutdown() 383 return c.saveState() 384 } 385 386 // RPC is used to forward an RPC call to a nomad server, or fail if no servers. 387 func (c *Client) RPC(method string, args interface{}, reply interface{}) error { 388 // Invoke the RPCHandler if it exists 389 if c.config.RPCHandler != nil { 390 return c.config.RPCHandler.RPC(method, args, reply) 391 } 392 393 servers := c.servers.all() 394 if len(servers) == 0 { 395 return noServersErr 396 } 397 398 var mErr multierror.Error 399 for _, s := range servers { 400 // Make the RPC request 401 if err := c.connPool.RPC(c.Region(), s.addr, c.RPCMajorVersion(), method, args, reply); err != nil { 402 errmsg := fmt.Errorf("RPC failed to server %s: %v", s.addr, err) 403 mErr.Errors = append(mErr.Errors, errmsg) 404 c.logger.Printf("[DEBUG] client: %v", errmsg) 405 c.servers.failed(s) 406 continue 407 } 408 c.servers.good(s) 409 return nil 410 } 411 412 return mErr.ErrorOrNil() 413 } 414 415 // Stats is used to return statistics for debugging and insight 416 // for various sub-systems 417 func (c *Client) Stats() map[string]map[string]string { 418 c.allocLock.RLock() 419 numAllocs := len(c.allocs) 420 c.allocLock.RUnlock() 421 422 c.heartbeatLock.Lock() 423 defer c.heartbeatLock.Unlock() 424 stats := map[string]map[string]string{ 425 "client": map[string]string{ 426 "node_id": c.Node().ID, 427 "known_servers": c.servers.all().String(), 428 "num_allocations": strconv.Itoa(numAllocs), 429 "last_heartbeat": fmt.Sprintf("%v", time.Since(c.lastHeartbeat)), 430 "heartbeat_ttl": fmt.Sprintf("%v", c.heartbeatTTL), 431 }, 432 "runtime": nomad.RuntimeStats(), 433 } 434 return stats 435 } 436 437 // Node returns the locally registered node 438 func (c *Client) Node() *structs.Node { 439 c.configLock.RLock() 440 defer c.configLock.RUnlock() 441 return c.config.Node 442 } 443 444 // StatsReporter exposes the various APIs related resource usage of a Nomad 445 // client 446 func (c *Client) StatsReporter() ClientStatsReporter { 447 return c 448 } 449 450 func (c *Client) GetAllocStats(allocID string) (AllocStatsReporter, error) { 451 c.allocLock.RLock() 452 defer c.allocLock.RUnlock() 453 ar, ok := c.allocs[allocID] 454 if !ok { 455 return nil, fmt.Errorf("unknown allocation ID %q", allocID) 456 } 457 return ar.StatsReporter(), nil 458 } 459 460 // HostStats returns all the stats related to a Nomad client 461 func (c *Client) LatestHostStats() *stats.HostStats { 462 c.resourceUsageLock.RLock() 463 defer c.resourceUsageLock.RUnlock() 464 return c.resourceUsage 465 } 466 467 // GetAllocFS returns the AllocFS interface for the alloc dir of an allocation 468 func (c *Client) GetAllocFS(allocID string) (allocdir.AllocDirFS, error) { 469 c.allocLock.RLock() 470 defer c.allocLock.RUnlock() 471 472 ar, ok := c.allocs[allocID] 473 if !ok { 474 return nil, fmt.Errorf("alloc not found") 475 } 476 return ar.GetAllocDir(), nil 477 } 478 479 // GetServers returns the list of nomad servers this client is aware of. 480 func (c *Client) GetServers() []string { 481 endpoints := c.servers.all() 482 res := make([]string, len(endpoints)) 483 for i := range endpoints { 484 res[i] = endpoints[i].addr.String() 485 } 486 return res 487 } 488 489 // SetServers sets a new list of nomad servers to connect to. As long as one 490 // server is resolvable no error is returned. 491 func (c *Client) SetServers(servers []string) error { 492 endpoints := make([]*endpoint, 0, len(servers)) 493 var merr multierror.Error 494 for _, s := range servers { 495 addr, err := resolveServer(s) 496 if err != nil { 497 c.logger.Printf("[DEBUG] client: ignoring server %s due to resolution error: %v", s, err) 498 merr.Errors = append(merr.Errors, err) 499 continue 500 } 501 502 // Valid endpoint, append it without a priority as this API 503 // doesn't support different priorities for different servers 504 endpoints = append(endpoints, &endpoint{name: s, addr: addr}) 505 } 506 507 // Only return errors if no servers are valid 508 if len(endpoints) == 0 { 509 if len(merr.Errors) > 0 { 510 return merr.ErrorOrNil() 511 } 512 return noServersErr 513 } 514 515 c.servers.set(endpoints) 516 return nil 517 } 518 519 // restoreState is used to restore our state from the data dir 520 func (c *Client) restoreState() error { 521 if c.config.DevMode { 522 return nil 523 } 524 525 // Scan the directory 526 list, err := ioutil.ReadDir(filepath.Join(c.config.StateDir, "alloc")) 527 if err != nil && os.IsNotExist(err) { 528 return nil 529 } else if err != nil { 530 return fmt.Errorf("failed to list alloc state: %v", err) 531 } 532 533 // Load each alloc back 534 var mErr multierror.Error 535 for _, entry := range list { 536 id := entry.Name() 537 alloc := &structs.Allocation{ID: id} 538 c.configLock.RLock() 539 ar := NewAllocRunner(c.logger, c.configCopy, c.updateAllocStatus, alloc, c.vaultClient) 540 c.configLock.RUnlock() 541 c.allocLock.Lock() 542 c.allocs[id] = ar 543 c.allocLock.Unlock() 544 if err := ar.RestoreState(); err != nil { 545 c.logger.Printf("[ERR] client: failed to restore state for alloc %s: %v", id, err) 546 mErr.Errors = append(mErr.Errors, err) 547 } else { 548 go ar.Run() 549 } 550 } 551 return mErr.ErrorOrNil() 552 } 553 554 // saveState is used to snapshot our state into the data dir 555 func (c *Client) saveState() error { 556 if c.config.DevMode { 557 return nil 558 } 559 560 var mErr multierror.Error 561 for id, ar := range c.getAllocRunners() { 562 if err := ar.SaveState(); err != nil { 563 c.logger.Printf("[ERR] client: failed to save state for alloc %s: %v", 564 id, err) 565 mErr.Errors = append(mErr.Errors, err) 566 } 567 } 568 return mErr.ErrorOrNil() 569 } 570 571 // getAllocRunners returns a snapshot of the current set of alloc runners. 572 func (c *Client) getAllocRunners() map[string]*AllocRunner { 573 c.allocLock.RLock() 574 defer c.allocLock.RUnlock() 575 runners := make(map[string]*AllocRunner, len(c.allocs)) 576 for id, ar := range c.allocs { 577 runners[id] = ar 578 } 579 return runners 580 } 581 582 // nodeIDs restores the nodes persistent unique ID and SecretID or generates new 583 // ones 584 func (c *Client) nodeID() (id string, secret string, err error) { 585 // Do not persist in dev mode 586 if c.config.DevMode { 587 return structs.GenerateUUID(), structs.GenerateUUID(), nil 588 } 589 590 // Attempt to read existing ID 591 idPath := filepath.Join(c.config.StateDir, "client-id") 592 idBuf, err := ioutil.ReadFile(idPath) 593 if err != nil && !os.IsNotExist(err) { 594 return "", "", err 595 } 596 597 // Attempt to read existing secret ID 598 secretPath := filepath.Join(c.config.StateDir, "secret-id") 599 secretBuf, err := ioutil.ReadFile(secretPath) 600 if err != nil && !os.IsNotExist(err) { 601 return "", "", err 602 } 603 604 // Use existing ID if any 605 if len(idBuf) != 0 { 606 id = string(idBuf) 607 } else { 608 // Generate new ID 609 id = structs.GenerateUUID() 610 611 // Persist the ID 612 if err := ioutil.WriteFile(idPath, []byte(id), 0700); err != nil { 613 return "", "", err 614 } 615 } 616 617 if len(secretBuf) != 0 { 618 secret = string(secretBuf) 619 } else { 620 // Generate new ID 621 secret = structs.GenerateUUID() 622 623 // Persist the ID 624 if err := ioutil.WriteFile(secretPath, []byte(secret), 0700); err != nil { 625 return "", "", err 626 } 627 } 628 629 return id, secret, nil 630 } 631 632 // setupNode is used to setup the initial node 633 func (c *Client) setupNode() error { 634 node := c.config.Node 635 if node == nil { 636 node = &structs.Node{} 637 c.config.Node = node 638 } 639 // Generate an iD for the node 640 id, secretID, err := c.nodeID() 641 if err != nil { 642 return fmt.Errorf("node ID setup failed: %v", err) 643 } 644 645 node.ID = id 646 node.SecretID = secretID 647 if node.Attributes == nil { 648 node.Attributes = make(map[string]string) 649 } 650 if node.Links == nil { 651 node.Links = make(map[string]string) 652 } 653 if node.Meta == nil { 654 node.Meta = make(map[string]string) 655 } 656 if node.Resources == nil { 657 node.Resources = &structs.Resources{} 658 } 659 if node.Reserved == nil { 660 node.Reserved = &structs.Resources{} 661 } 662 if node.Datacenter == "" { 663 node.Datacenter = "dc1" 664 } 665 if node.Name == "" { 666 node.Name, _ = os.Hostname() 667 } 668 if node.Name == "" { 669 node.Name = node.ID 670 } 671 node.Status = structs.NodeStatusInit 672 return nil 673 } 674 675 // reservePorts is used to reserve ports on the fingerprinted network devices. 676 func (c *Client) reservePorts() { 677 c.configLock.RLock() 678 defer c.configLock.RUnlock() 679 global := c.config.GloballyReservedPorts 680 if len(global) == 0 { 681 return 682 } 683 684 node := c.config.Node 685 networks := node.Resources.Networks 686 reservedIndex := make(map[string]*structs.NetworkResource, len(networks)) 687 for _, resNet := range node.Reserved.Networks { 688 reservedIndex[resNet.IP] = resNet 689 } 690 691 // Go through each network device and reserve ports on it. 692 for _, net := range networks { 693 res, ok := reservedIndex[net.IP] 694 if !ok { 695 res = net.Copy() 696 res.MBits = 0 697 reservedIndex[net.IP] = res 698 } 699 700 for _, portVal := range global { 701 p := structs.Port{Value: portVal} 702 res.ReservedPorts = append(res.ReservedPorts, p) 703 } 704 } 705 706 // Clear the reserved networks. 707 if node.Reserved == nil { 708 node.Reserved = new(structs.Resources) 709 } else { 710 node.Reserved.Networks = nil 711 } 712 713 // Restore the reserved networks 714 for _, net := range reservedIndex { 715 node.Reserved.Networks = append(node.Reserved.Networks, net) 716 } 717 } 718 719 // fingerprint is used to fingerprint the client and setup the node 720 func (c *Client) fingerprint() error { 721 whitelist := c.config.ReadStringListToMap("fingerprint.whitelist") 722 whitelistEnabled := len(whitelist) > 0 723 blacklist := c.config.ReadStringListToMap("fingerprint.blacklist") 724 725 c.logger.Printf("[DEBUG] client: built-in fingerprints: %v", fingerprint.BuiltinFingerprints()) 726 727 var applied []string 728 var skipped []string 729 for _, name := range fingerprint.BuiltinFingerprints() { 730 // Skip modules that are not in the whitelist if it is enabled. 731 if _, ok := whitelist[name]; whitelistEnabled && !ok { 732 skipped = append(skipped, name) 733 continue 734 } 735 // Skip modules that are in the blacklist 736 if _, ok := blacklist[name]; ok { 737 skipped = append(skipped, name) 738 continue 739 } 740 f, err := fingerprint.NewFingerprint(name, c.logger) 741 if err != nil { 742 return err 743 } 744 745 c.configLock.Lock() 746 applies, err := f.Fingerprint(c.config, c.config.Node) 747 c.configLock.Unlock() 748 if err != nil { 749 return err 750 } 751 if applies { 752 applied = append(applied, name) 753 } 754 p, period := f.Periodic() 755 if p { 756 // TODO: If more periodic fingerprinters are added, then 757 // fingerprintPeriodic should be used to handle all the periodic 758 // fingerprinters by using a priority queue. 759 go c.fingerprintPeriodic(name, f, period) 760 } 761 } 762 c.logger.Printf("[DEBUG] client: applied fingerprints %v", applied) 763 if len(skipped) != 0 { 764 c.logger.Printf("[DEBUG] client: fingerprint modules skipped due to white/blacklist: %v", skipped) 765 } 766 return nil 767 } 768 769 // fingerprintPeriodic runs a fingerprinter at the specified duration. 770 func (c *Client) fingerprintPeriodic(name string, f fingerprint.Fingerprint, d time.Duration) { 771 c.logger.Printf("[DEBUG] client: fingerprinting %v every %v", name, d) 772 for { 773 select { 774 case <-time.After(d): 775 c.configLock.Lock() 776 if _, err := f.Fingerprint(c.config, c.config.Node); err != nil { 777 c.logger.Printf("[DEBUG] client: periodic fingerprinting for %v failed: %v", name, err) 778 } 779 c.configLock.Unlock() 780 case <-c.shutdownCh: 781 return 782 } 783 } 784 } 785 786 // setupDrivers is used to find the available drivers 787 func (c *Client) setupDrivers() error { 788 // Build the white/blacklists of drivers. 789 whitelist := c.config.ReadStringListToMap("driver.whitelist") 790 whitelistEnabled := len(whitelist) > 0 791 blacklist := c.config.ReadStringListToMap("driver.blacklist") 792 793 var avail []string 794 var skipped []string 795 driverCtx := driver.NewDriverContext("", c.config, c.config.Node, c.logger, nil) 796 for name := range driver.BuiltinDrivers { 797 // Skip fingerprinting drivers that are not in the whitelist if it is 798 // enabled. 799 if _, ok := whitelist[name]; whitelistEnabled && !ok { 800 skipped = append(skipped, name) 801 continue 802 } 803 // Skip fingerprinting drivers that are in the blacklist 804 if _, ok := blacklist[name]; ok { 805 skipped = append(skipped, name) 806 continue 807 } 808 809 d, err := driver.NewDriver(name, driverCtx) 810 if err != nil { 811 return err 812 } 813 c.configLock.Lock() 814 applies, err := d.Fingerprint(c.config, c.config.Node) 815 c.configLock.Unlock() 816 if err != nil { 817 return err 818 } 819 if applies { 820 avail = append(avail, name) 821 } 822 823 p, period := d.Periodic() 824 if p { 825 go c.fingerprintPeriodic(name, d, period) 826 } 827 828 } 829 830 c.logger.Printf("[DEBUG] client: available drivers %v", avail) 831 832 if len(skipped) != 0 { 833 c.logger.Printf("[DEBUG] client: drivers skipped due to white/blacklist: %v", skipped) 834 } 835 836 return nil 837 } 838 839 // retryIntv calculates a retry interval value given the base 840 func (c *Client) retryIntv(base time.Duration) time.Duration { 841 if c.config.DevMode { 842 return devModeRetryIntv 843 } 844 return base + lib.RandomStagger(base) 845 } 846 847 // registerAndHeartbeat is a long lived goroutine used to register the client 848 // and then start heartbeatng to the server. 849 func (c *Client) registerAndHeartbeat() { 850 // Register the node 851 c.retryRegisterNode() 852 853 // Start watching changes for node changes 854 go c.watchNodeUpdates() 855 856 // Setup the heartbeat timer, for the initial registration 857 // we want to do this quickly. We want to do it extra quickly 858 // in development mode. 859 var heartbeat <-chan time.Time 860 if c.config.DevMode { 861 heartbeat = time.After(0) 862 } else { 863 heartbeat = time.After(lib.RandomStagger(initialHeartbeatStagger)) 864 } 865 866 for { 867 select { 868 case <-c.serversDiscoveredCh: 869 case <-heartbeat: 870 case <-c.shutdownCh: 871 return 872 } 873 874 if err := c.updateNodeStatus(); err != nil { 875 // The servers have changed such that this node has not been 876 // registered before 877 if strings.Contains(err.Error(), "node not found") { 878 // Re-register the node 879 c.logger.Printf("[INFO] client: re-registering node") 880 c.retryRegisterNode() 881 heartbeat = time.After(lib.RandomStagger(initialHeartbeatStagger)) 882 } else { 883 intv := c.retryIntv(registerRetryIntv) 884 c.logger.Printf("[ERR] client: heartbeating failed. Retrying in %v: %v", intv, err) 885 heartbeat = time.After(intv) 886 887 // if heartbeating fails, trigger Consul discovery 888 c.triggerDiscovery() 889 } 890 } else { 891 c.heartbeatLock.Lock() 892 heartbeat = time.After(c.heartbeatTTL) 893 c.heartbeatLock.Unlock() 894 } 895 } 896 } 897 898 // periodicSnapshot is a long lived goroutine used to periodically snapshot the 899 // state of the client 900 func (c *Client) periodicSnapshot() { 901 // Create a snapshot timer 902 snapshot := time.After(stateSnapshotIntv) 903 904 for { 905 select { 906 case <-snapshot: 907 snapshot = time.After(stateSnapshotIntv) 908 if err := c.saveState(); err != nil { 909 c.logger.Printf("[ERR] client: failed to save state: %v", err) 910 } 911 912 case <-c.shutdownCh: 913 return 914 } 915 } 916 } 917 918 // run is a long lived goroutine used to run the client 919 func (c *Client) run() { 920 // Watch for changes in allocations 921 allocUpdates := make(chan *allocUpdates, 8) 922 go c.watchAllocations(allocUpdates) 923 924 for { 925 select { 926 case update := <-allocUpdates: 927 c.runAllocs(update) 928 929 case <-c.shutdownCh: 930 return 931 } 932 } 933 } 934 935 // hasNodeChanged calculates a hash for the node attributes- and meta map. 936 // The new hash values are compared against the old (passed-in) hash values to 937 // determine if the node properties have changed. It returns the new hash values 938 // in case they are different from the old hash values. 939 func (c *Client) hasNodeChanged(oldAttrHash uint64, oldMetaHash uint64) (bool, uint64, uint64) { 940 c.configLock.RLock() 941 defer c.configLock.RUnlock() 942 newAttrHash, err := hashstructure.Hash(c.config.Node.Attributes, nil) 943 if err != nil { 944 c.logger.Printf("[DEBUG] client: unable to calculate node attributes hash: %v", err) 945 } 946 // Calculate node meta map hash 947 newMetaHash, err := hashstructure.Hash(c.config.Node.Meta, nil) 948 if err != nil { 949 c.logger.Printf("[DEBUG] client: unable to calculate node meta hash: %v", err) 950 } 951 if newAttrHash != oldAttrHash || newMetaHash != oldMetaHash { 952 return true, newAttrHash, newMetaHash 953 } 954 return false, oldAttrHash, oldMetaHash 955 } 956 957 // retryRegisterNode is used to register the node or update the registration and 958 // retry in case of failure. 959 func (c *Client) retryRegisterNode() { 960 for { 961 err := c.registerNode() 962 if err == nil { 963 // Registered! 964 return 965 } 966 967 if err == noServersErr { 968 c.logger.Print("[DEBUG] client: registration waiting on servers") 969 c.triggerDiscovery() 970 } else { 971 c.logger.Printf("[ERR] client: registration failure: %v", err) 972 } 973 select { 974 case <-c.serversDiscoveredCh: 975 case <-time.After(c.retryIntv(registerRetryIntv)): 976 case <-c.shutdownCh: 977 return 978 } 979 } 980 } 981 982 // registerNode is used to register the node or update the registration 983 func (c *Client) registerNode() error { 984 node := c.Node() 985 req := structs.NodeRegisterRequest{ 986 Node: node, 987 WriteRequest: structs.WriteRequest{Region: c.Region()}, 988 } 989 var resp structs.NodeUpdateResponse 990 if err := c.RPC("Node.Register", &req, &resp); err != nil { 991 return err 992 } 993 994 // Update the node status to ready after we register. 995 c.configLock.Lock() 996 node.Status = structs.NodeStatusReady 997 c.configLock.Unlock() 998 999 c.logger.Printf("[INFO] client: node registration complete") 1000 if len(resp.EvalIDs) != 0 { 1001 c.logger.Printf("[DEBUG] client: %d evaluations triggered by node registration", len(resp.EvalIDs)) 1002 } 1003 1004 c.heartbeatLock.Lock() 1005 defer c.heartbeatLock.Unlock() 1006 c.lastHeartbeat = time.Now() 1007 c.heartbeatTTL = resp.HeartbeatTTL 1008 return nil 1009 } 1010 1011 // updateNodeStatus is used to heartbeat and update the status of the node 1012 func (c *Client) updateNodeStatus() error { 1013 c.heartbeatLock.Lock() 1014 defer c.heartbeatLock.Unlock() 1015 1016 node := c.Node() 1017 req := structs.NodeUpdateStatusRequest{ 1018 NodeID: node.ID, 1019 Status: structs.NodeStatusReady, 1020 WriteRequest: structs.WriteRequest{Region: c.Region()}, 1021 } 1022 var resp structs.NodeUpdateResponse 1023 if err := c.RPC("Node.UpdateStatus", &req, &resp); err != nil { 1024 c.triggerDiscovery() 1025 return fmt.Errorf("failed to update status: %v", err) 1026 } 1027 if len(resp.EvalIDs) != 0 { 1028 c.logger.Printf("[DEBUG] client: %d evaluations triggered by node update", len(resp.EvalIDs)) 1029 } 1030 if resp.Index != 0 { 1031 c.logger.Printf("[DEBUG] client: state updated to %s", req.Status) 1032 } 1033 1034 // Update heartbeat time and ttl 1035 c.lastHeartbeat = time.Now() 1036 c.heartbeatTTL = resp.HeartbeatTTL 1037 1038 // Convert []*NodeServerInfo to []*endpoints 1039 localdc := c.Datacenter() 1040 servers := make(endpoints, 0, len(resp.Servers)) 1041 for _, s := range resp.Servers { 1042 addr, err := resolveServer(s.RPCAdvertiseAddr) 1043 if err != nil { 1044 continue 1045 } 1046 e := endpoint{name: s.RPCAdvertiseAddr, addr: addr} 1047 if s.Datacenter != localdc { 1048 // server is non-local; de-prioritize 1049 e.priority = 1 1050 } 1051 servers = append(servers, &e) 1052 } 1053 if len(servers) == 0 { 1054 return fmt.Errorf("server returned no valid servers") 1055 } 1056 c.servers.set(servers) 1057 1058 // Begin polling Consul if there is no Nomad leader. We could be 1059 // heartbeating to a Nomad server that is in the minority of a 1060 // partition of the Nomad server quorum, but this Nomad Agent still 1061 // has connectivity to the existing majority of Nomad Servers, but 1062 // only if it queries Consul. 1063 if resp.LeaderRPCAddr == "" { 1064 c.triggerDiscovery() 1065 } 1066 1067 return nil 1068 } 1069 1070 // updateAllocStatus is used to update the status of an allocation 1071 func (c *Client) updateAllocStatus(alloc *structs.Allocation) { 1072 // Only send the fields that are updatable by the client. 1073 stripped := new(structs.Allocation) 1074 stripped.ID = alloc.ID 1075 stripped.NodeID = c.Node().ID 1076 stripped.TaskStates = alloc.TaskStates 1077 stripped.ClientStatus = alloc.ClientStatus 1078 stripped.ClientDescription = alloc.ClientDescription 1079 select { 1080 case c.allocUpdates <- stripped: 1081 case <-c.shutdownCh: 1082 } 1083 } 1084 1085 // allocSync is a long lived function that batches allocation updates to the 1086 // server. 1087 func (c *Client) allocSync() { 1088 staggered := false 1089 syncTicker := time.NewTicker(allocSyncIntv) 1090 updates := make(map[string]*structs.Allocation) 1091 for { 1092 select { 1093 case <-c.shutdownCh: 1094 syncTicker.Stop() 1095 return 1096 case alloc := <-c.allocUpdates: 1097 // Batch the allocation updates until the timer triggers. 1098 updates[alloc.ID] = alloc 1099 1100 // If this alloc was blocking another alloc and transitioned to a 1101 // terminal state then start the blocked allocation 1102 c.blockedAllocsLock.Lock() 1103 if blockedAlloc, ok := c.blockedAllocations[alloc.ID]; ok && alloc.Terminated() { 1104 var prevAllocDir *allocdir.AllocDir 1105 if ar, ok := c.getAllocRunners()[alloc.ID]; ok { 1106 prevAllocDir = ar.GetAllocDir() 1107 } 1108 if err := c.addAlloc(blockedAlloc, prevAllocDir); err != nil { 1109 c.logger.Printf("[ERR] client: failed to add alloc which was previously blocked %q: %v", 1110 blockedAlloc.ID, err) 1111 } 1112 delete(c.blockedAllocations, blockedAlloc.PreviousAllocation) 1113 } 1114 c.blockedAllocsLock.Unlock() 1115 case <-syncTicker.C: 1116 // Fast path if there are no updates 1117 if len(updates) == 0 { 1118 continue 1119 } 1120 1121 sync := make([]*structs.Allocation, 0, len(updates)) 1122 for _, alloc := range updates { 1123 sync = append(sync, alloc) 1124 } 1125 1126 // Send to server. 1127 args := structs.AllocUpdateRequest{ 1128 Alloc: sync, 1129 WriteRequest: structs.WriteRequest{Region: c.Region()}, 1130 } 1131 1132 var resp structs.GenericResponse 1133 if err := c.RPC("Node.UpdateAlloc", &args, &resp); err != nil { 1134 c.logger.Printf("[ERR] client: failed to update allocations: %v", err) 1135 syncTicker.Stop() 1136 syncTicker = time.NewTicker(c.retryIntv(allocSyncRetryIntv)) 1137 staggered = true 1138 } else { 1139 updates = make(map[string]*structs.Allocation) 1140 if staggered { 1141 syncTicker.Stop() 1142 syncTicker = time.NewTicker(allocSyncIntv) 1143 staggered = false 1144 } 1145 } 1146 } 1147 } 1148 } 1149 1150 // allocUpdates holds the results of receiving updated allocations from the 1151 // servers. 1152 type allocUpdates struct { 1153 // pulled is the set of allocations that were downloaded from the servers. 1154 pulled map[string]*structs.Allocation 1155 1156 // filtered is the set of allocations that were not pulled because their 1157 // AllocModifyIndex didn't change. 1158 filtered map[string]struct{} 1159 } 1160 1161 // watchAllocations is used to scan for updates to allocations 1162 func (c *Client) watchAllocations(updates chan *allocUpdates) { 1163 // The request and response for getting the map of allocations that should 1164 // be running on the Node to their AllocModifyIndex which is incremented 1165 // when the allocation is updated by the servers. 1166 n := c.Node() 1167 req := structs.NodeSpecificRequest{ 1168 NodeID: n.ID, 1169 SecretID: n.SecretID, 1170 QueryOptions: structs.QueryOptions{ 1171 Region: c.Region(), 1172 AllowStale: true, 1173 }, 1174 } 1175 var resp structs.NodeClientAllocsResponse 1176 1177 // The request and response for pulling down the set of allocations that are 1178 // new, or updated server side. 1179 allocsReq := structs.AllocsGetRequest{ 1180 QueryOptions: structs.QueryOptions{ 1181 Region: c.Region(), 1182 AllowStale: true, 1183 }, 1184 } 1185 var allocsResp structs.AllocsGetResponse 1186 1187 for { 1188 // Get the allocation modify index map, blocking for updates. We will 1189 // use this to determine exactly what allocations need to be downloaded 1190 // in full. 1191 resp = structs.NodeClientAllocsResponse{} 1192 err := c.RPC("Node.GetClientAllocs", &req, &resp) 1193 if err != nil { 1194 // Shutdown often causes EOF errors, so check for shutdown first 1195 select { 1196 case <-c.shutdownCh: 1197 return 1198 default: 1199 } 1200 1201 // COMPAT: Remove in 0.6. This is to allow the case in which the 1202 // servers are not fully upgraded before the clients register. This 1203 // can cause the SecretID to be lost 1204 if strings.Contains(err.Error(), "node secret ID does not match") { 1205 c.logger.Printf("[DEBUG] client: re-registering node as there was a secret ID mismatch: %v", err) 1206 c.retryRegisterNode() 1207 } else if err != noServersErr { 1208 c.logger.Printf("[ERR] client: failed to query for node allocations: %v", err) 1209 } 1210 retry := c.retryIntv(getAllocRetryIntv) 1211 select { 1212 case <-c.serversDiscoveredCh: 1213 continue 1214 case <-time.After(retry): 1215 continue 1216 case <-c.shutdownCh: 1217 return 1218 } 1219 } 1220 1221 // Check for shutdown 1222 select { 1223 case <-c.shutdownCh: 1224 return 1225 default: 1226 } 1227 1228 // Filter all allocations whose AllocModifyIndex was not incremented. 1229 // These are the allocations who have either not been updated, or whose 1230 // updates are a result of the client sending an update for the alloc. 1231 // This lets us reduce the network traffic to the server as we don't 1232 // need to pull all the allocations. 1233 var pull []string 1234 filtered := make(map[string]struct{}) 1235 runners := c.getAllocRunners() 1236 for allocID, modifyIndex := range resp.Allocs { 1237 // Pull the allocation if we don't have an alloc runner for the 1238 // allocation or if the alloc runner requires an updated allocation. 1239 runner, ok := runners[allocID] 1240 if !ok || runner.shouldUpdate(modifyIndex) { 1241 pull = append(pull, allocID) 1242 } else { 1243 filtered[allocID] = struct{}{} 1244 } 1245 } 1246 1247 c.logger.Printf("[DEBUG] client: updated allocations at index %d (pulled %d) (filtered %d)", 1248 resp.Index, len(pull), len(filtered)) 1249 1250 // Pull the allocations that passed filtering. 1251 allocsResp.Allocs = nil 1252 if len(pull) != 0 { 1253 // Pull the allocations that need to be updated. 1254 allocsReq.AllocIDs = pull 1255 allocsResp = structs.AllocsGetResponse{} 1256 if err := c.RPC("Alloc.GetAllocs", &allocsReq, &allocsResp); err != nil { 1257 c.logger.Printf("[ERR] client: failed to query updated allocations: %v", err) 1258 retry := c.retryIntv(getAllocRetryIntv) 1259 select { 1260 case <-c.serversDiscoveredCh: 1261 continue 1262 case <-time.After(retry): 1263 continue 1264 case <-c.shutdownCh: 1265 return 1266 } 1267 } 1268 1269 // Check for shutdown 1270 select { 1271 case <-c.shutdownCh: 1272 return 1273 default: 1274 } 1275 } 1276 1277 // Update the query index. 1278 if resp.Index > req.MinQueryIndex { 1279 req.MinQueryIndex = resp.Index 1280 } 1281 1282 // Push the updates. 1283 pulled := make(map[string]*structs.Allocation, len(allocsResp.Allocs)) 1284 for _, alloc := range allocsResp.Allocs { 1285 pulled[alloc.ID] = alloc 1286 } 1287 update := &allocUpdates{ 1288 filtered: filtered, 1289 pulled: pulled, 1290 } 1291 select { 1292 case updates <- update: 1293 case <-c.shutdownCh: 1294 return 1295 } 1296 } 1297 } 1298 1299 // watchNodeUpdates periodically checks for changes to the node attributes or meta map 1300 func (c *Client) watchNodeUpdates() { 1301 c.logger.Printf("[DEBUG] client: periodically checking for node changes at duration %v", nodeUpdateRetryIntv) 1302 1303 // Initialize the hashes 1304 _, attrHash, metaHash := c.hasNodeChanged(0, 0) 1305 var changed bool 1306 for { 1307 select { 1308 case <-time.After(c.retryIntv(nodeUpdateRetryIntv)): 1309 changed, attrHash, metaHash = c.hasNodeChanged(attrHash, metaHash) 1310 if changed { 1311 c.logger.Printf("[DEBUG] client: state changed, updating node.") 1312 1313 // Update the config copy. 1314 c.configLock.Lock() 1315 node := c.config.Node.Copy() 1316 c.configCopy.Node = node 1317 c.configLock.Unlock() 1318 1319 c.retryRegisterNode() 1320 } 1321 case <-c.shutdownCh: 1322 return 1323 } 1324 } 1325 } 1326 1327 // runAllocs is invoked when we get an updated set of allocations 1328 func (c *Client) runAllocs(update *allocUpdates) { 1329 // Get the existing allocs 1330 c.allocLock.RLock() 1331 exist := make([]*structs.Allocation, 0, len(c.allocs)) 1332 for _, ar := range c.allocs { 1333 exist = append(exist, ar.alloc) 1334 } 1335 c.allocLock.RUnlock() 1336 1337 // Diff the existing and updated allocations 1338 diff := diffAllocs(exist, update) 1339 c.logger.Printf("[DEBUG] client: %#v", diff) 1340 1341 // Remove the old allocations 1342 for _, remove := range diff.removed { 1343 if err := c.removeAlloc(remove); err != nil { 1344 c.logger.Printf("[ERR] client: failed to remove alloc '%s': %v", 1345 remove.ID, err) 1346 } 1347 } 1348 1349 // Update the existing allocations 1350 for _, update := range diff.updated { 1351 if err := c.updateAlloc(update.exist, update.updated); err != nil { 1352 c.logger.Printf("[ERR] client: failed to update alloc '%s': %v", 1353 update.exist.ID, err) 1354 } 1355 1356 // See if the updated alloc is getting migrated 1357 c.migratingAllocsLock.Lock() 1358 ch, ok := c.migratingAllocs[update.updated.ID] 1359 c.migratingAllocsLock.Unlock() 1360 if ok { 1361 // Stopping the migration if the allocation doesn't need any 1362 // migration 1363 if !update.updated.ShouldMigrate() { 1364 close(ch) 1365 } 1366 } 1367 } 1368 1369 // Start the new allocations 1370 for _, add := range diff.added { 1371 // If the allocation is chained and the previous allocation hasn't 1372 // terminated yet, then add the alloc to the blocked queue. 1373 ar, ok := c.getAllocRunners()[add.PreviousAllocation] 1374 if ok && !ar.Alloc().Terminated() { 1375 c.logger.Printf("[DEBUG] client: added alloc %q to blocked queue", add.ID) 1376 c.blockedAllocsLock.Lock() 1377 c.blockedAllocations[add.PreviousAllocation] = add 1378 c.blockedAllocsLock.Unlock() 1379 continue 1380 } 1381 1382 // This means the allocation has a previous allocation on another node 1383 // so we will block for the previous allocation to complete 1384 if add.PreviousAllocation != "" && !ok { 1385 c.migratingAllocsLock.Lock() 1386 c.migratingAllocs[add.ID] = make(chan struct{}) 1387 c.migratingAllocsLock.Unlock() 1388 go c.blockForRemoteAlloc(add) 1389 continue 1390 } 1391 1392 // Setting the previous allocdir if the allocation had a terminal 1393 // previous allocation 1394 var prevAllocDir *allocdir.AllocDir 1395 tg := add.Job.LookupTaskGroup(add.TaskGroup) 1396 if tg != nil && tg.EphemeralDisk.Sticky == true && ar != nil { 1397 prevAllocDir = ar.GetAllocDir() 1398 } 1399 1400 if err := c.addAlloc(add, prevAllocDir); err != nil { 1401 c.logger.Printf("[ERR] client: failed to add alloc '%s': %v", 1402 add.ID, err) 1403 } 1404 } 1405 1406 // Persist our state 1407 if err := c.saveState(); err != nil { 1408 c.logger.Printf("[ERR] client: failed to save state: %v", err) 1409 } 1410 } 1411 1412 // blockForRemoteAlloc blocks until the previous allocation of an allocation has 1413 // been terminated and migrates the snapshot data 1414 func (c *Client) blockForRemoteAlloc(alloc *structs.Allocation) { 1415 // Removing the allocation from the set of allocs which are currently 1416 // undergoing migration 1417 defer func() { 1418 c.migratingAllocsLock.Lock() 1419 delete(c.migratingAllocs, alloc.ID) 1420 c.migratingAllocsLock.Unlock() 1421 }() 1422 1423 // prevAllocDir is the allocation directory of the previous allocation 1424 var prevAllocDir *allocdir.AllocDir 1425 1426 // If the allocation is not sticky then we won't wait for the previous 1427 // allocation to be terminal 1428 tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup) 1429 if tg == nil { 1430 c.logger.Printf("[ERR] client: task group %q not found in job %q", tg.Name, alloc.Job.ID) 1431 goto ADDALLOC 1432 } 1433 1434 // Wait for the remote previous alloc to be terminal if the alloc is sticky 1435 if tg.EphemeralDisk.Sticky { 1436 c.logger.Printf("[DEBUG] client: blocking alloc %q for previous allocation %q", alloc.ID, alloc.PreviousAllocation) 1437 // Block until the previous allocation migrates to terminal state 1438 prevAlloc, err := c.waitForAllocTerminal(alloc.PreviousAllocation) 1439 if err != nil { 1440 c.logger.Printf("[ERR] client: error waiting for allocation %q: %v", 1441 alloc.PreviousAllocation, err) 1442 } 1443 1444 // Migrate the data from the remote node 1445 prevAllocDir, err = c.migrateRemoteAllocDir(prevAlloc, alloc.ID) 1446 if err != nil { 1447 c.logger.Printf("[ERR] client: error migrating data from remote alloc %q: %v", 1448 alloc.PreviousAllocation, err) 1449 } 1450 } 1451 1452 ADDALLOC: 1453 // Add the allocation 1454 if err := c.addAlloc(alloc, prevAllocDir); err != nil { 1455 c.logger.Printf("[ERR] client: error adding alloc: %v", err) 1456 } 1457 } 1458 1459 // waitForAllocTerminal waits for an allocation with the given alloc id to 1460 // transition to terminal state and blocks the caller until then. 1461 func (c *Client) waitForAllocTerminal(allocID string) (*structs.Allocation, error) { 1462 req := structs.AllocSpecificRequest{ 1463 AllocID: allocID, 1464 QueryOptions: structs.QueryOptions{ 1465 Region: c.Region(), 1466 AllowStale: true, 1467 }, 1468 } 1469 1470 for { 1471 resp := structs.SingleAllocResponse{} 1472 err := c.RPC("Alloc.GetAlloc", &req, &resp) 1473 if err != nil { 1474 c.logger.Printf("[ERR] client: failed to query allocation %q: %v", allocID, err) 1475 retry := c.retryIntv(getAllocRetryIntv) 1476 select { 1477 case <-time.After(retry): 1478 continue 1479 case <-c.shutdownCh: 1480 return nil, fmt.Errorf("aborting because client is shutting down") 1481 } 1482 } 1483 if resp.Alloc == nil { 1484 return nil, nil 1485 } 1486 if resp.Alloc.Terminated() { 1487 return resp.Alloc, nil 1488 } 1489 1490 // Update the query index. 1491 if resp.Index > req.MinQueryIndex { 1492 req.MinQueryIndex = resp.Index 1493 } 1494 1495 } 1496 } 1497 1498 // migrateRemoteAllocDir migrates the allocation directory from a remote node to 1499 // the current node 1500 func (c *Client) migrateRemoteAllocDir(alloc *structs.Allocation, allocID string) (*allocdir.AllocDir, error) { 1501 if alloc == nil { 1502 return nil, nil 1503 } 1504 1505 tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup) 1506 if tg == nil { 1507 return nil, fmt.Errorf("Task Group %q not found in job %q", tg.Name, alloc.Job.ID) 1508 } 1509 1510 // Skip migration of data if the ephemeral disk is not sticky or 1511 // migration is turned off. 1512 if !tg.EphemeralDisk.Sticky || !tg.EphemeralDisk.Migrate { 1513 return nil, nil 1514 } 1515 1516 node, err := c.getNode(alloc.NodeID) 1517 1518 // If the node is down then skip migrating the data 1519 if err != nil { 1520 return nil, fmt.Errorf("error retreiving node %v: %v", alloc.NodeID, err) 1521 } 1522 1523 // Check if node is nil 1524 if node == nil { 1525 return nil, fmt.Errorf("node %q doesn't exist", alloc.NodeID) 1526 } 1527 1528 // skip migration if the remote node is down 1529 if node.Status == structs.NodeStatusDown { 1530 c.logger.Printf("[INFO] client: not migrating data from alloc %q since node %q is down", alloc.ID, alloc.NodeID) 1531 return nil, nil 1532 } 1533 1534 // Create the previous alloc dir 1535 pathToAllocDir := filepath.Join(c.config.AllocDir, alloc.ID) 1536 if err := os.MkdirAll(pathToAllocDir, 0777); err != nil { 1537 c.logger.Printf("[ERR] client: error creating previous allocation dir: %v", err) 1538 } 1539 1540 // Get the snapshot 1541 scheme := "http" 1542 if node.TLSEnabled { 1543 scheme = "https" 1544 } 1545 // Create an API client 1546 apiConfig := nomadapi.DefaultConfig() 1547 apiConfig.Address = fmt.Sprintf("%s://%s", scheme, node.HTTPAddr) 1548 apiConfig.TLSConfig = &nomadapi.TLSConfig{ 1549 CACert: c.config.TLSConfig.CAFile, 1550 ClientCert: c.config.TLSConfig.CertFile, 1551 ClientKey: c.config.TLSConfig.KeyFile, 1552 } 1553 apiClient, err := nomadapi.NewClient(apiConfig) 1554 if err != nil { 1555 return nil, err 1556 } 1557 1558 url := fmt.Sprintf("/v1/client/allocation/%v/snapshot", alloc.ID) 1559 resp, err := apiClient.Raw().Response(url, nil) 1560 if err != nil { 1561 os.RemoveAll(pathToAllocDir) 1562 c.logger.Printf("[ERR] client: error getting snapshot: %v", err) 1563 return nil, fmt.Errorf("error getting snapshot for alloc %v: %v", alloc.ID, err) 1564 } 1565 1566 tr := tar.NewReader(resp) 1567 defer resp.Close() 1568 1569 buf := make([]byte, 1024) 1570 1571 stopMigrating, ok := c.migratingAllocs[allocID] 1572 if !ok { 1573 os.RemoveAll(pathToAllocDir) 1574 return nil, fmt.Errorf("couldn't find a migration validity notifier for alloc: %v", alloc.ID) 1575 } 1576 for { 1577 // See if the alloc still needs migration 1578 select { 1579 case <-stopMigrating: 1580 os.RemoveAll(pathToAllocDir) 1581 c.logger.Printf("[INFO] client: stopping migration of allocdir for alloc: %v", alloc.ID) 1582 return nil, nil 1583 case <-c.shutdownCh: 1584 os.RemoveAll(pathToAllocDir) 1585 c.logger.Printf("[INFO] client: stopping migration of alloc %q since client is shutting down", alloc.ID) 1586 return nil, nil 1587 default: 1588 } 1589 1590 // Get the next header 1591 hdr, err := tr.Next() 1592 1593 // If the snapshot has ended then we create the previous 1594 // allocdir 1595 if err == io.EOF { 1596 prevAllocDir := allocdir.NewAllocDir(pathToAllocDir) 1597 return prevAllocDir, nil 1598 } 1599 // If there is an error then we avoid creating the alloc dir 1600 if err != nil { 1601 os.RemoveAll(pathToAllocDir) 1602 return nil, fmt.Errorf("error creating alloc dir for alloc %q: %v", alloc.ID, err) 1603 } 1604 1605 // If the header is for a directory we create the directory 1606 if hdr.Typeflag == tar.TypeDir { 1607 os.MkdirAll(filepath.Join(pathToAllocDir, hdr.Name), 0777) 1608 continue 1609 } 1610 // If the header is a file, we write to a file 1611 if hdr.Typeflag == tar.TypeReg { 1612 f, err := os.Create(filepath.Join(pathToAllocDir, hdr.Name)) 1613 if err != nil { 1614 c.logger.Printf("[ERR] client: error creating file: %v", err) 1615 continue 1616 } 1617 1618 // We write in chunks of 32 bytes so that we can test if 1619 // the client is still alive 1620 for { 1621 if c.shutdown { 1622 f.Close() 1623 os.RemoveAll(pathToAllocDir) 1624 c.logger.Printf("[INFO] client: stopping migration of alloc %q because client is shutting down", alloc.ID) 1625 return nil, nil 1626 } 1627 1628 n, err := tr.Read(buf) 1629 if err != nil { 1630 f.Close() 1631 if err != io.EOF { 1632 return nil, fmt.Errorf("error reading snapshot: %v", err) 1633 } 1634 break 1635 } 1636 if _, err := f.Write(buf[:n]); err != nil { 1637 f.Close() 1638 os.RemoveAll(pathToAllocDir) 1639 return nil, fmt.Errorf("error writing to file %q: %v", f.Name(), err) 1640 } 1641 } 1642 1643 } 1644 } 1645 } 1646 1647 // getNode gets the node from the server with the given Node ID 1648 func (c *Client) getNode(nodeID string) (*structs.Node, error) { 1649 req := structs.NodeSpecificRequest{ 1650 NodeID: nodeID, 1651 QueryOptions: structs.QueryOptions{ 1652 Region: c.Region(), 1653 AllowStale: true, 1654 }, 1655 } 1656 1657 resp := structs.SingleNodeResponse{} 1658 for { 1659 err := c.RPC("Node.GetNode", &req, &resp) 1660 if err != nil { 1661 c.logger.Printf("[ERR] client: failed to query node info %q: %v", nodeID, err) 1662 retry := c.retryIntv(getAllocRetryIntv) 1663 select { 1664 case <-time.After(retry): 1665 continue 1666 case <-c.shutdownCh: 1667 return nil, fmt.Errorf("aborting because client is shutting down") 1668 } 1669 } 1670 break 1671 } 1672 1673 return resp.Node, nil 1674 } 1675 1676 // removeAlloc is invoked when we should remove an allocation 1677 func (c *Client) removeAlloc(alloc *structs.Allocation) error { 1678 c.allocLock.Lock() 1679 ar, ok := c.allocs[alloc.ID] 1680 if !ok { 1681 c.allocLock.Unlock() 1682 c.logger.Printf("[WARN] client: missing context for alloc '%s'", alloc.ID) 1683 return nil 1684 } 1685 delete(c.allocs, alloc.ID) 1686 c.allocLock.Unlock() 1687 1688 ar.Destroy() 1689 return nil 1690 } 1691 1692 // updateAlloc is invoked when we should update an allocation 1693 func (c *Client) updateAlloc(exist, update *structs.Allocation) error { 1694 c.allocLock.RLock() 1695 ar, ok := c.allocs[exist.ID] 1696 c.allocLock.RUnlock() 1697 if !ok { 1698 c.logger.Printf("[WARN] client: missing context for alloc '%s'", exist.ID) 1699 return nil 1700 } 1701 1702 ar.Update(update) 1703 return nil 1704 } 1705 1706 // addAlloc is invoked when we should add an allocation 1707 func (c *Client) addAlloc(alloc *structs.Allocation, prevAllocDir *allocdir.AllocDir) error { 1708 c.configLock.RLock() 1709 ar := NewAllocRunner(c.logger, c.configCopy, c.updateAllocStatus, alloc, c.vaultClient) 1710 ar.SetPreviousAllocDir(prevAllocDir) 1711 c.configLock.RUnlock() 1712 go ar.Run() 1713 1714 // Store the alloc runner. 1715 c.allocLock.Lock() 1716 c.allocs[alloc.ID] = ar 1717 c.allocLock.Unlock() 1718 return nil 1719 } 1720 1721 // setupVaultClient creates an object to periodically renew tokens and secrets 1722 // with vault. 1723 func (c *Client) setupVaultClient() error { 1724 var err error 1725 if c.vaultClient, err = 1726 vaultclient.NewVaultClient(c.config.VaultConfig, c.logger, c.deriveToken); err != nil { 1727 return err 1728 } 1729 1730 if c.vaultClient == nil { 1731 c.logger.Printf("[ERR] client: failed to create vault client") 1732 return fmt.Errorf("failed to create vault client") 1733 } 1734 1735 // Start renewing tokens and secrets 1736 c.vaultClient.Start() 1737 1738 return nil 1739 } 1740 1741 // deriveToken takes in an allocation and a set of tasks and derives vault 1742 // tokens for each of the tasks, unwraps all of them using the supplied vault 1743 // client and returns a map of unwrapped tokens, indexed by the task name. 1744 func (c *Client) deriveToken(alloc *structs.Allocation, taskNames []string, vclient *vaultapi.Client) (map[string]string, error) { 1745 if alloc == nil { 1746 return nil, fmt.Errorf("nil allocation") 1747 } 1748 1749 if taskNames == nil || len(taskNames) == 0 { 1750 return nil, fmt.Errorf("missing task names") 1751 } 1752 1753 group := alloc.Job.LookupTaskGroup(alloc.TaskGroup) 1754 if group == nil { 1755 return nil, fmt.Errorf("group name in allocation is not present in job") 1756 } 1757 1758 verifiedTasks := []string{} 1759 found := false 1760 // Check if the given task names actually exist in the allocation 1761 for _, taskName := range taskNames { 1762 found = false 1763 for _, task := range group.Tasks { 1764 if task.Name == taskName { 1765 found = true 1766 } 1767 } 1768 if !found { 1769 c.logger.Printf("[ERR] task %q not found in the allocation", taskName) 1770 return nil, fmt.Errorf("task %q not found in the allocaition", taskName) 1771 } 1772 verifiedTasks = append(verifiedTasks, taskName) 1773 } 1774 1775 // DeriveVaultToken of nomad server can take in a set of tasks and 1776 // creates tokens for all the tasks. 1777 req := &structs.DeriveVaultTokenRequest{ 1778 NodeID: c.Node().ID, 1779 SecretID: c.Node().SecretID, 1780 AllocID: alloc.ID, 1781 Tasks: verifiedTasks, 1782 QueryOptions: structs.QueryOptions{ 1783 Region: c.Region(), 1784 AllowStale: true, 1785 }, 1786 } 1787 1788 // Derive the tokens 1789 var resp structs.DeriveVaultTokenResponse 1790 if err := c.RPC("Node.DeriveVaultToken", &req, &resp); err != nil { 1791 c.logger.Printf("[ERR] client.vault: DeriveVaultToken RPC failed: %v", err) 1792 return nil, fmt.Errorf("DeriveVaultToken RPC failed: %v", err) 1793 } 1794 if resp.Error != nil { 1795 c.logger.Printf("[ERR] client.vault: failed to derive vault tokens: %v", resp.Error) 1796 return nil, resp.Error 1797 } 1798 if resp.Tasks == nil { 1799 c.logger.Printf("[ERR] client.vault: failed to derive vault token: invalid response") 1800 return nil, fmt.Errorf("failed to derive vault tokens: invalid response") 1801 } 1802 1803 unwrappedTokens := make(map[string]string) 1804 1805 // Retrieve the wrapped tokens from the response and unwrap it 1806 for _, taskName := range verifiedTasks { 1807 // Get the wrapped token 1808 wrappedToken, ok := resp.Tasks[taskName] 1809 if !ok { 1810 c.logger.Printf("[ERR] client.vault: wrapped token missing for task %q", taskName) 1811 return nil, fmt.Errorf("wrapped token missing for task %q", taskName) 1812 } 1813 1814 // Unwrap the vault token 1815 unwrapResp, err := vclient.Logical().Unwrap(wrappedToken) 1816 if err != nil { 1817 return nil, fmt.Errorf("failed to unwrap the token for task %q: %v", taskName, err) 1818 } 1819 if unwrapResp == nil || unwrapResp.Auth == nil || unwrapResp.Auth.ClientToken == "" { 1820 return nil, fmt.Errorf("failed to unwrap the token for task %q", taskName) 1821 } 1822 1823 // Append the unwrapped token to the return value 1824 unwrappedTokens[taskName] = unwrapResp.Auth.ClientToken 1825 } 1826 1827 return unwrappedTokens, nil 1828 } 1829 1830 // triggerDiscovery causes a Consul discovery to begin (if one hasn't alread) 1831 func (c *Client) triggerDiscovery() { 1832 select { 1833 case c.triggerDiscoveryCh <- struct{}{}: 1834 // Discovery goroutine was released to execute 1835 default: 1836 // Discovery goroutine was already running 1837 } 1838 } 1839 1840 // consulDiscovery waits for the signal to attempt server discovery via Consul. 1841 // It's intended to be started in a goroutine. See triggerDiscovery() for 1842 // causing consul discovery from other code locations. 1843 func (c *Client) consulDiscovery() { 1844 for { 1845 select { 1846 case <-c.triggerDiscoveryCh: 1847 if err := c.consulDiscoveryImpl(); err != nil { 1848 c.logger.Printf("[ERR] client.consul: error discovering nomad servers: %v", err) 1849 } 1850 case <-c.shutdownCh: 1851 return 1852 } 1853 } 1854 } 1855 1856 func (c *Client) consulDiscoveryImpl() error { 1857 // Acquire heartbeat lock to prevent heartbeat from running 1858 // concurrently with discovery. Concurrent execution is safe, however 1859 // discovery is usually triggered when heartbeating has failed so 1860 // there's no point in allowing it. 1861 c.heartbeatLock.Lock() 1862 defer c.heartbeatLock.Unlock() 1863 1864 consulCatalog := c.consulSyncer.ConsulClient().Catalog() 1865 dcs, err := consulCatalog.Datacenters() 1866 if err != nil { 1867 return fmt.Errorf("client.consul: unable to query Consul datacenters: %v", err) 1868 } 1869 if len(dcs) > 2 { 1870 // Query the local DC first, then shuffle the 1871 // remaining DCs. Future heartbeats will cause Nomad 1872 // Clients to fixate on their local datacenter so 1873 // it's okay to talk with remote DCs. If the no 1874 // Nomad servers are available within 1875 // datacenterQueryLimit, the next heartbeat will pick 1876 // a new set of servers so it's okay. 1877 shuffleStrings(dcs[1:]) 1878 dcs = dcs[0:lib.MinInt(len(dcs), datacenterQueryLimit)] 1879 } 1880 1881 // Query for servers in this client's region only 1882 region := c.Region() 1883 rpcargs := structs.GenericRequest{ 1884 QueryOptions: structs.QueryOptions{ 1885 Region: region, 1886 }, 1887 } 1888 1889 serviceName := c.configCopy.ConsulConfig.ServerServiceName 1890 var mErr multierror.Error 1891 var servers endpoints 1892 c.logger.Printf("[DEBUG] client.consul: bootstrap contacting following Consul DCs: %+q", dcs) 1893 DISCOLOOP: 1894 for _, dc := range dcs { 1895 consulOpts := &consulapi.QueryOptions{ 1896 AllowStale: true, 1897 Datacenter: dc, 1898 Near: "_agent", 1899 WaitTime: consul.DefaultQueryWaitDuration, 1900 } 1901 consulServices, _, err := consulCatalog.Service(serviceName, consul.ServiceTagRPC, consulOpts) 1902 if err != nil { 1903 mErr.Errors = append(mErr.Errors, fmt.Errorf("unable to query service %+q from Consul datacenter %+q: %v", serviceName, dc, err)) 1904 continue 1905 } 1906 1907 for _, s := range consulServices { 1908 port := strconv.Itoa(s.ServicePort) 1909 addrstr := s.ServiceAddress 1910 if addrstr == "" { 1911 addrstr = s.Address 1912 } 1913 addr, err := net.ResolveTCPAddr("tcp", net.JoinHostPort(addrstr, port)) 1914 if err != nil { 1915 mErr.Errors = append(mErr.Errors, err) 1916 continue 1917 } 1918 var peers []string 1919 if err := c.connPool.RPC(region, addr, c.RPCMajorVersion(), "Status.Peers", rpcargs, &peers); err != nil { 1920 mErr.Errors = append(mErr.Errors, err) 1921 continue 1922 } 1923 1924 // Successfully received the Server peers list of the correct 1925 // region 1926 for _, p := range peers { 1927 addr, err := net.ResolveTCPAddr("tcp", p) 1928 if err != nil { 1929 mErr.Errors = append(mErr.Errors, err) 1930 } 1931 servers = append(servers, &endpoint{name: p, addr: addr}) 1932 } 1933 if len(servers) > 0 { 1934 break DISCOLOOP 1935 } 1936 } 1937 } 1938 if len(servers) == 0 { 1939 if len(mErr.Errors) > 0 { 1940 return mErr.ErrorOrNil() 1941 } 1942 return fmt.Errorf("no Nomad Servers advertising service %q in Consul datacenters: %+q", serviceName, dcs) 1943 } 1944 1945 c.logger.Printf("[INFO] client.consul: discovered following Servers: %s", servers) 1946 c.servers.set(servers) 1947 1948 // Notify waiting rpc calls. If a goroutine just failed an RPC call and 1949 // isn't receiving on this chan yet they'll still retry eventually. 1950 // This is a shortcircuit for the longer retry intervals. 1951 for { 1952 select { 1953 case c.serversDiscoveredCh <- struct{}{}: 1954 default: 1955 return nil 1956 } 1957 } 1958 } 1959 1960 // consulReaper periodically reaps unmatched domains from Consul. Intended to 1961 // be called in its own goroutine. See consulReaperIntv for interval. 1962 func (c *Client) consulReaper() { 1963 ticker := time.NewTicker(consulReaperIntv) 1964 defer ticker.Stop() 1965 lastok := true 1966 for { 1967 select { 1968 case <-ticker.C: 1969 if err := c.consulReaperImpl(); err != nil { 1970 if lastok { 1971 c.logger.Printf("[ERR] client.consul: error reaping services in consul: %v", err) 1972 lastok = false 1973 } 1974 } else { 1975 lastok = true 1976 } 1977 case <-c.shutdownCh: 1978 return 1979 } 1980 } 1981 } 1982 1983 // consulReaperImpl reaps unmatched domains from Consul. 1984 func (c *Client) consulReaperImpl() error { 1985 const estInitialExecutorDomains = 8 1986 1987 // Create the domains to keep and add the server and client 1988 domains := make([]consul.ServiceDomain, 2, estInitialExecutorDomains) 1989 domains[0] = consul.ServerDomain 1990 domains[1] = consul.ClientDomain 1991 1992 for allocID, ar := range c.getAllocRunners() { 1993 ar.taskStatusLock.RLock() 1994 taskStates := copyTaskStates(ar.taskStates) 1995 ar.taskStatusLock.RUnlock() 1996 for taskName, taskState := range taskStates { 1997 // Only keep running tasks 1998 if taskState.State == structs.TaskStateRunning { 1999 d := consul.NewExecutorDomain(allocID, taskName) 2000 domains = append(domains, d) 2001 } 2002 } 2003 } 2004 2005 return c.consulSyncer.ReapUnmatched(domains) 2006 } 2007 2008 // collectHostStats collects host resource usage stats periodically 2009 func (c *Client) collectHostStats() { 2010 // Start collecting host stats right away and then keep collecting every 2011 // collection interval 2012 next := time.NewTimer(0) 2013 defer next.Stop() 2014 for { 2015 select { 2016 case <-next.C: 2017 ru, err := c.hostStatsCollector.Collect() 2018 next.Reset(c.config.StatsCollectionInterval) 2019 if err != nil { 2020 c.logger.Printf("[WARN] client: error fetching host resource usage stats: %v", err) 2021 continue 2022 } 2023 2024 c.resourceUsageLock.Lock() 2025 c.resourceUsage = ru 2026 c.resourceUsageLock.Unlock() 2027 2028 // Publish Node metrics if operator has opted in 2029 if c.config.PublishNodeMetrics { 2030 c.emitStats(ru) 2031 } 2032 case <-c.shutdownCh: 2033 return 2034 } 2035 } 2036 } 2037 2038 // emitStats pushes host resource usage stats to remote metrics collection sinks 2039 func (c *Client) emitStats(hStats *stats.HostStats) { 2040 nodeID := c.Node().ID 2041 metrics.SetGauge([]string{"client", "host", "memory", nodeID, "total"}, float32(hStats.Memory.Total)) 2042 metrics.SetGauge([]string{"client", "host", "memory", nodeID, "available"}, float32(hStats.Memory.Available)) 2043 metrics.SetGauge([]string{"client", "host", "memory", nodeID, "used"}, float32(hStats.Memory.Used)) 2044 metrics.SetGauge([]string{"client", "host", "memory", nodeID, "free"}, float32(hStats.Memory.Free)) 2045 2046 metrics.SetGauge([]string{"uptime"}, float32(hStats.Uptime)) 2047 2048 for _, cpu := range hStats.CPU { 2049 metrics.SetGauge([]string{"client", "host", "cpu", nodeID, cpu.CPU, "total"}, float32(cpu.Total)) 2050 metrics.SetGauge([]string{"client", "host", "cpu", nodeID, cpu.CPU, "user"}, float32(cpu.User)) 2051 metrics.SetGauge([]string{"client", "host", "cpu", nodeID, cpu.CPU, "idle"}, float32(cpu.Idle)) 2052 metrics.SetGauge([]string{"client", "host", "cpu", nodeID, cpu.CPU, "system"}, float32(cpu.System)) 2053 } 2054 2055 for _, disk := range hStats.DiskStats { 2056 metrics.SetGauge([]string{"client", "host", "disk", nodeID, disk.Device, "size"}, float32(disk.Size)) 2057 metrics.SetGauge([]string{"client", "host", "disk", nodeID, disk.Device, "used"}, float32(disk.Used)) 2058 metrics.SetGauge([]string{"client", "host", "disk", nodeID, disk.Device, "available"}, float32(disk.Available)) 2059 metrics.SetGauge([]string{"client", "host", "disk", nodeID, disk.Device, "used_percent"}, float32(disk.UsedPercent)) 2060 metrics.SetGauge([]string{"client", "host", "disk", nodeID, disk.Device, "inodes_percent"}, float32(disk.InodesUsedPercent)) 2061 } 2062 } 2063 2064 // resolveServer given a sever's address as a string, return it's resolved 2065 // net.Addr or an error. 2066 func resolveServer(s string) (net.Addr, error) { 2067 const defaultClientPort = "4647" // default client RPC port 2068 host, port, err := net.SplitHostPort(s) 2069 if err != nil { 2070 if strings.Contains(err.Error(), "missing port") { 2071 host = s 2072 port = defaultClientPort 2073 } else { 2074 return nil, err 2075 } 2076 } 2077 return net.ResolveTCPAddr("tcp", net.JoinHostPort(host, port)) 2078 }