github.com/kardianos/nomad@v0.1.3-0.20151022182107-b13df73ee850/client/client.go (about) 1 package client 2 3 import ( 4 "fmt" 5 "io/ioutil" 6 "log" 7 "net" 8 "os" 9 "path/filepath" 10 "strconv" 11 "sync" 12 "time" 13 14 "github.com/hashicorp/go-multierror" 15 "github.com/hashicorp/nomad/client/config" 16 "github.com/hashicorp/nomad/client/driver" 17 "github.com/hashicorp/nomad/client/fingerprint" 18 "github.com/hashicorp/nomad/nomad" 19 "github.com/hashicorp/nomad/nomad/structs" 20 ) 21 22 const ( 23 // clientRPCCache controls how long we keep an idle connection 24 // open to a server 25 clientRPCCache = 30 * time.Second 26 27 // clientMaxStreams controsl how many idle streams we keep 28 // open to a server 29 clientMaxStreams = 2 30 31 // registerRetryIntv is minimum interval on which we retry 32 // registration. We pick a value between this and 2x this. 33 registerRetryIntv = 15 * time.Second 34 35 // getAllocRetryIntv is minimum interval on which we retry 36 // to fetch allocations. We pick a value between this and 2x this. 37 getAllocRetryIntv = 30 * time.Second 38 39 // devModeRetryIntv is the retry interval used for development 40 devModeRetryIntv = time.Second 41 42 // stateSnapshotIntv is how often the client snapshots state 43 stateSnapshotIntv = 60 * time.Second 44 45 // registerErrGrace is the grace period where we don't log about 46 // register errors after start. This is to improve the user experience 47 // in dev mode where the leader isn't elected for a few seconds. 48 registerErrGrace = 10 * time.Second 49 50 // initialHeartbeatStagger is used to stagger the interval between 51 // starting and the intial heartbeat. After the intial heartbeat, 52 // we switch to using the TTL specified by the servers. 53 initialHeartbeatStagger = 10 * time.Second 54 ) 55 56 // DefaultConfig returns the default configuration 57 func DefaultConfig() *config.Config { 58 return &config.Config{ 59 LogOutput: os.Stderr, 60 Region: "global", 61 } 62 } 63 64 // Client is used to implement the client interaction with Nomad. Clients 65 // are expected to register as a schedulable node to the servers, and to 66 // run allocations as determined by the servers. 67 type Client struct { 68 config *config.Config 69 start time.Time 70 71 logger *log.Logger 72 73 lastServer net.Addr 74 lastRPCTime time.Time 75 lastServerLock sync.Mutex 76 77 servers []string 78 serverLock sync.RWMutex 79 80 connPool *nomad.ConnPool 81 82 lastHeartbeat time.Time 83 heartbeatTTL time.Duration 84 85 // allocs is the current set of allocations 86 allocs map[string]*AllocRunner 87 allocLock sync.RWMutex 88 89 shutdown bool 90 shutdownCh chan struct{} 91 shutdownLock sync.Mutex 92 } 93 94 // NewClient is used to create a new client from the given configuration 95 func NewClient(cfg *config.Config) (*Client, error) { 96 // Create a logger 97 logger := log.New(cfg.LogOutput, "", log.LstdFlags) 98 99 // Create the client 100 c := &Client{ 101 config: cfg, 102 start: time.Now(), 103 connPool: nomad.NewPool(cfg.LogOutput, clientRPCCache, clientMaxStreams, nil), 104 logger: logger, 105 allocs: make(map[string]*AllocRunner), 106 shutdownCh: make(chan struct{}), 107 } 108 109 // Initialize the client 110 if err := c.init(); err != nil { 111 return nil, fmt.Errorf("failed intializing client: %v", err) 112 } 113 114 // Restore the state 115 if err := c.restoreState(); err != nil { 116 return nil, fmt.Errorf("failed to restore state: %v", err) 117 } 118 119 // Setup the node 120 if err := c.setupNode(); err != nil { 121 return nil, fmt.Errorf("node setup failed: %v", err) 122 } 123 124 // Fingerprint the node 125 if err := c.fingerprint(); err != nil { 126 return nil, fmt.Errorf("fingerprinting failed: %v", err) 127 } 128 129 // Scan for drivers 130 if err := c.setupDrivers(); err != nil { 131 return nil, fmt.Errorf("driver setup failed: %v", err) 132 } 133 134 // Set up the known servers list 135 c.SetServers(c.config.Servers) 136 137 // Start the client! 138 go c.run() 139 return c, nil 140 } 141 142 // init is used to initialize the client and perform any setup 143 // needed before we begin starting its various components. 144 func (c *Client) init() error { 145 // Ensure the state dir exists if we have one 146 if c.config.StateDir != "" { 147 if err := os.MkdirAll(c.config.StateDir, 0700); err != nil { 148 return fmt.Errorf("failed creating state dir: %s", err) 149 } 150 151 c.logger.Printf("[INFO] client: using state directory %v", c.config.StateDir) 152 } 153 154 // Ensure the alloc dir exists if we have one 155 if c.config.AllocDir != "" { 156 if err := os.MkdirAll(c.config.AllocDir, 0700); err != nil { 157 return fmt.Errorf("failed creating alloc dir: %s", err) 158 } 159 } else { 160 // Othewise make a temp directory to use. 161 p, err := ioutil.TempDir("", "NomadClient") 162 if err != nil { 163 return fmt.Errorf("failed creating temporary directory for the AllocDir: %v", err) 164 } 165 c.config.AllocDir = p 166 } 167 168 c.logger.Printf("[INFO] client: using alloc directory %v", c.config.AllocDir) 169 return nil 170 } 171 172 // Leave is used to prepare the client to leave the cluster 173 func (c *Client) Leave() error { 174 // TODO 175 return nil 176 } 177 178 // Shutdown is used to tear down the client 179 func (c *Client) Shutdown() error { 180 c.logger.Printf("[INFO] client: shutting down") 181 c.shutdownLock.Lock() 182 defer c.shutdownLock.Unlock() 183 184 if c.shutdown { 185 return nil 186 } 187 188 // Destroy all the running allocations. 189 if c.config.DevMode { 190 for _, ar := range c.allocs { 191 ar.Destroy() 192 <-ar.WaitCh() 193 } 194 } 195 196 c.shutdown = true 197 close(c.shutdownCh) 198 c.connPool.Shutdown() 199 return c.saveState() 200 } 201 202 // RPC is used to forward an RPC call to a nomad server, or fail if no servers 203 func (c *Client) RPC(method string, args interface{}, reply interface{}) error { 204 // Invoke the RPCHandle if it exists 205 if c.config.RPCHandler != nil { 206 return c.config.RPCHandler.RPC(method, args, reply) 207 } 208 209 // Pick a server to request from 210 addr, err := c.pickServer() 211 if err != nil { 212 return err 213 } 214 215 // Make the RPC request 216 err = c.connPool.RPC(c.config.Region, addr, 1, method, args, reply) 217 218 // Update the last server information 219 c.lastServerLock.Lock() 220 if err != nil { 221 c.lastServer = nil 222 c.lastRPCTime = time.Time{} 223 } else { 224 c.lastServer = addr 225 c.lastRPCTime = time.Now() 226 } 227 c.lastServerLock.Unlock() 228 return err 229 } 230 231 // pickServer is used to pick a target RPC server 232 func (c *Client) pickServer() (net.Addr, error) { 233 c.lastServerLock.Lock() 234 defer c.lastServerLock.Unlock() 235 236 // Check for a valid last-used server 237 if c.lastServer != nil && time.Now().Sub(c.lastRPCTime) < clientRPCCache { 238 return c.lastServer, nil 239 } 240 241 // Bail if we can't find any servers 242 servers := c.Servers() 243 if len(servers) == 0 { 244 return nil, fmt.Errorf("no known servers") 245 } 246 247 // Shuffle so we don't always use the same server 248 shuffleStrings(servers) 249 250 // Try to resolve each server 251 for i := 0; i < len(servers); i++ { 252 addr, err := net.ResolveTCPAddr("tcp", servers[i]) 253 if err == nil { 254 c.lastServer = addr 255 c.lastRPCTime = time.Now() 256 return addr, nil 257 } 258 c.logger.Printf("[WARN] client: failed to resolve '%s': %s", servers[i], err) 259 } 260 261 // Bail if we reach this point 262 return nil, fmt.Errorf("failed to resolve any servers") 263 } 264 265 // Servers is used to return the current known servers list. When an agent 266 // is first started, this list comes directly from configuration files. 267 func (c *Client) Servers() []string { 268 c.serverLock.RLock() 269 defer c.serverLock.RUnlock() 270 return c.servers 271 } 272 273 // SetServers is used to modify the known servers list. This avoids forcing 274 // a config rollout + rolling restart and enables auto-join features. The 275 // full set of servers is passed to support adding and/or removing servers. 276 func (c *Client) SetServers(servers []string) { 277 c.serverLock.Lock() 278 defer c.serverLock.Unlock() 279 if servers == nil { 280 servers = make([]string, 0) 281 } 282 c.servers = servers 283 } 284 285 // Stats is used to return statistics for debugging and insight 286 // for various sub-systems 287 func (c *Client) Stats() map[string]map[string]string { 288 toString := func(v uint64) string { 289 return strconv.FormatUint(v, 10) 290 } 291 c.allocLock.RLock() 292 numAllocs := len(c.allocs) 293 c.allocLock.RUnlock() 294 295 stats := map[string]map[string]string{ 296 "client": map[string]string{ 297 "known_servers": toString(uint64(len(c.Servers()))), 298 "num_allocations": toString(uint64(numAllocs)), 299 "last_heartbeat": fmt.Sprintf("%v", time.Since(c.lastHeartbeat)), 300 "heartbeat_ttl": fmt.Sprintf("%v", c.heartbeatTTL), 301 }, 302 "runtime": nomad.RuntimeStats(), 303 } 304 return stats 305 } 306 307 // Node returns the locally registered node 308 func (c *Client) Node() *structs.Node { 309 return c.config.Node 310 } 311 312 // restoreState is used to restore our state from the data dir 313 func (c *Client) restoreState() error { 314 if c.config.DevMode { 315 return nil 316 } 317 318 // Scan the directory 319 list, err := ioutil.ReadDir(filepath.Join(c.config.StateDir, "alloc")) 320 if err != nil && os.IsNotExist(err) { 321 return nil 322 } else if err != nil { 323 return fmt.Errorf("failed to list alloc state: %v", err) 324 } 325 326 // Load each alloc back 327 var mErr multierror.Error 328 for _, entry := range list { 329 id := entry.Name() 330 alloc := &structs.Allocation{ID: id} 331 ar := NewAllocRunner(c.logger, c.config, c.updateAllocStatus, alloc) 332 c.allocs[id] = ar 333 if err := ar.RestoreState(); err != nil { 334 c.logger.Printf("[ERR] client: failed to restore state for alloc %s: %v", 335 id, err) 336 mErr.Errors = append(mErr.Errors, err) 337 } else { 338 go ar.Run() 339 } 340 } 341 return mErr.ErrorOrNil() 342 } 343 344 // saveState is used to snapshot our state into the data dir 345 func (c *Client) saveState() error { 346 if c.config.DevMode { 347 return nil 348 } 349 350 var mErr multierror.Error 351 c.allocLock.RLock() 352 defer c.allocLock.RUnlock() 353 for id, ar := range c.allocs { 354 if err := ar.SaveState(); err != nil { 355 c.logger.Printf("[ERR] client: failed to save state for alloc %s: %v", 356 id, err) 357 mErr.Errors = append(mErr.Errors, err) 358 } 359 } 360 return mErr.ErrorOrNil() 361 } 362 363 // nodeID restores a persistent unique ID or generates a new one 364 func (c *Client) nodeID() (string, error) { 365 // Do not persist in dev mode 366 if c.config.DevMode { 367 return structs.GenerateUUID(), nil 368 } 369 370 // Attempt to read existing ID 371 path := filepath.Join(c.config.StateDir, "client-id") 372 buf, err := ioutil.ReadFile(path) 373 if err != nil && !os.IsNotExist(err) { 374 return "", err 375 } 376 377 // Use existing ID if any 378 if len(buf) != 0 { 379 return string(buf), nil 380 } 381 382 // Generate new ID 383 id := structs.GenerateUUID() 384 385 // Persist the ID 386 if err := ioutil.WriteFile(path, []byte(id), 0700); err != nil { 387 return "", err 388 } 389 return id, nil 390 } 391 392 // setupNode is used to setup the initial node 393 func (c *Client) setupNode() error { 394 node := c.config.Node 395 if node == nil { 396 node = &structs.Node{} 397 c.config.Node = node 398 } 399 if node.Attributes == nil { 400 node.Attributes = make(map[string]string) 401 } 402 if node.Links == nil { 403 node.Links = make(map[string]string) 404 } 405 if node.Meta == nil { 406 node.Meta = make(map[string]string) 407 } 408 if node.Resources == nil { 409 node.Resources = &structs.Resources{} 410 } 411 if node.ID == "" { 412 id, err := c.nodeID() 413 if err != nil { 414 return fmt.Errorf("node ID setup failed: %v", err) 415 } 416 node.ID = id 417 } 418 if node.Datacenter == "" { 419 node.Datacenter = "dc1" 420 } 421 if node.Name == "" { 422 node.Name, _ = os.Hostname() 423 } 424 if node.Name == "" { 425 node.Name = node.ID 426 } 427 node.Status = structs.NodeStatusInit 428 return nil 429 } 430 431 // fingerprint is used to fingerprint the client and setup the node 432 func (c *Client) fingerprint() error { 433 var applied []string 434 for _, name := range fingerprint.BuiltinFingerprints { 435 f, err := fingerprint.NewFingerprint(name, c.logger) 436 if err != nil { 437 return err 438 } 439 applies, err := f.Fingerprint(c.config, c.config.Node) 440 if err != nil { 441 return err 442 } 443 if applies { 444 applied = append(applied, name) 445 } 446 } 447 c.logger.Printf("[DEBUG] client: applied fingerprints %v", applied) 448 return nil 449 } 450 451 // setupDrivers is used to find the available drivers 452 func (c *Client) setupDrivers() error { 453 var avail []string 454 driverCtx := driver.NewDriverContext("", c.config, c.config.Node, c.logger) 455 for name := range driver.BuiltinDrivers { 456 d, err := driver.NewDriver(name, driverCtx) 457 if err != nil { 458 return err 459 } 460 applies, err := d.Fingerprint(c.config, c.config.Node) 461 if err != nil { 462 return err 463 } 464 if applies { 465 avail = append(avail, name) 466 } 467 } 468 c.logger.Printf("[DEBUG] client: available drivers %v", avail) 469 return nil 470 } 471 472 // retryIntv calculates a retry interval value given the base 473 func (c *Client) retryIntv(base time.Duration) time.Duration { 474 if c.config.DevMode { 475 return devModeRetryIntv 476 } 477 return base + randomStagger(base) 478 } 479 480 // run is a long lived goroutine used to run the client 481 func (c *Client) run() { 482 // Register the client 483 for { 484 if err := c.registerNode(); err == nil { 485 break 486 } 487 select { 488 case <-time.After(c.retryIntv(registerRetryIntv)): 489 case <-c.shutdownCh: 490 return 491 } 492 } 493 494 // Setup the heartbeat timer, for the initial registration 495 // we want to do this quickly. We want to do it extra quickly 496 // in development mode. 497 var heartbeat <-chan time.Time 498 if c.config.DevMode { 499 heartbeat = time.After(0) 500 } else { 501 heartbeat = time.After(randomStagger(initialHeartbeatStagger)) 502 } 503 504 // Watch for changes in allocations 505 allocUpdates := make(chan []*structs.Allocation, 1) 506 go c.watchAllocations(allocUpdates) 507 508 // Create a snapshot timer 509 snapshot := time.After(stateSnapshotIntv) 510 511 // Periodically update our status and wait for termination 512 for { 513 select { 514 case <-snapshot: 515 snapshot = time.After(stateSnapshotIntv) 516 if err := c.saveState(); err != nil { 517 c.logger.Printf("[ERR] client: failed to save state: %v", err) 518 } 519 520 case allocs := <-allocUpdates: 521 c.runAllocs(allocs) 522 523 case <-heartbeat: 524 if err := c.updateNodeStatus(); err != nil { 525 heartbeat = time.After(c.retryIntv(registerRetryIntv)) 526 } else { 527 heartbeat = time.After(c.heartbeatTTL) 528 } 529 530 case <-c.shutdownCh: 531 return 532 } 533 } 534 } 535 536 // registerNode is used to register the node or update the registration 537 func (c *Client) registerNode() error { 538 node := c.Node() 539 req := structs.NodeRegisterRequest{ 540 Node: node, 541 WriteRequest: structs.WriteRequest{Region: c.config.Region}, 542 } 543 var resp structs.NodeUpdateResponse 544 err := c.RPC("Node.Register", &req, &resp) 545 if err != nil { 546 if time.Since(c.start) > registerErrGrace { 547 c.logger.Printf("[ERR] client: failed to register node: %v", err) 548 } 549 return err 550 } 551 c.logger.Printf("[DEBUG] client: node registration complete") 552 if len(resp.EvalIDs) != 0 { 553 c.logger.Printf("[DEBUG] client: %d evaluations triggered by node registration", len(resp.EvalIDs)) 554 } 555 c.lastHeartbeat = time.Now() 556 c.heartbeatTTL = resp.HeartbeatTTL 557 return nil 558 } 559 560 // updateNodeStatus is used to heartbeat and update the status of the node 561 func (c *Client) updateNodeStatus() error { 562 node := c.Node() 563 req := structs.NodeUpdateStatusRequest{ 564 NodeID: node.ID, 565 Status: structs.NodeStatusReady, 566 WriteRequest: structs.WriteRequest{Region: c.config.Region}, 567 } 568 var resp structs.NodeUpdateResponse 569 err := c.RPC("Node.UpdateStatus", &req, &resp) 570 if err != nil { 571 c.logger.Printf("[ERR] client: failed to update status: %v", err) 572 return err 573 } 574 if len(resp.EvalIDs) != 0 { 575 c.logger.Printf("[DEBUG] client: %d evaluations triggered by node update", len(resp.EvalIDs)) 576 } 577 if resp.Index != 0 { 578 c.logger.Printf("[DEBUG] client: state updated to %s", req.Status) 579 } 580 c.lastHeartbeat = time.Now() 581 c.heartbeatTTL = resp.HeartbeatTTL 582 return nil 583 } 584 585 // updateAllocStatus is used to update the status of an allocation 586 func (c *Client) updateAllocStatus(alloc *structs.Allocation) error { 587 args := structs.AllocUpdateRequest{ 588 Alloc: []*structs.Allocation{alloc}, 589 WriteRequest: structs.WriteRequest{Region: c.config.Region}, 590 } 591 var resp structs.GenericResponse 592 err := c.RPC("Node.UpdateAlloc", &args, &resp) 593 if err != nil { 594 c.logger.Printf("[ERR] client: failed to update allocation: %v", err) 595 return err 596 } 597 return nil 598 } 599 600 // watchAllocations is used to scan for updates to allocations 601 func (c *Client) watchAllocations(allocUpdates chan []*structs.Allocation) { 602 req := structs.NodeSpecificRequest{ 603 NodeID: c.Node().ID, 604 QueryOptions: structs.QueryOptions{ 605 Region: c.config.Region, 606 AllowStale: true, 607 }, 608 } 609 var resp structs.NodeAllocsResponse 610 611 for { 612 // Get the allocations, blocking for updates 613 resp = structs.NodeAllocsResponse{} 614 err := c.RPC("Node.GetAllocs", &req, &resp) 615 if err != nil { 616 c.logger.Printf("[ERR] client: failed to query for node allocations: %v", err) 617 retry := c.retryIntv(getAllocRetryIntv) 618 select { 619 case <-time.After(retry): 620 continue 621 case <-c.shutdownCh: 622 return 623 } 624 } 625 626 // Check for shutdown 627 select { 628 case <-c.shutdownCh: 629 return 630 default: 631 } 632 633 // Check for updates 634 if resp.Index <= req.MinQueryIndex { 635 continue 636 } 637 req.MinQueryIndex = resp.Index 638 c.logger.Printf("[DEBUG] client: updated allocations at index %d (%d allocs)", resp.Index, len(resp.Allocs)) 639 640 // Push the updates 641 select { 642 case allocUpdates <- resp.Allocs: 643 case <-c.shutdownCh: 644 return 645 } 646 } 647 } 648 649 // runAllocs is invoked when we get an updated set of allocations 650 func (c *Client) runAllocs(updated []*structs.Allocation) { 651 // Get the existing allocs 652 c.allocLock.RLock() 653 exist := make([]*structs.Allocation, 0, len(c.allocs)) 654 for _, ar := range c.allocs { 655 exist = append(exist, ar.Alloc()) 656 } 657 c.allocLock.RUnlock() 658 659 // Diff the existing and updated allocations 660 diff := diffAllocs(exist, updated) 661 c.logger.Printf("[DEBUG] client: %#v", diff) 662 663 // Remove the old allocations 664 for _, remove := range diff.removed { 665 if err := c.removeAlloc(remove); err != nil { 666 c.logger.Printf("[ERR] client: failed to remove alloc '%s': %v", 667 remove.ID, err) 668 } 669 } 670 671 // Update the existing allocations 672 for _, update := range diff.updated { 673 if err := c.updateAlloc(update.exist, update.updated); err != nil { 674 c.logger.Printf("[ERR] client: failed to update alloc '%s': %v", 675 update.exist.ID, err) 676 } 677 } 678 679 // Start the new allocations 680 for _, add := range diff.added { 681 if err := c.addAlloc(add); err != nil { 682 c.logger.Printf("[ERR] client: failed to add alloc '%s': %v", 683 add.ID, err) 684 } 685 } 686 687 // Persist our state 688 if err := c.saveState(); err != nil { 689 c.logger.Printf("[ERR] client: failed to save state: %v", err) 690 } 691 } 692 693 // removeAlloc is invoked when we should remove an allocation 694 func (c *Client) removeAlloc(alloc *structs.Allocation) error { 695 c.allocLock.Lock() 696 defer c.allocLock.Unlock() 697 ar, ok := c.allocs[alloc.ID] 698 if !ok { 699 c.logger.Printf("[WARN] client: missing context for alloc '%s'", alloc.ID) 700 return nil 701 } 702 ar.Destroy() 703 delete(c.allocs, alloc.ID) 704 return nil 705 } 706 707 // updateAlloc is invoked when we should update an allocation 708 func (c *Client) updateAlloc(exist, update *structs.Allocation) error { 709 c.allocLock.RLock() 710 defer c.allocLock.RUnlock() 711 ar, ok := c.allocs[exist.ID] 712 if !ok { 713 c.logger.Printf("[WARN] client: missing context for alloc '%s'", exist.ID) 714 return nil 715 } 716 ar.Update(update) 717 return nil 718 } 719 720 // addAlloc is invoked when we should add an allocation 721 func (c *Client) addAlloc(alloc *structs.Allocation) error { 722 c.allocLock.Lock() 723 defer c.allocLock.Unlock() 724 ar := NewAllocRunner(c.logger, c.config, c.updateAllocStatus, alloc) 725 c.allocs[alloc.ID] = ar 726 go ar.Run() 727 return nil 728 }