github.com/hspak/nomad@v0.7.2-0.20180309000617-bc4ae22a39a5/nomad/node_endpoint.go (about) 1 package nomad 2 3 import ( 4 "context" 5 "fmt" 6 "strings" 7 "sync" 8 "time" 9 10 "golang.org/x/sync/errgroup" 11 12 "github.com/armon/go-metrics" 13 "github.com/hashicorp/go-memdb" 14 "github.com/hashicorp/go-multierror" 15 "github.com/hashicorp/nomad/acl" 16 "github.com/hashicorp/nomad/helper/uuid" 17 "github.com/hashicorp/nomad/nomad/state" 18 "github.com/hashicorp/nomad/nomad/structs" 19 "github.com/hashicorp/raft" 20 vapi "github.com/hashicorp/vault/api" 21 ) 22 23 const ( 24 // batchUpdateInterval is how long we wait to batch updates 25 batchUpdateInterval = 50 * time.Millisecond 26 27 // maxParallelRequestsPerDerive is the maximum number of parallel Vault 28 // create token requests that may be outstanding per derive request 29 maxParallelRequestsPerDerive = 16 30 ) 31 32 // Node endpoint is used for client interactions 33 type Node struct { 34 srv *Server 35 36 // ctx provides context regarding the underlying connection 37 ctx *RPCContext 38 39 // updates holds pending client status updates for allocations 40 updates []*structs.Allocation 41 42 // updateFuture is used to wait for the pending batch update 43 // to complete. This may be nil if no batch is pending. 44 updateFuture *batchFuture 45 46 // updateTimer is the timer that will trigger the next batch 47 // update, and may be nil if there is no batch pending. 48 updateTimer *time.Timer 49 50 // updatesLock synchronizes access to the updates list, 51 // the future and the timer. 52 updatesLock sync.Mutex 53 } 54 55 // Register is used to upsert a client that is available for scheduling 56 func (n *Node) Register(args *structs.NodeRegisterRequest, reply *structs.NodeUpdateResponse) error { 57 if done, err := n.srv.forward("Node.Register", args, args, reply); done { 58 return err 59 } 60 defer metrics.MeasureSince([]string{"nomad", "client", "register"}, time.Now()) 61 62 // Validate the arguments 63 if args.Node == nil { 64 return fmt.Errorf("missing node for client registration") 65 } 66 if args.Node.ID == "" { 67 return fmt.Errorf("missing node ID for client registration") 68 } 69 if args.Node.Datacenter == "" { 70 return fmt.Errorf("missing datacenter for client registration") 71 } 72 if args.Node.Name == "" { 73 return fmt.Errorf("missing node name for client registration") 74 } 75 if len(args.Node.Attributes) == 0 { 76 return fmt.Errorf("missing attributes for client registration") 77 } 78 if args.Node.SecretID == "" { 79 return fmt.Errorf("missing node secret ID for client registration") 80 } 81 82 // Default the status if none is given 83 if args.Node.Status == "" { 84 args.Node.Status = structs.NodeStatusInit 85 } 86 if !structs.ValidNodeStatus(args.Node.Status) { 87 return fmt.Errorf("invalid status for node") 88 } 89 90 // Set the timestamp when the node is registered 91 args.Node.StatusUpdatedAt = time.Now().Unix() 92 93 // Compute the node class 94 if err := args.Node.ComputeClass(); err != nil { 95 return fmt.Errorf("failed to computed node class: %v", err) 96 } 97 98 // Look for the node so we can detect a state transition 99 snap, err := n.srv.fsm.State().Snapshot() 100 if err != nil { 101 return err 102 } 103 104 ws := memdb.NewWatchSet() 105 originalNode, err := snap.NodeByID(ws, args.Node.ID) 106 if err != nil { 107 return err 108 } 109 110 // Check if the SecretID has been tampered with 111 if originalNode != nil { 112 if args.Node.SecretID != originalNode.SecretID && originalNode.SecretID != "" { 113 return fmt.Errorf("node secret ID does not match. Not registering node.") 114 } 115 } 116 117 // We have a valid node connection, so add the mapping to cache the 118 // connection and allow the server to send RPCs to the client. 119 if n.ctx != nil && n.ctx.NodeID == "" { 120 n.ctx.NodeID = args.Node.ID 121 n.srv.addNodeConn(n.ctx) 122 } 123 124 // Commit this update via Raft 125 _, index, err := n.srv.raftApply(structs.NodeRegisterRequestType, args) 126 if err != nil { 127 n.srv.logger.Printf("[ERR] nomad.client: Register failed: %v", err) 128 return err 129 } 130 reply.NodeModifyIndex = index 131 132 // Check if we should trigger evaluations 133 originalStatus := structs.NodeStatusInit 134 if originalNode != nil { 135 originalStatus = originalNode.Status 136 } 137 transitionToReady := transitionedToReady(args.Node.Status, originalStatus) 138 if structs.ShouldDrainNode(args.Node.Status) || transitionToReady { 139 evalIDs, evalIndex, err := n.createNodeEvals(args.Node.ID, index) 140 if err != nil { 141 n.srv.logger.Printf("[ERR] nomad.client: eval creation failed: %v", err) 142 return err 143 } 144 reply.EvalIDs = evalIDs 145 reply.EvalCreateIndex = evalIndex 146 } 147 148 // Check if we need to setup a heartbeat 149 if !args.Node.TerminalStatus() { 150 ttl, err := n.srv.resetHeartbeatTimer(args.Node.ID) 151 if err != nil { 152 n.srv.logger.Printf("[ERR] nomad.client: heartbeat reset failed: %v", err) 153 return err 154 } 155 reply.HeartbeatTTL = ttl 156 } 157 158 // Set the reply index 159 reply.Index = index 160 snap, err = n.srv.fsm.State().Snapshot() 161 if err != nil { 162 return err 163 } 164 165 n.srv.peerLock.RLock() 166 defer n.srv.peerLock.RUnlock() 167 if err := n.constructNodeServerInfoResponse(snap, reply); err != nil { 168 n.srv.logger.Printf("[ERR] nomad.client: failed to populate NodeUpdateResponse: %v", err) 169 return err 170 } 171 172 return nil 173 } 174 175 // updateNodeUpdateResponse assumes the n.srv.peerLock is held for reading. 176 func (n *Node) constructNodeServerInfoResponse(snap *state.StateSnapshot, reply *structs.NodeUpdateResponse) error { 177 reply.LeaderRPCAddr = string(n.srv.raft.Leader()) 178 179 // Reply with config information required for future RPC requests 180 reply.Servers = make([]*structs.NodeServerInfo, 0, len(n.srv.localPeers)) 181 for k, v := range n.srv.localPeers { 182 reply.Servers = append(reply.Servers, 183 &structs.NodeServerInfo{ 184 RPCAdvertiseAddr: string(k), 185 RPCMajorVersion: int32(v.MajorVersion), 186 RPCMinorVersion: int32(v.MinorVersion), 187 Datacenter: v.Datacenter, 188 }) 189 } 190 191 // TODO(sean@): Use an indexed node count instead 192 // 193 // Snapshot is used only to iterate over all nodes to create a node 194 // count to send back to Nomad Clients in their heartbeat so Clients 195 // can estimate the size of the cluster. 196 ws := memdb.NewWatchSet() 197 iter, err := snap.Nodes(ws) 198 if err == nil { 199 for { 200 raw := iter.Next() 201 if raw == nil { 202 break 203 } 204 reply.NumNodes++ 205 } 206 } 207 208 return nil 209 } 210 211 // Deregister is used to remove a client from the cluster. If a client should 212 // just be made unavailable for scheduling, a status update is preferred. 213 func (n *Node) Deregister(args *structs.NodeDeregisterRequest, reply *structs.NodeUpdateResponse) error { 214 if done, err := n.srv.forward("Node.Deregister", args, args, reply); done { 215 return err 216 } 217 defer metrics.MeasureSince([]string{"nomad", "client", "deregister"}, time.Now()) 218 219 // Check node permissions 220 if aclObj, err := n.srv.ResolveToken(args.AuthToken); err != nil { 221 return err 222 } else if aclObj != nil && !aclObj.AllowNodeWrite() { 223 return structs.ErrPermissionDenied 224 } 225 226 // Verify the arguments 227 if args.NodeID == "" { 228 return fmt.Errorf("missing node ID for client deregistration") 229 } 230 // Look for the node 231 snap, err := n.srv.fsm.State().Snapshot() 232 if err != nil { 233 return err 234 } 235 236 ws := memdb.NewWatchSet() 237 node, err := snap.NodeByID(ws, args.NodeID) 238 if err != nil { 239 return err 240 } 241 if node == nil { 242 return fmt.Errorf("node not found") 243 } 244 245 // Commit this update via Raft 246 _, index, err := n.srv.raftApply(structs.NodeDeregisterRequestType, args) 247 if err != nil { 248 n.srv.logger.Printf("[ERR] nomad.client: Deregister failed: %v", err) 249 return err 250 } 251 252 // Clear the heartbeat timer if any 253 n.srv.clearHeartbeatTimer(args.NodeID) 254 255 // Create the evaluations for this node 256 evalIDs, evalIndex, err := n.createNodeEvals(args.NodeID, index) 257 if err != nil { 258 n.srv.logger.Printf("[ERR] nomad.client: eval creation failed: %v", err) 259 return err 260 } 261 262 // Determine if there are any Vault accessors on the node 263 accessors, err := snap.VaultAccessorsByNode(ws, args.NodeID) 264 if err != nil { 265 n.srv.logger.Printf("[ERR] nomad.client: looking up accessors for node %q failed: %v", args.NodeID, err) 266 return err 267 } 268 269 if l := len(accessors); l != 0 { 270 n.srv.logger.Printf("[DEBUG] nomad.client: revoking %d accessors on node %q due to deregister", l, args.NodeID) 271 if err := n.srv.vault.RevokeTokens(context.Background(), accessors, true); err != nil { 272 n.srv.logger.Printf("[ERR] nomad.client: revoking accessors for node %q failed: %v", args.NodeID, err) 273 return err 274 } 275 } 276 277 // Setup the reply 278 reply.EvalIDs = evalIDs 279 reply.EvalCreateIndex = evalIndex 280 reply.NodeModifyIndex = index 281 reply.Index = index 282 return nil 283 } 284 285 // UpdateStatus is used to update the status of a client node 286 func (n *Node) UpdateStatus(args *structs.NodeUpdateStatusRequest, reply *structs.NodeUpdateResponse) error { 287 if done, err := n.srv.forward("Node.UpdateStatus", args, args, reply); done { 288 return err 289 } 290 defer metrics.MeasureSince([]string{"nomad", "client", "update_status"}, time.Now()) 291 292 // Verify the arguments 293 if args.NodeID == "" { 294 return fmt.Errorf("missing node ID for client status update") 295 } 296 if !structs.ValidNodeStatus(args.Status) { 297 return fmt.Errorf("invalid status for node") 298 } 299 300 // Look for the node 301 snap, err := n.srv.fsm.State().Snapshot() 302 if err != nil { 303 return err 304 } 305 306 ws := memdb.NewWatchSet() 307 node, err := snap.NodeByID(ws, args.NodeID) 308 if err != nil { 309 return err 310 } 311 if node == nil { 312 return fmt.Errorf("node not found") 313 } 314 315 // We have a valid node connection, so add the mapping to cache the 316 // connection and allow the server to send RPCs to the client. 317 if n.ctx != nil && n.ctx.NodeID == "" { 318 n.ctx.NodeID = args.NodeID 319 n.srv.addNodeConn(n.ctx) 320 } 321 322 // XXX: Could use the SecretID here but have to update the heartbeat system 323 // to track SecretIDs. 324 325 // Update the timestamp of when the node status was updated 326 node.StatusUpdatedAt = time.Now().Unix() 327 328 // Commit this update via Raft 329 var index uint64 330 if node.Status != args.Status { 331 _, index, err = n.srv.raftApply(structs.NodeUpdateStatusRequestType, args) 332 if err != nil { 333 n.srv.logger.Printf("[ERR] nomad.client: status update failed: %v", err) 334 return err 335 } 336 reply.NodeModifyIndex = index 337 } 338 339 // Check if we should trigger evaluations 340 transitionToReady := transitionedToReady(args.Status, node.Status) 341 if structs.ShouldDrainNode(args.Status) || transitionToReady { 342 evalIDs, evalIndex, err := n.createNodeEvals(args.NodeID, index) 343 if err != nil { 344 n.srv.logger.Printf("[ERR] nomad.client: eval creation failed: %v", err) 345 return err 346 } 347 reply.EvalIDs = evalIDs 348 reply.EvalCreateIndex = evalIndex 349 } 350 351 // Check if we need to setup a heartbeat 352 switch args.Status { 353 case structs.NodeStatusDown: 354 // Determine if there are any Vault accessors on the node 355 accessors, err := n.srv.State().VaultAccessorsByNode(ws, args.NodeID) 356 if err != nil { 357 n.srv.logger.Printf("[ERR] nomad.client: looking up accessors for node %q failed: %v", args.NodeID, err) 358 return err 359 } 360 361 if l := len(accessors); l != 0 { 362 n.srv.logger.Printf("[DEBUG] nomad.client: revoking %d accessors on node %q due to down state", l, args.NodeID) 363 if err := n.srv.vault.RevokeTokens(context.Background(), accessors, true); err != nil { 364 n.srv.logger.Printf("[ERR] nomad.client: revoking accessors for node %q failed: %v", args.NodeID, err) 365 return err 366 } 367 } 368 default: 369 ttl, err := n.srv.resetHeartbeatTimer(args.NodeID) 370 if err != nil { 371 n.srv.logger.Printf("[ERR] nomad.client: heartbeat reset failed: %v", err) 372 return err 373 } 374 reply.HeartbeatTTL = ttl 375 } 376 377 // Set the reply index and leader 378 reply.Index = index 379 n.srv.peerLock.RLock() 380 defer n.srv.peerLock.RUnlock() 381 if err := n.constructNodeServerInfoResponse(snap, reply); err != nil { 382 n.srv.logger.Printf("[ERR] nomad.client: failed to populate NodeUpdateResponse: %v", err) 383 return err 384 } 385 386 return nil 387 } 388 389 // transitionedToReady is a helper that takes a nodes new and old status and 390 // returns whether it has transistioned to ready. 391 func transitionedToReady(newStatus, oldStatus string) bool { 392 initToReady := oldStatus == structs.NodeStatusInit && newStatus == structs.NodeStatusReady 393 terminalToReady := oldStatus == structs.NodeStatusDown && newStatus == structs.NodeStatusReady 394 return initToReady || terminalToReady 395 } 396 397 // UpdateDrain is used to update the drain mode of a client node 398 func (n *Node) UpdateDrain(args *structs.NodeUpdateDrainRequest, 399 reply *structs.NodeDrainUpdateResponse) error { 400 if done, err := n.srv.forward("Node.UpdateDrain", args, args, reply); done { 401 return err 402 } 403 defer metrics.MeasureSince([]string{"nomad", "client", "update_drain"}, time.Now()) 404 405 // Check node write permissions 406 if aclObj, err := n.srv.ResolveToken(args.AuthToken); err != nil { 407 return err 408 } else if aclObj != nil && !aclObj.AllowNodeWrite() { 409 return structs.ErrPermissionDenied 410 } 411 412 // Verify the arguments 413 if args.NodeID == "" { 414 return fmt.Errorf("missing node ID for drain update") 415 } 416 417 // Look for the node 418 snap, err := n.srv.fsm.State().Snapshot() 419 if err != nil { 420 return err 421 } 422 ws := memdb.NewWatchSet() 423 node, err := snap.NodeByID(ws, args.NodeID) 424 if err != nil { 425 return err 426 } 427 if node == nil { 428 return fmt.Errorf("node not found") 429 } 430 431 // Update the timestamp to 432 node.StatusUpdatedAt = time.Now().Unix() 433 434 // Commit this update via Raft 435 var index uint64 436 if node.Drain != args.Drain { 437 _, index, err = n.srv.raftApply(structs.NodeUpdateDrainRequestType, args) 438 if err != nil { 439 n.srv.logger.Printf("[ERR] nomad.client: drain update failed: %v", err) 440 return err 441 } 442 reply.NodeModifyIndex = index 443 } 444 445 // Always attempt to create Node evaluations because there may be a System 446 // job registered that should be evaluated. 447 evalIDs, evalIndex, err := n.createNodeEvals(args.NodeID, index) 448 if err != nil { 449 n.srv.logger.Printf("[ERR] nomad.client: eval creation failed: %v", err) 450 return err 451 } 452 reply.EvalIDs = evalIDs 453 reply.EvalCreateIndex = evalIndex 454 455 // Set the reply index 456 reply.Index = index 457 return nil 458 } 459 460 // Evaluate is used to force a re-evaluation of the node 461 func (n *Node) Evaluate(args *structs.NodeEvaluateRequest, reply *structs.NodeUpdateResponse) error { 462 if done, err := n.srv.forward("Node.Evaluate", args, args, reply); done { 463 return err 464 } 465 defer metrics.MeasureSince([]string{"nomad", "client", "evaluate"}, time.Now()) 466 467 // Check node write permissions 468 if aclObj, err := n.srv.ResolveToken(args.AuthToken); err != nil { 469 return err 470 } else if aclObj != nil && !aclObj.AllowNodeWrite() { 471 return structs.ErrPermissionDenied 472 } 473 474 // Verify the arguments 475 if args.NodeID == "" { 476 return fmt.Errorf("missing node ID for evaluation") 477 } 478 479 // Look for the node 480 snap, err := n.srv.fsm.State().Snapshot() 481 if err != nil { 482 return err 483 } 484 ws := memdb.NewWatchSet() 485 node, err := snap.NodeByID(ws, args.NodeID) 486 if err != nil { 487 return err 488 } 489 if node == nil { 490 return fmt.Errorf("node not found") 491 } 492 493 // Create the evaluation 494 evalIDs, evalIndex, err := n.createNodeEvals(args.NodeID, node.ModifyIndex) 495 if err != nil { 496 n.srv.logger.Printf("[ERR] nomad.client: eval creation failed: %v", err) 497 return err 498 } 499 reply.EvalIDs = evalIDs 500 reply.EvalCreateIndex = evalIndex 501 502 // Set the reply index 503 reply.Index = evalIndex 504 505 n.srv.peerLock.RLock() 506 defer n.srv.peerLock.RUnlock() 507 if err := n.constructNodeServerInfoResponse(snap, reply); err != nil { 508 n.srv.logger.Printf("[ERR] nomad.client: failed to populate NodeUpdateResponse: %v", err) 509 return err 510 } 511 return nil 512 } 513 514 // GetNode is used to request information about a specific node 515 func (n *Node) GetNode(args *structs.NodeSpecificRequest, 516 reply *structs.SingleNodeResponse) error { 517 if done, err := n.srv.forward("Node.GetNode", args, args, reply); done { 518 return err 519 } 520 defer metrics.MeasureSince([]string{"nomad", "client", "get_node"}, time.Now()) 521 522 // Check node read permissions 523 if aclObj, err := n.srv.ResolveToken(args.AuthToken); err != nil { 524 // If ResolveToken had an unexpected error return that 525 if err != structs.ErrTokenNotFound { 526 return err 527 } 528 529 // Attempt to lookup AuthToken as a Node.SecretID since nodes 530 // call this endpoint and don't have an ACL token. 531 node, stateErr := n.srv.fsm.State().NodeBySecretID(nil, args.AuthToken) 532 if stateErr != nil { 533 // Return the original ResolveToken error with this err 534 var merr multierror.Error 535 merr.Errors = append(merr.Errors, err, stateErr) 536 return merr.ErrorOrNil() 537 } 538 539 // Not a node or a valid ACL token 540 if node == nil { 541 return structs.ErrTokenNotFound 542 } 543 } else if aclObj != nil && !aclObj.AllowNodeRead() { 544 return structs.ErrPermissionDenied 545 } 546 547 // Setup the blocking query 548 opts := blockingOptions{ 549 queryOpts: &args.QueryOptions, 550 queryMeta: &reply.QueryMeta, 551 run: func(ws memdb.WatchSet, state *state.StateStore) error { 552 // Verify the arguments 553 if args.NodeID == "" { 554 return fmt.Errorf("missing node ID") 555 } 556 557 // Look for the node 558 out, err := state.NodeByID(ws, args.NodeID) 559 if err != nil { 560 return err 561 } 562 563 // Setup the output 564 if out != nil { 565 // Clear the secret ID 566 reply.Node = out.Copy() 567 reply.Node.SecretID = "" 568 reply.Index = out.ModifyIndex 569 } else { 570 // Use the last index that affected the nodes table 571 index, err := state.Index("nodes") 572 if err != nil { 573 return err 574 } 575 reply.Node = nil 576 reply.Index = index 577 } 578 579 // Set the query response 580 n.srv.setQueryMeta(&reply.QueryMeta) 581 return nil 582 }} 583 return n.srv.blockingRPC(&opts) 584 } 585 586 // GetAllocs is used to request allocations for a specific node 587 func (n *Node) GetAllocs(args *structs.NodeSpecificRequest, 588 reply *structs.NodeAllocsResponse) error { 589 if done, err := n.srv.forward("Node.GetAllocs", args, args, reply); done { 590 return err 591 } 592 defer metrics.MeasureSince([]string{"nomad", "client", "get_allocs"}, time.Now()) 593 594 // Check node read and namespace job read permissions 595 aclObj, err := n.srv.ResolveToken(args.AuthToken) 596 if err != nil { 597 return err 598 } 599 if aclObj != nil && !aclObj.AllowNodeRead() { 600 return structs.ErrPermissionDenied 601 } 602 603 // cache namespace perms 604 readableNamespaces := map[string]bool{} 605 606 // readNS is a caching namespace read-job helper 607 readNS := func(ns string) bool { 608 if aclObj == nil { 609 // ACLs are disabled; everything is readable 610 return true 611 } 612 613 if readable, ok := readableNamespaces[ns]; ok { 614 // cache hit 615 return readable 616 } 617 618 // cache miss 619 readable := aclObj.AllowNsOp(ns, acl.NamespaceCapabilityReadJob) 620 readableNamespaces[ns] = readable 621 return readable 622 } 623 624 // Verify the arguments 625 if args.NodeID == "" { 626 return fmt.Errorf("missing node ID") 627 } 628 629 // Setup the blocking query 630 opts := blockingOptions{ 631 queryOpts: &args.QueryOptions, 632 queryMeta: &reply.QueryMeta, 633 run: func(ws memdb.WatchSet, state *state.StateStore) error { 634 // Look for the node 635 allocs, err := state.AllocsByNode(ws, args.NodeID) 636 if err != nil { 637 return err 638 } 639 640 // Setup the output 641 if n := len(allocs); n != 0 { 642 reply.Allocs = make([]*structs.Allocation, 0, n) 643 for _, alloc := range allocs { 644 if readNS(alloc.Namespace) { 645 reply.Allocs = append(reply.Allocs, alloc) 646 } 647 648 // Get the max of all allocs since 649 // subsequent requests need to start 650 // from the latest index 651 reply.Index = maxUint64(reply.Index, alloc.ModifyIndex) 652 } 653 } else { 654 reply.Allocs = nil 655 656 // Use the last index that affected the nodes table 657 index, err := state.Index("allocs") 658 if err != nil { 659 return err 660 } 661 662 // Must provide non-zero index to prevent blocking 663 // Index 1 is impossible anyways (due to Raft internals) 664 if index == 0 { 665 reply.Index = 1 666 } else { 667 reply.Index = index 668 } 669 } 670 return nil 671 }} 672 return n.srv.blockingRPC(&opts) 673 } 674 675 // GetClientAllocs is used to request a lightweight list of alloc modify indexes 676 // per allocation. 677 func (n *Node) GetClientAllocs(args *structs.NodeSpecificRequest, 678 reply *structs.NodeClientAllocsResponse) error { 679 if done, err := n.srv.forward("Node.GetClientAllocs", args, args, reply); done { 680 return err 681 } 682 defer metrics.MeasureSince([]string{"nomad", "client", "get_client_allocs"}, time.Now()) 683 684 // Verify the arguments 685 if args.NodeID == "" { 686 return fmt.Errorf("missing node ID") 687 } 688 689 // numOldAllocs is used to detect if there is a garbage collection event 690 // that effects the node. When an allocation is garbage collected, that does 691 // not change the modify index changes and thus the query won't unblock, 692 // even though the set of allocations on the node has changed. 693 var numOldAllocs int 694 695 // Setup the blocking query 696 opts := blockingOptions{ 697 queryOpts: &args.QueryOptions, 698 queryMeta: &reply.QueryMeta, 699 run: func(ws memdb.WatchSet, state *state.StateStore) error { 700 // Look for the node 701 node, err := state.NodeByID(ws, args.NodeID) 702 if err != nil { 703 return err 704 } 705 706 var allocs []*structs.Allocation 707 if node != nil { 708 if args.SecretID == "" { 709 return fmt.Errorf("missing node secret ID for client status update") 710 } else if args.SecretID != node.SecretID { 711 return fmt.Errorf("node secret ID does not match") 712 } 713 714 // We have a valid node connection, so add the mapping to cache the 715 // connection and allow the server to send RPCs to the client. 716 if n.ctx != nil && n.ctx.NodeID == "" { 717 n.ctx.NodeID = args.NodeID 718 n.srv.addNodeConn(n.ctx) 719 } 720 721 var err error 722 allocs, err = state.AllocsByNode(ws, args.NodeID) 723 if err != nil { 724 return err 725 } 726 } 727 728 reply.Allocs = make(map[string]uint64) 729 reply.MigrateTokens = make(map[string]string) 730 731 // preferTableIndex is used to determine whether we should build the 732 // response index based on the full table indexes versus the modify 733 // indexes of the allocations on the specific node. This is 734 // preferred in the case that the node doesn't yet have allocations 735 // or when we detect a GC that effects the node. 736 preferTableIndex := true 737 738 // Setup the output 739 if numAllocs := len(allocs); numAllocs != 0 { 740 preferTableIndex = false 741 742 for _, alloc := range allocs { 743 reply.Allocs[alloc.ID] = alloc.AllocModifyIndex 744 745 // If the allocation is going to do a migration, create a 746 // migration token so that the client can authenticate with 747 // the node hosting the previous allocation. 748 if alloc.ShouldMigrate() { 749 prevAllocation, err := state.AllocByID(ws, alloc.PreviousAllocation) 750 if err != nil { 751 return err 752 } 753 754 if prevAllocation != nil && prevAllocation.NodeID != alloc.NodeID { 755 allocNode, err := state.NodeByID(ws, prevAllocation.NodeID) 756 if err != nil { 757 return err 758 } 759 if allocNode == nil { 760 // Node must have been GC'd so skip the token 761 continue 762 } 763 764 token, err := structs.GenerateMigrateToken(prevAllocation.ID, allocNode.SecretID) 765 if err != nil { 766 return err 767 } 768 reply.MigrateTokens[alloc.ID] = token 769 } 770 } 771 772 reply.Index = maxUint64(reply.Index, alloc.ModifyIndex) 773 } 774 775 // Determine if we have less allocations than before. This 776 // indicates there was a garbage collection 777 if numAllocs < numOldAllocs { 778 preferTableIndex = true 779 } 780 781 // Store the new number of allocations 782 numOldAllocs = numAllocs 783 } 784 785 if preferTableIndex { 786 // Use the last index that affected the nodes table 787 index, err := state.Index("allocs") 788 if err != nil { 789 return err 790 } 791 792 // Must provide non-zero index to prevent blocking 793 // Index 1 is impossible anyways (due to Raft internals) 794 if index == 0 { 795 reply.Index = 1 796 } else { 797 reply.Index = index 798 } 799 } 800 return nil 801 }} 802 return n.srv.blockingRPC(&opts) 803 } 804 805 // UpdateAlloc is used to update the client status of an allocation 806 func (n *Node) UpdateAlloc(args *structs.AllocUpdateRequest, reply *structs.GenericResponse) error { 807 if done, err := n.srv.forward("Node.UpdateAlloc", args, args, reply); done { 808 return err 809 } 810 defer metrics.MeasureSince([]string{"nomad", "client", "update_alloc"}, time.Now()) 811 812 // Ensure at least a single alloc 813 if len(args.Alloc) == 0 { 814 return fmt.Errorf("must update at least one allocation") 815 } 816 817 // Ensure that evals aren't set from client RPCs 818 // We create them here before the raft update 819 if len(args.Evals) != 0 { 820 return fmt.Errorf("evals field must not be set ") 821 } 822 823 // Update modified timestamp for client initiated allocation updates 824 now := time.Now() 825 var evals []*structs.Evaluation 826 827 for _, alloc := range args.Alloc { 828 alloc.ModifyTime = now.UTC().UnixNano() 829 830 // Add an evaluation if this is a failed alloc that is eligible for rescheduling 831 if alloc.ClientStatus == structs.AllocClientStatusFailed { 832 // Only create evaluations if this is an existing alloc, 833 // and eligible as per its task group's ReschedulePolicy 834 if existingAlloc, _ := n.srv.State().AllocByID(nil, alloc.ID); existingAlloc != nil { 835 job, err := n.srv.State().JobByID(nil, existingAlloc.Namespace, existingAlloc.JobID) 836 if err != nil { 837 n.srv.logger.Printf("[ERR] nomad.client: UpdateAlloc unable to find job ID %q :%v", existingAlloc.JobID, err) 838 continue 839 } 840 if job == nil { 841 n.srv.logger.Printf("[DEBUG] nomad.client: UpdateAlloc unable to find job ID %q", existingAlloc.JobID) 842 continue 843 } 844 taskGroup := job.LookupTaskGroup(existingAlloc.TaskGroup) 845 if taskGroup != nil && existingAlloc.RescheduleEligible(taskGroup.ReschedulePolicy, now) { 846 eval := &structs.Evaluation{ 847 ID: uuid.Generate(), 848 Namespace: existingAlloc.Namespace, 849 TriggeredBy: structs.EvalTriggerRetryFailedAlloc, 850 JobID: existingAlloc.JobID, 851 Type: job.Type, 852 Priority: job.Priority, 853 Status: structs.EvalStatusPending, 854 } 855 evals = append(evals, eval) 856 } 857 } 858 } 859 } 860 if len(evals) > 0 { 861 n.srv.logger.Printf("[DEBUG] nomad.client: Adding %v evaluations for rescheduling failed allocations", len(evals)) 862 } 863 // Add this to the batch 864 n.updatesLock.Lock() 865 n.updates = append(n.updates, args.Alloc...) 866 867 // Start a new batch if none 868 future := n.updateFuture 869 if future == nil { 870 future = NewBatchFuture() 871 n.updateFuture = future 872 n.updateTimer = time.AfterFunc(batchUpdateInterval, func() { 873 // Get the pending updates 874 n.updatesLock.Lock() 875 updates := n.updates 876 future := n.updateFuture 877 n.updates = nil 878 n.updateFuture = nil 879 n.updateTimer = nil 880 n.updatesLock.Unlock() 881 882 // Perform the batch update 883 n.batchUpdate(future, updates, evals) 884 }) 885 } 886 n.updatesLock.Unlock() 887 888 // Wait for the future 889 if err := future.Wait(); err != nil { 890 return err 891 } 892 893 // Setup the response 894 reply.Index = future.Index() 895 return nil 896 } 897 898 // batchUpdate is used to update all the allocations 899 func (n *Node) batchUpdate(future *batchFuture, updates []*structs.Allocation, evals []*structs.Evaluation) { 900 // Prepare the batch update 901 batch := &structs.AllocUpdateRequest{ 902 Alloc: updates, 903 Evals: evals, 904 WriteRequest: structs.WriteRequest{Region: n.srv.config.Region}, 905 } 906 907 // Commit this update via Raft 908 var mErr multierror.Error 909 _, index, err := n.srv.raftApply(structs.AllocClientUpdateRequestType, batch) 910 if err != nil { 911 n.srv.logger.Printf("[ERR] nomad.client: alloc update failed: %v", err) 912 mErr.Errors = append(mErr.Errors, err) 913 } 914 915 // For each allocation we are updating check if we should revoke any 916 // Vault Accessors 917 var revoke []*structs.VaultAccessor 918 for _, alloc := range updates { 919 // Skip any allocation that isn't dead on the client 920 if !alloc.Terminated() { 921 continue 922 } 923 924 // Determine if there are any Vault accessors for the allocation 925 ws := memdb.NewWatchSet() 926 accessors, err := n.srv.State().VaultAccessorsByAlloc(ws, alloc.ID) 927 if err != nil { 928 n.srv.logger.Printf("[ERR] nomad.client: looking up accessors for alloc %q failed: %v", alloc.ID, err) 929 mErr.Errors = append(mErr.Errors, err) 930 } 931 932 revoke = append(revoke, accessors...) 933 } 934 935 if l := len(revoke); l != 0 { 936 n.srv.logger.Printf("[DEBUG] nomad.client: revoking %d accessors due to terminal allocations", l) 937 if err := n.srv.vault.RevokeTokens(context.Background(), revoke, true); err != nil { 938 n.srv.logger.Printf("[ERR] nomad.client: batched accessor revocation failed: %v", err) 939 mErr.Errors = append(mErr.Errors, err) 940 } 941 } 942 943 // Respond to the future 944 future.Respond(index, mErr.ErrorOrNil()) 945 } 946 947 // List is used to list the available nodes 948 func (n *Node) List(args *structs.NodeListRequest, 949 reply *structs.NodeListResponse) error { 950 if done, err := n.srv.forward("Node.List", args, args, reply); done { 951 return err 952 } 953 defer metrics.MeasureSince([]string{"nomad", "client", "list"}, time.Now()) 954 955 // Check node read permissions 956 if aclObj, err := n.srv.ResolveToken(args.AuthToken); err != nil { 957 return err 958 } else if aclObj != nil && !aclObj.AllowNodeRead() { 959 return structs.ErrPermissionDenied 960 } 961 962 // Setup the blocking query 963 opts := blockingOptions{ 964 queryOpts: &args.QueryOptions, 965 queryMeta: &reply.QueryMeta, 966 run: func(ws memdb.WatchSet, state *state.StateStore) error { 967 // Capture all the nodes 968 var err error 969 var iter memdb.ResultIterator 970 if prefix := args.QueryOptions.Prefix; prefix != "" { 971 iter, err = state.NodesByIDPrefix(ws, prefix) 972 } else { 973 iter, err = state.Nodes(ws) 974 } 975 if err != nil { 976 return err 977 } 978 979 var nodes []*structs.NodeListStub 980 for { 981 raw := iter.Next() 982 if raw == nil { 983 break 984 } 985 node := raw.(*structs.Node) 986 nodes = append(nodes, node.Stub()) 987 } 988 reply.Nodes = nodes 989 990 // Use the last index that affected the jobs table 991 index, err := state.Index("nodes") 992 if err != nil { 993 return err 994 } 995 reply.Index = index 996 997 // Set the query response 998 n.srv.setQueryMeta(&reply.QueryMeta) 999 return nil 1000 }} 1001 return n.srv.blockingRPC(&opts) 1002 } 1003 1004 // createNodeEvals is used to create evaluations for each alloc on a node. 1005 // Each Eval is scoped to a job, so we need to potentially trigger many evals. 1006 func (n *Node) createNodeEvals(nodeID string, nodeIndex uint64) ([]string, uint64, error) { 1007 // Snapshot the state 1008 snap, err := n.srv.fsm.State().Snapshot() 1009 if err != nil { 1010 return nil, 0, fmt.Errorf("failed to snapshot state: %v", err) 1011 } 1012 1013 // Find all the allocations for this node 1014 ws := memdb.NewWatchSet() 1015 allocs, err := snap.AllocsByNode(ws, nodeID) 1016 if err != nil { 1017 return nil, 0, fmt.Errorf("failed to find allocs for '%s': %v", nodeID, err) 1018 } 1019 1020 sysJobsIter, err := snap.JobsByScheduler(ws, "system") 1021 if err != nil { 1022 return nil, 0, fmt.Errorf("failed to find system jobs for '%s': %v", nodeID, err) 1023 } 1024 1025 var sysJobs []*structs.Job 1026 for job := sysJobsIter.Next(); job != nil; job = sysJobsIter.Next() { 1027 sysJobs = append(sysJobs, job.(*structs.Job)) 1028 } 1029 1030 // Fast-path if nothing to do 1031 if len(allocs) == 0 && len(sysJobs) == 0 { 1032 return nil, 0, nil 1033 } 1034 1035 // Create an eval for each JobID affected 1036 var evals []*structs.Evaluation 1037 var evalIDs []string 1038 jobIDs := make(map[string]struct{}) 1039 1040 for _, alloc := range allocs { 1041 // Deduplicate on JobID 1042 if _, ok := jobIDs[alloc.JobID]; ok { 1043 continue 1044 } 1045 jobIDs[alloc.JobID] = struct{}{} 1046 1047 // Create a new eval 1048 eval := &structs.Evaluation{ 1049 ID: uuid.Generate(), 1050 Namespace: alloc.Namespace, 1051 Priority: alloc.Job.Priority, 1052 Type: alloc.Job.Type, 1053 TriggeredBy: structs.EvalTriggerNodeUpdate, 1054 JobID: alloc.JobID, 1055 NodeID: nodeID, 1056 NodeModifyIndex: nodeIndex, 1057 Status: structs.EvalStatusPending, 1058 } 1059 evals = append(evals, eval) 1060 evalIDs = append(evalIDs, eval.ID) 1061 } 1062 1063 // Create an evaluation for each system job. 1064 for _, job := range sysJobs { 1065 // Still dedup on JobID as the node may already have the system job. 1066 if _, ok := jobIDs[job.ID]; ok { 1067 continue 1068 } 1069 jobIDs[job.ID] = struct{}{} 1070 1071 // Create a new eval 1072 eval := &structs.Evaluation{ 1073 ID: uuid.Generate(), 1074 Namespace: job.Namespace, 1075 Priority: job.Priority, 1076 Type: job.Type, 1077 TriggeredBy: structs.EvalTriggerNodeUpdate, 1078 JobID: job.ID, 1079 NodeID: nodeID, 1080 NodeModifyIndex: nodeIndex, 1081 Status: structs.EvalStatusPending, 1082 } 1083 evals = append(evals, eval) 1084 evalIDs = append(evalIDs, eval.ID) 1085 } 1086 1087 // Create the Raft transaction 1088 update := &structs.EvalUpdateRequest{ 1089 Evals: evals, 1090 WriteRequest: structs.WriteRequest{Region: n.srv.config.Region}, 1091 } 1092 1093 // Commit this evaluation via Raft 1094 // XXX: There is a risk of partial failure where the node update succeeds 1095 // but that the EvalUpdate does not. 1096 _, evalIndex, err := n.srv.raftApply(structs.EvalUpdateRequestType, update) 1097 if err != nil { 1098 return nil, 0, err 1099 } 1100 return evalIDs, evalIndex, nil 1101 } 1102 1103 // batchFuture is used to wait on a batch update to complete 1104 type batchFuture struct { 1105 doneCh chan struct{} 1106 err error 1107 index uint64 1108 } 1109 1110 // NewBatchFuture creates a new batch future 1111 func NewBatchFuture() *batchFuture { 1112 return &batchFuture{ 1113 doneCh: make(chan struct{}), 1114 } 1115 } 1116 1117 // Wait is used to block for the future to complete and returns the error 1118 func (b *batchFuture) Wait() error { 1119 <-b.doneCh 1120 return b.err 1121 } 1122 1123 // Index is used to return the index of the batch, only after Wait() 1124 func (b *batchFuture) Index() uint64 { 1125 return b.index 1126 } 1127 1128 // Respond is used to unblock the future 1129 func (b *batchFuture) Respond(index uint64, err error) { 1130 b.index = index 1131 b.err = err 1132 close(b.doneCh) 1133 } 1134 1135 // DeriveVaultToken is used by the clients to request wrapped Vault tokens for 1136 // tasks 1137 func (n *Node) DeriveVaultToken(args *structs.DeriveVaultTokenRequest, 1138 reply *structs.DeriveVaultTokenResponse) error { 1139 1140 // setErr is a helper for setting the recoverable error on the reply and 1141 // logging it 1142 setErr := func(e error, recoverable bool) { 1143 if e == nil { 1144 return 1145 } 1146 reply.Error = structs.NewRecoverableError(e, recoverable).(*structs.RecoverableError) 1147 n.srv.logger.Printf("[ERR] nomad.client: DeriveVaultToken failed (recoverable %v): %v", recoverable, e) 1148 } 1149 1150 if done, err := n.srv.forward("Node.DeriveVaultToken", args, args, reply); done { 1151 setErr(err, structs.IsRecoverable(err) || err == structs.ErrNoLeader) 1152 return nil 1153 } 1154 defer metrics.MeasureSince([]string{"nomad", "client", "derive_vault_token"}, time.Now()) 1155 1156 // Verify the arguments 1157 if args.NodeID == "" { 1158 setErr(fmt.Errorf("missing node ID"), false) 1159 return nil 1160 } 1161 if args.SecretID == "" { 1162 setErr(fmt.Errorf("missing node SecretID"), false) 1163 return nil 1164 } 1165 if args.AllocID == "" { 1166 setErr(fmt.Errorf("missing allocation ID"), false) 1167 return nil 1168 } 1169 if len(args.Tasks) == 0 { 1170 setErr(fmt.Errorf("no tasks specified"), false) 1171 return nil 1172 } 1173 1174 // Verify the following: 1175 // * The Node exists and has the correct SecretID 1176 // * The Allocation exists on the specified node 1177 // * The allocation contains the given tasks and they each require Vault 1178 // tokens 1179 snap, err := n.srv.fsm.State().Snapshot() 1180 if err != nil { 1181 setErr(err, false) 1182 return nil 1183 } 1184 ws := memdb.NewWatchSet() 1185 node, err := snap.NodeByID(ws, args.NodeID) 1186 if err != nil { 1187 setErr(err, false) 1188 return nil 1189 } 1190 if node == nil { 1191 setErr(fmt.Errorf("Node %q does not exist", args.NodeID), false) 1192 return nil 1193 } 1194 if node.SecretID != args.SecretID { 1195 setErr(fmt.Errorf("SecretID mismatch"), false) 1196 return nil 1197 } 1198 1199 alloc, err := snap.AllocByID(ws, args.AllocID) 1200 if err != nil { 1201 setErr(err, false) 1202 return nil 1203 } 1204 if alloc == nil { 1205 setErr(fmt.Errorf("Allocation %q does not exist", args.AllocID), false) 1206 return nil 1207 } 1208 if alloc.NodeID != args.NodeID { 1209 setErr(fmt.Errorf("Allocation %q not running on Node %q", args.AllocID, args.NodeID), false) 1210 return nil 1211 } 1212 if alloc.TerminalStatus() { 1213 setErr(fmt.Errorf("Can't request Vault token for terminal allocation"), false) 1214 return nil 1215 } 1216 1217 // Check the policies 1218 policies := alloc.Job.VaultPolicies() 1219 if policies == nil { 1220 setErr(fmt.Errorf("Job doesn't require Vault policies"), false) 1221 return nil 1222 } 1223 tg, ok := policies[alloc.TaskGroup] 1224 if !ok { 1225 setErr(fmt.Errorf("Task group does not require Vault policies"), false) 1226 return nil 1227 } 1228 1229 var unneeded []string 1230 for _, task := range args.Tasks { 1231 taskVault := tg[task] 1232 if taskVault == nil || len(taskVault.Policies) == 0 { 1233 unneeded = append(unneeded, task) 1234 } 1235 } 1236 1237 if len(unneeded) != 0 { 1238 e := fmt.Errorf("Requested Vault tokens for tasks without defined Vault policies: %s", 1239 strings.Join(unneeded, ", ")) 1240 setErr(e, false) 1241 return nil 1242 } 1243 1244 // At this point the request is valid and we should contact Vault for 1245 // tokens. 1246 1247 // Create an error group where we will spin up a fixed set of goroutines to 1248 // handle deriving tokens but where if any fails the whole group is 1249 // canceled. 1250 g, ctx := errgroup.WithContext(context.Background()) 1251 1252 // Cap the handlers 1253 handlers := len(args.Tasks) 1254 if handlers > maxParallelRequestsPerDerive { 1255 handlers = maxParallelRequestsPerDerive 1256 } 1257 1258 // Create the Vault Tokens 1259 input := make(chan string, handlers) 1260 results := make(map[string]*vapi.Secret, len(args.Tasks)) 1261 for i := 0; i < handlers; i++ { 1262 g.Go(func() error { 1263 for { 1264 select { 1265 case task, ok := <-input: 1266 if !ok { 1267 return nil 1268 } 1269 1270 secret, err := n.srv.vault.CreateToken(ctx, alloc, task) 1271 if err != nil { 1272 wrapped := fmt.Sprintf("failed to create token for task %q on alloc %q: %v", task, alloc.ID, err) 1273 return structs.WrapRecoverable(wrapped, err) 1274 } 1275 1276 results[task] = secret 1277 case <-ctx.Done(): 1278 return nil 1279 } 1280 } 1281 }) 1282 } 1283 1284 // Send the input 1285 go func() { 1286 defer close(input) 1287 for _, task := range args.Tasks { 1288 select { 1289 case <-ctx.Done(): 1290 return 1291 case input <- task: 1292 } 1293 } 1294 1295 }() 1296 1297 // Wait for everything to complete or for an error 1298 createErr := g.Wait() 1299 1300 // Retrieve the results 1301 accessors := make([]*structs.VaultAccessor, 0, len(results)) 1302 tokens := make(map[string]string, len(results)) 1303 for task, secret := range results { 1304 w := secret.WrapInfo 1305 if w == nil { 1306 return fmt.Errorf("Vault returned Secret without WrapInfo") 1307 } 1308 1309 tokens[task] = w.Token 1310 accessor := &structs.VaultAccessor{ 1311 Accessor: w.WrappedAccessor, 1312 Task: task, 1313 NodeID: alloc.NodeID, 1314 AllocID: alloc.ID, 1315 CreationTTL: w.TTL, 1316 } 1317 1318 accessors = append(accessors, accessor) 1319 } 1320 1321 // If there was an error revoke the created tokens 1322 if createErr != nil { 1323 n.srv.logger.Printf("[ERR] nomad.node: Vault token creation for alloc %q failed: %v", alloc.ID, createErr) 1324 1325 if revokeErr := n.srv.vault.RevokeTokens(context.Background(), accessors, false); revokeErr != nil { 1326 n.srv.logger.Printf("[ERR] nomad.node: Vault token revocation for alloc %q failed: %v", alloc.ID, revokeErr) 1327 } 1328 1329 if rerr, ok := createErr.(*structs.RecoverableError); ok { 1330 reply.Error = rerr 1331 } else { 1332 reply.Error = structs.NewRecoverableError(createErr, false).(*structs.RecoverableError) 1333 } 1334 1335 return nil 1336 } 1337 1338 // Commit to Raft before returning any of the tokens 1339 req := structs.VaultAccessorsRequest{Accessors: accessors} 1340 _, index, err := n.srv.raftApply(structs.VaultAccessorRegisterRequestType, &req) 1341 if err != nil { 1342 n.srv.logger.Printf("[ERR] nomad.client: Register Vault accessors for alloc %q failed: %v", alloc.ID, err) 1343 1344 // Determine if we can recover from the error 1345 retry := false 1346 switch err { 1347 case raft.ErrNotLeader, raft.ErrLeadershipLost, raft.ErrRaftShutdown, raft.ErrEnqueueTimeout: 1348 retry = true 1349 } 1350 1351 setErr(err, retry) 1352 return nil 1353 } 1354 1355 reply.Index = index 1356 reply.Tasks = tokens 1357 n.srv.setQueryMeta(&reply.QueryMeta) 1358 return nil 1359 }