github.com/ryanslade/nomad@v0.2.4-0.20160128061903-fc95782f2089/nomad/node_endpoint.go (about) 1 package nomad 2 3 import ( 4 "fmt" 5 "time" 6 7 "github.com/armon/go-metrics" 8 "github.com/hashicorp/go-memdb" 9 "github.com/hashicorp/nomad/nomad/structs" 10 "github.com/hashicorp/nomad/nomad/watch" 11 ) 12 13 // Node endpoint is used for client interactions 14 type Node struct { 15 srv *Server 16 } 17 18 // Register is used to upsert a client that is available for scheduling 19 func (n *Node) Register(args *structs.NodeRegisterRequest, reply *structs.NodeUpdateResponse) error { 20 if done, err := n.srv.forward("Node.Register", args, args, reply); done { 21 return err 22 } 23 defer metrics.MeasureSince([]string{"nomad", "client", "register"}, time.Now()) 24 25 // Validate the arguments 26 if args.Node == nil { 27 return fmt.Errorf("missing node for client registration") 28 } 29 if args.Node.ID == "" { 30 return fmt.Errorf("missing node ID for client registration") 31 } 32 if args.Node.Datacenter == "" { 33 return fmt.Errorf("missing datacenter for client registration") 34 } 35 if args.Node.Name == "" { 36 return fmt.Errorf("missing node name for client registration") 37 } 38 39 // Default the status if none is given 40 if args.Node.Status == "" { 41 args.Node.Status = structs.NodeStatusInit 42 } 43 if !structs.ValidNodeStatus(args.Node.Status) { 44 return fmt.Errorf("invalid status for node") 45 } 46 47 // Compute the node class 48 if err := args.Node.ComputeClass(); err != nil { 49 return fmt.Errorf("failed to computed node class: %v", err) 50 } 51 52 // Commit this update via Raft 53 _, index, err := n.srv.raftApply(structs.NodeRegisterRequestType, args) 54 if err != nil { 55 n.srv.logger.Printf("[ERR] nomad.client: Register failed: %v", err) 56 return err 57 } 58 reply.NodeModifyIndex = index 59 60 // Check if we should trigger evaluations 61 if structs.ShouldDrainNode(args.Node.Status) { 62 evalIDs, evalIndex, err := n.createNodeEvals(args.Node.ID, index) 63 if err != nil { 64 n.srv.logger.Printf("[ERR] nomad.client: eval creation failed: %v", err) 65 return err 66 } 67 reply.EvalIDs = evalIDs 68 reply.EvalCreateIndex = evalIndex 69 } 70 71 // Check if we need to setup a heartbeat 72 if !args.Node.TerminalStatus() { 73 ttl, err := n.srv.resetHeartbeatTimer(args.Node.ID) 74 if err != nil { 75 n.srv.logger.Printf("[ERR] nomad.client: heartbeat reset failed: %v", err) 76 return err 77 } 78 reply.HeartbeatTTL = ttl 79 } 80 81 // Set the reply index 82 reply.Index = index 83 return nil 84 } 85 86 // Deregister is used to remove a client from the client. If a client should 87 // just be made unavailable for scheduling, a status update is prefered. 88 func (n *Node) Deregister(args *structs.NodeDeregisterRequest, reply *structs.NodeUpdateResponse) error { 89 if done, err := n.srv.forward("Node.Deregister", args, args, reply); done { 90 return err 91 } 92 defer metrics.MeasureSince([]string{"nomad", "client", "deregister"}, time.Now()) 93 94 // Verify the arguments 95 if args.NodeID == "" { 96 return fmt.Errorf("missing node ID for client deregistration") 97 } 98 99 // Commit this update via Raft 100 _, index, err := n.srv.raftApply(structs.NodeDeregisterRequestType, args) 101 if err != nil { 102 n.srv.logger.Printf("[ERR] nomad.client: Deregister failed: %v", err) 103 return err 104 } 105 106 // Clear the heartbeat timer if any 107 n.srv.clearHeartbeatTimer(args.NodeID) 108 109 // Create the evaluations for this node 110 evalIDs, evalIndex, err := n.createNodeEvals(args.NodeID, index) 111 if err != nil { 112 n.srv.logger.Printf("[ERR] nomad.client: eval creation failed: %v", err) 113 return err 114 } 115 116 // Setup the reply 117 reply.EvalIDs = evalIDs 118 reply.EvalCreateIndex = evalIndex 119 reply.NodeModifyIndex = index 120 reply.Index = index 121 return nil 122 } 123 124 // UpdateStatus is used to update the status of a client node 125 func (n *Node) UpdateStatus(args *structs.NodeUpdateStatusRequest, reply *structs.NodeUpdateResponse) error { 126 if done, err := n.srv.forward("Node.UpdateStatus", args, args, reply); done { 127 return err 128 } 129 defer metrics.MeasureSince([]string{"nomad", "client", "update_status"}, time.Now()) 130 131 // Verify the arguments 132 if args.NodeID == "" { 133 return fmt.Errorf("missing node ID for client deregistration") 134 } 135 if !structs.ValidNodeStatus(args.Status) { 136 return fmt.Errorf("invalid status for node") 137 } 138 139 // Look for the node 140 snap, err := n.srv.fsm.State().Snapshot() 141 if err != nil { 142 return err 143 } 144 node, err := snap.NodeByID(args.NodeID) 145 if err != nil { 146 return err 147 } 148 if node == nil { 149 return fmt.Errorf("node not found") 150 } 151 152 // Commit this update via Raft 153 var index uint64 154 if node.Status != args.Status { 155 _, index, err = n.srv.raftApply(structs.NodeUpdateStatusRequestType, args) 156 if err != nil { 157 n.srv.logger.Printf("[ERR] nomad.client: status update failed: %v", err) 158 return err 159 } 160 reply.NodeModifyIndex = index 161 } 162 163 // Check if we should trigger evaluations 164 initToReady := node.Status == structs.NodeStatusInit && args.Status == structs.NodeStatusReady 165 terminalToReady := node.Status == structs.NodeStatusDown && args.Status == structs.NodeStatusReady 166 transitionToReady := initToReady || terminalToReady 167 if structs.ShouldDrainNode(args.Status) || transitionToReady { 168 evalIDs, evalIndex, err := n.createNodeEvals(args.NodeID, index) 169 if err != nil { 170 n.srv.logger.Printf("[ERR] nomad.client: eval creation failed: %v", err) 171 return err 172 } 173 reply.EvalIDs = evalIDs 174 reply.EvalCreateIndex = evalIndex 175 } 176 177 // Check if we need to setup a heartbeat 178 if args.Status != structs.NodeStatusDown { 179 ttl, err := n.srv.resetHeartbeatTimer(args.NodeID) 180 if err != nil { 181 n.srv.logger.Printf("[ERR] nomad.client: heartbeat reset failed: %v", err) 182 return err 183 } 184 reply.HeartbeatTTL = ttl 185 } 186 187 // Set the reply index 188 reply.Index = index 189 return nil 190 } 191 192 // UpdateDrain is used to update the drain mode of a client node 193 func (n *Node) UpdateDrain(args *structs.NodeUpdateDrainRequest, 194 reply *structs.NodeDrainUpdateResponse) error { 195 if done, err := n.srv.forward("Node.UpdateDrain", args, args, reply); done { 196 return err 197 } 198 defer metrics.MeasureSince([]string{"nomad", "client", "update_drain"}, time.Now()) 199 200 // Verify the arguments 201 if args.NodeID == "" { 202 return fmt.Errorf("missing node ID for drain update") 203 } 204 205 // Look for the node 206 snap, err := n.srv.fsm.State().Snapshot() 207 if err != nil { 208 return err 209 } 210 node, err := snap.NodeByID(args.NodeID) 211 if err != nil { 212 return err 213 } 214 if node == nil { 215 return fmt.Errorf("node not found") 216 } 217 218 // Commit this update via Raft 219 var index uint64 220 if node.Drain != args.Drain { 221 _, index, err = n.srv.raftApply(structs.NodeUpdateDrainRequestType, args) 222 if err != nil { 223 n.srv.logger.Printf("[ERR] nomad.client: drain update failed: %v", err) 224 return err 225 } 226 reply.NodeModifyIndex = index 227 } 228 229 // Check if we should trigger evaluations 230 if args.Drain { 231 evalIDs, evalIndex, err := n.createNodeEvals(args.NodeID, index) 232 if err != nil { 233 n.srv.logger.Printf("[ERR] nomad.client: eval creation failed: %v", err) 234 return err 235 } 236 reply.EvalIDs = evalIDs 237 reply.EvalCreateIndex = evalIndex 238 } 239 240 // Set the reply index 241 reply.Index = index 242 return nil 243 } 244 245 // Evaluate is used to force a re-evaluation of the node 246 func (n *Node) Evaluate(args *structs.NodeEvaluateRequest, reply *structs.NodeUpdateResponse) error { 247 if done, err := n.srv.forward("Node.Evaluate", args, args, reply); done { 248 return err 249 } 250 defer metrics.MeasureSince([]string{"nomad", "client", "evaluate"}, time.Now()) 251 252 // Verify the arguments 253 if args.NodeID == "" { 254 return fmt.Errorf("missing node ID for evaluation") 255 } 256 257 // Look for the node 258 snap, err := n.srv.fsm.State().Snapshot() 259 if err != nil { 260 return err 261 } 262 node, err := snap.NodeByID(args.NodeID) 263 if err != nil { 264 return err 265 } 266 if node == nil { 267 return fmt.Errorf("node not found") 268 } 269 270 // Create the evaluation 271 evalIDs, evalIndex, err := n.createNodeEvals(args.NodeID, node.ModifyIndex) 272 if err != nil { 273 n.srv.logger.Printf("[ERR] nomad.client: eval creation failed: %v", err) 274 return err 275 } 276 reply.EvalIDs = evalIDs 277 reply.EvalCreateIndex = evalIndex 278 279 // Set the reply index 280 reply.Index = evalIndex 281 return nil 282 } 283 284 // GetNode is used to request information about a specific node 285 func (n *Node) GetNode(args *structs.NodeSpecificRequest, 286 reply *structs.SingleNodeResponse) error { 287 if done, err := n.srv.forward("Node.GetNode", args, args, reply); done { 288 return err 289 } 290 defer metrics.MeasureSince([]string{"nomad", "client", "get_node"}, time.Now()) 291 292 // Setup the blocking query 293 opts := blockingOptions{ 294 queryOpts: &args.QueryOptions, 295 queryMeta: &reply.QueryMeta, 296 watch: watch.NewItems(watch.Item{Node: args.NodeID}), 297 run: func() error { 298 // Verify the arguments 299 if args.NodeID == "" { 300 return fmt.Errorf("missing node ID") 301 } 302 303 // Look for the node 304 snap, err := n.srv.fsm.State().Snapshot() 305 if err != nil { 306 return err 307 } 308 out, err := snap.NodeByID(args.NodeID) 309 if err != nil { 310 return err 311 } 312 313 // Setup the output 314 reply.Node = out 315 if out != nil { 316 reply.Index = out.ModifyIndex 317 } else { 318 // Use the last index that affected the nodes table 319 index, err := snap.Index("nodes") 320 if err != nil { 321 return err 322 } 323 reply.Index = index 324 } 325 326 // Set the query response 327 n.srv.setQueryMeta(&reply.QueryMeta) 328 return nil 329 }} 330 return n.srv.blockingRPC(&opts) 331 } 332 333 // GetAllocs is used to request allocations for a specific node 334 func (n *Node) GetAllocs(args *structs.NodeSpecificRequest, 335 reply *structs.NodeAllocsResponse) error { 336 if done, err := n.srv.forward("Node.GetAllocs", args, args, reply); done { 337 return err 338 } 339 defer metrics.MeasureSince([]string{"nomad", "client", "get_allocs"}, time.Now()) 340 341 // Verify the arguments 342 if args.NodeID == "" { 343 return fmt.Errorf("missing node ID") 344 } 345 346 // Setup the blocking query 347 opts := blockingOptions{ 348 queryOpts: &args.QueryOptions, 349 queryMeta: &reply.QueryMeta, 350 watch: watch.NewItems(watch.Item{AllocNode: args.NodeID}), 351 run: func() error { 352 // Look for the node 353 snap, err := n.srv.fsm.State().Snapshot() 354 if err != nil { 355 return err 356 } 357 allocs, err := snap.AllocsByNode(args.NodeID) 358 if err != nil { 359 return err 360 } 361 362 // Setup the output 363 if len(allocs) != 0 { 364 reply.Allocs = allocs 365 for _, alloc := range allocs { 366 reply.Index = maxUint64(reply.Index, alloc.ModifyIndex) 367 } 368 } else { 369 reply.Allocs = nil 370 371 // Use the last index that affected the nodes table 372 index, err := snap.Index("allocs") 373 if err != nil { 374 return err 375 } 376 377 // Must provide non-zero index to prevent blocking 378 // Index 1 is impossible anyways (due to Raft internals) 379 if index == 0 { 380 reply.Index = 1 381 } else { 382 reply.Index = index 383 } 384 } 385 return nil 386 }} 387 return n.srv.blockingRPC(&opts) 388 } 389 390 // UpdateAlloc is used to update the client status of an allocation 391 func (n *Node) UpdateAlloc(args *structs.AllocUpdateRequest, reply *structs.GenericResponse) error { 392 if done, err := n.srv.forward("Node.UpdateAlloc", args, args, reply); done { 393 return err 394 } 395 defer metrics.MeasureSince([]string{"nomad", "client", "update_alloc"}, time.Now()) 396 397 // Ensure only a single alloc 398 if len(args.Alloc) != 1 { 399 return fmt.Errorf("must update a single allocation") 400 } 401 402 // Commit this update via Raft 403 _, index, err := n.srv.raftApply(structs.AllocClientUpdateRequestType, args) 404 if err != nil { 405 n.srv.logger.Printf("[ERR] nomad.client: alloc update failed: %v", err) 406 return err 407 } 408 409 // Setup the response 410 reply.Index = index 411 return nil 412 } 413 414 // List is used to list the available nodes 415 func (n *Node) List(args *structs.NodeListRequest, 416 reply *structs.NodeListResponse) error { 417 if done, err := n.srv.forward("Node.List", args, args, reply); done { 418 return err 419 } 420 defer metrics.MeasureSince([]string{"nomad", "client", "list"}, time.Now()) 421 422 // Setup the blocking query 423 opts := blockingOptions{ 424 queryOpts: &args.QueryOptions, 425 queryMeta: &reply.QueryMeta, 426 watch: watch.NewItems(watch.Item{Table: "nodes"}), 427 run: func() error { 428 // Capture all the nodes 429 snap, err := n.srv.fsm.State().Snapshot() 430 if err != nil { 431 return err 432 } 433 var iter memdb.ResultIterator 434 if prefix := args.QueryOptions.Prefix; prefix != "" { 435 iter, err = snap.NodesByIDPrefix(prefix) 436 } else { 437 iter, err = snap.Nodes() 438 } 439 if err != nil { 440 return err 441 } 442 443 var nodes []*structs.NodeListStub 444 for { 445 raw := iter.Next() 446 if raw == nil { 447 break 448 } 449 node := raw.(*structs.Node) 450 nodes = append(nodes, node.Stub()) 451 } 452 reply.Nodes = nodes 453 454 // Use the last index that affected the jobs table 455 index, err := snap.Index("nodes") 456 if err != nil { 457 return err 458 } 459 reply.Index = index 460 461 // Set the query response 462 n.srv.setQueryMeta(&reply.QueryMeta) 463 return nil 464 }} 465 return n.srv.blockingRPC(&opts) 466 } 467 468 // createNodeEvals is used to create evaluations for each alloc on a node. 469 // Each Eval is scoped to a job, so we need to potentially trigger many evals. 470 func (n *Node) createNodeEvals(nodeID string, nodeIndex uint64) ([]string, uint64, error) { 471 // Snapshot the state 472 snap, err := n.srv.fsm.State().Snapshot() 473 if err != nil { 474 return nil, 0, fmt.Errorf("failed to snapshot state: %v", err) 475 } 476 477 // Find all the allocations for this node 478 allocs, err := snap.AllocsByNode(nodeID) 479 if err != nil { 480 return nil, 0, fmt.Errorf("failed to find allocs for '%s': %v", nodeID, err) 481 } 482 483 sysJobsIter, err := snap.JobsByScheduler("system") 484 if err != nil { 485 return nil, 0, fmt.Errorf("failed to find system jobs for '%s': %v", nodeID, err) 486 } 487 488 var sysJobs []*structs.Job 489 for job := sysJobsIter.Next(); job != nil; job = sysJobsIter.Next() { 490 sysJobs = append(sysJobs, job.(*structs.Job)) 491 } 492 493 // Fast-path if nothing to do 494 if len(allocs) == 0 && len(sysJobs) == 0 { 495 return nil, 0, nil 496 } 497 498 // Create an eval for each JobID affected 499 var evals []*structs.Evaluation 500 var evalIDs []string 501 jobIDs := make(map[string]struct{}) 502 503 for _, alloc := range allocs { 504 // Deduplicate on JobID 505 if _, ok := jobIDs[alloc.JobID]; ok { 506 continue 507 } 508 jobIDs[alloc.JobID] = struct{}{} 509 510 // Create a new eval 511 eval := &structs.Evaluation{ 512 ID: structs.GenerateUUID(), 513 Priority: alloc.Job.Priority, 514 Type: alloc.Job.Type, 515 TriggeredBy: structs.EvalTriggerNodeUpdate, 516 JobID: alloc.JobID, 517 NodeID: nodeID, 518 NodeModifyIndex: nodeIndex, 519 Status: structs.EvalStatusPending, 520 } 521 evals = append(evals, eval) 522 evalIDs = append(evalIDs, eval.ID) 523 } 524 525 // Create an evaluation for each system job. 526 for _, job := range sysJobs { 527 // Still dedup on JobID as the node may already have the system job. 528 if _, ok := jobIDs[job.ID]; ok { 529 continue 530 } 531 jobIDs[job.ID] = struct{}{} 532 533 // Create a new eval 534 eval := &structs.Evaluation{ 535 ID: structs.GenerateUUID(), 536 Priority: job.Priority, 537 Type: job.Type, 538 TriggeredBy: structs.EvalTriggerNodeUpdate, 539 JobID: job.ID, 540 NodeID: nodeID, 541 NodeModifyIndex: nodeIndex, 542 Status: structs.EvalStatusPending, 543 } 544 evals = append(evals, eval) 545 evalIDs = append(evalIDs, eval.ID) 546 } 547 548 // Create the Raft transaction 549 update := &structs.EvalUpdateRequest{ 550 Evals: evals, 551 WriteRequest: structs.WriteRequest{Region: n.srv.config.Region}, 552 } 553 554 // Commit this evaluation via Raft 555 // XXX: There is a risk of partial failure where the node update succeeds 556 // but that the EvalUpdate does not. 557 _, evalIndex, err := n.srv.raftApply(structs.EvalUpdateRequestType, update) 558 if err != nil { 559 return nil, 0, err 560 } 561 return evalIDs, evalIndex, nil 562 }