github.com/huiliang/nomad@v0.2.1-0.20151124023127-7a8b664699ff/nomad/node_endpoint.go (about) 1 package nomad 2 3 import ( 4 "fmt" 5 "time" 6 7 "github.com/armon/go-metrics" 8 "github.com/hashicorp/nomad/nomad/structs" 9 "github.com/hashicorp/nomad/nomad/watch" 10 ) 11 12 // Node endpoint is used for client interactions 13 type Node struct { 14 srv *Server 15 } 16 17 // Register is used to upsert a client that is available for scheduling 18 func (n *Node) Register(args *structs.NodeRegisterRequest, reply *structs.NodeUpdateResponse) error { 19 if done, err := n.srv.forward("Node.Register", args, args, reply); done { 20 return err 21 } 22 defer metrics.MeasureSince([]string{"nomad", "client", "register"}, time.Now()) 23 24 // Validate the arguments 25 if args.Node == nil { 26 return fmt.Errorf("missing node for client registration") 27 } 28 if args.Node.ID == "" { 29 return fmt.Errorf("missing node ID for client registration") 30 } 31 if args.Node.Datacenter == "" { 32 return fmt.Errorf("missing datacenter for client registration") 33 } 34 if args.Node.Name == "" { 35 return fmt.Errorf("missing node name for client registration") 36 } 37 38 // Default the status if none is given 39 if args.Node.Status == "" { 40 args.Node.Status = structs.NodeStatusInit 41 } 42 if !structs.ValidNodeStatus(args.Node.Status) { 43 return fmt.Errorf("invalid status for node") 44 } 45 46 // Commit this update via Raft 47 _, index, err := n.srv.raftApply(structs.NodeRegisterRequestType, args) 48 if err != nil { 49 n.srv.logger.Printf("[ERR] nomad.client: Register failed: %v", err) 50 return err 51 } 52 reply.NodeModifyIndex = index 53 54 // Check if we should trigger evaluations 55 if structs.ShouldDrainNode(args.Node.Status) { 56 evalIDs, evalIndex, err := n.createNodeEvals(args.Node.ID, index) 57 if err != nil { 58 n.srv.logger.Printf("[ERR] nomad.client: eval creation failed: %v", err) 59 return err 60 } 61 reply.EvalIDs = evalIDs 62 reply.EvalCreateIndex = evalIndex 63 } 64 65 // Check if we need to setup a heartbeat 66 if !args.Node.TerminalStatus() { 67 ttl, err := n.srv.resetHeartbeatTimer(args.Node.ID) 68 if err != nil { 69 n.srv.logger.Printf("[ERR] nomad.client: heartbeat reset failed: %v", err) 70 return err 71 } 72 reply.HeartbeatTTL = ttl 73 } 74 75 // Set the reply index 76 reply.Index = index 77 return nil 78 } 79 80 // Deregister is used to remove a client from the client. If a client should 81 // just be made unavailable for scheduling, a status update is prefered. 82 func (n *Node) Deregister(args *structs.NodeDeregisterRequest, reply *structs.NodeUpdateResponse) error { 83 if done, err := n.srv.forward("Node.Deregister", args, args, reply); done { 84 return err 85 } 86 defer metrics.MeasureSince([]string{"nomad", "client", "deregister"}, time.Now()) 87 88 // Verify the arguments 89 if args.NodeID == "" { 90 return fmt.Errorf("missing node ID for client deregistration") 91 } 92 93 // Commit this update via Raft 94 _, index, err := n.srv.raftApply(structs.NodeDeregisterRequestType, args) 95 if err != nil { 96 n.srv.logger.Printf("[ERR] nomad.client: Deregister failed: %v", err) 97 return err 98 } 99 100 // Clear the heartbeat timer if any 101 n.srv.clearHeartbeatTimer(args.NodeID) 102 103 // Create the evaluations for this node 104 evalIDs, evalIndex, err := n.createNodeEvals(args.NodeID, index) 105 if err != nil { 106 n.srv.logger.Printf("[ERR] nomad.client: eval creation failed: %v", err) 107 return err 108 } 109 110 // Setup the reply 111 reply.EvalIDs = evalIDs 112 reply.EvalCreateIndex = evalIndex 113 reply.NodeModifyIndex = index 114 reply.Index = index 115 return nil 116 } 117 118 // UpdateStatus is used to update the status of a client node 119 func (n *Node) UpdateStatus(args *structs.NodeUpdateStatusRequest, reply *structs.NodeUpdateResponse) error { 120 if done, err := n.srv.forward("Node.UpdateStatus", args, args, reply); done { 121 return err 122 } 123 defer metrics.MeasureSince([]string{"nomad", "client", "update_status"}, time.Now()) 124 125 // Verify the arguments 126 if args.NodeID == "" { 127 return fmt.Errorf("missing node ID for client deregistration") 128 } 129 if !structs.ValidNodeStatus(args.Status) { 130 return fmt.Errorf("invalid status for node") 131 } 132 133 // Look for the node 134 snap, err := n.srv.fsm.State().Snapshot() 135 if err != nil { 136 return err 137 } 138 node, err := snap.NodeByID(args.NodeID) 139 if err != nil { 140 return err 141 } 142 if node == nil { 143 return fmt.Errorf("node not found") 144 } 145 146 // Commit this update via Raft 147 var index uint64 148 if node.Status != args.Status { 149 _, index, err = n.srv.raftApply(structs.NodeUpdateStatusRequestType, args) 150 if err != nil { 151 n.srv.logger.Printf("[ERR] nomad.client: status update failed: %v", err) 152 return err 153 } 154 reply.NodeModifyIndex = index 155 } 156 157 // Check if we should trigger evaluations 158 initToReady := node.Status == structs.NodeStatusInit && args.Status == structs.NodeStatusReady 159 terminalToReady := node.Status == structs.NodeStatusDown && args.Status == structs.NodeStatusReady 160 transitionToReady := initToReady || terminalToReady 161 if structs.ShouldDrainNode(args.Status) || transitionToReady { 162 evalIDs, evalIndex, err := n.createNodeEvals(args.NodeID, index) 163 if err != nil { 164 n.srv.logger.Printf("[ERR] nomad.client: eval creation failed: %v", err) 165 return err 166 } 167 reply.EvalIDs = evalIDs 168 reply.EvalCreateIndex = evalIndex 169 } 170 171 // Check if we need to setup a heartbeat 172 if args.Status != structs.NodeStatusDown { 173 ttl, err := n.srv.resetHeartbeatTimer(args.NodeID) 174 if err != nil { 175 n.srv.logger.Printf("[ERR] nomad.client: heartbeat reset failed: %v", err) 176 return err 177 } 178 reply.HeartbeatTTL = ttl 179 } 180 181 // Set the reply index 182 reply.Index = index 183 return nil 184 } 185 186 // UpdateDrain is used to update the drain mode of a client node 187 func (n *Node) UpdateDrain(args *structs.NodeUpdateDrainRequest, 188 reply *structs.NodeDrainUpdateResponse) error { 189 if done, err := n.srv.forward("Node.UpdateDrain", args, args, reply); done { 190 return err 191 } 192 defer metrics.MeasureSince([]string{"nomad", "client", "update_drain"}, time.Now()) 193 194 // Verify the arguments 195 if args.NodeID == "" { 196 return fmt.Errorf("missing node ID for drain update") 197 } 198 199 // Look for the node 200 snap, err := n.srv.fsm.State().Snapshot() 201 if err != nil { 202 return err 203 } 204 node, err := snap.NodeByID(args.NodeID) 205 if err != nil { 206 return err 207 } 208 if node == nil { 209 return fmt.Errorf("node not found") 210 } 211 212 // Commit this update via Raft 213 var index uint64 214 if node.Drain != args.Drain { 215 _, index, err = n.srv.raftApply(structs.NodeUpdateDrainRequestType, args) 216 if err != nil { 217 n.srv.logger.Printf("[ERR] nomad.client: drain update failed: %v", err) 218 return err 219 } 220 reply.NodeModifyIndex = index 221 } 222 223 // Check if we should trigger evaluations 224 if args.Drain { 225 evalIDs, evalIndex, err := n.createNodeEvals(args.NodeID, index) 226 if err != nil { 227 n.srv.logger.Printf("[ERR] nomad.client: eval creation failed: %v", err) 228 return err 229 } 230 reply.EvalIDs = evalIDs 231 reply.EvalCreateIndex = evalIndex 232 } 233 234 // Set the reply index 235 reply.Index = index 236 return nil 237 } 238 239 // Evaluate is used to force a re-evaluation of the node 240 func (n *Node) Evaluate(args *structs.NodeEvaluateRequest, reply *structs.NodeUpdateResponse) error { 241 if done, err := n.srv.forward("Node.Evaluate", args, args, reply); done { 242 return err 243 } 244 defer metrics.MeasureSince([]string{"nomad", "client", "evaluate"}, time.Now()) 245 246 // Verify the arguments 247 if args.NodeID == "" { 248 return fmt.Errorf("missing node ID for evaluation") 249 } 250 251 // Look for the node 252 snap, err := n.srv.fsm.State().Snapshot() 253 if err != nil { 254 return err 255 } 256 node, err := snap.NodeByID(args.NodeID) 257 if err != nil { 258 return err 259 } 260 if node == nil { 261 return fmt.Errorf("node not found") 262 } 263 264 // Create the evaluation 265 evalIDs, evalIndex, err := n.createNodeEvals(args.NodeID, node.ModifyIndex) 266 if err != nil { 267 n.srv.logger.Printf("[ERR] nomad.client: eval creation failed: %v", err) 268 return err 269 } 270 reply.EvalIDs = evalIDs 271 reply.EvalCreateIndex = evalIndex 272 273 // Set the reply index 274 reply.Index = evalIndex 275 return nil 276 } 277 278 // GetNode is used to request information about a specific node 279 func (n *Node) GetNode(args *structs.NodeSpecificRequest, 280 reply *structs.SingleNodeResponse) error { 281 if done, err := n.srv.forward("Node.GetNode", args, args, reply); done { 282 return err 283 } 284 defer metrics.MeasureSince([]string{"nomad", "client", "get_node"}, time.Now()) 285 286 // Setup the blocking query 287 opts := blockingOptions{ 288 queryOpts: &args.QueryOptions, 289 queryMeta: &reply.QueryMeta, 290 watch: watch.NewItems(watch.Item{Node: args.NodeID}), 291 run: func() error { 292 // Verify the arguments 293 if args.NodeID == "" { 294 return fmt.Errorf("missing node ID") 295 } 296 297 // Look for the node 298 snap, err := n.srv.fsm.State().Snapshot() 299 if err != nil { 300 return err 301 } 302 out, err := snap.NodeByID(args.NodeID) 303 if err != nil { 304 return err 305 } 306 307 // Setup the output 308 reply.Node = out 309 if out != nil { 310 reply.Index = out.ModifyIndex 311 } else { 312 // Use the last index that affected the nodes table 313 index, err := snap.Index("nodes") 314 if err != nil { 315 return err 316 } 317 reply.Index = index 318 } 319 320 // Set the query response 321 n.srv.setQueryMeta(&reply.QueryMeta) 322 return nil 323 }} 324 return n.srv.blockingRPC(&opts) 325 } 326 327 // GetAllocs is used to request allocations for a specific node 328 func (n *Node) GetAllocs(args *structs.NodeSpecificRequest, 329 reply *structs.NodeAllocsResponse) error { 330 if done, err := n.srv.forward("Node.GetAllocs", args, args, reply); done { 331 return err 332 } 333 defer metrics.MeasureSince([]string{"nomad", "client", "get_allocs"}, time.Now()) 334 335 // Verify the arguments 336 if args.NodeID == "" { 337 return fmt.Errorf("missing node ID") 338 } 339 340 // Setup the blocking query 341 opts := blockingOptions{ 342 queryOpts: &args.QueryOptions, 343 queryMeta: &reply.QueryMeta, 344 watch: watch.NewItems(watch.Item{AllocNode: args.NodeID}), 345 run: func() error { 346 // Look for the node 347 snap, err := n.srv.fsm.State().Snapshot() 348 if err != nil { 349 return err 350 } 351 allocs, err := snap.AllocsByNode(args.NodeID) 352 if err != nil { 353 return err 354 } 355 356 // Setup the output 357 if len(allocs) != 0 { 358 reply.Allocs = allocs 359 for _, alloc := range allocs { 360 reply.Index = maxUint64(reply.Index, alloc.ModifyIndex) 361 } 362 } else { 363 reply.Allocs = nil 364 365 // Use the last index that affected the nodes table 366 index, err := snap.Index("allocs") 367 if err != nil { 368 return err 369 } 370 371 // Must provide non-zero index to prevent blocking 372 // Index 1 is impossible anyways (due to Raft internals) 373 if index == 0 { 374 reply.Index = 1 375 } else { 376 reply.Index = index 377 } 378 } 379 return nil 380 }} 381 return n.srv.blockingRPC(&opts) 382 } 383 384 // UpdateAlloc is used to update the client status of an allocation 385 func (n *Node) UpdateAlloc(args *structs.AllocUpdateRequest, reply *structs.GenericResponse) error { 386 if done, err := n.srv.forward("Node.UpdateAlloc", args, args, reply); done { 387 return err 388 } 389 defer metrics.MeasureSince([]string{"nomad", "client", "update_alloc"}, time.Now()) 390 391 // Ensure only a single alloc 392 if len(args.Alloc) != 1 { 393 return fmt.Errorf("must update a single allocation") 394 } 395 396 // Commit this update via Raft 397 _, index, err := n.srv.raftApply(structs.AllocClientUpdateRequestType, args) 398 if err != nil { 399 n.srv.logger.Printf("[ERR] nomad.client: alloc update failed: %v", err) 400 return err 401 } 402 403 // Setup the response 404 reply.Index = index 405 return nil 406 } 407 408 // List is used to list the available nodes 409 func (n *Node) List(args *structs.NodeListRequest, 410 reply *structs.NodeListResponse) error { 411 if done, err := n.srv.forward("Node.List", args, args, reply); done { 412 return err 413 } 414 defer metrics.MeasureSince([]string{"nomad", "client", "list"}, time.Now()) 415 416 // Setup the blocking query 417 opts := blockingOptions{ 418 queryOpts: &args.QueryOptions, 419 queryMeta: &reply.QueryMeta, 420 watch: watch.NewItems(watch.Item{Table: "nodes"}), 421 run: func() error { 422 // Capture all the nodes 423 snap, err := n.srv.fsm.State().Snapshot() 424 if err != nil { 425 return err 426 } 427 iter, err := snap.Nodes() 428 if err != nil { 429 return err 430 } 431 432 var nodes []*structs.NodeListStub 433 for { 434 raw := iter.Next() 435 if raw == nil { 436 break 437 } 438 node := raw.(*structs.Node) 439 nodes = append(nodes, node.Stub()) 440 } 441 reply.Nodes = nodes 442 443 // Use the last index that affected the jobs table 444 index, err := snap.Index("nodes") 445 if err != nil { 446 return err 447 } 448 reply.Index = index 449 450 // Set the query response 451 n.srv.setQueryMeta(&reply.QueryMeta) 452 return nil 453 }} 454 return n.srv.blockingRPC(&opts) 455 } 456 457 // createNodeEvals is used to create evaluations for each alloc on a node. 458 // Each Eval is scoped to a job, so we need to potentially trigger many evals. 459 func (n *Node) createNodeEvals(nodeID string, nodeIndex uint64) ([]string, uint64, error) { 460 // Snapshot the state 461 snap, err := n.srv.fsm.State().Snapshot() 462 if err != nil { 463 return nil, 0, fmt.Errorf("failed to snapshot state: %v", err) 464 } 465 466 // Find all the allocations for this node 467 allocs, err := snap.AllocsByNode(nodeID) 468 if err != nil { 469 return nil, 0, fmt.Errorf("failed to find allocs for '%s': %v", nodeID, err) 470 } 471 472 sysJobsIter, err := snap.JobsByScheduler("system") 473 if err != nil { 474 return nil, 0, fmt.Errorf("failed to find system jobs for '%s': %v", nodeID, err) 475 } 476 477 var sysJobs []*structs.Job 478 for job := sysJobsIter.Next(); job != nil; job = sysJobsIter.Next() { 479 sysJobs = append(sysJobs, job.(*structs.Job)) 480 } 481 482 // Fast-path if nothing to do 483 if len(allocs) == 0 && len(sysJobs) == 0 { 484 return nil, 0, nil 485 } 486 487 // Create an eval for each JobID affected 488 var evals []*structs.Evaluation 489 var evalIDs []string 490 jobIDs := make(map[string]struct{}) 491 492 for _, alloc := range allocs { 493 // Deduplicate on JobID 494 if _, ok := jobIDs[alloc.JobID]; ok { 495 continue 496 } 497 jobIDs[alloc.JobID] = struct{}{} 498 499 // Create a new eval 500 eval := &structs.Evaluation{ 501 ID: structs.GenerateUUID(), 502 Priority: alloc.Job.Priority, 503 Type: alloc.Job.Type, 504 TriggeredBy: structs.EvalTriggerNodeUpdate, 505 JobID: alloc.JobID, 506 NodeID: nodeID, 507 NodeModifyIndex: nodeIndex, 508 Status: structs.EvalStatusPending, 509 } 510 evals = append(evals, eval) 511 evalIDs = append(evalIDs, eval.ID) 512 } 513 514 // Create an evaluation for each system job. 515 for _, job := range sysJobs { 516 // Still dedup on JobID as the node may already have the system job. 517 if _, ok := jobIDs[job.ID]; ok { 518 continue 519 } 520 jobIDs[job.ID] = struct{}{} 521 522 // Create a new eval 523 eval := &structs.Evaluation{ 524 ID: structs.GenerateUUID(), 525 Priority: job.Priority, 526 Type: job.Type, 527 TriggeredBy: structs.EvalTriggerNodeUpdate, 528 JobID: job.ID, 529 NodeID: nodeID, 530 NodeModifyIndex: nodeIndex, 531 Status: structs.EvalStatusPending, 532 } 533 evals = append(evals, eval) 534 evalIDs = append(evalIDs, eval.ID) 535 } 536 537 // Create the Raft transaction 538 update := &structs.EvalUpdateRequest{ 539 Evals: evals, 540 WriteRequest: structs.WriteRequest{Region: n.srv.config.Region}, 541 } 542 543 // Commit this evaluation via Raft 544 // XXX: There is a risk of partial failure where the node update succeeds 545 // but that the EvalUpdate does not. 546 _, evalIndex, err := n.srv.raftApply(structs.EvalUpdateRequestType, update) 547 if err != nil { 548 return nil, 0, err 549 } 550 return evalIDs, evalIndex, nil 551 }