github.com/kardianos/nomad@v0.1.3-0.20151022182107-b13df73ee850/nomad/node_endpoint.go (about) 1 package nomad 2 3 import ( 4 "fmt" 5 "time" 6 7 "github.com/armon/go-metrics" 8 "github.com/hashicorp/nomad/nomad/structs" 9 ) 10 11 // Node endpoint is used for client interactions 12 type Node struct { 13 srv *Server 14 } 15 16 // Register is used to upsert a client that is available for scheduling 17 func (n *Node) Register(args *structs.NodeRegisterRequest, reply *structs.NodeUpdateResponse) error { 18 if done, err := n.srv.forward("Node.Register", args, args, reply); done { 19 return err 20 } 21 defer metrics.MeasureSince([]string{"nomad", "client", "register"}, time.Now()) 22 23 // Validate the arguments 24 if args.Node == nil { 25 return fmt.Errorf("missing node for client registration") 26 } 27 if args.Node.ID == "" { 28 return fmt.Errorf("missing node ID for client registration") 29 } 30 if args.Node.Datacenter == "" { 31 return fmt.Errorf("missing datacenter for client registration") 32 } 33 if args.Node.Name == "" { 34 return fmt.Errorf("missing node name for client registration") 35 } 36 37 // Default the status if none is given 38 if args.Node.Status == "" { 39 args.Node.Status = structs.NodeStatusInit 40 } 41 if !structs.ValidNodeStatus(args.Node.Status) { 42 return fmt.Errorf("invalid status for node") 43 } 44 45 // Commit this update via Raft 46 _, index, err := n.srv.raftApply(structs.NodeRegisterRequestType, args) 47 if err != nil { 48 n.srv.logger.Printf("[ERR] nomad.client: Register failed: %v", err) 49 return err 50 } 51 reply.NodeModifyIndex = index 52 53 // Check if we should trigger evaluations 54 if structs.ShouldDrainNode(args.Node.Status) { 55 evalIDs, evalIndex, err := n.createNodeEvals(args.Node.ID, index) 56 if err != nil { 57 n.srv.logger.Printf("[ERR] nomad.client: eval creation failed: %v", err) 58 return err 59 } 60 reply.EvalIDs = evalIDs 61 reply.EvalCreateIndex = evalIndex 62 } 63 64 // Check if we need to setup a heartbeat 65 if !args.Node.TerminalStatus() { 66 ttl, err := n.srv.resetHeartbeatTimer(args.Node.ID) 67 if err != nil { 68 n.srv.logger.Printf("[ERR] nomad.client: heartbeat reset failed: %v", err) 69 return err 70 } 71 reply.HeartbeatTTL = ttl 72 } 73 74 // Set the reply index 75 reply.Index = index 76 return nil 77 } 78 79 // Deregister is used to remove a client from the client. If a client should 80 // just be made unavailable for scheduling, a status update is prefered. 81 func (n *Node) Deregister(args *structs.NodeDeregisterRequest, reply *structs.NodeUpdateResponse) error { 82 if done, err := n.srv.forward("Node.Deregister", args, args, reply); done { 83 return err 84 } 85 defer metrics.MeasureSince([]string{"nomad", "client", "deregister"}, time.Now()) 86 87 // Verify the arguments 88 if args.NodeID == "" { 89 return fmt.Errorf("missing node ID for client deregistration") 90 } 91 92 // Commit this update via Raft 93 _, index, err := n.srv.raftApply(structs.NodeDeregisterRequestType, args) 94 if err != nil { 95 n.srv.logger.Printf("[ERR] nomad.client: Deregister failed: %v", err) 96 return err 97 } 98 99 // Clear the heartbeat timer if any 100 n.srv.clearHeartbeatTimer(args.NodeID) 101 102 // Create the evaluations for this node 103 evalIDs, evalIndex, err := n.createNodeEvals(args.NodeID, index) 104 if err != nil { 105 n.srv.logger.Printf("[ERR] nomad.client: eval creation failed: %v", err) 106 return err 107 } 108 109 // Setup the reply 110 reply.EvalIDs = evalIDs 111 reply.EvalCreateIndex = evalIndex 112 reply.NodeModifyIndex = index 113 reply.Index = index 114 return nil 115 } 116 117 // UpdateStatus is used to update the status of a client node 118 func (n *Node) UpdateStatus(args *structs.NodeUpdateStatusRequest, reply *structs.NodeUpdateResponse) error { 119 if done, err := n.srv.forward("Node.UpdateStatus", args, args, reply); done { 120 return err 121 } 122 defer metrics.MeasureSince([]string{"nomad", "client", "update_status"}, time.Now()) 123 124 // Verify the arguments 125 if args.NodeID == "" { 126 return fmt.Errorf("missing node ID for client deregistration") 127 } 128 if !structs.ValidNodeStatus(args.Status) { 129 return fmt.Errorf("invalid status for node") 130 } 131 132 // Look for the node 133 snap, err := n.srv.fsm.State().Snapshot() 134 if err != nil { 135 return err 136 } 137 node, err := snap.NodeByID(args.NodeID) 138 if err != nil { 139 return err 140 } 141 if node == nil { 142 return fmt.Errorf("node not found") 143 } 144 145 // Commit this update via Raft 146 var index uint64 147 if node.Status != args.Status { 148 _, index, err = n.srv.raftApply(structs.NodeUpdateStatusRequestType, args) 149 if err != nil { 150 n.srv.logger.Printf("[ERR] nomad.client: status update failed: %v", err) 151 return err 152 } 153 reply.NodeModifyIndex = index 154 } 155 156 // Check if we should trigger evaluations 157 initToReady := node.Status == structs.NodeStatusInit && args.Status == structs.NodeStatusReady 158 terminalToReady := node.Status == structs.NodeStatusDown && args.Status == structs.NodeStatusReady 159 transitionToReady := initToReady || terminalToReady 160 if structs.ShouldDrainNode(args.Status) || transitionToReady { 161 evalIDs, evalIndex, err := n.createNodeEvals(args.NodeID, index) 162 if err != nil { 163 n.srv.logger.Printf("[ERR] nomad.client: eval creation failed: %v", err) 164 return err 165 } 166 reply.EvalIDs = evalIDs 167 reply.EvalCreateIndex = evalIndex 168 } 169 170 // Check if we need to setup a heartbeat 171 if args.Status != structs.NodeStatusDown { 172 ttl, err := n.srv.resetHeartbeatTimer(args.NodeID) 173 if err != nil { 174 n.srv.logger.Printf("[ERR] nomad.client: heartbeat reset failed: %v", err) 175 return err 176 } 177 reply.HeartbeatTTL = ttl 178 } 179 180 // Set the reply index 181 reply.Index = index 182 return nil 183 } 184 185 // UpdateDrain is used to update the drain mode of a client node 186 func (n *Node) UpdateDrain(args *structs.NodeUpdateDrainRequest, 187 reply *structs.NodeDrainUpdateResponse) error { 188 if done, err := n.srv.forward("Node.UpdateDrain", args, args, reply); done { 189 return err 190 } 191 defer metrics.MeasureSince([]string{"nomad", "client", "update_drain"}, time.Now()) 192 193 // Verify the arguments 194 if args.NodeID == "" { 195 return fmt.Errorf("missing node ID for drain update") 196 } 197 198 // Look for the node 199 snap, err := n.srv.fsm.State().Snapshot() 200 if err != nil { 201 return err 202 } 203 node, err := snap.NodeByID(args.NodeID) 204 if err != nil { 205 return err 206 } 207 if node == nil { 208 return fmt.Errorf("node not found") 209 } 210 211 // Commit this update via Raft 212 var index uint64 213 if node.Drain != args.Drain { 214 _, index, err = n.srv.raftApply(structs.NodeUpdateDrainRequestType, args) 215 if err != nil { 216 n.srv.logger.Printf("[ERR] nomad.client: drain update failed: %v", err) 217 return err 218 } 219 reply.NodeModifyIndex = index 220 } 221 222 // Check if we should trigger evaluations 223 if args.Drain { 224 evalIDs, evalIndex, err := n.createNodeEvals(args.NodeID, index) 225 if err != nil { 226 n.srv.logger.Printf("[ERR] nomad.client: eval creation failed: %v", err) 227 return err 228 } 229 reply.EvalIDs = evalIDs 230 reply.EvalCreateIndex = evalIndex 231 } 232 233 // Set the reply index 234 reply.Index = index 235 return nil 236 } 237 238 // Evaluate is used to force a re-evaluation of the node 239 func (n *Node) Evaluate(args *structs.NodeEvaluateRequest, reply *structs.NodeUpdateResponse) error { 240 if done, err := n.srv.forward("Node.Evaluate", args, args, reply); done { 241 return err 242 } 243 defer metrics.MeasureSince([]string{"nomad", "client", "evaluate"}, time.Now()) 244 245 // Verify the arguments 246 if args.NodeID == "" { 247 return fmt.Errorf("missing node ID for evaluation") 248 } 249 250 // Look for the node 251 snap, err := n.srv.fsm.State().Snapshot() 252 if err != nil { 253 return err 254 } 255 node, err := snap.NodeByID(args.NodeID) 256 if err != nil { 257 return err 258 } 259 if node == nil { 260 return fmt.Errorf("node not found") 261 } 262 263 // Create the evaluation 264 evalIDs, evalIndex, err := n.createNodeEvals(args.NodeID, node.ModifyIndex) 265 if err != nil { 266 n.srv.logger.Printf("[ERR] nomad.client: eval creation failed: %v", err) 267 return err 268 } 269 reply.EvalIDs = evalIDs 270 reply.EvalCreateIndex = evalIndex 271 272 // Set the reply index 273 reply.Index = evalIndex 274 return nil 275 } 276 277 // GetNode is used to request information about a specific node 278 func (n *Node) GetNode(args *structs.NodeSpecificRequest, 279 reply *structs.SingleNodeResponse) error { 280 if done, err := n.srv.forward("Node.GetNode", args, args, reply); done { 281 return err 282 } 283 defer metrics.MeasureSince([]string{"nomad", "client", "get_node"}, time.Now()) 284 285 // Verify the arguments 286 if args.NodeID == "" { 287 return fmt.Errorf("missing node ID") 288 } 289 290 // Look for the node 291 snap, err := n.srv.fsm.State().Snapshot() 292 if err != nil { 293 return err 294 } 295 out, err := snap.NodeByID(args.NodeID) 296 if err != nil { 297 return err 298 } 299 300 // Setup the output 301 if out != nil { 302 reply.Node = out 303 reply.Index = out.ModifyIndex 304 } else { 305 // Use the last index that affected the nodes table 306 index, err := snap.Index("nodes") 307 if err != nil { 308 return err 309 } 310 reply.Index = index 311 } 312 313 // Set the query response 314 n.srv.setQueryMeta(&reply.QueryMeta) 315 return nil 316 } 317 318 // GetAllocs is used to request allocations for a specific node 319 func (n *Node) GetAllocs(args *structs.NodeSpecificRequest, 320 reply *structs.NodeAllocsResponse) error { 321 if done, err := n.srv.forward("Node.GetAllocs", args, args, reply); done { 322 return err 323 } 324 defer metrics.MeasureSince([]string{"nomad", "client", "get_allocs"}, time.Now()) 325 326 // Verify the arguments 327 if args.NodeID == "" { 328 return fmt.Errorf("missing node ID") 329 } 330 331 // Setup the blocking query 332 opts := blockingOptions{ 333 queryOpts: &args.QueryOptions, 334 queryMeta: &reply.QueryMeta, 335 allocWatch: args.NodeID, 336 run: func() error { 337 // Look for the node 338 snap, err := n.srv.fsm.State().Snapshot() 339 if err != nil { 340 return err 341 } 342 allocs, err := snap.AllocsByNode(args.NodeID) 343 if err != nil { 344 return err 345 } 346 347 // Setup the output 348 if len(allocs) != 0 { 349 reply.Allocs = allocs 350 for _, alloc := range allocs { 351 reply.Index = maxUint64(reply.Index, alloc.ModifyIndex) 352 } 353 } else { 354 reply.Allocs = nil 355 356 // Use the last index that affected the nodes table 357 index, err := snap.Index("allocs") 358 if err != nil { 359 return err 360 } 361 362 // Must provide non-zero index to prevent blocking 363 // Index 1 is impossible anyways (due to Raft internals) 364 if index == 0 { 365 reply.Index = 1 366 } else { 367 reply.Index = index 368 } 369 } 370 return nil 371 }} 372 return n.srv.blockingRPC(&opts) 373 } 374 375 // UpdateAlloc is used to update the client status of an allocation 376 func (n *Node) UpdateAlloc(args *structs.AllocUpdateRequest, reply *structs.GenericResponse) error { 377 if done, err := n.srv.forward("Node.UpdateAlloc", args, args, reply); done { 378 return err 379 } 380 defer metrics.MeasureSince([]string{"nomad", "client", "update_alloc"}, time.Now()) 381 382 // Ensure only a single alloc 383 if len(args.Alloc) != 1 { 384 return fmt.Errorf("must update a single allocation") 385 } 386 387 // Commit this update via Raft 388 _, index, err := n.srv.raftApply(structs.AllocClientUpdateRequestType, args) 389 if err != nil { 390 n.srv.logger.Printf("[ERR] nomad.client: alloc update failed: %v", err) 391 return err 392 } 393 394 // Setup the response 395 reply.Index = index 396 return nil 397 } 398 399 // List is used to list the available nodes 400 func (n *Node) List(args *structs.NodeListRequest, 401 reply *structs.NodeListResponse) error { 402 if done, err := n.srv.forward("Node.List", args, args, reply); done { 403 return err 404 } 405 defer metrics.MeasureSince([]string{"nomad", "client", "list"}, time.Now()) 406 407 // Capture all the nodes 408 snap, err := n.srv.fsm.State().Snapshot() 409 if err != nil { 410 return err 411 } 412 iter, err := snap.Nodes() 413 if err != nil { 414 return err 415 } 416 417 for { 418 raw := iter.Next() 419 if raw == nil { 420 break 421 } 422 node := raw.(*structs.Node) 423 reply.Nodes = append(reply.Nodes, node.Stub()) 424 } 425 426 // Use the last index that affected the jobs table 427 index, err := snap.Index("nodes") 428 if err != nil { 429 return err 430 } 431 reply.Index = index 432 433 // Set the query response 434 n.srv.setQueryMeta(&reply.QueryMeta) 435 return nil 436 } 437 438 // createNodeEvals is used to create evaluations for each alloc on a node. 439 // Each Eval is scoped to a job, so we need to potentially trigger many evals. 440 func (n *Node) createNodeEvals(nodeID string, nodeIndex uint64) ([]string, uint64, error) { 441 // Snapshot the state 442 snap, err := n.srv.fsm.State().Snapshot() 443 if err != nil { 444 return nil, 0, fmt.Errorf("failed to snapshot state: %v", err) 445 } 446 447 // Find all the allocations for this node 448 allocs, err := snap.AllocsByNode(nodeID) 449 if err != nil { 450 return nil, 0, fmt.Errorf("failed to find allocs for '%s': %v", nodeID, err) 451 } 452 453 sysJobsIter, err := snap.JobsByScheduler("system") 454 if err != nil { 455 return nil, 0, fmt.Errorf("failed to find system jobs for '%s': %v", nodeID, err) 456 } 457 458 var sysJobs []*structs.Job 459 for job := sysJobsIter.Next(); job != nil; job = sysJobsIter.Next() { 460 sysJobs = append(sysJobs, job.(*structs.Job)) 461 } 462 463 // Fast-path if nothing to do 464 if len(allocs) == 0 && len(sysJobs) == 0 { 465 return nil, 0, nil 466 } 467 468 // Create an eval for each JobID affected 469 var evals []*structs.Evaluation 470 var evalIDs []string 471 jobIDs := make(map[string]struct{}) 472 473 for _, alloc := range allocs { 474 // Deduplicate on JobID 475 if _, ok := jobIDs[alloc.JobID]; ok { 476 continue 477 } 478 jobIDs[alloc.JobID] = struct{}{} 479 480 // Create a new eval 481 eval := &structs.Evaluation{ 482 ID: structs.GenerateUUID(), 483 Priority: alloc.Job.Priority, 484 Type: alloc.Job.Type, 485 TriggeredBy: structs.EvalTriggerNodeUpdate, 486 JobID: alloc.JobID, 487 NodeID: nodeID, 488 NodeModifyIndex: nodeIndex, 489 Status: structs.EvalStatusPending, 490 } 491 evals = append(evals, eval) 492 evalIDs = append(evalIDs, eval.ID) 493 } 494 495 // Create an evaluation for each system job. 496 for _, job := range sysJobs { 497 // Still dedup on JobID as the node may already have the system job. 498 if _, ok := jobIDs[job.ID]; ok { 499 continue 500 } 501 jobIDs[job.ID] = struct{}{} 502 503 // Create a new eval 504 eval := &structs.Evaluation{ 505 ID: structs.GenerateUUID(), 506 Priority: job.Priority, 507 Type: job.Type, 508 TriggeredBy: structs.EvalTriggerNodeUpdate, 509 JobID: job.ID, 510 NodeID: nodeID, 511 NodeModifyIndex: nodeIndex, 512 Status: structs.EvalStatusPending, 513 } 514 evals = append(evals, eval) 515 evalIDs = append(evalIDs, eval.ID) 516 } 517 518 // Create the Raft transaction 519 update := &structs.EvalUpdateRequest{ 520 Evals: evals, 521 WriteRequest: structs.WriteRequest{Region: n.srv.config.Region}, 522 } 523 524 // Commit this evaluation via Raft 525 // XXX: There is a risk of partial failure where the node update succeeds 526 // but that the EvalUpdate does not. 527 _, evalIndex, err := n.srv.raftApply(structs.EvalUpdateRequestType, update) 528 if err != nil { 529 return nil, 0, err 530 } 531 return evalIDs, evalIndex, nil 532 }