github.com/jmitchell/nomad@v0.1.3-0.20151007230021-7ab84c2862d8/nomad/structs/structs.go (about) 1 package structs 2 3 import ( 4 "bytes" 5 "errors" 6 "fmt" 7 "strings" 8 "time" 9 10 "github.com/hashicorp/go-msgpack/codec" 11 "github.com/hashicorp/go-multierror" 12 ) 13 14 var ( 15 ErrNoLeader = fmt.Errorf("No cluster leader") 16 ErrNoRegionPath = fmt.Errorf("No path to region") 17 ) 18 19 type MessageType uint8 20 21 const ( 22 NodeRegisterRequestType MessageType = iota 23 NodeDeregisterRequestType 24 NodeUpdateStatusRequestType 25 NodeUpdateDrainRequestType 26 JobRegisterRequestType 27 JobDeregisterRequestType 28 EvalUpdateRequestType 29 EvalDeleteRequestType 30 AllocUpdateRequestType 31 AllocClientUpdateRequestType 32 ) 33 34 const ( 35 // IgnoreUnknownTypeFlag is set along with a MessageType 36 // to indicate that the message type can be safely ignored 37 // if it is not recognized. This is for future proofing, so 38 // that new commands can be added in a way that won't cause 39 // old servers to crash when the FSM attempts to process them. 40 IgnoreUnknownTypeFlag MessageType = 128 41 ) 42 43 // RPCInfo is used to describe common information about query 44 type RPCInfo interface { 45 RequestRegion() string 46 IsRead() bool 47 AllowStaleRead() bool 48 } 49 50 // QueryOptions is used to specify various flags for read queries 51 type QueryOptions struct { 52 // The target region for this query 53 Region string 54 55 // If set, wait until query exceeds given index. Must be provided 56 // with MaxQueryTime. 57 MinQueryIndex uint64 58 59 // Provided with MinQueryIndex to wait for change. 60 MaxQueryTime time.Duration 61 62 // If set, any follower can service the request. Results 63 // may be arbitrarily stale. 64 AllowStale bool 65 } 66 67 func (q QueryOptions) RequestRegion() string { 68 return q.Region 69 } 70 71 // QueryOption only applies to reads, so always true 72 func (q QueryOptions) IsRead() bool { 73 return true 74 } 75 76 func (q QueryOptions) AllowStaleRead() bool { 77 return q.AllowStale 78 } 79 80 type WriteRequest struct { 81 // The target region for this write 82 Region string 83 } 84 85 func (w WriteRequest) RequestRegion() string { 86 // The target region for this request 87 return w.Region 88 } 89 90 // WriteRequest only applies to writes, always false 91 func (w WriteRequest) IsRead() bool { 92 return false 93 } 94 95 func (w WriteRequest) AllowStaleRead() bool { 96 return false 97 } 98 99 // QueryMeta allows a query response to include potentially 100 // useful metadata about a query 101 type QueryMeta struct { 102 // This is the index associated with the read 103 Index uint64 104 105 // If AllowStale is used, this is time elapsed since 106 // last contact between the follower and leader. This 107 // can be used to gauge staleness. 108 LastContact time.Duration 109 110 // Used to indicate if there is a known leader node 111 KnownLeader bool 112 } 113 114 // WriteMeta allows a write response to includ e potentially 115 // useful metadata about the write 116 type WriteMeta struct { 117 // This is the index associated with the write 118 Index uint64 119 } 120 121 // NodeRegisterRequest is used for Node.Register endpoint 122 // to register a node as being a schedulable entity. 123 type NodeRegisterRequest struct { 124 Node *Node 125 WriteRequest 126 } 127 128 // NodeDeregisterRequest is used for Node.Deregister endpoint 129 // to deregister a node as being a schedulable entity. 130 type NodeDeregisterRequest struct { 131 NodeID string 132 WriteRequest 133 } 134 135 // NodeUpdateStatusRequest is used for Node.UpdateStatus endpoint 136 // to update the status of a node. 137 type NodeUpdateStatusRequest struct { 138 NodeID string 139 Status string 140 WriteRequest 141 } 142 143 // NodeUpdateDrainRequest is used for updatin the drain status 144 type NodeUpdateDrainRequest struct { 145 NodeID string 146 Drain bool 147 WriteRequest 148 } 149 150 // NodeEvaluateRequest is used to re-evaluate the ndoe 151 type NodeEvaluateRequest struct { 152 NodeID string 153 WriteRequest 154 } 155 156 // NodeSpecificRequest is used when we just need to specify a target node 157 type NodeSpecificRequest struct { 158 NodeID string 159 QueryOptions 160 } 161 162 // JobRegisterRequest is used for Job.Register endpoint 163 // to register a job as being a schedulable entity. 164 type JobRegisterRequest struct { 165 Job *Job 166 WriteRequest 167 } 168 169 // JobDeregisterRequest is used for Job.Deregister endpoint 170 // to deregister a job as being a schedulable entity. 171 type JobDeregisterRequest struct { 172 JobID string 173 WriteRequest 174 } 175 176 // JobEvaluateRequest is used when we just need to re-evaluate a target job 177 type JobEvaluateRequest struct { 178 JobID string 179 WriteRequest 180 } 181 182 // JobSpecificRequest is used when we just need to specify a target job 183 type JobSpecificRequest struct { 184 JobID string 185 QueryOptions 186 } 187 188 // JobListRequest is used to parameterize a list request 189 type JobListRequest struct { 190 QueryOptions 191 } 192 193 // NodeListRequest is used to parameterize a list request 194 type NodeListRequest struct { 195 QueryOptions 196 } 197 198 // EvalUpdateRequest is used for upserting evaluations. 199 type EvalUpdateRequest struct { 200 Evals []*Evaluation 201 EvalToken string 202 WriteRequest 203 } 204 205 // EvalDeleteRequest is used for deleting an evaluation. 206 type EvalDeleteRequest struct { 207 Evals []string 208 Allocs []string 209 WriteRequest 210 } 211 212 // EvalSpecificRequest is used when we just need to specify a target evaluation 213 type EvalSpecificRequest struct { 214 EvalID string 215 QueryOptions 216 } 217 218 // EvalAckRequest is used to Ack/Nack a specific evaluation 219 type EvalAckRequest struct { 220 EvalID string 221 Token string 222 WriteRequest 223 } 224 225 // EvalDequeueRequest is used when we want to dequeue an evaluation 226 type EvalDequeueRequest struct { 227 Schedulers []string 228 Timeout time.Duration 229 WriteRequest 230 } 231 232 // EvalListRequest is used to list the evaluations 233 type EvalListRequest struct { 234 QueryOptions 235 } 236 237 // PlanRequest is used to submit an allocation plan to the leader 238 type PlanRequest struct { 239 Plan *Plan 240 WriteRequest 241 } 242 243 // AllocUpdateRequest is used to submit changes to allocations, either 244 // to cause evictions or to assign new allocaitons. Both can be done 245 // within a single transaction 246 type AllocUpdateRequest struct { 247 // Alloc is the list of new allocations to assign 248 Alloc []*Allocation 249 WriteRequest 250 } 251 252 // AllocListRequest is used to request a list of allocations 253 type AllocListRequest struct { 254 QueryOptions 255 } 256 257 // AllocSpecificRequest is used to query a specific allocation 258 type AllocSpecificRequest struct { 259 AllocID string 260 QueryOptions 261 } 262 263 // GenericRequest is used to request where no 264 // specific information is needed. 265 type GenericRequest struct { 266 QueryOptions 267 } 268 269 // GenericResponse is used to respond to a request where no 270 // specific response information is needed. 271 type GenericResponse struct { 272 WriteMeta 273 } 274 275 const ( 276 ProtocolVersion = "protocol" 277 APIMajorVersion = "api.major" 278 APIMinorVersion = "api.minor" 279 ) 280 281 // VersionResponse is used for the Status.Version reseponse 282 type VersionResponse struct { 283 Build string 284 Versions map[string]int 285 QueryMeta 286 } 287 288 // JobRegisterResponse is used to respond to a job registration 289 type JobRegisterResponse struct { 290 EvalID string 291 EvalCreateIndex uint64 292 JobModifyIndex uint64 293 QueryMeta 294 } 295 296 // JobDeregisterResponse is used to respond to a job deregistration 297 type JobDeregisterResponse struct { 298 EvalID string 299 EvalCreateIndex uint64 300 JobModifyIndex uint64 301 QueryMeta 302 } 303 304 // NodeUpdateResponse is used to respond to a node update 305 type NodeUpdateResponse struct { 306 HeartbeatTTL time.Duration 307 EvalIDs []string 308 EvalCreateIndex uint64 309 NodeModifyIndex uint64 310 QueryMeta 311 } 312 313 // NodeDrainUpdateResponse is used to respond to a node drain update 314 type NodeDrainUpdateResponse struct { 315 EvalIDs []string 316 EvalCreateIndex uint64 317 NodeModifyIndex uint64 318 QueryMeta 319 } 320 321 // NodeAllocsResponse is used to return allocs for a single node 322 type NodeAllocsResponse struct { 323 Allocs []*Allocation 324 QueryMeta 325 } 326 327 // SingleNodeResponse is used to return a single node 328 type SingleNodeResponse struct { 329 Node *Node 330 QueryMeta 331 } 332 333 // JobListResponse is used for a list request 334 type NodeListResponse struct { 335 Nodes []*NodeListStub 336 QueryMeta 337 } 338 339 // SingleJobResponse is used to return a single job 340 type SingleJobResponse struct { 341 Job *Job 342 QueryMeta 343 } 344 345 // JobListResponse is used for a list request 346 type JobListResponse struct { 347 Jobs []*JobListStub 348 QueryMeta 349 } 350 351 // SingleAllocResponse is used to return a single allocation 352 type SingleAllocResponse struct { 353 Alloc *Allocation 354 QueryMeta 355 } 356 357 // JobAllocationsResponse is used to return the allocations for a job 358 type JobAllocationsResponse struct { 359 Allocations []*AllocListStub 360 QueryMeta 361 } 362 363 // JobEvaluationsResponse is used to return the evaluations for a job 364 type JobEvaluationsResponse struct { 365 Evaluations []*Evaluation 366 QueryMeta 367 } 368 369 // SingleEvalResponse is used to return a single evaluation 370 type SingleEvalResponse struct { 371 Eval *Evaluation 372 QueryMeta 373 } 374 375 // EvalDequeueResponse is used to return from a dequeue 376 type EvalDequeueResponse struct { 377 Eval *Evaluation 378 Token string 379 QueryMeta 380 } 381 382 // PlanResponse is used to return from a PlanRequest 383 type PlanResponse struct { 384 Result *PlanResult 385 WriteMeta 386 } 387 388 // AllocListResponse is used for a list request 389 type AllocListResponse struct { 390 Allocations []*AllocListStub 391 QueryMeta 392 } 393 394 // EvalListResponse is used for a list request 395 type EvalListResponse struct { 396 Evaluations []*Evaluation 397 QueryMeta 398 } 399 400 // EvalAllocationsResponse is used to return the allocations for an evaluation 401 type EvalAllocationsResponse struct { 402 Allocations []*AllocListStub 403 QueryMeta 404 } 405 406 const ( 407 NodeStatusInit = "initializing" 408 NodeStatusReady = "ready" 409 NodeStatusDown = "down" 410 ) 411 412 // ShouldDrainNode checks if a given node status should trigger an 413 // evaluation. Some states don't require any further action. 414 func ShouldDrainNode(status string) bool { 415 switch status { 416 case NodeStatusInit, NodeStatusReady: 417 return false 418 case NodeStatusDown: 419 return true 420 default: 421 panic(fmt.Sprintf("unhandled node status %s", status)) 422 } 423 } 424 425 // ValidNodeStatus is used to check if a node status is valid 426 func ValidNodeStatus(status string) bool { 427 switch status { 428 case NodeStatusInit, NodeStatusReady, NodeStatusDown: 429 return true 430 default: 431 return false 432 } 433 } 434 435 // Node is a representation of a schedulable client node 436 type Node struct { 437 // ID is a unique identifier for the node. It can be constructed 438 // by doing a concatenation of the Name and Datacenter as a simple 439 // approach. Alternatively a UUID may be used. 440 ID string 441 442 // Datacenter for this node 443 Datacenter string 444 445 // Node name 446 Name string 447 448 // Attributes is an arbitrary set of key/value 449 // data that can be used for constraints. Examples 450 // include "kernel.name=linux", "arch=386", "driver.docker=1", 451 // "docker.runtime=1.8.3" 452 Attributes map[string]string 453 454 // Resources is the available resources on the client. 455 // For example 'cpu=2' 'memory=2048' 456 Resources *Resources 457 458 // Reserved is the set of resources that are reserved, 459 // and should be subtracted from the total resources for 460 // the purposes of scheduling. This may be provide certain 461 // high-watermark tolerances or because of external schedulers 462 // consuming resources. 463 Reserved *Resources 464 465 // Links are used to 'link' this client to external 466 // systems. For example 'consul=foo.dc1' 'aws=i-83212' 467 // 'ami=ami-123' 468 Links map[string]string 469 470 // Meta is used to associate arbitrary metadata with this 471 // client. This is opaque to Nomad. 472 Meta map[string]string 473 474 // NodeClass is an opaque identifier used to group nodes 475 // together for the purpose of determining scheduling pressure. 476 NodeClass string 477 478 // Drain is controlled by the servers, and not the client. 479 // If true, no jobs will be scheduled to this node, and existing 480 // allocations will be drained. 481 Drain bool 482 483 // Status of this node 484 Status string 485 486 // StatusDescription is meant to provide more human useful information 487 StatusDescription string 488 489 // Raft Indexes 490 CreateIndex uint64 491 ModifyIndex uint64 492 } 493 494 // TerminalStatus returns if the current status is terminal and 495 // will no longer transition. 496 func (n *Node) TerminalStatus() bool { 497 switch n.Status { 498 case NodeStatusDown: 499 return true 500 default: 501 return false 502 } 503 } 504 505 // Stub returns a summarized version of the node 506 func (n *Node) Stub() *NodeListStub { 507 return &NodeListStub{ 508 ID: n.ID, 509 Datacenter: n.Datacenter, 510 Name: n.Name, 511 NodeClass: n.NodeClass, 512 Drain: n.Drain, 513 Status: n.Status, 514 StatusDescription: n.StatusDescription, 515 CreateIndex: n.CreateIndex, 516 ModifyIndex: n.ModifyIndex, 517 } 518 } 519 520 // NodeListStub is used to return a subset of job information 521 // for the job list 522 type NodeListStub struct { 523 ID string 524 Datacenter string 525 Name string 526 NodeClass string 527 Drain bool 528 Status string 529 StatusDescription string 530 CreateIndex uint64 531 ModifyIndex uint64 532 } 533 534 // Resources is used to define the resources available 535 // on a client 536 type Resources struct { 537 CPU int 538 MemoryMB int `mapstructure:"memory"` 539 DiskMB int `mapstructure:"disk"` 540 IOPS int 541 Networks []*NetworkResource 542 } 543 544 // Copy returns a deep copy of the resources 545 func (r *Resources) Copy() *Resources { 546 newR := new(Resources) 547 *newR = *r 548 n := len(r.Networks) 549 newR.Networks = make([]*NetworkResource, n) 550 for i := 0; i < n; i++ { 551 newR.Networks[i] = r.Networks[i].Copy() 552 } 553 return newR 554 } 555 556 // NetIndex finds the matching net index using device name 557 func (r *Resources) NetIndex(n *NetworkResource) int { 558 for idx, net := range r.Networks { 559 if net.Device == n.Device { 560 return idx 561 } 562 } 563 return -1 564 } 565 566 // Superset checks if one set of resources is a superset 567 // of another. This ignores network resources, and the NetworkIndex 568 // should be used for that. 569 func (r *Resources) Superset(other *Resources) (bool, string) { 570 if r.CPU < other.CPU { 571 return false, "cpu exhausted" 572 } 573 if r.MemoryMB < other.MemoryMB { 574 return false, "memory exhausted" 575 } 576 if r.DiskMB < other.DiskMB { 577 return false, "disk exhausted" 578 } 579 if r.IOPS < other.IOPS { 580 return false, "iops exhausted" 581 } 582 return true, "" 583 } 584 585 // Add adds the resources of the delta to this, potentially 586 // returning an error if not possible. 587 func (r *Resources) Add(delta *Resources) error { 588 if delta == nil { 589 return nil 590 } 591 r.CPU += delta.CPU 592 r.MemoryMB += delta.MemoryMB 593 r.DiskMB += delta.DiskMB 594 r.IOPS += delta.IOPS 595 596 for _, n := range delta.Networks { 597 // Find the matching interface by IP or CIDR 598 idx := r.NetIndex(n) 599 if idx == -1 { 600 r.Networks = append(r.Networks, n.Copy()) 601 } else { 602 r.Networks[idx].Add(n) 603 } 604 } 605 return nil 606 } 607 608 func (r *Resources) GoString() string { 609 return fmt.Sprintf("*%#v", *r) 610 } 611 612 // NetworkResource is used to represent available network 613 // resources 614 type NetworkResource struct { 615 Device string // Name of the device 616 CIDR string // CIDR block of addresses 617 IP string // IP address 618 MBits int // Throughput 619 ReservedPorts []int `mapstructure:"reserved_ports"` // Reserved ports 620 DynamicPorts []string `mapstructure:"dynamic_ports"` // Dynamically assigned ports 621 } 622 623 // Copy returns a deep copy of the network resource 624 func (n *NetworkResource) Copy() *NetworkResource { 625 newR := new(NetworkResource) 626 *newR = *n 627 if n.ReservedPorts != nil { 628 newR.ReservedPorts = make([]int, len(n.ReservedPorts)) 629 copy(newR.ReservedPorts, n.ReservedPorts) 630 } 631 return newR 632 } 633 634 // Add adds the resources of the delta to this, potentially 635 // returning an error if not possible. 636 func (n *NetworkResource) Add(delta *NetworkResource) { 637 if len(delta.ReservedPorts) > 0 { 638 n.ReservedPorts = append(n.ReservedPorts, delta.ReservedPorts...) 639 } 640 n.MBits += delta.MBits 641 n.DynamicPorts = append(n.DynamicPorts, delta.DynamicPorts...) 642 } 643 644 func (n *NetworkResource) GoString() string { 645 return fmt.Sprintf("*%#v", *n) 646 } 647 648 // MapDynamicPorts returns a mapping of Label:PortNumber for dynamic ports 649 // allocated on this NetworkResource. The ordering of Label:Port pairs is 650 // random. 651 // 652 // Details: 653 // 654 // The jobspec lets us ask for two types of ports: Reserved ports and Dynamic 655 // ports. Reserved ports are identified by the port number, while Dynamic ports 656 // are identified by a Label. 657 // 658 // When we ask nomad to run a job it checks to see if the Reserved ports we 659 // requested are available. If they are, it then tries to provision any Dynamic 660 // ports that we have requested. When available ports are found to satisfy our 661 // dynamic port requirements, they are APPENDED to the reserved ports list. In 662 // effect, the reserved ports list serves double-duty. First it indicates the 663 // ports we *want*, and then it indicates the ports we are *using*. 664 // 665 // After the the offer process is complete and the job is scheduled we want to 666 // see which ports were made available to us. To see the dynamic ports that 667 // were allocated to us we look at the last N ports in our reservation, where N 668 // is how many dynamic ports we requested. 669 // 670 // MapDynamicPorts matches these port numbers with their labels and gives you 671 // the port mapping. 672 // 673 // Also, be aware that this is intended to be called in the context of 674 // task.Resources after an offer has been made. If you call it in some other 675 // context the behavior is unspecified, including maybe crashing. So don't do that. 676 func (n *NetworkResource) MapDynamicPorts() map[string]int { 677 ports := n.ReservedPorts[len(n.ReservedPorts)-len(n.DynamicPorts):] 678 mapping := make(map[string]int, len(n.DynamicPorts)) 679 680 for idx, label := range n.DynamicPorts { 681 mapping[label] = ports[idx] 682 } 683 684 return mapping 685 } 686 687 // ListStaticPorts returns the list of Static ports allocated to this 688 // NetworkResource. These are presumed to have known semantics so there is no 689 // mapping information. 690 func (n *NetworkResource) ListStaticPorts() []int { 691 return n.ReservedPorts[:len(n.ReservedPorts)-len(n.DynamicPorts)] 692 } 693 694 const ( 695 // JobTypeNomad is reserved for internal system tasks and is 696 // always handled by the CoreScheduler. 697 JobTypeCore = "_core" 698 JobTypeService = "service" 699 JobTypeBatch = "batch" 700 ) 701 702 const ( 703 JobStatusPending = "pending" // Pending means the job is waiting on scheduling 704 JobStatusRunning = "running" // Running means the entire job is running 705 JobStatusComplete = "complete" // Complete means there was a clean termination 706 JobStatusDead = "dead" // Dead means there was abnormal termination 707 ) 708 709 const ( 710 // JobMinPriority is the minimum allowed priority 711 JobMinPriority = 1 712 713 // JobDefaultPriority is the default priority if not 714 // not specified. 715 JobDefaultPriority = 50 716 717 // JobMaxPriority is the maximum allowed priority 718 JobMaxPriority = 100 719 720 // Ensure CoreJobPriority is higher than any user 721 // specified job so that it gets priority. This is important 722 // for the system to remain healthy. 723 CoreJobPriority = JobMaxPriority * 2 724 ) 725 726 // Job is the scope of a scheduling request to Nomad. It is the largest 727 // scoped object, and is a named collection of task groups. Each task group 728 // is further composed of tasks. A task group (TG) is the unit of scheduling 729 // however. 730 type Job struct { 731 // Region is the Nomad region that handles scheduling this job 732 Region string 733 734 // ID is a unique identifier for the job per region. It can be 735 // specified hierarchically like LineOfBiz/OrgName/Team/Project 736 ID string 737 738 // Name is the logical name of the job used to refer to it. This is unique 739 // per region, but not unique globally. 740 Name string 741 742 // Type is used to control various behaviors about the job. Most jobs 743 // are service jobs, meaning they are expected to be long lived. 744 // Some jobs are batch oriented meaning they run and then terminate. 745 // This can be extended in the future to support custom schedulers. 746 Type string 747 748 // Priority is used to control scheduling importance and if this job 749 // can preempt other jobs. 750 Priority int 751 752 // AllAtOnce is used to control if incremental scheduling of task groups 753 // is allowed or if we must do a gang scheduling of the entire job. This 754 // can slow down larger jobs if resources are not available. 755 AllAtOnce bool `mapstructure:"all_at_once"` 756 757 // Datacenters contains all the datacenters this job is allowed to span 758 Datacenters []string 759 760 // Constraints can be specified at a job level and apply to 761 // all the task groups and tasks. 762 Constraints []*Constraint 763 764 // TaskGroups are the collections of task groups that this job needs 765 // to run. Each task group is an atomic unit of scheduling and placement. 766 TaskGroups []*TaskGroup 767 768 // Update is used to control the update strategy 769 Update UpdateStrategy 770 771 // Meta is used to associate arbitrary metadata with this 772 // job. This is opaque to Nomad. 773 Meta map[string]string 774 775 // Job status 776 Status string 777 778 // StatusDescription is meant to provide more human useful information 779 StatusDescription string 780 781 // Raft Indexes 782 CreateIndex uint64 783 ModifyIndex uint64 784 } 785 786 // Validate is used to sanity check a job input 787 func (j *Job) Validate() error { 788 var mErr multierror.Error 789 if j.Region == "" { 790 mErr.Errors = append(mErr.Errors, errors.New("Missing job region")) 791 } 792 if j.ID == "" { 793 mErr.Errors = append(mErr.Errors, errors.New("Missing job ID")) 794 } else if strings.Contains(j.ID, " ") { 795 mErr.Errors = append(mErr.Errors, errors.New("Job ID contains a space")) 796 } 797 if j.Name == "" { 798 mErr.Errors = append(mErr.Errors, errors.New("Missing job name")) 799 } 800 if j.Type == "" { 801 mErr.Errors = append(mErr.Errors, errors.New("Missing job type")) 802 } 803 if j.Priority < JobMinPriority || j.Priority > JobMaxPriority { 804 mErr.Errors = append(mErr.Errors, fmt.Errorf("Job priority must be between [%d, %d]", JobMinPriority, JobMaxPriority)) 805 } 806 if len(j.Datacenters) == 0 { 807 mErr.Errors = append(mErr.Errors, errors.New("Missing job datacenters")) 808 } 809 if len(j.TaskGroups) == 0 { 810 mErr.Errors = append(mErr.Errors, errors.New("Missing job task groups")) 811 } 812 813 // Check for duplicate task groups 814 taskGroups := make(map[string]int) 815 for idx, tg := range j.TaskGroups { 816 if tg.Name == "" { 817 mErr.Errors = append(mErr.Errors, fmt.Errorf("Job task group %d missing name", idx+1)) 818 } else if existing, ok := taskGroups[tg.Name]; ok { 819 mErr.Errors = append(mErr.Errors, fmt.Errorf("Job task group %d redefines '%s' from group %d", idx+1, tg.Name, existing+1)) 820 } else { 821 taskGroups[tg.Name] = idx 822 } 823 } 824 825 // Validate the task group 826 for idx, tg := range j.TaskGroups { 827 if err := tg.Validate(); err != nil { 828 outer := fmt.Errorf("Task group %d validation failed: %s", idx+1, err) 829 mErr.Errors = append(mErr.Errors, outer) 830 } 831 } 832 return mErr.ErrorOrNil() 833 } 834 835 // LookupTaskGroup finds a task group by name 836 func (j *Job) LookupTaskGroup(name string) *TaskGroup { 837 for _, tg := range j.TaskGroups { 838 if tg.Name == name { 839 return tg 840 } 841 } 842 return nil 843 } 844 845 // Stub is used to return a summary of the job 846 func (j *Job) Stub() *JobListStub { 847 return &JobListStub{ 848 ID: j.ID, 849 Name: j.Name, 850 Type: j.Type, 851 Priority: j.Priority, 852 Status: j.Status, 853 StatusDescription: j.StatusDescription, 854 CreateIndex: j.CreateIndex, 855 ModifyIndex: j.ModifyIndex, 856 } 857 } 858 859 // JobListStub is used to return a subset of job information 860 // for the job list 861 type JobListStub struct { 862 ID string 863 Name string 864 Type string 865 Priority int 866 Status string 867 StatusDescription string 868 CreateIndex uint64 869 ModifyIndex uint64 870 } 871 872 // UpdateStrategy is used to modify how updates are done 873 type UpdateStrategy struct { 874 // Stagger is the amount of time between the updates 875 Stagger time.Duration 876 877 // MaxParallel is how many updates can be done in parallel 878 MaxParallel int `mapstructure:"max_parallel"` 879 } 880 881 // Rolling returns if a rolling strategy should be used 882 func (u *UpdateStrategy) Rolling() bool { 883 return u.Stagger > 0 && u.MaxParallel > 0 884 } 885 886 // TaskGroup is an atomic unit of placement. Each task group belongs to 887 // a job and may contain any number of tasks. A task group support running 888 // in many replicas using the same configuration.. 889 type TaskGroup struct { 890 // Name of the task group 891 Name string 892 893 // Count is the number of replicas of this task group that should 894 // be scheduled. 895 Count int 896 897 // Constraints can be specified at a task group level and apply to 898 // all the tasks contained. 899 Constraints []*Constraint 900 901 // Tasks are the collection of tasks that this task group needs to run 902 Tasks []*Task 903 904 // Meta is used to associate arbitrary metadata with this 905 // task group. This is opaque to Nomad. 906 Meta map[string]string 907 } 908 909 // Validate is used to sanity check a task group 910 func (tg *TaskGroup) Validate() error { 911 var mErr multierror.Error 912 if tg.Name == "" { 913 mErr.Errors = append(mErr.Errors, errors.New("Missing task group name")) 914 } 915 if tg.Count <= 0 { 916 mErr.Errors = append(mErr.Errors, errors.New("Task group count must be positive")) 917 } 918 if len(tg.Tasks) == 0 { 919 mErr.Errors = append(mErr.Errors, errors.New("Missing tasks for task group")) 920 } 921 922 // Check for duplicate tasks 923 tasks := make(map[string]int) 924 for idx, task := range tg.Tasks { 925 if task.Name == "" { 926 mErr.Errors = append(mErr.Errors, fmt.Errorf("Task %d missing name", idx+1)) 927 } else if existing, ok := tasks[task.Name]; ok { 928 mErr.Errors = append(mErr.Errors, fmt.Errorf("Task %d redefines '%s' from task %d", idx+1, task.Name, existing+1)) 929 } else { 930 tasks[task.Name] = idx 931 } 932 } 933 934 // Validate the tasks 935 for idx, task := range tg.Tasks { 936 if err := task.Validate(); err != nil { 937 outer := fmt.Errorf("Task %d validation failed: %s", idx+1, err) 938 mErr.Errors = append(mErr.Errors, outer) 939 } 940 } 941 return mErr.ErrorOrNil() 942 } 943 944 // LookupTask finds a task by name 945 func (tg *TaskGroup) LookupTask(name string) *Task { 946 for _, t := range tg.Tasks { 947 if t.Name == name { 948 return t 949 } 950 } 951 return nil 952 } 953 954 func (tg *TaskGroup) GoString() string { 955 return fmt.Sprintf("*%#v", *tg) 956 } 957 958 // Task is a single process typically that is executed as part of a task group. 959 type Task struct { 960 // Name of the task 961 Name string 962 963 // Driver is used to control which driver is used 964 Driver string 965 966 // Config is provided to the driver to initialize 967 Config map[string]string 968 969 // Map of environment variables to be used by the driver 970 Env map[string]string 971 972 // Constraints can be specified at a task level and apply only to 973 // the particular task. 974 Constraints []*Constraint 975 976 // Resources is the resources needed by this task 977 Resources *Resources 978 979 // Meta is used to associate arbitrary metadata with this 980 // task. This is opaque to Nomad. 981 Meta map[string]string 982 } 983 984 func (t *Task) GoString() string { 985 return fmt.Sprintf("*%#v", *t) 986 } 987 988 // Validate is used to sanity check a task group 989 func (t *Task) Validate() error { 990 var mErr multierror.Error 991 if t.Name == "" { 992 mErr.Errors = append(mErr.Errors, errors.New("Missing task name")) 993 } 994 if t.Driver == "" { 995 mErr.Errors = append(mErr.Errors, errors.New("Missing task driver")) 996 } 997 if t.Resources == nil { 998 mErr.Errors = append(mErr.Errors, errors.New("Missing task resources")) 999 } 1000 return mErr.ErrorOrNil() 1001 } 1002 1003 // Constraints are used to restrict placement options in the case of 1004 // a hard constraint, and used to prefer a placement in the case of 1005 // a soft constraint. 1006 type Constraint struct { 1007 Hard bool // Hard or soft constraint 1008 LTarget string // Left-hand target 1009 RTarget string // Right-hand target 1010 Operand string // Constraint operand (<=, <, =, !=, >, >=), contains, near 1011 Weight int // Soft constraints can vary the weight 1012 } 1013 1014 func (c *Constraint) String() string { 1015 return fmt.Sprintf("%s %s %s", c.LTarget, c.Operand, c.RTarget) 1016 } 1017 1018 const ( 1019 AllocDesiredStatusRun = "run" // Allocation should run 1020 AllocDesiredStatusStop = "stop" // Allocation should stop 1021 AllocDesiredStatusEvict = "evict" // Allocation should stop, and was evicted 1022 AllocDesiredStatusFailed = "failed" // Allocation failed to be done 1023 ) 1024 1025 const ( 1026 AllocClientStatusPending = "pending" 1027 AllocClientStatusRunning = "running" 1028 AllocClientStatusDead = "dead" 1029 AllocClientStatusFailed = "failed" 1030 ) 1031 1032 // Allocation is used to allocate the placement of a task group to a node. 1033 type Allocation struct { 1034 // ID of the allocation (UUID) 1035 ID string 1036 1037 // ID of the evaluation that generated this allocation 1038 EvalID string 1039 1040 // Name is a logical name of the allocation. 1041 Name string 1042 1043 // NodeID is the node this is being placed on 1044 NodeID string 1045 1046 // Job is the parent job of the task group being allocated. 1047 // This is copied at allocation time to avoid issues if the job 1048 // definition is updated. 1049 JobID string 1050 Job *Job 1051 1052 // TaskGroup is the name of the task group that should be run 1053 TaskGroup string 1054 1055 // Resources is the total set of resources allocated as part 1056 // of this allocation of the task group. 1057 Resources *Resources 1058 1059 // TaskResources is the set of resources allocated to each 1060 // task. These should sum to the total Resources. 1061 TaskResources map[string]*Resources 1062 1063 // Metrics associated with this allocation 1064 Metrics *AllocMetric 1065 1066 // Desired Status of the allocation on the client 1067 DesiredStatus string 1068 1069 // DesiredStatusDescription is meant to provide more human useful information 1070 DesiredDescription string 1071 1072 // Status of the allocation on the client 1073 ClientStatus string 1074 1075 // ClientStatusDescription is meant to provide more human useful information 1076 ClientDescription string 1077 1078 // Raft Indexes 1079 CreateIndex uint64 1080 ModifyIndex uint64 1081 } 1082 1083 // TerminalStatus returns if the desired status is terminal and 1084 // will no longer transition. This is not based on the current client status. 1085 func (a *Allocation) TerminalStatus() bool { 1086 switch a.DesiredStatus { 1087 case AllocDesiredStatusStop, AllocDesiredStatusEvict, AllocDesiredStatusFailed: 1088 return true 1089 default: 1090 return false 1091 } 1092 } 1093 1094 // Stub returns a list stub for the allocation 1095 func (a *Allocation) Stub() *AllocListStub { 1096 return &AllocListStub{ 1097 ID: a.ID, 1098 EvalID: a.EvalID, 1099 Name: a.Name, 1100 NodeID: a.NodeID, 1101 JobID: a.JobID, 1102 TaskGroup: a.TaskGroup, 1103 DesiredStatus: a.DesiredStatus, 1104 DesiredDescription: a.DesiredDescription, 1105 ClientStatus: a.ClientStatus, 1106 ClientDescription: a.ClientDescription, 1107 CreateIndex: a.CreateIndex, 1108 ModifyIndex: a.ModifyIndex, 1109 } 1110 } 1111 1112 // AllocListStub is used to return a subset of alloc information 1113 type AllocListStub struct { 1114 ID string 1115 EvalID string 1116 Name string 1117 NodeID string 1118 JobID string 1119 TaskGroup string 1120 DesiredStatus string 1121 DesiredDescription string 1122 ClientStatus string 1123 ClientDescription string 1124 CreateIndex uint64 1125 ModifyIndex uint64 1126 } 1127 1128 // AllocMetric is used to track various metrics while attempting 1129 // to make an allocation. These are used to debug a job, or to better 1130 // understand the pressure within the system. 1131 type AllocMetric struct { 1132 // NodesEvaluated is the number of nodes that were evaluated 1133 NodesEvaluated int 1134 1135 // NodesFiltered is the number of nodes filtered due to 1136 // a hard constraint 1137 NodesFiltered int 1138 1139 // ClassFiltered is the number of nodes filtered by class 1140 ClassFiltered map[string]int 1141 1142 // ConstraintFiltered is the number of failures caused by constraint 1143 ConstraintFiltered map[string]int 1144 1145 // NodesExhausted is the number of nodes skipped due to being 1146 // exhausted of at least one resource 1147 NodesExhausted int 1148 1149 // ClassExhausted is the number of nodes exhausted by class 1150 ClassExhausted map[string]int 1151 1152 // DimensionExhausted provides the count by dimension or reason 1153 DimensionExhausted map[string]int 1154 1155 // Scores is the scores of the final few nodes remaining 1156 // for placement. The top score is typically selected. 1157 Scores map[string]float64 1158 1159 // AllocationTime is a measure of how long the allocation 1160 // attempt took. This can affect performance and SLAs. 1161 AllocationTime time.Duration 1162 1163 // CoalescedFailures indicates the number of other 1164 // allocations that were coalesced into this failed allocation. 1165 // This is to prevent creating many failed allocations for a 1166 // single task group. 1167 CoalescedFailures int 1168 } 1169 1170 func (a *AllocMetric) EvaluateNode() { 1171 a.NodesEvaluated += 1 1172 } 1173 1174 func (a *AllocMetric) FilterNode(node *Node, constraint string) { 1175 a.NodesFiltered += 1 1176 if node != nil && node.NodeClass != "" { 1177 if a.ClassFiltered == nil { 1178 a.ClassFiltered = make(map[string]int) 1179 } 1180 a.ClassFiltered[node.NodeClass] += 1 1181 } 1182 if constraint != "" { 1183 if a.ConstraintFiltered == nil { 1184 a.ConstraintFiltered = make(map[string]int) 1185 } 1186 a.ConstraintFiltered[constraint] += 1 1187 } 1188 } 1189 1190 func (a *AllocMetric) ExhaustedNode(node *Node, dimension string) { 1191 a.NodesExhausted += 1 1192 if node != nil && node.NodeClass != "" { 1193 if a.ClassExhausted == nil { 1194 a.ClassExhausted = make(map[string]int) 1195 } 1196 a.ClassExhausted[node.NodeClass] += 1 1197 } 1198 if dimension != "" { 1199 if a.DimensionExhausted == nil { 1200 a.DimensionExhausted = make(map[string]int) 1201 } 1202 a.DimensionExhausted[dimension] += 1 1203 } 1204 } 1205 1206 func (a *AllocMetric) ScoreNode(node *Node, name string, score float64) { 1207 if a.Scores == nil { 1208 a.Scores = make(map[string]float64) 1209 } 1210 key := fmt.Sprintf("%s.%s", node.ID, name) 1211 a.Scores[key] = score 1212 } 1213 1214 const ( 1215 EvalStatusPending = "pending" 1216 EvalStatusComplete = "complete" 1217 EvalStatusFailed = "failed" 1218 ) 1219 1220 const ( 1221 EvalTriggerJobRegister = "job-register" 1222 EvalTriggerJobDeregister = "job-deregister" 1223 EvalTriggerNodeUpdate = "node-update" 1224 EvalTriggerScheduled = "scheduled" 1225 EvalTriggerRollingUpdate = "rolling-update" 1226 ) 1227 1228 const ( 1229 // CoreJobEvalGC is used for the garbage collection of evaluations 1230 // and allocations. We periodically scan evaluations in a terminal state, 1231 // in which all the corresponding allocations are also terminal. We 1232 // delete these out of the system to bound the state. 1233 CoreJobEvalGC = "eval-gc" 1234 1235 // CoreJobNodeGC is used for the garbage collection of failed nodes. 1236 // We periodically scan nodes in a terminal state, and if they have no 1237 // corresponding allocations we delete these out of the system. 1238 CoreJobNodeGC = "node-gc" 1239 ) 1240 1241 // Evaluation is used anytime we need to apply business logic as a result 1242 // of a change to our desired state (job specification) or the emergent state 1243 // (registered nodes). When the inputs change, we need to "evaluate" them, 1244 // potentially taking action (allocation of work) or doing nothing if the state 1245 // of the world does not require it. 1246 type Evaluation struct { 1247 // ID is a randonly generated UUID used for this evaluation. This 1248 // is assigned upon the creation of the evaluation. 1249 ID string 1250 1251 // Priority is used to control scheduling importance and if this job 1252 // can preempt other jobs. 1253 Priority int 1254 1255 // Type is used to control which schedulers are available to handle 1256 // this evaluation. 1257 Type string 1258 1259 // TriggeredBy is used to give some insight into why this Eval 1260 // was created. (Job change, node failure, alloc failure, etc). 1261 TriggeredBy string 1262 1263 // JobID is the job this evaluation is scoped to. Evaluations cannot 1264 // be run in parallel for a given JobID, so we serialize on this. 1265 JobID string 1266 1267 // JobModifyIndex is the modify index of the job at the time 1268 // the evaluation was created 1269 JobModifyIndex uint64 1270 1271 // NodeID is the node that was affected triggering the evaluation. 1272 NodeID string 1273 1274 // NodeModifyIndex is the modify index of the node at the time 1275 // the evaluation was created 1276 NodeModifyIndex uint64 1277 1278 // Status of the evaluation 1279 Status string 1280 1281 // StatusDescription is meant to provide more human useful information 1282 StatusDescription string 1283 1284 // Wait is a minimum wait time for running the eval. This is used to 1285 // support a rolling upgrade. 1286 Wait time.Duration 1287 1288 // NextEval is the evaluation ID for the eval created to do a followup. 1289 // This is used to support rolling upgrades, where we need a chain of evaluations. 1290 NextEval string 1291 1292 // PreviousEval is the evaluation ID for the eval creating this one to do a followup. 1293 // This is used to support rolling upgrades, where we need a chain of evaluations. 1294 PreviousEval string 1295 1296 // Raft Indexes 1297 CreateIndex uint64 1298 ModifyIndex uint64 1299 } 1300 1301 // TerminalStatus returns if the current status is terminal and 1302 // will no longer transition. 1303 func (e *Evaluation) TerminalStatus() bool { 1304 switch e.Status { 1305 case EvalStatusComplete, EvalStatusFailed: 1306 return true 1307 default: 1308 return false 1309 } 1310 } 1311 1312 func (e *Evaluation) GoString() string { 1313 return fmt.Sprintf("<Eval '%s' JobID: '%s'>", e.ID, e.JobID) 1314 } 1315 1316 func (e *Evaluation) Copy() *Evaluation { 1317 ne := new(Evaluation) 1318 *ne = *e 1319 return ne 1320 } 1321 1322 // ShouldEnqueue checks if a given evaluation should be enqueued 1323 func (e *Evaluation) ShouldEnqueue() bool { 1324 switch e.Status { 1325 case EvalStatusPending: 1326 return true 1327 case EvalStatusComplete, EvalStatusFailed: 1328 return false 1329 default: 1330 panic(fmt.Sprintf("unhandled evaluation (%s) status %s", e.ID, e.Status)) 1331 } 1332 } 1333 1334 // MakePlan is used to make a plan from the given evaluation 1335 // for a given Job 1336 func (e *Evaluation) MakePlan(j *Job) *Plan { 1337 p := &Plan{ 1338 EvalID: e.ID, 1339 Priority: e.Priority, 1340 NodeUpdate: make(map[string][]*Allocation), 1341 NodeAllocation: make(map[string][]*Allocation), 1342 } 1343 if j != nil { 1344 p.AllAtOnce = j.AllAtOnce 1345 } 1346 return p 1347 } 1348 1349 // NextRollingEval creates an evaluation to followup this eval for rolling updates 1350 func (e *Evaluation) NextRollingEval(wait time.Duration) *Evaluation { 1351 return &Evaluation{ 1352 ID: GenerateUUID(), 1353 Priority: e.Priority, 1354 Type: e.Type, 1355 TriggeredBy: EvalTriggerRollingUpdate, 1356 JobID: e.JobID, 1357 JobModifyIndex: e.JobModifyIndex, 1358 Status: EvalStatusPending, 1359 Wait: wait, 1360 PreviousEval: e.ID, 1361 } 1362 } 1363 1364 // Plan is used to submit a commit plan for task allocations. These 1365 // are submitted to the leader which verifies that resources have 1366 // not been overcommitted before admiting the plan. 1367 type Plan struct { 1368 // EvalID is the evaluation ID this plan is associated with 1369 EvalID string 1370 1371 // EvalToken is used to prevent a split-brain processing of 1372 // an evaluation. There should only be a single scheduler running 1373 // an Eval at a time, but this could be violated after a leadership 1374 // transition. This unique token is used to reject plans that are 1375 // being submitted from a different leader. 1376 EvalToken string 1377 1378 // Priority is the priority of the upstream job 1379 Priority int 1380 1381 // AllAtOnce is used to control if incremental scheduling of task groups 1382 // is allowed or if we must do a gang scheduling of the entire job. 1383 // If this is false, a plan may be partially applied. Otherwise, the 1384 // entire plan must be able to make progress. 1385 AllAtOnce bool 1386 1387 // NodeUpdate contains all the allocations for each node. For each node, 1388 // this is a list of the allocations to update to either stop or evict. 1389 NodeUpdate map[string][]*Allocation 1390 1391 // NodeAllocation contains all the allocations for each node. 1392 // The evicts must be considered prior to the allocations. 1393 NodeAllocation map[string][]*Allocation 1394 1395 // FailedAllocs are allocations that could not be made, 1396 // but are persisted so that the user can use the feedback 1397 // to determine the cause. 1398 FailedAllocs []*Allocation 1399 } 1400 1401 func (p *Plan) AppendUpdate(alloc *Allocation, status, desc string) { 1402 newAlloc := new(Allocation) 1403 *newAlloc = *alloc 1404 newAlloc.DesiredStatus = status 1405 newAlloc.DesiredDescription = desc 1406 node := alloc.NodeID 1407 existing := p.NodeUpdate[node] 1408 p.NodeUpdate[node] = append(existing, newAlloc) 1409 } 1410 1411 func (p *Plan) PopUpdate(alloc *Allocation) { 1412 existing := p.NodeUpdate[alloc.NodeID] 1413 n := len(existing) 1414 if n > 0 && existing[n-1].ID == alloc.ID { 1415 existing = existing[:n-1] 1416 if len(existing) > 0 { 1417 p.NodeUpdate[alloc.NodeID] = existing 1418 } else { 1419 delete(p.NodeUpdate, alloc.NodeID) 1420 } 1421 } 1422 } 1423 1424 func (p *Plan) AppendAlloc(alloc *Allocation) { 1425 node := alloc.NodeID 1426 existing := p.NodeAllocation[node] 1427 p.NodeAllocation[node] = append(existing, alloc) 1428 } 1429 1430 func (p *Plan) AppendFailed(alloc *Allocation) { 1431 p.FailedAllocs = append(p.FailedAllocs, alloc) 1432 } 1433 1434 // IsNoOp checks if this plan would do nothing 1435 func (p *Plan) IsNoOp() bool { 1436 return len(p.NodeUpdate) == 0 && len(p.NodeAllocation) == 0 && len(p.FailedAllocs) == 0 1437 } 1438 1439 // PlanResult is the result of a plan submitted to the leader. 1440 type PlanResult struct { 1441 // NodeUpdate contains all the updates that were committed. 1442 NodeUpdate map[string][]*Allocation 1443 1444 // NodeAllocation contains all the allocations that were committed. 1445 NodeAllocation map[string][]*Allocation 1446 1447 // FailedAllocs are allocations that could not be made, 1448 // but are persisted so that the user can use the feedback 1449 // to determine the cause. 1450 FailedAllocs []*Allocation 1451 1452 // RefreshIndex is the index the worker should refresh state up to. 1453 // This allows all evictions and allocations to be materialized. 1454 // If any allocations were rejected due to stale data (node state, 1455 // over committed) this can be used to force a worker refresh. 1456 RefreshIndex uint64 1457 1458 // AllocIndex is the Raft index in which the evictions and 1459 // allocations took place. This is used for the write index. 1460 AllocIndex uint64 1461 } 1462 1463 // IsNoOp checks if this plan result would do nothing 1464 func (p *PlanResult) IsNoOp() bool { 1465 return len(p.NodeUpdate) == 0 && len(p.NodeAllocation) == 0 && len(p.FailedAllocs) == 0 1466 } 1467 1468 // FullCommit is used to check if all the allocations in a plan 1469 // were committed as part of the result. Returns if there was 1470 // a match, and the number of expected and actual allocations. 1471 func (p *PlanResult) FullCommit(plan *Plan) (bool, int, int) { 1472 expected := 0 1473 actual := 0 1474 for name, allocList := range plan.NodeAllocation { 1475 didAlloc, _ := p.NodeAllocation[name] 1476 expected += len(allocList) 1477 actual += len(didAlloc) 1478 } 1479 return actual == expected, expected, actual 1480 } 1481 1482 // msgpackHandle is a shared handle for encoding/decoding of structs 1483 var msgpackHandle = &codec.MsgpackHandle{} 1484 1485 // Decode is used to decode a MsgPack encoded object 1486 func Decode(buf []byte, out interface{}) error { 1487 return codec.NewDecoder(bytes.NewReader(buf), msgpackHandle).Decode(out) 1488 } 1489 1490 // Encode is used to encode a MsgPack object with type prefix 1491 func Encode(t MessageType, msg interface{}) ([]byte, error) { 1492 var buf bytes.Buffer 1493 buf.WriteByte(uint8(t)) 1494 err := codec.NewEncoder(&buf, msgpackHandle).Encode(msg) 1495 return buf.Bytes(), err 1496 }