github.com/hhrutter/nomad@v0.6.0-rc2.0.20170723054333-80c4b03f0705/nomad/structs/structs.go (about) 1 package structs 2 3 import ( 4 "bytes" 5 "crypto/md5" 6 "crypto/sha1" 7 "crypto/sha256" 8 "crypto/sha512" 9 "encoding/hex" 10 "errors" 11 "fmt" 12 "io" 13 "net" 14 "os" 15 "path/filepath" 16 "reflect" 17 "regexp" 18 "sort" 19 "strconv" 20 "strings" 21 "time" 22 23 "github.com/gorhill/cronexpr" 24 "github.com/hashicorp/consul/api" 25 "github.com/hashicorp/go-multierror" 26 "github.com/hashicorp/go-version" 27 "github.com/hashicorp/nomad/helper" 28 "github.com/hashicorp/nomad/helper/args" 29 "github.com/mitchellh/copystructure" 30 "github.com/ugorji/go/codec" 31 32 hcodec "github.com/hashicorp/go-msgpack/codec" 33 ) 34 35 var ( 36 ErrNoLeader = fmt.Errorf("No cluster leader") 37 ErrNoRegionPath = fmt.Errorf("No path to region") 38 ) 39 40 type MessageType uint8 41 42 const ( 43 NodeRegisterRequestType MessageType = iota 44 NodeDeregisterRequestType 45 NodeUpdateStatusRequestType 46 NodeUpdateDrainRequestType 47 JobRegisterRequestType 48 JobDeregisterRequestType 49 EvalUpdateRequestType 50 EvalDeleteRequestType 51 AllocUpdateRequestType 52 AllocClientUpdateRequestType 53 ReconcileJobSummariesRequestType 54 VaultAccessorRegisterRequestType 55 VaultAccessorDegisterRequestType 56 ApplyPlanResultsRequestType 57 DeploymentStatusUpdateRequestType 58 DeploymentPromoteRequestType 59 DeploymentAllocHealthRequestType 60 DeploymentDeleteRequestType 61 JobStabilityRequestType 62 ) 63 64 const ( 65 // IgnoreUnknownTypeFlag is set along with a MessageType 66 // to indicate that the message type can be safely ignored 67 // if it is not recognized. This is for future proofing, so 68 // that new commands can be added in a way that won't cause 69 // old servers to crash when the FSM attempts to process them. 70 IgnoreUnknownTypeFlag MessageType = 128 71 72 // ApiMajorVersion is returned as part of the Status.Version request. 73 // It should be incremented anytime the APIs are changed in a way 74 // that would break clients for sane client versioning. 75 ApiMajorVersion = 1 76 77 // ApiMinorVersion is returned as part of the Status.Version request. 78 // It should be incremented anytime the APIs are changed to allow 79 // for sane client versioning. Minor changes should be compatible 80 // within the major version. 81 ApiMinorVersion = 1 82 83 ProtocolVersion = "protocol" 84 APIMajorVersion = "api.major" 85 APIMinorVersion = "api.minor" 86 87 GetterModeAny = "any" 88 GetterModeFile = "file" 89 GetterModeDir = "dir" 90 ) 91 92 // RPCInfo is used to describe common information about query 93 type RPCInfo interface { 94 RequestRegion() string 95 IsRead() bool 96 AllowStaleRead() bool 97 } 98 99 // QueryOptions is used to specify various flags for read queries 100 type QueryOptions struct { 101 // The target region for this query 102 Region string 103 104 // If set, wait until query exceeds given index. Must be provided 105 // with MaxQueryTime. 106 MinQueryIndex uint64 107 108 // Provided with MinQueryIndex to wait for change. 109 MaxQueryTime time.Duration 110 111 // If set, any follower can service the request. Results 112 // may be arbitrarily stale. 113 AllowStale bool 114 115 // If set, used as prefix for resource list searches 116 Prefix string 117 } 118 119 func (q QueryOptions) RequestRegion() string { 120 return q.Region 121 } 122 123 // QueryOption only applies to reads, so always true 124 func (q QueryOptions) IsRead() bool { 125 return true 126 } 127 128 func (q QueryOptions) AllowStaleRead() bool { 129 return q.AllowStale 130 } 131 132 type WriteRequest struct { 133 // The target region for this write 134 Region string 135 } 136 137 func (w WriteRequest) RequestRegion() string { 138 // The target region for this request 139 return w.Region 140 } 141 142 // WriteRequest only applies to writes, always false 143 func (w WriteRequest) IsRead() bool { 144 return false 145 } 146 147 func (w WriteRequest) AllowStaleRead() bool { 148 return false 149 } 150 151 // QueryMeta allows a query response to include potentially 152 // useful metadata about a query 153 type QueryMeta struct { 154 // This is the index associated with the read 155 Index uint64 156 157 // If AllowStale is used, this is time elapsed since 158 // last contact between the follower and leader. This 159 // can be used to gauge staleness. 160 LastContact time.Duration 161 162 // Used to indicate if there is a known leader node 163 KnownLeader bool 164 } 165 166 // WriteMeta allows a write response to include potentially 167 // useful metadata about the write 168 type WriteMeta struct { 169 // This is the index associated with the write 170 Index uint64 171 } 172 173 // NodeRegisterRequest is used for Node.Register endpoint 174 // to register a node as being a schedulable entity. 175 type NodeRegisterRequest struct { 176 Node *Node 177 WriteRequest 178 } 179 180 // NodeDeregisterRequest is used for Node.Deregister endpoint 181 // to deregister a node as being a schedulable entity. 182 type NodeDeregisterRequest struct { 183 NodeID string 184 WriteRequest 185 } 186 187 // NodeServerInfo is used to in NodeUpdateResponse to return Nomad server 188 // information used in RPC server lists. 189 type NodeServerInfo struct { 190 // RPCAdvertiseAddr is the IP endpoint that a Nomad Server wishes to 191 // be contacted at for RPCs. 192 RPCAdvertiseAddr string 193 194 // RpcMajorVersion is the major version number the Nomad Server 195 // supports 196 RPCMajorVersion int32 197 198 // RpcMinorVersion is the minor version number the Nomad Server 199 // supports 200 RPCMinorVersion int32 201 202 // Datacenter is the datacenter that a Nomad server belongs to 203 Datacenter string 204 } 205 206 // NodeUpdateStatusRequest is used for Node.UpdateStatus endpoint 207 // to update the status of a node. 208 type NodeUpdateStatusRequest struct { 209 NodeID string 210 Status string 211 WriteRequest 212 } 213 214 // NodeUpdateDrainRequest is used for updatin the drain status 215 type NodeUpdateDrainRequest struct { 216 NodeID string 217 Drain bool 218 WriteRequest 219 } 220 221 // NodeEvaluateRequest is used to re-evaluate the ndoe 222 type NodeEvaluateRequest struct { 223 NodeID string 224 WriteRequest 225 } 226 227 // NodeSpecificRequest is used when we just need to specify a target node 228 type NodeSpecificRequest struct { 229 NodeID string 230 SecretID string 231 QueryOptions 232 } 233 234 // JobRegisterRequest is used for Job.Register endpoint 235 // to register a job as being a schedulable entity. 236 type JobRegisterRequest struct { 237 Job *Job 238 239 // If EnforceIndex is set then the job will only be registered if the passed 240 // JobModifyIndex matches the current Jobs index. If the index is zero, the 241 // register only occurs if the job is new. 242 EnforceIndex bool 243 JobModifyIndex uint64 244 245 WriteRequest 246 } 247 248 // JobDeregisterRequest is used for Job.Deregister endpoint 249 // to deregister a job as being a schedulable entity. 250 type JobDeregisterRequest struct { 251 JobID string 252 253 // Purge controls whether the deregister purges the job from the system or 254 // whether the job is just marked as stopped and will be removed by the 255 // garbage collector 256 Purge bool 257 258 WriteRequest 259 } 260 261 // JobEvaluateRequest is used when we just need to re-evaluate a target job 262 type JobEvaluateRequest struct { 263 JobID string 264 WriteRequest 265 } 266 267 // JobSpecificRequest is used when we just need to specify a target job 268 type JobSpecificRequest struct { 269 JobID string 270 AllAllocs bool 271 QueryOptions 272 } 273 274 // JobListRequest is used to parameterize a list request 275 type JobListRequest struct { 276 QueryOptions 277 } 278 279 // JobPlanRequest is used for the Job.Plan endpoint to trigger a dry-run 280 // evaluation of the Job. 281 type JobPlanRequest struct { 282 Job *Job 283 Diff bool // Toggles an annotated diff 284 WriteRequest 285 } 286 287 // JobSummaryRequest is used when we just need to get a specific job summary 288 type JobSummaryRequest struct { 289 JobID string 290 QueryOptions 291 } 292 293 // JobDispatchRequest is used to dispatch a job based on a parameterized job 294 type JobDispatchRequest struct { 295 JobID string 296 Payload []byte 297 Meta map[string]string 298 WriteRequest 299 } 300 301 // JobValidateRequest is used to validate a job 302 type JobValidateRequest struct { 303 Job *Job 304 WriteRequest 305 } 306 307 // JobRevertRequest is used to revert a job to a prior version. 308 type JobRevertRequest struct { 309 // JobID is the ID of the job being reverted 310 JobID string 311 312 // JobVersion the version to revert to. 313 JobVersion uint64 314 315 // EnforcePriorVersion if set will enforce that the job is at the given 316 // version before reverting. 317 EnforcePriorVersion *uint64 318 319 WriteRequest 320 } 321 322 // JobStabilityRequest is used to marked a job as stable. 323 type JobStabilityRequest struct { 324 // Job to set the stability on 325 JobID string 326 JobVersion uint64 327 328 // Set the stability 329 Stable bool 330 WriteRequest 331 } 332 333 // JobStabilityResponse is the response when marking a job as stable. 334 type JobStabilityResponse struct { 335 WriteMeta 336 } 337 338 // NodeListRequest is used to parameterize a list request 339 type NodeListRequest struct { 340 QueryOptions 341 } 342 343 // EvalUpdateRequest is used for upserting evaluations. 344 type EvalUpdateRequest struct { 345 Evals []*Evaluation 346 EvalToken string 347 WriteRequest 348 } 349 350 // EvalDeleteRequest is used for deleting an evaluation. 351 type EvalDeleteRequest struct { 352 Evals []string 353 Allocs []string 354 WriteRequest 355 } 356 357 // EvalSpecificRequest is used when we just need to specify a target evaluation 358 type EvalSpecificRequest struct { 359 EvalID string 360 QueryOptions 361 } 362 363 // EvalAckRequest is used to Ack/Nack a specific evaluation 364 type EvalAckRequest struct { 365 EvalID string 366 Token string 367 WriteRequest 368 } 369 370 // EvalDequeueRequest is used when we want to dequeue an evaluation 371 type EvalDequeueRequest struct { 372 Schedulers []string 373 Timeout time.Duration 374 SchedulerVersion uint16 375 WriteRequest 376 } 377 378 // EvalListRequest is used to list the evaluations 379 type EvalListRequest struct { 380 QueryOptions 381 } 382 383 // PlanRequest is used to submit an allocation plan to the leader 384 type PlanRequest struct { 385 Plan *Plan 386 WriteRequest 387 } 388 389 // ApplyPlanResultsRequest is used by the planner to apply a Raft transaction 390 // committing the result of a plan. 391 type ApplyPlanResultsRequest struct { 392 // AllocUpdateRequest holds the allocation updates to be made by the 393 // scheduler. 394 AllocUpdateRequest 395 396 // Deployment is the deployment created or updated as a result of a 397 // scheduling event. 398 Deployment *Deployment 399 400 // DeploymentUpdates is a set of status updates to apply to the given 401 // deployments. This allows the scheduler to cancel any unneeded deployment 402 // because the job is stopped or the update block is removed. 403 DeploymentUpdates []*DeploymentStatusUpdate 404 } 405 406 // AllocUpdateRequest is used to submit changes to allocations, either 407 // to cause evictions or to assign new allocaitons. Both can be done 408 // within a single transaction 409 type AllocUpdateRequest struct { 410 // Alloc is the list of new allocations to assign 411 Alloc []*Allocation 412 413 // Job is the shared parent job of the allocations. 414 // It is pulled out since it is common to reduce payload size. 415 Job *Job 416 417 WriteRequest 418 } 419 420 // AllocListRequest is used to request a list of allocations 421 type AllocListRequest struct { 422 QueryOptions 423 } 424 425 // AllocSpecificRequest is used to query a specific allocation 426 type AllocSpecificRequest struct { 427 AllocID string 428 QueryOptions 429 } 430 431 // AllocsGetRequest is used to query a set of allocations 432 type AllocsGetRequest struct { 433 AllocIDs []string 434 QueryOptions 435 } 436 437 // PeriodicForceReqeuest is used to force a specific periodic job. 438 type PeriodicForceRequest struct { 439 JobID string 440 WriteRequest 441 } 442 443 // ServerMembersResponse has the list of servers in a cluster 444 type ServerMembersResponse struct { 445 ServerName string 446 ServerRegion string 447 ServerDC string 448 Members []*ServerMember 449 } 450 451 // ServerMember holds information about a Nomad server agent in a cluster 452 type ServerMember struct { 453 Name string 454 Addr net.IP 455 Port uint16 456 Tags map[string]string 457 Status string 458 ProtocolMin uint8 459 ProtocolMax uint8 460 ProtocolCur uint8 461 DelegateMin uint8 462 DelegateMax uint8 463 DelegateCur uint8 464 } 465 466 // DeriveVaultTokenRequest is used to request wrapped Vault tokens for the 467 // following tasks in the given allocation 468 type DeriveVaultTokenRequest struct { 469 NodeID string 470 SecretID string 471 AllocID string 472 Tasks []string 473 QueryOptions 474 } 475 476 // VaultAccessorsRequest is used to operate on a set of Vault accessors 477 type VaultAccessorsRequest struct { 478 Accessors []*VaultAccessor 479 } 480 481 // VaultAccessor is a reference to a created Vault token on behalf of 482 // an allocation's task. 483 type VaultAccessor struct { 484 AllocID string 485 Task string 486 NodeID string 487 Accessor string 488 CreationTTL int 489 490 // Raft Indexes 491 CreateIndex uint64 492 } 493 494 // DeriveVaultTokenResponse returns the wrapped tokens for each requested task 495 type DeriveVaultTokenResponse struct { 496 // Tasks is a mapping between the task name and the wrapped token 497 Tasks map[string]string 498 499 // Error stores any error that occured. Errors are stored here so we can 500 // communicate whether it is retriable 501 Error *RecoverableError 502 503 QueryMeta 504 } 505 506 // GenericRequest is used to request where no 507 // specific information is needed. 508 type GenericRequest struct { 509 QueryOptions 510 } 511 512 // DeploymentListRequest is used to list the deployments 513 type DeploymentListRequest struct { 514 QueryOptions 515 } 516 517 // DeploymentDeleteRequest is used for deleting deployments. 518 type DeploymentDeleteRequest struct { 519 Deployments []string 520 WriteRequest 521 } 522 523 // DeploymentStatusUpdateRequest is used to update the status of a deployment as 524 // well as optionally creating an evaluation atomically. 525 type DeploymentStatusUpdateRequest struct { 526 // Eval, if set, is used to create an evaluation at the same time as 527 // updating the status of a deployment. 528 Eval *Evaluation 529 530 // DeploymentUpdate is a status update to apply to the given 531 // deployment. 532 DeploymentUpdate *DeploymentStatusUpdate 533 534 // Job is used to optionally upsert a job. This is used when setting the 535 // allocation health results in a deployment failure and the deployment 536 // auto-reverts to the latest stable job. 537 Job *Job 538 } 539 540 // DeploymentAllocHealthRequest is used to set the health of a set of 541 // allocations as part of a deployment. 542 type DeploymentAllocHealthRequest struct { 543 DeploymentID string 544 545 // Marks these allocations as healthy, allow further allocations 546 // to be rolled. 547 HealthyAllocationIDs []string 548 549 // Any unhealthy allocations fail the deployment 550 UnhealthyAllocationIDs []string 551 552 WriteRequest 553 } 554 555 // ApplyDeploymentAllocHealthRequest is used to apply an alloc health request via Raft 556 type ApplyDeploymentAllocHealthRequest struct { 557 DeploymentAllocHealthRequest 558 559 // An optional field to update the status of a deployment 560 DeploymentUpdate *DeploymentStatusUpdate 561 562 // Job is used to optionally upsert a job. This is used when setting the 563 // allocation health results in a deployment failure and the deployment 564 // auto-reverts to the latest stable job. 565 Job *Job 566 567 // An optional evaluation to create after promoting the canaries 568 Eval *Evaluation 569 } 570 571 // DeploymentPromoteRequest is used to promote task groups in a deployment 572 type DeploymentPromoteRequest struct { 573 DeploymentID string 574 575 // All is to promote all task groups 576 All bool 577 578 // Groups is used to set the promotion status per task group 579 Groups []string 580 581 WriteRequest 582 } 583 584 // ApplyDeploymentPromoteRequest is used to apply a promotion request via Raft 585 type ApplyDeploymentPromoteRequest struct { 586 DeploymentPromoteRequest 587 588 // An optional evaluation to create after promoting the canaries 589 Eval *Evaluation 590 } 591 592 // DeploymentPauseRequest is used to pause a deployment 593 type DeploymentPauseRequest struct { 594 DeploymentID string 595 596 // Pause sets the pause status 597 Pause bool 598 599 WriteRequest 600 } 601 602 // DeploymentSpecificRequest is used to make a request specific to a particular 603 // deployment 604 type DeploymentSpecificRequest struct { 605 DeploymentID string 606 QueryOptions 607 } 608 609 // DeploymentFailRequest is used to fail a particular deployment 610 type DeploymentFailRequest struct { 611 DeploymentID string 612 WriteRequest 613 } 614 615 // SingleDeploymentResponse is used to respond with a single deployment 616 type SingleDeploymentResponse struct { 617 Deployment *Deployment 618 QueryMeta 619 } 620 621 // GenericResponse is used to respond to a request where no 622 // specific response information is needed. 623 type GenericResponse struct { 624 WriteMeta 625 } 626 627 // VersionResponse is used for the Status.Version reseponse 628 type VersionResponse struct { 629 Build string 630 Versions map[string]int 631 QueryMeta 632 } 633 634 // JobRegisterResponse is used to respond to a job registration 635 type JobRegisterResponse struct { 636 EvalID string 637 EvalCreateIndex uint64 638 JobModifyIndex uint64 639 640 // Warnings contains any warnings about the given job. These may include 641 // deprecation warnings. 642 Warnings string 643 644 QueryMeta 645 } 646 647 // JobDeregisterResponse is used to respond to a job deregistration 648 type JobDeregisterResponse struct { 649 EvalID string 650 EvalCreateIndex uint64 651 JobModifyIndex uint64 652 QueryMeta 653 } 654 655 // JobValidateResponse is the response from validate request 656 type JobValidateResponse struct { 657 // DriverConfigValidated indicates whether the agent validated the driver 658 // config 659 DriverConfigValidated bool 660 661 // ValidationErrors is a list of validation errors 662 ValidationErrors []string 663 664 // Error is a string version of any error that may have occured 665 Error string 666 667 // Warnings contains any warnings about the given job. These may include 668 // deprecation warnings. 669 Warnings string 670 } 671 672 // NodeUpdateResponse is used to respond to a node update 673 type NodeUpdateResponse struct { 674 HeartbeatTTL time.Duration 675 EvalIDs []string 676 EvalCreateIndex uint64 677 NodeModifyIndex uint64 678 679 // LeaderRPCAddr is the RPC address of the current Raft Leader. If 680 // empty, the current Nomad Server is in the minority of a partition. 681 LeaderRPCAddr string 682 683 // NumNodes is the number of Nomad nodes attached to this quorum of 684 // Nomad Servers at the time of the response. This value can 685 // fluctuate based on the health of the cluster between heartbeats. 686 NumNodes int32 687 688 // Servers is the full list of known Nomad servers in the local 689 // region. 690 Servers []*NodeServerInfo 691 692 QueryMeta 693 } 694 695 // NodeDrainUpdateResponse is used to respond to a node drain update 696 type NodeDrainUpdateResponse struct { 697 EvalIDs []string 698 EvalCreateIndex uint64 699 NodeModifyIndex uint64 700 QueryMeta 701 } 702 703 // NodeAllocsResponse is used to return allocs for a single node 704 type NodeAllocsResponse struct { 705 Allocs []*Allocation 706 QueryMeta 707 } 708 709 // NodeClientAllocsResponse is used to return allocs meta data for a single node 710 type NodeClientAllocsResponse struct { 711 Allocs map[string]uint64 712 QueryMeta 713 } 714 715 // SingleNodeResponse is used to return a single node 716 type SingleNodeResponse struct { 717 Node *Node 718 QueryMeta 719 } 720 721 // NodeListResponse is used for a list request 722 type NodeListResponse struct { 723 Nodes []*NodeListStub 724 QueryMeta 725 } 726 727 // SingleJobResponse is used to return a single job 728 type SingleJobResponse struct { 729 Job *Job 730 QueryMeta 731 } 732 733 // JobSummaryResponse is used to return a single job summary 734 type JobSummaryResponse struct { 735 JobSummary *JobSummary 736 QueryMeta 737 } 738 739 type JobDispatchResponse struct { 740 DispatchedJobID string 741 EvalID string 742 EvalCreateIndex uint64 743 JobCreateIndex uint64 744 WriteMeta 745 } 746 747 // JobListResponse is used for a list request 748 type JobListResponse struct { 749 Jobs []*JobListStub 750 QueryMeta 751 } 752 753 // JobVersionsRequest is used to get a jobs versions 754 type JobVersionsRequest struct { 755 JobID string 756 Diffs bool 757 QueryOptions 758 } 759 760 // JobVersionsResponse is used for a job get versions request 761 type JobVersionsResponse struct { 762 Versions []*Job 763 Diffs []*JobDiff 764 QueryMeta 765 } 766 767 // JobPlanResponse is used to respond to a job plan request 768 type JobPlanResponse struct { 769 // Annotations stores annotations explaining decisions the scheduler made. 770 Annotations *PlanAnnotations 771 772 // FailedTGAllocs is the placement failures per task group. 773 FailedTGAllocs map[string]*AllocMetric 774 775 // JobModifyIndex is the modification index of the job. The value can be 776 // used when running `nomad run` to ensure that the Job wasn’t modified 777 // since the last plan. If the job is being created, the value is zero. 778 JobModifyIndex uint64 779 780 // CreatedEvals is the set of evaluations created by the scheduler. The 781 // reasons for this can be rolling-updates or blocked evals. 782 CreatedEvals []*Evaluation 783 784 // Diff contains the diff of the job and annotations on whether the change 785 // causes an in-place update or create/destroy 786 Diff *JobDiff 787 788 // NextPeriodicLaunch is the time duration till the job would be launched if 789 // submitted. 790 NextPeriodicLaunch time.Time 791 792 // Warnings contains any warnings about the given job. These may include 793 // deprecation warnings. 794 Warnings string 795 796 WriteMeta 797 } 798 799 // SingleAllocResponse is used to return a single allocation 800 type SingleAllocResponse struct { 801 Alloc *Allocation 802 QueryMeta 803 } 804 805 // AllocsGetResponse is used to return a set of allocations 806 type AllocsGetResponse struct { 807 Allocs []*Allocation 808 QueryMeta 809 } 810 811 // JobAllocationsResponse is used to return the allocations for a job 812 type JobAllocationsResponse struct { 813 Allocations []*AllocListStub 814 QueryMeta 815 } 816 817 // JobEvaluationsResponse is used to return the evaluations for a job 818 type JobEvaluationsResponse struct { 819 Evaluations []*Evaluation 820 QueryMeta 821 } 822 823 // SingleEvalResponse is used to return a single evaluation 824 type SingleEvalResponse struct { 825 Eval *Evaluation 826 QueryMeta 827 } 828 829 // EvalDequeueResponse is used to return from a dequeue 830 type EvalDequeueResponse struct { 831 Eval *Evaluation 832 Token string 833 QueryMeta 834 } 835 836 // PlanResponse is used to return from a PlanRequest 837 type PlanResponse struct { 838 Result *PlanResult 839 WriteMeta 840 } 841 842 // AllocListResponse is used for a list request 843 type AllocListResponse struct { 844 Allocations []*AllocListStub 845 QueryMeta 846 } 847 848 // DeploymentListResponse is used for a list request 849 type DeploymentListResponse struct { 850 Deployments []*Deployment 851 QueryMeta 852 } 853 854 // EvalListResponse is used for a list request 855 type EvalListResponse struct { 856 Evaluations []*Evaluation 857 QueryMeta 858 } 859 860 // EvalAllocationsResponse is used to return the allocations for an evaluation 861 type EvalAllocationsResponse struct { 862 Allocations []*AllocListStub 863 QueryMeta 864 } 865 866 // PeriodicForceResponse is used to respond to a periodic job force launch 867 type PeriodicForceResponse struct { 868 EvalID string 869 EvalCreateIndex uint64 870 WriteMeta 871 } 872 873 // DeploymentUpdateResponse is used to respond to a deployment change. The 874 // response will include the modify index of the deployment as well as details 875 // of any triggered evaluation. 876 type DeploymentUpdateResponse struct { 877 EvalID string 878 EvalCreateIndex uint64 879 DeploymentModifyIndex uint64 880 881 // RevertedJobVersion is the version the job was reverted to. If unset, the 882 // job wasn't reverted 883 RevertedJobVersion *uint64 884 885 WriteMeta 886 } 887 888 const ( 889 NodeStatusInit = "initializing" 890 NodeStatusReady = "ready" 891 NodeStatusDown = "down" 892 ) 893 894 // ShouldDrainNode checks if a given node status should trigger an 895 // evaluation. Some states don't require any further action. 896 func ShouldDrainNode(status string) bool { 897 switch status { 898 case NodeStatusInit, NodeStatusReady: 899 return false 900 case NodeStatusDown: 901 return true 902 default: 903 panic(fmt.Sprintf("unhandled node status %s", status)) 904 } 905 } 906 907 // ValidNodeStatus is used to check if a node status is valid 908 func ValidNodeStatus(status string) bool { 909 switch status { 910 case NodeStatusInit, NodeStatusReady, NodeStatusDown: 911 return true 912 default: 913 return false 914 } 915 } 916 917 // Node is a representation of a schedulable client node 918 type Node struct { 919 // ID is a unique identifier for the node. It can be constructed 920 // by doing a concatenation of the Name and Datacenter as a simple 921 // approach. Alternatively a UUID may be used. 922 ID string 923 924 // SecretID is an ID that is only known by the Node and the set of Servers. 925 // It is not accessible via the API and is used to authenticate nodes 926 // conducting priviledged activities. 927 SecretID string 928 929 // Datacenter for this node 930 Datacenter string 931 932 // Node name 933 Name string 934 935 // HTTPAddr is the address on which the Nomad client is listening for http 936 // requests 937 HTTPAddr string 938 939 // TLSEnabled indicates if the Agent has TLS enabled for the HTTP API 940 TLSEnabled bool 941 942 // Attributes is an arbitrary set of key/value 943 // data that can be used for constraints. Examples 944 // include "kernel.name=linux", "arch=386", "driver.docker=1", 945 // "docker.runtime=1.8.3" 946 Attributes map[string]string 947 948 // Resources is the available resources on the client. 949 // For example 'cpu=2' 'memory=2048' 950 Resources *Resources 951 952 // Reserved is the set of resources that are reserved, 953 // and should be subtracted from the total resources for 954 // the purposes of scheduling. This may be provide certain 955 // high-watermark tolerances or because of external schedulers 956 // consuming resources. 957 Reserved *Resources 958 959 // Links are used to 'link' this client to external 960 // systems. For example 'consul=foo.dc1' 'aws=i-83212' 961 // 'ami=ami-123' 962 Links map[string]string 963 964 // Meta is used to associate arbitrary metadata with this 965 // client. This is opaque to Nomad. 966 Meta map[string]string 967 968 // NodeClass is an opaque identifier used to group nodes 969 // together for the purpose of determining scheduling pressure. 970 NodeClass string 971 972 // ComputedClass is a unique id that identifies nodes with a common set of 973 // attributes and capabilities. 974 ComputedClass string 975 976 // Drain is controlled by the servers, and not the client. 977 // If true, no jobs will be scheduled to this node, and existing 978 // allocations will be drained. 979 Drain bool 980 981 // Status of this node 982 Status string 983 984 // StatusDescription is meant to provide more human useful information 985 StatusDescription string 986 987 // StatusUpdatedAt is the time stamp at which the state of the node was 988 // updated 989 StatusUpdatedAt int64 990 991 // Raft Indexes 992 CreateIndex uint64 993 ModifyIndex uint64 994 } 995 996 // Ready returns if the node is ready for running allocations 997 func (n *Node) Ready() bool { 998 return n.Status == NodeStatusReady && !n.Drain 999 } 1000 1001 func (n *Node) Copy() *Node { 1002 if n == nil { 1003 return nil 1004 } 1005 nn := new(Node) 1006 *nn = *n 1007 nn.Attributes = helper.CopyMapStringString(nn.Attributes) 1008 nn.Resources = nn.Resources.Copy() 1009 nn.Reserved = nn.Reserved.Copy() 1010 nn.Links = helper.CopyMapStringString(nn.Links) 1011 nn.Meta = helper.CopyMapStringString(nn.Meta) 1012 return nn 1013 } 1014 1015 // TerminalStatus returns if the current status is terminal and 1016 // will no longer transition. 1017 func (n *Node) TerminalStatus() bool { 1018 switch n.Status { 1019 case NodeStatusDown: 1020 return true 1021 default: 1022 return false 1023 } 1024 } 1025 1026 // Stub returns a summarized version of the node 1027 func (n *Node) Stub() *NodeListStub { 1028 return &NodeListStub{ 1029 ID: n.ID, 1030 Datacenter: n.Datacenter, 1031 Name: n.Name, 1032 NodeClass: n.NodeClass, 1033 Drain: n.Drain, 1034 Status: n.Status, 1035 StatusDescription: n.StatusDescription, 1036 CreateIndex: n.CreateIndex, 1037 ModifyIndex: n.ModifyIndex, 1038 } 1039 } 1040 1041 // NodeListStub is used to return a subset of job information 1042 // for the job list 1043 type NodeListStub struct { 1044 ID string 1045 Datacenter string 1046 Name string 1047 NodeClass string 1048 Drain bool 1049 Status string 1050 StatusDescription string 1051 CreateIndex uint64 1052 ModifyIndex uint64 1053 } 1054 1055 // Networks defined for a task on the Resources struct. 1056 type Networks []*NetworkResource 1057 1058 // Port assignment and IP for the given label or empty values. 1059 func (ns Networks) Port(label string) (string, int) { 1060 for _, n := range ns { 1061 for _, p := range n.ReservedPorts { 1062 if p.Label == label { 1063 return n.IP, p.Value 1064 } 1065 } 1066 for _, p := range n.DynamicPorts { 1067 if p.Label == label { 1068 return n.IP, p.Value 1069 } 1070 } 1071 } 1072 return "", 0 1073 } 1074 1075 // Resources is used to define the resources available 1076 // on a client 1077 type Resources struct { 1078 CPU int 1079 MemoryMB int 1080 DiskMB int 1081 IOPS int 1082 Networks Networks 1083 } 1084 1085 const ( 1086 BytesInMegabyte = 1024 * 1024 1087 ) 1088 1089 // DefaultResources returns the default resources for a task. 1090 func DefaultResources() *Resources { 1091 return &Resources{ 1092 CPU: 100, 1093 MemoryMB: 10, 1094 IOPS: 0, 1095 } 1096 } 1097 1098 // DiskInBytes returns the amount of disk resources in bytes. 1099 func (r *Resources) DiskInBytes() int64 { 1100 return int64(r.DiskMB * BytesInMegabyte) 1101 } 1102 1103 // Merge merges this resource with another resource. 1104 func (r *Resources) Merge(other *Resources) { 1105 if other.CPU != 0 { 1106 r.CPU = other.CPU 1107 } 1108 if other.MemoryMB != 0 { 1109 r.MemoryMB = other.MemoryMB 1110 } 1111 if other.DiskMB != 0 { 1112 r.DiskMB = other.DiskMB 1113 } 1114 if other.IOPS != 0 { 1115 r.IOPS = other.IOPS 1116 } 1117 if len(other.Networks) != 0 { 1118 r.Networks = other.Networks 1119 } 1120 } 1121 1122 func (r *Resources) Canonicalize() { 1123 // Ensure that an empty and nil slices are treated the same to avoid scheduling 1124 // problems since we use reflect DeepEquals. 1125 if len(r.Networks) == 0 { 1126 r.Networks = nil 1127 } 1128 1129 for _, n := range r.Networks { 1130 n.Canonicalize() 1131 } 1132 } 1133 1134 // MeetsMinResources returns an error if the resources specified are less than 1135 // the minimum allowed. 1136 func (r *Resources) MeetsMinResources() error { 1137 var mErr multierror.Error 1138 if r.CPU < 20 { 1139 mErr.Errors = append(mErr.Errors, fmt.Errorf("minimum CPU value is 20; got %d", r.CPU)) 1140 } 1141 if r.MemoryMB < 10 { 1142 mErr.Errors = append(mErr.Errors, fmt.Errorf("minimum MemoryMB value is 10; got %d", r.MemoryMB)) 1143 } 1144 if r.IOPS < 0 { 1145 mErr.Errors = append(mErr.Errors, fmt.Errorf("minimum IOPS value is 0; got %d", r.IOPS)) 1146 } 1147 for i, n := range r.Networks { 1148 if err := n.MeetsMinResources(); err != nil { 1149 mErr.Errors = append(mErr.Errors, fmt.Errorf("network resource at index %d failed: %v", i, err)) 1150 } 1151 } 1152 1153 return mErr.ErrorOrNil() 1154 } 1155 1156 // Copy returns a deep copy of the resources 1157 func (r *Resources) Copy() *Resources { 1158 if r == nil { 1159 return nil 1160 } 1161 newR := new(Resources) 1162 *newR = *r 1163 if r.Networks != nil { 1164 n := len(r.Networks) 1165 newR.Networks = make([]*NetworkResource, n) 1166 for i := 0; i < n; i++ { 1167 newR.Networks[i] = r.Networks[i].Copy() 1168 } 1169 } 1170 return newR 1171 } 1172 1173 // NetIndex finds the matching net index using device name 1174 func (r *Resources) NetIndex(n *NetworkResource) int { 1175 for idx, net := range r.Networks { 1176 if net.Device == n.Device { 1177 return idx 1178 } 1179 } 1180 return -1 1181 } 1182 1183 // Superset checks if one set of resources is a superset 1184 // of another. This ignores network resources, and the NetworkIndex 1185 // should be used for that. 1186 func (r *Resources) Superset(other *Resources) (bool, string) { 1187 if r.CPU < other.CPU { 1188 return false, "cpu exhausted" 1189 } 1190 if r.MemoryMB < other.MemoryMB { 1191 return false, "memory exhausted" 1192 } 1193 if r.DiskMB < other.DiskMB { 1194 return false, "disk exhausted" 1195 } 1196 if r.IOPS < other.IOPS { 1197 return false, "iops exhausted" 1198 } 1199 return true, "" 1200 } 1201 1202 // Add adds the resources of the delta to this, potentially 1203 // returning an error if not possible. 1204 func (r *Resources) Add(delta *Resources) error { 1205 if delta == nil { 1206 return nil 1207 } 1208 r.CPU += delta.CPU 1209 r.MemoryMB += delta.MemoryMB 1210 r.DiskMB += delta.DiskMB 1211 r.IOPS += delta.IOPS 1212 1213 for _, n := range delta.Networks { 1214 // Find the matching interface by IP or CIDR 1215 idx := r.NetIndex(n) 1216 if idx == -1 { 1217 r.Networks = append(r.Networks, n.Copy()) 1218 } else { 1219 r.Networks[idx].Add(n) 1220 } 1221 } 1222 return nil 1223 } 1224 1225 func (r *Resources) GoString() string { 1226 return fmt.Sprintf("*%#v", *r) 1227 } 1228 1229 type Port struct { 1230 Label string 1231 Value int 1232 } 1233 1234 // NetworkResource is used to represent available network 1235 // resources 1236 type NetworkResource struct { 1237 Device string // Name of the device 1238 CIDR string // CIDR block of addresses 1239 IP string // Host IP address 1240 MBits int // Throughput 1241 ReservedPorts []Port // Host Reserved ports 1242 DynamicPorts []Port // Host Dynamically assigned ports 1243 } 1244 1245 func (n *NetworkResource) Canonicalize() { 1246 // Ensure that an empty and nil slices are treated the same to avoid scheduling 1247 // problems since we use reflect DeepEquals. 1248 if len(n.ReservedPorts) == 0 { 1249 n.ReservedPorts = nil 1250 } 1251 if len(n.DynamicPorts) == 0 { 1252 n.DynamicPorts = nil 1253 } 1254 } 1255 1256 // MeetsMinResources returns an error if the resources specified are less than 1257 // the minimum allowed. 1258 func (n *NetworkResource) MeetsMinResources() error { 1259 var mErr multierror.Error 1260 if n.MBits < 1 { 1261 mErr.Errors = append(mErr.Errors, fmt.Errorf("minimum MBits value is 1; got %d", n.MBits)) 1262 } 1263 return mErr.ErrorOrNil() 1264 } 1265 1266 // Copy returns a deep copy of the network resource 1267 func (n *NetworkResource) Copy() *NetworkResource { 1268 if n == nil { 1269 return nil 1270 } 1271 newR := new(NetworkResource) 1272 *newR = *n 1273 if n.ReservedPorts != nil { 1274 newR.ReservedPorts = make([]Port, len(n.ReservedPorts)) 1275 copy(newR.ReservedPorts, n.ReservedPorts) 1276 } 1277 if n.DynamicPorts != nil { 1278 newR.DynamicPorts = make([]Port, len(n.DynamicPorts)) 1279 copy(newR.DynamicPorts, n.DynamicPorts) 1280 } 1281 return newR 1282 } 1283 1284 // Add adds the resources of the delta to this, potentially 1285 // returning an error if not possible. 1286 func (n *NetworkResource) Add(delta *NetworkResource) { 1287 if len(delta.ReservedPorts) > 0 { 1288 n.ReservedPorts = append(n.ReservedPorts, delta.ReservedPorts...) 1289 } 1290 n.MBits += delta.MBits 1291 n.DynamicPorts = append(n.DynamicPorts, delta.DynamicPorts...) 1292 } 1293 1294 func (n *NetworkResource) GoString() string { 1295 return fmt.Sprintf("*%#v", *n) 1296 } 1297 1298 // PortLabels returns a map of port labels to their assigned host ports. 1299 func (n *NetworkResource) PortLabels() map[string]int { 1300 num := len(n.ReservedPorts) + len(n.DynamicPorts) 1301 labelValues := make(map[string]int, num) 1302 for _, port := range n.ReservedPorts { 1303 labelValues[port.Label] = port.Value 1304 } 1305 for _, port := range n.DynamicPorts { 1306 labelValues[port.Label] = port.Value 1307 } 1308 return labelValues 1309 } 1310 1311 const ( 1312 // JobTypeNomad is reserved for internal system tasks and is 1313 // always handled by the CoreScheduler. 1314 JobTypeCore = "_core" 1315 JobTypeService = "service" 1316 JobTypeBatch = "batch" 1317 JobTypeSystem = "system" 1318 ) 1319 1320 const ( 1321 JobStatusPending = "pending" // Pending means the job is waiting on scheduling 1322 JobStatusRunning = "running" // Running means the job has non-terminal allocations 1323 JobStatusDead = "dead" // Dead means all evaluation's and allocations are terminal 1324 ) 1325 1326 const ( 1327 // JobMinPriority is the minimum allowed priority 1328 JobMinPriority = 1 1329 1330 // JobDefaultPriority is the default priority if not 1331 // not specified. 1332 JobDefaultPriority = 50 1333 1334 // JobMaxPriority is the maximum allowed priority 1335 JobMaxPriority = 100 1336 1337 // Ensure CoreJobPriority is higher than any user 1338 // specified job so that it gets priority. This is important 1339 // for the system to remain healthy. 1340 CoreJobPriority = JobMaxPriority * 2 1341 1342 // JobTrackedVersions is the number of historic job versions that are 1343 // kept. 1344 JobTrackedVersions = 6 1345 ) 1346 1347 // Job is the scope of a scheduling request to Nomad. It is the largest 1348 // scoped object, and is a named collection of task groups. Each task group 1349 // is further composed of tasks. A task group (TG) is the unit of scheduling 1350 // however. 1351 type Job struct { 1352 // Stop marks whether the user has stopped the job. A stopped job will 1353 // have all created allocations stopped and acts as a way to stop a job 1354 // without purging it from the system. This allows existing allocs to be 1355 // queried and the job to be inspected as it is being killed. 1356 Stop bool 1357 1358 // Region is the Nomad region that handles scheduling this job 1359 Region string 1360 1361 // ID is a unique identifier for the job per region. It can be 1362 // specified hierarchically like LineOfBiz/OrgName/Team/Project 1363 ID string 1364 1365 // ParentID is the unique identifier of the job that spawned this job. 1366 ParentID string 1367 1368 // Name is the logical name of the job used to refer to it. This is unique 1369 // per region, but not unique globally. 1370 Name string 1371 1372 // Type is used to control various behaviors about the job. Most jobs 1373 // are service jobs, meaning they are expected to be long lived. 1374 // Some jobs are batch oriented meaning they run and then terminate. 1375 // This can be extended in the future to support custom schedulers. 1376 Type string 1377 1378 // Priority is used to control scheduling importance and if this job 1379 // can preempt other jobs. 1380 Priority int 1381 1382 // AllAtOnce is used to control if incremental scheduling of task groups 1383 // is allowed or if we must do a gang scheduling of the entire job. This 1384 // can slow down larger jobs if resources are not available. 1385 AllAtOnce bool 1386 1387 // Datacenters contains all the datacenters this job is allowed to span 1388 Datacenters []string 1389 1390 // Constraints can be specified at a job level and apply to 1391 // all the task groups and tasks. 1392 Constraints []*Constraint 1393 1394 // TaskGroups are the collections of task groups that this job needs 1395 // to run. Each task group is an atomic unit of scheduling and placement. 1396 TaskGroups []*TaskGroup 1397 1398 // COMPAT: Remove in 0.7.0. Stagger is deprecated in 0.6.0. 1399 Update UpdateStrategy 1400 1401 // Periodic is used to define the interval the job is run at. 1402 Periodic *PeriodicConfig 1403 1404 // ParameterizedJob is used to specify the job as a parameterized job 1405 // for dispatching. 1406 ParameterizedJob *ParameterizedJobConfig 1407 1408 // Payload is the payload supplied when the job was dispatched. 1409 Payload []byte 1410 1411 // Meta is used to associate arbitrary metadata with this 1412 // job. This is opaque to Nomad. 1413 Meta map[string]string 1414 1415 // VaultToken is the Vault token that proves the submitter of the job has 1416 // access to the specified Vault policies. This field is only used to 1417 // transfer the token and is not stored after Job submission. 1418 VaultToken string 1419 1420 // Job status 1421 Status string 1422 1423 // StatusDescription is meant to provide more human useful information 1424 StatusDescription string 1425 1426 // Stable marks a job as stable. Stability is only defined on "service" and 1427 // "system" jobs. The stability of a job will be set automatically as part 1428 // of a deployment and can be manually set via APIs. 1429 Stable bool 1430 1431 // Version is a monitonically increasing version number that is incremened 1432 // on each job register. 1433 Version uint64 1434 1435 // SubmitTime is the time at which the job was submitted as a UnixNano in 1436 // UTC 1437 SubmitTime int64 1438 1439 // Raft Indexes 1440 CreateIndex uint64 1441 ModifyIndex uint64 1442 JobModifyIndex uint64 1443 } 1444 1445 // Canonicalize is used to canonicalize fields in the Job. This should be called 1446 // when registering a Job. A set of warnings are returned if the job was changed 1447 // in anyway that the user should be made aware of. 1448 func (j *Job) Canonicalize() (warnings error) { 1449 var mErr multierror.Error 1450 // Ensure that an empty and nil map are treated the same to avoid scheduling 1451 // problems since we use reflect DeepEquals. 1452 if len(j.Meta) == 0 { 1453 j.Meta = nil 1454 } 1455 1456 for _, tg := range j.TaskGroups { 1457 tg.Canonicalize(j) 1458 } 1459 1460 if j.ParameterizedJob != nil { 1461 j.ParameterizedJob.Canonicalize() 1462 } 1463 1464 if j.Periodic != nil { 1465 j.Periodic.Canonicalize() 1466 } 1467 1468 // COMPAT: Remove in 0.7.0 1469 // Rewrite any job that has an update block with pre 0.6.0 syntax. 1470 jobHasOldUpdate := j.Update.Stagger > 0 && j.Update.MaxParallel > 0 1471 if jobHasOldUpdate && j.Type != JobTypeBatch { 1472 // Build an appropriate update block and copy it down to each task group 1473 base := DefaultUpdateStrategy.Copy() 1474 base.MaxParallel = j.Update.MaxParallel 1475 base.MinHealthyTime = j.Update.Stagger 1476 1477 // Add to each task group, modifying as needed 1478 upgraded := false 1479 l := len(j.TaskGroups) 1480 for _, tg := range j.TaskGroups { 1481 // The task group doesn't need upgrading if it has an update block with the new syntax 1482 u := tg.Update 1483 if u != nil && u.Stagger > 0 && u.MaxParallel > 0 && 1484 u.HealthCheck != "" && u.MinHealthyTime > 0 && u.HealthyDeadline > 0 { 1485 continue 1486 } 1487 1488 upgraded = true 1489 1490 // The MaxParallel for the job should be 10% of the total count 1491 // unless there is just one task group then we can infer the old 1492 // max parallel should be the new 1493 tgu := base.Copy() 1494 if l != 1 { 1495 // RoundTo 10% 1496 var percent float64 = float64(tg.Count) * 0.1 1497 tgu.MaxParallel = int(percent + 0.5) 1498 } 1499 1500 // Safety guards 1501 if tgu.MaxParallel == 0 { 1502 tgu.MaxParallel = 1 1503 } else if tgu.MaxParallel > tg.Count { 1504 tgu.MaxParallel = tg.Count 1505 } 1506 1507 tg.Update = tgu 1508 } 1509 1510 if upgraded { 1511 w := "A best effort conversion to new update stanza introduced in v0.6.0 applied. " + 1512 "Please update upgrade stanza before v0.7.0." 1513 multierror.Append(&mErr, fmt.Errorf(w)) 1514 } 1515 } 1516 1517 // Ensure that the batch job doesn't have new style or old style update 1518 // stanza. Unfortunately are scanning here because we have to deprecate over 1519 // a release so we can't check in the task group since that may be new style 1520 // but wouldn't capture the old style and we don't want to have duplicate 1521 // warnings. 1522 if j.Type == JobTypeBatch { 1523 displayWarning := jobHasOldUpdate 1524 j.Update.Stagger = 0 1525 j.Update.MaxParallel = 0 1526 j.Update.HealthCheck = "" 1527 j.Update.MinHealthyTime = 0 1528 j.Update.HealthyDeadline = 0 1529 j.Update.AutoRevert = false 1530 j.Update.Canary = 0 1531 1532 // Remove any update spec from the task groups 1533 for _, tg := range j.TaskGroups { 1534 if tg.Update != nil { 1535 displayWarning = true 1536 tg.Update = nil 1537 } 1538 } 1539 1540 if displayWarning { 1541 w := "Update stanza is disallowed for batch jobs since v0.6.0. " + 1542 "The update block has automatically been removed" 1543 multierror.Append(&mErr, fmt.Errorf(w)) 1544 } 1545 } 1546 1547 return mErr.ErrorOrNil() 1548 } 1549 1550 // Copy returns a deep copy of the Job. It is expected that callers use recover. 1551 // This job can panic if the deep copy failed as it uses reflection. 1552 func (j *Job) Copy() *Job { 1553 if j == nil { 1554 return nil 1555 } 1556 nj := new(Job) 1557 *nj = *j 1558 nj.Datacenters = helper.CopySliceString(nj.Datacenters) 1559 nj.Constraints = CopySliceConstraints(nj.Constraints) 1560 1561 if j.TaskGroups != nil { 1562 tgs := make([]*TaskGroup, len(nj.TaskGroups)) 1563 for i, tg := range nj.TaskGroups { 1564 tgs[i] = tg.Copy() 1565 } 1566 nj.TaskGroups = tgs 1567 } 1568 1569 nj.Periodic = nj.Periodic.Copy() 1570 nj.Meta = helper.CopyMapStringString(nj.Meta) 1571 nj.ParameterizedJob = nj.ParameterizedJob.Copy() 1572 return nj 1573 } 1574 1575 // Validate is used to sanity check a job input 1576 func (j *Job) Validate() error { 1577 var mErr multierror.Error 1578 1579 if j.Region == "" { 1580 mErr.Errors = append(mErr.Errors, errors.New("Missing job region")) 1581 } 1582 if j.ID == "" { 1583 mErr.Errors = append(mErr.Errors, errors.New("Missing job ID")) 1584 } else if strings.Contains(j.ID, " ") { 1585 mErr.Errors = append(mErr.Errors, errors.New("Job ID contains a space")) 1586 } 1587 if j.Name == "" { 1588 mErr.Errors = append(mErr.Errors, errors.New("Missing job name")) 1589 } 1590 switch j.Type { 1591 case JobTypeCore, JobTypeService, JobTypeBatch, JobTypeSystem: 1592 case "": 1593 mErr.Errors = append(mErr.Errors, errors.New("Missing job type")) 1594 default: 1595 mErr.Errors = append(mErr.Errors, fmt.Errorf("Invalid job type: %q", j.Type)) 1596 } 1597 if j.Priority < JobMinPriority || j.Priority > JobMaxPriority { 1598 mErr.Errors = append(mErr.Errors, fmt.Errorf("Job priority must be between [%d, %d]", JobMinPriority, JobMaxPriority)) 1599 } 1600 if len(j.Datacenters) == 0 { 1601 mErr.Errors = append(mErr.Errors, errors.New("Missing job datacenters")) 1602 } 1603 if len(j.TaskGroups) == 0 { 1604 mErr.Errors = append(mErr.Errors, errors.New("Missing job task groups")) 1605 } 1606 for idx, constr := range j.Constraints { 1607 if err := constr.Validate(); err != nil { 1608 outer := fmt.Errorf("Constraint %d validation failed: %s", idx+1, err) 1609 mErr.Errors = append(mErr.Errors, outer) 1610 } 1611 } 1612 1613 // Check for duplicate task groups 1614 taskGroups := make(map[string]int) 1615 for idx, tg := range j.TaskGroups { 1616 if tg.Name == "" { 1617 mErr.Errors = append(mErr.Errors, fmt.Errorf("Job task group %d missing name", idx+1)) 1618 } else if existing, ok := taskGroups[tg.Name]; ok { 1619 mErr.Errors = append(mErr.Errors, fmt.Errorf("Job task group %d redefines '%s' from group %d", idx+1, tg.Name, existing+1)) 1620 } else { 1621 taskGroups[tg.Name] = idx 1622 } 1623 1624 if j.Type == "system" && tg.Count > 1 { 1625 mErr.Errors = append(mErr.Errors, 1626 fmt.Errorf("Job task group %s has count %d. Count cannot exceed 1 with system scheduler", 1627 tg.Name, tg.Count)) 1628 } 1629 } 1630 1631 // Validate the task group 1632 for _, tg := range j.TaskGroups { 1633 if err := tg.Validate(j); err != nil { 1634 outer := fmt.Errorf("Task group %s validation failed: %v", tg.Name, err) 1635 mErr.Errors = append(mErr.Errors, outer) 1636 } 1637 } 1638 1639 // Validate periodic is only used with batch jobs. 1640 if j.IsPeriodic() && j.Periodic.Enabled { 1641 if j.Type != JobTypeBatch { 1642 mErr.Errors = append(mErr.Errors, 1643 fmt.Errorf("Periodic can only be used with %q scheduler", JobTypeBatch)) 1644 } 1645 1646 if err := j.Periodic.Validate(); err != nil { 1647 mErr.Errors = append(mErr.Errors, err) 1648 } 1649 } 1650 1651 if j.IsParameterized() { 1652 if j.Type != JobTypeBatch { 1653 mErr.Errors = append(mErr.Errors, 1654 fmt.Errorf("Parameterized job can only be used with %q scheduler", JobTypeBatch)) 1655 } 1656 1657 if err := j.ParameterizedJob.Validate(); err != nil { 1658 mErr.Errors = append(mErr.Errors, err) 1659 } 1660 } 1661 1662 return mErr.ErrorOrNil() 1663 } 1664 1665 // Warnings returns a list of warnings that may be from dubious settings or 1666 // deprecation warnings. 1667 func (j *Job) Warnings() error { 1668 var mErr multierror.Error 1669 1670 // Check the groups 1671 for _, tg := range j.TaskGroups { 1672 if err := tg.Warnings(j); err != nil { 1673 outer := fmt.Errorf("Group %q has warnings: %v", tg.Name, err) 1674 mErr.Errors = append(mErr.Errors, outer) 1675 } 1676 } 1677 1678 return mErr.ErrorOrNil() 1679 } 1680 1681 // LookupTaskGroup finds a task group by name 1682 func (j *Job) LookupTaskGroup(name string) *TaskGroup { 1683 for _, tg := range j.TaskGroups { 1684 if tg.Name == name { 1685 return tg 1686 } 1687 } 1688 return nil 1689 } 1690 1691 // CombinedTaskMeta takes a TaskGroup and Task name and returns the combined 1692 // meta data for the task. When joining Job, Group and Task Meta, the precedence 1693 // is by deepest scope (Task > Group > Job). 1694 func (j *Job) CombinedTaskMeta(groupName, taskName string) map[string]string { 1695 group := j.LookupTaskGroup(groupName) 1696 if group == nil { 1697 return nil 1698 } 1699 1700 task := group.LookupTask(taskName) 1701 if task == nil { 1702 return nil 1703 } 1704 1705 meta := helper.CopyMapStringString(task.Meta) 1706 if meta == nil { 1707 meta = make(map[string]string, len(group.Meta)+len(j.Meta)) 1708 } 1709 1710 // Add the group specific meta 1711 for k, v := range group.Meta { 1712 if _, ok := meta[k]; !ok { 1713 meta[k] = v 1714 } 1715 } 1716 1717 // Add the job specific meta 1718 for k, v := range j.Meta { 1719 if _, ok := meta[k]; !ok { 1720 meta[k] = v 1721 } 1722 } 1723 1724 return meta 1725 } 1726 1727 // Stopped returns if a job is stopped. 1728 func (j *Job) Stopped() bool { 1729 return j == nil || j.Stop 1730 } 1731 1732 // HasUpdateStrategy returns if any task group in the job has an update strategy 1733 func (j *Job) HasUpdateStrategy() bool { 1734 for _, tg := range j.TaskGroups { 1735 if tg.Update != nil { 1736 return true 1737 } 1738 } 1739 1740 return false 1741 } 1742 1743 // Stub is used to return a summary of the job 1744 func (j *Job) Stub(summary *JobSummary) *JobListStub { 1745 return &JobListStub{ 1746 ID: j.ID, 1747 ParentID: j.ParentID, 1748 Name: j.Name, 1749 Type: j.Type, 1750 Priority: j.Priority, 1751 Periodic: j.IsPeriodic(), 1752 ParameterizedJob: j.IsParameterized(), 1753 Stop: j.Stop, 1754 Status: j.Status, 1755 StatusDescription: j.StatusDescription, 1756 CreateIndex: j.CreateIndex, 1757 ModifyIndex: j.ModifyIndex, 1758 JobModifyIndex: j.JobModifyIndex, 1759 SubmitTime: j.SubmitTime, 1760 JobSummary: summary, 1761 } 1762 } 1763 1764 // IsPeriodic returns whether a job is periodic. 1765 func (j *Job) IsPeriodic() bool { 1766 return j.Periodic != nil 1767 } 1768 1769 // IsParameterized returns whether a job is parameterized job. 1770 func (j *Job) IsParameterized() bool { 1771 return j.ParameterizedJob != nil 1772 } 1773 1774 // VaultPolicies returns the set of Vault policies per task group, per task 1775 func (j *Job) VaultPolicies() map[string]map[string]*Vault { 1776 policies := make(map[string]map[string]*Vault, len(j.TaskGroups)) 1777 1778 for _, tg := range j.TaskGroups { 1779 tgPolicies := make(map[string]*Vault, len(tg.Tasks)) 1780 1781 for _, task := range tg.Tasks { 1782 if task.Vault == nil { 1783 continue 1784 } 1785 1786 tgPolicies[task.Name] = task.Vault 1787 } 1788 1789 if len(tgPolicies) != 0 { 1790 policies[tg.Name] = tgPolicies 1791 } 1792 } 1793 1794 return policies 1795 } 1796 1797 // RequiredSignals returns a mapping of task groups to tasks to their required 1798 // set of signals 1799 func (j *Job) RequiredSignals() map[string]map[string][]string { 1800 signals := make(map[string]map[string][]string) 1801 1802 for _, tg := range j.TaskGroups { 1803 for _, task := range tg.Tasks { 1804 // Use this local one as a set 1805 taskSignals := make(map[string]struct{}) 1806 1807 // Check if the Vault change mode uses signals 1808 if task.Vault != nil && task.Vault.ChangeMode == VaultChangeModeSignal { 1809 taskSignals[task.Vault.ChangeSignal] = struct{}{} 1810 } 1811 1812 // Check if any template change mode uses signals 1813 for _, t := range task.Templates { 1814 if t.ChangeMode != TemplateChangeModeSignal { 1815 continue 1816 } 1817 1818 taskSignals[t.ChangeSignal] = struct{}{} 1819 } 1820 1821 // Flatten and sort the signals 1822 l := len(taskSignals) 1823 if l == 0 { 1824 continue 1825 } 1826 1827 flat := make([]string, 0, l) 1828 for sig := range taskSignals { 1829 flat = append(flat, sig) 1830 } 1831 1832 sort.Strings(flat) 1833 tgSignals, ok := signals[tg.Name] 1834 if !ok { 1835 tgSignals = make(map[string][]string) 1836 signals[tg.Name] = tgSignals 1837 } 1838 tgSignals[task.Name] = flat 1839 } 1840 1841 } 1842 1843 return signals 1844 } 1845 1846 // SpecChanged determines if the functional specification has changed between 1847 // two job versions. 1848 func (j *Job) SpecChanged(new *Job) bool { 1849 if j == nil { 1850 return new != nil 1851 } 1852 1853 // Create a copy of the new job 1854 c := new.Copy() 1855 1856 // Update the new job so we can do a reflect 1857 c.Status = j.Status 1858 c.StatusDescription = j.StatusDescription 1859 c.Stable = j.Stable 1860 c.Version = j.Version 1861 c.CreateIndex = j.CreateIndex 1862 c.ModifyIndex = j.ModifyIndex 1863 c.JobModifyIndex = j.JobModifyIndex 1864 c.SubmitTime = j.SubmitTime 1865 1866 // Deep equals the jobs 1867 return !reflect.DeepEqual(j, c) 1868 } 1869 1870 func (j *Job) SetSubmitTime() { 1871 j.SubmitTime = time.Now().UTC().UnixNano() 1872 } 1873 1874 // JobListStub is used to return a subset of job information 1875 // for the job list 1876 type JobListStub struct { 1877 ID string 1878 ParentID string 1879 Name string 1880 Type string 1881 Priority int 1882 Periodic bool 1883 ParameterizedJob bool 1884 Stop bool 1885 Status string 1886 StatusDescription string 1887 JobSummary *JobSummary 1888 CreateIndex uint64 1889 ModifyIndex uint64 1890 JobModifyIndex uint64 1891 SubmitTime int64 1892 } 1893 1894 // JobSummary summarizes the state of the allocations of a job 1895 type JobSummary struct { 1896 JobID string 1897 1898 // Summmary contains the summary per task group for the Job 1899 Summary map[string]TaskGroupSummary 1900 1901 // Children contains a summary for the children of this job. 1902 Children *JobChildrenSummary 1903 1904 // Raft Indexes 1905 CreateIndex uint64 1906 ModifyIndex uint64 1907 } 1908 1909 // Copy returns a new copy of JobSummary 1910 func (js *JobSummary) Copy() *JobSummary { 1911 newJobSummary := new(JobSummary) 1912 *newJobSummary = *js 1913 newTGSummary := make(map[string]TaskGroupSummary, len(js.Summary)) 1914 for k, v := range js.Summary { 1915 newTGSummary[k] = v 1916 } 1917 newJobSummary.Summary = newTGSummary 1918 newJobSummary.Children = newJobSummary.Children.Copy() 1919 return newJobSummary 1920 } 1921 1922 // JobChildrenSummary contains the summary of children job statuses 1923 type JobChildrenSummary struct { 1924 Pending int64 1925 Running int64 1926 Dead int64 1927 } 1928 1929 // Copy returns a new copy of a JobChildrenSummary 1930 func (jc *JobChildrenSummary) Copy() *JobChildrenSummary { 1931 if jc == nil { 1932 return nil 1933 } 1934 1935 njc := new(JobChildrenSummary) 1936 *njc = *jc 1937 return njc 1938 } 1939 1940 // TaskGroup summarizes the state of all the allocations of a particular 1941 // TaskGroup 1942 type TaskGroupSummary struct { 1943 Queued int 1944 Complete int 1945 Failed int 1946 Running int 1947 Starting int 1948 Lost int 1949 } 1950 1951 const ( 1952 // Checks uses any registered health check state in combination with task 1953 // states to determine if a allocation is healthy. 1954 UpdateStrategyHealthCheck_Checks = "checks" 1955 1956 // TaskStates uses the task states of an allocation to determine if the 1957 // allocation is healthy. 1958 UpdateStrategyHealthCheck_TaskStates = "task_states" 1959 1960 // Manual allows the operator to manually signal to Nomad when an 1961 // allocations is healthy. This allows more advanced health checking that is 1962 // outside of the scope of Nomad. 1963 UpdateStrategyHealthCheck_Manual = "manual" 1964 ) 1965 1966 var ( 1967 // DefaultUpdateStrategy provides a baseline that can be used to upgrade 1968 // jobs with the old policy or for populating field defaults. 1969 DefaultUpdateStrategy = &UpdateStrategy{ 1970 Stagger: 30 * time.Second, 1971 MaxParallel: 0, 1972 HealthCheck: UpdateStrategyHealthCheck_Checks, 1973 MinHealthyTime: 10 * time.Second, 1974 HealthyDeadline: 5 * time.Minute, 1975 AutoRevert: false, 1976 Canary: 0, 1977 } 1978 ) 1979 1980 // UpdateStrategy is used to modify how updates are done 1981 type UpdateStrategy struct { 1982 // Stagger is used to determine the rate at which allocations are migrated 1983 // due to down or draining nodes. 1984 Stagger time.Duration 1985 1986 // MaxParallel is how many updates can be done in parallel 1987 MaxParallel int 1988 1989 // HealthCheck specifies the mechanism in which allocations are marked 1990 // healthy or unhealthy as part of a deployment. 1991 HealthCheck string 1992 1993 // MinHealthyTime is the minimum time an allocation must be in the healthy 1994 // state before it is marked as healthy, unblocking more alllocations to be 1995 // rolled. 1996 MinHealthyTime time.Duration 1997 1998 // HealthyDeadline is the time in which an allocation must be marked as 1999 // healthy before it is automatically transistioned to unhealthy. This time 2000 // period doesn't count against the MinHealthyTime. 2001 HealthyDeadline time.Duration 2002 2003 // AutoRevert declares that if a deployment fails because of unhealthy 2004 // allocations, there should be an attempt to auto-revert the job to a 2005 // stable version. 2006 AutoRevert bool 2007 2008 // Canary is the number of canaries to deploy when a change to the task 2009 // group is detected. 2010 Canary int 2011 } 2012 2013 func (u *UpdateStrategy) Copy() *UpdateStrategy { 2014 if u == nil { 2015 return nil 2016 } 2017 2018 copy := new(UpdateStrategy) 2019 *copy = *u 2020 return copy 2021 } 2022 2023 func (u *UpdateStrategy) Validate() error { 2024 if u == nil { 2025 return nil 2026 } 2027 2028 var mErr multierror.Error 2029 switch u.HealthCheck { 2030 case UpdateStrategyHealthCheck_Checks, UpdateStrategyHealthCheck_TaskStates, UpdateStrategyHealthCheck_Manual: 2031 default: 2032 multierror.Append(&mErr, fmt.Errorf("Invalid health check given: %q", u.HealthCheck)) 2033 } 2034 2035 if u.MaxParallel < 0 { 2036 multierror.Append(&mErr, fmt.Errorf("Max parallel can not be less than zero: %d < 0", u.MaxParallel)) 2037 } 2038 if u.Canary < 0 { 2039 multierror.Append(&mErr, fmt.Errorf("Canary count can not be less than zero: %d < 0", u.Canary)) 2040 } 2041 if u.MinHealthyTime < 0 { 2042 multierror.Append(&mErr, fmt.Errorf("Minimum healthy time may not be less than zero: %v", u.MinHealthyTime)) 2043 } 2044 if u.HealthyDeadline <= 0 { 2045 multierror.Append(&mErr, fmt.Errorf("Healthy deadline must be greater than zero: %v", u.HealthyDeadline)) 2046 } 2047 if u.Stagger <= 0 { 2048 multierror.Append(&mErr, fmt.Errorf("Stagger must be greater than zero: %v", u.Stagger)) 2049 } 2050 2051 return mErr.ErrorOrNil() 2052 } 2053 2054 // TODO(alexdadgar): Remove once no longer used by the scheduler. 2055 // Rolling returns if a rolling strategy should be used 2056 func (u *UpdateStrategy) Rolling() bool { 2057 return u.Stagger > 0 && u.MaxParallel > 0 2058 } 2059 2060 const ( 2061 // PeriodicSpecCron is used for a cron spec. 2062 PeriodicSpecCron = "cron" 2063 2064 // PeriodicSpecTest is only used by unit tests. It is a sorted, comma 2065 // separated list of unix timestamps at which to launch. 2066 PeriodicSpecTest = "_internal_test" 2067 ) 2068 2069 // Periodic defines the interval a job should be run at. 2070 type PeriodicConfig struct { 2071 // Enabled determines if the job should be run periodically. 2072 Enabled bool 2073 2074 // Spec specifies the interval the job should be run as. It is parsed based 2075 // on the SpecType. 2076 Spec string 2077 2078 // SpecType defines the format of the spec. 2079 SpecType string 2080 2081 // ProhibitOverlap enforces that spawned jobs do not run in parallel. 2082 ProhibitOverlap bool 2083 2084 // TimeZone is the user specified string that determines the time zone to 2085 // launch against. The time zones must be specified from IANA Time Zone 2086 // database, such as "America/New_York". 2087 // Reference: https://en.wikipedia.org/wiki/List_of_tz_database_time_zones 2088 // Reference: https://www.iana.org/time-zones 2089 TimeZone string 2090 2091 // location is the time zone to evaluate the launch time against 2092 location *time.Location 2093 } 2094 2095 func (p *PeriodicConfig) Copy() *PeriodicConfig { 2096 if p == nil { 2097 return nil 2098 } 2099 np := new(PeriodicConfig) 2100 *np = *p 2101 return np 2102 } 2103 2104 func (p *PeriodicConfig) Validate() error { 2105 if !p.Enabled { 2106 return nil 2107 } 2108 2109 var mErr multierror.Error 2110 if p.Spec == "" { 2111 multierror.Append(&mErr, fmt.Errorf("Must specify a spec")) 2112 } 2113 2114 // Check if we got a valid time zone 2115 if p.TimeZone != "" { 2116 if _, err := time.LoadLocation(p.TimeZone); err != nil { 2117 multierror.Append(&mErr, fmt.Errorf("Invalid time zone %q: %v", p.TimeZone, err)) 2118 } 2119 } 2120 2121 switch p.SpecType { 2122 case PeriodicSpecCron: 2123 // Validate the cron spec 2124 if _, err := cronexpr.Parse(p.Spec); err != nil { 2125 multierror.Append(&mErr, fmt.Errorf("Invalid cron spec %q: %v", p.Spec, err)) 2126 } 2127 case PeriodicSpecTest: 2128 // No-op 2129 default: 2130 multierror.Append(&mErr, fmt.Errorf("Unknown periodic specification type %q", p.SpecType)) 2131 } 2132 2133 return mErr.ErrorOrNil() 2134 } 2135 2136 func (p *PeriodicConfig) Canonicalize() { 2137 // Load the location 2138 l, err := time.LoadLocation(p.TimeZone) 2139 if err != nil { 2140 p.location = time.UTC 2141 } 2142 2143 p.location = l 2144 } 2145 2146 // Next returns the closest time instant matching the spec that is after the 2147 // passed time. If no matching instance exists, the zero value of time.Time is 2148 // returned. The `time.Location` of the returned value matches that of the 2149 // passed time. 2150 func (p *PeriodicConfig) Next(fromTime time.Time) time.Time { 2151 switch p.SpecType { 2152 case PeriodicSpecCron: 2153 if e, err := cronexpr.Parse(p.Spec); err == nil { 2154 return e.Next(fromTime) 2155 } 2156 case PeriodicSpecTest: 2157 split := strings.Split(p.Spec, ",") 2158 if len(split) == 1 && split[0] == "" { 2159 return time.Time{} 2160 } 2161 2162 // Parse the times 2163 times := make([]time.Time, len(split)) 2164 for i, s := range split { 2165 unix, err := strconv.Atoi(s) 2166 if err != nil { 2167 return time.Time{} 2168 } 2169 2170 times[i] = time.Unix(int64(unix), 0) 2171 } 2172 2173 // Find the next match 2174 for _, next := range times { 2175 if fromTime.Before(next) { 2176 return next 2177 } 2178 } 2179 } 2180 2181 return time.Time{} 2182 } 2183 2184 // GetLocation returns the location to use for determining the time zone to run 2185 // the periodic job against. 2186 func (p *PeriodicConfig) GetLocation() *time.Location { 2187 // Jobs pre 0.5.5 will not have this 2188 if p.location != nil { 2189 return p.location 2190 } 2191 2192 return time.UTC 2193 } 2194 2195 const ( 2196 // PeriodicLaunchSuffix is the string appended to the periodic jobs ID 2197 // when launching derived instances of it. 2198 PeriodicLaunchSuffix = "/periodic-" 2199 ) 2200 2201 // PeriodicLaunch tracks the last launch time of a periodic job. 2202 type PeriodicLaunch struct { 2203 ID string // ID of the periodic job. 2204 Launch time.Time // The last launch time. 2205 2206 // Raft Indexes 2207 CreateIndex uint64 2208 ModifyIndex uint64 2209 } 2210 2211 const ( 2212 DispatchPayloadForbidden = "forbidden" 2213 DispatchPayloadOptional = "optional" 2214 DispatchPayloadRequired = "required" 2215 2216 // DispatchLaunchSuffix is the string appended to the parameterized job's ID 2217 // when dispatching instances of it. 2218 DispatchLaunchSuffix = "/dispatch-" 2219 ) 2220 2221 // ParameterizedJobConfig is used to configure the parameterized job 2222 type ParameterizedJobConfig struct { 2223 // Payload configure the payload requirements 2224 Payload string 2225 2226 // MetaRequired is metadata keys that must be specified by the dispatcher 2227 MetaRequired []string 2228 2229 // MetaOptional is metadata keys that may be specified by the dispatcher 2230 MetaOptional []string 2231 } 2232 2233 func (d *ParameterizedJobConfig) Validate() error { 2234 var mErr multierror.Error 2235 switch d.Payload { 2236 case DispatchPayloadOptional, DispatchPayloadRequired, DispatchPayloadForbidden: 2237 default: 2238 multierror.Append(&mErr, fmt.Errorf("Unknown payload requirement: %q", d.Payload)) 2239 } 2240 2241 // Check that the meta configurations are disjoint sets 2242 disjoint, offending := helper.SliceSetDisjoint(d.MetaRequired, d.MetaOptional) 2243 if !disjoint { 2244 multierror.Append(&mErr, fmt.Errorf("Required and optional meta keys should be disjoint. Following keys exist in both: %v", offending)) 2245 } 2246 2247 return mErr.ErrorOrNil() 2248 } 2249 2250 func (d *ParameterizedJobConfig) Canonicalize() { 2251 if d.Payload == "" { 2252 d.Payload = DispatchPayloadOptional 2253 } 2254 } 2255 2256 func (d *ParameterizedJobConfig) Copy() *ParameterizedJobConfig { 2257 if d == nil { 2258 return nil 2259 } 2260 nd := new(ParameterizedJobConfig) 2261 *nd = *d 2262 nd.MetaOptional = helper.CopySliceString(nd.MetaOptional) 2263 nd.MetaRequired = helper.CopySliceString(nd.MetaRequired) 2264 return nd 2265 } 2266 2267 // DispatchedID returns an ID appropriate for a job dispatched against a 2268 // particular parameterized job 2269 func DispatchedID(templateID string, t time.Time) string { 2270 u := GenerateUUID()[:8] 2271 return fmt.Sprintf("%s%s%d-%s", templateID, DispatchLaunchSuffix, t.Unix(), u) 2272 } 2273 2274 // DispatchPayloadConfig configures how a task gets its input from a job dispatch 2275 type DispatchPayloadConfig struct { 2276 // File specifies a relative path to where the input data should be written 2277 File string 2278 } 2279 2280 func (d *DispatchPayloadConfig) Copy() *DispatchPayloadConfig { 2281 if d == nil { 2282 return nil 2283 } 2284 nd := new(DispatchPayloadConfig) 2285 *nd = *d 2286 return nd 2287 } 2288 2289 func (d *DispatchPayloadConfig) Validate() error { 2290 // Verify the destination doesn't escape 2291 escaped, err := PathEscapesAllocDir("task/local/", d.File) 2292 if err != nil { 2293 return fmt.Errorf("invalid destination path: %v", err) 2294 } else if escaped { 2295 return fmt.Errorf("destination escapes allocation directory") 2296 } 2297 2298 return nil 2299 } 2300 2301 var ( 2302 defaultServiceJobRestartPolicy = RestartPolicy{ 2303 Delay: 15 * time.Second, 2304 Attempts: 2, 2305 Interval: 1 * time.Minute, 2306 Mode: RestartPolicyModeDelay, 2307 } 2308 defaultBatchJobRestartPolicy = RestartPolicy{ 2309 Delay: 15 * time.Second, 2310 Attempts: 15, 2311 Interval: 7 * 24 * time.Hour, 2312 Mode: RestartPolicyModeDelay, 2313 } 2314 ) 2315 2316 const ( 2317 // RestartPolicyModeDelay causes an artificial delay till the next interval is 2318 // reached when the specified attempts have been reached in the interval. 2319 RestartPolicyModeDelay = "delay" 2320 2321 // RestartPolicyModeFail causes a job to fail if the specified number of 2322 // attempts are reached within an interval. 2323 RestartPolicyModeFail = "fail" 2324 2325 // RestartPolicyMinInterval is the minimum interval that is accepted for a 2326 // restart policy. 2327 RestartPolicyMinInterval = 5 * time.Second 2328 ) 2329 2330 // RestartPolicy configures how Tasks are restarted when they crash or fail. 2331 type RestartPolicy struct { 2332 // Attempts is the number of restart that will occur in an interval. 2333 Attempts int 2334 2335 // Interval is a duration in which we can limit the number of restarts 2336 // within. 2337 Interval time.Duration 2338 2339 // Delay is the time between a failure and a restart. 2340 Delay time.Duration 2341 2342 // Mode controls what happens when the task restarts more than attempt times 2343 // in an interval. 2344 Mode string 2345 } 2346 2347 func (r *RestartPolicy) Copy() *RestartPolicy { 2348 if r == nil { 2349 return nil 2350 } 2351 nrp := new(RestartPolicy) 2352 *nrp = *r 2353 return nrp 2354 } 2355 2356 func (r *RestartPolicy) Validate() error { 2357 var mErr multierror.Error 2358 switch r.Mode { 2359 case RestartPolicyModeDelay, RestartPolicyModeFail: 2360 default: 2361 multierror.Append(&mErr, fmt.Errorf("Unsupported restart mode: %q", r.Mode)) 2362 } 2363 2364 // Check for ambiguous/confusing settings 2365 if r.Attempts == 0 && r.Mode != RestartPolicyModeFail { 2366 multierror.Append(&mErr, fmt.Errorf("Restart policy %q with %d attempts is ambiguous", r.Mode, r.Attempts)) 2367 } 2368 2369 if r.Interval.Nanoseconds() < RestartPolicyMinInterval.Nanoseconds() { 2370 multierror.Append(&mErr, fmt.Errorf("Interval can not be less than %v (got %v)", RestartPolicyMinInterval, r.Interval)) 2371 } 2372 if time.Duration(r.Attempts)*r.Delay > r.Interval { 2373 multierror.Append(&mErr, 2374 fmt.Errorf("Nomad can't restart the TaskGroup %v times in an interval of %v with a delay of %v", r.Attempts, r.Interval, r.Delay)) 2375 } 2376 return mErr.ErrorOrNil() 2377 } 2378 2379 func NewRestartPolicy(jobType string) *RestartPolicy { 2380 switch jobType { 2381 case JobTypeService, JobTypeSystem: 2382 rp := defaultServiceJobRestartPolicy 2383 return &rp 2384 case JobTypeBatch: 2385 rp := defaultBatchJobRestartPolicy 2386 return &rp 2387 } 2388 return nil 2389 } 2390 2391 // TaskGroup is an atomic unit of placement. Each task group belongs to 2392 // a job and may contain any number of tasks. A task group support running 2393 // in many replicas using the same configuration.. 2394 type TaskGroup struct { 2395 // Name of the task group 2396 Name string 2397 2398 // Count is the number of replicas of this task group that should 2399 // be scheduled. 2400 Count int 2401 2402 // Update is used to control the update strategy for this task group 2403 Update *UpdateStrategy 2404 2405 // Constraints can be specified at a task group level and apply to 2406 // all the tasks contained. 2407 Constraints []*Constraint 2408 2409 //RestartPolicy of a TaskGroup 2410 RestartPolicy *RestartPolicy 2411 2412 // Tasks are the collection of tasks that this task group needs to run 2413 Tasks []*Task 2414 2415 // EphemeralDisk is the disk resources that the task group requests 2416 EphemeralDisk *EphemeralDisk 2417 2418 // Meta is used to associate arbitrary metadata with this 2419 // task group. This is opaque to Nomad. 2420 Meta map[string]string 2421 } 2422 2423 func (tg *TaskGroup) Copy() *TaskGroup { 2424 if tg == nil { 2425 return nil 2426 } 2427 ntg := new(TaskGroup) 2428 *ntg = *tg 2429 ntg.Update = ntg.Update.Copy() 2430 ntg.Constraints = CopySliceConstraints(ntg.Constraints) 2431 ntg.RestartPolicy = ntg.RestartPolicy.Copy() 2432 2433 if tg.Tasks != nil { 2434 tasks := make([]*Task, len(ntg.Tasks)) 2435 for i, t := range ntg.Tasks { 2436 tasks[i] = t.Copy() 2437 } 2438 ntg.Tasks = tasks 2439 } 2440 2441 ntg.Meta = helper.CopyMapStringString(ntg.Meta) 2442 2443 if tg.EphemeralDisk != nil { 2444 ntg.EphemeralDisk = tg.EphemeralDisk.Copy() 2445 } 2446 return ntg 2447 } 2448 2449 // Canonicalize is used to canonicalize fields in the TaskGroup. 2450 func (tg *TaskGroup) Canonicalize(job *Job) { 2451 // Ensure that an empty and nil map are treated the same to avoid scheduling 2452 // problems since we use reflect DeepEquals. 2453 if len(tg.Meta) == 0 { 2454 tg.Meta = nil 2455 } 2456 2457 // Set the default restart policy. 2458 if tg.RestartPolicy == nil { 2459 tg.RestartPolicy = NewRestartPolicy(job.Type) 2460 } 2461 2462 // Set a default ephemeral disk object if the user has not requested for one 2463 if tg.EphemeralDisk == nil { 2464 tg.EphemeralDisk = DefaultEphemeralDisk() 2465 } 2466 2467 for _, task := range tg.Tasks { 2468 task.Canonicalize(job, tg) 2469 } 2470 2471 // Add up the disk resources to EphemeralDisk. This is done so that users 2472 // are not required to move their disk attribute from resources to 2473 // EphemeralDisk section of the job spec in Nomad 0.5 2474 // COMPAT 0.4.1 -> 0.5 2475 // Remove in 0.6 2476 var diskMB int 2477 for _, task := range tg.Tasks { 2478 diskMB += task.Resources.DiskMB 2479 } 2480 if diskMB > 0 { 2481 tg.EphemeralDisk.SizeMB = diskMB 2482 } 2483 } 2484 2485 // Validate is used to sanity check a task group 2486 func (tg *TaskGroup) Validate(j *Job) error { 2487 var mErr multierror.Error 2488 if tg.Name == "" { 2489 mErr.Errors = append(mErr.Errors, errors.New("Missing task group name")) 2490 } 2491 if tg.Count < 0 { 2492 mErr.Errors = append(mErr.Errors, errors.New("Task group count can't be negative")) 2493 } 2494 if len(tg.Tasks) == 0 { 2495 mErr.Errors = append(mErr.Errors, errors.New("Missing tasks for task group")) 2496 } 2497 for idx, constr := range tg.Constraints { 2498 if err := constr.Validate(); err != nil { 2499 outer := fmt.Errorf("Constraint %d validation failed: %s", idx+1, err) 2500 mErr.Errors = append(mErr.Errors, outer) 2501 } 2502 } 2503 2504 if tg.RestartPolicy != nil { 2505 if err := tg.RestartPolicy.Validate(); err != nil { 2506 mErr.Errors = append(mErr.Errors, err) 2507 } 2508 } else { 2509 mErr.Errors = append(mErr.Errors, fmt.Errorf("Task Group %v should have a restart policy", tg.Name)) 2510 } 2511 2512 if tg.EphemeralDisk != nil { 2513 if err := tg.EphemeralDisk.Validate(); err != nil { 2514 mErr.Errors = append(mErr.Errors, err) 2515 } 2516 } else { 2517 mErr.Errors = append(mErr.Errors, fmt.Errorf("Task Group %v should have an ephemeral disk object", tg.Name)) 2518 } 2519 2520 // Validate the update strategy 2521 if u := tg.Update; u != nil { 2522 switch j.Type { 2523 case JobTypeService, JobTypeSystem: 2524 default: 2525 // COMPAT: Enable in 0.7.0 2526 //mErr.Errors = append(mErr.Errors, fmt.Errorf("Job type %q does not allow update block", j.Type)) 2527 } 2528 if err := u.Validate(); err != nil { 2529 mErr.Errors = append(mErr.Errors, err) 2530 } 2531 } 2532 2533 // Check for duplicate tasks, that there is only leader task if any, 2534 // and no duplicated static ports 2535 tasks := make(map[string]int) 2536 staticPorts := make(map[int]string) 2537 leaderTasks := 0 2538 for idx, task := range tg.Tasks { 2539 if task.Name == "" { 2540 mErr.Errors = append(mErr.Errors, fmt.Errorf("Task %d missing name", idx+1)) 2541 } else if existing, ok := tasks[task.Name]; ok { 2542 mErr.Errors = append(mErr.Errors, fmt.Errorf("Task %d redefines '%s' from task %d", idx+1, task.Name, existing+1)) 2543 } else { 2544 tasks[task.Name] = idx 2545 } 2546 2547 if task.Leader { 2548 leaderTasks++ 2549 } 2550 2551 if task.Resources == nil { 2552 continue 2553 } 2554 2555 for _, net := range task.Resources.Networks { 2556 for _, port := range net.ReservedPorts { 2557 if other, ok := staticPorts[port.Value]; ok { 2558 err := fmt.Errorf("Static port %d already reserved by %s", port.Value, other) 2559 mErr.Errors = append(mErr.Errors, err) 2560 } else { 2561 staticPorts[port.Value] = fmt.Sprintf("%s:%s", task.Name, port.Label) 2562 } 2563 } 2564 } 2565 } 2566 2567 if leaderTasks > 1 { 2568 mErr.Errors = append(mErr.Errors, fmt.Errorf("Only one task may be marked as leader")) 2569 } 2570 2571 // Validate the tasks 2572 for _, task := range tg.Tasks { 2573 if err := task.Validate(tg.EphemeralDisk); err != nil { 2574 outer := fmt.Errorf("Task %s validation failed: %v", task.Name, err) 2575 mErr.Errors = append(mErr.Errors, outer) 2576 } 2577 } 2578 return mErr.ErrorOrNil() 2579 } 2580 2581 // Warnings returns a list of warnings that may be from dubious settings or 2582 // deprecation warnings. 2583 func (tg *TaskGroup) Warnings(j *Job) error { 2584 var mErr multierror.Error 2585 2586 // Validate the update strategy 2587 if u := tg.Update; u != nil { 2588 // Check the counts are appropriate 2589 if u.MaxParallel > tg.Count { 2590 mErr.Errors = append(mErr.Errors, 2591 fmt.Errorf("Update max parallel count is greater than task group count (%d > %d). "+ 2592 "A destructive change would result in the simultaneous replacement of all allocations.", u.MaxParallel, tg.Count)) 2593 } 2594 } 2595 2596 return mErr.ErrorOrNil() 2597 } 2598 2599 // LookupTask finds a task by name 2600 func (tg *TaskGroup) LookupTask(name string) *Task { 2601 for _, t := range tg.Tasks { 2602 if t.Name == name { 2603 return t 2604 } 2605 } 2606 return nil 2607 } 2608 2609 func (tg *TaskGroup) GoString() string { 2610 return fmt.Sprintf("*%#v", *tg) 2611 } 2612 2613 const ( 2614 ServiceCheckHTTP = "http" 2615 ServiceCheckTCP = "tcp" 2616 ServiceCheckScript = "script" 2617 2618 // minCheckInterval is the minimum check interval permitted. Consul 2619 // currently has its MinInterval set to 1s. Mirror that here for 2620 // consistency. 2621 minCheckInterval = 1 * time.Second 2622 2623 // minCheckTimeout is the minimum check timeout permitted for Consul 2624 // script TTL checks. 2625 minCheckTimeout = 1 * time.Second 2626 ) 2627 2628 // The ServiceCheck data model represents the consul health check that 2629 // Nomad registers for a Task 2630 type ServiceCheck struct { 2631 Name string // Name of the check, defaults to id 2632 Type string // Type of the check - tcp, http, docker and script 2633 Command string // Command is the command to run for script checks 2634 Args []string // Args is a list of argumes for script checks 2635 Path string // path of the health check url for http type check 2636 Protocol string // Protocol to use if check is http, defaults to http 2637 PortLabel string // The port to use for tcp/http checks 2638 Interval time.Duration // Interval of the check 2639 Timeout time.Duration // Timeout of the response from the check before consul fails the check 2640 InitialStatus string // Initial status of the check 2641 TLSSkipVerify bool // Skip TLS verification when Protocol=https 2642 } 2643 2644 func (sc *ServiceCheck) Copy() *ServiceCheck { 2645 if sc == nil { 2646 return nil 2647 } 2648 nsc := new(ServiceCheck) 2649 *nsc = *sc 2650 return nsc 2651 } 2652 2653 func (sc *ServiceCheck) Canonicalize(serviceName string) { 2654 // Ensure empty slices are treated as null to avoid scheduling issues when 2655 // using DeepEquals. 2656 if len(sc.Args) == 0 { 2657 sc.Args = nil 2658 } 2659 2660 if sc.Name == "" { 2661 sc.Name = fmt.Sprintf("service: %q check", serviceName) 2662 } 2663 } 2664 2665 // validate a Service's ServiceCheck 2666 func (sc *ServiceCheck) validate() error { 2667 switch strings.ToLower(sc.Type) { 2668 case ServiceCheckTCP: 2669 if sc.Timeout == 0 { 2670 return fmt.Errorf("missing required value timeout. Timeout cannot be less than %v", minCheckInterval) 2671 } else if sc.Timeout < minCheckTimeout { 2672 return fmt.Errorf("timeout (%v) is lower than required minimum timeout %v", sc.Timeout, minCheckInterval) 2673 } 2674 case ServiceCheckHTTP: 2675 if sc.Path == "" { 2676 return fmt.Errorf("http type must have a valid http path") 2677 } 2678 2679 if sc.Timeout == 0 { 2680 return fmt.Errorf("missing required value timeout. Timeout cannot be less than %v", minCheckInterval) 2681 } else if sc.Timeout < minCheckTimeout { 2682 return fmt.Errorf("timeout (%v) is lower than required minimum timeout %v", sc.Timeout, minCheckInterval) 2683 } 2684 case ServiceCheckScript: 2685 if sc.Command == "" { 2686 return fmt.Errorf("script type must have a valid script path") 2687 } 2688 2689 // TODO: enforce timeout on the Client side and reenable 2690 // validation. 2691 default: 2692 return fmt.Errorf(`invalid type (%+q), must be one of "http", "tcp", or "script" type`, sc.Type) 2693 } 2694 2695 if sc.Interval == 0 { 2696 return fmt.Errorf("missing required value interval. Interval cannot be less than %v", minCheckInterval) 2697 } else if sc.Interval < minCheckInterval { 2698 return fmt.Errorf("interval (%v) cannot be lower than %v", sc.Interval, minCheckInterval) 2699 } 2700 2701 switch sc.InitialStatus { 2702 case "": 2703 // case api.HealthUnknown: TODO: Add when Consul releases 0.7.1 2704 case api.HealthPassing: 2705 case api.HealthWarning: 2706 case api.HealthCritical: 2707 default: 2708 return fmt.Errorf(`invalid initial check state (%s), must be one of %q, %q, %q or empty`, sc.InitialStatus, api.HealthPassing, api.HealthWarning, api.HealthCritical) 2709 2710 } 2711 2712 return nil 2713 } 2714 2715 // RequiresPort returns whether the service check requires the task has a port. 2716 func (sc *ServiceCheck) RequiresPort() bool { 2717 switch sc.Type { 2718 case ServiceCheckHTTP, ServiceCheckTCP: 2719 return true 2720 default: 2721 return false 2722 } 2723 } 2724 2725 // Hash all ServiceCheck fields and the check's corresponding service ID to 2726 // create an identifier. The identifier is not guaranteed to be unique as if 2727 // the PortLabel is blank, the Service's PortLabel will be used after Hash is 2728 // called. 2729 func (sc *ServiceCheck) Hash(serviceID string) string { 2730 h := sha1.New() 2731 io.WriteString(h, serviceID) 2732 io.WriteString(h, sc.Name) 2733 io.WriteString(h, sc.Type) 2734 io.WriteString(h, sc.Command) 2735 io.WriteString(h, strings.Join(sc.Args, "")) 2736 io.WriteString(h, sc.Path) 2737 io.WriteString(h, sc.Protocol) 2738 io.WriteString(h, sc.PortLabel) 2739 io.WriteString(h, sc.Interval.String()) 2740 io.WriteString(h, sc.Timeout.String()) 2741 // Only include TLSSkipVerify if set to maintain ID stability with Nomad <0.6 2742 if sc.TLSSkipVerify { 2743 io.WriteString(h, "true") 2744 } 2745 return fmt.Sprintf("%x", h.Sum(nil)) 2746 } 2747 2748 const ( 2749 AddressModeAuto = "auto" 2750 AddressModeHost = "host" 2751 AddressModeDriver = "driver" 2752 ) 2753 2754 // Service represents a Consul service definition in Nomad 2755 type Service struct { 2756 // Name of the service registered with Consul. Consul defaults the 2757 // Name to ServiceID if not specified. The Name if specified is used 2758 // as one of the seed values when generating a Consul ServiceID. 2759 Name string 2760 2761 // PortLabel is either the numeric port number or the `host:port`. 2762 // To specify the port number using the host's Consul Advertise 2763 // address, specify an empty host in the PortLabel (e.g. `:port`). 2764 PortLabel string 2765 2766 // AddressMode specifies whether or not to use the host ip:port for 2767 // this service. 2768 AddressMode string 2769 2770 Tags []string // List of tags for the service 2771 Checks []*ServiceCheck // List of checks associated with the service 2772 } 2773 2774 func (s *Service) Copy() *Service { 2775 if s == nil { 2776 return nil 2777 } 2778 ns := new(Service) 2779 *ns = *s 2780 ns.Tags = helper.CopySliceString(ns.Tags) 2781 2782 if s.Checks != nil { 2783 checks := make([]*ServiceCheck, len(ns.Checks)) 2784 for i, c := range ns.Checks { 2785 checks[i] = c.Copy() 2786 } 2787 ns.Checks = checks 2788 } 2789 2790 return ns 2791 } 2792 2793 // Canonicalize interpolates values of Job, Task Group and Task in the Service 2794 // Name. This also generates check names, service id and check ids. 2795 func (s *Service) Canonicalize(job string, taskGroup string, task string) { 2796 // Ensure empty lists are treated as null to avoid scheduler issues when 2797 // using DeepEquals 2798 if len(s.Tags) == 0 { 2799 s.Tags = nil 2800 } 2801 if len(s.Checks) == 0 { 2802 s.Checks = nil 2803 } 2804 2805 s.Name = args.ReplaceEnv(s.Name, map[string]string{ 2806 "JOB": job, 2807 "TASKGROUP": taskGroup, 2808 "TASK": task, 2809 "BASE": fmt.Sprintf("%s-%s-%s", job, taskGroup, task), 2810 }, 2811 ) 2812 2813 for _, check := range s.Checks { 2814 check.Canonicalize(s.Name) 2815 } 2816 } 2817 2818 // Validate checks if the Check definition is valid 2819 func (s *Service) Validate() error { 2820 var mErr multierror.Error 2821 2822 // Ensure the service name is valid per the below RFCs but make an exception 2823 // for our interpolation syntax 2824 // RFC-952 §1 (https://tools.ietf.org/html/rfc952), RFC-1123 §2.1 2825 // (https://tools.ietf.org/html/rfc1123), and RFC-2782 2826 // (https://tools.ietf.org/html/rfc2782). 2827 re := regexp.MustCompile(`^(?i:[a-z0-9]|[a-z0-9\$][a-zA-Z0-9\-\$\{\}\_\.]*[a-z0-9\}])$`) 2828 if !re.MatchString(s.Name) { 2829 mErr.Errors = append(mErr.Errors, fmt.Errorf("service name must be valid per RFC 1123 and can contain only alphanumeric characters or dashes: %q", s.Name)) 2830 } 2831 2832 switch s.AddressMode { 2833 case "", AddressModeAuto, AddressModeHost, AddressModeDriver: 2834 // OK 2835 default: 2836 mErr.Errors = append(mErr.Errors, fmt.Errorf("service address_mode must be %q, %q, or %q; not %q", AddressModeAuto, AddressModeHost, AddressModeDriver, s.AddressMode)) 2837 } 2838 2839 for _, c := range s.Checks { 2840 if s.PortLabel == "" && c.RequiresPort() { 2841 mErr.Errors = append(mErr.Errors, fmt.Errorf("check %s invalid: check requires a port but the service %+q has no port", c.Name, s.Name)) 2842 continue 2843 } 2844 2845 if err := c.validate(); err != nil { 2846 mErr.Errors = append(mErr.Errors, fmt.Errorf("check %s invalid: %v", c.Name, err)) 2847 } 2848 } 2849 return mErr.ErrorOrNil() 2850 } 2851 2852 // ValidateName checks if the services Name is valid and should be called after 2853 // the name has been interpolated 2854 func (s *Service) ValidateName(name string) error { 2855 // Ensure the service name is valid per RFC-952 §1 2856 // (https://tools.ietf.org/html/rfc952), RFC-1123 §2.1 2857 // (https://tools.ietf.org/html/rfc1123), and RFC-2782 2858 // (https://tools.ietf.org/html/rfc2782). 2859 re := regexp.MustCompile(`^(?i:[a-z0-9]|[a-z0-9][a-z0-9\-]{0,61}[a-z0-9])$`) 2860 if !re.MatchString(name) { 2861 return fmt.Errorf("service name must be valid per RFC 1123 and can contain only alphanumeric characters or dashes and must be no longer than 63 characters: %q", name) 2862 } 2863 return nil 2864 } 2865 2866 // Hash calculates the hash of the check based on it's content and the service 2867 // which owns it 2868 func (s *Service) Hash() string { 2869 h := sha1.New() 2870 io.WriteString(h, s.Name) 2871 io.WriteString(h, strings.Join(s.Tags, "")) 2872 io.WriteString(h, s.PortLabel) 2873 io.WriteString(h, s.AddressMode) 2874 return fmt.Sprintf("%x", h.Sum(nil)) 2875 } 2876 2877 const ( 2878 // DefaultKillTimeout is the default timeout between signaling a task it 2879 // will be killed and killing it. 2880 DefaultKillTimeout = 5 * time.Second 2881 ) 2882 2883 // LogConfig provides configuration for log rotation 2884 type LogConfig struct { 2885 MaxFiles int 2886 MaxFileSizeMB int 2887 } 2888 2889 // DefaultLogConfig returns the default LogConfig values. 2890 func DefaultLogConfig() *LogConfig { 2891 return &LogConfig{ 2892 MaxFiles: 10, 2893 MaxFileSizeMB: 10, 2894 } 2895 } 2896 2897 // Validate returns an error if the log config specified are less than 2898 // the minimum allowed. 2899 func (l *LogConfig) Validate() error { 2900 var mErr multierror.Error 2901 if l.MaxFiles < 1 { 2902 mErr.Errors = append(mErr.Errors, fmt.Errorf("minimum number of files is 1; got %d", l.MaxFiles)) 2903 } 2904 if l.MaxFileSizeMB < 1 { 2905 mErr.Errors = append(mErr.Errors, fmt.Errorf("minimum file size is 1MB; got %d", l.MaxFileSizeMB)) 2906 } 2907 return mErr.ErrorOrNil() 2908 } 2909 2910 // Task is a single process typically that is executed as part of a task group. 2911 type Task struct { 2912 // Name of the task 2913 Name string 2914 2915 // Driver is used to control which driver is used 2916 Driver string 2917 2918 // User is used to determine which user will run the task. It defaults to 2919 // the same user the Nomad client is being run as. 2920 User string 2921 2922 // Config is provided to the driver to initialize 2923 Config map[string]interface{} 2924 2925 // Map of environment variables to be used by the driver 2926 Env map[string]string 2927 2928 // List of service definitions exposed by the Task 2929 Services []*Service 2930 2931 // Vault is used to define the set of Vault policies that this task should 2932 // have access to. 2933 Vault *Vault 2934 2935 // Templates are the set of templates to be rendered for the task. 2936 Templates []*Template 2937 2938 // Constraints can be specified at a task level and apply only to 2939 // the particular task. 2940 Constraints []*Constraint 2941 2942 // Resources is the resources needed by this task 2943 Resources *Resources 2944 2945 // DispatchPayload configures how the task retrieves its input from a dispatch 2946 DispatchPayload *DispatchPayloadConfig 2947 2948 // Meta is used to associate arbitrary metadata with this 2949 // task. This is opaque to Nomad. 2950 Meta map[string]string 2951 2952 // KillTimeout is the time between signaling a task that it will be 2953 // killed and killing it. 2954 KillTimeout time.Duration 2955 2956 // LogConfig provides configuration for log rotation 2957 LogConfig *LogConfig 2958 2959 // Artifacts is a list of artifacts to download and extract before running 2960 // the task. 2961 Artifacts []*TaskArtifact 2962 2963 // Leader marks the task as the leader within the group. When the leader 2964 // task exits, other tasks will be gracefully terminated. 2965 Leader bool 2966 } 2967 2968 func (t *Task) Copy() *Task { 2969 if t == nil { 2970 return nil 2971 } 2972 nt := new(Task) 2973 *nt = *t 2974 nt.Env = helper.CopyMapStringString(nt.Env) 2975 2976 if t.Services != nil { 2977 services := make([]*Service, len(nt.Services)) 2978 for i, s := range nt.Services { 2979 services[i] = s.Copy() 2980 } 2981 nt.Services = services 2982 } 2983 2984 nt.Constraints = CopySliceConstraints(nt.Constraints) 2985 2986 nt.Vault = nt.Vault.Copy() 2987 nt.Resources = nt.Resources.Copy() 2988 nt.Meta = helper.CopyMapStringString(nt.Meta) 2989 nt.DispatchPayload = nt.DispatchPayload.Copy() 2990 2991 if t.Artifacts != nil { 2992 artifacts := make([]*TaskArtifact, 0, len(t.Artifacts)) 2993 for _, a := range nt.Artifacts { 2994 artifacts = append(artifacts, a.Copy()) 2995 } 2996 nt.Artifacts = artifacts 2997 } 2998 2999 if i, err := copystructure.Copy(nt.Config); err != nil { 3000 panic(err.Error()) 3001 } else { 3002 nt.Config = i.(map[string]interface{}) 3003 } 3004 3005 if t.Templates != nil { 3006 templates := make([]*Template, len(t.Templates)) 3007 for i, tmpl := range nt.Templates { 3008 templates[i] = tmpl.Copy() 3009 } 3010 nt.Templates = templates 3011 } 3012 3013 return nt 3014 } 3015 3016 // Canonicalize canonicalizes fields in the task. 3017 func (t *Task) Canonicalize(job *Job, tg *TaskGroup) { 3018 // Ensure that an empty and nil map are treated the same to avoid scheduling 3019 // problems since we use reflect DeepEquals. 3020 if len(t.Meta) == 0 { 3021 t.Meta = nil 3022 } 3023 if len(t.Config) == 0 { 3024 t.Config = nil 3025 } 3026 if len(t.Env) == 0 { 3027 t.Env = nil 3028 } 3029 3030 for _, service := range t.Services { 3031 service.Canonicalize(job.Name, tg.Name, t.Name) 3032 } 3033 3034 // If Resources are nil initialize them to defaults, otherwise canonicalize 3035 if t.Resources == nil { 3036 t.Resources = DefaultResources() 3037 } else { 3038 t.Resources.Canonicalize() 3039 } 3040 3041 // Set the default timeout if it is not specified. 3042 if t.KillTimeout == 0 { 3043 t.KillTimeout = DefaultKillTimeout 3044 } 3045 3046 if t.Vault != nil { 3047 t.Vault.Canonicalize() 3048 } 3049 3050 for _, template := range t.Templates { 3051 template.Canonicalize() 3052 } 3053 } 3054 3055 func (t *Task) GoString() string { 3056 return fmt.Sprintf("*%#v", *t) 3057 } 3058 3059 // Validate is used to sanity check a task 3060 func (t *Task) Validate(ephemeralDisk *EphemeralDisk) error { 3061 var mErr multierror.Error 3062 if t.Name == "" { 3063 mErr.Errors = append(mErr.Errors, errors.New("Missing task name")) 3064 } 3065 if strings.ContainsAny(t.Name, `/\`) { 3066 // We enforce this so that when creating the directory on disk it will 3067 // not have any slashes. 3068 mErr.Errors = append(mErr.Errors, errors.New("Task name cannot include slashes")) 3069 } 3070 if t.Driver == "" { 3071 mErr.Errors = append(mErr.Errors, errors.New("Missing task driver")) 3072 } 3073 if t.KillTimeout.Nanoseconds() < 0 { 3074 mErr.Errors = append(mErr.Errors, errors.New("KillTimeout must be a positive value")) 3075 } 3076 3077 // Validate the resources. 3078 if t.Resources == nil { 3079 mErr.Errors = append(mErr.Errors, errors.New("Missing task resources")) 3080 } else { 3081 if err := t.Resources.MeetsMinResources(); err != nil { 3082 mErr.Errors = append(mErr.Errors, err) 3083 } 3084 3085 // Ensure the task isn't asking for disk resources 3086 if t.Resources.DiskMB > 0 { 3087 mErr.Errors = append(mErr.Errors, errors.New("Task can't ask for disk resources, they have to be specified at the task group level.")) 3088 } 3089 } 3090 3091 // Validate the log config 3092 if t.LogConfig == nil { 3093 mErr.Errors = append(mErr.Errors, errors.New("Missing Log Config")) 3094 } else if err := t.LogConfig.Validate(); err != nil { 3095 mErr.Errors = append(mErr.Errors, err) 3096 } 3097 3098 for idx, constr := range t.Constraints { 3099 if err := constr.Validate(); err != nil { 3100 outer := fmt.Errorf("Constraint %d validation failed: %s", idx+1, err) 3101 mErr.Errors = append(mErr.Errors, outer) 3102 } 3103 3104 switch constr.Operand { 3105 case ConstraintDistinctHosts, ConstraintDistinctProperty: 3106 outer := fmt.Errorf("Constraint %d has disallowed Operand at task level: %s", idx+1, constr.Operand) 3107 mErr.Errors = append(mErr.Errors, outer) 3108 } 3109 } 3110 3111 // Validate Services 3112 if err := validateServices(t); err != nil { 3113 mErr.Errors = append(mErr.Errors, err) 3114 } 3115 3116 if t.LogConfig != nil && ephemeralDisk != nil { 3117 logUsage := (t.LogConfig.MaxFiles * t.LogConfig.MaxFileSizeMB) 3118 if ephemeralDisk.SizeMB <= logUsage { 3119 mErr.Errors = append(mErr.Errors, 3120 fmt.Errorf("log storage (%d MB) must be less than requested disk capacity (%d MB)", 3121 logUsage, ephemeralDisk.SizeMB)) 3122 } 3123 } 3124 3125 for idx, artifact := range t.Artifacts { 3126 if err := artifact.Validate(); err != nil { 3127 outer := fmt.Errorf("Artifact %d validation failed: %v", idx+1, err) 3128 mErr.Errors = append(mErr.Errors, outer) 3129 } 3130 } 3131 3132 if t.Vault != nil { 3133 if err := t.Vault.Validate(); err != nil { 3134 mErr.Errors = append(mErr.Errors, fmt.Errorf("Vault validation failed: %v", err)) 3135 } 3136 } 3137 3138 destinations := make(map[string]int, len(t.Templates)) 3139 for idx, tmpl := range t.Templates { 3140 if err := tmpl.Validate(); err != nil { 3141 outer := fmt.Errorf("Template %d validation failed: %s", idx+1, err) 3142 mErr.Errors = append(mErr.Errors, outer) 3143 } 3144 3145 if other, ok := destinations[tmpl.DestPath]; ok { 3146 outer := fmt.Errorf("Template %d has same destination as %d", idx+1, other) 3147 mErr.Errors = append(mErr.Errors, outer) 3148 } else { 3149 destinations[tmpl.DestPath] = idx + 1 3150 } 3151 } 3152 3153 // Validate the dispatch payload block if there 3154 if t.DispatchPayload != nil { 3155 if err := t.DispatchPayload.Validate(); err != nil { 3156 mErr.Errors = append(mErr.Errors, fmt.Errorf("Dispatch Payload validation failed: %v", err)) 3157 } 3158 } 3159 3160 return mErr.ErrorOrNil() 3161 } 3162 3163 // validateServices takes a task and validates the services within it are valid 3164 // and reference ports that exist. 3165 func validateServices(t *Task) error { 3166 var mErr multierror.Error 3167 3168 // Ensure that services don't ask for non-existent ports and their names are 3169 // unique. 3170 servicePorts := make(map[string][]string) 3171 knownServices := make(map[string]struct{}) 3172 for i, service := range t.Services { 3173 if err := service.Validate(); err != nil { 3174 outer := fmt.Errorf("service[%d] %+q validation failed: %s", i, service.Name, err) 3175 mErr.Errors = append(mErr.Errors, outer) 3176 } 3177 3178 // Ensure that services with the same name are not being registered for 3179 // the same port 3180 if _, ok := knownServices[service.Name+service.PortLabel]; ok { 3181 mErr.Errors = append(mErr.Errors, fmt.Errorf("service %q is duplicate", service.Name)) 3182 } 3183 knownServices[service.Name+service.PortLabel] = struct{}{} 3184 3185 if service.PortLabel != "" { 3186 servicePorts[service.PortLabel] = append(servicePorts[service.PortLabel], service.Name) 3187 } 3188 3189 // Ensure that check names are unique. 3190 knownChecks := make(map[string]struct{}) 3191 for _, check := range service.Checks { 3192 if _, ok := knownChecks[check.Name]; ok { 3193 mErr.Errors = append(mErr.Errors, fmt.Errorf("check %q is duplicate", check.Name)) 3194 } 3195 knownChecks[check.Name] = struct{}{} 3196 } 3197 } 3198 3199 // Get the set of port labels. 3200 portLabels := make(map[string]struct{}) 3201 if t.Resources != nil { 3202 for _, network := range t.Resources.Networks { 3203 ports := network.PortLabels() 3204 for portLabel, _ := range ports { 3205 portLabels[portLabel] = struct{}{} 3206 } 3207 } 3208 } 3209 3210 // Ensure all ports referenced in services exist. 3211 for servicePort, services := range servicePorts { 3212 _, ok := portLabels[servicePort] 3213 if !ok { 3214 joined := strings.Join(services, ", ") 3215 err := fmt.Errorf("port label %q referenced by services %v does not exist", servicePort, joined) 3216 mErr.Errors = append(mErr.Errors, err) 3217 } 3218 } 3219 3220 // Ensure address mode is valid 3221 return mErr.ErrorOrNil() 3222 } 3223 3224 const ( 3225 // TemplateChangeModeNoop marks that no action should be taken if the 3226 // template is re-rendered 3227 TemplateChangeModeNoop = "noop" 3228 3229 // TemplateChangeModeSignal marks that the task should be signaled if the 3230 // template is re-rendered 3231 TemplateChangeModeSignal = "signal" 3232 3233 // TemplateChangeModeRestart marks that the task should be restarted if the 3234 // template is re-rendered 3235 TemplateChangeModeRestart = "restart" 3236 ) 3237 3238 var ( 3239 // TemplateChangeModeInvalidError is the error for when an invalid change 3240 // mode is given 3241 TemplateChangeModeInvalidError = errors.New("Invalid change mode. Must be one of the following: noop, signal, restart") 3242 ) 3243 3244 // Template represents a template configuration to be rendered for a given task 3245 type Template struct { 3246 // SourcePath is the path to the template to be rendered 3247 SourcePath string 3248 3249 // DestPath is the path to where the template should be rendered 3250 DestPath string 3251 3252 // EmbeddedTmpl store the raw template. This is useful for smaller templates 3253 // where they are embedded in the job file rather than sent as an artificat 3254 EmbeddedTmpl string 3255 3256 // ChangeMode indicates what should be done if the template is re-rendered 3257 ChangeMode string 3258 3259 // ChangeSignal is the signal that should be sent if the change mode 3260 // requires it. 3261 ChangeSignal string 3262 3263 // Splay is used to avoid coordinated restarts of processes by applying a 3264 // random wait between 0 and the given splay value before signalling the 3265 // application of a change 3266 Splay time.Duration 3267 3268 // Perms is the permission the file should be written out with. 3269 Perms string 3270 3271 // LeftDelim and RightDelim are optional configurations to control what 3272 // delimiter is utilized when parsing the template. 3273 LeftDelim string 3274 RightDelim string 3275 3276 // Envvars enables exposing the template as environment variables 3277 // instead of as a file. The template must be of the form: 3278 // 3279 // VAR_NAME_1={{ key service/my-key }} 3280 // VAR_NAME_2=raw string and {{ env "attr.kernel.name" }} 3281 // 3282 // Lines will be split on the initial "=" with the first part being the 3283 // key name and the second part the value. 3284 // Empty lines and lines starting with # will be ignored, but to avoid 3285 // escaping issues #s within lines will not be treated as comments. 3286 Envvars bool 3287 } 3288 3289 // DefaultTemplate returns a default template. 3290 func DefaultTemplate() *Template { 3291 return &Template{ 3292 ChangeMode: TemplateChangeModeRestart, 3293 Splay: 5 * time.Second, 3294 Perms: "0644", 3295 } 3296 } 3297 3298 func (t *Template) Copy() *Template { 3299 if t == nil { 3300 return nil 3301 } 3302 copy := new(Template) 3303 *copy = *t 3304 return copy 3305 } 3306 3307 func (t *Template) Canonicalize() { 3308 if t.ChangeSignal != "" { 3309 t.ChangeSignal = strings.ToUpper(t.ChangeSignal) 3310 } 3311 } 3312 3313 func (t *Template) Validate() error { 3314 var mErr multierror.Error 3315 3316 // Verify we have something to render 3317 if t.SourcePath == "" && t.EmbeddedTmpl == "" { 3318 multierror.Append(&mErr, fmt.Errorf("Must specify a source path or have an embedded template")) 3319 } 3320 3321 // Verify we can render somewhere 3322 if t.DestPath == "" { 3323 multierror.Append(&mErr, fmt.Errorf("Must specify a destination for the template")) 3324 } 3325 3326 // Verify the destination doesn't escape 3327 escaped, err := PathEscapesAllocDir("task", t.DestPath) 3328 if err != nil { 3329 mErr.Errors = append(mErr.Errors, fmt.Errorf("invalid destination path: %v", err)) 3330 } else if escaped { 3331 mErr.Errors = append(mErr.Errors, fmt.Errorf("destination escapes allocation directory")) 3332 } 3333 3334 // Verify a proper change mode 3335 switch t.ChangeMode { 3336 case TemplateChangeModeNoop, TemplateChangeModeRestart: 3337 case TemplateChangeModeSignal: 3338 if t.ChangeSignal == "" { 3339 multierror.Append(&mErr, fmt.Errorf("Must specify signal value when change mode is signal")) 3340 } 3341 default: 3342 multierror.Append(&mErr, TemplateChangeModeInvalidError) 3343 } 3344 3345 // Verify the splay is positive 3346 if t.Splay < 0 { 3347 multierror.Append(&mErr, fmt.Errorf("Must specify positive splay value")) 3348 } 3349 3350 // Verify the permissions 3351 if t.Perms != "" { 3352 if _, err := strconv.ParseUint(t.Perms, 8, 12); err != nil { 3353 multierror.Append(&mErr, fmt.Errorf("Failed to parse %q as octal: %v", t.Perms, err)) 3354 } 3355 } 3356 3357 return mErr.ErrorOrNil() 3358 } 3359 3360 // Set of possible states for a task. 3361 const ( 3362 TaskStatePending = "pending" // The task is waiting to be run. 3363 TaskStateRunning = "running" // The task is currently running. 3364 TaskStateDead = "dead" // Terminal state of task. 3365 ) 3366 3367 // TaskState tracks the current state of a task and events that caused state 3368 // transitions. 3369 type TaskState struct { 3370 // The current state of the task. 3371 State string 3372 3373 // Failed marks a task as having failed 3374 Failed bool 3375 3376 // Restarts is the number of times the task has restarted 3377 Restarts uint64 3378 3379 // LastRestart is the time the task last restarted. It is updated each time the 3380 // task restarts 3381 LastRestart time.Time 3382 3383 // StartedAt is the time the task is started. It is updated each time the 3384 // task starts 3385 StartedAt time.Time 3386 3387 // FinishedAt is the time at which the task transistioned to dead and will 3388 // not be started again. 3389 FinishedAt time.Time 3390 3391 // Series of task events that transition the state of the task. 3392 Events []*TaskEvent 3393 } 3394 3395 func (ts *TaskState) Copy() *TaskState { 3396 if ts == nil { 3397 return nil 3398 } 3399 copy := new(TaskState) 3400 *copy = *ts 3401 3402 if ts.Events != nil { 3403 copy.Events = make([]*TaskEvent, len(ts.Events)) 3404 for i, e := range ts.Events { 3405 copy.Events[i] = e.Copy() 3406 } 3407 } 3408 return copy 3409 } 3410 3411 // Successful returns whether a task finished successfully. 3412 func (ts *TaskState) Successful() bool { 3413 l := len(ts.Events) 3414 if ts.State != TaskStateDead || l == 0 { 3415 return false 3416 } 3417 3418 e := ts.Events[l-1] 3419 if e.Type != TaskTerminated { 3420 return false 3421 } 3422 3423 return e.ExitCode == 0 3424 } 3425 3426 const ( 3427 // TaskSetupFailure indicates that the task could not be started due to a 3428 // a setup failure. 3429 TaskSetupFailure = "Setup Failure" 3430 3431 // TaskDriveFailure indicates that the task could not be started due to a 3432 // failure in the driver. 3433 TaskDriverFailure = "Driver Failure" 3434 3435 // TaskReceived signals that the task has been pulled by the client at the 3436 // given timestamp. 3437 TaskReceived = "Received" 3438 3439 // TaskFailedValidation indicates the task was invalid and as such was not 3440 // run. 3441 TaskFailedValidation = "Failed Validation" 3442 3443 // TaskStarted signals that the task was started and its timestamp can be 3444 // used to determine the running length of the task. 3445 TaskStarted = "Started" 3446 3447 // TaskTerminated indicates that the task was started and exited. 3448 TaskTerminated = "Terminated" 3449 3450 // TaskKilling indicates a kill signal has been sent to the task. 3451 TaskKilling = "Killing" 3452 3453 // TaskKilled indicates a user has killed the task. 3454 TaskKilled = "Killed" 3455 3456 // TaskRestarting indicates that task terminated and is being restarted. 3457 TaskRestarting = "Restarting" 3458 3459 // TaskNotRestarting indicates that the task has failed and is not being 3460 // restarted because it has exceeded its restart policy. 3461 TaskNotRestarting = "Not Restarting" 3462 3463 // TaskRestartSignal indicates that the task has been signalled to be 3464 // restarted 3465 TaskRestartSignal = "Restart Signaled" 3466 3467 // TaskSignaling indicates that the task is being signalled. 3468 TaskSignaling = "Signaling" 3469 3470 // TaskDownloadingArtifacts means the task is downloading the artifacts 3471 // specified in the task. 3472 TaskDownloadingArtifacts = "Downloading Artifacts" 3473 3474 // TaskArtifactDownloadFailed indicates that downloading the artifacts 3475 // failed. 3476 TaskArtifactDownloadFailed = "Failed Artifact Download" 3477 3478 // TaskBuildingTaskDir indicates that the task directory/chroot is being 3479 // built. 3480 TaskBuildingTaskDir = "Building Task Directory" 3481 3482 // TaskSetup indicates the task runner is setting up the task environment 3483 TaskSetup = "Task Setup" 3484 3485 // TaskDiskExceeded indicates that one of the tasks in a taskgroup has 3486 // exceeded the requested disk resources. 3487 TaskDiskExceeded = "Disk Resources Exceeded" 3488 3489 // TaskSiblingFailed indicates that a sibling task in the task group has 3490 // failed. 3491 TaskSiblingFailed = "Sibling Task Failed" 3492 3493 // TaskDriverMessage is an informational event message emitted by 3494 // drivers such as when they're performing a long running action like 3495 // downloading an image. 3496 TaskDriverMessage = "Driver" 3497 3498 // TaskLeaderDead indicates that the leader task within the has finished. 3499 TaskLeaderDead = "Leader Task Dead" 3500 ) 3501 3502 // TaskEvent is an event that effects the state of a task and contains meta-data 3503 // appropriate to the events type. 3504 type TaskEvent struct { 3505 Type string 3506 Time int64 // Unix Nanosecond timestamp 3507 3508 // FailsTask marks whether this event fails the task 3509 FailsTask bool 3510 3511 // Restart fields. 3512 RestartReason string 3513 3514 // Setup Failure fields. 3515 SetupError string 3516 3517 // Driver Failure fields. 3518 DriverError string // A driver error occurred while starting the task. 3519 3520 // Task Terminated Fields. 3521 ExitCode int // The exit code of the task. 3522 Signal int // The signal that terminated the task. 3523 Message string // A possible message explaining the termination of the task. 3524 3525 // Killing fields 3526 KillTimeout time.Duration 3527 3528 // Task Killed Fields. 3529 KillError string // Error killing the task. 3530 3531 // KillReason is the reason the task was killed 3532 KillReason string 3533 3534 // TaskRestarting fields. 3535 StartDelay int64 // The sleep period before restarting the task in unix nanoseconds. 3536 3537 // Artifact Download fields 3538 DownloadError string // Error downloading artifacts 3539 3540 // Validation fields 3541 ValidationError string // Validation error 3542 3543 // The maximum allowed task disk size. 3544 DiskLimit int64 3545 3546 // Name of the sibling task that caused termination of the task that 3547 // the TaskEvent refers to. 3548 FailedSibling string 3549 3550 // VaultError is the error from token renewal 3551 VaultError string 3552 3553 // TaskSignalReason indicates the reason the task is being signalled. 3554 TaskSignalReason string 3555 3556 // TaskSignal is the signal that was sent to the task 3557 TaskSignal string 3558 3559 // DriverMessage indicates a driver action being taken. 3560 DriverMessage string 3561 } 3562 3563 func (te *TaskEvent) GoString() string { 3564 return fmt.Sprintf("%v at %v", te.Type, te.Time) 3565 } 3566 3567 // SetMessage sets the message of TaskEvent 3568 func (te *TaskEvent) SetMessage(msg string) *TaskEvent { 3569 te.Message = msg 3570 return te 3571 } 3572 3573 func (te *TaskEvent) Copy() *TaskEvent { 3574 if te == nil { 3575 return nil 3576 } 3577 copy := new(TaskEvent) 3578 *copy = *te 3579 return copy 3580 } 3581 3582 func NewTaskEvent(event string) *TaskEvent { 3583 return &TaskEvent{ 3584 Type: event, 3585 Time: time.Now().UnixNano(), 3586 } 3587 } 3588 3589 // SetSetupError is used to store an error that occured while setting up the 3590 // task 3591 func (e *TaskEvent) SetSetupError(err error) *TaskEvent { 3592 if err != nil { 3593 e.SetupError = err.Error() 3594 } 3595 return e 3596 } 3597 3598 func (e *TaskEvent) SetFailsTask() *TaskEvent { 3599 e.FailsTask = true 3600 return e 3601 } 3602 3603 func (e *TaskEvent) SetDriverError(err error) *TaskEvent { 3604 if err != nil { 3605 e.DriverError = err.Error() 3606 } 3607 return e 3608 } 3609 3610 func (e *TaskEvent) SetExitCode(c int) *TaskEvent { 3611 e.ExitCode = c 3612 return e 3613 } 3614 3615 func (e *TaskEvent) SetSignal(s int) *TaskEvent { 3616 e.Signal = s 3617 return e 3618 } 3619 3620 func (e *TaskEvent) SetExitMessage(err error) *TaskEvent { 3621 if err != nil { 3622 e.Message = err.Error() 3623 } 3624 return e 3625 } 3626 3627 func (e *TaskEvent) SetKillError(err error) *TaskEvent { 3628 if err != nil { 3629 e.KillError = err.Error() 3630 } 3631 return e 3632 } 3633 3634 func (e *TaskEvent) SetKillReason(r string) *TaskEvent { 3635 e.KillReason = r 3636 return e 3637 } 3638 3639 func (e *TaskEvent) SetRestartDelay(delay time.Duration) *TaskEvent { 3640 e.StartDelay = int64(delay) 3641 return e 3642 } 3643 3644 func (e *TaskEvent) SetRestartReason(reason string) *TaskEvent { 3645 e.RestartReason = reason 3646 return e 3647 } 3648 3649 func (e *TaskEvent) SetTaskSignalReason(r string) *TaskEvent { 3650 e.TaskSignalReason = r 3651 return e 3652 } 3653 3654 func (e *TaskEvent) SetTaskSignal(s os.Signal) *TaskEvent { 3655 e.TaskSignal = s.String() 3656 return e 3657 } 3658 3659 func (e *TaskEvent) SetDownloadError(err error) *TaskEvent { 3660 if err != nil { 3661 e.DownloadError = err.Error() 3662 } 3663 return e 3664 } 3665 3666 func (e *TaskEvent) SetValidationError(err error) *TaskEvent { 3667 if err != nil { 3668 e.ValidationError = err.Error() 3669 } 3670 return e 3671 } 3672 3673 func (e *TaskEvent) SetKillTimeout(timeout time.Duration) *TaskEvent { 3674 e.KillTimeout = timeout 3675 return e 3676 } 3677 3678 func (e *TaskEvent) SetDiskLimit(limit int64) *TaskEvent { 3679 e.DiskLimit = limit 3680 return e 3681 } 3682 3683 func (e *TaskEvent) SetFailedSibling(sibling string) *TaskEvent { 3684 e.FailedSibling = sibling 3685 return e 3686 } 3687 3688 func (e *TaskEvent) SetVaultRenewalError(err error) *TaskEvent { 3689 if err != nil { 3690 e.VaultError = err.Error() 3691 } 3692 return e 3693 } 3694 3695 func (e *TaskEvent) SetDriverMessage(m string) *TaskEvent { 3696 e.DriverMessage = m 3697 return e 3698 } 3699 3700 // TaskArtifact is an artifact to download before running the task. 3701 type TaskArtifact struct { 3702 // GetterSource is the source to download an artifact using go-getter 3703 GetterSource string 3704 3705 // GetterOptions are options to use when downloading the artifact using 3706 // go-getter. 3707 GetterOptions map[string]string 3708 3709 // GetterMode is the go-getter.ClientMode for fetching resources. 3710 // Defaults to "any" but can be set to "file" or "dir". 3711 GetterMode string 3712 3713 // RelativeDest is the download destination given relative to the task's 3714 // directory. 3715 RelativeDest string 3716 } 3717 3718 func (ta *TaskArtifact) Copy() *TaskArtifact { 3719 if ta == nil { 3720 return nil 3721 } 3722 nta := new(TaskArtifact) 3723 *nta = *ta 3724 nta.GetterOptions = helper.CopyMapStringString(ta.GetterOptions) 3725 return nta 3726 } 3727 3728 func (ta *TaskArtifact) GoString() string { 3729 return fmt.Sprintf("%+v", ta) 3730 } 3731 3732 // PathEscapesAllocDir returns if the given path escapes the allocation 3733 // directory. The prefix allows adding a prefix if the path will be joined, for 3734 // example a "task/local" prefix may be provided if the path will be joined 3735 // against that prefix. 3736 func PathEscapesAllocDir(prefix, path string) (bool, error) { 3737 // Verify the destination doesn't escape the tasks directory 3738 alloc, err := filepath.Abs(filepath.Join("/", "alloc-dir/", "alloc-id/")) 3739 if err != nil { 3740 return false, err 3741 } 3742 abs, err := filepath.Abs(filepath.Join(alloc, prefix, path)) 3743 if err != nil { 3744 return false, err 3745 } 3746 rel, err := filepath.Rel(alloc, abs) 3747 if err != nil { 3748 return false, err 3749 } 3750 3751 return strings.HasPrefix(rel, ".."), nil 3752 } 3753 3754 func (ta *TaskArtifact) Validate() error { 3755 // Verify the source 3756 var mErr multierror.Error 3757 if ta.GetterSource == "" { 3758 mErr.Errors = append(mErr.Errors, fmt.Errorf("source must be specified")) 3759 } 3760 3761 switch ta.GetterMode { 3762 case "": 3763 // Default to any 3764 ta.GetterMode = GetterModeAny 3765 case GetterModeAny, GetterModeFile, GetterModeDir: 3766 // Ok 3767 default: 3768 mErr.Errors = append(mErr.Errors, fmt.Errorf("invalid artifact mode %q; must be one of: %s, %s, %s", 3769 ta.GetterMode, GetterModeAny, GetterModeFile, GetterModeDir)) 3770 } 3771 3772 escaped, err := PathEscapesAllocDir("task", ta.RelativeDest) 3773 if err != nil { 3774 mErr.Errors = append(mErr.Errors, fmt.Errorf("invalid destination path: %v", err)) 3775 } else if escaped { 3776 mErr.Errors = append(mErr.Errors, fmt.Errorf("destination escapes allocation directory")) 3777 } 3778 3779 // Verify the checksum 3780 if check, ok := ta.GetterOptions["checksum"]; ok { 3781 check = strings.TrimSpace(check) 3782 if check == "" { 3783 mErr.Errors = append(mErr.Errors, fmt.Errorf("checksum value cannot be empty")) 3784 return mErr.ErrorOrNil() 3785 } 3786 3787 parts := strings.Split(check, ":") 3788 if l := len(parts); l != 2 { 3789 mErr.Errors = append(mErr.Errors, fmt.Errorf(`checksum must be given as "type:value"; got %q`, check)) 3790 return mErr.ErrorOrNil() 3791 } 3792 3793 checksumVal := parts[1] 3794 checksumBytes, err := hex.DecodeString(checksumVal) 3795 if err != nil { 3796 mErr.Errors = append(mErr.Errors, fmt.Errorf("invalid checksum: %v", err)) 3797 return mErr.ErrorOrNil() 3798 } 3799 3800 checksumType := parts[0] 3801 expectedLength := 0 3802 switch checksumType { 3803 case "md5": 3804 expectedLength = md5.Size 3805 case "sha1": 3806 expectedLength = sha1.Size 3807 case "sha256": 3808 expectedLength = sha256.Size 3809 case "sha512": 3810 expectedLength = sha512.Size 3811 default: 3812 mErr.Errors = append(mErr.Errors, fmt.Errorf("unsupported checksum type: %s", checksumType)) 3813 return mErr.ErrorOrNil() 3814 } 3815 3816 if len(checksumBytes) != expectedLength { 3817 mErr.Errors = append(mErr.Errors, fmt.Errorf("invalid %s checksum: %v", checksumType, checksumVal)) 3818 return mErr.ErrorOrNil() 3819 } 3820 } 3821 3822 return mErr.ErrorOrNil() 3823 } 3824 3825 const ( 3826 ConstraintDistinctProperty = "distinct_property" 3827 ConstraintDistinctHosts = "distinct_hosts" 3828 ConstraintRegex = "regexp" 3829 ConstraintVersion = "version" 3830 ConstraintSetContains = "set_contains" 3831 ) 3832 3833 // Constraints are used to restrict placement options. 3834 type Constraint struct { 3835 LTarget string // Left-hand target 3836 RTarget string // Right-hand target 3837 Operand string // Constraint operand (<=, <, =, !=, >, >=), contains, near 3838 str string // Memoized string 3839 } 3840 3841 // Equal checks if two constraints are equal 3842 func (c *Constraint) Equal(o *Constraint) bool { 3843 return c.LTarget == o.LTarget && 3844 c.RTarget == o.RTarget && 3845 c.Operand == o.Operand 3846 } 3847 3848 func (c *Constraint) Copy() *Constraint { 3849 if c == nil { 3850 return nil 3851 } 3852 nc := new(Constraint) 3853 *nc = *c 3854 return nc 3855 } 3856 3857 func (c *Constraint) String() string { 3858 if c.str != "" { 3859 return c.str 3860 } 3861 c.str = fmt.Sprintf("%s %s %s", c.LTarget, c.Operand, c.RTarget) 3862 return c.str 3863 } 3864 3865 func (c *Constraint) Validate() error { 3866 var mErr multierror.Error 3867 if c.Operand == "" { 3868 mErr.Errors = append(mErr.Errors, errors.New("Missing constraint operand")) 3869 } 3870 3871 // Perform additional validation based on operand 3872 switch c.Operand { 3873 case ConstraintRegex: 3874 if _, err := regexp.Compile(c.RTarget); err != nil { 3875 mErr.Errors = append(mErr.Errors, fmt.Errorf("Regular expression failed to compile: %v", err)) 3876 } 3877 case ConstraintVersion: 3878 if _, err := version.NewConstraint(c.RTarget); err != nil { 3879 mErr.Errors = append(mErr.Errors, fmt.Errorf("Version constraint is invalid: %v", err)) 3880 } 3881 } 3882 return mErr.ErrorOrNil() 3883 } 3884 3885 // EphemeralDisk is an ephemeral disk object 3886 type EphemeralDisk struct { 3887 // Sticky indicates whether the allocation is sticky to a node 3888 Sticky bool 3889 3890 // SizeMB is the size of the local disk 3891 SizeMB int 3892 3893 // Migrate determines if Nomad client should migrate the allocation dir for 3894 // sticky allocations 3895 Migrate bool 3896 } 3897 3898 // DefaultEphemeralDisk returns a EphemeralDisk with default configurations 3899 func DefaultEphemeralDisk() *EphemeralDisk { 3900 return &EphemeralDisk{ 3901 SizeMB: 300, 3902 } 3903 } 3904 3905 // Validate validates EphemeralDisk 3906 func (d *EphemeralDisk) Validate() error { 3907 if d.SizeMB < 10 { 3908 return fmt.Errorf("minimum DiskMB value is 10; got %d", d.SizeMB) 3909 } 3910 return nil 3911 } 3912 3913 // Copy copies the EphemeralDisk struct and returns a new one 3914 func (d *EphemeralDisk) Copy() *EphemeralDisk { 3915 ld := new(EphemeralDisk) 3916 *ld = *d 3917 return ld 3918 } 3919 3920 const ( 3921 // VaultChangeModeNoop takes no action when a new token is retrieved. 3922 VaultChangeModeNoop = "noop" 3923 3924 // VaultChangeModeSignal signals the task when a new token is retrieved. 3925 VaultChangeModeSignal = "signal" 3926 3927 // VaultChangeModeRestart restarts the task when a new token is retrieved. 3928 VaultChangeModeRestart = "restart" 3929 ) 3930 3931 // Vault stores the set of premissions a task needs access to from Vault. 3932 type Vault struct { 3933 // Policies is the set of policies that the task needs access to 3934 Policies []string 3935 3936 // Env marks whether the Vault Token should be exposed as an environment 3937 // variable 3938 Env bool 3939 3940 // ChangeMode is used to configure the task's behavior when the Vault 3941 // token changes because the original token could not be renewed in time. 3942 ChangeMode string 3943 3944 // ChangeSignal is the signal sent to the task when a new token is 3945 // retrieved. This is only valid when using the signal change mode. 3946 ChangeSignal string 3947 } 3948 3949 func DefaultVaultBlock() *Vault { 3950 return &Vault{ 3951 Env: true, 3952 ChangeMode: VaultChangeModeRestart, 3953 } 3954 } 3955 3956 // Copy returns a copy of this Vault block. 3957 func (v *Vault) Copy() *Vault { 3958 if v == nil { 3959 return nil 3960 } 3961 3962 nv := new(Vault) 3963 *nv = *v 3964 return nv 3965 } 3966 3967 func (v *Vault) Canonicalize() { 3968 if v.ChangeSignal != "" { 3969 v.ChangeSignal = strings.ToUpper(v.ChangeSignal) 3970 } 3971 } 3972 3973 // Validate returns if the Vault block is valid. 3974 func (v *Vault) Validate() error { 3975 if v == nil { 3976 return nil 3977 } 3978 3979 var mErr multierror.Error 3980 if len(v.Policies) == 0 { 3981 multierror.Append(&mErr, fmt.Errorf("Policy list cannot be empty")) 3982 } 3983 3984 for _, p := range v.Policies { 3985 if p == "root" { 3986 multierror.Append(&mErr, fmt.Errorf("Can not specifiy \"root\" policy")) 3987 } 3988 } 3989 3990 switch v.ChangeMode { 3991 case VaultChangeModeSignal: 3992 if v.ChangeSignal == "" { 3993 multierror.Append(&mErr, fmt.Errorf("Signal must be specified when using change mode %q", VaultChangeModeSignal)) 3994 } 3995 case VaultChangeModeNoop, VaultChangeModeRestart: 3996 default: 3997 multierror.Append(&mErr, fmt.Errorf("Unknown change mode %q", v.ChangeMode)) 3998 } 3999 4000 return mErr.ErrorOrNil() 4001 } 4002 4003 const ( 4004 // DeploymentStatuses are the various states a deployment can be be in 4005 DeploymentStatusRunning = "running" 4006 DeploymentStatusPaused = "paused" 4007 DeploymentStatusFailed = "failed" 4008 DeploymentStatusSuccessful = "successful" 4009 DeploymentStatusCancelled = "cancelled" 4010 4011 // DeploymentStatusDescriptions are the various descriptions of the states a 4012 // deployment can be in. 4013 DeploymentStatusDescriptionRunning = "Deployment is running" 4014 DeploymentStatusDescriptionRunningNeedsPromotion = "Deployment is running but requires promotion" 4015 DeploymentStatusDescriptionPaused = "Deployment is paused" 4016 DeploymentStatusDescriptionSuccessful = "Deployment completed successfully" 4017 DeploymentStatusDescriptionStoppedJob = "Cancelled because job is stopped" 4018 DeploymentStatusDescriptionNewerJob = "Cancelled due to newer version of job" 4019 DeploymentStatusDescriptionFailedAllocations = "Failed due to unhealthy allocations" 4020 DeploymentStatusDescriptionFailedByUser = "Deployment marked as failed" 4021 ) 4022 4023 // DeploymentStatusDescriptionRollback is used to get the status description of 4024 // a deployment when rolling back to an older job. 4025 func DeploymentStatusDescriptionRollback(baseDescription string, jobVersion uint64) string { 4026 return fmt.Sprintf("%s - rolling back to job version %d", baseDescription, jobVersion) 4027 } 4028 4029 // Deployment is the object that represents a job deployment which is used to 4030 // transistion a job between versions. 4031 type Deployment struct { 4032 // ID is a generated UUID for the deployment 4033 ID string 4034 4035 // JobID is the job the deployment is created for 4036 JobID string 4037 4038 // JobVersion is the version of the job at which the deployment is tracking 4039 JobVersion uint64 4040 4041 // JobModifyIndex is the modify index of the job at which the deployment is tracking 4042 JobModifyIndex uint64 4043 4044 // JobCreateIndex is the create index of the job which the deployment is 4045 // tracking. It is needed so that if the job gets stopped and reran we can 4046 // present the correct list of deployments for the job and not old ones. 4047 JobCreateIndex uint64 4048 4049 // TaskGroups is the set of task groups effected by the deployment and their 4050 // current deployment status. 4051 TaskGroups map[string]*DeploymentState 4052 4053 // The status of the deployment 4054 Status string 4055 4056 // StatusDescription allows a human readable description of the deployment 4057 // status. 4058 StatusDescription string 4059 4060 CreateIndex uint64 4061 ModifyIndex uint64 4062 } 4063 4064 // NewDeployment creates a new deployment given the job. 4065 func NewDeployment(job *Job) *Deployment { 4066 return &Deployment{ 4067 ID: GenerateUUID(), 4068 JobID: job.ID, 4069 JobVersion: job.Version, 4070 JobModifyIndex: job.ModifyIndex, 4071 JobCreateIndex: job.CreateIndex, 4072 Status: DeploymentStatusRunning, 4073 StatusDescription: DeploymentStatusDescriptionRunning, 4074 TaskGroups: make(map[string]*DeploymentState, len(job.TaskGroups)), 4075 } 4076 } 4077 4078 func (d *Deployment) Copy() *Deployment { 4079 if d == nil { 4080 return nil 4081 } 4082 4083 c := &Deployment{} 4084 *c = *d 4085 4086 c.TaskGroups = nil 4087 if l := len(d.TaskGroups); d.TaskGroups != nil { 4088 c.TaskGroups = make(map[string]*DeploymentState, l) 4089 for tg, s := range d.TaskGroups { 4090 c.TaskGroups[tg] = s.Copy() 4091 } 4092 } 4093 4094 return c 4095 } 4096 4097 // Active returns whether the deployment is active or terminal. 4098 func (d *Deployment) Active() bool { 4099 switch d.Status { 4100 case DeploymentStatusRunning, DeploymentStatusPaused: 4101 return true 4102 default: 4103 return false 4104 } 4105 } 4106 4107 // GetID is a helper for getting the ID when the object may be nil 4108 func (d *Deployment) GetID() string { 4109 if d == nil { 4110 return "" 4111 } 4112 return d.ID 4113 } 4114 4115 // HasPlacedCanaries returns whether the deployment has placed canaries 4116 func (d *Deployment) HasPlacedCanaries() bool { 4117 if d == nil || len(d.TaskGroups) == 0 { 4118 return false 4119 } 4120 for _, group := range d.TaskGroups { 4121 if len(group.PlacedCanaries) != 0 { 4122 return true 4123 } 4124 } 4125 return false 4126 } 4127 4128 // RequiresPromotion returns whether the deployment requires promotion to 4129 // continue 4130 func (d *Deployment) RequiresPromotion() bool { 4131 if d == nil || len(d.TaskGroups) == 0 || d.Status != DeploymentStatusRunning { 4132 return false 4133 } 4134 for _, group := range d.TaskGroups { 4135 if group.DesiredCanaries > 0 && !group.Promoted { 4136 return true 4137 } 4138 } 4139 return false 4140 } 4141 4142 func (d *Deployment) GoString() string { 4143 base := fmt.Sprintf("Deployment ID %q for job %q has status %q (%v):", d.ID, d.JobID, d.Status, d.StatusDescription) 4144 for group, state := range d.TaskGroups { 4145 base += fmt.Sprintf("\nTask Group %q has state:\n%#v", group, state) 4146 } 4147 return base 4148 } 4149 4150 // DeploymentState tracks the state of a deployment for a given task group. 4151 type DeploymentState struct { 4152 // AutoRevert marks whether the task group has indicated the job should be 4153 // reverted on failure 4154 AutoRevert bool 4155 4156 // Promoted marks whether the canaries have been promoted 4157 Promoted bool 4158 4159 // PlacedCanaries is the set of placed canary allocations 4160 PlacedCanaries []string 4161 4162 // DesiredCanaries is the number of canaries that should be created. 4163 DesiredCanaries int 4164 4165 // DesiredTotal is the total number of allocations that should be created as 4166 // part of the deployment. 4167 DesiredTotal int 4168 4169 // PlacedAllocs is the number of allocations that have been placed 4170 PlacedAllocs int 4171 4172 // HealthyAllocs is the number of allocations that have been marked healthy. 4173 HealthyAllocs int 4174 4175 // UnhealthyAllocs are allocations that have been marked as unhealthy. 4176 UnhealthyAllocs int 4177 } 4178 4179 func (d *DeploymentState) GoString() string { 4180 base := fmt.Sprintf("\tDesired Total: %d", d.DesiredTotal) 4181 base += fmt.Sprintf("\n\tDesired Canaries: %d", d.DesiredCanaries) 4182 base += fmt.Sprintf("\n\tPlaced Canaries: %#v", d.PlacedCanaries) 4183 base += fmt.Sprintf("\n\tPromoted: %v", d.Promoted) 4184 base += fmt.Sprintf("\n\tPlaced: %d", d.PlacedAllocs) 4185 base += fmt.Sprintf("\n\tHealthy: %d", d.HealthyAllocs) 4186 base += fmt.Sprintf("\n\tUnhealthy: %d", d.UnhealthyAllocs) 4187 base += fmt.Sprintf("\n\tAutoRevert: %v", d.AutoRevert) 4188 return base 4189 } 4190 4191 func (d *DeploymentState) Copy() *DeploymentState { 4192 c := &DeploymentState{} 4193 *c = *d 4194 c.PlacedCanaries = helper.CopySliceString(d.PlacedCanaries) 4195 return c 4196 } 4197 4198 // DeploymentStatusUpdate is used to update the status of a given deployment 4199 type DeploymentStatusUpdate struct { 4200 // DeploymentID is the ID of the deployment to update 4201 DeploymentID string 4202 4203 // Status is the new status of the deployment. 4204 Status string 4205 4206 // StatusDescription is the new status description of the deployment. 4207 StatusDescription string 4208 } 4209 4210 const ( 4211 AllocDesiredStatusRun = "run" // Allocation should run 4212 AllocDesiredStatusStop = "stop" // Allocation should stop 4213 AllocDesiredStatusEvict = "evict" // Allocation should stop, and was evicted 4214 ) 4215 4216 const ( 4217 AllocClientStatusPending = "pending" 4218 AllocClientStatusRunning = "running" 4219 AllocClientStatusComplete = "complete" 4220 AllocClientStatusFailed = "failed" 4221 AllocClientStatusLost = "lost" 4222 ) 4223 4224 // Allocation is used to allocate the placement of a task group to a node. 4225 type Allocation struct { 4226 // ID of the allocation (UUID) 4227 ID string 4228 4229 // ID of the evaluation that generated this allocation 4230 EvalID string 4231 4232 // Name is a logical name of the allocation. 4233 Name string 4234 4235 // NodeID is the node this is being placed on 4236 NodeID string 4237 4238 // Job is the parent job of the task group being allocated. 4239 // This is copied at allocation time to avoid issues if the job 4240 // definition is updated. 4241 JobID string 4242 Job *Job 4243 4244 // TaskGroup is the name of the task group that should be run 4245 TaskGroup string 4246 4247 // Resources is the total set of resources allocated as part 4248 // of this allocation of the task group. 4249 Resources *Resources 4250 4251 // SharedResources are the resources that are shared by all the tasks in an 4252 // allocation 4253 SharedResources *Resources 4254 4255 // TaskResources is the set of resources allocated to each 4256 // task. These should sum to the total Resources. 4257 TaskResources map[string]*Resources 4258 4259 // Metrics associated with this allocation 4260 Metrics *AllocMetric 4261 4262 // Desired Status of the allocation on the client 4263 DesiredStatus string 4264 4265 // DesiredStatusDescription is meant to provide more human useful information 4266 DesiredDescription string 4267 4268 // Status of the allocation on the client 4269 ClientStatus string 4270 4271 // ClientStatusDescription is meant to provide more human useful information 4272 ClientDescription string 4273 4274 // TaskStates stores the state of each task, 4275 TaskStates map[string]*TaskState 4276 4277 // PreviousAllocation is the allocation that this allocation is replacing 4278 PreviousAllocation string 4279 4280 // DeploymentID identifies an allocation as being created from a 4281 // particular deployment 4282 DeploymentID string 4283 4284 // DeploymentStatus captures the status of the allocation as part of the 4285 // given deployment 4286 DeploymentStatus *AllocDeploymentStatus 4287 4288 // Raft Indexes 4289 CreateIndex uint64 4290 ModifyIndex uint64 4291 4292 // AllocModifyIndex is not updated when the client updates allocations. This 4293 // lets the client pull only the allocs updated by the server. 4294 AllocModifyIndex uint64 4295 4296 // CreateTime is the time the allocation has finished scheduling and been 4297 // verified by the plan applier. 4298 CreateTime int64 4299 } 4300 4301 // Index returns the index of the allocation. If the allocation is from a task 4302 // group with count greater than 1, there will be multiple allocations for it. 4303 func (a *Allocation) Index() uint { 4304 l := len(a.Name) 4305 prefix := len(a.JobID) + len(a.TaskGroup) + 2 4306 if l <= 3 || l <= prefix { 4307 return uint(0) 4308 } 4309 4310 strNum := a.Name[prefix : len(a.Name)-1] 4311 num, _ := strconv.Atoi(strNum) 4312 return uint(num) 4313 } 4314 4315 func (a *Allocation) Copy() *Allocation { 4316 return a.copyImpl(true) 4317 } 4318 4319 // Copy provides a copy of the allocation but doesn't deep copy the job 4320 func (a *Allocation) CopySkipJob() *Allocation { 4321 return a.copyImpl(false) 4322 } 4323 4324 func (a *Allocation) copyImpl(job bool) *Allocation { 4325 if a == nil { 4326 return nil 4327 } 4328 na := new(Allocation) 4329 *na = *a 4330 4331 if job { 4332 na.Job = na.Job.Copy() 4333 } 4334 4335 na.Resources = na.Resources.Copy() 4336 na.SharedResources = na.SharedResources.Copy() 4337 4338 if a.TaskResources != nil { 4339 tr := make(map[string]*Resources, len(na.TaskResources)) 4340 for task, resource := range na.TaskResources { 4341 tr[task] = resource.Copy() 4342 } 4343 na.TaskResources = tr 4344 } 4345 4346 na.Metrics = na.Metrics.Copy() 4347 na.DeploymentStatus = na.DeploymentStatus.Copy() 4348 4349 if a.TaskStates != nil { 4350 ts := make(map[string]*TaskState, len(na.TaskStates)) 4351 for task, state := range na.TaskStates { 4352 ts[task] = state.Copy() 4353 } 4354 na.TaskStates = ts 4355 } 4356 return na 4357 } 4358 4359 // TerminalStatus returns if the desired or actual status is terminal and 4360 // will no longer transition. 4361 func (a *Allocation) TerminalStatus() bool { 4362 // First check the desired state and if that isn't terminal, check client 4363 // state. 4364 switch a.DesiredStatus { 4365 case AllocDesiredStatusStop, AllocDesiredStatusEvict: 4366 return true 4367 default: 4368 } 4369 4370 switch a.ClientStatus { 4371 case AllocClientStatusComplete, AllocClientStatusFailed, AllocClientStatusLost: 4372 return true 4373 default: 4374 return false 4375 } 4376 } 4377 4378 // Terminated returns if the allocation is in a terminal state on a client. 4379 func (a *Allocation) Terminated() bool { 4380 if a.ClientStatus == AllocClientStatusFailed || 4381 a.ClientStatus == AllocClientStatusComplete || 4382 a.ClientStatus == AllocClientStatusLost { 4383 return true 4384 } 4385 return false 4386 } 4387 4388 // RanSuccessfully returns whether the client has ran the allocation and all 4389 // tasks finished successfully 4390 func (a *Allocation) RanSuccessfully() bool { 4391 // Handle the case the client hasn't started the allocation. 4392 if len(a.TaskStates) == 0 { 4393 return false 4394 } 4395 4396 // Check to see if all the tasks finised successfully in the allocation 4397 allSuccess := true 4398 for _, state := range a.TaskStates { 4399 allSuccess = allSuccess && state.Successful() 4400 } 4401 4402 return allSuccess 4403 } 4404 4405 // ShouldMigrate returns if the allocation needs data migration 4406 func (a *Allocation) ShouldMigrate() bool { 4407 if a.DesiredStatus == AllocDesiredStatusStop || a.DesiredStatus == AllocDesiredStatusEvict { 4408 return false 4409 } 4410 4411 tg := a.Job.LookupTaskGroup(a.TaskGroup) 4412 4413 // if the task group is nil or the ephemeral disk block isn't present then 4414 // we won't migrate 4415 if tg == nil || tg.EphemeralDisk == nil { 4416 return false 4417 } 4418 4419 // We won't migrate any data is the user hasn't enabled migration or the 4420 // disk is not marked as sticky 4421 if !tg.EphemeralDisk.Migrate || !tg.EphemeralDisk.Sticky { 4422 return false 4423 } 4424 4425 return true 4426 } 4427 4428 // Stub returns a list stub for the allocation 4429 func (a *Allocation) Stub() *AllocListStub { 4430 return &AllocListStub{ 4431 ID: a.ID, 4432 EvalID: a.EvalID, 4433 Name: a.Name, 4434 NodeID: a.NodeID, 4435 JobID: a.JobID, 4436 JobVersion: a.Job.Version, 4437 TaskGroup: a.TaskGroup, 4438 DesiredStatus: a.DesiredStatus, 4439 DesiredDescription: a.DesiredDescription, 4440 ClientStatus: a.ClientStatus, 4441 ClientDescription: a.ClientDescription, 4442 TaskStates: a.TaskStates, 4443 DeploymentStatus: a.DeploymentStatus, 4444 CreateIndex: a.CreateIndex, 4445 ModifyIndex: a.ModifyIndex, 4446 CreateTime: a.CreateTime, 4447 } 4448 } 4449 4450 // AllocListStub is used to return a subset of alloc information 4451 type AllocListStub struct { 4452 ID string 4453 EvalID string 4454 Name string 4455 NodeID string 4456 JobID string 4457 JobVersion uint64 4458 TaskGroup string 4459 DesiredStatus string 4460 DesiredDescription string 4461 ClientStatus string 4462 ClientDescription string 4463 TaskStates map[string]*TaskState 4464 DeploymentStatus *AllocDeploymentStatus 4465 CreateIndex uint64 4466 ModifyIndex uint64 4467 CreateTime int64 4468 } 4469 4470 // AllocMetric is used to track various metrics while attempting 4471 // to make an allocation. These are used to debug a job, or to better 4472 // understand the pressure within the system. 4473 type AllocMetric struct { 4474 // NodesEvaluated is the number of nodes that were evaluated 4475 NodesEvaluated int 4476 4477 // NodesFiltered is the number of nodes filtered due to a constraint 4478 NodesFiltered int 4479 4480 // NodesAvailable is the number of nodes available for evaluation per DC. 4481 NodesAvailable map[string]int 4482 4483 // ClassFiltered is the number of nodes filtered by class 4484 ClassFiltered map[string]int 4485 4486 // ConstraintFiltered is the number of failures caused by constraint 4487 ConstraintFiltered map[string]int 4488 4489 // NodesExhausted is the number of nodes skipped due to being 4490 // exhausted of at least one resource 4491 NodesExhausted int 4492 4493 // ClassExhausted is the number of nodes exhausted by class 4494 ClassExhausted map[string]int 4495 4496 // DimensionExhausted provides the count by dimension or reason 4497 DimensionExhausted map[string]int 4498 4499 // Scores is the scores of the final few nodes remaining 4500 // for placement. The top score is typically selected. 4501 Scores map[string]float64 4502 4503 // AllocationTime is a measure of how long the allocation 4504 // attempt took. This can affect performance and SLAs. 4505 AllocationTime time.Duration 4506 4507 // CoalescedFailures indicates the number of other 4508 // allocations that were coalesced into this failed allocation. 4509 // This is to prevent creating many failed allocations for a 4510 // single task group. 4511 CoalescedFailures int 4512 } 4513 4514 func (a *AllocMetric) Copy() *AllocMetric { 4515 if a == nil { 4516 return nil 4517 } 4518 na := new(AllocMetric) 4519 *na = *a 4520 na.NodesAvailable = helper.CopyMapStringInt(na.NodesAvailable) 4521 na.ClassFiltered = helper.CopyMapStringInt(na.ClassFiltered) 4522 na.ConstraintFiltered = helper.CopyMapStringInt(na.ConstraintFiltered) 4523 na.ClassExhausted = helper.CopyMapStringInt(na.ClassExhausted) 4524 na.DimensionExhausted = helper.CopyMapStringInt(na.DimensionExhausted) 4525 na.Scores = helper.CopyMapStringFloat64(na.Scores) 4526 return na 4527 } 4528 4529 func (a *AllocMetric) EvaluateNode() { 4530 a.NodesEvaluated += 1 4531 } 4532 4533 func (a *AllocMetric) FilterNode(node *Node, constraint string) { 4534 a.NodesFiltered += 1 4535 if node != nil && node.NodeClass != "" { 4536 if a.ClassFiltered == nil { 4537 a.ClassFiltered = make(map[string]int) 4538 } 4539 a.ClassFiltered[node.NodeClass] += 1 4540 } 4541 if constraint != "" { 4542 if a.ConstraintFiltered == nil { 4543 a.ConstraintFiltered = make(map[string]int) 4544 } 4545 a.ConstraintFiltered[constraint] += 1 4546 } 4547 } 4548 4549 func (a *AllocMetric) ExhaustedNode(node *Node, dimension string) { 4550 a.NodesExhausted += 1 4551 if node != nil && node.NodeClass != "" { 4552 if a.ClassExhausted == nil { 4553 a.ClassExhausted = make(map[string]int) 4554 } 4555 a.ClassExhausted[node.NodeClass] += 1 4556 } 4557 if dimension != "" { 4558 if a.DimensionExhausted == nil { 4559 a.DimensionExhausted = make(map[string]int) 4560 } 4561 a.DimensionExhausted[dimension] += 1 4562 } 4563 } 4564 4565 func (a *AllocMetric) ScoreNode(node *Node, name string, score float64) { 4566 if a.Scores == nil { 4567 a.Scores = make(map[string]float64) 4568 } 4569 key := fmt.Sprintf("%s.%s", node.ID, name) 4570 a.Scores[key] = score 4571 } 4572 4573 // AllocDeploymentStatus captures the status of the allocation as part of the 4574 // deployment. This can include things like if the allocation has been marked as 4575 // heatlhy. 4576 type AllocDeploymentStatus struct { 4577 // Healthy marks whether the allocation has been marked healthy or unhealthy 4578 // as part of a deployment. It can be unset if it has neither been marked 4579 // healthy or unhealthy. 4580 Healthy *bool 4581 4582 // ModifyIndex is the raft index in which the deployment status was last 4583 // changed. 4584 ModifyIndex uint64 4585 } 4586 4587 // IsHealthy returns if the allocation is marked as healthy as part of a 4588 // deployment 4589 func (a *AllocDeploymentStatus) IsHealthy() bool { 4590 if a == nil { 4591 return false 4592 } 4593 4594 return a.Healthy != nil && *a.Healthy 4595 } 4596 4597 // IsUnhealthy returns if the allocation is marked as unhealthy as part of a 4598 // deployment 4599 func (a *AllocDeploymentStatus) IsUnhealthy() bool { 4600 if a == nil { 4601 return false 4602 } 4603 4604 return a.Healthy != nil && !*a.Healthy 4605 } 4606 4607 func (a *AllocDeploymentStatus) Copy() *AllocDeploymentStatus { 4608 if a == nil { 4609 return nil 4610 } 4611 4612 c := new(AllocDeploymentStatus) 4613 *c = *a 4614 4615 if a.Healthy != nil { 4616 c.Healthy = helper.BoolToPtr(*a.Healthy) 4617 } 4618 4619 return c 4620 } 4621 4622 const ( 4623 EvalStatusBlocked = "blocked" 4624 EvalStatusPending = "pending" 4625 EvalStatusComplete = "complete" 4626 EvalStatusFailed = "failed" 4627 EvalStatusCancelled = "canceled" 4628 ) 4629 4630 const ( 4631 EvalTriggerJobRegister = "job-register" 4632 EvalTriggerJobDeregister = "job-deregister" 4633 EvalTriggerPeriodicJob = "periodic-job" 4634 EvalTriggerNodeUpdate = "node-update" 4635 EvalTriggerScheduled = "scheduled" 4636 EvalTriggerRollingUpdate = "rolling-update" 4637 EvalTriggerDeploymentWatcher = "deployment-watcher" 4638 EvalTriggerFailedFollowUp = "failed-follow-up" 4639 EvalTriggerMaxPlans = "max-plan-attempts" 4640 ) 4641 4642 const ( 4643 // CoreJobEvalGC is used for the garbage collection of evaluations 4644 // and allocations. We periodically scan evaluations in a terminal state, 4645 // in which all the corresponding allocations are also terminal. We 4646 // delete these out of the system to bound the state. 4647 CoreJobEvalGC = "eval-gc" 4648 4649 // CoreJobNodeGC is used for the garbage collection of failed nodes. 4650 // We periodically scan nodes in a terminal state, and if they have no 4651 // corresponding allocations we delete these out of the system. 4652 CoreJobNodeGC = "node-gc" 4653 4654 // CoreJobJobGC is used for the garbage collection of eligible jobs. We 4655 // periodically scan garbage collectible jobs and check if both their 4656 // evaluations and allocations are terminal. If so, we delete these out of 4657 // the system. 4658 CoreJobJobGC = "job-gc" 4659 4660 // CoreJobDeploymentGC is used for the garbage collection of eligible 4661 // deployments. We periodically scan garbage collectible deployments and 4662 // check if they are terminal. If so, we delete these out of the system. 4663 CoreJobDeploymentGC = "deployment-gc" 4664 4665 // CoreJobForceGC is used to force garbage collection of all GCable objects. 4666 CoreJobForceGC = "force-gc" 4667 ) 4668 4669 // Evaluation is used anytime we need to apply business logic as a result 4670 // of a change to our desired state (job specification) or the emergent state 4671 // (registered nodes). When the inputs change, we need to "evaluate" them, 4672 // potentially taking action (allocation of work) or doing nothing if the state 4673 // of the world does not require it. 4674 type Evaluation struct { 4675 // ID is a randonly generated UUID used for this evaluation. This 4676 // is assigned upon the creation of the evaluation. 4677 ID string 4678 4679 // Priority is used to control scheduling importance and if this job 4680 // can preempt other jobs. 4681 Priority int 4682 4683 // Type is used to control which schedulers are available to handle 4684 // this evaluation. 4685 Type string 4686 4687 // TriggeredBy is used to give some insight into why this Eval 4688 // was created. (Job change, node failure, alloc failure, etc). 4689 TriggeredBy string 4690 4691 // JobID is the job this evaluation is scoped to. Evaluations cannot 4692 // be run in parallel for a given JobID, so we serialize on this. 4693 JobID string 4694 4695 // JobModifyIndex is the modify index of the job at the time 4696 // the evaluation was created 4697 JobModifyIndex uint64 4698 4699 // NodeID is the node that was affected triggering the evaluation. 4700 NodeID string 4701 4702 // NodeModifyIndex is the modify index of the node at the time 4703 // the evaluation was created 4704 NodeModifyIndex uint64 4705 4706 // DeploymentID is the ID of the deployment that triggered the evaluation. 4707 DeploymentID string 4708 4709 // Status of the evaluation 4710 Status string 4711 4712 // StatusDescription is meant to provide more human useful information 4713 StatusDescription string 4714 4715 // Wait is a minimum wait time for running the eval. This is used to 4716 // support a rolling upgrade. 4717 Wait time.Duration 4718 4719 // NextEval is the evaluation ID for the eval created to do a followup. 4720 // This is used to support rolling upgrades, where we need a chain of evaluations. 4721 NextEval string 4722 4723 // PreviousEval is the evaluation ID for the eval creating this one to do a followup. 4724 // This is used to support rolling upgrades, where we need a chain of evaluations. 4725 PreviousEval string 4726 4727 // BlockedEval is the evaluation ID for a created blocked eval. A 4728 // blocked eval will be created if all allocations could not be placed due 4729 // to constraints or lacking resources. 4730 BlockedEval string 4731 4732 // FailedTGAllocs are task groups which have allocations that could not be 4733 // made, but the metrics are persisted so that the user can use the feedback 4734 // to determine the cause. 4735 FailedTGAllocs map[string]*AllocMetric 4736 4737 // ClassEligibility tracks computed node classes that have been explicitly 4738 // marked as eligible or ineligible. 4739 ClassEligibility map[string]bool 4740 4741 // EscapedComputedClass marks whether the job has constraints that are not 4742 // captured by computed node classes. 4743 EscapedComputedClass bool 4744 4745 // AnnotatePlan triggers the scheduler to provide additional annotations 4746 // during the evaluation. This should not be set during normal operations. 4747 AnnotatePlan bool 4748 4749 // QueuedAllocations is the number of unplaced allocations at the time the 4750 // evaluation was processed. The map is keyed by Task Group names. 4751 QueuedAllocations map[string]int 4752 4753 // SnapshotIndex is the Raft index of the snapshot used to process the 4754 // evaluation. As such it will only be set once it has gone through the 4755 // scheduler. 4756 SnapshotIndex uint64 4757 4758 // Raft Indexes 4759 CreateIndex uint64 4760 ModifyIndex uint64 4761 } 4762 4763 // TerminalStatus returns if the current status is terminal and 4764 // will no longer transition. 4765 func (e *Evaluation) TerminalStatus() bool { 4766 switch e.Status { 4767 case EvalStatusComplete, EvalStatusFailed, EvalStatusCancelled: 4768 return true 4769 default: 4770 return false 4771 } 4772 } 4773 4774 func (e *Evaluation) GoString() string { 4775 return fmt.Sprintf("<Eval '%s' JobID: '%s'>", e.ID, e.JobID) 4776 } 4777 4778 func (e *Evaluation) Copy() *Evaluation { 4779 if e == nil { 4780 return nil 4781 } 4782 ne := new(Evaluation) 4783 *ne = *e 4784 4785 // Copy ClassEligibility 4786 if e.ClassEligibility != nil { 4787 classes := make(map[string]bool, len(e.ClassEligibility)) 4788 for class, elig := range e.ClassEligibility { 4789 classes[class] = elig 4790 } 4791 ne.ClassEligibility = classes 4792 } 4793 4794 // Copy FailedTGAllocs 4795 if e.FailedTGAllocs != nil { 4796 failedTGs := make(map[string]*AllocMetric, len(e.FailedTGAllocs)) 4797 for tg, metric := range e.FailedTGAllocs { 4798 failedTGs[tg] = metric.Copy() 4799 } 4800 ne.FailedTGAllocs = failedTGs 4801 } 4802 4803 // Copy queued allocations 4804 if e.QueuedAllocations != nil { 4805 queuedAllocations := make(map[string]int, len(e.QueuedAllocations)) 4806 for tg, num := range e.QueuedAllocations { 4807 queuedAllocations[tg] = num 4808 } 4809 ne.QueuedAllocations = queuedAllocations 4810 } 4811 4812 return ne 4813 } 4814 4815 // ShouldEnqueue checks if a given evaluation should be enqueued into the 4816 // eval_broker 4817 func (e *Evaluation) ShouldEnqueue() bool { 4818 switch e.Status { 4819 case EvalStatusPending: 4820 return true 4821 case EvalStatusComplete, EvalStatusFailed, EvalStatusBlocked, EvalStatusCancelled: 4822 return false 4823 default: 4824 panic(fmt.Sprintf("unhandled evaluation (%s) status %s", e.ID, e.Status)) 4825 } 4826 } 4827 4828 // ShouldBlock checks if a given evaluation should be entered into the blocked 4829 // eval tracker. 4830 func (e *Evaluation) ShouldBlock() bool { 4831 switch e.Status { 4832 case EvalStatusBlocked: 4833 return true 4834 case EvalStatusComplete, EvalStatusFailed, EvalStatusPending, EvalStatusCancelled: 4835 return false 4836 default: 4837 panic(fmt.Sprintf("unhandled evaluation (%s) status %s", e.ID, e.Status)) 4838 } 4839 } 4840 4841 // MakePlan is used to make a plan from the given evaluation 4842 // for a given Job 4843 func (e *Evaluation) MakePlan(j *Job) *Plan { 4844 p := &Plan{ 4845 EvalID: e.ID, 4846 Priority: e.Priority, 4847 Job: j, 4848 NodeUpdate: make(map[string][]*Allocation), 4849 NodeAllocation: make(map[string][]*Allocation), 4850 } 4851 if j != nil { 4852 p.AllAtOnce = j.AllAtOnce 4853 } 4854 return p 4855 } 4856 4857 // NextRollingEval creates an evaluation to followup this eval for rolling updates 4858 func (e *Evaluation) NextRollingEval(wait time.Duration) *Evaluation { 4859 return &Evaluation{ 4860 ID: GenerateUUID(), 4861 Priority: e.Priority, 4862 Type: e.Type, 4863 TriggeredBy: EvalTriggerRollingUpdate, 4864 JobID: e.JobID, 4865 JobModifyIndex: e.JobModifyIndex, 4866 Status: EvalStatusPending, 4867 Wait: wait, 4868 PreviousEval: e.ID, 4869 } 4870 } 4871 4872 // CreateBlockedEval creates a blocked evaluation to followup this eval to place any 4873 // failed allocations. It takes the classes marked explicitly eligible or 4874 // ineligible and whether the job has escaped computed node classes. 4875 func (e *Evaluation) CreateBlockedEval(classEligibility map[string]bool, escaped bool) *Evaluation { 4876 return &Evaluation{ 4877 ID: GenerateUUID(), 4878 Priority: e.Priority, 4879 Type: e.Type, 4880 TriggeredBy: e.TriggeredBy, 4881 JobID: e.JobID, 4882 JobModifyIndex: e.JobModifyIndex, 4883 Status: EvalStatusBlocked, 4884 PreviousEval: e.ID, 4885 ClassEligibility: classEligibility, 4886 EscapedComputedClass: escaped, 4887 } 4888 } 4889 4890 // CreateFailedFollowUpEval creates a follow up evaluation when the current one 4891 // has been marked as failed becasue it has hit the delivery limit and will not 4892 // be retried by the eval_broker. 4893 func (e *Evaluation) CreateFailedFollowUpEval(wait time.Duration) *Evaluation { 4894 return &Evaluation{ 4895 ID: GenerateUUID(), 4896 Priority: e.Priority, 4897 Type: e.Type, 4898 TriggeredBy: EvalTriggerFailedFollowUp, 4899 JobID: e.JobID, 4900 JobModifyIndex: e.JobModifyIndex, 4901 Status: EvalStatusPending, 4902 Wait: wait, 4903 PreviousEval: e.ID, 4904 } 4905 } 4906 4907 // Plan is used to submit a commit plan for task allocations. These 4908 // are submitted to the leader which verifies that resources have 4909 // not been overcommitted before admiting the plan. 4910 type Plan struct { 4911 // EvalID is the evaluation ID this plan is associated with 4912 EvalID string 4913 4914 // EvalToken is used to prevent a split-brain processing of 4915 // an evaluation. There should only be a single scheduler running 4916 // an Eval at a time, but this could be violated after a leadership 4917 // transition. This unique token is used to reject plans that are 4918 // being submitted from a different leader. 4919 EvalToken string 4920 4921 // Priority is the priority of the upstream job 4922 Priority int 4923 4924 // AllAtOnce is used to control if incremental scheduling of task groups 4925 // is allowed or if we must do a gang scheduling of the entire job. 4926 // If this is false, a plan may be partially applied. Otherwise, the 4927 // entire plan must be able to make progress. 4928 AllAtOnce bool 4929 4930 // Job is the parent job of all the allocations in the Plan. 4931 // Since a Plan only involves a single Job, we can reduce the size 4932 // of the plan by only including it once. 4933 Job *Job 4934 4935 // NodeUpdate contains all the allocations for each node. For each node, 4936 // this is a list of the allocations to update to either stop or evict. 4937 NodeUpdate map[string][]*Allocation 4938 4939 // NodeAllocation contains all the allocations for each node. 4940 // The evicts must be considered prior to the allocations. 4941 NodeAllocation map[string][]*Allocation 4942 4943 // Annotations contains annotations by the scheduler to be used by operators 4944 // to understand the decisions made by the scheduler. 4945 Annotations *PlanAnnotations 4946 4947 // Deployment is the deployment created or updated by the scheduler that 4948 // should be applied by the planner. 4949 Deployment *Deployment 4950 4951 // DeploymentUpdates is a set of status updates to apply to the given 4952 // deployments. This allows the scheduler to cancel any unneeded deployment 4953 // because the job is stopped or the update block is removed. 4954 DeploymentUpdates []*DeploymentStatusUpdate 4955 } 4956 4957 // AppendUpdate marks the allocation for eviction. The clientStatus of the 4958 // allocation may be optionally set by passing in a non-empty value. 4959 func (p *Plan) AppendUpdate(alloc *Allocation, desiredStatus, desiredDesc, clientStatus string) { 4960 newAlloc := new(Allocation) 4961 *newAlloc = *alloc 4962 4963 // If the job is not set in the plan we are deregistering a job so we 4964 // extract the job from the allocation. 4965 if p.Job == nil && newAlloc.Job != nil { 4966 p.Job = newAlloc.Job 4967 } 4968 4969 // Normalize the job 4970 newAlloc.Job = nil 4971 4972 // Strip the resources as it can be rebuilt. 4973 newAlloc.Resources = nil 4974 4975 newAlloc.DesiredStatus = desiredStatus 4976 newAlloc.DesiredDescription = desiredDesc 4977 4978 if clientStatus != "" { 4979 newAlloc.ClientStatus = clientStatus 4980 } 4981 4982 node := alloc.NodeID 4983 existing := p.NodeUpdate[node] 4984 p.NodeUpdate[node] = append(existing, newAlloc) 4985 } 4986 4987 func (p *Plan) PopUpdate(alloc *Allocation) { 4988 existing := p.NodeUpdate[alloc.NodeID] 4989 n := len(existing) 4990 if n > 0 && existing[n-1].ID == alloc.ID { 4991 existing = existing[:n-1] 4992 if len(existing) > 0 { 4993 p.NodeUpdate[alloc.NodeID] = existing 4994 } else { 4995 delete(p.NodeUpdate, alloc.NodeID) 4996 } 4997 } 4998 } 4999 5000 func (p *Plan) AppendAlloc(alloc *Allocation) { 5001 node := alloc.NodeID 5002 existing := p.NodeAllocation[node] 5003 p.NodeAllocation[node] = append(existing, alloc) 5004 } 5005 5006 // IsNoOp checks if this plan would do nothing 5007 func (p *Plan) IsNoOp() bool { 5008 return len(p.NodeUpdate) == 0 && 5009 len(p.NodeAllocation) == 0 && 5010 p.Deployment == nil && 5011 len(p.DeploymentUpdates) == 0 5012 } 5013 5014 // PlanResult is the result of a plan submitted to the leader. 5015 type PlanResult struct { 5016 // NodeUpdate contains all the updates that were committed. 5017 NodeUpdate map[string][]*Allocation 5018 5019 // NodeAllocation contains all the allocations that were committed. 5020 NodeAllocation map[string][]*Allocation 5021 5022 // Deployment is the deployment that was committed. 5023 Deployment *Deployment 5024 5025 // DeploymentUpdates is the set of deployment updates that were commited. 5026 DeploymentUpdates []*DeploymentStatusUpdate 5027 5028 // RefreshIndex is the index the worker should refresh state up to. 5029 // This allows all evictions and allocations to be materialized. 5030 // If any allocations were rejected due to stale data (node state, 5031 // over committed) this can be used to force a worker refresh. 5032 RefreshIndex uint64 5033 5034 // AllocIndex is the Raft index in which the evictions and 5035 // allocations took place. This is used for the write index. 5036 AllocIndex uint64 5037 } 5038 5039 // IsNoOp checks if this plan result would do nothing 5040 func (p *PlanResult) IsNoOp() bool { 5041 return len(p.NodeUpdate) == 0 && len(p.NodeAllocation) == 0 && 5042 len(p.DeploymentUpdates) == 0 && p.Deployment == nil 5043 } 5044 5045 // FullCommit is used to check if all the allocations in a plan 5046 // were committed as part of the result. Returns if there was 5047 // a match, and the number of expected and actual allocations. 5048 func (p *PlanResult) FullCommit(plan *Plan) (bool, int, int) { 5049 expected := 0 5050 actual := 0 5051 for name, allocList := range plan.NodeAllocation { 5052 didAlloc, _ := p.NodeAllocation[name] 5053 expected += len(allocList) 5054 actual += len(didAlloc) 5055 } 5056 return actual == expected, expected, actual 5057 } 5058 5059 // PlanAnnotations holds annotations made by the scheduler to give further debug 5060 // information to operators. 5061 type PlanAnnotations struct { 5062 // DesiredTGUpdates is the set of desired updates per task group. 5063 DesiredTGUpdates map[string]*DesiredUpdates 5064 } 5065 5066 // DesiredUpdates is the set of changes the scheduler would like to make given 5067 // sufficient resources and cluster capacity. 5068 type DesiredUpdates struct { 5069 Ignore uint64 5070 Place uint64 5071 Migrate uint64 5072 Stop uint64 5073 InPlaceUpdate uint64 5074 DestructiveUpdate uint64 5075 Canary uint64 5076 } 5077 5078 func (d *DesiredUpdates) GoString() string { 5079 return fmt.Sprintf("(place %d) (inplace %d) (destructive %d) (stop %d) (migrate %d) (ignore %d) (canary %d)", 5080 d.Place, d.InPlaceUpdate, d.DestructiveUpdate, d.Stop, d.Migrate, d.Ignore, d.Canary) 5081 } 5082 5083 // msgpackHandle is a shared handle for encoding/decoding of structs 5084 var MsgpackHandle = func() *codec.MsgpackHandle { 5085 h := &codec.MsgpackHandle{RawToString: true} 5086 5087 // Sets the default type for decoding a map into a nil interface{}. 5088 // This is necessary in particular because we store the driver configs as a 5089 // nil interface{}. 5090 h.MapType = reflect.TypeOf(map[string]interface{}(nil)) 5091 return h 5092 }() 5093 5094 var ( 5095 // JsonHandle and JsonHandlePretty are the codec handles to JSON encode 5096 // structs. The pretty handle will add indents for easier human consumption. 5097 JsonHandle = &codec.JsonHandle{ 5098 HTMLCharsAsIs: true, 5099 } 5100 JsonHandlePretty = &codec.JsonHandle{ 5101 HTMLCharsAsIs: true, 5102 Indent: 4, 5103 } 5104 ) 5105 5106 var HashiMsgpackHandle = func() *hcodec.MsgpackHandle { 5107 h := &hcodec.MsgpackHandle{RawToString: true} 5108 5109 // Sets the default type for decoding a map into a nil interface{}. 5110 // This is necessary in particular because we store the driver configs as a 5111 // nil interface{}. 5112 h.MapType = reflect.TypeOf(map[string]interface{}(nil)) 5113 return h 5114 }() 5115 5116 // Decode is used to decode a MsgPack encoded object 5117 func Decode(buf []byte, out interface{}) error { 5118 return codec.NewDecoder(bytes.NewReader(buf), MsgpackHandle).Decode(out) 5119 } 5120 5121 // Encode is used to encode a MsgPack object with type prefix 5122 func Encode(t MessageType, msg interface{}) ([]byte, error) { 5123 var buf bytes.Buffer 5124 buf.WriteByte(uint8(t)) 5125 err := codec.NewEncoder(&buf, MsgpackHandle).Encode(msg) 5126 return buf.Bytes(), err 5127 } 5128 5129 // KeyringResponse is a unified key response and can be used for install, 5130 // remove, use, as well as listing key queries. 5131 type KeyringResponse struct { 5132 Messages map[string]string 5133 Keys map[string]int 5134 NumNodes int 5135 } 5136 5137 // KeyringRequest is request objects for serf key operations. 5138 type KeyringRequest struct { 5139 Key string 5140 } 5141 5142 // RecoverableError wraps an error and marks whether it is recoverable and could 5143 // be retried or it is fatal. 5144 type RecoverableError struct { 5145 Err string 5146 Recoverable bool 5147 } 5148 5149 // NewRecoverableError is used to wrap an error and mark it as recoverable or 5150 // not. 5151 func NewRecoverableError(e error, recoverable bool) error { 5152 if e == nil { 5153 return nil 5154 } 5155 5156 return &RecoverableError{ 5157 Err: e.Error(), 5158 Recoverable: recoverable, 5159 } 5160 } 5161 5162 // WrapRecoverable wraps an existing error in a new RecoverableError with a new 5163 // message. If the error was recoverable before the returned error is as well; 5164 // otherwise it is unrecoverable. 5165 func WrapRecoverable(msg string, err error) error { 5166 return &RecoverableError{Err: msg, Recoverable: IsRecoverable(err)} 5167 } 5168 5169 func (r *RecoverableError) Error() string { 5170 return r.Err 5171 } 5172 5173 func (r *RecoverableError) IsRecoverable() bool { 5174 return r.Recoverable 5175 } 5176 5177 // Recoverable is an interface for errors to implement to indicate whether or 5178 // not they are fatal or recoverable. 5179 type Recoverable interface { 5180 error 5181 IsRecoverable() bool 5182 } 5183 5184 // IsRecoverable returns true if error is a RecoverableError with 5185 // Recoverable=true. Otherwise false is returned. 5186 func IsRecoverable(e error) bool { 5187 if re, ok := e.(Recoverable); ok { 5188 return re.IsRecoverable() 5189 } 5190 return false 5191 }