github.com/hashicorp/nomad/api@v0.0.0-20240306165712-3193ac204f65/allocations.go (about)

     1  // Copyright (c) HashiCorp, Inc.
     2  // SPDX-License-Identifier: MPL-2.0
     3  
     4  package api
     5  
     6  import (
     7  	"context"
     8  	"errors"
     9  	"io"
    10  	"sort"
    11  	"strings"
    12  	"time"
    13  )
    14  
    15  var (
    16  	// NodeDownErr marks an operation as not able to complete since the node is
    17  	// down.
    18  	NodeDownErr = errors.New("node down")
    19  )
    20  
    21  const (
    22  	AllocDesiredStatusRun   = "run"   // Allocation should run
    23  	AllocDesiredStatusStop  = "stop"  // Allocation should stop
    24  	AllocDesiredStatusEvict = "evict" // Allocation should stop, and was evicted
    25  )
    26  
    27  const (
    28  	AllocClientStatusPending  = "pending"
    29  	AllocClientStatusRunning  = "running"
    30  	AllocClientStatusComplete = "complete"
    31  	AllocClientStatusFailed   = "failed"
    32  	AllocClientStatusLost     = "lost"
    33  	AllocClientStatusUnknown  = "unknown"
    34  )
    35  
    36  const (
    37  	AllocRestartReasonWithinPolicy = "Restart within policy"
    38  )
    39  
    40  // Allocations is used to query the alloc-related endpoints.
    41  type Allocations struct {
    42  	client *Client
    43  }
    44  
    45  // Allocations returns a handle on the allocs endpoints.
    46  func (c *Client) Allocations() *Allocations {
    47  	return &Allocations{client: c}
    48  }
    49  
    50  // List returns a list of all of the allocations.
    51  func (a *Allocations) List(q *QueryOptions) ([]*AllocationListStub, *QueryMeta, error) {
    52  	var resp []*AllocationListStub
    53  	qm, err := a.client.query("/v1/allocations", &resp, q)
    54  	if err != nil {
    55  		return nil, nil, err
    56  	}
    57  	sort.Sort(AllocIndexSort(resp))
    58  	return resp, qm, nil
    59  }
    60  
    61  func (a *Allocations) PrefixList(prefix string) ([]*AllocationListStub, *QueryMeta, error) {
    62  	return a.List(&QueryOptions{Prefix: prefix})
    63  }
    64  
    65  // Info is used to retrieve a single allocation.
    66  func (a *Allocations) Info(allocID string, q *QueryOptions) (*Allocation, *QueryMeta, error) {
    67  	var resp Allocation
    68  	qm, err := a.client.query("/v1/allocation/"+allocID, &resp, q)
    69  	if err != nil {
    70  		return nil, nil, err
    71  	}
    72  	return &resp, qm, nil
    73  }
    74  
    75  // Exec is used to execute a command inside a running task.  The command is to run inside
    76  // the task environment.
    77  //
    78  // The parameters are:
    79  //   - ctx: context to set deadlines or timeout
    80  //   - allocation: the allocation to execute command inside
    81  //   - task: the task's name to execute command in
    82  //   - tty: indicates whether to start a pseudo-tty for the command
    83  //   - stdin, stdout, stderr: the std io to pass to command.
    84  //     If tty is true, then streams need to point to a tty that's alive for the whole process
    85  //   - terminalSizeCh: A channel to send new tty terminal sizes
    86  //
    87  // The call blocks until command terminates (or an error occurs), and returns the exit code.
    88  //
    89  // Note: for cluster topologies where API consumers don't have network access to
    90  // Nomad clients, set api.ClientConnTimeout to a small value (ex 1ms) to avoid
    91  // long pauses on this API call.
    92  func (a *Allocations) Exec(ctx context.Context,
    93  	alloc *Allocation, task string, tty bool, command []string,
    94  	stdin io.Reader, stdout, stderr io.Writer,
    95  	terminalSizeCh <-chan TerminalSize, q *QueryOptions) (exitCode int, err error) {
    96  
    97  	s := &execSession{
    98  		client:  a.client,
    99  		alloc:   alloc,
   100  		task:    task,
   101  		tty:     tty,
   102  		command: command,
   103  
   104  		stdin:  stdin,
   105  		stdout: stdout,
   106  		stderr: stderr,
   107  
   108  		terminalSizeCh: terminalSizeCh,
   109  		q:              q,
   110  	}
   111  
   112  	return s.run(ctx)
   113  }
   114  
   115  // Stats gets allocation resource usage statistics about an allocation.
   116  //
   117  // Note: for cluster topologies where API consumers don't have network access to
   118  // Nomad clients, set api.ClientConnTimeout to a small value (ex 1ms) to avoid
   119  // long pauses on this API call.
   120  func (a *Allocations) Stats(alloc *Allocation, q *QueryOptions) (*AllocResourceUsage, error) {
   121  	var resp AllocResourceUsage
   122  	_, err := a.client.query("/v1/client/allocation/"+alloc.ID+"/stats", &resp, q)
   123  	return &resp, err
   124  }
   125  
   126  // Checks gets status information for nomad service checks that exist in the allocation.
   127  //
   128  // Note: for cluster topologies where API consumers don't have network access to
   129  // Nomad clients, set api.ClientConnTimeout to a small value (ex 1ms) to avoid
   130  // long pauses on this API call.
   131  func (a *Allocations) Checks(allocID string, q *QueryOptions) (AllocCheckStatuses, error) {
   132  	var resp AllocCheckStatuses
   133  	_, err := a.client.query("/v1/client/allocation/"+allocID+"/checks", &resp, q)
   134  	return resp, err
   135  }
   136  
   137  // GC forces a garbage collection of client state for an allocation.
   138  //
   139  // Note: for cluster topologies where API consumers don't have network access to
   140  // Nomad clients, set api.ClientConnTimeout to a small value (ex 1ms) to avoid
   141  // long pauses on this API call.
   142  func (a *Allocations) GC(alloc *Allocation, q *QueryOptions) error {
   143  	var resp struct{}
   144  	_, err := a.client.query("/v1/client/allocation/"+alloc.ID+"/gc", &resp, nil)
   145  	return err
   146  }
   147  
   148  // Restart restarts the tasks that are currently running or a specific task if
   149  // taskName is provided. An error is returned if the task to be restarted is
   150  // not running.
   151  //
   152  // Note: for cluster topologies where API consumers don't have network access to
   153  // Nomad clients, set api.ClientConnTimeout to a small value (ex 1ms) to avoid
   154  // long pauses on this API call.
   155  func (a *Allocations) Restart(alloc *Allocation, taskName string, q *QueryOptions) error {
   156  	req := AllocationRestartRequest{
   157  		TaskName: taskName,
   158  	}
   159  
   160  	var resp struct{}
   161  	_, err := a.client.putQuery("/v1/client/allocation/"+alloc.ID+"/restart", &req, &resp, q)
   162  	return err
   163  }
   164  
   165  // RestartAllTasks restarts all tasks in the allocation, regardless of
   166  // lifecycle type or state. Tasks will restart following their lifecycle order.
   167  //
   168  // Note: for cluster topologies where API consumers don't have network access to
   169  // Nomad clients, set api.ClientConnTimeout to a small value (ex 1ms) to avoid
   170  // long pauses on this API call.
   171  //
   172  // DEPRECATED: This method will be removed in 1.6.0
   173  func (a *Allocations) RestartAllTasks(alloc *Allocation, q *QueryOptions) error {
   174  	req := AllocationRestartRequest{
   175  		AllTasks: true,
   176  	}
   177  
   178  	var resp struct{}
   179  	_, err := a.client.putQuery("/v1/client/allocation/"+alloc.ID+"/restart", &req, &resp, q)
   180  	return err
   181  }
   182  
   183  // Stop stops an allocation.
   184  //
   185  // Note: for cluster topologies where API consumers don't have network access to
   186  // Nomad clients, set api.ClientConnTimeout to a small value (ex 1ms) to avoid
   187  // long pauses on this API call.
   188  //
   189  // BREAKING: This method will have the following signature in 1.6.0
   190  // func (a *Allocations) Stop(allocID string, w *WriteOptions) (*AllocStopResponse, error) {
   191  func (a *Allocations) Stop(alloc *Allocation, q *QueryOptions) (*AllocStopResponse, error) {
   192  	// COMPAT: Remove in 1.6.0
   193  	var w *WriteOptions
   194  	if q != nil {
   195  		w = &WriteOptions{
   196  			Region:    q.Region,
   197  			Namespace: q.Namespace,
   198  			AuthToken: q.AuthToken,
   199  			Headers:   q.Headers,
   200  			ctx:       q.ctx,
   201  		}
   202  	}
   203  
   204  	var resp AllocStopResponse
   205  	wm, err := a.client.put("/v1/allocation/"+alloc.ID+"/stop", nil, &resp, w)
   206  	if wm != nil {
   207  		resp.LastIndex = wm.LastIndex
   208  		resp.RequestTime = wm.RequestTime
   209  	}
   210  
   211  	return &resp, err
   212  }
   213  
   214  // AllocStopResponse is the response to an `AllocStopRequest`
   215  type AllocStopResponse struct {
   216  	// EvalID is the id of the follow up evalution for the rescheduled alloc.
   217  	EvalID string
   218  
   219  	WriteMeta
   220  }
   221  
   222  // Signal sends a signal to the allocation.
   223  //
   224  // Note: for cluster topologies where API consumers don't have network access to
   225  // Nomad clients, set api.ClientConnTimeout to a small value (ex 1ms) to avoid
   226  // long pauses on this API call.
   227  func (a *Allocations) Signal(alloc *Allocation, q *QueryOptions, task, signal string) error {
   228  	req := AllocSignalRequest{
   229  		Signal: signal,
   230  		Task:   task,
   231  	}
   232  
   233  	var resp GenericResponse
   234  	_, err := a.client.putQuery("/v1/client/allocation/"+alloc.ID+"/signal", &req, &resp, q)
   235  	return err
   236  }
   237  
   238  // Services is used to return a list of service registrations associated to the
   239  // specified allocID.
   240  func (a *Allocations) Services(allocID string, q *QueryOptions) ([]*ServiceRegistration, *QueryMeta, error) {
   241  	var resp []*ServiceRegistration
   242  	qm, err := a.client.query("/v1/allocation/"+allocID+"/services", &resp, q)
   243  	return resp, qm, err
   244  }
   245  
   246  // Allocation is used for serialization of allocations.
   247  type Allocation struct {
   248  	ID                    string
   249  	Namespace             string
   250  	EvalID                string
   251  	Name                  string
   252  	NodeID                string
   253  	NodeName              string
   254  	JobID                 string
   255  	Job                   *Job
   256  	TaskGroup             string
   257  	Resources             *Resources
   258  	TaskResources         map[string]*Resources
   259  	AllocatedResources    *AllocatedResources
   260  	Services              map[string]string
   261  	Metrics               *AllocationMetric
   262  	DesiredStatus         string
   263  	DesiredDescription    string
   264  	DesiredTransition     DesiredTransition
   265  	ClientStatus          string
   266  	ClientDescription     string
   267  	TaskStates            map[string]*TaskState
   268  	DeploymentID          string
   269  	DeploymentStatus      *AllocDeploymentStatus
   270  	FollowupEvalID        string
   271  	PreviousAllocation    string
   272  	NextAllocation        string
   273  	RescheduleTracker     *RescheduleTracker
   274  	NetworkStatus         *AllocNetworkStatus
   275  	PreemptedAllocations  []string
   276  	PreemptedByAllocation string
   277  	CreateIndex           uint64
   278  	ModifyIndex           uint64
   279  	AllocModifyIndex      uint64
   280  	CreateTime            int64
   281  	ModifyTime            int64
   282  }
   283  
   284  // AllocationMetric is used to deserialize allocation metrics.
   285  type AllocationMetric struct {
   286  	NodesEvaluated     int
   287  	NodesFiltered      int
   288  	NodesInPool        int
   289  	NodesAvailable     map[string]int
   290  	ClassFiltered      map[string]int
   291  	ConstraintFiltered map[string]int
   292  	NodesExhausted     int
   293  	ClassExhausted     map[string]int
   294  	DimensionExhausted map[string]int
   295  	QuotaExhausted     []string
   296  	ResourcesExhausted map[string]*Resources
   297  	// Deprecated, replaced with ScoreMetaData
   298  	Scores            map[string]float64
   299  	AllocationTime    time.Duration
   300  	CoalescedFailures int
   301  	ScoreMetaData     []*NodeScoreMeta
   302  }
   303  
   304  // NodeScoreMeta is used to serialize node scoring metadata
   305  // displayed in the CLI during verbose mode
   306  type NodeScoreMeta struct {
   307  	NodeID    string
   308  	Scores    map[string]float64
   309  	NormScore float64
   310  }
   311  
   312  // Stub returns a list stub for the allocation
   313  func (a *Allocation) Stub() *AllocationListStub {
   314  	stub := &AllocationListStub{
   315  		ID:                    a.ID,
   316  		EvalID:                a.EvalID,
   317  		Name:                  a.Name,
   318  		Namespace:             a.Namespace,
   319  		NodeID:                a.NodeID,
   320  		NodeName:              a.NodeName,
   321  		JobID:                 a.JobID,
   322  		TaskGroup:             a.TaskGroup,
   323  		DesiredStatus:         a.DesiredStatus,
   324  		DesiredDescription:    a.DesiredDescription,
   325  		ClientStatus:          a.ClientStatus,
   326  		ClientDescription:     a.ClientDescription,
   327  		TaskStates:            a.TaskStates,
   328  		DeploymentStatus:      a.DeploymentStatus,
   329  		FollowupEvalID:        a.FollowupEvalID,
   330  		NextAllocation:        a.NextAllocation,
   331  		RescheduleTracker:     a.RescheduleTracker,
   332  		PreemptedAllocations:  a.PreemptedAllocations,
   333  		PreemptedByAllocation: a.PreemptedByAllocation,
   334  		CreateIndex:           a.CreateIndex,
   335  		ModifyIndex:           a.ModifyIndex,
   336  		CreateTime:            a.CreateTime,
   337  		ModifyTime:            a.ModifyTime,
   338  	}
   339  
   340  	if a.Job != nil {
   341  		stub.JobType = *a.Job.Type
   342  		stub.JobVersion = *a.Job.Version
   343  	}
   344  
   345  	return stub
   346  }
   347  
   348  // ServerTerminalStatus returns true if the desired state of the allocation is
   349  // terminal.
   350  func (a *Allocation) ServerTerminalStatus() bool {
   351  	switch a.DesiredStatus {
   352  	case AllocDesiredStatusStop, AllocDesiredStatusEvict:
   353  		return true
   354  	default:
   355  		return false
   356  	}
   357  }
   358  
   359  // ClientTerminalStatus returns true if the client status is terminal and will
   360  // therefore no longer transition.
   361  func (a *Allocation) ClientTerminalStatus() bool {
   362  	switch a.ClientStatus {
   363  	case AllocClientStatusComplete, AllocClientStatusFailed, AllocClientStatusLost:
   364  		return true
   365  	default:
   366  		return false
   367  	}
   368  }
   369  
   370  // AllocationListStub is used to return a subset of an allocation
   371  // during list operations.
   372  type AllocationListStub struct {
   373  	ID                    string
   374  	EvalID                string
   375  	Name                  string
   376  	Namespace             string
   377  	NodeID                string
   378  	NodeName              string
   379  	JobID                 string
   380  	JobType               string
   381  	JobVersion            uint64
   382  	TaskGroup             string
   383  	AllocatedResources    *AllocatedResources `json:",omitempty"`
   384  	DesiredStatus         string
   385  	DesiredDescription    string
   386  	ClientStatus          string
   387  	ClientDescription     string
   388  	TaskStates            map[string]*TaskState
   389  	DeploymentStatus      *AllocDeploymentStatus
   390  	FollowupEvalID        string
   391  	NextAllocation        string
   392  	RescheduleTracker     *RescheduleTracker
   393  	PreemptedAllocations  []string
   394  	PreemptedByAllocation string
   395  	CreateIndex           uint64
   396  	ModifyIndex           uint64
   397  	CreateTime            int64
   398  	ModifyTime            int64
   399  }
   400  
   401  // AllocDeploymentStatus captures the status of the allocation as part of the
   402  // deployment. This can include things like if the allocation has been marked as
   403  // healthy.
   404  type AllocDeploymentStatus struct {
   405  	Healthy     *bool
   406  	Timestamp   time.Time
   407  	Canary      bool
   408  	ModifyIndex uint64
   409  }
   410  
   411  // AllocNetworkStatus captures the status of an allocation's network during runtime.
   412  // Depending on the network mode, an allocation's address may need to be known to other
   413  // systems in Nomad such as service registration.
   414  type AllocNetworkStatus struct {
   415  	InterfaceName string
   416  	Address       string
   417  	DNS           *DNSConfig
   418  }
   419  
   420  type AllocatedResources struct {
   421  	Tasks  map[string]*AllocatedTaskResources
   422  	Shared AllocatedSharedResources
   423  }
   424  
   425  type AllocatedTaskResources struct {
   426  	Cpu      AllocatedCpuResources
   427  	Memory   AllocatedMemoryResources
   428  	Networks []*NetworkResource
   429  	Devices  []*AllocatedDeviceResource
   430  }
   431  
   432  type AllocatedSharedResources struct {
   433  	DiskMB   int64
   434  	Networks []*NetworkResource
   435  	Ports    []PortMapping
   436  }
   437  
   438  type PortMapping struct {
   439  	Label  string
   440  	Value  int
   441  	To     int
   442  	HostIP string
   443  }
   444  
   445  type AllocatedCpuResources struct {
   446  	CpuShares int64
   447  }
   448  
   449  type AllocatedMemoryResources struct {
   450  	MemoryMB    int64
   451  	MemoryMaxMB int64
   452  }
   453  
   454  type AllocatedDeviceResource struct {
   455  	Vendor    string
   456  	Type      string
   457  	Name      string
   458  	DeviceIDs []string
   459  }
   460  
   461  // AllocIndexSort reverse sorts allocs by CreateIndex.
   462  type AllocIndexSort []*AllocationListStub
   463  
   464  func (a AllocIndexSort) Len() int {
   465  	return len(a)
   466  }
   467  
   468  func (a AllocIndexSort) Less(i, j int) bool {
   469  	return a[i].CreateIndex > a[j].CreateIndex
   470  }
   471  
   472  func (a AllocIndexSort) Swap(i, j int) {
   473  	a[i], a[j] = a[j], a[i]
   474  }
   475  
   476  func (a Allocation) GetTaskGroup() *TaskGroup {
   477  	for _, tg := range a.Job.TaskGroups {
   478  		if *tg.Name == a.TaskGroup {
   479  			return tg
   480  		}
   481  	}
   482  	return nil
   483  }
   484  
   485  // RescheduleInfo is used to calculate remaining reschedule attempts
   486  // according to the given time and the task groups reschedule policy
   487  func (a Allocation) RescheduleInfo(t time.Time) (int, int) {
   488  	tg := a.GetTaskGroup()
   489  	if tg == nil || tg.ReschedulePolicy == nil {
   490  		return 0, 0
   491  	}
   492  	reschedulePolicy := tg.ReschedulePolicy
   493  	availableAttempts := *reschedulePolicy.Attempts
   494  	interval := *reschedulePolicy.Interval
   495  	attempted := 0
   496  
   497  	// Loop over reschedule tracker to find attempts within the restart policy's interval
   498  	if a.RescheduleTracker != nil && availableAttempts > 0 && interval > 0 {
   499  		for j := len(a.RescheduleTracker.Events) - 1; j >= 0; j-- {
   500  			lastAttempt := a.RescheduleTracker.Events[j].RescheduleTime
   501  			timeDiff := t.UTC().UnixNano() - lastAttempt
   502  			if timeDiff < interval.Nanoseconds() {
   503  				attempted += 1
   504  			}
   505  		}
   506  	}
   507  	return attempted, availableAttempts
   508  }
   509  
   510  type AllocationRestartRequest struct {
   511  	TaskName string
   512  	AllTasks bool
   513  }
   514  
   515  type AllocSignalRequest struct {
   516  	Task   string
   517  	Signal string
   518  }
   519  
   520  // GenericResponse is used to respond to a request where no
   521  // specific response information is needed.
   522  type GenericResponse struct {
   523  	WriteMeta
   524  }
   525  
   526  // RescheduleTracker encapsulates previous reschedule events
   527  type RescheduleTracker struct {
   528  	Events []*RescheduleEvent
   529  }
   530  
   531  // RescheduleEvent is used to keep track of previous attempts at rescheduling an allocation
   532  type RescheduleEvent struct {
   533  	// RescheduleTime is the timestamp of a reschedule attempt
   534  	RescheduleTime int64
   535  
   536  	// PrevAllocID is the ID of the previous allocation being restarted
   537  	PrevAllocID string
   538  
   539  	// PrevNodeID is the node ID of the previous allocation
   540  	PrevNodeID string
   541  }
   542  
   543  // DesiredTransition is used to mark an allocation as having a desired state
   544  // transition. This information can be used by the scheduler to make the
   545  // correct decision.
   546  type DesiredTransition struct {
   547  	// Migrate is used to indicate that this allocation should be stopped and
   548  	// migrated to another node.
   549  	Migrate *bool
   550  
   551  	// Reschedule is used to indicate that this allocation is eligible to be
   552  	// rescheduled.
   553  	Reschedule *bool
   554  }
   555  
   556  // ShouldMigrate returns whether the transition object dictates a migration.
   557  func (d DesiredTransition) ShouldMigrate() bool {
   558  	return d.Migrate != nil && *d.Migrate
   559  }
   560  
   561  // ExecStreamingIOOperation represents a stream write operation: either appending data or close (exclusively)
   562  type ExecStreamingIOOperation struct {
   563  	Data  []byte `json:"data,omitempty"`
   564  	Close bool   `json:"close,omitempty"`
   565  }
   566  
   567  // TerminalSize represents the size of the terminal
   568  type TerminalSize struct {
   569  	Height int `json:"height,omitempty"`
   570  	Width  int `json:"width,omitempty"`
   571  }
   572  
   573  var execStreamingInputHeartbeat = ExecStreamingInput{}
   574  
   575  // ExecStreamingInput represents user input to be sent to nomad exec handler.
   576  //
   577  // At most one field should be set.
   578  type ExecStreamingInput struct {
   579  	Stdin   *ExecStreamingIOOperation `json:"stdin,omitempty"`
   580  	TTYSize *TerminalSize             `json:"tty_size,omitempty"`
   581  }
   582  
   583  // ExecStreamingExitResult captures the exit code of just completed nomad exec command
   584  type ExecStreamingExitResult struct {
   585  	ExitCode int `json:"exit_code"`
   586  }
   587  
   588  // ExecStreamingOutput represents an output streaming entity, e.g. stdout/stderr update or termination
   589  //
   590  // At most one of these fields should be set: `Stdout`, `Stderr`, or `Result`.
   591  // If `Exited` is true, then `Result` is non-nil, and other fields are nil.
   592  type ExecStreamingOutput struct {
   593  	Stdout *ExecStreamingIOOperation `json:"stdout,omitempty"`
   594  	Stderr *ExecStreamingIOOperation `json:"stderr,omitempty"`
   595  
   596  	Exited bool                     `json:"exited,omitempty"`
   597  	Result *ExecStreamingExitResult `json:"result,omitempty"`
   598  }
   599  
   600  func AllocSuffix(name string) string {
   601  	idx := strings.LastIndex(name, "[")
   602  	if idx == -1 {
   603  		return ""
   604  	}
   605  	suffix := name[idx:]
   606  	return suffix
   607  }