github.com/hashicorp/nomad/api@v0.0.0-20240306165712-3193ac204f65/allocations.go (about) 1 // Copyright (c) HashiCorp, Inc. 2 // SPDX-License-Identifier: MPL-2.0 3 4 package api 5 6 import ( 7 "context" 8 "errors" 9 "io" 10 "sort" 11 "strings" 12 "time" 13 ) 14 15 var ( 16 // NodeDownErr marks an operation as not able to complete since the node is 17 // down. 18 NodeDownErr = errors.New("node down") 19 ) 20 21 const ( 22 AllocDesiredStatusRun = "run" // Allocation should run 23 AllocDesiredStatusStop = "stop" // Allocation should stop 24 AllocDesiredStatusEvict = "evict" // Allocation should stop, and was evicted 25 ) 26 27 const ( 28 AllocClientStatusPending = "pending" 29 AllocClientStatusRunning = "running" 30 AllocClientStatusComplete = "complete" 31 AllocClientStatusFailed = "failed" 32 AllocClientStatusLost = "lost" 33 AllocClientStatusUnknown = "unknown" 34 ) 35 36 const ( 37 AllocRestartReasonWithinPolicy = "Restart within policy" 38 ) 39 40 // Allocations is used to query the alloc-related endpoints. 41 type Allocations struct { 42 client *Client 43 } 44 45 // Allocations returns a handle on the allocs endpoints. 46 func (c *Client) Allocations() *Allocations { 47 return &Allocations{client: c} 48 } 49 50 // List returns a list of all of the allocations. 51 func (a *Allocations) List(q *QueryOptions) ([]*AllocationListStub, *QueryMeta, error) { 52 var resp []*AllocationListStub 53 qm, err := a.client.query("/v1/allocations", &resp, q) 54 if err != nil { 55 return nil, nil, err 56 } 57 sort.Sort(AllocIndexSort(resp)) 58 return resp, qm, nil 59 } 60 61 func (a *Allocations) PrefixList(prefix string) ([]*AllocationListStub, *QueryMeta, error) { 62 return a.List(&QueryOptions{Prefix: prefix}) 63 } 64 65 // Info is used to retrieve a single allocation. 66 func (a *Allocations) Info(allocID string, q *QueryOptions) (*Allocation, *QueryMeta, error) { 67 var resp Allocation 68 qm, err := a.client.query("/v1/allocation/"+allocID, &resp, q) 69 if err != nil { 70 return nil, nil, err 71 } 72 return &resp, qm, nil 73 } 74 75 // Exec is used to execute a command inside a running task. The command is to run inside 76 // the task environment. 77 // 78 // The parameters are: 79 // - ctx: context to set deadlines or timeout 80 // - allocation: the allocation to execute command inside 81 // - task: the task's name to execute command in 82 // - tty: indicates whether to start a pseudo-tty for the command 83 // - stdin, stdout, stderr: the std io to pass to command. 84 // If tty is true, then streams need to point to a tty that's alive for the whole process 85 // - terminalSizeCh: A channel to send new tty terminal sizes 86 // 87 // The call blocks until command terminates (or an error occurs), and returns the exit code. 88 // 89 // Note: for cluster topologies where API consumers don't have network access to 90 // Nomad clients, set api.ClientConnTimeout to a small value (ex 1ms) to avoid 91 // long pauses on this API call. 92 func (a *Allocations) Exec(ctx context.Context, 93 alloc *Allocation, task string, tty bool, command []string, 94 stdin io.Reader, stdout, stderr io.Writer, 95 terminalSizeCh <-chan TerminalSize, q *QueryOptions) (exitCode int, err error) { 96 97 s := &execSession{ 98 client: a.client, 99 alloc: alloc, 100 task: task, 101 tty: tty, 102 command: command, 103 104 stdin: stdin, 105 stdout: stdout, 106 stderr: stderr, 107 108 terminalSizeCh: terminalSizeCh, 109 q: q, 110 } 111 112 return s.run(ctx) 113 } 114 115 // Stats gets allocation resource usage statistics about an allocation. 116 // 117 // Note: for cluster topologies where API consumers don't have network access to 118 // Nomad clients, set api.ClientConnTimeout to a small value (ex 1ms) to avoid 119 // long pauses on this API call. 120 func (a *Allocations) Stats(alloc *Allocation, q *QueryOptions) (*AllocResourceUsage, error) { 121 var resp AllocResourceUsage 122 _, err := a.client.query("/v1/client/allocation/"+alloc.ID+"/stats", &resp, q) 123 return &resp, err 124 } 125 126 // Checks gets status information for nomad service checks that exist in the allocation. 127 // 128 // Note: for cluster topologies where API consumers don't have network access to 129 // Nomad clients, set api.ClientConnTimeout to a small value (ex 1ms) to avoid 130 // long pauses on this API call. 131 func (a *Allocations) Checks(allocID string, q *QueryOptions) (AllocCheckStatuses, error) { 132 var resp AllocCheckStatuses 133 _, err := a.client.query("/v1/client/allocation/"+allocID+"/checks", &resp, q) 134 return resp, err 135 } 136 137 // GC forces a garbage collection of client state for an allocation. 138 // 139 // Note: for cluster topologies where API consumers don't have network access to 140 // Nomad clients, set api.ClientConnTimeout to a small value (ex 1ms) to avoid 141 // long pauses on this API call. 142 func (a *Allocations) GC(alloc *Allocation, q *QueryOptions) error { 143 var resp struct{} 144 _, err := a.client.query("/v1/client/allocation/"+alloc.ID+"/gc", &resp, nil) 145 return err 146 } 147 148 // Restart restarts the tasks that are currently running or a specific task if 149 // taskName is provided. An error is returned if the task to be restarted is 150 // not running. 151 // 152 // Note: for cluster topologies where API consumers don't have network access to 153 // Nomad clients, set api.ClientConnTimeout to a small value (ex 1ms) to avoid 154 // long pauses on this API call. 155 func (a *Allocations) Restart(alloc *Allocation, taskName string, q *QueryOptions) error { 156 req := AllocationRestartRequest{ 157 TaskName: taskName, 158 } 159 160 var resp struct{} 161 _, err := a.client.putQuery("/v1/client/allocation/"+alloc.ID+"/restart", &req, &resp, q) 162 return err 163 } 164 165 // RestartAllTasks restarts all tasks in the allocation, regardless of 166 // lifecycle type or state. Tasks will restart following their lifecycle order. 167 // 168 // Note: for cluster topologies where API consumers don't have network access to 169 // Nomad clients, set api.ClientConnTimeout to a small value (ex 1ms) to avoid 170 // long pauses on this API call. 171 // 172 // DEPRECATED: This method will be removed in 1.6.0 173 func (a *Allocations) RestartAllTasks(alloc *Allocation, q *QueryOptions) error { 174 req := AllocationRestartRequest{ 175 AllTasks: true, 176 } 177 178 var resp struct{} 179 _, err := a.client.putQuery("/v1/client/allocation/"+alloc.ID+"/restart", &req, &resp, q) 180 return err 181 } 182 183 // Stop stops an allocation. 184 // 185 // Note: for cluster topologies where API consumers don't have network access to 186 // Nomad clients, set api.ClientConnTimeout to a small value (ex 1ms) to avoid 187 // long pauses on this API call. 188 // 189 // BREAKING: This method will have the following signature in 1.6.0 190 // func (a *Allocations) Stop(allocID string, w *WriteOptions) (*AllocStopResponse, error) { 191 func (a *Allocations) Stop(alloc *Allocation, q *QueryOptions) (*AllocStopResponse, error) { 192 // COMPAT: Remove in 1.6.0 193 var w *WriteOptions 194 if q != nil { 195 w = &WriteOptions{ 196 Region: q.Region, 197 Namespace: q.Namespace, 198 AuthToken: q.AuthToken, 199 Headers: q.Headers, 200 ctx: q.ctx, 201 } 202 } 203 204 var resp AllocStopResponse 205 wm, err := a.client.put("/v1/allocation/"+alloc.ID+"/stop", nil, &resp, w) 206 if wm != nil { 207 resp.LastIndex = wm.LastIndex 208 resp.RequestTime = wm.RequestTime 209 } 210 211 return &resp, err 212 } 213 214 // AllocStopResponse is the response to an `AllocStopRequest` 215 type AllocStopResponse struct { 216 // EvalID is the id of the follow up evalution for the rescheduled alloc. 217 EvalID string 218 219 WriteMeta 220 } 221 222 // Signal sends a signal to the allocation. 223 // 224 // Note: for cluster topologies where API consumers don't have network access to 225 // Nomad clients, set api.ClientConnTimeout to a small value (ex 1ms) to avoid 226 // long pauses on this API call. 227 func (a *Allocations) Signal(alloc *Allocation, q *QueryOptions, task, signal string) error { 228 req := AllocSignalRequest{ 229 Signal: signal, 230 Task: task, 231 } 232 233 var resp GenericResponse 234 _, err := a.client.putQuery("/v1/client/allocation/"+alloc.ID+"/signal", &req, &resp, q) 235 return err 236 } 237 238 // Services is used to return a list of service registrations associated to the 239 // specified allocID. 240 func (a *Allocations) Services(allocID string, q *QueryOptions) ([]*ServiceRegistration, *QueryMeta, error) { 241 var resp []*ServiceRegistration 242 qm, err := a.client.query("/v1/allocation/"+allocID+"/services", &resp, q) 243 return resp, qm, err 244 } 245 246 // Allocation is used for serialization of allocations. 247 type Allocation struct { 248 ID string 249 Namespace string 250 EvalID string 251 Name string 252 NodeID string 253 NodeName string 254 JobID string 255 Job *Job 256 TaskGroup string 257 Resources *Resources 258 TaskResources map[string]*Resources 259 AllocatedResources *AllocatedResources 260 Services map[string]string 261 Metrics *AllocationMetric 262 DesiredStatus string 263 DesiredDescription string 264 DesiredTransition DesiredTransition 265 ClientStatus string 266 ClientDescription string 267 TaskStates map[string]*TaskState 268 DeploymentID string 269 DeploymentStatus *AllocDeploymentStatus 270 FollowupEvalID string 271 PreviousAllocation string 272 NextAllocation string 273 RescheduleTracker *RescheduleTracker 274 NetworkStatus *AllocNetworkStatus 275 PreemptedAllocations []string 276 PreemptedByAllocation string 277 CreateIndex uint64 278 ModifyIndex uint64 279 AllocModifyIndex uint64 280 CreateTime int64 281 ModifyTime int64 282 } 283 284 // AllocationMetric is used to deserialize allocation metrics. 285 type AllocationMetric struct { 286 NodesEvaluated int 287 NodesFiltered int 288 NodesInPool int 289 NodesAvailable map[string]int 290 ClassFiltered map[string]int 291 ConstraintFiltered map[string]int 292 NodesExhausted int 293 ClassExhausted map[string]int 294 DimensionExhausted map[string]int 295 QuotaExhausted []string 296 ResourcesExhausted map[string]*Resources 297 // Deprecated, replaced with ScoreMetaData 298 Scores map[string]float64 299 AllocationTime time.Duration 300 CoalescedFailures int 301 ScoreMetaData []*NodeScoreMeta 302 } 303 304 // NodeScoreMeta is used to serialize node scoring metadata 305 // displayed in the CLI during verbose mode 306 type NodeScoreMeta struct { 307 NodeID string 308 Scores map[string]float64 309 NormScore float64 310 } 311 312 // Stub returns a list stub for the allocation 313 func (a *Allocation) Stub() *AllocationListStub { 314 stub := &AllocationListStub{ 315 ID: a.ID, 316 EvalID: a.EvalID, 317 Name: a.Name, 318 Namespace: a.Namespace, 319 NodeID: a.NodeID, 320 NodeName: a.NodeName, 321 JobID: a.JobID, 322 TaskGroup: a.TaskGroup, 323 DesiredStatus: a.DesiredStatus, 324 DesiredDescription: a.DesiredDescription, 325 ClientStatus: a.ClientStatus, 326 ClientDescription: a.ClientDescription, 327 TaskStates: a.TaskStates, 328 DeploymentStatus: a.DeploymentStatus, 329 FollowupEvalID: a.FollowupEvalID, 330 NextAllocation: a.NextAllocation, 331 RescheduleTracker: a.RescheduleTracker, 332 PreemptedAllocations: a.PreemptedAllocations, 333 PreemptedByAllocation: a.PreemptedByAllocation, 334 CreateIndex: a.CreateIndex, 335 ModifyIndex: a.ModifyIndex, 336 CreateTime: a.CreateTime, 337 ModifyTime: a.ModifyTime, 338 } 339 340 if a.Job != nil { 341 stub.JobType = *a.Job.Type 342 stub.JobVersion = *a.Job.Version 343 } 344 345 return stub 346 } 347 348 // ServerTerminalStatus returns true if the desired state of the allocation is 349 // terminal. 350 func (a *Allocation) ServerTerminalStatus() bool { 351 switch a.DesiredStatus { 352 case AllocDesiredStatusStop, AllocDesiredStatusEvict: 353 return true 354 default: 355 return false 356 } 357 } 358 359 // ClientTerminalStatus returns true if the client status is terminal and will 360 // therefore no longer transition. 361 func (a *Allocation) ClientTerminalStatus() bool { 362 switch a.ClientStatus { 363 case AllocClientStatusComplete, AllocClientStatusFailed, AllocClientStatusLost: 364 return true 365 default: 366 return false 367 } 368 } 369 370 // AllocationListStub is used to return a subset of an allocation 371 // during list operations. 372 type AllocationListStub struct { 373 ID string 374 EvalID string 375 Name string 376 Namespace string 377 NodeID string 378 NodeName string 379 JobID string 380 JobType string 381 JobVersion uint64 382 TaskGroup string 383 AllocatedResources *AllocatedResources `json:",omitempty"` 384 DesiredStatus string 385 DesiredDescription string 386 ClientStatus string 387 ClientDescription string 388 TaskStates map[string]*TaskState 389 DeploymentStatus *AllocDeploymentStatus 390 FollowupEvalID string 391 NextAllocation string 392 RescheduleTracker *RescheduleTracker 393 PreemptedAllocations []string 394 PreemptedByAllocation string 395 CreateIndex uint64 396 ModifyIndex uint64 397 CreateTime int64 398 ModifyTime int64 399 } 400 401 // AllocDeploymentStatus captures the status of the allocation as part of the 402 // deployment. This can include things like if the allocation has been marked as 403 // healthy. 404 type AllocDeploymentStatus struct { 405 Healthy *bool 406 Timestamp time.Time 407 Canary bool 408 ModifyIndex uint64 409 } 410 411 // AllocNetworkStatus captures the status of an allocation's network during runtime. 412 // Depending on the network mode, an allocation's address may need to be known to other 413 // systems in Nomad such as service registration. 414 type AllocNetworkStatus struct { 415 InterfaceName string 416 Address string 417 DNS *DNSConfig 418 } 419 420 type AllocatedResources struct { 421 Tasks map[string]*AllocatedTaskResources 422 Shared AllocatedSharedResources 423 } 424 425 type AllocatedTaskResources struct { 426 Cpu AllocatedCpuResources 427 Memory AllocatedMemoryResources 428 Networks []*NetworkResource 429 Devices []*AllocatedDeviceResource 430 } 431 432 type AllocatedSharedResources struct { 433 DiskMB int64 434 Networks []*NetworkResource 435 Ports []PortMapping 436 } 437 438 type PortMapping struct { 439 Label string 440 Value int 441 To int 442 HostIP string 443 } 444 445 type AllocatedCpuResources struct { 446 CpuShares int64 447 } 448 449 type AllocatedMemoryResources struct { 450 MemoryMB int64 451 MemoryMaxMB int64 452 } 453 454 type AllocatedDeviceResource struct { 455 Vendor string 456 Type string 457 Name string 458 DeviceIDs []string 459 } 460 461 // AllocIndexSort reverse sorts allocs by CreateIndex. 462 type AllocIndexSort []*AllocationListStub 463 464 func (a AllocIndexSort) Len() int { 465 return len(a) 466 } 467 468 func (a AllocIndexSort) Less(i, j int) bool { 469 return a[i].CreateIndex > a[j].CreateIndex 470 } 471 472 func (a AllocIndexSort) Swap(i, j int) { 473 a[i], a[j] = a[j], a[i] 474 } 475 476 func (a Allocation) GetTaskGroup() *TaskGroup { 477 for _, tg := range a.Job.TaskGroups { 478 if *tg.Name == a.TaskGroup { 479 return tg 480 } 481 } 482 return nil 483 } 484 485 // RescheduleInfo is used to calculate remaining reschedule attempts 486 // according to the given time and the task groups reschedule policy 487 func (a Allocation) RescheduleInfo(t time.Time) (int, int) { 488 tg := a.GetTaskGroup() 489 if tg == nil || tg.ReschedulePolicy == nil { 490 return 0, 0 491 } 492 reschedulePolicy := tg.ReschedulePolicy 493 availableAttempts := *reschedulePolicy.Attempts 494 interval := *reschedulePolicy.Interval 495 attempted := 0 496 497 // Loop over reschedule tracker to find attempts within the restart policy's interval 498 if a.RescheduleTracker != nil && availableAttempts > 0 && interval > 0 { 499 for j := len(a.RescheduleTracker.Events) - 1; j >= 0; j-- { 500 lastAttempt := a.RescheduleTracker.Events[j].RescheduleTime 501 timeDiff := t.UTC().UnixNano() - lastAttempt 502 if timeDiff < interval.Nanoseconds() { 503 attempted += 1 504 } 505 } 506 } 507 return attempted, availableAttempts 508 } 509 510 type AllocationRestartRequest struct { 511 TaskName string 512 AllTasks bool 513 } 514 515 type AllocSignalRequest struct { 516 Task string 517 Signal string 518 } 519 520 // GenericResponse is used to respond to a request where no 521 // specific response information is needed. 522 type GenericResponse struct { 523 WriteMeta 524 } 525 526 // RescheduleTracker encapsulates previous reschedule events 527 type RescheduleTracker struct { 528 Events []*RescheduleEvent 529 } 530 531 // RescheduleEvent is used to keep track of previous attempts at rescheduling an allocation 532 type RescheduleEvent struct { 533 // RescheduleTime is the timestamp of a reschedule attempt 534 RescheduleTime int64 535 536 // PrevAllocID is the ID of the previous allocation being restarted 537 PrevAllocID string 538 539 // PrevNodeID is the node ID of the previous allocation 540 PrevNodeID string 541 } 542 543 // DesiredTransition is used to mark an allocation as having a desired state 544 // transition. This information can be used by the scheduler to make the 545 // correct decision. 546 type DesiredTransition struct { 547 // Migrate is used to indicate that this allocation should be stopped and 548 // migrated to another node. 549 Migrate *bool 550 551 // Reschedule is used to indicate that this allocation is eligible to be 552 // rescheduled. 553 Reschedule *bool 554 } 555 556 // ShouldMigrate returns whether the transition object dictates a migration. 557 func (d DesiredTransition) ShouldMigrate() bool { 558 return d.Migrate != nil && *d.Migrate 559 } 560 561 // ExecStreamingIOOperation represents a stream write operation: either appending data or close (exclusively) 562 type ExecStreamingIOOperation struct { 563 Data []byte `json:"data,omitempty"` 564 Close bool `json:"close,omitempty"` 565 } 566 567 // TerminalSize represents the size of the terminal 568 type TerminalSize struct { 569 Height int `json:"height,omitempty"` 570 Width int `json:"width,omitempty"` 571 } 572 573 var execStreamingInputHeartbeat = ExecStreamingInput{} 574 575 // ExecStreamingInput represents user input to be sent to nomad exec handler. 576 // 577 // At most one field should be set. 578 type ExecStreamingInput struct { 579 Stdin *ExecStreamingIOOperation `json:"stdin,omitempty"` 580 TTYSize *TerminalSize `json:"tty_size,omitempty"` 581 } 582 583 // ExecStreamingExitResult captures the exit code of just completed nomad exec command 584 type ExecStreamingExitResult struct { 585 ExitCode int `json:"exit_code"` 586 } 587 588 // ExecStreamingOutput represents an output streaming entity, e.g. stdout/stderr update or termination 589 // 590 // At most one of these fields should be set: `Stdout`, `Stderr`, or `Result`. 591 // If `Exited` is true, then `Result` is non-nil, and other fields are nil. 592 type ExecStreamingOutput struct { 593 Stdout *ExecStreamingIOOperation `json:"stdout,omitempty"` 594 Stderr *ExecStreamingIOOperation `json:"stderr,omitempty"` 595 596 Exited bool `json:"exited,omitempty"` 597 Result *ExecStreamingExitResult `json:"result,omitempty"` 598 } 599 600 func AllocSuffix(name string) string { 601 idx := strings.LastIndex(name, "[") 602 if idx == -1 { 603 return "" 604 } 605 suffix := name[idx:] 606 return suffix 607 }