github.com/zhizhiboom/nomad@v0.8.5-0.20180907175415-f28fd3a1a056/client/allocrunner/taskrunner/task_runner.go (about) 1 package taskrunner 2 3 import ( 4 "bytes" 5 "crypto/md5" 6 "encoding/hex" 7 "fmt" 8 "io" 9 "io/ioutil" 10 "log" 11 "os" 12 "path/filepath" 13 "strings" 14 "sync" 15 "time" 16 17 metrics "github.com/armon/go-metrics" 18 "github.com/boltdb/bolt" 19 "github.com/golang/snappy" 20 "github.com/hashicorp/consul-template/signals" 21 "github.com/hashicorp/go-multierror" 22 version "github.com/hashicorp/go-version" 23 "github.com/hashicorp/nomad/client/allocdir" 24 "github.com/hashicorp/nomad/client/allocrunner/getter" 25 "github.com/hashicorp/nomad/client/allocrunner/taskrunner/restarts" 26 "github.com/hashicorp/nomad/client/config" 27 consulApi "github.com/hashicorp/nomad/client/consul" 28 "github.com/hashicorp/nomad/client/driver" 29 "github.com/hashicorp/nomad/client/state" 30 "github.com/hashicorp/nomad/client/vaultclient" 31 "github.com/hashicorp/nomad/command/agent/consul" 32 "github.com/hashicorp/nomad/nomad/structs" 33 "github.com/ugorji/go/codec" 34 35 "github.com/hashicorp/nomad/client/driver/env" 36 dstructs "github.com/hashicorp/nomad/client/driver/structs" 37 cstructs "github.com/hashicorp/nomad/client/structs" 38 ) 39 40 const ( 41 // killBackoffBaseline is the baseline time for exponential backoff while 42 // killing a task. 43 killBackoffBaseline = 5 * time.Second 44 45 // killBackoffLimit is the limit of the exponential backoff for killing 46 // the task. 47 killBackoffLimit = 2 * time.Minute 48 49 // killFailureLimit is how many times we will attempt to kill a task before 50 // giving up and potentially leaking resources. 51 killFailureLimit = 5 52 53 // vaultBackoffBaseline is the baseline time for exponential backoff when 54 // attempting to retrieve a Vault token 55 vaultBackoffBaseline = 5 * time.Second 56 57 // vaultBackoffLimit is the limit of the exponential backoff when attempting 58 // to retrieve a Vault token 59 vaultBackoffLimit = 3 * time.Minute 60 61 // vaultTokenFile is the name of the file holding the Vault token inside the 62 // task's secret directory 63 vaultTokenFile = "vault_token" 64 ) 65 66 var ( 67 // taskRunnerStateAllKey holds all the task runners state. At the moment 68 // there is no need to split it 69 taskRunnerStateAllKey = []byte("simple-all") 70 ) 71 72 // taskRestartEvent wraps a TaskEvent with additional metadata to control 73 // restart behavior. 74 type taskRestartEvent struct { 75 // taskEvent to report 76 taskEvent *structs.TaskEvent 77 78 // if false, don't count against restart count 79 failure bool 80 } 81 82 func newTaskRestartEvent(reason string, failure bool) *taskRestartEvent { 83 return &taskRestartEvent{ 84 taskEvent: structs.NewTaskEvent(structs.TaskRestartSignal).SetRestartReason(reason), 85 failure: failure, 86 } 87 } 88 89 // TaskRunner is used to wrap a task within an allocation and provide the execution context. 90 type TaskRunner struct { 91 stateDB *bolt.DB 92 config *config.Config 93 updater TaskStateUpdater 94 logger *log.Logger 95 restartTracker *restarts.RestartTracker 96 consul consulApi.ConsulServiceAPI 97 98 // running marks whether the task is running 99 running bool 100 runningLock sync.Mutex 101 102 resourceUsage *cstructs.TaskResourceUsage 103 resourceUsageLock sync.RWMutex 104 105 alloc *structs.Allocation 106 task *structs.Task 107 taskDir *allocdir.TaskDir 108 109 // envBuilder is used to build the task's environment 110 envBuilder *env.Builder 111 112 // driverNet is the network information returned by the driver 113 driverNet *cstructs.DriverNetwork 114 driverNetLock sync.Mutex 115 116 // updateCh is used to receive updated versions of the allocation 117 updateCh chan *structs.Allocation 118 119 handle driver.DriverHandle 120 handleLock sync.Mutex 121 122 // artifactsDownloaded tracks whether the tasks artifacts have been 123 // downloaded 124 // 125 // Must acquire persistLock when accessing 126 artifactsDownloaded bool 127 128 // taskDirBuilt tracks whether the task has built its directory. 129 // 130 // Must acquire persistLock when accessing 131 taskDirBuilt bool 132 133 // createdResources are all the resources created by the task driver 134 // across all attempts to start the task. 135 // Simple gets and sets should use {get,set}CreatedResources 136 createdResources *driver.CreatedResources 137 createdResourcesLock sync.Mutex 138 139 // payloadRendered tracks whether the payload has been rendered to disk 140 payloadRendered bool 141 142 // vaultFuture is the means to wait for and get a Vault token 143 vaultFuture *tokenFuture 144 145 // recoveredVaultToken is the token that was recovered through a restore 146 recoveredVaultToken string 147 148 // vaultClient is used to retrieve and renew any needed Vault token 149 vaultClient vaultclient.VaultClient 150 151 // templateManager is used to manage any consul-templates this task may have 152 templateManager *TaskTemplateManager 153 154 // startCh is used to trigger the start of the task 155 startCh chan struct{} 156 157 // unblockCh is used to unblock the starting of the task 158 unblockCh chan struct{} 159 unblocked bool 160 unblockLock sync.Mutex 161 162 // restartCh is used to restart a task 163 restartCh chan *taskRestartEvent 164 165 // signalCh is used to send a signal to a task 166 signalCh chan SignalEvent 167 168 destroy bool 169 destroyCh chan struct{} 170 destroyLock sync.Mutex 171 destroyEvent *structs.TaskEvent 172 173 // waitCh closing marks the run loop as having exited 174 waitCh chan struct{} 175 176 // persistLock must be acquired when accessing fields stored by 177 // SaveState. SaveState is called asynchronously to TaskRunner.Run by 178 // AllocRunner, so all state fields must be synchronized using this 179 // lock. 180 persistLock sync.Mutex 181 182 // persistedHash is the hash of the last persisted snapshot. It is used to 183 // detect if a new snapshot has to be written to disk. 184 persistedHash []byte 185 186 // baseLabels are used when emitting tagged metrics. All task runner metrics 187 // will have these tags, and optionally more. 188 baseLabels []metrics.Label 189 } 190 191 // taskRunnerState is used to snapshot the state of the task runner 192 type taskRunnerState struct { 193 Version string 194 HandleID string 195 ArtifactDownloaded bool 196 TaskDirBuilt bool 197 PayloadRendered bool 198 CreatedResources *driver.CreatedResources 199 DriverNetwork *cstructs.DriverNetwork 200 } 201 202 func (s *taskRunnerState) Hash() []byte { 203 h := md5.New() 204 205 io.WriteString(h, s.Version) 206 io.WriteString(h, s.HandleID) 207 io.WriteString(h, fmt.Sprintf("%v", s.ArtifactDownloaded)) 208 io.WriteString(h, fmt.Sprintf("%v", s.TaskDirBuilt)) 209 io.WriteString(h, fmt.Sprintf("%v", s.PayloadRendered)) 210 h.Write(s.CreatedResources.Hash()) 211 h.Write(s.DriverNetwork.Hash()) 212 213 return h.Sum(nil) 214 } 215 216 // TaskStateUpdater is used to signal that tasks state has changed. If lazySync 217 // is set the event won't be immediately pushed to the server. 218 type TaskStateUpdater func(taskName, state string, event *structs.TaskEvent, lazySync bool) 219 220 // SignalEvent is a tuple of the signal and the event generating it 221 type SignalEvent struct { 222 // s is the signal to be sent 223 s os.Signal 224 225 // e is the task event generating the signal 226 e *structs.TaskEvent 227 228 // result should be used to send back the result of the signal 229 result chan<- error 230 } 231 232 // NewTaskRunner is used to create a new task context 233 func NewTaskRunner(logger *log.Logger, config *config.Config, 234 stateDB *bolt.DB, updater TaskStateUpdater, taskDir *allocdir.TaskDir, 235 alloc *structs.Allocation, task *structs.Task, 236 vaultClient vaultclient.VaultClient, consulClient consulApi.ConsulServiceAPI) *TaskRunner { 237 238 // Merge in the task resources 239 task.Resources = alloc.TaskResources[task.Name] 240 241 // Build the restart tracker. 242 tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup) 243 if tg == nil { 244 logger.Printf("[ERR] client: alloc %q for missing task group %q", alloc.ID, alloc.TaskGroup) 245 return nil 246 } 247 restartTracker := restarts.NewRestartTracker(tg.RestartPolicy, alloc.Job.Type) 248 249 // Initialize the environment builder 250 envBuilder := env.NewBuilder(config.Node, alloc, task, config.Region) 251 252 tc := &TaskRunner{ 253 config: config, 254 stateDB: stateDB, 255 updater: updater, 256 logger: logger, 257 restartTracker: restartTracker, 258 alloc: alloc, 259 task: task, 260 taskDir: taskDir, 261 envBuilder: envBuilder, 262 createdResources: driver.NewCreatedResources(), 263 consul: consulClient, 264 vaultClient: vaultClient, 265 vaultFuture: NewTokenFuture().Set(""), 266 updateCh: make(chan *structs.Allocation, 64), 267 destroyCh: make(chan struct{}), 268 waitCh: make(chan struct{}), 269 startCh: make(chan struct{}, 1), 270 unblockCh: make(chan struct{}), 271 restartCh: make(chan *taskRestartEvent), 272 signalCh: make(chan SignalEvent), 273 } 274 275 tc.baseLabels = []metrics.Label{ 276 { 277 Name: "job", 278 Value: tc.alloc.Job.Name, 279 }, 280 { 281 Name: "task_group", 282 Value: tc.alloc.TaskGroup, 283 }, 284 { 285 Name: "alloc_id", 286 Value: tc.alloc.ID, 287 }, 288 { 289 Name: "task", 290 Value: tc.task.Name, 291 }, 292 } 293 294 if tc.alloc.Job.ParentID != "" { 295 tc.baseLabels = append(tc.baseLabels, metrics.Label{ 296 Name: "parent_id", 297 Value: tc.alloc.Job.ParentID, 298 }) 299 if strings.Contains(tc.alloc.Job.Name, "/dispatch-") { 300 tc.baseLabels = append(tc.baseLabels, metrics.Label{ 301 Name: "dispatch_id", 302 Value: strings.Split(tc.alloc.Job.Name, "/dispatch-")[1], 303 }) 304 } 305 if strings.Contains(tc.alloc.Job.Name, "/periodic-") { 306 tc.baseLabels = append(tc.baseLabels, metrics.Label{ 307 Name: "periodic_id", 308 Value: strings.Split(tc.alloc.Job.Name, "/periodic-")[1], 309 }) 310 } 311 return tc 312 } 313 314 return tc 315 } 316 317 // MarkReceived marks the task as received. 318 func (r *TaskRunner) MarkReceived() { 319 // We lazy sync this since there will be a follow up message almost 320 // immediately. 321 r.updater(r.task.Name, structs.TaskStatePending, structs.NewTaskEvent(structs.TaskReceived), true) 322 } 323 324 // WaitCh returns a channel to wait for termination 325 func (r *TaskRunner) WaitCh() <-chan struct{} { 326 return r.waitCh 327 } 328 329 // getHandle returns the task's handle or nil 330 func (r *TaskRunner) getHandle() driver.DriverHandle { 331 r.handleLock.Lock() 332 h := r.handle 333 r.handleLock.Unlock() 334 return h 335 } 336 337 // pre060StateFilePath returns the path to our state file that would have been 338 // written pre v0.6.0 339 // COMPAT: Remove in 0.7.0 340 func (r *TaskRunner) pre060StateFilePath() string { 341 // Get the MD5 of the task name 342 hashVal := md5.Sum([]byte(r.task.Name)) 343 hashHex := hex.EncodeToString(hashVal[:]) 344 dirName := fmt.Sprintf("task-%s", hashHex) 345 346 // Generate the path 347 return filepath.Join(r.config.StateDir, "alloc", r.alloc.ID, dirName, "state.json") 348 } 349 350 // RestoreState is used to restore our state. If a non-empty string is returned 351 // the task is restarted with the string as the reason. This is useful for 352 // backwards incompatible upgrades that need to restart tasks with a new 353 // executor. 354 func (r *TaskRunner) RestoreState() (string, error) { 355 var snap taskRunnerState 356 err := r.stateDB.View(func(tx *bolt.Tx) error { 357 bkt, err := state.GetTaskBucket(tx, r.alloc.ID, r.task.Name) 358 if err != nil { 359 return fmt.Errorf("failed to get task bucket: %v", err) 360 } 361 362 if err := state.GetObject(bkt, taskRunnerStateAllKey, &snap); err != nil { 363 return fmt.Errorf("failed to read task runner state: %v", err) 364 } 365 return nil 366 }) 367 if err != nil { 368 return "", err 369 } 370 371 // Restore fields from the snapshot 372 r.artifactsDownloaded = snap.ArtifactDownloaded 373 r.taskDirBuilt = snap.TaskDirBuilt 374 r.payloadRendered = snap.PayloadRendered 375 r.setCreatedResources(snap.CreatedResources) 376 r.driverNet = snap.DriverNetwork 377 378 if r.task.Vault != nil { 379 // Read the token from the secret directory 380 tokenPath := filepath.Join(r.taskDir.SecretsDir, vaultTokenFile) 381 data, err := ioutil.ReadFile(tokenPath) 382 if err != nil { 383 if !os.IsNotExist(err) { 384 return "", fmt.Errorf("failed to read token for task %q in alloc %q: %v", r.task.Name, r.alloc.ID, err) 385 } 386 387 // Token file doesn't exist 388 } else { 389 // Store the recovered token 390 r.recoveredVaultToken = string(data) 391 } 392 } 393 394 // Restore the driver 395 restartReason := "" 396 if snap.HandleID != "" { 397 d, err := r.createDriver() 398 if err != nil { 399 return "", err 400 } 401 402 // Add the restored network driver to the environment 403 r.envBuilder.SetDriverNetwork(r.driverNet) 404 405 // Open a connection to the driver handle 406 ctx := driver.NewExecContext(r.taskDir, r.envBuilder.Build()) 407 handle, err := d.Open(ctx, snap.HandleID) 408 409 // In the case it fails, we relaunch the task in the Run() method. 410 if err != nil { 411 r.logger.Printf("[ERR] client: failed to open handle to task %q for alloc %q: %v", 412 r.task.Name, r.alloc.ID, err) 413 return "", nil 414 } 415 416 if pre06ScriptCheck(snap.Version, r.task.Driver, r.task.Services) { 417 restartReason = pre06ScriptCheckReason 418 } 419 420 if err := r.registerServices(d, handle, r.driverNet); err != nil { 421 // Don't hard fail here as there's a chance this task 422 // registered with Consul properly when it initial 423 // started. 424 r.logger.Printf("[WARN] client: failed to register services and checks with consul for task %q in alloc %q: %v", 425 r.task.Name, r.alloc.ID, err) 426 } 427 428 r.handleLock.Lock() 429 r.handle = handle 430 r.handleLock.Unlock() 431 432 r.runningLock.Lock() 433 r.running = true 434 r.runningLock.Unlock() 435 } 436 return restartReason, nil 437 } 438 439 // ver06 is used for checking for pre-0.6 script checks 440 var ver06 = version.Must(version.NewVersion("0.6.0dev")) 441 442 // pre06ScriptCheckReason is the restart reason given when a pre-0.6 script 443 // check is found on an exec/java task. 444 const pre06ScriptCheckReason = "upgrading pre-0.6 script checks" 445 446 // pre06ScriptCheck returns true if version is prior to 0.6.0dev, has a script 447 // check, and uses exec or java drivers. 448 func pre06ScriptCheck(ver, driver string, services []*structs.Service) bool { 449 if driver != "exec" && driver != "java" && driver != "mock_driver" { 450 // Only exec and java are affected 451 return false 452 } 453 v, err := version.NewVersion(ver) 454 if err != nil { 455 // Treat it as old 456 return true 457 } 458 if !v.LessThan(ver06) { 459 // >= 0.6.0dev 460 return false 461 } 462 for _, service := range services { 463 for _, check := range service.Checks { 464 if check.Type == "script" { 465 return true 466 } 467 } 468 } 469 return false 470 } 471 472 // SaveState is used to snapshot our state 473 func (r *TaskRunner) SaveState() error { 474 r.destroyLock.Lock() 475 defer r.destroyLock.Unlock() 476 if r.destroy { 477 // Don't save state if already destroyed 478 return nil 479 } 480 481 r.persistLock.Lock() 482 defer r.persistLock.Unlock() 483 snap := taskRunnerState{ 484 Version: r.config.Version.VersionNumber(), 485 ArtifactDownloaded: r.artifactsDownloaded, 486 TaskDirBuilt: r.taskDirBuilt, 487 PayloadRendered: r.payloadRendered, 488 CreatedResources: r.getCreatedResources(), 489 } 490 491 r.handleLock.Lock() 492 if r.handle != nil { 493 snap.HandleID = r.handle.ID() 494 } 495 r.handleLock.Unlock() 496 497 r.driverNetLock.Lock() 498 snap.DriverNetwork = r.driverNet.Copy() 499 r.driverNetLock.Unlock() 500 501 // If nothing has changed avoid the write 502 h := snap.Hash() 503 if bytes.Equal(h, r.persistedHash) { 504 return nil 505 } 506 507 // Serialize the object 508 var buf bytes.Buffer 509 if err := codec.NewEncoder(&buf, structs.MsgpackHandle).Encode(&snap); err != nil { 510 return fmt.Errorf("failed to serialize snapshot: %v", err) 511 } 512 513 // Start the transaction. 514 return r.stateDB.Batch(func(tx *bolt.Tx) error { 515 // Grab the task bucket 516 taskBkt, err := state.GetTaskBucket(tx, r.alloc.ID, r.task.Name) 517 if err != nil { 518 return fmt.Errorf("failed to retrieve allocation bucket: %v", err) 519 } 520 521 if err := state.PutData(taskBkt, taskRunnerStateAllKey, buf.Bytes()); err != nil { 522 return fmt.Errorf("failed to write task_runner state: %v", err) 523 } 524 525 // Store the hash that was persisted 526 tx.OnCommit(func() { 527 r.persistedHash = h 528 }) 529 530 return nil 531 }) 532 } 533 534 // DestroyState is used to cleanup after ourselves 535 func (r *TaskRunner) DestroyState() error { 536 r.persistLock.Lock() 537 defer r.persistLock.Unlock() 538 539 return r.stateDB.Update(func(tx *bolt.Tx) error { 540 if err := state.DeleteTaskBucket(tx, r.alloc.ID, r.task.Name); err != nil { 541 return fmt.Errorf("failed to delete task bucket: %v", err) 542 } 543 return nil 544 }) 545 } 546 547 // setState is used to update the state of the task runner 548 func (r *TaskRunner) setState(state string, event *structs.TaskEvent, lazySync bool) { 549 event.PopulateEventDisplayMessage() 550 551 // Persist our state to disk. 552 if err := r.SaveState(); err != nil { 553 r.logger.Printf("[ERR] client: failed to save state of Task Runner for task %q: %v", r.task.Name, err) 554 } 555 556 // Indicate the task has been updated. 557 r.updater(r.task.Name, state, event, lazySync) 558 } 559 560 // createDriver makes a driver for the task 561 func (r *TaskRunner) createDriver() (driver.Driver, error) { 562 // Create a task-specific event emitter callback to expose minimal 563 // state to drivers 564 eventEmitter := func(m string, args ...interface{}) { 565 msg := fmt.Sprintf(m, args...) 566 r.logger.Printf("[DEBUG] client: driver event for alloc %q: %s", r.alloc.ID, msg) 567 r.setState(structs.TaskStatePending, structs.NewTaskEvent(structs.TaskDriverMessage).SetDriverMessage(msg), false) 568 } 569 570 driverCtx := driver.NewDriverContext(r.alloc.Job.Name, r.alloc.TaskGroup, r.task.Name, r.alloc.ID, r.config, r.config.Node, r.logger, eventEmitter) 571 d, err := driver.NewDriver(r.task.Driver, driverCtx) 572 if err != nil { 573 return nil, fmt.Errorf("failed to create driver '%s' for alloc %s: %v", 574 r.task.Driver, r.alloc.ID, err) 575 } 576 577 return d, err 578 } 579 580 // Run is a long running routine used to manage the task 581 func (r *TaskRunner) Run() { 582 defer close(r.waitCh) 583 r.logger.Printf("[DEBUG] client: starting task context for '%s' (alloc '%s')", 584 r.task.Name, r.alloc.ID) 585 586 if err := r.validateTask(); err != nil { 587 r.setState( 588 structs.TaskStateDead, 589 structs.NewTaskEvent(structs.TaskFailedValidation).SetValidationError(err).SetFailsTask(), 590 false) 591 return 592 } 593 594 // Create a temporary driver so that we can determine the FSIsolation 595 // required. run->startTask will create a new driver after environment 596 // has been setup (env vars, templates, artifacts, secrets, etc). 597 tmpDrv, err := r.createDriver() 598 if err != nil { 599 e := fmt.Errorf("failed to create driver of task %q for alloc %q: %v", r.task.Name, r.alloc.ID, err) 600 r.setState( 601 structs.TaskStateDead, 602 structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(e).SetFailsTask(), 603 false) 604 return 605 } 606 607 // Build base task directory structure regardless of FS isolation abilities. 608 // This needs to happen before we start the Vault manager and call prestart 609 // as both those can write to the task directories 610 if err := r.buildTaskDir(tmpDrv.FSIsolation()); err != nil { 611 e := fmt.Errorf("failed to build task directory for %q: %v", r.task.Name, err) 612 r.setState( 613 structs.TaskStateDead, 614 structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(e).SetFailsTask(), 615 false) 616 return 617 } 618 619 // If there is no Vault policy leave the static future created in 620 // NewTaskRunner 621 if r.task.Vault != nil { 622 // Start the go-routine to get a Vault token 623 r.vaultFuture.Clear() 624 go r.vaultManager(r.recoveredVaultToken) 625 } 626 627 // Start the run loop 628 r.run() 629 630 // Do any cleanup necessary 631 r.postrun() 632 633 return 634 } 635 636 // validateTask validates the fields of the task and returns an error if the 637 // task is invalid. 638 func (r *TaskRunner) validateTask() error { 639 var mErr multierror.Error 640 641 // Validate the user. 642 unallowedUsers := r.config.ReadStringListToMapDefault("user.blacklist", config.DefaultUserBlacklist) 643 checkDrivers := r.config.ReadStringListToMapDefault("user.checked_drivers", config.DefaultUserCheckedDrivers) 644 if _, driverMatch := checkDrivers[r.task.Driver]; driverMatch { 645 if _, unallowed := unallowedUsers[r.task.User]; unallowed { 646 mErr.Errors = append(mErr.Errors, fmt.Errorf("running as user %q is disallowed", r.task.User)) 647 } 648 } 649 650 // Validate the artifacts 651 for i, artifact := range r.task.Artifacts { 652 // Verify the artifact doesn't escape the task directory. 653 if err := artifact.Validate(); err != nil { 654 // If this error occurs there is potentially a server bug or 655 // malicious, server spoofing. 656 r.logger.Printf("[ERR] client: allocation %q, task %v, artifact %#v (%v) fails validation: %v", 657 r.alloc.ID, r.task.Name, artifact, i, err) 658 mErr.Errors = append(mErr.Errors, fmt.Errorf("artifact (%d) failed validation: %v", i, err)) 659 } 660 } 661 662 // Validate the Service names 663 taskEnv := r.envBuilder.Build() 664 for i, service := range r.task.Services { 665 name := taskEnv.ReplaceEnv(service.Name) 666 if err := service.ValidateName(name); err != nil { 667 mErr.Errors = append(mErr.Errors, fmt.Errorf("service (%d) failed validation: %v", i, err)) 668 } 669 } 670 671 if len(mErr.Errors) == 1 { 672 return mErr.Errors[0] 673 } 674 return mErr.ErrorOrNil() 675 } 676 677 // tokenFuture stores the Vault token and allows consumers to block till a valid 678 // token exists 679 type tokenFuture struct { 680 waiting []chan struct{} 681 token string 682 set bool 683 m sync.Mutex 684 } 685 686 // NewTokenFuture returns a new token future without any token set 687 func NewTokenFuture() *tokenFuture { 688 return &tokenFuture{} 689 } 690 691 // Wait returns a channel that can be waited on. When this channel unblocks, a 692 // valid token will be available via the Get method 693 func (f *tokenFuture) Wait() <-chan struct{} { 694 f.m.Lock() 695 defer f.m.Unlock() 696 697 c := make(chan struct{}) 698 if f.set { 699 close(c) 700 return c 701 } 702 703 f.waiting = append(f.waiting, c) 704 return c 705 } 706 707 // Set sets the token value and unblocks any caller of Wait 708 func (f *tokenFuture) Set(token string) *tokenFuture { 709 f.m.Lock() 710 defer f.m.Unlock() 711 712 f.set = true 713 f.token = token 714 for _, w := range f.waiting { 715 close(w) 716 } 717 f.waiting = nil 718 return f 719 } 720 721 // Clear clears the set vault token. 722 func (f *tokenFuture) Clear() *tokenFuture { 723 f.m.Lock() 724 defer f.m.Unlock() 725 726 f.token = "" 727 f.set = false 728 return f 729 } 730 731 // Get returns the set Vault token 732 func (f *tokenFuture) Get() string { 733 f.m.Lock() 734 defer f.m.Unlock() 735 return f.token 736 } 737 738 // vaultManager should be called in a go-routine and manages the derivation, 739 // renewal and handling of errors with the Vault token. The optional parameter 740 // allows setting the initial Vault token. This is useful when the Vault token 741 // is recovered off disk. 742 func (r *TaskRunner) vaultManager(token string) { 743 // Helper for stopping token renewal 744 stopRenewal := func() { 745 if err := r.vaultClient.StopRenewToken(r.vaultFuture.Get()); err != nil { 746 r.logger.Printf("[WARN] client: failed to stop token renewal for task %v in alloc %q: %v", r.task.Name, r.alloc.ID, err) 747 } 748 } 749 750 // updatedToken lets us store state between loops. If true, a new token 751 // has been retrieved and we need to apply the Vault change mode 752 var updatedToken bool 753 754 OUTER: 755 for { 756 // Check if we should exit 757 select { 758 case <-r.waitCh: 759 stopRenewal() 760 return 761 default: 762 } 763 764 // Clear the token 765 r.vaultFuture.Clear() 766 767 // Check if there already is a token which can be the case for 768 // restoring the TaskRunner 769 if token == "" { 770 // Get a token 771 var exit bool 772 token, exit = r.deriveVaultToken() 773 if exit { 774 // Exit the manager 775 return 776 } 777 778 // Write the token to disk 779 if err := r.writeToken(token); err != nil { 780 e := fmt.Errorf("failed to write Vault token to disk") 781 r.logger.Printf("[ERR] client: %v for task %v on alloc %q: %v", e, r.task.Name, r.alloc.ID, err) 782 r.Kill("vault", e.Error(), true) 783 return 784 } 785 } 786 787 // Start the renewal process 788 renewCh, err := r.vaultClient.RenewToken(token, 30) 789 790 // An error returned means the token is not being renewed 791 if err != nil { 792 r.logger.Printf("[ERR] client: failed to start renewal of Vault token for task %v on alloc %q: %v", r.task.Name, r.alloc.ID, err) 793 token = "" 794 goto OUTER 795 } 796 797 // The Vault token is valid now, so set it 798 r.vaultFuture.Set(token) 799 800 if updatedToken { 801 switch r.task.Vault.ChangeMode { 802 case structs.VaultChangeModeSignal: 803 s, err := signals.Parse(r.task.Vault.ChangeSignal) 804 if err != nil { 805 e := fmt.Errorf("failed to parse signal: %v", err) 806 r.logger.Printf("[ERR] client: %v", err) 807 r.Kill("vault", e.Error(), true) 808 return 809 } 810 811 if err := r.Signal("vault", "new Vault token acquired", s); err != nil { 812 r.logger.Printf("[ERR] client: failed to send signal to task %v for alloc %q: %v", r.task.Name, r.alloc.ID, err) 813 r.Kill("vault", fmt.Sprintf("failed to send signal to task: %v", err), true) 814 return 815 } 816 case structs.VaultChangeModeRestart: 817 const noFailure = false 818 r.Restart("vault", "new Vault token acquired", noFailure) 819 case structs.VaultChangeModeNoop: 820 fallthrough 821 default: 822 r.logger.Printf("[ERR] client: Invalid Vault change mode: %q", r.task.Vault.ChangeMode) 823 } 824 825 // We have handled it 826 updatedToken = false 827 828 // Call the handler 829 r.updatedTokenHandler() 830 } 831 832 // Start watching for renewal errors 833 select { 834 case err := <-renewCh: 835 // Clear the token 836 token = "" 837 r.logger.Printf("[ERR] client: failed to renew Vault token for task %v on alloc %q: %v", r.task.Name, r.alloc.ID, err) 838 stopRenewal() 839 840 // Check if we have to do anything 841 if r.task.Vault.ChangeMode != structs.VaultChangeModeNoop { 842 updatedToken = true 843 } 844 case <-r.waitCh: 845 stopRenewal() 846 return 847 } 848 } 849 } 850 851 // deriveVaultToken derives the Vault token using exponential backoffs. It 852 // returns the Vault token and whether the manager should exit. 853 func (r *TaskRunner) deriveVaultToken() (token string, exit bool) { 854 attempts := 0 855 for { 856 tokens, err := r.vaultClient.DeriveToken(r.alloc, []string{r.task.Name}) 857 if err == nil { 858 return tokens[r.task.Name], false 859 } 860 861 // Check if this is a server side error 862 if structs.IsServerSide(err) { 863 r.logger.Printf("[ERR] client: failed to derive Vault token for task %v on alloc %q: %v", 864 r.task.Name, r.alloc.ID, err) 865 r.Kill("vault", fmt.Sprintf("server error deriving vault token: %v", err), true) 866 return "", true 867 } 868 // Check if we can't recover from the error 869 if !structs.IsRecoverable(err) { 870 r.logger.Printf("[ERR] client: failed to derive Vault token for task %v on alloc %q: %v", 871 r.task.Name, r.alloc.ID, err) 872 r.Kill("vault", fmt.Sprintf("failed to derive token: %v", err), true) 873 return "", true 874 } 875 876 // Handle the retry case 877 backoff := (1 << (2 * uint64(attempts))) * vaultBackoffBaseline 878 if backoff > vaultBackoffLimit { 879 backoff = vaultBackoffLimit 880 } 881 r.logger.Printf("[ERR] client: failed to derive Vault token for task %v on alloc %q: %v; retrying in %v", 882 r.task.Name, r.alloc.ID, err, backoff) 883 884 attempts++ 885 886 // Wait till retrying 887 select { 888 case <-r.waitCh: 889 return "", true 890 case <-time.After(backoff): 891 } 892 } 893 } 894 895 // writeToken writes the given token to disk 896 func (r *TaskRunner) writeToken(token string) error { 897 tokenPath := filepath.Join(r.taskDir.SecretsDir, vaultTokenFile) 898 if err := ioutil.WriteFile(tokenPath, []byte(token), 0777); err != nil { 899 return fmt.Errorf("failed to save Vault tokens to secret dir for task %q in alloc %q: %v", r.task.Name, r.alloc.ID, err) 900 } 901 902 return nil 903 } 904 905 // updatedTokenHandler is called when a new Vault token is retrieved. Things 906 // that rely on the token should be updated here. 907 func (r *TaskRunner) updatedTokenHandler() { 908 909 // Update the tasks environment 910 r.envBuilder.SetVaultToken(r.vaultFuture.Get(), r.task.Vault.Env) 911 912 if r.templateManager != nil { 913 r.templateManager.Stop() 914 915 // Create a new templateManager 916 var err error 917 r.templateManager, err = NewTaskTemplateManager(&TaskTemplateManagerConfig{ 918 Hooks: r, 919 Templates: r.task.Templates, 920 ClientConfig: r.config, 921 VaultToken: r.vaultFuture.Get(), 922 TaskDir: r.taskDir.Dir, 923 EnvBuilder: r.envBuilder, 924 MaxTemplateEventRate: DefaultMaxTemplateEventRate, 925 }) 926 927 if err != nil { 928 err := fmt.Errorf("failed to build task's template manager: %v", err) 929 r.setState(structs.TaskStateDead, 930 structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(err).SetFailsTask(), 931 false) 932 r.logger.Printf("[ERR] client: alloc %q, task %q %v", r.alloc.ID, r.task.Name, err) 933 r.Kill("vault", err.Error(), true) 934 return 935 } 936 } 937 } 938 939 // prestart handles life-cycle tasks that occur before the task has started. 940 // Since it's run asynchronously with the main Run() loop the alloc & task are 941 // passed in to avoid racing with updates. 942 func (r *TaskRunner) prestart(alloc *structs.Allocation, task *structs.Task, resultCh chan bool) { 943 if task.Vault != nil { 944 // Wait for the token 945 r.logger.Printf("[DEBUG] client: waiting for Vault token for task %v in alloc %q", task.Name, alloc.ID) 946 tokenCh := r.vaultFuture.Wait() 947 select { 948 case <-tokenCh: 949 case <-r.waitCh: 950 resultCh <- false 951 return 952 } 953 r.logger.Printf("[DEBUG] client: retrieved Vault token for task %v in alloc %q", task.Name, alloc.ID) 954 r.envBuilder.SetVaultToken(r.vaultFuture.Get(), task.Vault.Env) 955 } 956 957 // If the job is a dispatch job and there is a payload write it to disk 958 requirePayload := len(alloc.Job.Payload) != 0 && 959 (r.task.DispatchPayload != nil && r.task.DispatchPayload.File != "") 960 if !r.payloadRendered && requirePayload { 961 renderTo := filepath.Join(r.taskDir.LocalDir, task.DispatchPayload.File) 962 decoded, err := snappy.Decode(nil, alloc.Job.Payload) 963 if err != nil { 964 r.setState( 965 structs.TaskStateDead, 966 structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(err).SetFailsTask(), 967 false) 968 resultCh <- false 969 return 970 } 971 972 if err := os.MkdirAll(filepath.Dir(renderTo), 07777); err != nil { 973 r.setState( 974 structs.TaskStateDead, 975 structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(err).SetFailsTask(), 976 false) 977 resultCh <- false 978 return 979 } 980 981 if err := ioutil.WriteFile(renderTo, decoded, 0777); err != nil { 982 r.setState( 983 structs.TaskStateDead, 984 structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(err).SetFailsTask(), 985 false) 986 resultCh <- false 987 return 988 } 989 990 r.payloadRendered = true 991 } 992 993 for { 994 r.persistLock.Lock() 995 downloaded := r.artifactsDownloaded 996 r.persistLock.Unlock() 997 998 // Download the task's artifacts 999 if !downloaded && len(task.Artifacts) > 0 { 1000 r.setState(structs.TaskStatePending, structs.NewTaskEvent(structs.TaskDownloadingArtifacts), false) 1001 taskEnv := r.envBuilder.Build() 1002 for _, artifact := range task.Artifacts { 1003 if err := getter.GetArtifact(taskEnv, artifact, r.taskDir.Dir); err != nil { 1004 wrapped := fmt.Errorf("failed to download artifact %q: %v", artifact.GetterSource, err) 1005 r.logger.Printf("[DEBUG] client: %v", wrapped) 1006 r.setState(structs.TaskStatePending, 1007 structs.NewTaskEvent(structs.TaskArtifactDownloadFailed).SetDownloadError(wrapped), false) 1008 r.restartTracker.SetStartError(structs.WrapRecoverable(wrapped.Error(), err)) 1009 goto RESTART 1010 } 1011 } 1012 1013 r.persistLock.Lock() 1014 r.artifactsDownloaded = true 1015 r.persistLock.Unlock() 1016 } 1017 1018 // We don't have to wait for any template 1019 if len(task.Templates) == 0 { 1020 // Send the start signal 1021 select { 1022 case r.startCh <- struct{}{}: 1023 default: 1024 } 1025 1026 resultCh <- true 1027 return 1028 } 1029 1030 // Build the template manager 1031 if r.templateManager == nil { 1032 var err error 1033 r.templateManager, err = NewTaskTemplateManager(&TaskTemplateManagerConfig{ 1034 Hooks: r, 1035 Templates: r.task.Templates, 1036 ClientConfig: r.config, 1037 VaultToken: r.vaultFuture.Get(), 1038 TaskDir: r.taskDir.Dir, 1039 EnvBuilder: r.envBuilder, 1040 MaxTemplateEventRate: DefaultMaxTemplateEventRate, 1041 }) 1042 if err != nil { 1043 err := fmt.Errorf("failed to build task's template manager: %v", err) 1044 r.setState(structs.TaskStateDead, structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(err).SetFailsTask(), false) 1045 r.logger.Printf("[ERR] client: alloc %q, task %q %v", alloc.ID, task.Name, err) 1046 resultCh <- false 1047 return 1048 } 1049 } 1050 1051 // Block for consul-template 1052 // TODO Hooks should register themselves as blocking and then we can 1053 // periodically enumerate what we are still blocked on 1054 select { 1055 case <-r.unblockCh: 1056 // Send the start signal 1057 select { 1058 case r.startCh <- struct{}{}: 1059 default: 1060 } 1061 1062 resultCh <- true 1063 return 1064 case <-r.waitCh: 1065 // The run loop has exited so exit too 1066 resultCh <- false 1067 return 1068 } 1069 1070 RESTART: 1071 restart := r.shouldRestart() 1072 if !restart { 1073 resultCh <- false 1074 return 1075 } 1076 } 1077 } 1078 1079 // postrun is used to do any cleanup that is necessary after exiting the runloop 1080 func (r *TaskRunner) postrun() { 1081 // Stop the template manager 1082 if r.templateManager != nil { 1083 r.templateManager.Stop() 1084 } 1085 } 1086 1087 // run is the main run loop that handles starting the application, destroying 1088 // it, restarts and signals. 1089 func (r *TaskRunner) run() { 1090 // Predeclare things so we can jump to the RESTART 1091 var stopCollection chan struct{} 1092 var handleWaitCh chan *dstructs.WaitResult 1093 1094 // If we already have a handle, populate the stopCollection and handleWaitCh 1095 // to fix the invariant that it exists. 1096 handleEmpty := r.getHandle() == nil 1097 1098 if !handleEmpty { 1099 stopCollection = make(chan struct{}) 1100 go r.collectResourceUsageStats(stopCollection) 1101 handleWaitCh = r.handle.WaitCh() 1102 } 1103 1104 for { 1105 // Do the prestart activities 1106 prestartResultCh := make(chan bool, 1) 1107 go r.prestart(r.alloc, r.task, prestartResultCh) 1108 1109 WAIT: 1110 for { 1111 select { 1112 case success := <-prestartResultCh: 1113 if !success { 1114 r.cleanup() 1115 r.setState(structs.TaskStateDead, nil, false) 1116 return 1117 } 1118 case <-r.startCh: 1119 // Start the task if not yet started or it is being forced. This logic 1120 // is necessary because in the case of a restore the handle already 1121 // exists. 1122 handleEmpty := r.getHandle() == nil 1123 if handleEmpty { 1124 startErr := r.startTask() 1125 r.restartTracker.SetStartError(startErr) 1126 if startErr != nil { 1127 r.setState("", structs.NewTaskEvent(structs.TaskDriverFailure).SetDriverError(startErr), true) 1128 goto RESTART 1129 } 1130 1131 // Mark the task as started 1132 r.setState(structs.TaskStateRunning, structs.NewTaskEvent(structs.TaskStarted), false) 1133 r.runningLock.Lock() 1134 r.running = true 1135 r.runningLock.Unlock() 1136 1137 if stopCollection == nil { 1138 stopCollection = make(chan struct{}) 1139 go r.collectResourceUsageStats(stopCollection) 1140 } 1141 1142 handleWaitCh = r.handle.WaitCh() 1143 } 1144 1145 case waitRes := <-handleWaitCh: 1146 if waitRes == nil { 1147 panic("nil wait") 1148 } 1149 1150 r.runningLock.Lock() 1151 r.running = false 1152 r.runningLock.Unlock() 1153 1154 // Stop collection of the task's resource usage 1155 close(stopCollection) 1156 1157 // Log whether the task was successful or not. 1158 r.restartTracker.SetWaitResult(waitRes) 1159 r.setState("", r.waitErrorToEvent(waitRes), true) 1160 if !waitRes.Successful() { 1161 r.logger.Printf("[INFO] client: task %q for alloc %q failed: %v", r.task.Name, r.alloc.ID, waitRes) 1162 } else { 1163 r.logger.Printf("[INFO] client: task %q for alloc %q completed successfully", r.task.Name, r.alloc.ID) 1164 } 1165 1166 break WAIT 1167 case update := <-r.updateCh: 1168 if err := r.handleUpdate(update); err != nil { 1169 r.logger.Printf("[ERR] client: update to task %q failed: %v", r.task.Name, err) 1170 } 1171 1172 case se := <-r.signalCh: 1173 r.runningLock.Lock() 1174 running := r.running 1175 r.runningLock.Unlock() 1176 common := fmt.Sprintf("signal %v to task %v for alloc %q", se.s, r.task.Name, r.alloc.ID) 1177 if !running { 1178 // Send no error 1179 r.logger.Printf("[DEBUG] client: skipping %s", common) 1180 se.result <- nil 1181 continue 1182 } 1183 1184 r.logger.Printf("[DEBUG] client: sending %s", common) 1185 r.setState(structs.TaskStateRunning, se.e, false) 1186 1187 res := r.handle.Signal(se.s) 1188 se.result <- res 1189 1190 case restartEvent := <-r.restartCh: 1191 r.runningLock.Lock() 1192 running := r.running 1193 r.runningLock.Unlock() 1194 common := fmt.Sprintf("task %v for alloc %q", r.task.Name, r.alloc.ID) 1195 if !running { 1196 r.logger.Printf("[DEBUG] client: skipping restart of %v: task isn't running", common) 1197 continue 1198 } 1199 1200 r.logger.Printf("[DEBUG] client: restarting %s: %v", common, restartEvent.taskEvent.RestartReason) 1201 r.setState(structs.TaskStateRunning, restartEvent.taskEvent, false) 1202 r.killTask(nil) 1203 1204 close(stopCollection) 1205 1206 if handleWaitCh != nil { 1207 <-handleWaitCh 1208 } 1209 1210 r.restartTracker.SetRestartTriggered(restartEvent.failure) 1211 break WAIT 1212 1213 case <-r.destroyCh: 1214 r.runningLock.Lock() 1215 running := r.running 1216 r.runningLock.Unlock() 1217 if !running { 1218 r.cleanup() 1219 r.setState(structs.TaskStateDead, r.destroyEvent, false) 1220 return 1221 } 1222 1223 // Remove from consul before killing the task so that traffic 1224 // can be rerouted 1225 r.removeServices() 1226 1227 // Delay actually killing the task if configured. See #244 1228 if r.task.ShutdownDelay > 0 { 1229 r.logger.Printf("[DEBUG] client: delaying shutdown of alloc %q task %q for %q", 1230 r.alloc.ID, r.task.Name, r.task.ShutdownDelay) 1231 <-time.After(r.task.ShutdownDelay) 1232 } 1233 1234 // Store the task event that provides context on the task 1235 // destroy. The Killed event is set from the alloc_runner and 1236 // doesn't add detail 1237 var killEvent *structs.TaskEvent 1238 if r.destroyEvent.Type != structs.TaskKilled { 1239 if r.destroyEvent.Type == structs.TaskKilling { 1240 killEvent = r.destroyEvent 1241 } else { 1242 r.setState(structs.TaskStateRunning, r.destroyEvent, false) 1243 } 1244 } 1245 1246 r.killTask(killEvent) 1247 close(stopCollection) 1248 1249 // Wait for handler to exit before calling cleanup 1250 <-handleWaitCh 1251 r.cleanup() 1252 1253 r.setState(structs.TaskStateDead, nil, false) 1254 return 1255 } 1256 } 1257 1258 RESTART: 1259 // shouldRestart will block if the task should restart after a delay. 1260 restart := r.shouldRestart() 1261 if !restart { 1262 r.cleanup() 1263 r.setState(structs.TaskStateDead, nil, false) 1264 return 1265 } 1266 1267 // Clear the handle so a new driver will be created. 1268 r.handleLock.Lock() 1269 r.handle = nil 1270 handleWaitCh = nil 1271 stopCollection = nil 1272 r.handleLock.Unlock() 1273 } 1274 } 1275 1276 // cleanup removes Consul entries and calls Driver.Cleanup when a task is 1277 // stopping. Errors are logged. 1278 func (r *TaskRunner) cleanup() { 1279 // Remove from Consul 1280 r.removeServices() 1281 1282 drv, err := r.createDriver() 1283 if err != nil { 1284 r.logger.Printf("[ERR] client: error creating driver to cleanup resources: %v", err) 1285 return 1286 } 1287 1288 res := r.getCreatedResources() 1289 1290 ctx := driver.NewExecContext(r.taskDir, r.envBuilder.Build()) 1291 attempts := 1 1292 var cleanupErr error 1293 for retry := true; retry; attempts++ { 1294 cleanupErr = drv.Cleanup(ctx, res) 1295 retry = structs.IsRecoverable(cleanupErr) 1296 1297 // Copy current createdResources state in case SaveState is 1298 // called between retries 1299 r.setCreatedResources(res) 1300 1301 // Retry 3 times with sleeps between 1302 if !retry || attempts > 3 { 1303 break 1304 } 1305 time.Sleep(time.Duration(attempts) * time.Second) 1306 } 1307 1308 if cleanupErr != nil { 1309 r.logger.Printf("[ERR] client: error cleaning up resources for task %q after %d attempts: %v", r.task.Name, attempts, cleanupErr) 1310 } 1311 return 1312 } 1313 1314 // shouldRestart returns if the task should restart. If the return value is 1315 // true, the task's restart policy has already been considered and any wait time 1316 // between restarts has been applied. 1317 func (r *TaskRunner) shouldRestart() bool { 1318 state, when := r.restartTracker.GetState() 1319 reason := r.restartTracker.GetReason() 1320 switch state { 1321 case structs.TaskNotRestarting, structs.TaskTerminated: 1322 r.logger.Printf("[INFO] client: Not restarting task: %v for alloc: %v ", r.task.Name, r.alloc.ID) 1323 if state == structs.TaskNotRestarting { 1324 r.setState(structs.TaskStateDead, 1325 structs.NewTaskEvent(structs.TaskNotRestarting). 1326 SetRestartReason(reason).SetFailsTask(), 1327 false) 1328 } 1329 return false 1330 case structs.TaskRestarting: 1331 r.logger.Printf("[INFO] client: Restarting task %q for alloc %q in %v", r.task.Name, r.alloc.ID, when) 1332 r.setState(structs.TaskStatePending, 1333 structs.NewTaskEvent(structs.TaskRestarting). 1334 SetRestartDelay(when). 1335 SetRestartReason(reason), 1336 false) 1337 default: 1338 r.logger.Printf("[ERR] client: restart tracker returned unknown state: %q", state) 1339 return false 1340 } 1341 1342 // Unregister from Consul while waiting to restart. 1343 r.removeServices() 1344 1345 // Sleep but watch for destroy events. 1346 select { 1347 case <-time.After(when): 1348 case <-r.destroyCh: 1349 } 1350 1351 // Destroyed while we were waiting to restart, so abort. 1352 r.destroyLock.Lock() 1353 destroyed := r.destroy 1354 r.destroyLock.Unlock() 1355 if destroyed { 1356 r.logger.Printf("[DEBUG] client: Not restarting task: %v because it has been destroyed", r.task.Name) 1357 r.setState(structs.TaskStateDead, r.destroyEvent, false) 1358 return false 1359 } 1360 1361 return true 1362 } 1363 1364 // killTask kills the running task. A killing event can optionally be passed and 1365 // this event is used to mark the task as being killed. It provides a means to 1366 // store extra information. 1367 func (r *TaskRunner) killTask(killingEvent *structs.TaskEvent) { 1368 r.runningLock.Lock() 1369 running := r.running 1370 r.runningLock.Unlock() 1371 if !running { 1372 return 1373 } 1374 1375 // Get the kill timeout 1376 timeout := driver.GetKillTimeout(r.task.KillTimeout, r.config.MaxKillTimeout) 1377 1378 // Build the event 1379 var event *structs.TaskEvent 1380 if killingEvent != nil { 1381 event = killingEvent 1382 event.Type = structs.TaskKilling 1383 } else { 1384 event = structs.NewTaskEvent(structs.TaskKilling) 1385 } 1386 event.SetKillTimeout(timeout) 1387 1388 // Mark that we received the kill event 1389 r.setState(structs.TaskStateRunning, event, false) 1390 1391 handle := r.getHandle() 1392 1393 // Kill the task using an exponential backoff in-case of failures. 1394 destroySuccess, err := r.handleDestroy(handle) 1395 if !destroySuccess { 1396 // We couldn't successfully destroy the resource created. 1397 r.logger.Printf("[ERR] client: failed to kill task %q. Resources may have been leaked: %v", r.task.Name, err) 1398 } 1399 1400 r.runningLock.Lock() 1401 r.running = false 1402 r.runningLock.Unlock() 1403 1404 // Store that the task has been destroyed and any associated error. 1405 r.setState("", structs.NewTaskEvent(structs.TaskKilled).SetKillError(err), true) 1406 } 1407 1408 // startTask creates the driver, task dir, and starts the task. 1409 func (r *TaskRunner) startTask() error { 1410 // Create a driver 1411 drv, err := r.createDriver() 1412 if err != nil { 1413 return fmt.Errorf("failed to create driver of task %q for alloc %q: %v", 1414 r.task.Name, r.alloc.ID, err) 1415 } 1416 1417 // Run prestart 1418 ctx := driver.NewExecContext(r.taskDir, r.envBuilder.Build()) 1419 presp, err := drv.Prestart(ctx, r.task) 1420 1421 // Merge newly created resources into previously created resources 1422 if presp != nil { 1423 r.createdResourcesLock.Lock() 1424 r.createdResources.Merge(presp.CreatedResources) 1425 r.createdResourcesLock.Unlock() 1426 1427 // Set any network configuration returned by the driver 1428 r.envBuilder.SetDriverNetwork(presp.Network) 1429 } 1430 1431 if err != nil { 1432 wrapped := fmt.Sprintf("failed to initialize task %q for alloc %q: %v", 1433 r.task.Name, r.alloc.ID, err) 1434 r.logger.Printf("[WARN] client: error from prestart: %s", wrapped) 1435 return structs.WrapRecoverable(wrapped, err) 1436 } 1437 1438 // Create a new context for Start since the environment may have been updated. 1439 ctx = driver.NewExecContext(r.taskDir, r.envBuilder.Build()) 1440 1441 // Start the job 1442 sresp, err := drv.Start(ctx, r.task) 1443 if err != nil { 1444 wrapped := fmt.Sprintf("failed to start task %q for alloc %q: %v", 1445 r.task.Name, r.alloc.ID, err) 1446 r.logger.Printf("[WARN] client: %s", wrapped) 1447 return structs.WrapRecoverable(wrapped, err) 1448 1449 } 1450 1451 // Log driver network information 1452 if sresp.Network != nil && sresp.Network.IP != "" { 1453 if sresp.Network.AutoAdvertise { 1454 r.logger.Printf("[INFO] client: alloc %s task %s auto-advertising detected IP %s", 1455 r.alloc.ID, r.task.Name, sresp.Network.IP) 1456 } else { 1457 r.logger.Printf("[TRACE] client: alloc %s task %s detected IP %s but not auto-advertising", 1458 r.alloc.ID, r.task.Name, sresp.Network.IP) 1459 } 1460 } 1461 1462 if sresp.Network == nil || sresp.Network.IP == "" { 1463 r.logger.Printf("[TRACE] client: alloc %s task %s could not detect a driver IP", r.alloc.ID, r.task.Name) 1464 } 1465 1466 // Update environment with the network defined by the driver's Start method. 1467 r.envBuilder.SetDriverNetwork(sresp.Network) 1468 1469 if err := r.registerServices(drv, sresp.Handle, sresp.Network); err != nil { 1470 // All IO is done asynchronously, so errors from registering 1471 // services are hard failures. 1472 r.logger.Printf("[ERR] client: failed to register services and checks for task %q alloc %q: %v", r.task.Name, r.alloc.ID, err) 1473 1474 // Kill the started task 1475 if destroyed, err := r.handleDestroy(sresp.Handle); !destroyed { 1476 r.logger.Printf("[ERR] client: failed to kill task %q alloc %q. Resources may be leaked: %v", 1477 r.task.Name, r.alloc.ID, err) 1478 } 1479 return structs.NewRecoverableError(err, false) 1480 } 1481 1482 r.handleLock.Lock() 1483 r.handle = sresp.Handle 1484 r.handleLock.Unlock() 1485 1486 // Need to persist the driver network between restarts 1487 r.driverNetLock.Lock() 1488 r.driverNet = sresp.Network 1489 r.driverNetLock.Unlock() 1490 1491 return nil 1492 } 1493 1494 // registerServices and checks with Consul. 1495 func (r *TaskRunner) registerServices(d driver.Driver, h driver.DriverHandle, n *cstructs.DriverNetwork) error { 1496 var exec driver.ScriptExecutor 1497 if d.Abilities().Exec { 1498 // Allow set the script executor if the driver supports it 1499 exec = h 1500 } 1501 interpolatedTask := interpolateServices(r.envBuilder.Build(), r.task) 1502 taskServices := consul.NewTaskServices(r.alloc, interpolatedTask, r, exec, n) 1503 return r.consul.RegisterTask(taskServices) 1504 } 1505 1506 // interpolateServices interpolates tags in a service and checks with values from the 1507 // task's environment. 1508 func interpolateServices(taskEnv *env.TaskEnv, task *structs.Task) *structs.Task { 1509 taskCopy := task.Copy() 1510 for _, service := range taskCopy.Services { 1511 for _, check := range service.Checks { 1512 check.Name = taskEnv.ReplaceEnv(check.Name) 1513 check.Type = taskEnv.ReplaceEnv(check.Type) 1514 check.Command = taskEnv.ReplaceEnv(check.Command) 1515 check.Args = taskEnv.ParseAndReplace(check.Args) 1516 check.Path = taskEnv.ReplaceEnv(check.Path) 1517 check.Protocol = taskEnv.ReplaceEnv(check.Protocol) 1518 check.PortLabel = taskEnv.ReplaceEnv(check.PortLabel) 1519 check.InitialStatus = taskEnv.ReplaceEnv(check.InitialStatus) 1520 check.Method = taskEnv.ReplaceEnv(check.Method) 1521 check.GRPCService = taskEnv.ReplaceEnv(check.GRPCService) 1522 if len(check.Header) > 0 { 1523 header := make(map[string][]string, len(check.Header)) 1524 for k, vs := range check.Header { 1525 newVals := make([]string, len(vs)) 1526 for i, v := range vs { 1527 newVals[i] = taskEnv.ReplaceEnv(v) 1528 } 1529 header[taskEnv.ReplaceEnv(k)] = newVals 1530 } 1531 check.Header = header 1532 } 1533 } 1534 service.Name = taskEnv.ReplaceEnv(service.Name) 1535 service.PortLabel = taskEnv.ReplaceEnv(service.PortLabel) 1536 service.Tags = taskEnv.ParseAndReplace(service.Tags) 1537 service.CanaryTags = taskEnv.ParseAndReplace(service.CanaryTags) 1538 } 1539 return taskCopy 1540 } 1541 1542 // buildTaskDir creates the task directory before driver.Prestart. It is safe 1543 // to call multiple times as its state is persisted. 1544 func (r *TaskRunner) buildTaskDir(fsi cstructs.FSIsolation) error { 1545 r.persistLock.Lock() 1546 built := r.taskDirBuilt 1547 r.persistLock.Unlock() 1548 1549 // We do not set the state again since this only occurs during restoration 1550 // and the task dir is already built. The reason we call Build again is to 1551 // ensure that the task dir invariants are still held. 1552 if !built { 1553 r.setState(structs.TaskStatePending, 1554 structs.NewTaskEvent(structs.TaskSetup).SetMessage(structs.TaskBuildingTaskDir), 1555 false) 1556 } 1557 1558 chroot := config.DefaultChrootEnv 1559 if len(r.config.ChrootEnv) > 0 { 1560 chroot = r.config.ChrootEnv 1561 } 1562 if err := r.taskDir.Build(built, chroot, fsi); err != nil { 1563 return err 1564 } 1565 1566 // Mark task dir as successfully built 1567 r.persistLock.Lock() 1568 r.taskDirBuilt = true 1569 r.persistLock.Unlock() 1570 1571 // Set path and host related env vars 1572 driver.SetEnvvars(r.envBuilder, fsi, r.taskDir, r.config) 1573 return nil 1574 } 1575 1576 // collectResourceUsageStats starts collecting resource usage stats of a Task. 1577 // Collection ends when the passed channel is closed 1578 func (r *TaskRunner) collectResourceUsageStats(stopCollection <-chan struct{}) { 1579 // start collecting the stats right away and then start collecting every 1580 // collection interval 1581 next := time.NewTimer(0) 1582 defer next.Stop() 1583 for { 1584 select { 1585 case <-next.C: 1586 next.Reset(r.config.StatsCollectionInterval) 1587 handle := r.getHandle() 1588 if handle == nil { 1589 continue 1590 } 1591 ru, err := handle.Stats() 1592 1593 if err != nil { 1594 // Check if the driver doesn't implement stats 1595 if err.Error() == driver.DriverStatsNotImplemented.Error() { 1596 r.logger.Printf("[DEBUG] client: driver for task %q in allocation %q doesn't support stats", r.task.Name, r.alloc.ID) 1597 return 1598 } 1599 1600 // We do not log when the plugin is shutdown as this is simply a 1601 // race between the stopCollection channel being closed and calling 1602 // Stats on the handle. 1603 if !strings.Contains(err.Error(), "connection is shut down") { 1604 r.logger.Printf("[DEBUG] client: error fetching stats of task %v: %v", r.task.Name, err) 1605 } 1606 continue 1607 } 1608 1609 r.resourceUsageLock.Lock() 1610 r.resourceUsage = ru 1611 r.resourceUsageLock.Unlock() 1612 if ru != nil { 1613 r.emitStats(ru) 1614 } 1615 case <-stopCollection: 1616 return 1617 } 1618 } 1619 } 1620 1621 // LatestResourceUsage returns the last resource utilization datapoint collected 1622 func (r *TaskRunner) LatestResourceUsage() *cstructs.TaskResourceUsage { 1623 r.resourceUsageLock.RLock() 1624 defer r.resourceUsageLock.RUnlock() 1625 r.runningLock.Lock() 1626 defer r.runningLock.Unlock() 1627 1628 // If the task is not running there can be no latest resource 1629 if !r.running { 1630 return nil 1631 } 1632 1633 return r.resourceUsage 1634 } 1635 1636 // handleUpdate takes an updated allocation and updates internal state to 1637 // reflect the new config for the task. 1638 func (r *TaskRunner) handleUpdate(update *structs.Allocation) error { 1639 // Extract the task group from the alloc. 1640 tg := update.Job.LookupTaskGroup(update.TaskGroup) 1641 if tg == nil { 1642 return fmt.Errorf("alloc '%s' missing task group '%s'", update.ID, update.TaskGroup) 1643 } 1644 1645 // Extract the task. 1646 var updatedTask *structs.Task 1647 for _, t := range tg.Tasks { 1648 if t.Name == r.task.Name { 1649 updatedTask = t.Copy() 1650 break 1651 } 1652 } 1653 if updatedTask == nil { 1654 return fmt.Errorf("task group %q doesn't contain task %q", tg.Name, r.task.Name) 1655 } 1656 1657 // Merge in the task resources 1658 updatedTask.Resources = update.TaskResources[updatedTask.Name] 1659 1660 // Interpolate the old task with the old env before updating the env as 1661 // updating services in Consul need both the old and new interpolations 1662 // to find differences. 1663 oldInterpolatedTask := interpolateServices(r.envBuilder.Build(), r.task) 1664 1665 // Now it's safe to update the environment 1666 r.envBuilder.UpdateTask(update, updatedTask) 1667 1668 var mErr multierror.Error 1669 r.handleLock.Lock() 1670 if r.handle != nil { 1671 drv, err := r.createDriver() 1672 if err != nil { 1673 // Something has really gone wrong; don't continue 1674 r.handleLock.Unlock() 1675 return fmt.Errorf("error accessing driver when updating task %q: %v", r.task.Name, err) 1676 } 1677 1678 // Update will update resources and store the new kill timeout. 1679 if err := r.handle.Update(updatedTask); err != nil { 1680 mErr.Errors = append(mErr.Errors, fmt.Errorf("updating task resources failed: %v", err)) 1681 } 1682 1683 // Update services in Consul 1684 newInterpolatedTask := interpolateServices(r.envBuilder.Build(), updatedTask) 1685 if err := r.updateServices(drv, r.handle, r.alloc, oldInterpolatedTask, update, newInterpolatedTask); err != nil { 1686 mErr.Errors = append(mErr.Errors, fmt.Errorf("error updating services and checks in Consul: %v", err)) 1687 } 1688 } 1689 r.handleLock.Unlock() 1690 1691 // Update the restart policy. 1692 if r.restartTracker != nil { 1693 r.restartTracker.SetPolicy(tg.RestartPolicy) 1694 } 1695 1696 // Store the updated alloc. 1697 r.alloc = update 1698 r.task = updatedTask 1699 return mErr.ErrorOrNil() 1700 } 1701 1702 // updateServices and checks with Consul. Tasks must be interpolated! 1703 func (r *TaskRunner) updateServices(d driver.Driver, h driver.ScriptExecutor, 1704 oldAlloc *structs.Allocation, oldTask *structs.Task, 1705 newAlloc *structs.Allocation, newTask *structs.Task) error { 1706 1707 var exec driver.ScriptExecutor 1708 if d.Abilities().Exec { 1709 // Allow set the script executor if the driver supports it 1710 exec = h 1711 } 1712 r.driverNetLock.Lock() 1713 net := r.driverNet.Copy() 1714 r.driverNetLock.Unlock() 1715 oldTaskServices := consul.NewTaskServices(oldAlloc, oldTask, r, exec, net) 1716 newTaskServices := consul.NewTaskServices(newAlloc, newTask, r, exec, net) 1717 return r.consul.UpdateTask(oldTaskServices, newTaskServices) 1718 } 1719 1720 // removeServices and checks from Consul. Handles interpolation and deleting 1721 // Canary=true and Canary=false versions in case Canary=false is set at the 1722 // same time as the alloc is stopped. 1723 func (r *TaskRunner) removeServices() { 1724 interpTask := interpolateServices(r.envBuilder.Build(), r.task) 1725 taskServices := consul.NewTaskServices(r.alloc, interpTask, r, nil, nil) 1726 r.consul.RemoveTask(taskServices) 1727 1728 // Flip Canary and remove again in case canary is getting flipped at 1729 // the same time as the alloc is being destroyed 1730 taskServices.Canary = !taskServices.Canary 1731 r.consul.RemoveTask(taskServices) 1732 } 1733 1734 // handleDestroy kills the task handle. In the case that killing fails, 1735 // handleDestroy will retry with an exponential backoff and will give up at a 1736 // given limit. It returns whether the task was destroyed and the error 1737 // associated with the last kill attempt. 1738 func (r *TaskRunner) handleDestroy(handle driver.DriverHandle) (destroyed bool, err error) { 1739 // Cap the number of times we attempt to kill the task. 1740 for i := 0; i < killFailureLimit; i++ { 1741 if err = handle.Kill(); err != nil { 1742 // Calculate the new backoff 1743 backoff := (1 << (2 * uint64(i))) * killBackoffBaseline 1744 if backoff > killBackoffLimit { 1745 backoff = killBackoffLimit 1746 } 1747 1748 r.logger.Printf("[ERR] client: failed to kill task '%s' for alloc %q. Retrying in %v: %v", 1749 r.task.Name, r.alloc.ID, backoff, err) 1750 time.Sleep(backoff) 1751 } else { 1752 // Kill was successful 1753 return true, nil 1754 } 1755 } 1756 return 1757 } 1758 1759 // Restart will restart the task. 1760 func (r *TaskRunner) Restart(source, reason string, failure bool) { 1761 reasonStr := fmt.Sprintf("%s: %s", source, reason) 1762 event := newTaskRestartEvent(reasonStr, failure) 1763 1764 select { 1765 case r.restartCh <- event: 1766 case <-r.waitCh: 1767 } 1768 } 1769 1770 // Signal will send a signal to the task 1771 func (r *TaskRunner) Signal(source, reason string, s os.Signal) error { 1772 1773 reasonStr := fmt.Sprintf("%s: %s", source, reason) 1774 event := structs.NewTaskEvent(structs.TaskSignaling).SetTaskSignal(s).SetTaskSignalReason(reasonStr) 1775 1776 resCh := make(chan error) 1777 se := SignalEvent{ 1778 s: s, 1779 e: event, 1780 result: resCh, 1781 } 1782 1783 select { 1784 case r.signalCh <- se: 1785 case <-r.waitCh: 1786 } 1787 1788 return <-resCh 1789 } 1790 1791 // Kill will kill a task and store the error, no longer restarting the task. If 1792 // fail is set, the task is marked as having failed. 1793 func (r *TaskRunner) Kill(source, reason string, fail bool) { 1794 reasonStr := fmt.Sprintf("%s: %s", source, reason) 1795 event := structs.NewTaskEvent(structs.TaskKilling).SetKillReason(reasonStr) 1796 if fail { 1797 event.SetFailsTask() 1798 } 1799 1800 r.logger.Printf("[DEBUG] client: killing task %v for alloc %q: %v", r.task.Name, r.alloc.ID, reasonStr) 1801 r.Destroy(event) 1802 } 1803 1804 func (r *TaskRunner) EmitEvent(source, message string) { 1805 event := structs.NewTaskEvent(source). 1806 SetMessage(message) 1807 r.setState("", event, false) 1808 r.logger.Printf("[DEBUG] client: event from %q for task %q in alloc %q: %v", 1809 source, r.task.Name, r.alloc.ID, message) 1810 } 1811 1812 // UnblockStart unblocks the starting of the task. It currently assumes only 1813 // consul-template will unblock 1814 func (r *TaskRunner) UnblockStart(source string) { 1815 r.unblockLock.Lock() 1816 defer r.unblockLock.Unlock() 1817 if r.unblocked { 1818 return 1819 } 1820 1821 r.logger.Printf("[DEBUG] client: unblocking task %v for alloc %q: %v", r.task.Name, r.alloc.ID, source) 1822 r.unblocked = true 1823 close(r.unblockCh) 1824 } 1825 1826 // Helper function for converting a WaitResult into a TaskTerminated event. 1827 func (r *TaskRunner) waitErrorToEvent(res *dstructs.WaitResult) *structs.TaskEvent { 1828 return structs.NewTaskEvent(structs.TaskTerminated). 1829 SetExitCode(res.ExitCode). 1830 SetSignal(res.Signal). 1831 SetExitMessage(res.Err) 1832 } 1833 1834 // Update is used to update the task of the context 1835 func (r *TaskRunner) Update(update *structs.Allocation) { 1836 select { 1837 case r.updateCh <- update: 1838 default: 1839 r.logger.Printf("[ERR] client: dropping task update '%s' (alloc '%s')", 1840 r.task.Name, r.alloc.ID) 1841 } 1842 } 1843 1844 // Destroy is used to indicate that the task context should be destroyed. The 1845 // event parameter provides a context for the destroy. 1846 func (r *TaskRunner) Destroy(event *structs.TaskEvent) { 1847 r.destroyLock.Lock() 1848 defer r.destroyLock.Unlock() 1849 1850 if r.destroy { 1851 return 1852 } 1853 r.destroy = true 1854 r.destroyEvent = event 1855 close(r.destroyCh) 1856 } 1857 1858 // getCreatedResources returns the resources created by drivers. It will never 1859 // return nil. 1860 func (r *TaskRunner) getCreatedResources() *driver.CreatedResources { 1861 r.createdResourcesLock.Lock() 1862 if r.createdResources == nil { 1863 r.createdResources = driver.NewCreatedResources() 1864 } 1865 cr := r.createdResources.Copy() 1866 r.createdResourcesLock.Unlock() 1867 1868 return cr 1869 } 1870 1871 // setCreatedResources updates the resources created by drivers. If passed nil 1872 // it will set createdResources to an initialized struct. 1873 func (r *TaskRunner) setCreatedResources(cr *driver.CreatedResources) { 1874 if cr == nil { 1875 cr = driver.NewCreatedResources() 1876 } 1877 r.createdResourcesLock.Lock() 1878 r.createdResources = cr.Copy() 1879 r.createdResourcesLock.Unlock() 1880 } 1881 1882 func (r *TaskRunner) setGaugeForMemory(ru *cstructs.TaskResourceUsage) { 1883 if !r.config.DisableTaggedMetrics { 1884 metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "rss"}, 1885 float32(ru.ResourceUsage.MemoryStats.RSS), r.baseLabels) 1886 metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "rss"}, 1887 float32(ru.ResourceUsage.MemoryStats.RSS), r.baseLabels) 1888 metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "cache"}, 1889 float32(ru.ResourceUsage.MemoryStats.Cache), r.baseLabels) 1890 metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "swap"}, 1891 float32(ru.ResourceUsage.MemoryStats.Swap), r.baseLabels) 1892 metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "max_usage"}, 1893 float32(ru.ResourceUsage.MemoryStats.MaxUsage), r.baseLabels) 1894 metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "kernel_usage"}, 1895 float32(ru.ResourceUsage.MemoryStats.KernelUsage), r.baseLabels) 1896 metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "kernel_max_usage"}, 1897 float32(ru.ResourceUsage.MemoryStats.KernelMaxUsage), r.baseLabels) 1898 } 1899 1900 if r.config.BackwardsCompatibleMetrics { 1901 metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "rss"}, float32(ru.ResourceUsage.MemoryStats.RSS)) 1902 metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "cache"}, float32(ru.ResourceUsage.MemoryStats.Cache)) 1903 metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "swap"}, float32(ru.ResourceUsage.MemoryStats.Swap)) 1904 metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "max_usage"}, float32(ru.ResourceUsage.MemoryStats.MaxUsage)) 1905 metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "kernel_usage"}, float32(ru.ResourceUsage.MemoryStats.KernelUsage)) 1906 metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "kernel_max_usage"}, float32(ru.ResourceUsage.MemoryStats.KernelMaxUsage)) 1907 } 1908 } 1909 1910 func (r *TaskRunner) setGaugeForCPU(ru *cstructs.TaskResourceUsage) { 1911 if !r.config.DisableTaggedMetrics { 1912 metrics.SetGaugeWithLabels([]string{"client", "allocs", "cpu", "total_percent"}, 1913 float32(ru.ResourceUsage.CpuStats.Percent), r.baseLabels) 1914 metrics.SetGaugeWithLabels([]string{"client", "allocs", "cpu", "system"}, 1915 float32(ru.ResourceUsage.CpuStats.SystemMode), r.baseLabels) 1916 metrics.SetGaugeWithLabels([]string{"client", "allocs", "cpu", "user"}, 1917 float32(ru.ResourceUsage.CpuStats.UserMode), r.baseLabels) 1918 metrics.SetGaugeWithLabels([]string{"client", "allocs", "cpu", "throttled_time"}, 1919 float32(ru.ResourceUsage.CpuStats.ThrottledTime), r.baseLabels) 1920 metrics.SetGaugeWithLabels([]string{"client", "allocs", "cpu", "throttled_periods"}, 1921 float32(ru.ResourceUsage.CpuStats.ThrottledPeriods), r.baseLabels) 1922 metrics.SetGaugeWithLabels([]string{"client", "allocs", "cpu", "total_ticks"}, 1923 float32(ru.ResourceUsage.CpuStats.TotalTicks), r.baseLabels) 1924 } 1925 1926 if r.config.BackwardsCompatibleMetrics { 1927 metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "total_percent"}, float32(ru.ResourceUsage.CpuStats.Percent)) 1928 metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "system"}, float32(ru.ResourceUsage.CpuStats.SystemMode)) 1929 metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "user"}, float32(ru.ResourceUsage.CpuStats.UserMode)) 1930 metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "throttled_time"}, float32(ru.ResourceUsage.CpuStats.ThrottledTime)) 1931 metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "throttled_periods"}, float32(ru.ResourceUsage.CpuStats.ThrottledPeriods)) 1932 metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "total_ticks"}, float32(ru.ResourceUsage.CpuStats.TotalTicks)) 1933 } 1934 } 1935 1936 // emitStats emits resource usage stats of tasks to remote metrics collector 1937 // sinks 1938 func (r *TaskRunner) emitStats(ru *cstructs.TaskResourceUsage) { 1939 if !r.config.PublishAllocationMetrics { 1940 return 1941 } 1942 1943 // If the task is not running don't emit anything 1944 r.runningLock.Lock() 1945 running := r.running 1946 r.runningLock.Unlock() 1947 if !running { 1948 return 1949 } 1950 1951 if ru.ResourceUsage.MemoryStats != nil { 1952 r.setGaugeForMemory(ru) 1953 } 1954 1955 if ru.ResourceUsage.CpuStats != nil { 1956 r.setGaugeForCPU(ru) 1957 } 1958 }