github.com/djenriquez/nomad-1@v0.8.1/client/task_runner.go (about) 1 package client 2 3 import ( 4 "bytes" 5 "crypto/md5" 6 "encoding/hex" 7 "fmt" 8 "io" 9 "io/ioutil" 10 "log" 11 "os" 12 "path/filepath" 13 "strings" 14 "sync" 15 "time" 16 17 metrics "github.com/armon/go-metrics" 18 "github.com/boltdb/bolt" 19 "github.com/golang/snappy" 20 "github.com/hashicorp/consul-template/signals" 21 "github.com/hashicorp/go-multierror" 22 version "github.com/hashicorp/go-version" 23 "github.com/hashicorp/nomad/client/allocdir" 24 "github.com/hashicorp/nomad/client/config" 25 "github.com/hashicorp/nomad/client/driver" 26 "github.com/hashicorp/nomad/client/getter" 27 "github.com/hashicorp/nomad/client/vaultclient" 28 "github.com/hashicorp/nomad/nomad/structs" 29 "github.com/ugorji/go/codec" 30 31 "github.com/hashicorp/nomad/client/driver/env" 32 dstructs "github.com/hashicorp/nomad/client/driver/structs" 33 cstructs "github.com/hashicorp/nomad/client/structs" 34 ) 35 36 const ( 37 // killBackoffBaseline is the baseline time for exponential backoff while 38 // killing a task. 39 killBackoffBaseline = 5 * time.Second 40 41 // killBackoffLimit is the limit of the exponential backoff for killing 42 // the task. 43 killBackoffLimit = 2 * time.Minute 44 45 // killFailureLimit is how many times we will attempt to kill a task before 46 // giving up and potentially leaking resources. 47 killFailureLimit = 5 48 49 // vaultBackoffBaseline is the baseline time for exponential backoff when 50 // attempting to retrieve a Vault token 51 vaultBackoffBaseline = 5 * time.Second 52 53 // vaultBackoffLimit is the limit of the exponential backoff when attempting 54 // to retrieve a Vault token 55 vaultBackoffLimit = 3 * time.Minute 56 57 // vaultTokenFile is the name of the file holding the Vault token inside the 58 // task's secret directory 59 vaultTokenFile = "vault_token" 60 ) 61 62 var ( 63 // taskRunnerStateAllKey holds all the task runners state. At the moment 64 // there is no need to split it 65 taskRunnerStateAllKey = []byte("simple-all") 66 ) 67 68 // taskRestartEvent wraps a TaskEvent with additional metadata to control 69 // restart behavior. 70 type taskRestartEvent struct { 71 // taskEvent to report 72 taskEvent *structs.TaskEvent 73 74 // if false, don't count against restart count 75 failure bool 76 } 77 78 func newTaskRestartEvent(reason string, failure bool) *taskRestartEvent { 79 return &taskRestartEvent{ 80 taskEvent: structs.NewTaskEvent(structs.TaskRestartSignal).SetRestartReason(reason), 81 failure: failure, 82 } 83 } 84 85 // TaskRunner is used to wrap a task within an allocation and provide the execution context. 86 type TaskRunner struct { 87 stateDB *bolt.DB 88 config *config.Config 89 updater TaskStateUpdater 90 logger *log.Logger 91 restartTracker *RestartTracker 92 consul ConsulServiceAPI 93 94 // running marks whether the task is running 95 running bool 96 runningLock sync.Mutex 97 98 resourceUsage *cstructs.TaskResourceUsage 99 resourceUsageLock sync.RWMutex 100 101 alloc *structs.Allocation 102 task *structs.Task 103 taskDir *allocdir.TaskDir 104 105 // envBuilder is used to build the task's environment 106 envBuilder *env.Builder 107 108 // driverNet is the network information returned by the driver 109 driverNet *cstructs.DriverNetwork 110 driverNetLock sync.Mutex 111 112 // updateCh is used to receive updated versions of the allocation 113 updateCh chan *structs.Allocation 114 115 handle driver.DriverHandle 116 handleLock sync.Mutex 117 118 // artifactsDownloaded tracks whether the tasks artifacts have been 119 // downloaded 120 // 121 // Must acquire persistLock when accessing 122 artifactsDownloaded bool 123 124 // taskDirBuilt tracks whether the task has built its directory. 125 // 126 // Must acquire persistLock when accessing 127 taskDirBuilt bool 128 129 // createdResources are all the resources created by the task driver 130 // across all attempts to start the task. 131 // Simple gets and sets should use {get,set}CreatedResources 132 createdResources *driver.CreatedResources 133 createdResourcesLock sync.Mutex 134 135 // payloadRendered tracks whether the payload has been rendered to disk 136 payloadRendered bool 137 138 // vaultFuture is the means to wait for and get a Vault token 139 vaultFuture *tokenFuture 140 141 // recoveredVaultToken is the token that was recovered through a restore 142 recoveredVaultToken string 143 144 // vaultClient is used to retrieve and renew any needed Vault token 145 vaultClient vaultclient.VaultClient 146 147 // templateManager is used to manage any consul-templates this task may have 148 templateManager *TaskTemplateManager 149 150 // startCh is used to trigger the start of the task 151 startCh chan struct{} 152 153 // unblockCh is used to unblock the starting of the task 154 unblockCh chan struct{} 155 unblocked bool 156 unblockLock sync.Mutex 157 158 // restartCh is used to restart a task 159 restartCh chan *taskRestartEvent 160 161 // signalCh is used to send a signal to a task 162 signalCh chan SignalEvent 163 164 destroy bool 165 destroyCh chan struct{} 166 destroyLock sync.Mutex 167 destroyEvent *structs.TaskEvent 168 169 // waitCh closing marks the run loop as having exited 170 waitCh chan struct{} 171 172 // persistLock must be acquired when accessing fields stored by 173 // SaveState. SaveState is called asynchronously to TaskRunner.Run by 174 // AllocRunner, so all state fields must be synchronized using this 175 // lock. 176 persistLock sync.Mutex 177 178 // persistedHash is the hash of the last persisted snapshot. It is used to 179 // detect if a new snapshot has to be written to disk. 180 persistedHash []byte 181 182 // baseLabels are used when emitting tagged metrics. All task runner metrics 183 // will have these tags, and optionally more. 184 baseLabels []metrics.Label 185 } 186 187 // taskRunnerState is used to snapshot the state of the task runner 188 type taskRunnerState struct { 189 Version string 190 HandleID string 191 ArtifactDownloaded bool 192 TaskDirBuilt bool 193 PayloadRendered bool 194 CreatedResources *driver.CreatedResources 195 DriverNetwork *cstructs.DriverNetwork 196 } 197 198 func (s *taskRunnerState) Hash() []byte { 199 h := md5.New() 200 201 io.WriteString(h, s.Version) 202 io.WriteString(h, s.HandleID) 203 io.WriteString(h, fmt.Sprintf("%v", s.ArtifactDownloaded)) 204 io.WriteString(h, fmt.Sprintf("%v", s.TaskDirBuilt)) 205 io.WriteString(h, fmt.Sprintf("%v", s.PayloadRendered)) 206 h.Write(s.CreatedResources.Hash()) 207 h.Write(s.DriverNetwork.Hash()) 208 209 return h.Sum(nil) 210 } 211 212 // TaskStateUpdater is used to signal that tasks state has changed. If lazySync 213 // is set the event won't be immediately pushed to the server. 214 type TaskStateUpdater func(taskName, state string, event *structs.TaskEvent, lazySync bool) 215 216 // SignalEvent is a tuple of the signal and the event generating it 217 type SignalEvent struct { 218 // s is the signal to be sent 219 s os.Signal 220 221 // e is the task event generating the signal 222 e *structs.TaskEvent 223 224 // result should be used to send back the result of the signal 225 result chan<- error 226 } 227 228 // NewTaskRunner is used to create a new task context 229 func NewTaskRunner(logger *log.Logger, config *config.Config, 230 stateDB *bolt.DB, updater TaskStateUpdater, taskDir *allocdir.TaskDir, 231 alloc *structs.Allocation, task *structs.Task, 232 vaultClient vaultclient.VaultClient, consulClient ConsulServiceAPI) *TaskRunner { 233 234 // Merge in the task resources 235 task.Resources = alloc.TaskResources[task.Name] 236 237 // Build the restart tracker. 238 tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup) 239 if tg == nil { 240 logger.Printf("[ERR] client: alloc %q for missing task group %q", alloc.ID, alloc.TaskGroup) 241 return nil 242 } 243 restartTracker := newRestartTracker(tg.RestartPolicy, alloc.Job.Type) 244 245 // Initialize the environment builder 246 envBuilder := env.NewBuilder(config.Node, alloc, task, config.Region) 247 248 tc := &TaskRunner{ 249 config: config, 250 stateDB: stateDB, 251 updater: updater, 252 logger: logger, 253 restartTracker: restartTracker, 254 alloc: alloc, 255 task: task, 256 taskDir: taskDir, 257 envBuilder: envBuilder, 258 createdResources: driver.NewCreatedResources(), 259 consul: consulClient, 260 vaultClient: vaultClient, 261 vaultFuture: NewTokenFuture().Set(""), 262 updateCh: make(chan *structs.Allocation, 64), 263 destroyCh: make(chan struct{}), 264 waitCh: make(chan struct{}), 265 startCh: make(chan struct{}, 1), 266 unblockCh: make(chan struct{}), 267 restartCh: make(chan *taskRestartEvent), 268 signalCh: make(chan SignalEvent), 269 } 270 271 tc.baseLabels = []metrics.Label{ 272 { 273 Name: "job", 274 Value: tc.alloc.Job.Name, 275 }, 276 { 277 Name: "task_group", 278 Value: tc.alloc.TaskGroup, 279 }, 280 { 281 Name: "alloc_id", 282 Value: tc.alloc.ID, 283 }, 284 { 285 Name: "task", 286 Value: tc.task.Name, 287 }, 288 } 289 290 return tc 291 } 292 293 // MarkReceived marks the task as received. 294 func (r *TaskRunner) MarkReceived() { 295 // We lazy sync this since there will be a follow up message almost 296 // immediately. 297 r.updater(r.task.Name, structs.TaskStatePending, structs.NewTaskEvent(structs.TaskReceived), true) 298 } 299 300 // WaitCh returns a channel to wait for termination 301 func (r *TaskRunner) WaitCh() <-chan struct{} { 302 return r.waitCh 303 } 304 305 // getHandle returns the task's handle or nil 306 func (r *TaskRunner) getHandle() driver.DriverHandle { 307 r.handleLock.Lock() 308 h := r.handle 309 r.handleLock.Unlock() 310 return h 311 } 312 313 // pre060StateFilePath returns the path to our state file that would have been 314 // written pre v0.6.0 315 // COMPAT: Remove in 0.7.0 316 func (r *TaskRunner) pre060StateFilePath() string { 317 // Get the MD5 of the task name 318 hashVal := md5.Sum([]byte(r.task.Name)) 319 hashHex := hex.EncodeToString(hashVal[:]) 320 dirName := fmt.Sprintf("task-%s", hashHex) 321 322 // Generate the path 323 return filepath.Join(r.config.StateDir, "alloc", r.alloc.ID, dirName, "state.json") 324 } 325 326 // RestoreState is used to restore our state. If a non-empty string is returned 327 // the task is restarted with the string as the reason. This is useful for 328 // backwards incompatible upgrades that need to restart tasks with a new 329 // executor. 330 func (r *TaskRunner) RestoreState() (string, error) { 331 // COMPAT: Remove in 0.7.0 332 // 0.6.0 transitioned from individual state files to a single bolt-db. 333 // The upgrade path is to: 334 // Check if old state exists 335 // If so, restore from that and delete old state 336 // Restore using state database 337 338 var snap taskRunnerState 339 340 // Check if the old snapshot is there 341 oldPath := r.pre060StateFilePath() 342 if err := pre060RestoreState(oldPath, &snap); err == nil { 343 // Delete the old state 344 os.RemoveAll(oldPath) 345 } else if !os.IsNotExist(err) { 346 // Something corrupt in the old state file 347 return "", err 348 } else { 349 // We are doing a normal restore 350 err := r.stateDB.View(func(tx *bolt.Tx) error { 351 bkt, err := getTaskBucket(tx, r.alloc.ID, r.task.Name) 352 if err != nil { 353 return fmt.Errorf("failed to get task bucket: %v", err) 354 } 355 356 if err := getObject(bkt, taskRunnerStateAllKey, &snap); err != nil { 357 return fmt.Errorf("failed to read task runner state: %v", err) 358 } 359 return nil 360 }) 361 if err != nil { 362 return "", err 363 } 364 365 } 366 367 // Restore fields from the snapshot 368 r.artifactsDownloaded = snap.ArtifactDownloaded 369 r.taskDirBuilt = snap.TaskDirBuilt 370 r.payloadRendered = snap.PayloadRendered 371 r.setCreatedResources(snap.CreatedResources) 372 r.driverNet = snap.DriverNetwork 373 374 if r.task.Vault != nil { 375 // Read the token from the secret directory 376 tokenPath := filepath.Join(r.taskDir.SecretsDir, vaultTokenFile) 377 data, err := ioutil.ReadFile(tokenPath) 378 if err != nil { 379 if !os.IsNotExist(err) { 380 return "", fmt.Errorf("failed to read token for task %q in alloc %q: %v", r.task.Name, r.alloc.ID, err) 381 } 382 383 // Token file doesn't exist 384 } else { 385 // Store the recovered token 386 r.recoveredVaultToken = string(data) 387 } 388 } 389 390 // Restore the driver 391 restartReason := "" 392 if snap.HandleID != "" { 393 d, err := r.createDriver() 394 if err != nil { 395 return "", err 396 } 397 398 // Add the restored network driver to the environment 399 r.envBuilder.SetDriverNetwork(r.driverNet) 400 401 // Open a connection to the driver handle 402 ctx := driver.NewExecContext(r.taskDir, r.envBuilder.Build()) 403 handle, err := d.Open(ctx, snap.HandleID) 404 405 // In the case it fails, we relaunch the task in the Run() method. 406 if err != nil { 407 r.logger.Printf("[ERR] client: failed to open handle to task %q for alloc %q: %v", 408 r.task.Name, r.alloc.ID, err) 409 return "", nil 410 } 411 412 if pre06ScriptCheck(snap.Version, r.task.Driver, r.task.Services) { 413 restartReason = pre06ScriptCheckReason 414 } 415 416 if err := r.registerServices(d, handle, r.driverNet); err != nil { 417 // Don't hard fail here as there's a chance this task 418 // registered with Consul properly when it initial 419 // started. 420 r.logger.Printf("[WARN] client: failed to register services and checks with consul for task %q in alloc %q: %v", 421 r.task.Name, r.alloc.ID, err) 422 } 423 424 r.handleLock.Lock() 425 r.handle = handle 426 r.handleLock.Unlock() 427 428 r.runningLock.Lock() 429 r.running = true 430 r.runningLock.Unlock() 431 } 432 return restartReason, nil 433 } 434 435 // ver06 is used for checking for pre-0.6 script checks 436 var ver06 = version.Must(version.NewVersion("0.6.0dev")) 437 438 // pre06ScriptCheckReason is the restart reason given when a pre-0.6 script 439 // check is found on an exec/java task. 440 const pre06ScriptCheckReason = "upgrading pre-0.6 script checks" 441 442 // pre06ScriptCheck returns true if version is prior to 0.6.0dev, has a script 443 // check, and uses exec or java drivers. 444 func pre06ScriptCheck(ver, driver string, services []*structs.Service) bool { 445 if driver != "exec" && driver != "java" && driver != "mock_driver" { 446 // Only exec and java are affected 447 return false 448 } 449 v, err := version.NewVersion(ver) 450 if err != nil { 451 // Treat it as old 452 return true 453 } 454 if !v.LessThan(ver06) { 455 // >= 0.6.0dev 456 return false 457 } 458 for _, service := range services { 459 for _, check := range service.Checks { 460 if check.Type == "script" { 461 return true 462 } 463 } 464 } 465 return false 466 } 467 468 // SaveState is used to snapshot our state 469 func (r *TaskRunner) SaveState() error { 470 r.destroyLock.Lock() 471 defer r.destroyLock.Unlock() 472 if r.destroy { 473 // Don't save state if already destroyed 474 return nil 475 } 476 477 r.persistLock.Lock() 478 defer r.persistLock.Unlock() 479 snap := taskRunnerState{ 480 Version: r.config.Version.VersionNumber(), 481 ArtifactDownloaded: r.artifactsDownloaded, 482 TaskDirBuilt: r.taskDirBuilt, 483 PayloadRendered: r.payloadRendered, 484 CreatedResources: r.getCreatedResources(), 485 } 486 487 r.handleLock.Lock() 488 if r.handle != nil { 489 snap.HandleID = r.handle.ID() 490 } 491 r.handleLock.Unlock() 492 493 r.driverNetLock.Lock() 494 snap.DriverNetwork = r.driverNet.Copy() 495 r.driverNetLock.Unlock() 496 497 // If nothing has changed avoid the write 498 h := snap.Hash() 499 if bytes.Equal(h, r.persistedHash) { 500 return nil 501 } 502 503 // Serialize the object 504 var buf bytes.Buffer 505 if err := codec.NewEncoder(&buf, structs.MsgpackHandle).Encode(&snap); err != nil { 506 return fmt.Errorf("failed to serialize snapshot: %v", err) 507 } 508 509 // Start the transaction. 510 return r.stateDB.Batch(func(tx *bolt.Tx) error { 511 // Grab the task bucket 512 taskBkt, err := getTaskBucket(tx, r.alloc.ID, r.task.Name) 513 if err != nil { 514 return fmt.Errorf("failed to retrieve allocation bucket: %v", err) 515 } 516 517 if err := putData(taskBkt, taskRunnerStateAllKey, buf.Bytes()); err != nil { 518 return fmt.Errorf("failed to write task_runner state: %v", err) 519 } 520 521 // Store the hash that was persisted 522 tx.OnCommit(func() { 523 r.persistedHash = h 524 }) 525 526 return nil 527 }) 528 } 529 530 // DestroyState is used to cleanup after ourselves 531 func (r *TaskRunner) DestroyState() error { 532 r.persistLock.Lock() 533 defer r.persistLock.Unlock() 534 535 return r.stateDB.Update(func(tx *bolt.Tx) error { 536 if err := deleteTaskBucket(tx, r.alloc.ID, r.task.Name); err != nil { 537 return fmt.Errorf("failed to delete task bucket: %v", err) 538 } 539 return nil 540 }) 541 } 542 543 // setState is used to update the state of the task runner 544 func (r *TaskRunner) setState(state string, event *structs.TaskEvent, lazySync bool) { 545 event.PopulateEventDisplayMessage() 546 547 // Persist our state to disk. 548 if err := r.SaveState(); err != nil { 549 r.logger.Printf("[ERR] client: failed to save state of Task Runner for task %q: %v", r.task.Name, err) 550 } 551 552 // Indicate the task has been updated. 553 r.updater(r.task.Name, state, event, lazySync) 554 } 555 556 // createDriver makes a driver for the task 557 func (r *TaskRunner) createDriver() (driver.Driver, error) { 558 // Create a task-specific event emitter callback to expose minimal 559 // state to drivers 560 eventEmitter := func(m string, args ...interface{}) { 561 msg := fmt.Sprintf(m, args...) 562 r.logger.Printf("[DEBUG] client: driver event for alloc %q: %s", r.alloc.ID, msg) 563 r.setState(structs.TaskStatePending, structs.NewTaskEvent(structs.TaskDriverMessage).SetDriverMessage(msg), false) 564 } 565 566 driverCtx := driver.NewDriverContext(r.task.Name, r.alloc.ID, r.config, r.config.Node, r.logger, eventEmitter) 567 d, err := driver.NewDriver(r.task.Driver, driverCtx) 568 if err != nil { 569 return nil, fmt.Errorf("failed to create driver '%s' for alloc %s: %v", 570 r.task.Driver, r.alloc.ID, err) 571 } 572 573 return d, err 574 } 575 576 // Run is a long running routine used to manage the task 577 func (r *TaskRunner) Run() { 578 defer close(r.waitCh) 579 r.logger.Printf("[DEBUG] client: starting task context for '%s' (alloc '%s')", 580 r.task.Name, r.alloc.ID) 581 582 if err := r.validateTask(); err != nil { 583 r.setState( 584 structs.TaskStateDead, 585 structs.NewTaskEvent(structs.TaskFailedValidation).SetValidationError(err).SetFailsTask(), 586 false) 587 return 588 } 589 590 // Create a temporary driver so that we can determine the FSIsolation 591 // required. run->startTask will create a new driver after environment 592 // has been setup (env vars, templates, artifacts, secrets, etc). 593 tmpDrv, err := r.createDriver() 594 if err != nil { 595 e := fmt.Errorf("failed to create driver of task %q for alloc %q: %v", r.task.Name, r.alloc.ID, err) 596 r.setState( 597 structs.TaskStateDead, 598 structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(e).SetFailsTask(), 599 false) 600 return 601 } 602 603 // Build base task directory structure regardless of FS isolation abilities. 604 // This needs to happen before we start the Vault manager and call prestart 605 // as both those can write to the task directories 606 if err := r.buildTaskDir(tmpDrv.FSIsolation()); err != nil { 607 e := fmt.Errorf("failed to build task directory for %q: %v", r.task.Name, err) 608 r.setState( 609 structs.TaskStateDead, 610 structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(e).SetFailsTask(), 611 false) 612 return 613 } 614 615 // If there is no Vault policy leave the static future created in 616 // NewTaskRunner 617 if r.task.Vault != nil { 618 // Start the go-routine to get a Vault token 619 r.vaultFuture.Clear() 620 go r.vaultManager(r.recoveredVaultToken) 621 } 622 623 // Start the run loop 624 r.run() 625 626 // Do any cleanup necessary 627 r.postrun() 628 629 return 630 } 631 632 // validateTask validates the fields of the task and returns an error if the 633 // task is invalid. 634 func (r *TaskRunner) validateTask() error { 635 var mErr multierror.Error 636 637 // Validate the user. 638 unallowedUsers := r.config.ReadStringListToMapDefault("user.blacklist", config.DefaultUserBlacklist) 639 checkDrivers := r.config.ReadStringListToMapDefault("user.checked_drivers", config.DefaultUserCheckedDrivers) 640 if _, driverMatch := checkDrivers[r.task.Driver]; driverMatch { 641 if _, unallowed := unallowedUsers[r.task.User]; unallowed { 642 mErr.Errors = append(mErr.Errors, fmt.Errorf("running as user %q is disallowed", r.task.User)) 643 } 644 } 645 646 // Validate the artifacts 647 for i, artifact := range r.task.Artifacts { 648 // Verify the artifact doesn't escape the task directory. 649 if err := artifact.Validate(); err != nil { 650 // If this error occurs there is potentially a server bug or 651 // malicious, server spoofing. 652 r.logger.Printf("[ERR] client: allocation %q, task %v, artifact %#v (%v) fails validation: %v", 653 r.alloc.ID, r.task.Name, artifact, i, err) 654 mErr.Errors = append(mErr.Errors, fmt.Errorf("artifact (%d) failed validation: %v", i, err)) 655 } 656 } 657 658 // Validate the Service names 659 taskEnv := r.envBuilder.Build() 660 for i, service := range r.task.Services { 661 name := taskEnv.ReplaceEnv(service.Name) 662 if err := service.ValidateName(name); err != nil { 663 mErr.Errors = append(mErr.Errors, fmt.Errorf("service (%d) failed validation: %v", i, err)) 664 } 665 } 666 667 if len(mErr.Errors) == 1 { 668 return mErr.Errors[0] 669 } 670 return mErr.ErrorOrNil() 671 } 672 673 // tokenFuture stores the Vault token and allows consumers to block till a valid 674 // token exists 675 type tokenFuture struct { 676 waiting []chan struct{} 677 token string 678 set bool 679 m sync.Mutex 680 } 681 682 // NewTokenFuture returns a new token future without any token set 683 func NewTokenFuture() *tokenFuture { 684 return &tokenFuture{} 685 } 686 687 // Wait returns a channel that can be waited on. When this channel unblocks, a 688 // valid token will be available via the Get method 689 func (f *tokenFuture) Wait() <-chan struct{} { 690 f.m.Lock() 691 defer f.m.Unlock() 692 693 c := make(chan struct{}) 694 if f.set { 695 close(c) 696 return c 697 } 698 699 f.waiting = append(f.waiting, c) 700 return c 701 } 702 703 // Set sets the token value and unblocks any caller of Wait 704 func (f *tokenFuture) Set(token string) *tokenFuture { 705 f.m.Lock() 706 defer f.m.Unlock() 707 708 f.set = true 709 f.token = token 710 for _, w := range f.waiting { 711 close(w) 712 } 713 f.waiting = nil 714 return f 715 } 716 717 // Clear clears the set vault token. 718 func (f *tokenFuture) Clear() *tokenFuture { 719 f.m.Lock() 720 defer f.m.Unlock() 721 722 f.token = "" 723 f.set = false 724 return f 725 } 726 727 // Get returns the set Vault token 728 func (f *tokenFuture) Get() string { 729 f.m.Lock() 730 defer f.m.Unlock() 731 return f.token 732 } 733 734 // vaultManager should be called in a go-routine and manages the derivation, 735 // renewal and handling of errors with the Vault token. The optional parameter 736 // allows setting the initial Vault token. This is useful when the Vault token 737 // is recovered off disk. 738 func (r *TaskRunner) vaultManager(token string) { 739 // Helper for stopping token renewal 740 stopRenewal := func() { 741 if err := r.vaultClient.StopRenewToken(r.vaultFuture.Get()); err != nil { 742 r.logger.Printf("[WARN] client: failed to stop token renewal for task %v in alloc %q: %v", r.task.Name, r.alloc.ID, err) 743 } 744 } 745 746 // updatedToken lets us store state between loops. If true, a new token 747 // has been retrieved and we need to apply the Vault change mode 748 var updatedToken bool 749 750 OUTER: 751 for { 752 // Check if we should exit 753 select { 754 case <-r.waitCh: 755 stopRenewal() 756 return 757 default: 758 } 759 760 // Clear the token 761 r.vaultFuture.Clear() 762 763 // Check if there already is a token which can be the case for 764 // restoring the TaskRunner 765 if token == "" { 766 // Get a token 767 var exit bool 768 token, exit = r.deriveVaultToken() 769 if exit { 770 // Exit the manager 771 return 772 } 773 774 // Write the token to disk 775 if err := r.writeToken(token); err != nil { 776 e := fmt.Errorf("failed to write Vault token to disk") 777 r.logger.Printf("[ERR] client: %v for task %v on alloc %q: %v", e, r.task.Name, r.alloc.ID, err) 778 r.Kill("vault", e.Error(), true) 779 return 780 } 781 } 782 783 // Start the renewal process 784 renewCh, err := r.vaultClient.RenewToken(token, 30) 785 786 // An error returned means the token is not being renewed 787 if err != nil { 788 r.logger.Printf("[ERR] client: failed to start renewal of Vault token for task %v on alloc %q: %v", r.task.Name, r.alloc.ID, err) 789 token = "" 790 goto OUTER 791 } 792 793 // The Vault token is valid now, so set it 794 r.vaultFuture.Set(token) 795 796 if updatedToken { 797 switch r.task.Vault.ChangeMode { 798 case structs.VaultChangeModeSignal: 799 s, err := signals.Parse(r.task.Vault.ChangeSignal) 800 if err != nil { 801 e := fmt.Errorf("failed to parse signal: %v", err) 802 r.logger.Printf("[ERR] client: %v", err) 803 r.Kill("vault", e.Error(), true) 804 return 805 } 806 807 if err := r.Signal("vault", "new Vault token acquired", s); err != nil { 808 r.logger.Printf("[ERR] client: failed to send signal to task %v for alloc %q: %v", r.task.Name, r.alloc.ID, err) 809 r.Kill("vault", fmt.Sprintf("failed to send signal to task: %v", err), true) 810 return 811 } 812 case structs.VaultChangeModeRestart: 813 const noFailure = false 814 r.Restart("vault", "new Vault token acquired", noFailure) 815 case structs.VaultChangeModeNoop: 816 fallthrough 817 default: 818 r.logger.Printf("[ERR] client: Invalid Vault change mode: %q", r.task.Vault.ChangeMode) 819 } 820 821 // We have handled it 822 updatedToken = false 823 824 // Call the handler 825 r.updatedTokenHandler() 826 } 827 828 // Start watching for renewal errors 829 select { 830 case err := <-renewCh: 831 // Clear the token 832 token = "" 833 r.logger.Printf("[ERR] client: failed to renew Vault token for task %v on alloc %q: %v", r.task.Name, r.alloc.ID, err) 834 stopRenewal() 835 836 // Check if we have to do anything 837 if r.task.Vault.ChangeMode != structs.VaultChangeModeNoop { 838 updatedToken = true 839 } 840 case <-r.waitCh: 841 stopRenewal() 842 return 843 } 844 } 845 } 846 847 // deriveVaultToken derives the Vault token using exponential backoffs. It 848 // returns the Vault token and whether the manager should exit. 849 func (r *TaskRunner) deriveVaultToken() (token string, exit bool) { 850 attempts := 0 851 for { 852 tokens, err := r.vaultClient.DeriveToken(r.alloc, []string{r.task.Name}) 853 if err == nil { 854 return tokens[r.task.Name], false 855 } 856 857 // Check if this is a server side error 858 if structs.IsServerSide(err) { 859 r.logger.Printf("[ERR] client: failed to derive Vault token for task %v on alloc %q: %v", 860 r.task.Name, r.alloc.ID, err) 861 r.Kill("vault", fmt.Sprintf("server error deriving vault token: %v", err), true) 862 return "", true 863 } 864 // Check if we can't recover from the error 865 if !structs.IsRecoverable(err) { 866 r.logger.Printf("[ERR] client: failed to derive Vault token for task %v on alloc %q: %v", 867 r.task.Name, r.alloc.ID, err) 868 r.Kill("vault", fmt.Sprintf("failed to derive token: %v", err), true) 869 return "", true 870 } 871 872 // Handle the retry case 873 backoff := (1 << (2 * uint64(attempts))) * vaultBackoffBaseline 874 if backoff > vaultBackoffLimit { 875 backoff = vaultBackoffLimit 876 } 877 r.logger.Printf("[ERR] client: failed to derive Vault token for task %v on alloc %q: %v; retrying in %v", 878 r.task.Name, r.alloc.ID, err, backoff) 879 880 attempts++ 881 882 // Wait till retrying 883 select { 884 case <-r.waitCh: 885 return "", true 886 case <-time.After(backoff): 887 } 888 } 889 } 890 891 // writeToken writes the given token to disk 892 func (r *TaskRunner) writeToken(token string) error { 893 tokenPath := filepath.Join(r.taskDir.SecretsDir, vaultTokenFile) 894 if err := ioutil.WriteFile(tokenPath, []byte(token), 0777); err != nil { 895 return fmt.Errorf("failed to save Vault tokens to secret dir for task %q in alloc %q: %v", r.task.Name, r.alloc.ID, err) 896 } 897 898 return nil 899 } 900 901 // updatedTokenHandler is called when a new Vault token is retrieved. Things 902 // that rely on the token should be updated here. 903 func (r *TaskRunner) updatedTokenHandler() { 904 905 // Update the tasks environment 906 r.envBuilder.SetVaultToken(r.vaultFuture.Get(), r.task.Vault.Env) 907 908 if r.templateManager != nil { 909 r.templateManager.Stop() 910 911 // Create a new templateManager 912 var err error 913 r.templateManager, err = NewTaskTemplateManager(&TaskTemplateManagerConfig{ 914 Hooks: r, 915 Templates: r.task.Templates, 916 ClientConfig: r.config, 917 VaultToken: r.vaultFuture.Get(), 918 TaskDir: r.taskDir.Dir, 919 EnvBuilder: r.envBuilder, 920 MaxTemplateEventRate: DefaultMaxTemplateEventRate, 921 }) 922 923 if err != nil { 924 err := fmt.Errorf("failed to build task's template manager: %v", err) 925 r.setState(structs.TaskStateDead, 926 structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(err).SetFailsTask(), 927 false) 928 r.logger.Printf("[ERR] client: alloc %q, task %q %v", r.alloc.ID, r.task.Name, err) 929 r.Kill("vault", err.Error(), true) 930 return 931 } 932 } 933 } 934 935 // prestart handles life-cycle tasks that occur before the task has started. 936 // Since it's run asynchronously with the main Run() loop the alloc & task are 937 // passed in to avoid racing with updates. 938 func (r *TaskRunner) prestart(alloc *structs.Allocation, task *structs.Task, resultCh chan bool) { 939 if task.Vault != nil { 940 // Wait for the token 941 r.logger.Printf("[DEBUG] client: waiting for Vault token for task %v in alloc %q", task.Name, alloc.ID) 942 tokenCh := r.vaultFuture.Wait() 943 select { 944 case <-tokenCh: 945 case <-r.waitCh: 946 resultCh <- false 947 return 948 } 949 r.logger.Printf("[DEBUG] client: retrieved Vault token for task %v in alloc %q", task.Name, alloc.ID) 950 r.envBuilder.SetVaultToken(r.vaultFuture.Get(), task.Vault.Env) 951 } 952 953 // If the job is a dispatch job and there is a payload write it to disk 954 requirePayload := len(alloc.Job.Payload) != 0 && 955 (r.task.DispatchPayload != nil && r.task.DispatchPayload.File != "") 956 if !r.payloadRendered && requirePayload { 957 renderTo := filepath.Join(r.taskDir.LocalDir, task.DispatchPayload.File) 958 decoded, err := snappy.Decode(nil, alloc.Job.Payload) 959 if err != nil { 960 r.setState( 961 structs.TaskStateDead, 962 structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(err).SetFailsTask(), 963 false) 964 resultCh <- false 965 return 966 } 967 968 if err := os.MkdirAll(filepath.Dir(renderTo), 07777); err != nil { 969 r.setState( 970 structs.TaskStateDead, 971 structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(err).SetFailsTask(), 972 false) 973 resultCh <- false 974 return 975 } 976 977 if err := ioutil.WriteFile(renderTo, decoded, 0777); err != nil { 978 r.setState( 979 structs.TaskStateDead, 980 structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(err).SetFailsTask(), 981 false) 982 resultCh <- false 983 return 984 } 985 986 r.payloadRendered = true 987 } 988 989 for { 990 r.persistLock.Lock() 991 downloaded := r.artifactsDownloaded 992 r.persistLock.Unlock() 993 994 // Download the task's artifacts 995 if !downloaded && len(task.Artifacts) > 0 { 996 r.setState(structs.TaskStatePending, structs.NewTaskEvent(structs.TaskDownloadingArtifacts), false) 997 taskEnv := r.envBuilder.Build() 998 for _, artifact := range task.Artifacts { 999 if err := getter.GetArtifact(taskEnv, artifact, r.taskDir.Dir); err != nil { 1000 wrapped := fmt.Errorf("failed to download artifact %q: %v", artifact.GetterSource, err) 1001 r.logger.Printf("[DEBUG] client: %v", wrapped) 1002 r.setState(structs.TaskStatePending, 1003 structs.NewTaskEvent(structs.TaskArtifactDownloadFailed).SetDownloadError(wrapped), false) 1004 r.restartTracker.SetStartError(structs.WrapRecoverable(wrapped.Error(), err)) 1005 goto RESTART 1006 } 1007 } 1008 1009 r.persistLock.Lock() 1010 r.artifactsDownloaded = true 1011 r.persistLock.Unlock() 1012 } 1013 1014 // We don't have to wait for any template 1015 if len(task.Templates) == 0 { 1016 // Send the start signal 1017 select { 1018 case r.startCh <- struct{}{}: 1019 default: 1020 } 1021 1022 resultCh <- true 1023 return 1024 } 1025 1026 // Build the template manager 1027 if r.templateManager == nil { 1028 var err error 1029 r.templateManager, err = NewTaskTemplateManager(&TaskTemplateManagerConfig{ 1030 Hooks: r, 1031 Templates: r.task.Templates, 1032 ClientConfig: r.config, 1033 VaultToken: r.vaultFuture.Get(), 1034 TaskDir: r.taskDir.Dir, 1035 EnvBuilder: r.envBuilder, 1036 MaxTemplateEventRate: DefaultMaxTemplateEventRate, 1037 }) 1038 if err != nil { 1039 err := fmt.Errorf("failed to build task's template manager: %v", err) 1040 r.setState(structs.TaskStateDead, structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(err).SetFailsTask(), false) 1041 r.logger.Printf("[ERR] client: alloc %q, task %q %v", alloc.ID, task.Name, err) 1042 resultCh <- false 1043 return 1044 } 1045 } 1046 1047 // Block for consul-template 1048 // TODO Hooks should register themselves as blocking and then we can 1049 // periodically enumerate what we are still blocked on 1050 select { 1051 case <-r.unblockCh: 1052 // Send the start signal 1053 select { 1054 case r.startCh <- struct{}{}: 1055 default: 1056 } 1057 1058 resultCh <- true 1059 return 1060 case <-r.waitCh: 1061 // The run loop has exited so exit too 1062 resultCh <- false 1063 return 1064 } 1065 1066 RESTART: 1067 restart := r.shouldRestart() 1068 if !restart { 1069 resultCh <- false 1070 return 1071 } 1072 } 1073 } 1074 1075 // postrun is used to do any cleanup that is necessary after exiting the runloop 1076 func (r *TaskRunner) postrun() { 1077 // Stop the template manager 1078 if r.templateManager != nil { 1079 r.templateManager.Stop() 1080 } 1081 } 1082 1083 // run is the main run loop that handles starting the application, destroying 1084 // it, restarts and signals. 1085 func (r *TaskRunner) run() { 1086 // Predeclare things so we can jump to the RESTART 1087 var stopCollection chan struct{} 1088 var handleWaitCh chan *dstructs.WaitResult 1089 1090 // If we already have a handle, populate the stopCollection and handleWaitCh 1091 // to fix the invariant that it exists. 1092 handleEmpty := r.getHandle() == nil 1093 1094 if !handleEmpty { 1095 stopCollection = make(chan struct{}) 1096 go r.collectResourceUsageStats(stopCollection) 1097 handleWaitCh = r.handle.WaitCh() 1098 } 1099 1100 for { 1101 // Do the prestart activities 1102 prestartResultCh := make(chan bool, 1) 1103 go r.prestart(r.alloc, r.task, prestartResultCh) 1104 1105 WAIT: 1106 for { 1107 select { 1108 case success := <-prestartResultCh: 1109 if !success { 1110 r.cleanup() 1111 r.setState(structs.TaskStateDead, nil, false) 1112 return 1113 } 1114 case <-r.startCh: 1115 // Start the task if not yet started or it is being forced. This logic 1116 // is necessary because in the case of a restore the handle already 1117 // exists. 1118 handleEmpty := r.getHandle() == nil 1119 if handleEmpty { 1120 startErr := r.startTask() 1121 r.restartTracker.SetStartError(startErr) 1122 if startErr != nil { 1123 r.setState("", structs.NewTaskEvent(structs.TaskDriverFailure).SetDriverError(startErr), true) 1124 goto RESTART 1125 } 1126 1127 // Mark the task as started 1128 r.setState(structs.TaskStateRunning, structs.NewTaskEvent(structs.TaskStarted), false) 1129 r.runningLock.Lock() 1130 r.running = true 1131 r.runningLock.Unlock() 1132 1133 if stopCollection == nil { 1134 stopCollection = make(chan struct{}) 1135 go r.collectResourceUsageStats(stopCollection) 1136 } 1137 1138 handleWaitCh = r.handle.WaitCh() 1139 } 1140 1141 case waitRes := <-handleWaitCh: 1142 if waitRes == nil { 1143 panic("nil wait") 1144 } 1145 1146 r.runningLock.Lock() 1147 r.running = false 1148 r.runningLock.Unlock() 1149 1150 // Stop collection of the task's resource usage 1151 close(stopCollection) 1152 1153 // Log whether the task was successful or not. 1154 r.restartTracker.SetWaitResult(waitRes) 1155 r.setState("", r.waitErrorToEvent(waitRes), true) 1156 if !waitRes.Successful() { 1157 r.logger.Printf("[INFO] client: task %q for alloc %q failed: %v", r.task.Name, r.alloc.ID, waitRes) 1158 } else { 1159 r.logger.Printf("[INFO] client: task %q for alloc %q completed successfully", r.task.Name, r.alloc.ID) 1160 } 1161 1162 break WAIT 1163 case update := <-r.updateCh: 1164 if err := r.handleUpdate(update); err != nil { 1165 r.logger.Printf("[ERR] client: update to task %q failed: %v", r.task.Name, err) 1166 } 1167 1168 case se := <-r.signalCh: 1169 r.runningLock.Lock() 1170 running := r.running 1171 r.runningLock.Unlock() 1172 common := fmt.Sprintf("signal %v to task %v for alloc %q", se.s, r.task.Name, r.alloc.ID) 1173 if !running { 1174 // Send no error 1175 r.logger.Printf("[DEBUG] client: skipping %s", common) 1176 se.result <- nil 1177 continue 1178 } 1179 1180 r.logger.Printf("[DEBUG] client: sending %s", common) 1181 r.setState(structs.TaskStateRunning, se.e, false) 1182 1183 res := r.handle.Signal(se.s) 1184 se.result <- res 1185 1186 case restartEvent := <-r.restartCh: 1187 r.runningLock.Lock() 1188 running := r.running 1189 r.runningLock.Unlock() 1190 common := fmt.Sprintf("task %v for alloc %q", r.task.Name, r.alloc.ID) 1191 if !running { 1192 r.logger.Printf("[DEBUG] client: skipping restart of %v: task isn't running", common) 1193 continue 1194 } 1195 1196 r.logger.Printf("[DEBUG] client: restarting %s: %v", common, restartEvent.taskEvent.RestartReason) 1197 r.setState(structs.TaskStateRunning, restartEvent.taskEvent, false) 1198 r.killTask(nil) 1199 1200 close(stopCollection) 1201 1202 if handleWaitCh != nil { 1203 <-handleWaitCh 1204 } 1205 1206 r.restartTracker.SetRestartTriggered(restartEvent.failure) 1207 break WAIT 1208 1209 case <-r.destroyCh: 1210 r.runningLock.Lock() 1211 running := r.running 1212 r.runningLock.Unlock() 1213 if !running { 1214 r.cleanup() 1215 r.setState(structs.TaskStateDead, r.destroyEvent, false) 1216 return 1217 } 1218 1219 // Remove from consul before killing the task so that traffic 1220 // can be rerouted 1221 interpTask := interpolateServices(r.envBuilder.Build(), r.task) 1222 r.consul.RemoveTask(r.alloc.ID, interpTask) 1223 1224 // Delay actually killing the task if configured. See #244 1225 if r.task.ShutdownDelay > 0 { 1226 r.logger.Printf("[DEBUG] client: delaying shutdown of alloc %q task %q for %q", 1227 r.alloc.ID, r.task.Name, r.task.ShutdownDelay) 1228 <-time.After(r.task.ShutdownDelay) 1229 } 1230 1231 // Store the task event that provides context on the task 1232 // destroy. The Killed event is set from the alloc_runner and 1233 // doesn't add detail 1234 var killEvent *structs.TaskEvent 1235 if r.destroyEvent.Type != structs.TaskKilled { 1236 if r.destroyEvent.Type == structs.TaskKilling { 1237 killEvent = r.destroyEvent 1238 } else { 1239 r.setState(structs.TaskStateRunning, r.destroyEvent, false) 1240 } 1241 } 1242 1243 r.killTask(killEvent) 1244 close(stopCollection) 1245 1246 // Wait for handler to exit before calling cleanup 1247 <-handleWaitCh 1248 r.cleanup() 1249 1250 r.setState(structs.TaskStateDead, nil, false) 1251 return 1252 } 1253 } 1254 1255 RESTART: 1256 // shouldRestart will block if the task should restart after a delay. 1257 restart := r.shouldRestart() 1258 if !restart { 1259 r.cleanup() 1260 r.setState(structs.TaskStateDead, nil, false) 1261 return 1262 } 1263 1264 // Clear the handle so a new driver will be created. 1265 r.handleLock.Lock() 1266 r.handle = nil 1267 handleWaitCh = nil 1268 stopCollection = nil 1269 r.handleLock.Unlock() 1270 } 1271 } 1272 1273 // cleanup removes Consul entries and calls Driver.Cleanup when a task is 1274 // stopping. Errors are logged. 1275 func (r *TaskRunner) cleanup() { 1276 // Remove from Consul 1277 interpTask := interpolateServices(r.envBuilder.Build(), r.task) 1278 r.consul.RemoveTask(r.alloc.ID, interpTask) 1279 1280 drv, err := r.createDriver() 1281 if err != nil { 1282 r.logger.Printf("[ERR] client: error creating driver to cleanup resources: %v", err) 1283 return 1284 } 1285 1286 res := r.getCreatedResources() 1287 1288 ctx := driver.NewExecContext(r.taskDir, r.envBuilder.Build()) 1289 attempts := 1 1290 var cleanupErr error 1291 for retry := true; retry; attempts++ { 1292 cleanupErr = drv.Cleanup(ctx, res) 1293 retry = structs.IsRecoverable(cleanupErr) 1294 1295 // Copy current createdResources state in case SaveState is 1296 // called between retries 1297 r.setCreatedResources(res) 1298 1299 // Retry 3 times with sleeps between 1300 if !retry || attempts > 3 { 1301 break 1302 } 1303 time.Sleep(time.Duration(attempts) * time.Second) 1304 } 1305 1306 if cleanupErr != nil { 1307 r.logger.Printf("[ERR] client: error cleaning up resources for task %q after %d attempts: %v", r.task.Name, attempts, cleanupErr) 1308 } 1309 return 1310 } 1311 1312 // shouldRestart returns if the task should restart. If the return value is 1313 // true, the task's restart policy has already been considered and any wait time 1314 // between restarts has been applied. 1315 func (r *TaskRunner) shouldRestart() bool { 1316 state, when := r.restartTracker.GetState() 1317 reason := r.restartTracker.GetReason() 1318 switch state { 1319 case structs.TaskNotRestarting, structs.TaskTerminated: 1320 r.logger.Printf("[INFO] client: Not restarting task: %v for alloc: %v ", r.task.Name, r.alloc.ID) 1321 if state == structs.TaskNotRestarting { 1322 r.setState(structs.TaskStateDead, 1323 structs.NewTaskEvent(structs.TaskNotRestarting). 1324 SetRestartReason(reason).SetFailsTask(), 1325 false) 1326 } 1327 return false 1328 case structs.TaskRestarting: 1329 r.logger.Printf("[INFO] client: Restarting task %q for alloc %q in %v", r.task.Name, r.alloc.ID, when) 1330 r.setState(structs.TaskStatePending, 1331 structs.NewTaskEvent(structs.TaskRestarting). 1332 SetRestartDelay(when). 1333 SetRestartReason(reason), 1334 false) 1335 default: 1336 r.logger.Printf("[ERR] client: restart tracker returned unknown state: %q", state) 1337 return false 1338 } 1339 1340 // Unregister from Consul while waiting to restart. 1341 interpTask := interpolateServices(r.envBuilder.Build(), r.task) 1342 r.consul.RemoveTask(r.alloc.ID, interpTask) 1343 1344 // Sleep but watch for destroy events. 1345 select { 1346 case <-time.After(when): 1347 case <-r.destroyCh: 1348 } 1349 1350 // Destroyed while we were waiting to restart, so abort. 1351 r.destroyLock.Lock() 1352 destroyed := r.destroy 1353 r.destroyLock.Unlock() 1354 if destroyed { 1355 r.logger.Printf("[DEBUG] client: Not restarting task: %v because it has been destroyed", r.task.Name) 1356 r.setState(structs.TaskStateDead, r.destroyEvent, false) 1357 return false 1358 } 1359 1360 return true 1361 } 1362 1363 // killTask kills the running task. A killing event can optionally be passed and 1364 // this event is used to mark the task as being killed. It provides a means to 1365 // store extra information. 1366 func (r *TaskRunner) killTask(killingEvent *structs.TaskEvent) { 1367 r.runningLock.Lock() 1368 running := r.running 1369 r.runningLock.Unlock() 1370 if !running { 1371 return 1372 } 1373 1374 // Get the kill timeout 1375 timeout := driver.GetKillTimeout(r.task.KillTimeout, r.config.MaxKillTimeout) 1376 1377 // Build the event 1378 var event *structs.TaskEvent 1379 if killingEvent != nil { 1380 event = killingEvent 1381 event.Type = structs.TaskKilling 1382 } else { 1383 event = structs.NewTaskEvent(structs.TaskKilling) 1384 } 1385 event.SetKillTimeout(timeout) 1386 1387 // Mark that we received the kill event 1388 r.setState(structs.TaskStateRunning, event, false) 1389 1390 handle := r.getHandle() 1391 1392 // Kill the task using an exponential backoff in-case of failures. 1393 destroySuccess, err := r.handleDestroy(handle) 1394 if !destroySuccess { 1395 // We couldn't successfully destroy the resource created. 1396 r.logger.Printf("[ERR] client: failed to kill task %q. Resources may have been leaked: %v", r.task.Name, err) 1397 } 1398 1399 r.runningLock.Lock() 1400 r.running = false 1401 r.runningLock.Unlock() 1402 1403 // Store that the task has been destroyed and any associated error. 1404 r.setState("", structs.NewTaskEvent(structs.TaskKilled).SetKillError(err), true) 1405 } 1406 1407 // startTask creates the driver, task dir, and starts the task. 1408 func (r *TaskRunner) startTask() error { 1409 // Create a driver 1410 drv, err := r.createDriver() 1411 if err != nil { 1412 return fmt.Errorf("failed to create driver of task %q for alloc %q: %v", 1413 r.task.Name, r.alloc.ID, err) 1414 } 1415 1416 // Run prestart 1417 ctx := driver.NewExecContext(r.taskDir, r.envBuilder.Build()) 1418 presp, err := drv.Prestart(ctx, r.task) 1419 1420 // Merge newly created resources into previously created resources 1421 if presp != nil { 1422 r.createdResourcesLock.Lock() 1423 r.createdResources.Merge(presp.CreatedResources) 1424 r.createdResourcesLock.Unlock() 1425 1426 // Set any network configuration returned by the driver 1427 r.envBuilder.SetDriverNetwork(presp.Network) 1428 } 1429 1430 if err != nil { 1431 wrapped := fmt.Sprintf("failed to initialize task %q for alloc %q: %v", 1432 r.task.Name, r.alloc.ID, err) 1433 r.logger.Printf("[WARN] client: error from prestart: %s", wrapped) 1434 return structs.WrapRecoverable(wrapped, err) 1435 } 1436 1437 // Create a new context for Start since the environment may have been updated. 1438 ctx = driver.NewExecContext(r.taskDir, r.envBuilder.Build()) 1439 1440 // Start the job 1441 sresp, err := drv.Start(ctx, r.task) 1442 if err != nil { 1443 wrapped := fmt.Sprintf("failed to start task %q for alloc %q: %v", 1444 r.task.Name, r.alloc.ID, err) 1445 r.logger.Printf("[WARN] client: %s", wrapped) 1446 return structs.WrapRecoverable(wrapped, err) 1447 1448 } 1449 1450 // Log driver network information 1451 if sresp.Network != nil && sresp.Network.IP != "" { 1452 if sresp.Network.AutoAdvertise { 1453 r.logger.Printf("[INFO] client: alloc %s task %s auto-advertising detected IP %s", 1454 r.alloc.ID, r.task.Name, sresp.Network.IP) 1455 } else { 1456 r.logger.Printf("[TRACE] client: alloc %s task %s detected IP %s but not auto-advertising", 1457 r.alloc.ID, r.task.Name, sresp.Network.IP) 1458 } 1459 } 1460 1461 if sresp.Network == nil || sresp.Network.IP == "" { 1462 r.logger.Printf("[TRACE] client: alloc %s task %s could not detect a driver IP", r.alloc.ID, r.task.Name) 1463 } 1464 1465 // Update environment with the network defined by the driver's Start method. 1466 r.envBuilder.SetDriverNetwork(sresp.Network) 1467 1468 if err := r.registerServices(drv, sresp.Handle, sresp.Network); err != nil { 1469 // All IO is done asynchronously, so errors from registering 1470 // services are hard failures. 1471 r.logger.Printf("[ERR] client: failed to register services and checks for task %q alloc %q: %v", r.task.Name, r.alloc.ID, err) 1472 1473 // Kill the started task 1474 if destroyed, err := r.handleDestroy(sresp.Handle); !destroyed { 1475 r.logger.Printf("[ERR] client: failed to kill task %q alloc %q. Resources may be leaked: %v", 1476 r.task.Name, r.alloc.ID, err) 1477 } 1478 return structs.NewRecoverableError(err, false) 1479 } 1480 1481 r.handleLock.Lock() 1482 r.handle = sresp.Handle 1483 r.handleLock.Unlock() 1484 1485 // Need to persist the driver network between restarts 1486 r.driverNetLock.Lock() 1487 r.driverNet = sresp.Network 1488 r.driverNetLock.Unlock() 1489 1490 return nil 1491 } 1492 1493 // registerServices and checks with Consul. 1494 func (r *TaskRunner) registerServices(d driver.Driver, h driver.DriverHandle, n *cstructs.DriverNetwork) error { 1495 var exec driver.ScriptExecutor 1496 if d.Abilities().Exec { 1497 // Allow set the script executor if the driver supports it 1498 exec = h 1499 } 1500 interpolatedTask := interpolateServices(r.envBuilder.Build(), r.task) 1501 return r.consul.RegisterTask(r.alloc.ID, interpolatedTask, r, exec, n) 1502 } 1503 1504 // interpolateServices interpolates tags in a service and checks with values from the 1505 // task's environment. 1506 func interpolateServices(taskEnv *env.TaskEnv, task *structs.Task) *structs.Task { 1507 taskCopy := task.Copy() 1508 for _, service := range taskCopy.Services { 1509 for _, check := range service.Checks { 1510 check.Name = taskEnv.ReplaceEnv(check.Name) 1511 check.Type = taskEnv.ReplaceEnv(check.Type) 1512 check.Command = taskEnv.ReplaceEnv(check.Command) 1513 check.Args = taskEnv.ParseAndReplace(check.Args) 1514 check.Path = taskEnv.ReplaceEnv(check.Path) 1515 check.Protocol = taskEnv.ReplaceEnv(check.Protocol) 1516 check.PortLabel = taskEnv.ReplaceEnv(check.PortLabel) 1517 check.InitialStatus = taskEnv.ReplaceEnv(check.InitialStatus) 1518 check.Method = taskEnv.ReplaceEnv(check.Method) 1519 if len(check.Header) > 0 { 1520 header := make(map[string][]string, len(check.Header)) 1521 for k, vs := range check.Header { 1522 newVals := make([]string, len(vs)) 1523 for i, v := range vs { 1524 newVals[i] = taskEnv.ReplaceEnv(v) 1525 } 1526 header[taskEnv.ReplaceEnv(k)] = newVals 1527 } 1528 check.Header = header 1529 } 1530 } 1531 service.Name = taskEnv.ReplaceEnv(service.Name) 1532 service.PortLabel = taskEnv.ReplaceEnv(service.PortLabel) 1533 service.Tags = taskEnv.ParseAndReplace(service.Tags) 1534 } 1535 return taskCopy 1536 } 1537 1538 // buildTaskDir creates the task directory before driver.Prestart. It is safe 1539 // to call multiple times as its state is persisted. 1540 func (r *TaskRunner) buildTaskDir(fsi cstructs.FSIsolation) error { 1541 r.persistLock.Lock() 1542 built := r.taskDirBuilt 1543 r.persistLock.Unlock() 1544 1545 // We do not set the state again since this only occurs during restoration 1546 // and the task dir is already built. The reason we call Build again is to 1547 // ensure that the task dir invariants are still held. 1548 if !built { 1549 r.setState(structs.TaskStatePending, 1550 structs.NewTaskEvent(structs.TaskSetup).SetMessage(structs.TaskBuildingTaskDir), 1551 false) 1552 } 1553 1554 chroot := config.DefaultChrootEnv 1555 if len(r.config.ChrootEnv) > 0 { 1556 chroot = r.config.ChrootEnv 1557 } 1558 if err := r.taskDir.Build(built, chroot, fsi); err != nil { 1559 return err 1560 } 1561 1562 // Mark task dir as successfully built 1563 r.persistLock.Lock() 1564 r.taskDirBuilt = true 1565 r.persistLock.Unlock() 1566 1567 // Set path and host related env vars 1568 driver.SetEnvvars(r.envBuilder, fsi, r.taskDir, r.config) 1569 return nil 1570 } 1571 1572 // collectResourceUsageStats starts collecting resource usage stats of a Task. 1573 // Collection ends when the passed channel is closed 1574 func (r *TaskRunner) collectResourceUsageStats(stopCollection <-chan struct{}) { 1575 // start collecting the stats right away and then start collecting every 1576 // collection interval 1577 next := time.NewTimer(0) 1578 defer next.Stop() 1579 for { 1580 select { 1581 case <-next.C: 1582 next.Reset(r.config.StatsCollectionInterval) 1583 handle := r.getHandle() 1584 if handle == nil { 1585 continue 1586 } 1587 ru, err := handle.Stats() 1588 1589 if err != nil { 1590 // Check if the driver doesn't implement stats 1591 if err.Error() == driver.DriverStatsNotImplemented.Error() { 1592 r.logger.Printf("[DEBUG] client: driver for task %q in allocation %q doesn't support stats", r.task.Name, r.alloc.ID) 1593 return 1594 } 1595 1596 // We do not log when the plugin is shutdown as this is simply a 1597 // race between the stopCollection channel being closed and calling 1598 // Stats on the handle. 1599 if !strings.Contains(err.Error(), "connection is shut down") { 1600 r.logger.Printf("[DEBUG] client: error fetching stats of task %v: %v", r.task.Name, err) 1601 } 1602 continue 1603 } 1604 1605 r.resourceUsageLock.Lock() 1606 r.resourceUsage = ru 1607 r.resourceUsageLock.Unlock() 1608 if ru != nil { 1609 r.emitStats(ru) 1610 } 1611 case <-stopCollection: 1612 return 1613 } 1614 } 1615 } 1616 1617 // LatestResourceUsage returns the last resource utilization datapoint collected 1618 func (r *TaskRunner) LatestResourceUsage() *cstructs.TaskResourceUsage { 1619 r.resourceUsageLock.RLock() 1620 defer r.resourceUsageLock.RUnlock() 1621 r.runningLock.Lock() 1622 defer r.runningLock.Unlock() 1623 1624 // If the task is not running there can be no latest resource 1625 if !r.running { 1626 return nil 1627 } 1628 1629 return r.resourceUsage 1630 } 1631 1632 // handleUpdate takes an updated allocation and updates internal state to 1633 // reflect the new config for the task. 1634 func (r *TaskRunner) handleUpdate(update *structs.Allocation) error { 1635 // Extract the task group from the alloc. 1636 tg := update.Job.LookupTaskGroup(update.TaskGroup) 1637 if tg == nil { 1638 return fmt.Errorf("alloc '%s' missing task group '%s'", update.ID, update.TaskGroup) 1639 } 1640 1641 // Extract the task. 1642 var updatedTask *structs.Task 1643 for _, t := range tg.Tasks { 1644 if t.Name == r.task.Name { 1645 updatedTask = t.Copy() 1646 break 1647 } 1648 } 1649 if updatedTask == nil { 1650 return fmt.Errorf("task group %q doesn't contain task %q", tg.Name, r.task.Name) 1651 } 1652 1653 // Merge in the task resources 1654 updatedTask.Resources = update.TaskResources[updatedTask.Name] 1655 1656 // Interpolate the old task with the old env before updating the env as 1657 // updating services in Consul need both the old and new interpolations 1658 // to find differences. 1659 oldInterpolatedTask := interpolateServices(r.envBuilder.Build(), r.task) 1660 1661 // Now it's safe to update the environment 1662 r.envBuilder.UpdateTask(update, updatedTask) 1663 1664 var mErr multierror.Error 1665 r.handleLock.Lock() 1666 if r.handle != nil { 1667 drv, err := r.createDriver() 1668 if err != nil { 1669 // Something has really gone wrong; don't continue 1670 r.handleLock.Unlock() 1671 return fmt.Errorf("error accessing driver when updating task %q: %v", r.task.Name, err) 1672 } 1673 1674 // Update will update resources and store the new kill timeout. 1675 if err := r.handle.Update(updatedTask); err != nil { 1676 mErr.Errors = append(mErr.Errors, fmt.Errorf("updating task resources failed: %v", err)) 1677 } 1678 1679 // Update services in Consul 1680 newInterpolatedTask := interpolateServices(r.envBuilder.Build(), updatedTask) 1681 if err := r.updateServices(drv, r.handle, oldInterpolatedTask, newInterpolatedTask); err != nil { 1682 mErr.Errors = append(mErr.Errors, fmt.Errorf("error updating services and checks in Consul: %v", err)) 1683 } 1684 } 1685 r.handleLock.Unlock() 1686 1687 // Update the restart policy. 1688 if r.restartTracker != nil { 1689 r.restartTracker.SetPolicy(tg.RestartPolicy) 1690 } 1691 1692 // Store the updated alloc. 1693 r.alloc = update 1694 r.task = updatedTask 1695 return mErr.ErrorOrNil() 1696 } 1697 1698 // updateServices and checks with Consul. Tasks must be interpolated! 1699 func (r *TaskRunner) updateServices(d driver.Driver, h driver.ScriptExecutor, oldTask, newTask *structs.Task) error { 1700 var exec driver.ScriptExecutor 1701 if d.Abilities().Exec { 1702 // Allow set the script executor if the driver supports it 1703 exec = h 1704 } 1705 r.driverNetLock.Lock() 1706 net := r.driverNet.Copy() 1707 r.driverNetLock.Unlock() 1708 return r.consul.UpdateTask(r.alloc.ID, oldTask, newTask, r, exec, net) 1709 } 1710 1711 // handleDestroy kills the task handle. In the case that killing fails, 1712 // handleDestroy will retry with an exponential backoff and will give up at a 1713 // given limit. It returns whether the task was destroyed and the error 1714 // associated with the last kill attempt. 1715 func (r *TaskRunner) handleDestroy(handle driver.DriverHandle) (destroyed bool, err error) { 1716 // Cap the number of times we attempt to kill the task. 1717 for i := 0; i < killFailureLimit; i++ { 1718 if err = handle.Kill(); err != nil { 1719 // Calculate the new backoff 1720 backoff := (1 << (2 * uint64(i))) * killBackoffBaseline 1721 if backoff > killBackoffLimit { 1722 backoff = killBackoffLimit 1723 } 1724 1725 r.logger.Printf("[ERR] client: failed to kill task '%s' for alloc %q. Retrying in %v: %v", 1726 r.task.Name, r.alloc.ID, backoff, err) 1727 time.Sleep(backoff) 1728 } else { 1729 // Kill was successful 1730 return true, nil 1731 } 1732 } 1733 return 1734 } 1735 1736 // Restart will restart the task. 1737 func (r *TaskRunner) Restart(source, reason string, failure bool) { 1738 reasonStr := fmt.Sprintf("%s: %s", source, reason) 1739 event := newTaskRestartEvent(reasonStr, failure) 1740 1741 select { 1742 case r.restartCh <- event: 1743 case <-r.waitCh: 1744 } 1745 } 1746 1747 // Signal will send a signal to the task 1748 func (r *TaskRunner) Signal(source, reason string, s os.Signal) error { 1749 1750 reasonStr := fmt.Sprintf("%s: %s", source, reason) 1751 event := structs.NewTaskEvent(structs.TaskSignaling).SetTaskSignal(s).SetTaskSignalReason(reasonStr) 1752 1753 resCh := make(chan error) 1754 se := SignalEvent{ 1755 s: s, 1756 e: event, 1757 result: resCh, 1758 } 1759 1760 select { 1761 case r.signalCh <- se: 1762 case <-r.waitCh: 1763 } 1764 1765 return <-resCh 1766 } 1767 1768 // Kill will kill a task and store the error, no longer restarting the task. If 1769 // fail is set, the task is marked as having failed. 1770 func (r *TaskRunner) Kill(source, reason string, fail bool) { 1771 reasonStr := fmt.Sprintf("%s: %s", source, reason) 1772 event := structs.NewTaskEvent(structs.TaskKilling).SetKillReason(reasonStr) 1773 if fail { 1774 event.SetFailsTask() 1775 } 1776 1777 r.logger.Printf("[DEBUG] client: killing task %v for alloc %q: %v", r.task.Name, r.alloc.ID, reasonStr) 1778 r.Destroy(event) 1779 } 1780 1781 func (r *TaskRunner) EmitEvent(source, message string) { 1782 event := structs.NewTaskEvent(source). 1783 SetMessage(message) 1784 r.setState("", event, false) 1785 r.logger.Printf("[DEBUG] client: event from %q for task %q in alloc %q: %v", 1786 source, r.task.Name, r.alloc.ID, message) 1787 } 1788 1789 // UnblockStart unblocks the starting of the task. It currently assumes only 1790 // consul-template will unblock 1791 func (r *TaskRunner) UnblockStart(source string) { 1792 r.unblockLock.Lock() 1793 defer r.unblockLock.Unlock() 1794 if r.unblocked { 1795 return 1796 } 1797 1798 r.logger.Printf("[DEBUG] client: unblocking task %v for alloc %q: %v", r.task.Name, r.alloc.ID, source) 1799 r.unblocked = true 1800 close(r.unblockCh) 1801 } 1802 1803 // Helper function for converting a WaitResult into a TaskTerminated event. 1804 func (r *TaskRunner) waitErrorToEvent(res *dstructs.WaitResult) *structs.TaskEvent { 1805 return structs.NewTaskEvent(structs.TaskTerminated). 1806 SetExitCode(res.ExitCode). 1807 SetSignal(res.Signal). 1808 SetExitMessage(res.Err) 1809 } 1810 1811 // Update is used to update the task of the context 1812 func (r *TaskRunner) Update(update *structs.Allocation) { 1813 select { 1814 case r.updateCh <- update: 1815 default: 1816 r.logger.Printf("[ERR] client: dropping task update '%s' (alloc '%s')", 1817 r.task.Name, r.alloc.ID) 1818 } 1819 } 1820 1821 // Destroy is used to indicate that the task context should be destroyed. The 1822 // event parameter provides a context for the destroy. 1823 func (r *TaskRunner) Destroy(event *structs.TaskEvent) { 1824 r.destroyLock.Lock() 1825 defer r.destroyLock.Unlock() 1826 1827 if r.destroy { 1828 return 1829 } 1830 r.destroy = true 1831 r.destroyEvent = event 1832 close(r.destroyCh) 1833 } 1834 1835 // getCreatedResources returns the resources created by drivers. It will never 1836 // return nil. 1837 func (r *TaskRunner) getCreatedResources() *driver.CreatedResources { 1838 r.createdResourcesLock.Lock() 1839 if r.createdResources == nil { 1840 r.createdResources = driver.NewCreatedResources() 1841 } 1842 cr := r.createdResources.Copy() 1843 r.createdResourcesLock.Unlock() 1844 1845 return cr 1846 } 1847 1848 // setCreatedResources updates the resources created by drivers. If passed nil 1849 // it will set createdResources to an initialized struct. 1850 func (r *TaskRunner) setCreatedResources(cr *driver.CreatedResources) { 1851 if cr == nil { 1852 cr = driver.NewCreatedResources() 1853 } 1854 r.createdResourcesLock.Lock() 1855 r.createdResources = cr.Copy() 1856 r.createdResourcesLock.Unlock() 1857 } 1858 1859 func (r *TaskRunner) setGaugeForMemory(ru *cstructs.TaskResourceUsage) { 1860 if !r.config.DisableTaggedMetrics { 1861 metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "rss"}, 1862 float32(ru.ResourceUsage.MemoryStats.RSS), r.baseLabels) 1863 metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "rss"}, 1864 float32(ru.ResourceUsage.MemoryStats.RSS), r.baseLabels) 1865 metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "cache"}, 1866 float32(ru.ResourceUsage.MemoryStats.Cache), r.baseLabels) 1867 metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "swap"}, 1868 float32(ru.ResourceUsage.MemoryStats.Swap), r.baseLabels) 1869 metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "max_usage"}, 1870 float32(ru.ResourceUsage.MemoryStats.MaxUsage), r.baseLabels) 1871 metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "kernel_usage"}, 1872 float32(ru.ResourceUsage.MemoryStats.KernelUsage), r.baseLabels) 1873 metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "kernel_max_usage"}, 1874 float32(ru.ResourceUsage.MemoryStats.KernelMaxUsage), r.baseLabels) 1875 } 1876 1877 if r.config.BackwardsCompatibleMetrics { 1878 metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "rss"}, float32(ru.ResourceUsage.MemoryStats.RSS)) 1879 metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "cache"}, float32(ru.ResourceUsage.MemoryStats.Cache)) 1880 metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "swap"}, float32(ru.ResourceUsage.MemoryStats.Swap)) 1881 metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "max_usage"}, float32(ru.ResourceUsage.MemoryStats.MaxUsage)) 1882 metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "kernel_usage"}, float32(ru.ResourceUsage.MemoryStats.KernelUsage)) 1883 metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "kernel_max_usage"}, float32(ru.ResourceUsage.MemoryStats.KernelMaxUsage)) 1884 } 1885 } 1886 1887 func (r *TaskRunner) setGaugeForCPU(ru *cstructs.TaskResourceUsage) { 1888 if !r.config.DisableTaggedMetrics { 1889 metrics.SetGaugeWithLabels([]string{"client", "allocs", "cpu", "total_percent"}, 1890 float32(ru.ResourceUsage.CpuStats.Percent), r.baseLabels) 1891 metrics.SetGaugeWithLabels([]string{"client", "allocs", "cpu", "system"}, 1892 float32(ru.ResourceUsage.CpuStats.SystemMode), r.baseLabels) 1893 metrics.SetGaugeWithLabels([]string{"client", "allocs", "cpu", "user"}, 1894 float32(ru.ResourceUsage.CpuStats.UserMode), r.baseLabels) 1895 metrics.SetGaugeWithLabels([]string{"client", "allocs", "cpu", "throttled_time"}, 1896 float32(ru.ResourceUsage.CpuStats.ThrottledTime), r.baseLabels) 1897 metrics.SetGaugeWithLabels([]string{"client", "allocs", "cpu", "throttled_periods"}, 1898 float32(ru.ResourceUsage.CpuStats.ThrottledPeriods), r.baseLabels) 1899 metrics.SetGaugeWithLabels([]string{"client", "allocs", "cpu", "total_ticks"}, 1900 float32(ru.ResourceUsage.CpuStats.TotalTicks), r.baseLabels) 1901 } 1902 1903 if r.config.BackwardsCompatibleMetrics { 1904 metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "total_percent"}, float32(ru.ResourceUsage.CpuStats.Percent)) 1905 metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "system"}, float32(ru.ResourceUsage.CpuStats.SystemMode)) 1906 metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "user"}, float32(ru.ResourceUsage.CpuStats.UserMode)) 1907 metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "throttled_time"}, float32(ru.ResourceUsage.CpuStats.ThrottledTime)) 1908 metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "throttled_periods"}, float32(ru.ResourceUsage.CpuStats.ThrottledPeriods)) 1909 metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "total_ticks"}, float32(ru.ResourceUsage.CpuStats.TotalTicks)) 1910 } 1911 } 1912 1913 // emitStats emits resource usage stats of tasks to remote metrics collector 1914 // sinks 1915 func (r *TaskRunner) emitStats(ru *cstructs.TaskResourceUsage) { 1916 if !r.config.PublishAllocationMetrics { 1917 return 1918 } 1919 1920 // If the task is not running don't emit anything 1921 r.runningLock.Lock() 1922 running := r.running 1923 r.runningLock.Unlock() 1924 if !running { 1925 return 1926 } 1927 1928 if ru.ResourceUsage.MemoryStats != nil { 1929 r.setGaugeForMemory(ru) 1930 } 1931 1932 if ru.ResourceUsage.CpuStats != nil { 1933 r.setGaugeForCPU(ru) 1934 } 1935 }