github.com/hspak/nomad@v0.7.2-0.20180309000617-bc4ae22a39a5/client/task_runner.go (about) 1 package client 2 3 import ( 4 "bytes" 5 "crypto/md5" 6 "encoding/hex" 7 "fmt" 8 "io" 9 "io/ioutil" 10 "log" 11 "os" 12 "path/filepath" 13 "strings" 14 "sync" 15 "time" 16 17 metrics "github.com/armon/go-metrics" 18 "github.com/boltdb/bolt" 19 "github.com/golang/snappy" 20 "github.com/hashicorp/consul-template/signals" 21 "github.com/hashicorp/go-multierror" 22 version "github.com/hashicorp/go-version" 23 "github.com/hashicorp/nomad/client/allocdir" 24 "github.com/hashicorp/nomad/client/config" 25 "github.com/hashicorp/nomad/client/driver" 26 "github.com/hashicorp/nomad/client/getter" 27 "github.com/hashicorp/nomad/client/vaultclient" 28 "github.com/hashicorp/nomad/nomad/structs" 29 "github.com/ugorji/go/codec" 30 31 "github.com/hashicorp/nomad/client/driver/env" 32 dstructs "github.com/hashicorp/nomad/client/driver/structs" 33 cstructs "github.com/hashicorp/nomad/client/structs" 34 ) 35 36 const ( 37 // killBackoffBaseline is the baseline time for exponential backoff while 38 // killing a task. 39 killBackoffBaseline = 5 * time.Second 40 41 // killBackoffLimit is the limit of the exponential backoff for killing 42 // the task. 43 killBackoffLimit = 2 * time.Minute 44 45 // killFailureLimit is how many times we will attempt to kill a task before 46 // giving up and potentially leaking resources. 47 killFailureLimit = 5 48 49 // vaultBackoffBaseline is the baseline time for exponential backoff when 50 // attempting to retrieve a Vault token 51 vaultBackoffBaseline = 5 * time.Second 52 53 // vaultBackoffLimit is the limit of the exponential backoff when attempting 54 // to retrieve a Vault token 55 vaultBackoffLimit = 3 * time.Minute 56 57 // vaultTokenFile is the name of the file holding the Vault token inside the 58 // task's secret directory 59 vaultTokenFile = "vault_token" 60 ) 61 62 var ( 63 // taskRunnerStateAllKey holds all the task runners state. At the moment 64 // there is no need to split it 65 taskRunnerStateAllKey = []byte("simple-all") 66 ) 67 68 // taskRestartEvent wraps a TaskEvent with additional metadata to control 69 // restart behavior. 70 type taskRestartEvent struct { 71 // taskEvent to report 72 taskEvent *structs.TaskEvent 73 74 // if false, don't count against restart count 75 failure bool 76 } 77 78 func newTaskRestartEvent(reason string, failure bool) *taskRestartEvent { 79 return &taskRestartEvent{ 80 taskEvent: structs.NewTaskEvent(structs.TaskRestartSignal).SetRestartReason(reason), 81 failure: failure, 82 } 83 } 84 85 // TaskRunner is used to wrap a task within an allocation and provide the execution context. 86 type TaskRunner struct { 87 stateDB *bolt.DB 88 config *config.Config 89 updater TaskStateUpdater 90 logger *log.Logger 91 restartTracker *RestartTracker 92 consul ConsulServiceAPI 93 94 // running marks whether the task is running 95 running bool 96 runningLock sync.Mutex 97 98 resourceUsage *cstructs.TaskResourceUsage 99 resourceUsageLock sync.RWMutex 100 101 alloc *structs.Allocation 102 task *structs.Task 103 taskDir *allocdir.TaskDir 104 105 // envBuilder is used to build the task's environment 106 envBuilder *env.Builder 107 108 // driverNet is the network information returned by the driver 109 driverNet *cstructs.DriverNetwork 110 driverNetLock sync.Mutex 111 112 // updateCh is used to receive updated versions of the allocation 113 updateCh chan *structs.Allocation 114 115 handle driver.DriverHandle 116 handleLock sync.Mutex 117 118 // artifactsDownloaded tracks whether the tasks artifacts have been 119 // downloaded 120 // 121 // Must acquire persistLock when accessing 122 artifactsDownloaded bool 123 124 // taskDirBuilt tracks whether the task has built its directory. 125 // 126 // Must acquire persistLock when accessing 127 taskDirBuilt bool 128 129 // createdResources are all the resources created by the task driver 130 // across all attempts to start the task. 131 // Simple gets and sets should use {get,set}CreatedResources 132 createdResources *driver.CreatedResources 133 createdResourcesLock sync.Mutex 134 135 // payloadRendered tracks whether the payload has been rendered to disk 136 payloadRendered bool 137 138 // vaultFuture is the means to wait for and get a Vault token 139 vaultFuture *tokenFuture 140 141 // recoveredVaultToken is the token that was recovered through a restore 142 recoveredVaultToken string 143 144 // vaultClient is used to retrieve and renew any needed Vault token 145 vaultClient vaultclient.VaultClient 146 147 // templateManager is used to manage any consul-templates this task may have 148 templateManager *TaskTemplateManager 149 150 // startCh is used to trigger the start of the task 151 startCh chan struct{} 152 153 // unblockCh is used to unblock the starting of the task 154 unblockCh chan struct{} 155 unblocked bool 156 unblockLock sync.Mutex 157 158 // restartCh is used to restart a task 159 restartCh chan *taskRestartEvent 160 161 // signalCh is used to send a signal to a task 162 signalCh chan SignalEvent 163 164 destroy bool 165 destroyCh chan struct{} 166 destroyLock sync.Mutex 167 destroyEvent *structs.TaskEvent 168 169 // waitCh closing marks the run loop as having exited 170 waitCh chan struct{} 171 172 // persistLock must be acquired when accessing fields stored by 173 // SaveState. SaveState is called asynchronously to TaskRunner.Run by 174 // AllocRunner, so all state fields must be synchronized using this 175 // lock. 176 persistLock sync.Mutex 177 178 // persistedHash is the hash of the last persisted snapshot. It is used to 179 // detect if a new snapshot has to be written to disk. 180 persistedHash []byte 181 182 // baseLabels are used when emitting tagged metrics. All task runner metrics 183 // will have these tags, and optionally more. 184 baseLabels []metrics.Label 185 } 186 187 // taskRunnerState is used to snapshot the state of the task runner 188 type taskRunnerState struct { 189 Version string 190 HandleID string 191 ArtifactDownloaded bool 192 TaskDirBuilt bool 193 PayloadRendered bool 194 CreatedResources *driver.CreatedResources 195 DriverNetwork *cstructs.DriverNetwork 196 } 197 198 func (s *taskRunnerState) Hash() []byte { 199 h := md5.New() 200 201 io.WriteString(h, s.Version) 202 io.WriteString(h, s.HandleID) 203 io.WriteString(h, fmt.Sprintf("%v", s.ArtifactDownloaded)) 204 io.WriteString(h, fmt.Sprintf("%v", s.TaskDirBuilt)) 205 io.WriteString(h, fmt.Sprintf("%v", s.PayloadRendered)) 206 h.Write(s.CreatedResources.Hash()) 207 h.Write(s.DriverNetwork.Hash()) 208 209 return h.Sum(nil) 210 } 211 212 // TaskStateUpdater is used to signal that tasks state has changed. If lazySync 213 // is set the event won't be immediately pushed to the server. 214 type TaskStateUpdater func(taskName, state string, event *structs.TaskEvent, lazySync bool) 215 216 // SignalEvent is a tuple of the signal and the event generating it 217 type SignalEvent struct { 218 // s is the signal to be sent 219 s os.Signal 220 221 // e is the task event generating the signal 222 e *structs.TaskEvent 223 224 // result should be used to send back the result of the signal 225 result chan<- error 226 } 227 228 // NewTaskRunner is used to create a new task context 229 func NewTaskRunner(logger *log.Logger, config *config.Config, 230 stateDB *bolt.DB, updater TaskStateUpdater, taskDir *allocdir.TaskDir, 231 alloc *structs.Allocation, task *structs.Task, 232 vaultClient vaultclient.VaultClient, consulClient ConsulServiceAPI) *TaskRunner { 233 234 // Merge in the task resources 235 task.Resources = alloc.TaskResources[task.Name] 236 237 // Build the restart tracker. 238 tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup) 239 if tg == nil { 240 logger.Printf("[ERR] client: alloc %q for missing task group %q", alloc.ID, alloc.TaskGroup) 241 return nil 242 } 243 restartTracker := newRestartTracker(tg.RestartPolicy, alloc.Job.Type) 244 245 // Initialize the environment builder 246 envBuilder := env.NewBuilder(config.Node, alloc, task, config.Region) 247 248 tc := &TaskRunner{ 249 config: config, 250 stateDB: stateDB, 251 updater: updater, 252 logger: logger, 253 restartTracker: restartTracker, 254 alloc: alloc, 255 task: task, 256 taskDir: taskDir, 257 envBuilder: envBuilder, 258 createdResources: driver.NewCreatedResources(), 259 consul: consulClient, 260 vaultClient: vaultClient, 261 vaultFuture: NewTokenFuture().Set(""), 262 updateCh: make(chan *structs.Allocation, 64), 263 destroyCh: make(chan struct{}), 264 waitCh: make(chan struct{}), 265 startCh: make(chan struct{}, 1), 266 unblockCh: make(chan struct{}), 267 restartCh: make(chan *taskRestartEvent), 268 signalCh: make(chan SignalEvent), 269 } 270 271 tc.baseLabels = []metrics.Label{ 272 { 273 Name: "job", 274 Value: tc.alloc.Job.Name, 275 }, 276 { 277 Name: "task_group", 278 Value: tc.alloc.TaskGroup, 279 }, 280 { 281 Name: "alloc_id", 282 Value: tc.alloc.ID, 283 }, 284 { 285 Name: "task", 286 Value: tc.task.Name, 287 }, 288 } 289 290 return tc 291 } 292 293 // MarkReceived marks the task as received. 294 func (r *TaskRunner) MarkReceived() { 295 // We lazy sync this since there will be a follow up message almost 296 // immediately. 297 r.updater(r.task.Name, structs.TaskStatePending, structs.NewTaskEvent(structs.TaskReceived), true) 298 } 299 300 // WaitCh returns a channel to wait for termination 301 func (r *TaskRunner) WaitCh() <-chan struct{} { 302 return r.waitCh 303 } 304 305 // getHandle returns the task's handle or nil 306 func (r *TaskRunner) getHandle() driver.DriverHandle { 307 r.handleLock.Lock() 308 h := r.handle 309 r.handleLock.Unlock() 310 return h 311 } 312 313 // pre060StateFilePath returns the path to our state file that would have been 314 // written pre v0.6.0 315 // COMPAT: Remove in 0.7.0 316 func (r *TaskRunner) pre060StateFilePath() string { 317 // Get the MD5 of the task name 318 hashVal := md5.Sum([]byte(r.task.Name)) 319 hashHex := hex.EncodeToString(hashVal[:]) 320 dirName := fmt.Sprintf("task-%s", hashHex) 321 322 // Generate the path 323 return filepath.Join(r.config.StateDir, "alloc", r.alloc.ID, dirName, "state.json") 324 } 325 326 // RestoreState is used to restore our state. If a non-empty string is returned 327 // the task is restarted with the string as the reason. This is useful for 328 // backwards incompatible upgrades that need to restart tasks with a new 329 // executor. 330 func (r *TaskRunner) RestoreState() (string, error) { 331 // COMPAT: Remove in 0.7.0 332 // 0.6.0 transistioned from individual state files to a single bolt-db. 333 // The upgrade path is to: 334 // Check if old state exists 335 // If so, restore from that and delete old state 336 // Restore using state database 337 338 var snap taskRunnerState 339 340 // Check if the old snapshot is there 341 oldPath := r.pre060StateFilePath() 342 if err := pre060RestoreState(oldPath, &snap); err == nil { 343 // Delete the old state 344 os.RemoveAll(oldPath) 345 } else if !os.IsNotExist(err) { 346 // Something corrupt in the old state file 347 return "", err 348 } else { 349 // We are doing a normal restore 350 err := r.stateDB.View(func(tx *bolt.Tx) error { 351 bkt, err := getTaskBucket(tx, r.alloc.ID, r.task.Name) 352 if err != nil { 353 return fmt.Errorf("failed to get task bucket: %v", err) 354 } 355 356 if err := getObject(bkt, taskRunnerStateAllKey, &snap); err != nil { 357 return fmt.Errorf("failed to read task runner state: %v", err) 358 } 359 return nil 360 }) 361 if err != nil { 362 return "", err 363 } 364 365 } 366 367 // Restore fields from the snapshot 368 r.artifactsDownloaded = snap.ArtifactDownloaded 369 r.taskDirBuilt = snap.TaskDirBuilt 370 r.payloadRendered = snap.PayloadRendered 371 r.setCreatedResources(snap.CreatedResources) 372 r.driverNet = snap.DriverNetwork 373 374 if r.task.Vault != nil { 375 // Read the token from the secret directory 376 tokenPath := filepath.Join(r.taskDir.SecretsDir, vaultTokenFile) 377 data, err := ioutil.ReadFile(tokenPath) 378 if err != nil { 379 if !os.IsNotExist(err) { 380 return "", fmt.Errorf("failed to read token for task %q in alloc %q: %v", r.task.Name, r.alloc.ID, err) 381 } 382 383 // Token file doesn't exist 384 } else { 385 // Store the recovered token 386 r.recoveredVaultToken = string(data) 387 } 388 } 389 390 // Restore the driver 391 restartReason := "" 392 if snap.HandleID != "" { 393 d, err := r.createDriver() 394 if err != nil { 395 return "", err 396 } 397 398 // Add the restored network driver to the environment 399 r.envBuilder.SetDriverNetwork(r.driverNet) 400 401 // Open a connection to the driver handle 402 ctx := driver.NewExecContext(r.taskDir, r.envBuilder.Build()) 403 handle, err := d.Open(ctx, snap.HandleID) 404 405 // In the case it fails, we relaunch the task in the Run() method. 406 if err != nil { 407 r.logger.Printf("[ERR] client: failed to open handle to task %q for alloc %q: %v", 408 r.task.Name, r.alloc.ID, err) 409 return "", nil 410 } 411 412 if pre06ScriptCheck(snap.Version, r.task.Driver, r.task.Services) { 413 restartReason = pre06ScriptCheckReason 414 } 415 416 if err := r.registerServices(d, handle, r.driverNet); err != nil { 417 // Don't hard fail here as there's a chance this task 418 // registered with Consul properly when it initial 419 // started. 420 r.logger.Printf("[WARN] client: failed to register services and checks with consul for task %q in alloc %q: %v", 421 r.task.Name, r.alloc.ID, err) 422 } 423 424 r.handleLock.Lock() 425 r.handle = handle 426 r.handleLock.Unlock() 427 428 r.runningLock.Lock() 429 r.running = true 430 r.runningLock.Unlock() 431 } 432 return restartReason, nil 433 } 434 435 // ver06 is used for checking for pre-0.6 script checks 436 var ver06 = version.Must(version.NewVersion("0.6.0dev")) 437 438 // pre06ScriptCheckReason is the restart reason given when a pre-0.6 script 439 // check is found on an exec/java task. 440 const pre06ScriptCheckReason = "upgrading pre-0.6 script checks" 441 442 // pre06ScriptCheck returns true if version is prior to 0.6.0dev, has a script 443 // check, and uses exec or java drivers. 444 func pre06ScriptCheck(ver, driver string, services []*structs.Service) bool { 445 if driver != "exec" && driver != "java" && driver != "mock_driver" { 446 // Only exec and java are affected 447 return false 448 } 449 v, err := version.NewVersion(ver) 450 if err != nil { 451 // Treat it as old 452 return true 453 } 454 if !v.LessThan(ver06) { 455 // >= 0.6.0dev 456 return false 457 } 458 for _, service := range services { 459 for _, check := range service.Checks { 460 if check.Type == "script" { 461 return true 462 } 463 } 464 } 465 return false 466 } 467 468 // SaveState is used to snapshot our state 469 func (r *TaskRunner) SaveState() error { 470 r.destroyLock.Lock() 471 defer r.destroyLock.Unlock() 472 if r.destroy { 473 // Don't save state if already destroyed 474 return nil 475 } 476 477 r.persistLock.Lock() 478 defer r.persistLock.Unlock() 479 snap := taskRunnerState{ 480 Version: r.config.Version.VersionNumber(), 481 ArtifactDownloaded: r.artifactsDownloaded, 482 TaskDirBuilt: r.taskDirBuilt, 483 PayloadRendered: r.payloadRendered, 484 CreatedResources: r.getCreatedResources(), 485 } 486 487 r.handleLock.Lock() 488 if r.handle != nil { 489 snap.HandleID = r.handle.ID() 490 } 491 r.handleLock.Unlock() 492 493 r.driverNetLock.Lock() 494 snap.DriverNetwork = r.driverNet.Copy() 495 r.driverNetLock.Unlock() 496 497 // If nothing has changed avoid the write 498 h := snap.Hash() 499 if bytes.Equal(h, r.persistedHash) { 500 return nil 501 } 502 503 // Serialize the object 504 var buf bytes.Buffer 505 if err := codec.NewEncoder(&buf, structs.MsgpackHandle).Encode(&snap); err != nil { 506 return fmt.Errorf("failed to serialize snapshot: %v", err) 507 } 508 509 // Start the transaction. 510 return r.stateDB.Batch(func(tx *bolt.Tx) error { 511 // Grab the task bucket 512 taskBkt, err := getTaskBucket(tx, r.alloc.ID, r.task.Name) 513 if err != nil { 514 return fmt.Errorf("failed to retrieve allocation bucket: %v", err) 515 } 516 517 if err := putData(taskBkt, taskRunnerStateAllKey, buf.Bytes()); err != nil { 518 return fmt.Errorf("failed to write task_runner state: %v", err) 519 } 520 521 // Store the hash that was persisted 522 tx.OnCommit(func() { 523 r.persistedHash = h 524 }) 525 526 return nil 527 }) 528 } 529 530 // DestroyState is used to cleanup after ourselves 531 func (r *TaskRunner) DestroyState() error { 532 r.persistLock.Lock() 533 defer r.persistLock.Unlock() 534 535 return r.stateDB.Update(func(tx *bolt.Tx) error { 536 if err := deleteTaskBucket(tx, r.alloc.ID, r.task.Name); err != nil { 537 return fmt.Errorf("failed to delete task bucket: %v", err) 538 } 539 return nil 540 }) 541 } 542 543 // setState is used to update the state of the task runner 544 func (r *TaskRunner) setState(state string, event *structs.TaskEvent, lazySync bool) { 545 event.PopulateEventDisplayMessage() 546 547 // Persist our state to disk. 548 if err := r.SaveState(); err != nil { 549 r.logger.Printf("[ERR] client: failed to save state of Task Runner for task %q: %v", r.task.Name, err) 550 } 551 552 // Indicate the task has been updated. 553 r.updater(r.task.Name, state, event, lazySync) 554 } 555 556 // createDriver makes a driver for the task 557 func (r *TaskRunner) createDriver() (driver.Driver, error) { 558 // Create a task-specific event emitter callback to expose minimal 559 // state to drivers 560 eventEmitter := func(m string, args ...interface{}) { 561 msg := fmt.Sprintf(m, args...) 562 r.logger.Printf("[DEBUG] client: driver event for alloc %q: %s", r.alloc.ID, msg) 563 r.setState(structs.TaskStatePending, structs.NewTaskEvent(structs.TaskDriverMessage).SetDriverMessage(msg), false) 564 } 565 566 driverCtx := driver.NewDriverContext(r.task.Name, r.alloc.ID, r.config, r.config.Node, r.logger, eventEmitter) 567 d, err := driver.NewDriver(r.task.Driver, driverCtx) 568 if err != nil { 569 return nil, fmt.Errorf("failed to create driver '%s' for alloc %s: %v", 570 r.task.Driver, r.alloc.ID, err) 571 } 572 573 return d, err 574 } 575 576 // Run is a long running routine used to manage the task 577 func (r *TaskRunner) Run() { 578 defer close(r.waitCh) 579 r.logger.Printf("[DEBUG] client: starting task context for '%s' (alloc '%s')", 580 r.task.Name, r.alloc.ID) 581 582 if err := r.validateTask(); err != nil { 583 r.setState( 584 structs.TaskStateDead, 585 structs.NewTaskEvent(structs.TaskFailedValidation).SetValidationError(err).SetFailsTask(), 586 false) 587 return 588 } 589 590 // Create a temporary driver so that we can determine the FSIsolation 591 // required. run->startTask will create a new driver after environment 592 // has been setup (env vars, templates, artifacts, secrets, etc). 593 tmpDrv, err := r.createDriver() 594 if err != nil { 595 e := fmt.Errorf("failed to create driver of task %q for alloc %q: %v", r.task.Name, r.alloc.ID, err) 596 r.setState( 597 structs.TaskStateDead, 598 structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(e).SetFailsTask(), 599 false) 600 return 601 } 602 603 // Build base task directory structure regardless of FS isolation abilities. 604 // This needs to happen before we start the Vault manager and call prestart 605 // as both those can write to the task directories 606 if err := r.buildTaskDir(tmpDrv.FSIsolation()); err != nil { 607 e := fmt.Errorf("failed to build task directory for %q: %v", r.task.Name, err) 608 r.setState( 609 structs.TaskStateDead, 610 structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(e).SetFailsTask(), 611 false) 612 return 613 } 614 615 // If there is no Vault policy leave the static future created in 616 // NewTaskRunner 617 if r.task.Vault != nil { 618 // Start the go-routine to get a Vault token 619 r.vaultFuture.Clear() 620 go r.vaultManager(r.recoveredVaultToken) 621 } 622 623 // Start the run loop 624 r.run() 625 626 // Do any cleanup necessary 627 r.postrun() 628 629 return 630 } 631 632 // validateTask validates the fields of the task and returns an error if the 633 // task is invalid. 634 func (r *TaskRunner) validateTask() error { 635 var mErr multierror.Error 636 637 // Validate the user. 638 unallowedUsers := r.config.ReadStringListToMapDefault("user.blacklist", config.DefaultUserBlacklist) 639 checkDrivers := r.config.ReadStringListToMapDefault("user.checked_drivers", config.DefaultUserCheckedDrivers) 640 if _, driverMatch := checkDrivers[r.task.Driver]; driverMatch { 641 if _, unallowed := unallowedUsers[r.task.User]; unallowed { 642 mErr.Errors = append(mErr.Errors, fmt.Errorf("running as user %q is disallowed", r.task.User)) 643 } 644 } 645 646 // Validate the artifacts 647 for i, artifact := range r.task.Artifacts { 648 // Verify the artifact doesn't escape the task directory. 649 if err := artifact.Validate(); err != nil { 650 // If this error occurs there is potentially a server bug or 651 // mallicious, server spoofing. 652 r.logger.Printf("[ERR] client: allocation %q, task %v, artifact %#v (%v) fails validation: %v", 653 r.alloc.ID, r.task.Name, artifact, i, err) 654 mErr.Errors = append(mErr.Errors, fmt.Errorf("artifact (%d) failed validation: %v", i, err)) 655 } 656 } 657 658 // Validate the Service names 659 taskEnv := r.envBuilder.Build() 660 for i, service := range r.task.Services { 661 name := taskEnv.ReplaceEnv(service.Name) 662 if err := service.ValidateName(name); err != nil { 663 mErr.Errors = append(mErr.Errors, fmt.Errorf("service (%d) failed validation: %v", i, err)) 664 } 665 } 666 667 if len(mErr.Errors) == 1 { 668 return mErr.Errors[0] 669 } 670 return mErr.ErrorOrNil() 671 } 672 673 // tokenFuture stores the Vault token and allows consumers to block till a valid 674 // token exists 675 type tokenFuture struct { 676 waiting []chan struct{} 677 token string 678 set bool 679 m sync.Mutex 680 } 681 682 // NewTokenFuture returns a new token future without any token set 683 func NewTokenFuture() *tokenFuture { 684 return &tokenFuture{} 685 } 686 687 // Wait returns a channel that can be waited on. When this channel unblocks, a 688 // valid token will be available via the Get method 689 func (f *tokenFuture) Wait() <-chan struct{} { 690 f.m.Lock() 691 defer f.m.Unlock() 692 693 c := make(chan struct{}) 694 if f.set { 695 close(c) 696 return c 697 } 698 699 f.waiting = append(f.waiting, c) 700 return c 701 } 702 703 // Set sets the token value and unblocks any caller of Wait 704 func (f *tokenFuture) Set(token string) *tokenFuture { 705 f.m.Lock() 706 defer f.m.Unlock() 707 708 f.set = true 709 f.token = token 710 for _, w := range f.waiting { 711 close(w) 712 } 713 f.waiting = nil 714 return f 715 } 716 717 // Clear clears the set vault token. 718 func (f *tokenFuture) Clear() *tokenFuture { 719 f.m.Lock() 720 defer f.m.Unlock() 721 722 f.token = "" 723 f.set = false 724 return f 725 } 726 727 // Get returns the set Vault token 728 func (f *tokenFuture) Get() string { 729 f.m.Lock() 730 defer f.m.Unlock() 731 return f.token 732 } 733 734 // vaultManager should be called in a go-routine and manages the derivation, 735 // renewal and handling of errors with the Vault token. The optional parameter 736 // allows setting the initial Vault token. This is useful when the Vault token 737 // is recovered off disk. 738 func (r *TaskRunner) vaultManager(token string) { 739 // Helper for stopping token renewal 740 stopRenewal := func() { 741 if err := r.vaultClient.StopRenewToken(r.vaultFuture.Get()); err != nil { 742 r.logger.Printf("[WARN] client: failed to stop token renewal for task %v in alloc %q: %v", r.task.Name, r.alloc.ID, err) 743 } 744 } 745 746 // updatedToken lets us store state between loops. If true, a new token 747 // has been retrieved and we need to apply the Vault change mode 748 var updatedToken bool 749 750 OUTER: 751 for { 752 // Check if we should exit 753 select { 754 case <-r.waitCh: 755 stopRenewal() 756 return 757 default: 758 } 759 760 // Clear the token 761 r.vaultFuture.Clear() 762 763 // Check if there already is a token which can be the case for 764 // restoring the TaskRunner 765 if token == "" { 766 // Get a token 767 var exit bool 768 token, exit = r.deriveVaultToken() 769 if exit { 770 // Exit the manager 771 return 772 } 773 774 // Write the token to disk 775 if err := r.writeToken(token); err != nil { 776 e := fmt.Errorf("failed to write Vault token to disk") 777 r.logger.Printf("[ERR] client: %v for task %v on alloc %q: %v", e, r.task.Name, r.alloc.ID, err) 778 r.Kill("vault", e.Error(), true) 779 return 780 } 781 } 782 783 // Start the renewal process 784 renewCh, err := r.vaultClient.RenewToken(token, 30) 785 786 // An error returned means the token is not being renewed 787 if err != nil { 788 r.logger.Printf("[ERR] client: failed to start renewal of Vault token for task %v on alloc %q: %v", r.task.Name, r.alloc.ID, err) 789 token = "" 790 goto OUTER 791 } 792 793 // The Vault token is valid now, so set it 794 r.vaultFuture.Set(token) 795 796 if updatedToken { 797 switch r.task.Vault.ChangeMode { 798 case structs.VaultChangeModeSignal: 799 s, err := signals.Parse(r.task.Vault.ChangeSignal) 800 if err != nil { 801 e := fmt.Errorf("failed to parse signal: %v", err) 802 r.logger.Printf("[ERR] client: %v", err) 803 r.Kill("vault", e.Error(), true) 804 return 805 } 806 807 if err := r.Signal("vault", "new Vault token acquired", s); err != nil { 808 r.logger.Printf("[ERR] client: failed to send signal to task %v for alloc %q: %v", r.task.Name, r.alloc.ID, err) 809 r.Kill("vault", fmt.Sprintf("failed to send signal to task: %v", err), true) 810 return 811 } 812 case structs.VaultChangeModeRestart: 813 const noFailure = false 814 r.Restart("vault", "new Vault token acquired", noFailure) 815 case structs.VaultChangeModeNoop: 816 fallthrough 817 default: 818 r.logger.Printf("[ERR] client: Invalid Vault change mode: %q", r.task.Vault.ChangeMode) 819 } 820 821 // We have handled it 822 updatedToken = false 823 824 // Call the handler 825 r.updatedTokenHandler() 826 } 827 828 // Start watching for renewal errors 829 select { 830 case err := <-renewCh: 831 // Clear the token 832 token = "" 833 r.logger.Printf("[ERR] client: failed to renew Vault token for task %v on alloc %q: %v", r.task.Name, r.alloc.ID, err) 834 stopRenewal() 835 836 // Check if we have to do anything 837 if r.task.Vault.ChangeMode != structs.VaultChangeModeNoop { 838 updatedToken = true 839 } 840 case <-r.waitCh: 841 stopRenewal() 842 return 843 } 844 } 845 } 846 847 // deriveVaultToken derives the Vault token using exponential backoffs. It 848 // returns the Vault token and whether the manager should exit. 849 func (r *TaskRunner) deriveVaultToken() (token string, exit bool) { 850 attempts := 0 851 for { 852 tokens, err := r.vaultClient.DeriveToken(r.alloc, []string{r.task.Name}) 853 if err == nil { 854 return tokens[r.task.Name], false 855 } 856 857 // Check if we can't recover from the error 858 if !structs.IsRecoverable(err) { 859 r.logger.Printf("[ERR] client: failed to derive Vault token for task %v on alloc %q: %v", 860 r.task.Name, r.alloc.ID, err) 861 r.Kill("vault", fmt.Sprintf("failed to derive token: %v", err), true) 862 return "", true 863 } 864 865 // Handle the retry case 866 backoff := (1 << (2 * uint64(attempts))) * vaultBackoffBaseline 867 if backoff > vaultBackoffLimit { 868 backoff = vaultBackoffLimit 869 } 870 r.logger.Printf("[ERR] client: failed to derive Vault token for task %v on alloc %q: %v; retrying in %v", 871 r.task.Name, r.alloc.ID, err, backoff) 872 873 attempts++ 874 875 // Wait till retrying 876 select { 877 case <-r.waitCh: 878 return "", true 879 case <-time.After(backoff): 880 } 881 } 882 } 883 884 // writeToken writes the given token to disk 885 func (r *TaskRunner) writeToken(token string) error { 886 tokenPath := filepath.Join(r.taskDir.SecretsDir, vaultTokenFile) 887 if err := ioutil.WriteFile(tokenPath, []byte(token), 0777); err != nil { 888 return fmt.Errorf("failed to save Vault tokens to secret dir for task %q in alloc %q: %v", r.task.Name, r.alloc.ID, err) 889 } 890 891 return nil 892 } 893 894 // updatedTokenHandler is called when a new Vault token is retrieved. Things 895 // that rely on the token should be updated here. 896 func (r *TaskRunner) updatedTokenHandler() { 897 898 // Update the tasks environment 899 r.envBuilder.SetVaultToken(r.vaultFuture.Get(), r.task.Vault.Env) 900 901 if r.templateManager != nil { 902 r.templateManager.Stop() 903 904 // Create a new templateManager 905 var err error 906 r.templateManager, err = NewTaskTemplateManager(&TaskTemplateManagerConfig{ 907 Hooks: r, 908 Templates: r.task.Templates, 909 ClientConfig: r.config, 910 VaultToken: r.vaultFuture.Get(), 911 TaskDir: r.taskDir.Dir, 912 EnvBuilder: r.envBuilder, 913 MaxTemplateEventRate: DefaultMaxTemplateEventRate, 914 }) 915 916 if err != nil { 917 err := fmt.Errorf("failed to build task's template manager: %v", err) 918 r.setState(structs.TaskStateDead, 919 structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(err).SetFailsTask(), 920 false) 921 r.logger.Printf("[ERR] client: alloc %q, task %q %v", r.alloc.ID, r.task.Name, err) 922 r.Kill("vault", err.Error(), true) 923 return 924 } 925 } 926 } 927 928 // prestart handles life-cycle tasks that occur before the task has started. 929 // Since it's run asynchronously with the main Run() loop the alloc & task are 930 // passed in to avoid racing with updates. 931 func (r *TaskRunner) prestart(alloc *structs.Allocation, task *structs.Task, resultCh chan bool) { 932 if task.Vault != nil { 933 // Wait for the token 934 r.logger.Printf("[DEBUG] client: waiting for Vault token for task %v in alloc %q", task.Name, alloc.ID) 935 tokenCh := r.vaultFuture.Wait() 936 select { 937 case <-tokenCh: 938 case <-r.waitCh: 939 resultCh <- false 940 return 941 } 942 r.logger.Printf("[DEBUG] client: retrieved Vault token for task %v in alloc %q", task.Name, alloc.ID) 943 r.envBuilder.SetVaultToken(r.vaultFuture.Get(), task.Vault.Env) 944 } 945 946 // If the job is a dispatch job and there is a payload write it to disk 947 requirePayload := len(alloc.Job.Payload) != 0 && 948 (r.task.DispatchPayload != nil && r.task.DispatchPayload.File != "") 949 if !r.payloadRendered && requirePayload { 950 renderTo := filepath.Join(r.taskDir.LocalDir, task.DispatchPayload.File) 951 decoded, err := snappy.Decode(nil, alloc.Job.Payload) 952 if err != nil { 953 r.setState( 954 structs.TaskStateDead, 955 structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(err).SetFailsTask(), 956 false) 957 resultCh <- false 958 return 959 } 960 961 if err := os.MkdirAll(filepath.Dir(renderTo), 07777); err != nil { 962 r.setState( 963 structs.TaskStateDead, 964 structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(err).SetFailsTask(), 965 false) 966 resultCh <- false 967 return 968 } 969 970 if err := ioutil.WriteFile(renderTo, decoded, 0777); err != nil { 971 r.setState( 972 structs.TaskStateDead, 973 structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(err).SetFailsTask(), 974 false) 975 resultCh <- false 976 return 977 } 978 979 r.payloadRendered = true 980 } 981 982 for { 983 r.persistLock.Lock() 984 downloaded := r.artifactsDownloaded 985 r.persistLock.Unlock() 986 987 // Download the task's artifacts 988 if !downloaded && len(task.Artifacts) > 0 { 989 r.setState(structs.TaskStatePending, structs.NewTaskEvent(structs.TaskDownloadingArtifacts), false) 990 taskEnv := r.envBuilder.Build() 991 for _, artifact := range task.Artifacts { 992 if err := getter.GetArtifact(taskEnv, artifact, r.taskDir.Dir); err != nil { 993 wrapped := fmt.Errorf("failed to download artifact %q: %v", artifact.GetterSource, err) 994 r.logger.Printf("[DEBUG] client: %v", wrapped) 995 r.setState(structs.TaskStatePending, 996 structs.NewTaskEvent(structs.TaskArtifactDownloadFailed).SetDownloadError(wrapped), false) 997 r.restartTracker.SetStartError(structs.WrapRecoverable(wrapped.Error(), err)) 998 goto RESTART 999 } 1000 } 1001 1002 r.persistLock.Lock() 1003 r.artifactsDownloaded = true 1004 r.persistLock.Unlock() 1005 } 1006 1007 // We don't have to wait for any template 1008 if len(task.Templates) == 0 { 1009 // Send the start signal 1010 select { 1011 case r.startCh <- struct{}{}: 1012 default: 1013 } 1014 1015 resultCh <- true 1016 return 1017 } 1018 1019 // Build the template manager 1020 if r.templateManager == nil { 1021 var err error 1022 r.templateManager, err = NewTaskTemplateManager(&TaskTemplateManagerConfig{ 1023 Hooks: r, 1024 Templates: r.task.Templates, 1025 ClientConfig: r.config, 1026 VaultToken: r.vaultFuture.Get(), 1027 TaskDir: r.taskDir.Dir, 1028 EnvBuilder: r.envBuilder, 1029 MaxTemplateEventRate: DefaultMaxTemplateEventRate, 1030 }) 1031 if err != nil { 1032 err := fmt.Errorf("failed to build task's template manager: %v", err) 1033 r.setState(structs.TaskStateDead, structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(err).SetFailsTask(), false) 1034 r.logger.Printf("[ERR] client: alloc %q, task %q %v", alloc.ID, task.Name, err) 1035 resultCh <- false 1036 return 1037 } 1038 } 1039 1040 // Block for consul-template 1041 // TODO Hooks should register themselves as blocking and then we can 1042 // perioidcally enumerate what we are still blocked on 1043 select { 1044 case <-r.unblockCh: 1045 // Send the start signal 1046 select { 1047 case r.startCh <- struct{}{}: 1048 default: 1049 } 1050 1051 resultCh <- true 1052 return 1053 case <-r.waitCh: 1054 // The run loop has exited so exit too 1055 resultCh <- false 1056 return 1057 } 1058 1059 RESTART: 1060 restart := r.shouldRestart() 1061 if !restart { 1062 resultCh <- false 1063 return 1064 } 1065 } 1066 } 1067 1068 // postrun is used to do any cleanup that is necessary after exiting the runloop 1069 func (r *TaskRunner) postrun() { 1070 // Stop the template manager 1071 if r.templateManager != nil { 1072 r.templateManager.Stop() 1073 } 1074 } 1075 1076 // run is the main run loop that handles starting the application, destroying 1077 // it, restarts and signals. 1078 func (r *TaskRunner) run() { 1079 // Predeclare things so we can jump to the RESTART 1080 var stopCollection chan struct{} 1081 var handleWaitCh chan *dstructs.WaitResult 1082 1083 // If we already have a handle, populate the stopCollection and handleWaitCh 1084 // to fix the invariant that it exists. 1085 handleEmpty := r.getHandle() == nil 1086 1087 if !handleEmpty { 1088 stopCollection = make(chan struct{}) 1089 go r.collectResourceUsageStats(stopCollection) 1090 handleWaitCh = r.handle.WaitCh() 1091 } 1092 1093 for { 1094 // Do the prestart activities 1095 prestartResultCh := make(chan bool, 1) 1096 go r.prestart(r.alloc, r.task, prestartResultCh) 1097 1098 WAIT: 1099 for { 1100 select { 1101 case success := <-prestartResultCh: 1102 if !success { 1103 r.cleanup() 1104 r.setState(structs.TaskStateDead, nil, false) 1105 return 1106 } 1107 case <-r.startCh: 1108 // Start the task if not yet started or it is being forced. This logic 1109 // is necessary because in the case of a restore the handle already 1110 // exists. 1111 handleEmpty := r.getHandle() == nil 1112 if handleEmpty { 1113 startErr := r.startTask() 1114 r.restartTracker.SetStartError(startErr) 1115 if startErr != nil { 1116 r.setState("", structs.NewTaskEvent(structs.TaskDriverFailure).SetDriverError(startErr), true) 1117 goto RESTART 1118 } 1119 1120 // Mark the task as started 1121 r.setState(structs.TaskStateRunning, structs.NewTaskEvent(structs.TaskStarted), false) 1122 r.runningLock.Lock() 1123 r.running = true 1124 r.runningLock.Unlock() 1125 1126 if stopCollection == nil { 1127 stopCollection = make(chan struct{}) 1128 go r.collectResourceUsageStats(stopCollection) 1129 } 1130 1131 handleWaitCh = r.handle.WaitCh() 1132 } 1133 1134 case waitRes := <-handleWaitCh: 1135 if waitRes == nil { 1136 panic("nil wait") 1137 } 1138 1139 r.runningLock.Lock() 1140 r.running = false 1141 r.runningLock.Unlock() 1142 1143 // Stop collection of the task's resource usage 1144 close(stopCollection) 1145 1146 // Log whether the task was successful or not. 1147 r.restartTracker.SetWaitResult(waitRes) 1148 r.setState("", r.waitErrorToEvent(waitRes), true) 1149 if !waitRes.Successful() { 1150 r.logger.Printf("[INFO] client: task %q for alloc %q failed: %v", r.task.Name, r.alloc.ID, waitRes) 1151 } else { 1152 r.logger.Printf("[INFO] client: task %q for alloc %q completed successfully", r.task.Name, r.alloc.ID) 1153 } 1154 1155 break WAIT 1156 case update := <-r.updateCh: 1157 if err := r.handleUpdate(update); err != nil { 1158 r.logger.Printf("[ERR] client: update to task %q failed: %v", r.task.Name, err) 1159 } 1160 1161 case se := <-r.signalCh: 1162 r.runningLock.Lock() 1163 running := r.running 1164 r.runningLock.Unlock() 1165 common := fmt.Sprintf("signal %v to task %v for alloc %q", se.s, r.task.Name, r.alloc.ID) 1166 if !running { 1167 // Send no error 1168 r.logger.Printf("[DEBUG] client: skipping %s", common) 1169 se.result <- nil 1170 continue 1171 } 1172 1173 r.logger.Printf("[DEBUG] client: sending %s", common) 1174 r.setState(structs.TaskStateRunning, se.e, false) 1175 1176 res := r.handle.Signal(se.s) 1177 se.result <- res 1178 1179 case restartEvent := <-r.restartCh: 1180 r.runningLock.Lock() 1181 running := r.running 1182 r.runningLock.Unlock() 1183 common := fmt.Sprintf("task %v for alloc %q", r.task.Name, r.alloc.ID) 1184 if !running { 1185 r.logger.Printf("[DEBUG] client: skipping restart of %v: task isn't running", common) 1186 continue 1187 } 1188 1189 r.logger.Printf("[DEBUG] client: restarting %s: %v", common, restartEvent.taskEvent.RestartReason) 1190 r.setState(structs.TaskStateRunning, restartEvent.taskEvent, false) 1191 r.killTask(nil) 1192 1193 close(stopCollection) 1194 1195 if handleWaitCh != nil { 1196 <-handleWaitCh 1197 } 1198 1199 r.restartTracker.SetRestartTriggered(restartEvent.failure) 1200 break WAIT 1201 1202 case <-r.destroyCh: 1203 r.runningLock.Lock() 1204 running := r.running 1205 r.runningLock.Unlock() 1206 if !running { 1207 r.cleanup() 1208 r.setState(structs.TaskStateDead, r.destroyEvent, false) 1209 return 1210 } 1211 1212 // Remove from consul before killing the task so that traffic 1213 // can be rerouted 1214 interpTask := interpolateServices(r.envBuilder.Build(), r.task) 1215 r.consul.RemoveTask(r.alloc.ID, interpTask) 1216 1217 // Delay actually killing the task if configured. See #244 1218 if r.task.ShutdownDelay > 0 { 1219 r.logger.Printf("[DEBUG] client: delaying shutdown of alloc %q task %q for %q", 1220 r.alloc.ID, r.task.Name, r.task.ShutdownDelay) 1221 <-time.After(r.task.ShutdownDelay) 1222 } 1223 1224 // Store the task event that provides context on the task 1225 // destroy. The Killed event is set from the alloc_runner and 1226 // doesn't add detail 1227 var killEvent *structs.TaskEvent 1228 if r.destroyEvent.Type != structs.TaskKilled { 1229 if r.destroyEvent.Type == structs.TaskKilling { 1230 killEvent = r.destroyEvent 1231 } else { 1232 r.setState(structs.TaskStateRunning, r.destroyEvent, false) 1233 } 1234 } 1235 1236 r.killTask(killEvent) 1237 close(stopCollection) 1238 1239 // Wait for handler to exit before calling cleanup 1240 <-handleWaitCh 1241 r.cleanup() 1242 1243 r.setState(structs.TaskStateDead, nil, false) 1244 return 1245 } 1246 } 1247 1248 RESTART: 1249 // shouldRestart will block if the task should restart after a delay. 1250 restart := r.shouldRestart() 1251 if !restart { 1252 r.cleanup() 1253 r.setState(structs.TaskStateDead, nil, false) 1254 return 1255 } 1256 1257 // Clear the handle so a new driver will be created. 1258 r.handleLock.Lock() 1259 r.handle = nil 1260 handleWaitCh = nil 1261 stopCollection = nil 1262 r.handleLock.Unlock() 1263 } 1264 } 1265 1266 // cleanup removes Consul entries and calls Driver.Cleanup when a task is 1267 // stopping. Errors are logged. 1268 func (r *TaskRunner) cleanup() { 1269 // Remove from Consul 1270 interpTask := interpolateServices(r.envBuilder.Build(), r.task) 1271 r.consul.RemoveTask(r.alloc.ID, interpTask) 1272 1273 drv, err := r.createDriver() 1274 if err != nil { 1275 r.logger.Printf("[ERR] client: error creating driver to cleanup resources: %v", err) 1276 return 1277 } 1278 1279 res := r.getCreatedResources() 1280 1281 ctx := driver.NewExecContext(r.taskDir, r.envBuilder.Build()) 1282 attempts := 1 1283 var cleanupErr error 1284 for retry := true; retry; attempts++ { 1285 cleanupErr = drv.Cleanup(ctx, res) 1286 retry = structs.IsRecoverable(cleanupErr) 1287 1288 // Copy current createdResources state in case SaveState is 1289 // called between retries 1290 r.setCreatedResources(res) 1291 1292 // Retry 3 times with sleeps between 1293 if !retry || attempts > 3 { 1294 break 1295 } 1296 time.Sleep(time.Duration(attempts) * time.Second) 1297 } 1298 1299 if cleanupErr != nil { 1300 r.logger.Printf("[ERR] client: error cleaning up resources for task %q after %d attempts: %v", r.task.Name, attempts, cleanupErr) 1301 } 1302 return 1303 } 1304 1305 // shouldRestart returns if the task should restart. If the return value is 1306 // true, the task's restart policy has already been considered and any wait time 1307 // between restarts has been applied. 1308 func (r *TaskRunner) shouldRestart() bool { 1309 state, when := r.restartTracker.GetState() 1310 reason := r.restartTracker.GetReason() 1311 switch state { 1312 case structs.TaskNotRestarting, structs.TaskTerminated: 1313 r.logger.Printf("[INFO] client: Not restarting task: %v for alloc: %v ", r.task.Name, r.alloc.ID) 1314 if state == structs.TaskNotRestarting { 1315 r.setState(structs.TaskStateDead, 1316 structs.NewTaskEvent(structs.TaskNotRestarting). 1317 SetRestartReason(reason).SetFailsTask(), 1318 false) 1319 } 1320 return false 1321 case structs.TaskRestarting: 1322 r.logger.Printf("[INFO] client: Restarting task %q for alloc %q in %v", r.task.Name, r.alloc.ID, when) 1323 r.setState(structs.TaskStatePending, 1324 structs.NewTaskEvent(structs.TaskRestarting). 1325 SetRestartDelay(when). 1326 SetRestartReason(reason), 1327 false) 1328 default: 1329 r.logger.Printf("[ERR] client: restart tracker returned unknown state: %q", state) 1330 return false 1331 } 1332 1333 // Unregister from Consul while waiting to restart. 1334 interpTask := interpolateServices(r.envBuilder.Build(), r.task) 1335 r.consul.RemoveTask(r.alloc.ID, interpTask) 1336 1337 // Sleep but watch for destroy events. 1338 select { 1339 case <-time.After(when): 1340 case <-r.destroyCh: 1341 } 1342 1343 // Destroyed while we were waiting to restart, so abort. 1344 r.destroyLock.Lock() 1345 destroyed := r.destroy 1346 r.destroyLock.Unlock() 1347 if destroyed { 1348 r.logger.Printf("[DEBUG] client: Not restarting task: %v because it has been destroyed", r.task.Name) 1349 r.setState(structs.TaskStateDead, r.destroyEvent, false) 1350 return false 1351 } 1352 1353 return true 1354 } 1355 1356 // killTask kills the running task. A killing event can optionally be passed and 1357 // this event is used to mark the task as being killed. It provides a means to 1358 // store extra information. 1359 func (r *TaskRunner) killTask(killingEvent *structs.TaskEvent) { 1360 r.runningLock.Lock() 1361 running := r.running 1362 r.runningLock.Unlock() 1363 if !running { 1364 return 1365 } 1366 1367 // Get the kill timeout 1368 timeout := driver.GetKillTimeout(r.task.KillTimeout, r.config.MaxKillTimeout) 1369 1370 // Build the event 1371 var event *structs.TaskEvent 1372 if killingEvent != nil { 1373 event = killingEvent 1374 event.Type = structs.TaskKilling 1375 } else { 1376 event = structs.NewTaskEvent(structs.TaskKilling) 1377 } 1378 event.SetKillTimeout(timeout) 1379 1380 // Mark that we received the kill event 1381 r.setState(structs.TaskStateRunning, event, false) 1382 1383 handle := r.getHandle() 1384 1385 // Kill the task using an exponential backoff in-case of failures. 1386 destroySuccess, err := r.handleDestroy(handle) 1387 if !destroySuccess { 1388 // We couldn't successfully destroy the resource created. 1389 r.logger.Printf("[ERR] client: failed to kill task %q. Resources may have been leaked: %v", r.task.Name, err) 1390 } 1391 1392 r.runningLock.Lock() 1393 r.running = false 1394 r.runningLock.Unlock() 1395 1396 // Store that the task has been destroyed and any associated error. 1397 r.setState("", structs.NewTaskEvent(structs.TaskKilled).SetKillError(err), true) 1398 } 1399 1400 // startTask creates the driver, task dir, and starts the task. 1401 func (r *TaskRunner) startTask() error { 1402 // Create a driver 1403 drv, err := r.createDriver() 1404 if err != nil { 1405 return fmt.Errorf("failed to create driver of task %q for alloc %q: %v", 1406 r.task.Name, r.alloc.ID, err) 1407 } 1408 1409 // Run prestart 1410 ctx := driver.NewExecContext(r.taskDir, r.envBuilder.Build()) 1411 presp, err := drv.Prestart(ctx, r.task) 1412 1413 // Merge newly created resources into previously created resources 1414 if presp != nil { 1415 r.createdResourcesLock.Lock() 1416 r.createdResources.Merge(presp.CreatedResources) 1417 r.createdResourcesLock.Unlock() 1418 1419 // Set any network configuration returned by the driver 1420 r.envBuilder.SetDriverNetwork(presp.Network) 1421 } 1422 1423 if err != nil { 1424 wrapped := fmt.Sprintf("failed to initialize task %q for alloc %q: %v", 1425 r.task.Name, r.alloc.ID, err) 1426 r.logger.Printf("[WARN] client: error from prestart: %s", wrapped) 1427 return structs.WrapRecoverable(wrapped, err) 1428 } 1429 1430 // Create a new context for Start since the environment may have been updated. 1431 ctx = driver.NewExecContext(r.taskDir, r.envBuilder.Build()) 1432 1433 // Start the job 1434 sresp, err := drv.Start(ctx, r.task) 1435 if err != nil { 1436 wrapped := fmt.Sprintf("failed to start task %q for alloc %q: %v", 1437 r.task.Name, r.alloc.ID, err) 1438 r.logger.Printf("[WARN] client: %s", wrapped) 1439 return structs.WrapRecoverable(wrapped, err) 1440 1441 } 1442 1443 // Log driver network information 1444 if sresp.Network != nil && sresp.Network.IP != "" { 1445 if sresp.Network.AutoAdvertise { 1446 r.logger.Printf("[INFO] client: alloc %s task %s auto-advertising detected IP %s", 1447 r.alloc.ID, r.task.Name, sresp.Network.IP) 1448 } else { 1449 r.logger.Printf("[TRACE] client: alloc %s task %s detected IP %s but not auto-advertising", 1450 r.alloc.ID, r.task.Name, sresp.Network.IP) 1451 } 1452 } 1453 1454 if sresp.Network == nil || sresp.Network.IP == "" { 1455 r.logger.Printf("[TRACE] client: alloc %s task %s could not detect a driver IP", r.alloc.ID, r.task.Name) 1456 } 1457 1458 // Update environment with the network defined by the driver's Start method. 1459 r.envBuilder.SetDriverNetwork(sresp.Network) 1460 1461 if err := r.registerServices(drv, sresp.Handle, sresp.Network); err != nil { 1462 // All IO is done asynchronously, so errors from registering 1463 // services are hard failures. 1464 r.logger.Printf("[ERR] client: failed to register services and checks for task %q alloc %q: %v", r.task.Name, r.alloc.ID, err) 1465 1466 // Kill the started task 1467 if destroyed, err := r.handleDestroy(sresp.Handle); !destroyed { 1468 r.logger.Printf("[ERR] client: failed to kill task %q alloc %q. Resources may be leaked: %v", 1469 r.task.Name, r.alloc.ID, err) 1470 } 1471 return structs.NewRecoverableError(err, false) 1472 } 1473 1474 r.handleLock.Lock() 1475 r.handle = sresp.Handle 1476 r.handleLock.Unlock() 1477 1478 // Need to persist the driver network between restarts 1479 r.driverNetLock.Lock() 1480 r.driverNet = sresp.Network 1481 r.driverNetLock.Unlock() 1482 1483 return nil 1484 } 1485 1486 // registerServices and checks with Consul. 1487 func (r *TaskRunner) registerServices(d driver.Driver, h driver.DriverHandle, n *cstructs.DriverNetwork) error { 1488 var exec driver.ScriptExecutor 1489 if d.Abilities().Exec { 1490 // Allow set the script executor if the driver supports it 1491 exec = h 1492 } 1493 interpolatedTask := interpolateServices(r.envBuilder.Build(), r.task) 1494 return r.consul.RegisterTask(r.alloc.ID, interpolatedTask, r, exec, n) 1495 } 1496 1497 // interpolateServices interpolates tags in a service and checks with values from the 1498 // task's environment. 1499 func interpolateServices(taskEnv *env.TaskEnv, task *structs.Task) *structs.Task { 1500 taskCopy := task.Copy() 1501 for _, service := range taskCopy.Services { 1502 for _, check := range service.Checks { 1503 check.Name = taskEnv.ReplaceEnv(check.Name) 1504 check.Type = taskEnv.ReplaceEnv(check.Type) 1505 check.Command = taskEnv.ReplaceEnv(check.Command) 1506 check.Args = taskEnv.ParseAndReplace(check.Args) 1507 check.Path = taskEnv.ReplaceEnv(check.Path) 1508 check.Protocol = taskEnv.ReplaceEnv(check.Protocol) 1509 check.PortLabel = taskEnv.ReplaceEnv(check.PortLabel) 1510 check.InitialStatus = taskEnv.ReplaceEnv(check.InitialStatus) 1511 check.Method = taskEnv.ReplaceEnv(check.Method) 1512 if len(check.Header) > 0 { 1513 header := make(map[string][]string, len(check.Header)) 1514 for k, vs := range check.Header { 1515 newVals := make([]string, len(vs)) 1516 for i, v := range vs { 1517 newVals[i] = taskEnv.ReplaceEnv(v) 1518 } 1519 header[taskEnv.ReplaceEnv(k)] = newVals 1520 } 1521 check.Header = header 1522 } 1523 } 1524 service.Name = taskEnv.ReplaceEnv(service.Name) 1525 service.PortLabel = taskEnv.ReplaceEnv(service.PortLabel) 1526 service.Tags = taskEnv.ParseAndReplace(service.Tags) 1527 } 1528 return taskCopy 1529 } 1530 1531 // buildTaskDir creates the task directory before driver.Prestart. It is safe 1532 // to call multiple times as its state is persisted. 1533 func (r *TaskRunner) buildTaskDir(fsi cstructs.FSIsolation) error { 1534 r.persistLock.Lock() 1535 built := r.taskDirBuilt 1536 r.persistLock.Unlock() 1537 1538 // We do not set the state again since this only occurs during restoration 1539 // and the task dir is already built. The reason we call Build again is to 1540 // ensure that the task dir invariants are still held. 1541 if !built { 1542 r.setState(structs.TaskStatePending, 1543 structs.NewTaskEvent(structs.TaskSetup).SetMessage(structs.TaskBuildingTaskDir), 1544 false) 1545 } 1546 1547 chroot := config.DefaultChrootEnv 1548 if len(r.config.ChrootEnv) > 0 { 1549 chroot = r.config.ChrootEnv 1550 } 1551 if err := r.taskDir.Build(built, chroot, fsi); err != nil { 1552 return err 1553 } 1554 1555 // Mark task dir as successfully built 1556 r.persistLock.Lock() 1557 r.taskDirBuilt = true 1558 r.persistLock.Unlock() 1559 1560 // Set path and host related env vars 1561 driver.SetEnvvars(r.envBuilder, fsi, r.taskDir, r.config) 1562 return nil 1563 } 1564 1565 // collectResourceUsageStats starts collecting resource usage stats of a Task. 1566 // Collection ends when the passed channel is closed 1567 func (r *TaskRunner) collectResourceUsageStats(stopCollection <-chan struct{}) { 1568 // start collecting the stats right away and then start collecting every 1569 // collection interval 1570 next := time.NewTimer(0) 1571 defer next.Stop() 1572 for { 1573 select { 1574 case <-next.C: 1575 next.Reset(r.config.StatsCollectionInterval) 1576 handle := r.getHandle() 1577 if handle == nil { 1578 continue 1579 } 1580 ru, err := handle.Stats() 1581 1582 if err != nil { 1583 // Check if the driver doesn't implement stats 1584 if err.Error() == driver.DriverStatsNotImplemented.Error() { 1585 r.logger.Printf("[DEBUG] client: driver for task %q in allocation %q doesn't support stats", r.task.Name, r.alloc.ID) 1586 return 1587 } 1588 1589 // We do not log when the plugin is shutdown as this is simply a 1590 // race between the stopCollection channel being closed and calling 1591 // Stats on the handle. 1592 if !strings.Contains(err.Error(), "connection is shut down") { 1593 r.logger.Printf("[WARN] client: error fetching stats of task %v: %v", r.task.Name, err) 1594 } 1595 continue 1596 } 1597 1598 r.resourceUsageLock.Lock() 1599 r.resourceUsage = ru 1600 r.resourceUsageLock.Unlock() 1601 if ru != nil { 1602 r.emitStats(ru) 1603 } 1604 case <-stopCollection: 1605 return 1606 } 1607 } 1608 } 1609 1610 // LatestResourceUsage returns the last resource utilization datapoint collected 1611 func (r *TaskRunner) LatestResourceUsage() *cstructs.TaskResourceUsage { 1612 r.resourceUsageLock.RLock() 1613 defer r.resourceUsageLock.RUnlock() 1614 r.runningLock.Lock() 1615 defer r.runningLock.Unlock() 1616 1617 // If the task is not running there can be no latest resource 1618 if !r.running { 1619 return nil 1620 } 1621 1622 return r.resourceUsage 1623 } 1624 1625 // handleUpdate takes an updated allocation and updates internal state to 1626 // reflect the new config for the task. 1627 func (r *TaskRunner) handleUpdate(update *structs.Allocation) error { 1628 // Extract the task group from the alloc. 1629 tg := update.Job.LookupTaskGroup(update.TaskGroup) 1630 if tg == nil { 1631 return fmt.Errorf("alloc '%s' missing task group '%s'", update.ID, update.TaskGroup) 1632 } 1633 1634 // Extract the task. 1635 var updatedTask *structs.Task 1636 for _, t := range tg.Tasks { 1637 if t.Name == r.task.Name { 1638 updatedTask = t.Copy() 1639 break 1640 } 1641 } 1642 if updatedTask == nil { 1643 return fmt.Errorf("task group %q doesn't contain task %q", tg.Name, r.task.Name) 1644 } 1645 1646 // Merge in the task resources 1647 updatedTask.Resources = update.TaskResources[updatedTask.Name] 1648 1649 // Interpolate the old task with the old env before updating the env as 1650 // updating services in Consul need both the old and new interpolations 1651 // to find differences. 1652 oldInterpolatedTask := interpolateServices(r.envBuilder.Build(), r.task) 1653 1654 // Now it's safe to update the environment 1655 r.envBuilder.UpdateTask(update, updatedTask) 1656 1657 var mErr multierror.Error 1658 r.handleLock.Lock() 1659 if r.handle != nil { 1660 drv, err := r.createDriver() 1661 if err != nil { 1662 // Something has really gone wrong; don't continue 1663 r.handleLock.Unlock() 1664 return fmt.Errorf("error accessing driver when updating task %q: %v", r.task.Name, err) 1665 } 1666 1667 // Update will update resources and store the new kill timeout. 1668 if err := r.handle.Update(updatedTask); err != nil { 1669 mErr.Errors = append(mErr.Errors, fmt.Errorf("updating task resources failed: %v", err)) 1670 } 1671 1672 // Update services in Consul 1673 newInterpolatedTask := interpolateServices(r.envBuilder.Build(), updatedTask) 1674 if err := r.updateServices(drv, r.handle, oldInterpolatedTask, newInterpolatedTask); err != nil { 1675 mErr.Errors = append(mErr.Errors, fmt.Errorf("error updating services and checks in Consul: %v", err)) 1676 } 1677 } 1678 r.handleLock.Unlock() 1679 1680 // Update the restart policy. 1681 if r.restartTracker != nil { 1682 r.restartTracker.SetPolicy(tg.RestartPolicy) 1683 } 1684 1685 // Store the updated alloc. 1686 r.alloc = update 1687 r.task = updatedTask 1688 return mErr.ErrorOrNil() 1689 } 1690 1691 // updateServices and checks with Consul. Tasks must be interpolated! 1692 func (r *TaskRunner) updateServices(d driver.Driver, h driver.ScriptExecutor, oldTask, newTask *structs.Task) error { 1693 var exec driver.ScriptExecutor 1694 if d.Abilities().Exec { 1695 // Allow set the script executor if the driver supports it 1696 exec = h 1697 } 1698 r.driverNetLock.Lock() 1699 net := r.driverNet.Copy() 1700 r.driverNetLock.Unlock() 1701 return r.consul.UpdateTask(r.alloc.ID, oldTask, newTask, r, exec, net) 1702 } 1703 1704 // handleDestroy kills the task handle. In the case that killing fails, 1705 // handleDestroy will retry with an exponential backoff and will give up at a 1706 // given limit. It returns whether the task was destroyed and the error 1707 // associated with the last kill attempt. 1708 func (r *TaskRunner) handleDestroy(handle driver.DriverHandle) (destroyed bool, err error) { 1709 // Cap the number of times we attempt to kill the task. 1710 for i := 0; i < killFailureLimit; i++ { 1711 if err = handle.Kill(); err != nil { 1712 // Calculate the new backoff 1713 backoff := (1 << (2 * uint64(i))) * killBackoffBaseline 1714 if backoff > killBackoffLimit { 1715 backoff = killBackoffLimit 1716 } 1717 1718 r.logger.Printf("[ERR] client: failed to kill task '%s' for alloc %q. Retrying in %v: %v", 1719 r.task.Name, r.alloc.ID, backoff, err) 1720 time.Sleep(backoff) 1721 } else { 1722 // Kill was successful 1723 return true, nil 1724 } 1725 } 1726 return 1727 } 1728 1729 // Restart will restart the task. 1730 func (r *TaskRunner) Restart(source, reason string, failure bool) { 1731 reasonStr := fmt.Sprintf("%s: %s", source, reason) 1732 event := newTaskRestartEvent(reasonStr, failure) 1733 1734 select { 1735 case r.restartCh <- event: 1736 case <-r.waitCh: 1737 } 1738 } 1739 1740 // Signal will send a signal to the task 1741 func (r *TaskRunner) Signal(source, reason string, s os.Signal) error { 1742 1743 reasonStr := fmt.Sprintf("%s: %s", source, reason) 1744 event := structs.NewTaskEvent(structs.TaskSignaling).SetTaskSignal(s).SetTaskSignalReason(reasonStr) 1745 1746 resCh := make(chan error) 1747 se := SignalEvent{ 1748 s: s, 1749 e: event, 1750 result: resCh, 1751 } 1752 1753 select { 1754 case r.signalCh <- se: 1755 case <-r.waitCh: 1756 } 1757 1758 return <-resCh 1759 } 1760 1761 // Kill will kill a task and store the error, no longer restarting the task. If 1762 // fail is set, the task is marked as having failed. 1763 func (r *TaskRunner) Kill(source, reason string, fail bool) { 1764 reasonStr := fmt.Sprintf("%s: %s", source, reason) 1765 event := structs.NewTaskEvent(structs.TaskKilling).SetKillReason(reasonStr) 1766 if fail { 1767 event.SetFailsTask() 1768 } 1769 1770 r.logger.Printf("[DEBUG] client: killing task %v for alloc %q: %v", r.task.Name, r.alloc.ID, reasonStr) 1771 r.Destroy(event) 1772 } 1773 1774 func (r *TaskRunner) EmitEvent(source, message string) { 1775 event := structs.NewTaskEvent(source). 1776 SetMessage(message) 1777 r.setState("", event, false) 1778 r.logger.Printf("[DEBUG] client: event from %q for task %q in alloc %q: %v", 1779 source, r.task.Name, r.alloc.ID, message) 1780 } 1781 1782 // UnblockStart unblocks the starting of the task. It currently assumes only 1783 // consul-template will unblock 1784 func (r *TaskRunner) UnblockStart(source string) { 1785 r.unblockLock.Lock() 1786 defer r.unblockLock.Unlock() 1787 if r.unblocked { 1788 return 1789 } 1790 1791 r.logger.Printf("[DEBUG] client: unblocking task %v for alloc %q: %v", r.task.Name, r.alloc.ID, source) 1792 r.unblocked = true 1793 close(r.unblockCh) 1794 } 1795 1796 // Helper function for converting a WaitResult into a TaskTerminated event. 1797 func (r *TaskRunner) waitErrorToEvent(res *dstructs.WaitResult) *structs.TaskEvent { 1798 return structs.NewTaskEvent(structs.TaskTerminated). 1799 SetExitCode(res.ExitCode). 1800 SetSignal(res.Signal). 1801 SetExitMessage(res.Err) 1802 } 1803 1804 // Update is used to update the task of the context 1805 func (r *TaskRunner) Update(update *structs.Allocation) { 1806 select { 1807 case r.updateCh <- update: 1808 default: 1809 r.logger.Printf("[ERR] client: dropping task update '%s' (alloc '%s')", 1810 r.task.Name, r.alloc.ID) 1811 } 1812 } 1813 1814 // Destroy is used to indicate that the task context should be destroyed. The 1815 // event parameter provides a context for the destroy. 1816 func (r *TaskRunner) Destroy(event *structs.TaskEvent) { 1817 r.destroyLock.Lock() 1818 defer r.destroyLock.Unlock() 1819 1820 if r.destroy { 1821 return 1822 } 1823 r.destroy = true 1824 r.destroyEvent = event 1825 close(r.destroyCh) 1826 } 1827 1828 // getCreatedResources returns the resources created by drivers. It will never 1829 // return nil. 1830 func (r *TaskRunner) getCreatedResources() *driver.CreatedResources { 1831 r.createdResourcesLock.Lock() 1832 if r.createdResources == nil { 1833 r.createdResources = driver.NewCreatedResources() 1834 } 1835 cr := r.createdResources.Copy() 1836 r.createdResourcesLock.Unlock() 1837 1838 return cr 1839 } 1840 1841 // setCreatedResources updates the resources created by drivers. If passed nil 1842 // it will set createdResources to an initialized struct. 1843 func (r *TaskRunner) setCreatedResources(cr *driver.CreatedResources) { 1844 if cr == nil { 1845 cr = driver.NewCreatedResources() 1846 } 1847 r.createdResourcesLock.Lock() 1848 r.createdResources = cr.Copy() 1849 r.createdResourcesLock.Unlock() 1850 } 1851 1852 func (r *TaskRunner) setGaugeForMemory(ru *cstructs.TaskResourceUsage) { 1853 if !r.config.DisableTaggedMetrics { 1854 metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "rss"}, 1855 float32(ru.ResourceUsage.MemoryStats.RSS), r.baseLabels) 1856 metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "rss"}, 1857 float32(ru.ResourceUsage.MemoryStats.RSS), r.baseLabels) 1858 metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "cache"}, 1859 float32(ru.ResourceUsage.MemoryStats.Cache), r.baseLabels) 1860 metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "swap"}, 1861 float32(ru.ResourceUsage.MemoryStats.Swap), r.baseLabels) 1862 metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "max_usage"}, 1863 float32(ru.ResourceUsage.MemoryStats.MaxUsage), r.baseLabels) 1864 metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "kernel_usage"}, 1865 float32(ru.ResourceUsage.MemoryStats.KernelUsage), r.baseLabels) 1866 metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "kernel_max_usage"}, 1867 float32(ru.ResourceUsage.MemoryStats.KernelMaxUsage), r.baseLabels) 1868 } 1869 1870 if r.config.BackwardsCompatibleMetrics { 1871 metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "rss"}, float32(ru.ResourceUsage.MemoryStats.RSS)) 1872 metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "cache"}, float32(ru.ResourceUsage.MemoryStats.Cache)) 1873 metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "swap"}, float32(ru.ResourceUsage.MemoryStats.Swap)) 1874 metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "max_usage"}, float32(ru.ResourceUsage.MemoryStats.MaxUsage)) 1875 metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "kernel_usage"}, float32(ru.ResourceUsage.MemoryStats.KernelUsage)) 1876 metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "kernel_max_usage"}, float32(ru.ResourceUsage.MemoryStats.KernelMaxUsage)) 1877 } 1878 } 1879 1880 func (r *TaskRunner) setGaugeForCPU(ru *cstructs.TaskResourceUsage) { 1881 if !r.config.DisableTaggedMetrics { 1882 metrics.SetGaugeWithLabels([]string{"client", "allocs", "cpu", "total_percent"}, 1883 float32(ru.ResourceUsage.CpuStats.Percent), r.baseLabels) 1884 metrics.SetGaugeWithLabels([]string{"client", "allocs", "cpu", "system"}, 1885 float32(ru.ResourceUsage.CpuStats.SystemMode), r.baseLabels) 1886 metrics.SetGaugeWithLabels([]string{"client", "allocs", "cpu", "user"}, 1887 float32(ru.ResourceUsage.CpuStats.UserMode), r.baseLabels) 1888 metrics.SetGaugeWithLabels([]string{"client", "allocs", "cpu", "throttled_time"}, 1889 float32(ru.ResourceUsage.CpuStats.ThrottledTime), r.baseLabels) 1890 metrics.SetGaugeWithLabels([]string{"client", "allocs", "cpu", "throttled_periods"}, 1891 float32(ru.ResourceUsage.CpuStats.ThrottledPeriods), r.baseLabels) 1892 metrics.SetGaugeWithLabels([]string{"client", "allocs", "cpu", "total_ticks"}, 1893 float32(ru.ResourceUsage.CpuStats.TotalTicks), r.baseLabels) 1894 } 1895 1896 if r.config.BackwardsCompatibleMetrics { 1897 metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "total_percent"}, float32(ru.ResourceUsage.CpuStats.Percent)) 1898 metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "system"}, float32(ru.ResourceUsage.CpuStats.SystemMode)) 1899 metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "user"}, float32(ru.ResourceUsage.CpuStats.UserMode)) 1900 metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "throttled_time"}, float32(ru.ResourceUsage.CpuStats.ThrottledTime)) 1901 metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "throttled_periods"}, float32(ru.ResourceUsage.CpuStats.ThrottledPeriods)) 1902 metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "total_ticks"}, float32(ru.ResourceUsage.CpuStats.TotalTicks)) 1903 } 1904 } 1905 1906 // emitStats emits resource usage stats of tasks to remote metrics collector 1907 // sinks 1908 func (r *TaskRunner) emitStats(ru *cstructs.TaskResourceUsage) { 1909 if !r.config.PublishAllocationMetrics { 1910 return 1911 } 1912 1913 // If the task is not running don't emit anything 1914 r.runningLock.Lock() 1915 running := r.running 1916 r.runningLock.Unlock() 1917 if !running { 1918 return 1919 } 1920 1921 if ru.ResourceUsage.MemoryStats != nil { 1922 r.setGaugeForMemory(ru) 1923 } 1924 1925 if ru.ResourceUsage.CpuStats != nil { 1926 r.setGaugeForCPU(ru) 1927 } 1928 }