github.com/ncodes/nomad@v0.5.7-0.20170403112158-97adf4a74fb3/client/task_runner.go (about) 1 package client 2 3 import ( 4 "crypto/md5" 5 "encoding/hex" 6 "fmt" 7 "io/ioutil" 8 "log" 9 "os" 10 "path/filepath" 11 "strconv" 12 "strings" 13 "sync" 14 "time" 15 16 "github.com/armon/go-metrics" 17 "github.com/golang/snappy" 18 "github.com/hashicorp/consul-template/signals" 19 "github.com/hashicorp/go-multierror" 20 "github.com/ncodes/nomad/client/allocdir" 21 "github.com/ncodes/nomad/client/config" 22 "github.com/ncodes/nomad/client/driver" 23 "github.com/ncodes/nomad/client/getter" 24 "github.com/ncodes/nomad/client/vaultclient" 25 "github.com/ncodes/nomad/nomad/structs" 26 27 "github.com/ncodes/nomad/client/driver/env" 28 dstructs "github.com/ncodes/nomad/client/driver/structs" 29 cstructs "github.com/ncodes/nomad/client/structs" 30 ) 31 32 const ( 33 // killBackoffBaseline is the baseline time for exponential backoff while 34 // killing a task. 35 killBackoffBaseline = 5 * time.Second 36 37 // killBackoffLimit is the limit of the exponential backoff for killing 38 // the task. 39 killBackoffLimit = 2 * time.Minute 40 41 // killFailureLimit is how many times we will attempt to kill a task before 42 // giving up and potentially leaking resources. 43 killFailureLimit = 5 44 45 // vaultBackoffBaseline is the baseline time for exponential backoff when 46 // attempting to retrieve a Vault token 47 vaultBackoffBaseline = 5 * time.Second 48 49 // vaultBackoffLimit is the limit of the exponential backoff when attempting 50 // to retrieve a Vault token 51 vaultBackoffLimit = 3 * time.Minute 52 53 // vaultTokenFile is the name of the file holding the Vault token inside the 54 // task's secret directory 55 vaultTokenFile = "vault_token" 56 ) 57 58 // TaskRunner is used to wrap a task within an allocation and provide the execution context. 59 type TaskRunner struct { 60 config *config.Config 61 updater TaskStateUpdater 62 logger *log.Logger 63 alloc *structs.Allocation 64 restartTracker *RestartTracker 65 66 // running marks whether the task is running 67 running bool 68 runningLock sync.Mutex 69 70 resourceUsage *cstructs.TaskResourceUsage 71 resourceUsageLock sync.RWMutex 72 73 task *structs.Task 74 taskDir *allocdir.TaskDir 75 76 // taskEnv is the environment variables of the task 77 taskEnv *env.TaskEnvironment 78 taskEnvLock sync.Mutex 79 80 // updateCh is used to receive updated versions of the allocation 81 updateCh chan *structs.Allocation 82 83 handle driver.DriverHandle 84 handleLock sync.Mutex 85 86 // artifactsDownloaded tracks whether the tasks artifacts have been 87 // downloaded 88 // 89 // Must acquire persistLock when accessing 90 artifactsDownloaded bool 91 92 // taskDirBuilt tracks whether the task has built its directory. 93 // 94 // Must acquire persistLock when accessing 95 taskDirBuilt bool 96 97 // createdResources are all the resources created by the task driver 98 // across all attempts to start the task. 99 // Simple gets and sets should use {get,set}CreatedResources 100 createdResources *driver.CreatedResources 101 createdResourcesLock sync.Mutex 102 103 // payloadRendered tracks whether the payload has been rendered to disk 104 payloadRendered bool 105 106 // vaultFuture is the means to wait for and get a Vault token 107 vaultFuture *tokenFuture 108 109 // recoveredVaultToken is the token that was recovered through a restore 110 recoveredVaultToken string 111 112 // vaultClient is used to retrieve and renew any needed Vault token 113 vaultClient vaultclient.VaultClient 114 115 // templateManager is used to manage any consul-templates this task may have 116 templateManager *TaskTemplateManager 117 118 // startCh is used to trigger the start of the task 119 startCh chan struct{} 120 121 // unblockCh is used to unblock the starting of the task 122 unblockCh chan struct{} 123 unblocked bool 124 unblockLock sync.Mutex 125 126 // restartCh is used to restart a task 127 restartCh chan *structs.TaskEvent 128 129 // signalCh is used to send a signal to a task 130 signalCh chan SignalEvent 131 132 destroy bool 133 destroyCh chan struct{} 134 destroyLock sync.Mutex 135 destroyEvent *structs.TaskEvent 136 137 // waitCh closing marks the run loop as having exited 138 waitCh chan struct{} 139 140 // persistLock must be acquired when accessing fields stored by 141 // SaveState. SaveState is called asynchronously to TaskRunner.Run by 142 // AllocRunner, so all state fields must be synchronized using this 143 // lock. 144 persistLock sync.Mutex 145 146 // taskRunnerPlus container unofficial features 147 taskRunnerPlus *TaskRunnerPlus 148 } 149 150 // taskRunnerState is used to snapshot the state of the task runner 151 type taskRunnerState struct { 152 Version string 153 Task *structs.Task 154 HandleID string 155 ArtifactDownloaded bool 156 TaskDirBuilt bool 157 CreatedResources *driver.CreatedResources 158 PayloadRendered bool 159 } 160 161 // TaskStateUpdater is used to signal that tasks state has changed. 162 type TaskStateUpdater func(taskName, state string, event *structs.TaskEvent) 163 164 // SignalEvent is a tuple of the signal and the event generating it 165 type SignalEvent struct { 166 // s is the signal to be sent 167 s os.Signal 168 169 // e is the task event generating the signal 170 e *structs.TaskEvent 171 172 // result should be used to send back the result of the signal 173 result chan<- error 174 } 175 176 // NewTaskRunner is used to create a new task context 177 func NewTaskRunner(logger *log.Logger, config *config.Config, 178 updater TaskStateUpdater, taskDir *allocdir.TaskDir, 179 alloc *structs.Allocation, task *structs.Task, 180 vaultClient vaultclient.VaultClient) *TaskRunner { 181 182 // Merge in the task resources 183 task.Resources = alloc.TaskResources[task.Name] 184 185 // Build the restart tracker. 186 tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup) 187 if tg == nil { 188 logger.Printf("[ERR] client: alloc '%s' for missing task group '%s'", alloc.ID, alloc.TaskGroup) 189 return nil 190 } 191 restartTracker := newRestartTracker(tg.RestartPolicy, alloc.Job.Type) 192 193 tc := &TaskRunner{ 194 config: config, 195 updater: updater, 196 logger: logger, 197 restartTracker: restartTracker, 198 alloc: alloc, 199 task: task, 200 taskDir: taskDir, 201 createdResources: driver.NewCreatedResources(), 202 vaultClient: vaultClient, 203 vaultFuture: NewTokenFuture().Set(""), 204 updateCh: make(chan *structs.Allocation, 64), 205 destroyCh: make(chan struct{}), 206 waitCh: make(chan struct{}), 207 startCh: make(chan struct{}, 1), 208 unblockCh: make(chan struct{}), 209 restartCh: make(chan *structs.TaskEvent), 210 signalCh: make(chan SignalEvent), 211 taskRunnerPlus: NewTaskRunnerPlus(logger, task.Env), 212 } 213 214 return tc 215 } 216 217 // MarkReceived marks the task as received. 218 func (r *TaskRunner) MarkReceived() { 219 r.updater(r.task.Name, structs.TaskStatePending, structs.NewTaskEvent(structs.TaskReceived)) 220 } 221 222 // WaitCh returns a channel to wait for termination 223 func (r *TaskRunner) WaitCh() <-chan struct{} { 224 return r.waitCh 225 } 226 227 // stateFilePath returns the path to our state file 228 func (r *TaskRunner) stateFilePath() string { 229 // Get the MD5 of the task name 230 hashVal := md5.Sum([]byte(r.task.Name)) 231 hashHex := hex.EncodeToString(hashVal[:]) 232 dirName := fmt.Sprintf("task-%s", hashHex) 233 234 // Generate the path 235 path := filepath.Join(r.config.StateDir, "alloc", r.alloc.ID, 236 dirName, "state.json") 237 return path 238 } 239 240 // RestoreState is used to restore our state 241 func (r *TaskRunner) RestoreState() error { 242 // Load the snapshot 243 var snap taskRunnerState 244 if err := restoreState(r.stateFilePath(), &snap); err != nil { 245 return err 246 } 247 248 // Restore fields 249 if snap.Task == nil { 250 return fmt.Errorf("task runner snapshot includes nil Task") 251 } else { 252 r.task = snap.Task 253 } 254 r.artifactsDownloaded = snap.ArtifactDownloaded 255 r.taskDirBuilt = snap.TaskDirBuilt 256 r.payloadRendered = snap.PayloadRendered 257 258 r.setCreatedResources(snap.CreatedResources) 259 260 if err := r.setTaskEnv(); err != nil { 261 return fmt.Errorf("client: failed to create task environment for task %q in allocation %q: %v", 262 r.task.Name, r.alloc.ID, err) 263 } 264 265 if r.task.Vault != nil { 266 // Read the token from the secret directory 267 tokenPath := filepath.Join(r.taskDir.SecretsDir, vaultTokenFile) 268 data, err := ioutil.ReadFile(tokenPath) 269 if err != nil { 270 if !os.IsNotExist(err) { 271 return fmt.Errorf("failed to read token for task %q in alloc %q: %v", r.task.Name, r.alloc.ID, err) 272 } 273 274 // Token file doesn't exist 275 } else { 276 // Store the recovered token 277 r.recoveredVaultToken = string(data) 278 } 279 } 280 281 // Restore the driver 282 if snap.HandleID != "" { 283 d, err := r.createDriver() 284 if err != nil { 285 return err 286 } 287 288 ctx := driver.NewExecContext(r.taskDir) 289 handle, err := d.Open(ctx, snap.HandleID) 290 291 // In the case it fails, we relaunch the task in the Run() method. 292 if err != nil { 293 r.logger.Printf("[ERR] client: failed to open handle to task %q for alloc %q: %v", 294 r.task.Name, r.alloc.ID, err) 295 return nil 296 } 297 r.handleLock.Lock() 298 r.handle = handle 299 r.handleLock.Unlock() 300 301 r.runningLock.Lock() 302 r.running = true 303 r.runningLock.Unlock() 304 } 305 return nil 306 } 307 308 // SaveState is used to snapshot our state 309 func (r *TaskRunner) SaveState() error { 310 r.persistLock.Lock() 311 defer r.persistLock.Unlock() 312 313 snap := taskRunnerState{ 314 Task: r.task, 315 Version: r.config.Version, 316 ArtifactDownloaded: r.artifactsDownloaded, 317 TaskDirBuilt: r.taskDirBuilt, 318 PayloadRendered: r.payloadRendered, 319 CreatedResources: r.getCreatedResources(), 320 } 321 322 r.handleLock.Lock() 323 if r.handle != nil { 324 snap.HandleID = r.handle.ID() 325 } 326 r.handleLock.Unlock() 327 return persistState(r.stateFilePath(), &snap) 328 } 329 330 // DestroyState is used to cleanup after ourselves 331 func (r *TaskRunner) DestroyState() error { 332 r.persistLock.Lock() 333 defer r.persistLock.Unlock() 334 335 return os.RemoveAll(r.stateFilePath()) 336 } 337 338 // setState is used to update the state of the task runner 339 func (r *TaskRunner) setState(state string, event *structs.TaskEvent) { 340 // Persist our state to disk. 341 if err := r.SaveState(); err != nil { 342 r.logger.Printf("[ERR] client: failed to save state of Task Runner for task %q: %v", r.task.Name, err) 343 } 344 345 // Indicate the task has been updated. 346 r.updater(r.task.Name, state, event) 347 } 348 349 // setTaskEnv sets the task environment. It returns an error if it could not be 350 // created. 351 func (r *TaskRunner) setTaskEnv() error { 352 r.taskEnvLock.Lock() 353 defer r.taskEnvLock.Unlock() 354 355 taskEnv, err := driver.GetTaskEnv(r.taskDir, r.config.Node, 356 r.task.Copy(), r.alloc, r.config, r.vaultFuture.Get()) 357 if err != nil { 358 return err 359 } 360 r.taskEnv = taskEnv 361 return nil 362 } 363 364 // getTaskEnv returns the task environment 365 func (r *TaskRunner) getTaskEnv() *env.TaskEnvironment { 366 r.taskEnvLock.Lock() 367 defer r.taskEnvLock.Unlock() 368 return r.taskEnv 369 } 370 371 // createDriver makes a driver for the task 372 func (r *TaskRunner) createDriver() (driver.Driver, error) { 373 env := r.getTaskEnv() 374 if env == nil { 375 return nil, fmt.Errorf("task environment not made for task %q in allocation %q", r.task.Name, r.alloc.ID) 376 } 377 378 // Create a task-specific event emitter callback to expose minimal 379 // state to drivers 380 eventEmitter := func(m string, args ...interface{}) { 381 msg := fmt.Sprintf(m, args...) 382 r.logger.Printf("[DEBUG] client: driver event for alloc %q: %s", r.alloc.ID, msg) 383 r.setState(structs.TaskStatePending, structs.NewTaskEvent(structs.TaskDriverMessage).SetDriverMessage(msg)) 384 } 385 386 driverCtx := driver.NewDriverContext(r.task.Name, r.alloc.ID, r.config, r.config.Node, r.logger, env, eventEmitter) 387 driver, err := driver.NewDriver(r.task.Driver, driverCtx) 388 if err != nil { 389 return nil, fmt.Errorf("failed to create driver '%s' for alloc %s: %v", 390 r.task.Driver, r.alloc.ID, err) 391 } 392 return driver, err 393 } 394 395 // Run is a long running routine used to manage the task 396 func (r *TaskRunner) Run() { 397 defer close(r.waitCh) 398 r.logger.Printf("[DEBUG] client: starting task context for '%s' (alloc '%s')", 399 r.task.Name, r.alloc.ID) 400 401 // Create the initial environment, this will be recreated if a Vault token 402 // is needed 403 if err := r.setTaskEnv(); err != nil { 404 r.setState( 405 structs.TaskStateDead, 406 structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(err)) 407 return 408 } 409 410 if err := r.validateTask(); err != nil { 411 r.setState( 412 structs.TaskStateDead, 413 structs.NewTaskEvent(structs.TaskFailedValidation).SetValidationError(err).SetFailsTask()) 414 return 415 } 416 417 // Create a driver so that we can determine the FSIsolation required 418 drv, err := r.createDriver() 419 if err != nil { 420 e := fmt.Errorf("failed to create driver of task %q for alloc %q: %v", r.task.Name, r.alloc.ID, err) 421 r.setState( 422 structs.TaskStateDead, 423 structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(e).SetFailsTask()) 424 return 425 } 426 427 // Build base task directory structure regardless of FS isolation abilities. 428 // This needs to happen before we start the Vault manager and call prestart 429 // as both those can write to the task directories 430 if err := r.buildTaskDir(drv.FSIsolation()); err != nil { 431 e := fmt.Errorf("failed to build task directory for %q: %v", r.task.Name, err) 432 r.setState( 433 structs.TaskStateDead, 434 structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(e).SetFailsTask()) 435 return 436 } 437 438 // If there is no Vault policy leave the static future created in 439 // NewTaskRunner 440 if r.task.Vault != nil { 441 // Start the go-routine to get a Vault token 442 r.vaultFuture.Clear() 443 go r.vaultManager(r.recoveredVaultToken) 444 } 445 446 // Start the run loop 447 r.run() 448 449 // Do any cleanup necessary 450 r.postrun() 451 452 return 453 } 454 455 // validateTask validates the fields of the task and returns an error if the 456 // task is invalid. 457 func (r *TaskRunner) validateTask() error { 458 var mErr multierror.Error 459 460 // Validate the user. 461 unallowedUsers := r.config.ReadStringListToMapDefault("user.blacklist", config.DefaultUserBlacklist) 462 checkDrivers := r.config.ReadStringListToMapDefault("user.checked_drivers", config.DefaultUserCheckedDrivers) 463 if _, driverMatch := checkDrivers[r.task.Driver]; driverMatch { 464 if _, unallowed := unallowedUsers[r.task.User]; unallowed { 465 mErr.Errors = append(mErr.Errors, fmt.Errorf("running as user %q is disallowed", r.task.User)) 466 } 467 } 468 469 // Validate the artifacts 470 for i, artifact := range r.task.Artifacts { 471 // Verify the artifact doesn't escape the task directory. 472 if err := artifact.Validate(); err != nil { 473 // If this error occurs there is potentially a server bug or 474 // mallicious, server spoofing. 475 r.logger.Printf("[ERR] client: allocation %q, task %v, artifact %#v (%v) fails validation: %v", 476 r.alloc.ID, r.task.Name, artifact, i, err) 477 mErr.Errors = append(mErr.Errors, fmt.Errorf("artifact (%d) failed validation: %v", i, err)) 478 } 479 } 480 481 // Validate the Service names 482 for i, service := range r.task.Services { 483 name := r.taskEnv.ReplaceEnv(service.Name) 484 if err := service.ValidateName(name); err != nil { 485 mErr.Errors = append(mErr.Errors, fmt.Errorf("service (%d) failed validation: %v", i, err)) 486 } 487 } 488 489 if len(mErr.Errors) == 1 { 490 return mErr.Errors[0] 491 } 492 return mErr.ErrorOrNil() 493 } 494 495 // tokenFuture stores the Vault token and allows consumers to block till a valid 496 // token exists 497 type tokenFuture struct { 498 waiting []chan struct{} 499 token string 500 set bool 501 m sync.Mutex 502 } 503 504 // NewTokenFuture returns a new token future without any token set 505 func NewTokenFuture() *tokenFuture { 506 return &tokenFuture{} 507 } 508 509 // Wait returns a channel that can be waited on. When this channel unblocks, a 510 // valid token will be available via the Get method 511 func (f *tokenFuture) Wait() <-chan struct{} { 512 f.m.Lock() 513 defer f.m.Unlock() 514 515 c := make(chan struct{}) 516 if f.set { 517 close(c) 518 return c 519 } 520 521 f.waiting = append(f.waiting, c) 522 return c 523 } 524 525 // Set sets the token value and unblocks any caller of Wait 526 func (f *tokenFuture) Set(token string) *tokenFuture { 527 f.m.Lock() 528 defer f.m.Unlock() 529 530 f.set = true 531 f.token = token 532 for _, w := range f.waiting { 533 close(w) 534 } 535 f.waiting = nil 536 return f 537 } 538 539 // Clear clears the set vault token. 540 func (f *tokenFuture) Clear() *tokenFuture { 541 f.m.Lock() 542 defer f.m.Unlock() 543 544 f.token = "" 545 f.set = false 546 return f 547 } 548 549 // Get returns the set Vault token 550 func (f *tokenFuture) Get() string { 551 f.m.Lock() 552 defer f.m.Unlock() 553 return f.token 554 } 555 556 // vaultManager should be called in a go-routine and manages the derivation, 557 // renewal and handling of errors with the Vault token. The optional parameter 558 // allows setting the initial Vault token. This is useful when the Vault token 559 // is recovered off disk. 560 func (r *TaskRunner) vaultManager(token string) { 561 // Helper for stopping token renewal 562 stopRenewal := func() { 563 if err := r.vaultClient.StopRenewToken(r.vaultFuture.Get()); err != nil { 564 r.logger.Printf("[WARN] client: failed to stop token renewal for task %v in alloc %q: %v", r.task.Name, r.alloc.ID, err) 565 } 566 } 567 568 // updatedToken lets us store state between loops. If true, a new token 569 // has been retrieved and we need to apply the Vault change mode 570 var updatedToken bool 571 572 OUTER: 573 for { 574 // Check if we should exit 575 select { 576 case <-r.waitCh: 577 stopRenewal() 578 return 579 default: 580 } 581 582 // Clear the token 583 r.vaultFuture.Clear() 584 585 // Check if there already is a token which can be the case for 586 // restoring the TaskRunner 587 if token == "" { 588 // Get a token 589 var exit bool 590 token, exit = r.deriveVaultToken() 591 if exit { 592 // Exit the manager 593 return 594 } 595 596 // Write the token to disk 597 if err := r.writeToken(token); err != nil { 598 e := fmt.Errorf("failed to write Vault token to disk") 599 r.logger.Printf("[ERR] client: %v for task %v on alloc %q: %v", e, r.task.Name, r.alloc.ID, err) 600 r.Kill("vault", e.Error(), true) 601 return 602 } 603 } 604 605 // Start the renewal process 606 renewCh, err := r.vaultClient.RenewToken(token, 30) 607 608 // An error returned means the token is not being renewed 609 if err != nil { 610 r.logger.Printf("[ERR] client: failed to start renewal of Vault token for task %v on alloc %q: %v", r.task.Name, r.alloc.ID, err) 611 token = "" 612 goto OUTER 613 } 614 615 // The Vault token is valid now, so set it 616 r.vaultFuture.Set(token) 617 618 if updatedToken { 619 switch r.task.Vault.ChangeMode { 620 case structs.VaultChangeModeSignal: 621 s, err := signals.Parse(r.task.Vault.ChangeSignal) 622 if err != nil { 623 e := fmt.Errorf("failed to parse signal: %v", err) 624 r.logger.Printf("[ERR] client: %v", err) 625 r.Kill("vault", e.Error(), true) 626 return 627 } 628 629 if err := r.Signal("vault", "new Vault token acquired", s); err != nil { 630 r.logger.Printf("[ERR] client: failed to send signal to task %v for alloc %q: %v", r.task.Name, r.alloc.ID, err) 631 r.Kill("vault", fmt.Sprintf("failed to send signal to task: %v", err), true) 632 return 633 } 634 case structs.VaultChangeModeRestart: 635 r.Restart("vault", "new Vault token acquired") 636 case structs.VaultChangeModeNoop: 637 fallthrough 638 default: 639 r.logger.Printf("[ERR] client: Invalid Vault change mode: %q", r.task.Vault.ChangeMode) 640 } 641 642 // We have handled it 643 updatedToken = false 644 645 // Call the handler 646 r.updatedTokenHandler() 647 } 648 649 // Start watching for renewal errors 650 select { 651 case err := <-renewCh: 652 // Clear the token 653 token = "" 654 r.logger.Printf("[ERR] client: failed to renew Vault token for task %v on alloc %q: %v", r.task.Name, r.alloc.ID, err) 655 stopRenewal() 656 657 // Check if we have to do anything 658 if r.task.Vault.ChangeMode != structs.VaultChangeModeNoop { 659 updatedToken = true 660 } 661 case <-r.waitCh: 662 stopRenewal() 663 return 664 } 665 } 666 } 667 668 // deriveVaultToken derives the Vault token using exponential backoffs. It 669 // returns the Vault token and whether the manager should exit. 670 func (r *TaskRunner) deriveVaultToken() (token string, exit bool) { 671 attempts := 0 672 for { 673 tokens, err := r.vaultClient.DeriveToken(r.alloc, []string{r.task.Name}) 674 if err == nil { 675 return tokens[r.task.Name], false 676 } 677 678 // Check if we can't recover from the error 679 if !structs.IsRecoverable(err) { 680 r.logger.Printf("[ERR] client: failed to derive Vault token for task %v on alloc %q: %v", 681 r.task.Name, r.alloc.ID, err) 682 r.Kill("vault", fmt.Sprintf("failed to derive token: %v", err), true) 683 return "", true 684 } 685 686 // Handle the retry case 687 backoff := (1 << (2 * uint64(attempts))) * vaultBackoffBaseline 688 if backoff > vaultBackoffLimit { 689 backoff = vaultBackoffLimit 690 } 691 r.logger.Printf("[ERR] client: failed to derive Vault token for task %v on alloc %q: %v; retrying in %v", 692 r.task.Name, r.alloc.ID, err, backoff) 693 694 attempts++ 695 696 // Wait till retrying 697 select { 698 case <-r.waitCh: 699 return "", true 700 case <-time.After(backoff): 701 } 702 } 703 } 704 705 // writeToken writes the given token to disk 706 func (r *TaskRunner) writeToken(token string) error { 707 tokenPath := filepath.Join(r.taskDir.SecretsDir, vaultTokenFile) 708 if err := ioutil.WriteFile(tokenPath, []byte(token), 0777); err != nil { 709 return fmt.Errorf("failed to save Vault tokens to secret dir for task %q in alloc %q: %v", r.task.Name, r.alloc.ID, err) 710 } 711 712 return nil 713 } 714 715 // updatedTokenHandler is called when a new Vault token is retrieved. Things 716 // that rely on the token should be updated here. 717 func (r *TaskRunner) updatedTokenHandler() { 718 719 // Update the tasks environment 720 if err := r.setTaskEnv(); err != nil { 721 r.setState( 722 structs.TaskStateDead, 723 structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(err).SetFailsTask()) 724 return 725 } 726 727 if r.templateManager != nil { 728 r.templateManager.Stop() 729 730 // Create a new templateManager 731 var err error 732 r.templateManager, err = NewTaskTemplateManager(r, r.task.Templates, 733 r.config, r.vaultFuture.Get(), r.taskDir.Dir, r.getTaskEnv()) 734 if err != nil { 735 err := fmt.Errorf("failed to build task's template manager: %v", err) 736 r.setState(structs.TaskStateDead, structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(err).SetFailsTask()) 737 r.logger.Printf("[ERR] client: alloc %q, task %q %v", r.alloc.ID, r.task.Name, err) 738 r.Kill("vault", err.Error(), true) 739 return 740 } 741 } 742 } 743 744 // prestart handles life-cycle tasks that occur before the task has started. 745 func (r *TaskRunner) prestart(resultCh chan bool) { 746 if r.task.Vault != nil { 747 // Wait for the token 748 r.logger.Printf("[DEBUG] client: waiting for Vault token for task %v in alloc %q", r.task.Name, r.alloc.ID) 749 tokenCh := r.vaultFuture.Wait() 750 select { 751 case <-tokenCh: 752 case <-r.waitCh: 753 resultCh <- false 754 return 755 } 756 r.logger.Printf("[DEBUG] client: retrieved Vault token for task %v in alloc %q", r.task.Name, r.alloc.ID) 757 } 758 759 if err := r.setTaskEnv(); err != nil { 760 r.setState( 761 structs.TaskStateDead, 762 structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(err).SetFailsTask()) 763 resultCh <- false 764 return 765 } 766 767 // If the job is a dispatch job and there is a payload write it to disk 768 requirePayload := len(r.alloc.Job.Payload) != 0 && 769 (r.task.DispatchPayload != nil && r.task.DispatchPayload.File != "") 770 if !r.payloadRendered && requirePayload { 771 renderTo := filepath.Join(r.taskDir.LocalDir, r.task.DispatchPayload.File) 772 decoded, err := snappy.Decode(nil, r.alloc.Job.Payload) 773 if err != nil { 774 r.setState( 775 structs.TaskStateDead, 776 structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(err).SetFailsTask()) 777 resultCh <- false 778 return 779 } 780 781 if err := os.MkdirAll(filepath.Dir(renderTo), 07777); err != nil { 782 r.setState( 783 structs.TaskStateDead, 784 structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(err).SetFailsTask()) 785 resultCh <- false 786 return 787 } 788 789 if err := ioutil.WriteFile(renderTo, decoded, 0777); err != nil { 790 r.setState( 791 structs.TaskStateDead, 792 structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(err).SetFailsTask()) 793 resultCh <- false 794 return 795 } 796 797 r.payloadRendered = true 798 } 799 800 for { 801 r.persistLock.Lock() 802 downloaded := r.artifactsDownloaded 803 r.persistLock.Unlock() 804 805 // Download the task's artifacts 806 if !downloaded && len(r.task.Artifacts) > 0 { 807 r.setState(structs.TaskStatePending, structs.NewTaskEvent(structs.TaskDownloadingArtifacts)) 808 for _, artifact := range r.task.Artifacts { 809 if err := getter.GetArtifact(r.getTaskEnv(), artifact, r.taskDir.Dir); err != nil { 810 wrapped := fmt.Errorf("failed to download artifact %q: %v", artifact.GetterSource, err) 811 r.logger.Printf("[DEBUG] client: %v", wrapped) 812 r.setState(structs.TaskStatePending, 813 structs.NewTaskEvent(structs.TaskArtifactDownloadFailed).SetDownloadError(wrapped)) 814 r.restartTracker.SetStartError(structs.WrapRecoverable(wrapped.Error(), err)) 815 goto RESTART 816 } 817 } 818 819 r.persistLock.Lock() 820 r.artifactsDownloaded = true 821 r.persistLock.Unlock() 822 } 823 824 // We don't have to wait for any template 825 if len(r.task.Templates) == 0 { 826 // Send the start signal 827 select { 828 case r.startCh <- struct{}{}: 829 default: 830 } 831 832 resultCh <- true 833 return 834 } 835 836 // Build the template manager 837 if r.templateManager == nil { 838 var err error 839 r.templateManager, err = NewTaskTemplateManager(r, r.task.Templates, 840 r.config, r.vaultFuture.Get(), r.taskDir.Dir, r.getTaskEnv()) 841 if err != nil { 842 err := fmt.Errorf("failed to build task's template manager: %v", err) 843 r.setState(structs.TaskStateDead, structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(err).SetFailsTask()) 844 r.logger.Printf("[ERR] client: alloc %q, task %q %v", r.alloc.ID, r.task.Name, err) 845 resultCh <- false 846 return 847 } 848 } 849 850 // Block for consul-template 851 // TODO Hooks should register themselves as blocking and then we can 852 // perioidcally enumerate what we are still blocked on 853 select { 854 case <-r.unblockCh: 855 // Send the start signal 856 select { 857 case r.startCh <- struct{}{}: 858 default: 859 } 860 861 resultCh <- true 862 return 863 case <-r.waitCh: 864 // The run loop has exited so exit too 865 resultCh <- false 866 return 867 } 868 869 RESTART: 870 restart := r.shouldRestart() 871 if !restart { 872 resultCh <- false 873 return 874 } 875 } 876 } 877 878 // postrun is used to do any cleanup that is necessary after exiting the runloop 879 func (r *TaskRunner) postrun() { 880 881 // Stop the template manager 882 if r.templateManager != nil { 883 r.templateManager.Stop() 884 } 885 886 // Unofficial Feature: Forcefully stop the associated container if still running 887 if err := r.taskRunnerPlus.stopContainer(); err != nil { 888 r.logger.Printf("[DEBUG] %s", err.Error()) 889 } 890 } 891 892 // run is the main run loop that handles starting the application, destroying 893 // it, restarts and signals. 894 func (r *TaskRunner) run() { 895 // Predeclare things so we can jump to the RESTART 896 var stopCollection chan struct{} 897 var handleWaitCh chan *dstructs.WaitResult 898 899 // If we already have a handle, populate the stopCollection and handleWaitCh 900 // to fix the invariant that it exists. 901 r.handleLock.Lock() 902 handleEmpty := r.handle == nil 903 r.handleLock.Unlock() 904 905 if !handleEmpty { 906 stopCollection = make(chan struct{}) 907 go r.collectResourceUsageStats(stopCollection) 908 handleWaitCh = r.handle.WaitCh() 909 } 910 911 for { 912 // Do the prestart activities 913 prestartResultCh := make(chan bool, 1) 914 go r.prestart(prestartResultCh) 915 916 WAIT: 917 for { 918 select { 919 case success := <-prestartResultCh: 920 if !success { 921 r.cleanup() 922 r.setState(structs.TaskStateDead, nil) 923 return 924 } 925 case <-r.startCh: 926 // Start the task if not yet started or it is being forced. This logic 927 // is necessary because in the case of a restore the handle already 928 // exists. 929 r.handleLock.Lock() 930 handleEmpty := r.handle == nil 931 r.handleLock.Unlock() 932 if handleEmpty { 933 startErr := r.startTask() 934 r.restartTracker.SetStartError(startErr) 935 if startErr != nil { 936 r.setState("", structs.NewTaskEvent(structs.TaskDriverFailure).SetDriverError(startErr)) 937 goto RESTART 938 } 939 940 // Mark the task as started 941 r.setState(structs.TaskStateRunning, structs.NewTaskEvent(structs.TaskStarted)) 942 r.runningLock.Lock() 943 r.running = true 944 r.runningLock.Unlock() 945 946 if stopCollection == nil { 947 stopCollection = make(chan struct{}) 948 go r.collectResourceUsageStats(stopCollection) 949 } 950 951 handleWaitCh = r.handle.WaitCh() 952 } 953 954 case waitRes := <-handleWaitCh: 955 if waitRes == nil { 956 panic("nil wait") 957 } 958 959 r.runningLock.Lock() 960 r.running = false 961 r.runningLock.Unlock() 962 963 // Stop collection of the task's resource usage 964 close(stopCollection) 965 966 // Log whether the task was successful or not. 967 r.restartTracker.SetWaitResult(waitRes) 968 r.setState("", r.waitErrorToEvent(waitRes)) 969 if !waitRes.Successful() { 970 r.logger.Printf("[INFO] client: task %q for alloc %q failed: %v", r.task.Name, r.alloc.ID, waitRes) 971 } else { 972 r.logger.Printf("[INFO] client: task %q for alloc %q completed successfully", r.task.Name, r.alloc.ID) 973 } 974 975 break WAIT 976 case update := <-r.updateCh: 977 if err := r.handleUpdate(update); err != nil { 978 r.logger.Printf("[ERR] client: update to task %q failed: %v", r.task.Name, err) 979 } 980 981 case se := <-r.signalCh: 982 r.runningLock.Lock() 983 running := r.running 984 r.runningLock.Unlock() 985 common := fmt.Sprintf("signal %v to task %v for alloc %q", se.s, r.task.Name, r.alloc.ID) 986 if !running { 987 // Send no error 988 r.logger.Printf("[DEBUG] client: skipping %s", common) 989 se.result <- nil 990 continue 991 } 992 993 r.logger.Printf("[DEBUG] client: sending %s", common) 994 r.setState(structs.TaskStateRunning, se.e) 995 996 res := r.handle.Signal(se.s) 997 se.result <- res 998 999 case event := <-r.restartCh: 1000 r.runningLock.Lock() 1001 running := r.running 1002 r.runningLock.Unlock() 1003 common := fmt.Sprintf("task %v for alloc %q", r.task.Name, r.alloc.ID) 1004 if !running { 1005 r.logger.Printf("[DEBUG] client: skipping restart of %v: task isn't running", common) 1006 continue 1007 } 1008 1009 r.logger.Printf("[DEBUG] client: restarting %s: %v", common, event.RestartReason) 1010 r.setState(structs.TaskStateRunning, event) 1011 r.killTask(nil) 1012 1013 close(stopCollection) 1014 1015 if handleWaitCh != nil { 1016 <-handleWaitCh 1017 } 1018 1019 // Since the restart isn't from a failure, restart immediately 1020 // and don't count against the restart policy 1021 r.restartTracker.SetRestartTriggered() 1022 break WAIT 1023 1024 case <-r.destroyCh: 1025 1026 r.runningLock.Lock() 1027 running := r.running 1028 r.runningLock.Unlock() 1029 if !running { 1030 r.cleanup() 1031 r.setState(structs.TaskStateDead, r.destroyEvent) 1032 return 1033 } 1034 1035 // Store the task event that provides context on the task 1036 // destroy. The Killed event is set from the alloc_runner and 1037 // doesn't add detail 1038 var killEvent *structs.TaskEvent 1039 if r.destroyEvent.Type != structs.TaskKilled { 1040 if r.destroyEvent.Type == structs.TaskKilling { 1041 killEvent = r.destroyEvent 1042 } else { 1043 r.setState(structs.TaskStateRunning, r.destroyEvent) 1044 } 1045 } 1046 1047 r.killTask(killEvent) 1048 close(stopCollection) 1049 1050 // Wait for handler to exit before calling cleanup 1051 <-handleWaitCh 1052 r.cleanup() 1053 1054 r.setState(structs.TaskStateDead, nil) 1055 return 1056 } 1057 } 1058 1059 RESTART: 1060 restart := r.shouldRestart() 1061 if !restart { 1062 r.cleanup() 1063 r.setState(structs.TaskStateDead, nil) 1064 return 1065 } 1066 1067 // Clear the handle so a new driver will be created. 1068 r.handleLock.Lock() 1069 r.handle = nil 1070 handleWaitCh = nil 1071 stopCollection = nil 1072 r.handleLock.Unlock() 1073 } 1074 } 1075 1076 // cleanup calls Driver.Cleanup when a task is stopping. Errors are logged. 1077 func (r *TaskRunner) cleanup() { 1078 1079 drv, err := r.createDriver() 1080 if err != nil { 1081 r.logger.Printf("[ERR] client: error creating driver to cleanup resources: %v", err) 1082 return 1083 } 1084 1085 res := r.getCreatedResources() 1086 1087 ctx := driver.NewExecContext(r.taskDir) 1088 attempts := 1 1089 var cleanupErr error 1090 for retry := true; retry; attempts++ { 1091 cleanupErr = drv.Cleanup(ctx, res) 1092 retry = structs.IsRecoverable(cleanupErr) 1093 1094 // Copy current createdResources state in case SaveState is 1095 // called between retries 1096 r.setCreatedResources(res) 1097 1098 // Retry 3 times with sleeps between 1099 if !retry || attempts > 3 { 1100 break 1101 } 1102 time.Sleep(time.Duration(attempts) * time.Second) 1103 } 1104 1105 if cleanupErr != nil { 1106 r.logger.Printf("[ERR] client: error cleaning up resources for task %q after %d attempts: %v", r.task.Name, attempts, cleanupErr) 1107 } 1108 1109 // Unofficial Feature: Forcefully stop the associated container 1110 if err := r.taskRunnerPlus.stopContainer(); err != nil { 1111 r.logger.Printf("[DEBUG] %s", err.Error()) 1112 } 1113 1114 return 1115 } 1116 1117 // shouldRestart returns if the task should restart. If the return value is 1118 // true, the task's restart policy has already been considered and any wait time 1119 // between restarts has been applied. 1120 func (r *TaskRunner) shouldRestart() bool { 1121 state, when := r.restartTracker.GetState() 1122 reason := r.restartTracker.GetReason() 1123 switch state { 1124 case structs.TaskNotRestarting, structs.TaskTerminated: 1125 r.logger.Printf("[INFO] client: Not restarting task: %v for alloc: %v ", r.task.Name, r.alloc.ID) 1126 if state == structs.TaskNotRestarting { 1127 r.setState(structs.TaskStateDead, 1128 structs.NewTaskEvent(structs.TaskNotRestarting). 1129 SetRestartReason(reason).SetFailsTask()) 1130 } 1131 return false 1132 case structs.TaskRestarting: 1133 1134 // Unofficial Feature: Forcefully stop the associated container 1135 if err := r.taskRunnerPlus.stopContainer(); err != nil { 1136 r.logger.Printf("[DEBUG] %s", err.Error()) 1137 } 1138 1139 r.logger.Printf("[INFO] client: Restarting task %q for alloc %q in %v", r.task.Name, r.alloc.ID, when) 1140 r.setState(structs.TaskStatePending, 1141 structs.NewTaskEvent(structs.TaskRestarting). 1142 SetRestartDelay(when). 1143 SetRestartReason(reason)) 1144 default: 1145 r.logger.Printf("[ERR] client: restart tracker returned unknown state: %q", state) 1146 return false 1147 } 1148 1149 // Sleep but watch for destroy events. 1150 select { 1151 case <-time.After(when): 1152 case <-r.destroyCh: 1153 } 1154 1155 // Destroyed while we were waiting to restart, so abort. 1156 r.destroyLock.Lock() 1157 destroyed := r.destroy 1158 r.destroyLock.Unlock() 1159 if destroyed { 1160 r.logger.Printf("[DEBUG] client: Not restarting task: %v because it has been destroyed", r.task.Name) 1161 r.setState(structs.TaskStateDead, r.destroyEvent) 1162 return false 1163 } 1164 1165 return true 1166 } 1167 1168 // killTask kills the running task. A killing event can optionally be passed and 1169 // this event is used to mark the task as being killed. It provides a means to 1170 // store extra information. 1171 func (r *TaskRunner) killTask(killingEvent *structs.TaskEvent) { 1172 r.runningLock.Lock() 1173 running := r.running 1174 r.runningLock.Unlock() 1175 if !running { 1176 return 1177 } 1178 1179 // Get the kill timeout 1180 timeout := driver.GetKillTimeout(r.task.KillTimeout, r.config.MaxKillTimeout) 1181 1182 // Build the event 1183 var event *structs.TaskEvent 1184 if killingEvent != nil { 1185 event = killingEvent 1186 event.Type = structs.TaskKilling 1187 } else { 1188 event = structs.NewTaskEvent(structs.TaskKilling) 1189 } 1190 event.SetKillTimeout(timeout) 1191 1192 // Mark that we received the kill event 1193 r.setState(structs.TaskStateRunning, event) 1194 1195 // Kill the task using an exponential backoff in-case of failures. 1196 destroySuccess, err := r.handleDestroy() 1197 if !destroySuccess { 1198 // We couldn't successfully destroy the resource created. 1199 r.logger.Printf("[ERR] client: failed to kill task %q. Resources may have been leaked: %v", r.task.Name, err) 1200 } 1201 1202 r.runningLock.Lock() 1203 r.running = false 1204 r.runningLock.Unlock() 1205 1206 // TODO: Send GRPC signal to container 1207 r.taskRunnerPlus.SendGRPCSignal(5 * time.Second) 1208 1209 // Unofficial Feature: Forcefully stop the associated container if still running 1210 if err := r.taskRunnerPlus.stopContainer(); err != nil { 1211 r.logger.Printf("[DEBUG] %s", err.Error()) 1212 } 1213 1214 // Store that the task has been destroyed and any associated error. 1215 r.setState("", structs.NewTaskEvent(structs.TaskKilled).SetKillError(err)) 1216 } 1217 1218 // startTask creates the driver, task dir, and starts the task. 1219 func (r *TaskRunner) startTask() error { 1220 1221 // Create a driver 1222 drv, err := r.createDriver() 1223 if err != nil { 1224 return fmt.Errorf("failed to create driver of task %q for alloc %q: %v", 1225 r.task.Name, r.alloc.ID, err) 1226 } 1227 1228 // Since raw exec driver has no resource allocation support, 1229 // we introduce a memory availability check before task is started. 1230 // Fires a SetupFailure event if memory requirement is not met. 1231 if r.task.Driver == "raw_exec" { 1232 expectedMemStr := r.getTaskEnv().Env[r.taskRunnerPlus.MemoryAllocEnvKey] 1233 if len(expectedMemStr) > 0 { 1234 expectedMem, _ := strconv.Atoi(expectedMemStr) 1235 err := r.taskRunnerPlus.KillOnLowMemory(expectedMem, func() error { 1236 wrapped := fmt.Errorf("insufficient memory") 1237 r.killTask(structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(wrapped)) 1238 r.logger.Printf("[DEBUG] client: insufficient memory for raw_exec task. Task will be killed") 1239 return wrapped 1240 }) 1241 if err != nil { 1242 return err 1243 } 1244 } 1245 } 1246 1247 // Run prestart 1248 ctx := driver.NewExecContext(r.taskDir) 1249 res, err := drv.Prestart(ctx, r.task) 1250 1251 // Merge newly created resources into previously created resources 1252 r.createdResourcesLock.Lock() 1253 r.createdResources.Merge(res) 1254 r.createdResourcesLock.Unlock() 1255 1256 if err != nil { 1257 wrapped := fmt.Sprintf("failed to initialize task %q for alloc %q: %v", 1258 r.task.Name, r.alloc.ID, err) 1259 r.logger.Printf("[WARN] client: error from prestart: %s", wrapped) 1260 return structs.WrapRecoverable(wrapped, err) 1261 } 1262 1263 // Start the job 1264 handle, err := drv.Start(ctx, r.task) 1265 if err != nil { 1266 wrapped := fmt.Sprintf("failed to start task %q for alloc %q: %v", 1267 r.task.Name, r.alloc.ID, err) 1268 r.logger.Printf("[WARN] client: %s", wrapped) 1269 return structs.WrapRecoverable(wrapped, err) 1270 1271 } 1272 1273 r.handleLock.Lock() 1274 r.handle = handle 1275 r.handleLock.Unlock() 1276 return nil 1277 } 1278 1279 // buildTaskDir creates the task directory before driver.Prestart. It is safe 1280 // to call multiple times as its state is persisted. 1281 func (r *TaskRunner) buildTaskDir(fsi cstructs.FSIsolation) error { 1282 r.persistLock.Lock() 1283 built := r.taskDirBuilt 1284 r.persistLock.Unlock() 1285 1286 // We do not set the state again since this only occurs during restoration 1287 // and the task dir is already built. The reason we call Build again is to 1288 // ensure that the task dir invariants are still held. 1289 if !built { 1290 r.setState(structs.TaskStatePending, structs.NewTaskEvent(structs.TaskSetup). 1291 SetMessage(structs.TaskBuildingTaskDir)) 1292 } 1293 1294 chroot := config.DefaultChrootEnv 1295 if len(r.config.ChrootEnv) > 0 { 1296 chroot = r.config.ChrootEnv 1297 } 1298 if err := r.taskDir.Build(built, chroot, fsi); err != nil { 1299 return err 1300 } 1301 1302 // Mark task dir as successfully built 1303 r.persistLock.Lock() 1304 r.taskDirBuilt = true 1305 r.persistLock.Unlock() 1306 return nil 1307 } 1308 1309 // collectResourceUsageStats starts collecting resource usage stats of a Task. 1310 // Collection ends when the passed channel is closed 1311 func (r *TaskRunner) collectResourceUsageStats(stopCollection <-chan struct{}) { 1312 // start collecting the stats right away and then start collecting every 1313 // collection interval 1314 next := time.NewTimer(0) 1315 defer next.Stop() 1316 for { 1317 select { 1318 case <-next.C: 1319 next.Reset(r.config.StatsCollectionInterval) 1320 if r.handle == nil { 1321 continue 1322 } 1323 ru, err := r.handle.Stats() 1324 1325 if err != nil { 1326 // Check if the driver doesn't implement stats 1327 if err.Error() == driver.DriverStatsNotImplemented.Error() { 1328 r.logger.Printf("[DEBUG] client: driver for task %q in allocation %q doesn't support stats", r.task.Name, r.alloc.ID) 1329 return 1330 } 1331 1332 // We do not log when the plugin is shutdown as this is simply a 1333 // race between the stopCollection channel being closed and calling 1334 // Stats on the handle. 1335 if !strings.Contains(err.Error(), "connection is shut down") { 1336 r.logger.Printf("[WARN] client: error fetching stats of task %v: %v", r.task.Name, err) 1337 } 1338 continue 1339 } 1340 1341 r.resourceUsageLock.Lock() 1342 r.resourceUsage = ru 1343 r.resourceUsageLock.Unlock() 1344 if ru != nil { 1345 r.emitStats(ru) 1346 } 1347 case <-stopCollection: 1348 return 1349 } 1350 } 1351 } 1352 1353 // LatestResourceUsage returns the last resource utilization datapoint collected 1354 func (r *TaskRunner) LatestResourceUsage() *cstructs.TaskResourceUsage { 1355 r.resourceUsageLock.RLock() 1356 defer r.resourceUsageLock.RUnlock() 1357 r.runningLock.Lock() 1358 defer r.runningLock.Unlock() 1359 1360 // If the task is not running there can be no latest resource 1361 if !r.running { 1362 return nil 1363 } 1364 1365 return r.resourceUsage 1366 } 1367 1368 // handleUpdate takes an updated allocation and updates internal state to 1369 // reflect the new config for the task. 1370 func (r *TaskRunner) handleUpdate(update *structs.Allocation) error { 1371 // Extract the task group from the alloc. 1372 tg := update.Job.LookupTaskGroup(update.TaskGroup) 1373 if tg == nil { 1374 return fmt.Errorf("alloc '%s' missing task group '%s'", update.ID, update.TaskGroup) 1375 } 1376 1377 // Extract the task. 1378 var updatedTask *structs.Task 1379 for _, t := range tg.Tasks { 1380 if t.Name == r.task.Name { 1381 updatedTask = t.Copy() 1382 } 1383 } 1384 if updatedTask == nil { 1385 return fmt.Errorf("task group %q doesn't contain task %q", tg.Name, r.task.Name) 1386 } 1387 1388 // Merge in the task resources 1389 updatedTask.Resources = update.TaskResources[updatedTask.Name] 1390 1391 // Update will update resources and store the new kill timeout. 1392 var mErr multierror.Error 1393 r.handleLock.Lock() 1394 if r.handle != nil { 1395 if err := r.handle.Update(updatedTask); err != nil { 1396 mErr.Errors = append(mErr.Errors, fmt.Errorf("updating task resources failed: %v", err)) 1397 } 1398 } 1399 r.handleLock.Unlock() 1400 1401 // Update the restart policy. 1402 if r.restartTracker != nil { 1403 r.restartTracker.SetPolicy(tg.RestartPolicy) 1404 } 1405 1406 // Store the updated alloc. 1407 r.alloc = update 1408 r.task = updatedTask 1409 return mErr.ErrorOrNil() 1410 } 1411 1412 // handleDestroy kills the task handle. In the case that killing fails, 1413 // handleDestroy will retry with an exponential backoff and will give up at a 1414 // given limit. It returns whether the task was destroyed and the error 1415 // associated with the last kill attempt. 1416 func (r *TaskRunner) handleDestroy() (destroyed bool, err error) { 1417 // Cap the number of times we attempt to kill the task. 1418 for i := 0; i < killFailureLimit; i++ { 1419 if err = r.handle.Kill(); err != nil { 1420 // Calculate the new backoff 1421 backoff := (1 << (2 * uint64(i))) * killBackoffBaseline 1422 if backoff > killBackoffLimit { 1423 backoff = killBackoffLimit 1424 } 1425 1426 r.logger.Printf("[ERR] client: failed to kill task '%s' for alloc %q. Retrying in %v: %v", 1427 r.task.Name, r.alloc.ID, backoff, err) 1428 time.Sleep(time.Duration(backoff)) 1429 } else { 1430 // Kill was successful 1431 return true, nil 1432 } 1433 } 1434 return 1435 } 1436 1437 // Restart will restart the task 1438 func (r *TaskRunner) Restart(source, reason string) { 1439 reasonStr := fmt.Sprintf("%s: %s", source, reason) 1440 event := structs.NewTaskEvent(structs.TaskRestartSignal).SetRestartReason(reasonStr) 1441 1442 select { 1443 case r.restartCh <- event: 1444 case <-r.waitCh: 1445 } 1446 } 1447 1448 // Signal will send a signal to the task 1449 func (r *TaskRunner) Signal(source, reason string, s os.Signal) error { 1450 1451 reasonStr := fmt.Sprintf("%s: %s", source, reason) 1452 event := structs.NewTaskEvent(structs.TaskSignaling).SetTaskSignal(s).SetTaskSignalReason(reasonStr) 1453 1454 resCh := make(chan error) 1455 se := SignalEvent{ 1456 s: s, 1457 e: event, 1458 result: resCh, 1459 } 1460 1461 select { 1462 case r.signalCh <- se: 1463 case <-r.waitCh: 1464 } 1465 1466 return <-resCh 1467 } 1468 1469 // Kill will kill a task and store the error, no longer restarting the task. If 1470 // fail is set, the task is marked as having failed. 1471 func (r *TaskRunner) Kill(source, reason string, fail bool) { 1472 reasonStr := fmt.Sprintf("%s: %s", source, reason) 1473 event := structs.NewTaskEvent(structs.TaskKilling).SetKillReason(reasonStr) 1474 if fail { 1475 event.SetFailsTask() 1476 } 1477 1478 r.logger.Printf("[DEBUG] client: killing task %v for alloc %q: %v", r.task.Name, r.alloc.ID, reasonStr) 1479 r.Destroy(event) 1480 } 1481 1482 // UnblockStart unblocks the starting of the task. It currently assumes only 1483 // consul-template will unblock 1484 func (r *TaskRunner) UnblockStart(source string) { 1485 r.unblockLock.Lock() 1486 defer r.unblockLock.Unlock() 1487 if r.unblocked { 1488 return 1489 } 1490 1491 r.logger.Printf("[DEBUG] client: unblocking task %v for alloc %q: %v", r.task.Name, r.alloc.ID, source) 1492 r.unblocked = true 1493 close(r.unblockCh) 1494 } 1495 1496 // Helper function for converting a WaitResult into a TaskTerminated event. 1497 func (r *TaskRunner) waitErrorToEvent(res *dstructs.WaitResult) *structs.TaskEvent { 1498 return structs.NewTaskEvent(structs.TaskTerminated). 1499 SetExitCode(res.ExitCode). 1500 SetSignal(res.Signal). 1501 SetExitMessage(res.Err) 1502 } 1503 1504 // Update is used to update the task of the context 1505 func (r *TaskRunner) Update(update *structs.Allocation) { 1506 select { 1507 case r.updateCh <- update: 1508 default: 1509 r.logger.Printf("[ERR] client: dropping task update '%s' (alloc '%s')", 1510 r.task.Name, r.alloc.ID) 1511 } 1512 } 1513 1514 // Destroy is used to indicate that the task context should be destroyed. The 1515 // event parameter provides a context for the destroy. 1516 func (r *TaskRunner) Destroy(event *structs.TaskEvent) { 1517 r.destroyLock.Lock() 1518 defer r.destroyLock.Unlock() 1519 1520 if r.destroy { 1521 return 1522 } 1523 r.destroy = true 1524 r.destroyEvent = event 1525 close(r.destroyCh) 1526 } 1527 1528 // getCreatedResources returns the resources created by drivers. It will never 1529 // return nil. 1530 func (r *TaskRunner) getCreatedResources() *driver.CreatedResources { 1531 r.createdResourcesLock.Lock() 1532 if r.createdResources == nil { 1533 r.createdResources = driver.NewCreatedResources() 1534 } 1535 cr := r.createdResources.Copy() 1536 r.createdResourcesLock.Unlock() 1537 1538 return cr 1539 } 1540 1541 // setCreatedResources updates the resources created by drivers. If passed nil 1542 // it will set createdResources to an initialized struct. 1543 func (r *TaskRunner) setCreatedResources(cr *driver.CreatedResources) { 1544 if cr == nil { 1545 cr = driver.NewCreatedResources() 1546 } 1547 r.createdResourcesLock.Lock() 1548 r.createdResources = cr.Copy() 1549 r.createdResourcesLock.Unlock() 1550 } 1551 1552 // emitStats emits resource usage stats of tasks to remote metrics collector 1553 // sinks 1554 func (r *TaskRunner) emitStats(ru *cstructs.TaskResourceUsage) { 1555 if ru.ResourceUsage.MemoryStats != nil && r.config.PublishAllocationMetrics { 1556 metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "rss"}, float32(ru.ResourceUsage.MemoryStats.RSS)) 1557 metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "cache"}, float32(ru.ResourceUsage.MemoryStats.Cache)) 1558 metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "swap"}, float32(ru.ResourceUsage.MemoryStats.Swap)) 1559 metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "max_usage"}, float32(ru.ResourceUsage.MemoryStats.MaxUsage)) 1560 metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "kernel_usage"}, float32(ru.ResourceUsage.MemoryStats.KernelUsage)) 1561 metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "kernel_max_usage"}, float32(ru.ResourceUsage.MemoryStats.KernelMaxUsage)) 1562 } 1563 1564 if ru.ResourceUsage.CpuStats != nil && r.config.PublishAllocationMetrics { 1565 metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "total_percent"}, float32(ru.ResourceUsage.CpuStats.Percent)) 1566 metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "system"}, float32(ru.ResourceUsage.CpuStats.SystemMode)) 1567 metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "user"}, float32(ru.ResourceUsage.CpuStats.UserMode)) 1568 metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "throttled_time"}, float32(ru.ResourceUsage.CpuStats.ThrottledTime)) 1569 metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "throttled_periods"}, float32(ru.ResourceUsage.CpuStats.ThrottledPeriods)) 1570 metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "total_ticks"}, float32(ru.ResourceUsage.CpuStats.TotalTicks)) 1571 } 1572 }