github.com/maier/nomad@v0.4.1-0.20161110003312-a9e3d0b8549d/client/task_runner.go (about) 1 package client 2 3 import ( 4 "crypto/md5" 5 "encoding/hex" 6 "fmt" 7 "io/ioutil" 8 "log" 9 "os" 10 "path/filepath" 11 "strings" 12 "sync" 13 "time" 14 15 "github.com/armon/go-metrics" 16 "github.com/hashicorp/consul-template/signals" 17 "github.com/hashicorp/go-multierror" 18 "github.com/hashicorp/nomad/client/config" 19 "github.com/hashicorp/nomad/client/driver" 20 "github.com/hashicorp/nomad/client/getter" 21 "github.com/hashicorp/nomad/client/vaultclient" 22 "github.com/hashicorp/nomad/nomad/structs" 23 24 "github.com/hashicorp/nomad/client/driver/env" 25 dstructs "github.com/hashicorp/nomad/client/driver/structs" 26 cstructs "github.com/hashicorp/nomad/client/structs" 27 ) 28 29 const ( 30 // killBackoffBaseline is the baseline time for exponential backoff while 31 // killing a task. 32 killBackoffBaseline = 5 * time.Second 33 34 // killBackoffLimit is the limit of the exponential backoff for killing 35 // the task. 36 killBackoffLimit = 2 * time.Minute 37 38 // killFailureLimit is how many times we will attempt to kill a task before 39 // giving up and potentially leaking resources. 40 killFailureLimit = 5 41 42 // vaultBackoffBaseline is the baseline time for exponential backoff when 43 // attempting to retrieve a Vault token 44 vaultBackoffBaseline = 5 * time.Second 45 46 // vaultBackoffLimit is the limit of the exponential backoff when attempting 47 // to retrieve a Vault token 48 vaultBackoffLimit = 3 * time.Minute 49 50 // vaultTokenFile is the name of the file holding the Vault token inside the 51 // task's secret directory 52 vaultTokenFile = "vault_token" 53 ) 54 55 // TaskRunner is used to wrap a task within an allocation and provide the execution context. 56 type TaskRunner struct { 57 config *config.Config 58 updater TaskStateUpdater 59 logger *log.Logger 60 ctx *driver.ExecContext 61 alloc *structs.Allocation 62 restartTracker *RestartTracker 63 64 // running marks whether the task is running 65 running bool 66 runningLock sync.Mutex 67 68 resourceUsage *cstructs.TaskResourceUsage 69 resourceUsageLock sync.RWMutex 70 71 task *structs.Task 72 taskDir string 73 74 // taskEnv is the environment variables of the task 75 taskEnv *env.TaskEnvironment 76 taskEnvLock sync.Mutex 77 78 // updateCh is used to receive updated versions of the allocation 79 updateCh chan *structs.Allocation 80 81 handle driver.DriverHandle 82 handleLock sync.Mutex 83 84 // artifactsDownloaded tracks whether the tasks artifacts have been 85 // downloaded 86 artifactsDownloaded bool 87 88 // vaultFuture is the means to wait for and get a Vault token 89 vaultFuture *tokenFuture 90 91 // recoveredVaultToken is the token that was recovered through a restore 92 recoveredVaultToken string 93 94 // vaultClient is used to retrieve and renew any needed Vault token 95 vaultClient vaultclient.VaultClient 96 97 // templateManager is used to manage any consul-templates this task may have 98 templateManager *TaskTemplateManager 99 100 // startCh is used to trigger the start of the task 101 startCh chan struct{} 102 103 // unblockCh is used to unblock the starting of the task 104 unblockCh chan struct{} 105 unblocked bool 106 unblockLock sync.Mutex 107 108 // restartCh is used to restart a task 109 restartCh chan *structs.TaskEvent 110 111 // signalCh is used to send a signal to a task 112 signalCh chan SignalEvent 113 114 destroy bool 115 destroyCh chan struct{} 116 destroyLock sync.Mutex 117 destroyEvent *structs.TaskEvent 118 119 // waitCh closing marks the run loop as having exited 120 waitCh chan struct{} 121 122 // serialize SaveState calls 123 persistLock sync.Mutex 124 } 125 126 // taskRunnerState is used to snapshot the state of the task runner 127 type taskRunnerState struct { 128 Version string 129 Task *structs.Task 130 HandleID string 131 ArtifactDownloaded bool 132 } 133 134 // TaskStateUpdater is used to signal that tasks state has changed. 135 type TaskStateUpdater func(taskName, state string, event *structs.TaskEvent) 136 137 // SignalEvent is a tuple of the signal and the event generating it 138 type SignalEvent struct { 139 // s is the signal to be sent 140 s os.Signal 141 142 // e is the task event generating the signal 143 e *structs.TaskEvent 144 145 // result should be used to send back the result of the signal 146 result chan<- error 147 } 148 149 // NewTaskRunner is used to create a new task context 150 func NewTaskRunner(logger *log.Logger, config *config.Config, 151 updater TaskStateUpdater, ctx *driver.ExecContext, 152 alloc *structs.Allocation, task *structs.Task, 153 vaultClient vaultclient.VaultClient) *TaskRunner { 154 155 // Merge in the task resources 156 task.Resources = alloc.TaskResources[task.Name] 157 158 // Build the restart tracker. 159 tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup) 160 if tg == nil { 161 logger.Printf("[ERR] client: alloc '%s' for missing task group '%s'", alloc.ID, alloc.TaskGroup) 162 return nil 163 } 164 restartTracker := newRestartTracker(tg.RestartPolicy, alloc.Job.Type) 165 166 // Get the task directory 167 taskDir, ok := ctx.AllocDir.TaskDirs[task.Name] 168 if !ok { 169 logger.Printf("[ERR] client: task directory for alloc %q task %q couldn't be found", alloc.ID, task.Name) 170 return nil 171 } 172 173 tc := &TaskRunner{ 174 config: config, 175 updater: updater, 176 logger: logger, 177 restartTracker: restartTracker, 178 ctx: ctx, 179 alloc: alloc, 180 task: task, 181 taskDir: taskDir, 182 vaultClient: vaultClient, 183 vaultFuture: NewTokenFuture().Set(""), 184 updateCh: make(chan *structs.Allocation, 64), 185 destroyCh: make(chan struct{}), 186 waitCh: make(chan struct{}), 187 startCh: make(chan struct{}, 1), 188 unblockCh: make(chan struct{}), 189 restartCh: make(chan *structs.TaskEvent), 190 signalCh: make(chan SignalEvent), 191 } 192 193 return tc 194 } 195 196 // MarkReceived marks the task as received. 197 func (r *TaskRunner) MarkReceived() { 198 r.updater(r.task.Name, structs.TaskStatePending, structs.NewTaskEvent(structs.TaskReceived)) 199 } 200 201 // WaitCh returns a channel to wait for termination 202 func (r *TaskRunner) WaitCh() <-chan struct{} { 203 return r.waitCh 204 } 205 206 // stateFilePath returns the path to our state file 207 func (r *TaskRunner) stateFilePath() string { 208 // Get the MD5 of the task name 209 hashVal := md5.Sum([]byte(r.task.Name)) 210 hashHex := hex.EncodeToString(hashVal[:]) 211 dirName := fmt.Sprintf("task-%s", hashHex) 212 213 // Generate the path 214 path := filepath.Join(r.config.StateDir, "alloc", r.alloc.ID, 215 dirName, "state.json") 216 return path 217 } 218 219 // RestoreState is used to restore our state 220 func (r *TaskRunner) RestoreState() error { 221 // Load the snapshot 222 var snap taskRunnerState 223 if err := restoreState(r.stateFilePath(), &snap); err != nil { 224 return err 225 } 226 227 // Restore fields 228 if snap.Task == nil { 229 return fmt.Errorf("task runner snapshot include nil Task") 230 } else { 231 r.task = snap.Task 232 } 233 r.artifactsDownloaded = snap.ArtifactDownloaded 234 235 if err := r.setTaskEnv(); err != nil { 236 return fmt.Errorf("client: failed to create task environment for task %q in allocation %q: %v", 237 r.task.Name, r.alloc.ID, err) 238 } 239 240 if r.task.Vault != nil { 241 secretDir, err := r.ctx.AllocDir.GetSecretDir(r.task.Name) 242 if err != nil { 243 return fmt.Errorf("failed to determine task %s secret dir in alloc %q: %v", r.task.Name, r.alloc.ID, err) 244 } 245 246 // Read the token from the secret directory 247 tokenPath := filepath.Join(secretDir, vaultTokenFile) 248 data, err := ioutil.ReadFile(tokenPath) 249 if err != nil { 250 if !os.IsNotExist(err) { 251 return fmt.Errorf("failed to read token for task %q in alloc %q: %v", r.task.Name, r.alloc.ID, err) 252 } 253 254 // Token file doesn't exist 255 } else { 256 // Store the recovered token 257 r.recoveredVaultToken = string(data) 258 } 259 } 260 261 // Restore the driver 262 if snap.HandleID != "" { 263 driver, err := r.createDriver() 264 if err != nil { 265 return err 266 } 267 268 handle, err := driver.Open(r.ctx, snap.HandleID) 269 270 // In the case it fails, we relaunch the task in the Run() method. 271 if err != nil { 272 r.logger.Printf("[ERR] client: failed to open handle to task '%s' for alloc '%s': %v", 273 r.task.Name, r.alloc.ID, err) 274 return nil 275 } 276 r.handleLock.Lock() 277 r.handle = handle 278 r.handleLock.Unlock() 279 280 r.runningLock.Lock() 281 r.running = true 282 r.runningLock.Unlock() 283 } 284 return nil 285 } 286 287 // SaveState is used to snapshot our state 288 func (r *TaskRunner) SaveState() error { 289 r.persistLock.Lock() 290 defer r.persistLock.Unlock() 291 292 snap := taskRunnerState{ 293 Task: r.task, 294 Version: r.config.Version, 295 ArtifactDownloaded: r.artifactsDownloaded, 296 } 297 r.handleLock.Lock() 298 if r.handle != nil { 299 snap.HandleID = r.handle.ID() 300 } 301 r.handleLock.Unlock() 302 return persistState(r.stateFilePath(), &snap) 303 } 304 305 // DestroyState is used to cleanup after ourselves 306 func (r *TaskRunner) DestroyState() error { 307 return os.RemoveAll(r.stateFilePath()) 308 } 309 310 // setState is used to update the state of the task runner 311 func (r *TaskRunner) setState(state string, event *structs.TaskEvent) { 312 // Persist our state to disk. 313 if err := r.SaveState(); err != nil { 314 r.logger.Printf("[ERR] client: failed to save state of Task Runner for task %q: %v", r.task.Name, err) 315 } 316 317 // Indicate the task has been updated. 318 r.updater(r.task.Name, state, event) 319 } 320 321 // setTaskEnv sets the task environment. It returns an error if it could not be 322 // created. 323 func (r *TaskRunner) setTaskEnv() error { 324 r.taskEnvLock.Lock() 325 defer r.taskEnvLock.Unlock() 326 327 taskEnv, err := driver.GetTaskEnv(r.ctx.AllocDir, r.config.Node, r.task.Copy(), r.alloc, r.vaultFuture.Get()) 328 if err != nil { 329 return err 330 } 331 r.taskEnv = taskEnv 332 return nil 333 } 334 335 // getTaskEnv returns the task environment 336 func (r *TaskRunner) getTaskEnv() *env.TaskEnvironment { 337 r.taskEnvLock.Lock() 338 defer r.taskEnvLock.Unlock() 339 return r.taskEnv 340 } 341 342 // createDriver makes a driver for the task 343 func (r *TaskRunner) createDriver() (driver.Driver, error) { 344 env := r.getTaskEnv() 345 if env == nil { 346 return nil, fmt.Errorf("task environment not made for task %q in allocation %q", r.task.Name, r.alloc.ID) 347 } 348 349 driverCtx := driver.NewDriverContext(r.task.Name, r.config, r.config.Node, r.logger, env) 350 driver, err := driver.NewDriver(r.task.Driver, driverCtx) 351 if err != nil { 352 return nil, fmt.Errorf("failed to create driver '%s' for alloc %s: %v", 353 r.task.Driver, r.alloc.ID, err) 354 } 355 return driver, err 356 } 357 358 // Run is a long running routine used to manage the task 359 func (r *TaskRunner) Run() { 360 defer close(r.waitCh) 361 r.logger.Printf("[DEBUG] client: starting task context for '%s' (alloc '%s')", 362 r.task.Name, r.alloc.ID) 363 364 // Create the initial environment, this will be recreated if a Vault token 365 // is needed 366 if err := r.setTaskEnv(); err != nil { 367 r.setState( 368 structs.TaskStateDead, 369 structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(err)) 370 return 371 } 372 373 if err := r.validateTask(); err != nil { 374 r.setState( 375 structs.TaskStateDead, 376 structs.NewTaskEvent(structs.TaskFailedValidation).SetValidationError(err).SetFailsTask()) 377 return 378 } 379 380 // If there is no Vault policy leave the static future created in 381 // NewTaskRunner 382 if r.task.Vault != nil { 383 // Start the go-routine to get a Vault token 384 r.vaultFuture.Clear() 385 go r.vaultManager(r.recoveredVaultToken) 386 } 387 388 // Start the run loop 389 r.run() 390 391 // Do any cleanup necessary 392 r.postrun() 393 394 return 395 } 396 397 // validateTask validates the fields of the task and returns an error if the 398 // task is invalid. 399 func (r *TaskRunner) validateTask() error { 400 var mErr multierror.Error 401 402 // Validate the user. 403 unallowedUsers := r.config.ReadStringListToMapDefault("user.blacklist", config.DefaultUserBlacklist) 404 checkDrivers := r.config.ReadStringListToMapDefault("user.checked_drivers", config.DefaultUserCheckedDrivers) 405 if _, driverMatch := checkDrivers[r.task.Driver]; driverMatch { 406 if _, unallowed := unallowedUsers[r.task.User]; unallowed { 407 mErr.Errors = append(mErr.Errors, fmt.Errorf("running as user %q is disallowed", r.task.User)) 408 } 409 } 410 411 // Validate the artifacts 412 for i, artifact := range r.task.Artifacts { 413 // Verify the artifact doesn't escape the task directory. 414 if err := artifact.Validate(); err != nil { 415 // If this error occurs there is potentially a server bug or 416 // mallicious, server spoofing. 417 r.logger.Printf("[ERR] client: allocation %q, task %v, artifact %#v (%v) fails validation: %v", 418 r.alloc.ID, r.task.Name, artifact, i, err) 419 mErr.Errors = append(mErr.Errors, fmt.Errorf("artifact (%d) failed validation: %v", i, err)) 420 } 421 } 422 423 // Validate the Service names 424 for i, service := range r.task.Services { 425 name := r.taskEnv.ReplaceEnv(service.Name) 426 if err := service.ValidateName(name); err != nil { 427 mErr.Errors = append(mErr.Errors, fmt.Errorf("service (%d) failed validation: %v", i, err)) 428 } 429 } 430 431 if len(mErr.Errors) == 1 { 432 return mErr.Errors[0] 433 } 434 return mErr.ErrorOrNil() 435 } 436 437 // tokenFuture stores the Vault token and allows consumers to block till a valid 438 // token exists 439 type tokenFuture struct { 440 waiting []chan struct{} 441 token string 442 set bool 443 m sync.Mutex 444 } 445 446 // NewTokenFuture returns a new token future without any token set 447 func NewTokenFuture() *tokenFuture { 448 return &tokenFuture{} 449 } 450 451 // Wait returns a channel that can be waited on. When this channel unblocks, a 452 // valid token will be available via the Get method 453 func (f *tokenFuture) Wait() <-chan struct{} { 454 f.m.Lock() 455 defer f.m.Unlock() 456 457 c := make(chan struct{}) 458 if f.set { 459 close(c) 460 return c 461 } 462 463 f.waiting = append(f.waiting, c) 464 return c 465 } 466 467 // Set sets the token value and unblocks any caller of Wait 468 func (f *tokenFuture) Set(token string) *tokenFuture { 469 f.m.Lock() 470 defer f.m.Unlock() 471 472 f.set = true 473 f.token = token 474 for _, w := range f.waiting { 475 close(w) 476 } 477 f.waiting = nil 478 return f 479 } 480 481 // Clear clears the set vault token. 482 func (f *tokenFuture) Clear() *tokenFuture { 483 f.m.Lock() 484 defer f.m.Unlock() 485 486 f.token = "" 487 f.set = false 488 return f 489 } 490 491 // Get returns the set Vault token 492 func (f *tokenFuture) Get() string { 493 f.m.Lock() 494 defer f.m.Unlock() 495 return f.token 496 } 497 498 // vaultManager should be called in a go-routine and manages the derivation, 499 // renewal and handling of errors with the Vault token. The optional parameter 500 // allows setting the initial Vault token. This is useful when the Vault token 501 // is recovered off disk. 502 func (r *TaskRunner) vaultManager(token string) { 503 // updatedToken lets us store state between loops. If true, a new token 504 // has been retrieved and we need to apply the Vault change mode 505 var updatedToken bool 506 507 OUTER: 508 for { 509 // Check if we should exit 510 select { 511 case <-r.waitCh: 512 return 513 default: 514 } 515 516 // Clear the token 517 r.vaultFuture.Clear() 518 519 // Check if there already is a token which can be the case for 520 // restoring the TaskRunner 521 if token == "" { 522 // Get a token 523 var exit bool 524 token, exit = r.deriveVaultToken() 525 if exit { 526 // Exit the manager 527 return 528 } 529 530 // Write the token to disk 531 if err := r.writeToken(token); err != nil { 532 e := fmt.Errorf("failed to write Vault token to disk") 533 r.logger.Printf("[ERR] client: %v for task %v on alloc %q: %v", e, r.task.Name, r.alloc.ID, err) 534 r.Kill("vault", e.Error(), true) 535 return 536 } 537 } 538 539 // Start the renewal process 540 renewCh, err := r.vaultClient.RenewToken(token, 30) 541 542 // An error returned means the token is not being renewed 543 if err != nil { 544 r.logger.Printf("[ERR] client: failed to start renewal of Vault token for task %v on alloc %q: %v", r.task.Name, r.alloc.ID, err) 545 token = "" 546 goto OUTER 547 } 548 549 // The Vault token is valid now, so set it 550 r.vaultFuture.Set(token) 551 552 if updatedToken { 553 switch r.task.Vault.ChangeMode { 554 case structs.VaultChangeModeSignal: 555 s, err := signals.Parse(r.task.Vault.ChangeSignal) 556 if err != nil { 557 e := fmt.Errorf("failed to parse signal: %v", err) 558 r.logger.Printf("[ERR] client: %v", err) 559 r.Kill("vault", e.Error(), true) 560 return 561 } 562 563 if err := r.Signal("vault", "new Vault token acquired", s); err != nil { 564 r.logger.Printf("[ERR] client: failed to send signal to task %v for alloc %q: %v", r.task.Name, r.alloc.ID, err) 565 r.Kill("vault", fmt.Sprintf("failed to send signal to task: %v", err), true) 566 return 567 } 568 case structs.VaultChangeModeRestart: 569 r.Restart("vault", "new Vault token acquired") 570 case structs.VaultChangeModeNoop: 571 fallthrough 572 default: 573 r.logger.Printf("[ERR] client: Invalid Vault change mode: %q", r.task.Vault.ChangeMode) 574 } 575 576 // We have handled it 577 updatedToken = false 578 579 // Call the handler 580 r.updatedTokenHandler() 581 } 582 583 // Start watching for renewal errors 584 select { 585 case err := <-renewCh: 586 // Clear the token 587 token = "" 588 r.logger.Printf("[ERR] client: failed to renew Vault token for task %v on alloc %q: %v", r.task.Name, r.alloc.ID, err) 589 590 // Check if we have to do anything 591 if r.task.Vault.ChangeMode != structs.VaultChangeModeNoop { 592 updatedToken = true 593 } 594 case <-r.waitCh: 595 return 596 } 597 } 598 } 599 600 // deriveVaultToken derives the Vault token using exponential backoffs. It 601 // returns the Vault token and whether the manager should exit. 602 func (r *TaskRunner) deriveVaultToken() (token string, exit bool) { 603 attempts := 0 604 for { 605 tokens, err := r.vaultClient.DeriveToken(r.alloc, []string{r.task.Name}) 606 if err == nil { 607 return tokens[r.task.Name], false 608 } 609 610 // Check if we can't recover from the error 611 if rerr, ok := err.(*structs.RecoverableError); !ok || !rerr.Recoverable { 612 r.logger.Printf("[ERR] client: failed to derive Vault token for task %v on alloc %q: %v", 613 r.task.Name, r.alloc.ID, err) 614 r.Kill("vault", fmt.Sprintf("failed to derive token: %v", err), true) 615 return "", true 616 } 617 618 // Handle the retry case 619 backoff := (1 << (2 * uint64(attempts))) * vaultBackoffBaseline 620 if backoff > vaultBackoffLimit { 621 backoff = vaultBackoffLimit 622 } 623 r.logger.Printf("[ERR] client: failed to derive Vault token for task %v on alloc %q: %v; retrying in %v", 624 r.task.Name, r.alloc.ID, err, backoff) 625 626 attempts++ 627 628 // Wait till retrying 629 select { 630 case <-r.waitCh: 631 return "", true 632 case <-time.After(backoff): 633 } 634 } 635 } 636 637 // writeToken writes the given token to disk 638 func (r *TaskRunner) writeToken(token string) error { 639 // Write the token to disk 640 secretDir, err := r.ctx.AllocDir.GetSecretDir(r.task.Name) 641 if err != nil { 642 return fmt.Errorf("failed to determine task %s secret dir in alloc %q: %v", r.task.Name, r.alloc.ID, err) 643 } 644 645 // Write the token to the file system 646 tokenPath := filepath.Join(secretDir, vaultTokenFile) 647 if err := ioutil.WriteFile(tokenPath, []byte(token), 0777); err != nil { 648 return fmt.Errorf("failed to save Vault tokens to secret dir for task %q in alloc %q: %v", r.task.Name, r.alloc.ID, err) 649 } 650 651 return nil 652 } 653 654 // updatedTokenHandler is called when a new Vault token is retrieved. Things 655 // that rely on the token should be updated here. 656 func (r *TaskRunner) updatedTokenHandler() { 657 658 // Update the tasks environment 659 if err := r.setTaskEnv(); err != nil { 660 r.setState( 661 structs.TaskStateDead, 662 structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(err).SetFailsTask()) 663 return 664 } 665 666 if r.templateManager != nil { 667 r.templateManager.Stop() 668 669 // Create a new templateManager 670 var err error 671 r.templateManager, err = NewTaskTemplateManager(r, r.task.Templates, 672 r.config, r.vaultFuture.Get(), r.taskDir, r.getTaskEnv()) 673 if err != nil { 674 err := fmt.Errorf("failed to build task's template manager: %v", err) 675 r.setState(structs.TaskStateDead, structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(err).SetFailsTask()) 676 r.logger.Printf("[ERR] client: alloc %q, task %q %v", r.alloc.ID, r.task.Name, err) 677 r.Kill("vault", err.Error(), true) 678 return 679 } 680 } 681 } 682 683 // prestart handles life-cycle tasks that occur before the task has started. 684 func (r *TaskRunner) prestart(resultCh chan bool) { 685 686 if r.task.Vault != nil { 687 // Wait for the token 688 r.logger.Printf("[DEBUG] client: waiting for Vault token for task %v in alloc %q", r.task.Name, r.alloc.ID) 689 tokenCh := r.vaultFuture.Wait() 690 select { 691 case <-tokenCh: 692 case <-r.waitCh: 693 resultCh <- false 694 return 695 } 696 r.logger.Printf("[DEBUG] client: retrieved Vault token for task %v in alloc %q", r.task.Name, r.alloc.ID) 697 } 698 699 if err := r.setTaskEnv(); err != nil { 700 r.setState( 701 structs.TaskStateDead, 702 structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(err).SetFailsTask()) 703 resultCh <- false 704 return 705 } 706 707 for { 708 // Download the task's artifacts 709 if !r.artifactsDownloaded && len(r.task.Artifacts) > 0 { 710 r.setState(structs.TaskStatePending, structs.NewTaskEvent(structs.TaskDownloadingArtifacts)) 711 for _, artifact := range r.task.Artifacts { 712 if err := getter.GetArtifact(r.getTaskEnv(), artifact, r.taskDir); err != nil { 713 wrapped := fmt.Errorf("failed to download artifact %q: %v", artifact.GetterSource, err) 714 r.setState(structs.TaskStatePending, 715 structs.NewTaskEvent(structs.TaskArtifactDownloadFailed).SetDownloadError(wrapped)) 716 r.restartTracker.SetStartError(structs.NewRecoverableError(wrapped, true)) 717 goto RESTART 718 } 719 } 720 721 r.artifactsDownloaded = true 722 } 723 724 // We don't have to wait for any template 725 if len(r.task.Templates) == 0 { 726 // Send the start signal 727 select { 728 case r.startCh <- struct{}{}: 729 default: 730 } 731 732 resultCh <- true 733 return 734 } 735 736 // Build the template manager 737 if r.templateManager == nil { 738 var err error 739 r.templateManager, err = NewTaskTemplateManager(r, r.task.Templates, 740 r.config, r.vaultFuture.Get(), r.taskDir, r.getTaskEnv()) 741 if err != nil { 742 err := fmt.Errorf("failed to build task's template manager: %v", err) 743 r.setState(structs.TaskStateDead, structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(err).SetFailsTask()) 744 r.logger.Printf("[ERR] client: alloc %q, task %q %v", r.alloc.ID, r.task.Name, err) 745 resultCh <- false 746 return 747 } 748 } 749 750 // Block for consul-template 751 // TODO Hooks should register themselves as blocking and then we can 752 // perioidcally enumerate what we are still blocked on 753 select { 754 case <-r.unblockCh: 755 // Send the start signal 756 select { 757 case r.startCh <- struct{}{}: 758 default: 759 } 760 761 resultCh <- true 762 return 763 case <-r.waitCh: 764 // The run loop has exited so exit too 765 resultCh <- false 766 return 767 } 768 769 RESTART: 770 restart := r.shouldRestart() 771 if !restart { 772 resultCh <- false 773 return 774 } 775 } 776 } 777 778 // postrun is used to do any cleanup that is necessary after exiting the runloop 779 func (r *TaskRunner) postrun() { 780 // Stop the template manager 781 if r.templateManager != nil { 782 r.templateManager.Stop() 783 } 784 } 785 786 // run is the main run loop that handles starting the application, destroying 787 // it, restarts and signals. 788 func (r *TaskRunner) run() { 789 // Predeclare things so we can jump to the RESTART 790 var stopCollection chan struct{} 791 var handleWaitCh chan *dstructs.WaitResult 792 793 for { 794 // Do the prestart activities 795 prestartResultCh := make(chan bool, 1) 796 go r.prestart(prestartResultCh) 797 798 WAIT: 799 for { 800 select { 801 case success := <-prestartResultCh: 802 if !success { 803 r.setState(structs.TaskStateDead, nil) 804 return 805 } 806 case <-r.startCh: 807 // Start the task if not yet started or it is being forced. This logic 808 // is necessary because in the case of a restore the handle already 809 // exists. 810 r.handleLock.Lock() 811 handleEmpty := r.handle == nil 812 r.handleLock.Unlock() 813 814 if handleEmpty { 815 startErr := r.startTask() 816 r.restartTracker.SetStartError(startErr) 817 if startErr != nil { 818 r.setState("", structs.NewTaskEvent(structs.TaskDriverFailure).SetDriverError(startErr)) 819 goto RESTART 820 } 821 822 // Mark the task as started 823 r.setState(structs.TaskStateRunning, structs.NewTaskEvent(structs.TaskStarted)) 824 r.runningLock.Lock() 825 r.running = true 826 r.runningLock.Unlock() 827 } 828 829 if stopCollection == nil { 830 stopCollection = make(chan struct{}) 831 go r.collectResourceUsageStats(stopCollection) 832 } 833 834 handleWaitCh = r.handle.WaitCh() 835 836 case waitRes := <-handleWaitCh: 837 if waitRes == nil { 838 panic("nil wait") 839 } 840 841 r.runningLock.Lock() 842 r.running = false 843 r.runningLock.Unlock() 844 845 // Stop collection of the task's resource usage 846 close(stopCollection) 847 848 // Log whether the task was successful or not. 849 r.restartTracker.SetWaitResult(waitRes) 850 r.setState("", r.waitErrorToEvent(waitRes)) 851 if !waitRes.Successful() { 852 r.logger.Printf("[INFO] client: task %q for alloc %q failed: %v", r.task.Name, r.alloc.ID, waitRes) 853 } else { 854 r.logger.Printf("[INFO] client: task %q for alloc %q completed successfully", r.task.Name, r.alloc.ID) 855 } 856 857 break WAIT 858 case update := <-r.updateCh: 859 if err := r.handleUpdate(update); err != nil { 860 r.logger.Printf("[ERR] client: update to task %q failed: %v", r.task.Name, err) 861 } 862 863 case se := <-r.signalCh: 864 r.logger.Printf("[DEBUG] client: task being signalled with %v: %s", se.s, se.e.TaskSignalReason) 865 r.setState(structs.TaskStateRunning, se.e) 866 867 res := r.handle.Signal(se.s) 868 se.result <- res 869 870 case event := <-r.restartCh: 871 r.logger.Printf("[DEBUG] client: task being restarted: %s", event.RestartReason) 872 r.setState(structs.TaskStateRunning, event) 873 r.killTask(nil) 874 875 close(stopCollection) 876 877 if handleWaitCh != nil { 878 <-handleWaitCh 879 } 880 881 // Since the restart isn't from a failure, restart immediately 882 // and don't count against the restart policy 883 r.restartTracker.SetRestartTriggered() 884 break WAIT 885 886 case <-r.destroyCh: 887 r.runningLock.Lock() 888 running := r.running 889 r.runningLock.Unlock() 890 if !running { 891 r.setState(structs.TaskStateDead, r.destroyEvent) 892 return 893 } 894 895 // Store the task event that provides context on the task 896 // destroy. The Killed event is set from the alloc_runner and 897 // doesn't add detail 898 var killEvent *structs.TaskEvent 899 if r.destroyEvent.Type != structs.TaskKilled { 900 if r.destroyEvent.Type == structs.TaskKilling { 901 killEvent = r.destroyEvent 902 } else { 903 r.setState(structs.TaskStateRunning, r.destroyEvent) 904 } 905 } 906 907 r.killTask(killEvent) 908 close(stopCollection) 909 r.setState(structs.TaskStateDead, nil) 910 return 911 } 912 } 913 914 RESTART: 915 restart := r.shouldRestart() 916 if !restart { 917 r.setState(structs.TaskStateDead, nil) 918 return 919 } 920 921 // Clear the handle so a new driver will be created. 922 r.handleLock.Lock() 923 r.handle = nil 924 handleWaitCh = nil 925 stopCollection = nil 926 r.handleLock.Unlock() 927 } 928 } 929 930 // shouldRestart returns if the task should restart. If the return value is 931 // true, the task's restart policy has already been considered and any wait time 932 // between restarts has been applied. 933 func (r *TaskRunner) shouldRestart() bool { 934 state, when := r.restartTracker.GetState() 935 reason := r.restartTracker.GetReason() 936 switch state { 937 case structs.TaskNotRestarting, structs.TaskTerminated: 938 r.logger.Printf("[INFO] client: Not restarting task: %v for alloc: %v ", r.task.Name, r.alloc.ID) 939 if state == structs.TaskNotRestarting { 940 r.setState(structs.TaskStateDead, 941 structs.NewTaskEvent(structs.TaskNotRestarting). 942 SetRestartReason(reason).SetFailsTask()) 943 } 944 return false 945 case structs.TaskRestarting: 946 r.logger.Printf("[INFO] client: Restarting task %q for alloc %q in %v", r.task.Name, r.alloc.ID, when) 947 r.setState(structs.TaskStatePending, 948 structs.NewTaskEvent(structs.TaskRestarting). 949 SetRestartDelay(when). 950 SetRestartReason(reason)) 951 default: 952 r.logger.Printf("[ERR] client: restart tracker returned unknown state: %q", state) 953 return false 954 } 955 956 // Sleep but watch for destroy events. 957 select { 958 case <-time.After(when): 959 case <-r.destroyCh: 960 } 961 962 // Destroyed while we were waiting to restart, so abort. 963 r.destroyLock.Lock() 964 destroyed := r.destroy 965 r.destroyLock.Unlock() 966 if destroyed { 967 r.logger.Printf("[DEBUG] client: Not restarting task: %v because it has been destroyed", r.task.Name) 968 r.setState(structs.TaskStateDead, r.destroyEvent) 969 return false 970 } 971 972 return true 973 } 974 975 // killTask kills the running task. A killing event can optionally be passed and 976 // this event is used to mark the task as being killed. It provides a means to 977 // store extra information. 978 func (r *TaskRunner) killTask(killingEvent *structs.TaskEvent) { 979 r.runningLock.Lock() 980 running := r.running 981 r.runningLock.Unlock() 982 if !running { 983 return 984 } 985 986 // Get the kill timeout 987 timeout := driver.GetKillTimeout(r.task.KillTimeout, r.config.MaxKillTimeout) 988 989 // Build the event 990 var event *structs.TaskEvent 991 if killingEvent != nil { 992 event = killingEvent 993 event.Type = structs.TaskKilling 994 } else { 995 event = structs.NewTaskEvent(structs.TaskKilling) 996 } 997 event.SetKillTimeout(timeout) 998 999 // Mark that we received the kill event 1000 r.setState(structs.TaskStateRunning, event) 1001 1002 // Kill the task using an exponential backoff in-case of failures. 1003 destroySuccess, err := r.handleDestroy() 1004 if !destroySuccess { 1005 // We couldn't successfully destroy the resource created. 1006 r.logger.Printf("[ERR] client: failed to kill task %q. Resources may have been leaked: %v", r.task.Name, err) 1007 } 1008 1009 r.runningLock.Lock() 1010 r.running = false 1011 r.runningLock.Unlock() 1012 1013 // Store that the task has been destroyed and any associated error. 1014 r.setState("", structs.NewTaskEvent(structs.TaskKilled).SetKillError(err)) 1015 } 1016 1017 // startTask creates the driver and starts the task. 1018 func (r *TaskRunner) startTask() error { 1019 // Create a driver 1020 driver, err := r.createDriver() 1021 if err != nil { 1022 return fmt.Errorf("failed to create driver of task '%s' for alloc '%s': %v", 1023 r.task.Name, r.alloc.ID, err) 1024 } 1025 1026 // Start the job 1027 handle, err := driver.Start(r.ctx, r.task) 1028 if err != nil { 1029 wrapped := fmt.Errorf("failed to start task '%s' for alloc '%s': %v", 1030 r.task.Name, r.alloc.ID, err) 1031 1032 r.logger.Printf("[INFO] client: %v", wrapped) 1033 1034 if rerr, ok := err.(*structs.RecoverableError); ok { 1035 return structs.NewRecoverableError(wrapped, rerr.Recoverable) 1036 } 1037 1038 return wrapped 1039 1040 } 1041 1042 r.handleLock.Lock() 1043 r.handle = handle 1044 r.handleLock.Unlock() 1045 return nil 1046 } 1047 1048 // collectResourceUsageStats starts collecting resource usage stats of a Task. 1049 // Collection ends when the passed channel is closed 1050 func (r *TaskRunner) collectResourceUsageStats(stopCollection <-chan struct{}) { 1051 // start collecting the stats right away and then start collecting every 1052 // collection interval 1053 next := time.NewTimer(0) 1054 defer next.Stop() 1055 for { 1056 select { 1057 case <-next.C: 1058 next.Reset(r.config.StatsCollectionInterval) 1059 if r.handle == nil { 1060 continue 1061 } 1062 ru, err := r.handle.Stats() 1063 1064 if err != nil { 1065 // We do not log when the plugin is shutdown as this is simply a 1066 // race between the stopCollection channel being closed and calling 1067 // Stats on the handle. 1068 if !strings.Contains(err.Error(), "connection is shut down") { 1069 r.logger.Printf("[WARN] client: error fetching stats of task %v: %v", r.task.Name, err) 1070 } 1071 continue 1072 } 1073 1074 r.resourceUsageLock.Lock() 1075 r.resourceUsage = ru 1076 r.resourceUsageLock.Unlock() 1077 if ru != nil { 1078 r.emitStats(ru) 1079 } 1080 case <-stopCollection: 1081 return 1082 } 1083 } 1084 } 1085 1086 // LatestResourceUsage returns the last resource utilization datapoint collected 1087 func (r *TaskRunner) LatestResourceUsage() *cstructs.TaskResourceUsage { 1088 r.resourceUsageLock.RLock() 1089 defer r.resourceUsageLock.RUnlock() 1090 r.runningLock.Lock() 1091 defer r.runningLock.Unlock() 1092 1093 // If the task is not running there can be no latest resource 1094 if !r.running { 1095 return nil 1096 } 1097 1098 return r.resourceUsage 1099 } 1100 1101 // handleUpdate takes an updated allocation and updates internal state to 1102 // reflect the new config for the task. 1103 func (r *TaskRunner) handleUpdate(update *structs.Allocation) error { 1104 // Extract the task group from the alloc. 1105 tg := update.Job.LookupTaskGroup(update.TaskGroup) 1106 if tg == nil { 1107 return fmt.Errorf("alloc '%s' missing task group '%s'", update.ID, update.TaskGroup) 1108 } 1109 1110 // Extract the task. 1111 var updatedTask *structs.Task 1112 for _, t := range tg.Tasks { 1113 if t.Name == r.task.Name { 1114 updatedTask = t.Copy() 1115 } 1116 } 1117 if updatedTask == nil { 1118 return fmt.Errorf("task group %q doesn't contain task %q", tg.Name, r.task.Name) 1119 } 1120 1121 // Merge in the task resources 1122 updatedTask.Resources = update.TaskResources[updatedTask.Name] 1123 1124 // Update will update resources and store the new kill timeout. 1125 var mErr multierror.Error 1126 r.handleLock.Lock() 1127 if r.handle != nil { 1128 if err := r.handle.Update(updatedTask); err != nil { 1129 mErr.Errors = append(mErr.Errors, fmt.Errorf("updating task resources failed: %v", err)) 1130 } 1131 } 1132 r.handleLock.Unlock() 1133 1134 // Update the restart policy. 1135 if r.restartTracker != nil { 1136 r.restartTracker.SetPolicy(tg.RestartPolicy) 1137 } 1138 1139 // Store the updated alloc. 1140 r.alloc = update 1141 r.task = updatedTask 1142 return mErr.ErrorOrNil() 1143 } 1144 1145 // handleDestroy kills the task handle. In the case that killing fails, 1146 // handleDestroy will retry with an exponential backoff and will give up at a 1147 // given limit. It returns whether the task was destroyed and the error 1148 // associated with the last kill attempt. 1149 func (r *TaskRunner) handleDestroy() (destroyed bool, err error) { 1150 // Cap the number of times we attempt to kill the task. 1151 for i := 0; i < killFailureLimit; i++ { 1152 if err = r.handle.Kill(); err != nil { 1153 // Calculate the new backoff 1154 backoff := (1 << (2 * uint64(i))) * killBackoffBaseline 1155 if backoff > killBackoffLimit { 1156 backoff = killBackoffLimit 1157 } 1158 1159 r.logger.Printf("[ERR] client: failed to kill task '%s' for alloc %q. Retrying in %v: %v", 1160 r.task.Name, r.alloc.ID, backoff, err) 1161 time.Sleep(time.Duration(backoff)) 1162 } else { 1163 // Kill was successful 1164 return true, nil 1165 } 1166 } 1167 return 1168 } 1169 1170 // Restart will restart the task 1171 func (r *TaskRunner) Restart(source, reason string) { 1172 1173 reasonStr := fmt.Sprintf("%s: %s", source, reason) 1174 event := structs.NewTaskEvent(structs.TaskRestartSignal).SetRestartReason(reasonStr) 1175 1176 r.logger.Printf("[DEBUG] client: restarting task %v for alloc %q: %v", 1177 r.task.Name, r.alloc.ID, reasonStr) 1178 1179 r.runningLock.Lock() 1180 running := r.running 1181 r.runningLock.Unlock() 1182 1183 // Drop the restart event 1184 if !running { 1185 r.logger.Printf("[DEBUG] client: skipping restart since task isn't running") 1186 return 1187 } 1188 1189 select { 1190 case r.restartCh <- event: 1191 case <-r.waitCh: 1192 } 1193 } 1194 1195 // Signal will send a signal to the task 1196 func (r *TaskRunner) Signal(source, reason string, s os.Signal) error { 1197 1198 reasonStr := fmt.Sprintf("%s: %s", source, reason) 1199 event := structs.NewTaskEvent(structs.TaskSignaling).SetTaskSignal(s).SetTaskSignalReason(reasonStr) 1200 1201 r.logger.Printf("[DEBUG] client: sending signal %v to task %v for alloc %q", s, r.task.Name, r.alloc.ID) 1202 1203 r.runningLock.Lock() 1204 running := r.running 1205 r.runningLock.Unlock() 1206 1207 // Drop the restart event 1208 if !running { 1209 r.logger.Printf("[DEBUG] client: skipping signal since task isn't running") 1210 return nil 1211 } 1212 1213 resCh := make(chan error) 1214 se := SignalEvent{ 1215 s: s, 1216 e: event, 1217 result: resCh, 1218 } 1219 select { 1220 case r.signalCh <- se: 1221 case <-r.waitCh: 1222 } 1223 1224 return <-resCh 1225 } 1226 1227 // Kill will kill a task and store the error, no longer restarting the task. If 1228 // fail is set, the task is marked as having failed. 1229 func (r *TaskRunner) Kill(source, reason string, fail bool) { 1230 reasonStr := fmt.Sprintf("%s: %s", source, reason) 1231 event := structs.NewTaskEvent(structs.TaskKilling).SetKillReason(reasonStr) 1232 if fail { 1233 event.SetFailsTask() 1234 } 1235 1236 r.logger.Printf("[DEBUG] client: killing task %v for alloc %q: %v", r.task.Name, r.alloc.ID, reasonStr) 1237 r.Destroy(event) 1238 } 1239 1240 // UnblockStart unblocks the starting of the task. It currently assumes only 1241 // consul-template will unblock 1242 func (r *TaskRunner) UnblockStart(source string) { 1243 r.unblockLock.Lock() 1244 defer r.unblockLock.Unlock() 1245 if r.unblocked { 1246 return 1247 } 1248 1249 r.logger.Printf("[DEBUG] client: unblocking task %v for alloc %q: %v", r.task.Name, r.alloc.ID, source) 1250 r.unblocked = true 1251 close(r.unblockCh) 1252 } 1253 1254 // Helper function for converting a WaitResult into a TaskTerminated event. 1255 func (r *TaskRunner) waitErrorToEvent(res *dstructs.WaitResult) *structs.TaskEvent { 1256 return structs.NewTaskEvent(structs.TaskTerminated). 1257 SetExitCode(res.ExitCode). 1258 SetSignal(res.Signal). 1259 SetExitMessage(res.Err) 1260 } 1261 1262 // Update is used to update the task of the context 1263 func (r *TaskRunner) Update(update *structs.Allocation) { 1264 select { 1265 case r.updateCh <- update: 1266 default: 1267 r.logger.Printf("[ERR] client: dropping task update '%s' (alloc '%s')", 1268 r.task.Name, r.alloc.ID) 1269 } 1270 } 1271 1272 // Destroy is used to indicate that the task context should be destroyed. The 1273 // event parameter provides a context for the destroy. 1274 func (r *TaskRunner) Destroy(event *structs.TaskEvent) { 1275 r.destroyLock.Lock() 1276 defer r.destroyLock.Unlock() 1277 1278 if r.destroy { 1279 return 1280 } 1281 r.destroy = true 1282 r.destroyEvent = event 1283 close(r.destroyCh) 1284 } 1285 1286 // emitStats emits resource usage stats of tasks to remote metrics collector 1287 // sinks 1288 func (r *TaskRunner) emitStats(ru *cstructs.TaskResourceUsage) { 1289 if ru.ResourceUsage.MemoryStats != nil && r.config.PublishAllocationMetrics { 1290 metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "rss"}, float32(ru.ResourceUsage.MemoryStats.RSS)) 1291 metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "cache"}, float32(ru.ResourceUsage.MemoryStats.Cache)) 1292 metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "swap"}, float32(ru.ResourceUsage.MemoryStats.Swap)) 1293 metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "max_usage"}, float32(ru.ResourceUsage.MemoryStats.MaxUsage)) 1294 metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "kernel_usage"}, float32(ru.ResourceUsage.MemoryStats.KernelUsage)) 1295 metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "kernel_max_usage"}, float32(ru.ResourceUsage.MemoryStats.KernelMaxUsage)) 1296 } 1297 1298 if ru.ResourceUsage.CpuStats != nil && r.config.PublishAllocationMetrics { 1299 metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "total_percent"}, float32(ru.ResourceUsage.CpuStats.Percent)) 1300 metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "system"}, float32(ru.ResourceUsage.CpuStats.SystemMode)) 1301 metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "user"}, float32(ru.ResourceUsage.CpuStats.UserMode)) 1302 metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "throttled_time"}, float32(ru.ResourceUsage.CpuStats.ThrottledTime)) 1303 metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "throttled_periods"}, float32(ru.ResourceUsage.CpuStats.ThrottledPeriods)) 1304 metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "total_ticks"}, float32(ru.ResourceUsage.CpuStats.TotalTicks)) 1305 } 1306 }