github.com/taylorchu/nomad@v0.5.3-rc1.0.20170407200202-db11e7dd7b55/client/task_runner.go (about) 1 package client 2 3 import ( 4 "crypto/md5" 5 "encoding/hex" 6 "fmt" 7 "io/ioutil" 8 "log" 9 "os" 10 "path/filepath" 11 "strings" 12 "sync" 13 "time" 14 15 "github.com/armon/go-metrics" 16 "github.com/golang/snappy" 17 "github.com/hashicorp/consul-template/signals" 18 "github.com/hashicorp/go-multierror" 19 "github.com/hashicorp/nomad/client/allocdir" 20 "github.com/hashicorp/nomad/client/config" 21 "github.com/hashicorp/nomad/client/driver" 22 "github.com/hashicorp/nomad/client/getter" 23 "github.com/hashicorp/nomad/client/vaultclient" 24 "github.com/hashicorp/nomad/nomad/structs" 25 26 "github.com/hashicorp/nomad/client/driver/env" 27 dstructs "github.com/hashicorp/nomad/client/driver/structs" 28 cstructs "github.com/hashicorp/nomad/client/structs" 29 ) 30 31 const ( 32 // killBackoffBaseline is the baseline time for exponential backoff while 33 // killing a task. 34 killBackoffBaseline = 5 * time.Second 35 36 // killBackoffLimit is the limit of the exponential backoff for killing 37 // the task. 38 killBackoffLimit = 2 * time.Minute 39 40 // killFailureLimit is how many times we will attempt to kill a task before 41 // giving up and potentially leaking resources. 42 killFailureLimit = 5 43 44 // vaultBackoffBaseline is the baseline time for exponential backoff when 45 // attempting to retrieve a Vault token 46 vaultBackoffBaseline = 5 * time.Second 47 48 // vaultBackoffLimit is the limit of the exponential backoff when attempting 49 // to retrieve a Vault token 50 vaultBackoffLimit = 3 * time.Minute 51 52 // vaultTokenFile is the name of the file holding the Vault token inside the 53 // task's secret directory 54 vaultTokenFile = "vault_token" 55 ) 56 57 // TaskRunner is used to wrap a task within an allocation and provide the execution context. 58 type TaskRunner struct { 59 config *config.Config 60 updater TaskStateUpdater 61 logger *log.Logger 62 alloc *structs.Allocation 63 restartTracker *RestartTracker 64 65 // running marks whether the task is running 66 running bool 67 runningLock sync.Mutex 68 69 resourceUsage *cstructs.TaskResourceUsage 70 resourceUsageLock sync.RWMutex 71 72 task *structs.Task 73 taskDir *allocdir.TaskDir 74 75 // taskEnv is the environment variables of the task 76 taskEnv *env.TaskEnvironment 77 taskEnvLock sync.Mutex 78 79 // updateCh is used to receive updated versions of the allocation 80 updateCh chan *structs.Allocation 81 82 handle driver.DriverHandle 83 handleLock sync.Mutex 84 85 // artifactsDownloaded tracks whether the tasks artifacts have been 86 // downloaded 87 // 88 // Must acquire persistLock when accessing 89 artifactsDownloaded bool 90 91 // taskDirBuilt tracks whether the task has built its directory. 92 // 93 // Must acquire persistLock when accessing 94 taskDirBuilt bool 95 96 // createdResources are all the resources created by the task driver 97 // across all attempts to start the task. 98 // Simple gets and sets should use {get,set}CreatedResources 99 createdResources *driver.CreatedResources 100 createdResourcesLock sync.Mutex 101 102 // payloadRendered tracks whether the payload has been rendered to disk 103 payloadRendered bool 104 105 // vaultFuture is the means to wait for and get a Vault token 106 vaultFuture *tokenFuture 107 108 // recoveredVaultToken is the token that was recovered through a restore 109 recoveredVaultToken string 110 111 // vaultClient is used to retrieve and renew any needed Vault token 112 vaultClient vaultclient.VaultClient 113 114 // templateManager is used to manage any consul-templates this task may have 115 templateManager *TaskTemplateManager 116 117 // startCh is used to trigger the start of the task 118 startCh chan struct{} 119 120 // unblockCh is used to unblock the starting of the task 121 unblockCh chan struct{} 122 unblocked bool 123 unblockLock sync.Mutex 124 125 // restartCh is used to restart a task 126 restartCh chan *structs.TaskEvent 127 128 // signalCh is used to send a signal to a task 129 signalCh chan SignalEvent 130 131 destroy bool 132 destroyCh chan struct{} 133 destroyLock sync.Mutex 134 destroyEvent *structs.TaskEvent 135 136 // waitCh closing marks the run loop as having exited 137 waitCh chan struct{} 138 139 // persistLock must be acquired when accessing fields stored by 140 // SaveState. SaveState is called asynchronously to TaskRunner.Run by 141 // AllocRunner, so all state fields must be synchronized using this 142 // lock. 143 persistLock sync.Mutex 144 } 145 146 // taskRunnerState is used to snapshot the state of the task runner 147 type taskRunnerState struct { 148 Version string 149 Task *structs.Task 150 HandleID string 151 ArtifactDownloaded bool 152 TaskDirBuilt bool 153 CreatedResources *driver.CreatedResources 154 PayloadRendered bool 155 } 156 157 // TaskStateUpdater is used to signal that tasks state has changed. 158 type TaskStateUpdater func(taskName, state string, event *structs.TaskEvent) 159 160 // SignalEvent is a tuple of the signal and the event generating it 161 type SignalEvent struct { 162 // s is the signal to be sent 163 s os.Signal 164 165 // e is the task event generating the signal 166 e *structs.TaskEvent 167 168 // result should be used to send back the result of the signal 169 result chan<- error 170 } 171 172 // NewTaskRunner is used to create a new task context 173 func NewTaskRunner(logger *log.Logger, config *config.Config, 174 updater TaskStateUpdater, taskDir *allocdir.TaskDir, 175 alloc *structs.Allocation, task *structs.Task, 176 vaultClient vaultclient.VaultClient) *TaskRunner { 177 178 // Merge in the task resources 179 task.Resources = alloc.TaskResources[task.Name] 180 181 // Build the restart tracker. 182 tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup) 183 if tg == nil { 184 logger.Printf("[ERR] client: alloc '%s' for missing task group '%s'", alloc.ID, alloc.TaskGroup) 185 return nil 186 } 187 restartTracker := newRestartTracker(tg.RestartPolicy, alloc.Job.Type) 188 189 tc := &TaskRunner{ 190 config: config, 191 updater: updater, 192 logger: logger, 193 restartTracker: restartTracker, 194 alloc: alloc, 195 task: task, 196 taskDir: taskDir, 197 createdResources: driver.NewCreatedResources(), 198 vaultClient: vaultClient, 199 vaultFuture: NewTokenFuture().Set(""), 200 updateCh: make(chan *structs.Allocation, 64), 201 destroyCh: make(chan struct{}), 202 waitCh: make(chan struct{}), 203 startCh: make(chan struct{}, 1), 204 unblockCh: make(chan struct{}), 205 restartCh: make(chan *structs.TaskEvent), 206 signalCh: make(chan SignalEvent), 207 } 208 209 return tc 210 } 211 212 // MarkReceived marks the task as received. 213 func (r *TaskRunner) MarkReceived() { 214 r.updater(r.task.Name, structs.TaskStatePending, structs.NewTaskEvent(structs.TaskReceived)) 215 } 216 217 // WaitCh returns a channel to wait for termination 218 func (r *TaskRunner) WaitCh() <-chan struct{} { 219 return r.waitCh 220 } 221 222 // stateFilePath returns the path to our state file 223 func (r *TaskRunner) stateFilePath() string { 224 // Get the MD5 of the task name 225 hashVal := md5.Sum([]byte(r.task.Name)) 226 hashHex := hex.EncodeToString(hashVal[:]) 227 dirName := fmt.Sprintf("task-%s", hashHex) 228 229 // Generate the path 230 path := filepath.Join(r.config.StateDir, "alloc", r.alloc.ID, 231 dirName, "state.json") 232 return path 233 } 234 235 // RestoreState is used to restore our state 236 func (r *TaskRunner) RestoreState() error { 237 // Load the snapshot 238 var snap taskRunnerState 239 if err := restoreState(r.stateFilePath(), &snap); err != nil { 240 return err 241 } 242 243 // Restore fields 244 if snap.Task == nil { 245 return fmt.Errorf("task runner snapshot includes nil Task") 246 } else { 247 r.task = snap.Task 248 } 249 r.artifactsDownloaded = snap.ArtifactDownloaded 250 r.taskDirBuilt = snap.TaskDirBuilt 251 r.payloadRendered = snap.PayloadRendered 252 253 r.setCreatedResources(snap.CreatedResources) 254 255 if err := r.setTaskEnv(); err != nil { 256 return fmt.Errorf("client: failed to create task environment for task %q in allocation %q: %v", 257 r.task.Name, r.alloc.ID, err) 258 } 259 260 if r.task.Vault != nil { 261 // Read the token from the secret directory 262 tokenPath := filepath.Join(r.taskDir.SecretsDir, vaultTokenFile) 263 data, err := ioutil.ReadFile(tokenPath) 264 if err != nil { 265 if !os.IsNotExist(err) { 266 return fmt.Errorf("failed to read token for task %q in alloc %q: %v", r.task.Name, r.alloc.ID, err) 267 } 268 269 // Token file doesn't exist 270 } else { 271 // Store the recovered token 272 r.recoveredVaultToken = string(data) 273 } 274 } 275 276 // Restore the driver 277 if snap.HandleID != "" { 278 d, err := r.createDriver() 279 if err != nil { 280 return err 281 } 282 283 ctx := driver.NewExecContext(r.taskDir) 284 handle, err := d.Open(ctx, snap.HandleID) 285 286 // In the case it fails, we relaunch the task in the Run() method. 287 if err != nil { 288 r.logger.Printf("[ERR] client: failed to open handle to task %q for alloc %q: %v", 289 r.task.Name, r.alloc.ID, err) 290 return nil 291 } 292 r.handleLock.Lock() 293 r.handle = handle 294 r.handleLock.Unlock() 295 296 r.runningLock.Lock() 297 r.running = true 298 r.runningLock.Unlock() 299 } 300 return nil 301 } 302 303 // SaveState is used to snapshot our state 304 func (r *TaskRunner) SaveState() error { 305 r.persistLock.Lock() 306 defer r.persistLock.Unlock() 307 308 snap := taskRunnerState{ 309 Task: r.task, 310 Version: r.config.Version, 311 ArtifactDownloaded: r.artifactsDownloaded, 312 TaskDirBuilt: r.taskDirBuilt, 313 PayloadRendered: r.payloadRendered, 314 CreatedResources: r.getCreatedResources(), 315 } 316 317 r.handleLock.Lock() 318 if r.handle != nil { 319 snap.HandleID = r.handle.ID() 320 } 321 r.handleLock.Unlock() 322 return persistState(r.stateFilePath(), &snap) 323 } 324 325 // DestroyState is used to cleanup after ourselves 326 func (r *TaskRunner) DestroyState() error { 327 r.persistLock.Lock() 328 defer r.persistLock.Unlock() 329 330 return os.RemoveAll(r.stateFilePath()) 331 } 332 333 // setState is used to update the state of the task runner 334 func (r *TaskRunner) setState(state string, event *structs.TaskEvent) { 335 // Persist our state to disk. 336 if err := r.SaveState(); err != nil { 337 r.logger.Printf("[ERR] client: failed to save state of Task Runner for task %q: %v", r.task.Name, err) 338 } 339 340 // Indicate the task has been updated. 341 r.updater(r.task.Name, state, event) 342 } 343 344 // setTaskEnv sets the task environment. It returns an error if it could not be 345 // created. 346 func (r *TaskRunner) setTaskEnv() error { 347 r.taskEnvLock.Lock() 348 defer r.taskEnvLock.Unlock() 349 350 taskEnv, err := driver.GetTaskEnv(r.taskDir, r.config.Node, 351 r.task.Copy(), r.alloc, r.config, r.vaultFuture.Get()) 352 if err != nil { 353 return err 354 } 355 r.taskEnv = taskEnv 356 return nil 357 } 358 359 // getTaskEnv returns the task environment 360 func (r *TaskRunner) getTaskEnv() *env.TaskEnvironment { 361 r.taskEnvLock.Lock() 362 defer r.taskEnvLock.Unlock() 363 return r.taskEnv 364 } 365 366 // createDriver makes a driver for the task 367 func (r *TaskRunner) createDriver() (driver.Driver, error) { 368 env := r.getTaskEnv() 369 if env == nil { 370 return nil, fmt.Errorf("task environment not made for task %q in allocation %q", r.task.Name, r.alloc.ID) 371 } 372 373 // Create a task-specific event emitter callback to expose minimal 374 // state to drivers 375 eventEmitter := func(m string, args ...interface{}) { 376 msg := fmt.Sprintf(m, args...) 377 r.logger.Printf("[DEBUG] client: driver event for alloc %q: %s", r.alloc.ID, msg) 378 r.setState(structs.TaskStatePending, structs.NewTaskEvent(structs.TaskDriverMessage).SetDriverMessage(msg)) 379 } 380 381 driverCtx := driver.NewDriverContext(r.task.Name, r.alloc.ID, r.config, r.config.Node, r.logger, env, eventEmitter) 382 driver, err := driver.NewDriver(r.task.Driver, driverCtx) 383 if err != nil { 384 return nil, fmt.Errorf("failed to create driver '%s' for alloc %s: %v", 385 r.task.Driver, r.alloc.ID, err) 386 } 387 return driver, err 388 } 389 390 // Run is a long running routine used to manage the task 391 func (r *TaskRunner) Run() { 392 defer close(r.waitCh) 393 r.logger.Printf("[DEBUG] client: starting task context for '%s' (alloc '%s')", 394 r.task.Name, r.alloc.ID) 395 396 // Create the initial environment, this will be recreated if a Vault token 397 // is needed 398 if err := r.setTaskEnv(); err != nil { 399 r.setState( 400 structs.TaskStateDead, 401 structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(err)) 402 return 403 } 404 405 if err := r.validateTask(); err != nil { 406 r.setState( 407 structs.TaskStateDead, 408 structs.NewTaskEvent(structs.TaskFailedValidation).SetValidationError(err).SetFailsTask()) 409 return 410 } 411 412 // Create a driver so that we can determine the FSIsolation required 413 drv, err := r.createDriver() 414 if err != nil { 415 e := fmt.Errorf("failed to create driver of task %q for alloc %q: %v", r.task.Name, r.alloc.ID, err) 416 r.setState( 417 structs.TaskStateDead, 418 structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(e).SetFailsTask()) 419 return 420 } 421 422 // Build base task directory structure regardless of FS isolation abilities. 423 // This needs to happen before we start the Vault manager and call prestart 424 // as both those can write to the task directories 425 if err := r.buildTaskDir(drv.FSIsolation()); err != nil { 426 e := fmt.Errorf("failed to build task directory for %q: %v", r.task.Name, err) 427 r.setState( 428 structs.TaskStateDead, 429 structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(e).SetFailsTask()) 430 return 431 } 432 433 // If there is no Vault policy leave the static future created in 434 // NewTaskRunner 435 if r.task.Vault != nil { 436 // Start the go-routine to get a Vault token 437 r.vaultFuture.Clear() 438 go r.vaultManager(r.recoveredVaultToken) 439 } 440 441 // Start the run loop 442 r.run() 443 444 // Do any cleanup necessary 445 r.postrun() 446 447 return 448 } 449 450 // validateTask validates the fields of the task and returns an error if the 451 // task is invalid. 452 func (r *TaskRunner) validateTask() error { 453 var mErr multierror.Error 454 455 // Validate the user. 456 unallowedUsers := r.config.ReadStringListToMapDefault("user.blacklist", config.DefaultUserBlacklist) 457 checkDrivers := r.config.ReadStringListToMapDefault("user.checked_drivers", config.DefaultUserCheckedDrivers) 458 if _, driverMatch := checkDrivers[r.task.Driver]; driverMatch { 459 if _, unallowed := unallowedUsers[r.task.User]; unallowed { 460 mErr.Errors = append(mErr.Errors, fmt.Errorf("running as user %q is disallowed", r.task.User)) 461 } 462 } 463 464 // Validate the artifacts 465 for i, artifact := range r.task.Artifacts { 466 // Verify the artifact doesn't escape the task directory. 467 if err := artifact.Validate(); err != nil { 468 // If this error occurs there is potentially a server bug or 469 // mallicious, server spoofing. 470 r.logger.Printf("[ERR] client: allocation %q, task %v, artifact %#v (%v) fails validation: %v", 471 r.alloc.ID, r.task.Name, artifact, i, err) 472 mErr.Errors = append(mErr.Errors, fmt.Errorf("artifact (%d) failed validation: %v", i, err)) 473 } 474 } 475 476 // Validate the Service names 477 for i, service := range r.task.Services { 478 name := r.taskEnv.ReplaceEnv(service.Name) 479 if err := service.ValidateName(name); err != nil { 480 mErr.Errors = append(mErr.Errors, fmt.Errorf("service (%d) failed validation: %v", i, err)) 481 } 482 } 483 484 if len(mErr.Errors) == 1 { 485 return mErr.Errors[0] 486 } 487 return mErr.ErrorOrNil() 488 } 489 490 // tokenFuture stores the Vault token and allows consumers to block till a valid 491 // token exists 492 type tokenFuture struct { 493 waiting []chan struct{} 494 token string 495 set bool 496 m sync.Mutex 497 } 498 499 // NewTokenFuture returns a new token future without any token set 500 func NewTokenFuture() *tokenFuture { 501 return &tokenFuture{} 502 } 503 504 // Wait returns a channel that can be waited on. When this channel unblocks, a 505 // valid token will be available via the Get method 506 func (f *tokenFuture) Wait() <-chan struct{} { 507 f.m.Lock() 508 defer f.m.Unlock() 509 510 c := make(chan struct{}) 511 if f.set { 512 close(c) 513 return c 514 } 515 516 f.waiting = append(f.waiting, c) 517 return c 518 } 519 520 // Set sets the token value and unblocks any caller of Wait 521 func (f *tokenFuture) Set(token string) *tokenFuture { 522 f.m.Lock() 523 defer f.m.Unlock() 524 525 f.set = true 526 f.token = token 527 for _, w := range f.waiting { 528 close(w) 529 } 530 f.waiting = nil 531 return f 532 } 533 534 // Clear clears the set vault token. 535 func (f *tokenFuture) Clear() *tokenFuture { 536 f.m.Lock() 537 defer f.m.Unlock() 538 539 f.token = "" 540 f.set = false 541 return f 542 } 543 544 // Get returns the set Vault token 545 func (f *tokenFuture) Get() string { 546 f.m.Lock() 547 defer f.m.Unlock() 548 return f.token 549 } 550 551 // vaultManager should be called in a go-routine and manages the derivation, 552 // renewal and handling of errors with the Vault token. The optional parameter 553 // allows setting the initial Vault token. This is useful when the Vault token 554 // is recovered off disk. 555 func (r *TaskRunner) vaultManager(token string) { 556 // Helper for stopping token renewal 557 stopRenewal := func() { 558 if err := r.vaultClient.StopRenewToken(r.vaultFuture.Get()); err != nil { 559 r.logger.Printf("[WARN] client: failed to stop token renewal for task %v in alloc %q: %v", r.task.Name, r.alloc.ID, err) 560 } 561 } 562 563 // updatedToken lets us store state between loops. If true, a new token 564 // has been retrieved and we need to apply the Vault change mode 565 var updatedToken bool 566 567 OUTER: 568 for { 569 // Check if we should exit 570 select { 571 case <-r.waitCh: 572 stopRenewal() 573 return 574 default: 575 } 576 577 // Clear the token 578 r.vaultFuture.Clear() 579 580 // Check if there already is a token which can be the case for 581 // restoring the TaskRunner 582 if token == "" { 583 // Get a token 584 var exit bool 585 token, exit = r.deriveVaultToken() 586 if exit { 587 // Exit the manager 588 return 589 } 590 591 // Write the token to disk 592 if err := r.writeToken(token); err != nil { 593 e := fmt.Errorf("failed to write Vault token to disk") 594 r.logger.Printf("[ERR] client: %v for task %v on alloc %q: %v", e, r.task.Name, r.alloc.ID, err) 595 r.Kill("vault", e.Error(), true) 596 return 597 } 598 } 599 600 // Start the renewal process 601 renewCh, err := r.vaultClient.RenewToken(token, 30) 602 603 // An error returned means the token is not being renewed 604 if err != nil { 605 r.logger.Printf("[ERR] client: failed to start renewal of Vault token for task %v on alloc %q: %v", r.task.Name, r.alloc.ID, err) 606 token = "" 607 goto OUTER 608 } 609 610 // The Vault token is valid now, so set it 611 r.vaultFuture.Set(token) 612 613 if updatedToken { 614 switch r.task.Vault.ChangeMode { 615 case structs.VaultChangeModeSignal: 616 s, err := signals.Parse(r.task.Vault.ChangeSignal) 617 if err != nil { 618 e := fmt.Errorf("failed to parse signal: %v", err) 619 r.logger.Printf("[ERR] client: %v", err) 620 r.Kill("vault", e.Error(), true) 621 return 622 } 623 624 if err := r.Signal("vault", "new Vault token acquired", s); err != nil { 625 r.logger.Printf("[ERR] client: failed to send signal to task %v for alloc %q: %v", r.task.Name, r.alloc.ID, err) 626 r.Kill("vault", fmt.Sprintf("failed to send signal to task: %v", err), true) 627 return 628 } 629 case structs.VaultChangeModeRestart: 630 r.Restart("vault", "new Vault token acquired") 631 case structs.VaultChangeModeNoop: 632 fallthrough 633 default: 634 r.logger.Printf("[ERR] client: Invalid Vault change mode: %q", r.task.Vault.ChangeMode) 635 } 636 637 // We have handled it 638 updatedToken = false 639 640 // Call the handler 641 r.updatedTokenHandler() 642 } 643 644 // Start watching for renewal errors 645 select { 646 case err := <-renewCh: 647 // Clear the token 648 token = "" 649 r.logger.Printf("[ERR] client: failed to renew Vault token for task %v on alloc %q: %v", r.task.Name, r.alloc.ID, err) 650 stopRenewal() 651 652 // Check if we have to do anything 653 if r.task.Vault.ChangeMode != structs.VaultChangeModeNoop { 654 updatedToken = true 655 } 656 case <-r.waitCh: 657 stopRenewal() 658 return 659 } 660 } 661 } 662 663 // deriveVaultToken derives the Vault token using exponential backoffs. It 664 // returns the Vault token and whether the manager should exit. 665 func (r *TaskRunner) deriveVaultToken() (token string, exit bool) { 666 attempts := 0 667 for { 668 tokens, err := r.vaultClient.DeriveToken(r.alloc, []string{r.task.Name}) 669 if err == nil { 670 return tokens[r.task.Name], false 671 } 672 673 // Check if we can't recover from the error 674 if !structs.IsRecoverable(err) { 675 r.logger.Printf("[ERR] client: failed to derive Vault token for task %v on alloc %q: %v", 676 r.task.Name, r.alloc.ID, err) 677 r.Kill("vault", fmt.Sprintf("failed to derive token: %v", err), true) 678 return "", true 679 } 680 681 // Handle the retry case 682 backoff := (1 << (2 * uint64(attempts))) * vaultBackoffBaseline 683 if backoff > vaultBackoffLimit { 684 backoff = vaultBackoffLimit 685 } 686 r.logger.Printf("[ERR] client: failed to derive Vault token for task %v on alloc %q: %v; retrying in %v", 687 r.task.Name, r.alloc.ID, err, backoff) 688 689 attempts++ 690 691 // Wait till retrying 692 select { 693 case <-r.waitCh: 694 return "", true 695 case <-time.After(backoff): 696 } 697 } 698 } 699 700 // writeToken writes the given token to disk 701 func (r *TaskRunner) writeToken(token string) error { 702 tokenPath := filepath.Join(r.taskDir.SecretsDir, vaultTokenFile) 703 if err := ioutil.WriteFile(tokenPath, []byte(token), 0777); err != nil { 704 return fmt.Errorf("failed to save Vault tokens to secret dir for task %q in alloc %q: %v", r.task.Name, r.alloc.ID, err) 705 } 706 707 return nil 708 } 709 710 // updatedTokenHandler is called when a new Vault token is retrieved. Things 711 // that rely on the token should be updated here. 712 func (r *TaskRunner) updatedTokenHandler() { 713 714 // Update the tasks environment 715 if err := r.setTaskEnv(); err != nil { 716 r.setState( 717 structs.TaskStateDead, 718 structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(err).SetFailsTask()) 719 return 720 } 721 722 if r.templateManager != nil { 723 r.templateManager.Stop() 724 725 // Create a new templateManager 726 var err error 727 r.templateManager, err = NewTaskTemplateManager(r, r.task.Templates, 728 r.config, r.vaultFuture.Get(), r.taskDir.Dir, r.getTaskEnv()) 729 if err != nil { 730 err := fmt.Errorf("failed to build task's template manager: %v", err) 731 r.setState(structs.TaskStateDead, structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(err).SetFailsTask()) 732 r.logger.Printf("[ERR] client: alloc %q, task %q %v", r.alloc.ID, r.task.Name, err) 733 r.Kill("vault", err.Error(), true) 734 return 735 } 736 } 737 } 738 739 // prestart handles life-cycle tasks that occur before the task has started. 740 func (r *TaskRunner) prestart(resultCh chan bool) { 741 if r.task.Vault != nil { 742 // Wait for the token 743 r.logger.Printf("[DEBUG] client: waiting for Vault token for task %v in alloc %q", r.task.Name, r.alloc.ID) 744 tokenCh := r.vaultFuture.Wait() 745 select { 746 case <-tokenCh: 747 case <-r.waitCh: 748 resultCh <- false 749 return 750 } 751 r.logger.Printf("[DEBUG] client: retrieved Vault token for task %v in alloc %q", r.task.Name, r.alloc.ID) 752 } 753 754 if err := r.setTaskEnv(); err != nil { 755 r.setState( 756 structs.TaskStateDead, 757 structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(err).SetFailsTask()) 758 resultCh <- false 759 return 760 } 761 762 // If the job is a dispatch job and there is a payload write it to disk 763 requirePayload := len(r.alloc.Job.Payload) != 0 && 764 (r.task.DispatchPayload != nil && r.task.DispatchPayload.File != "") 765 if !r.payloadRendered && requirePayload { 766 renderTo := filepath.Join(r.taskDir.LocalDir, r.task.DispatchPayload.File) 767 decoded, err := snappy.Decode(nil, r.alloc.Job.Payload) 768 if err != nil { 769 r.setState( 770 structs.TaskStateDead, 771 structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(err).SetFailsTask()) 772 resultCh <- false 773 return 774 } 775 776 if err := os.MkdirAll(filepath.Dir(renderTo), 07777); err != nil { 777 r.setState( 778 structs.TaskStateDead, 779 structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(err).SetFailsTask()) 780 resultCh <- false 781 return 782 } 783 784 if err := ioutil.WriteFile(renderTo, decoded, 0777); err != nil { 785 r.setState( 786 structs.TaskStateDead, 787 structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(err).SetFailsTask()) 788 resultCh <- false 789 return 790 } 791 792 r.payloadRendered = true 793 } 794 795 for { 796 r.persistLock.Lock() 797 downloaded := r.artifactsDownloaded 798 r.persistLock.Unlock() 799 800 // Download the task's artifacts 801 if !downloaded && len(r.task.Artifacts) > 0 { 802 r.setState(structs.TaskStatePending, structs.NewTaskEvent(structs.TaskDownloadingArtifacts)) 803 for _, artifact := range r.task.Artifacts { 804 if err := getter.GetArtifact(r.getTaskEnv(), artifact, r.taskDir.Dir); err != nil { 805 wrapped := fmt.Errorf("failed to download artifact %q: %v", artifact.GetterSource, err) 806 r.logger.Printf("[DEBUG] client: %v", wrapped) 807 r.setState(structs.TaskStatePending, 808 structs.NewTaskEvent(structs.TaskArtifactDownloadFailed).SetDownloadError(wrapped)) 809 r.restartTracker.SetStartError(structs.WrapRecoverable(wrapped.Error(), err)) 810 goto RESTART 811 } 812 } 813 814 r.persistLock.Lock() 815 r.artifactsDownloaded = true 816 r.persistLock.Unlock() 817 } 818 819 // We don't have to wait for any template 820 if len(r.task.Templates) == 0 { 821 // Send the start signal 822 select { 823 case r.startCh <- struct{}{}: 824 default: 825 } 826 827 resultCh <- true 828 return 829 } 830 831 // Build the template manager 832 if r.templateManager == nil { 833 var err error 834 r.templateManager, err = NewTaskTemplateManager(r, r.task.Templates, 835 r.config, r.vaultFuture.Get(), r.taskDir.Dir, r.getTaskEnv()) 836 if err != nil { 837 err := fmt.Errorf("failed to build task's template manager: %v", err) 838 r.setState(structs.TaskStateDead, structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(err).SetFailsTask()) 839 r.logger.Printf("[ERR] client: alloc %q, task %q %v", r.alloc.ID, r.task.Name, err) 840 resultCh <- false 841 return 842 } 843 } 844 845 // Block for consul-template 846 // TODO Hooks should register themselves as blocking and then we can 847 // perioidcally enumerate what we are still blocked on 848 select { 849 case <-r.unblockCh: 850 // Send the start signal 851 select { 852 case r.startCh <- struct{}{}: 853 default: 854 } 855 856 resultCh <- true 857 return 858 case <-r.waitCh: 859 // The run loop has exited so exit too 860 resultCh <- false 861 return 862 } 863 864 RESTART: 865 restart := r.shouldRestart() 866 if !restart { 867 resultCh <- false 868 return 869 } 870 } 871 } 872 873 // postrun is used to do any cleanup that is necessary after exiting the runloop 874 func (r *TaskRunner) postrun() { 875 // Stop the template manager 876 if r.templateManager != nil { 877 r.templateManager.Stop() 878 } 879 } 880 881 // run is the main run loop that handles starting the application, destroying 882 // it, restarts and signals. 883 func (r *TaskRunner) run() { 884 // Predeclare things so we can jump to the RESTART 885 var stopCollection chan struct{} 886 var handleWaitCh chan *dstructs.WaitResult 887 888 // If we already have a handle, populate the stopCollection and handleWaitCh 889 // to fix the invariant that it exists. 890 r.handleLock.Lock() 891 handleEmpty := r.handle == nil 892 r.handleLock.Unlock() 893 894 if !handleEmpty { 895 stopCollection = make(chan struct{}) 896 go r.collectResourceUsageStats(stopCollection) 897 handleWaitCh = r.handle.WaitCh() 898 } 899 900 for { 901 // Do the prestart activities 902 prestartResultCh := make(chan bool, 1) 903 go r.prestart(prestartResultCh) 904 905 WAIT: 906 for { 907 select { 908 case success := <-prestartResultCh: 909 if !success { 910 r.cleanup() 911 r.setState(structs.TaskStateDead, nil) 912 return 913 } 914 case <-r.startCh: 915 // Start the task if not yet started or it is being forced. This logic 916 // is necessary because in the case of a restore the handle already 917 // exists. 918 r.handleLock.Lock() 919 handleEmpty := r.handle == nil 920 r.handleLock.Unlock() 921 if handleEmpty { 922 startErr := r.startTask() 923 r.restartTracker.SetStartError(startErr) 924 if startErr != nil { 925 r.setState("", structs.NewTaskEvent(structs.TaskDriverFailure).SetDriverError(startErr)) 926 goto RESTART 927 } 928 929 // Mark the task as started 930 r.setState(structs.TaskStateRunning, structs.NewTaskEvent(structs.TaskStarted)) 931 r.runningLock.Lock() 932 r.running = true 933 r.runningLock.Unlock() 934 935 if stopCollection == nil { 936 stopCollection = make(chan struct{}) 937 go r.collectResourceUsageStats(stopCollection) 938 } 939 940 handleWaitCh = r.handle.WaitCh() 941 } 942 943 case waitRes := <-handleWaitCh: 944 if waitRes == nil { 945 panic("nil wait") 946 } 947 948 r.runningLock.Lock() 949 r.running = false 950 r.runningLock.Unlock() 951 952 // Stop collection of the task's resource usage 953 close(stopCollection) 954 955 // Log whether the task was successful or not. 956 r.restartTracker.SetWaitResult(waitRes) 957 r.setState("", r.waitErrorToEvent(waitRes)) 958 if !waitRes.Successful() { 959 r.logger.Printf("[INFO] client: task %q for alloc %q failed: %v", r.task.Name, r.alloc.ID, waitRes) 960 } else { 961 r.logger.Printf("[INFO] client: task %q for alloc %q completed successfully", r.task.Name, r.alloc.ID) 962 } 963 964 break WAIT 965 case update := <-r.updateCh: 966 if err := r.handleUpdate(update); err != nil { 967 r.logger.Printf("[ERR] client: update to task %q failed: %v", r.task.Name, err) 968 } 969 970 case se := <-r.signalCh: 971 r.runningLock.Lock() 972 running := r.running 973 r.runningLock.Unlock() 974 common := fmt.Sprintf("signal %v to task %v for alloc %q", se.s, r.task.Name, r.alloc.ID) 975 if !running { 976 // Send no error 977 r.logger.Printf("[DEBUG] client: skipping %s", common) 978 se.result <- nil 979 continue 980 } 981 982 r.logger.Printf("[DEBUG] client: sending %s", common) 983 r.setState(structs.TaskStateRunning, se.e) 984 985 res := r.handle.Signal(se.s) 986 se.result <- res 987 988 case event := <-r.restartCh: 989 r.runningLock.Lock() 990 running := r.running 991 r.runningLock.Unlock() 992 common := fmt.Sprintf("task %v for alloc %q", r.task.Name, r.alloc.ID) 993 if !running { 994 r.logger.Printf("[DEBUG] client: skipping restart of %v: task isn't running", common) 995 continue 996 } 997 998 r.logger.Printf("[DEBUG] client: restarting %s: %v", common, event.RestartReason) 999 r.setState(structs.TaskStateRunning, event) 1000 r.killTask(nil) 1001 1002 close(stopCollection) 1003 1004 if handleWaitCh != nil { 1005 <-handleWaitCh 1006 } 1007 1008 // Since the restart isn't from a failure, restart immediately 1009 // and don't count against the restart policy 1010 r.restartTracker.SetRestartTriggered() 1011 break WAIT 1012 1013 case <-r.destroyCh: 1014 r.runningLock.Lock() 1015 running := r.running 1016 r.runningLock.Unlock() 1017 if !running { 1018 r.cleanup() 1019 r.setState(structs.TaskStateDead, r.destroyEvent) 1020 return 1021 } 1022 1023 // Store the task event that provides context on the task 1024 // destroy. The Killed event is set from the alloc_runner and 1025 // doesn't add detail 1026 var killEvent *structs.TaskEvent 1027 if r.destroyEvent.Type != structs.TaskKilled { 1028 if r.destroyEvent.Type == structs.TaskKilling { 1029 killEvent = r.destroyEvent 1030 } else { 1031 r.setState(structs.TaskStateRunning, r.destroyEvent) 1032 } 1033 } 1034 1035 r.killTask(killEvent) 1036 close(stopCollection) 1037 1038 // Wait for handler to exit before calling cleanup 1039 <-handleWaitCh 1040 r.cleanup() 1041 1042 r.setState(structs.TaskStateDead, nil) 1043 return 1044 } 1045 } 1046 1047 RESTART: 1048 restart := r.shouldRestart() 1049 if !restart { 1050 r.cleanup() 1051 r.setState(structs.TaskStateDead, nil) 1052 return 1053 } 1054 1055 // Clear the handle so a new driver will be created. 1056 r.handleLock.Lock() 1057 r.handle = nil 1058 handleWaitCh = nil 1059 stopCollection = nil 1060 r.handleLock.Unlock() 1061 } 1062 } 1063 1064 // cleanup calls Driver.Cleanup when a task is stopping. Errors are logged. 1065 func (r *TaskRunner) cleanup() { 1066 drv, err := r.createDriver() 1067 if err != nil { 1068 r.logger.Printf("[ERR] client: error creating driver to cleanup resources: %v", err) 1069 return 1070 } 1071 1072 res := r.getCreatedResources() 1073 1074 ctx := driver.NewExecContext(r.taskDir) 1075 attempts := 1 1076 var cleanupErr error 1077 for retry := true; retry; attempts++ { 1078 cleanupErr = drv.Cleanup(ctx, res) 1079 retry = structs.IsRecoverable(cleanupErr) 1080 1081 // Copy current createdResources state in case SaveState is 1082 // called between retries 1083 r.setCreatedResources(res) 1084 1085 // Retry 3 times with sleeps between 1086 if !retry || attempts > 3 { 1087 break 1088 } 1089 time.Sleep(time.Duration(attempts) * time.Second) 1090 } 1091 1092 if cleanupErr != nil { 1093 r.logger.Printf("[ERR] client: error cleaning up resources for task %q after %d attempts: %v", r.task.Name, attempts, cleanupErr) 1094 } 1095 return 1096 } 1097 1098 // shouldRestart returns if the task should restart. If the return value is 1099 // true, the task's restart policy has already been considered and any wait time 1100 // between restarts has been applied. 1101 func (r *TaskRunner) shouldRestart() bool { 1102 state, when := r.restartTracker.GetState() 1103 reason := r.restartTracker.GetReason() 1104 switch state { 1105 case structs.TaskNotRestarting, structs.TaskTerminated: 1106 r.logger.Printf("[INFO] client: Not restarting task: %v for alloc: %v ", r.task.Name, r.alloc.ID) 1107 if state == structs.TaskNotRestarting { 1108 r.setState(structs.TaskStateDead, 1109 structs.NewTaskEvent(structs.TaskNotRestarting). 1110 SetRestartReason(reason).SetFailsTask()) 1111 } 1112 return false 1113 case structs.TaskRestarting: 1114 r.logger.Printf("[INFO] client: Restarting task %q for alloc %q in %v", r.task.Name, r.alloc.ID, when) 1115 r.setState(structs.TaskStatePending, 1116 structs.NewTaskEvent(structs.TaskRestarting). 1117 SetRestartDelay(when). 1118 SetRestartReason(reason)) 1119 default: 1120 r.logger.Printf("[ERR] client: restart tracker returned unknown state: %q", state) 1121 return false 1122 } 1123 1124 // Sleep but watch for destroy events. 1125 select { 1126 case <-time.After(when): 1127 case <-r.destroyCh: 1128 } 1129 1130 // Destroyed while we were waiting to restart, so abort. 1131 r.destroyLock.Lock() 1132 destroyed := r.destroy 1133 r.destroyLock.Unlock() 1134 if destroyed { 1135 r.logger.Printf("[DEBUG] client: Not restarting task: %v because it has been destroyed", r.task.Name) 1136 r.setState(structs.TaskStateDead, r.destroyEvent) 1137 return false 1138 } 1139 1140 return true 1141 } 1142 1143 // killTask kills the running task. A killing event can optionally be passed and 1144 // this event is used to mark the task as being killed. It provides a means to 1145 // store extra information. 1146 func (r *TaskRunner) killTask(killingEvent *structs.TaskEvent) { 1147 r.runningLock.Lock() 1148 running := r.running 1149 r.runningLock.Unlock() 1150 if !running { 1151 return 1152 } 1153 1154 // Get the kill timeout 1155 timeout := driver.GetKillTimeout(r.task.KillTimeout, r.config.MaxKillTimeout) 1156 1157 // Build the event 1158 var event *structs.TaskEvent 1159 if killingEvent != nil { 1160 event = killingEvent 1161 event.Type = structs.TaskKilling 1162 } else { 1163 event = structs.NewTaskEvent(structs.TaskKilling) 1164 } 1165 event.SetKillTimeout(timeout) 1166 1167 // Mark that we received the kill event 1168 r.setState(structs.TaskStateRunning, event) 1169 1170 // Kill the task using an exponential backoff in-case of failures. 1171 destroySuccess, err := r.handleDestroy() 1172 if !destroySuccess { 1173 // We couldn't successfully destroy the resource created. 1174 r.logger.Printf("[ERR] client: failed to kill task %q. Resources may have been leaked: %v", r.task.Name, err) 1175 } 1176 1177 r.runningLock.Lock() 1178 r.running = false 1179 r.runningLock.Unlock() 1180 1181 // Store that the task has been destroyed and any associated error. 1182 r.setState("", structs.NewTaskEvent(structs.TaskKilled).SetKillError(err)) 1183 } 1184 1185 // startTask creates the driver, task dir, and starts the task. 1186 func (r *TaskRunner) startTask() error { 1187 // Create a driver 1188 drv, err := r.createDriver() 1189 if err != nil { 1190 return fmt.Errorf("failed to create driver of task %q for alloc %q: %v", 1191 r.task.Name, r.alloc.ID, err) 1192 } 1193 1194 // Run prestart 1195 ctx := driver.NewExecContext(r.taskDir) 1196 res, err := drv.Prestart(ctx, r.task) 1197 1198 // Merge newly created resources into previously created resources 1199 r.createdResourcesLock.Lock() 1200 r.createdResources.Merge(res) 1201 r.createdResourcesLock.Unlock() 1202 1203 if err != nil { 1204 wrapped := fmt.Sprintf("failed to initialize task %q for alloc %q: %v", 1205 r.task.Name, r.alloc.ID, err) 1206 r.logger.Printf("[WARN] client: error from prestart: %s", wrapped) 1207 return structs.WrapRecoverable(wrapped, err) 1208 } 1209 1210 // Start the job 1211 handle, err := drv.Start(ctx, r.task) 1212 if err != nil { 1213 wrapped := fmt.Sprintf("failed to start task %q for alloc %q: %v", 1214 r.task.Name, r.alloc.ID, err) 1215 r.logger.Printf("[WARN] client: %s", wrapped) 1216 return structs.WrapRecoverable(wrapped, err) 1217 1218 } 1219 1220 r.handleLock.Lock() 1221 r.handle = handle 1222 r.handleLock.Unlock() 1223 return nil 1224 } 1225 1226 // buildTaskDir creates the task directory before driver.Prestart. It is safe 1227 // to call multiple times as its state is persisted. 1228 func (r *TaskRunner) buildTaskDir(fsi cstructs.FSIsolation) error { 1229 r.persistLock.Lock() 1230 built := r.taskDirBuilt 1231 r.persistLock.Unlock() 1232 1233 // We do not set the state again since this only occurs during restoration 1234 // and the task dir is already built. The reason we call Build again is to 1235 // ensure that the task dir invariants are still held. 1236 if !built { 1237 r.setState(structs.TaskStatePending, structs.NewTaskEvent(structs.TaskSetup). 1238 SetMessage(structs.TaskBuildingTaskDir)) 1239 } 1240 1241 chroot := config.DefaultChrootEnv 1242 if len(r.config.ChrootEnv) > 0 { 1243 chroot = r.config.ChrootEnv 1244 } 1245 if err := r.taskDir.Build(built, chroot, fsi); err != nil { 1246 return err 1247 } 1248 1249 // Mark task dir as successfully built 1250 r.persistLock.Lock() 1251 r.taskDirBuilt = true 1252 r.persistLock.Unlock() 1253 return nil 1254 } 1255 1256 // collectResourceUsageStats starts collecting resource usage stats of a Task. 1257 // Collection ends when the passed channel is closed 1258 func (r *TaskRunner) collectResourceUsageStats(stopCollection <-chan struct{}) { 1259 // start collecting the stats right away and then start collecting every 1260 // collection interval 1261 next := time.NewTimer(0) 1262 defer next.Stop() 1263 for { 1264 select { 1265 case <-next.C: 1266 next.Reset(r.config.StatsCollectionInterval) 1267 if r.handle == nil { 1268 continue 1269 } 1270 ru, err := r.handle.Stats() 1271 1272 if err != nil { 1273 // Check if the driver doesn't implement stats 1274 if err.Error() == driver.DriverStatsNotImplemented.Error() { 1275 r.logger.Printf("[DEBUG] client: driver for task %q in allocation %q doesn't support stats", r.task.Name, r.alloc.ID) 1276 return 1277 } 1278 1279 // We do not log when the plugin is shutdown as this is simply a 1280 // race between the stopCollection channel being closed and calling 1281 // Stats on the handle. 1282 if !strings.Contains(err.Error(), "connection is shut down") { 1283 r.logger.Printf("[WARN] client: error fetching stats of task %v: %v", r.task.Name, err) 1284 } 1285 continue 1286 } 1287 1288 r.resourceUsageLock.Lock() 1289 r.resourceUsage = ru 1290 r.resourceUsageLock.Unlock() 1291 if ru != nil { 1292 r.emitStats(ru) 1293 } 1294 case <-stopCollection: 1295 return 1296 } 1297 } 1298 } 1299 1300 // LatestResourceUsage returns the last resource utilization datapoint collected 1301 func (r *TaskRunner) LatestResourceUsage() *cstructs.TaskResourceUsage { 1302 r.resourceUsageLock.RLock() 1303 defer r.resourceUsageLock.RUnlock() 1304 r.runningLock.Lock() 1305 defer r.runningLock.Unlock() 1306 1307 // If the task is not running there can be no latest resource 1308 if !r.running { 1309 return nil 1310 } 1311 1312 return r.resourceUsage 1313 } 1314 1315 // handleUpdate takes an updated allocation and updates internal state to 1316 // reflect the new config for the task. 1317 func (r *TaskRunner) handleUpdate(update *structs.Allocation) error { 1318 // Extract the task group from the alloc. 1319 tg := update.Job.LookupTaskGroup(update.TaskGroup) 1320 if tg == nil { 1321 return fmt.Errorf("alloc '%s' missing task group '%s'", update.ID, update.TaskGroup) 1322 } 1323 1324 // Extract the task. 1325 var updatedTask *structs.Task 1326 for _, t := range tg.Tasks { 1327 if t.Name == r.task.Name { 1328 updatedTask = t.Copy() 1329 } 1330 } 1331 if updatedTask == nil { 1332 return fmt.Errorf("task group %q doesn't contain task %q", tg.Name, r.task.Name) 1333 } 1334 1335 // Merge in the task resources 1336 updatedTask.Resources = update.TaskResources[updatedTask.Name] 1337 1338 // Update will update resources and store the new kill timeout. 1339 var mErr multierror.Error 1340 r.handleLock.Lock() 1341 if r.handle != nil { 1342 if err := r.handle.Update(updatedTask); err != nil { 1343 mErr.Errors = append(mErr.Errors, fmt.Errorf("updating task resources failed: %v", err)) 1344 } 1345 } 1346 r.handleLock.Unlock() 1347 1348 // Update the restart policy. 1349 if r.restartTracker != nil { 1350 r.restartTracker.SetPolicy(tg.RestartPolicy) 1351 } 1352 1353 // Store the updated alloc. 1354 r.alloc = update 1355 r.task = updatedTask 1356 return mErr.ErrorOrNil() 1357 } 1358 1359 // handleDestroy kills the task handle. In the case that killing fails, 1360 // handleDestroy will retry with an exponential backoff and will give up at a 1361 // given limit. It returns whether the task was destroyed and the error 1362 // associated with the last kill attempt. 1363 func (r *TaskRunner) handleDestroy() (destroyed bool, err error) { 1364 // Cap the number of times we attempt to kill the task. 1365 for i := 0; i < killFailureLimit; i++ { 1366 if err = r.handle.Kill(); err != nil { 1367 // Calculate the new backoff 1368 backoff := (1 << (2 * uint64(i))) * killBackoffBaseline 1369 if backoff > killBackoffLimit { 1370 backoff = killBackoffLimit 1371 } 1372 1373 r.logger.Printf("[ERR] client: failed to kill task '%s' for alloc %q. Retrying in %v: %v", 1374 r.task.Name, r.alloc.ID, backoff, err) 1375 time.Sleep(time.Duration(backoff)) 1376 } else { 1377 // Kill was successful 1378 return true, nil 1379 } 1380 } 1381 return 1382 } 1383 1384 // Restart will restart the task 1385 func (r *TaskRunner) Restart(source, reason string) { 1386 reasonStr := fmt.Sprintf("%s: %s", source, reason) 1387 event := structs.NewTaskEvent(structs.TaskRestartSignal).SetRestartReason(reasonStr) 1388 1389 select { 1390 case r.restartCh <- event: 1391 case <-r.waitCh: 1392 } 1393 } 1394 1395 // Signal will send a signal to the task 1396 func (r *TaskRunner) Signal(source, reason string, s os.Signal) error { 1397 1398 reasonStr := fmt.Sprintf("%s: %s", source, reason) 1399 event := structs.NewTaskEvent(structs.TaskSignaling).SetTaskSignal(s).SetTaskSignalReason(reasonStr) 1400 1401 resCh := make(chan error) 1402 se := SignalEvent{ 1403 s: s, 1404 e: event, 1405 result: resCh, 1406 } 1407 1408 select { 1409 case r.signalCh <- se: 1410 case <-r.waitCh: 1411 } 1412 1413 return <-resCh 1414 } 1415 1416 // Kill will kill a task and store the error, no longer restarting the task. If 1417 // fail is set, the task is marked as having failed. 1418 func (r *TaskRunner) Kill(source, reason string, fail bool) { 1419 reasonStr := fmt.Sprintf("%s: %s", source, reason) 1420 event := structs.NewTaskEvent(structs.TaskKilling).SetKillReason(reasonStr) 1421 if fail { 1422 event.SetFailsTask() 1423 } 1424 1425 r.logger.Printf("[DEBUG] client: killing task %v for alloc %q: %v", r.task.Name, r.alloc.ID, reasonStr) 1426 r.Destroy(event) 1427 } 1428 1429 // UnblockStart unblocks the starting of the task. It currently assumes only 1430 // consul-template will unblock 1431 func (r *TaskRunner) UnblockStart(source string) { 1432 r.unblockLock.Lock() 1433 defer r.unblockLock.Unlock() 1434 if r.unblocked { 1435 return 1436 } 1437 1438 r.logger.Printf("[DEBUG] client: unblocking task %v for alloc %q: %v", r.task.Name, r.alloc.ID, source) 1439 r.unblocked = true 1440 close(r.unblockCh) 1441 } 1442 1443 // Helper function for converting a WaitResult into a TaskTerminated event. 1444 func (r *TaskRunner) waitErrorToEvent(res *dstructs.WaitResult) *structs.TaskEvent { 1445 return structs.NewTaskEvent(structs.TaskTerminated). 1446 SetExitCode(res.ExitCode). 1447 SetSignal(res.Signal). 1448 SetExitMessage(res.Err) 1449 } 1450 1451 // Update is used to update the task of the context 1452 func (r *TaskRunner) Update(update *structs.Allocation) { 1453 select { 1454 case r.updateCh <- update: 1455 default: 1456 r.logger.Printf("[ERR] client: dropping task update '%s' (alloc '%s')", 1457 r.task.Name, r.alloc.ID) 1458 } 1459 } 1460 1461 // Destroy is used to indicate that the task context should be destroyed. The 1462 // event parameter provides a context for the destroy. 1463 func (r *TaskRunner) Destroy(event *structs.TaskEvent) { 1464 r.destroyLock.Lock() 1465 defer r.destroyLock.Unlock() 1466 1467 if r.destroy { 1468 return 1469 } 1470 r.destroy = true 1471 r.destroyEvent = event 1472 close(r.destroyCh) 1473 } 1474 1475 // getCreatedResources returns the resources created by drivers. It will never 1476 // return nil. 1477 func (r *TaskRunner) getCreatedResources() *driver.CreatedResources { 1478 r.createdResourcesLock.Lock() 1479 if r.createdResources == nil { 1480 r.createdResources = driver.NewCreatedResources() 1481 } 1482 cr := r.createdResources.Copy() 1483 r.createdResourcesLock.Unlock() 1484 1485 return cr 1486 } 1487 1488 // setCreatedResources updates the resources created by drivers. If passed nil 1489 // it will set createdResources to an initialized struct. 1490 func (r *TaskRunner) setCreatedResources(cr *driver.CreatedResources) { 1491 if cr == nil { 1492 cr = driver.NewCreatedResources() 1493 } 1494 r.createdResourcesLock.Lock() 1495 r.createdResources = cr.Copy() 1496 r.createdResourcesLock.Unlock() 1497 } 1498 1499 // emitStats emits resource usage stats of tasks to remote metrics collector 1500 // sinks 1501 func (r *TaskRunner) emitStats(ru *cstructs.TaskResourceUsage) { 1502 if ru.ResourceUsage.MemoryStats != nil && r.config.PublishAllocationMetrics { 1503 metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "rss"}, float32(ru.ResourceUsage.MemoryStats.RSS)) 1504 metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "cache"}, float32(ru.ResourceUsage.MemoryStats.Cache)) 1505 metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "swap"}, float32(ru.ResourceUsage.MemoryStats.Swap)) 1506 metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "max_usage"}, float32(ru.ResourceUsage.MemoryStats.MaxUsage)) 1507 metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "kernel_usage"}, float32(ru.ResourceUsage.MemoryStats.KernelUsage)) 1508 metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "kernel_max_usage"}, float32(ru.ResourceUsage.MemoryStats.KernelMaxUsage)) 1509 } 1510 1511 if ru.ResourceUsage.CpuStats != nil && r.config.PublishAllocationMetrics { 1512 metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "total_percent"}, float32(ru.ResourceUsage.CpuStats.Percent)) 1513 metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "system"}, float32(ru.ResourceUsage.CpuStats.SystemMode)) 1514 metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "user"}, float32(ru.ResourceUsage.CpuStats.UserMode)) 1515 metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "throttled_time"}, float32(ru.ResourceUsage.CpuStats.ThrottledTime)) 1516 metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "throttled_periods"}, float32(ru.ResourceUsage.CpuStats.ThrottledPeriods)) 1517 metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "total_ticks"}, float32(ru.ResourceUsage.CpuStats.TotalTicks)) 1518 } 1519 }