github.com/hhrutter/nomad@v0.6.0-rc2.0.20170723054333-80c4b03f0705/client/task_runner.go (about) 1 package client 2 3 import ( 4 "bytes" 5 "crypto/md5" 6 "encoding/hex" 7 "fmt" 8 "io" 9 "io/ioutil" 10 "log" 11 "os" 12 "path/filepath" 13 "strings" 14 "sync" 15 "time" 16 17 "github.com/armon/go-metrics" 18 "github.com/boltdb/bolt" 19 "github.com/golang/snappy" 20 "github.com/hashicorp/consul-template/signals" 21 "github.com/hashicorp/go-multierror" 22 version "github.com/hashicorp/go-version" 23 "github.com/hashicorp/nomad/client/allocdir" 24 "github.com/hashicorp/nomad/client/config" 25 "github.com/hashicorp/nomad/client/driver" 26 "github.com/hashicorp/nomad/client/getter" 27 "github.com/hashicorp/nomad/client/vaultclient" 28 "github.com/hashicorp/nomad/nomad/structs" 29 "github.com/ugorji/go/codec" 30 31 "github.com/hashicorp/nomad/client/driver/env" 32 dstructs "github.com/hashicorp/nomad/client/driver/structs" 33 cstructs "github.com/hashicorp/nomad/client/structs" 34 ) 35 36 const ( 37 // killBackoffBaseline is the baseline time for exponential backoff while 38 // killing a task. 39 killBackoffBaseline = 5 * time.Second 40 41 // killBackoffLimit is the limit of the exponential backoff for killing 42 // the task. 43 killBackoffLimit = 2 * time.Minute 44 45 // killFailureLimit is how many times we will attempt to kill a task before 46 // giving up and potentially leaking resources. 47 killFailureLimit = 5 48 49 // vaultBackoffBaseline is the baseline time for exponential backoff when 50 // attempting to retrieve a Vault token 51 vaultBackoffBaseline = 5 * time.Second 52 53 // vaultBackoffLimit is the limit of the exponential backoff when attempting 54 // to retrieve a Vault token 55 vaultBackoffLimit = 3 * time.Minute 56 57 // vaultTokenFile is the name of the file holding the Vault token inside the 58 // task's secret directory 59 vaultTokenFile = "vault_token" 60 ) 61 62 var ( 63 // taskRunnerStateAllKey holds all the task runners state. At the moment 64 // there is no need to split it 65 taskRunnerStateAllKey = []byte("simple-all") 66 ) 67 68 // TaskRunner is used to wrap a task within an allocation and provide the execution context. 69 type TaskRunner struct { 70 stateDB *bolt.DB 71 config *config.Config 72 updater TaskStateUpdater 73 logger *log.Logger 74 alloc *structs.Allocation 75 restartTracker *RestartTracker 76 consul ConsulServiceAPI 77 78 // running marks whether the task is running 79 running bool 80 runningLock sync.Mutex 81 82 resourceUsage *cstructs.TaskResourceUsage 83 resourceUsageLock sync.RWMutex 84 85 task *structs.Task 86 taskDir *allocdir.TaskDir 87 88 // envBuilder is used to build the task's environment 89 envBuilder *env.Builder 90 91 // driverNet is the network information returned by the driver 92 driverNet *cstructs.DriverNetwork 93 driverNetLock sync.Mutex 94 95 // updateCh is used to receive updated versions of the allocation 96 updateCh chan *structs.Allocation 97 98 handle driver.DriverHandle 99 handleLock sync.Mutex 100 101 // artifactsDownloaded tracks whether the tasks artifacts have been 102 // downloaded 103 // 104 // Must acquire persistLock when accessing 105 artifactsDownloaded bool 106 107 // taskDirBuilt tracks whether the task has built its directory. 108 // 109 // Must acquire persistLock when accessing 110 taskDirBuilt bool 111 112 // createdResources are all the resources created by the task driver 113 // across all attempts to start the task. 114 // Simple gets and sets should use {get,set}CreatedResources 115 createdResources *driver.CreatedResources 116 createdResourcesLock sync.Mutex 117 118 // payloadRendered tracks whether the payload has been rendered to disk 119 payloadRendered bool 120 121 // vaultFuture is the means to wait for and get a Vault token 122 vaultFuture *tokenFuture 123 124 // recoveredVaultToken is the token that was recovered through a restore 125 recoveredVaultToken string 126 127 // vaultClient is used to retrieve and renew any needed Vault token 128 vaultClient vaultclient.VaultClient 129 130 // templateManager is used to manage any consul-templates this task may have 131 templateManager *TaskTemplateManager 132 133 // startCh is used to trigger the start of the task 134 startCh chan struct{} 135 136 // unblockCh is used to unblock the starting of the task 137 unblockCh chan struct{} 138 unblocked bool 139 unblockLock sync.Mutex 140 141 // restartCh is used to restart a task 142 restartCh chan *structs.TaskEvent 143 144 // signalCh is used to send a signal to a task 145 signalCh chan SignalEvent 146 147 destroy bool 148 destroyCh chan struct{} 149 destroyLock sync.Mutex 150 destroyEvent *structs.TaskEvent 151 152 // waitCh closing marks the run loop as having exited 153 waitCh chan struct{} 154 155 // persistLock must be acquired when accessing fields stored by 156 // SaveState. SaveState is called asynchronously to TaskRunner.Run by 157 // AllocRunner, so all state fields must be synchronized using this 158 // lock. 159 persistLock sync.Mutex 160 161 // persistedHash is the hash of the last persisted snapshot. It is used to 162 // detect if a new snapshot has to be writen to disk. 163 persistedHash []byte 164 } 165 166 // taskRunnerState is used to snapshot the state of the task runner 167 type taskRunnerState struct { 168 Version string 169 HandleID string 170 ArtifactDownloaded bool 171 TaskDirBuilt bool 172 PayloadRendered bool 173 CreatedResources *driver.CreatedResources 174 DriverNetwork *cstructs.DriverNetwork 175 } 176 177 func (s *taskRunnerState) Hash() []byte { 178 h := md5.New() 179 180 io.WriteString(h, s.Version) 181 io.WriteString(h, s.HandleID) 182 io.WriteString(h, fmt.Sprintf("%v", s.ArtifactDownloaded)) 183 io.WriteString(h, fmt.Sprintf("%v", s.TaskDirBuilt)) 184 io.WriteString(h, fmt.Sprintf("%v", s.PayloadRendered)) 185 h.Write(s.CreatedResources.Hash()) 186 h.Write(s.DriverNetwork.Hash()) 187 188 return h.Sum(nil) 189 } 190 191 // TaskStateUpdater is used to signal that tasks state has changed. 192 type TaskStateUpdater func(taskName, state string, event *structs.TaskEvent) 193 194 // SignalEvent is a tuple of the signal and the event generating it 195 type SignalEvent struct { 196 // s is the signal to be sent 197 s os.Signal 198 199 // e is the task event generating the signal 200 e *structs.TaskEvent 201 202 // result should be used to send back the result of the signal 203 result chan<- error 204 } 205 206 // NewTaskRunner is used to create a new task context 207 func NewTaskRunner(logger *log.Logger, config *config.Config, 208 stateDB *bolt.DB, updater TaskStateUpdater, taskDir *allocdir.TaskDir, 209 alloc *structs.Allocation, task *structs.Task, 210 vaultClient vaultclient.VaultClient, consulClient ConsulServiceAPI) *TaskRunner { 211 212 // Merge in the task resources 213 task.Resources = alloc.TaskResources[task.Name] 214 215 // Build the restart tracker. 216 tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup) 217 if tg == nil { 218 logger.Printf("[ERR] client: alloc '%s' for missing task group '%s'", alloc.ID, alloc.TaskGroup) 219 return nil 220 } 221 restartTracker := newRestartTracker(tg.RestartPolicy, alloc.Job.Type) 222 223 // Initialize the environment builder 224 envBuilder := env.NewBuilder(config.Node, alloc, task, config.Region) 225 226 tc := &TaskRunner{ 227 config: config, 228 stateDB: stateDB, 229 updater: updater, 230 logger: logger, 231 restartTracker: restartTracker, 232 alloc: alloc, 233 task: task, 234 taskDir: taskDir, 235 envBuilder: envBuilder, 236 createdResources: driver.NewCreatedResources(), 237 consul: consulClient, 238 vaultClient: vaultClient, 239 vaultFuture: NewTokenFuture().Set(""), 240 updateCh: make(chan *structs.Allocation, 64), 241 destroyCh: make(chan struct{}), 242 waitCh: make(chan struct{}), 243 startCh: make(chan struct{}, 1), 244 unblockCh: make(chan struct{}), 245 restartCh: make(chan *structs.TaskEvent), 246 signalCh: make(chan SignalEvent), 247 } 248 249 return tc 250 } 251 252 // MarkReceived marks the task as received. 253 func (r *TaskRunner) MarkReceived() { 254 r.updater(r.task.Name, structs.TaskStatePending, structs.NewTaskEvent(structs.TaskReceived)) 255 } 256 257 // WaitCh returns a channel to wait for termination 258 func (r *TaskRunner) WaitCh() <-chan struct{} { 259 return r.waitCh 260 } 261 262 // getHandle returns the task's handle or nil 263 func (r *TaskRunner) getHandle() driver.DriverHandle { 264 r.handleLock.Lock() 265 h := r.handle 266 r.handleLock.Unlock() 267 return h 268 } 269 270 // pre060StateFilePath returns the path to our state file that would have been 271 // written pre v0.6.0 272 // COMPAT: Remove in 0.7.0 273 func (r *TaskRunner) pre060StateFilePath() string { 274 // Get the MD5 of the task name 275 hashVal := md5.Sum([]byte(r.task.Name)) 276 hashHex := hex.EncodeToString(hashVal[:]) 277 dirName := fmt.Sprintf("task-%s", hashHex) 278 279 // Generate the path 280 return filepath.Join(r.config.StateDir, "alloc", r.alloc.ID, dirName, "state.json") 281 } 282 283 // RestoreState is used to restore our state. If a non-empty string is returned 284 // the task is restarted with the string as the reason. This is useful for 285 // backwards incompatible upgrades that need to restart tasks with a new 286 // executor. 287 func (r *TaskRunner) RestoreState() (string, error) { 288 // COMPAT: Remove in 0.7.0 289 // 0.6.0 transistioned from individual state files to a single bolt-db. 290 // The upgrade path is to: 291 // Check if old state exists 292 // If so, restore from that and delete old state 293 // Restore using state database 294 295 var snap taskRunnerState 296 297 // Check if the old snapshot is there 298 oldPath := r.pre060StateFilePath() 299 if err := pre060RestoreState(oldPath, &snap); err == nil { 300 // Delete the old state 301 os.RemoveAll(oldPath) 302 } else if !os.IsNotExist(err) { 303 // Something corrupt in the old state file 304 return "", err 305 } else { 306 // We are doing a normal restore 307 err := r.stateDB.View(func(tx *bolt.Tx) error { 308 bkt, err := getTaskBucket(tx, r.alloc.ID, r.task.Name) 309 if err != nil { 310 return fmt.Errorf("failed to get task bucket: %v", err) 311 } 312 313 if err := getObject(bkt, taskRunnerStateAllKey, &snap); err != nil { 314 return fmt.Errorf("failed to read task runner state: %v", err) 315 } 316 return nil 317 }) 318 if err != nil { 319 return "", err 320 } 321 322 } 323 324 // Restore fields from the snapshot 325 r.artifactsDownloaded = snap.ArtifactDownloaded 326 r.taskDirBuilt = snap.TaskDirBuilt 327 r.payloadRendered = snap.PayloadRendered 328 r.setCreatedResources(snap.CreatedResources) 329 r.driverNet = snap.DriverNetwork 330 331 if r.task.Vault != nil { 332 // Read the token from the secret directory 333 tokenPath := filepath.Join(r.taskDir.SecretsDir, vaultTokenFile) 334 data, err := ioutil.ReadFile(tokenPath) 335 if err != nil { 336 if !os.IsNotExist(err) { 337 return "", fmt.Errorf("failed to read token for task %q in alloc %q: %v", r.task.Name, r.alloc.ID, err) 338 } 339 340 // Token file doesn't exist 341 } else { 342 // Store the recovered token 343 r.recoveredVaultToken = string(data) 344 } 345 } 346 347 // Restore the driver 348 restartReason := "" 349 if snap.HandleID != "" { 350 d, err := r.createDriver() 351 if err != nil { 352 return "", err 353 } 354 355 // Add the restored network driver to the environment 356 r.envBuilder.SetDriverNetwork(r.driverNet) 357 358 // Open a connection to the driver handle 359 ctx := driver.NewExecContext(r.taskDir, r.envBuilder.Build()) 360 handle, err := d.Open(ctx, snap.HandleID) 361 362 // In the case it fails, we relaunch the task in the Run() method. 363 if err != nil { 364 r.logger.Printf("[ERR] client: failed to open handle to task %q for alloc %q: %v", 365 r.task.Name, r.alloc.ID, err) 366 return "", nil 367 } 368 369 if pre06ScriptCheck(snap.Version, r.task.Driver, r.task.Services) { 370 restartReason = pre06ScriptCheckReason 371 } 372 373 if err := r.registerServices(d, handle, r.driverNet); err != nil { 374 // Don't hard fail here as there's a chance this task 375 // registered with Consul properly when it initial 376 // started. 377 r.logger.Printf("[WARN] client: failed to register services and checks with consul for task %q in alloc %q: %v", 378 r.task.Name, r.alloc.ID, err) 379 } 380 381 r.handleLock.Lock() 382 r.handle = handle 383 r.handleLock.Unlock() 384 385 r.runningLock.Lock() 386 r.running = true 387 r.runningLock.Unlock() 388 } 389 return restartReason, nil 390 } 391 392 // ver06 is used for checking for pre-0.6 script checks 393 var ver06 = version.Must(version.NewVersion("0.6.0dev")) 394 395 // pre06ScriptCheckReason is the restart reason given when a pre-0.6 script 396 // check is found on an exec/java task. 397 const pre06ScriptCheckReason = "upgrading pre-0.6 script checks" 398 399 // pre06ScriptCheck returns true if version is prior to 0.6.0dev, has a script 400 // check, and uses exec or java drivers. 401 func pre06ScriptCheck(ver, driver string, services []*structs.Service) bool { 402 if driver != "exec" && driver != "java" && driver != "mock_driver" { 403 // Only exec and java are affected 404 return false 405 } 406 v, err := version.NewVersion(ver) 407 if err != nil { 408 // Treat it as old 409 return true 410 } 411 if !v.LessThan(ver06) { 412 // >= 0.6.0dev 413 return false 414 } 415 for _, service := range services { 416 for _, check := range service.Checks { 417 if check.Type == "script" { 418 return true 419 } 420 } 421 } 422 return false 423 } 424 425 // SaveState is used to snapshot our state 426 func (r *TaskRunner) SaveState() error { 427 r.destroyLock.Lock() 428 defer r.destroyLock.Unlock() 429 if r.destroy { 430 // Don't save state if already destroyed 431 return nil 432 } 433 434 r.persistLock.Lock() 435 defer r.persistLock.Unlock() 436 snap := taskRunnerState{ 437 Version: r.config.Version, 438 ArtifactDownloaded: r.artifactsDownloaded, 439 TaskDirBuilt: r.taskDirBuilt, 440 PayloadRendered: r.payloadRendered, 441 CreatedResources: r.getCreatedResources(), 442 } 443 444 r.handleLock.Lock() 445 if r.handle != nil { 446 snap.HandleID = r.handle.ID() 447 } 448 r.handleLock.Unlock() 449 450 r.driverNetLock.Lock() 451 snap.DriverNetwork = r.driverNet.Copy() 452 r.driverNetLock.Unlock() 453 454 // If nothing has changed avoid the write 455 h := snap.Hash() 456 if bytes.Equal(h, r.persistedHash) { 457 return nil 458 } 459 460 // Serialize the object 461 var buf bytes.Buffer 462 if err := codec.NewEncoder(&buf, structs.MsgpackHandle).Encode(&snap); err != nil { 463 return fmt.Errorf("failed to serialize snapshot: %v", err) 464 } 465 466 // Start the transaction. 467 return r.stateDB.Batch(func(tx *bolt.Tx) error { 468 // Grab the task bucket 469 taskBkt, err := getTaskBucket(tx, r.alloc.ID, r.task.Name) 470 if err != nil { 471 return fmt.Errorf("failed to retrieve allocation bucket: %v", err) 472 } 473 474 if err := putData(taskBkt, taskRunnerStateAllKey, buf.Bytes()); err != nil { 475 return fmt.Errorf("failed to write task_runner state: %v", err) 476 } 477 478 // Store the hash that was persisted 479 tx.OnCommit(func() { 480 r.persistedHash = h 481 }) 482 483 return nil 484 }) 485 } 486 487 // DestroyState is used to cleanup after ourselves 488 func (r *TaskRunner) DestroyState() error { 489 r.persistLock.Lock() 490 defer r.persistLock.Unlock() 491 492 return r.stateDB.Update(func(tx *bolt.Tx) error { 493 if err := deleteTaskBucket(tx, r.alloc.ID, r.task.Name); err != nil { 494 return fmt.Errorf("failed to delete task bucket: %v", err) 495 } 496 return nil 497 }) 498 } 499 500 // setState is used to update the state of the task runner 501 func (r *TaskRunner) setState(state string, event *structs.TaskEvent) { 502 // Persist our state to disk. 503 if err := r.SaveState(); err != nil { 504 r.logger.Printf("[ERR] client: failed to save state of Task Runner for task %q: %v", r.task.Name, err) 505 } 506 507 // Indicate the task has been updated. 508 r.updater(r.task.Name, state, event) 509 } 510 511 // createDriver makes a driver for the task 512 func (r *TaskRunner) createDriver() (driver.Driver, error) { 513 // Create a task-specific event emitter callback to expose minimal 514 // state to drivers 515 eventEmitter := func(m string, args ...interface{}) { 516 msg := fmt.Sprintf(m, args...) 517 r.logger.Printf("[DEBUG] client: driver event for alloc %q: %s", r.alloc.ID, msg) 518 r.setState(structs.TaskStatePending, structs.NewTaskEvent(structs.TaskDriverMessage).SetDriverMessage(msg)) 519 } 520 521 driverCtx := driver.NewDriverContext(r.task.Name, r.alloc.ID, r.config, r.config.Node, r.logger, eventEmitter) 522 d, err := driver.NewDriver(r.task.Driver, driverCtx) 523 if err != nil { 524 return nil, fmt.Errorf("failed to create driver '%s' for alloc %s: %v", 525 r.task.Driver, r.alloc.ID, err) 526 } 527 528 return d, err 529 } 530 531 // Run is a long running routine used to manage the task 532 func (r *TaskRunner) Run() { 533 defer close(r.waitCh) 534 r.logger.Printf("[DEBUG] client: starting task context for '%s' (alloc '%s')", 535 r.task.Name, r.alloc.ID) 536 537 if err := r.validateTask(); err != nil { 538 r.setState( 539 structs.TaskStateDead, 540 structs.NewTaskEvent(structs.TaskFailedValidation).SetValidationError(err).SetFailsTask()) 541 return 542 } 543 544 // Create a temporary driver so that we can determine the FSIsolation 545 // required. run->startTask will create a new driver after environment 546 // has been setup (env vars, templates, artifacts, secrets, etc). 547 tmpDrv, err := r.createDriver() 548 if err != nil { 549 e := fmt.Errorf("failed to create driver of task %q for alloc %q: %v", r.task.Name, r.alloc.ID, err) 550 r.setState( 551 structs.TaskStateDead, 552 structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(e).SetFailsTask()) 553 return 554 } 555 556 // Build base task directory structure regardless of FS isolation abilities. 557 // This needs to happen before we start the Vault manager and call prestart 558 // as both those can write to the task directories 559 if err := r.buildTaskDir(tmpDrv.FSIsolation()); err != nil { 560 e := fmt.Errorf("failed to build task directory for %q: %v", r.task.Name, err) 561 r.setState( 562 structs.TaskStateDead, 563 structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(e).SetFailsTask()) 564 return 565 } 566 567 // If there is no Vault policy leave the static future created in 568 // NewTaskRunner 569 if r.task.Vault != nil { 570 // Start the go-routine to get a Vault token 571 r.vaultFuture.Clear() 572 go r.vaultManager(r.recoveredVaultToken) 573 } 574 575 // Start the run loop 576 r.run() 577 578 // Do any cleanup necessary 579 r.postrun() 580 581 return 582 } 583 584 // validateTask validates the fields of the task and returns an error if the 585 // task is invalid. 586 func (r *TaskRunner) validateTask() error { 587 var mErr multierror.Error 588 589 // Validate the user. 590 unallowedUsers := r.config.ReadStringListToMapDefault("user.blacklist", config.DefaultUserBlacklist) 591 checkDrivers := r.config.ReadStringListToMapDefault("user.checked_drivers", config.DefaultUserCheckedDrivers) 592 if _, driverMatch := checkDrivers[r.task.Driver]; driverMatch { 593 if _, unallowed := unallowedUsers[r.task.User]; unallowed { 594 mErr.Errors = append(mErr.Errors, fmt.Errorf("running as user %q is disallowed", r.task.User)) 595 } 596 } 597 598 // Validate the artifacts 599 for i, artifact := range r.task.Artifacts { 600 // Verify the artifact doesn't escape the task directory. 601 if err := artifact.Validate(); err != nil { 602 // If this error occurs there is potentially a server bug or 603 // mallicious, server spoofing. 604 r.logger.Printf("[ERR] client: allocation %q, task %v, artifact %#v (%v) fails validation: %v", 605 r.alloc.ID, r.task.Name, artifact, i, err) 606 mErr.Errors = append(mErr.Errors, fmt.Errorf("artifact (%d) failed validation: %v", i, err)) 607 } 608 } 609 610 // Validate the Service names 611 taskEnv := r.envBuilder.Build() 612 for i, service := range r.task.Services { 613 name := taskEnv.ReplaceEnv(service.Name) 614 if err := service.ValidateName(name); err != nil { 615 mErr.Errors = append(mErr.Errors, fmt.Errorf("service (%d) failed validation: %v", i, err)) 616 } 617 } 618 619 if len(mErr.Errors) == 1 { 620 return mErr.Errors[0] 621 } 622 return mErr.ErrorOrNil() 623 } 624 625 // tokenFuture stores the Vault token and allows consumers to block till a valid 626 // token exists 627 type tokenFuture struct { 628 waiting []chan struct{} 629 token string 630 set bool 631 m sync.Mutex 632 } 633 634 // NewTokenFuture returns a new token future without any token set 635 func NewTokenFuture() *tokenFuture { 636 return &tokenFuture{} 637 } 638 639 // Wait returns a channel that can be waited on. When this channel unblocks, a 640 // valid token will be available via the Get method 641 func (f *tokenFuture) Wait() <-chan struct{} { 642 f.m.Lock() 643 defer f.m.Unlock() 644 645 c := make(chan struct{}) 646 if f.set { 647 close(c) 648 return c 649 } 650 651 f.waiting = append(f.waiting, c) 652 return c 653 } 654 655 // Set sets the token value and unblocks any caller of Wait 656 func (f *tokenFuture) Set(token string) *tokenFuture { 657 f.m.Lock() 658 defer f.m.Unlock() 659 660 f.set = true 661 f.token = token 662 for _, w := range f.waiting { 663 close(w) 664 } 665 f.waiting = nil 666 return f 667 } 668 669 // Clear clears the set vault token. 670 func (f *tokenFuture) Clear() *tokenFuture { 671 f.m.Lock() 672 defer f.m.Unlock() 673 674 f.token = "" 675 f.set = false 676 return f 677 } 678 679 // Get returns the set Vault token 680 func (f *tokenFuture) Get() string { 681 f.m.Lock() 682 defer f.m.Unlock() 683 return f.token 684 } 685 686 // vaultManager should be called in a go-routine and manages the derivation, 687 // renewal and handling of errors with the Vault token. The optional parameter 688 // allows setting the initial Vault token. This is useful when the Vault token 689 // is recovered off disk. 690 func (r *TaskRunner) vaultManager(token string) { 691 // Helper for stopping token renewal 692 stopRenewal := func() { 693 if err := r.vaultClient.StopRenewToken(r.vaultFuture.Get()); err != nil { 694 r.logger.Printf("[WARN] client: failed to stop token renewal for task %v in alloc %q: %v", r.task.Name, r.alloc.ID, err) 695 } 696 } 697 698 // updatedToken lets us store state between loops. If true, a new token 699 // has been retrieved and we need to apply the Vault change mode 700 var updatedToken bool 701 702 OUTER: 703 for { 704 // Check if we should exit 705 select { 706 case <-r.waitCh: 707 stopRenewal() 708 return 709 default: 710 } 711 712 // Clear the token 713 r.vaultFuture.Clear() 714 715 // Check if there already is a token which can be the case for 716 // restoring the TaskRunner 717 if token == "" { 718 // Get a token 719 var exit bool 720 token, exit = r.deriveVaultToken() 721 if exit { 722 // Exit the manager 723 return 724 } 725 726 // Write the token to disk 727 if err := r.writeToken(token); err != nil { 728 e := fmt.Errorf("failed to write Vault token to disk") 729 r.logger.Printf("[ERR] client: %v for task %v on alloc %q: %v", e, r.task.Name, r.alloc.ID, err) 730 r.Kill("vault", e.Error(), true) 731 return 732 } 733 } 734 735 // Start the renewal process 736 renewCh, err := r.vaultClient.RenewToken(token, 30) 737 738 // An error returned means the token is not being renewed 739 if err != nil { 740 r.logger.Printf("[ERR] client: failed to start renewal of Vault token for task %v on alloc %q: %v", r.task.Name, r.alloc.ID, err) 741 token = "" 742 goto OUTER 743 } 744 745 // The Vault token is valid now, so set it 746 r.vaultFuture.Set(token) 747 748 if updatedToken { 749 switch r.task.Vault.ChangeMode { 750 case structs.VaultChangeModeSignal: 751 s, err := signals.Parse(r.task.Vault.ChangeSignal) 752 if err != nil { 753 e := fmt.Errorf("failed to parse signal: %v", err) 754 r.logger.Printf("[ERR] client: %v", err) 755 r.Kill("vault", e.Error(), true) 756 return 757 } 758 759 if err := r.Signal("vault", "new Vault token acquired", s); err != nil { 760 r.logger.Printf("[ERR] client: failed to send signal to task %v for alloc %q: %v", r.task.Name, r.alloc.ID, err) 761 r.Kill("vault", fmt.Sprintf("failed to send signal to task: %v", err), true) 762 return 763 } 764 case structs.VaultChangeModeRestart: 765 r.Restart("vault", "new Vault token acquired") 766 case structs.VaultChangeModeNoop: 767 fallthrough 768 default: 769 r.logger.Printf("[ERR] client: Invalid Vault change mode: %q", r.task.Vault.ChangeMode) 770 } 771 772 // We have handled it 773 updatedToken = false 774 775 // Call the handler 776 r.updatedTokenHandler() 777 } 778 779 // Start watching for renewal errors 780 select { 781 case err := <-renewCh: 782 // Clear the token 783 token = "" 784 r.logger.Printf("[ERR] client: failed to renew Vault token for task %v on alloc %q: %v", r.task.Name, r.alloc.ID, err) 785 stopRenewal() 786 787 // Check if we have to do anything 788 if r.task.Vault.ChangeMode != structs.VaultChangeModeNoop { 789 updatedToken = true 790 } 791 case <-r.waitCh: 792 stopRenewal() 793 return 794 } 795 } 796 } 797 798 // deriveVaultToken derives the Vault token using exponential backoffs. It 799 // returns the Vault token and whether the manager should exit. 800 func (r *TaskRunner) deriveVaultToken() (token string, exit bool) { 801 attempts := 0 802 for { 803 tokens, err := r.vaultClient.DeriveToken(r.alloc, []string{r.task.Name}) 804 if err == nil { 805 return tokens[r.task.Name], false 806 } 807 808 // Check if we can't recover from the error 809 if !structs.IsRecoverable(err) { 810 r.logger.Printf("[ERR] client: failed to derive Vault token for task %v on alloc %q: %v", 811 r.task.Name, r.alloc.ID, err) 812 r.Kill("vault", fmt.Sprintf("failed to derive token: %v", err), true) 813 return "", true 814 } 815 816 // Handle the retry case 817 backoff := (1 << (2 * uint64(attempts))) * vaultBackoffBaseline 818 if backoff > vaultBackoffLimit { 819 backoff = vaultBackoffLimit 820 } 821 r.logger.Printf("[ERR] client: failed to derive Vault token for task %v on alloc %q: %v; retrying in %v", 822 r.task.Name, r.alloc.ID, err, backoff) 823 824 attempts++ 825 826 // Wait till retrying 827 select { 828 case <-r.waitCh: 829 return "", true 830 case <-time.After(backoff): 831 } 832 } 833 } 834 835 // writeToken writes the given token to disk 836 func (r *TaskRunner) writeToken(token string) error { 837 tokenPath := filepath.Join(r.taskDir.SecretsDir, vaultTokenFile) 838 if err := ioutil.WriteFile(tokenPath, []byte(token), 0777); err != nil { 839 return fmt.Errorf("failed to save Vault tokens to secret dir for task %q in alloc %q: %v", r.task.Name, r.alloc.ID, err) 840 } 841 842 return nil 843 } 844 845 // updatedTokenHandler is called when a new Vault token is retrieved. Things 846 // that rely on the token should be updated here. 847 func (r *TaskRunner) updatedTokenHandler() { 848 849 // Update the tasks environment 850 r.envBuilder.SetVaultToken(r.vaultFuture.Get(), r.task.Vault.Env) 851 852 if r.templateManager != nil { 853 r.templateManager.Stop() 854 855 // Create a new templateManager 856 var err error 857 r.templateManager, err = NewTaskTemplateManager(r, r.task.Templates, 858 r.config, r.vaultFuture.Get(), r.taskDir.Dir, r.envBuilder) 859 if err != nil { 860 err := fmt.Errorf("failed to build task's template manager: %v", err) 861 r.setState(structs.TaskStateDead, structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(err).SetFailsTask()) 862 r.logger.Printf("[ERR] client: alloc %q, task %q %v", r.alloc.ID, r.task.Name, err) 863 r.Kill("vault", err.Error(), true) 864 return 865 } 866 } 867 } 868 869 // prestart handles life-cycle tasks that occur before the task has started. 870 // Since it's run asynchronously with the main Run() loop the alloc & task are 871 // passed in to avoid racing with updates. 872 func (r *TaskRunner) prestart(alloc *structs.Allocation, task *structs.Task, resultCh chan bool) { 873 if task.Vault != nil { 874 // Wait for the token 875 r.logger.Printf("[DEBUG] client: waiting for Vault token for task %v in alloc %q", task.Name, alloc.ID) 876 tokenCh := r.vaultFuture.Wait() 877 select { 878 case <-tokenCh: 879 case <-r.waitCh: 880 resultCh <- false 881 return 882 } 883 r.logger.Printf("[DEBUG] client: retrieved Vault token for task %v in alloc %q", task.Name, alloc.ID) 884 r.envBuilder.SetVaultToken(r.vaultFuture.Get(), task.Vault.Env) 885 } 886 887 // If the job is a dispatch job and there is a payload write it to disk 888 requirePayload := len(alloc.Job.Payload) != 0 && 889 (r.task.DispatchPayload != nil && r.task.DispatchPayload.File != "") 890 if !r.payloadRendered && requirePayload { 891 renderTo := filepath.Join(r.taskDir.LocalDir, task.DispatchPayload.File) 892 decoded, err := snappy.Decode(nil, alloc.Job.Payload) 893 if err != nil { 894 r.setState( 895 structs.TaskStateDead, 896 structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(err).SetFailsTask()) 897 resultCh <- false 898 return 899 } 900 901 if err := os.MkdirAll(filepath.Dir(renderTo), 07777); err != nil { 902 r.setState( 903 structs.TaskStateDead, 904 structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(err).SetFailsTask()) 905 resultCh <- false 906 return 907 } 908 909 if err := ioutil.WriteFile(renderTo, decoded, 0777); err != nil { 910 r.setState( 911 structs.TaskStateDead, 912 structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(err).SetFailsTask()) 913 resultCh <- false 914 return 915 } 916 917 r.payloadRendered = true 918 } 919 920 for { 921 r.persistLock.Lock() 922 downloaded := r.artifactsDownloaded 923 r.persistLock.Unlock() 924 925 // Download the task's artifacts 926 if !downloaded && len(task.Artifacts) > 0 { 927 r.setState(structs.TaskStatePending, structs.NewTaskEvent(structs.TaskDownloadingArtifacts)) 928 taskEnv := r.envBuilder.Build() 929 for _, artifact := range task.Artifacts { 930 if err := getter.GetArtifact(taskEnv, artifact, r.taskDir.Dir); err != nil { 931 wrapped := fmt.Errorf("failed to download artifact %q: %v", artifact.GetterSource, err) 932 r.logger.Printf("[DEBUG] client: %v", wrapped) 933 r.setState(structs.TaskStatePending, 934 structs.NewTaskEvent(structs.TaskArtifactDownloadFailed).SetDownloadError(wrapped)) 935 r.restartTracker.SetStartError(structs.WrapRecoverable(wrapped.Error(), err)) 936 goto RESTART 937 } 938 } 939 940 r.persistLock.Lock() 941 r.artifactsDownloaded = true 942 r.persistLock.Unlock() 943 } 944 945 // We don't have to wait for any template 946 if len(task.Templates) == 0 { 947 // Send the start signal 948 select { 949 case r.startCh <- struct{}{}: 950 default: 951 } 952 953 resultCh <- true 954 return 955 } 956 957 // Build the template manager 958 if r.templateManager == nil { 959 var err error 960 r.templateManager, err = NewTaskTemplateManager(r, task.Templates, 961 r.config, r.vaultFuture.Get(), r.taskDir.Dir, r.envBuilder) 962 if err != nil { 963 err := fmt.Errorf("failed to build task's template manager: %v", err) 964 r.setState(structs.TaskStateDead, structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(err).SetFailsTask()) 965 r.logger.Printf("[ERR] client: alloc %q, task %q %v", alloc.ID, task.Name, err) 966 resultCh <- false 967 return 968 } 969 } 970 971 // Block for consul-template 972 // TODO Hooks should register themselves as blocking and then we can 973 // perioidcally enumerate what we are still blocked on 974 select { 975 case <-r.unblockCh: 976 // Send the start signal 977 select { 978 case r.startCh <- struct{}{}: 979 default: 980 } 981 982 resultCh <- true 983 return 984 case <-r.waitCh: 985 // The run loop has exited so exit too 986 resultCh <- false 987 return 988 } 989 990 RESTART: 991 restart := r.shouldRestart() 992 if !restart { 993 resultCh <- false 994 return 995 } 996 } 997 } 998 999 // postrun is used to do any cleanup that is necessary after exiting the runloop 1000 func (r *TaskRunner) postrun() { 1001 // Stop the template manager 1002 if r.templateManager != nil { 1003 r.templateManager.Stop() 1004 } 1005 } 1006 1007 // run is the main run loop that handles starting the application, destroying 1008 // it, restarts and signals. 1009 func (r *TaskRunner) run() { 1010 // Predeclare things so we can jump to the RESTART 1011 var stopCollection chan struct{} 1012 var handleWaitCh chan *dstructs.WaitResult 1013 1014 // If we already have a handle, populate the stopCollection and handleWaitCh 1015 // to fix the invariant that it exists. 1016 handleEmpty := r.getHandle() == nil 1017 1018 if !handleEmpty { 1019 stopCollection = make(chan struct{}) 1020 go r.collectResourceUsageStats(stopCollection) 1021 handleWaitCh = r.handle.WaitCh() 1022 } 1023 1024 for { 1025 // Do the prestart activities 1026 prestartResultCh := make(chan bool, 1) 1027 go r.prestart(r.alloc, r.task, prestartResultCh) 1028 1029 WAIT: 1030 for { 1031 select { 1032 case success := <-prestartResultCh: 1033 if !success { 1034 r.cleanup() 1035 r.setState(structs.TaskStateDead, nil) 1036 return 1037 } 1038 case <-r.startCh: 1039 // Start the task if not yet started or it is being forced. This logic 1040 // is necessary because in the case of a restore the handle already 1041 // exists. 1042 handleEmpty := r.getHandle() == nil 1043 if handleEmpty { 1044 startErr := r.startTask() 1045 r.restartTracker.SetStartError(startErr) 1046 if startErr != nil { 1047 r.setState("", structs.NewTaskEvent(structs.TaskDriverFailure).SetDriverError(startErr)) 1048 goto RESTART 1049 } 1050 1051 // Mark the task as started 1052 r.setState(structs.TaskStateRunning, structs.NewTaskEvent(structs.TaskStarted)) 1053 r.runningLock.Lock() 1054 r.running = true 1055 r.runningLock.Unlock() 1056 1057 if stopCollection == nil { 1058 stopCollection = make(chan struct{}) 1059 go r.collectResourceUsageStats(stopCollection) 1060 } 1061 1062 handleWaitCh = r.handle.WaitCh() 1063 } 1064 1065 case waitRes := <-handleWaitCh: 1066 if waitRes == nil { 1067 panic("nil wait") 1068 } 1069 1070 r.runningLock.Lock() 1071 r.running = false 1072 r.runningLock.Unlock() 1073 1074 // Stop collection of the task's resource usage 1075 close(stopCollection) 1076 1077 // Log whether the task was successful or not. 1078 r.restartTracker.SetWaitResult(waitRes) 1079 r.setState("", r.waitErrorToEvent(waitRes)) 1080 if !waitRes.Successful() { 1081 r.logger.Printf("[INFO] client: task %q for alloc %q failed: %v", r.task.Name, r.alloc.ID, waitRes) 1082 } else { 1083 r.logger.Printf("[INFO] client: task %q for alloc %q completed successfully", r.task.Name, r.alloc.ID) 1084 } 1085 1086 break WAIT 1087 case update := <-r.updateCh: 1088 if err := r.handleUpdate(update); err != nil { 1089 r.logger.Printf("[ERR] client: update to task %q failed: %v", r.task.Name, err) 1090 } 1091 1092 case se := <-r.signalCh: 1093 r.runningLock.Lock() 1094 running := r.running 1095 r.runningLock.Unlock() 1096 common := fmt.Sprintf("signal %v to task %v for alloc %q", se.s, r.task.Name, r.alloc.ID) 1097 if !running { 1098 // Send no error 1099 r.logger.Printf("[DEBUG] client: skipping %s", common) 1100 se.result <- nil 1101 continue 1102 } 1103 1104 r.logger.Printf("[DEBUG] client: sending %s", common) 1105 r.setState(structs.TaskStateRunning, se.e) 1106 1107 res := r.handle.Signal(se.s) 1108 se.result <- res 1109 1110 case event := <-r.restartCh: 1111 r.runningLock.Lock() 1112 running := r.running 1113 r.runningLock.Unlock() 1114 common := fmt.Sprintf("task %v for alloc %q", r.task.Name, r.alloc.ID) 1115 if !running { 1116 r.logger.Printf("[DEBUG] client: skipping restart of %v: task isn't running", common) 1117 continue 1118 } 1119 1120 r.logger.Printf("[DEBUG] client: restarting %s: %v", common, event.RestartReason) 1121 r.setState(structs.TaskStateRunning, event) 1122 r.killTask(nil) 1123 1124 close(stopCollection) 1125 1126 if handleWaitCh != nil { 1127 <-handleWaitCh 1128 } 1129 1130 // Since the restart isn't from a failure, restart immediately 1131 // and don't count against the restart policy 1132 r.restartTracker.SetRestartTriggered() 1133 break WAIT 1134 1135 case <-r.destroyCh: 1136 r.runningLock.Lock() 1137 running := r.running 1138 r.runningLock.Unlock() 1139 if !running { 1140 r.cleanup() 1141 r.setState(structs.TaskStateDead, r.destroyEvent) 1142 return 1143 } 1144 1145 // Remove from consul before killing the task so that traffic 1146 // can be rerouted 1147 interpTask := interpolateServices(r.envBuilder.Build(), r.task) 1148 r.consul.RemoveTask(r.alloc.ID, interpTask) 1149 1150 // Store the task event that provides context on the task 1151 // destroy. The Killed event is set from the alloc_runner and 1152 // doesn't add detail 1153 var killEvent *structs.TaskEvent 1154 if r.destroyEvent.Type != structs.TaskKilled { 1155 if r.destroyEvent.Type == structs.TaskKilling { 1156 killEvent = r.destroyEvent 1157 } else { 1158 r.setState(structs.TaskStateRunning, r.destroyEvent) 1159 } 1160 } 1161 1162 r.killTask(killEvent) 1163 close(stopCollection) 1164 1165 // Wait for handler to exit before calling cleanup 1166 <-handleWaitCh 1167 r.cleanup() 1168 1169 r.setState(structs.TaskStateDead, nil) 1170 return 1171 } 1172 } 1173 1174 RESTART: 1175 // shouldRestart will block if the task should restart after a delay. 1176 restart := r.shouldRestart() 1177 if !restart { 1178 r.cleanup() 1179 r.setState(structs.TaskStateDead, nil) 1180 return 1181 } 1182 1183 // Clear the handle so a new driver will be created. 1184 r.handleLock.Lock() 1185 r.handle = nil 1186 handleWaitCh = nil 1187 stopCollection = nil 1188 r.handleLock.Unlock() 1189 } 1190 } 1191 1192 // cleanup removes Consul entries and calls Driver.Cleanup when a task is 1193 // stopping. Errors are logged. 1194 func (r *TaskRunner) cleanup() { 1195 // Remove from Consul 1196 interpTask := interpolateServices(r.envBuilder.Build(), r.task) 1197 r.consul.RemoveTask(r.alloc.ID, interpTask) 1198 1199 drv, err := r.createDriver() 1200 if err != nil { 1201 r.logger.Printf("[ERR] client: error creating driver to cleanup resources: %v", err) 1202 return 1203 } 1204 1205 res := r.getCreatedResources() 1206 1207 ctx := driver.NewExecContext(r.taskDir, r.envBuilder.Build()) 1208 attempts := 1 1209 var cleanupErr error 1210 for retry := true; retry; attempts++ { 1211 cleanupErr = drv.Cleanup(ctx, res) 1212 retry = structs.IsRecoverable(cleanupErr) 1213 1214 // Copy current createdResources state in case SaveState is 1215 // called between retries 1216 r.setCreatedResources(res) 1217 1218 // Retry 3 times with sleeps between 1219 if !retry || attempts > 3 { 1220 break 1221 } 1222 time.Sleep(time.Duration(attempts) * time.Second) 1223 } 1224 1225 if cleanupErr != nil { 1226 r.logger.Printf("[ERR] client: error cleaning up resources for task %q after %d attempts: %v", r.task.Name, attempts, cleanupErr) 1227 } 1228 return 1229 } 1230 1231 // shouldRestart returns if the task should restart. If the return value is 1232 // true, the task's restart policy has already been considered and any wait time 1233 // between restarts has been applied. 1234 func (r *TaskRunner) shouldRestart() bool { 1235 state, when := r.restartTracker.GetState() 1236 reason := r.restartTracker.GetReason() 1237 switch state { 1238 case structs.TaskNotRestarting, structs.TaskTerminated: 1239 r.logger.Printf("[INFO] client: Not restarting task: %v for alloc: %v ", r.task.Name, r.alloc.ID) 1240 if state == structs.TaskNotRestarting { 1241 r.setState(structs.TaskStateDead, 1242 structs.NewTaskEvent(structs.TaskNotRestarting). 1243 SetRestartReason(reason).SetFailsTask()) 1244 } 1245 return false 1246 case structs.TaskRestarting: 1247 r.logger.Printf("[INFO] client: Restarting task %q for alloc %q in %v", r.task.Name, r.alloc.ID, when) 1248 r.setState(structs.TaskStatePending, 1249 structs.NewTaskEvent(structs.TaskRestarting). 1250 SetRestartDelay(when). 1251 SetRestartReason(reason)) 1252 default: 1253 r.logger.Printf("[ERR] client: restart tracker returned unknown state: %q", state) 1254 return false 1255 } 1256 1257 // Unregister from Consul while waiting to restart. 1258 interpTask := interpolateServices(r.envBuilder.Build(), r.task) 1259 r.consul.RemoveTask(r.alloc.ID, interpTask) 1260 1261 // Sleep but watch for destroy events. 1262 select { 1263 case <-time.After(when): 1264 case <-r.destroyCh: 1265 } 1266 1267 // Destroyed while we were waiting to restart, so abort. 1268 r.destroyLock.Lock() 1269 destroyed := r.destroy 1270 r.destroyLock.Unlock() 1271 if destroyed { 1272 r.logger.Printf("[DEBUG] client: Not restarting task: %v because it has been destroyed", r.task.Name) 1273 r.setState(structs.TaskStateDead, r.destroyEvent) 1274 return false 1275 } 1276 1277 return true 1278 } 1279 1280 // killTask kills the running task. A killing event can optionally be passed and 1281 // this event is used to mark the task as being killed. It provides a means to 1282 // store extra information. 1283 func (r *TaskRunner) killTask(killingEvent *structs.TaskEvent) { 1284 r.runningLock.Lock() 1285 running := r.running 1286 r.runningLock.Unlock() 1287 if !running { 1288 return 1289 } 1290 1291 // Get the kill timeout 1292 timeout := driver.GetKillTimeout(r.task.KillTimeout, r.config.MaxKillTimeout) 1293 1294 // Build the event 1295 var event *structs.TaskEvent 1296 if killingEvent != nil { 1297 event = killingEvent 1298 event.Type = structs.TaskKilling 1299 } else { 1300 event = structs.NewTaskEvent(structs.TaskKilling) 1301 } 1302 event.SetKillTimeout(timeout) 1303 1304 // Mark that we received the kill event 1305 r.setState(structs.TaskStateRunning, event) 1306 1307 handle := r.getHandle() 1308 1309 // Kill the task using an exponential backoff in-case of failures. 1310 destroySuccess, err := r.handleDestroy(handle) 1311 if !destroySuccess { 1312 // We couldn't successfully destroy the resource created. 1313 r.logger.Printf("[ERR] client: failed to kill task %q. Resources may have been leaked: %v", r.task.Name, err) 1314 } 1315 1316 r.runningLock.Lock() 1317 r.running = false 1318 r.runningLock.Unlock() 1319 1320 // Store that the task has been destroyed and any associated error. 1321 r.setState("", structs.NewTaskEvent(structs.TaskKilled).SetKillError(err)) 1322 } 1323 1324 // startTask creates the driver, task dir, and starts the task. 1325 func (r *TaskRunner) startTask() error { 1326 // Create a driver 1327 drv, err := r.createDriver() 1328 if err != nil { 1329 return fmt.Errorf("failed to create driver of task %q for alloc %q: %v", 1330 r.task.Name, r.alloc.ID, err) 1331 } 1332 1333 // Run prestart 1334 ctx := driver.NewExecContext(r.taskDir, r.envBuilder.Build()) 1335 presp, err := drv.Prestart(ctx, r.task) 1336 1337 // Merge newly created resources into previously created resources 1338 if presp != nil { 1339 r.createdResourcesLock.Lock() 1340 r.createdResources.Merge(presp.CreatedResources) 1341 r.createdResourcesLock.Unlock() 1342 1343 // Set any network configuration returned by the driver 1344 r.envBuilder.SetDriverNetwork(presp.Network) 1345 } 1346 1347 if err != nil { 1348 wrapped := fmt.Sprintf("failed to initialize task %q for alloc %q: %v", 1349 r.task.Name, r.alloc.ID, err) 1350 r.logger.Printf("[WARN] client: error from prestart: %s", wrapped) 1351 return structs.WrapRecoverable(wrapped, err) 1352 } 1353 1354 // Create a new context for Start since the environment may have been updated. 1355 ctx = driver.NewExecContext(r.taskDir, r.envBuilder.Build()) 1356 1357 // Start the job 1358 sresp, err := drv.Start(ctx, r.task) 1359 if err != nil { 1360 wrapped := fmt.Sprintf("failed to start task %q for alloc %q: %v", 1361 r.task.Name, r.alloc.ID, err) 1362 r.logger.Printf("[WARN] client: %s", wrapped) 1363 return structs.WrapRecoverable(wrapped, err) 1364 1365 } 1366 1367 // Update environment with the network defined by the driver's Start method. 1368 r.envBuilder.SetDriverNetwork(sresp.Network) 1369 1370 if err := r.registerServices(drv, sresp.Handle, sresp.Network); err != nil { 1371 // All IO is done asynchronously, so errors from registering 1372 // services are hard failures. 1373 r.logger.Printf("[ERR] client: failed to register services and checks for task %q alloc %q: %v", r.task.Name, r.alloc.ID, err) 1374 1375 // Kill the started task 1376 if destroyed, err := r.handleDestroy(sresp.Handle); !destroyed { 1377 r.logger.Printf("[ERR] client: failed to kill task %q alloc %q. Resources may be leaked: %v", 1378 r.task.Name, r.alloc.ID, err) 1379 } 1380 return structs.NewRecoverableError(err, false) 1381 } 1382 1383 r.handleLock.Lock() 1384 r.handle = sresp.Handle 1385 r.handleLock.Unlock() 1386 1387 // Need to persist the driver network between restarts 1388 r.driverNetLock.Lock() 1389 r.driverNet = sresp.Network 1390 r.driverNetLock.Unlock() 1391 1392 return nil 1393 } 1394 1395 // registerServices and checks with Consul. 1396 func (r *TaskRunner) registerServices(d driver.Driver, h driver.DriverHandle, n *cstructs.DriverNetwork) error { 1397 var exec driver.ScriptExecutor 1398 if d.Abilities().Exec { 1399 // Allow set the script executor if the driver supports it 1400 exec = h 1401 } 1402 interpolatedTask := interpolateServices(r.envBuilder.Build(), r.task) 1403 return r.consul.RegisterTask(r.alloc.ID, interpolatedTask, exec, n) 1404 } 1405 1406 // interpolateServices interpolates tags in a service and checks with values from the 1407 // task's environment. 1408 func interpolateServices(taskEnv *env.TaskEnv, task *structs.Task) *structs.Task { 1409 taskCopy := task.Copy() 1410 for _, service := range taskCopy.Services { 1411 for _, check := range service.Checks { 1412 check.Name = taskEnv.ReplaceEnv(check.Name) 1413 check.Type = taskEnv.ReplaceEnv(check.Type) 1414 check.Command = taskEnv.ReplaceEnv(check.Command) 1415 check.Args = taskEnv.ParseAndReplace(check.Args) 1416 check.Path = taskEnv.ReplaceEnv(check.Path) 1417 check.Protocol = taskEnv.ReplaceEnv(check.Protocol) 1418 check.PortLabel = taskEnv.ReplaceEnv(check.PortLabel) 1419 check.InitialStatus = taskEnv.ReplaceEnv(check.InitialStatus) 1420 } 1421 service.Name = taskEnv.ReplaceEnv(service.Name) 1422 service.PortLabel = taskEnv.ReplaceEnv(service.PortLabel) 1423 service.Tags = taskEnv.ParseAndReplace(service.Tags) 1424 } 1425 return taskCopy 1426 } 1427 1428 // buildTaskDir creates the task directory before driver.Prestart. It is safe 1429 // to call multiple times as its state is persisted. 1430 func (r *TaskRunner) buildTaskDir(fsi cstructs.FSIsolation) error { 1431 r.persistLock.Lock() 1432 built := r.taskDirBuilt 1433 r.persistLock.Unlock() 1434 1435 // We do not set the state again since this only occurs during restoration 1436 // and the task dir is already built. The reason we call Build again is to 1437 // ensure that the task dir invariants are still held. 1438 if !built { 1439 r.setState(structs.TaskStatePending, structs.NewTaskEvent(structs.TaskSetup). 1440 SetMessage(structs.TaskBuildingTaskDir)) 1441 } 1442 1443 chroot := config.DefaultChrootEnv 1444 if len(r.config.ChrootEnv) > 0 { 1445 chroot = r.config.ChrootEnv 1446 } 1447 if err := r.taskDir.Build(built, chroot, fsi); err != nil { 1448 return err 1449 } 1450 1451 // Mark task dir as successfully built 1452 r.persistLock.Lock() 1453 r.taskDirBuilt = true 1454 r.persistLock.Unlock() 1455 1456 // Set path and host related env vars 1457 driver.SetEnvvars(r.envBuilder, fsi, r.taskDir, r.config) 1458 return nil 1459 } 1460 1461 // collectResourceUsageStats starts collecting resource usage stats of a Task. 1462 // Collection ends when the passed channel is closed 1463 func (r *TaskRunner) collectResourceUsageStats(stopCollection <-chan struct{}) { 1464 // start collecting the stats right away and then start collecting every 1465 // collection interval 1466 next := time.NewTimer(0) 1467 defer next.Stop() 1468 for { 1469 select { 1470 case <-next.C: 1471 next.Reset(r.config.StatsCollectionInterval) 1472 handle := r.getHandle() 1473 if handle == nil { 1474 continue 1475 } 1476 ru, err := handle.Stats() 1477 1478 if err != nil { 1479 // Check if the driver doesn't implement stats 1480 if err.Error() == driver.DriverStatsNotImplemented.Error() { 1481 r.logger.Printf("[DEBUG] client: driver for task %q in allocation %q doesn't support stats", r.task.Name, r.alloc.ID) 1482 return 1483 } 1484 1485 // We do not log when the plugin is shutdown as this is simply a 1486 // race between the stopCollection channel being closed and calling 1487 // Stats on the handle. 1488 if !strings.Contains(err.Error(), "connection is shut down") { 1489 r.logger.Printf("[WARN] client: error fetching stats of task %v: %v", r.task.Name, err) 1490 } 1491 continue 1492 } 1493 1494 r.resourceUsageLock.Lock() 1495 r.resourceUsage = ru 1496 r.resourceUsageLock.Unlock() 1497 if ru != nil { 1498 r.emitStats(ru) 1499 } 1500 case <-stopCollection: 1501 return 1502 } 1503 } 1504 } 1505 1506 // LatestResourceUsage returns the last resource utilization datapoint collected 1507 func (r *TaskRunner) LatestResourceUsage() *cstructs.TaskResourceUsage { 1508 r.resourceUsageLock.RLock() 1509 defer r.resourceUsageLock.RUnlock() 1510 r.runningLock.Lock() 1511 defer r.runningLock.Unlock() 1512 1513 // If the task is not running there can be no latest resource 1514 if !r.running { 1515 return nil 1516 } 1517 1518 return r.resourceUsage 1519 } 1520 1521 // handleUpdate takes an updated allocation and updates internal state to 1522 // reflect the new config for the task. 1523 func (r *TaskRunner) handleUpdate(update *structs.Allocation) error { 1524 // Extract the task group from the alloc. 1525 tg := update.Job.LookupTaskGroup(update.TaskGroup) 1526 if tg == nil { 1527 return fmt.Errorf("alloc '%s' missing task group '%s'", update.ID, update.TaskGroup) 1528 } 1529 1530 // Extract the task. 1531 var updatedTask *structs.Task 1532 for _, t := range tg.Tasks { 1533 if t.Name == r.task.Name { 1534 updatedTask = t.Copy() 1535 } 1536 } 1537 if updatedTask == nil { 1538 return fmt.Errorf("task group %q doesn't contain task %q", tg.Name, r.task.Name) 1539 } 1540 1541 // Merge in the task resources 1542 updatedTask.Resources = update.TaskResources[updatedTask.Name] 1543 1544 // Update the task's environment for interpolating in services/checks 1545 r.envBuilder.UpdateTask(update, updatedTask) 1546 1547 var mErr multierror.Error 1548 r.handleLock.Lock() 1549 if r.handle != nil { 1550 drv, err := r.createDriver() 1551 if err != nil { 1552 // Something has really gone wrong; don't continue 1553 r.handleLock.Unlock() 1554 return fmt.Errorf("error accessing driver when updating task %q: %v", r.task.Name, err) 1555 } 1556 1557 // Update will update resources and store the new kill timeout. 1558 if err := r.handle.Update(updatedTask); err != nil { 1559 mErr.Errors = append(mErr.Errors, fmt.Errorf("updating task resources failed: %v", err)) 1560 } 1561 1562 // Update services in Consul 1563 if err := r.updateServices(drv, r.handle, r.task, updatedTask); err != nil { 1564 mErr.Errors = append(mErr.Errors, fmt.Errorf("error updating services and checks in Consul: %v", err)) 1565 } 1566 } 1567 r.handleLock.Unlock() 1568 1569 // Update the restart policy. 1570 if r.restartTracker != nil { 1571 r.restartTracker.SetPolicy(tg.RestartPolicy) 1572 } 1573 1574 // Store the updated alloc. 1575 r.alloc = update 1576 r.task = updatedTask 1577 return mErr.ErrorOrNil() 1578 } 1579 1580 // updateServices and checks with Consul. 1581 func (r *TaskRunner) updateServices(d driver.Driver, h driver.ScriptExecutor, old, new *structs.Task) error { 1582 var exec driver.ScriptExecutor 1583 if d.Abilities().Exec { 1584 // Allow set the script executor if the driver supports it 1585 exec = h 1586 } 1587 newInterpolatedTask := interpolateServices(r.envBuilder.Build(), new) 1588 oldInterpolatedTask := interpolateServices(r.envBuilder.Build(), old) 1589 r.driverNetLock.Lock() 1590 net := r.driverNet.Copy() 1591 r.driverNetLock.Unlock() 1592 return r.consul.UpdateTask(r.alloc.ID, oldInterpolatedTask, newInterpolatedTask, exec, net) 1593 } 1594 1595 // handleDestroy kills the task handle. In the case that killing fails, 1596 // handleDestroy will retry with an exponential backoff and will give up at a 1597 // given limit. It returns whether the task was destroyed and the error 1598 // associated with the last kill attempt. 1599 func (r *TaskRunner) handleDestroy(handle driver.DriverHandle) (destroyed bool, err error) { 1600 // Cap the number of times we attempt to kill the task. 1601 for i := 0; i < killFailureLimit; i++ { 1602 if err = handle.Kill(); err != nil { 1603 // Calculate the new backoff 1604 backoff := (1 << (2 * uint64(i))) * killBackoffBaseline 1605 if backoff > killBackoffLimit { 1606 backoff = killBackoffLimit 1607 } 1608 1609 r.logger.Printf("[ERR] client: failed to kill task '%s' for alloc %q. Retrying in %v: %v", 1610 r.task.Name, r.alloc.ID, backoff, err) 1611 time.Sleep(time.Duration(backoff)) 1612 } else { 1613 // Kill was successful 1614 return true, nil 1615 } 1616 } 1617 return 1618 } 1619 1620 // Restart will restart the task 1621 func (r *TaskRunner) Restart(source, reason string) { 1622 reasonStr := fmt.Sprintf("%s: %s", source, reason) 1623 event := structs.NewTaskEvent(structs.TaskRestartSignal).SetRestartReason(reasonStr) 1624 1625 select { 1626 case r.restartCh <- event: 1627 case <-r.waitCh: 1628 } 1629 } 1630 1631 // Signal will send a signal to the task 1632 func (r *TaskRunner) Signal(source, reason string, s os.Signal) error { 1633 1634 reasonStr := fmt.Sprintf("%s: %s", source, reason) 1635 event := structs.NewTaskEvent(structs.TaskSignaling).SetTaskSignal(s).SetTaskSignalReason(reasonStr) 1636 1637 resCh := make(chan error) 1638 se := SignalEvent{ 1639 s: s, 1640 e: event, 1641 result: resCh, 1642 } 1643 1644 select { 1645 case r.signalCh <- se: 1646 case <-r.waitCh: 1647 } 1648 1649 return <-resCh 1650 } 1651 1652 // Kill will kill a task and store the error, no longer restarting the task. If 1653 // fail is set, the task is marked as having failed. 1654 func (r *TaskRunner) Kill(source, reason string, fail bool) { 1655 reasonStr := fmt.Sprintf("%s: %s", source, reason) 1656 event := structs.NewTaskEvent(structs.TaskKilling).SetKillReason(reasonStr) 1657 if fail { 1658 event.SetFailsTask() 1659 } 1660 1661 r.logger.Printf("[DEBUG] client: killing task %v for alloc %q: %v", r.task.Name, r.alloc.ID, reasonStr) 1662 r.Destroy(event) 1663 } 1664 1665 // UnblockStart unblocks the starting of the task. It currently assumes only 1666 // consul-template will unblock 1667 func (r *TaskRunner) UnblockStart(source string) { 1668 r.unblockLock.Lock() 1669 defer r.unblockLock.Unlock() 1670 if r.unblocked { 1671 return 1672 } 1673 1674 r.logger.Printf("[DEBUG] client: unblocking task %v for alloc %q: %v", r.task.Name, r.alloc.ID, source) 1675 r.unblocked = true 1676 close(r.unblockCh) 1677 } 1678 1679 // Helper function for converting a WaitResult into a TaskTerminated event. 1680 func (r *TaskRunner) waitErrorToEvent(res *dstructs.WaitResult) *structs.TaskEvent { 1681 return structs.NewTaskEvent(structs.TaskTerminated). 1682 SetExitCode(res.ExitCode). 1683 SetSignal(res.Signal). 1684 SetExitMessage(res.Err) 1685 } 1686 1687 // Update is used to update the task of the context 1688 func (r *TaskRunner) Update(update *structs.Allocation) { 1689 select { 1690 case r.updateCh <- update: 1691 default: 1692 r.logger.Printf("[ERR] client: dropping task update '%s' (alloc '%s')", 1693 r.task.Name, r.alloc.ID) 1694 } 1695 } 1696 1697 // Destroy is used to indicate that the task context should be destroyed. The 1698 // event parameter provides a context for the destroy. 1699 func (r *TaskRunner) Destroy(event *structs.TaskEvent) { 1700 r.destroyLock.Lock() 1701 defer r.destroyLock.Unlock() 1702 1703 if r.destroy { 1704 return 1705 } 1706 r.destroy = true 1707 r.destroyEvent = event 1708 close(r.destroyCh) 1709 } 1710 1711 // getCreatedResources returns the resources created by drivers. It will never 1712 // return nil. 1713 func (r *TaskRunner) getCreatedResources() *driver.CreatedResources { 1714 r.createdResourcesLock.Lock() 1715 if r.createdResources == nil { 1716 r.createdResources = driver.NewCreatedResources() 1717 } 1718 cr := r.createdResources.Copy() 1719 r.createdResourcesLock.Unlock() 1720 1721 return cr 1722 } 1723 1724 // setCreatedResources updates the resources created by drivers. If passed nil 1725 // it will set createdResources to an initialized struct. 1726 func (r *TaskRunner) setCreatedResources(cr *driver.CreatedResources) { 1727 if cr == nil { 1728 cr = driver.NewCreatedResources() 1729 } 1730 r.createdResourcesLock.Lock() 1731 r.createdResources = cr.Copy() 1732 r.createdResourcesLock.Unlock() 1733 } 1734 1735 // emitStats emits resource usage stats of tasks to remote metrics collector 1736 // sinks 1737 func (r *TaskRunner) emitStats(ru *cstructs.TaskResourceUsage) { 1738 if ru.ResourceUsage.MemoryStats != nil && r.config.PublishAllocationMetrics { 1739 metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "rss"}, float32(ru.ResourceUsage.MemoryStats.RSS)) 1740 metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "cache"}, float32(ru.ResourceUsage.MemoryStats.Cache)) 1741 metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "swap"}, float32(ru.ResourceUsage.MemoryStats.Swap)) 1742 metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "max_usage"}, float32(ru.ResourceUsage.MemoryStats.MaxUsage)) 1743 metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "kernel_usage"}, float32(ru.ResourceUsage.MemoryStats.KernelUsage)) 1744 metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "kernel_max_usage"}, float32(ru.ResourceUsage.MemoryStats.KernelMaxUsage)) 1745 } 1746 1747 if ru.ResourceUsage.CpuStats != nil && r.config.PublishAllocationMetrics { 1748 metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "total_percent"}, float32(ru.ResourceUsage.CpuStats.Percent)) 1749 metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "system"}, float32(ru.ResourceUsage.CpuStats.SystemMode)) 1750 metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "user"}, float32(ru.ResourceUsage.CpuStats.UserMode)) 1751 metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "throttled_time"}, float32(ru.ResourceUsage.CpuStats.ThrottledTime)) 1752 metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "throttled_periods"}, float32(ru.ResourceUsage.CpuStats.ThrottledPeriods)) 1753 metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "total_ticks"}, float32(ru.ResourceUsage.CpuStats.TotalTicks)) 1754 } 1755 }