github.com/diptanu/nomad@v0.5.7-0.20170516172507-d72e86cbe3d9/client/task_runner.go (about) 1 package client 2 3 import ( 4 "bytes" 5 "crypto/md5" 6 "encoding/hex" 7 "fmt" 8 "io" 9 "io/ioutil" 10 "log" 11 "os" 12 "path/filepath" 13 "strings" 14 "sync" 15 "time" 16 17 "github.com/armon/go-metrics" 18 "github.com/boltdb/bolt" 19 "github.com/golang/snappy" 20 "github.com/hashicorp/consul-template/signals" 21 "github.com/hashicorp/go-multierror" 22 version "github.com/hashicorp/go-version" 23 "github.com/hashicorp/nomad/client/allocdir" 24 "github.com/hashicorp/nomad/client/config" 25 "github.com/hashicorp/nomad/client/driver" 26 "github.com/hashicorp/nomad/client/getter" 27 "github.com/hashicorp/nomad/client/vaultclient" 28 "github.com/hashicorp/nomad/nomad/structs" 29 "github.com/ugorji/go/codec" 30 31 "github.com/hashicorp/nomad/client/driver/env" 32 dstructs "github.com/hashicorp/nomad/client/driver/structs" 33 cstructs "github.com/hashicorp/nomad/client/structs" 34 ) 35 36 const ( 37 // killBackoffBaseline is the baseline time for exponential backoff while 38 // killing a task. 39 killBackoffBaseline = 5 * time.Second 40 41 // killBackoffLimit is the limit of the exponential backoff for killing 42 // the task. 43 killBackoffLimit = 2 * time.Minute 44 45 // killFailureLimit is how many times we will attempt to kill a task before 46 // giving up and potentially leaking resources. 47 killFailureLimit = 5 48 49 // vaultBackoffBaseline is the baseline time for exponential backoff when 50 // attempting to retrieve a Vault token 51 vaultBackoffBaseline = 5 * time.Second 52 53 // vaultBackoffLimit is the limit of the exponential backoff when attempting 54 // to retrieve a Vault token 55 vaultBackoffLimit = 3 * time.Minute 56 57 // vaultTokenFile is the name of the file holding the Vault token inside the 58 // task's secret directory 59 vaultTokenFile = "vault_token" 60 ) 61 62 var ( 63 // taskRunnerStateAllKey holds all the task runners state. At the moment 64 // there is no need to split it 65 taskRunnerStateAllKey = []byte("simple-all") 66 ) 67 68 // TaskRunner is used to wrap a task within an allocation and provide the execution context. 69 type TaskRunner struct { 70 stateDB *bolt.DB 71 config *config.Config 72 updater TaskStateUpdater 73 logger *log.Logger 74 alloc *structs.Allocation 75 restartTracker *RestartTracker 76 consul ConsulServiceAPI 77 78 // running marks whether the task is running 79 running bool 80 runningLock sync.Mutex 81 82 resourceUsage *cstructs.TaskResourceUsage 83 resourceUsageLock sync.RWMutex 84 85 task *structs.Task 86 taskDir *allocdir.TaskDir 87 88 // taskEnv is the environment variables of the task 89 taskEnv *env.TaskEnvironment 90 taskEnvLock sync.Mutex 91 92 // updateCh is used to receive updated versions of the allocation 93 updateCh chan *structs.Allocation 94 95 handle driver.DriverHandle 96 handleLock sync.Mutex 97 98 // artifactsDownloaded tracks whether the tasks artifacts have been 99 // downloaded 100 // 101 // Must acquire persistLock when accessing 102 artifactsDownloaded bool 103 104 // taskDirBuilt tracks whether the task has built its directory. 105 // 106 // Must acquire persistLock when accessing 107 taskDirBuilt bool 108 109 // createdResources are all the resources created by the task driver 110 // across all attempts to start the task. 111 // Simple gets and sets should use {get,set}CreatedResources 112 createdResources *driver.CreatedResources 113 createdResourcesLock sync.Mutex 114 115 // payloadRendered tracks whether the payload has been rendered to disk 116 payloadRendered bool 117 118 // vaultFuture is the means to wait for and get a Vault token 119 vaultFuture *tokenFuture 120 121 // recoveredVaultToken is the token that was recovered through a restore 122 recoveredVaultToken string 123 124 // vaultClient is used to retrieve and renew any needed Vault token 125 vaultClient vaultclient.VaultClient 126 127 // templateManager is used to manage any consul-templates this task may have 128 templateManager *TaskTemplateManager 129 130 // startCh is used to trigger the start of the task 131 startCh chan struct{} 132 133 // unblockCh is used to unblock the starting of the task 134 unblockCh chan struct{} 135 unblocked bool 136 unblockLock sync.Mutex 137 138 // restartCh is used to restart a task 139 restartCh chan *structs.TaskEvent 140 141 // signalCh is used to send a signal to a task 142 signalCh chan SignalEvent 143 144 destroy bool 145 destroyCh chan struct{} 146 destroyLock sync.Mutex 147 destroyEvent *structs.TaskEvent 148 149 // waitCh closing marks the run loop as having exited 150 waitCh chan struct{} 151 152 // persistLock must be acquired when accessing fields stored by 153 // SaveState. SaveState is called asynchronously to TaskRunner.Run by 154 // AllocRunner, so all state fields must be synchronized using this 155 // lock. 156 persistLock sync.Mutex 157 158 // persistedHash is the hash of the last persisted snapshot. It is used to 159 // detect if a new snapshot has to be writen to disk. 160 persistedHash []byte 161 } 162 163 // taskRunnerState is used to snapshot the state of the task runner 164 type taskRunnerState struct { 165 Version string 166 HandleID string 167 ArtifactDownloaded bool 168 TaskDirBuilt bool 169 PayloadRendered bool 170 CreatedResources *driver.CreatedResources 171 } 172 173 func (s *taskRunnerState) Hash() []byte { 174 h := md5.New() 175 176 io.WriteString(h, s.Version) 177 io.WriteString(h, s.HandleID) 178 io.WriteString(h, fmt.Sprintf("%v", s.ArtifactDownloaded)) 179 io.WriteString(h, fmt.Sprintf("%v", s.TaskDirBuilt)) 180 io.WriteString(h, fmt.Sprintf("%v", s.PayloadRendered)) 181 h.Write(s.CreatedResources.Hash()) 182 183 return h.Sum(nil) 184 } 185 186 // TaskStateUpdater is used to signal that tasks state has changed. 187 type TaskStateUpdater func(taskName, state string, event *structs.TaskEvent) 188 189 // SignalEvent is a tuple of the signal and the event generating it 190 type SignalEvent struct { 191 // s is the signal to be sent 192 s os.Signal 193 194 // e is the task event generating the signal 195 e *structs.TaskEvent 196 197 // result should be used to send back the result of the signal 198 result chan<- error 199 } 200 201 // NewTaskRunner is used to create a new task context 202 func NewTaskRunner(logger *log.Logger, config *config.Config, 203 stateDB *bolt.DB, updater TaskStateUpdater, taskDir *allocdir.TaskDir, 204 alloc *structs.Allocation, task *structs.Task, 205 vaultClient vaultclient.VaultClient, consulClient ConsulServiceAPI) *TaskRunner { 206 207 // Merge in the task resources 208 task.Resources = alloc.TaskResources[task.Name] 209 210 // Build the restart tracker. 211 tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup) 212 if tg == nil { 213 logger.Printf("[ERR] client: alloc '%s' for missing task group '%s'", alloc.ID, alloc.TaskGroup) 214 return nil 215 } 216 restartTracker := newRestartTracker(tg.RestartPolicy, alloc.Job.Type) 217 218 tc := &TaskRunner{ 219 config: config, 220 stateDB: stateDB, 221 updater: updater, 222 logger: logger, 223 restartTracker: restartTracker, 224 alloc: alloc, 225 task: task, 226 taskDir: taskDir, 227 createdResources: driver.NewCreatedResources(), 228 consul: consulClient, 229 vaultClient: vaultClient, 230 vaultFuture: NewTokenFuture().Set(""), 231 updateCh: make(chan *structs.Allocation, 64), 232 destroyCh: make(chan struct{}), 233 waitCh: make(chan struct{}), 234 startCh: make(chan struct{}, 1), 235 unblockCh: make(chan struct{}), 236 restartCh: make(chan *structs.TaskEvent), 237 signalCh: make(chan SignalEvent), 238 } 239 240 return tc 241 } 242 243 // MarkReceived marks the task as received. 244 func (r *TaskRunner) MarkReceived() { 245 r.updater(r.task.Name, structs.TaskStatePending, structs.NewTaskEvent(structs.TaskReceived)) 246 } 247 248 // WaitCh returns a channel to wait for termination 249 func (r *TaskRunner) WaitCh() <-chan struct{} { 250 return r.waitCh 251 } 252 253 // pre060StateFilePath returns the path to our state file that would have been 254 // written pre v0.6.0 255 // COMPAT: Remove in 0.7.0 256 func (r *TaskRunner) pre060StateFilePath() string { 257 // Get the MD5 of the task name 258 hashVal := md5.Sum([]byte(r.task.Name)) 259 hashHex := hex.EncodeToString(hashVal[:]) 260 dirName := fmt.Sprintf("task-%s", hashHex) 261 262 // Generate the path 263 return filepath.Join(r.config.StateDir, "alloc", r.alloc.ID, dirName, "state.json") 264 } 265 266 // RestoreState is used to restore our state. If a non-empty string is returned 267 // the task is restarted with the string as the reason. This is useful for 268 // backwards incompatible upgrades that need to restart tasks with a new 269 // executor. 270 func (r *TaskRunner) RestoreState() (string, error) { 271 // COMPAT: Remove in 0.7.0 272 // 0.6.0 transistioned from individual state files to a single bolt-db. 273 // The upgrade path is to: 274 // Check if old state exists 275 // If so, restore from that and delete old state 276 // Restore using state database 277 278 var snap taskRunnerState 279 280 // Check if the old snapshot is there 281 oldPath := r.pre060StateFilePath() 282 if err := pre060RestoreState(oldPath, &snap); err == nil { 283 // Delete the old state 284 os.RemoveAll(oldPath) 285 } else if !os.IsNotExist(err) { 286 // Something corrupt in the old state file 287 return "", err 288 } else { 289 // We are doing a normal restore 290 err := r.stateDB.View(func(tx *bolt.Tx) error { 291 bkt, err := getTaskBucket(tx, r.alloc.ID, r.task.Name) 292 if err != nil { 293 return fmt.Errorf("failed to get task bucket: %v", err) 294 } 295 296 if err := getObject(bkt, taskRunnerStateAllKey, &snap); err != nil { 297 return fmt.Errorf("failed to read task runner state: %v", err) 298 } 299 return nil 300 }) 301 if err != nil { 302 return "", err 303 } 304 305 } 306 307 // Restore fields from the snapshot 308 r.artifactsDownloaded = snap.ArtifactDownloaded 309 r.taskDirBuilt = snap.TaskDirBuilt 310 r.payloadRendered = snap.PayloadRendered 311 r.setCreatedResources(snap.CreatedResources) 312 313 if err := r.setTaskEnv(); err != nil { 314 return "", fmt.Errorf("client: failed to create task environment for task %q in allocation %q: %v", 315 r.task.Name, r.alloc.ID, err) 316 } 317 318 if r.task.Vault != nil { 319 // Read the token from the secret directory 320 tokenPath := filepath.Join(r.taskDir.SecretsDir, vaultTokenFile) 321 data, err := ioutil.ReadFile(tokenPath) 322 if err != nil { 323 if !os.IsNotExist(err) { 324 return "", fmt.Errorf("failed to read token for task %q in alloc %q: %v", r.task.Name, r.alloc.ID, err) 325 } 326 327 // Token file doesn't exist 328 } else { 329 // Store the recovered token 330 r.recoveredVaultToken = string(data) 331 } 332 } 333 334 // Restore the driver 335 restartReason := "" 336 if snap.HandleID != "" { 337 d, err := r.createDriver() 338 if err != nil { 339 return "", err 340 } 341 342 ctx := driver.NewExecContext(r.taskDir) 343 handle, err := d.Open(ctx, snap.HandleID) 344 345 // In the case it fails, we relaunch the task in the Run() method. 346 if err != nil { 347 r.logger.Printf("[ERR] client: failed to open handle to task %q for alloc %q: %v", 348 r.task.Name, r.alloc.ID, err) 349 return "", nil 350 } 351 352 if pre06ScriptCheck(snap.Version, r.task.Driver, r.task.Services) { 353 restartReason = pre06ScriptCheckReason 354 } 355 356 if err := r.registerServices(d, handle); err != nil { 357 // Don't hard fail here as there's a chance this task 358 // registered with Consul properly when it initial 359 // started. 360 r.logger.Printf("[WARN] client: failed to register services and checks with consul for task %q in alloc %q: %v", 361 r.task.Name, r.alloc.ID, err) 362 } 363 364 r.handleLock.Lock() 365 r.handle = handle 366 r.handleLock.Unlock() 367 368 r.runningLock.Lock() 369 r.running = true 370 r.runningLock.Unlock() 371 } 372 return restartReason, nil 373 } 374 375 // ver06 is used for checking for pre-0.6 script checks 376 var ver06 = version.Must(version.NewVersion("0.6.0dev")) 377 378 // pre06ScriptCheckReason is the restart reason given when a pre-0.6 script 379 // check is found on an exec/java task. 380 const pre06ScriptCheckReason = "upgrading pre-0.6 script checks" 381 382 // pre06ScriptCheck returns true if version is prior to 0.6.0dev, has a script 383 // check, and uses exec or java drivers. 384 func pre06ScriptCheck(ver, driver string, services []*structs.Service) bool { 385 if driver != "exec" && driver != "java" && driver != "mock_driver" { 386 // Only exec and java are affected 387 return false 388 } 389 v, err := version.NewVersion(ver) 390 if err != nil { 391 // Treat it as old 392 return true 393 } 394 if !v.LessThan(ver06) { 395 // >= 0.6.0dev 396 return false 397 } 398 for _, service := range services { 399 for _, check := range service.Checks { 400 if check.Type == "script" { 401 return true 402 } 403 } 404 } 405 return false 406 } 407 408 // SaveState is used to snapshot our state 409 func (r *TaskRunner) SaveState() error { 410 r.persistLock.Lock() 411 defer r.persistLock.Unlock() 412 snap := taskRunnerState{ 413 Version: r.config.Version, 414 ArtifactDownloaded: r.artifactsDownloaded, 415 TaskDirBuilt: r.taskDirBuilt, 416 PayloadRendered: r.payloadRendered, 417 CreatedResources: r.getCreatedResources(), 418 } 419 420 r.handleLock.Lock() 421 if r.handle != nil { 422 snap.HandleID = r.handle.ID() 423 } 424 r.handleLock.Unlock() 425 426 // If nothing has changed avoid the write 427 h := snap.Hash() 428 if bytes.Equal(h, r.persistedHash) { 429 return nil 430 } 431 432 // Serialize the object 433 var buf bytes.Buffer 434 if err := codec.NewEncoder(&buf, structs.MsgpackHandle).Encode(&snap); err != nil { 435 return fmt.Errorf("failed to serialize snapshot: %v", err) 436 } 437 438 // Start the transaction. 439 return r.stateDB.Batch(func(tx *bolt.Tx) error { 440 // Grab the task bucket 441 taskBkt, err := getTaskBucket(tx, r.alloc.ID, r.task.Name) 442 if err != nil { 443 return fmt.Errorf("failed to retrieve allocation bucket: %v", err) 444 } 445 446 if err := putData(taskBkt, taskRunnerStateAllKey, buf.Bytes()); err != nil { 447 return fmt.Errorf("failed to write task_runner state: %v", err) 448 } 449 450 // Store the hash that was persisted 451 tx.OnCommit(func() { 452 r.persistedHash = h 453 }) 454 455 return nil 456 }) 457 } 458 459 // DestroyState is used to cleanup after ourselves 460 func (r *TaskRunner) DestroyState() error { 461 r.persistLock.Lock() 462 defer r.persistLock.Unlock() 463 464 return r.stateDB.Update(func(tx *bolt.Tx) error { 465 if err := deleteTaskBucket(tx, r.alloc.ID, r.task.Name); err != nil { 466 return fmt.Errorf("failed to delete task bucket: %v", err) 467 } 468 return nil 469 }) 470 } 471 472 // setState is used to update the state of the task runner 473 func (r *TaskRunner) setState(state string, event *structs.TaskEvent) { 474 // Persist our state to disk. 475 if err := r.SaveState(); err != nil { 476 r.logger.Printf("[ERR] client: failed to save state of Task Runner for task %q: %v", r.task.Name, err) 477 } 478 479 // Indicate the task has been updated. 480 r.updater(r.task.Name, state, event) 481 } 482 483 // setTaskEnv sets the task environment. It returns an error if it could not be 484 // created. 485 func (r *TaskRunner) setTaskEnv() error { 486 r.taskEnvLock.Lock() 487 defer r.taskEnvLock.Unlock() 488 489 taskEnv, err := driver.GetTaskEnv(r.taskDir, r.config.Node, 490 r.task.Copy(), r.alloc, r.config, r.vaultFuture.Get()) 491 if err != nil { 492 return err 493 } 494 r.taskEnv = taskEnv 495 return nil 496 } 497 498 // getTaskEnv returns the task environment 499 func (r *TaskRunner) getTaskEnv() *env.TaskEnvironment { 500 r.taskEnvLock.Lock() 501 defer r.taskEnvLock.Unlock() 502 return r.taskEnv 503 } 504 505 // createDriver makes a driver for the task 506 func (r *TaskRunner) createDriver() (driver.Driver, error) { 507 env := r.getTaskEnv() 508 if env == nil { 509 return nil, fmt.Errorf("task environment not made for task %q in allocation %q", r.task.Name, r.alloc.ID) 510 } 511 512 // Create a task-specific event emitter callback to expose minimal 513 // state to drivers 514 eventEmitter := func(m string, args ...interface{}) { 515 msg := fmt.Sprintf(m, args...) 516 r.logger.Printf("[DEBUG] client: driver event for alloc %q: %s", r.alloc.ID, msg) 517 r.setState(structs.TaskStatePending, structs.NewTaskEvent(structs.TaskDriverMessage).SetDriverMessage(msg)) 518 } 519 520 driverCtx := driver.NewDriverContext(r.task.Name, r.alloc.ID, r.config, r.config.Node, r.logger, env, eventEmitter) 521 driver, err := driver.NewDriver(r.task.Driver, driverCtx) 522 if err != nil { 523 return nil, fmt.Errorf("failed to create driver '%s' for alloc %s: %v", 524 r.task.Driver, r.alloc.ID, err) 525 } 526 return driver, err 527 } 528 529 // Run is a long running routine used to manage the task 530 func (r *TaskRunner) Run() { 531 defer close(r.waitCh) 532 r.logger.Printf("[DEBUG] client: starting task context for '%s' (alloc '%s')", 533 r.task.Name, r.alloc.ID) 534 535 // Create the initial environment, this will be recreated if a Vault token 536 // is needed 537 if err := r.setTaskEnv(); err != nil { 538 r.setState( 539 structs.TaskStateDead, 540 structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(err)) 541 return 542 } 543 544 if err := r.validateTask(); err != nil { 545 r.setState( 546 structs.TaskStateDead, 547 structs.NewTaskEvent(structs.TaskFailedValidation).SetValidationError(err).SetFailsTask()) 548 return 549 } 550 551 // Create a driver so that we can determine the FSIsolation required 552 drv, err := r.createDriver() 553 if err != nil { 554 e := fmt.Errorf("failed to create driver of task %q for alloc %q: %v", r.task.Name, r.alloc.ID, err) 555 r.setState( 556 structs.TaskStateDead, 557 structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(e).SetFailsTask()) 558 return 559 } 560 561 // Build base task directory structure regardless of FS isolation abilities. 562 // This needs to happen before we start the Vault manager and call prestart 563 // as both those can write to the task directories 564 if err := r.buildTaskDir(drv.FSIsolation()); err != nil { 565 e := fmt.Errorf("failed to build task directory for %q: %v", r.task.Name, err) 566 r.setState( 567 structs.TaskStateDead, 568 structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(e).SetFailsTask()) 569 return 570 } 571 572 // If there is no Vault policy leave the static future created in 573 // NewTaskRunner 574 if r.task.Vault != nil { 575 // Start the go-routine to get a Vault token 576 r.vaultFuture.Clear() 577 go r.vaultManager(r.recoveredVaultToken) 578 } 579 580 // Start the run loop 581 r.run() 582 583 // Do any cleanup necessary 584 r.postrun() 585 586 return 587 } 588 589 // validateTask validates the fields of the task and returns an error if the 590 // task is invalid. 591 func (r *TaskRunner) validateTask() error { 592 var mErr multierror.Error 593 594 // Validate the user. 595 unallowedUsers := r.config.ReadStringListToMapDefault("user.blacklist", config.DefaultUserBlacklist) 596 checkDrivers := r.config.ReadStringListToMapDefault("user.checked_drivers", config.DefaultUserCheckedDrivers) 597 if _, driverMatch := checkDrivers[r.task.Driver]; driverMatch { 598 if _, unallowed := unallowedUsers[r.task.User]; unallowed { 599 mErr.Errors = append(mErr.Errors, fmt.Errorf("running as user %q is disallowed", r.task.User)) 600 } 601 } 602 603 // Validate the artifacts 604 for i, artifact := range r.task.Artifacts { 605 // Verify the artifact doesn't escape the task directory. 606 if err := artifact.Validate(); err != nil { 607 // If this error occurs there is potentially a server bug or 608 // mallicious, server spoofing. 609 r.logger.Printf("[ERR] client: allocation %q, task %v, artifact %#v (%v) fails validation: %v", 610 r.alloc.ID, r.task.Name, artifact, i, err) 611 mErr.Errors = append(mErr.Errors, fmt.Errorf("artifact (%d) failed validation: %v", i, err)) 612 } 613 } 614 615 // Validate the Service names 616 for i, service := range r.task.Services { 617 name := r.taskEnv.ReplaceEnv(service.Name) 618 if err := service.ValidateName(name); err != nil { 619 mErr.Errors = append(mErr.Errors, fmt.Errorf("service (%d) failed validation: %v", i, err)) 620 } 621 } 622 623 if len(mErr.Errors) == 1 { 624 return mErr.Errors[0] 625 } 626 return mErr.ErrorOrNil() 627 } 628 629 // tokenFuture stores the Vault token and allows consumers to block till a valid 630 // token exists 631 type tokenFuture struct { 632 waiting []chan struct{} 633 token string 634 set bool 635 m sync.Mutex 636 } 637 638 // NewTokenFuture returns a new token future without any token set 639 func NewTokenFuture() *tokenFuture { 640 return &tokenFuture{} 641 } 642 643 // Wait returns a channel that can be waited on. When this channel unblocks, a 644 // valid token will be available via the Get method 645 func (f *tokenFuture) Wait() <-chan struct{} { 646 f.m.Lock() 647 defer f.m.Unlock() 648 649 c := make(chan struct{}) 650 if f.set { 651 close(c) 652 return c 653 } 654 655 f.waiting = append(f.waiting, c) 656 return c 657 } 658 659 // Set sets the token value and unblocks any caller of Wait 660 func (f *tokenFuture) Set(token string) *tokenFuture { 661 f.m.Lock() 662 defer f.m.Unlock() 663 664 f.set = true 665 f.token = token 666 for _, w := range f.waiting { 667 close(w) 668 } 669 f.waiting = nil 670 return f 671 } 672 673 // Clear clears the set vault token. 674 func (f *tokenFuture) Clear() *tokenFuture { 675 f.m.Lock() 676 defer f.m.Unlock() 677 678 f.token = "" 679 f.set = false 680 return f 681 } 682 683 // Get returns the set Vault token 684 func (f *tokenFuture) Get() string { 685 f.m.Lock() 686 defer f.m.Unlock() 687 return f.token 688 } 689 690 // vaultManager should be called in a go-routine and manages the derivation, 691 // renewal and handling of errors with the Vault token. The optional parameter 692 // allows setting the initial Vault token. This is useful when the Vault token 693 // is recovered off disk. 694 func (r *TaskRunner) vaultManager(token string) { 695 // Helper for stopping token renewal 696 stopRenewal := func() { 697 if err := r.vaultClient.StopRenewToken(r.vaultFuture.Get()); err != nil { 698 r.logger.Printf("[WARN] client: failed to stop token renewal for task %v in alloc %q: %v", r.task.Name, r.alloc.ID, err) 699 } 700 } 701 702 // updatedToken lets us store state between loops. If true, a new token 703 // has been retrieved and we need to apply the Vault change mode 704 var updatedToken bool 705 706 OUTER: 707 for { 708 // Check if we should exit 709 select { 710 case <-r.waitCh: 711 stopRenewal() 712 return 713 default: 714 } 715 716 // Clear the token 717 r.vaultFuture.Clear() 718 719 // Check if there already is a token which can be the case for 720 // restoring the TaskRunner 721 if token == "" { 722 // Get a token 723 var exit bool 724 token, exit = r.deriveVaultToken() 725 if exit { 726 // Exit the manager 727 return 728 } 729 730 // Write the token to disk 731 if err := r.writeToken(token); err != nil { 732 e := fmt.Errorf("failed to write Vault token to disk") 733 r.logger.Printf("[ERR] client: %v for task %v on alloc %q: %v", e, r.task.Name, r.alloc.ID, err) 734 r.Kill("vault", e.Error(), true) 735 return 736 } 737 } 738 739 // Start the renewal process 740 renewCh, err := r.vaultClient.RenewToken(token, 30) 741 742 // An error returned means the token is not being renewed 743 if err != nil { 744 r.logger.Printf("[ERR] client: failed to start renewal of Vault token for task %v on alloc %q: %v", r.task.Name, r.alloc.ID, err) 745 token = "" 746 goto OUTER 747 } 748 749 // The Vault token is valid now, so set it 750 r.vaultFuture.Set(token) 751 752 if updatedToken { 753 switch r.task.Vault.ChangeMode { 754 case structs.VaultChangeModeSignal: 755 s, err := signals.Parse(r.task.Vault.ChangeSignal) 756 if err != nil { 757 e := fmt.Errorf("failed to parse signal: %v", err) 758 r.logger.Printf("[ERR] client: %v", err) 759 r.Kill("vault", e.Error(), true) 760 return 761 } 762 763 if err := r.Signal("vault", "new Vault token acquired", s); err != nil { 764 r.logger.Printf("[ERR] client: failed to send signal to task %v for alloc %q: %v", r.task.Name, r.alloc.ID, err) 765 r.Kill("vault", fmt.Sprintf("failed to send signal to task: %v", err), true) 766 return 767 } 768 case structs.VaultChangeModeRestart: 769 r.Restart("vault", "new Vault token acquired") 770 case structs.VaultChangeModeNoop: 771 fallthrough 772 default: 773 r.logger.Printf("[ERR] client: Invalid Vault change mode: %q", r.task.Vault.ChangeMode) 774 } 775 776 // We have handled it 777 updatedToken = false 778 779 // Call the handler 780 r.updatedTokenHandler() 781 } 782 783 // Start watching for renewal errors 784 select { 785 case err := <-renewCh: 786 // Clear the token 787 token = "" 788 r.logger.Printf("[ERR] client: failed to renew Vault token for task %v on alloc %q: %v", r.task.Name, r.alloc.ID, err) 789 stopRenewal() 790 791 // Check if we have to do anything 792 if r.task.Vault.ChangeMode != structs.VaultChangeModeNoop { 793 updatedToken = true 794 } 795 case <-r.waitCh: 796 stopRenewal() 797 return 798 } 799 } 800 } 801 802 // deriveVaultToken derives the Vault token using exponential backoffs. It 803 // returns the Vault token and whether the manager should exit. 804 func (r *TaskRunner) deriveVaultToken() (token string, exit bool) { 805 attempts := 0 806 for { 807 tokens, err := r.vaultClient.DeriveToken(r.alloc, []string{r.task.Name}) 808 if err == nil { 809 return tokens[r.task.Name], false 810 } 811 812 // Check if we can't recover from the error 813 if !structs.IsRecoverable(err) { 814 r.logger.Printf("[ERR] client: failed to derive Vault token for task %v on alloc %q: %v", 815 r.task.Name, r.alloc.ID, err) 816 r.Kill("vault", fmt.Sprintf("failed to derive token: %v", err), true) 817 return "", true 818 } 819 820 // Handle the retry case 821 backoff := (1 << (2 * uint64(attempts))) * vaultBackoffBaseline 822 if backoff > vaultBackoffLimit { 823 backoff = vaultBackoffLimit 824 } 825 r.logger.Printf("[ERR] client: failed to derive Vault token for task %v on alloc %q: %v; retrying in %v", 826 r.task.Name, r.alloc.ID, err, backoff) 827 828 attempts++ 829 830 // Wait till retrying 831 select { 832 case <-r.waitCh: 833 return "", true 834 case <-time.After(backoff): 835 } 836 } 837 } 838 839 // writeToken writes the given token to disk 840 func (r *TaskRunner) writeToken(token string) error { 841 tokenPath := filepath.Join(r.taskDir.SecretsDir, vaultTokenFile) 842 if err := ioutil.WriteFile(tokenPath, []byte(token), 0777); err != nil { 843 return fmt.Errorf("failed to save Vault tokens to secret dir for task %q in alloc %q: %v", r.task.Name, r.alloc.ID, err) 844 } 845 846 return nil 847 } 848 849 // updatedTokenHandler is called when a new Vault token is retrieved. Things 850 // that rely on the token should be updated here. 851 func (r *TaskRunner) updatedTokenHandler() { 852 853 // Update the tasks environment 854 if err := r.setTaskEnv(); err != nil { 855 r.setState( 856 structs.TaskStateDead, 857 structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(err).SetFailsTask()) 858 return 859 } 860 861 if r.templateManager != nil { 862 r.templateManager.Stop() 863 864 // Create a new templateManager 865 var err error 866 r.templateManager, err = NewTaskTemplateManager(r, r.task.Templates, 867 r.config, r.vaultFuture.Get(), r.taskDir.Dir, r.getTaskEnv()) 868 if err != nil { 869 err := fmt.Errorf("failed to build task's template manager: %v", err) 870 r.setState(structs.TaskStateDead, structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(err).SetFailsTask()) 871 r.logger.Printf("[ERR] client: alloc %q, task %q %v", r.alloc.ID, r.task.Name, err) 872 r.Kill("vault", err.Error(), true) 873 return 874 } 875 } 876 } 877 878 // prestart handles life-cycle tasks that occur before the task has started. 879 func (r *TaskRunner) prestart(resultCh chan bool) { 880 if r.task.Vault != nil { 881 // Wait for the token 882 r.logger.Printf("[DEBUG] client: waiting for Vault token for task %v in alloc %q", r.task.Name, r.alloc.ID) 883 tokenCh := r.vaultFuture.Wait() 884 select { 885 case <-tokenCh: 886 case <-r.waitCh: 887 resultCh <- false 888 return 889 } 890 r.logger.Printf("[DEBUG] client: retrieved Vault token for task %v in alloc %q", r.task.Name, r.alloc.ID) 891 } 892 893 if err := r.setTaskEnv(); err != nil { 894 r.setState( 895 structs.TaskStateDead, 896 structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(err).SetFailsTask()) 897 resultCh <- false 898 return 899 } 900 901 // If the job is a dispatch job and there is a payload write it to disk 902 requirePayload := len(r.alloc.Job.Payload) != 0 && 903 (r.task.DispatchPayload != nil && r.task.DispatchPayload.File != "") 904 if !r.payloadRendered && requirePayload { 905 renderTo := filepath.Join(r.taskDir.LocalDir, r.task.DispatchPayload.File) 906 decoded, err := snappy.Decode(nil, r.alloc.Job.Payload) 907 if err != nil { 908 r.setState( 909 structs.TaskStateDead, 910 structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(err).SetFailsTask()) 911 resultCh <- false 912 return 913 } 914 915 if err := os.MkdirAll(filepath.Dir(renderTo), 07777); err != nil { 916 r.setState( 917 structs.TaskStateDead, 918 structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(err).SetFailsTask()) 919 resultCh <- false 920 return 921 } 922 923 if err := ioutil.WriteFile(renderTo, decoded, 0777); err != nil { 924 r.setState( 925 structs.TaskStateDead, 926 structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(err).SetFailsTask()) 927 resultCh <- false 928 return 929 } 930 931 r.payloadRendered = true 932 } 933 934 for { 935 r.persistLock.Lock() 936 downloaded := r.artifactsDownloaded 937 r.persistLock.Unlock() 938 939 // Download the task's artifacts 940 if !downloaded && len(r.task.Artifacts) > 0 { 941 r.setState(structs.TaskStatePending, structs.NewTaskEvent(structs.TaskDownloadingArtifacts)) 942 for _, artifact := range r.task.Artifacts { 943 if err := getter.GetArtifact(r.getTaskEnv(), artifact, r.taskDir.Dir); err != nil { 944 wrapped := fmt.Errorf("failed to download artifact %q: %v", artifact.GetterSource, err) 945 r.logger.Printf("[DEBUG] client: %v", wrapped) 946 r.setState(structs.TaskStatePending, 947 structs.NewTaskEvent(structs.TaskArtifactDownloadFailed).SetDownloadError(wrapped)) 948 r.restartTracker.SetStartError(structs.WrapRecoverable(wrapped.Error(), err)) 949 goto RESTART 950 } 951 } 952 953 r.persistLock.Lock() 954 r.artifactsDownloaded = true 955 r.persistLock.Unlock() 956 } 957 958 // We don't have to wait for any template 959 if len(r.task.Templates) == 0 { 960 // Send the start signal 961 select { 962 case r.startCh <- struct{}{}: 963 default: 964 } 965 966 resultCh <- true 967 return 968 } 969 970 // Build the template manager 971 if r.templateManager == nil { 972 var err error 973 r.templateManager, err = NewTaskTemplateManager(r, r.task.Templates, 974 r.config, r.vaultFuture.Get(), r.taskDir.Dir, r.getTaskEnv()) 975 if err != nil { 976 err := fmt.Errorf("failed to build task's template manager: %v", err) 977 r.setState(structs.TaskStateDead, structs.NewTaskEvent(structs.TaskSetupFailure).SetSetupError(err).SetFailsTask()) 978 r.logger.Printf("[ERR] client: alloc %q, task %q %v", r.alloc.ID, r.task.Name, err) 979 resultCh <- false 980 return 981 } 982 } 983 984 // Block for consul-template 985 // TODO Hooks should register themselves as blocking and then we can 986 // perioidcally enumerate what we are still blocked on 987 select { 988 case <-r.unblockCh: 989 // Send the start signal 990 select { 991 case r.startCh <- struct{}{}: 992 default: 993 } 994 995 resultCh <- true 996 return 997 case <-r.waitCh: 998 // The run loop has exited so exit too 999 resultCh <- false 1000 return 1001 } 1002 1003 RESTART: 1004 restart := r.shouldRestart() 1005 if !restart { 1006 resultCh <- false 1007 return 1008 } 1009 } 1010 } 1011 1012 // postrun is used to do any cleanup that is necessary after exiting the runloop 1013 func (r *TaskRunner) postrun() { 1014 // Stop the template manager 1015 if r.templateManager != nil { 1016 r.templateManager.Stop() 1017 } 1018 } 1019 1020 // run is the main run loop that handles starting the application, destroying 1021 // it, restarts and signals. 1022 func (r *TaskRunner) run() { 1023 // Predeclare things so we can jump to the RESTART 1024 var stopCollection chan struct{} 1025 var handleWaitCh chan *dstructs.WaitResult 1026 1027 // If we already have a handle, populate the stopCollection and handleWaitCh 1028 // to fix the invariant that it exists. 1029 r.handleLock.Lock() 1030 handleEmpty := r.handle == nil 1031 r.handleLock.Unlock() 1032 1033 if !handleEmpty { 1034 stopCollection = make(chan struct{}) 1035 go r.collectResourceUsageStats(stopCollection) 1036 handleWaitCh = r.handle.WaitCh() 1037 } 1038 1039 for { 1040 // Do the prestart activities 1041 prestartResultCh := make(chan bool, 1) 1042 go r.prestart(prestartResultCh) 1043 1044 WAIT: 1045 for { 1046 select { 1047 case success := <-prestartResultCh: 1048 if !success { 1049 r.cleanup() 1050 r.setState(structs.TaskStateDead, nil) 1051 return 1052 } 1053 case <-r.startCh: 1054 // Start the task if not yet started or it is being forced. This logic 1055 // is necessary because in the case of a restore the handle already 1056 // exists. 1057 r.handleLock.Lock() 1058 handleEmpty := r.handle == nil 1059 r.handleLock.Unlock() 1060 if handleEmpty { 1061 startErr := r.startTask() 1062 r.restartTracker.SetStartError(startErr) 1063 if startErr != nil { 1064 r.setState("", structs.NewTaskEvent(structs.TaskDriverFailure).SetDriverError(startErr)) 1065 goto RESTART 1066 } 1067 1068 // Mark the task as started 1069 r.setState(structs.TaskStateRunning, structs.NewTaskEvent(structs.TaskStarted)) 1070 r.runningLock.Lock() 1071 r.running = true 1072 r.runningLock.Unlock() 1073 1074 if stopCollection == nil { 1075 stopCollection = make(chan struct{}) 1076 go r.collectResourceUsageStats(stopCollection) 1077 } 1078 1079 handleWaitCh = r.handle.WaitCh() 1080 } 1081 1082 case waitRes := <-handleWaitCh: 1083 if waitRes == nil { 1084 panic("nil wait") 1085 } 1086 1087 r.runningLock.Lock() 1088 r.running = false 1089 r.runningLock.Unlock() 1090 1091 // Stop collection of the task's resource usage 1092 close(stopCollection) 1093 1094 // Log whether the task was successful or not. 1095 r.restartTracker.SetWaitResult(waitRes) 1096 r.setState("", r.waitErrorToEvent(waitRes)) 1097 if !waitRes.Successful() { 1098 r.logger.Printf("[INFO] client: task %q for alloc %q failed: %v", r.task.Name, r.alloc.ID, waitRes) 1099 } else { 1100 r.logger.Printf("[INFO] client: task %q for alloc %q completed successfully", r.task.Name, r.alloc.ID) 1101 } 1102 1103 break WAIT 1104 case update := <-r.updateCh: 1105 if err := r.handleUpdate(update); err != nil { 1106 r.logger.Printf("[ERR] client: update to task %q failed: %v", r.task.Name, err) 1107 } 1108 1109 case se := <-r.signalCh: 1110 r.runningLock.Lock() 1111 running := r.running 1112 r.runningLock.Unlock() 1113 common := fmt.Sprintf("signal %v to task %v for alloc %q", se.s, r.task.Name, r.alloc.ID) 1114 if !running { 1115 // Send no error 1116 r.logger.Printf("[DEBUG] client: skipping %s", common) 1117 se.result <- nil 1118 continue 1119 } 1120 1121 r.logger.Printf("[DEBUG] client: sending %s", common) 1122 r.setState(structs.TaskStateRunning, se.e) 1123 1124 res := r.handle.Signal(se.s) 1125 se.result <- res 1126 1127 case event := <-r.restartCh: 1128 r.runningLock.Lock() 1129 running := r.running 1130 r.runningLock.Unlock() 1131 common := fmt.Sprintf("task %v for alloc %q", r.task.Name, r.alloc.ID) 1132 if !running { 1133 r.logger.Printf("[DEBUG] client: skipping restart of %v: task isn't running", common) 1134 continue 1135 } 1136 1137 r.logger.Printf("[DEBUG] client: restarting %s: %v", common, event.RestartReason) 1138 r.setState(structs.TaskStateRunning, event) 1139 r.killTask(nil) 1140 1141 close(stopCollection) 1142 1143 if handleWaitCh != nil { 1144 <-handleWaitCh 1145 } 1146 1147 // Since the restart isn't from a failure, restart immediately 1148 // and don't count against the restart policy 1149 r.restartTracker.SetRestartTriggered() 1150 break WAIT 1151 1152 case <-r.destroyCh: 1153 r.runningLock.Lock() 1154 running := r.running 1155 r.runningLock.Unlock() 1156 if !running { 1157 r.cleanup() 1158 r.setState(structs.TaskStateDead, r.destroyEvent) 1159 return 1160 } 1161 1162 // Remove from consul before killing the task so that traffic 1163 // can be rerouted 1164 r.consul.RemoveTask(r.alloc.ID, r.task) 1165 1166 // Store the task event that provides context on the task 1167 // destroy. The Killed event is set from the alloc_runner and 1168 // doesn't add detail 1169 var killEvent *structs.TaskEvent 1170 if r.destroyEvent.Type != structs.TaskKilled { 1171 if r.destroyEvent.Type == structs.TaskKilling { 1172 killEvent = r.destroyEvent 1173 } else { 1174 r.setState(structs.TaskStateRunning, r.destroyEvent) 1175 } 1176 } 1177 1178 r.killTask(killEvent) 1179 close(stopCollection) 1180 1181 // Wait for handler to exit before calling cleanup 1182 <-handleWaitCh 1183 r.cleanup() 1184 1185 r.setState(structs.TaskStateDead, nil) 1186 return 1187 } 1188 } 1189 1190 RESTART: 1191 // shouldRestart will block if the task should restart after a delay. 1192 restart := r.shouldRestart() 1193 if !restart { 1194 r.cleanup() 1195 r.setState(structs.TaskStateDead, nil) 1196 return 1197 } 1198 1199 // Clear the handle so a new driver will be created. 1200 r.handleLock.Lock() 1201 r.handle = nil 1202 handleWaitCh = nil 1203 stopCollection = nil 1204 r.handleLock.Unlock() 1205 } 1206 } 1207 1208 // cleanup removes Consul entries and calls Driver.Cleanup when a task is 1209 // stopping. Errors are logged. 1210 func (r *TaskRunner) cleanup() { 1211 // Remove from Consul 1212 r.consul.RemoveTask(r.alloc.ID, r.task) 1213 1214 drv, err := r.createDriver() 1215 if err != nil { 1216 r.logger.Printf("[ERR] client: error creating driver to cleanup resources: %v", err) 1217 return 1218 } 1219 1220 res := r.getCreatedResources() 1221 1222 ctx := driver.NewExecContext(r.taskDir) 1223 attempts := 1 1224 var cleanupErr error 1225 for retry := true; retry; attempts++ { 1226 cleanupErr = drv.Cleanup(ctx, res) 1227 retry = structs.IsRecoverable(cleanupErr) 1228 1229 // Copy current createdResources state in case SaveState is 1230 // called between retries 1231 r.setCreatedResources(res) 1232 1233 // Retry 3 times with sleeps between 1234 if !retry || attempts > 3 { 1235 break 1236 } 1237 time.Sleep(time.Duration(attempts) * time.Second) 1238 } 1239 1240 if cleanupErr != nil { 1241 r.logger.Printf("[ERR] client: error cleaning up resources for task %q after %d attempts: %v", r.task.Name, attempts, cleanupErr) 1242 } 1243 return 1244 } 1245 1246 // shouldRestart returns if the task should restart. If the return value is 1247 // true, the task's restart policy has already been considered and any wait time 1248 // between restarts has been applied. 1249 func (r *TaskRunner) shouldRestart() bool { 1250 state, when := r.restartTracker.GetState() 1251 reason := r.restartTracker.GetReason() 1252 switch state { 1253 case structs.TaskNotRestarting, structs.TaskTerminated: 1254 r.logger.Printf("[INFO] client: Not restarting task: %v for alloc: %v ", r.task.Name, r.alloc.ID) 1255 if state == structs.TaskNotRestarting { 1256 r.setState(structs.TaskStateDead, 1257 structs.NewTaskEvent(structs.TaskNotRestarting). 1258 SetRestartReason(reason).SetFailsTask()) 1259 } 1260 return false 1261 case structs.TaskRestarting: 1262 r.logger.Printf("[INFO] client: Restarting task %q for alloc %q in %v", r.task.Name, r.alloc.ID, when) 1263 r.setState(structs.TaskStatePending, 1264 structs.NewTaskEvent(structs.TaskRestarting). 1265 SetRestartDelay(when). 1266 SetRestartReason(reason)) 1267 default: 1268 r.logger.Printf("[ERR] client: restart tracker returned unknown state: %q", state) 1269 return false 1270 } 1271 1272 // Unregister from Consul while waiting to restart. 1273 r.consul.RemoveTask(r.alloc.ID, r.task) 1274 1275 // Sleep but watch for destroy events. 1276 select { 1277 case <-time.After(when): 1278 case <-r.destroyCh: 1279 } 1280 1281 // Destroyed while we were waiting to restart, so abort. 1282 r.destroyLock.Lock() 1283 destroyed := r.destroy 1284 r.destroyLock.Unlock() 1285 if destroyed { 1286 r.logger.Printf("[DEBUG] client: Not restarting task: %v because it has been destroyed", r.task.Name) 1287 r.setState(structs.TaskStateDead, r.destroyEvent) 1288 return false 1289 } 1290 1291 return true 1292 } 1293 1294 // killTask kills the running task. A killing event can optionally be passed and 1295 // this event is used to mark the task as being killed. It provides a means to 1296 // store extra information. 1297 func (r *TaskRunner) killTask(killingEvent *structs.TaskEvent) { 1298 r.runningLock.Lock() 1299 running := r.running 1300 r.runningLock.Unlock() 1301 if !running { 1302 return 1303 } 1304 1305 // Get the kill timeout 1306 timeout := driver.GetKillTimeout(r.task.KillTimeout, r.config.MaxKillTimeout) 1307 1308 // Build the event 1309 var event *structs.TaskEvent 1310 if killingEvent != nil { 1311 event = killingEvent 1312 event.Type = structs.TaskKilling 1313 } else { 1314 event = structs.NewTaskEvent(structs.TaskKilling) 1315 } 1316 event.SetKillTimeout(timeout) 1317 1318 // Mark that we received the kill event 1319 r.setState(structs.TaskStateRunning, event) 1320 1321 r.handleLock.Lock() 1322 handle := r.handle 1323 r.handleLock.Unlock() 1324 1325 // Kill the task using an exponential backoff in-case of failures. 1326 destroySuccess, err := r.handleDestroy(handle) 1327 if !destroySuccess { 1328 // We couldn't successfully destroy the resource created. 1329 r.logger.Printf("[ERR] client: failed to kill task %q. Resources may have been leaked: %v", r.task.Name, err) 1330 } 1331 1332 r.runningLock.Lock() 1333 r.running = false 1334 r.runningLock.Unlock() 1335 1336 // Store that the task has been destroyed and any associated error. 1337 r.setState("", structs.NewTaskEvent(structs.TaskKilled).SetKillError(err)) 1338 } 1339 1340 // startTask creates the driver, task dir, and starts the task. 1341 func (r *TaskRunner) startTask() error { 1342 // Create a driver 1343 drv, err := r.createDriver() 1344 if err != nil { 1345 return fmt.Errorf("failed to create driver of task %q for alloc %q: %v", 1346 r.task.Name, r.alloc.ID, err) 1347 } 1348 1349 // Run prestart 1350 ctx := driver.NewExecContext(r.taskDir) 1351 res, err := drv.Prestart(ctx, r.task) 1352 1353 // Merge newly created resources into previously created resources 1354 r.createdResourcesLock.Lock() 1355 r.createdResources.Merge(res) 1356 r.createdResourcesLock.Unlock() 1357 1358 if err != nil { 1359 wrapped := fmt.Sprintf("failed to initialize task %q for alloc %q: %v", 1360 r.task.Name, r.alloc.ID, err) 1361 r.logger.Printf("[WARN] client: error from prestart: %s", wrapped) 1362 return structs.WrapRecoverable(wrapped, err) 1363 } 1364 1365 // Start the job 1366 handle, err := drv.Start(ctx, r.task) 1367 if err != nil { 1368 wrapped := fmt.Sprintf("failed to start task %q for alloc %q: %v", 1369 r.task.Name, r.alloc.ID, err) 1370 r.logger.Printf("[WARN] client: %s", wrapped) 1371 return structs.WrapRecoverable(wrapped, err) 1372 1373 } 1374 1375 if err := r.registerServices(drv, handle); err != nil { 1376 // All IO is done asynchronously, so errors from registering 1377 // services are hard failures. 1378 r.logger.Printf("[ERR] client: failed to register services and checks for task %q alloc %q: %v", r.task.Name, r.alloc.ID, err) 1379 1380 // Kill the started task 1381 if destroyed, err := r.handleDestroy(handle); !destroyed { 1382 r.logger.Printf("[ERR] client: failed to kill task %q alloc %q. Resources may be leaked: %v", 1383 r.task.Name, r.alloc.ID, err) 1384 } 1385 return structs.NewRecoverableError(err, false) 1386 } 1387 1388 r.handleLock.Lock() 1389 r.handle = handle 1390 r.handleLock.Unlock() 1391 1392 return nil 1393 } 1394 1395 // registerServices and checks with Consul. 1396 func (r *TaskRunner) registerServices(d driver.Driver, h driver.ScriptExecutor) error { 1397 var exec driver.ScriptExecutor 1398 if d.Abilities().Exec { 1399 // Allow set the script executor if the driver supports it 1400 exec = h 1401 } 1402 interpolateServices(r.getTaskEnv(), r.task) 1403 return r.consul.RegisterTask(r.alloc.ID, r.task, exec) 1404 } 1405 1406 // interpolateServices interpolates tags in a service and checks with values from the 1407 // task's environment. 1408 func interpolateServices(taskEnv *env.TaskEnvironment, task *structs.Task) { 1409 for _, service := range task.Services { 1410 for _, check := range service.Checks { 1411 check.Name = taskEnv.ReplaceEnv(check.Name) 1412 check.Type = taskEnv.ReplaceEnv(check.Type) 1413 check.Command = taskEnv.ReplaceEnv(check.Command) 1414 check.Args = taskEnv.ParseAndReplace(check.Args) 1415 check.Path = taskEnv.ReplaceEnv(check.Path) 1416 check.Protocol = taskEnv.ReplaceEnv(check.Protocol) 1417 check.PortLabel = taskEnv.ReplaceEnv(check.PortLabel) 1418 check.InitialStatus = taskEnv.ReplaceEnv(check.InitialStatus) 1419 } 1420 service.Name = taskEnv.ReplaceEnv(service.Name) 1421 service.PortLabel = taskEnv.ReplaceEnv(service.PortLabel) 1422 service.Tags = taskEnv.ParseAndReplace(service.Tags) 1423 } 1424 } 1425 1426 // buildTaskDir creates the task directory before driver.Prestart. It is safe 1427 // to call multiple times as its state is persisted. 1428 func (r *TaskRunner) buildTaskDir(fsi cstructs.FSIsolation) error { 1429 r.persistLock.Lock() 1430 built := r.taskDirBuilt 1431 r.persistLock.Unlock() 1432 1433 // We do not set the state again since this only occurs during restoration 1434 // and the task dir is already built. The reason we call Build again is to 1435 // ensure that the task dir invariants are still held. 1436 if !built { 1437 r.setState(structs.TaskStatePending, structs.NewTaskEvent(structs.TaskSetup). 1438 SetMessage(structs.TaskBuildingTaskDir)) 1439 } 1440 1441 chroot := config.DefaultChrootEnv 1442 if len(r.config.ChrootEnv) > 0 { 1443 chroot = r.config.ChrootEnv 1444 } 1445 if err := r.taskDir.Build(built, chroot, fsi); err != nil { 1446 return err 1447 } 1448 1449 // Mark task dir as successfully built 1450 r.persistLock.Lock() 1451 r.taskDirBuilt = true 1452 r.persistLock.Unlock() 1453 return nil 1454 } 1455 1456 // collectResourceUsageStats starts collecting resource usage stats of a Task. 1457 // Collection ends when the passed channel is closed 1458 func (r *TaskRunner) collectResourceUsageStats(stopCollection <-chan struct{}) { 1459 // start collecting the stats right away and then start collecting every 1460 // collection interval 1461 next := time.NewTimer(0) 1462 defer next.Stop() 1463 for { 1464 select { 1465 case <-next.C: 1466 next.Reset(r.config.StatsCollectionInterval) 1467 if r.handle == nil { 1468 continue 1469 } 1470 ru, err := r.handle.Stats() 1471 1472 if err != nil { 1473 // Check if the driver doesn't implement stats 1474 if err.Error() == driver.DriverStatsNotImplemented.Error() { 1475 r.logger.Printf("[DEBUG] client: driver for task %q in allocation %q doesn't support stats", r.task.Name, r.alloc.ID) 1476 return 1477 } 1478 1479 // We do not log when the plugin is shutdown as this is simply a 1480 // race between the stopCollection channel being closed and calling 1481 // Stats on the handle. 1482 if !strings.Contains(err.Error(), "connection is shut down") { 1483 r.logger.Printf("[WARN] client: error fetching stats of task %v: %v", r.task.Name, err) 1484 } 1485 continue 1486 } 1487 1488 r.resourceUsageLock.Lock() 1489 r.resourceUsage = ru 1490 r.resourceUsageLock.Unlock() 1491 if ru != nil { 1492 r.emitStats(ru) 1493 } 1494 case <-stopCollection: 1495 return 1496 } 1497 } 1498 } 1499 1500 // LatestResourceUsage returns the last resource utilization datapoint collected 1501 func (r *TaskRunner) LatestResourceUsage() *cstructs.TaskResourceUsage { 1502 r.resourceUsageLock.RLock() 1503 defer r.resourceUsageLock.RUnlock() 1504 r.runningLock.Lock() 1505 defer r.runningLock.Unlock() 1506 1507 // If the task is not running there can be no latest resource 1508 if !r.running { 1509 return nil 1510 } 1511 1512 return r.resourceUsage 1513 } 1514 1515 // handleUpdate takes an updated allocation and updates internal state to 1516 // reflect the new config for the task. 1517 func (r *TaskRunner) handleUpdate(update *structs.Allocation) error { 1518 // Extract the task group from the alloc. 1519 tg := update.Job.LookupTaskGroup(update.TaskGroup) 1520 if tg == nil { 1521 return fmt.Errorf("alloc '%s' missing task group '%s'", update.ID, update.TaskGroup) 1522 } 1523 1524 // Extract the task. 1525 var updatedTask *structs.Task 1526 for _, t := range tg.Tasks { 1527 if t.Name == r.task.Name { 1528 updatedTask = t.Copy() 1529 } 1530 } 1531 if updatedTask == nil { 1532 return fmt.Errorf("task group %q doesn't contain task %q", tg.Name, r.task.Name) 1533 } 1534 1535 // Merge in the task resources 1536 updatedTask.Resources = update.TaskResources[updatedTask.Name] 1537 1538 var mErr multierror.Error 1539 r.handleLock.Lock() 1540 if r.handle != nil { 1541 drv, err := r.createDriver() 1542 if err != nil { 1543 // Something has really gone wrong; don't continue 1544 r.handleLock.Unlock() 1545 return fmt.Errorf("error accessing driver when updating task %q: %v", r.task.Name, err) 1546 } 1547 1548 // Update will update resources and store the new kill timeout. 1549 if err := r.handle.Update(updatedTask); err != nil { 1550 mErr.Errors = append(mErr.Errors, fmt.Errorf("updating task resources failed: %v", err)) 1551 } 1552 1553 if err := r.updateServices(drv, r.handle, r.task, updatedTask); err != nil { 1554 mErr.Errors = append(mErr.Errors, fmt.Errorf("error updating services and checks in Consul: %v", err)) 1555 } 1556 } 1557 r.handleLock.Unlock() 1558 1559 // Update the restart policy. 1560 if r.restartTracker != nil { 1561 r.restartTracker.SetPolicy(tg.RestartPolicy) 1562 } 1563 1564 // Store the updated alloc. 1565 r.alloc = update 1566 r.task = updatedTask 1567 return mErr.ErrorOrNil() 1568 } 1569 1570 // updateServices and checks with Consul. 1571 func (r *TaskRunner) updateServices(d driver.Driver, h driver.ScriptExecutor, old, new *structs.Task) error { 1572 var exec driver.ScriptExecutor 1573 if d.Abilities().Exec { 1574 // Allow set the script executor if the driver supports it 1575 exec = h 1576 } 1577 interpolateServices(r.getTaskEnv(), r.task) 1578 return r.consul.UpdateTask(r.alloc.ID, old, new, exec) 1579 } 1580 1581 // handleDestroy kills the task handle. In the case that killing fails, 1582 // handleDestroy will retry with an exponential backoff and will give up at a 1583 // given limit. It returns whether the task was destroyed and the error 1584 // associated with the last kill attempt. 1585 func (r *TaskRunner) handleDestroy(handle driver.DriverHandle) (destroyed bool, err error) { 1586 // Cap the number of times we attempt to kill the task. 1587 for i := 0; i < killFailureLimit; i++ { 1588 if err = handle.Kill(); err != nil { 1589 // Calculate the new backoff 1590 backoff := (1 << (2 * uint64(i))) * killBackoffBaseline 1591 if backoff > killBackoffLimit { 1592 backoff = killBackoffLimit 1593 } 1594 1595 r.logger.Printf("[ERR] client: failed to kill task '%s' for alloc %q. Retrying in %v: %v", 1596 r.task.Name, r.alloc.ID, backoff, err) 1597 time.Sleep(time.Duration(backoff)) 1598 } else { 1599 // Kill was successful 1600 return true, nil 1601 } 1602 } 1603 return 1604 } 1605 1606 // Restart will restart the task 1607 func (r *TaskRunner) Restart(source, reason string) { 1608 reasonStr := fmt.Sprintf("%s: %s", source, reason) 1609 event := structs.NewTaskEvent(structs.TaskRestartSignal).SetRestartReason(reasonStr) 1610 1611 select { 1612 case r.restartCh <- event: 1613 case <-r.waitCh: 1614 } 1615 } 1616 1617 // Signal will send a signal to the task 1618 func (r *TaskRunner) Signal(source, reason string, s os.Signal) error { 1619 1620 reasonStr := fmt.Sprintf("%s: %s", source, reason) 1621 event := structs.NewTaskEvent(structs.TaskSignaling).SetTaskSignal(s).SetTaskSignalReason(reasonStr) 1622 1623 resCh := make(chan error) 1624 se := SignalEvent{ 1625 s: s, 1626 e: event, 1627 result: resCh, 1628 } 1629 1630 select { 1631 case r.signalCh <- se: 1632 case <-r.waitCh: 1633 } 1634 1635 return <-resCh 1636 } 1637 1638 // Kill will kill a task and store the error, no longer restarting the task. If 1639 // fail is set, the task is marked as having failed. 1640 func (r *TaskRunner) Kill(source, reason string, fail bool) { 1641 reasonStr := fmt.Sprintf("%s: %s", source, reason) 1642 event := structs.NewTaskEvent(structs.TaskKilling).SetKillReason(reasonStr) 1643 if fail { 1644 event.SetFailsTask() 1645 } 1646 1647 r.logger.Printf("[DEBUG] client: killing task %v for alloc %q: %v", r.task.Name, r.alloc.ID, reasonStr) 1648 r.Destroy(event) 1649 } 1650 1651 // UnblockStart unblocks the starting of the task. It currently assumes only 1652 // consul-template will unblock 1653 func (r *TaskRunner) UnblockStart(source string) { 1654 r.unblockLock.Lock() 1655 defer r.unblockLock.Unlock() 1656 if r.unblocked { 1657 return 1658 } 1659 1660 r.logger.Printf("[DEBUG] client: unblocking task %v for alloc %q: %v", r.task.Name, r.alloc.ID, source) 1661 r.unblocked = true 1662 close(r.unblockCh) 1663 } 1664 1665 // Helper function for converting a WaitResult into a TaskTerminated event. 1666 func (r *TaskRunner) waitErrorToEvent(res *dstructs.WaitResult) *structs.TaskEvent { 1667 return structs.NewTaskEvent(structs.TaskTerminated). 1668 SetExitCode(res.ExitCode). 1669 SetSignal(res.Signal). 1670 SetExitMessage(res.Err) 1671 } 1672 1673 // Update is used to update the task of the context 1674 func (r *TaskRunner) Update(update *structs.Allocation) { 1675 select { 1676 case r.updateCh <- update: 1677 default: 1678 r.logger.Printf("[ERR] client: dropping task update '%s' (alloc '%s')", 1679 r.task.Name, r.alloc.ID) 1680 } 1681 } 1682 1683 // Destroy is used to indicate that the task context should be destroyed. The 1684 // event parameter provides a context for the destroy. 1685 func (r *TaskRunner) Destroy(event *structs.TaskEvent) { 1686 r.destroyLock.Lock() 1687 defer r.destroyLock.Unlock() 1688 1689 if r.destroy { 1690 return 1691 } 1692 r.destroy = true 1693 r.destroyEvent = event 1694 close(r.destroyCh) 1695 } 1696 1697 // getCreatedResources returns the resources created by drivers. It will never 1698 // return nil. 1699 func (r *TaskRunner) getCreatedResources() *driver.CreatedResources { 1700 r.createdResourcesLock.Lock() 1701 if r.createdResources == nil { 1702 r.createdResources = driver.NewCreatedResources() 1703 } 1704 cr := r.createdResources.Copy() 1705 r.createdResourcesLock.Unlock() 1706 1707 return cr 1708 } 1709 1710 // setCreatedResources updates the resources created by drivers. If passed nil 1711 // it will set createdResources to an initialized struct. 1712 func (r *TaskRunner) setCreatedResources(cr *driver.CreatedResources) { 1713 if cr == nil { 1714 cr = driver.NewCreatedResources() 1715 } 1716 r.createdResourcesLock.Lock() 1717 r.createdResources = cr.Copy() 1718 r.createdResourcesLock.Unlock() 1719 } 1720 1721 // emitStats emits resource usage stats of tasks to remote metrics collector 1722 // sinks 1723 func (r *TaskRunner) emitStats(ru *cstructs.TaskResourceUsage) { 1724 if ru.ResourceUsage.MemoryStats != nil && r.config.PublishAllocationMetrics { 1725 metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "rss"}, float32(ru.ResourceUsage.MemoryStats.RSS)) 1726 metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "cache"}, float32(ru.ResourceUsage.MemoryStats.Cache)) 1727 metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "swap"}, float32(ru.ResourceUsage.MemoryStats.Swap)) 1728 metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "max_usage"}, float32(ru.ResourceUsage.MemoryStats.MaxUsage)) 1729 metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "kernel_usage"}, float32(ru.ResourceUsage.MemoryStats.KernelUsage)) 1730 metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "memory", "kernel_max_usage"}, float32(ru.ResourceUsage.MemoryStats.KernelMaxUsage)) 1731 } 1732 1733 if ru.ResourceUsage.CpuStats != nil && r.config.PublishAllocationMetrics { 1734 metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "total_percent"}, float32(ru.ResourceUsage.CpuStats.Percent)) 1735 metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "system"}, float32(ru.ResourceUsage.CpuStats.SystemMode)) 1736 metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "user"}, float32(ru.ResourceUsage.CpuStats.UserMode)) 1737 metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "throttled_time"}, float32(ru.ResourceUsage.CpuStats.ThrottledTime)) 1738 metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "throttled_periods"}, float32(ru.ResourceUsage.CpuStats.ThrottledPeriods)) 1739 metrics.SetGauge([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, r.alloc.ID, r.task.Name, "cpu", "total_ticks"}, float32(ru.ResourceUsage.CpuStats.TotalTicks)) 1740 } 1741 }