github.com/diptanu/nomad@v0.5.7-0.20170516172507-d72e86cbe3d9/client/alloc_runner.go (about) 1 package client 2 3 import ( 4 "fmt" 5 "log" 6 "os" 7 "path/filepath" 8 "sync" 9 "time" 10 11 "github.com/boltdb/bolt" 12 "github.com/hashicorp/go-multierror" 13 "github.com/hashicorp/nomad/client/allocdir" 14 "github.com/hashicorp/nomad/client/config" 15 "github.com/hashicorp/nomad/client/vaultclient" 16 "github.com/hashicorp/nomad/nomad/structs" 17 18 cstructs "github.com/hashicorp/nomad/client/structs" 19 ) 20 21 const ( 22 // taskReceivedSyncLimit is how long the client will wait before sending 23 // that a task was received to the server. The client does not immediately 24 // send that the task was received to the server because another transition 25 // to running or failed is likely to occur immediately after and a single 26 // update will transfer all past state information. If not other transition 27 // has occurred up to this limit, we will send to the server. 28 taskReceivedSyncLimit = 30 * time.Second 29 ) 30 31 var ( 32 // The following are the key paths written to the state database 33 allocRunnerStateImmutableKey = []byte("immutable") 34 allocRunnerStateMutableKey = []byte("mutable") 35 allocRunnerStateAllocDirKey = []byte("alloc-dir") 36 ) 37 38 // AllocStateUpdater is used to update the status of an allocation 39 type AllocStateUpdater func(alloc *structs.Allocation) 40 41 type AllocStatsReporter interface { 42 LatestAllocStats(taskFilter string) (*cstructs.AllocResourceUsage, error) 43 } 44 45 // AllocRunner is used to wrap an allocation and provide the execution context. 46 type AllocRunner struct { 47 config *config.Config 48 updater AllocStateUpdater 49 logger *log.Logger 50 51 alloc *structs.Allocation 52 allocClientStatus string // Explicit status of allocation. Set when there are failures 53 allocClientDescription string 54 allocLock sync.Mutex 55 56 dirtyCh chan struct{} 57 58 allocDir *allocdir.AllocDir 59 allocDirLock sync.Mutex 60 61 tasks map[string]*TaskRunner 62 taskStates map[string]*structs.TaskState 63 restored map[string]struct{} 64 taskLock sync.RWMutex 65 66 taskStatusLock sync.RWMutex 67 68 updateCh chan *structs.Allocation 69 70 vaultClient vaultclient.VaultClient 71 consulClient ConsulServiceAPI 72 73 otherAllocDir *allocdir.AllocDir 74 75 destroy bool 76 destroyCh chan struct{} 77 destroyLock sync.Mutex 78 waitCh chan struct{} 79 80 // State related fields 81 // stateDB is used to store the alloc runners state 82 stateDB *bolt.DB 83 84 // immutablePersisted and allocDirPersisted are used to track whether the 85 // immutable data and the alloc dir have been persisted. Once persisted we 86 // can lower write volume by not re-writing these values 87 immutablePersisted bool 88 allocDirPersisted bool 89 } 90 91 // COMPAT: Remove in 0.7.0 92 // allocRunnerState is used to snapshot the state of the alloc runner 93 type allocRunnerState struct { 94 Version string 95 Alloc *structs.Allocation 96 AllocDir *allocdir.AllocDir 97 AllocClientStatus string 98 AllocClientDescription string 99 100 // COMPAT: Remove in 0.7.0: removing will break upgrading directly from 101 // 0.5.2, so don't remove in the 0.6 series. 102 // Context is deprecated and only used to migrate from older releases. 103 // It will be removed in the future. 104 Context *struct { 105 AllocID string // unused; included for completeness 106 AllocDir struct { 107 AllocDir string 108 SharedDir string // unused; included for completeness 109 TaskDirs map[string]string 110 } 111 } `json:"Context,omitempty"` 112 } 113 114 // allocRunnerImmutableState is state that only has to be written once as it 115 // doesn't change over the life-cycle of the alloc_runner. 116 type allocRunnerImmutableState struct { 117 Version string 118 Alloc *structs.Allocation 119 } 120 121 // allocRunnerMutableState is state that has to be written on each save as it 122 // changes over the life-cycle of the alloc_runner. 123 type allocRunnerMutableState struct { 124 AllocClientStatus string 125 AllocClientDescription string 126 TaskStates map[string]*structs.TaskState 127 } 128 129 // NewAllocRunner is used to create a new allocation context 130 func NewAllocRunner(logger *log.Logger, config *config.Config, stateDB *bolt.DB, updater AllocStateUpdater, 131 alloc *structs.Allocation, vaultClient vaultclient.VaultClient, 132 consulClient ConsulServiceAPI) *AllocRunner { 133 134 ar := &AllocRunner{ 135 config: config, 136 stateDB: stateDB, 137 updater: updater, 138 logger: logger, 139 alloc: alloc, 140 dirtyCh: make(chan struct{}, 1), 141 tasks: make(map[string]*TaskRunner), 142 taskStates: copyTaskStates(alloc.TaskStates), 143 restored: make(map[string]struct{}), 144 updateCh: make(chan *structs.Allocation, 64), 145 destroyCh: make(chan struct{}), 146 waitCh: make(chan struct{}), 147 vaultClient: vaultClient, 148 consulClient: consulClient, 149 } 150 return ar 151 } 152 153 // pre060StateFilePath returns the path to our state file that would have been 154 // written pre v0.6.0 155 // COMPAT: Remove in 0.7.0 156 func (r *AllocRunner) pre060StateFilePath() string { 157 r.allocLock.Lock() 158 defer r.allocLock.Unlock() 159 path := filepath.Join(r.config.StateDir, "alloc", r.alloc.ID, "state.json") 160 return path 161 } 162 163 // RestoreState is used to restore the state of the alloc runner 164 func (r *AllocRunner) RestoreState() error { 165 166 // COMPAT: Remove in 0.7.0 167 // Check if the old snapshot is there 168 oldPath := r.pre060StateFilePath() 169 var snap allocRunnerState 170 var upgrading bool 171 if err := pre060RestoreState(oldPath, &snap); err == nil { 172 // Restore fields 173 r.logger.Printf("[INFO] client: restoring pre v0.6.0 alloc runner state for alloc %q", r.alloc.ID) 174 r.alloc = snap.Alloc 175 r.allocDir = snap.AllocDir 176 r.allocClientStatus = snap.AllocClientStatus 177 r.allocClientDescription = snap.AllocClientDescription 178 179 if r.alloc != nil { 180 r.taskStates = snap.Alloc.TaskStates 181 } 182 183 // COMPAT: Remove in 0.7.0 184 // #2132 Upgrade path: if snap.AllocDir is nil, try to convert old 185 // Context struct to new AllocDir struct 186 if snap.AllocDir == nil && snap.Context != nil { 187 r.logger.Printf("[DEBUG] client: migrating state snapshot for alloc %q", r.alloc.ID) 188 r.allocDir = allocdir.NewAllocDir(r.logger, snap.Context.AllocDir.AllocDir) 189 for taskName := range snap.Context.AllocDir.TaskDirs { 190 r.allocDir.NewTaskDir(taskName) 191 } 192 } 193 194 // Delete the old state 195 os.RemoveAll(oldPath) 196 upgrading = true 197 } else if !os.IsNotExist(err) { 198 // Something corrupt in the old state file 199 return err 200 } else { 201 // We are doing a normal restore 202 err := r.stateDB.View(func(tx *bolt.Tx) error { 203 bkt, err := getAllocationBucket(tx, r.alloc.ID) 204 if err != nil { 205 return fmt.Errorf("failed to get allocation bucket: %v", err) 206 } 207 208 // Get the state objects 209 var mutable allocRunnerMutableState 210 var immutable allocRunnerImmutableState 211 var allocDir allocdir.AllocDir 212 213 if err := getObject(bkt, allocRunnerStateImmutableKey, &immutable); err != nil { 214 return fmt.Errorf("failed to read alloc runner immutable state: %v", err) 215 } 216 if err := getObject(bkt, allocRunnerStateMutableKey, &mutable); err != nil { 217 return fmt.Errorf("failed to read alloc runner mutable state: %v", err) 218 } 219 if err := getObject(bkt, allocRunnerStateAllocDirKey, &allocDir); err != nil { 220 return fmt.Errorf("failed to read alloc runner alloc_dir state: %v", err) 221 } 222 223 // Populate the fields 224 r.alloc = immutable.Alloc 225 r.allocDir = &allocDir 226 r.allocClientStatus = mutable.AllocClientStatus 227 r.allocClientDescription = mutable.AllocClientDescription 228 r.taskStates = mutable.TaskStates 229 r.alloc.ClientStatus = getClientStatus(r.taskStates) 230 return nil 231 }) 232 233 if err != nil { 234 return fmt.Errorf("failed to read allocation state: %v", err) 235 } 236 } 237 238 var snapshotErrors multierror.Error 239 if r.alloc == nil { 240 snapshotErrors.Errors = append(snapshotErrors.Errors, fmt.Errorf("alloc_runner snapshot includes a nil allocation")) 241 } 242 if r.allocDir == nil { 243 snapshotErrors.Errors = append(snapshotErrors.Errors, fmt.Errorf("alloc_runner snapshot includes a nil alloc dir")) 244 } 245 if e := snapshotErrors.ErrorOrNil(); e != nil { 246 return e 247 } 248 249 tg := r.alloc.Job.LookupTaskGroup(r.alloc.TaskGroup) 250 if tg == nil { 251 return fmt.Errorf("restored allocation doesn't contain task group %q", r.alloc.TaskGroup) 252 } 253 254 // Restore the task runners 255 var mErr multierror.Error 256 for _, task := range tg.Tasks { 257 name := task.Name 258 state := r.taskStates[name] 259 260 // Mark the task as restored. 261 r.restored[name] = struct{}{} 262 263 td, ok := r.allocDir.TaskDirs[name] 264 if !ok { 265 err := fmt.Errorf("failed to find task dir metadata for alloc %q task %q", 266 r.alloc.ID, name) 267 r.logger.Printf("[ERR] client: %v", err) 268 return err 269 } 270 271 tr := NewTaskRunner(r.logger, r.config, r.stateDB, r.setTaskState, td, r.Alloc(), task, r.vaultClient, r.consulClient) 272 r.tasks[name] = tr 273 274 // Skip tasks in terminal states. 275 if state.State == structs.TaskStateDead { 276 continue 277 } 278 279 if restartReason, err := tr.RestoreState(); err != nil { 280 r.logger.Printf("[ERR] client: failed to restore state for alloc %s task %q: %v", r.alloc.ID, name, err) 281 mErr.Errors = append(mErr.Errors, err) 282 } else if !r.alloc.TerminalStatus() { 283 // Only start if the alloc isn't in a terminal status. 284 go tr.Run() 285 286 if upgrading { 287 if err := tr.SaveState(); err != nil { 288 r.logger.Printf("[WARN] client: initial save state for alloc %s task %s failed: %v", r.alloc.ID, name, err) 289 } 290 } 291 292 // Restart task runner if RestoreState gave a reason 293 if restartReason != "" { 294 r.logger.Printf("[INFO] client: restarting alloc %s task %s: %v", r.alloc.ID, name, restartReason) 295 tr.Restart("upgrade", restartReason) 296 } 297 } 298 } 299 300 return mErr.ErrorOrNil() 301 } 302 303 // SaveState is used to snapshot the state of the alloc runner 304 // if the fullSync is marked as false only the state of the Alloc Runner 305 // is snapshotted. If fullSync is marked as true, we snapshot 306 // all the Task Runners associated with the Alloc 307 func (r *AllocRunner) SaveState() error { 308 if err := r.saveAllocRunnerState(); err != nil { 309 return err 310 } 311 312 // Save state for each task 313 runners := r.getTaskRunners() 314 var mErr multierror.Error 315 for _, tr := range runners { 316 if err := r.saveTaskRunnerState(tr); err != nil { 317 mErr.Errors = append(mErr.Errors, err) 318 } 319 } 320 return mErr.ErrorOrNil() 321 } 322 323 func (r *AllocRunner) saveAllocRunnerState() error { 324 // Grab all the relevant data 325 alloc := r.Alloc() 326 327 r.allocLock.Lock() 328 allocClientStatus := r.allocClientStatus 329 allocClientDescription := r.allocClientDescription 330 r.allocLock.Unlock() 331 332 r.allocDirLock.Lock() 333 allocDir := r.allocDir 334 r.allocDirLock.Unlock() 335 336 // Start the transaction. 337 return r.stateDB.Batch(func(tx *bolt.Tx) error { 338 339 // Grab the allocation bucket 340 allocBkt, err := getAllocationBucket(tx, r.alloc.ID) 341 if err != nil { 342 return fmt.Errorf("failed to retrieve allocation bucket: %v", err) 343 } 344 345 // Write the immutable data 346 if !r.immutablePersisted { 347 immutable := &allocRunnerImmutableState{ 348 Alloc: alloc, 349 Version: r.config.Version, 350 } 351 352 if err := putObject(allocBkt, allocRunnerStateImmutableKey, &immutable); err != nil { 353 return fmt.Errorf("failed to write alloc_runner immutable state: %v", err) 354 } 355 356 tx.OnCommit(func() { 357 r.immutablePersisted = true 358 }) 359 } 360 361 // Write the alloc dir data if it hasn't been written before and it exists. 362 if !r.allocDirPersisted && r.allocDir != nil { 363 if err := putObject(allocBkt, allocRunnerStateAllocDirKey, allocDir); err != nil { 364 return fmt.Errorf("failed to write alloc_runner allocDir state: %v", err) 365 } 366 367 tx.OnCommit(func() { 368 r.allocDirPersisted = true 369 }) 370 } 371 372 // Write the mutable state every time 373 mutable := &allocRunnerMutableState{ 374 AllocClientStatus: allocClientStatus, 375 AllocClientDescription: allocClientDescription, 376 TaskStates: alloc.TaskStates, 377 } 378 379 if err := putObject(allocBkt, allocRunnerStateMutableKey, &mutable); err != nil { 380 return fmt.Errorf("failed to write alloc_runner mutable state: %v", err) 381 } 382 383 return nil 384 }) 385 } 386 387 func (r *AllocRunner) saveTaskRunnerState(tr *TaskRunner) error { 388 if err := tr.SaveState(); err != nil { 389 return fmt.Errorf("failed to save state for alloc %s task '%s': %v", 390 r.alloc.ID, tr.task.Name, err) 391 } 392 return nil 393 } 394 395 // DestroyState is used to cleanup after ourselves 396 func (r *AllocRunner) DestroyState() error { 397 return r.stateDB.Update(func(tx *bolt.Tx) error { 398 if err := deleteAllocationBucket(tx, r.alloc.ID); err != nil { 399 return fmt.Errorf("failed to delete allocation bucket: %v", err) 400 } 401 return nil 402 }) 403 } 404 405 // DestroyContext is used to destroy the context 406 func (r *AllocRunner) DestroyContext() error { 407 return r.allocDir.Destroy() 408 } 409 410 // GetAllocDir returns the alloc dir for the alloc runner 411 func (r *AllocRunner) GetAllocDir() *allocdir.AllocDir { 412 return r.allocDir 413 } 414 415 // copyTaskStates returns a copy of the passed task states. 416 func copyTaskStates(states map[string]*structs.TaskState) map[string]*structs.TaskState { 417 copy := make(map[string]*structs.TaskState, len(states)) 418 for task, state := range states { 419 copy[task] = state.Copy() 420 } 421 return copy 422 } 423 424 // Alloc returns the associated allocation 425 func (r *AllocRunner) Alloc() *structs.Allocation { 426 r.allocLock.Lock() 427 428 // Clear the job before copying 429 job := r.alloc.Job 430 431 // Since we are clearing the job, anything that access the alloc.Job field 432 // must acquire the lock or access it via this method. 433 r.alloc.Job = nil 434 435 alloc := r.alloc.Copy() 436 437 // Restore 438 r.alloc.Job = job 439 alloc.Job = job 440 441 // The status has explicitly been set. 442 if r.allocClientStatus != "" || r.allocClientDescription != "" { 443 alloc.ClientStatus = r.allocClientStatus 444 alloc.ClientDescription = r.allocClientDescription 445 446 // Copy over the task states so we don't lose them 447 r.taskStatusLock.RLock() 448 alloc.TaskStates = copyTaskStates(r.taskStates) 449 r.taskStatusLock.RUnlock() 450 451 r.allocLock.Unlock() 452 return alloc 453 } 454 r.allocLock.Unlock() 455 456 // Scan the task states to determine the status of the alloc 457 r.taskStatusLock.RLock() 458 alloc.TaskStates = copyTaskStates(r.taskStates) 459 alloc.ClientStatus = getClientStatus(r.taskStates) 460 r.taskStatusLock.RUnlock() 461 462 return alloc 463 } 464 465 // getClientStatus takes in the task states for a given allocation and computes 466 // the client status 467 func getClientStatus(taskStates map[string]*structs.TaskState) string { 468 var pending, running, dead, failed bool 469 for _, state := range taskStates { 470 switch state.State { 471 case structs.TaskStateRunning: 472 running = true 473 case structs.TaskStatePending: 474 pending = true 475 case structs.TaskStateDead: 476 if state.Failed { 477 failed = true 478 } else { 479 dead = true 480 } 481 } 482 } 483 484 // Determine the alloc status 485 if failed { 486 return structs.AllocClientStatusFailed 487 } else if running { 488 return structs.AllocClientStatusRunning 489 } else if pending { 490 return structs.AllocClientStatusPending 491 } else if dead { 492 return structs.AllocClientStatusComplete 493 } 494 495 return "" 496 } 497 498 // dirtySyncState is used to watch for state being marked dirty to sync 499 func (r *AllocRunner) dirtySyncState() { 500 for { 501 select { 502 case <-r.dirtyCh: 503 r.syncStatus() 504 case <-r.destroyCh: 505 return 506 } 507 } 508 } 509 510 // syncStatus is used to run and sync the status when it changes 511 func (r *AllocRunner) syncStatus() error { 512 // Get a copy of our alloc, update status server side and sync to disk 513 alloc := r.Alloc() 514 r.updater(alloc) 515 return r.saveAllocRunnerState() 516 } 517 518 // setStatus is used to update the allocation status 519 func (r *AllocRunner) setStatus(status, desc string) { 520 r.allocLock.Lock() 521 r.allocClientStatus = status 522 r.allocClientDescription = desc 523 r.allocLock.Unlock() 524 select { 525 case r.dirtyCh <- struct{}{}: 526 default: 527 } 528 } 529 530 // setTaskState is used to set the status of a task. If state is empty then the 531 // event is appended but not synced with the server. The event may be omitted 532 func (r *AllocRunner) setTaskState(taskName, state string, event *structs.TaskEvent) { 533 r.taskStatusLock.Lock() 534 defer r.taskStatusLock.Unlock() 535 taskState, ok := r.taskStates[taskName] 536 if !ok { 537 taskState = &structs.TaskState{} 538 r.taskStates[taskName] = taskState 539 } 540 541 // Set the tasks state. 542 if event != nil { 543 if event.FailsTask { 544 taskState.Failed = true 545 } 546 r.appendTaskEvent(taskState, event) 547 } 548 549 if state == "" { 550 return 551 } 552 553 switch state { 554 case structs.TaskStateRunning: 555 // Capture the start time if it is just starting 556 if taskState.State != structs.TaskStateRunning { 557 taskState.StartedAt = time.Now().UTC() 558 } 559 case structs.TaskStateDead: 560 // Capture the finished time. If it has never started there is no finish 561 // time 562 if !taskState.StartedAt.IsZero() { 563 taskState.FinishedAt = time.Now().UTC() 564 } 565 566 // Find all tasks that are not the one that is dead and check if the one 567 // that is dead is a leader 568 var otherTaskRunners []*TaskRunner 569 var otherTaskNames []string 570 leader := false 571 for task, tr := range r.tasks { 572 if task != taskName { 573 otherTaskRunners = append(otherTaskRunners, tr) 574 otherTaskNames = append(otherTaskNames, task) 575 } else if tr.task.Leader { 576 leader = true 577 } 578 } 579 580 // If the task failed, we should kill all the other tasks in the task group. 581 if taskState.Failed { 582 for _, tr := range otherTaskRunners { 583 tr.Destroy(structs.NewTaskEvent(structs.TaskSiblingFailed).SetFailedSibling(taskName)) 584 } 585 if len(otherTaskRunners) > 0 { 586 r.logger.Printf("[DEBUG] client: task %q failed, destroying other tasks in task group: %v", taskName, otherTaskNames) 587 } 588 } else if leader { 589 // If the task was a leader task we should kill all the other tasks. 590 for _, tr := range otherTaskRunners { 591 tr.Destroy(structs.NewTaskEvent(structs.TaskLeaderDead)) 592 } 593 if len(otherTaskRunners) > 0 { 594 r.logger.Printf("[DEBUG] client: leader task %q is dead, destroying other tasks in task group: %v", taskName, otherTaskNames) 595 } 596 } 597 } 598 599 // Store the new state 600 taskState.State = state 601 602 select { 603 case r.dirtyCh <- struct{}{}: 604 default: 605 } 606 } 607 608 // appendTaskEvent updates the task status by appending the new event. 609 func (r *AllocRunner) appendTaskEvent(state *structs.TaskState, event *structs.TaskEvent) { 610 capacity := 10 611 if state.Events == nil { 612 state.Events = make([]*structs.TaskEvent, 0, capacity) 613 } 614 615 // If we hit capacity, then shift it. 616 if len(state.Events) == capacity { 617 old := state.Events 618 state.Events = make([]*structs.TaskEvent, 0, capacity) 619 state.Events = append(state.Events, old[1:]...) 620 } 621 622 state.Events = append(state.Events, event) 623 } 624 625 // Run is a long running goroutine used to manage an allocation 626 func (r *AllocRunner) Run() { 627 defer close(r.waitCh) 628 go r.dirtySyncState() 629 630 // Find the task group to run in the allocation 631 alloc := r.Alloc() 632 tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup) 633 if tg == nil { 634 r.logger.Printf("[ERR] client: alloc '%s' for missing task group '%s'", alloc.ID, alloc.TaskGroup) 635 r.setStatus(structs.AllocClientStatusFailed, fmt.Sprintf("missing task group '%s'", alloc.TaskGroup)) 636 return 637 } 638 639 // Create the execution context 640 r.allocDirLock.Lock() 641 if r.allocDir == nil { 642 // Build allocation directory 643 r.allocDir = allocdir.NewAllocDir(r.logger, filepath.Join(r.config.AllocDir, r.alloc.ID)) 644 if err := r.allocDir.Build(); err != nil { 645 r.logger.Printf("[WARN] client: failed to build task directories: %v", err) 646 r.setStatus(structs.AllocClientStatusFailed, fmt.Sprintf("failed to build task dirs for '%s'", alloc.TaskGroup)) 647 r.allocDirLock.Unlock() 648 return 649 } 650 651 if r.otherAllocDir != nil { 652 if err := r.allocDir.Move(r.otherAllocDir, tg.Tasks); err != nil { 653 r.logger.Printf("[ERROR] client: failed to move alloc dir into alloc %q: %v", r.alloc.ID, err) 654 } 655 if err := r.otherAllocDir.Destroy(); err != nil { 656 r.logger.Printf("[ERROR] client: error destroying allocdir %v: %v", r.otherAllocDir.AllocDir, err) 657 } 658 } 659 } 660 r.allocDirLock.Unlock() 661 662 // Check if the allocation is in a terminal status. In this case, we don't 663 // start any of the task runners and directly wait for the destroy signal to 664 // clean up the allocation. 665 if alloc.TerminalStatus() { 666 r.logger.Printf("[DEBUG] client: alloc %q in terminal status, waiting for destroy", r.alloc.ID) 667 r.handleDestroy() 668 r.logger.Printf("[DEBUG] client: terminating runner for alloc '%s'", r.alloc.ID) 669 return 670 } 671 672 // Start the task runners 673 r.logger.Printf("[DEBUG] client: starting task runners for alloc '%s'", r.alloc.ID) 674 r.taskLock.Lock() 675 for _, task := range tg.Tasks { 676 if _, ok := r.restored[task.Name]; ok { 677 continue 678 } 679 680 r.allocDirLock.Lock() 681 taskdir := r.allocDir.NewTaskDir(task.Name) 682 r.allocDirLock.Unlock() 683 684 tr := NewTaskRunner(r.logger, r.config, r.stateDB, r.setTaskState, taskdir, r.Alloc(), task.Copy(), r.vaultClient, r.consulClient) 685 r.tasks[task.Name] = tr 686 tr.MarkReceived() 687 688 go tr.Run() 689 } 690 r.taskLock.Unlock() 691 692 // taskDestroyEvent contains an event that caused the destroyment of a task 693 // in the allocation. 694 var taskDestroyEvent *structs.TaskEvent 695 696 OUTER: 697 // Wait for updates 698 for { 699 select { 700 case update := <-r.updateCh: 701 // Store the updated allocation. 702 r.allocLock.Lock() 703 r.alloc = update 704 r.allocLock.Unlock() 705 706 // Check if we're in a terminal status 707 if update.TerminalStatus() { 708 taskDestroyEvent = structs.NewTaskEvent(structs.TaskKilled) 709 break OUTER 710 } 711 712 // Update the task groups 713 runners := r.getTaskRunners() 714 for _, tr := range runners { 715 tr.Update(update) 716 } 717 718 if err := r.syncStatus(); err != nil { 719 r.logger.Printf("[WARN] client: failed to sync status upon receiving alloc update: %v", err) 720 } 721 case <-r.destroyCh: 722 taskDestroyEvent = structs.NewTaskEvent(structs.TaskKilled) 723 break OUTER 724 } 725 } 726 727 // Kill the task runners 728 r.destroyTaskRunners(taskDestroyEvent) 729 730 // Block until we should destroy the state of the alloc 731 r.handleDestroy() 732 r.logger.Printf("[DEBUG] client: terminating runner for alloc '%s'", r.alloc.ID) 733 } 734 735 // SetPreviousAllocDir sets the previous allocation directory of the current 736 // allocation 737 func (r *AllocRunner) SetPreviousAllocDir(allocDir *allocdir.AllocDir) { 738 r.otherAllocDir = allocDir 739 } 740 741 // destroyTaskRunners destroys the task runners, waits for them to terminate and 742 // then saves state. 743 func (r *AllocRunner) destroyTaskRunners(destroyEvent *structs.TaskEvent) { 744 // Destroy each sub-task 745 runners := r.getTaskRunners() 746 for _, tr := range runners { 747 tr.Destroy(destroyEvent) 748 } 749 750 // Wait for termination of the task runners 751 for _, tr := range runners { 752 <-tr.WaitCh() 753 } 754 } 755 756 // handleDestroy blocks till the AllocRunner should be destroyed and does the 757 // necessary cleanup. 758 func (r *AllocRunner) handleDestroy() { 759 // Final state sync. We do this to ensure that the server has the correct 760 // state as we wait for a destroy. 761 r.syncStatus() 762 763 for { 764 select { 765 case <-r.destroyCh: 766 if err := r.DestroyContext(); err != nil { 767 r.logger.Printf("[ERR] client: failed to destroy context for alloc '%s': %v", 768 r.alloc.ID, err) 769 } 770 if err := r.DestroyState(); err != nil { 771 r.logger.Printf("[ERR] client: failed to destroy state for alloc '%s': %v", 772 r.alloc.ID, err) 773 } 774 775 return 776 case <-r.updateCh: 777 r.logger.Printf("[ERR] client: dropping update to terminal alloc '%s'", r.alloc.ID) 778 } 779 } 780 } 781 782 // Update is used to update the allocation of the context 783 func (r *AllocRunner) Update(update *structs.Allocation) { 784 select { 785 case r.updateCh <- update: 786 default: 787 r.logger.Printf("[ERR] client: dropping update to alloc '%s'", update.ID) 788 } 789 } 790 791 // StatsReporter returns an interface to query resource usage statistics of an 792 // allocation 793 func (r *AllocRunner) StatsReporter() AllocStatsReporter { 794 return r 795 } 796 797 // getTaskRunners is a helper that returns a copy of the task runners list using 798 // the taskLock. 799 func (r *AllocRunner) getTaskRunners() []*TaskRunner { 800 // Get the task runners 801 r.taskLock.RLock() 802 defer r.taskLock.RUnlock() 803 runners := make([]*TaskRunner, 0, len(r.tasks)) 804 for _, tr := range r.tasks { 805 runners = append(runners, tr) 806 } 807 return runners 808 } 809 810 // LatestAllocStats returns the latest allocation stats. If the optional taskFilter is set 811 // the allocation stats will only include the given task. 812 func (r *AllocRunner) LatestAllocStats(taskFilter string) (*cstructs.AllocResourceUsage, error) { 813 astat := &cstructs.AllocResourceUsage{ 814 Tasks: make(map[string]*cstructs.TaskResourceUsage), 815 } 816 817 var flat []*cstructs.TaskResourceUsage 818 if taskFilter != "" { 819 r.taskLock.RLock() 820 tr, ok := r.tasks[taskFilter] 821 r.taskLock.RUnlock() 822 if !ok { 823 return nil, fmt.Errorf("allocation %q has no task %q", r.alloc.ID, taskFilter) 824 } 825 l := tr.LatestResourceUsage() 826 if l != nil { 827 astat.Tasks[taskFilter] = l 828 flat = []*cstructs.TaskResourceUsage{l} 829 astat.Timestamp = l.Timestamp 830 } 831 } else { 832 // Get the task runners 833 runners := r.getTaskRunners() 834 for _, tr := range runners { 835 l := tr.LatestResourceUsage() 836 if l != nil { 837 astat.Tasks[tr.task.Name] = l 838 flat = append(flat, l) 839 if l.Timestamp > astat.Timestamp { 840 astat.Timestamp = l.Timestamp 841 } 842 } 843 } 844 } 845 846 astat.ResourceUsage = sumTaskResourceUsage(flat) 847 return astat, nil 848 } 849 850 // sumTaskResourceUsage takes a set of task resources and sums their resources 851 func sumTaskResourceUsage(usages []*cstructs.TaskResourceUsage) *cstructs.ResourceUsage { 852 summed := &cstructs.ResourceUsage{ 853 MemoryStats: &cstructs.MemoryStats{}, 854 CpuStats: &cstructs.CpuStats{}, 855 } 856 for _, usage := range usages { 857 summed.Add(usage.ResourceUsage) 858 } 859 return summed 860 } 861 862 // shouldUpdate takes the AllocModifyIndex of an allocation sent from the server and 863 // checks if the current running allocation is behind and should be updated. 864 func (r *AllocRunner) shouldUpdate(serverIndex uint64) bool { 865 r.allocLock.Lock() 866 defer r.allocLock.Unlock() 867 return r.alloc.AllocModifyIndex < serverIndex 868 } 869 870 // Destroy is used to indicate that the allocation context should be destroyed 871 func (r *AllocRunner) Destroy() { 872 r.destroyLock.Lock() 873 defer r.destroyLock.Unlock() 874 875 if r.destroy { 876 return 877 } 878 r.destroy = true 879 close(r.destroyCh) 880 } 881 882 // WaitCh returns a channel to wait for termination 883 func (r *AllocRunner) WaitCh() <-chan struct{} { 884 return r.waitCh 885 }