github.com/kardianos/nomad@v0.1.3-0.20151022182107-b13df73ee850/client/alloc_runner.go

github.com/kardianos/nomad@v0.1.3-0.20151022182107-b13df73ee850/client/alloc_runner.go (about)

     1  package client
     2  
     3  import (
     4  	"encoding/json"
     5  	"fmt"
     6  	"log"
     7  	"os"
     8  	"path/filepath"
     9  	"sync"
    10  	"time"
    11  
    12  	"github.com/hashicorp/go-multierror"
    13  	"github.com/hashicorp/nomad/client/allocdir"
    14  	"github.com/hashicorp/nomad/client/config"
    15  	"github.com/hashicorp/nomad/client/driver"
    16  	"github.com/hashicorp/nomad/nomad/structs"
    17  )
    18  
    19  const (
    20  	// allocSyncRetryIntv is the interval on which we retry updating
    21  	// the status of the allocation
    22  	allocSyncRetryIntv = 15 * time.Second
    23  )
    24  
    25  // taskStatus is used to track the status of a task
    26  type taskStatus struct {
    27  	Status      string
    28  	Description string
    29  }
    30  
    31  // AllocStateUpdater is used to update the status of an allocation
    32  type AllocStateUpdater func(alloc *structs.Allocation) error
    33  
    34  // AllocRunner is used to wrap an allocation and provide the execution context.
    35  type AllocRunner struct {
    36  	config  *config.Config
    37  	updater AllocStateUpdater
    38  	logger  *log.Logger
    39  
    40  	alloc *structs.Allocation
    41  
    42  	dirtyCh chan struct{}
    43  
    44  	ctx      *driver.ExecContext
    45  	tasks    map[string]*TaskRunner
    46  	taskLock sync.RWMutex
    47  
    48  	taskStatus     map[string]taskStatus
    49  	taskStatusLock sync.RWMutex
    50  
    51  	updateCh chan *structs.Allocation
    52  
    53  	destroy     bool
    54  	destroyCh   chan struct{}
    55  	destroyLock sync.Mutex
    56  	waitCh      chan struct{}
    57  }
    58  
    59  // allocRunnerState is used to snapshot the state of the alloc runner
    60  type allocRunnerState struct {
    61  	Alloc      *structs.Allocation
    62  	TaskStatus map[string]taskStatus
    63  	Context    *driver.ExecContext
    64  }
    65  
    66  // NewAllocRunner is used to create a new allocation context
    67  func NewAllocRunner(logger *log.Logger, config *config.Config, updater AllocStateUpdater, alloc *structs.Allocation) *AllocRunner {
    68  	ar := &AllocRunner{
    69  		config:     config,
    70  		updater:    updater,
    71  		logger:     logger,
    72  		alloc:      alloc,
    73  		dirtyCh:    make(chan struct{}, 1),
    74  		tasks:      make(map[string]*TaskRunner),
    75  		taskStatus: make(map[string]taskStatus),
    76  		updateCh:   make(chan *structs.Allocation, 8),
    77  		destroyCh:  make(chan struct{}),
    78  		waitCh:     make(chan struct{}),
    79  	}
    80  	return ar
    81  }
    82  
    83  // stateFilePath returns the path to our state file
    84  func (r *AllocRunner) stateFilePath() string {
    85  	return filepath.Join(r.config.StateDir, "alloc", r.alloc.ID, "state.json")
    86  }
    87  
    88  // RestoreState is used to restore the state of the alloc runner
    89  func (r *AllocRunner) RestoreState() error {
    90  	// Load the snapshot
    91  	var snap allocRunnerState
    92  	if err := restoreState(r.stateFilePath(), &snap); err != nil {
    93  		return err
    94  	}
    95  
    96  	// Restore fields
    97  	r.alloc = snap.Alloc
    98  	r.taskStatus = snap.TaskStatus
    99  	r.ctx = snap.Context
   100  
   101  	// Restore the task runners
   102  	var mErr multierror.Error
   103  	for name := range r.taskStatus {
   104  		task := &structs.Task{Name: name}
   105  		tr := NewTaskRunner(r.logger, r.config, r.setTaskStatus, r.ctx, r.alloc.ID, task)
   106  		r.tasks[name] = tr
   107  		if err := tr.RestoreState(); err != nil {
   108  			r.logger.Printf("[ERR] client: failed to restore state for alloc %s task '%s': %v", r.alloc.ID, name, err)
   109  			mErr.Errors = append(mErr.Errors, err)
   110  		} else {
   111  			go tr.Run()
   112  		}
   113  	}
   114  	return mErr.ErrorOrNil()
   115  }
   116  
   117  // SaveState is used to snapshot our state
   118  func (r *AllocRunner) SaveState() error {
   119  	r.taskStatusLock.RLock()
   120  	snap := allocRunnerState{
   121  		Alloc:      r.alloc,
   122  		TaskStatus: r.taskStatus,
   123  		Context:    r.ctx,
   124  	}
   125  	err := persistState(r.stateFilePath(), &snap)
   126  	r.taskStatusLock.RUnlock()
   127  	if err != nil {
   128  		return err
   129  	}
   130  
   131  	// Save state for each task
   132  	r.taskLock.RLock()
   133  	defer r.taskLock.RUnlock()
   134  	var mErr multierror.Error
   135  	for name, tr := range r.tasks {
   136  		if err := tr.SaveState(); err != nil {
   137  			r.logger.Printf("[ERR] client: failed to save state for alloc %s task '%s': %v",
   138  				r.alloc.ID, name, err)
   139  			mErr.Errors = append(mErr.Errors, err)
   140  		}
   141  	}
   142  	return mErr.ErrorOrNil()
   143  }
   144  
   145  // DestroyState is used to cleanup after ourselves
   146  func (r *AllocRunner) DestroyState() error {
   147  	return os.RemoveAll(filepath.Dir(r.stateFilePath()))
   148  }
   149  
   150  // DestroyContext is used to destroy the context
   151  func (r *AllocRunner) DestroyContext() error {
   152  	return r.ctx.AllocDir.Destroy()
   153  }
   154  
   155  // Alloc returns the associated allocation
   156  func (r *AllocRunner) Alloc() *structs.Allocation {
   157  	return r.alloc
   158  }
   159  
   160  // setAlloc is used to update the allocation of the runner
   161  // we preserve the existing client status and description
   162  func (r *AllocRunner) setAlloc(alloc *structs.Allocation) {
   163  	if r.alloc != nil {
   164  		alloc.ClientStatus = r.alloc.ClientStatus
   165  		alloc.ClientDescription = r.alloc.ClientDescription
   166  	}
   167  	r.alloc = alloc
   168  }
   169  
   170  // dirtySyncState is used to watch for state being marked dirty to sync
   171  func (r *AllocRunner) dirtySyncState() {
   172  	for {
   173  		select {
   174  		case <-r.dirtyCh:
   175  			r.retrySyncState(r.destroyCh)
   176  		case <-r.destroyCh:
   177  			return
   178  		}
   179  	}
   180  }
   181  
   182  // retrySyncState is used to retry the state sync until success
   183  func (r *AllocRunner) retrySyncState(stopCh chan struct{}) {
   184  	for {
   185  		err := r.syncStatus()
   186  		if err == nil {
   187  			return
   188  		}
   189  		select {
   190  		case <-time.After(allocSyncRetryIntv + randomStagger(allocSyncRetryIntv)):
   191  		case <-stopCh:
   192  			return
   193  		}
   194  	}
   195  }
   196  
   197  // syncStatus is used to run and sync the status when it changes
   198  func (r *AllocRunner) syncStatus() error {
   199  	// Scan the task status to termine the status of the alloc
   200  	var pending, running, dead, failed bool
   201  	r.taskStatusLock.RLock()
   202  	pending = len(r.taskStatus) < len(r.tasks)
   203  	for _, status := range r.taskStatus {
   204  		switch status.Status {
   205  		case structs.AllocClientStatusRunning:
   206  			running = true
   207  		case structs.AllocClientStatusDead:
   208  			dead = true
   209  		case structs.AllocClientStatusFailed:
   210  			failed = true
   211  		}
   212  	}
   213  	if len(r.taskStatus) > 0 {
   214  		taskDesc, _ := json.Marshal(r.taskStatus)
   215  		r.alloc.ClientDescription = string(taskDesc)
   216  	}
   217  	r.taskStatusLock.RUnlock()
   218  
   219  	// Determine the alloc status
   220  	if failed {
   221  		r.alloc.ClientStatus = structs.AllocClientStatusFailed
   222  	} else if running {
   223  		r.alloc.ClientStatus = structs.AllocClientStatusRunning
   224  	} else if dead && !pending {
   225  		r.alloc.ClientStatus = structs.AllocClientStatusDead
   226  	}
   227  
   228  	// Attempt to update the status
   229  	if err := r.updater(r.alloc); err != nil {
   230  		r.logger.Printf("[ERR] client: failed to update alloc '%s' status to %s: %s",
   231  			r.alloc.ID, r.alloc.ClientStatus, err)
   232  		return err
   233  	}
   234  	return nil
   235  }
   236  
   237  // setStatus is used to update the allocation status
   238  func (r *AllocRunner) setStatus(status, desc string) {
   239  	r.alloc.ClientStatus = status
   240  	r.alloc.ClientDescription = desc
   241  	select {
   242  	case r.dirtyCh <- struct{}{}:
   243  	default:
   244  	}
   245  }
   246  
   247  // setTaskStatus is used to set the status of a task
   248  func (r *AllocRunner) setTaskStatus(taskName, status, desc string) {
   249  	r.taskStatusLock.Lock()
   250  	r.taskStatus[taskName] = taskStatus{
   251  		Status:      status,
   252  		Description: desc,
   253  	}
   254  	r.taskStatusLock.Unlock()
   255  	select {
   256  	case r.dirtyCh <- struct{}{}:
   257  	default:
   258  	}
   259  }
   260  
   261  // Run is a long running goroutine used to manage an allocation
   262  func (r *AllocRunner) Run() {
   263  	defer close(r.waitCh)
   264  	go r.dirtySyncState()
   265  
   266  	// Check if the allocation is in a terminal status
   267  	alloc := r.alloc
   268  	if alloc.TerminalStatus() {
   269  		r.logger.Printf("[DEBUG] client: aborting runner for alloc '%s', terminal status", r.alloc.ID)
   270  		return
   271  	}
   272  	r.logger.Printf("[DEBUG] client: starting runner for alloc '%s'", r.alloc.ID)
   273  
   274  	// Find the task group to run in the allocation
   275  	tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup)
   276  	if tg == nil {
   277  		r.logger.Printf("[ERR] client: alloc '%s' for missing task group '%s'", alloc.ID, alloc.TaskGroup)
   278  		r.setStatus(structs.AllocClientStatusFailed, fmt.Sprintf("missing task group '%s'", alloc.TaskGroup))
   279  		return
   280  	}
   281  
   282  	// Create the execution context
   283  	if r.ctx == nil {
   284  		allocDir := allocdir.NewAllocDir(filepath.Join(r.config.AllocDir, r.alloc.ID))
   285  		if err := allocDir.Build(tg.Tasks); err != nil {
   286  			r.logger.Printf("[WARN] client: failed to build task directories: %v", err)
   287  			r.setStatus(structs.AllocClientStatusFailed, fmt.Sprintf("failed to build task dirs for '%s'", alloc.TaskGroup))
   288  			return
   289  		}
   290  		r.ctx = driver.NewExecContext(allocDir)
   291  	}
   292  
   293  	// Start the task runners
   294  	r.taskLock.Lock()
   295  	for _, task := range tg.Tasks {
   296  		// Skip tasks that were restored
   297  		if _, ok := r.tasks[task.Name]; ok {
   298  			continue
   299  		}
   300  
   301  		// Merge in the task resources
   302  		task.Resources = alloc.TaskResources[task.Name]
   303  
   304  		tr := NewTaskRunner(r.logger, r.config, r.setTaskStatus, r.ctx, r.alloc.ID, task)
   305  		r.tasks[task.Name] = tr
   306  		go tr.Run()
   307  	}
   308  	r.taskLock.Unlock()
   309  
   310  OUTER:
   311  	// Wait for updates
   312  	for {
   313  		select {
   314  		case update := <-r.updateCh:
   315  			// Check if we're in a terminal status
   316  			if update.TerminalStatus() {
   317  				r.setAlloc(update)
   318  				break OUTER
   319  			}
   320  
   321  			// Update the task groups
   322  			r.taskLock.RLock()
   323  			for _, task := range tg.Tasks {
   324  				tr := r.tasks[task.Name]
   325  
   326  				// Merge in the task resources
   327  				task.Resources = update.TaskResources[task.Name]
   328  				tr.Update(task)
   329  			}
   330  			r.taskLock.RUnlock()
   331  
   332  		case <-r.destroyCh:
   333  			break OUTER
   334  		}
   335  	}
   336  
   337  	// Destroy each sub-task
   338  	r.taskLock.RLock()
   339  	defer r.taskLock.RUnlock()
   340  	for _, tr := range r.tasks {
   341  		tr.Destroy()
   342  	}
   343  
   344  	// Wait for termination of the task runners
   345  	for _, tr := range r.tasks {
   346  		<-tr.WaitCh()
   347  	}
   348  
   349  	// Final state sync
   350  	r.retrySyncState(nil)
   351  
   352  	// Check if we should destroy our state
   353  	if r.destroy {
   354  		if err := r.DestroyContext(); err != nil {
   355  			r.logger.Printf("[ERR] client: failed to destroy context for alloc '%s': %v",
   356  				r.alloc.ID, err)
   357  		}
   358  		if err := r.DestroyState(); err != nil {
   359  			r.logger.Printf("[ERR] client: failed to destroy state for alloc '%s': %v",
   360  				r.alloc.ID, err)
   361  		}
   362  	}
   363  	r.logger.Printf("[DEBUG] client: terminating runner for alloc '%s'", r.alloc.ID)
   364  }
   365  
   366  // Update is used to update the allocation of the context
   367  func (r *AllocRunner) Update(update *structs.Allocation) {
   368  	select {
   369  	case r.updateCh <- update:
   370  	default:
   371  		r.logger.Printf("[ERR] client: dropping update to alloc '%s'", update.ID)
   372  	}
   373  }
   374  
   375  // Destroy is used to indicate that the allocation context should be destroyed
   376  func (r *AllocRunner) Destroy() {
   377  	r.destroyLock.Lock()
   378  	defer r.destroyLock.Unlock()
   379  
   380  	if r.destroy {
   381  		return
   382  	}
   383  	r.destroy = true
   384  	close(r.destroyCh)
   385  }
   386  
   387  // WaitCh returns a channel to wait for termination
   388  func (r *AllocRunner) WaitCh() <-chan struct{} {
   389  	return r.waitCh
   390  }