github.com/huiliang/nomad@v0.2.1-0.20151124023127-7a8b664699ff/command/monitor.go

github.com/huiliang/nomad@v0.2.1-0.20151124023127-7a8b664699ff/command/monitor.go (about)

     1  package command
     2  
     3  import (
     4  	"fmt"
     5  	"sync"
     6  	"time"
     7  
     8  	"github.com/hashicorp/nomad/api"
     9  	"github.com/hashicorp/nomad/nomad/structs"
    10  	"github.com/mitchellh/cli"
    11  )
    12  
    13  const (
    14  	// updateWait is the amount of time to wait between status
    15  	// updates. Because the monitor is poll-based, we use this
    16  	// delay to avoid overwhelming the API server.
    17  	updateWait = time.Second
    18  )
    19  
    20  // evalState is used to store the current "state of the world"
    21  // in the context of monitoring an evaluation.
    22  type evalState struct {
    23  	status string
    24  	desc   string
    25  	node   string
    26  	job    string
    27  	allocs map[string]*allocState
    28  	wait   time.Duration
    29  	index  uint64
    30  }
    31  
    32  // newEvalState creates and initializes a new monitorState
    33  func newEvalState() *evalState {
    34  	return &evalState{
    35  		status: structs.EvalStatusPending,
    36  		allocs: make(map[string]*allocState),
    37  	}
    38  }
    39  
    40  // allocState is used to track the state of an allocation
    41  type allocState struct {
    42  	id          string
    43  	group       string
    44  	node        string
    45  	desired     string
    46  	desiredDesc string
    47  	client      string
    48  	clientDesc  string
    49  	index       uint64
    50  
    51  	// full is the allocation struct with full details. This
    52  	// must be queried for explicitly so it is only included
    53  	// if there is important error information inside.
    54  	full *api.Allocation
    55  }
    56  
    57  // monitor wraps an evaluation monitor and holds metadata and
    58  // state information.
    59  type monitor struct {
    60  	ui     cli.Ui
    61  	client *api.Client
    62  	state  *evalState
    63  
    64  	sync.Mutex
    65  }
    66  
    67  // newMonitor returns a new monitor. The returned monitor will
    68  // write output information to the provided ui.
    69  func newMonitor(ui cli.Ui, client *api.Client) *monitor {
    70  	mon := &monitor{
    71  		ui: &cli.PrefixedUi{
    72  			InfoPrefix:   "==> ",
    73  			OutputPrefix: "    ",
    74  			ErrorPrefix:  "==> ",
    75  			Ui:           ui,
    76  		},
    77  		client: client,
    78  		state:  newEvalState(),
    79  	}
    80  	return mon
    81  }
    82  
    83  // update is used to update our monitor with new state. It can be
    84  // called whether the passed information is new or not, and will
    85  // only dump update messages when state changes.
    86  func (m *monitor) update(update *evalState) {
    87  	m.Lock()
    88  	defer m.Unlock()
    89  
    90  	existing := m.state
    91  
    92  	// Swap in the new state at the end
    93  	defer func() {
    94  		m.state = update
    95  	}()
    96  
    97  	// Check if the evaluation was triggered by a node
    98  	if existing.node == "" && update.node != "" {
    99  		m.ui.Output(fmt.Sprintf("Evaluation triggered by node %q",
   100  			update.node))
   101  	}
   102  
   103  	// Check if the evaluation was triggered by a job
   104  	if existing.job == "" && update.job != "" {
   105  		m.ui.Output(fmt.Sprintf("Evaluation triggered by job %q", update.job))
   106  	}
   107  
   108  	// Check the allocations
   109  	for allocID, alloc := range update.allocs {
   110  		if existing, ok := existing.allocs[allocID]; !ok {
   111  			switch {
   112  			case alloc.desired == structs.AllocDesiredStatusFailed:
   113  				// New allocs with desired state failed indicate
   114  				// scheduling failure.
   115  				m.ui.Output(fmt.Sprintf("Scheduling error for group %q (%s)",
   116  					alloc.group, alloc.desiredDesc))
   117  
   118  				// Log the client status, if any provided
   119  				if alloc.clientDesc != "" {
   120  					m.ui.Output("Client reported status: " + alloc.clientDesc)
   121  				}
   122  
   123  				// Generate a more descriptive error for why the allocation
   124  				// failed and dump it to the screen
   125  				if alloc.full != nil {
   126  					dumpAllocStatus(m.ui, alloc.full)
   127  				}
   128  
   129  			case alloc.index < update.index:
   130  				// New alloc with create index lower than the eval
   131  				// create index indicates modification
   132  				m.ui.Output(fmt.Sprintf(
   133  					"Allocation %q modified: node %q, group %q",
   134  					alloc.id, alloc.node, alloc.group))
   135  
   136  			case alloc.desired == structs.AllocDesiredStatusRun:
   137  				// New allocation with desired status running
   138  				m.ui.Output(fmt.Sprintf(
   139  					"Allocation %q created: node %q, group %q",
   140  					alloc.id, alloc.node, alloc.group))
   141  			}
   142  		} else {
   143  			switch {
   144  			case existing.client != alloc.client:
   145  				// Allocation status has changed
   146  				m.ui.Output(fmt.Sprintf(
   147  					"Allocation %q status changed: %q -> %q (%s)",
   148  					alloc.id, existing.client, alloc.client, alloc.clientDesc))
   149  			}
   150  		}
   151  	}
   152  
   153  	// Check if the status changed. We skip any transitions to pending status.
   154  	if existing.status != "" &&
   155  		update.status != structs.AllocClientStatusPending &&
   156  		existing.status != update.status {
   157  		m.ui.Output(fmt.Sprintf("Evaluation status changed: %q -> %q",
   158  			existing.status, update.status))
   159  	}
   160  }
   161  
   162  // monitor is used to start monitoring the given evaluation ID. It
   163  // writes output directly to the monitor's ui, and returns the
   164  // exit code for the command.
   165  //
   166  // The return code will be 0 on successful evaluation. If there are
   167  // problems scheduling the job (impossible constraints, resources
   168  // exhausted, etc), then the return code will be 2. For any other
   169  // failures (API connectivity, internal errors, etc), the return code
   170  // will be 1.
   171  func (m *monitor) monitor(evalID string) int {
   172  	// Track if we encounter a scheduling failure. This can only be
   173  	// detected while querying allocations, so we use this bool to
   174  	// carry that status into the return code.
   175  	var schedFailure bool
   176  
   177  	// Add the initial pending state
   178  	m.update(newEvalState())
   179  
   180  	m.ui.Info(fmt.Sprintf("Monitoring evaluation %q", evalID))
   181  	for {
   182  		// Query the evaluation
   183  		eval, _, err := m.client.Evaluations().Info(evalID, nil)
   184  		if err != nil {
   185  			m.ui.Error(fmt.Sprintf("Error reading evaluation: %s", err))
   186  			return 1
   187  		}
   188  
   189  		// Create the new eval state.
   190  		state := newEvalState()
   191  		state.status = eval.Status
   192  		state.desc = eval.StatusDescription
   193  		state.node = eval.NodeID
   194  		state.job = eval.JobID
   195  		state.wait = eval.Wait
   196  		state.index = eval.CreateIndex
   197  
   198  		// Query the allocations associated with the evaluation
   199  		allocs, _, err := m.client.Evaluations().Allocations(evalID, nil)
   200  		if err != nil {
   201  			m.ui.Error(fmt.Sprintf("Error reading allocations: %s", err))
   202  			return 1
   203  		}
   204  
   205  		// Add the allocs to the state
   206  		for _, alloc := range allocs {
   207  			state.allocs[alloc.ID] = &allocState{
   208  				id:          alloc.ID,
   209  				group:       alloc.TaskGroup,
   210  				node:        alloc.NodeID,
   211  				desired:     alloc.DesiredStatus,
   212  				desiredDesc: alloc.DesiredDescription,
   213  				client:      alloc.ClientStatus,
   214  				clientDesc:  alloc.ClientDescription,
   215  				index:       alloc.CreateIndex,
   216  			}
   217  
   218  			// If we have a scheduling error, query the full allocation
   219  			// to get the details.
   220  			if alloc.DesiredStatus == structs.AllocDesiredStatusFailed {
   221  				schedFailure = true
   222  				failed, _, err := m.client.Allocations().Info(alloc.ID, nil)
   223  				if err != nil {
   224  					m.ui.Error(fmt.Sprintf("Error querying allocation: %s", err))
   225  					return 1
   226  				}
   227  				state.allocs[alloc.ID].full = failed
   228  			}
   229  		}
   230  
   231  		// Update the state
   232  		m.update(state)
   233  
   234  		switch eval.Status {
   235  		case structs.EvalStatusComplete, structs.EvalStatusFailed:
   236  			m.ui.Info(fmt.Sprintf("Evaluation %q finished with status %q",
   237  				eval.ID, eval.Status))
   238  		default:
   239  			// Wait for the next update
   240  			time.Sleep(updateWait)
   241  			continue
   242  		}
   243  
   244  		// Monitor the next eval in the chain, if present
   245  		if eval.NextEval != "" {
   246  			m.ui.Info(fmt.Sprintf(
   247  				"Monitoring next evaluation %q in %s",
   248  				eval.NextEval, eval.Wait))
   249  
   250  			// Skip some unnecessary polling
   251  			time.Sleep(eval.Wait)
   252  
   253  			// Reset the state and monitor the new eval
   254  			m.state = newEvalState()
   255  			return m.monitor(eval.NextEval)
   256  		}
   257  		break
   258  	}
   259  
   260  	// Treat scheduling failures specially using a dedicated exit code.
   261  	// This makes it easier to detect failures from the CLI.
   262  	if schedFailure {
   263  		return 2
   264  	}
   265  
   266  	return 0
   267  }
   268  
   269  // dumpAllocStatus is a helper to generate a more user-friendly error message
   270  // for scheduling failures, displaying a high level status of why the job
   271  // could not be scheduled out.
   272  func dumpAllocStatus(ui cli.Ui, alloc *api.Allocation) {
   273  	// Print filter stats
   274  	ui.Output(fmt.Sprintf("Allocation %q status %q (%d/%d nodes filtered)",
   275  		alloc.ID, alloc.ClientStatus,
   276  		alloc.Metrics.NodesFiltered, alloc.Metrics.NodesEvaluated))
   277  
   278  	// Print a helpful message if we have an eligibility problem
   279  	if alloc.Metrics.NodesEvaluated == 0 {
   280  		ui.Output("  * No nodes were eligible for evaluation")
   281  	}
   282  
   283  	// Print filter info
   284  	for class, num := range alloc.Metrics.ClassFiltered {
   285  		ui.Output(fmt.Sprintf("  * Class %q filtered %d nodes", class, num))
   286  	}
   287  	for cs, num := range alloc.Metrics.ConstraintFiltered {
   288  		ui.Output(fmt.Sprintf("  * Constraint %q filtered %d nodes", cs, num))
   289  	}
   290  
   291  	// Print exhaustion info
   292  	if ne := alloc.Metrics.NodesExhausted; ne > 0 {
   293  		ui.Output(fmt.Sprintf("  * Resources exhausted on %d nodes", ne))
   294  	}
   295  	for class, num := range alloc.Metrics.ClassExhausted {
   296  		ui.Output(fmt.Sprintf("  * Class %q exhausted on %d nodes", class, num))
   297  	}
   298  	for dim, num := range alloc.Metrics.DimensionExhausted {
   299  		ui.Output(fmt.Sprintf("  * Dimension %q exhausted on %d nodes", dim, num))
   300  	}
   301  
   302  	// Print scores
   303  	for name, score := range alloc.Metrics.Scores {
   304  		ui.Output(fmt.Sprintf("  * Score %q = %f", name, score))
   305  	}
   306  }