github.com/ryanslade/nomad@v0.2.4-0.20160128061903-fc95782f2089/command/monitor.go (about)

     1  package command
     2  
     3  import (
     4  	"fmt"
     5  	"sync"
     6  	"time"
     7  
     8  	"github.com/hashicorp/nomad/api"
     9  	"github.com/hashicorp/nomad/nomad/structs"
    10  	"github.com/mitchellh/cli"
    11  )
    12  
    13  const (
    14  	// updateWait is the amount of time to wait between status
    15  	// updates. Because the monitor is poll-based, we use this
    16  	// delay to avoid overwhelming the API server.
    17  	updateWait = time.Second
    18  )
    19  
    20  // evalState is used to store the current "state of the world"
    21  // in the context of monitoring an evaluation.
    22  type evalState struct {
    23  	status string
    24  	desc   string
    25  	node   string
    26  	job    string
    27  	allocs map[string]*allocState
    28  	wait   time.Duration
    29  	index  uint64
    30  }
    31  
    32  // newEvalState creates and initializes a new monitorState
    33  func newEvalState() *evalState {
    34  	return &evalState{
    35  		status: structs.EvalStatusPending,
    36  		allocs: make(map[string]*allocState),
    37  	}
    38  }
    39  
    40  // allocState is used to track the state of an allocation
    41  type allocState struct {
    42  	id          string
    43  	group       string
    44  	node        string
    45  	desired     string
    46  	desiredDesc string
    47  	client      string
    48  	clientDesc  string
    49  	index       uint64
    50  
    51  	// full is the allocation struct with full details. This
    52  	// must be queried for explicitly so it is only included
    53  	// if there is important error information inside.
    54  	full *api.Allocation
    55  }
    56  
    57  // monitor wraps an evaluation monitor and holds metadata and
    58  // state information.
    59  type monitor struct {
    60  	ui     cli.Ui
    61  	client *api.Client
    62  	state  *evalState
    63  
    64  	// length determines the number of characters for identifiers in the ui.
    65  	length int
    66  
    67  	sync.Mutex
    68  }
    69  
    70  // newMonitor returns a new monitor. The returned monitor will
    71  // write output information to the provided ui. The length parameter determines
    72  // the number of characters for identifiers in the ui.
    73  func newMonitor(ui cli.Ui, client *api.Client, length int) *monitor {
    74  	mon := &monitor{
    75  		ui: &cli.PrefixedUi{
    76  			InfoPrefix:   "==> ",
    77  			OutputPrefix: "    ",
    78  			ErrorPrefix:  "==> ",
    79  			Ui:           ui,
    80  		},
    81  		client: client,
    82  		state:  newEvalState(),
    83  		length: length,
    84  	}
    85  	return mon
    86  }
    87  
    88  // update is used to update our monitor with new state. It can be
    89  // called whether the passed information is new or not, and will
    90  // only dump update messages when state changes.
    91  func (m *monitor) update(update *evalState) {
    92  	m.Lock()
    93  	defer m.Unlock()
    94  
    95  	existing := m.state
    96  
    97  	// Swap in the new state at the end
    98  	defer func() {
    99  		m.state = update
   100  	}()
   101  
   102  	// Check if the evaluation was triggered by a node
   103  	if existing.node == "" && update.node != "" {
   104  		m.ui.Output(fmt.Sprintf("Evaluation triggered by node %q",
   105  			limit(update.node, m.length)))
   106  	}
   107  
   108  	// Check if the evaluation was triggered by a job
   109  	if existing.job == "" && update.job != "" {
   110  		m.ui.Output(fmt.Sprintf("Evaluation triggered by job %q", update.job))
   111  	}
   112  
   113  	// Check the allocations
   114  	for allocID, alloc := range update.allocs {
   115  		if existing, ok := existing.allocs[allocID]; !ok {
   116  			switch {
   117  			case alloc.desired == structs.AllocDesiredStatusFailed:
   118  				// New allocs with desired state failed indicate
   119  				// scheduling failure.
   120  				m.ui.Output(fmt.Sprintf("Scheduling error for group %q (%s)",
   121  					alloc.group, alloc.desiredDesc))
   122  
   123  				// Log the client status, if any provided
   124  				if alloc.clientDesc != "" {
   125  					m.ui.Output("Client reported status: " + alloc.clientDesc)
   126  				}
   127  
   128  				// Generate a more descriptive error for why the allocation
   129  				// failed and dump it to the screen
   130  				if alloc.full != nil {
   131  					dumpAllocStatus(m.ui, alloc.full, m.length)
   132  				}
   133  
   134  			case alloc.index < update.index:
   135  				// New alloc with create index lower than the eval
   136  				// create index indicates modification
   137  				m.ui.Output(fmt.Sprintf(
   138  					"Allocation %q modified: node %q, group %q",
   139  					limit(alloc.id, m.length), limit(alloc.node, m.length), alloc.group))
   140  
   141  			case alloc.desired == structs.AllocDesiredStatusRun:
   142  				// New allocation with desired status running
   143  				m.ui.Output(fmt.Sprintf(
   144  					"Allocation %q created: node %q, group %q",
   145  					limit(alloc.id, m.length), limit(alloc.node, m.length), alloc.group))
   146  			}
   147  		} else {
   148  			switch {
   149  			case existing.client != alloc.client:
   150  				// Allocation status has changed
   151  				m.ui.Output(fmt.Sprintf(
   152  					"Allocation %q status changed: %q -> %q (%s)",
   153  					limit(alloc.id, m.length), existing.client, alloc.client, alloc.clientDesc))
   154  			}
   155  		}
   156  	}
   157  
   158  	// Check if the status changed. We skip any transitions to pending status.
   159  	if existing.status != "" &&
   160  		update.status != structs.AllocClientStatusPending &&
   161  		existing.status != update.status {
   162  		m.ui.Output(fmt.Sprintf("Evaluation status changed: %q -> %q",
   163  			existing.status, update.status))
   164  	}
   165  }
   166  
   167  // monitor is used to start monitoring the given evaluation ID. It
   168  // writes output directly to the monitor's ui, and returns the
   169  // exit code for the command. If allowPrefix is false, monitor will only accept
   170  // exact matching evalIDs.
   171  //
   172  // The return code will be 0 on successful evaluation. If there are
   173  // problems scheduling the job (impossible constraints, resources
   174  // exhausted, etc), then the return code will be 2. For any other
   175  // failures (API connectivity, internal errors, etc), the return code
   176  // will be 1.
   177  func (m *monitor) monitor(evalID string, allowPrefix bool) int {
   178  	// Track if we encounter a scheduling failure. This can only be
   179  	// detected while querying allocations, so we use this bool to
   180  	// carry that status into the return code.
   181  	var schedFailure bool
   182  
   183  	// The user may have specified a prefix as eval id. We need to lookup the
   184  	// full id from the database first. Since we do this in a loop we need a
   185  	// variable to keep track if we've already written the header message.
   186  	var headerWritten bool
   187  
   188  	// Add the initial pending state
   189  	m.update(newEvalState())
   190  
   191  	for {
   192  		// Query the evaluation
   193  		eval, _, err := m.client.Evaluations().Info(evalID, nil)
   194  		if err != nil {
   195  			if !allowPrefix {
   196  				m.ui.Error(fmt.Sprintf("No evaluation with id %q found", evalID))
   197  				return 1
   198  			}
   199  			if len(evalID) == 1 {
   200  				m.ui.Error(fmt.Sprintf("Identifier must contain at least two characters."))
   201  				return 1
   202  			}
   203  			if len(evalID)%2 == 1 {
   204  				// Identifiers must be of even length, so we strip off the last byte
   205  				// to provide a consistent user experience.
   206  				evalID = evalID[:len(evalID)-1]
   207  			}
   208  
   209  			evals, _, err := m.client.Evaluations().PrefixList(evalID)
   210  			if err != nil {
   211  				m.ui.Error(fmt.Sprintf("Error reading evaluation: %s", err))
   212  				return 1
   213  			}
   214  			if len(evals) == 0 {
   215  				m.ui.Error(fmt.Sprintf("No evaluation(s) with prefix or id %q found", evalID))
   216  				return 1
   217  			}
   218  			if len(evals) > 1 {
   219  				// Format the evaluations
   220  				out := make([]string, len(evals)+1)
   221  				out[0] = "ID|Priority|Type|Triggered By|Status"
   222  				for i, eval := range evals {
   223  					out[i+1] = fmt.Sprintf("%s|%d|%s|%s|%s",
   224  						limit(eval.ID, m.length),
   225  						eval.Priority,
   226  						eval.Type,
   227  						eval.TriggeredBy,
   228  						eval.Status)
   229  				}
   230  				m.ui.Output(fmt.Sprintf("Prefix matched multiple evaluations\n\n%s", formatList(out)))
   231  				return 0
   232  			}
   233  			// Prefix lookup matched a single evaluation
   234  			eval, _, err = m.client.Evaluations().Info(evals[0].ID, nil)
   235  			if err != nil {
   236  				m.ui.Error(fmt.Sprintf("Error reading evaluation: %s", err))
   237  			}
   238  		}
   239  
   240  		if !headerWritten {
   241  			m.ui.Info(fmt.Sprintf("Monitoring evaluation %q", limit(eval.ID, m.length)))
   242  			headerWritten = true
   243  		}
   244  
   245  		// Create the new eval state.
   246  		state := newEvalState()
   247  		state.status = eval.Status
   248  		state.desc = eval.StatusDescription
   249  		state.node = eval.NodeID
   250  		state.job = eval.JobID
   251  		state.wait = eval.Wait
   252  		state.index = eval.CreateIndex
   253  
   254  		// Query the allocations associated with the evaluation
   255  		allocs, _, err := m.client.Evaluations().Allocations(eval.ID, nil)
   256  		if err != nil {
   257  			m.ui.Error(fmt.Sprintf("Error reading allocations: %s", err))
   258  			return 1
   259  		}
   260  
   261  		// Add the allocs to the state
   262  		for _, alloc := range allocs {
   263  			state.allocs[alloc.ID] = &allocState{
   264  				id:          alloc.ID,
   265  				group:       alloc.TaskGroup,
   266  				node:        alloc.NodeID,
   267  				desired:     alloc.DesiredStatus,
   268  				desiredDesc: alloc.DesiredDescription,
   269  				client:      alloc.ClientStatus,
   270  				clientDesc:  alloc.ClientDescription,
   271  				index:       alloc.CreateIndex,
   272  			}
   273  
   274  			// If we have a scheduling error, query the full allocation
   275  			// to get the details.
   276  			if alloc.DesiredStatus == structs.AllocDesiredStatusFailed {
   277  				schedFailure = true
   278  				failed, _, err := m.client.Allocations().Info(alloc.ID, nil)
   279  				if err != nil {
   280  					m.ui.Error(fmt.Sprintf("Error querying allocation: %s", err))
   281  					return 1
   282  				}
   283  				state.allocs[alloc.ID].full = failed
   284  			}
   285  		}
   286  
   287  		// Update the state
   288  		m.update(state)
   289  
   290  		switch eval.Status {
   291  		case structs.EvalStatusComplete, structs.EvalStatusFailed:
   292  			m.ui.Info(fmt.Sprintf("Evaluation %q finished with status %q",
   293  				limit(eval.ID, m.length), eval.Status))
   294  		default:
   295  			// Wait for the next update
   296  			time.Sleep(updateWait)
   297  			continue
   298  		}
   299  
   300  		// Monitor the next eval in the chain, if present
   301  		if eval.NextEval != "" {
   302  			m.ui.Info(fmt.Sprintf(
   303  				"Monitoring next evaluation %q in %s",
   304  				eval.NextEval, eval.Wait))
   305  
   306  			// Skip some unnecessary polling
   307  			time.Sleep(eval.Wait)
   308  
   309  			// Reset the state and monitor the new eval
   310  			m.state = newEvalState()
   311  			return m.monitor(eval.NextEval, allowPrefix)
   312  		}
   313  		break
   314  	}
   315  
   316  	// Treat scheduling failures specially using a dedicated exit code.
   317  	// This makes it easier to detect failures from the CLI.
   318  	if schedFailure {
   319  		return 2
   320  	}
   321  
   322  	return 0
   323  }
   324  
   325  // dumpAllocStatus is a helper to generate a more user-friendly error message
   326  // for scheduling failures, displaying a high level status of why the job
   327  // could not be scheduled out.
   328  func dumpAllocStatus(ui cli.Ui, alloc *api.Allocation, length int) {
   329  	// Print filter stats
   330  	ui.Output(fmt.Sprintf("Allocation %q status %q (%d/%d nodes filtered)",
   331  		limit(alloc.ID, length), alloc.ClientStatus,
   332  		alloc.Metrics.NodesFiltered, alloc.Metrics.NodesEvaluated))
   333  
   334  	// Print a helpful message if we have an eligibility problem
   335  	if alloc.Metrics.NodesEvaluated == 0 {
   336  		ui.Output("  * No nodes were eligible for evaluation")
   337  	}
   338  
   339  	// Print a helpful message if the user has asked for a DC that has no
   340  	// available nodes.
   341  	for dc, available := range alloc.Metrics.NodesAvailable {
   342  		if available == 0 {
   343  			ui.Output(fmt.Sprintf("  * No nodes are available in datacenter %q", dc))
   344  		}
   345  	}
   346  
   347  	// Print filter info
   348  	for class, num := range alloc.Metrics.ClassFiltered {
   349  		ui.Output(fmt.Sprintf("  * Class %q filtered %d nodes", class, num))
   350  	}
   351  	for cs, num := range alloc.Metrics.ConstraintFiltered {
   352  		ui.Output(fmt.Sprintf("  * Constraint %q filtered %d nodes", cs, num))
   353  	}
   354  
   355  	// Print exhaustion info
   356  	if ne := alloc.Metrics.NodesExhausted; ne > 0 {
   357  		ui.Output(fmt.Sprintf("  * Resources exhausted on %d nodes", ne))
   358  	}
   359  	for class, num := range alloc.Metrics.ClassExhausted {
   360  		ui.Output(fmt.Sprintf("  * Class %q exhausted on %d nodes", class, num))
   361  	}
   362  	for dim, num := range alloc.Metrics.DimensionExhausted {
   363  		ui.Output(fmt.Sprintf("  * Dimension %q exhausted on %d nodes", dim, num))
   364  	}
   365  
   366  	// Print scores
   367  	for name, score := range alloc.Metrics.Scores {
   368  		ui.Output(fmt.Sprintf("  * Score %q = %f", name, score))
   369  	}
   370  }