github.com/hhrutter/nomad@v0.6.0-rc2.0.20170723054333-80c4b03f0705/command/monitor.go

github.com/hhrutter/nomad@v0.6.0-rc2.0.20170723054333-80c4b03f0705/command/monitor.go (about)

     1  package command
     2  
     3  import (
     4  	"fmt"
     5  	"strings"
     6  	"sync"
     7  	"time"
     8  
     9  	"github.com/hashicorp/nomad/api"
    10  	"github.com/hashicorp/nomad/nomad/structs"
    11  	"github.com/mitchellh/cli"
    12  )
    13  
    14  const (
    15  	// updateWait is the amount of time to wait between status
    16  	// updates. Because the monitor is poll-based, we use this
    17  	// delay to avoid overwhelming the API server.
    18  	updateWait = time.Second
    19  )
    20  
    21  // evalState is used to store the current "state of the world"
    22  // in the context of monitoring an evaluation.
    23  type evalState struct {
    24  	status     string
    25  	desc       string
    26  	node       string
    27  	deployment string
    28  	job        string
    29  	allocs     map[string]*allocState
    30  	wait       time.Duration
    31  	index      uint64
    32  }
    33  
    34  // newEvalState creates and initializes a new monitorState
    35  func newEvalState() *evalState {
    36  	return &evalState{
    37  		status: structs.EvalStatusPending,
    38  		allocs: make(map[string]*allocState),
    39  	}
    40  }
    41  
    42  // allocState is used to track the state of an allocation
    43  type allocState struct {
    44  	id          string
    45  	group       string
    46  	node        string
    47  	desired     string
    48  	desiredDesc string
    49  	client      string
    50  	clientDesc  string
    51  	index       uint64
    52  
    53  	// full is the allocation struct with full details. This
    54  	// must be queried for explicitly so it is only included
    55  	// if there is important error information inside.
    56  	full *api.Allocation
    57  }
    58  
    59  // monitor wraps an evaluation monitor and holds metadata and
    60  // state information.
    61  type monitor struct {
    62  	ui     cli.Ui
    63  	client *api.Client
    64  	state  *evalState
    65  
    66  	// length determines the number of characters for identifiers in the ui.
    67  	length int
    68  
    69  	sync.Mutex
    70  }
    71  
    72  // newMonitor returns a new monitor. The returned monitor will
    73  // write output information to the provided ui. The length parameter determines
    74  // the number of characters for identifiers in the ui.
    75  func newMonitor(ui cli.Ui, client *api.Client, length int) *monitor {
    76  	mon := &monitor{
    77  		ui: &cli.PrefixedUi{
    78  			InfoPrefix:   "==> ",
    79  			OutputPrefix: "    ",
    80  			ErrorPrefix:  "==> ",
    81  			Ui:           ui,
    82  		},
    83  		client: client,
    84  		state:  newEvalState(),
    85  		length: length,
    86  	}
    87  	return mon
    88  }
    89  
    90  // update is used to update our monitor with new state. It can be
    91  // called whether the passed information is new or not, and will
    92  // only dump update messages when state changes.
    93  func (m *monitor) update(update *evalState) {
    94  	m.Lock()
    95  	defer m.Unlock()
    96  
    97  	existing := m.state
    98  
    99  	// Swap in the new state at the end
   100  	defer func() {
   101  		m.state = update
   102  	}()
   103  
   104  	// Check if the evaluation was triggered by a node
   105  	if existing.node == "" && update.node != "" {
   106  		m.ui.Output(fmt.Sprintf("Evaluation triggered by node %q",
   107  			limit(update.node, m.length)))
   108  	}
   109  
   110  	// Check if the evaluation was triggered by a job
   111  	if existing.job == "" && update.job != "" {
   112  		m.ui.Output(fmt.Sprintf("Evaluation triggered by job %q", update.job))
   113  	}
   114  
   115  	// Check if the evaluation was triggered by a deployment
   116  	if existing.deployment == "" && update.deployment != "" {
   117  		m.ui.Output(fmt.Sprintf("Evaluation within deployment: %q", limit(update.deployment, m.length)))
   118  	}
   119  
   120  	// Check the allocations
   121  	for allocID, alloc := range update.allocs {
   122  		if existing, ok := existing.allocs[allocID]; !ok {
   123  			switch {
   124  			case alloc.index < update.index:
   125  				// New alloc with create index lower than the eval
   126  				// create index indicates modification
   127  				m.ui.Output(fmt.Sprintf(
   128  					"Allocation %q modified: node %q, group %q",
   129  					limit(alloc.id, m.length), limit(alloc.node, m.length), alloc.group))
   130  
   131  			case alloc.desired == structs.AllocDesiredStatusRun:
   132  				// New allocation with desired status running
   133  				m.ui.Output(fmt.Sprintf(
   134  					"Allocation %q created: node %q, group %q",
   135  					limit(alloc.id, m.length), limit(alloc.node, m.length), alloc.group))
   136  			}
   137  		} else {
   138  			switch {
   139  			case existing.client != alloc.client:
   140  				description := ""
   141  				if alloc.clientDesc != "" {
   142  					description = fmt.Sprintf(" (%s)", alloc.clientDesc)
   143  				}
   144  				// Allocation status has changed
   145  				m.ui.Output(fmt.Sprintf(
   146  					"Allocation %q status changed: %q -> %q%s",
   147  					limit(alloc.id, m.length), existing.client, alloc.client, description))
   148  			}
   149  		}
   150  	}
   151  
   152  	// Check if the status changed. We skip any transitions to pending status.
   153  	if existing.status != "" &&
   154  		update.status != structs.AllocClientStatusPending &&
   155  		existing.status != update.status {
   156  		m.ui.Output(fmt.Sprintf("Evaluation status changed: %q -> %q",
   157  			existing.status, update.status))
   158  	}
   159  }
   160  
   161  // monitor is used to start monitoring the given evaluation ID. It
   162  // writes output directly to the monitor's ui, and returns the
   163  // exit code for the command. If allowPrefix is false, monitor will only accept
   164  // exact matching evalIDs.
   165  //
   166  // The return code will be 0 on successful evaluation. If there are
   167  // problems scheduling the job (impossible constraints, resources
   168  // exhausted, etc), then the return code will be 2. For any other
   169  // failures (API connectivity, internal errors, etc), the return code
   170  // will be 1.
   171  func (m *monitor) monitor(evalID string, allowPrefix bool) int {
   172  	// Track if we encounter a scheduling failure. This can only be
   173  	// detected while querying allocations, so we use this bool to
   174  	// carry that status into the return code.
   175  	var schedFailure bool
   176  
   177  	// The user may have specified a prefix as eval id. We need to lookup the
   178  	// full id from the database first. Since we do this in a loop we need a
   179  	// variable to keep track if we've already written the header message.
   180  	var headerWritten bool
   181  
   182  	// Add the initial pending state
   183  	m.update(newEvalState())
   184  
   185  	for {
   186  		// Query the evaluation
   187  		eval, _, err := m.client.Evaluations().Info(evalID, nil)
   188  		if err != nil {
   189  			if !allowPrefix {
   190  				m.ui.Error(fmt.Sprintf("No evaluation with id %q found", evalID))
   191  				return 1
   192  			}
   193  			if len(evalID) == 1 {
   194  				m.ui.Error(fmt.Sprintf("Identifier must contain at least two characters."))
   195  				return 1
   196  			}
   197  			if len(evalID)%2 == 1 {
   198  				// Identifiers must be of even length, so we strip off the last byte
   199  				// to provide a consistent user experience.
   200  				evalID = evalID[:len(evalID)-1]
   201  			}
   202  
   203  			evals, _, err := m.client.Evaluations().PrefixList(evalID)
   204  			if err != nil {
   205  				m.ui.Error(fmt.Sprintf("Error reading evaluation: %s", err))
   206  				return 1
   207  			}
   208  			if len(evals) == 0 {
   209  				m.ui.Error(fmt.Sprintf("No evaluation(s) with prefix or id %q found", evalID))
   210  				return 1
   211  			}
   212  			if len(evals) > 1 {
   213  				// Format the evaluations
   214  				out := make([]string, len(evals)+1)
   215  				out[0] = "ID|Priority|Type|Triggered By|Status"
   216  				for i, eval := range evals {
   217  					out[i+1] = fmt.Sprintf("%s|%d|%s|%s|%s",
   218  						limit(eval.ID, m.length),
   219  						eval.Priority,
   220  						eval.Type,
   221  						eval.TriggeredBy,
   222  						eval.Status)
   223  				}
   224  				m.ui.Output(fmt.Sprintf("Prefix matched multiple evaluations\n\n%s", formatList(out)))
   225  				return 0
   226  			}
   227  			// Prefix lookup matched a single evaluation
   228  			eval, _, err = m.client.Evaluations().Info(evals[0].ID, nil)
   229  			if err != nil {
   230  				m.ui.Error(fmt.Sprintf("Error reading evaluation: %s", err))
   231  			}
   232  		}
   233  
   234  		if !headerWritten {
   235  			m.ui.Info(fmt.Sprintf("Monitoring evaluation %q", limit(eval.ID, m.length)))
   236  			headerWritten = true
   237  		}
   238  
   239  		// Create the new eval state.
   240  		state := newEvalState()
   241  		state.status = eval.Status
   242  		state.desc = eval.StatusDescription
   243  		state.node = eval.NodeID
   244  		state.job = eval.JobID
   245  		state.deployment = eval.DeploymentID
   246  		state.wait = eval.Wait
   247  		state.index = eval.CreateIndex
   248  
   249  		// Query the allocations associated with the evaluation
   250  		allocs, _, err := m.client.Evaluations().Allocations(eval.ID, nil)
   251  		if err != nil {
   252  			m.ui.Error(fmt.Sprintf("Error reading allocations: %s", err))
   253  			return 1
   254  		}
   255  
   256  		// Add the allocs to the state
   257  		for _, alloc := range allocs {
   258  			state.allocs[alloc.ID] = &allocState{
   259  				id:          alloc.ID,
   260  				group:       alloc.TaskGroup,
   261  				node:        alloc.NodeID,
   262  				desired:     alloc.DesiredStatus,
   263  				desiredDesc: alloc.DesiredDescription,
   264  				client:      alloc.ClientStatus,
   265  				clientDesc:  alloc.ClientDescription,
   266  				index:       alloc.CreateIndex,
   267  			}
   268  		}
   269  
   270  		// Update the state
   271  		m.update(state)
   272  
   273  		switch eval.Status {
   274  		case structs.EvalStatusComplete, structs.EvalStatusFailed, structs.EvalStatusCancelled:
   275  			if len(eval.FailedTGAllocs) == 0 {
   276  				m.ui.Info(fmt.Sprintf("Evaluation %q finished with status %q",
   277  					limit(eval.ID, m.length), eval.Status))
   278  			} else {
   279  				// There were failures making the allocations
   280  				schedFailure = true
   281  				m.ui.Info(fmt.Sprintf("Evaluation %q finished with status %q but failed to place all allocations:",
   282  					limit(eval.ID, m.length), eval.Status))
   283  
   284  				// Print the failures per task group
   285  				for tg, metrics := range eval.FailedTGAllocs {
   286  					noun := "allocation"
   287  					if metrics.CoalescedFailures > 0 {
   288  						noun += "s"
   289  					}
   290  					m.ui.Output(fmt.Sprintf("Task Group %q (failed to place %d %s):", tg, metrics.CoalescedFailures+1, noun))
   291  					metrics := formatAllocMetrics(metrics, false, "  ")
   292  					for _, line := range strings.Split(metrics, "\n") {
   293  						m.ui.Output(line)
   294  					}
   295  				}
   296  
   297  				if eval.BlockedEval != "" {
   298  					m.ui.Output(fmt.Sprintf("Evaluation %q waiting for additional capacity to place remainder",
   299  						limit(eval.BlockedEval, m.length)))
   300  				}
   301  			}
   302  		default:
   303  			// Wait for the next update
   304  			time.Sleep(updateWait)
   305  			continue
   306  		}
   307  
   308  		// Monitor the next eval in the chain, if present
   309  		if eval.NextEval != "" {
   310  			if eval.Wait.Nanoseconds() != 0 {
   311  				m.ui.Info(fmt.Sprintf(
   312  					"Monitoring next evaluation %q in %s",
   313  					limit(eval.NextEval, m.length), eval.Wait))
   314  
   315  				// Skip some unnecessary polling
   316  				time.Sleep(eval.Wait)
   317  			}
   318  
   319  			// Reset the state and monitor the new eval
   320  			m.state = newEvalState()
   321  			return m.monitor(eval.NextEval, allowPrefix)
   322  		}
   323  		break
   324  	}
   325  
   326  	// Treat scheduling failures specially using a dedicated exit code.
   327  	// This makes it easier to detect failures from the CLI.
   328  	if schedFailure {
   329  		return 2
   330  	}
   331  
   332  	return 0
   333  }
   334  
   335  // dumpAllocStatus is a helper to generate a more user-friendly error message
   336  // for scheduling failures, displaying a high level status of why the job
   337  // could not be scheduled out.
   338  func dumpAllocStatus(ui cli.Ui, alloc *api.Allocation, length int) {
   339  	// Print filter stats
   340  	ui.Output(fmt.Sprintf("Allocation %q status %q (%d/%d nodes filtered)",
   341  		limit(alloc.ID, length), alloc.ClientStatus,
   342  		alloc.Metrics.NodesFiltered, alloc.Metrics.NodesEvaluated))
   343  	ui.Output(formatAllocMetrics(alloc.Metrics, true, "  "))
   344  }
   345  
   346  func formatAllocMetrics(metrics *api.AllocationMetric, scores bool, prefix string) string {
   347  	// Print a helpful message if we have an eligibility problem
   348  	var out string
   349  	if metrics.NodesEvaluated == 0 {
   350  		out += fmt.Sprintf("%s* No nodes were eligible for evaluation\n", prefix)
   351  	}
   352  
   353  	// Print a helpful message if the user has asked for a DC that has no
   354  	// available nodes.
   355  	for dc, available := range metrics.NodesAvailable {
   356  		if available == 0 {
   357  			out += fmt.Sprintf("%s* No nodes are available in datacenter %q\n", prefix, dc)
   358  		}
   359  	}
   360  
   361  	// Print filter info
   362  	for class, num := range metrics.ClassFiltered {
   363  		out += fmt.Sprintf("%s* Class %q filtered %d nodes\n", prefix, class, num)
   364  	}
   365  	for cs, num := range metrics.ConstraintFiltered {
   366  		out += fmt.Sprintf("%s* Constraint %q filtered %d nodes\n", prefix, cs, num)
   367  	}
   368  
   369  	// Print exhaustion info
   370  	if ne := metrics.NodesExhausted; ne > 0 {
   371  		out += fmt.Sprintf("%s* Resources exhausted on %d nodes\n", prefix, ne)
   372  	}
   373  	for class, num := range metrics.ClassExhausted {
   374  		out += fmt.Sprintf("%s* Class %q exhausted on %d nodes\n", prefix, class, num)
   375  	}
   376  	for dim, num := range metrics.DimensionExhausted {
   377  		out += fmt.Sprintf("%s* Dimension %q exhausted on %d nodes\n", prefix, dim, num)
   378  	}
   379  
   380  	// Print scores
   381  	if scores {
   382  		for name, score := range metrics.Scores {
   383  			out += fmt.Sprintf("%s* Score %q = %f\n", prefix, name, score)
   384  		}
   385  	}
   386  
   387  	out = strings.TrimSuffix(out, "\n")
   388  	return out
   389  }