github.com/diptanu/nomad@v0.5.7-0.20170516172507-d72e86cbe3d9/command/monitor.go

github.com/diptanu/nomad@v0.5.7-0.20170516172507-d72e86cbe3d9/command/monitor.go (about)

     1  package command
     2  
     3  import (
     4  	"fmt"
     5  	"strings"
     6  	"sync"
     7  	"time"
     8  
     9  	"github.com/hashicorp/nomad/api"
    10  	"github.com/hashicorp/nomad/nomad/structs"
    11  	"github.com/mitchellh/cli"
    12  )
    13  
    14  const (
    15  	// updateWait is the amount of time to wait between status
    16  	// updates. Because the monitor is poll-based, we use this
    17  	// delay to avoid overwhelming the API server.
    18  	updateWait = time.Second
    19  )
    20  
    21  // evalState is used to store the current "state of the world"
    22  // in the context of monitoring an evaluation.
    23  type evalState struct {
    24  	status string
    25  	desc   string
    26  	node   string
    27  	job    string
    28  	allocs map[string]*allocState
    29  	wait   time.Duration
    30  	index  uint64
    31  }
    32  
    33  // newEvalState creates and initializes a new monitorState
    34  func newEvalState() *evalState {
    35  	return &evalState{
    36  		status: structs.EvalStatusPending,
    37  		allocs: make(map[string]*allocState),
    38  	}
    39  }
    40  
    41  // allocState is used to track the state of an allocation
    42  type allocState struct {
    43  	id          string
    44  	group       string
    45  	node        string
    46  	desired     string
    47  	desiredDesc string
    48  	client      string
    49  	clientDesc  string
    50  	index       uint64
    51  
    52  	// full is the allocation struct with full details. This
    53  	// must be queried for explicitly so it is only included
    54  	// if there is important error information inside.
    55  	full *api.Allocation
    56  }
    57  
    58  // monitor wraps an evaluation monitor and holds metadata and
    59  // state information.
    60  type monitor struct {
    61  	ui     cli.Ui
    62  	client *api.Client
    63  	state  *evalState
    64  
    65  	// length determines the number of characters for identifiers in the ui.
    66  	length int
    67  
    68  	sync.Mutex
    69  }
    70  
    71  // newMonitor returns a new monitor. The returned monitor will
    72  // write output information to the provided ui. The length parameter determines
    73  // the number of characters for identifiers in the ui.
    74  func newMonitor(ui cli.Ui, client *api.Client, length int) *monitor {
    75  	mon := &monitor{
    76  		ui: &cli.PrefixedUi{
    77  			InfoPrefix:   "==> ",
    78  			OutputPrefix: "    ",
    79  			ErrorPrefix:  "==> ",
    80  			Ui:           ui,
    81  		},
    82  		client: client,
    83  		state:  newEvalState(),
    84  		length: length,
    85  	}
    86  	return mon
    87  }
    88  
    89  // update is used to update our monitor with new state. It can be
    90  // called whether the passed information is new or not, and will
    91  // only dump update messages when state changes.
    92  func (m *monitor) update(update *evalState) {
    93  	m.Lock()
    94  	defer m.Unlock()
    95  
    96  	existing := m.state
    97  
    98  	// Swap in the new state at the end
    99  	defer func() {
   100  		m.state = update
   101  	}()
   102  
   103  	// Check if the evaluation was triggered by a node
   104  	if existing.node == "" && update.node != "" {
   105  		m.ui.Output(fmt.Sprintf("Evaluation triggered by node %q",
   106  			limit(update.node, m.length)))
   107  	}
   108  
   109  	// Check if the evaluation was triggered by a job
   110  	if existing.job == "" && update.job != "" {
   111  		m.ui.Output(fmt.Sprintf("Evaluation triggered by job %q", update.job))
   112  	}
   113  
   114  	// Check the allocations
   115  	for allocID, alloc := range update.allocs {
   116  		if existing, ok := existing.allocs[allocID]; !ok {
   117  			switch {
   118  			case alloc.index < update.index:
   119  				// New alloc with create index lower than the eval
   120  				// create index indicates modification
   121  				m.ui.Output(fmt.Sprintf(
   122  					"Allocation %q modified: node %q, group %q",
   123  					limit(alloc.id, m.length), limit(alloc.node, m.length), alloc.group))
   124  
   125  			case alloc.desired == structs.AllocDesiredStatusRun:
   126  				// New allocation with desired status running
   127  				m.ui.Output(fmt.Sprintf(
   128  					"Allocation %q created: node %q, group %q",
   129  					limit(alloc.id, m.length), limit(alloc.node, m.length), alloc.group))
   130  			}
   131  		} else {
   132  			switch {
   133  			case existing.client != alloc.client:
   134  				description := ""
   135  				if alloc.clientDesc != "" {
   136  					description = fmt.Sprintf(" (%s)", alloc.clientDesc)
   137  				}
   138  				// Allocation status has changed
   139  				m.ui.Output(fmt.Sprintf(
   140  					"Allocation %q status changed: %q -> %q%s",
   141  					limit(alloc.id, m.length), existing.client, alloc.client, description))
   142  			}
   143  		}
   144  	}
   145  
   146  	// Check if the status changed. We skip any transitions to pending status.
   147  	if existing.status != "" &&
   148  		update.status != structs.AllocClientStatusPending &&
   149  		existing.status != update.status {
   150  		m.ui.Output(fmt.Sprintf("Evaluation status changed: %q -> %q",
   151  			existing.status, update.status))
   152  	}
   153  }
   154  
   155  // monitor is used to start monitoring the given evaluation ID. It
   156  // writes output directly to the monitor's ui, and returns the
   157  // exit code for the command. If allowPrefix is false, monitor will only accept
   158  // exact matching evalIDs.
   159  //
   160  // The return code will be 0 on successful evaluation. If there are
   161  // problems scheduling the job (impossible constraints, resources
   162  // exhausted, etc), then the return code will be 2. For any other
   163  // failures (API connectivity, internal errors, etc), the return code
   164  // will be 1.
   165  func (m *monitor) monitor(evalID string, allowPrefix bool) int {
   166  	// Track if we encounter a scheduling failure. This can only be
   167  	// detected while querying allocations, so we use this bool to
   168  	// carry that status into the return code.
   169  	var schedFailure bool
   170  
   171  	// The user may have specified a prefix as eval id. We need to lookup the
   172  	// full id from the database first. Since we do this in a loop we need a
   173  	// variable to keep track if we've already written the header message.
   174  	var headerWritten bool
   175  
   176  	// Add the initial pending state
   177  	m.update(newEvalState())
   178  
   179  	for {
   180  		// Query the evaluation
   181  		eval, _, err := m.client.Evaluations().Info(evalID, nil)
   182  		if err != nil {
   183  			if !allowPrefix {
   184  				m.ui.Error(fmt.Sprintf("No evaluation with id %q found", evalID))
   185  				return 1
   186  			}
   187  			if len(evalID) == 1 {
   188  				m.ui.Error(fmt.Sprintf("Identifier must contain at least two characters."))
   189  				return 1
   190  			}
   191  			if len(evalID)%2 == 1 {
   192  				// Identifiers must be of even length, so we strip off the last byte
   193  				// to provide a consistent user experience.
   194  				evalID = evalID[:len(evalID)-1]
   195  			}
   196  
   197  			evals, _, err := m.client.Evaluations().PrefixList(evalID)
   198  			if err != nil {
   199  				m.ui.Error(fmt.Sprintf("Error reading evaluation: %s", err))
   200  				return 1
   201  			}
   202  			if len(evals) == 0 {
   203  				m.ui.Error(fmt.Sprintf("No evaluation(s) with prefix or id %q found", evalID))
   204  				return 1
   205  			}
   206  			if len(evals) > 1 {
   207  				// Format the evaluations
   208  				out := make([]string, len(evals)+1)
   209  				out[0] = "ID|Priority|Type|Triggered By|Status"
   210  				for i, eval := range evals {
   211  					out[i+1] = fmt.Sprintf("%s|%d|%s|%s|%s",
   212  						limit(eval.ID, m.length),
   213  						eval.Priority,
   214  						eval.Type,
   215  						eval.TriggeredBy,
   216  						eval.Status)
   217  				}
   218  				m.ui.Output(fmt.Sprintf("Prefix matched multiple evaluations\n\n%s", formatList(out)))
   219  				return 0
   220  			}
   221  			// Prefix lookup matched a single evaluation
   222  			eval, _, err = m.client.Evaluations().Info(evals[0].ID, nil)
   223  			if err != nil {
   224  				m.ui.Error(fmt.Sprintf("Error reading evaluation: %s", err))
   225  			}
   226  		}
   227  
   228  		if !headerWritten {
   229  			m.ui.Info(fmt.Sprintf("Monitoring evaluation %q", limit(eval.ID, m.length)))
   230  			headerWritten = true
   231  		}
   232  
   233  		// Create the new eval state.
   234  		state := newEvalState()
   235  		state.status = eval.Status
   236  		state.desc = eval.StatusDescription
   237  		state.node = eval.NodeID
   238  		state.job = eval.JobID
   239  		state.wait = eval.Wait
   240  		state.index = eval.CreateIndex
   241  
   242  		// Query the allocations associated with the evaluation
   243  		allocs, _, err := m.client.Evaluations().Allocations(eval.ID, nil)
   244  		if err != nil {
   245  			m.ui.Error(fmt.Sprintf("Error reading allocations: %s", err))
   246  			return 1
   247  		}
   248  
   249  		// Add the allocs to the state
   250  		for _, alloc := range allocs {
   251  			state.allocs[alloc.ID] = &allocState{
   252  				id:          alloc.ID,
   253  				group:       alloc.TaskGroup,
   254  				node:        alloc.NodeID,
   255  				desired:     alloc.DesiredStatus,
   256  				desiredDesc: alloc.DesiredDescription,
   257  				client:      alloc.ClientStatus,
   258  				clientDesc:  alloc.ClientDescription,
   259  				index:       alloc.CreateIndex,
   260  			}
   261  		}
   262  
   263  		// Update the state
   264  		m.update(state)
   265  
   266  		switch eval.Status {
   267  		case structs.EvalStatusComplete, structs.EvalStatusFailed, structs.EvalStatusCancelled:
   268  			if len(eval.FailedTGAllocs) == 0 {
   269  				m.ui.Info(fmt.Sprintf("Evaluation %q finished with status %q",
   270  					limit(eval.ID, m.length), eval.Status))
   271  			} else {
   272  				// There were failures making the allocations
   273  				schedFailure = true
   274  				m.ui.Info(fmt.Sprintf("Evaluation %q finished with status %q but failed to place all allocations:",
   275  					limit(eval.ID, m.length), eval.Status))
   276  
   277  				// Print the failures per task group
   278  				for tg, metrics := range eval.FailedTGAllocs {
   279  					noun := "allocation"
   280  					if metrics.CoalescedFailures > 0 {
   281  						noun += "s"
   282  					}
   283  					m.ui.Output(fmt.Sprintf("Task Group %q (failed to place %d %s):", tg, metrics.CoalescedFailures+1, noun))
   284  					metrics := formatAllocMetrics(metrics, false, "  ")
   285  					for _, line := range strings.Split(metrics, "\n") {
   286  						m.ui.Output(line)
   287  					}
   288  				}
   289  
   290  				if eval.BlockedEval != "" {
   291  					m.ui.Output(fmt.Sprintf("Evaluation %q waiting for additional capacity to place remainder",
   292  						limit(eval.BlockedEval, m.length)))
   293  				}
   294  			}
   295  		default:
   296  			// Wait for the next update
   297  			time.Sleep(updateWait)
   298  			continue
   299  		}
   300  
   301  		// Monitor the next eval in the chain, if present
   302  		if eval.NextEval != "" {
   303  			if eval.Wait.Nanoseconds() != 0 {
   304  				m.ui.Info(fmt.Sprintf(
   305  					"Monitoring next evaluation %q in %s",
   306  					limit(eval.NextEval, m.length), eval.Wait))
   307  
   308  				// Skip some unnecessary polling
   309  				time.Sleep(eval.Wait)
   310  			}
   311  
   312  			// Reset the state and monitor the new eval
   313  			m.state = newEvalState()
   314  			return m.monitor(eval.NextEval, allowPrefix)
   315  		}
   316  		break
   317  	}
   318  
   319  	// Treat scheduling failures specially using a dedicated exit code.
   320  	// This makes it easier to detect failures from the CLI.
   321  	if schedFailure {
   322  		return 2
   323  	}
   324  
   325  	return 0
   326  }
   327  
   328  // dumpAllocStatus is a helper to generate a more user-friendly error message
   329  // for scheduling failures, displaying a high level status of why the job
   330  // could not be scheduled out.
   331  func dumpAllocStatus(ui cli.Ui, alloc *api.Allocation, length int) {
   332  	// Print filter stats
   333  	ui.Output(fmt.Sprintf("Allocation %q status %q (%d/%d nodes filtered)",
   334  		limit(alloc.ID, length), alloc.ClientStatus,
   335  		alloc.Metrics.NodesFiltered, alloc.Metrics.NodesEvaluated))
   336  	ui.Output(formatAllocMetrics(alloc.Metrics, true, "  "))
   337  }
   338  
   339  func formatAllocMetrics(metrics *api.AllocationMetric, scores bool, prefix string) string {
   340  	// Print a helpful message if we have an eligibility problem
   341  	var out string
   342  	if metrics.NodesEvaluated == 0 {
   343  		out += fmt.Sprintf("%s* No nodes were eligible for evaluation\n", prefix)
   344  	}
   345  
   346  	// Print a helpful message if the user has asked for a DC that has no
   347  	// available nodes.
   348  	for dc, available := range metrics.NodesAvailable {
   349  		if available == 0 {
   350  			out += fmt.Sprintf("%s* No nodes are available in datacenter %q\n", prefix, dc)
   351  		}
   352  	}
   353  
   354  	// Print filter info
   355  	for class, num := range metrics.ClassFiltered {
   356  		out += fmt.Sprintf("%s* Class %q filtered %d nodes\n", prefix, class, num)
   357  	}
   358  	for cs, num := range metrics.ConstraintFiltered {
   359  		out += fmt.Sprintf("%s* Constraint %q filtered %d nodes\n", prefix, cs, num)
   360  	}
   361  
   362  	// Print exhaustion info
   363  	if ne := metrics.NodesExhausted; ne > 0 {
   364  		out += fmt.Sprintf("%s* Resources exhausted on %d nodes\n", prefix, ne)
   365  	}
   366  	for class, num := range metrics.ClassExhausted {
   367  		out += fmt.Sprintf("%s* Class %q exhausted on %d nodes\n", prefix, class, num)
   368  	}
   369  	for dim, num := range metrics.DimensionExhausted {
   370  		out += fmt.Sprintf("%s* Dimension %q exhausted on %d nodes\n", prefix, dim, num)
   371  	}
   372  
   373  	// Print scores
   374  	if scores {
   375  		for name, score := range metrics.Scores {
   376  			out += fmt.Sprintf("%s* Score %q = %f\n", prefix, name, score)
   377  		}
   378  	}
   379  
   380  	out = strings.TrimSuffix(out, "\n")
   381  	return out
   382  }