github.com/blixtra/nomad@v0.7.2-0.20171221000451-da9a1d7bb050/command/monitor.go

github.com/blixtra/nomad@v0.7.2-0.20171221000451-da9a1d7bb050/command/monitor.go (about)

     1  package command
     2  
     3  import (
     4  	"fmt"
     5  	"strings"
     6  	"sync"
     7  	"time"
     8  
     9  	"github.com/hashicorp/nomad/api"
    10  	"github.com/hashicorp/nomad/nomad/structs"
    11  	"github.com/mitchellh/cli"
    12  )
    13  
    14  const (
    15  	// updateWait is the amount of time to wait between status
    16  	// updates. Because the monitor is poll-based, we use this
    17  	// delay to avoid overwhelming the API server.
    18  	updateWait = time.Second
    19  )
    20  
    21  // evalState is used to store the current "state of the world"
    22  // in the context of monitoring an evaluation.
    23  type evalState struct {
    24  	status     string
    25  	desc       string
    26  	node       string
    27  	deployment string
    28  	job        string
    29  	allocs     map[string]*allocState
    30  	wait       time.Duration
    31  	index      uint64
    32  }
    33  
    34  // newEvalState creates and initializes a new monitorState
    35  func newEvalState() *evalState {
    36  	return &evalState{
    37  		status: structs.EvalStatusPending,
    38  		allocs: make(map[string]*allocState),
    39  	}
    40  }
    41  
    42  // allocState is used to track the state of an allocation
    43  type allocState struct {
    44  	id          string
    45  	group       string
    46  	node        string
    47  	desired     string
    48  	desiredDesc string
    49  	client      string
    50  	clientDesc  string
    51  	index       uint64
    52  }
    53  
    54  // monitor wraps an evaluation monitor and holds metadata and
    55  // state information.
    56  type monitor struct {
    57  	ui     cli.Ui
    58  	client *api.Client
    59  	state  *evalState
    60  
    61  	// length determines the number of characters for identifiers in the ui.
    62  	length int
    63  
    64  	sync.Mutex
    65  }
    66  
    67  // newMonitor returns a new monitor. The returned monitor will
    68  // write output information to the provided ui. The length parameter determines
    69  // the number of characters for identifiers in the ui.
    70  func newMonitor(ui cli.Ui, client *api.Client, length int) *monitor {
    71  	mon := &monitor{
    72  		ui: &cli.PrefixedUi{
    73  			InfoPrefix:   "==> ",
    74  			OutputPrefix: "    ",
    75  			ErrorPrefix:  "==> ",
    76  			Ui:           ui,
    77  		},
    78  		client: client,
    79  		state:  newEvalState(),
    80  		length: length,
    81  	}
    82  	return mon
    83  }
    84  
    85  // update is used to update our monitor with new state. It can be
    86  // called whether the passed information is new or not, and will
    87  // only dump update messages when state changes.
    88  func (m *monitor) update(update *evalState) {
    89  	m.Lock()
    90  	defer m.Unlock()
    91  
    92  	existing := m.state
    93  
    94  	// Swap in the new state at the end
    95  	defer func() {
    96  		m.state = update
    97  	}()
    98  
    99  	// Check if the evaluation was triggered by a node
   100  	if existing.node == "" && update.node != "" {
   101  		m.ui.Output(fmt.Sprintf("Evaluation triggered by node %q",
   102  			limit(update.node, m.length)))
   103  	}
   104  
   105  	// Check if the evaluation was triggered by a job
   106  	if existing.job == "" && update.job != "" {
   107  		m.ui.Output(fmt.Sprintf("Evaluation triggered by job %q", update.job))
   108  	}
   109  
   110  	// Check if the evaluation was triggered by a deployment
   111  	if existing.deployment == "" && update.deployment != "" {
   112  		m.ui.Output(fmt.Sprintf("Evaluation within deployment: %q", limit(update.deployment, m.length)))
   113  	}
   114  
   115  	// Check the allocations
   116  	for allocID, alloc := range update.allocs {
   117  		if existing, ok := existing.allocs[allocID]; !ok {
   118  			switch {
   119  			case alloc.index < update.index:
   120  				// New alloc with create index lower than the eval
   121  				// create index indicates modification
   122  				m.ui.Output(fmt.Sprintf(
   123  					"Allocation %q modified: node %q, group %q",
   124  					limit(alloc.id, m.length), limit(alloc.node, m.length), alloc.group))
   125  
   126  			case alloc.desired == structs.AllocDesiredStatusRun:
   127  				// New allocation with desired status running
   128  				m.ui.Output(fmt.Sprintf(
   129  					"Allocation %q created: node %q, group %q",
   130  					limit(alloc.id, m.length), limit(alloc.node, m.length), alloc.group))
   131  			}
   132  		} else {
   133  			switch {
   134  			case existing.client != alloc.client:
   135  				description := ""
   136  				if alloc.clientDesc != "" {
   137  					description = fmt.Sprintf(" (%s)", alloc.clientDesc)
   138  				}
   139  				// Allocation status has changed
   140  				m.ui.Output(fmt.Sprintf(
   141  					"Allocation %q status changed: %q -> %q%s",
   142  					limit(alloc.id, m.length), existing.client, alloc.client, description))
   143  			}
   144  		}
   145  	}
   146  
   147  	// Check if the status changed. We skip any transitions to pending status.
   148  	if existing.status != "" &&
   149  		update.status != structs.AllocClientStatusPending &&
   150  		existing.status != update.status {
   151  		m.ui.Output(fmt.Sprintf("Evaluation status changed: %q -> %q",
   152  			existing.status, update.status))
   153  	}
   154  }
   155  
   156  // monitor is used to start monitoring the given evaluation ID. It
   157  // writes output directly to the monitor's ui, and returns the
   158  // exit code for the command. If allowPrefix is false, monitor will only accept
   159  // exact matching evalIDs.
   160  //
   161  // The return code will be 0 on successful evaluation. If there are
   162  // problems scheduling the job (impossible constraints, resources
   163  // exhausted, etc), then the return code will be 2. For any other
   164  // failures (API connectivity, internal errors, etc), the return code
   165  // will be 1.
   166  func (m *monitor) monitor(evalID string, allowPrefix bool) int {
   167  	// Track if we encounter a scheduling failure. This can only be
   168  	// detected while querying allocations, so we use this bool to
   169  	// carry that status into the return code.
   170  	var schedFailure bool
   171  
   172  	// The user may have specified a prefix as eval id. We need to lookup the
   173  	// full id from the database first. Since we do this in a loop we need a
   174  	// variable to keep track if we've already written the header message.
   175  	var headerWritten bool
   176  
   177  	// Add the initial pending state
   178  	m.update(newEvalState())
   179  
   180  	for {
   181  		// Query the evaluation
   182  		eval, _, err := m.client.Evaluations().Info(evalID, nil)
   183  		if err != nil {
   184  			if !allowPrefix {
   185  				m.ui.Error(fmt.Sprintf("No evaluation with id %q found", evalID))
   186  				return 1
   187  			}
   188  			if len(evalID) == 1 {
   189  				m.ui.Error(fmt.Sprintf("Identifier must contain at least two characters."))
   190  				return 1
   191  			}
   192  
   193  			evalID = sanatizeUUIDPrefix(evalID)
   194  			evals, _, err := m.client.Evaluations().PrefixList(evalID)
   195  			if err != nil {
   196  				m.ui.Error(fmt.Sprintf("Error reading evaluation: %s", err))
   197  				return 1
   198  			}
   199  			if len(evals) == 0 {
   200  				m.ui.Error(fmt.Sprintf("No evaluation(s) with prefix or id %q found", evalID))
   201  				return 1
   202  			}
   203  			if len(evals) > 1 {
   204  				// Format the evaluations
   205  				out := make([]string, len(evals)+1)
   206  				out[0] = "ID|Priority|Type|Triggered By|Status"
   207  				for i, eval := range evals {
   208  					out[i+1] = fmt.Sprintf("%s|%d|%s|%s|%s",
   209  						limit(eval.ID, m.length),
   210  						eval.Priority,
   211  						eval.Type,
   212  						eval.TriggeredBy,
   213  						eval.Status)
   214  				}
   215  				m.ui.Output(fmt.Sprintf("Prefix matched multiple evaluations\n\n%s", formatList(out)))
   216  				return 0
   217  			}
   218  			// Prefix lookup matched a single evaluation
   219  			eval, _, err = m.client.Evaluations().Info(evals[0].ID, nil)
   220  			if err != nil {
   221  				m.ui.Error(fmt.Sprintf("Error reading evaluation: %s", err))
   222  			}
   223  		}
   224  
   225  		if !headerWritten {
   226  			m.ui.Info(fmt.Sprintf("Monitoring evaluation %q", limit(eval.ID, m.length)))
   227  			headerWritten = true
   228  		}
   229  
   230  		// Create the new eval state.
   231  		state := newEvalState()
   232  		state.status = eval.Status
   233  		state.desc = eval.StatusDescription
   234  		state.node = eval.NodeID
   235  		state.job = eval.JobID
   236  		state.deployment = eval.DeploymentID
   237  		state.wait = eval.Wait
   238  		state.index = eval.CreateIndex
   239  
   240  		// Query the allocations associated with the evaluation
   241  		allocs, _, err := m.client.Evaluations().Allocations(eval.ID, nil)
   242  		if err != nil {
   243  			m.ui.Error(fmt.Sprintf("Error reading allocations: %s", err))
   244  			return 1
   245  		}
   246  
   247  		// Add the allocs to the state
   248  		for _, alloc := range allocs {
   249  			state.allocs[alloc.ID] = &allocState{
   250  				id:          alloc.ID,
   251  				group:       alloc.TaskGroup,
   252  				node:        alloc.NodeID,
   253  				desired:     alloc.DesiredStatus,
   254  				desiredDesc: alloc.DesiredDescription,
   255  				client:      alloc.ClientStatus,
   256  				clientDesc:  alloc.ClientDescription,
   257  				index:       alloc.CreateIndex,
   258  			}
   259  		}
   260  
   261  		// Update the state
   262  		m.update(state)
   263  
   264  		switch eval.Status {
   265  		case structs.EvalStatusComplete, structs.EvalStatusFailed, structs.EvalStatusCancelled:
   266  			if len(eval.FailedTGAllocs) == 0 {
   267  				m.ui.Info(fmt.Sprintf("Evaluation %q finished with status %q",
   268  					limit(eval.ID, m.length), eval.Status))
   269  			} else {
   270  				// There were failures making the allocations
   271  				schedFailure = true
   272  				m.ui.Info(fmt.Sprintf("Evaluation %q finished with status %q but failed to place all allocations:",
   273  					limit(eval.ID, m.length), eval.Status))
   274  
   275  				// Print the failures per task group
   276  				for tg, metrics := range eval.FailedTGAllocs {
   277  					noun := "allocation"
   278  					if metrics.CoalescedFailures > 0 {
   279  						noun += "s"
   280  					}
   281  					m.ui.Output(fmt.Sprintf("Task Group %q (failed to place %d %s):", tg, metrics.CoalescedFailures+1, noun))
   282  					metrics := formatAllocMetrics(metrics, false, "  ")
   283  					for _, line := range strings.Split(metrics, "\n") {
   284  						m.ui.Output(line)
   285  					}
   286  				}
   287  
   288  				if eval.BlockedEval != "" {
   289  					m.ui.Output(fmt.Sprintf("Evaluation %q waiting for additional capacity to place remainder",
   290  						limit(eval.BlockedEval, m.length)))
   291  				}
   292  			}
   293  		default:
   294  			// Wait for the next update
   295  			time.Sleep(updateWait)
   296  			continue
   297  		}
   298  
   299  		// Monitor the next eval in the chain, if present
   300  		if eval.NextEval != "" {
   301  			if eval.Wait.Nanoseconds() != 0 {
   302  				m.ui.Info(fmt.Sprintf(
   303  					"Monitoring next evaluation %q in %s",
   304  					limit(eval.NextEval, m.length), eval.Wait))
   305  
   306  				// Skip some unnecessary polling
   307  				time.Sleep(eval.Wait)
   308  			}
   309  
   310  			// Reset the state and monitor the new eval
   311  			m.state = newEvalState()
   312  			return m.monitor(eval.NextEval, allowPrefix)
   313  		}
   314  		break
   315  	}
   316  
   317  	// Treat scheduling failures specially using a dedicated exit code.
   318  	// This makes it easier to detect failures from the CLI.
   319  	if schedFailure {
   320  		return 2
   321  	}
   322  
   323  	return 0
   324  }
   325  
   326  func formatAllocMetrics(metrics *api.AllocationMetric, scores bool, prefix string) string {
   327  	// Print a helpful message if we have an eligibility problem
   328  	var out string
   329  	if metrics.NodesEvaluated == 0 {
   330  		out += fmt.Sprintf("%s* No nodes were eligible for evaluation\n", prefix)
   331  	}
   332  
   333  	// Print a helpful message if the user has asked for a DC that has no
   334  	// available nodes.
   335  	for dc, available := range metrics.NodesAvailable {
   336  		if available == 0 {
   337  			out += fmt.Sprintf("%s* No nodes are available in datacenter %q\n", prefix, dc)
   338  		}
   339  	}
   340  
   341  	// Print filter info
   342  	for class, num := range metrics.ClassFiltered {
   343  		out += fmt.Sprintf("%s* Class %q filtered %d nodes\n", prefix, class, num)
   344  	}
   345  	for cs, num := range metrics.ConstraintFiltered {
   346  		out += fmt.Sprintf("%s* Constraint %q filtered %d nodes\n", prefix, cs, num)
   347  	}
   348  
   349  	// Print exhaustion info
   350  	if ne := metrics.NodesExhausted; ne > 0 {
   351  		out += fmt.Sprintf("%s* Resources exhausted on %d nodes\n", prefix, ne)
   352  	}
   353  	for class, num := range metrics.ClassExhausted {
   354  		out += fmt.Sprintf("%s* Class %q exhausted on %d nodes\n", prefix, class, num)
   355  	}
   356  	for dim, num := range metrics.DimensionExhausted {
   357  		out += fmt.Sprintf("%s* Dimension %q exhausted on %d nodes\n", prefix, dim, num)
   358  	}
   359  
   360  	// Print quota info
   361  	for _, dim := range metrics.QuotaExhausted {
   362  		out += fmt.Sprintf("%s* Quota limit hit %q\n", prefix, dim)
   363  	}
   364  
   365  	// Print scores
   366  	if scores {
   367  		for name, score := range metrics.Scores {
   368  			out += fmt.Sprintf("%s* Score %q = %f\n", prefix, name, score)
   369  		}
   370  	}
   371  
   372  	out = strings.TrimSuffix(out, "\n")
   373  	return out
   374  }