github.com/ilhicas/nomad@v0.11.8/command/monitor.go

github.com/ilhicas/nomad@v0.11.8/command/monitor.go (about)

     1  package command
     2  
     3  import (
     4  	"fmt"
     5  	"sort"
     6  	"strings"
     7  	"sync"
     8  	"time"
     9  
    10  	"github.com/hashicorp/nomad/api"
    11  	"github.com/hashicorp/nomad/nomad/structs"
    12  	"github.com/mitchellh/cli"
    13  )
    14  
    15  const (
    16  	// updateWait is the amount of time to wait between status
    17  	// updates. Because the monitor is poll-based, we use this
    18  	// delay to avoid overwhelming the API server.
    19  	updateWait = time.Second
    20  )
    21  
    22  // evalState is used to store the current "state of the world"
    23  // in the context of monitoring an evaluation.
    24  type evalState struct {
    25  	status     string
    26  	desc       string
    27  	node       string
    28  	deployment string
    29  	job        string
    30  	allocs     map[string]*allocState
    31  	wait       time.Duration
    32  	index      uint64
    33  }
    34  
    35  // newEvalState creates and initializes a new monitorState
    36  func newEvalState() *evalState {
    37  	return &evalState{
    38  		status: structs.EvalStatusPending,
    39  		allocs: make(map[string]*allocState),
    40  	}
    41  }
    42  
    43  // allocState is used to track the state of an allocation
    44  type allocState struct {
    45  	id          string
    46  	group       string
    47  	node        string
    48  	desired     string
    49  	desiredDesc string
    50  	client      string
    51  	clientDesc  string
    52  	index       uint64
    53  }
    54  
    55  // monitor wraps an evaluation monitor and holds metadata and
    56  // state information.
    57  type monitor struct {
    58  	ui     cli.Ui
    59  	client *api.Client
    60  	state  *evalState
    61  
    62  	// length determines the number of characters for identifiers in the ui.
    63  	length int
    64  
    65  	sync.Mutex
    66  }
    67  
    68  // newMonitor returns a new monitor. The returned monitor will
    69  // write output information to the provided ui. The length parameter determines
    70  // the number of characters for identifiers in the ui.
    71  func newMonitor(ui cli.Ui, client *api.Client, length int) *monitor {
    72  	if colorUi, ok := ui.(*cli.ColoredUi); ok {
    73  		// Disable Info color for monitored output
    74  		ui = &cli.ColoredUi{
    75  			ErrorColor: colorUi.ErrorColor,
    76  			WarnColor:  colorUi.WarnColor,
    77  			InfoColor:  cli.UiColorNone,
    78  			Ui:         colorUi.Ui,
    79  		}
    80  	}
    81  	mon := &monitor{
    82  		ui: &cli.PrefixedUi{
    83  			InfoPrefix:   "==> ",
    84  			OutputPrefix: "    ",
    85  			ErrorPrefix:  "==> ",
    86  			Ui:           ui,
    87  		},
    88  		client: client,
    89  		state:  newEvalState(),
    90  		length: length,
    91  	}
    92  	return mon
    93  }
    94  
    95  // update is used to update our monitor with new state. It can be
    96  // called whether the passed information is new or not, and will
    97  // only dump update messages when state changes.
    98  func (m *monitor) update(update *evalState) {
    99  	m.Lock()
   100  	defer m.Unlock()
   101  
   102  	existing := m.state
   103  
   104  	// Swap in the new state at the end
   105  	defer func() {
   106  		m.state = update
   107  	}()
   108  
   109  	// Check if the evaluation was triggered by a node
   110  	if existing.node == "" && update.node != "" {
   111  		m.ui.Output(fmt.Sprintf("Evaluation triggered by node %q",
   112  			limit(update.node, m.length)))
   113  	}
   114  
   115  	// Check if the evaluation was triggered by a job
   116  	if existing.job == "" && update.job != "" {
   117  		m.ui.Output(fmt.Sprintf("Evaluation triggered by job %q", update.job))
   118  	}
   119  
   120  	// Check if the evaluation was triggered by a deployment
   121  	if existing.deployment == "" && update.deployment != "" {
   122  		m.ui.Output(fmt.Sprintf("Evaluation within deployment: %q", limit(update.deployment, m.length)))
   123  	}
   124  
   125  	// Check the allocations
   126  	for allocID, alloc := range update.allocs {
   127  		if existing, ok := existing.allocs[allocID]; !ok {
   128  			switch {
   129  			case alloc.index < update.index:
   130  				// New alloc with create index lower than the eval
   131  				// create index indicates modification
   132  				m.ui.Output(fmt.Sprintf(
   133  					"Allocation %q modified: node %q, group %q",
   134  					limit(alloc.id, m.length), limit(alloc.node, m.length), alloc.group))
   135  
   136  			case alloc.desired == structs.AllocDesiredStatusRun:
   137  				// New allocation with desired status running
   138  				m.ui.Output(fmt.Sprintf(
   139  					"Allocation %q created: node %q, group %q",
   140  					limit(alloc.id, m.length), limit(alloc.node, m.length), alloc.group))
   141  			}
   142  		} else {
   143  			switch {
   144  			case existing.client != alloc.client:
   145  				description := ""
   146  				if alloc.clientDesc != "" {
   147  					description = fmt.Sprintf(" (%s)", alloc.clientDesc)
   148  				}
   149  				// Allocation status has changed
   150  				m.ui.Output(fmt.Sprintf(
   151  					"Allocation %q status changed: %q -> %q%s",
   152  					limit(alloc.id, m.length), existing.client, alloc.client, description))
   153  			}
   154  		}
   155  	}
   156  
   157  	// Check if the status changed. We skip any transitions to pending status.
   158  	if existing.status != "" &&
   159  		update.status != structs.AllocClientStatusPending &&
   160  		existing.status != update.status {
   161  		m.ui.Output(fmt.Sprintf("Evaluation status changed: %q -> %q",
   162  			existing.status, update.status))
   163  	}
   164  }
   165  
   166  // monitor is used to start monitoring the given evaluation ID. It
   167  // writes output directly to the monitor's ui, and returns the
   168  // exit code for the command. If allowPrefix is false, monitor will only accept
   169  // exact matching evalIDs.
   170  //
   171  // The return code will be 0 on successful evaluation. If there are
   172  // problems scheduling the job (impossible constraints, resources
   173  // exhausted, etc), then the return code will be 2. For any other
   174  // failures (API connectivity, internal errors, etc), the return code
   175  // will be 1.
   176  func (m *monitor) monitor(evalID string, allowPrefix bool) int {
   177  	// Track if we encounter a scheduling failure. This can only be
   178  	// detected while querying allocations, so we use this bool to
   179  	// carry that status into the return code.
   180  	var schedFailure bool
   181  
   182  	// The user may have specified a prefix as eval id. We need to lookup the
   183  	// full id from the database first. Since we do this in a loop we need a
   184  	// variable to keep track if we've already written the header message.
   185  	var headerWritten bool
   186  
   187  	// Add the initial pending state
   188  	m.update(newEvalState())
   189  
   190  	for {
   191  		// Query the evaluation
   192  		eval, _, err := m.client.Evaluations().Info(evalID, nil)
   193  		if err != nil {
   194  			if !allowPrefix {
   195  				m.ui.Error(fmt.Sprintf("No evaluation with id %q found", evalID))
   196  				return 1
   197  			}
   198  			if len(evalID) == 1 {
   199  				m.ui.Error(fmt.Sprintf("Identifier must contain at least two characters."))
   200  				return 1
   201  			}
   202  
   203  			evalID = sanitizeUUIDPrefix(evalID)
   204  			evals, _, err := m.client.Evaluations().PrefixList(evalID)
   205  			if err != nil {
   206  				m.ui.Error(fmt.Sprintf("Error reading evaluation: %s", err))
   207  				return 1
   208  			}
   209  			if len(evals) == 0 {
   210  				m.ui.Error(fmt.Sprintf("No evaluation(s) with prefix or id %q found", evalID))
   211  				return 1
   212  			}
   213  			if len(evals) > 1 {
   214  				// Format the evaluations
   215  				out := make([]string, len(evals)+1)
   216  				out[0] = "ID|Priority|Type|Triggered By|Status"
   217  				for i, eval := range evals {
   218  					out[i+1] = fmt.Sprintf("%s|%d|%s|%s|%s",
   219  						limit(eval.ID, m.length),
   220  						eval.Priority,
   221  						eval.Type,
   222  						eval.TriggeredBy,
   223  						eval.Status)
   224  				}
   225  				m.ui.Output(fmt.Sprintf("Prefix matched multiple evaluations\n\n%s", formatList(out)))
   226  				return 0
   227  			}
   228  			// Prefix lookup matched a single evaluation
   229  			eval, _, err = m.client.Evaluations().Info(evals[0].ID, nil)
   230  			if err != nil {
   231  				m.ui.Error(fmt.Sprintf("Error reading evaluation: %s", err))
   232  			}
   233  		}
   234  
   235  		if !headerWritten {
   236  			m.ui.Info(fmt.Sprintf("Monitoring evaluation %q", limit(eval.ID, m.length)))
   237  			headerWritten = true
   238  		}
   239  
   240  		// Create the new eval state.
   241  		state := newEvalState()
   242  		state.status = eval.Status
   243  		state.desc = eval.StatusDescription
   244  		state.node = eval.NodeID
   245  		state.job = eval.JobID
   246  		state.deployment = eval.DeploymentID
   247  		state.wait = eval.Wait
   248  		state.index = eval.CreateIndex
   249  
   250  		// Query the allocations associated with the evaluation
   251  		allocs, _, err := m.client.Evaluations().Allocations(eval.ID, nil)
   252  		if err != nil {
   253  			m.ui.Error(fmt.Sprintf("Error reading allocations: %s", err))
   254  			return 1
   255  		}
   256  
   257  		// Add the allocs to the state
   258  		for _, alloc := range allocs {
   259  			state.allocs[alloc.ID] = &allocState{
   260  				id:          alloc.ID,
   261  				group:       alloc.TaskGroup,
   262  				node:        alloc.NodeID,
   263  				desired:     alloc.DesiredStatus,
   264  				desiredDesc: alloc.DesiredDescription,
   265  				client:      alloc.ClientStatus,
   266  				clientDesc:  alloc.ClientDescription,
   267  				index:       alloc.CreateIndex,
   268  			}
   269  		}
   270  
   271  		// Update the state
   272  		m.update(state)
   273  
   274  		switch eval.Status {
   275  		case structs.EvalStatusComplete, structs.EvalStatusFailed, structs.EvalStatusCancelled:
   276  			if len(eval.FailedTGAllocs) == 0 {
   277  				m.ui.Info(fmt.Sprintf("Evaluation %q finished with status %q",
   278  					limit(eval.ID, m.length), eval.Status))
   279  			} else {
   280  				// There were failures making the allocations
   281  				schedFailure = true
   282  				m.ui.Info(fmt.Sprintf("Evaluation %q finished with status %q but failed to place all allocations:",
   283  					limit(eval.ID, m.length), eval.Status))
   284  
   285  				// Print the failures per task group
   286  				for tg, metrics := range eval.FailedTGAllocs {
   287  					noun := "allocation"
   288  					if metrics.CoalescedFailures > 0 {
   289  						noun += "s"
   290  					}
   291  					m.ui.Output(fmt.Sprintf("Task Group %q (failed to place %d %s):", tg, metrics.CoalescedFailures+1, noun))
   292  					metrics := formatAllocMetrics(metrics, false, "  ")
   293  					for _, line := range strings.Split(metrics, "\n") {
   294  						m.ui.Output(line)
   295  					}
   296  				}
   297  
   298  				if eval.BlockedEval != "" {
   299  					m.ui.Output(fmt.Sprintf("Evaluation %q waiting for additional capacity to place remainder",
   300  						limit(eval.BlockedEval, m.length)))
   301  				}
   302  			}
   303  		default:
   304  			// Wait for the next update
   305  			time.Sleep(updateWait)
   306  			continue
   307  		}
   308  
   309  		// Monitor the next eval in the chain, if present
   310  		if eval.NextEval != "" {
   311  			if eval.Wait.Nanoseconds() != 0 {
   312  				m.ui.Info(fmt.Sprintf(
   313  					"Monitoring next evaluation %q in %s",
   314  					limit(eval.NextEval, m.length), eval.Wait))
   315  
   316  				// Skip some unnecessary polling
   317  				time.Sleep(eval.Wait)
   318  			}
   319  
   320  			// Reset the state and monitor the new eval
   321  			m.state = newEvalState()
   322  			return m.monitor(eval.NextEval, allowPrefix)
   323  		}
   324  		break
   325  	}
   326  
   327  	// Treat scheduling failures specially using a dedicated exit code.
   328  	// This makes it easier to detect failures from the CLI.
   329  	if schedFailure {
   330  		return 2
   331  	}
   332  
   333  	return 0
   334  }
   335  
   336  func formatAllocMetrics(metrics *api.AllocationMetric, scores bool, prefix string) string {
   337  	// Print a helpful message if we have an eligibility problem
   338  	var out string
   339  	if metrics.NodesEvaluated == 0 {
   340  		out += fmt.Sprintf("%s* No nodes were eligible for evaluation\n", prefix)
   341  	}
   342  
   343  	// Print a helpful message if the user has asked for a DC that has no
   344  	// available nodes.
   345  	for dc, available := range metrics.NodesAvailable {
   346  		if available == 0 {
   347  			out += fmt.Sprintf("%s* No nodes are available in datacenter %q\n", prefix, dc)
   348  		}
   349  	}
   350  
   351  	// Print filter info
   352  	for class, num := range metrics.ClassFiltered {
   353  		out += fmt.Sprintf("%s* Class %q: %d nodes excluded by filter\n", prefix, class, num)
   354  	}
   355  	for cs, num := range metrics.ConstraintFiltered {
   356  		out += fmt.Sprintf("%s* Constraint %q: %d nodes excluded by filter\n", prefix, cs, num)
   357  	}
   358  
   359  	// Print exhaustion info
   360  	if ne := metrics.NodesExhausted; ne > 0 {
   361  		out += fmt.Sprintf("%s* Resources exhausted on %d nodes\n", prefix, ne)
   362  	}
   363  	for class, num := range metrics.ClassExhausted {
   364  		out += fmt.Sprintf("%s* Class %q exhausted on %d nodes\n", prefix, class, num)
   365  	}
   366  	for dim, num := range metrics.DimensionExhausted {
   367  		out += fmt.Sprintf("%s* Dimension %q exhausted on %d nodes\n", prefix, dim, num)
   368  	}
   369  
   370  	// Print quota info
   371  	for _, dim := range metrics.QuotaExhausted {
   372  		out += fmt.Sprintf("%s* Quota limit hit %q\n", prefix, dim)
   373  	}
   374  
   375  	// Print scores
   376  	if scores {
   377  		if len(metrics.ScoreMetaData) > 0 {
   378  			scoreOutput := make([]string, len(metrics.ScoreMetaData)+1)
   379  			var scorerNames []string
   380  			for i, scoreMeta := range metrics.ScoreMetaData {
   381  				// Add header as first row
   382  				if i == 0 {
   383  					scoreOutput[0] = "Node|"
   384  
   385  					// sort scores alphabetically
   386  					scores := make([]string, 0, len(scoreMeta.Scores))
   387  					for score := range scoreMeta.Scores {
   388  						scores = append(scores, score)
   389  					}
   390  					sort.Strings(scores)
   391  
   392  					// build score header output
   393  					for _, scorerName := range scores {
   394  						scoreOutput[0] += fmt.Sprintf("%v|", scorerName)
   395  						scorerNames = append(scorerNames, scorerName)
   396  					}
   397  					scoreOutput[0] += "final score"
   398  				}
   399  				scoreOutput[i+1] = fmt.Sprintf("%v|", scoreMeta.NodeID)
   400  				for _, scorerName := range scorerNames {
   401  					scoreVal := scoreMeta.Scores[scorerName]
   402  					scoreOutput[i+1] += fmt.Sprintf("%.3g|", scoreVal)
   403  				}
   404  				scoreOutput[i+1] += fmt.Sprintf("%.3g", scoreMeta.NormScore)
   405  			}
   406  			out += formatList(scoreOutput)
   407  		} else {
   408  			// Backwards compatibility for old allocs
   409  			for name, score := range metrics.Scores {
   410  				out += fmt.Sprintf("%s* Score %q = %f\n", prefix, name, score)
   411  			}
   412  		}
   413  	}
   414  
   415  	out = strings.TrimSuffix(out, "\n")
   416  	return out
   417  }