github.com/hernad/nomad@v1.6.112/command/monitor.go (about)

     1  // Copyright (c) HashiCorp, Inc.
     2  // SPDX-License-Identifier: MPL-2.0
     3  
     4  package command
     5  
     6  import (
     7  	"fmt"
     8  	"sort"
     9  	"strings"
    10  	"sync"
    11  	"time"
    12  
    13  	"github.com/hernad/nomad/api"
    14  	"github.com/mitchellh/cli"
    15  )
    16  
    17  const (
    18  	// updateWait is the amount of time to wait between status
    19  	// updates. Because the monitor is poll-based, we use this
    20  	// delay to avoid overwhelming the API server.
    21  	updateWait = time.Second
    22  )
    23  
    24  // evalState is used to store the current "state of the world"
    25  // in the context of monitoring an evaluation.
    26  type evalState struct {
    27  	status     string
    28  	desc       string
    29  	node       string
    30  	deployment string
    31  	job        string
    32  	allocs     map[string]*allocState
    33  	wait       time.Duration
    34  	index      uint64
    35  }
    36  
    37  // newEvalState creates and initializes a new monitorState
    38  func newEvalState() *evalState {
    39  	return &evalState{
    40  		status: api.EvalStatusPending,
    41  		allocs: make(map[string]*allocState),
    42  	}
    43  }
    44  
    45  // allocState is used to track the state of an allocation
    46  type allocState struct {
    47  	id          string
    48  	group       string
    49  	node        string
    50  	desired     string
    51  	desiredDesc string
    52  	client      string
    53  	clientDesc  string
    54  	index       uint64
    55  }
    56  
    57  // monitor wraps an evaluation monitor and holds metadata and
    58  // state information.
    59  type monitor struct {
    60  	ui     cli.Ui
    61  	client *api.Client
    62  	state  *evalState
    63  
    64  	// length determines the number of characters for identifiers in the ui.
    65  	length int
    66  
    67  	sync.Mutex
    68  }
    69  
    70  // newMonitor returns a new monitor. The returned monitor will
    71  // write output information to the provided ui. The length parameter determines
    72  // the number of characters for identifiers in the ui.
    73  func newMonitor(ui cli.Ui, client *api.Client, length int) *monitor {
    74  	if colorUi, ok := ui.(*cli.ColoredUi); ok {
    75  		// Disable Info color for monitored output
    76  		ui = &cli.ColoredUi{
    77  			ErrorColor: colorUi.ErrorColor,
    78  			WarnColor:  colorUi.WarnColor,
    79  			InfoColor:  cli.UiColorNone,
    80  			Ui:         colorUi.Ui,
    81  		}
    82  	}
    83  	mon := &monitor{
    84  		ui: &cli.PrefixedUi{
    85  			InfoPrefix:   "==> ",
    86  			OutputPrefix: "    ",
    87  			ErrorPrefix:  "==> ",
    88  			Ui:           ui,
    89  		},
    90  		client: client,
    91  		state:  newEvalState(),
    92  		length: length,
    93  	}
    94  	return mon
    95  }
    96  
    97  // update is used to update our monitor with new state. It can be
    98  // called whether the passed information is new or not, and will
    99  // only dump update messages when state changes.
   100  func (m *monitor) update(update *evalState) {
   101  	m.Lock()
   102  	defer m.Unlock()
   103  
   104  	existing := m.state
   105  
   106  	// Swap in the new state at the end
   107  	defer func() {
   108  		m.state = update
   109  	}()
   110  
   111  	// Check if the evaluation was triggered by a node
   112  	if existing.node == "" && update.node != "" {
   113  		m.ui.Output(fmt.Sprintf("%s: Evaluation triggered by node %q",
   114  			formatTime(time.Now()), limit(update.node, m.length)))
   115  	}
   116  
   117  	// Check if the evaluation was triggered by a job
   118  	if existing.job == "" && update.job != "" {
   119  		m.ui.Output(fmt.Sprintf("%s: Evaluation triggered by job %q",
   120  			formatTime(time.Now()), update.job))
   121  	}
   122  
   123  	// Check if the evaluation was triggered by a deployment
   124  	if existing.deployment == "" && update.deployment != "" {
   125  		m.ui.Output(fmt.Sprintf("%s: Evaluation within deployment: %q",
   126  			formatTime(time.Now()), limit(update.deployment, m.length)))
   127  	}
   128  
   129  	// Check the allocations
   130  	for allocID, alloc := range update.allocs {
   131  		if existing, ok := existing.allocs[allocID]; !ok {
   132  			switch {
   133  			case alloc.index < update.index:
   134  				// New alloc with create index lower than the eval
   135  				// create index indicates modification
   136  				m.ui.Output(fmt.Sprintf(
   137  					"%s: Allocation %q modified: node %q, group %q",
   138  					formatTime(time.Now()), limit(alloc.id, m.length),
   139  					limit(alloc.node, m.length), alloc.group))
   140  
   141  			case alloc.desired == api.AllocDesiredStatusRun:
   142  				// New allocation with desired status running
   143  				m.ui.Output(fmt.Sprintf(
   144  					"%s: Allocation %q created: node %q, group %q",
   145  					formatTime(time.Now()), limit(alloc.id, m.length),
   146  					limit(alloc.node, m.length), alloc.group))
   147  			}
   148  		} else {
   149  			switch {
   150  			case existing.client != alloc.client:
   151  				description := ""
   152  				if alloc.clientDesc != "" {
   153  					description = fmt.Sprintf(" (%s)", alloc.clientDesc)
   154  				}
   155  				// Allocation status has changed
   156  				m.ui.Output(fmt.Sprintf(
   157  					"%s: Allocation %q status changed: %q -> %q%s",
   158  					formatTime(time.Now()), limit(alloc.id, m.length),
   159  					existing.client, alloc.client, description))
   160  			}
   161  		}
   162  	}
   163  
   164  	// Check if the status changed. We skip any transitions to pending status.
   165  	if existing.status != "" &&
   166  		update.status != api.AllocClientStatusPending &&
   167  		existing.status != update.status {
   168  		m.ui.Output(fmt.Sprintf("%s: Evaluation status changed: %q -> %q",
   169  			formatTime(time.Now()), existing.status, update.status))
   170  	}
   171  }
   172  
   173  // monitor is used to start monitoring the given evaluation ID. It
   174  // writes output directly to the monitor's ui, and returns the
   175  // exit code for the command.
   176  //
   177  // The return code will be 0 on successful evaluation. If there are
   178  // problems scheduling the job (impossible constraints, resources
   179  // exhausted, etc), then the return code will be 2. For any other
   180  // failures (API connectivity, internal errors, etc), the return code
   181  // will be 1.
   182  func (m *monitor) monitor(evalID string) int {
   183  	// Track if we encounter a scheduling failure. This can only be
   184  	// detected while querying allocations, so we use this bool to
   185  	// carry that status into the return code.
   186  	var schedFailure bool
   187  
   188  	// Add the initial pending state
   189  	m.update(newEvalState())
   190  
   191  	m.ui.Info(fmt.Sprintf("%s: Monitoring evaluation %q",
   192  		formatTime(time.Now()), limit(evalID, m.length)))
   193  
   194  	for {
   195  		// Query the evaluation
   196  		eval, _, err := m.client.Evaluations().Info(evalID, nil)
   197  		if err != nil {
   198  			m.ui.Error(fmt.Sprintf("No evaluation with id %q found", evalID))
   199  			return 1
   200  		}
   201  
   202  		// Create the new eval state.
   203  		state := newEvalState()
   204  		state.status = eval.Status
   205  		state.desc = eval.StatusDescription
   206  		state.node = eval.NodeID
   207  		state.job = eval.JobID
   208  		state.deployment = eval.DeploymentID
   209  		state.wait = eval.Wait
   210  		state.index = eval.CreateIndex
   211  
   212  		// Query the allocations associated with the evaluation
   213  		allocs, _, err := m.client.Evaluations().Allocations(eval.ID, nil)
   214  		if err != nil {
   215  			m.ui.Error(fmt.Sprintf("%s: Error reading allocations: %s", formatTime(time.Now()), err))
   216  			return 1
   217  		}
   218  
   219  		// Add the allocs to the state
   220  		for _, alloc := range allocs {
   221  			state.allocs[alloc.ID] = &allocState{
   222  				id:          alloc.ID,
   223  				group:       alloc.TaskGroup,
   224  				node:        alloc.NodeID,
   225  				desired:     alloc.DesiredStatus,
   226  				desiredDesc: alloc.DesiredDescription,
   227  				client:      alloc.ClientStatus,
   228  				clientDesc:  alloc.ClientDescription,
   229  				index:       alloc.CreateIndex,
   230  			}
   231  		}
   232  
   233  		// Update the state
   234  		m.update(state)
   235  
   236  		switch eval.Status {
   237  		case api.EvalStatusComplete, api.EvalStatusFailed, api.EvalStatusCancelled:
   238  			if len(eval.FailedTGAllocs) == 0 {
   239  				m.ui.Info(fmt.Sprintf("%s: Evaluation %q finished with status %q",
   240  					formatTime(time.Now()), limit(eval.ID, m.length), eval.Status))
   241  			} else {
   242  				// There were failures making the allocations
   243  				schedFailure = true
   244  				m.ui.Info(fmt.Sprintf("%s: Evaluation %q finished with status %q but failed to place all allocations:",
   245  					formatTime(time.Now()), limit(eval.ID, m.length), eval.Status))
   246  
   247  				// Print the failures per task group
   248  				for tg, metrics := range eval.FailedTGAllocs {
   249  					noun := "allocation"
   250  					if metrics.CoalescedFailures > 0 {
   251  						noun += "s"
   252  					}
   253  					m.ui.Output(fmt.Sprintf("%s: Task Group %q (failed to place %d %s):",
   254  						formatTime(time.Now()), tg, metrics.CoalescedFailures+1, noun))
   255  					metrics := formatAllocMetrics(metrics, false, "  ")
   256  					for _, line := range strings.Split(metrics, "\n") {
   257  						m.ui.Output(line)
   258  					}
   259  				}
   260  
   261  				if eval.BlockedEval != "" {
   262  					m.ui.Output(fmt.Sprintf("%s: Evaluation %q waiting for additional capacity to place remainder",
   263  						formatTime(time.Now()), limit(eval.BlockedEval, m.length)))
   264  				}
   265  			}
   266  		default:
   267  			// Wait for the next update
   268  			time.Sleep(updateWait)
   269  			continue
   270  		}
   271  
   272  		// Monitor the next eval in the chain, if present
   273  		if eval.NextEval != "" {
   274  			if eval.Wait.Nanoseconds() != 0 {
   275  				m.ui.Info(fmt.Sprintf(
   276  					"%s: Monitoring next evaluation %q in %s",
   277  					formatTime(time.Now()), limit(eval.NextEval, m.length), eval.Wait))
   278  
   279  				// Skip some unnecessary polling
   280  				time.Sleep(eval.Wait)
   281  			}
   282  
   283  			// Reset the state and monitor the new eval
   284  			m.state = newEvalState()
   285  			return m.monitor(eval.NextEval)
   286  		}
   287  		break
   288  	}
   289  
   290  	// Monitor the deployment if it exists
   291  	dID := m.state.deployment
   292  	if dID != "" {
   293  		m.ui.Info(fmt.Sprintf("%s: Monitoring deployment %q", formatTime(time.Now()), limit(dID, m.length)))
   294  
   295  		var verbose bool
   296  		if m.length == fullId {
   297  			verbose = true
   298  		} else {
   299  			verbose = false
   300  		}
   301  
   302  		meta := new(Meta)
   303  		meta.Ui = m.ui
   304  		cmd := &DeploymentStatusCommand{Meta: *meta}
   305  		status, err := cmd.monitor(m.client, dID, 0, m.state.wait, verbose)
   306  		if err != nil || status != api.DeploymentStatusSuccessful {
   307  			return 1
   308  		}
   309  	}
   310  
   311  	// Treat scheduling failures specially using a dedicated exit code.
   312  	// This makes it easier to detect failures from the CLI.
   313  	if schedFailure {
   314  		return 2
   315  	}
   316  
   317  	return 0
   318  }
   319  
   320  func formatAllocMetrics(metrics *api.AllocationMetric, scores bool, prefix string) string {
   321  	// Print a helpful message if we have an eligibility problem
   322  	var out string
   323  	if metrics.NodesEvaluated == 0 {
   324  		out += fmt.Sprintf("%s* No nodes were eligible for evaluation\n", prefix)
   325  	}
   326  
   327  	// Print a helpful message if the user has asked for a DC that has no
   328  	// available nodes.
   329  	for dc, available := range metrics.NodesAvailable {
   330  		if available == 0 {
   331  			out += fmt.Sprintf("%s* No nodes are available in datacenter %q\n", prefix, dc)
   332  		}
   333  	}
   334  
   335  	// Print filter info
   336  	for class, num := range metrics.ClassFiltered {
   337  		out += fmt.Sprintf("%s* Class %q: %d nodes excluded by filter\n", prefix, class, num)
   338  	}
   339  	for cs, num := range metrics.ConstraintFiltered {
   340  		out += fmt.Sprintf("%s* Constraint %q: %d nodes excluded by filter\n", prefix, cs, num)
   341  	}
   342  
   343  	// Print exhaustion info
   344  	if ne := metrics.NodesExhausted; ne > 0 {
   345  		out += fmt.Sprintf("%s* Resources exhausted on %d nodes\n", prefix, ne)
   346  	}
   347  	for class, num := range metrics.ClassExhausted {
   348  		out += fmt.Sprintf("%s* Class %q exhausted on %d nodes\n", prefix, class, num)
   349  	}
   350  	for dim, num := range metrics.DimensionExhausted {
   351  		out += fmt.Sprintf("%s* Dimension %q exhausted on %d nodes\n", prefix, dim, num)
   352  	}
   353  
   354  	// Print quota info
   355  	for _, dim := range metrics.QuotaExhausted {
   356  		out += fmt.Sprintf("%s* Quota limit hit %q\n", prefix, dim)
   357  	}
   358  
   359  	// Print scores
   360  	if scores {
   361  		if len(metrics.ScoreMetaData) > 0 {
   362  			scoreOutput := make([]string, len(metrics.ScoreMetaData)+1)
   363  
   364  			// Find all possible scores and build header row.
   365  			allScores := make(map[string]struct{})
   366  			for _, scoreMeta := range metrics.ScoreMetaData {
   367  				for score := range scoreMeta.Scores {
   368  					allScores[score] = struct{}{}
   369  				}
   370  			}
   371  			// Sort scores alphabetically.
   372  			scores := make([]string, 0, len(allScores))
   373  			for score := range allScores {
   374  				scores = append(scores, score)
   375  			}
   376  			sort.Strings(scores)
   377  			scoreOutput[0] = fmt.Sprintf("Node|%s|final score", strings.Join(scores, "|"))
   378  
   379  			// Build row for each score.
   380  			for i, scoreMeta := range metrics.ScoreMetaData {
   381  				scoreOutput[i+1] = fmt.Sprintf("%v|", scoreMeta.NodeID)
   382  				for _, scorerName := range scores {
   383  					scoreVal := scoreMeta.Scores[scorerName]
   384  					scoreOutput[i+1] += fmt.Sprintf("%.3g|", scoreVal)
   385  				}
   386  				scoreOutput[i+1] += fmt.Sprintf("%.3g", scoreMeta.NormScore)
   387  			}
   388  
   389  			out += formatList(scoreOutput)
   390  		} else {
   391  			// Backwards compatibility for old allocs
   392  			for name, score := range metrics.Scores {
   393  				out += fmt.Sprintf("%s* Score %q = %f\n", prefix, name, score)
   394  			}
   395  		}
   396  	}
   397  
   398  	out = strings.TrimSuffix(out, "\n")
   399  	return out
   400  }