github.com/jrxfive/nomad@v0.6.1-0.20170802162750-1fef470e89bf/command/check.go (about)

     1  package command
     2  
     3  import (
     4  	"fmt"
     5  	"strconv"
     6  	"strings"
     7  	"time"
     8  )
     9  
    10  const (
    11  	HealthCritical = 2
    12  	HealthWarn     = 1
    13  	HealthPass     = 0
    14  	HealthUnknown  = 3
    15  )
    16  
    17  type AgentCheckCommand struct {
    18  	Meta
    19  }
    20  
    21  func (c *AgentCheckCommand) Help() string {
    22  	helpText := `
    23  Usage: nomad check [options]
    24  
    25    Display state of the Nomad agent. The exit code of the command is Nagios
    26    compatible and could be used with alerting systems.
    27  
    28  General Options:
    29  
    30    ` + generalOptionsUsage() + `
    31  
    32  Agent Check Options:
    33  
    34    -min-peers
    35       Minimum number of peers that a server is expected to know.
    36  
    37    -min-servers
    38       Minumum number of servers that a client is expected to know.
    39  `
    40  
    41  	return strings.TrimSpace(helpText)
    42  }
    43  
    44  func (c *AgentCheckCommand) Synopsis() string {
    45  	return "Displays health of the local Nomad agent"
    46  }
    47  
    48  func (c *AgentCheckCommand) Run(args []string) int {
    49  	var minPeers, minServers int
    50  
    51  	flags := c.Meta.FlagSet("check", FlagSetClient)
    52  	flags.Usage = func() { c.Ui.Output(c.Help()) }
    53  	flags.IntVar(&minPeers, "min-peers", 0, "")
    54  	flags.IntVar(&minServers, "min-servers", 1, "")
    55  
    56  	if err := flags.Parse(args); err != nil {
    57  		return 1
    58  	}
    59  
    60  	client, err := c.Meta.Client()
    61  	if err != nil {
    62  		c.Ui.Error(fmt.Sprintf("error initializing client: %s", err))
    63  		return HealthCritical
    64  	}
    65  
    66  	info, err := client.Agent().Self()
    67  	if err != nil {
    68  		c.Ui.Output(fmt.Sprintf("unable to query agent info: %v", err))
    69  		return HealthCritical
    70  	}
    71  	if _, ok := info.Stats["nomad"]; ok {
    72  		return c.checkServerHealth(info.Stats, minPeers)
    73  	}
    74  
    75  	if clientStats, ok := info.Stats["client"]; ok {
    76  		return c.checkClientHealth(clientStats, minServers)
    77  	}
    78  	return HealthWarn
    79  }
    80  
    81  // checkServerHealth returns the health of a server.
    82  // TODO Add more rules for determining server health
    83  func (c *AgentCheckCommand) checkServerHealth(info map[string]map[string]string, minPeers int) int {
    84  	raft := info["raft"]
    85  	knownPeers, err := strconv.Atoi(raft["num_peers"])
    86  	if err != nil {
    87  		c.Ui.Output(fmt.Sprintf("unable to get known peers: %v", err))
    88  		return HealthCritical
    89  	}
    90  
    91  	if knownPeers < minPeers {
    92  		c.Ui.Output(fmt.Sprintf("known peers: %v, is less than expected number of peers: %v", knownPeers, minPeers))
    93  		return HealthCritical
    94  	}
    95  	return HealthPass
    96  }
    97  
    98  // checkClientHealth returns the health of a client
    99  func (c *AgentCheckCommand) checkClientHealth(clientStats map[string]string, minServers int) int {
   100  	knownServers, err := strconv.Atoi(clientStats["known_servers"])
   101  	if err != nil {
   102  		c.Ui.Output(fmt.Sprintf("unable to get known servers: %v", err))
   103  		return HealthCritical
   104  	}
   105  
   106  	heartbeatTTL, err := time.ParseDuration(clientStats["heartbeat_ttl"])
   107  	if err != nil {
   108  		c.Ui.Output(fmt.Sprintf("unable to parse heartbeat TTL: %v", err))
   109  		return HealthCritical
   110  	}
   111  
   112  	lastHeartbeat, err := time.ParseDuration(clientStats["last_heartbeat"])
   113  	if err != nil {
   114  		c.Ui.Output(fmt.Sprintf("unable to parse last heartbeat: %v", err))
   115  		return HealthCritical
   116  	}
   117  
   118  	if lastHeartbeat > heartbeatTTL {
   119  		c.Ui.Output(fmt.Sprintf("last heartbeat was %q time ago, expected heartbeat ttl: %q", lastHeartbeat, heartbeatTTL))
   120  		return HealthCritical
   121  	}
   122  
   123  	if knownServers < minServers {
   124  		c.Ui.Output(fmt.Sprintf("known servers: %v, is less than expected number of servers: %v", knownServers, minServers))
   125  		return HealthCritical
   126  	}
   127  
   128  	return HealthPass
   129  }