github.com/mattyr/nomad@v0.3.3-0.20160919021406-3485a065154a/command/check.go (about)

     1  package command
     2  
     3  import (
     4  	"fmt"
     5  	"reflect"
     6  	"strconv"
     7  	"strings"
     8  	"time"
     9  )
    10  
    11  const (
    12  	HealthCritical = 2
    13  	HealthWarn     = 1
    14  	HealthPass     = 0
    15  	HealthUnknown  = 3
    16  )
    17  
    18  type AgentCheckCommand struct {
    19  	Meta
    20  }
    21  
    22  func (c *AgentCheckCommand) Help() string {
    23  	helpText := `
    24  Usage: nomad check
    25  
    26    Display state of the Nomad agent. The exit code of the command is Nagios
    27    compatible and could be used with alerting systems.
    28  
    29  General Options:
    30  
    31    ` + generalOptionsUsage() + `
    32  
    33  Agent Check Options:
    34  
    35    -min-peers
    36       Minimum number of peers that a server is expected to know.
    37  
    38    -min-servers
    39       Minumum number of servers that a client is expected to know.
    40  `
    41  
    42  	return strings.TrimSpace(helpText)
    43  }
    44  
    45  func (c *AgentCheckCommand) Synopsis() string {
    46  	return "Displays health of the local Nomad agent"
    47  }
    48  
    49  func (c *AgentCheckCommand) Run(args []string) int {
    50  	var minPeers, minServers int
    51  
    52  	flags := c.Meta.FlagSet("check", FlagSetClient)
    53  	flags.Usage = func() { c.Ui.Output(c.Help()) }
    54  	flags.IntVar(&minPeers, "min-peers", 0, "")
    55  	flags.IntVar(&minServers, "min-servers", 1, "")
    56  
    57  	if err := flags.Parse(args); err != nil {
    58  		return 1
    59  	}
    60  
    61  	client, err := c.Meta.Client()
    62  	if err != nil {
    63  		c.Ui.Error(fmt.Sprintf("error initializing client: %s", err))
    64  		return HealthCritical
    65  	}
    66  
    67  	info, err := client.Agent().Self()
    68  	if err != nil {
    69  		c.Ui.Output(fmt.Sprintf("unable to query agent info: %v", err))
    70  		return HealthCritical
    71  	}
    72  	if stats, ok := info["stats"]; !ok && (reflect.TypeOf(stats).Kind() == reflect.Map) {
    73  		c.Ui.Error("error getting stats from the agent api")
    74  		return 1
    75  	}
    76  	if _, ok := info["stats"]["nomad"]; ok {
    77  		return c.checkServerHealth(info["stats"], minPeers)
    78  	}
    79  
    80  	if _, ok := info["stats"]["client"]; ok {
    81  		return c.checkClientHealth(info["stats"], minServers)
    82  	}
    83  	return HealthWarn
    84  }
    85  
    86  // checkServerHealth returns the health of a server.
    87  // TODO Add more rules for determining server health
    88  func (c *AgentCheckCommand) checkServerHealth(info map[string]interface{}, minPeers int) int {
    89  	raft := info["raft"].(map[string]interface{})
    90  	knownPeers, err := strconv.Atoi(raft["num_peers"].(string))
    91  	if err != nil {
    92  		c.Ui.Output(fmt.Sprintf("unable to get known peers: %v", err))
    93  		return HealthCritical
    94  	}
    95  
    96  	if knownPeers < minPeers {
    97  		c.Ui.Output(fmt.Sprintf("known peers: %v, is less than expected number of peers: %v", knownPeers, minPeers))
    98  		return HealthCritical
    99  	}
   100  	return HealthPass
   101  }
   102  
   103  // checkClientHealth returns the health of a client
   104  func (c *AgentCheckCommand) checkClientHealth(info map[string]interface{}, minServers int) int {
   105  	clientStats := info["client"].(map[string]interface{})
   106  	knownServers, err := strconv.Atoi(clientStats["known_servers"].(string))
   107  	if err != nil {
   108  		c.Ui.Output(fmt.Sprintf("unable to get known servers: %v", err))
   109  		return HealthCritical
   110  	}
   111  
   112  	heartbeatTTL, err := time.ParseDuration(clientStats["heartbeat_ttl"].(string))
   113  	if err != nil {
   114  		c.Ui.Output(fmt.Sprintf("unable to parse heartbeat TTL: %v", err))
   115  		return HealthCritical
   116  	}
   117  
   118  	lastHeartbeat, err := time.ParseDuration(clientStats["last_heartbeat"].(string))
   119  	if err != nil {
   120  		c.Ui.Output(fmt.Sprintf("unable to parse last heartbeat: %v", err))
   121  		return HealthCritical
   122  	}
   123  
   124  	if lastHeartbeat > heartbeatTTL {
   125  		c.Ui.Output(fmt.Sprintf("last heartbeat was %q time ago, expected heartbeat ttl: %q", lastHeartbeat, heartbeatTTL))
   126  		return HealthCritical
   127  	}
   128  
   129  	if knownServers < minServers {
   130  		c.Ui.Output(fmt.Sprintf("known servers: %v, is less than expected number of servers: %v", knownServers, minServers))
   131  		return HealthCritical
   132  	}
   133  
   134  	return HealthPass
   135  }