github.com/diptanu/nomad@v0.5.7-0.20170516172507-d72e86cbe3d9/command/check.go (about) 1 package command 2 3 import ( 4 "fmt" 5 "strconv" 6 "strings" 7 "time" 8 ) 9 10 const ( 11 HealthCritical = 2 12 HealthWarn = 1 13 HealthPass = 0 14 HealthUnknown = 3 15 ) 16 17 type AgentCheckCommand struct { 18 Meta 19 } 20 21 func (c *AgentCheckCommand) Help() string { 22 helpText := ` 23 Usage: nomad check [options] 24 25 Display state of the Nomad agent. The exit code of the command is Nagios 26 compatible and could be used with alerting systems. 27 28 General Options: 29 30 ` + generalOptionsUsage() + ` 31 32 Agent Check Options: 33 34 -min-peers 35 Minimum number of peers that a server is expected to know. 36 37 -min-servers 38 Minumum number of servers that a client is expected to know. 39 ` 40 41 return strings.TrimSpace(helpText) 42 } 43 44 func (c *AgentCheckCommand) Synopsis() string { 45 return "Displays health of the local Nomad agent" 46 } 47 48 func (c *AgentCheckCommand) Run(args []string) int { 49 var minPeers, minServers int 50 51 flags := c.Meta.FlagSet("check", FlagSetClient) 52 flags.Usage = func() { c.Ui.Output(c.Help()) } 53 flags.IntVar(&minPeers, "min-peers", 0, "") 54 flags.IntVar(&minServers, "min-servers", 1, "") 55 56 if err := flags.Parse(args); err != nil { 57 return 1 58 } 59 60 client, err := c.Meta.Client() 61 if err != nil { 62 c.Ui.Error(fmt.Sprintf("error initializing client: %s", err)) 63 return HealthCritical 64 } 65 66 info, err := client.Agent().Self() 67 if err != nil { 68 c.Ui.Output(fmt.Sprintf("unable to query agent info: %v", err)) 69 return HealthCritical 70 } 71 if _, ok := info.Stats["nomad"]; ok { 72 return c.checkServerHealth(info.Stats, minPeers) 73 } 74 75 if clientStats, ok := info.Stats["client"]; ok { 76 return c.checkClientHealth(clientStats, minServers) 77 } 78 return HealthWarn 79 } 80 81 // checkServerHealth returns the health of a server. 82 // TODO Add more rules for determining server health 83 func (c *AgentCheckCommand) checkServerHealth(info map[string]map[string]string, minPeers int) int { 84 raft := info["raft"] 85 knownPeers, err := strconv.Atoi(raft["num_peers"]) 86 if err != nil { 87 c.Ui.Output(fmt.Sprintf("unable to get known peers: %v", err)) 88 return HealthCritical 89 } 90 91 if knownPeers < minPeers { 92 c.Ui.Output(fmt.Sprintf("known peers: %v, is less than expected number of peers: %v", knownPeers, minPeers)) 93 return HealthCritical 94 } 95 return HealthPass 96 } 97 98 // checkClientHealth returns the health of a client 99 func (c *AgentCheckCommand) checkClientHealth(clientStats map[string]string, minServers int) int { 100 knownServers, err := strconv.Atoi(clientStats["known_servers"]) 101 if err != nil { 102 c.Ui.Output(fmt.Sprintf("unable to get known servers: %v", err)) 103 return HealthCritical 104 } 105 106 heartbeatTTL, err := time.ParseDuration(clientStats["heartbeat_ttl"]) 107 if err != nil { 108 c.Ui.Output(fmt.Sprintf("unable to parse heartbeat TTL: %v", err)) 109 return HealthCritical 110 } 111 112 lastHeartbeat, err := time.ParseDuration(clientStats["last_heartbeat"]) 113 if err != nil { 114 c.Ui.Output(fmt.Sprintf("unable to parse last heartbeat: %v", err)) 115 return HealthCritical 116 } 117 118 if lastHeartbeat > heartbeatTTL { 119 c.Ui.Output(fmt.Sprintf("last heartbeat was %q time ago, expected heartbeat ttl: %q", lastHeartbeat, heartbeatTTL)) 120 return HealthCritical 121 } 122 123 if knownServers < minServers { 124 c.Ui.Output(fmt.Sprintf("known servers: %v, is less than expected number of servers: %v", knownServers, minServers)) 125 return HealthCritical 126 } 127 128 return HealthPass 129 }