github.com/dkerwin/nomad@v0.3.3-0.20160525181927-74554135514b/command/check.go (about) 1 package command 2 3 import ( 4 "fmt" 5 "reflect" 6 "strconv" 7 "strings" 8 "time" 9 ) 10 11 const ( 12 HealthCritical = 2 13 HealthWarn = 1 14 HealthPass = 0 15 HealthUnknown = 3 16 ) 17 18 type AgentCheckCommand struct { 19 Meta 20 } 21 22 func (c *AgentCheckCommand) Help() string { 23 helpText := ` 24 Usage: nomad check 25 26 Display state of the Nomad agent. The exit code of the command is Nagios 27 compatible and could be used with alerting systems. 28 29 General Options: 30 31 ` + generalOptionsUsage() + ` 32 33 Agent Check Options: 34 35 -min-peers 36 Minimum number of peers that a server is expected to know. 37 38 -min-servers 39 Minumum number of servers that a client is expected to know. 40 ` 41 42 return strings.TrimSpace(helpText) 43 } 44 45 func (c *AgentCheckCommand) Synopsis() string { 46 return "Displays health of the local Nomad agent" 47 } 48 49 func (c *AgentCheckCommand) Run(args []string) int { 50 var minPeers, minServers int 51 52 flags := c.Meta.FlagSet("check", FlagSetClient) 53 flags.Usage = func() { c.Ui.Output(c.Help()) } 54 flags.IntVar(&minPeers, "min-peers", 0, "") 55 flags.IntVar(&minServers, "min-servers", 1, "") 56 57 if err := flags.Parse(args); err != nil { 58 return 1 59 } 60 61 client, err := c.Meta.Client() 62 if err != nil { 63 c.Ui.Error(fmt.Sprintf("error initializing client: %s", err)) 64 return HealthCritical 65 } 66 67 info, err := client.Agent().Self() 68 if err != nil { 69 c.Ui.Output(fmt.Sprintf("unable to query agent info: %v", err)) 70 return HealthCritical 71 } 72 if stats, ok := info["stats"]; !ok && (reflect.TypeOf(stats).Kind() == reflect.Map) { 73 c.Ui.Error("error getting stats from the agent api") 74 return 1 75 } 76 if _, ok := info["stats"]["nomad"]; ok { 77 return c.checkServerHealth(info["stats"], minPeers) 78 } 79 80 if _, ok := info["stats"]["client"]; ok { 81 return c.checkClientHealth(info["stats"], minServers) 82 } 83 return HealthWarn 84 } 85 86 // checkServerHealth returns the health of a server. 87 // TODO Add more rules for determining server health 88 func (c *AgentCheckCommand) checkServerHealth(info map[string]interface{}, minPeers int) int { 89 raft := info["raft"].(map[string]interface{}) 90 knownPeers, err := strconv.Atoi(raft["num_peers"].(string)) 91 if err != nil { 92 c.Ui.Output(fmt.Sprintf("unable to get known peers: %v", err)) 93 return HealthCritical 94 } 95 96 if knownPeers < minPeers { 97 c.Ui.Output(fmt.Sprintf("known peers: %v, is less than expected number of peers: %v", knownPeers, minPeers)) 98 return HealthCritical 99 } 100 return HealthPass 101 } 102 103 // checkClientHealth returns the health of a client 104 func (c *AgentCheckCommand) checkClientHealth(info map[string]interface{}, minServers int) int { 105 clientStats := info["client"].(map[string]interface{}) 106 knownServers, err := strconv.Atoi(clientStats["known_servers"].(string)) 107 if err != nil { 108 c.Ui.Output(fmt.Sprintf("unable to get known servers: %v", err)) 109 return HealthCritical 110 } 111 112 heartbeatTTL, err := time.ParseDuration(clientStats["heartbeat_ttl"].(string)) 113 if err != nil { 114 c.Ui.Output(fmt.Sprintf("unable to parse heartbeat TTL: %v", err)) 115 return HealthCritical 116 } 117 118 lastHeartbeat, err := time.ParseDuration(clientStats["last_heartbeat"].(string)) 119 if err != nil { 120 c.Ui.Output(fmt.Sprintf("unable to parse last heartbeat: %v", err)) 121 return HealthCritical 122 } 123 124 if lastHeartbeat > heartbeatTTL { 125 c.Ui.Output(fmt.Sprintf("last heartbeat was %q time ago, expected heartbeat ttl: %q", lastHeartbeat, heartbeatTTL)) 126 return HealthCritical 127 } 128 129 if knownServers < minServers { 130 c.Ui.Output(fmt.Sprintf("known servers: %v, is less than expected number of servers: %v", knownServers, minServers)) 131 return HealthCritical 132 } 133 134 return HealthPass 135 }