github.com/choria-io/go-choria@v0.28.1-0.20240416190746-b3bf9c7d5a45/aagent/watchers/nagioswatcher/nagios.go (about) 1 // Copyright (c) 2020-2022, R.I. Pienaar and the Choria Project contributors 2 // 3 // SPDX-License-Identifier: Apache-2.0 4 5 package nagioswatcher 6 7 import ( 8 "bytes" 9 "context" 10 "fmt" 11 "html/template" 12 "math/rand" 13 "os" 14 "os/exec" 15 "strings" 16 "sync" 17 "time" 18 19 "github.com/choria-io/go-choria/aagent/model" 20 "github.com/google/shlex" 21 "github.com/tidwall/gjson" 22 23 "github.com/choria-io/go-choria/aagent/util" 24 "github.com/choria-io/go-choria/aagent/watchers/event" 25 "github.com/choria-io/go-choria/aagent/watchers/watcher" 26 iu "github.com/choria-io/go-choria/internal/util" 27 ) 28 29 type State int 30 31 const ( 32 OK State = iota 33 WARNING 34 CRITICAL 35 UNKNOWN 36 SKIPPED 37 NOTCHECKED 38 39 wtype = "nagios" 40 version = "v1" 41 ) 42 43 var stateNames = map[State]string{ 44 OK: "OK", 45 WARNING: "WARNING", 46 CRITICAL: "CRITICAL", 47 UNKNOWN: "UNKNOWN", 48 49 // these are internal states that doesnt cause prom updates 50 // or matching state transitions, they are there to force transitions 51 // to unknown on the first time and to avoid immediate double checks 52 // when transitioning between states 53 SKIPPED: "SKIPPED", 54 NOTCHECKED: "NOTCHECKED", 55 } 56 57 var intStates = map[int]State{ 58 int(OK): OK, 59 int(WARNING): WARNING, 60 int(CRITICAL): CRITICAL, 61 int(UNKNOWN): UNKNOWN, 62 int(SKIPPED): SKIPPED, 63 int(NOTCHECKED): NOTCHECKED, 64 } 65 66 // StateName returns friendly name for a state 67 func StateName(s int) string { 68 state, ok := intStates[s] 69 if !ok { 70 return stateNames[UNKNOWN] 71 } 72 73 return stateNames[state] 74 } 75 76 type properties struct { 77 Annotations map[string]string 78 Plugin string 79 Gossfile string 80 Builtin string 81 Timeout time.Duration 82 LastMessage time.Duration `mapstructure:"last_message"` 83 CertExpiry time.Duration `mapstructure:"pubcert_expire"` 84 TokenExpiry time.Duration `mapstructure:"token_expire"` 85 } 86 87 type Execution struct { 88 Executed time.Time `json:"execute"` 89 Status int `json:"status"` 90 PerfData []util.PerfData `json:"perfdata,omitempty"` 91 } 92 93 type Watcher struct { 94 *watcher.Watcher 95 96 properties *properties 97 name string 98 machine model.Machine 99 interval time.Duration 100 previousRunTime time.Duration 101 previousOutput string 102 previousPerfData []util.PerfData 103 previousCheck time.Time 104 previousPlugin string 105 previous State 106 force bool 107 history []*Execution 108 machineName string 109 textFileDir string 110 111 watching bool 112 mu *sync.Mutex 113 } 114 115 func New(machine model.Machine, name string, states []string, failEvent string, successEvent string, interval string, ai time.Duration, properties map[string]any) (any, error) { 116 var err error 117 118 nw := &Watcher{ 119 machineName: machine.Name(), 120 textFileDir: machine.TextFileDirectory(), 121 name: name, 122 machine: machine, 123 previous: NOTCHECKED, 124 history: []*Execution{}, 125 mu: &sync.Mutex{}, 126 } 127 128 nw.Watcher, err = watcher.NewWatcher(name, wtype, ai, states, machine, failEvent, successEvent) 129 if err != nil { 130 return nil, err 131 } 132 133 err = nw.setProperties(properties) 134 if err != nil { 135 return nil, fmt.Errorf("could not set properties: %s", err) 136 } 137 138 if interval != "" { 139 nw.interval, err = iu.ParseDuration(interval) 140 if err != nil { 141 return nil, fmt.Errorf("invalid interval: %s", err) 142 } 143 144 if nw.interval < 500*time.Millisecond { 145 return nil, fmt.Errorf("interval %v is too small", nw.interval) 146 } 147 } 148 149 updatePromState(nw.machineName, UNKNOWN, machine.TextFileDirectory(), nw) 150 151 return nw, err 152 } 153 154 // Delete stops the watcher and remove it from the prom state after the check was removed from disk 155 func (w *Watcher) Delete() { 156 w.mu.Lock() 157 defer w.mu.Unlock() 158 159 // suppress next check and set state to unknown 160 w.previousCheck = time.Now() 161 deletePromState(w.machineName, w.textFileDir, w) 162 } 163 164 func (w *Watcher) CurrentState() any { 165 w.mu.Lock() 166 defer w.mu.Unlock() 167 168 s := &StateNotification{ 169 Event: event.New(w.name, wtype, version, w.machine), 170 Plugin: w.previousPlugin, 171 Status: stateNames[w.previous], 172 StatusCode: int(w.previous), 173 Output: w.previousOutput, 174 PerfData: w.previousPerfData, 175 RunTime: w.previousRunTime.Seconds(), 176 History: w.history, 177 Annotations: w.properties.Annotations, 178 CheckTime: w.previousCheck.Unix(), 179 } 180 181 if !w.previousCheck.IsZero() { 182 s.CheckTime = w.previousCheck.Unix() 183 } 184 185 return s 186 } 187 188 func (w *Watcher) validate() error { 189 if w.properties.Builtin != "" && w.properties.Plugin != "" { 190 return fmt.Errorf("cannot set plugin and builtin") 191 } 192 193 if w.properties.Builtin == "" && w.properties.Plugin == "" { 194 return fmt.Errorf("plugin or builtin is required") 195 } 196 197 if w.properties.Builtin == "goss" && w.properties.Gossfile == "" { 198 return fmt.Errorf("gossfile property is required for the goss builtin check") 199 } 200 201 if w.properties.Builtin == "choria_status" && w.properties.LastMessage == 0 { 202 return fmt.Errorf("last_message property is required for the choria_status builtin check") 203 } 204 205 if w.properties.Timeout == 0 { 206 w.properties.Timeout = time.Second 207 } 208 209 return nil 210 } 211 212 func (w *Watcher) setProperties(props map[string]any) error { 213 if w.properties == nil { 214 w.properties = &properties{ 215 Annotations: make(map[string]string), 216 Timeout: time.Second, 217 } 218 } 219 220 err := util.ParseMapStructure(props, &w.properties) 221 if err != nil { 222 return err 223 } 224 225 return w.validate() 226 } 227 228 func (w *Watcher) NotifyStateChance() { 229 var s State 230 switch w.machine.State() { 231 case "OK": 232 s = OK 233 case "WARNING": 234 s = WARNING 235 case "CRITICAL": 236 s = CRITICAL 237 case "UNKNOWN": 238 s = UNKNOWN 239 case "FORCE_CHECK": 240 w.Infof("Forcing a check of %s", w.machineName) 241 w.force = true 242 w.StateChangeC() <- struct{}{} 243 return 244 } 245 246 w.mu.Lock() 247 w.previous = s 248 w.mu.Unlock() 249 250 err := updatePromState(w.machineName, s, w.textFileDir, w) 251 if err != nil { 252 w.Errorf("Could not update prometheus: %s", err) 253 } 254 } 255 256 func (w *Watcher) Run(ctx context.Context, wg *sync.WaitGroup) { 257 defer wg.Done() 258 259 if w.textFileDir != "" { 260 w.Infof("nagios watcher starting, updating prometheus in %s", w.textFileDir) 261 } else { 262 w.Infof("nagios watcher starting, prometheus integration disabled") 263 } 264 265 if w.interval != 0 { 266 wg.Add(1) 267 go w.intervalWatcher(ctx, wg) 268 } 269 270 for { 271 select { 272 case <-w.StateChangeC(): 273 w.performWatch(ctx) 274 275 case <-ctx.Done(): 276 w.Infof("Stopping on context interrupt") 277 return 278 } 279 } 280 } 281 282 func (w *Watcher) intervalWatcher(ctx context.Context, wg *sync.WaitGroup) { 283 defer wg.Done() 284 285 splay := time.Duration(rand.Intn(int(w.interval.Seconds()))) * time.Second 286 w.Infof("Splaying first check by %v", splay) 287 288 select { 289 case <-time.NewTimer(splay).C: 290 case <-ctx.Done(): 291 return 292 } 293 294 tick := time.NewTicker(w.interval) 295 296 for { 297 select { 298 case <-tick.C: 299 w.performWatch(ctx) 300 301 case <-ctx.Done(): 302 tick.Stop() 303 return 304 } 305 } 306 } 307 308 func (w *Watcher) performWatch(ctx context.Context) { 309 if w.isWatching() { 310 return 311 } 312 313 start := time.Now().UTC() 314 state, err := w.watch(ctx) 315 err = w.handleCheck(start, state, false, err) 316 if err != nil { 317 w.Errorf("could not handle watcher event: %s", err) 318 } 319 } 320 321 func (w *Watcher) handleCheck(start time.Time, s State, external bool, err error) error { 322 if s == SKIPPED || s == NOTCHECKED { 323 return nil 324 } 325 326 w.Debugf("handling check for %s %s %v", w.properties.Plugin, stateNames[s], err) 327 328 w.mu.Lock() 329 w.previous = s 330 331 if len(w.history) >= 15 { 332 w.history = w.history[1:] 333 } 334 w.history = append(w.history, &Execution{Executed: start, Status: int(s), PerfData: w.previousPerfData}) 335 336 w.mu.Unlock() 337 338 // dont notify if we are externally transitioning because probably notifications were already sent 339 if !external { 340 w.NotifyWatcherState(w.CurrentState()) 341 } 342 343 w.Debugf("Notifying prometheus") 344 345 err = updatePromState(w.machineName, s, w.textFileDir, w) 346 if err != nil { 347 w.Errorf("Could not update prometheus: %s", err) 348 } 349 350 if external { 351 return nil 352 } 353 354 return w.Transition(stateNames[s]) 355 } 356 357 func (w *Watcher) processOverrides(c string) (string, error) { 358 res, err := template.New(w.name).Funcs(w.funcMap()).Parse(c) 359 if err != nil { 360 return c, err 361 } 362 363 wr := new(bytes.Buffer) 364 err = res.Execute(wr, struct{}{}) 365 if err != nil { 366 return c, err 367 } 368 369 return wr.String(), nil 370 } 371 372 func (w *Watcher) funcMap() template.FuncMap { 373 return template.FuncMap{ 374 "o": func(path string, dflt any) string { 375 overrides, err := w.machine.OverrideData() 376 if err != nil { 377 return fmt.Sprintf("%v", dflt) 378 } 379 380 if len(overrides) == 0 { 381 return fmt.Sprintf("%v", dflt) 382 } 383 384 r := gjson.GetBytes(overrides, w.machineName+"."+path) 385 if !r.Exists() { 386 return fmt.Sprintf("%v", dflt) 387 } 388 389 return r.String() 390 }, 391 } 392 } 393 394 func (w *Watcher) watchUsingPlugin(ctx context.Context) (state State, output string, err error) { 395 timeoutCtx, cancel := context.WithTimeout(ctx, w.properties.Timeout) 396 defer cancel() 397 398 plugin, err := w.processOverrides(w.properties.Plugin) 399 if err != nil { 400 w.Errorf("could not process overrides for plugin command: %s", err) 401 return UNKNOWN, "", err 402 } 403 404 w.Infof("Running %s", w.properties.Plugin) 405 406 splitcmd, err := shlex.Split(plugin) 407 if err != nil { 408 w.Errorf("Exec watcher %s failed: %s", plugin, err) 409 return UNKNOWN, "", err 410 } 411 412 w.previousPlugin = plugin 413 414 cmd := exec.CommandContext(timeoutCtx, splitcmd[0], splitcmd[1:]...) 415 cmd.Env = append(cmd.Env, fmt.Sprintf("MACHINE_WATCHER_NAME=%s", w.name)) 416 cmd.Env = append(cmd.Env, fmt.Sprintf("MACHINE_NAME=%s", w.machineName)) 417 cmd.Env = append(cmd.Env, fmt.Sprintf("PATH=%s%s%s", os.Getenv("PATH"), string(os.PathListSeparator), w.machine.Directory())) 418 cmd.Dir = w.machine.Directory() 419 420 var pstate *os.ProcessState 421 422 outb, err := cmd.CombinedOutput() 423 if err != nil { 424 eerr, ok := err.(*exec.ExitError) 425 if ok { 426 pstate = eerr.ProcessState 427 } else { 428 w.Errorf("Exec watcher %s failed: %s", w.properties.Plugin, err) 429 w.previousOutput = err.Error() 430 return UNKNOWN, "", err 431 } 432 } else { 433 pstate = cmd.ProcessState 434 } 435 436 output = string(outb) 437 438 w.Debugf("Output from %s: %s", w.properties.Plugin, output) 439 440 s, ok := intStates[pstate.ExitCode()] 441 if ok { 442 return s, output, nil 443 } 444 445 return UNKNOWN, output, nil 446 } 447 448 func (w *Watcher) watchUsingBuiltin(_ context.Context) (state State, output string, err error) { 449 w.previousPlugin = w.properties.Builtin 450 451 switch { 452 case w.properties.Builtin == "heartbeat": 453 return w.builtinHeartbeat() 454 case strings.HasPrefix(w.properties.Builtin, "goss"): 455 return w.watchUsingGoss() 456 case w.properties.Builtin == "choria_status": 457 return w.watchUsingChoria() 458 default: 459 return UNKNOWN, "", fmt.Errorf("unsupported builtin %q", w.properties.Builtin) 460 } 461 } 462 463 func (w *Watcher) startWatching() { 464 w.mu.Lock() 465 w.watching = true 466 w.mu.Unlock() 467 } 468 469 func (w *Watcher) isWatching() bool { 470 w.mu.Lock() 471 defer w.mu.Unlock() 472 473 return w.watching 474 } 475 476 func (w *Watcher) stopWatching() { 477 w.mu.Lock() 478 w.watching = false 479 w.mu.Unlock() 480 } 481 482 func (w *Watcher) watch(ctx context.Context) (state State, err error) { 483 if !w.ShouldWatch() { 484 return SKIPPED, nil 485 } 486 487 w.startWatching() 488 defer w.stopWatching() 489 490 start := time.Now() 491 w.previousCheck = start 492 defer func() { 493 w.mu.Lock() 494 w.previousRunTime = time.Since(start) 495 w.mu.Unlock() 496 }() 497 498 var output string 499 500 switch { 501 case w.properties.Plugin != "": 502 state, output, err = w.watchUsingPlugin(ctx) 503 case w.properties.Builtin != "": 504 state, output, err = w.watchUsingBuiltin(ctx) 505 default: 506 state = UNKNOWN 507 err = fmt.Errorf("command or builtin required") 508 } 509 510 w.previousOutput = strings.TrimSpace(output) 511 w.previousPerfData = util.ParsePerfData(output) 512 513 return state, err 514 } 515 516 func (w *Watcher) ShouldWatch() bool { 517 if w.force { 518 w.force = false 519 return true 520 } 521 522 since := time.Since(w.previousCheck) 523 if !w.previousCheck.IsZero() && since < w.interval-time.Second { 524 w.Debugf("Skipping check due to previous check being %v sooner than interval %v", since, w.interval) 525 return false 526 } 527 528 return w.Watcher.ShouldWatch() 529 }