github.com/choria-io/go-choria@v0.28.1-0.20240416190746-b3bf9c7d5a45/aagent/watchers/nagioswatcher/nagios.go (about)

     1  // Copyright (c) 2020-2022, R.I. Pienaar and the Choria Project contributors
     2  //
     3  // SPDX-License-Identifier: Apache-2.0
     4  
     5  package nagioswatcher
     6  
     7  import (
     8  	"bytes"
     9  	"context"
    10  	"fmt"
    11  	"html/template"
    12  	"math/rand"
    13  	"os"
    14  	"os/exec"
    15  	"strings"
    16  	"sync"
    17  	"time"
    18  
    19  	"github.com/choria-io/go-choria/aagent/model"
    20  	"github.com/google/shlex"
    21  	"github.com/tidwall/gjson"
    22  
    23  	"github.com/choria-io/go-choria/aagent/util"
    24  	"github.com/choria-io/go-choria/aagent/watchers/event"
    25  	"github.com/choria-io/go-choria/aagent/watchers/watcher"
    26  	iu "github.com/choria-io/go-choria/internal/util"
    27  )
    28  
    29  type State int
    30  
    31  const (
    32  	OK State = iota
    33  	WARNING
    34  	CRITICAL
    35  	UNKNOWN
    36  	SKIPPED
    37  	NOTCHECKED
    38  
    39  	wtype   = "nagios"
    40  	version = "v1"
    41  )
    42  
    43  var stateNames = map[State]string{
    44  	OK:       "OK",
    45  	WARNING:  "WARNING",
    46  	CRITICAL: "CRITICAL",
    47  	UNKNOWN:  "UNKNOWN",
    48  
    49  	// these are internal states that doesnt cause prom updates
    50  	// or matching state transitions, they are there to force transitions
    51  	// to unknown on the first time and to avoid immediate double checks
    52  	// when transitioning between states
    53  	SKIPPED:    "SKIPPED",
    54  	NOTCHECKED: "NOTCHECKED",
    55  }
    56  
    57  var intStates = map[int]State{
    58  	int(OK):         OK,
    59  	int(WARNING):    WARNING,
    60  	int(CRITICAL):   CRITICAL,
    61  	int(UNKNOWN):    UNKNOWN,
    62  	int(SKIPPED):    SKIPPED,
    63  	int(NOTCHECKED): NOTCHECKED,
    64  }
    65  
    66  // StateName returns friendly name for a state
    67  func StateName(s int) string {
    68  	state, ok := intStates[s]
    69  	if !ok {
    70  		return stateNames[UNKNOWN]
    71  	}
    72  
    73  	return stateNames[state]
    74  }
    75  
    76  type properties struct {
    77  	Annotations map[string]string
    78  	Plugin      string
    79  	Gossfile    string
    80  	Builtin     string
    81  	Timeout     time.Duration
    82  	LastMessage time.Duration `mapstructure:"last_message"`
    83  	CertExpiry  time.Duration `mapstructure:"pubcert_expire"`
    84  	TokenExpiry time.Duration `mapstructure:"token_expire"`
    85  }
    86  
    87  type Execution struct {
    88  	Executed time.Time       `json:"execute"`
    89  	Status   int             `json:"status"`
    90  	PerfData []util.PerfData `json:"perfdata,omitempty"`
    91  }
    92  
    93  type Watcher struct {
    94  	*watcher.Watcher
    95  
    96  	properties       *properties
    97  	name             string
    98  	machine          model.Machine
    99  	interval         time.Duration
   100  	previousRunTime  time.Duration
   101  	previousOutput   string
   102  	previousPerfData []util.PerfData
   103  	previousCheck    time.Time
   104  	previousPlugin   string
   105  	previous         State
   106  	force            bool
   107  	history          []*Execution
   108  	machineName      string
   109  	textFileDir      string
   110  
   111  	watching bool
   112  	mu       *sync.Mutex
   113  }
   114  
   115  func New(machine model.Machine, name string, states []string, failEvent string, successEvent string, interval string, ai time.Duration, properties map[string]any) (any, error) {
   116  	var err error
   117  
   118  	nw := &Watcher{
   119  		machineName: machine.Name(),
   120  		textFileDir: machine.TextFileDirectory(),
   121  		name:        name,
   122  		machine:     machine,
   123  		previous:    NOTCHECKED,
   124  		history:     []*Execution{},
   125  		mu:          &sync.Mutex{},
   126  	}
   127  
   128  	nw.Watcher, err = watcher.NewWatcher(name, wtype, ai, states, machine, failEvent, successEvent)
   129  	if err != nil {
   130  		return nil, err
   131  	}
   132  
   133  	err = nw.setProperties(properties)
   134  	if err != nil {
   135  		return nil, fmt.Errorf("could not set properties: %s", err)
   136  	}
   137  
   138  	if interval != "" {
   139  		nw.interval, err = iu.ParseDuration(interval)
   140  		if err != nil {
   141  			return nil, fmt.Errorf("invalid interval: %s", err)
   142  		}
   143  
   144  		if nw.interval < 500*time.Millisecond {
   145  			return nil, fmt.Errorf("interval %v is too small", nw.interval)
   146  		}
   147  	}
   148  
   149  	updatePromState(nw.machineName, UNKNOWN, machine.TextFileDirectory(), nw)
   150  
   151  	return nw, err
   152  }
   153  
   154  // Delete stops the watcher and remove it from the prom state after the check was removed from disk
   155  func (w *Watcher) Delete() {
   156  	w.mu.Lock()
   157  	defer w.mu.Unlock()
   158  
   159  	// suppress next check and set state to unknown
   160  	w.previousCheck = time.Now()
   161  	deletePromState(w.machineName, w.textFileDir, w)
   162  }
   163  
   164  func (w *Watcher) CurrentState() any {
   165  	w.mu.Lock()
   166  	defer w.mu.Unlock()
   167  
   168  	s := &StateNotification{
   169  		Event:       event.New(w.name, wtype, version, w.machine),
   170  		Plugin:      w.previousPlugin,
   171  		Status:      stateNames[w.previous],
   172  		StatusCode:  int(w.previous),
   173  		Output:      w.previousOutput,
   174  		PerfData:    w.previousPerfData,
   175  		RunTime:     w.previousRunTime.Seconds(),
   176  		History:     w.history,
   177  		Annotations: w.properties.Annotations,
   178  		CheckTime:   w.previousCheck.Unix(),
   179  	}
   180  
   181  	if !w.previousCheck.IsZero() {
   182  		s.CheckTime = w.previousCheck.Unix()
   183  	}
   184  
   185  	return s
   186  }
   187  
   188  func (w *Watcher) validate() error {
   189  	if w.properties.Builtin != "" && w.properties.Plugin != "" {
   190  		return fmt.Errorf("cannot set plugin and builtin")
   191  	}
   192  
   193  	if w.properties.Builtin == "" && w.properties.Plugin == "" {
   194  		return fmt.Errorf("plugin or builtin is required")
   195  	}
   196  
   197  	if w.properties.Builtin == "goss" && w.properties.Gossfile == "" {
   198  		return fmt.Errorf("gossfile property is required for the goss builtin check")
   199  	}
   200  
   201  	if w.properties.Builtin == "choria_status" && w.properties.LastMessage == 0 {
   202  		return fmt.Errorf("last_message property is required for the choria_status builtin check")
   203  	}
   204  
   205  	if w.properties.Timeout == 0 {
   206  		w.properties.Timeout = time.Second
   207  	}
   208  
   209  	return nil
   210  }
   211  
   212  func (w *Watcher) setProperties(props map[string]any) error {
   213  	if w.properties == nil {
   214  		w.properties = &properties{
   215  			Annotations: make(map[string]string),
   216  			Timeout:     time.Second,
   217  		}
   218  	}
   219  
   220  	err := util.ParseMapStructure(props, &w.properties)
   221  	if err != nil {
   222  		return err
   223  	}
   224  
   225  	return w.validate()
   226  }
   227  
   228  func (w *Watcher) NotifyStateChance() {
   229  	var s State
   230  	switch w.machine.State() {
   231  	case "OK":
   232  		s = OK
   233  	case "WARNING":
   234  		s = WARNING
   235  	case "CRITICAL":
   236  		s = CRITICAL
   237  	case "UNKNOWN":
   238  		s = UNKNOWN
   239  	case "FORCE_CHECK":
   240  		w.Infof("Forcing a check of %s", w.machineName)
   241  		w.force = true
   242  		w.StateChangeC() <- struct{}{}
   243  		return
   244  	}
   245  
   246  	w.mu.Lock()
   247  	w.previous = s
   248  	w.mu.Unlock()
   249  
   250  	err := updatePromState(w.machineName, s, w.textFileDir, w)
   251  	if err != nil {
   252  		w.Errorf("Could not update prometheus: %s", err)
   253  	}
   254  }
   255  
   256  func (w *Watcher) Run(ctx context.Context, wg *sync.WaitGroup) {
   257  	defer wg.Done()
   258  
   259  	if w.textFileDir != "" {
   260  		w.Infof("nagios watcher starting, updating prometheus in %s", w.textFileDir)
   261  	} else {
   262  		w.Infof("nagios watcher starting, prometheus integration disabled")
   263  	}
   264  
   265  	if w.interval != 0 {
   266  		wg.Add(1)
   267  		go w.intervalWatcher(ctx, wg)
   268  	}
   269  
   270  	for {
   271  		select {
   272  		case <-w.StateChangeC():
   273  			w.performWatch(ctx)
   274  
   275  		case <-ctx.Done():
   276  			w.Infof("Stopping on context interrupt")
   277  			return
   278  		}
   279  	}
   280  }
   281  
   282  func (w *Watcher) intervalWatcher(ctx context.Context, wg *sync.WaitGroup) {
   283  	defer wg.Done()
   284  
   285  	splay := time.Duration(rand.Intn(int(w.interval.Seconds()))) * time.Second
   286  	w.Infof("Splaying first check by %v", splay)
   287  
   288  	select {
   289  	case <-time.NewTimer(splay).C:
   290  	case <-ctx.Done():
   291  		return
   292  	}
   293  
   294  	tick := time.NewTicker(w.interval)
   295  
   296  	for {
   297  		select {
   298  		case <-tick.C:
   299  			w.performWatch(ctx)
   300  
   301  		case <-ctx.Done():
   302  			tick.Stop()
   303  			return
   304  		}
   305  	}
   306  }
   307  
   308  func (w *Watcher) performWatch(ctx context.Context) {
   309  	if w.isWatching() {
   310  		return
   311  	}
   312  
   313  	start := time.Now().UTC()
   314  	state, err := w.watch(ctx)
   315  	err = w.handleCheck(start, state, false, err)
   316  	if err != nil {
   317  		w.Errorf("could not handle watcher event: %s", err)
   318  	}
   319  }
   320  
   321  func (w *Watcher) handleCheck(start time.Time, s State, external bool, err error) error {
   322  	if s == SKIPPED || s == NOTCHECKED {
   323  		return nil
   324  	}
   325  
   326  	w.Debugf("handling check for %s %s %v", w.properties.Plugin, stateNames[s], err)
   327  
   328  	w.mu.Lock()
   329  	w.previous = s
   330  
   331  	if len(w.history) >= 15 {
   332  		w.history = w.history[1:]
   333  	}
   334  	w.history = append(w.history, &Execution{Executed: start, Status: int(s), PerfData: w.previousPerfData})
   335  
   336  	w.mu.Unlock()
   337  
   338  	// dont notify if we are externally transitioning because probably notifications were already sent
   339  	if !external {
   340  		w.NotifyWatcherState(w.CurrentState())
   341  	}
   342  
   343  	w.Debugf("Notifying prometheus")
   344  
   345  	err = updatePromState(w.machineName, s, w.textFileDir, w)
   346  	if err != nil {
   347  		w.Errorf("Could not update prometheus: %s", err)
   348  	}
   349  
   350  	if external {
   351  		return nil
   352  	}
   353  
   354  	return w.Transition(stateNames[s])
   355  }
   356  
   357  func (w *Watcher) processOverrides(c string) (string, error) {
   358  	res, err := template.New(w.name).Funcs(w.funcMap()).Parse(c)
   359  	if err != nil {
   360  		return c, err
   361  	}
   362  
   363  	wr := new(bytes.Buffer)
   364  	err = res.Execute(wr, struct{}{})
   365  	if err != nil {
   366  		return c, err
   367  	}
   368  
   369  	return wr.String(), nil
   370  }
   371  
   372  func (w *Watcher) funcMap() template.FuncMap {
   373  	return template.FuncMap{
   374  		"o": func(path string, dflt any) string {
   375  			overrides, err := w.machine.OverrideData()
   376  			if err != nil {
   377  				return fmt.Sprintf("%v", dflt)
   378  			}
   379  
   380  			if len(overrides) == 0 {
   381  				return fmt.Sprintf("%v", dflt)
   382  			}
   383  
   384  			r := gjson.GetBytes(overrides, w.machineName+"."+path)
   385  			if !r.Exists() {
   386  				return fmt.Sprintf("%v", dflt)
   387  			}
   388  
   389  			return r.String()
   390  		},
   391  	}
   392  }
   393  
   394  func (w *Watcher) watchUsingPlugin(ctx context.Context) (state State, output string, err error) {
   395  	timeoutCtx, cancel := context.WithTimeout(ctx, w.properties.Timeout)
   396  	defer cancel()
   397  
   398  	plugin, err := w.processOverrides(w.properties.Plugin)
   399  	if err != nil {
   400  		w.Errorf("could not process overrides for plugin command: %s", err)
   401  		return UNKNOWN, "", err
   402  	}
   403  
   404  	w.Infof("Running %s", w.properties.Plugin)
   405  
   406  	splitcmd, err := shlex.Split(plugin)
   407  	if err != nil {
   408  		w.Errorf("Exec watcher %s failed: %s", plugin, err)
   409  		return UNKNOWN, "", err
   410  	}
   411  
   412  	w.previousPlugin = plugin
   413  
   414  	cmd := exec.CommandContext(timeoutCtx, splitcmd[0], splitcmd[1:]...)
   415  	cmd.Env = append(cmd.Env, fmt.Sprintf("MACHINE_WATCHER_NAME=%s", w.name))
   416  	cmd.Env = append(cmd.Env, fmt.Sprintf("MACHINE_NAME=%s", w.machineName))
   417  	cmd.Env = append(cmd.Env, fmt.Sprintf("PATH=%s%s%s", os.Getenv("PATH"), string(os.PathListSeparator), w.machine.Directory()))
   418  	cmd.Dir = w.machine.Directory()
   419  
   420  	var pstate *os.ProcessState
   421  
   422  	outb, err := cmd.CombinedOutput()
   423  	if err != nil {
   424  		eerr, ok := err.(*exec.ExitError)
   425  		if ok {
   426  			pstate = eerr.ProcessState
   427  		} else {
   428  			w.Errorf("Exec watcher %s failed: %s", w.properties.Plugin, err)
   429  			w.previousOutput = err.Error()
   430  			return UNKNOWN, "", err
   431  		}
   432  	} else {
   433  		pstate = cmd.ProcessState
   434  	}
   435  
   436  	output = string(outb)
   437  
   438  	w.Debugf("Output from %s: %s", w.properties.Plugin, output)
   439  
   440  	s, ok := intStates[pstate.ExitCode()]
   441  	if ok {
   442  		return s, output, nil
   443  	}
   444  
   445  	return UNKNOWN, output, nil
   446  }
   447  
   448  func (w *Watcher) watchUsingBuiltin(_ context.Context) (state State, output string, err error) {
   449  	w.previousPlugin = w.properties.Builtin
   450  
   451  	switch {
   452  	case w.properties.Builtin == "heartbeat":
   453  		return w.builtinHeartbeat()
   454  	case strings.HasPrefix(w.properties.Builtin, "goss"):
   455  		return w.watchUsingGoss()
   456  	case w.properties.Builtin == "choria_status":
   457  		return w.watchUsingChoria()
   458  	default:
   459  		return UNKNOWN, "", fmt.Errorf("unsupported builtin %q", w.properties.Builtin)
   460  	}
   461  }
   462  
   463  func (w *Watcher) startWatching() {
   464  	w.mu.Lock()
   465  	w.watching = true
   466  	w.mu.Unlock()
   467  }
   468  
   469  func (w *Watcher) isWatching() bool {
   470  	w.mu.Lock()
   471  	defer w.mu.Unlock()
   472  
   473  	return w.watching
   474  }
   475  
   476  func (w *Watcher) stopWatching() {
   477  	w.mu.Lock()
   478  	w.watching = false
   479  	w.mu.Unlock()
   480  }
   481  
   482  func (w *Watcher) watch(ctx context.Context) (state State, err error) {
   483  	if !w.ShouldWatch() {
   484  		return SKIPPED, nil
   485  	}
   486  
   487  	w.startWatching()
   488  	defer w.stopWatching()
   489  
   490  	start := time.Now()
   491  	w.previousCheck = start
   492  	defer func() {
   493  		w.mu.Lock()
   494  		w.previousRunTime = time.Since(start)
   495  		w.mu.Unlock()
   496  	}()
   497  
   498  	var output string
   499  
   500  	switch {
   501  	case w.properties.Plugin != "":
   502  		state, output, err = w.watchUsingPlugin(ctx)
   503  	case w.properties.Builtin != "":
   504  		state, output, err = w.watchUsingBuiltin(ctx)
   505  	default:
   506  		state = UNKNOWN
   507  		err = fmt.Errorf("command or builtin required")
   508  	}
   509  
   510  	w.previousOutput = strings.TrimSpace(output)
   511  	w.previousPerfData = util.ParsePerfData(output)
   512  
   513  	return state, err
   514  }
   515  
   516  func (w *Watcher) ShouldWatch() bool {
   517  	if w.force {
   518  		w.force = false
   519  		return true
   520  	}
   521  
   522  	since := time.Since(w.previousCheck)
   523  	if !w.previousCheck.IsZero() && since < w.interval-time.Second {
   524  		w.Debugf("Skipping check due to previous check being %v sooner than interval %v", since, w.interval)
   525  		return false
   526  	}
   527  
   528  	return w.Watcher.ShouldWatch()
   529  }