github.com/choria-io/go-choria@v0.28.1-0.20240416190746-b3bf9c7d5a45/aagent/watchers/metricwatcher/metric.go (about)

     1  // Copyright (c) 2020-2024, R.I. Pienaar and the Choria Project contributors
     2  //
     3  // SPDX-License-Identifier: Apache-2.0
     4  
     5  package metricwatcher
     6  
     7  import (
     8  	"bytes"
     9  	"context"
    10  	"encoding/json"
    11  	"fmt"
    12  	"math/rand"
    13  	"net"
    14  	"os"
    15  	"os/exec"
    16  	"strconv"
    17  	"strings"
    18  	"sync"
    19  	"time"
    20  
    21  	"github.com/choria-io/go-choria/aagent/model"
    22  	"github.com/google/shlex"
    23  
    24  	"github.com/choria-io/go-choria/aagent/util"
    25  	"github.com/choria-io/go-choria/aagent/watchers/event"
    26  	"github.com/choria-io/go-choria/aagent/watchers/watcher"
    27  )
    28  
    29  const (
    30  	wtype   = "metric"
    31  	version = "v1"
    32  )
    33  
    34  type Metric struct {
    35  	Labels  map[string]string  `json:"labels"`
    36  	Metrics map[string]float64 `json:"metrics"`
    37  	Time    int64              `json:"time"`
    38  	name    string
    39  	machine string
    40  	seen    int
    41  }
    42  
    43  type properties struct {
    44  	Command        string
    45  	Interval       time.Duration
    46  	Labels         map[string]string
    47  	SkipPrometheus bool   `mapstructure:"skip_prometheus"`
    48  	StoreAsData    bool   `mapstructure:"store"`
    49  	GraphiteHost   string `mapstructure:"graphite_host"`
    50  	GraphitePort   string `mapstructure:"graphite_port"`
    51  	GraphitePrefix string `mapstructure:"graphite_prefix"`
    52  }
    53  
    54  type Watcher struct {
    55  	*watcher.Watcher
    56  
    57  	name            string
    58  	machine         model.Machine
    59  	previousRunTime time.Duration
    60  	previousResult  *Metric
    61  	properties      *properties
    62  
    63  	watching bool
    64  	mu       *sync.Mutex
    65  }
    66  
    67  func New(machine model.Machine, name string, states []string, failEvent string, successEvent string, interval string, ai time.Duration, rawprops map[string]any) (any, error) {
    68  	var err error
    69  
    70  	mw := &Watcher{
    71  		name:    name,
    72  		machine: machine,
    73  		mu:      &sync.Mutex{},
    74  	}
    75  
    76  	mw.Watcher, err = watcher.NewWatcher(name, wtype, ai, states, machine, failEvent, successEvent)
    77  	if err != nil {
    78  		return nil, err
    79  	}
    80  
    81  	err = mw.setProperties(rawprops)
    82  	if err != nil {
    83  		return nil, fmt.Errorf("could not set properties: %s", err)
    84  	}
    85  
    86  	if mw.properties.GraphitePrefix == "" {
    87  		mw.properties.GraphitePrefix = fmt.Sprintf("choria.%s", strings.ReplaceAll(name, " ", "-"))
    88  	}
    89  
    90  	if !mw.properties.SkipPrometheus {
    91  		savePromState(machine.TextFileDirectory(), mw)
    92  	}
    93  
    94  	return mw, nil
    95  }
    96  
    97  func (w *Watcher) Delete() {
    98  	if !w.properties.SkipPrometheus {
    99  		err := deletePromState(w.machine.TextFileDirectory(), w, w.machine.Name(), w.name)
   100  		if err != nil {
   101  			w.Errorf("could not delete from prometheus: %s", err)
   102  		}
   103  	}
   104  }
   105  
   106  func (w *Watcher) Run(ctx context.Context, wg *sync.WaitGroup) {
   107  	defer wg.Done()
   108  
   109  	w.Infof("metric watcher for %s starting", w.properties.Command)
   110  
   111  	splay := time.Duration(rand.Intn(int(w.properties.Interval.Seconds()))) * time.Second
   112  	w.Infof("Splaying first check by %v", splay)
   113  
   114  	select {
   115  	case <-time.NewTimer(splay).C:
   116  		w.performWatch(ctx)
   117  	case <-ctx.Done():
   118  		return
   119  	}
   120  
   121  	tick := time.NewTicker(w.properties.Interval)
   122  
   123  	for {
   124  		select {
   125  		case <-tick.C:
   126  			w.performWatch(ctx)
   127  
   128  		case <-w.StateChangeC():
   129  			w.performWatch(ctx)
   130  
   131  		case <-ctx.Done():
   132  			w.Infof("Stopping on context interrupt")
   133  			tick.Stop()
   134  			return
   135  		}
   136  	}
   137  }
   138  
   139  func (w *Watcher) startWatching() {
   140  	w.mu.Lock()
   141  	w.watching = true
   142  	w.mu.Unlock()
   143  }
   144  
   145  func (w *Watcher) isWatching() bool {
   146  	w.mu.Lock()
   147  	defer w.mu.Unlock()
   148  
   149  	return w.watching
   150  }
   151  
   152  func (w *Watcher) stopWatching() {
   153  	w.mu.Lock()
   154  	w.watching = false
   155  	w.mu.Unlock()
   156  }
   157  
   158  func (w *Watcher) watch(ctx context.Context) (state []byte, err error) {
   159  	if !w.ShouldWatch() {
   160  		return nil, nil
   161  	}
   162  
   163  	w.startWatching()
   164  	defer w.stopWatching()
   165  
   166  	start := time.Now()
   167  	defer func() {
   168  		w.mu.Lock()
   169  		w.previousRunTime = time.Since(start)
   170  		w.mu.Unlock()
   171  	}()
   172  
   173  	w.Infof("Running %s", w.properties.Command)
   174  
   175  	timeoutCtx, cancel := context.WithTimeout(ctx, time.Second)
   176  	defer cancel()
   177  
   178  	splitcmd, err := shlex.Split(w.properties.Command)
   179  	if err != nil {
   180  		w.Errorf("Metric watcher %s failed: %s", w.properties.Command, err)
   181  		return nil, err
   182  	}
   183  
   184  	cmd := exec.CommandContext(timeoutCtx, splitcmd[0], splitcmd[1:]...)
   185  	cmd.Env = append(cmd.Env, fmt.Sprintf("MACHINE_WATCHER_NAME=%s", w.name))
   186  	cmd.Env = append(cmd.Env, fmt.Sprintf("MACHINE_NAME=%s", w.machine.Name()))
   187  	cmd.Env = append(cmd.Env, fmt.Sprintf("PATH=%s%s%s", os.Getenv("PATH"), string(os.PathListSeparator), w.machine.Directory()))
   188  	cmd.Dir = w.machine.Directory()
   189  
   190  	output, err := cmd.CombinedOutput()
   191  	if err != nil {
   192  		w.Errorf("Metric watcher %s failed: %s", w.properties.Command, err)
   193  		return nil, err
   194  	}
   195  
   196  	w.Debugf("Output from %s: %s", w.properties.Command, output)
   197  
   198  	return output, nil
   199  }
   200  
   201  func (w *Watcher) performWatch(ctx context.Context) {
   202  	if w.isWatching() {
   203  		return
   204  	}
   205  
   206  	metric, err := w.watch(ctx)
   207  	err = w.handleCheck(ctx, metric, err)
   208  	if err != nil {
   209  		w.Errorf("could not handle watcher event: %s", err)
   210  	}
   211  }
   212  
   213  func (w *Watcher) parseJSONCheck(output []byte) (*Metric, error) {
   214  	metric := &Metric{
   215  		Labels:  map[string]string{"format": "choria"},
   216  		Metrics: map[string]float64{},
   217  	}
   218  
   219  	err := json.Unmarshal(output, metric)
   220  	if err != nil {
   221  		return metric, err
   222  	}
   223  
   224  	for k, v := range w.properties.Labels {
   225  		metric.Labels[k] = v
   226  	}
   227  
   228  	return metric, nil
   229  }
   230  
   231  func (w *Watcher) parseNagiosCheck(output []byte) (*Metric, error) {
   232  	metric := &Metric{
   233  		Labels:  map[string]string{"format": "nagios"},
   234  		Metrics: map[string]float64{},
   235  	}
   236  
   237  	perf := util.ParsePerfData(string(output))
   238  	if perf == nil {
   239  		return metric, nil
   240  	}
   241  
   242  	for _, p := range perf {
   243  		metric.Metrics[p.Label] = p.Value
   244  	}
   245  
   246  	return metric, nil
   247  }
   248  
   249  func (w *Watcher) handleCheck(ctx context.Context, output []byte, err error) error {
   250  	var metric *Metric
   251  
   252  	if err == nil {
   253  		if bytes.HasPrefix(bytes.TrimSpace(output), []byte("{")) {
   254  			metric, err = w.parseJSONCheck(output)
   255  			if err != nil {
   256  				w.Errorf("Failed to parse metric output: %v", err)
   257  			}
   258  		} else {
   259  			metric, err = w.parseNagiosCheck(output)
   260  			if err != nil {
   261  				w.Errorf("Failed to parse perf data output: %v", err)
   262  			}
   263  		}
   264  	}
   265  
   266  	if err != nil {
   267  		w.NotifyWatcherState(w.CurrentState())
   268  		return w.FailureTransition()
   269  	}
   270  
   271  	metric.Time = time.Now().Unix()
   272  
   273  	for k, v := range w.properties.Labels {
   274  		metric.Labels[k] = v
   275  	}
   276  
   277  	if !w.properties.SkipPrometheus {
   278  		err = updatePromState(w.machine.TextFileDirectory(), w, w.machine.Name(), w.name, metric)
   279  		if err != nil {
   280  			w.Errorf("Could not update prometheus: %s", err)
   281  		}
   282  	}
   283  
   284  	err = w.publishToGraphite(ctx, metric)
   285  	if err != nil {
   286  		return err
   287  	}
   288  
   289  	err = w.storeMetricAsData(metric)
   290  	if err != nil {
   291  		return err
   292  	}
   293  
   294  	w.mu.Lock()
   295  	w.previousResult = metric
   296  	w.mu.Unlock()
   297  
   298  	w.NotifyWatcherState(w.CurrentState())
   299  
   300  	return nil
   301  }
   302  
   303  func (w *Watcher) storeMetricAsData(metric *Metric) error {
   304  	if !w.properties.StoreAsData {
   305  		return nil
   306  	}
   307  
   308  	w.Debugf("Storing metrics to machine data")
   309  
   310  	return w.machine.DataPut("metric", map[string]any{w.name: metric})
   311  }
   312  
   313  func (w *Watcher) publishToGraphite(ctx context.Context, metric *Metric) error {
   314  	if w.properties.GraphiteHost == "" {
   315  		w.Debugf("Skipping graphite publish without a host defined")
   316  		return nil
   317  	}
   318  
   319  	if w.properties.GraphitePort == "" {
   320  		w.Debugf("Skipping graphite publish without a port defined")
   321  		return nil
   322  	}
   323  
   324  	if len(metric.Metrics) == 0 {
   325  		w.Debugf("Skipping graphite publish without any metrics")
   326  		return nil
   327  	}
   328  
   329  	connCtx, cancel := context.WithTimeout(ctx, 2*time.Second)
   330  	defer cancel()
   331  
   332  	host, err := w.ProcessTemplate(w.properties.GraphiteHost)
   333  	if err != nil {
   334  		return err
   335  	}
   336  	portString, err := w.ProcessTemplate(w.properties.GraphitePort)
   337  	if err != nil {
   338  		return err
   339  	}
   340  	port, err := strconv.Atoi(portString)
   341  	if err != nil {
   342  		return err
   343  	}
   344  
   345  	hostPort := fmt.Sprintf("%s:%d", host, port)
   346  
   347  	w.Debugf("Sending %d metrics to graphite %s", len(metric.Metrics), hostPort)
   348  	var d net.Dialer
   349  	conn, err := d.DialContext(connCtx, "tcp", hostPort)
   350  	if err != nil {
   351  		return err
   352  	}
   353  	defer conn.Close()
   354  
   355  	// copy it so we can add stuff to it without impacting other parts
   356  	// TODO: use maps.Copy() later
   357  	m := make(map[string]float64)
   358  	for k, v := range metric.Metrics {
   359  		m[k] = v
   360  	}
   361  	m["runtime"] = w.previousRunTime.Seconds()
   362  
   363  	for k, v := range m {
   364  		prefix, err := w.ProcessTemplate(w.properties.GraphitePrefix)
   365  		if err != nil {
   366  			return err
   367  		}
   368  
   369  		name := fmt.Sprintf("%s.%s", prefix, k)
   370  		_, err = conn.Write([]byte(fmt.Sprintf("%s %f %d\n", name, v, metric.Time)))
   371  		if err != nil {
   372  			return err
   373  		}
   374  	}
   375  
   376  	return nil
   377  }
   378  
   379  func (w *Watcher) CurrentState() any {
   380  	w.mu.Lock()
   381  	defer w.mu.Unlock()
   382  
   383  	var res Metric
   384  	if w.previousResult == nil {
   385  		res = Metric{
   386  			Labels:  make(map[string]string),
   387  			Metrics: make(map[string]float64),
   388  		}
   389  	} else {
   390  		res = *w.previousResult
   391  	}
   392  
   393  	res.Metrics["choria_runtime_seconds"] = w.previousRunTime.Seconds()
   394  
   395  	s := &StateNotification{
   396  		Event:   event.New(w.name, wtype, version, w.machine),
   397  		Metrics: res,
   398  	}
   399  
   400  	return s
   401  }
   402  
   403  func (w *Watcher) validate() error {
   404  	if w.properties.Command == "" {
   405  		return fmt.Errorf("command is required")
   406  	}
   407  
   408  	if w.properties.Interval < time.Second {
   409  		w.properties.Interval = time.Second
   410  	}
   411  
   412  	return nil
   413  }
   414  
   415  func (w *Watcher) setProperties(props map[string]any) error {
   416  	if w.properties == nil {
   417  		w.properties = &properties{
   418  			Labels: make(map[string]string),
   419  		}
   420  	}
   421  
   422  	err := util.ParseMapStructure(props, w.properties)
   423  	if err != nil {
   424  		return err
   425  	}
   426  
   427  	return w.validate()
   428  }