github.com/choria-io/go-choria@v0.28.1-0.20240416190746-b3bf9c7d5a45/lifecycle/tally/tally.go (about)

     1  // Copyright (c) 2020-2021, R.I. Pienaar and the Choria Project contributors
     2  //
     3  // SPDX-License-Identifier: Apache-2.0
     4  
     5  package tally
     6  
     7  import (
     8  	"context"
     9  	"encoding/json"
    10  	"fmt"
    11  	"github.com/choria-io/go-choria/aagent/watchers/execwatcher"
    12  	"github.com/choria-io/go-choria/backoff"
    13  	election "github.com/choria-io/go-choria/providers/election/streams"
    14  	"github.com/nats-io/nats.go"
    15  	"os"
    16  	"strconv"
    17  	"strings"
    18  	"sync"
    19  	"sync/atomic"
    20  	"time"
    21  
    22  	"github.com/choria-io/go-choria/aagent/machine"
    23  	"github.com/choria-io/go-choria/inter"
    24  	"github.com/choria-io/go-choria/internal/util"
    25  	"github.com/choria-io/go-choria/lifecycle"
    26  	cloudevents "github.com/cloudevents/sdk-go/v2"
    27  	"github.com/prometheus/client_golang/prometheus"
    28  )
    29  
    30  // Connector is a connection to the middleware
    31  type Connector interface {
    32  	QueueSubscribe(ctx context.Context, name string, subject string, group string, output chan inter.ConnectorMessage) error
    33  	Nats() *nats.Conn
    34  }
    35  
    36  // Recorder listens for alive events and records the versions and expose the results to Prometheus
    37  type Recorder struct {
    38  	sync.Mutex
    39  
    40  	options  *options
    41  	active   int32
    42  	observed map[string]*observations
    43  
    44  	// lifecycle
    45  	okEvents       *prometheus.CounterVec
    46  	badEvents      *prometheus.CounterVec
    47  	versionsTally  *prometheus.GaugeVec
    48  	processTime    *prometheus.SummaryVec
    49  	eventTypes     *prometheus.CounterVec
    50  	nodesExpired   *prometheus.CounterVec
    51  	governorEvents *prometheus.CounterVec
    52  
    53  	// transitions
    54  	transitionEvent *prometheus.CounterVec
    55  
    56  	// exec watch events
    57  	execWatchSuccess *prometheus.CounterVec
    58  	execWatchFail    *prometheus.CounterVec
    59  	execWatchRuntime *prometheus.SummaryVec
    60  }
    61  
    62  type observations struct {
    63  	component string
    64  	hosts     map[string]*observation
    65  }
    66  
    67  type observation struct {
    68  	ts        time.Time
    69  	component string
    70  	version   string
    71  }
    72  
    73  // New creates a new Recorder
    74  func New(opts ...Option) (recorder *Recorder, err error) {
    75  	recorder = &Recorder{
    76  		options:  &options{},
    77  		observed: make(map[string]*observations),
    78  	}
    79  
    80  	for _, opt := range opts {
    81  		opt(recorder.options)
    82  	}
    83  
    84  	err = recorder.options.Validate()
    85  	if err != nil {
    86  		return nil, fmt.Errorf("invalid options supplied: %s", err)
    87  	}
    88  
    89  	if recorder.options.Election == "" {
    90  		recorder.active = 1
    91  	}
    92  
    93  	recorder.createStats()
    94  
    95  	return recorder, nil
    96  }
    97  
    98  func (r *Recorder) wonCb() {
    99  	atomic.StoreInt32(&r.active, 1)
   100  	r.options.Log.Infof("Became leader")
   101  }
   102  
   103  func (r *Recorder) lostCb() {
   104  	atomic.StoreInt32(&r.active, 0)
   105  	r.options.Log.Infof("Lost leadership")
   106  }
   107  
   108  func (r *Recorder) activeLabel() string {
   109  	return strconv.Itoa(int(atomic.LoadInt32(&r.active)))
   110  }
   111  
   112  func (r *Recorder) processAlive(e lifecycle.Event) error {
   113  	alive := e.(*lifecycle.AliveEvent)
   114  
   115  	r.Lock()
   116  	defer r.Unlock()
   117  
   118  	hname := alive.Identity()
   119  
   120  	cobs, ok := r.observed[alive.Component()]
   121  	if !ok {
   122  		cobs = &observations{
   123  			component: alive.Component(),
   124  			hosts:     make(map[string]*observation),
   125  		}
   126  		r.observed[alive.Component()] = cobs
   127  	}
   128  
   129  	obs, ok := cobs.hosts[hname]
   130  	if !ok {
   131  		cobs.hosts[hname] = &observation{
   132  			version:   alive.Version,
   133  			component: alive.Component(),
   134  			ts:        time.Now(),
   135  		}
   136  
   137  		r.versionsTally.WithLabelValues(alive.Component(), alive.Version, r.activeLabel()).Inc()
   138  
   139  		return nil
   140  	}
   141  
   142  	cobs.hosts[hname].ts = time.Now()
   143  
   144  	if obs.version != alive.Version {
   145  		r.versionsTally.WithLabelValues(alive.Component(), obs.version, r.activeLabel()).Dec()
   146  		obs.version = alive.Version
   147  		r.versionsTally.WithLabelValues(alive.Component(), obs.version, r.activeLabel()).Inc()
   148  	}
   149  
   150  	return nil
   151  }
   152  
   153  func (r *Recorder) processStartup(e lifecycle.Event) error {
   154  	startup := e.(*lifecycle.StartupEvent)
   155  
   156  	r.Lock()
   157  	defer r.Unlock()
   158  
   159  	hname := startup.Identity()
   160  	cobs, ok := r.observed[startup.Component()]
   161  	if !ok {
   162  		cobs = &observations{
   163  			component: startup.Component(),
   164  			hosts:     make(map[string]*observation),
   165  		}
   166  		r.observed[startup.Component()] = cobs
   167  	}
   168  
   169  	obs, ok := cobs.hosts[hname]
   170  	if ok {
   171  		r.versionsTally.WithLabelValues(startup.Component(), obs.version, r.activeLabel()).Dec()
   172  	}
   173  
   174  	cobs.hosts[hname] = &observation{
   175  		ts:        time.Now(),
   176  		version:   startup.Version,
   177  		component: startup.Component(),
   178  	}
   179  
   180  	r.versionsTally.WithLabelValues(startup.Component(), startup.Version, r.activeLabel()).Inc()
   181  
   182  	return nil
   183  }
   184  
   185  func (r *Recorder) processShutdown(e lifecycle.Event) error {
   186  	shutdown := e.(*lifecycle.ShutdownEvent)
   187  
   188  	r.Lock()
   189  	defer r.Unlock()
   190  
   191  	cobs, ok := r.observed[shutdown.Component()]
   192  	if !ok {
   193  		return nil
   194  	}
   195  
   196  	hname := shutdown.Identity()
   197  	obs, ok := cobs.hosts[hname]
   198  	if !ok {
   199  		return nil
   200  	}
   201  
   202  	delete(cobs.hosts, hname)
   203  	r.versionsTally.WithLabelValues(shutdown.Component(), obs.version, r.activeLabel()).Dec()
   204  
   205  	return nil
   206  }
   207  
   208  func (r *Recorder) processGovernor(e lifecycle.Event) error {
   209  	governor := e.(*lifecycle.GovernorEvent)
   210  
   211  	r.governorEvents.WithLabelValues(governor.Component(), governor.Governor, string(governor.EventType), r.activeLabel()).Inc()
   212  
   213  	return nil
   214  }
   215  
   216  func (r *Recorder) process(e lifecycle.Event) (err error) {
   217  	r.options.Log.Debugf("Processing %s event from %s %s", e.TypeString(), e.Component(), e.Identity())
   218  
   219  	timer := r.processTime.WithLabelValues(e.Component(), r.activeLabel())
   220  	obs := prometheus.NewTimer(timer)
   221  	defer obs.ObserveDuration()
   222  
   223  	r.eventTypes.WithLabelValues(e.Component(), e.TypeString(), r.activeLabel()).Inc()
   224  
   225  	switch e.Type() {
   226  	case lifecycle.Alive:
   227  		err = r.processAlive(e)
   228  
   229  	case lifecycle.Startup:
   230  		err = r.processStartup(e)
   231  
   232  	case lifecycle.Shutdown:
   233  		err = r.processShutdown(e)
   234  
   235  	case lifecycle.Governor:
   236  		err = r.processGovernor(e)
   237  	}
   238  
   239  	if err == nil {
   240  		r.okEvents.WithLabelValues(e.Component(), r.activeLabel()).Inc()
   241  	} else {
   242  		r.badEvents.WithLabelValues(e.Component(), r.activeLabel()).Inc()
   243  	}
   244  
   245  	return err
   246  }
   247  
   248  func (r *Recorder) maintenance() {
   249  	r.Lock()
   250  	defer r.Unlock()
   251  
   252  	if len(r.observed) == 0 {
   253  		return
   254  	}
   255  
   256  	oldest := time.Now().Add(-80 * time.Minute)
   257  
   258  	for component := range r.observed {
   259  		older := map[string]*observation{}
   260  
   261  		for host, obs := range r.observed[component].hosts {
   262  			if obs.ts.Before(oldest) {
   263  				r.versionsTally.WithLabelValues(obs.component, obs.version, r.activeLabel()).Dec()
   264  				older[host] = obs
   265  			}
   266  		}
   267  
   268  		for host, obs := range older {
   269  			r.options.Log.Debugf("Removing node %s, last seen %v", host, obs.ts)
   270  
   271  			delete(r.observed[component].hosts, host)
   272  			r.nodesExpired.WithLabelValues(obs.component, r.activeLabel()).Inc()
   273  		}
   274  
   275  		if len(older) > 0 {
   276  			r.options.Log.Infof("Removed %d '%s' hosts that have not been seen in over an hour", len(older), component)
   277  		}
   278  	}
   279  }
   280  
   281  func (r *Recorder) processStateTransition(m inter.ConnectorMessage) (err error) {
   282  	ce := cloudevents.NewEvent("1.0")
   283  	event := &machine.TransitionNotification{}
   284  
   285  	err = json.Unmarshal(m.Data(), &ce)
   286  	if err != nil {
   287  		return fmt.Errorf("could not parse cloudevent: %s", err)
   288  	}
   289  
   290  	err = ce.DataAs(event)
   291  	if err != nil {
   292  		return fmt.Errorf("could not parse transition event: %s", err)
   293  	}
   294  
   295  	if event.Protocol != "io.choria.machine.v1.transition" {
   296  		return fmt.Errorf("unknown notification protocol %s", event.Protocol)
   297  	}
   298  
   299  	r.transitionEvent.WithLabelValues(event.Machine, event.Version, event.Transition, event.FromState, event.ToState, r.activeLabel()).Inc()
   300  
   301  	return nil
   302  }
   303  
   304  func (r *Recorder) componentFromSubject(s string) string {
   305  	parts := strings.Split(s, ".")
   306  	if len(parts) == 0 {
   307  		return "unknown"
   308  	}
   309  
   310  	return parts[len(parts)-1]
   311  }
   312  
   313  // Run starts listening for events and record statistics about it in prometheus
   314  func (r *Recorder) Run(ctx context.Context) (err error) {
   315  	lifeEvents := make(chan inter.ConnectorMessage, 100)
   316  	machineTransitions := make(chan inter.ConnectorMessage, 100)
   317  	execWatcherStates := make(chan inter.ConnectorMessage, 100)
   318  
   319  	maintSched := time.NewTicker(time.Minute)
   320  	subid := util.UniqueID()
   321  
   322  	if r.options.Component == "" {
   323  		r.options.Log.Warn("Component was not specified, disabling lifecycle tallies")
   324  	} else {
   325  		err = r.options.Connector.QueueSubscribe(ctx, fmt.Sprintf("tally_%s_%s", r.options.Component, subid), fmt.Sprintf("choria.lifecycle.event.*.%s", r.options.Component), "", lifeEvents)
   326  		if err != nil {
   327  			return fmt.Errorf("could not subscribe to lifecycle events: %s", err)
   328  		}
   329  	}
   330  
   331  	err = r.options.Connector.QueueSubscribe(ctx, fmt.Sprintf("tally_transitions_%s", subid), "choria.machine.transition", "", machineTransitions)
   332  	if err != nil {
   333  		return fmt.Errorf("could not subscribe to machine transition events: %s", err)
   334  	}
   335  
   336  	err = r.options.Connector.QueueSubscribe(ctx, "tally_exec_watcher_states", "choria.machine.watcher.exec.state", "", execWatcherStates)
   337  	if err != nil {
   338  		return err
   339  	}
   340  
   341  	if r.options.Election != "" {
   342  		r.options.Log.Warnf("Starting leader election in campaign %s", r.options.Election)
   343  
   344  		name, err := os.Hostname()
   345  		if err != nil {
   346  			return err
   347  		}
   348  
   349  		js, err := r.options.Connector.Nats().JetStream()
   350  		if err != nil {
   351  			return err
   352  		}
   353  
   354  		kv, err := js.KeyValue("CHORIA_LEADER_ELECTION")
   355  		if err != nil {
   356  			return fmt.Errorf("cannot access KV Bucket CHORIA_LEADER_ELECTION: %v", err)
   357  		}
   358  
   359  		e, err := election.NewElection(name, r.options.Election, kv, election.WithBackoff(backoff.FiveSec), election.OnWon(r.wonCb), election.OnLost(r.lostCb))
   360  		if err != nil {
   361  			return err
   362  		}
   363  
   364  		go e.Start(ctx)
   365  	}
   366  
   367  	for {
   368  		select {
   369  		case e := <-lifeEvents:
   370  			event, err := lifecycle.NewFromJSON(e.Data())
   371  			if err != nil {
   372  				r.options.Log.Errorf("could not process event: %s", err)
   373  				r.badEvents.WithLabelValues(r.componentFromSubject(e.Subject()), r.activeLabel()).Inc()
   374  				continue
   375  			}
   376  
   377  			err = r.process(event)
   378  			if err != nil {
   379  				r.options.Log.Errorf("could not process event from %s: %s", event.Identity(), err)
   380  			}
   381  
   382  		case t := <-machineTransitions:
   383  			err = r.processStateTransition(t)
   384  			if err != nil {
   385  				r.options.Log.Errorf("could not process transition event: %s", err)
   386  				r.badEvents.WithLabelValues("transition", r.activeLabel()).Inc()
   387  			}
   388  
   389  		case t := <-execWatcherStates:
   390  			err = r.processExecWatcherState(t)
   391  			if err != nil {
   392  				r.options.Log.Errorf("could not process exec watcher event: %s", err)
   393  				r.badEvents.WithLabelValues("exec_watcher", r.activeLabel()).Inc()
   394  			}
   395  
   396  		case <-maintSched.C:
   397  			r.maintenance()
   398  
   399  		case <-ctx.Done():
   400  			return nil
   401  		}
   402  	}
   403  }
   404  
   405  func (r *Recorder) processExecWatcherState(m inter.ConnectorMessage) error {
   406  	ce := cloudevents.NewEvent("1.0")
   407  	event := &execwatcher.StateNotification{}
   408  
   409  	err := json.Unmarshal(m.Data(), &ce)
   410  	if err != nil {
   411  		return fmt.Errorf("could not parse cloudevent: %w", err)
   412  	}
   413  
   414  	err = ce.DataAs(event)
   415  	if err != nil {
   416  		return fmt.Errorf("could not parse state notification: %w", err)
   417  	}
   418  
   419  	switch event.PreviousOutcome {
   420  	case "success":
   421  		r.execWatchSuccess.WithLabelValues(event.Machine, event.Version, event.Name, r.activeLabel()).Inc()
   422  	case "error":
   423  		r.execWatchFail.WithLabelValues(event.Machine, event.Version, event.Name, r.activeLabel()).Inc()
   424  	default:
   425  		return nil
   426  	}
   427  
   428  	r.execWatchRuntime.WithLabelValues(event.Machine, event.Version, event.Name, r.activeLabel()).Observe(time.Duration(event.PreviousRunTime).Seconds())
   429  
   430  	return nil
   431  }