github.com/choria-io/go-choria@v0.28.1-0.20240416190746-b3bf9c7d5a45/lifecycle/tally/tally.go (about) 1 // Copyright (c) 2020-2021, R.I. Pienaar and the Choria Project contributors 2 // 3 // SPDX-License-Identifier: Apache-2.0 4 5 package tally 6 7 import ( 8 "context" 9 "encoding/json" 10 "fmt" 11 "github.com/choria-io/go-choria/aagent/watchers/execwatcher" 12 "github.com/choria-io/go-choria/backoff" 13 election "github.com/choria-io/go-choria/providers/election/streams" 14 "github.com/nats-io/nats.go" 15 "os" 16 "strconv" 17 "strings" 18 "sync" 19 "sync/atomic" 20 "time" 21 22 "github.com/choria-io/go-choria/aagent/machine" 23 "github.com/choria-io/go-choria/inter" 24 "github.com/choria-io/go-choria/internal/util" 25 "github.com/choria-io/go-choria/lifecycle" 26 cloudevents "github.com/cloudevents/sdk-go/v2" 27 "github.com/prometheus/client_golang/prometheus" 28 ) 29 30 // Connector is a connection to the middleware 31 type Connector interface { 32 QueueSubscribe(ctx context.Context, name string, subject string, group string, output chan inter.ConnectorMessage) error 33 Nats() *nats.Conn 34 } 35 36 // Recorder listens for alive events and records the versions and expose the results to Prometheus 37 type Recorder struct { 38 sync.Mutex 39 40 options *options 41 active int32 42 observed map[string]*observations 43 44 // lifecycle 45 okEvents *prometheus.CounterVec 46 badEvents *prometheus.CounterVec 47 versionsTally *prometheus.GaugeVec 48 processTime *prometheus.SummaryVec 49 eventTypes *prometheus.CounterVec 50 nodesExpired *prometheus.CounterVec 51 governorEvents *prometheus.CounterVec 52 53 // transitions 54 transitionEvent *prometheus.CounterVec 55 56 // exec watch events 57 execWatchSuccess *prometheus.CounterVec 58 execWatchFail *prometheus.CounterVec 59 execWatchRuntime *prometheus.SummaryVec 60 } 61 62 type observations struct { 63 component string 64 hosts map[string]*observation 65 } 66 67 type observation struct { 68 ts time.Time 69 component string 70 version string 71 } 72 73 // New creates a new Recorder 74 func New(opts ...Option) (recorder *Recorder, err error) { 75 recorder = &Recorder{ 76 options: &options{}, 77 observed: make(map[string]*observations), 78 } 79 80 for _, opt := range opts { 81 opt(recorder.options) 82 } 83 84 err = recorder.options.Validate() 85 if err != nil { 86 return nil, fmt.Errorf("invalid options supplied: %s", err) 87 } 88 89 if recorder.options.Election == "" { 90 recorder.active = 1 91 } 92 93 recorder.createStats() 94 95 return recorder, nil 96 } 97 98 func (r *Recorder) wonCb() { 99 atomic.StoreInt32(&r.active, 1) 100 r.options.Log.Infof("Became leader") 101 } 102 103 func (r *Recorder) lostCb() { 104 atomic.StoreInt32(&r.active, 0) 105 r.options.Log.Infof("Lost leadership") 106 } 107 108 func (r *Recorder) activeLabel() string { 109 return strconv.Itoa(int(atomic.LoadInt32(&r.active))) 110 } 111 112 func (r *Recorder) processAlive(e lifecycle.Event) error { 113 alive := e.(*lifecycle.AliveEvent) 114 115 r.Lock() 116 defer r.Unlock() 117 118 hname := alive.Identity() 119 120 cobs, ok := r.observed[alive.Component()] 121 if !ok { 122 cobs = &observations{ 123 component: alive.Component(), 124 hosts: make(map[string]*observation), 125 } 126 r.observed[alive.Component()] = cobs 127 } 128 129 obs, ok := cobs.hosts[hname] 130 if !ok { 131 cobs.hosts[hname] = &observation{ 132 version: alive.Version, 133 component: alive.Component(), 134 ts: time.Now(), 135 } 136 137 r.versionsTally.WithLabelValues(alive.Component(), alive.Version, r.activeLabel()).Inc() 138 139 return nil 140 } 141 142 cobs.hosts[hname].ts = time.Now() 143 144 if obs.version != alive.Version { 145 r.versionsTally.WithLabelValues(alive.Component(), obs.version, r.activeLabel()).Dec() 146 obs.version = alive.Version 147 r.versionsTally.WithLabelValues(alive.Component(), obs.version, r.activeLabel()).Inc() 148 } 149 150 return nil 151 } 152 153 func (r *Recorder) processStartup(e lifecycle.Event) error { 154 startup := e.(*lifecycle.StartupEvent) 155 156 r.Lock() 157 defer r.Unlock() 158 159 hname := startup.Identity() 160 cobs, ok := r.observed[startup.Component()] 161 if !ok { 162 cobs = &observations{ 163 component: startup.Component(), 164 hosts: make(map[string]*observation), 165 } 166 r.observed[startup.Component()] = cobs 167 } 168 169 obs, ok := cobs.hosts[hname] 170 if ok { 171 r.versionsTally.WithLabelValues(startup.Component(), obs.version, r.activeLabel()).Dec() 172 } 173 174 cobs.hosts[hname] = &observation{ 175 ts: time.Now(), 176 version: startup.Version, 177 component: startup.Component(), 178 } 179 180 r.versionsTally.WithLabelValues(startup.Component(), startup.Version, r.activeLabel()).Inc() 181 182 return nil 183 } 184 185 func (r *Recorder) processShutdown(e lifecycle.Event) error { 186 shutdown := e.(*lifecycle.ShutdownEvent) 187 188 r.Lock() 189 defer r.Unlock() 190 191 cobs, ok := r.observed[shutdown.Component()] 192 if !ok { 193 return nil 194 } 195 196 hname := shutdown.Identity() 197 obs, ok := cobs.hosts[hname] 198 if !ok { 199 return nil 200 } 201 202 delete(cobs.hosts, hname) 203 r.versionsTally.WithLabelValues(shutdown.Component(), obs.version, r.activeLabel()).Dec() 204 205 return nil 206 } 207 208 func (r *Recorder) processGovernor(e lifecycle.Event) error { 209 governor := e.(*lifecycle.GovernorEvent) 210 211 r.governorEvents.WithLabelValues(governor.Component(), governor.Governor, string(governor.EventType), r.activeLabel()).Inc() 212 213 return nil 214 } 215 216 func (r *Recorder) process(e lifecycle.Event) (err error) { 217 r.options.Log.Debugf("Processing %s event from %s %s", e.TypeString(), e.Component(), e.Identity()) 218 219 timer := r.processTime.WithLabelValues(e.Component(), r.activeLabel()) 220 obs := prometheus.NewTimer(timer) 221 defer obs.ObserveDuration() 222 223 r.eventTypes.WithLabelValues(e.Component(), e.TypeString(), r.activeLabel()).Inc() 224 225 switch e.Type() { 226 case lifecycle.Alive: 227 err = r.processAlive(e) 228 229 case lifecycle.Startup: 230 err = r.processStartup(e) 231 232 case lifecycle.Shutdown: 233 err = r.processShutdown(e) 234 235 case lifecycle.Governor: 236 err = r.processGovernor(e) 237 } 238 239 if err == nil { 240 r.okEvents.WithLabelValues(e.Component(), r.activeLabel()).Inc() 241 } else { 242 r.badEvents.WithLabelValues(e.Component(), r.activeLabel()).Inc() 243 } 244 245 return err 246 } 247 248 func (r *Recorder) maintenance() { 249 r.Lock() 250 defer r.Unlock() 251 252 if len(r.observed) == 0 { 253 return 254 } 255 256 oldest := time.Now().Add(-80 * time.Minute) 257 258 for component := range r.observed { 259 older := map[string]*observation{} 260 261 for host, obs := range r.observed[component].hosts { 262 if obs.ts.Before(oldest) { 263 r.versionsTally.WithLabelValues(obs.component, obs.version, r.activeLabel()).Dec() 264 older[host] = obs 265 } 266 } 267 268 for host, obs := range older { 269 r.options.Log.Debugf("Removing node %s, last seen %v", host, obs.ts) 270 271 delete(r.observed[component].hosts, host) 272 r.nodesExpired.WithLabelValues(obs.component, r.activeLabel()).Inc() 273 } 274 275 if len(older) > 0 { 276 r.options.Log.Infof("Removed %d '%s' hosts that have not been seen in over an hour", len(older), component) 277 } 278 } 279 } 280 281 func (r *Recorder) processStateTransition(m inter.ConnectorMessage) (err error) { 282 ce := cloudevents.NewEvent("1.0") 283 event := &machine.TransitionNotification{} 284 285 err = json.Unmarshal(m.Data(), &ce) 286 if err != nil { 287 return fmt.Errorf("could not parse cloudevent: %s", err) 288 } 289 290 err = ce.DataAs(event) 291 if err != nil { 292 return fmt.Errorf("could not parse transition event: %s", err) 293 } 294 295 if event.Protocol != "io.choria.machine.v1.transition" { 296 return fmt.Errorf("unknown notification protocol %s", event.Protocol) 297 } 298 299 r.transitionEvent.WithLabelValues(event.Machine, event.Version, event.Transition, event.FromState, event.ToState, r.activeLabel()).Inc() 300 301 return nil 302 } 303 304 func (r *Recorder) componentFromSubject(s string) string { 305 parts := strings.Split(s, ".") 306 if len(parts) == 0 { 307 return "unknown" 308 } 309 310 return parts[len(parts)-1] 311 } 312 313 // Run starts listening for events and record statistics about it in prometheus 314 func (r *Recorder) Run(ctx context.Context) (err error) { 315 lifeEvents := make(chan inter.ConnectorMessage, 100) 316 machineTransitions := make(chan inter.ConnectorMessage, 100) 317 execWatcherStates := make(chan inter.ConnectorMessage, 100) 318 319 maintSched := time.NewTicker(time.Minute) 320 subid := util.UniqueID() 321 322 if r.options.Component == "" { 323 r.options.Log.Warn("Component was not specified, disabling lifecycle tallies") 324 } else { 325 err = r.options.Connector.QueueSubscribe(ctx, fmt.Sprintf("tally_%s_%s", r.options.Component, subid), fmt.Sprintf("choria.lifecycle.event.*.%s", r.options.Component), "", lifeEvents) 326 if err != nil { 327 return fmt.Errorf("could not subscribe to lifecycle events: %s", err) 328 } 329 } 330 331 err = r.options.Connector.QueueSubscribe(ctx, fmt.Sprintf("tally_transitions_%s", subid), "choria.machine.transition", "", machineTransitions) 332 if err != nil { 333 return fmt.Errorf("could not subscribe to machine transition events: %s", err) 334 } 335 336 err = r.options.Connector.QueueSubscribe(ctx, "tally_exec_watcher_states", "choria.machine.watcher.exec.state", "", execWatcherStates) 337 if err != nil { 338 return err 339 } 340 341 if r.options.Election != "" { 342 r.options.Log.Warnf("Starting leader election in campaign %s", r.options.Election) 343 344 name, err := os.Hostname() 345 if err != nil { 346 return err 347 } 348 349 js, err := r.options.Connector.Nats().JetStream() 350 if err != nil { 351 return err 352 } 353 354 kv, err := js.KeyValue("CHORIA_LEADER_ELECTION") 355 if err != nil { 356 return fmt.Errorf("cannot access KV Bucket CHORIA_LEADER_ELECTION: %v", err) 357 } 358 359 e, err := election.NewElection(name, r.options.Election, kv, election.WithBackoff(backoff.FiveSec), election.OnWon(r.wonCb), election.OnLost(r.lostCb)) 360 if err != nil { 361 return err 362 } 363 364 go e.Start(ctx) 365 } 366 367 for { 368 select { 369 case e := <-lifeEvents: 370 event, err := lifecycle.NewFromJSON(e.Data()) 371 if err != nil { 372 r.options.Log.Errorf("could not process event: %s", err) 373 r.badEvents.WithLabelValues(r.componentFromSubject(e.Subject()), r.activeLabel()).Inc() 374 continue 375 } 376 377 err = r.process(event) 378 if err != nil { 379 r.options.Log.Errorf("could not process event from %s: %s", event.Identity(), err) 380 } 381 382 case t := <-machineTransitions: 383 err = r.processStateTransition(t) 384 if err != nil { 385 r.options.Log.Errorf("could not process transition event: %s", err) 386 r.badEvents.WithLabelValues("transition", r.activeLabel()).Inc() 387 } 388 389 case t := <-execWatcherStates: 390 err = r.processExecWatcherState(t) 391 if err != nil { 392 r.options.Log.Errorf("could not process exec watcher event: %s", err) 393 r.badEvents.WithLabelValues("exec_watcher", r.activeLabel()).Inc() 394 } 395 396 case <-maintSched.C: 397 r.maintenance() 398 399 case <-ctx.Done(): 400 return nil 401 } 402 } 403 } 404 405 func (r *Recorder) processExecWatcherState(m inter.ConnectorMessage) error { 406 ce := cloudevents.NewEvent("1.0") 407 event := &execwatcher.StateNotification{} 408 409 err := json.Unmarshal(m.Data(), &ce) 410 if err != nil { 411 return fmt.Errorf("could not parse cloudevent: %w", err) 412 } 413 414 err = ce.DataAs(event) 415 if err != nil { 416 return fmt.Errorf("could not parse state notification: %w", err) 417 } 418 419 switch event.PreviousOutcome { 420 case "success": 421 r.execWatchSuccess.WithLabelValues(event.Machine, event.Version, event.Name, r.activeLabel()).Inc() 422 case "error": 423 r.execWatchFail.WithLabelValues(event.Machine, event.Version, event.Name, r.activeLabel()).Inc() 424 default: 425 return nil 426 } 427 428 r.execWatchRuntime.WithLabelValues(event.Machine, event.Version, event.Name, r.activeLabel()).Observe(time.Duration(event.PreviousRunTime).Seconds()) 429 430 return nil 431 }