github.com/choria-io/go-choria@v0.28.1-0.20240416190746-b3bf9c7d5a45/aagent/watchers/metricwatcher/metric.go (about) 1 // Copyright (c) 2020-2024, R.I. Pienaar and the Choria Project contributors 2 // 3 // SPDX-License-Identifier: Apache-2.0 4 5 package metricwatcher 6 7 import ( 8 "bytes" 9 "context" 10 "encoding/json" 11 "fmt" 12 "math/rand" 13 "net" 14 "os" 15 "os/exec" 16 "strconv" 17 "strings" 18 "sync" 19 "time" 20 21 "github.com/choria-io/go-choria/aagent/model" 22 "github.com/google/shlex" 23 24 "github.com/choria-io/go-choria/aagent/util" 25 "github.com/choria-io/go-choria/aagent/watchers/event" 26 "github.com/choria-io/go-choria/aagent/watchers/watcher" 27 ) 28 29 const ( 30 wtype = "metric" 31 version = "v1" 32 ) 33 34 type Metric struct { 35 Labels map[string]string `json:"labels"` 36 Metrics map[string]float64 `json:"metrics"` 37 Time int64 `json:"time"` 38 name string 39 machine string 40 seen int 41 } 42 43 type properties struct { 44 Command string 45 Interval time.Duration 46 Labels map[string]string 47 SkipPrometheus bool `mapstructure:"skip_prometheus"` 48 StoreAsData bool `mapstructure:"store"` 49 GraphiteHost string `mapstructure:"graphite_host"` 50 GraphitePort string `mapstructure:"graphite_port"` 51 GraphitePrefix string `mapstructure:"graphite_prefix"` 52 } 53 54 type Watcher struct { 55 *watcher.Watcher 56 57 name string 58 machine model.Machine 59 previousRunTime time.Duration 60 previousResult *Metric 61 properties *properties 62 63 watching bool 64 mu *sync.Mutex 65 } 66 67 func New(machine model.Machine, name string, states []string, failEvent string, successEvent string, interval string, ai time.Duration, rawprops map[string]any) (any, error) { 68 var err error 69 70 mw := &Watcher{ 71 name: name, 72 machine: machine, 73 mu: &sync.Mutex{}, 74 } 75 76 mw.Watcher, err = watcher.NewWatcher(name, wtype, ai, states, machine, failEvent, successEvent) 77 if err != nil { 78 return nil, err 79 } 80 81 err = mw.setProperties(rawprops) 82 if err != nil { 83 return nil, fmt.Errorf("could not set properties: %s", err) 84 } 85 86 if mw.properties.GraphitePrefix == "" { 87 mw.properties.GraphitePrefix = fmt.Sprintf("choria.%s", strings.ReplaceAll(name, " ", "-")) 88 } 89 90 if !mw.properties.SkipPrometheus { 91 savePromState(machine.TextFileDirectory(), mw) 92 } 93 94 return mw, nil 95 } 96 97 func (w *Watcher) Delete() { 98 if !w.properties.SkipPrometheus { 99 err := deletePromState(w.machine.TextFileDirectory(), w, w.machine.Name(), w.name) 100 if err != nil { 101 w.Errorf("could not delete from prometheus: %s", err) 102 } 103 } 104 } 105 106 func (w *Watcher) Run(ctx context.Context, wg *sync.WaitGroup) { 107 defer wg.Done() 108 109 w.Infof("metric watcher for %s starting", w.properties.Command) 110 111 splay := time.Duration(rand.Intn(int(w.properties.Interval.Seconds()))) * time.Second 112 w.Infof("Splaying first check by %v", splay) 113 114 select { 115 case <-time.NewTimer(splay).C: 116 w.performWatch(ctx) 117 case <-ctx.Done(): 118 return 119 } 120 121 tick := time.NewTicker(w.properties.Interval) 122 123 for { 124 select { 125 case <-tick.C: 126 w.performWatch(ctx) 127 128 case <-w.StateChangeC(): 129 w.performWatch(ctx) 130 131 case <-ctx.Done(): 132 w.Infof("Stopping on context interrupt") 133 tick.Stop() 134 return 135 } 136 } 137 } 138 139 func (w *Watcher) startWatching() { 140 w.mu.Lock() 141 w.watching = true 142 w.mu.Unlock() 143 } 144 145 func (w *Watcher) isWatching() bool { 146 w.mu.Lock() 147 defer w.mu.Unlock() 148 149 return w.watching 150 } 151 152 func (w *Watcher) stopWatching() { 153 w.mu.Lock() 154 w.watching = false 155 w.mu.Unlock() 156 } 157 158 func (w *Watcher) watch(ctx context.Context) (state []byte, err error) { 159 if !w.ShouldWatch() { 160 return nil, nil 161 } 162 163 w.startWatching() 164 defer w.stopWatching() 165 166 start := time.Now() 167 defer func() { 168 w.mu.Lock() 169 w.previousRunTime = time.Since(start) 170 w.mu.Unlock() 171 }() 172 173 w.Infof("Running %s", w.properties.Command) 174 175 timeoutCtx, cancel := context.WithTimeout(ctx, time.Second) 176 defer cancel() 177 178 splitcmd, err := shlex.Split(w.properties.Command) 179 if err != nil { 180 w.Errorf("Metric watcher %s failed: %s", w.properties.Command, err) 181 return nil, err 182 } 183 184 cmd := exec.CommandContext(timeoutCtx, splitcmd[0], splitcmd[1:]...) 185 cmd.Env = append(cmd.Env, fmt.Sprintf("MACHINE_WATCHER_NAME=%s", w.name)) 186 cmd.Env = append(cmd.Env, fmt.Sprintf("MACHINE_NAME=%s", w.machine.Name())) 187 cmd.Env = append(cmd.Env, fmt.Sprintf("PATH=%s%s%s", os.Getenv("PATH"), string(os.PathListSeparator), w.machine.Directory())) 188 cmd.Dir = w.machine.Directory() 189 190 output, err := cmd.CombinedOutput() 191 if err != nil { 192 w.Errorf("Metric watcher %s failed: %s", w.properties.Command, err) 193 return nil, err 194 } 195 196 w.Debugf("Output from %s: %s", w.properties.Command, output) 197 198 return output, nil 199 } 200 201 func (w *Watcher) performWatch(ctx context.Context) { 202 if w.isWatching() { 203 return 204 } 205 206 metric, err := w.watch(ctx) 207 err = w.handleCheck(ctx, metric, err) 208 if err != nil { 209 w.Errorf("could not handle watcher event: %s", err) 210 } 211 } 212 213 func (w *Watcher) parseJSONCheck(output []byte) (*Metric, error) { 214 metric := &Metric{ 215 Labels: map[string]string{"format": "choria"}, 216 Metrics: map[string]float64{}, 217 } 218 219 err := json.Unmarshal(output, metric) 220 if err != nil { 221 return metric, err 222 } 223 224 for k, v := range w.properties.Labels { 225 metric.Labels[k] = v 226 } 227 228 return metric, nil 229 } 230 231 func (w *Watcher) parseNagiosCheck(output []byte) (*Metric, error) { 232 metric := &Metric{ 233 Labels: map[string]string{"format": "nagios"}, 234 Metrics: map[string]float64{}, 235 } 236 237 perf := util.ParsePerfData(string(output)) 238 if perf == nil { 239 return metric, nil 240 } 241 242 for _, p := range perf { 243 metric.Metrics[p.Label] = p.Value 244 } 245 246 return metric, nil 247 } 248 249 func (w *Watcher) handleCheck(ctx context.Context, output []byte, err error) error { 250 var metric *Metric 251 252 if err == nil { 253 if bytes.HasPrefix(bytes.TrimSpace(output), []byte("{")) { 254 metric, err = w.parseJSONCheck(output) 255 if err != nil { 256 w.Errorf("Failed to parse metric output: %v", err) 257 } 258 } else { 259 metric, err = w.parseNagiosCheck(output) 260 if err != nil { 261 w.Errorf("Failed to parse perf data output: %v", err) 262 } 263 } 264 } 265 266 if err != nil { 267 w.NotifyWatcherState(w.CurrentState()) 268 return w.FailureTransition() 269 } 270 271 metric.Time = time.Now().Unix() 272 273 for k, v := range w.properties.Labels { 274 metric.Labels[k] = v 275 } 276 277 if !w.properties.SkipPrometheus { 278 err = updatePromState(w.machine.TextFileDirectory(), w, w.machine.Name(), w.name, metric) 279 if err != nil { 280 w.Errorf("Could not update prometheus: %s", err) 281 } 282 } 283 284 err = w.publishToGraphite(ctx, metric) 285 if err != nil { 286 return err 287 } 288 289 err = w.storeMetricAsData(metric) 290 if err != nil { 291 return err 292 } 293 294 w.mu.Lock() 295 w.previousResult = metric 296 w.mu.Unlock() 297 298 w.NotifyWatcherState(w.CurrentState()) 299 300 return nil 301 } 302 303 func (w *Watcher) storeMetricAsData(metric *Metric) error { 304 if !w.properties.StoreAsData { 305 return nil 306 } 307 308 w.Debugf("Storing metrics to machine data") 309 310 return w.machine.DataPut("metric", map[string]any{w.name: metric}) 311 } 312 313 func (w *Watcher) publishToGraphite(ctx context.Context, metric *Metric) error { 314 if w.properties.GraphiteHost == "" { 315 w.Debugf("Skipping graphite publish without a host defined") 316 return nil 317 } 318 319 if w.properties.GraphitePort == "" { 320 w.Debugf("Skipping graphite publish without a port defined") 321 return nil 322 } 323 324 if len(metric.Metrics) == 0 { 325 w.Debugf("Skipping graphite publish without any metrics") 326 return nil 327 } 328 329 connCtx, cancel := context.WithTimeout(ctx, 2*time.Second) 330 defer cancel() 331 332 host, err := w.ProcessTemplate(w.properties.GraphiteHost) 333 if err != nil { 334 return err 335 } 336 portString, err := w.ProcessTemplate(w.properties.GraphitePort) 337 if err != nil { 338 return err 339 } 340 port, err := strconv.Atoi(portString) 341 if err != nil { 342 return err 343 } 344 345 hostPort := fmt.Sprintf("%s:%d", host, port) 346 347 w.Debugf("Sending %d metrics to graphite %s", len(metric.Metrics), hostPort) 348 var d net.Dialer 349 conn, err := d.DialContext(connCtx, "tcp", hostPort) 350 if err != nil { 351 return err 352 } 353 defer conn.Close() 354 355 // copy it so we can add stuff to it without impacting other parts 356 // TODO: use maps.Copy() later 357 m := make(map[string]float64) 358 for k, v := range metric.Metrics { 359 m[k] = v 360 } 361 m["runtime"] = w.previousRunTime.Seconds() 362 363 for k, v := range m { 364 prefix, err := w.ProcessTemplate(w.properties.GraphitePrefix) 365 if err != nil { 366 return err 367 } 368 369 name := fmt.Sprintf("%s.%s", prefix, k) 370 _, err = conn.Write([]byte(fmt.Sprintf("%s %f %d\n", name, v, metric.Time))) 371 if err != nil { 372 return err 373 } 374 } 375 376 return nil 377 } 378 379 func (w *Watcher) CurrentState() any { 380 w.mu.Lock() 381 defer w.mu.Unlock() 382 383 var res Metric 384 if w.previousResult == nil { 385 res = Metric{ 386 Labels: make(map[string]string), 387 Metrics: make(map[string]float64), 388 } 389 } else { 390 res = *w.previousResult 391 } 392 393 res.Metrics["choria_runtime_seconds"] = w.previousRunTime.Seconds() 394 395 s := &StateNotification{ 396 Event: event.New(w.name, wtype, version, w.machine), 397 Metrics: res, 398 } 399 400 return s 401 } 402 403 func (w *Watcher) validate() error { 404 if w.properties.Command == "" { 405 return fmt.Errorf("command is required") 406 } 407 408 if w.properties.Interval < time.Second { 409 w.properties.Interval = time.Second 410 } 411 412 return nil 413 } 414 415 func (w *Watcher) setProperties(props map[string]any) error { 416 if w.properties == nil { 417 w.properties = &properties{ 418 Labels: make(map[string]string), 419 } 420 } 421 422 err := util.ParseMapStructure(props, w.properties) 423 if err != nil { 424 return err 425 } 426 427 return w.validate() 428 }