github.com/wfusion/gofusion@v1.1.14/common/infra/metrics/prometheus/prometheus.go (about) 1 //go:build go1.9 2 // +build go1.9 3 4 package prometheus 5 6 import ( 7 "context" 8 "fmt" 9 "log" 10 "strings" 11 "sync" 12 "time" 13 14 "github.com/Rican7/retry" 15 "github.com/Rican7/retry/strategy" 16 "github.com/pkg/errors" 17 "github.com/prometheus/client_golang/prometheus" 18 "github.com/prometheus/client_golang/prometheus/push" 19 20 "github.com/wfusion/gofusion/common/constant" 21 "github.com/wfusion/gofusion/common/infra/metrics" 22 "github.com/wfusion/gofusion/common/utils" 23 "github.com/wfusion/gofusion/common/utils/clone" 24 ) 25 26 var ( 27 // DefaultPrometheusOpts is the default set of options used when creating a 28 // PrometheusSink. 29 DefaultPrometheusOpts = PrometheusOpts{ 30 Expiration: 60 * time.Second, 31 Name: "default_prometheus_sink", 32 } 33 34 // 54 buckets, too many 35 defaultBuckets = []float64{ 36 // .0005, .0009, .00095, .00099, 37 // .001, .0025, .0050, .0075, .0090, .0095, .0099, 38 .01, .025, .050, .075, .090, .095, .099, 39 .1, .25, .50, .75, .90, .95, .99, 40 1, 2.5, 5, 7.5, 9, 9.5, 9.9, 41 10, 25, 50, 75, 90, 95, 99, 42 100, 250, 500, 750, 900, 950, 990, 43 1000, 2500, 5000, 7500, 9000, 9500, 9900, 44 10000, 45 } 46 objectives = map[float64]float64{ 47 0.50: 0.05, // p50 48 0.75: 0.01, // p75 49 0.90: 0.01, // p90 50 0.95: 0.001, // p95 51 0.99: 0.001, // p99 52 0.999: 0.0001, // p999 53 } 54 ) 55 56 func Bucket(buckets []float64) utils.OptionFunc[metrics.Option] { 57 return func(o *metrics.Option) { 58 o.Buckets = buckets 59 } 60 } 61 62 // PrometheusOpts is used to configure the Prometheus Sink 63 type PrometheusOpts struct { 64 // Expiration is the duration a metric is valid for, after which it will be 65 // untracked. If the value is zero, a metric is never expired. 66 Expiration time.Duration 67 Registerer prometheus.Registerer 68 69 // Gauges, Summaries, Histograms, and Counters allow us to pre-declare metrics by giving 70 // their Name, Help, and ConstLabels to the PrometheusSink when it is created. 71 // Metrics declared in this way will be initialized at zero and will not be 72 // deleted or altered when their expiry is reached. 73 // 74 // Ex: PrometheusOpts{ 75 // Expiration: 10 * time.Second, 76 // Gauges: []GaugeDefinition{ 77 // { 78 // Name: []string{ "application", "component", "measurement"}, 79 // Help: "application_component_measurement provides an example of how to declare static metrics", 80 // ConstLabels: []metrics.Label{ { Name: "my_label", Value: "does_not_change" }, }, 81 // }, 82 // }, 83 // } 84 GaugeDefinitions []GaugeDefinition 85 SummaryDefinitions []SummaryDefinition 86 CounterDefinitions []CounterDefinition 87 HistogramDefinitions []HistogramDefinition 88 Name string 89 } 90 91 type PrometheusSink struct { 92 // If these will ever be copied, they should be converted to *sync.Map values and initialized appropriately 93 gauges sync.Map 94 summaries sync.Map 95 histograms sync.Map 96 counters sync.Map 97 expiration time.Duration 98 help map[string]string 99 name string 100 logger metrics.Logger 101 102 Registry prometheus.Registerer 103 } 104 105 // GaugeDefinition can be provided to PrometheusOpts to declare a constant gauge that is not deleted on expiry. 106 type GaugeDefinition struct { 107 Name []string 108 ConstLabels []metrics.Label 109 Help string 110 } 111 112 type gauge struct { 113 prometheus.Gauge 114 updatedAt time.Time 115 // canDelete is set if the metric is created during runtime so we know it's ephemeral and can delete it on expiry. 116 canDelete bool 117 } 118 119 // SummaryDefinition can be provided to PrometheusOpts to declare a constant summary that is not deleted on expiry. 120 type SummaryDefinition struct { 121 Name []string 122 ConstLabels []metrics.Label 123 Help string 124 } 125 126 type summary struct { 127 prometheus.Summary 128 updatedAt time.Time 129 canDelete bool 130 } 131 132 // HistogramDefinition can be provided to PrometheusOpts to declare a constant summary that is not deleted on expiry. 133 type HistogramDefinition struct { 134 Name []string 135 ConstLabels []metrics.Label 136 Help string 137 } 138 139 type histogram struct { 140 prometheus.Histogram 141 updatedAt time.Time 142 canDelete bool 143 } 144 145 // CounterDefinition can be provided to PrometheusOpts to declare a constant counter that is not deleted on expiry. 146 type CounterDefinition struct { 147 Name []string 148 ConstLabels []metrics.Label 149 Help string 150 } 151 152 type counter struct { 153 prometheus.Counter 154 updatedAt time.Time 155 canDelete bool 156 } 157 158 // NewPrometheusSink creates a new PrometheusSink using the default options. 159 func NewPrometheusSink() (*PrometheusSink, error) { 160 return NewPrometheusSinkFrom(DefaultPrometheusOpts) 161 } 162 163 // NewPrometheusSinkFrom creates a new PrometheusSink using the passed options. 164 func NewPrometheusSinkFrom(opts PrometheusOpts) (*PrometheusSink, error) { 165 name := opts.Name 166 if name == "" { 167 name = "default_prometheus_sink" 168 } 169 sink := &PrometheusSink{ 170 gauges: sync.Map{}, 171 summaries: sync.Map{}, 172 histograms: sync.Map{}, 173 counters: sync.Map{}, 174 expiration: opts.Expiration, 175 help: make(map[string]string), 176 name: name, 177 } 178 179 initGauges(&sink.gauges, opts.GaugeDefinitions, sink.help) 180 initSummaries(&sink.summaries, opts.SummaryDefinitions, sink.help) 181 initHistograms(&sink.histograms, opts.HistogramDefinitions, sink.help) 182 initCounters(&sink.counters, opts.CounterDefinitions, sink.help) 183 184 reg := opts.Registerer 185 if reg == nil { 186 reg = prometheus.DefaultRegisterer 187 } 188 sink.Registry = reg 189 return sink, reg.Register(sink) 190 } 191 192 // Describe sends a Collector.Describe value from the descriptor created around PrometheusSink.Name 193 // Note that we cannot describe all the metrics (gauges, counters, summaries) in the sink as 194 // metrics can be added at any point during the lifecycle of the sink, which does not respect 195 // the idempotency aspect of the Collector.Describe() interface 196 func (p *PrometheusSink) Describe(c chan<- *prometheus.Desc) { 197 // dummy value to be able to register and unregister "empty" sinks 198 // Note this is not actually retained in the PrometheusSink so this has no side effects 199 // on the caller's sink. So it shouldn't show up to any of its consumers. 200 prometheus.NewGauge(prometheus.GaugeOpts{Name: p.name, Help: p.name}).Describe(c) 201 } 202 203 // Collect meets the collection interface and allows us to enforce our expiration 204 // logic to clean up ephemeral metrics if their value haven't been set for a 205 // duration exceeding our allowed expiration time. 206 func (p *PrometheusSink) Collect(c chan<- prometheus.Metric) { 207 p.collectAtTime(c, time.Now()) 208 } 209 210 // collectAtTime allows internal testing of the expiry based logic here without 211 // mocking clocks or making tests timing sensitive. 212 func (p *PrometheusSink) collectAtTime(c chan<- prometheus.Metric, t time.Time) { 213 expire := p.expiration != 0 214 p.gauges.Range(func(k, v interface{}) bool { 215 if v == nil { 216 return true 217 } 218 g := v.(*gauge) 219 lastUpdate := g.updatedAt 220 if expire && lastUpdate.Add(p.expiration).Before(t) { 221 if g.canDelete { 222 p.gauges.Delete(k) 223 return true 224 } 225 } 226 g.Collect(c) 227 return true 228 }) 229 p.summaries.Range(func(k, v interface{}) bool { 230 if v == nil { 231 return true 232 } 233 s := v.(*summary) 234 lastUpdate := s.updatedAt 235 if expire && lastUpdate.Add(p.expiration).Before(t) { 236 if s.canDelete { 237 p.summaries.Delete(k) 238 return true 239 } 240 } 241 s.Collect(c) 242 return true 243 }) 244 p.histograms.Range(func(k, v interface{}) bool { 245 if v == nil { 246 return true 247 } 248 h := v.(*histogram) 249 lastUpdate := h.updatedAt 250 if expire && lastUpdate.Add(p.expiration).Before(t) { 251 if h.canDelete { 252 p.histograms.Delete(k) 253 return true 254 } 255 } 256 h.Collect(c) 257 return true 258 }) 259 p.counters.Range(func(k, v interface{}) bool { 260 if v == nil { 261 return true 262 } 263 count := v.(*counter) 264 lastUpdate := count.updatedAt 265 if expire && lastUpdate.Add(p.expiration).Before(t) { 266 if count.canDelete { 267 p.counters.Delete(k) 268 return true 269 } 270 } 271 count.Collect(c) 272 return true 273 }) 274 } 275 276 func initGauges(m *sync.Map, gauges []GaugeDefinition, help map[string]string) { 277 for _, g := range gauges { 278 key, hash := flattenKey(g.Name, g.ConstLabels) 279 help[fmt.Sprintf("gauge.%s", key)] = g.Help 280 pG := prometheus.NewGauge(prometheus.GaugeOpts{ 281 Name: key, 282 Help: g.Help, 283 ConstLabels: prometheusLabels(g.ConstLabels), 284 }) 285 m.Store(hash, &gauge{Gauge: pG}) 286 } 287 } 288 289 func initSummaries(m *sync.Map, summaries []SummaryDefinition, help map[string]string) { 290 for _, s := range summaries { 291 key, hash := flattenKey(s.Name, s.ConstLabels) 292 help[fmt.Sprintf("summary.%s", key)] = s.Help 293 pS := prometheus.NewSummary(prometheus.SummaryOpts{ 294 Name: key, 295 Help: s.Help, 296 MaxAge: 10 * time.Second, 297 ConstLabels: prometheusLabels(s.ConstLabels), 298 Objectives: clone.Clone(objectives), 299 }) 300 m.Store(hash, &summary{Summary: pS}) 301 } 302 } 303 304 func initHistograms(m *sync.Map, summaries []HistogramDefinition, help map[string]string) { 305 for _, s := range summaries { 306 key, hash := flattenKey(s.Name, s.ConstLabels) 307 help[fmt.Sprintf("summary.%s", key)] = s.Help 308 pS := prometheus.NewHistogram(prometheus.HistogramOpts{ 309 Name: key, 310 Help: s.Help, 311 ConstLabels: prometheusLabels(s.ConstLabels), 312 Buckets: clone.Clone(defaultBuckets), 313 }) 314 m.Store(hash, &summary{Summary: pS}) 315 } 316 } 317 318 func initCounters(m *sync.Map, counters []CounterDefinition, help map[string]string) { 319 for _, c := range counters { 320 key, hash := flattenKey(c.Name, c.ConstLabels) 321 help[fmt.Sprintf("counter.%s", key)] = c.Help 322 pC := prometheus.NewCounter(prometheus.CounterOpts{ 323 Name: key, 324 Help: c.Help, 325 ConstLabels: prometheusLabels(c.ConstLabels), 326 }) 327 m.Store(hash, &counter{Counter: pC}) 328 } 329 } 330 331 var forbiddenCharsReplacer = strings.NewReplacer(" ", "_", ".", "_", "=", "_", "-", "_", "/", "_") 332 333 func flattenKey(parts []string, labels []metrics.Label) (string, string) { 334 key := strings.Join(parts, "_") 335 key = forbiddenCharsReplacer.Replace(key) 336 337 hash := key 338 for _, label := range labels { 339 hash += ";" + label.Name + "=" + label.Value 340 } 341 342 return key, hash 343 } 344 345 func prometheusLabels(labels []metrics.Label) prometheus.Labels { 346 l := make(prometheus.Labels) 347 for _, label := range labels { 348 l[label.Name] = label.Value 349 } 350 return l 351 } 352 353 func (p *PrometheusSink) SetGauge(parts []string, val float32, opts ...utils.OptionExtender) { 354 p.SetPrecisionGauge(parts, float64(val)) 355 } 356 357 func (p *PrometheusSink) SetGaugeWithLabels(parts []string, val float32, labels []metrics.Label, 358 opts ...utils.OptionExtender) { 359 p.SetPrecisionGaugeWithLabels(parts, float64(val), labels) 360 } 361 362 func (p *PrometheusSink) SetPrecisionGauge(parts []string, val float64, opts ...utils.OptionExtender) { 363 p.SetPrecisionGaugeWithLabels(parts, val, nil) 364 } 365 366 func (p *PrometheusSink) SetPrecisionGaugeWithLabels(parts []string, val float64, labels []metrics.Label, 367 opts ...utils.OptionExtender) { 368 key, hash := flattenKey(parts, labels) 369 pg, ok := p.gauges.Load(hash) 370 371 // The sync.Map underlying gauges stores pointers to our structs. If we need to make updates, 372 // rather than modifying the underlying value directly, which would be racy, we make a local 373 // copy by dereferencing the pointer we get back, making the appropriate changes, and then 374 // storing a pointer to our local copy. The underlying Prometheus types are threadsafe, 375 // so there's no issues there. It's possible for racy updates to occur to the updatedAt 376 // value, but since we're always setting it to time.Now(), it doesn't really matter. 377 if ok { 378 localGauge := *pg.(*gauge) 379 localGauge.Set(val) 380 localGauge.updatedAt = time.Now() 381 p.gauges.Store(hash, &localGauge) 382 383 // The gauge does not exist, create the gauge and allow it to be deleted 384 } else { 385 help := key 386 existingHelp, ok := p.help[fmt.Sprintf("gauge.%s", key)] 387 if ok { 388 help = existingHelp 389 } 390 g := prometheus.NewGauge(prometheus.GaugeOpts{ 391 Name: key, 392 Help: help, 393 ConstLabels: prometheusLabels(labels), 394 }) 395 g.Set(val) 396 pg = &gauge{ 397 Gauge: g, 398 updatedAt: time.Now(), 399 canDelete: true, 400 } 401 p.gauges.Store(hash, pg) 402 } 403 } 404 405 func (p *PrometheusSink) AddSample(parts []string, val float32, opts ...utils.OptionExtender) { 406 p.AddSampleWithLabels(parts, val, nil) 407 } 408 409 func (p *PrometheusSink) AddSampleWithLabels(parts []string, val float32, labels []metrics.Label, 410 opts ...utils.OptionExtender) { 411 opt := utils.ApplyOptions[metrics.Option](opts...) 412 if opt.Precision { 413 p.AddPrecisionSampleWithLabels(parts, float64(val), labels, opts...) 414 return 415 } 416 417 buckets := defaultBuckets 418 if len(opt.Buckets) > 0 { 419 buckets = clone.Clone(opt.Buckets) 420 } 421 422 key, hash := flattenKey(parts, labels) 423 ps, ok := p.histograms.Load(hash) 424 425 // Does the summary already exist for this sample type? 426 if ok { 427 localHistogram := *ps.(*histogram) 428 localHistogram.Observe(float64(val)) 429 localHistogram.updatedAt = time.Now() 430 p.histograms.Store(hash, &localHistogram) 431 432 // The summary does not exist, create the Summary and allow it to be deleted 433 } else { 434 help := key 435 existingHelp, ok := p.help[fmt.Sprintf("histogram.%s", key)] 436 if ok { 437 help = existingHelp 438 } 439 h := prometheus.NewHistogram(prometheus.HistogramOpts{ 440 Name: key, 441 Help: help, 442 ConstLabels: prometheusLabels(labels), 443 Buckets: clone.Clone(buckets), 444 }) 445 h.Observe(float64(val)) 446 ps = &histogram{ 447 Histogram: h, 448 updatedAt: time.Now(), 449 canDelete: true, 450 } 451 p.histograms.Store(hash, ps) 452 } 453 } 454 455 func (p *PrometheusSink) AddPrecisionSample(parts []string, val float64, opts ...utils.OptionExtender) { 456 p.AddPrecisionSampleWithLabels(parts, val, nil, opts...) 457 } 458 459 func (p *PrometheusSink) AddPrecisionSampleWithLabels(parts []string, val float64, labels []metrics.Label, 460 opts ...utils.OptionExtender) { 461 key, hash := flattenKey(parts, labels) 462 ps, ok := p.summaries.Load(hash) 463 464 // Does the summary already exist for this sample type? 465 if ok { 466 localSummary := *ps.(*summary) 467 localSummary.Observe(val) 468 localSummary.updatedAt = time.Now() 469 p.summaries.Store(hash, &localSummary) 470 471 // The summary does not exist, create the Summary and allow it to be deleted 472 } else { 473 help := key 474 existingHelp, ok := p.help[fmt.Sprintf("summary.%s", key)] 475 if ok { 476 help = existingHelp 477 } 478 s := prometheus.NewSummary(prometheus.SummaryOpts{ 479 Name: key, 480 Help: help, 481 MaxAge: 10 * time.Second, 482 ConstLabels: prometheusLabels(labels), 483 Objectives: clone.Clone(objectives), 484 }) 485 s.Observe(val) 486 ps = &summary{ 487 Summary: s, 488 updatedAt: time.Now(), 489 canDelete: true, 490 } 491 p.summaries.Store(hash, ps) 492 } 493 } 494 495 // EmitKey is not implemented. Prometheus doesn’t offer a type for which an 496 // arbitrary number of values is retained, as Prometheus works with a pull 497 // model, rather than a push model. 498 func (p *PrometheusSink) EmitKey(key []string, val float32, opts ...utils.OptionExtender) { 499 panic(errors.New("prometheus sink not implement EmitKey")) 500 } 501 502 func (p *PrometheusSink) IncrCounter(parts []string, val float32, opts ...utils.OptionExtender) { 503 p.IncrCounterWithLabels(parts, val, nil) 504 } 505 506 func (p *PrometheusSink) IncrCounterWithLabels(parts []string, val float32, labels []metrics.Label, 507 opts ...utils.OptionExtender) { 508 key, hash := flattenKey(parts, labels) 509 pc, ok := p.counters.Load(hash) 510 511 // Prometheus Counter.Add() panics if val < 0. We don't want this to 512 // cause applications to crash, so log an error instead. 513 if val < 0 { 514 if p.logger != nil { 515 p.logger.Warn("[Common] metrics prometheus attempting to "+ 516 "increment prometheus counter %v with value negative value %v", key, val) 517 } else { 518 log.Printf("[Common] metrics prometheus attempting to "+ 519 "increment prometheus counter %v with value negative value %v", key, val) 520 } 521 return 522 } 523 524 // Does the counter exist? 525 if ok { 526 localCounter := *pc.(*counter) 527 localCounter.Add(float64(val)) 528 localCounter.updatedAt = time.Now() 529 p.counters.Store(hash, &localCounter) 530 531 // The counter does not exist yet, create it and allow it to be deleted 532 } else { 533 help := key 534 existingHelp, ok := p.help[fmt.Sprintf("counter.%s", key)] 535 if ok { 536 help = existingHelp 537 } 538 c := prometheus.NewCounter(prometheus.CounterOpts{ 539 Name: key, 540 Help: help, 541 ConstLabels: prometheusLabels(labels), 542 }) 543 c.Add(float64(val)) 544 pc = &counter{ 545 Counter: c, 546 updatedAt: time.Now(), 547 canDelete: true, 548 } 549 p.counters.Store(hash, pc) 550 } 551 } 552 553 // PrometheusPushSink wraps a normal prometheus sink and provides an address and facilities to export it to an address 554 // on an interval. 555 type PrometheusPushSink struct { 556 *PrometheusSink 557 pusher *push.Pusher 558 address string 559 pushInterval time.Duration 560 stopChan chan struct{} 561 } 562 563 // NewPrometheusPushSink creates a PrometheusPushSink by taking an address, interval, and destination name. 564 func NewPrometheusPushSink(ctx context.Context, address string, pushInterval time.Duration, 565 name string, logger metrics.Logger) (*PrometheusPushSink, error) { 566 promSink := &PrometheusSink{ 567 gauges: sync.Map{}, 568 summaries: sync.Map{}, 569 histograms: sync.Map{}, 570 counters: sync.Map{}, 571 expiration: 60 * time.Second, 572 name: "default_prometheus_sink", 573 logger: logger, 574 } 575 pusher := push.New(address, name).Collector(promSink) 576 577 sink := &PrometheusPushSink{ 578 PrometheusSink: promSink, 579 pusher: pusher, 580 address: address, 581 pushInterval: pushInterval, 582 stopChan: make(chan struct{}), 583 } 584 585 sink.flushMetrics(ctx) 586 return sink, nil 587 } 588 589 func (s *PrometheusPushSink) flushMetrics(ctx context.Context) { 590 go func() { 591 err := retry.Retry(func(attempt uint) (err error) { 592 ticker := time.NewTicker(s.pushInterval) 593 defer ticker.Stop() 594 595 _, err = utils.Catch(func() { 596 for { 597 select { 598 case ti := <-ticker.C: 599 if err := s.pusher.PushContext(ctx); err != nil { 600 if s.logger != nil { 601 s.logger.Warn(ctx, "[Common] metrics prometheus pushing to prometheus err: %s", err) 602 } else { 603 log.Printf("[Common] metrics prometheus pushing to prometheus err: %s", err) 604 } 605 } else if s.logger != nil { 606 s.logger.Debug(ctx, "[Common] metrics prometheus push to prometheus success at %s", 607 ti.Format(constant.StdTimeMSLayout)) 608 } 609 610 case <-s.stopChan: 611 if s.logger != nil { 612 s.logger.Warn(ctx, "[Common] metrics prometheus push cycle exited") 613 } else { 614 log.Printf("[Common] metrics prometheus push cycle exited") 615 } 616 return 617 } 618 } 619 }) 620 return 621 }, strategy.Limit(86400)) // 24 * 60 * 60 * s.pushInterval 622 623 if err != nil { 624 if s.logger != nil { 625 s.logger.Warn(ctx, "[Common] metrics prometheus exit unexpectedly: %s", err) 626 } else { 627 log.Printf("[Common] metrics prometheus exit unexpectedly: %s", err) 628 } 629 } 630 }() 631 } 632 633 // Shutdown tears down the PrometheusPushSink, and blocks while flushing metrics to the backend. 634 func (s *PrometheusPushSink) Shutdown() { 635 if _, ok := utils.IsChannelClosed(s.stopChan); ok { 636 return 637 } 638 639 close(s.stopChan) 640 // Closing the channel only stops the running goroutine that pushes metrics. 641 // To minimize the chance of data loss pusher.Push is called one last time. 642 _ = s.pusher.Push() 643 }