code.vegaprotocol.io/vega@v0.79.0/core/metrics/prometheus.go (about) 1 // Copyright (C) 2023 Gobalsky Labs Limited 2 // 3 // This program is free software: you can redistribute it and/or modify 4 // it under the terms of the GNU Affero General Public License as 5 // published by the Free Software Foundation, either version 3 of the 6 // License, or (at your option) any later version. 7 // 8 // This program is distributed in the hope that it will be useful, 9 // but WITHOUT ANY WARRANTY; without even the implied warranty of 10 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 // GNU Affero General Public License for more details. 12 // 13 // You should have received a copy of the GNU Affero General Public License 14 // along with this program. If not, see <http://www.gnu.org/licenses/>. 15 16 package metrics 17 18 import ( 19 "fmt" 20 "log" 21 "net/http" 22 "strings" 23 "time" 24 25 "code.vegaprotocol.io/vega/protos" 26 27 "github.com/pkg/errors" 28 "github.com/prometheus/client_golang/prometheus" 29 "github.com/prometheus/client_golang/prometheus/promhttp" 30 ) 31 32 const ( 33 // Gauge ... 34 Gauge instrument = iota 35 // Counter ... 36 Counter 37 // Histogram ... 38 Histogram 39 // Summary ... 40 Summary 41 ) 42 43 var ( 44 // ErrInstrumentNotSupported signals the specified instrument is not yet supported. 45 ErrInstrumentNotSupported = errors.New("instrument type unsupported") 46 // ErrInstrumentTypeMismatch signal the type of the instrument is not expected. 47 ErrInstrumentTypeMismatch = errors.New("instrument is not of the expected type") 48 ) 49 50 var ( 51 unconfirmedTxGauge prometheus.Gauge 52 engineTime *prometheus.CounterVec 53 orderCounter *prometheus.CounterVec 54 dataSourceEthVerifierOnGoingCallCounter *prometheus.CounterVec 55 ethereumRPCCallCounter *prometheus.CounterVec 56 ethCallCounter *prometheus.CounterVec 57 evtForwardCounter *prometheus.CounterVec 58 orderGauge *prometheus.GaugeVec 59 dataSourceEthVerifierOnGoingCallGauge *prometheus.GaugeVec 60 // Call counters for each request type per API. 61 apiRequestCallCounter *prometheus.CounterVec 62 // Total time counters for each request type per API. 63 apiRequestTimeCounter *prometheus.CounterVec 64 // Total time spent snapshoting. 65 snapshotTimeGauge *prometheus.GaugeVec 66 // Size of the snapshot per namespace. 67 snapshotSizeGauge *prometheus.GaugeVec 68 // Height of the last snapshot. 69 snapshotBlockHeightCounter prometheus.Gauge 70 // Core HTTP bindings that we will check against when updating HTTP metrics. 71 httpBindings *protos.Bindings 72 ) 73 74 // abstract prometheus types. 75 type instrument int 76 77 // combine all possible prometheus options + way to differentiate between regular or vector type. 78 type instrumentOpts struct { 79 opts prometheus.Opts 80 buckets []float64 81 objectives map[float64]float64 82 maxAge time.Duration 83 ageBuckets, bufCap uint32 84 vectors []string 85 } 86 87 type mi struct { 88 gaugeV *prometheus.GaugeVec 89 gauge prometheus.Gauge 90 counterV *prometheus.CounterVec 91 counter prometheus.Counter 92 histogramV *prometheus.HistogramVec 93 histogram prometheus.Histogram 94 summaryV *prometheus.SummaryVec 95 summary prometheus.Summary 96 } 97 98 // MetricInstrument - template interface for mi type return value - only mock if needed, and only mock the funcs you use. 99 type MetricInstrument interface { 100 Gauge() (prometheus.Gauge, error) 101 GaugeVec() (*prometheus.GaugeVec, error) 102 Counter() (prometheus.Counter, error) 103 CounterVec() (*prometheus.CounterVec, error) 104 Histogram() (prometheus.Histogram, error) 105 HistogramVec() (*prometheus.HistogramVec, error) 106 Summary() (prometheus.Summary, error) 107 SummaryVec() (*prometheus.SummaryVec, error) 108 } 109 110 // InstrumentOption - vararg for instrument options setting. 111 type InstrumentOption func(o *instrumentOpts) 112 113 // Vectors - configuration used to create a vector of a given interface, slice of label names. 114 func Vectors(labels ...string) InstrumentOption { 115 return func(o *instrumentOpts) { 116 o.vectors = labels 117 } 118 } 119 120 // Help - set the help field on instrument. 121 func Help(help string) InstrumentOption { 122 return func(o *instrumentOpts) { 123 o.opts.Help = help 124 } 125 } 126 127 // Namespace - set namespace. 128 func Namespace(ns string) InstrumentOption { 129 return func(o *instrumentOpts) { 130 o.opts.Namespace = ns 131 } 132 } 133 134 // Subsystem - set subsystem... obviously. 135 func Subsystem(s string) InstrumentOption { 136 return func(o *instrumentOpts) { 137 o.opts.Subsystem = s 138 } 139 } 140 141 // Labels set labels for instrument (similar to vector, but with given values). 142 func Labels(labels map[string]string) InstrumentOption { 143 return func(o *instrumentOpts) { 144 o.opts.ConstLabels = labels 145 } 146 } 147 148 // Buckets - specific to histogram type. 149 func Buckets(b []float64) InstrumentOption { 150 return func(o *instrumentOpts) { 151 o.buckets = b 152 } 153 } 154 155 // Objectives - specific to summary type. 156 func Objectives(obj map[float64]float64) InstrumentOption { 157 return func(o *instrumentOpts) { 158 o.objectives = obj 159 } 160 } 161 162 // MaxAge - specific to summary type. 163 func MaxAge(m time.Duration) InstrumentOption { 164 return func(o *instrumentOpts) { 165 o.maxAge = m 166 } 167 } 168 169 // AgeBuckets - specific to summary type. 170 func AgeBuckets(ab uint32) InstrumentOption { 171 return func(o *instrumentOpts) { 172 o.ageBuckets = ab 173 } 174 } 175 176 // BufCap - specific to summary type. 177 func BufCap(bc uint32) InstrumentOption { 178 return func(o *instrumentOpts) { 179 o.bufCap = bc 180 } 181 } 182 183 // addInstrument configures and registers new metrics instrument. 184 // This will, over time, be moved to use custom Registries, etc... 185 func addInstrument(t instrument, name string, opts ...InstrumentOption) (*mi, error) { 186 var col prometheus.Collector 187 ret := mi{} 188 opt := instrumentOpts{ 189 opts: prometheus.Opts{ 190 Name: name, 191 }, 192 } 193 // apply options 194 for _, o := range opts { 195 o(&opt) 196 } 197 switch t { 198 case Gauge: 199 o := opt.gauge() 200 if len(opt.vectors) == 0 { 201 ret.gauge = prometheus.NewGauge(o) 202 col = ret.gauge 203 } else { 204 ret.gaugeV = prometheus.NewGaugeVec(o, opt.vectors) 205 col = ret.gaugeV 206 } 207 case Counter: 208 o := opt.counter() 209 if len(opt.vectors) == 0 { 210 ret.counter = prometheus.NewCounter(o) 211 col = ret.counter 212 } else { 213 ret.counterV = prometheus.NewCounterVec(o, opt.vectors) 214 col = ret.counterV 215 } 216 case Histogram: 217 o := opt.histogram() 218 if len(opt.vectors) == 0 { 219 ret.histogram = prometheus.NewHistogram(o) 220 col = ret.histogram 221 } else { 222 ret.histogramV = prometheus.NewHistogramVec(o, opt.vectors) 223 col = ret.histogramV 224 } 225 case Summary: 226 o := opt.summary() 227 if len(opt.vectors) == 0 { 228 ret.summary = prometheus.NewSummary(o) 229 col = ret.summary 230 } else { 231 ret.summaryV = prometheus.NewSummaryVec(o, opt.vectors) 232 col = ret.summaryV 233 } 234 default: 235 return nil, ErrInstrumentNotSupported 236 } 237 if err := prometheus.Register(col); err != nil { 238 return nil, err 239 } 240 return &ret, nil 241 } 242 243 // Start enable metrics (given config). 244 func Start(conf Config) { 245 if !conf.Enabled { 246 return 247 } 248 if err := setupMetrics(); err != nil { 249 panic(fmt.Sprintf("could not set up metrics: %v", err)) 250 } 251 http.Handle(conf.Path, promhttp.Handler()) 252 go func() { 253 log.Fatal(http.ListenAndServe(fmt.Sprintf(":%d", conf.Port), nil)) 254 }() 255 } 256 257 func (i instrumentOpts) gauge() prometheus.GaugeOpts { 258 return prometheus.GaugeOpts(i.opts) 259 } 260 261 func (i instrumentOpts) counter() prometheus.CounterOpts { 262 return prometheus.CounterOpts(i.opts) 263 } 264 265 func (i instrumentOpts) summary() prometheus.SummaryOpts { 266 return prometheus.SummaryOpts{ 267 Name: i.opts.Name, 268 Namespace: i.opts.Namespace, 269 Subsystem: i.opts.Subsystem, 270 ConstLabels: i.opts.ConstLabels, 271 Help: i.opts.Help, 272 Objectives: i.objectives, 273 MaxAge: i.maxAge, 274 AgeBuckets: i.ageBuckets, 275 BufCap: i.bufCap, 276 } 277 } 278 279 func (i instrumentOpts) histogram() prometheus.HistogramOpts { 280 return prometheus.HistogramOpts{ 281 Name: i.opts.Name, 282 Namespace: i.opts.Namespace, 283 Subsystem: i.opts.Subsystem, 284 ConstLabels: i.opts.ConstLabels, 285 Help: i.opts.Help, 286 Buckets: i.buckets, 287 } 288 } 289 290 // Gauge returns a prometheus Gauge instrument. 291 func (m mi) Gauge() (prometheus.Gauge, error) { 292 if m.gauge == nil { 293 return nil, ErrInstrumentTypeMismatch 294 } 295 return m.gauge, nil 296 } 297 298 // GaugeVec returns a prometheus GaugeVec instrument. 299 func (m mi) GaugeVec() (*prometheus.GaugeVec, error) { 300 if m.gaugeV == nil { 301 return nil, ErrInstrumentTypeMismatch 302 } 303 return m.gaugeV, nil 304 } 305 306 // Counter returns a prometheus Counter instrument. 307 func (m mi) Counter() (prometheus.Counter, error) { 308 if m.counter == nil { 309 return nil, ErrInstrumentTypeMismatch 310 } 311 return m.counter, nil 312 } 313 314 // CounterVec returns a prometheus CounterVec instrument. 315 func (m mi) CounterVec() (*prometheus.CounterVec, error) { 316 if m.counterV == nil { 317 return nil, ErrInstrumentTypeMismatch 318 } 319 return m.counterV, nil 320 } 321 322 func (m mi) Histogram() (prometheus.Histogram, error) { 323 if m.histogram == nil { 324 return nil, ErrInstrumentTypeMismatch 325 } 326 return m.histogram, nil 327 } 328 329 func (m mi) HistogramVec() (*prometheus.HistogramVec, error) { 330 if m.histogramV == nil { 331 return nil, ErrInstrumentTypeMismatch 332 } 333 return m.histogramV, nil 334 } 335 336 func (m mi) Summary() (prometheus.Summary, error) { 337 if m.summary == nil { 338 return nil, ErrInstrumentTypeMismatch 339 } 340 return m.summary, nil 341 } 342 343 func (m mi) SummaryVec() (*prometheus.SummaryVec, error) { 344 if m.summaryV == nil { 345 return nil, ErrInstrumentTypeMismatch 346 } 347 return m.summaryV, nil 348 } 349 350 func setupMetrics() error { 351 // instrument with time histogram for blocks 352 h, err := addInstrument( 353 Counter, 354 "engine_seconds_total", 355 Namespace("vega"), 356 Vectors("market", "engine", "fn"), 357 ) 358 if err != nil { 359 return err 360 } 361 est, err := h.CounterVec() 362 if err != nil { 363 return err 364 } 365 engineTime = est 366 367 h, err = addInstrument( 368 Counter, 369 "orders_total", 370 Namespace("vega"), 371 Vectors("market", "valid"), 372 Help("Number of orders processed"), 373 ) 374 if err != nil { 375 return err 376 } 377 ot, err := h.CounterVec() 378 if err != nil { 379 return err 380 } 381 orderCounter = ot 382 383 h, err = addInstrument( 384 Counter, 385 "data_source_ethverifier_calls_total", 386 Namespace("vega"), 387 Vectors("spec"), 388 Help("Number of orders processed"), 389 ) 390 if err != nil { 391 return err 392 } 393 dataC, err := h.CounterVec() 394 if err != nil { 395 return err 396 } 397 dataSourceEthVerifierOnGoingCallCounter = dataC 398 399 h, err = addInstrument( 400 Counter, 401 "ethereum_rpc_calls_total", 402 Namespace("vega"), 403 Vectors("endpoint"), 404 Help("Number of calls made to the ethereum RPC"), 405 ) 406 if err != nil { 407 return err 408 } 409 ethRPCC, err := h.CounterVec() 410 if err != nil { 411 return err 412 } 413 ethereumRPCCallCounter = ethRPCC 414 415 h, err = addInstrument( 416 Counter, 417 "eth_calls_total", 418 Namespace("vega"), 419 Vectors("func", "asset", "respcode"), 420 Help("Number of call made to the ethereum node"), 421 ) 422 if err != nil { 423 return err 424 } 425 ethCalls, err := h.CounterVec() 426 if err != nil { 427 return err 428 } 429 ethCallCounter = ethCalls 430 431 h, err = addInstrument( 432 Counter, 433 "evt_forward_total", 434 Namespace("vega"), 435 Vectors("func", "res"), 436 Help("Number of call made forward/ack event from ethereum"), 437 ) 438 if err != nil { 439 return err 440 } 441 evtFwd, err := h.CounterVec() 442 if err != nil { 443 return err 444 } 445 evtForwardCounter = evtFwd 446 447 // now add the orders gauge 448 h, err = addInstrument( 449 Gauge, 450 "orders", 451 Namespace("vega"), 452 Vectors("market"), 453 Help("Number of orders currently being processed"), 454 ) 455 if err != nil { 456 return err 457 } 458 g, err := h.GaugeVec() 459 if err != nil { 460 return err 461 } 462 orderGauge = g 463 464 // now add the orders gauge 465 h, err = addInstrument( 466 Gauge, 467 "data_source_ethverifier_calls_ongoing", 468 Namespace("vega"), 469 Vectors("spec"), 470 Help("Number of event being verified"), 471 ) 472 if err != nil { 473 return err 474 } 475 dataD, err := h.GaugeVec() 476 if err != nil { 477 return err 478 } 479 dataSourceEthVerifierOnGoingCallGauge = dataD 480 481 // example usage of this simple gauge: 482 // e.orderGauge.WithLabelValues(mkt.Name).Add(float64(len(orders))) 483 // e.orderGauge.WithLabelValues(mkt.Name).Sub(float64(len(completedOrders))) 484 485 h, err = addInstrument( 486 Gauge, 487 "unconfirmedtx", 488 Namespace("vega"), 489 Help("Number of transactions waiting to be processed"), 490 ) 491 if err != nil { 492 return err 493 } 494 utxg, err := h.Gauge() 495 if err != nil { 496 return err 497 } 498 unconfirmedTxGauge = utxg 499 500 // 501 // API usage metrics start here 502 // 503 504 httpBindings, err = protos.CoreBindings() 505 if err != nil { 506 return err 507 } 508 // Number of calls to each request type 509 h, err = addInstrument( 510 Counter, 511 "request_count_total", 512 Namespace("vega"), 513 Vectors("apiType", "requestType"), 514 Help("Count of API requests"), 515 ) 516 if err != nil { 517 return err 518 } 519 rc, err := h.CounterVec() 520 if err != nil { 521 return err 522 } 523 apiRequestCallCounter = rc 524 525 // Total time for calls to each request type for each api type 526 h, err = addInstrument( 527 Counter, 528 "request_time_total", 529 Namespace("vega"), 530 Vectors("apiType", "requestType"), 531 Help("Total time spent in each API request"), 532 ) 533 if err != nil { 534 return err 535 } 536 rpac, err := h.CounterVec() 537 if err != nil { 538 return err 539 } 540 apiRequestTimeCounter = rpac 541 542 // snapshots times 543 h, err = addInstrument( 544 Gauge, 545 "snapshot_time_seconds", 546 Namespace("vega"), 547 Vectors("engine"), 548 Help("Total time spent snapshotting state"), 549 ) 550 if err != nil { 551 return err 552 } 553 snap, err := h.GaugeVec() 554 if err != nil { 555 return err 556 } 557 snapshotTimeGauge = snap 558 559 // snapshots sizes 560 h, err = addInstrument( 561 Gauge, 562 "snapshot_size_bytes", 563 Namespace("vega"), 564 Vectors("engine"), 565 Help("Total size of the snapshotting state"), 566 ) 567 if err != nil { 568 return err 569 } 570 snapSize, err := h.GaugeVec() 571 if err != nil { 572 return err 573 } 574 snapshotSizeGauge = snapSize 575 576 // snapshots block heights 577 h, err = addInstrument( 578 Gauge, 579 "snapshot_block_height", 580 Namespace("vega"), 581 Help("Block height of the last snapshot"), 582 ) 583 if err != nil { 584 return err 585 } 586 snapBlockHeight, err := h.Gauge() 587 if err != nil { 588 return err 589 } 590 snapshotBlockHeightCounter = snapBlockHeight 591 592 return nil 593 } 594 595 // OrderCounterInc increments the order counter. 596 func OrderCounterInc(labelValues ...string) { 597 if orderCounter == nil { 598 return 599 } 600 orderCounter.WithLabelValues(labelValues...).Inc() 601 } 602 603 // DataSourceEthVerifierCallCounterInc increments the order counter. 604 func DataSourceEthVerifierCallCounterInc(labelValues ...string) { 605 if dataSourceEthVerifierOnGoingCallCounter == nil { 606 return 607 } 608 dataSourceEthVerifierOnGoingCallCounter.WithLabelValues(labelValues...).Inc() 609 } 610 611 // EthereumRPCCallCounterInc increments the order counter. 612 func EthereumRPCCallCounterInc(labelValues ...string) { 613 if ethereumRPCCallCounter == nil { 614 return 615 } 616 ethereumRPCCallCounter.WithLabelValues("all").Inc() 617 ethereumRPCCallCounter.WithLabelValues(labelValues...).Inc() 618 } 619 620 // EthCallInc increments the eth call counter. 621 func EthCallInc(labelValues ...string) { 622 if ethCallCounter == nil { 623 return 624 } 625 ethCallCounter.WithLabelValues(labelValues...).Inc() 626 } 627 628 // EvtForwardInc increments the evt forward counter. 629 func EvtForwardInc(labelValues ...string) { 630 if evtForwardCounter == nil { 631 return 632 } 633 evtForwardCounter.WithLabelValues(labelValues...).Inc() 634 } 635 636 // OrderGaugeAdd increment the order gauge. 637 func OrderGaugeAdd(n int, labelValues ...string) { 638 if orderGauge == nil { 639 return 640 } 641 orderGauge.WithLabelValues(labelValues...).Add(float64(n)) 642 } 643 644 // DataSourceEthVerifierCallGaugeAdd increments the eth verified calls. 645 func DataSourceEthVerifierCallGaugeAdd(n int, labelValues ...string) { 646 if dataSourceEthVerifierOnGoingCallGauge == nil { 647 return 648 } 649 dataSourceEthVerifierOnGoingCallGauge.WithLabelValues(labelValues...).Add(float64(n)) 650 } 651 652 func DataSourceEthVerifierCallGaugeReset(labelValues ...string) { 653 if dataSourceEthVerifierOnGoingCallGauge == nil { 654 return 655 } 656 dataSourceEthVerifierOnGoingCallGauge.WithLabelValues(labelValues...).Set(0) 657 } 658 659 // UnconfirmedTxGaugeSet update the number of unconfirmed transactions. 660 func UnconfirmedTxGaugeSet(n int) { 661 if unconfirmedTxGauge == nil { 662 return 663 } 664 unconfirmedTxGauge.Set(float64(n)) 665 } 666 667 // APIRequestAndTimeREST updates the metrics for REST API calls. 668 func APIRequestAndTimeREST(method, request string, time float64) { 669 if apiRequestCallCounter == nil || apiRequestTimeCounter == nil || httpBindings == nil { 670 return 671 } 672 673 const ( 674 invalid = "invalid route" 675 prefix = "/" 676 ) 677 678 if !httpBindings.HasRoute(method, request) { 679 apiRequestCallCounter.WithLabelValues("REST", invalid).Inc() 680 apiRequestTimeCounter.WithLabelValues("REST", invalid).Add(time) 681 return 682 } 683 684 uri := request 685 686 // Remove the first slash if it has one 687 if strings.Index(uri, prefix) == 0 { 688 uri = uri[len(prefix):] 689 } 690 // Trim the URI down to something useful 691 if strings.Count(uri, "/") >= 1 { 692 uri = uri[:strings.Index(uri, "/")] 693 } 694 695 apiRequestCallCounter.WithLabelValues("REST", uri).Inc() 696 apiRequestTimeCounter.WithLabelValues("REST", uri).Add(time) 697 } 698 699 // APIRequestAndTimeGRPC updates the metrics for GRPC API calls. 700 func APIRequestAndTimeGRPC(request string, startTime time.Time) { 701 if apiRequestCallCounter == nil || apiRequestTimeCounter == nil { 702 return 703 } 704 apiRequestCallCounter.WithLabelValues("GRPC", request).Inc() 705 duration := time.Since(startTime).Seconds() 706 apiRequestTimeCounter.WithLabelValues("GRPC", request).Add(duration) 707 } 708 709 // APIRequestAndTimeGraphQL updates the metrics for GraphQL API calls. 710 func APIRequestAndTimeGraphQL(request string, time float64) { 711 if apiRequestCallCounter == nil || apiRequestTimeCounter == nil { 712 return 713 } 714 apiRequestCallCounter.WithLabelValues("GraphQL", request).Inc() 715 apiRequestTimeCounter.WithLabelValues("GraphQL", request).Add(time) 716 } 717 718 // StartAPIRequestAndTimeGRPC updates the metrics for GRPC API calls. 719 func StartAPIRequestAndTimeGRPC(request string) func() { 720 startTime := time.Now() 721 return func() { 722 if apiRequestCallCounter == nil || apiRequestTimeCounter == nil { 723 return 724 } 725 apiRequestCallCounter.WithLabelValues("GRPC", request).Inc() 726 duration := time.Since(startTime).Seconds() 727 apiRequestTimeCounter.WithLabelValues("GRPC", request).Add(duration) 728 } 729 } 730 731 func RegisterSnapshotNamespaces( 732 namespace string, 733 timeTaken time.Duration, 734 size int, 735 ) { 736 if snapshotTimeGauge == nil || snapshotSizeGauge == nil { 737 return 738 } 739 snapshotTimeGauge.WithLabelValues(namespace).Set(timeTaken.Seconds()) 740 snapshotSizeGauge.WithLabelValues(namespace).Set(float64(size)) 741 } 742 743 func RegisterSnapshotBlockHeight( 744 blockHeight uint64, 745 ) { 746 if snapshotBlockHeightCounter == nil { 747 return 748 } 749 snapshotBlockHeightCounter.Set(float64(blockHeight)) 750 } 751 752 func StartSnapshot(namespace string) func() { 753 startTime := time.Now() 754 return func() { 755 if snapshotTimeGauge == nil { 756 return 757 } 758 duration := time.Since(startTime).Seconds() 759 snapshotTimeGauge.WithLabelValues(namespace).Set(duration) 760 } 761 }