github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/metrics.go (about) 1 // Copyright 2016 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package kvserver 12 13 import ( 14 "context" 15 "time" 16 17 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/batcheval/result" 18 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/rangefeed" 19 "github.com/cockroachdb/cockroach/pkg/storage" 20 "github.com/cockroachdb/cockroach/pkg/storage/enginepb" 21 "github.com/cockroachdb/cockroach/pkg/util/log" 22 "github.com/cockroachdb/cockroach/pkg/util/metric" 23 "go.etcd.io/etcd/raft/raftpb" 24 ) 25 26 var ( 27 // Replica metrics. 28 metaReplicaCount = metric.Metadata{ 29 Name: "replicas", 30 Help: "Number of replicas", 31 Measurement: "Replicas", 32 Unit: metric.Unit_COUNT, 33 } 34 metaReservedReplicaCount = metric.Metadata{ 35 Name: "replicas.reserved", 36 Help: "Number of replicas reserved for snapshots", 37 Measurement: "Replicas", 38 Unit: metric.Unit_COUNT, 39 } 40 metaRaftLeaderCount = metric.Metadata{ 41 Name: "replicas.leaders", 42 Help: "Number of raft leaders", 43 Measurement: "Raft Leaders", 44 Unit: metric.Unit_COUNT, 45 } 46 metaRaftLeaderNotLeaseHolderCount = metric.Metadata{ 47 Name: "replicas.leaders_not_leaseholders", 48 Help: "Number of replicas that are Raft leaders whose range lease is held by another store", 49 Measurement: "Replicas", 50 Unit: metric.Unit_COUNT, 51 } 52 metaLeaseHolderCount = metric.Metadata{ 53 Name: "replicas.leaseholders", 54 Help: "Number of lease holders", 55 Measurement: "Replicas", 56 Unit: metric.Unit_COUNT, 57 } 58 metaQuiescentCount = metric.Metadata{ 59 Name: "replicas.quiescent", 60 Help: "Number of quiesced replicas", 61 Measurement: "Replicas", 62 Unit: metric.Unit_COUNT, 63 } 64 65 // Range metrics. 66 metaRangeCount = metric.Metadata{ 67 Name: "ranges", 68 Help: "Number of ranges", 69 Measurement: "Ranges", 70 Unit: metric.Unit_COUNT, 71 } 72 metaUnavailableRangeCount = metric.Metadata{ 73 Name: "ranges.unavailable", 74 Help: "Number of ranges with fewer live replicas than needed for quorum", 75 Measurement: "Ranges", 76 Unit: metric.Unit_COUNT, 77 } 78 metaUnderReplicatedRangeCount = metric.Metadata{ 79 Name: "ranges.underreplicated", 80 Help: "Number of ranges with fewer live replicas than the replication target", 81 Measurement: "Ranges", 82 Unit: metric.Unit_COUNT, 83 } 84 metaOverReplicatedRangeCount = metric.Metadata{ 85 Name: "ranges.overreplicated", 86 Help: "Number of ranges with more live replicas than the replication target", 87 Measurement: "Ranges", 88 Unit: metric.Unit_COUNT, 89 } 90 91 // Lease request metrics. 92 metaLeaseRequestSuccessCount = metric.Metadata{ 93 Name: "leases.success", 94 Help: "Number of successful lease requests", 95 Measurement: "Lease Requests", 96 Unit: metric.Unit_COUNT, 97 } 98 metaLeaseRequestErrorCount = metric.Metadata{ 99 Name: "leases.error", 100 Help: "Number of failed lease requests", 101 Measurement: "Lease Requests", 102 Unit: metric.Unit_COUNT, 103 } 104 metaLeaseTransferSuccessCount = metric.Metadata{ 105 Name: "leases.transfers.success", 106 Help: "Number of successful lease transfers", 107 Measurement: "Lease Transfers", 108 Unit: metric.Unit_COUNT, 109 } 110 metaLeaseTransferErrorCount = metric.Metadata{ 111 Name: "leases.transfers.error", 112 Help: "Number of failed lease transfers", 113 Measurement: "Lease Transfers", 114 Unit: metric.Unit_COUNT, 115 } 116 metaLeaseExpirationCount = metric.Metadata{ 117 Name: "leases.expiration", 118 Help: "Number of replica leaseholders using expiration-based leases", 119 Measurement: "Replicas", 120 Unit: metric.Unit_COUNT, 121 } 122 metaLeaseEpochCount = metric.Metadata{ 123 Name: "leases.epoch", 124 Help: "Number of replica leaseholders using epoch-based leases", 125 Measurement: "Replicas", 126 Unit: metric.Unit_COUNT, 127 } 128 129 // Storage metrics. 130 metaLiveBytes = metric.Metadata{ 131 Name: "livebytes", 132 Help: "Number of bytes of live data (keys plus values)", 133 Measurement: "Storage", 134 Unit: metric.Unit_BYTES, 135 } 136 metaKeyBytes = metric.Metadata{ 137 Name: "keybytes", 138 Help: "Number of bytes taken up by keys", 139 Measurement: "Storage", 140 Unit: metric.Unit_BYTES, 141 } 142 metaValBytes = metric.Metadata{ 143 Name: "valbytes", 144 Help: "Number of bytes taken up by values", 145 Measurement: "Storage", 146 Unit: metric.Unit_BYTES, 147 } 148 metaTotalBytes = metric.Metadata{ 149 Name: "totalbytes", 150 Help: "Total number of bytes taken up by keys and values including non-live data", 151 Measurement: "Storage", 152 Unit: metric.Unit_BYTES, 153 } 154 metaIntentBytes = metric.Metadata{ 155 Name: "intentbytes", 156 Help: "Number of bytes in intent KV pairs", 157 Measurement: "Storage", 158 Unit: metric.Unit_BYTES, 159 } 160 metaLiveCount = metric.Metadata{ 161 Name: "livecount", 162 Help: "Count of live keys", 163 Measurement: "Keys", 164 Unit: metric.Unit_COUNT, 165 } 166 metaKeyCount = metric.Metadata{ 167 Name: "keycount", 168 Help: "Count of all keys", 169 Measurement: "Keys", 170 Unit: metric.Unit_COUNT, 171 } 172 metaValCount = metric.Metadata{ 173 Name: "valcount", 174 Help: "Count of all values", 175 Measurement: "MVCC Values", 176 Unit: metric.Unit_COUNT, 177 } 178 metaIntentCount = metric.Metadata{ 179 Name: "intentcount", 180 Help: "Count of intent keys", 181 Measurement: "Keys", 182 Unit: metric.Unit_COUNT, 183 } 184 metaIntentAge = metric.Metadata{ 185 Name: "intentage", 186 Help: "Cumulative age of intents", 187 Measurement: "Age", 188 Unit: metric.Unit_SECONDS, 189 } 190 metaGcBytesAge = metric.Metadata{ 191 Name: "gcbytesage", 192 Help: "Cumulative age of non-live data", 193 Measurement: "Age", 194 Unit: metric.Unit_SECONDS, 195 } 196 metaLastUpdateNanos = metric.Metadata{ 197 Name: "lastupdatenanos", 198 Help: "Timestamp at which bytes/keys/intents metrics were last updated", 199 Measurement: "Last Update", 200 Unit: metric.Unit_TIMESTAMP_NS, 201 } 202 203 // Contention and intent resolution metrics. 204 metaResolveCommit = metric.Metadata{ 205 Name: "intents.resolve-attempts", 206 Help: "Count of (point or range) intent commit evaluation attempts", 207 Measurement: "Operations", 208 Unit: metric.Unit_COUNT, 209 } 210 metaResolveAbort = metric.Metadata{ 211 Name: "intents.abort-attempts", 212 Help: "Count of (point or range) non-poisoning intent abort evaluation attempts", 213 Measurement: "Operations", 214 Unit: metric.Unit_COUNT, 215 } 216 metaResolvePoison = metric.Metadata{ 217 Name: "intents.poison-attempts", 218 Help: "Count of (point or range) poisoning intent abort evaluation attempts", 219 Measurement: "Operations", 220 Unit: metric.Unit_COUNT, 221 } 222 223 // Disk usage diagram (CR=Cockroach): 224 // --------------------------------- 225 // Entire hard drive: | non-CR data | CR data | empty | 226 // --------------------------------- 227 // Metrics: 228 // "capacity": |===============================| 229 // "used": |=========| 230 // "available": |=======| 231 // "usable" (computed in UI): |=================| 232 metaCapacity = metric.Metadata{ 233 Name: "capacity", 234 Help: "Total storage capacity", 235 Measurement: "Storage", 236 Unit: metric.Unit_BYTES, 237 } 238 metaAvailable = metric.Metadata{ 239 Name: "capacity.available", 240 Help: "Available storage capacity", 241 Measurement: "Storage", 242 Unit: metric.Unit_BYTES, 243 } 244 metaUsed = metric.Metadata{ 245 Name: "capacity.used", 246 Help: "Used storage capacity", 247 Measurement: "Storage", 248 Unit: metric.Unit_BYTES, 249 } 250 251 metaReserved = metric.Metadata{ 252 Name: "capacity.reserved", 253 Help: "Capacity reserved for snapshots", 254 Measurement: "Storage", 255 Unit: metric.Unit_BYTES, 256 } 257 metaSysBytes = metric.Metadata{ 258 Name: "sysbytes", 259 Help: "Number of bytes in system KV pairs", 260 Measurement: "Storage", 261 Unit: metric.Unit_BYTES, 262 } 263 metaSysCount = metric.Metadata{ 264 Name: "syscount", 265 Help: "Count of system KV pairs", 266 Measurement: "Keys", 267 Unit: metric.Unit_COUNT, 268 } 269 270 // Metrics used by the rebalancing logic that aren't already captured elsewhere. 271 metaAverageQueriesPerSecond = metric.Metadata{ 272 Name: "rebalancing.queriespersecond", 273 Help: "Number of kv-level requests received per second by the store, averaged over a large time period as used in rebalancing decisions", 274 Measurement: "Keys/Sec", 275 Unit: metric.Unit_COUNT, 276 } 277 metaAverageWritesPerSecond = metric.Metadata{ 278 Name: "rebalancing.writespersecond", 279 Help: "Number of keys written (i.e. applied by raft) per second to the store, averaged over a large time period as used in rebalancing decisions", 280 Measurement: "Keys/Sec", 281 Unit: metric.Unit_COUNT, 282 } 283 284 // Metric for tracking follower reads. 285 metaFollowerReadsCount = metric.Metadata{ 286 Name: "follower_reads.success_count", 287 Help: "Number of reads successfully processed by any replica", 288 Measurement: "Read Ops", 289 Unit: metric.Unit_COUNT, 290 } 291 292 // RocksDB metrics. 293 metaRdbBlockCacheHits = metric.Metadata{ 294 Name: "rocksdb.block.cache.hits", 295 Help: "Count of block cache hits", 296 Measurement: "Cache Ops", 297 Unit: metric.Unit_COUNT, 298 } 299 metaRdbBlockCacheMisses = metric.Metadata{ 300 Name: "rocksdb.block.cache.misses", 301 Help: "Count of block cache misses", 302 Measurement: "Cache Ops", 303 Unit: metric.Unit_COUNT, 304 } 305 metaRdbBlockCacheUsage = metric.Metadata{ 306 Name: "rocksdb.block.cache.usage", 307 Help: "Bytes used by the block cache", 308 Measurement: "Memory", 309 Unit: metric.Unit_BYTES, 310 } 311 metaRdbBlockCachePinnedUsage = metric.Metadata{ 312 Name: "rocksdb.block.cache.pinned-usage", 313 Help: "Bytes pinned by the block cache", 314 Measurement: "Memory", 315 Unit: metric.Unit_BYTES, 316 } 317 metaRdbBloomFilterPrefixChecked = metric.Metadata{ 318 Name: "rocksdb.bloom.filter.prefix.checked", 319 Help: "Number of times the bloom filter was checked", 320 Measurement: "Bloom Filter Ops", 321 Unit: metric.Unit_COUNT, 322 } 323 metaRdbBloomFilterPrefixUseful = metric.Metadata{ 324 Name: "rocksdb.bloom.filter.prefix.useful", 325 Help: "Number of times the bloom filter helped avoid iterator creation", 326 Measurement: "Bloom Filter Ops", 327 Unit: metric.Unit_COUNT, 328 } 329 metaRdbMemtableTotalSize = metric.Metadata{ 330 Name: "rocksdb.memtable.total-size", 331 Help: "Current size of memtable in bytes", 332 Measurement: "Memory", 333 Unit: metric.Unit_BYTES, 334 } 335 metaRdbFlushes = metric.Metadata{ 336 Name: "rocksdb.flushes", 337 Help: "Number of table flushes", 338 Measurement: "Flushes", 339 Unit: metric.Unit_COUNT, 340 } 341 metaRdbFlushedBytes = metric.Metadata{ 342 Name: "rocksdb.flushed-bytes", 343 Help: "Bytes written during flush", 344 Measurement: "Bytes Written", 345 Unit: metric.Unit_BYTES, 346 } 347 metaRdbCompactions = metric.Metadata{ 348 Name: "rocksdb.compactions", 349 Help: "Number of table compactions", 350 Measurement: "Compactions", 351 Unit: metric.Unit_COUNT, 352 } 353 metaRdbIngestedBytes = metric.Metadata{ 354 Name: "rocksdb.ingested-bytes", 355 Help: "Bytes ingested", 356 Measurement: "Bytes Ingested", 357 Unit: metric.Unit_BYTES, 358 } 359 metaRdbCompactedBytesRead = metric.Metadata{ 360 Name: "rocksdb.compacted-bytes-read", 361 Help: "Bytes read during compaction", 362 Measurement: "Bytes Read", 363 Unit: metric.Unit_BYTES, 364 } 365 metaRdbCompactedBytesWritten = metric.Metadata{ 366 Name: "rocksdb.compacted-bytes-written", 367 Help: "Bytes written during compaction", 368 Measurement: "Bytes Written", 369 Unit: metric.Unit_BYTES, 370 } 371 metaRdbTableReadersMemEstimate = metric.Metadata{ 372 Name: "rocksdb.table-readers-mem-estimate", 373 Help: "Memory used by index and filter blocks", 374 Measurement: "Memory", 375 Unit: metric.Unit_BYTES, 376 } 377 metaRdbReadAmplification = metric.Metadata{ 378 Name: "rocksdb.read-amplification", 379 Help: "Number of disk reads per query", 380 Measurement: "Disk Reads per Query", 381 Unit: metric.Unit_COUNT, 382 } 383 metaRdbNumSSTables = metric.Metadata{ 384 Name: "rocksdb.num-sstables", 385 Help: "Number of rocksdb SSTables", 386 Measurement: "SSTables", 387 Unit: metric.Unit_COUNT, 388 } 389 metaRdbPendingCompaction = metric.Metadata{ 390 Name: "rocksdb.estimated-pending-compaction", 391 Help: "Estimated pending compaction bytes", 392 Measurement: "Storage", 393 Unit: metric.Unit_BYTES, 394 } 395 396 // Range event metrics. 397 metaRangeSplits = metric.Metadata{ 398 Name: "range.splits", 399 Help: "Number of range splits", 400 Measurement: "Range Ops", 401 Unit: metric.Unit_COUNT, 402 } 403 metaRangeMerges = metric.Metadata{ 404 Name: "range.merges", 405 Help: "Number of range merges", 406 Measurement: "Range Ops", 407 Unit: metric.Unit_COUNT, 408 } 409 metaRangeAdds = metric.Metadata{ 410 Name: "range.adds", 411 Help: "Number of range additions", 412 Measurement: "Range Ops", 413 Unit: metric.Unit_COUNT, 414 } 415 metaRangeRemoves = metric.Metadata{ 416 Name: "range.removes", 417 Help: "Number of range removals", 418 Measurement: "Range Ops", 419 Unit: metric.Unit_COUNT, 420 } 421 metaRangeSnapshotsGenerated = metric.Metadata{ 422 Name: "range.snapshots.generated", 423 Help: "Number of generated snapshots", 424 Measurement: "Snapshots", 425 Unit: metric.Unit_COUNT, 426 } 427 metaRangeSnapshotsNormalApplied = metric.Metadata{ 428 Name: "range.snapshots.normal-applied", 429 Help: "Number of applied snapshots", 430 Measurement: "Snapshots", 431 Unit: metric.Unit_COUNT, 432 } 433 metaRangeSnapshotsLearnerApplied = metric.Metadata{ 434 Name: "range.snapshots.learner-applied", 435 Help: "Number of applied learner snapshots", 436 Measurement: "Snapshots", 437 Unit: metric.Unit_COUNT, 438 } 439 metaRangeRaftLeaderTransfers = metric.Metadata{ 440 Name: "range.raftleadertransfers", 441 Help: "Number of raft leader transfers", 442 Measurement: "Leader Transfers", 443 Unit: metric.Unit_COUNT, 444 } 445 446 // Raft processing metrics. 447 metaRaftTicks = metric.Metadata{ 448 Name: "raft.ticks", 449 Help: "Number of Raft ticks queued", 450 Measurement: "Ticks", 451 Unit: metric.Unit_COUNT, 452 } 453 metaRaftWorkingDurationNanos = metric.Metadata{ 454 Name: "raft.process.workingnanos", 455 Help: "Nanoseconds spent in store.processRaft() working", 456 Measurement: "Processing Time", 457 Unit: metric.Unit_NANOSECONDS, 458 } 459 metaRaftTickingDurationNanos = metric.Metadata{ 460 Name: "raft.process.tickingnanos", 461 Help: "Nanoseconds spent in store.processRaft() processing replica.Tick()", 462 Measurement: "Processing Time", 463 Unit: metric.Unit_NANOSECONDS, 464 } 465 metaRaftCommandsApplied = metric.Metadata{ 466 Name: "raft.commandsapplied", 467 Help: "Count of Raft commands applied", 468 Measurement: "Commands", 469 Unit: metric.Unit_COUNT, 470 } 471 metaRaftLogCommitLatency = metric.Metadata{ 472 Name: "raft.process.logcommit.latency", 473 Help: "Latency histogram for committing Raft log entries", 474 Measurement: "Latency", 475 Unit: metric.Unit_NANOSECONDS, 476 } 477 metaRaftCommandCommitLatency = metric.Metadata{ 478 Name: "raft.process.commandcommit.latency", 479 Help: "Latency histogram for committing Raft commands", 480 Measurement: "Latency", 481 Unit: metric.Unit_NANOSECONDS, 482 } 483 metaRaftHandleReadyLatency = metric.Metadata{ 484 Name: "raft.process.handleready.latency", 485 Help: "Latency histogram for handling a Raft ready", 486 Measurement: "Latency", 487 Unit: metric.Unit_NANOSECONDS, 488 } 489 metaRaftApplyCommittedLatency = metric.Metadata{ 490 Name: "raft.process.applycommitted.latency", 491 Help: "Latency histogram for applying all committed Raft commands in a Raft ready", 492 Measurement: "Latency", 493 Unit: metric.Unit_NANOSECONDS, 494 } 495 496 // Raft message metrics. 497 metaRaftRcvdProp = metric.Metadata{ 498 Name: "raft.rcvd.prop", 499 Help: "Number of MsgProp messages received by this store", 500 Measurement: "Messages", 501 Unit: metric.Unit_COUNT, 502 } 503 metaRaftRcvdApp = metric.Metadata{ 504 Name: "raft.rcvd.app", 505 Help: "Number of MsgApp messages received by this store", 506 Measurement: "Messages", 507 Unit: metric.Unit_COUNT, 508 } 509 metaRaftRcvdAppResp = metric.Metadata{ 510 Name: "raft.rcvd.appresp", 511 Help: "Number of MsgAppResp messages received by this store", 512 Measurement: "Messages", 513 Unit: metric.Unit_COUNT, 514 } 515 metaRaftRcvdVote = metric.Metadata{ 516 Name: "raft.rcvd.vote", 517 Help: "Number of MsgVote messages received by this store", 518 Measurement: "Messages", 519 Unit: metric.Unit_COUNT, 520 } 521 metaRaftRcvdVoteResp = metric.Metadata{ 522 Name: "raft.rcvd.voteresp", 523 Help: "Number of MsgVoteResp messages received by this store", 524 Measurement: "Messages", 525 Unit: metric.Unit_COUNT, 526 } 527 metaRaftRcvdPreVote = metric.Metadata{ 528 Name: "raft.rcvd.prevote", 529 Help: "Number of MsgPreVote messages received by this store", 530 Measurement: "Messages", 531 Unit: metric.Unit_COUNT, 532 } 533 metaRaftRcvdPreVoteResp = metric.Metadata{ 534 Name: "raft.rcvd.prevoteresp", 535 Help: "Number of MsgPreVoteResp messages received by this store", 536 Measurement: "Messages", 537 Unit: metric.Unit_COUNT, 538 } 539 metaRaftRcvdSnap = metric.Metadata{ 540 Name: "raft.rcvd.snap", 541 Help: "Number of MsgSnap messages received by this store", 542 Measurement: "Messages", 543 Unit: metric.Unit_COUNT, 544 } 545 metaRaftRcvdHeartbeat = metric.Metadata{ 546 Name: "raft.rcvd.heartbeat", 547 Help: "Number of (coalesced, if enabled) MsgHeartbeat messages received by this store", 548 Measurement: "Messages", 549 Unit: metric.Unit_COUNT, 550 } 551 metaRaftRcvdHeartbeatResp = metric.Metadata{ 552 Name: "raft.rcvd.heartbeatresp", 553 Help: "Number of (coalesced, if enabled) MsgHeartbeatResp messages received by this store", 554 Measurement: "Messages", 555 Unit: metric.Unit_COUNT, 556 } 557 metaRaftRcvdTransferLeader = metric.Metadata{ 558 Name: "raft.rcvd.transferleader", 559 Help: "Number of MsgTransferLeader messages received by this store", 560 Measurement: "Messages", 561 Unit: metric.Unit_COUNT, 562 } 563 metaRaftRcvdTimeoutNow = metric.Metadata{ 564 Name: "raft.rcvd.timeoutnow", 565 Help: "Number of MsgTimeoutNow messages received by this store", 566 Measurement: "Messages", 567 Unit: metric.Unit_COUNT, 568 } 569 metaRaftRcvdDropped = metric.Metadata{ 570 Name: "raft.rcvd.dropped", 571 Help: "Number of dropped incoming Raft messages", 572 Measurement: "Messages", 573 Unit: metric.Unit_COUNT, 574 } 575 metaRaftEnqueuedPending = metric.Metadata{ 576 Name: "raft.enqueued.pending", 577 Help: "Number of pending outgoing messages in the Raft Transport queue", 578 Measurement: "Messages", 579 Unit: metric.Unit_COUNT, 580 } 581 metaRaftCoalescedHeartbeatsPending = metric.Metadata{ 582 Name: "raft.heartbeats.pending", 583 Help: "Number of pending heartbeats and responses waiting to be coalesced", 584 Measurement: "Messages", 585 Unit: metric.Unit_COUNT, 586 } 587 588 // Raft log metrics. 589 metaRaftLogFollowerBehindCount = metric.Metadata{ 590 Name: "raftlog.behind", 591 Help: "Number of Raft log entries followers on other stores are behind", 592 Measurement: "Log Entries", 593 Unit: metric.Unit_COUNT, 594 } 595 metaRaftLogTruncated = metric.Metadata{ 596 Name: "raftlog.truncated", 597 Help: "Number of Raft log entries truncated", 598 Measurement: "Log Entries", 599 Unit: metric.Unit_COUNT, 600 } 601 602 // Replica queue metrics. 603 metaGCQueueSuccesses = metric.Metadata{ 604 Name: "queue.gc.process.success", 605 Help: "Number of replicas successfully processed by the GC queue", 606 Measurement: "Replicas", 607 Unit: metric.Unit_COUNT, 608 } 609 metaGCQueueFailures = metric.Metadata{ 610 Name: "queue.gc.process.failure", 611 Help: "Number of replicas which failed processing in the GC queue", 612 Measurement: "Replicas", 613 Unit: metric.Unit_COUNT, 614 } 615 metaGCQueuePending = metric.Metadata{ 616 Name: "queue.gc.pending", 617 Help: "Number of pending replicas in the GC queue", 618 Measurement: "Replicas", 619 Unit: metric.Unit_COUNT, 620 } 621 metaGCQueueProcessingNanos = metric.Metadata{ 622 Name: "queue.gc.processingnanos", 623 Help: "Nanoseconds spent processing replicas in the GC queue", 624 Measurement: "Processing Time", 625 Unit: metric.Unit_NANOSECONDS, 626 } 627 metaMergeQueueSuccesses = metric.Metadata{ 628 Name: "queue.merge.process.success", 629 Help: "Number of replicas successfully processed by the merge queue", 630 Measurement: "Replicas", 631 Unit: metric.Unit_COUNT, 632 } 633 metaMergeQueueFailures = metric.Metadata{ 634 Name: "queue.merge.process.failure", 635 Help: "Number of replicas which failed processing in the merge queue", 636 Measurement: "Replicas", 637 Unit: metric.Unit_COUNT, 638 } 639 metaMergeQueuePending = metric.Metadata{ 640 Name: "queue.merge.pending", 641 Help: "Number of pending replicas in the merge queue", 642 Measurement: "Replicas", 643 Unit: metric.Unit_COUNT, 644 } 645 metaMergeQueueProcessingNanos = metric.Metadata{ 646 Name: "queue.merge.processingnanos", 647 Help: "Nanoseconds spent processing replicas in the merge queue", 648 Measurement: "Processing Time", 649 Unit: metric.Unit_NANOSECONDS, 650 } 651 metaMergeQueuePurgatory = metric.Metadata{ 652 Name: "queue.merge.purgatory", 653 Help: "Number of replicas in the merge queue's purgatory, waiting to become mergeable", 654 Measurement: "Replicas", 655 Unit: metric.Unit_COUNT, 656 } 657 metaRaftLogQueueSuccesses = metric.Metadata{ 658 Name: "queue.raftlog.process.success", 659 Help: "Number of replicas successfully processed by the Raft log queue", 660 Measurement: "Replicas", 661 Unit: metric.Unit_COUNT, 662 } 663 metaRaftLogQueueFailures = metric.Metadata{ 664 Name: "queue.raftlog.process.failure", 665 Help: "Number of replicas which failed processing in the Raft log queue", 666 Measurement: "Replicas", 667 Unit: metric.Unit_COUNT, 668 } 669 metaRaftLogQueuePending = metric.Metadata{ 670 Name: "queue.raftlog.pending", 671 Help: "Number of pending replicas in the Raft log queue", 672 Measurement: "Replicas", 673 Unit: metric.Unit_COUNT, 674 } 675 metaRaftLogQueueProcessingNanos = metric.Metadata{ 676 Name: "queue.raftlog.processingnanos", 677 Help: "Nanoseconds spent processing replicas in the Raft log queue", 678 Measurement: "Processing Time", 679 Unit: metric.Unit_NANOSECONDS, 680 } 681 metaRaftSnapshotQueueSuccesses = metric.Metadata{ 682 Name: "queue.raftsnapshot.process.success", 683 Help: "Number of replicas successfully processed by the Raft repair queue", 684 Measurement: "Replicas", 685 Unit: metric.Unit_COUNT, 686 } 687 metaRaftSnapshotQueueFailures = metric.Metadata{ 688 Name: "queue.raftsnapshot.process.failure", 689 Help: "Number of replicas which failed processing in the Raft repair queue", 690 Measurement: "Replicas", 691 Unit: metric.Unit_COUNT, 692 } 693 metaRaftSnapshotQueuePending = metric.Metadata{ 694 Name: "queue.raftsnapshot.pending", 695 Help: "Number of pending replicas in the Raft repair queue", 696 Measurement: "Replicas", 697 Unit: metric.Unit_COUNT, 698 } 699 metaRaftSnapshotQueueProcessingNanos = metric.Metadata{ 700 Name: "queue.raftsnapshot.processingnanos", 701 Help: "Nanoseconds spent processing replicas in the Raft repair queue", 702 Measurement: "Processing Time", 703 Unit: metric.Unit_NANOSECONDS, 704 } 705 metaConsistencyQueueSuccesses = metric.Metadata{ 706 Name: "queue.consistency.process.success", 707 Help: "Number of replicas successfully processed by the consistency checker queue", 708 Measurement: "Replicas", 709 Unit: metric.Unit_COUNT, 710 } 711 metaConsistencyQueueFailures = metric.Metadata{ 712 Name: "queue.consistency.process.failure", 713 Help: "Number of replicas which failed processing in the consistency checker queue", 714 Measurement: "Replicas", 715 Unit: metric.Unit_COUNT, 716 } 717 metaConsistencyQueuePending = metric.Metadata{ 718 Name: "queue.consistency.pending", 719 Help: "Number of pending replicas in the consistency checker queue", 720 Measurement: "Replicas", 721 Unit: metric.Unit_COUNT, 722 } 723 metaConsistencyQueueProcessingNanos = metric.Metadata{ 724 Name: "queue.consistency.processingnanos", 725 Help: "Nanoseconds spent processing replicas in the consistency checker queue", 726 Measurement: "Processing Time", 727 Unit: metric.Unit_NANOSECONDS, 728 } 729 metaReplicaGCQueueSuccesses = metric.Metadata{ 730 Name: "queue.replicagc.process.success", 731 Help: "Number of replicas successfully processed by the replica GC queue", 732 Measurement: "Replicas", 733 Unit: metric.Unit_COUNT, 734 } 735 metaReplicaGCQueueFailures = metric.Metadata{ 736 Name: "queue.replicagc.process.failure", 737 Help: "Number of replicas which failed processing in the replica GC queue", 738 Measurement: "Replicas", 739 Unit: metric.Unit_COUNT, 740 } 741 metaReplicaGCQueuePending = metric.Metadata{ 742 Name: "queue.replicagc.pending", 743 Help: "Number of pending replicas in the replica GC queue", 744 Measurement: "Replicas", 745 Unit: metric.Unit_COUNT, 746 } 747 metaReplicaGCQueueProcessingNanos = metric.Metadata{ 748 Name: "queue.replicagc.processingnanos", 749 Help: "Nanoseconds spent processing replicas in the replica GC queue", 750 Measurement: "Processing Time", 751 Unit: metric.Unit_NANOSECONDS, 752 } 753 metaReplicateQueueSuccesses = metric.Metadata{ 754 Name: "queue.replicate.process.success", 755 Help: "Number of replicas successfully processed by the replicate queue", 756 Measurement: "Replicas", 757 Unit: metric.Unit_COUNT, 758 } 759 metaReplicateQueueFailures = metric.Metadata{ 760 Name: "queue.replicate.process.failure", 761 Help: "Number of replicas which failed processing in the replicate queue", 762 Measurement: "Replicas", 763 Unit: metric.Unit_COUNT, 764 } 765 metaReplicateQueuePending = metric.Metadata{ 766 Name: "queue.replicate.pending", 767 Help: "Number of pending replicas in the replicate queue", 768 Measurement: "Replicas", 769 Unit: metric.Unit_COUNT, 770 } 771 metaReplicateQueueProcessingNanos = metric.Metadata{ 772 Name: "queue.replicate.processingnanos", 773 Help: "Nanoseconds spent processing replicas in the replicate queue", 774 Measurement: "Processing Time", 775 Unit: metric.Unit_NANOSECONDS, 776 } 777 metaReplicateQueuePurgatory = metric.Metadata{ 778 Name: "queue.replicate.purgatory", 779 Help: "Number of replicas in the replicate queue's purgatory, awaiting allocation options", 780 Measurement: "Replicas", 781 Unit: metric.Unit_COUNT, 782 } 783 metaSplitQueueSuccesses = metric.Metadata{ 784 Name: "queue.split.process.success", 785 Help: "Number of replicas successfully processed by the split queue", 786 Measurement: "Replicas", 787 Unit: metric.Unit_COUNT, 788 } 789 metaSplitQueueFailures = metric.Metadata{ 790 Name: "queue.split.process.failure", 791 Help: "Number of replicas which failed processing in the split queue", 792 Measurement: "Replicas", 793 Unit: metric.Unit_COUNT, 794 } 795 metaSplitQueuePending = metric.Metadata{ 796 Name: "queue.split.pending", 797 Help: "Number of pending replicas in the split queue", 798 Measurement: "Replicas", 799 Unit: metric.Unit_COUNT, 800 } 801 metaSplitQueueProcessingNanos = metric.Metadata{ 802 Name: "queue.split.processingnanos", 803 Help: "Nanoseconds spent processing replicas in the split queue", 804 Measurement: "Processing Time", 805 Unit: metric.Unit_NANOSECONDS, 806 } 807 metaSplitQueuePurgatory = metric.Metadata{ 808 Name: "queue.split.purgatory", 809 Help: "Number of replicas in the split queue's purgatory, waiting to become splittable", 810 Measurement: "Replicas", 811 Unit: metric.Unit_COUNT, 812 } 813 metaTimeSeriesMaintenanceQueueSuccesses = metric.Metadata{ 814 Name: "queue.tsmaintenance.process.success", 815 Help: "Number of replicas successfully processed by the time series maintenance queue", 816 Measurement: "Replicas", 817 Unit: metric.Unit_COUNT, 818 } 819 metaTimeSeriesMaintenanceQueueFailures = metric.Metadata{ 820 Name: "queue.tsmaintenance.process.failure", 821 Help: "Number of replicas which failed processing in the time series maintenance queue", 822 Measurement: "Replicas", 823 Unit: metric.Unit_COUNT, 824 } 825 metaTimeSeriesMaintenanceQueuePending = metric.Metadata{ 826 Name: "queue.tsmaintenance.pending", 827 Help: "Number of pending replicas in the time series maintenance queue", 828 Measurement: "Replicas", 829 Unit: metric.Unit_COUNT, 830 } 831 metaTimeSeriesMaintenanceQueueProcessingNanos = metric.Metadata{ 832 Name: "queue.tsmaintenance.processingnanos", 833 Help: "Nanoseconds spent processing replicas in the time series maintenance queue", 834 Measurement: "Processing Time", 835 Unit: metric.Unit_NANOSECONDS, 836 } 837 838 // GCInfo cumulative totals. 839 metaGCNumKeysAffected = metric.Metadata{ 840 Name: "queue.gc.info.numkeysaffected", 841 Help: "Number of keys with GC'able data", 842 Measurement: "Keys", 843 Unit: metric.Unit_COUNT, 844 } 845 metaGCIntentsConsidered = metric.Metadata{ 846 Name: "queue.gc.info.intentsconsidered", 847 Help: "Number of 'old' intents", 848 Measurement: "Intents", 849 Unit: metric.Unit_COUNT, 850 } 851 metaGCIntentTxns = metric.Metadata{ 852 Name: "queue.gc.info.intenttxns", 853 Help: "Number of associated distinct transactions", 854 Measurement: "Txns", 855 Unit: metric.Unit_COUNT, 856 } 857 metaGCTransactionSpanScanned = metric.Metadata{ 858 Name: "queue.gc.info.transactionspanscanned", 859 Help: "Number of entries in transaction spans scanned from the engine", 860 Measurement: "Txn Entries", 861 Unit: metric.Unit_COUNT, 862 } 863 metaGCTransactionSpanGCAborted = metric.Metadata{ 864 Name: "queue.gc.info.transactionspangcaborted", 865 Help: "Number of GC'able entries corresponding to aborted txns", 866 Measurement: "Txn Entries", 867 Unit: metric.Unit_COUNT, 868 } 869 metaGCTransactionSpanGCCommitted = metric.Metadata{ 870 Name: "queue.gc.info.transactionspangccommitted", 871 Help: "Number of GC'able entries corresponding to committed txns", 872 Measurement: "Txn Entries", 873 Unit: metric.Unit_COUNT, 874 } 875 metaGCTransactionSpanGCStaging = metric.Metadata{ 876 Name: "queue.gc.info.transactionspangcstaging", 877 Help: "Number of GC'able entries corresponding to staging txns", 878 Measurement: "Txn Entries", 879 Unit: metric.Unit_COUNT, 880 } 881 metaGCTransactionSpanGCPending = metric.Metadata{ 882 Name: "queue.gc.info.transactionspangcpending", 883 Help: "Number of GC'able entries corresponding to pending txns", 884 Measurement: "Txn Entries", 885 Unit: metric.Unit_COUNT, 886 } 887 metaGCAbortSpanScanned = metric.Metadata{ 888 Name: "queue.gc.info.abortspanscanned", 889 Help: "Number of transactions present in the AbortSpan scanned from the engine", 890 Measurement: "Txn Entries", 891 Unit: metric.Unit_COUNT, 892 } 893 metaGCAbortSpanConsidered = metric.Metadata{ 894 Name: "queue.gc.info.abortspanconsidered", 895 Help: "Number of AbortSpan entries old enough to be considered for removal", 896 Measurement: "Txn Entries", 897 Unit: metric.Unit_COUNT, 898 } 899 metaGCAbortSpanGCNum = metric.Metadata{ 900 Name: "queue.gc.info.abortspangcnum", 901 Help: "Number of AbortSpan entries fit for removal", 902 Measurement: "Txn Entries", 903 Unit: metric.Unit_COUNT, 904 } 905 metaGCPushTxn = metric.Metadata{ 906 Name: "queue.gc.info.pushtxn", 907 Help: "Number of attempted pushes", 908 Measurement: "Pushes", 909 Unit: metric.Unit_COUNT, 910 } 911 metaGCResolveTotal = metric.Metadata{ 912 Name: "queue.gc.info.resolvetotal", 913 Help: "Number of attempted intent resolutions", 914 Measurement: "Intent Resolutions", 915 Unit: metric.Unit_COUNT, 916 } 917 metaGCResolveSuccess = metric.Metadata{ 918 Name: "queue.gc.info.resolvesuccess", 919 Help: "Number of successful intent resolutions", 920 Measurement: "Intent Resolutions", 921 Unit: metric.Unit_COUNT, 922 } 923 924 // Slow request metrics. 925 metaLatchRequests = metric.Metadata{ 926 Name: "requests.slow.latch", 927 Help: "Number of requests that have been stuck for a long time acquiring latches", 928 Measurement: "Requests", 929 Unit: metric.Unit_COUNT, 930 } 931 metaSlowLeaseRequests = metric.Metadata{ 932 Name: "requests.slow.lease", 933 Help: "Number of requests that have been stuck for a long time acquiring a lease", 934 Measurement: "Requests", 935 Unit: metric.Unit_COUNT, 936 } 937 metaSlowRaftRequests = metric.Metadata{ 938 Name: "requests.slow.raft", 939 Help: "Number of requests that have been stuck for a long time in raft", 940 Measurement: "Requests", 941 Unit: metric.Unit_COUNT, 942 } 943 944 // Backpressure metrics. 945 metaBackpressuredOnSplitRequests = metric.Metadata{ 946 Name: "requests.backpressure.split", 947 Help: "Number of backpressured writes waiting on a Range split", 948 Measurement: "Writes", 949 Unit: metric.Unit_COUNT, 950 } 951 952 // AddSSTable metrics. 953 metaAddSSTableProposals = metric.Metadata{ 954 Name: "addsstable.proposals", 955 Help: "Number of SSTable ingestions proposed (i.e. sent to Raft by lease holders)", 956 Measurement: "Ingestions", 957 Unit: metric.Unit_COUNT, 958 } 959 metaAddSSTableApplications = metric.Metadata{ 960 Name: "addsstable.applications", 961 Help: "Number of SSTable ingestions applied (i.e. applied by Replicas)", 962 Measurement: "Ingestions", 963 Unit: metric.Unit_COUNT, 964 } 965 metaAddSSTableApplicationCopies = metric.Metadata{ 966 Name: "addsstable.copies", 967 Help: "number of SSTable ingestions that required copying files during application", 968 Measurement: "Ingestions", 969 Unit: metric.Unit_COUNT, 970 } 971 metaAddSSTableEvalTotalDelay = metric.Metadata{ 972 Name: "addsstable.delay.total", 973 Help: "Amount by which evaluation of AddSSTable requests was delayed", 974 Measurement: "Nanoseconds", 975 Unit: metric.Unit_NANOSECONDS, 976 } 977 metaAddSSTableEvalEngineDelay = metric.Metadata{ 978 Name: "addsstable.delay.enginebackpressure", 979 Help: "Amount by which evaluation of AddSSTable requests was delayed by storage-engine backpressure", 980 Measurement: "Nanoseconds", 981 Unit: metric.Unit_NANOSECONDS, 982 } 983 984 // Encryption-at-rest metrics. 985 // TODO(mberhault): metrics for key age, per-key file/bytes counts. 986 metaEncryptionAlgorithm = metric.Metadata{ 987 Name: "rocksdb.encryption.algorithm", 988 Help: "algorithm in use for encryption-at-rest, see ccl/storageccl/engineccl/enginepbccl/key_registry.proto", 989 Measurement: "Encryption At Rest", 990 Unit: metric.Unit_CONST, 991 } 992 993 // Closed timestamp metrics. 994 metaClosedTimestampMaxBehindNanos = metric.Metadata{ 995 Name: "kv.closed_timestamp.max_behind_nanos", 996 Help: "Largest latency between realtime and replica max closed timestamp", 997 Measurement: "Nanoseconds", 998 Unit: metric.Unit_NANOSECONDS, 999 } 1000 ) 1001 1002 // StoreMetrics is the set of metrics for a given store. 1003 type StoreMetrics struct { 1004 registry *metric.Registry 1005 1006 // Replica metrics. 1007 ReplicaCount *metric.Gauge // Does not include uninitialized or reserved replicas. 1008 ReservedReplicaCount *metric.Gauge 1009 RaftLeaderCount *metric.Gauge 1010 RaftLeaderNotLeaseHolderCount *metric.Gauge 1011 LeaseHolderCount *metric.Gauge 1012 QuiescentCount *metric.Gauge 1013 1014 // Range metrics. 1015 RangeCount *metric.Gauge 1016 UnavailableRangeCount *metric.Gauge 1017 UnderReplicatedRangeCount *metric.Gauge 1018 OverReplicatedRangeCount *metric.Gauge 1019 1020 // Lease request metrics for successful and failed lease requests. These 1021 // count proposals (i.e. it does not matter how many replicas apply the 1022 // lease). 1023 LeaseRequestSuccessCount *metric.Counter 1024 LeaseRequestErrorCount *metric.Counter 1025 LeaseTransferSuccessCount *metric.Counter 1026 LeaseTransferErrorCount *metric.Counter 1027 LeaseExpirationCount *metric.Gauge 1028 LeaseEpochCount *metric.Gauge 1029 1030 // Storage metrics. 1031 LiveBytes *metric.Gauge 1032 KeyBytes *metric.Gauge 1033 ValBytes *metric.Gauge 1034 TotalBytes *metric.Gauge 1035 IntentBytes *metric.Gauge 1036 LiveCount *metric.Gauge 1037 KeyCount *metric.Gauge 1038 ValCount *metric.Gauge 1039 IntentCount *metric.Gauge 1040 IntentAge *metric.Gauge 1041 GcBytesAge *metric.Gauge 1042 LastUpdateNanos *metric.Gauge 1043 ResolveCommitCount *metric.Counter 1044 ResolveAbortCount *metric.Counter 1045 ResolvePoisonCount *metric.Counter 1046 Capacity *metric.Gauge 1047 Available *metric.Gauge 1048 Used *metric.Gauge 1049 Reserved *metric.Gauge 1050 SysBytes *metric.Gauge 1051 SysCount *metric.Gauge 1052 1053 // Rebalancing metrics. 1054 AverageQueriesPerSecond *metric.GaugeFloat64 1055 AverageWritesPerSecond *metric.GaugeFloat64 1056 1057 // Follower read metrics. 1058 FollowerReadsCount *metric.Counter 1059 1060 // RocksDB metrics. 1061 RdbBlockCacheHits *metric.Gauge 1062 RdbBlockCacheMisses *metric.Gauge 1063 RdbBlockCacheUsage *metric.Gauge 1064 RdbBlockCachePinnedUsage *metric.Gauge 1065 RdbBloomFilterPrefixChecked *metric.Gauge 1066 RdbBloomFilterPrefixUseful *metric.Gauge 1067 RdbMemtableTotalSize *metric.Gauge 1068 RdbFlushes *metric.Gauge 1069 RdbFlushedBytes *metric.Gauge 1070 RdbCompactions *metric.Gauge 1071 RdbIngestedBytes *metric.Gauge 1072 RdbCompactedBytesRead *metric.Gauge 1073 RdbCompactedBytesWritten *metric.Gauge 1074 RdbTableReadersMemEstimate *metric.Gauge 1075 RdbReadAmplification *metric.Gauge 1076 RdbNumSSTables *metric.Gauge 1077 RdbPendingCompaction *metric.Gauge 1078 1079 // TODO(mrtracy): This should be removed as part of #4465. This is only 1080 // maintained to keep the current structure of NodeStatus; it would be 1081 // better to convert the Gauges above into counters which are adjusted 1082 // accordingly. 1083 1084 // Range event metrics. 1085 RangeSplits *metric.Counter 1086 RangeMerges *metric.Counter 1087 RangeAdds *metric.Counter 1088 RangeRemoves *metric.Counter 1089 RangeSnapshotsGenerated *metric.Counter 1090 RangeSnapshotsNormalApplied *metric.Counter 1091 RangeSnapshotsLearnerApplied *metric.Counter 1092 RangeRaftLeaderTransfers *metric.Counter 1093 1094 // Raft processing metrics. 1095 RaftTicks *metric.Counter 1096 RaftWorkingDurationNanos *metric.Counter 1097 RaftTickingDurationNanos *metric.Counter 1098 RaftCommandsApplied *metric.Counter 1099 RaftLogCommitLatency *metric.Histogram 1100 RaftCommandCommitLatency *metric.Histogram 1101 RaftHandleReadyLatency *metric.Histogram 1102 RaftApplyCommittedLatency *metric.Histogram 1103 1104 // Raft message metrics. 1105 RaftRcvdMsgProp *metric.Counter 1106 RaftRcvdMsgApp *metric.Counter 1107 RaftRcvdMsgAppResp *metric.Counter 1108 RaftRcvdMsgVote *metric.Counter 1109 RaftRcvdMsgVoteResp *metric.Counter 1110 RaftRcvdMsgPreVote *metric.Counter 1111 RaftRcvdMsgPreVoteResp *metric.Counter 1112 RaftRcvdMsgSnap *metric.Counter 1113 RaftRcvdMsgHeartbeat *metric.Counter 1114 RaftRcvdMsgHeartbeatResp *metric.Counter 1115 RaftRcvdMsgTransferLeader *metric.Counter 1116 RaftRcvdMsgTimeoutNow *metric.Counter 1117 RaftRcvdMsgDropped *metric.Counter 1118 1119 // Raft log metrics. 1120 RaftLogFollowerBehindCount *metric.Gauge 1121 RaftLogTruncated *metric.Counter 1122 1123 // An array for conveniently finding the appropriate metric. The individual 1124 // metric references must exist as AddMetricStruct adds them by reflection 1125 // on this struct and does not process array types. 1126 // TODO(arjun): eliminate this duplication. 1127 raftRcvdMessages [maxRaftMsgType + 1]*metric.Counter 1128 1129 RaftEnqueuedPending *metric.Gauge 1130 RaftCoalescedHeartbeatsPending *metric.Gauge 1131 1132 // Replica queue metrics. 1133 GCQueueSuccesses *metric.Counter 1134 GCQueueFailures *metric.Counter 1135 GCQueuePending *metric.Gauge 1136 GCQueueProcessingNanos *metric.Counter 1137 MergeQueueSuccesses *metric.Counter 1138 MergeQueueFailures *metric.Counter 1139 MergeQueuePending *metric.Gauge 1140 MergeQueueProcessingNanos *metric.Counter 1141 MergeQueuePurgatory *metric.Gauge 1142 RaftLogQueueSuccesses *metric.Counter 1143 RaftLogQueueFailures *metric.Counter 1144 RaftLogQueuePending *metric.Gauge 1145 RaftLogQueueProcessingNanos *metric.Counter 1146 RaftSnapshotQueueSuccesses *metric.Counter 1147 RaftSnapshotQueueFailures *metric.Counter 1148 RaftSnapshotQueuePending *metric.Gauge 1149 RaftSnapshotQueueProcessingNanos *metric.Counter 1150 ConsistencyQueueSuccesses *metric.Counter 1151 ConsistencyQueueFailures *metric.Counter 1152 ConsistencyQueuePending *metric.Gauge 1153 ConsistencyQueueProcessingNanos *metric.Counter 1154 ReplicaGCQueueSuccesses *metric.Counter 1155 ReplicaGCQueueFailures *metric.Counter 1156 ReplicaGCQueuePending *metric.Gauge 1157 ReplicaGCQueueProcessingNanos *metric.Counter 1158 ReplicateQueueSuccesses *metric.Counter 1159 ReplicateQueueFailures *metric.Counter 1160 ReplicateQueuePending *metric.Gauge 1161 ReplicateQueueProcessingNanos *metric.Counter 1162 ReplicateQueuePurgatory *metric.Gauge 1163 SplitQueueSuccesses *metric.Counter 1164 SplitQueueFailures *metric.Counter 1165 SplitQueuePending *metric.Gauge 1166 SplitQueueProcessingNanos *metric.Counter 1167 SplitQueuePurgatory *metric.Gauge 1168 TimeSeriesMaintenanceQueueSuccesses *metric.Counter 1169 TimeSeriesMaintenanceQueueFailures *metric.Counter 1170 TimeSeriesMaintenanceQueuePending *metric.Gauge 1171 TimeSeriesMaintenanceQueueProcessingNanos *metric.Counter 1172 1173 // GCInfo cumulative totals. 1174 GCNumKeysAffected *metric.Counter 1175 GCIntentsConsidered *metric.Counter 1176 GCIntentTxns *metric.Counter 1177 GCTransactionSpanScanned *metric.Counter 1178 GCTransactionSpanGCAborted *metric.Counter 1179 GCTransactionSpanGCCommitted *metric.Counter 1180 GCTransactionSpanGCStaging *metric.Counter 1181 GCTransactionSpanGCPending *metric.Counter 1182 GCAbortSpanScanned *metric.Counter 1183 GCAbortSpanConsidered *metric.Counter 1184 GCAbortSpanGCNum *metric.Counter 1185 GCPushTxn *metric.Counter 1186 GCResolveTotal *metric.Counter 1187 GCResolveSuccess *metric.Counter 1188 1189 // Slow request counts. 1190 SlowLatchRequests *metric.Gauge 1191 SlowLeaseRequests *metric.Gauge 1192 SlowRaftRequests *metric.Gauge 1193 1194 // Backpressure counts. 1195 BackpressuredOnSplitRequests *metric.Gauge 1196 1197 // AddSSTable stats: how many AddSSTable commands were proposed and how many 1198 // were applied? How many applications required writing a copy? 1199 AddSSTableProposals *metric.Counter 1200 AddSSTableApplications *metric.Counter 1201 AddSSTableApplicationCopies *metric.Counter 1202 AddSSTableProposalTotalDelay *metric.Counter 1203 AddSSTableProposalEngineDelay *metric.Counter 1204 1205 // Encryption-at-rest stats. 1206 // EncryptionAlgorithm is an enum representing the cipher in use, so we use a gauge. 1207 EncryptionAlgorithm *metric.Gauge 1208 1209 // RangeFeed counts. 1210 RangeFeedMetrics *rangefeed.Metrics 1211 1212 // Closed timestamp metrics. 1213 ClosedTimestampMaxBehindNanos *metric.Gauge 1214 } 1215 1216 func newStoreMetrics(histogramWindow time.Duration) *StoreMetrics { 1217 storeRegistry := metric.NewRegistry() 1218 sm := &StoreMetrics{ 1219 registry: storeRegistry, 1220 1221 // Replica metrics. 1222 ReplicaCount: metric.NewGauge(metaReplicaCount), 1223 ReservedReplicaCount: metric.NewGauge(metaReservedReplicaCount), 1224 RaftLeaderCount: metric.NewGauge(metaRaftLeaderCount), 1225 RaftLeaderNotLeaseHolderCount: metric.NewGauge(metaRaftLeaderNotLeaseHolderCount), 1226 LeaseHolderCount: metric.NewGauge(metaLeaseHolderCount), 1227 QuiescentCount: metric.NewGauge(metaQuiescentCount), 1228 1229 // Range metrics. 1230 RangeCount: metric.NewGauge(metaRangeCount), 1231 UnavailableRangeCount: metric.NewGauge(metaUnavailableRangeCount), 1232 UnderReplicatedRangeCount: metric.NewGauge(metaUnderReplicatedRangeCount), 1233 OverReplicatedRangeCount: metric.NewGauge(metaOverReplicatedRangeCount), 1234 1235 // Lease request metrics. 1236 LeaseRequestSuccessCount: metric.NewCounter(metaLeaseRequestSuccessCount), 1237 LeaseRequestErrorCount: metric.NewCounter(metaLeaseRequestErrorCount), 1238 LeaseTransferSuccessCount: metric.NewCounter(metaLeaseTransferSuccessCount), 1239 LeaseTransferErrorCount: metric.NewCounter(metaLeaseTransferErrorCount), 1240 LeaseExpirationCount: metric.NewGauge(metaLeaseExpirationCount), 1241 LeaseEpochCount: metric.NewGauge(metaLeaseEpochCount), 1242 1243 // Storage metrics. 1244 LiveBytes: metric.NewGauge(metaLiveBytes), 1245 KeyBytes: metric.NewGauge(metaKeyBytes), 1246 ValBytes: metric.NewGauge(metaValBytes), 1247 TotalBytes: metric.NewGauge(metaTotalBytes), 1248 IntentBytes: metric.NewGauge(metaIntentBytes), 1249 LiveCount: metric.NewGauge(metaLiveCount), 1250 KeyCount: metric.NewGauge(metaKeyCount), 1251 ValCount: metric.NewGauge(metaValCount), 1252 IntentCount: metric.NewGauge(metaIntentCount), 1253 IntentAge: metric.NewGauge(metaIntentAge), 1254 GcBytesAge: metric.NewGauge(metaGcBytesAge), 1255 LastUpdateNanos: metric.NewGauge(metaLastUpdateNanos), 1256 1257 ResolveCommitCount: metric.NewCounter(metaResolveCommit), 1258 ResolveAbortCount: metric.NewCounter(metaResolveAbort), 1259 ResolvePoisonCount: metric.NewCounter(metaResolvePoison), 1260 1261 Capacity: metric.NewGauge(metaCapacity), 1262 Available: metric.NewGauge(metaAvailable), 1263 Used: metric.NewGauge(metaUsed), 1264 Reserved: metric.NewGauge(metaReserved), 1265 SysBytes: metric.NewGauge(metaSysBytes), 1266 SysCount: metric.NewGauge(metaSysCount), 1267 1268 // Rebalancing metrics. 1269 AverageQueriesPerSecond: metric.NewGaugeFloat64(metaAverageQueriesPerSecond), 1270 AverageWritesPerSecond: metric.NewGaugeFloat64(metaAverageWritesPerSecond), 1271 1272 // Follower reads metrics. 1273 FollowerReadsCount: metric.NewCounter(metaFollowerReadsCount), 1274 1275 // RocksDB metrics. 1276 RdbBlockCacheHits: metric.NewGauge(metaRdbBlockCacheHits), 1277 RdbBlockCacheMisses: metric.NewGauge(metaRdbBlockCacheMisses), 1278 RdbBlockCacheUsage: metric.NewGauge(metaRdbBlockCacheUsage), 1279 RdbBlockCachePinnedUsage: metric.NewGauge(metaRdbBlockCachePinnedUsage), 1280 RdbBloomFilterPrefixChecked: metric.NewGauge(metaRdbBloomFilterPrefixChecked), 1281 RdbBloomFilterPrefixUseful: metric.NewGauge(metaRdbBloomFilterPrefixUseful), 1282 RdbMemtableTotalSize: metric.NewGauge(metaRdbMemtableTotalSize), 1283 RdbFlushes: metric.NewGauge(metaRdbFlushes), 1284 RdbFlushedBytes: metric.NewGauge(metaRdbFlushedBytes), 1285 RdbCompactions: metric.NewGauge(metaRdbCompactions), 1286 RdbIngestedBytes: metric.NewGauge(metaRdbIngestedBytes), 1287 RdbCompactedBytesRead: metric.NewGauge(metaRdbCompactedBytesRead), 1288 RdbCompactedBytesWritten: metric.NewGauge(metaRdbCompactedBytesWritten), 1289 RdbTableReadersMemEstimate: metric.NewGauge(metaRdbTableReadersMemEstimate), 1290 RdbReadAmplification: metric.NewGauge(metaRdbReadAmplification), 1291 RdbNumSSTables: metric.NewGauge(metaRdbNumSSTables), 1292 RdbPendingCompaction: metric.NewGauge(metaRdbPendingCompaction), 1293 1294 // Range event metrics. 1295 RangeSplits: metric.NewCounter(metaRangeSplits), 1296 RangeMerges: metric.NewCounter(metaRangeMerges), 1297 RangeAdds: metric.NewCounter(metaRangeAdds), 1298 RangeRemoves: metric.NewCounter(metaRangeRemoves), 1299 RangeSnapshotsGenerated: metric.NewCounter(metaRangeSnapshotsGenerated), 1300 RangeSnapshotsNormalApplied: metric.NewCounter(metaRangeSnapshotsNormalApplied), 1301 RangeSnapshotsLearnerApplied: metric.NewCounter(metaRangeSnapshotsLearnerApplied), 1302 RangeRaftLeaderTransfers: metric.NewCounter(metaRangeRaftLeaderTransfers), 1303 1304 // Raft processing metrics. 1305 RaftTicks: metric.NewCounter(metaRaftTicks), 1306 RaftWorkingDurationNanos: metric.NewCounter(metaRaftWorkingDurationNanos), 1307 RaftTickingDurationNanos: metric.NewCounter(metaRaftTickingDurationNanos), 1308 RaftCommandsApplied: metric.NewCounter(metaRaftCommandsApplied), 1309 RaftLogCommitLatency: metric.NewLatency(metaRaftLogCommitLatency, histogramWindow), 1310 RaftCommandCommitLatency: metric.NewLatency(metaRaftCommandCommitLatency, histogramWindow), 1311 RaftHandleReadyLatency: metric.NewLatency(metaRaftHandleReadyLatency, histogramWindow), 1312 RaftApplyCommittedLatency: metric.NewLatency(metaRaftApplyCommittedLatency, histogramWindow), 1313 1314 // Raft message metrics. 1315 RaftRcvdMsgProp: metric.NewCounter(metaRaftRcvdProp), 1316 RaftRcvdMsgApp: metric.NewCounter(metaRaftRcvdApp), 1317 RaftRcvdMsgAppResp: metric.NewCounter(metaRaftRcvdAppResp), 1318 RaftRcvdMsgVote: metric.NewCounter(metaRaftRcvdVote), 1319 RaftRcvdMsgVoteResp: metric.NewCounter(metaRaftRcvdVoteResp), 1320 RaftRcvdMsgPreVote: metric.NewCounter(metaRaftRcvdPreVote), 1321 RaftRcvdMsgPreVoteResp: metric.NewCounter(metaRaftRcvdPreVoteResp), 1322 RaftRcvdMsgSnap: metric.NewCounter(metaRaftRcvdSnap), 1323 RaftRcvdMsgHeartbeat: metric.NewCounter(metaRaftRcvdHeartbeat), 1324 RaftRcvdMsgHeartbeatResp: metric.NewCounter(metaRaftRcvdHeartbeatResp), 1325 RaftRcvdMsgTransferLeader: metric.NewCounter(metaRaftRcvdTransferLeader), 1326 RaftRcvdMsgTimeoutNow: metric.NewCounter(metaRaftRcvdTimeoutNow), 1327 RaftRcvdMsgDropped: metric.NewCounter(metaRaftRcvdDropped), 1328 1329 RaftEnqueuedPending: metric.NewGauge(metaRaftEnqueuedPending), 1330 1331 // This Gauge measures the number of heartbeats queued up just before 1332 // the queue is cleared, to avoid flapping wildly. 1333 RaftCoalescedHeartbeatsPending: metric.NewGauge(metaRaftCoalescedHeartbeatsPending), 1334 1335 // Raft log metrics. 1336 RaftLogFollowerBehindCount: metric.NewGauge(metaRaftLogFollowerBehindCount), 1337 RaftLogTruncated: metric.NewCounter(metaRaftLogTruncated), 1338 1339 // Replica queue metrics. 1340 GCQueueSuccesses: metric.NewCounter(metaGCQueueSuccesses), 1341 GCQueueFailures: metric.NewCounter(metaGCQueueFailures), 1342 GCQueuePending: metric.NewGauge(metaGCQueuePending), 1343 GCQueueProcessingNanos: metric.NewCounter(metaGCQueueProcessingNanos), 1344 MergeQueueSuccesses: metric.NewCounter(metaMergeQueueSuccesses), 1345 MergeQueueFailures: metric.NewCounter(metaMergeQueueFailures), 1346 MergeQueuePending: metric.NewGauge(metaMergeQueuePending), 1347 MergeQueueProcessingNanos: metric.NewCounter(metaMergeQueueProcessingNanos), 1348 MergeQueuePurgatory: metric.NewGauge(metaMergeQueuePurgatory), 1349 RaftLogQueueSuccesses: metric.NewCounter(metaRaftLogQueueSuccesses), 1350 RaftLogQueueFailures: metric.NewCounter(metaRaftLogQueueFailures), 1351 RaftLogQueuePending: metric.NewGauge(metaRaftLogQueuePending), 1352 RaftLogQueueProcessingNanos: metric.NewCounter(metaRaftLogQueueProcessingNanos), 1353 RaftSnapshotQueueSuccesses: metric.NewCounter(metaRaftSnapshotQueueSuccesses), 1354 RaftSnapshotQueueFailures: metric.NewCounter(metaRaftSnapshotQueueFailures), 1355 RaftSnapshotQueuePending: metric.NewGauge(metaRaftSnapshotQueuePending), 1356 RaftSnapshotQueueProcessingNanos: metric.NewCounter(metaRaftSnapshotQueueProcessingNanos), 1357 ConsistencyQueueSuccesses: metric.NewCounter(metaConsistencyQueueSuccesses), 1358 ConsistencyQueueFailures: metric.NewCounter(metaConsistencyQueueFailures), 1359 ConsistencyQueuePending: metric.NewGauge(metaConsistencyQueuePending), 1360 ConsistencyQueueProcessingNanos: metric.NewCounter(metaConsistencyQueueProcessingNanos), 1361 ReplicaGCQueueSuccesses: metric.NewCounter(metaReplicaGCQueueSuccesses), 1362 ReplicaGCQueueFailures: metric.NewCounter(metaReplicaGCQueueFailures), 1363 ReplicaGCQueuePending: metric.NewGauge(metaReplicaGCQueuePending), 1364 ReplicaGCQueueProcessingNanos: metric.NewCounter(metaReplicaGCQueueProcessingNanos), 1365 ReplicateQueueSuccesses: metric.NewCounter(metaReplicateQueueSuccesses), 1366 ReplicateQueueFailures: metric.NewCounter(metaReplicateQueueFailures), 1367 ReplicateQueuePending: metric.NewGauge(metaReplicateQueuePending), 1368 ReplicateQueueProcessingNanos: metric.NewCounter(metaReplicateQueueProcessingNanos), 1369 ReplicateQueuePurgatory: metric.NewGauge(metaReplicateQueuePurgatory), 1370 SplitQueueSuccesses: metric.NewCounter(metaSplitQueueSuccesses), 1371 SplitQueueFailures: metric.NewCounter(metaSplitQueueFailures), 1372 SplitQueuePending: metric.NewGauge(metaSplitQueuePending), 1373 SplitQueueProcessingNanos: metric.NewCounter(metaSplitQueueProcessingNanos), 1374 SplitQueuePurgatory: metric.NewGauge(metaSplitQueuePurgatory), 1375 TimeSeriesMaintenanceQueueSuccesses: metric.NewCounter(metaTimeSeriesMaintenanceQueueSuccesses), 1376 TimeSeriesMaintenanceQueueFailures: metric.NewCounter(metaTimeSeriesMaintenanceQueueFailures), 1377 TimeSeriesMaintenanceQueuePending: metric.NewGauge(metaTimeSeriesMaintenanceQueuePending), 1378 TimeSeriesMaintenanceQueueProcessingNanos: metric.NewCounter(metaTimeSeriesMaintenanceQueueProcessingNanos), 1379 1380 // GCInfo cumulative totals. 1381 GCNumKeysAffected: metric.NewCounter(metaGCNumKeysAffected), 1382 GCIntentsConsidered: metric.NewCounter(metaGCIntentsConsidered), 1383 GCIntentTxns: metric.NewCounter(metaGCIntentTxns), 1384 GCTransactionSpanScanned: metric.NewCounter(metaGCTransactionSpanScanned), 1385 GCTransactionSpanGCAborted: metric.NewCounter(metaGCTransactionSpanGCAborted), 1386 GCTransactionSpanGCCommitted: metric.NewCounter(metaGCTransactionSpanGCCommitted), 1387 GCTransactionSpanGCStaging: metric.NewCounter(metaGCTransactionSpanGCStaging), 1388 GCTransactionSpanGCPending: metric.NewCounter(metaGCTransactionSpanGCPending), 1389 GCAbortSpanScanned: metric.NewCounter(metaGCAbortSpanScanned), 1390 GCAbortSpanConsidered: metric.NewCounter(metaGCAbortSpanConsidered), 1391 GCAbortSpanGCNum: metric.NewCounter(metaGCAbortSpanGCNum), 1392 GCPushTxn: metric.NewCounter(metaGCPushTxn), 1393 GCResolveTotal: metric.NewCounter(metaGCResolveTotal), 1394 GCResolveSuccess: metric.NewCounter(metaGCResolveSuccess), 1395 1396 // Wedge request counters. 1397 SlowLatchRequests: metric.NewGauge(metaLatchRequests), 1398 SlowLeaseRequests: metric.NewGauge(metaSlowLeaseRequests), 1399 SlowRaftRequests: metric.NewGauge(metaSlowRaftRequests), 1400 1401 // Backpressure counters. 1402 BackpressuredOnSplitRequests: metric.NewGauge(metaBackpressuredOnSplitRequests), 1403 1404 // AddSSTable proposal + applications counters. 1405 AddSSTableProposals: metric.NewCounter(metaAddSSTableProposals), 1406 AddSSTableApplications: metric.NewCounter(metaAddSSTableApplications), 1407 AddSSTableApplicationCopies: metric.NewCounter(metaAddSSTableApplicationCopies), 1408 AddSSTableProposalTotalDelay: metric.NewCounter(metaAddSSTableEvalTotalDelay), 1409 AddSSTableProposalEngineDelay: metric.NewCounter(metaAddSSTableEvalEngineDelay), 1410 1411 // Encryption-at-rest. 1412 EncryptionAlgorithm: metric.NewGauge(metaEncryptionAlgorithm), 1413 1414 // RangeFeed counters. 1415 RangeFeedMetrics: rangefeed.NewMetrics(), 1416 1417 // Closed timestamp metrics. 1418 ClosedTimestampMaxBehindNanos: metric.NewGauge(metaClosedTimestampMaxBehindNanos), 1419 } 1420 1421 sm.raftRcvdMessages[raftpb.MsgProp] = sm.RaftRcvdMsgProp 1422 sm.raftRcvdMessages[raftpb.MsgApp] = sm.RaftRcvdMsgApp 1423 sm.raftRcvdMessages[raftpb.MsgAppResp] = sm.RaftRcvdMsgAppResp 1424 sm.raftRcvdMessages[raftpb.MsgVote] = sm.RaftRcvdMsgVote 1425 sm.raftRcvdMessages[raftpb.MsgVoteResp] = sm.RaftRcvdMsgVoteResp 1426 sm.raftRcvdMessages[raftpb.MsgPreVote] = sm.RaftRcvdMsgPreVote 1427 sm.raftRcvdMessages[raftpb.MsgPreVoteResp] = sm.RaftRcvdMsgPreVoteResp 1428 sm.raftRcvdMessages[raftpb.MsgSnap] = sm.RaftRcvdMsgSnap 1429 sm.raftRcvdMessages[raftpb.MsgHeartbeat] = sm.RaftRcvdMsgHeartbeat 1430 sm.raftRcvdMessages[raftpb.MsgHeartbeatResp] = sm.RaftRcvdMsgHeartbeatResp 1431 sm.raftRcvdMessages[raftpb.MsgTransferLeader] = sm.RaftRcvdMsgTransferLeader 1432 sm.raftRcvdMessages[raftpb.MsgTimeoutNow] = sm.RaftRcvdMsgTimeoutNow 1433 1434 storeRegistry.AddMetricStruct(sm) 1435 1436 return sm 1437 } 1438 1439 // incMVCCGauges increments each individual metric from an MVCCStats delta. The 1440 // method uses a series of atomic operations without any external locking, so a 1441 // single snapshot of these gauges in the registry might mix the values of two 1442 // subsequent updates. 1443 func (sm *StoreMetrics) incMVCCGauges(delta enginepb.MVCCStats) { 1444 sm.LiveBytes.Inc(delta.LiveBytes) 1445 sm.KeyBytes.Inc(delta.KeyBytes) 1446 sm.ValBytes.Inc(delta.ValBytes) 1447 sm.TotalBytes.Inc(delta.Total()) 1448 sm.IntentBytes.Inc(delta.IntentBytes) 1449 sm.LiveCount.Inc(delta.LiveCount) 1450 sm.KeyCount.Inc(delta.KeyCount) 1451 sm.ValCount.Inc(delta.ValCount) 1452 sm.IntentCount.Inc(delta.IntentCount) 1453 sm.IntentAge.Inc(delta.IntentAge) 1454 sm.GcBytesAge.Inc(delta.GCBytesAge) 1455 sm.LastUpdateNanos.Inc(delta.LastUpdateNanos) 1456 sm.SysBytes.Inc(delta.SysBytes) 1457 sm.SysCount.Inc(delta.SysCount) 1458 } 1459 1460 func (sm *StoreMetrics) addMVCCStats(delta enginepb.MVCCStats) { 1461 sm.incMVCCGauges(delta) 1462 } 1463 1464 func (sm *StoreMetrics) subtractMVCCStats(delta enginepb.MVCCStats) { 1465 var neg enginepb.MVCCStats 1466 neg.Subtract(delta) 1467 sm.incMVCCGauges(neg) 1468 } 1469 1470 func (sm *StoreMetrics) updateRocksDBStats(stats storage.Stats) { 1471 // We do not grab a lock here, because it's not possible to get a point-in- 1472 // time snapshot of RocksDB stats. Retrieving RocksDB stats doesn't grab any 1473 // locks, and there's no way to retrieve multiple stats in a single operation. 1474 sm.RdbBlockCacheHits.Update(stats.BlockCacheHits) 1475 sm.RdbBlockCacheMisses.Update(stats.BlockCacheMisses) 1476 sm.RdbBlockCacheUsage.Update(stats.BlockCacheUsage) 1477 sm.RdbBlockCachePinnedUsage.Update(stats.BlockCachePinnedUsage) 1478 sm.RdbBloomFilterPrefixUseful.Update(stats.BloomFilterPrefixUseful) 1479 sm.RdbBloomFilterPrefixChecked.Update(stats.BloomFilterPrefixChecked) 1480 sm.RdbMemtableTotalSize.Update(stats.MemtableTotalSize) 1481 sm.RdbFlushes.Update(stats.Flushes) 1482 sm.RdbFlushedBytes.Update(stats.FlushedBytes) 1483 sm.RdbCompactions.Update(stats.Compactions) 1484 sm.RdbIngestedBytes.Update(stats.IngestedBytes) 1485 sm.RdbCompactedBytesRead.Update(stats.CompactedBytesRead) 1486 sm.RdbCompactedBytesWritten.Update(stats.CompactedBytesWritten) 1487 sm.RdbTableReadersMemEstimate.Update(stats.TableReadersMemEstimate) 1488 } 1489 1490 func (sm *StoreMetrics) updateEnvStats(stats storage.EnvStats) { 1491 sm.EncryptionAlgorithm.Update(int64(stats.EncryptionType)) 1492 } 1493 1494 func (sm *StoreMetrics) handleMetricsResult(ctx context.Context, metric result.Metrics) { 1495 sm.LeaseRequestSuccessCount.Inc(int64(metric.LeaseRequestSuccess)) 1496 metric.LeaseRequestSuccess = 0 1497 sm.LeaseRequestErrorCount.Inc(int64(metric.LeaseRequestError)) 1498 metric.LeaseRequestError = 0 1499 sm.LeaseTransferSuccessCount.Inc(int64(metric.LeaseTransferSuccess)) 1500 metric.LeaseTransferSuccess = 0 1501 sm.LeaseTransferErrorCount.Inc(int64(metric.LeaseTransferError)) 1502 metric.LeaseTransferError = 0 1503 1504 sm.ResolveCommitCount.Inc(int64(metric.ResolveCommit)) 1505 metric.ResolveCommit = 0 1506 sm.ResolveAbortCount.Inc(int64(metric.ResolveAbort)) 1507 metric.ResolveAbort = 0 1508 sm.ResolvePoisonCount.Inc(int64(metric.ResolvePoison)) 1509 metric.ResolvePoison = 0 1510 1511 if metric != (result.Metrics{}) { 1512 log.Fatalf(ctx, "unhandled fields in metrics result: %+v", metric) 1513 } 1514 }