github.com/KinWaiYuen/client-go/v2@v2.5.4/metrics/metrics.go (about)

     1  // Copyright 2021 TiKV Authors
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // NOTE: The code in this file is based on code from the
    16  // TiDB project, licensed under the Apache License v 2.0
    17  //
    18  // https://github.com/pingcap/tidb/tree/cc5e161ac06827589c4966674597c137cc9e809c/store/tikv/metrics/metrics.go
    19  //
    20  
    21  // Copyright 2021 PingCAP, Inc.
    22  //
    23  // Licensed under the Apache License, Version 2.0 (the "License");
    24  // you may not use this file except in compliance with the License.
    25  // You may obtain a copy of the License at
    26  //
    27  //     http://www.apache.org/licenses/LICENSE-2.0
    28  //
    29  // Unless required by applicable law or agreed to in writing, software
    30  // distributed under the License is distributed on an "AS IS" BASIS,
    31  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    32  // See the License for the specific language governing permissions and
    33  // limitations under the License.
    34  
    35  package metrics
    36  
    37  import (
    38  	"github.com/prometheus/client_golang/prometheus"
    39  	dto "github.com/prometheus/client_model/go"
    40  )
    41  
    42  // Client metrics.
    43  var (
    44  	TiKVTxnCmdHistogram                      *prometheus.HistogramVec
    45  	TiKVBackoffHistogram                     *prometheus.HistogramVec
    46  	TiKVSendReqHistogram                     *prometheus.HistogramVec
    47  	TiKVCoprocessorHistogram                 *prometheus.HistogramVec
    48  	TiKVLockResolverCounter                  *prometheus.CounterVec
    49  	TiKVRegionErrorCounter                   *prometheus.CounterVec
    50  	TiKVTxnWriteKVCountHistogram             prometheus.Histogram
    51  	TiKVTxnWriteSizeHistogram                prometheus.Histogram
    52  	TiKVRawkvCmdHistogram                    *prometheus.HistogramVec
    53  	TiKVRawkvSizeHistogram                   *prometheus.HistogramVec
    54  	TiKVTxnRegionsNumHistogram               *prometheus.HistogramVec
    55  	TiKVLoadSafepointCounter                 *prometheus.CounterVec
    56  	TiKVSecondaryLockCleanupFailureCounter   *prometheus.CounterVec
    57  	TiKVRegionCacheCounter                   *prometheus.CounterVec
    58  	TiKVLocalLatchWaitTimeHistogram          prometheus.Histogram
    59  	TiKVStatusDuration                       *prometheus.HistogramVec
    60  	TiKVStatusCounter                        *prometheus.CounterVec
    61  	TiKVBatchWaitDuration                    prometheus.Histogram
    62  	TiKVBatchSendLatency                     prometheus.Histogram
    63  	TiKVBatchWaitOverLoad                    prometheus.Counter
    64  	TiKVBatchPendingRequests                 *prometheus.HistogramVec
    65  	TiKVBatchRequests                        *prometheus.HistogramVec
    66  	TiKVBatchClientUnavailable               prometheus.Histogram
    67  	TiKVBatchClientWaitEstablish             prometheus.Histogram
    68  	TiKVBatchClientRecycle                   prometheus.Histogram
    69  	TiKVBatchRecvLatency                     *prometheus.HistogramVec
    70  	TiKVRangeTaskStats                       *prometheus.GaugeVec
    71  	TiKVRangeTaskPushDuration                *prometheus.HistogramVec
    72  	TiKVTokenWaitDuration                    prometheus.Histogram
    73  	TiKVTxnHeartBeatHistogram                *prometheus.HistogramVec
    74  	TiKVPessimisticLockKeysDuration          prometheus.Histogram
    75  	TiKVTTLLifeTimeReachCounter              prometheus.Counter
    76  	TiKVNoAvailableConnectionCounter         prometheus.Counter
    77  	TiKVTwoPCTxnCounter                      *prometheus.CounterVec
    78  	TiKVAsyncCommitTxnCounter                *prometheus.CounterVec
    79  	TiKVOnePCTxnCounter                      *prometheus.CounterVec
    80  	TiKVStoreLimitErrorCounter               *prometheus.CounterVec
    81  	TiKVGRPCConnTransientFailureCounter      *prometheus.CounterVec
    82  	TiKVPanicCounter                         *prometheus.CounterVec
    83  	TiKVForwardRequestCounter                *prometheus.CounterVec
    84  	TiKVTSFutureWaitDuration                 prometheus.Histogram
    85  	TiKVSafeTSUpdateCounter                  *prometheus.CounterVec
    86  	TiKVMinSafeTSGapSeconds                  *prometheus.GaugeVec
    87  	TiKVReplicaSelectorFailureCounter        *prometheus.CounterVec
    88  	TiKVRequestRetryTimesHistogram           prometheus.Histogram
    89  	TiKVTxnCommitBackoffSeconds              prometheus.Histogram
    90  	TiKVTxnCommitBackoffCount                prometheus.Histogram
    91  	TiKVSmallReadDuration                    prometheus.Histogram
    92  	TiKVUnsafeDestroyRangeFailuresCounterVec *prometheus.CounterVec
    93  )
    94  
    95  // Label constants.
    96  const (
    97  	LblType            = "type"
    98  	LblResult          = "result"
    99  	LblStore           = "store"
   100  	LblCommit          = "commit"
   101  	LblAbort           = "abort"
   102  	LblRollback        = "rollback"
   103  	LblBatchGet        = "batch_get"
   104  	LblGet             = "get"
   105  	LblLockKeys        = "lock_keys"
   106  	LabelBatchRecvLoop = "batch-recv-loop"
   107  	LabelBatchSendLoop = "batch-send-loop"
   108  	LblAddress         = "address"
   109  	LblFromStore       = "from_store"
   110  	LblToStore         = "to_store"
   111  	LblStaleRead       = "stale_read"
   112  )
   113  
   114  func initMetrics(namespace, subsystem string) {
   115  	TiKVTxnCmdHistogram = prometheus.NewHistogramVec(
   116  		prometheus.HistogramOpts{
   117  			Namespace: namespace,
   118  			Subsystem: subsystem,
   119  			Name:      "txn_cmd_duration_seconds",
   120  			Help:      "Bucketed histogram of processing time of txn cmds.",
   121  			Buckets:   prometheus.ExponentialBuckets(0.0005, 2, 29), // 0.5ms ~ 1.5days
   122  		}, []string{LblType})
   123  
   124  	TiKVBackoffHistogram = prometheus.NewHistogramVec(
   125  		prometheus.HistogramOpts{
   126  			Namespace: namespace,
   127  			Subsystem: subsystem,
   128  			Name:      "backoff_seconds",
   129  			Help:      "total backoff seconds of a single backoffer.",
   130  			Buckets:   prometheus.ExponentialBuckets(0.0005, 2, 29), // 0.5ms ~ 1.5days
   131  		}, []string{LblType})
   132  
   133  	TiKVSendReqHistogram = prometheus.NewHistogramVec(
   134  		prometheus.HistogramOpts{
   135  			Namespace: namespace,
   136  			Subsystem: subsystem,
   137  			Name:      "request_seconds",
   138  			Help:      "Bucketed histogram of sending request duration.",
   139  			Buckets:   prometheus.ExponentialBuckets(0.0005, 2, 29), // 0.5ms ~ 1.5days
   140  		}, []string{LblType, LblStore, LblStaleRead})
   141  
   142  	TiKVCoprocessorHistogram = prometheus.NewHistogramVec(
   143  		prometheus.HistogramOpts{
   144  			Namespace: namespace,
   145  			Subsystem: subsystem,
   146  			Name:      "cop_duration_seconds",
   147  			Help:      "Run duration of a single coprocessor task, includes backoff time.",
   148  			Buckets:   prometheus.ExponentialBuckets(0.0005, 2, 29), // 0.5ms ~ 1.5days
   149  		}, []string{LblStore, LblStaleRead})
   150  
   151  	TiKVLockResolverCounter = prometheus.NewCounterVec(
   152  		prometheus.CounterOpts{
   153  			Namespace: namespace,
   154  			Subsystem: subsystem,
   155  			Name:      "lock_resolver_actions_total",
   156  			Help:      "Counter of lock resolver actions.",
   157  		}, []string{LblType})
   158  
   159  	TiKVRegionErrorCounter = prometheus.NewCounterVec(
   160  		prometheus.CounterOpts{
   161  			Namespace: namespace,
   162  			Subsystem: subsystem,
   163  			Name:      "region_err_total",
   164  			Help:      "Counter of region errors.",
   165  		}, []string{LblType})
   166  
   167  	TiKVTxnWriteKVCountHistogram = prometheus.NewHistogram(
   168  		prometheus.HistogramOpts{
   169  			Namespace: namespace,
   170  			Subsystem: subsystem,
   171  			Name:      "txn_write_kv_num",
   172  			Help:      "Count of kv pairs to write in a transaction.",
   173  			Buckets:   prometheus.ExponentialBuckets(1, 4, 17), // 1 ~ 4G
   174  		})
   175  
   176  	TiKVTxnWriteSizeHistogram = prometheus.NewHistogram(
   177  		prometheus.HistogramOpts{
   178  			Namespace: namespace,
   179  			Subsystem: subsystem,
   180  			Name:      "txn_write_size_bytes",
   181  			Help:      "Size of kv pairs to write in a transaction.",
   182  			Buckets:   prometheus.ExponentialBuckets(16, 4, 17), // 16Bytes ~ 64GB
   183  		})
   184  
   185  	TiKVRawkvCmdHistogram = prometheus.NewHistogramVec(
   186  		prometheus.HistogramOpts{
   187  			Namespace: namespace,
   188  			Subsystem: subsystem,
   189  			Name:      "rawkv_cmd_seconds",
   190  			Help:      "Bucketed histogram of processing time of rawkv cmds.",
   191  			Buckets:   prometheus.ExponentialBuckets(0.0005, 2, 29), // 0.5ms ~ 1.5days
   192  		}, []string{LblType})
   193  
   194  	TiKVRawkvSizeHistogram = prometheus.NewHistogramVec(
   195  		prometheus.HistogramOpts{
   196  			Namespace: namespace,
   197  			Subsystem: subsystem,
   198  			Name:      "rawkv_kv_size_bytes",
   199  			Help:      "Size of key/value to put, in bytes.",
   200  			Buckets:   prometheus.ExponentialBuckets(1, 2, 30), // 1Byte ~ 512MB
   201  		}, []string{LblType})
   202  
   203  	TiKVTxnRegionsNumHistogram = prometheus.NewHistogramVec(
   204  		prometheus.HistogramOpts{
   205  			Namespace: namespace,
   206  			Subsystem: subsystem,
   207  			Name:      "txn_regions_num",
   208  			Help:      "Number of regions in a transaction.",
   209  			Buckets:   prometheus.ExponentialBuckets(1, 2, 25), // 1 ~ 16M
   210  		}, []string{LblType})
   211  
   212  	TiKVLoadSafepointCounter = prometheus.NewCounterVec(
   213  		prometheus.CounterOpts{
   214  			Namespace: namespace,
   215  			Subsystem: subsystem,
   216  			Name:      "load_safepoint_total",
   217  			Help:      "Counter of load safepoint.",
   218  		}, []string{LblType})
   219  
   220  	TiKVSecondaryLockCleanupFailureCounter = prometheus.NewCounterVec(
   221  		prometheus.CounterOpts{
   222  			Namespace: namespace,
   223  			Subsystem: subsystem,
   224  			Name:      "lock_cleanup_task_total",
   225  			Help:      "failure statistic of secondary lock cleanup task.",
   226  		}, []string{LblType})
   227  
   228  	TiKVRegionCacheCounter = prometheus.NewCounterVec(
   229  		prometheus.CounterOpts{
   230  			Namespace: namespace,
   231  			Subsystem: subsystem,
   232  			Name:      "region_cache_operations_total",
   233  			Help:      "Counter of region cache.",
   234  		}, []string{LblType, LblResult})
   235  
   236  	TiKVLocalLatchWaitTimeHistogram = prometheus.NewHistogram(
   237  		prometheus.HistogramOpts{
   238  			Namespace: namespace,
   239  			Subsystem: subsystem,
   240  			Name:      "local_latch_wait_seconds",
   241  			Help:      "Wait time of a get local latch.",
   242  			Buckets:   prometheus.ExponentialBuckets(0.0005, 2, 20), // 0.5ms ~ 262s
   243  		})
   244  
   245  	TiKVStatusDuration = prometheus.NewHistogramVec(
   246  		prometheus.HistogramOpts{
   247  			Namespace: namespace,
   248  			Subsystem: subsystem,
   249  			Name:      "kv_status_api_duration",
   250  			Help:      "duration for kv status api.",
   251  			Buckets:   prometheus.ExponentialBuckets(0.0005, 2, 20), // 0.5ms ~ 262s
   252  		}, []string{"store"})
   253  
   254  	TiKVStatusCounter = prometheus.NewCounterVec(
   255  		prometheus.CounterOpts{
   256  			Namespace: namespace,
   257  			Subsystem: subsystem,
   258  			Name:      "kv_status_api_count",
   259  			Help:      "Counter of access kv status api.",
   260  		}, []string{LblResult})
   261  
   262  	TiKVBatchWaitDuration = prometheus.NewHistogram(
   263  		prometheus.HistogramOpts{
   264  			Namespace: namespace,
   265  			Subsystem: subsystem,
   266  			Name:      "batch_wait_duration",
   267  			Buckets:   prometheus.ExponentialBuckets(1, 2, 34), // 1ns ~ 8s
   268  			Help:      "batch wait duration",
   269  		})
   270  
   271  	TiKVBatchSendLatency = prometheus.NewHistogram(
   272  		prometheus.HistogramOpts{
   273  			Namespace: namespace,
   274  			Subsystem: subsystem,
   275  			Name:      "batch_send_latency",
   276  			Buckets:   prometheus.ExponentialBuckets(1, 2, 34), // 1ns ~ 8s
   277  			Help:      "batch send latency",
   278  		})
   279  
   280  	TiKVBatchRecvLatency = prometheus.NewHistogramVec(
   281  		prometheus.HistogramOpts{
   282  			Namespace: namespace,
   283  			Subsystem: subsystem,
   284  			Name:      "batch_recv_latency",
   285  			Buckets:   prometheus.ExponentialBuckets(1000, 2, 34), // 1us ~ 8000s
   286  			Help:      "batch recv latency",
   287  		}, []string{LblResult})
   288  
   289  	TiKVBatchWaitOverLoad = prometheus.NewCounter(
   290  		prometheus.CounterOpts{
   291  			Namespace: namespace,
   292  			Subsystem: subsystem,
   293  			Name:      "batch_wait_overload",
   294  			Help:      "event of tikv transport layer overload",
   295  		})
   296  
   297  	TiKVBatchPendingRequests = prometheus.NewHistogramVec(
   298  		prometheus.HistogramOpts{
   299  			Namespace: namespace,
   300  			Subsystem: subsystem,
   301  			Name:      "batch_pending_requests",
   302  			Buckets:   prometheus.ExponentialBuckets(1, 2, 8),
   303  			Help:      "number of requests pending in the batch channel",
   304  		}, []string{"store"})
   305  
   306  	TiKVBatchRequests = prometheus.NewHistogramVec(
   307  		prometheus.HistogramOpts{
   308  			Namespace: namespace,
   309  			Subsystem: subsystem,
   310  			Name:      "batch_requests",
   311  			Buckets:   prometheus.ExponentialBuckets(1, 2, 8),
   312  			Help:      "number of requests in one batch",
   313  		}, []string{"store"})
   314  
   315  	TiKVBatchClientUnavailable = prometheus.NewHistogram(
   316  		prometheus.HistogramOpts{
   317  			Namespace: namespace,
   318  			Subsystem: subsystem,
   319  			Name:      "batch_client_unavailable_seconds",
   320  			Buckets:   prometheus.ExponentialBuckets(0.001, 2, 28), // 1ms ~ 1.5days
   321  			Help:      "batch client unavailable",
   322  		})
   323  
   324  	TiKVBatchClientWaitEstablish = prometheus.NewHistogram(
   325  		prometheus.HistogramOpts{
   326  			Namespace: namespace,
   327  			Subsystem: subsystem,
   328  			Name:      "batch_client_wait_connection_establish",
   329  			Buckets:   prometheus.ExponentialBuckets(0.001, 2, 28), // 1ms ~ 1.5days
   330  			Help:      "batch client wait new connection establish",
   331  		})
   332  
   333  	TiKVBatchClientRecycle = prometheus.NewHistogram(
   334  		prometheus.HistogramOpts{
   335  			Namespace: namespace,
   336  			Subsystem: subsystem,
   337  			Name:      "batch_client_reset",
   338  			Buckets:   prometheus.ExponentialBuckets(0.001, 2, 28), // 1ms ~ 1.5days
   339  			Help:      "batch client recycle connection and reconnect duration",
   340  		})
   341  
   342  	TiKVRangeTaskStats = prometheus.NewGaugeVec(
   343  		prometheus.GaugeOpts{
   344  			Namespace: namespace,
   345  			Subsystem: subsystem,
   346  			Name:      "range_task_stats",
   347  			Help:      "stat of range tasks",
   348  		}, []string{LblType, LblResult})
   349  
   350  	TiKVRangeTaskPushDuration = prometheus.NewHistogramVec(
   351  		prometheus.HistogramOpts{
   352  			Namespace: namespace,
   353  			Subsystem: subsystem,
   354  			Name:      "range_task_push_duration",
   355  			Buckets:   prometheus.ExponentialBuckets(0.001, 2, 20), // 1ms ~ 524s
   356  			Help:      "duration to push sub tasks to range task workers",
   357  		}, []string{LblType})
   358  
   359  	TiKVTokenWaitDuration = prometheus.NewHistogram(
   360  		prometheus.HistogramOpts{
   361  			Namespace: namespace,
   362  			Subsystem: subsystem,
   363  			Name:      "batch_executor_token_wait_duration",
   364  			Buckets:   prometheus.ExponentialBuckets(1, 2, 34), // 1ns ~ 8s
   365  			Help:      "tidb txn token wait duration to process batches",
   366  		})
   367  
   368  	TiKVTxnHeartBeatHistogram = prometheus.NewHistogramVec(
   369  		prometheus.HistogramOpts{
   370  			Namespace: namespace,
   371  			Subsystem: subsystem,
   372  			Name:      "txn_heart_beat",
   373  			Help:      "Bucketed histogram of the txn_heartbeat request duration.",
   374  			Buckets:   prometheus.ExponentialBuckets(0.001, 2, 20), // 1ms ~ 524s
   375  		}, []string{LblType})
   376  
   377  	TiKVPessimisticLockKeysDuration = prometheus.NewHistogram(
   378  		prometheus.HistogramOpts{
   379  			Namespace: namespace,
   380  			Subsystem: subsystem,
   381  			Name:      "pessimistic_lock_keys_duration",
   382  			Buckets:   prometheus.ExponentialBuckets(0.001, 2, 24), // 1ms ~ 8389s
   383  			Help:      "tidb txn pessimistic lock keys duration",
   384  		})
   385  
   386  	TiKVTTLLifeTimeReachCounter = prometheus.NewCounter(
   387  		prometheus.CounterOpts{
   388  			Namespace: namespace,
   389  			Subsystem: subsystem,
   390  			Name:      "ttl_lifetime_reach_total",
   391  			Help:      "Counter of ttlManager live too long.",
   392  		})
   393  
   394  	TiKVNoAvailableConnectionCounter = prometheus.NewCounter(
   395  		prometheus.CounterOpts{
   396  			Namespace: namespace,
   397  			Subsystem: subsystem,
   398  			Name:      "batch_client_no_available_connection_total",
   399  			Help:      "Counter of no available batch client.",
   400  		})
   401  
   402  	TiKVTwoPCTxnCounter = prometheus.NewCounterVec(
   403  		prometheus.CounterOpts{
   404  			Namespace: namespace,
   405  			Subsystem: subsystem,
   406  			Name:      "commit_txn_counter",
   407  			Help:      "Counter of 2PC transactions.",
   408  		}, []string{LblType})
   409  
   410  	TiKVAsyncCommitTxnCounter = prometheus.NewCounterVec(
   411  		prometheus.CounterOpts{
   412  			Namespace: namespace,
   413  			Subsystem: subsystem,
   414  			Name:      "async_commit_txn_counter",
   415  			Help:      "Counter of async commit transactions.",
   416  		}, []string{LblType})
   417  
   418  	TiKVOnePCTxnCounter = prometheus.NewCounterVec(
   419  		prometheus.CounterOpts{
   420  			Namespace: namespace,
   421  			Subsystem: subsystem,
   422  			Name:      "one_pc_txn_counter",
   423  			Help:      "Counter of 1PC transactions.",
   424  		}, []string{LblType})
   425  
   426  	TiKVStoreLimitErrorCounter = prometheus.NewCounterVec(
   427  		prometheus.CounterOpts{
   428  			Namespace: namespace,
   429  			Subsystem: subsystem,
   430  			Name:      "get_store_limit_token_error",
   431  			Help:      "store token is up to the limit, probably because one of the stores is the hotspot or unavailable",
   432  		}, []string{LblAddress, LblStore})
   433  
   434  	TiKVGRPCConnTransientFailureCounter = prometheus.NewCounterVec(
   435  		prometheus.CounterOpts{
   436  			Namespace: namespace,
   437  			Subsystem: subsystem,
   438  			Name:      "connection_transient_failure_count",
   439  			Help:      "Counter of gRPC connection transient failure",
   440  		}, []string{LblAddress, LblStore})
   441  
   442  	TiKVPanicCounter = prometheus.NewCounterVec(
   443  		prometheus.CounterOpts{
   444  			Namespace: namespace,
   445  			Subsystem: subsystem,
   446  			Name:      "panic_total",
   447  			Help:      "Counter of panic.",
   448  		}, []string{LblType})
   449  
   450  	TiKVForwardRequestCounter = prometheus.NewCounterVec(
   451  		prometheus.CounterOpts{
   452  			Namespace: namespace,
   453  			Subsystem: subsystem,
   454  			Name:      "forward_request_counter",
   455  			Help:      "Counter of tikv request being forwarded through another node",
   456  		}, []string{LblFromStore, LblToStore, LblType, LblResult})
   457  
   458  	TiKVTSFutureWaitDuration = prometheus.NewHistogram(
   459  		prometheus.HistogramOpts{
   460  			Namespace: namespace,
   461  			Subsystem: subsystem,
   462  			Name:      "ts_future_wait_seconds",
   463  			Help:      "Bucketed histogram of seconds cost for waiting timestamp future.",
   464  			Buckets:   prometheus.ExponentialBuckets(0.000005, 2, 30), // 5us ~ 2560s
   465  		})
   466  
   467  	TiKVSafeTSUpdateCounter = prometheus.NewCounterVec(
   468  		prometheus.CounterOpts{
   469  			Namespace: namespace,
   470  			Subsystem: subsystem,
   471  			Name:      "safets_update_counter",
   472  			Help:      "Counter of tikv safe_ts being updated.",
   473  		}, []string{LblResult, LblStore})
   474  
   475  	TiKVMinSafeTSGapSeconds = prometheus.NewGaugeVec(
   476  		prometheus.GaugeOpts{
   477  			Namespace: namespace,
   478  			Subsystem: subsystem,
   479  			Name:      "min_safets_gap_seconds",
   480  			Help:      "The minimal (non-zero) SafeTS gap for each store.",
   481  		}, []string{LblStore})
   482  
   483  	TiKVReplicaSelectorFailureCounter = prometheus.NewCounterVec(
   484  		prometheus.CounterOpts{
   485  			Namespace: namespace,
   486  			Subsystem: subsystem,
   487  			Name:      "replica_selector_failure_counter",
   488  			Help:      "Counter of the reason why the replica selector cannot yield a potential leader.",
   489  		}, []string{LblType})
   490  
   491  	TiKVRequestRetryTimesHistogram = prometheus.NewHistogram(
   492  		prometheus.HistogramOpts{
   493  			Namespace: namespace,
   494  			Subsystem: subsystem,
   495  			Name:      "request_retry_times",
   496  			Help:      "Bucketed histogram of how many times a region request retries.",
   497  			Buckets:   []float64{1, 2, 3, 4, 8, 16, 32, 64, 128, 256},
   498  		})
   499  	TiKVTxnCommitBackoffSeconds = prometheus.NewHistogram(
   500  		prometheus.HistogramOpts{
   501  			Namespace: namespace,
   502  			Subsystem: subsystem,
   503  			Name:      "txn_commit_backoff_seconds",
   504  			Help:      "Bucketed histogram of the total backoff duration in committing a transaction.",
   505  			Buckets:   prometheus.ExponentialBuckets(0.001, 2, 22), // 1ms ~ 2097s
   506  		})
   507  	TiKVTxnCommitBackoffCount = prometheus.NewHistogram(
   508  		prometheus.HistogramOpts{
   509  			Namespace: namespace,
   510  			Subsystem: subsystem,
   511  			Name:      "txn_commit_backoff_count",
   512  			Help:      "Bucketed histogram of the backoff count in committing a transaction.",
   513  			Buckets:   prometheus.ExponentialBuckets(1, 2, 12), // 1 ~ 2048
   514  		})
   515  
   516  	// TiKVSmallReadDuration uses to collect small request read duration.
   517  	TiKVSmallReadDuration = prometheus.NewHistogram(
   518  		prometheus.HistogramOpts{
   519  			Namespace: namespace,
   520  			Subsystem: "sli", // Always use "sli" to make it compatible with TiDB.
   521  			Name:      "tikv_small_read_duration",
   522  			Help:      "Read time of TiKV small read.",
   523  			Buckets:   prometheus.ExponentialBuckets(0.0005, 2, 28), // 0.5ms ~ 74h
   524  		})
   525  
   526  	TiKVUnsafeDestroyRangeFailuresCounterVec = prometheus.NewCounterVec(
   527  		prometheus.CounterOpts{
   528  			Namespace: namespace,
   529  			Subsystem: subsystem,
   530  			Name:      "gc_unsafe_destroy_range_failures",
   531  			Help:      "Counter of unsafe destroyrange failures",
   532  		}, []string{LblType})
   533  
   534  	initShortcuts()
   535  }
   536  
   537  func init() {
   538  	initMetrics("tikv", "client_go")
   539  }
   540  
   541  // InitMetrics initializes metrics variables with given namespace and subsystem name.
   542  func InitMetrics(namespace, subsystem string) {
   543  	initMetrics(namespace, subsystem)
   544  }
   545  
   546  // RegisterMetrics registers all metrics variables.
   547  // Note: to change default namespace and subsystem name, call `InitMetrics` before registering.
   548  func RegisterMetrics() {
   549  	prometheus.MustRegister(TiKVTxnCmdHistogram)
   550  	prometheus.MustRegister(TiKVBackoffHistogram)
   551  	prometheus.MustRegister(TiKVSendReqHistogram)
   552  	prometheus.MustRegister(TiKVCoprocessorHistogram)
   553  	prometheus.MustRegister(TiKVLockResolverCounter)
   554  	prometheus.MustRegister(TiKVRegionErrorCounter)
   555  	prometheus.MustRegister(TiKVTxnWriteKVCountHistogram)
   556  	prometheus.MustRegister(TiKVTxnWriteSizeHistogram)
   557  	prometheus.MustRegister(TiKVRawkvCmdHistogram)
   558  	prometheus.MustRegister(TiKVRawkvSizeHistogram)
   559  	prometheus.MustRegister(TiKVTxnRegionsNumHistogram)
   560  	prometheus.MustRegister(TiKVLoadSafepointCounter)
   561  	prometheus.MustRegister(TiKVSecondaryLockCleanupFailureCounter)
   562  	prometheus.MustRegister(TiKVRegionCacheCounter)
   563  	prometheus.MustRegister(TiKVLocalLatchWaitTimeHistogram)
   564  	prometheus.MustRegister(TiKVStatusDuration)
   565  	prometheus.MustRegister(TiKVStatusCounter)
   566  	prometheus.MustRegister(TiKVBatchWaitDuration)
   567  	prometheus.MustRegister(TiKVBatchSendLatency)
   568  	prometheus.MustRegister(TiKVBatchRecvLatency)
   569  	prometheus.MustRegister(TiKVBatchWaitOverLoad)
   570  	prometheus.MustRegister(TiKVBatchPendingRequests)
   571  	prometheus.MustRegister(TiKVBatchRequests)
   572  	prometheus.MustRegister(TiKVBatchClientUnavailable)
   573  	prometheus.MustRegister(TiKVBatchClientWaitEstablish)
   574  	prometheus.MustRegister(TiKVBatchClientRecycle)
   575  	prometheus.MustRegister(TiKVRangeTaskStats)
   576  	prometheus.MustRegister(TiKVRangeTaskPushDuration)
   577  	prometheus.MustRegister(TiKVTokenWaitDuration)
   578  	prometheus.MustRegister(TiKVTxnHeartBeatHistogram)
   579  	prometheus.MustRegister(TiKVPessimisticLockKeysDuration)
   580  	prometheus.MustRegister(TiKVTTLLifeTimeReachCounter)
   581  	prometheus.MustRegister(TiKVNoAvailableConnectionCounter)
   582  	prometheus.MustRegister(TiKVTwoPCTxnCounter)
   583  	prometheus.MustRegister(TiKVAsyncCommitTxnCounter)
   584  	prometheus.MustRegister(TiKVOnePCTxnCounter)
   585  	prometheus.MustRegister(TiKVStoreLimitErrorCounter)
   586  	prometheus.MustRegister(TiKVGRPCConnTransientFailureCounter)
   587  	prometheus.MustRegister(TiKVPanicCounter)
   588  	prometheus.MustRegister(TiKVForwardRequestCounter)
   589  	prometheus.MustRegister(TiKVTSFutureWaitDuration)
   590  	prometheus.MustRegister(TiKVSafeTSUpdateCounter)
   591  	prometheus.MustRegister(TiKVMinSafeTSGapSeconds)
   592  	prometheus.MustRegister(TiKVReplicaSelectorFailureCounter)
   593  	prometheus.MustRegister(TiKVRequestRetryTimesHistogram)
   594  	prometheus.MustRegister(TiKVTxnCommitBackoffSeconds)
   595  	prometheus.MustRegister(TiKVTxnCommitBackoffCount)
   596  	prometheus.MustRegister(TiKVSmallReadDuration)
   597  }
   598  
   599  // readCounter reads the value of a prometheus.Counter.
   600  // Returns -1 when failing to read the value.
   601  func readCounter(m prometheus.Counter) int64 {
   602  	// Actually, it's not recommended to read the value of prometheus metric types directly:
   603  	// https://github.com/prometheus/client_golang/issues/486#issuecomment-433345239
   604  	pb := &dto.Metric{}
   605  	// It's impossible to return an error though.
   606  	if err := m.Write(pb); err != nil {
   607  		return -1
   608  	}
   609  	return int64(pb.GetCounter().GetValue())
   610  }
   611  
   612  // TxnCommitCounter is the counter of transactions committed with
   613  // different protocols, i.e. 2PC, async-commit, 1PC.
   614  type TxnCommitCounter struct {
   615  	TwoPC       int64 `json:"twoPC"`
   616  	AsyncCommit int64 `json:"asyncCommit"`
   617  	OnePC       int64 `json:"onePC"`
   618  }
   619  
   620  // Sub returns the difference of two counters.
   621  func (c TxnCommitCounter) Sub(rhs TxnCommitCounter) TxnCommitCounter {
   622  	new := TxnCommitCounter{}
   623  	new.TwoPC = c.TwoPC - rhs.TwoPC
   624  	new.AsyncCommit = c.AsyncCommit - rhs.AsyncCommit
   625  	new.OnePC = c.OnePC - rhs.OnePC
   626  	return new
   627  }
   628  
   629  // GetTxnCommitCounter gets the TxnCommitCounter.
   630  func GetTxnCommitCounter() TxnCommitCounter {
   631  	return TxnCommitCounter{
   632  		TwoPC:       readCounter(TwoPCTxnCounterOk),
   633  		AsyncCommit: readCounter(AsyncCommitTxnCounterOk),
   634  		OnePC:       readCounter(OnePCTxnCounterOk),
   635  	}
   636  }
   637  
   638  const smallTxnAffectRow = 20
   639  
   640  // ObserveReadSLI observes the read SLI metric.
   641  func ObserveReadSLI(readKeys uint64, readTime float64) {
   642  	if readKeys <= smallTxnAffectRow && readKeys != 0 && readTime != 0 {
   643  		TiKVSmallReadDuration.Observe(readTime)
   644  	}
   645  }