github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvclient/kvcoord/txn_metrics.go (about)

     1  // Copyright 2019 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package kvcoord
    12  
    13  import (
    14  	"time"
    15  
    16  	"github.com/cockroachdb/cockroach/pkg/server/telemetry"
    17  	"github.com/cockroachdb/cockroach/pkg/util/metric"
    18  )
    19  
    20  // TxnMetrics holds all metrics relating to KV transactions.
    21  type TxnMetrics struct {
    22  	Aborts          *metric.Counter
    23  	Commits         *metric.Counter
    24  	Commits1PC      *metric.Counter // Commits which finished in a single phase
    25  	ParallelCommits *metric.Counter // Commits which entered the STAGING state
    26  
    27  	RefreshSuccess                *metric.Counter
    28  	RefreshFail                   *metric.Counter
    29  	RefreshFailWithCondensedSpans *metric.Counter
    30  	RefreshMemoryLimitExceeded    *metric.Counter
    31  
    32  	Durations *metric.Histogram
    33  
    34  	// Restarts is the number of times we had to restart the transaction.
    35  	Restarts *metric.Histogram
    36  
    37  	// Counts of restart types.
    38  	RestartsWriteTooOld           telemetry.CounterWithMetric
    39  	RestartsWriteTooOldMulti      telemetry.CounterWithMetric
    40  	RestartsSerializable          telemetry.CounterWithMetric
    41  	RestartsAsyncWriteFailure     telemetry.CounterWithMetric
    42  	RestartsReadWithinUncertainty telemetry.CounterWithMetric
    43  	RestartsTxnAborted            telemetry.CounterWithMetric
    44  	RestartsTxnPush               telemetry.CounterWithMetric
    45  	RestartsUnknown               telemetry.CounterWithMetric
    46  }
    47  
    48  var (
    49  	metaAbortsRates = metric.Metadata{
    50  		Name:        "txn.aborts",
    51  		Help:        "Number of aborted KV transactions",
    52  		Measurement: "KV Transactions",
    53  		Unit:        metric.Unit_COUNT,
    54  	}
    55  	metaCommitsRates = metric.Metadata{
    56  		Name:        "txn.commits",
    57  		Help:        "Number of committed KV transactions (including 1PC)",
    58  		Measurement: "KV Transactions",
    59  		Unit:        metric.Unit_COUNT,
    60  	}
    61  	metaCommits1PCRates = metric.Metadata{
    62  		Name:        "txn.commits1PC",
    63  		Help:        "Number of KV transaction on-phase commit attempts",
    64  		Measurement: "KV Transactions",
    65  		Unit:        metric.Unit_COUNT,
    66  	}
    67  	metaParallelCommitsRates = metric.Metadata{
    68  		Name:        "txn.parallelcommits",
    69  		Help:        "Number of KV transaction parallel commit attempts",
    70  		Measurement: "KV Transactions",
    71  		Unit:        metric.Unit_COUNT,
    72  	}
    73  	metaRefreshSuccess = metric.Metadata{
    74  		Name:        "txn.refresh.success",
    75  		Help:        "Number of successful refreshes",
    76  		Measurement: "Refreshes",
    77  		Unit:        metric.Unit_COUNT,
    78  	}
    79  	metaRefreshFail = metric.Metadata{
    80  		Name:        "txn.refresh.fail",
    81  		Help:        "Number of failed refreshes",
    82  		Measurement: "Refreshes",
    83  		Unit:        metric.Unit_COUNT,
    84  	}
    85  	metaRefreshFailWithCondensedSpans = metric.Metadata{
    86  		Name: "txn.refresh.fail_with_condensed_spans",
    87  		Help: "Number of failed refreshes for transactions whose read " +
    88  			"tracking lost fidelity because of condensing. Such a failure " +
    89  			"could be a false conflict. Failures counted here are also counted " +
    90  			"in txn.refresh.fail, and the respective transactions are also counted in " +
    91  			"txn.refresh.memory_limit_exceeded.",
    92  		Measurement: "Refreshes",
    93  		Unit:        metric.Unit_COUNT,
    94  	}
    95  	metaRefreshMemoryLimitExceeded = metric.Metadata{
    96  		Name: "txn.refresh.memory_limit_exceeded",
    97  		Help: "Number of transaction which exceed the refresh span bytes limit, causing " +
    98  			"their read spans to be condensed",
    99  		Measurement: "Transactions",
   100  		Unit:        metric.Unit_COUNT,
   101  	}
   102  	metaDurationsHistograms = metric.Metadata{
   103  		Name:        "txn.durations",
   104  		Help:        "KV transaction durations",
   105  		Measurement: "KV Txn Duration",
   106  		Unit:        metric.Unit_NANOSECONDS,
   107  	}
   108  	metaRestartsHistogram = metric.Metadata{
   109  		Name:        "txn.restarts",
   110  		Help:        "Number of restarted KV transactions",
   111  		Measurement: "KV Transactions",
   112  		Unit:        metric.Unit_COUNT,
   113  	}
   114  	// There are two ways we can get "write too old" restarts. In both cases, a
   115  	// WriteTooOldError is generated in the MVCC layer. This is intercepted on
   116  	// the way out by the Store, which performs a single retry at a pushed
   117  	// timestamp. If the retry succeeds, the immediate operation succeeds but
   118  	// the WriteTooOld flag is set on the Transaction, which causes EndTxn to
   119  	// return a/ TransactionRetryError with RETRY_WRITE_TOO_OLD. These are
   120  	// captured as txn.restarts.writetooold.
   121  	//
   122  	// If the Store's retried operation generates a second WriteTooOldError
   123  	// (indicating a conflict with a third transaction with a higher timestamp
   124  	// than the one that caused the first WriteTooOldError), the store doesn't
   125  	// retry again, and the WriteTooOldError will be returned up the stack to be
   126  	// retried at this level. These are captured as
   127  	// txn.restarts.writetoooldmulti. This path is inefficient, and if it turns
   128  	// out to be common we may want to do something about it.
   129  	metaRestartsWriteTooOld = metric.Metadata{
   130  		Name:        "txn.restarts.writetooold",
   131  		Help:        "Number of restarts due to a concurrent writer committing first",
   132  		Measurement: "Restarted Transactions",
   133  		Unit:        metric.Unit_COUNT,
   134  	}
   135  	metaRestartsWriteTooOldMulti = metric.Metadata{
   136  		Name:        "txn.restarts.writetoooldmulti",
   137  		Help:        "Number of restarts due to multiple concurrent writers committing first",
   138  		Measurement: "Restarted Transactions",
   139  		Unit:        metric.Unit_COUNT,
   140  	}
   141  	metaRestartsSerializable = metric.Metadata{
   142  		Name:        "txn.restarts.serializable",
   143  		Help:        "Number of restarts due to a forwarded commit timestamp and isolation=SERIALIZABLE",
   144  		Measurement: "Restarted Transactions",
   145  		Unit:        metric.Unit_COUNT,
   146  	}
   147  	metaRestartsPossibleReplay = metric.Metadata{
   148  		Name:        "txn.restarts.possiblereplay",
   149  		Help:        "Number of restarts due to possible replays of command batches at the storage layer",
   150  		Measurement: "Restarted Transactions",
   151  		Unit:        metric.Unit_COUNT,
   152  	}
   153  	metaRestartsAsyncWriteFailure = metric.Metadata{
   154  		Name:        "txn.restarts.asyncwritefailure",
   155  		Help:        "Number of restarts due to async consensus writes that failed to leave intents",
   156  		Measurement: "Restarted Transactions",
   157  		Unit:        metric.Unit_COUNT,
   158  	}
   159  	metaRestartsReadWithinUncertainty = metric.Metadata{
   160  		Name:        "txn.restarts.readwithinuncertainty",
   161  		Help:        "Number of restarts due to reading a new value within the uncertainty interval",
   162  		Measurement: "Restarted Transactions",
   163  		Unit:        metric.Unit_COUNT,
   164  	}
   165  	metaRestartsTxnAborted = metric.Metadata{
   166  		Name:        "txn.restarts.txnaborted",
   167  		Help:        "Number of restarts due to an abort by a concurrent transaction (usually due to deadlock)",
   168  		Measurement: "Restarted Transactions",
   169  		Unit:        metric.Unit_COUNT,
   170  	}
   171  	// TransactionPushErrors at this level are unusual. They are
   172  	// normally handled at the Store level with the txnwait and
   173  	// contention queues. However, they can reach this level and be
   174  	// retried in tests that disable the store-level retries, and
   175  	// there may be edge cases that allow them to reach this point in
   176  	// production.
   177  	metaRestartsTxnPush = metric.Metadata{
   178  		Name:        "txn.restarts.txnpush",
   179  		Help:        "Number of restarts due to a transaction push failure",
   180  		Measurement: "Restarted Transactions",
   181  		Unit:        metric.Unit_COUNT,
   182  	}
   183  	metaRestartsUnknown = metric.Metadata{
   184  		Name:        "txn.restarts.unknown",
   185  		Help:        "Number of restarts due to a unknown reasons",
   186  		Measurement: "Restarted Transactions",
   187  		Unit:        metric.Unit_COUNT,
   188  	}
   189  )
   190  
   191  // MakeTxnMetrics returns a TxnMetrics struct that contains metrics whose
   192  // windowed portions retain data for approximately histogramWindow.
   193  func MakeTxnMetrics(histogramWindow time.Duration) TxnMetrics {
   194  	return TxnMetrics{
   195  		Aborts:                        metric.NewCounter(metaAbortsRates),
   196  		Commits:                       metric.NewCounter(metaCommitsRates),
   197  		Commits1PC:                    metric.NewCounter(metaCommits1PCRates),
   198  		ParallelCommits:               metric.NewCounter(metaParallelCommitsRates),
   199  		RefreshFail:                   metric.NewCounter(metaRefreshFail),
   200  		RefreshFailWithCondensedSpans: metric.NewCounter(metaRefreshFailWithCondensedSpans),
   201  		RefreshSuccess:                metric.NewCounter(metaRefreshSuccess),
   202  		RefreshMemoryLimitExceeded:    metric.NewCounter(metaRefreshMemoryLimitExceeded),
   203  		Durations:                     metric.NewLatency(metaDurationsHistograms, histogramWindow),
   204  		Restarts:                      metric.NewHistogram(metaRestartsHistogram, histogramWindow, 100, 3),
   205  		RestartsWriteTooOld:           telemetry.NewCounterWithMetric(metaRestartsWriteTooOld),
   206  		RestartsWriteTooOldMulti:      telemetry.NewCounterWithMetric(metaRestartsWriteTooOldMulti),
   207  		RestartsSerializable:          telemetry.NewCounterWithMetric(metaRestartsSerializable),
   208  		RestartsAsyncWriteFailure:     telemetry.NewCounterWithMetric(metaRestartsAsyncWriteFailure),
   209  		RestartsReadWithinUncertainty: telemetry.NewCounterWithMetric(metaRestartsReadWithinUncertainty),
   210  		RestartsTxnAborted:            telemetry.NewCounterWithMetric(metaRestartsTxnAborted),
   211  		RestartsTxnPush:               telemetry.NewCounterWithMetric(metaRestartsTxnPush),
   212  		RestartsUnknown:               telemetry.NewCounterWithMetric(metaRestartsUnknown),
   213  	}
   214  }