github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvclient/kvcoord/txn_metrics.go (about) 1 // Copyright 2019 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package kvcoord 12 13 import ( 14 "time" 15 16 "github.com/cockroachdb/cockroach/pkg/server/telemetry" 17 "github.com/cockroachdb/cockroach/pkg/util/metric" 18 ) 19 20 // TxnMetrics holds all metrics relating to KV transactions. 21 type TxnMetrics struct { 22 Aborts *metric.Counter 23 Commits *metric.Counter 24 Commits1PC *metric.Counter // Commits which finished in a single phase 25 ParallelCommits *metric.Counter // Commits which entered the STAGING state 26 27 RefreshSuccess *metric.Counter 28 RefreshFail *metric.Counter 29 RefreshFailWithCondensedSpans *metric.Counter 30 RefreshMemoryLimitExceeded *metric.Counter 31 32 Durations *metric.Histogram 33 34 // Restarts is the number of times we had to restart the transaction. 35 Restarts *metric.Histogram 36 37 // Counts of restart types. 38 RestartsWriteTooOld telemetry.CounterWithMetric 39 RestartsWriteTooOldMulti telemetry.CounterWithMetric 40 RestartsSerializable telemetry.CounterWithMetric 41 RestartsAsyncWriteFailure telemetry.CounterWithMetric 42 RestartsReadWithinUncertainty telemetry.CounterWithMetric 43 RestartsTxnAborted telemetry.CounterWithMetric 44 RestartsTxnPush telemetry.CounterWithMetric 45 RestartsUnknown telemetry.CounterWithMetric 46 } 47 48 var ( 49 metaAbortsRates = metric.Metadata{ 50 Name: "txn.aborts", 51 Help: "Number of aborted KV transactions", 52 Measurement: "KV Transactions", 53 Unit: metric.Unit_COUNT, 54 } 55 metaCommitsRates = metric.Metadata{ 56 Name: "txn.commits", 57 Help: "Number of committed KV transactions (including 1PC)", 58 Measurement: "KV Transactions", 59 Unit: metric.Unit_COUNT, 60 } 61 metaCommits1PCRates = metric.Metadata{ 62 Name: "txn.commits1PC", 63 Help: "Number of KV transaction on-phase commit attempts", 64 Measurement: "KV Transactions", 65 Unit: metric.Unit_COUNT, 66 } 67 metaParallelCommitsRates = metric.Metadata{ 68 Name: "txn.parallelcommits", 69 Help: "Number of KV transaction parallel commit attempts", 70 Measurement: "KV Transactions", 71 Unit: metric.Unit_COUNT, 72 } 73 metaRefreshSuccess = metric.Metadata{ 74 Name: "txn.refresh.success", 75 Help: "Number of successful refreshes", 76 Measurement: "Refreshes", 77 Unit: metric.Unit_COUNT, 78 } 79 metaRefreshFail = metric.Metadata{ 80 Name: "txn.refresh.fail", 81 Help: "Number of failed refreshes", 82 Measurement: "Refreshes", 83 Unit: metric.Unit_COUNT, 84 } 85 metaRefreshFailWithCondensedSpans = metric.Metadata{ 86 Name: "txn.refresh.fail_with_condensed_spans", 87 Help: "Number of failed refreshes for transactions whose read " + 88 "tracking lost fidelity because of condensing. Such a failure " + 89 "could be a false conflict. Failures counted here are also counted " + 90 "in txn.refresh.fail, and the respective transactions are also counted in " + 91 "txn.refresh.memory_limit_exceeded.", 92 Measurement: "Refreshes", 93 Unit: metric.Unit_COUNT, 94 } 95 metaRefreshMemoryLimitExceeded = metric.Metadata{ 96 Name: "txn.refresh.memory_limit_exceeded", 97 Help: "Number of transaction which exceed the refresh span bytes limit, causing " + 98 "their read spans to be condensed", 99 Measurement: "Transactions", 100 Unit: metric.Unit_COUNT, 101 } 102 metaDurationsHistograms = metric.Metadata{ 103 Name: "txn.durations", 104 Help: "KV transaction durations", 105 Measurement: "KV Txn Duration", 106 Unit: metric.Unit_NANOSECONDS, 107 } 108 metaRestartsHistogram = metric.Metadata{ 109 Name: "txn.restarts", 110 Help: "Number of restarted KV transactions", 111 Measurement: "KV Transactions", 112 Unit: metric.Unit_COUNT, 113 } 114 // There are two ways we can get "write too old" restarts. In both cases, a 115 // WriteTooOldError is generated in the MVCC layer. This is intercepted on 116 // the way out by the Store, which performs a single retry at a pushed 117 // timestamp. If the retry succeeds, the immediate operation succeeds but 118 // the WriteTooOld flag is set on the Transaction, which causes EndTxn to 119 // return a/ TransactionRetryError with RETRY_WRITE_TOO_OLD. These are 120 // captured as txn.restarts.writetooold. 121 // 122 // If the Store's retried operation generates a second WriteTooOldError 123 // (indicating a conflict with a third transaction with a higher timestamp 124 // than the one that caused the first WriteTooOldError), the store doesn't 125 // retry again, and the WriteTooOldError will be returned up the stack to be 126 // retried at this level. These are captured as 127 // txn.restarts.writetoooldmulti. This path is inefficient, and if it turns 128 // out to be common we may want to do something about it. 129 metaRestartsWriteTooOld = metric.Metadata{ 130 Name: "txn.restarts.writetooold", 131 Help: "Number of restarts due to a concurrent writer committing first", 132 Measurement: "Restarted Transactions", 133 Unit: metric.Unit_COUNT, 134 } 135 metaRestartsWriteTooOldMulti = metric.Metadata{ 136 Name: "txn.restarts.writetoooldmulti", 137 Help: "Number of restarts due to multiple concurrent writers committing first", 138 Measurement: "Restarted Transactions", 139 Unit: metric.Unit_COUNT, 140 } 141 metaRestartsSerializable = metric.Metadata{ 142 Name: "txn.restarts.serializable", 143 Help: "Number of restarts due to a forwarded commit timestamp and isolation=SERIALIZABLE", 144 Measurement: "Restarted Transactions", 145 Unit: metric.Unit_COUNT, 146 } 147 metaRestartsPossibleReplay = metric.Metadata{ 148 Name: "txn.restarts.possiblereplay", 149 Help: "Number of restarts due to possible replays of command batches at the storage layer", 150 Measurement: "Restarted Transactions", 151 Unit: metric.Unit_COUNT, 152 } 153 metaRestartsAsyncWriteFailure = metric.Metadata{ 154 Name: "txn.restarts.asyncwritefailure", 155 Help: "Number of restarts due to async consensus writes that failed to leave intents", 156 Measurement: "Restarted Transactions", 157 Unit: metric.Unit_COUNT, 158 } 159 metaRestartsReadWithinUncertainty = metric.Metadata{ 160 Name: "txn.restarts.readwithinuncertainty", 161 Help: "Number of restarts due to reading a new value within the uncertainty interval", 162 Measurement: "Restarted Transactions", 163 Unit: metric.Unit_COUNT, 164 } 165 metaRestartsTxnAborted = metric.Metadata{ 166 Name: "txn.restarts.txnaborted", 167 Help: "Number of restarts due to an abort by a concurrent transaction (usually due to deadlock)", 168 Measurement: "Restarted Transactions", 169 Unit: metric.Unit_COUNT, 170 } 171 // TransactionPushErrors at this level are unusual. They are 172 // normally handled at the Store level with the txnwait and 173 // contention queues. However, they can reach this level and be 174 // retried in tests that disable the store-level retries, and 175 // there may be edge cases that allow them to reach this point in 176 // production. 177 metaRestartsTxnPush = metric.Metadata{ 178 Name: "txn.restarts.txnpush", 179 Help: "Number of restarts due to a transaction push failure", 180 Measurement: "Restarted Transactions", 181 Unit: metric.Unit_COUNT, 182 } 183 metaRestartsUnknown = metric.Metadata{ 184 Name: "txn.restarts.unknown", 185 Help: "Number of restarts due to a unknown reasons", 186 Measurement: "Restarted Transactions", 187 Unit: metric.Unit_COUNT, 188 } 189 ) 190 191 // MakeTxnMetrics returns a TxnMetrics struct that contains metrics whose 192 // windowed portions retain data for approximately histogramWindow. 193 func MakeTxnMetrics(histogramWindow time.Duration) TxnMetrics { 194 return TxnMetrics{ 195 Aborts: metric.NewCounter(metaAbortsRates), 196 Commits: metric.NewCounter(metaCommitsRates), 197 Commits1PC: metric.NewCounter(metaCommits1PCRates), 198 ParallelCommits: metric.NewCounter(metaParallelCommitsRates), 199 RefreshFail: metric.NewCounter(metaRefreshFail), 200 RefreshFailWithCondensedSpans: metric.NewCounter(metaRefreshFailWithCondensedSpans), 201 RefreshSuccess: metric.NewCounter(metaRefreshSuccess), 202 RefreshMemoryLimitExceeded: metric.NewCounter(metaRefreshMemoryLimitExceeded), 203 Durations: metric.NewLatency(metaDurationsHistograms, histogramWindow), 204 Restarts: metric.NewHistogram(metaRestartsHistogram, histogramWindow, 100, 3), 205 RestartsWriteTooOld: telemetry.NewCounterWithMetric(metaRestartsWriteTooOld), 206 RestartsWriteTooOldMulti: telemetry.NewCounterWithMetric(metaRestartsWriteTooOldMulti), 207 RestartsSerializable: telemetry.NewCounterWithMetric(metaRestartsSerializable), 208 RestartsAsyncWriteFailure: telemetry.NewCounterWithMetric(metaRestartsAsyncWriteFailure), 209 RestartsReadWithinUncertainty: telemetry.NewCounterWithMetric(metaRestartsReadWithinUncertainty), 210 RestartsTxnAborted: telemetry.NewCounterWithMetric(metaRestartsTxnAborted), 211 RestartsTxnPush: telemetry.NewCounterWithMetric(metaRestartsTxnPush), 212 RestartsUnknown: telemetry.NewCounterWithMetric(metaRestartsUnknown), 213 } 214 }