github.com/pingcap/tiflow@v0.0.0-20240520035814-5bf52d54e205/dm/syncer/metrics/metrics.go (about) 1 // Copyright 2019 PingCAP, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // See the License for the specific language governing permissions and 12 // limitations under the License. 13 14 package metrics 15 16 import ( 17 "github.com/pingcap/tiflow/engine/pkg/promutil" 18 "github.com/prometheus/client_golang/prometheus" 19 ) 20 21 // for BinlogEventCost metric stage field. 22 const ( 23 BinlogEventCostStageDDLExec = "ddl-exec" 24 BinlogEventCostStageDMLExec = "dml-exec" 25 26 BinlogEventCostStageGenWriteRows = "gen-write-rows" 27 BinlogEventCostStageGenUpdateRows = "gen-update-rows" 28 BinlogEventCostStageGenDeleteRows = "gen-delete-rows" 29 BinlogEventCostStageGenQuery = "gen-query" 30 ) 31 32 // Metrics groups syncer's metric variables. 33 type Metrics struct { 34 BinlogReadDurationHistogram prometheus.Observer 35 BinlogEventSizeHistogram prometheus.Observer 36 ConflictDetectDurationHistogram prometheus.Observer 37 IdealQPS prometheus.Gauge 38 BinlogMasterPosGauge prometheus.Gauge 39 BinlogSyncerPosGauge prometheus.Gauge 40 BinlogMasterFileGauge prometheus.Gauge 41 BinlogSyncerFileGauge prometheus.Gauge 42 BinlogEventRowHistogram prometheus.Observer 43 TxnHistogram prometheus.Observer 44 QueryHistogram prometheus.Observer 45 ExitWithResumableErrorCounter prometheus.Counter 46 ExitWithNonResumableErrorCounter prometheus.Counter 47 ReplicationLagGauge prometheus.Gauge 48 ReplicationLagHistogram prometheus.Observer 49 RemainingTimeGauge prometheus.Gauge 50 ShardLockResolving prometheus.Gauge 51 FinishedTransactionTotal prometheus.Counter 52 FlushCheckPointsTimeInterval prometheus.Observer 53 } 54 55 // Proxies provides the ability to clean Metrics values when syncer is closed. 56 // private members have a corresponding cached variable in Metrics. 57 type Proxies struct { 58 Metrics *Metrics 59 binlogReadDurationHistogram *prometheus.HistogramVec 60 binlogEventSizeHistogram *prometheus.HistogramVec 61 BinlogEventCost *prometheus.HistogramVec 62 conflictDetectDurationHistogram *prometheus.HistogramVec 63 AddJobDurationHistogram *prometheus.HistogramVec 64 // dispatch/add multiple jobs for one binlog event. 65 // NOTE: only observe for DML now. 66 DispatchBinlogDurationHistogram *prometheus.HistogramVec 67 SkipBinlogDurationHistogram *prometheus.HistogramVec 68 AddedJobsTotal *prometheus.CounterVec 69 FinishedJobsTotal *prometheus.CounterVec 70 idealQPS *prometheus.GaugeVec 71 QueueSizeGauge *prometheus.GaugeVec 72 binlogPosGauge *prometheus.GaugeVec 73 binlogFileGauge *prometheus.GaugeVec 74 binlogEventRowHistogram *prometheus.HistogramVec 75 txnHistogram *prometheus.HistogramVec 76 queryHistogram *prometheus.HistogramVec 77 StmtHistogram *prometheus.HistogramVec 78 syncerExitWithErrorCounter *prometheus.CounterVec 79 replicationLagGauge *prometheus.GaugeVec 80 replicationLagHistogram *prometheus.HistogramVec 81 remainingTimeGauge *prometheus.GaugeVec 82 UnsyncedTableGauge *prometheus.GaugeVec 83 shardLockResolving *prometheus.GaugeVec 84 finishedTransactionTotal *prometheus.CounterVec 85 ReplicationTransactionBatch *prometheus.HistogramVec 86 flushCheckPointsTimeInterval *prometheus.HistogramVec 87 } 88 89 var DefaultMetricsProxies *Proxies 90 91 func init() { 92 DefaultMetricsProxies = &Proxies{} 93 DefaultMetricsProxies.Init(&promutil.PromFactory{}) 94 } 95 96 // Init creates Metrics proxy variables from Factory. 97 func (m *Proxies) Init(f promutil.Factory) { 98 m.binlogReadDurationHistogram = f.NewHistogramVec( 99 prometheus.HistogramOpts{ 100 Namespace: "dm", 101 Subsystem: "syncer", 102 Name: "read_binlog_duration", 103 Help: "bucketed histogram of read time (s) for single binlog event from the relay log or master.", 104 Buckets: prometheus.ExponentialBuckets(0.000005, 2, 25), 105 }, []string{"task", "source_id"}) 106 m.binlogEventSizeHistogram = f.NewHistogramVec( 107 prometheus.HistogramOpts{ 108 Namespace: "dm", 109 Subsystem: "syncer", 110 Name: "binlog_event_size", 111 Help: "size of a binlog event", 112 Buckets: prometheus.ExponentialBuckets(16, 2, 20), 113 }, []string{"task", "worker", "source_id"}) 114 m.BinlogEventCost = f.NewHistogramVec( 115 prometheus.HistogramOpts{ 116 Namespace: "dm", 117 Subsystem: "syncer", 118 Name: "binlog_transform_cost", 119 Help: "cost of binlog event transform", 120 Buckets: prometheus.ExponentialBuckets(0.000005, 2, 25), 121 }, []string{"stage", "task", "worker", "source_id"}) 122 m.conflictDetectDurationHistogram = f.NewHistogramVec( 123 prometheus.HistogramOpts{ 124 Namespace: "dm", 125 Subsystem: "syncer", 126 Name: "conflict_detect_duration", 127 Help: "bucketed histogram of conflict detect time (s) for single DML statement", 128 Buckets: prometheus.ExponentialBuckets(0.000005, 2, 25), 129 }, []string{"task", "source_id"}) 130 m.AddJobDurationHistogram = f.NewHistogramVec( 131 prometheus.HistogramOpts{ 132 Namespace: "dm", 133 Subsystem: "syncer", 134 Name: "add_job_duration", 135 Help: "bucketed histogram of add a job to the queue time (s)", 136 Buckets: prometheus.ExponentialBuckets(0.000005, 2, 25), 137 }, []string{"type", "task", "queueNo", "source_id"}) 138 m.DispatchBinlogDurationHistogram = f.NewHistogramVec( 139 prometheus.HistogramOpts{ 140 Namespace: "dm", 141 Subsystem: "syncer", 142 Name: "dispatch_binlog_duration", 143 Help: "bucketed histogram of dispatch a binlog event time (s)", 144 Buckets: prometheus.ExponentialBuckets(0.000005, 2, 25), 145 }, []string{"type", "task", "source_id"}) 146 m.SkipBinlogDurationHistogram = f.NewHistogramVec( 147 prometheus.HistogramOpts{ 148 Namespace: "dm", 149 Subsystem: "syncer", 150 Name: "skip_binlog_duration", 151 Help: "bucketed histogram of skip a binlog event time (s)", 152 Buckets: prometheus.ExponentialBuckets(0.0000005, 2, 25), // this should be very fast. 153 }, []string{"type", "task", "source_id"}) 154 m.AddedJobsTotal = f.NewCounterVec( 155 prometheus.CounterOpts{ 156 Namespace: "dm", 157 Subsystem: "syncer", 158 Name: "added_jobs_total", 159 Help: "total number of added jobs", 160 }, []string{"type", "task", "queueNo", "source_id", "worker", "target_schema", "target_table"}) 161 m.FinishedJobsTotal = f.NewCounterVec( 162 prometheus.CounterOpts{ 163 Namespace: "dm", 164 Subsystem: "syncer", 165 Name: "finished_jobs_total", 166 Help: "total number of finished jobs", 167 }, []string{"type", "task", "queueNo", "source_id", "worker", "target_schema", "target_table"}) 168 m.idealQPS = f.NewGaugeVec( 169 prometheus.GaugeOpts{ 170 Namespace: "dm", 171 Subsystem: "syncer", 172 Name: "ideal_qps", 173 Help: "the highest QPS that can be achieved ideally", 174 }, []string{"task", "worker", "source_id"}) 175 m.QueueSizeGauge = f.NewGaugeVec( 176 prometheus.GaugeOpts{ 177 Namespace: "dm", 178 Subsystem: "syncer", 179 Name: "queue_size", 180 Help: "remain size of the DML queue", 181 }, []string{"task", "queue_id", "source_id"}) 182 m.binlogPosGauge = f.NewGaugeVec( 183 prometheus.GaugeOpts{ 184 Namespace: "dm", 185 Subsystem: "syncer", 186 Name: "binlog_pos", 187 Help: "current binlog pos", 188 }, []string{"node", "task", "source_id"}) 189 m.binlogFileGauge = f.NewGaugeVec( 190 prometheus.GaugeOpts{ 191 Namespace: "dm", 192 Subsystem: "syncer", 193 Name: "binlog_file", 194 Help: "current binlog file index", 195 }, []string{"node", "task", "source_id"}) 196 m.binlogEventRowHistogram = f.NewHistogramVec( 197 prometheus.HistogramOpts{ 198 Namespace: "dm", 199 Subsystem: "syncer", 200 Name: "binlog_event_row", 201 Help: "number of rows in a binlog event", 202 Buckets: prometheus.LinearBuckets(0, 100, 101), // linear from 0 to 10000, i think this is enough 203 }, []string{"worker", "task", "source_id"}) 204 m.txnHistogram = f.NewHistogramVec( 205 prometheus.HistogramOpts{ 206 Namespace: "dm", 207 Subsystem: "syncer", 208 Name: "txn_duration_time", 209 Help: "Bucketed histogram of processing time (s) of a txn.", 210 Buckets: prometheus.ExponentialBuckets(0.000005, 2, 25), 211 }, []string{"task", "worker", "source_id"}) 212 m.queryHistogram = f.NewHistogramVec( 213 prometheus.HistogramOpts{ 214 Namespace: "dm", 215 Subsystem: "syncer", 216 Name: "query_duration_time", 217 Help: "Bucketed histogram of query time (s).", 218 Buckets: prometheus.ExponentialBuckets(0.000005, 2, 25), 219 }, []string{"task", "worker", "source_id"}) 220 m.StmtHistogram = f.NewHistogramVec( 221 prometheus.HistogramOpts{ 222 Namespace: "dm", 223 Subsystem: "syncer", 224 Name: "stmt_duration_time", 225 Help: "Bucketed histogram of every statement query time (s).", 226 Buckets: prometheus.ExponentialBuckets(0.000005, 2, 25), 227 }, []string{"type", "task"}) 228 m.syncerExitWithErrorCounter = f.NewCounterVec( 229 prometheus.CounterOpts{ 230 Namespace: "dm", 231 Subsystem: "syncer", 232 Name: "exit_with_error_count", 233 Help: "counter for syncer exits with error", 234 }, []string{"task", "source_id", "resumable_err"}) 235 m.replicationLagGauge = f.NewGaugeVec( 236 prometheus.GaugeOpts{ 237 Namespace: "dm", 238 Subsystem: "syncer", 239 Name: "replication_lag_gauge", 240 Help: "replication lag gauge in second between mysql and syncer", 241 }, []string{"task", "source_id", "worker"}) 242 m.replicationLagHistogram = f.NewHistogramVec( 243 prometheus.HistogramOpts{ 244 Namespace: "dm", 245 Subsystem: "syncer", 246 Name: "replication_lag", 247 Help: "replication lag histogram in second between mysql and syncer", 248 Buckets: prometheus.ExponentialBuckets(0.5, 2, 12), // exponential from 0.5s to 1024s 249 }, []string{"task", "source_id", "worker"}) 250 m.remainingTimeGauge = f.NewGaugeVec( 251 prometheus.GaugeOpts{ 252 Namespace: "dm", 253 Subsystem: "syncer", 254 Name: "remaining_time", 255 Help: "the remaining time in second to catch up master", 256 }, []string{"task", "source_id", "worker"}) 257 m.UnsyncedTableGauge = f.NewGaugeVec( 258 prometheus.GaugeOpts{ 259 Namespace: "dm", 260 Subsystem: "syncer", 261 Name: "unsynced_table_number", 262 Help: "number of unsynced tables in the subtask", 263 }, []string{"task", "table", "source_id"}) 264 m.shardLockResolving = f.NewGaugeVec( 265 prometheus.GaugeOpts{ 266 Namespace: "dm", 267 Subsystem: "syncer", 268 Name: "shard_lock_resolving", 269 Help: "waiting shard DDL lock to be resolved", 270 }, []string{"task", "source_id"}) 271 m.finishedTransactionTotal = f.NewCounterVec( 272 prometheus.CounterOpts{ 273 Namespace: "dm", 274 Subsystem: "syncer", 275 Name: "finished_transaction_total", 276 Help: "total number of finished transaction", 277 }, []string{"task", "worker", "source_id"}) 278 m.ReplicationTransactionBatch = f.NewHistogramVec( 279 prometheus.HistogramOpts{ 280 Namespace: "dm", 281 Subsystem: "syncer", 282 Name: "replication_transaction_batch", 283 Help: "number of sql's contained in a transaction that executed to downstream", 284 Buckets: prometheus.LinearBuckets(1, 50, 21), // linear from 1 to 1001 285 }, []string{"worker", "task", "source_id", "queueNo", "type"}) 286 m.flushCheckPointsTimeInterval = f.NewHistogramVec( 287 prometheus.HistogramOpts{ 288 Namespace: "dm", 289 Subsystem: "syncer", 290 Name: "flush_checkpoints_time_interval", 291 Help: "checkpoint flushed time interval in seconds", 292 Buckets: prometheus.LinearBuckets(1, 50, 21), // linear from 1 to 1001, i think this is enough 293 }, []string{"worker", "task", "source_id"}) 294 } 295 296 // CacheForOneTask returns a new Proxies with m.Metrics filled. It is used 297 // to avoid calling WithLabelValues in hot path. 298 func (m *Proxies) CacheForOneTask(taskName, workerName, sourceID string) *Proxies { 299 ret := *m 300 ret.Metrics = &Metrics{} 301 ret.Metrics.BinlogReadDurationHistogram = m.binlogReadDurationHistogram.WithLabelValues(taskName, sourceID) 302 ret.Metrics.BinlogEventSizeHistogram = m.binlogEventSizeHistogram.WithLabelValues(taskName, workerName, sourceID) 303 ret.Metrics.ConflictDetectDurationHistogram = m.conflictDetectDurationHistogram.WithLabelValues(taskName, sourceID) 304 ret.Metrics.IdealQPS = m.idealQPS.WithLabelValues(taskName, workerName, sourceID) 305 ret.Metrics.BinlogMasterPosGauge = m.binlogPosGauge.WithLabelValues("master", taskName, sourceID) 306 ret.Metrics.BinlogSyncerPosGauge = m.binlogPosGauge.WithLabelValues("syncer", taskName, sourceID) 307 ret.Metrics.BinlogMasterFileGauge = m.binlogFileGauge.WithLabelValues("master", taskName, sourceID) 308 ret.Metrics.BinlogSyncerFileGauge = m.binlogFileGauge.WithLabelValues("syncer", taskName, sourceID) 309 ret.Metrics.BinlogEventRowHistogram = m.binlogEventRowHistogram.WithLabelValues(workerName, taskName, sourceID) 310 ret.Metrics.TxnHistogram = m.txnHistogram.WithLabelValues(taskName, workerName, sourceID) 311 ret.Metrics.QueryHistogram = m.queryHistogram.WithLabelValues(taskName, workerName, sourceID) 312 ret.Metrics.ExitWithResumableErrorCounter = m.syncerExitWithErrorCounter.WithLabelValues(taskName, sourceID, "true") 313 ret.Metrics.ExitWithNonResumableErrorCounter = m.syncerExitWithErrorCounter.WithLabelValues(taskName, sourceID, "false") 314 ret.Metrics.ReplicationLagGauge = m.replicationLagGauge.WithLabelValues(taskName, sourceID, workerName) 315 ret.Metrics.ReplicationLagHistogram = m.replicationLagHistogram.WithLabelValues(taskName, sourceID, workerName) 316 ret.Metrics.RemainingTimeGauge = m.remainingTimeGauge.WithLabelValues(taskName, sourceID, workerName) 317 ret.Metrics.ShardLockResolving = m.shardLockResolving.WithLabelValues(taskName, sourceID) 318 ret.Metrics.FinishedTransactionTotal = m.finishedTransactionTotal.WithLabelValues(taskName, workerName, sourceID) 319 ret.Metrics.FlushCheckPointsTimeInterval = m.flushCheckPointsTimeInterval.WithLabelValues(workerName, taskName, sourceID) 320 return &ret 321 } 322 323 // RegisterMetrics registers Proxies. 324 func (m *Proxies) RegisterMetrics(registry *prometheus.Registry) { 325 registry.MustRegister(m.binlogReadDurationHistogram) 326 registry.MustRegister(m.binlogEventSizeHistogram) 327 registry.MustRegister(m.BinlogEventCost) 328 registry.MustRegister(m.binlogEventRowHistogram) 329 registry.MustRegister(m.conflictDetectDurationHistogram) 330 registry.MustRegister(m.AddJobDurationHistogram) 331 registry.MustRegister(m.DispatchBinlogDurationHistogram) 332 registry.MustRegister(m.SkipBinlogDurationHistogram) 333 registry.MustRegister(m.AddedJobsTotal) 334 registry.MustRegister(m.FinishedJobsTotal) 335 registry.MustRegister(m.QueueSizeGauge) 336 registry.MustRegister(m.binlogPosGauge) 337 registry.MustRegister(m.binlogFileGauge) 338 registry.MustRegister(m.txnHistogram) 339 registry.MustRegister(m.StmtHistogram) 340 registry.MustRegister(m.queryHistogram) 341 registry.MustRegister(m.syncerExitWithErrorCounter) 342 registry.MustRegister(m.replicationLagGauge) 343 registry.MustRegister(m.replicationLagHistogram) 344 registry.MustRegister(m.remainingTimeGauge) 345 registry.MustRegister(m.UnsyncedTableGauge) 346 registry.MustRegister(m.shardLockResolving) 347 registry.MustRegister(m.idealQPS) 348 registry.MustRegister(m.finishedTransactionTotal) 349 registry.MustRegister(m.ReplicationTransactionBatch) 350 registry.MustRegister(m.flushCheckPointsTimeInterval) 351 } 352 353 // RemoveLabelValuesWithTaskInMetrics cleans all Metrics related to the task. 354 func (m *Proxies) RemoveLabelValuesWithTaskInMetrics(task string) { 355 m.binlogReadDurationHistogram.DeletePartialMatch(prometheus.Labels{"task": task}) 356 m.binlogEventSizeHistogram.DeletePartialMatch(prometheus.Labels{"task": task}) 357 m.BinlogEventCost.DeletePartialMatch(prometheus.Labels{"task": task}) 358 m.binlogEventRowHistogram.DeletePartialMatch(prometheus.Labels{"task": task}) 359 m.conflictDetectDurationHistogram.DeletePartialMatch(prometheus.Labels{"task": task}) 360 m.AddJobDurationHistogram.DeletePartialMatch(prometheus.Labels{"task": task}) 361 m.DispatchBinlogDurationHistogram.DeletePartialMatch(prometheus.Labels{"task": task}) 362 m.SkipBinlogDurationHistogram.DeletePartialMatch(prometheus.Labels{"task": task}) 363 m.AddedJobsTotal.DeletePartialMatch(prometheus.Labels{"task": task}) 364 m.FinishedJobsTotal.DeletePartialMatch(prometheus.Labels{"task": task}) 365 m.QueueSizeGauge.DeletePartialMatch(prometheus.Labels{"task": task}) 366 m.binlogPosGauge.DeletePartialMatch(prometheus.Labels{"task": task}) 367 m.binlogFileGauge.DeletePartialMatch(prometheus.Labels{"task": task}) 368 m.txnHistogram.DeletePartialMatch(prometheus.Labels{"task": task}) 369 m.StmtHistogram.DeletePartialMatch(prometheus.Labels{"task": task}) 370 m.queryHistogram.DeletePartialMatch(prometheus.Labels{"task": task}) 371 m.syncerExitWithErrorCounter.DeletePartialMatch(prometheus.Labels{"task": task}) 372 m.replicationLagGauge.DeletePartialMatch(prometheus.Labels{"task": task}) 373 m.replicationLagHistogram.DeletePartialMatch(prometheus.Labels{"task": task}) 374 m.remainingTimeGauge.DeletePartialMatch(prometheus.Labels{"task": task}) 375 m.UnsyncedTableGauge.DeletePartialMatch(prometheus.Labels{"task": task}) 376 m.shardLockResolving.DeletePartialMatch(prometheus.Labels{"task": task}) 377 m.idealQPS.DeletePartialMatch(prometheus.Labels{"task": task}) 378 m.finishedTransactionTotal.DeletePartialMatch(prometheus.Labels{"task": task}) 379 m.ReplicationTransactionBatch.DeletePartialMatch(prometheus.Labels{"task": task}) 380 m.flushCheckPointsTimeInterval.DeletePartialMatch(prometheus.Labels{"task": task}) 381 }