github.com/pingcap/tiflow@v0.0.0-20240520035814-5bf52d54e205/dm/syncer/metrics/metrics.go (about)

     1  // Copyright 2019 PingCAP, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // See the License for the specific language governing permissions and
    12  // limitations under the License.
    13  
    14  package metrics
    15  
    16  import (
    17  	"github.com/pingcap/tiflow/engine/pkg/promutil"
    18  	"github.com/prometheus/client_golang/prometheus"
    19  )
    20  
    21  // for BinlogEventCost metric stage field.
    22  const (
    23  	BinlogEventCostStageDDLExec = "ddl-exec"
    24  	BinlogEventCostStageDMLExec = "dml-exec"
    25  
    26  	BinlogEventCostStageGenWriteRows  = "gen-write-rows"
    27  	BinlogEventCostStageGenUpdateRows = "gen-update-rows"
    28  	BinlogEventCostStageGenDeleteRows = "gen-delete-rows"
    29  	BinlogEventCostStageGenQuery      = "gen-query"
    30  )
    31  
    32  // Metrics groups syncer's metric variables.
    33  type Metrics struct {
    34  	BinlogReadDurationHistogram      prometheus.Observer
    35  	BinlogEventSizeHistogram         prometheus.Observer
    36  	ConflictDetectDurationHistogram  prometheus.Observer
    37  	IdealQPS                         prometheus.Gauge
    38  	BinlogMasterPosGauge             prometheus.Gauge
    39  	BinlogSyncerPosGauge             prometheus.Gauge
    40  	BinlogMasterFileGauge            prometheus.Gauge
    41  	BinlogSyncerFileGauge            prometheus.Gauge
    42  	BinlogEventRowHistogram          prometheus.Observer
    43  	TxnHistogram                     prometheus.Observer
    44  	QueryHistogram                   prometheus.Observer
    45  	ExitWithResumableErrorCounter    prometheus.Counter
    46  	ExitWithNonResumableErrorCounter prometheus.Counter
    47  	ReplicationLagGauge              prometheus.Gauge
    48  	ReplicationLagHistogram          prometheus.Observer
    49  	RemainingTimeGauge               prometheus.Gauge
    50  	ShardLockResolving               prometheus.Gauge
    51  	FinishedTransactionTotal         prometheus.Counter
    52  	FlushCheckPointsTimeInterval     prometheus.Observer
    53  }
    54  
    55  // Proxies provides the ability to clean Metrics values when syncer is closed.
    56  // private members have a corresponding cached variable in Metrics.
    57  type Proxies struct {
    58  	Metrics                         *Metrics
    59  	binlogReadDurationHistogram     *prometheus.HistogramVec
    60  	binlogEventSizeHistogram        *prometheus.HistogramVec
    61  	BinlogEventCost                 *prometheus.HistogramVec
    62  	conflictDetectDurationHistogram *prometheus.HistogramVec
    63  	AddJobDurationHistogram         *prometheus.HistogramVec
    64  	// dispatch/add multiple jobs for one binlog event.
    65  	// NOTE: only observe for DML now.
    66  	DispatchBinlogDurationHistogram *prometheus.HistogramVec
    67  	SkipBinlogDurationHistogram     *prometheus.HistogramVec
    68  	AddedJobsTotal                  *prometheus.CounterVec
    69  	FinishedJobsTotal               *prometheus.CounterVec
    70  	idealQPS                        *prometheus.GaugeVec
    71  	QueueSizeGauge                  *prometheus.GaugeVec
    72  	binlogPosGauge                  *prometheus.GaugeVec
    73  	binlogFileGauge                 *prometheus.GaugeVec
    74  	binlogEventRowHistogram         *prometheus.HistogramVec
    75  	txnHistogram                    *prometheus.HistogramVec
    76  	queryHistogram                  *prometheus.HistogramVec
    77  	StmtHistogram                   *prometheus.HistogramVec
    78  	syncerExitWithErrorCounter      *prometheus.CounterVec
    79  	replicationLagGauge             *prometheus.GaugeVec
    80  	replicationLagHistogram         *prometheus.HistogramVec
    81  	remainingTimeGauge              *prometheus.GaugeVec
    82  	UnsyncedTableGauge              *prometheus.GaugeVec
    83  	shardLockResolving              *prometheus.GaugeVec
    84  	finishedTransactionTotal        *prometheus.CounterVec
    85  	ReplicationTransactionBatch     *prometheus.HistogramVec
    86  	flushCheckPointsTimeInterval    *prometheus.HistogramVec
    87  }
    88  
    89  var DefaultMetricsProxies *Proxies
    90  
    91  func init() {
    92  	DefaultMetricsProxies = &Proxies{}
    93  	DefaultMetricsProxies.Init(&promutil.PromFactory{})
    94  }
    95  
    96  // Init creates Metrics proxy variables from Factory.
    97  func (m *Proxies) Init(f promutil.Factory) {
    98  	m.binlogReadDurationHistogram = f.NewHistogramVec(
    99  		prometheus.HistogramOpts{
   100  			Namespace: "dm",
   101  			Subsystem: "syncer",
   102  			Name:      "read_binlog_duration",
   103  			Help:      "bucketed histogram of read time (s) for single binlog event from the relay log or master.",
   104  			Buckets:   prometheus.ExponentialBuckets(0.000005, 2, 25),
   105  		}, []string{"task", "source_id"})
   106  	m.binlogEventSizeHistogram = f.NewHistogramVec(
   107  		prometheus.HistogramOpts{
   108  			Namespace: "dm",
   109  			Subsystem: "syncer",
   110  			Name:      "binlog_event_size",
   111  			Help:      "size of a binlog event",
   112  			Buckets:   prometheus.ExponentialBuckets(16, 2, 20),
   113  		}, []string{"task", "worker", "source_id"})
   114  	m.BinlogEventCost = f.NewHistogramVec(
   115  		prometheus.HistogramOpts{
   116  			Namespace: "dm",
   117  			Subsystem: "syncer",
   118  			Name:      "binlog_transform_cost",
   119  			Help:      "cost of binlog event transform",
   120  			Buckets:   prometheus.ExponentialBuckets(0.000005, 2, 25),
   121  		}, []string{"stage", "task", "worker", "source_id"})
   122  	m.conflictDetectDurationHistogram = f.NewHistogramVec(
   123  		prometheus.HistogramOpts{
   124  			Namespace: "dm",
   125  			Subsystem: "syncer",
   126  			Name:      "conflict_detect_duration",
   127  			Help:      "bucketed histogram of conflict detect time (s) for single DML statement",
   128  			Buckets:   prometheus.ExponentialBuckets(0.000005, 2, 25),
   129  		}, []string{"task", "source_id"})
   130  	m.AddJobDurationHistogram = f.NewHistogramVec(
   131  		prometheus.HistogramOpts{
   132  			Namespace: "dm",
   133  			Subsystem: "syncer",
   134  			Name:      "add_job_duration",
   135  			Help:      "bucketed histogram of add a job to the queue time (s)",
   136  			Buckets:   prometheus.ExponentialBuckets(0.000005, 2, 25),
   137  		}, []string{"type", "task", "queueNo", "source_id"})
   138  	m.DispatchBinlogDurationHistogram = f.NewHistogramVec(
   139  		prometheus.HistogramOpts{
   140  			Namespace: "dm",
   141  			Subsystem: "syncer",
   142  			Name:      "dispatch_binlog_duration",
   143  			Help:      "bucketed histogram of dispatch a binlog event time (s)",
   144  			Buckets:   prometheus.ExponentialBuckets(0.000005, 2, 25),
   145  		}, []string{"type", "task", "source_id"})
   146  	m.SkipBinlogDurationHistogram = f.NewHistogramVec(
   147  		prometheus.HistogramOpts{
   148  			Namespace: "dm",
   149  			Subsystem: "syncer",
   150  			Name:      "skip_binlog_duration",
   151  			Help:      "bucketed histogram of skip a binlog event time (s)",
   152  			Buckets:   prometheus.ExponentialBuckets(0.0000005, 2, 25), // this should be very fast.
   153  		}, []string{"type", "task", "source_id"})
   154  	m.AddedJobsTotal = f.NewCounterVec(
   155  		prometheus.CounterOpts{
   156  			Namespace: "dm",
   157  			Subsystem: "syncer",
   158  			Name:      "added_jobs_total",
   159  			Help:      "total number of added jobs",
   160  		}, []string{"type", "task", "queueNo", "source_id", "worker", "target_schema", "target_table"})
   161  	m.FinishedJobsTotal = f.NewCounterVec(
   162  		prometheus.CounterOpts{
   163  			Namespace: "dm",
   164  			Subsystem: "syncer",
   165  			Name:      "finished_jobs_total",
   166  			Help:      "total number of finished jobs",
   167  		}, []string{"type", "task", "queueNo", "source_id", "worker", "target_schema", "target_table"})
   168  	m.idealQPS = f.NewGaugeVec(
   169  		prometheus.GaugeOpts{
   170  			Namespace: "dm",
   171  			Subsystem: "syncer",
   172  			Name:      "ideal_qps",
   173  			Help:      "the highest QPS that can be achieved ideally",
   174  		}, []string{"task", "worker", "source_id"})
   175  	m.QueueSizeGauge = f.NewGaugeVec(
   176  		prometheus.GaugeOpts{
   177  			Namespace: "dm",
   178  			Subsystem: "syncer",
   179  			Name:      "queue_size",
   180  			Help:      "remain size of the DML queue",
   181  		}, []string{"task", "queue_id", "source_id"})
   182  	m.binlogPosGauge = f.NewGaugeVec(
   183  		prometheus.GaugeOpts{
   184  			Namespace: "dm",
   185  			Subsystem: "syncer",
   186  			Name:      "binlog_pos",
   187  			Help:      "current binlog pos",
   188  		}, []string{"node", "task", "source_id"})
   189  	m.binlogFileGauge = f.NewGaugeVec(
   190  		prometheus.GaugeOpts{
   191  			Namespace: "dm",
   192  			Subsystem: "syncer",
   193  			Name:      "binlog_file",
   194  			Help:      "current binlog file index",
   195  		}, []string{"node", "task", "source_id"})
   196  	m.binlogEventRowHistogram = f.NewHistogramVec(
   197  		prometheus.HistogramOpts{
   198  			Namespace: "dm",
   199  			Subsystem: "syncer",
   200  			Name:      "binlog_event_row",
   201  			Help:      "number of rows in a binlog event",
   202  			Buckets:   prometheus.LinearBuckets(0, 100, 101), // linear from 0 to 10000, i think this is enough
   203  		}, []string{"worker", "task", "source_id"})
   204  	m.txnHistogram = f.NewHistogramVec(
   205  		prometheus.HistogramOpts{
   206  			Namespace: "dm",
   207  			Subsystem: "syncer",
   208  			Name:      "txn_duration_time",
   209  			Help:      "Bucketed histogram of processing time (s) of a txn.",
   210  			Buckets:   prometheus.ExponentialBuckets(0.000005, 2, 25),
   211  		}, []string{"task", "worker", "source_id"})
   212  	m.queryHistogram = f.NewHistogramVec(
   213  		prometheus.HistogramOpts{
   214  			Namespace: "dm",
   215  			Subsystem: "syncer",
   216  			Name:      "query_duration_time",
   217  			Help:      "Bucketed histogram of query time (s).",
   218  			Buckets:   prometheus.ExponentialBuckets(0.000005, 2, 25),
   219  		}, []string{"task", "worker", "source_id"})
   220  	m.StmtHistogram = f.NewHistogramVec(
   221  		prometheus.HistogramOpts{
   222  			Namespace: "dm",
   223  			Subsystem: "syncer",
   224  			Name:      "stmt_duration_time",
   225  			Help:      "Bucketed histogram of every statement query time (s).",
   226  			Buckets:   prometheus.ExponentialBuckets(0.000005, 2, 25),
   227  		}, []string{"type", "task"})
   228  	m.syncerExitWithErrorCounter = f.NewCounterVec(
   229  		prometheus.CounterOpts{
   230  			Namespace: "dm",
   231  			Subsystem: "syncer",
   232  			Name:      "exit_with_error_count",
   233  			Help:      "counter for syncer exits with error",
   234  		}, []string{"task", "source_id", "resumable_err"})
   235  	m.replicationLagGauge = f.NewGaugeVec(
   236  		prometheus.GaugeOpts{
   237  			Namespace: "dm",
   238  			Subsystem: "syncer",
   239  			Name:      "replication_lag_gauge",
   240  			Help:      "replication lag gauge in second between mysql and syncer",
   241  		}, []string{"task", "source_id", "worker"})
   242  	m.replicationLagHistogram = f.NewHistogramVec(
   243  		prometheus.HistogramOpts{
   244  			Namespace: "dm",
   245  			Subsystem: "syncer",
   246  			Name:      "replication_lag",
   247  			Help:      "replication lag histogram in second between mysql and syncer",
   248  			Buckets:   prometheus.ExponentialBuckets(0.5, 2, 12), // exponential from 0.5s to 1024s
   249  		}, []string{"task", "source_id", "worker"})
   250  	m.remainingTimeGauge = f.NewGaugeVec(
   251  		prometheus.GaugeOpts{
   252  			Namespace: "dm",
   253  			Subsystem: "syncer",
   254  			Name:      "remaining_time",
   255  			Help:      "the remaining time in second to catch up master",
   256  		}, []string{"task", "source_id", "worker"})
   257  	m.UnsyncedTableGauge = f.NewGaugeVec(
   258  		prometheus.GaugeOpts{
   259  			Namespace: "dm",
   260  			Subsystem: "syncer",
   261  			Name:      "unsynced_table_number",
   262  			Help:      "number of unsynced tables in the subtask",
   263  		}, []string{"task", "table", "source_id"})
   264  	m.shardLockResolving = f.NewGaugeVec(
   265  		prometheus.GaugeOpts{
   266  			Namespace: "dm",
   267  			Subsystem: "syncer",
   268  			Name:      "shard_lock_resolving",
   269  			Help:      "waiting shard DDL lock to be resolved",
   270  		}, []string{"task", "source_id"})
   271  	m.finishedTransactionTotal = f.NewCounterVec(
   272  		prometheus.CounterOpts{
   273  			Namespace: "dm",
   274  			Subsystem: "syncer",
   275  			Name:      "finished_transaction_total",
   276  			Help:      "total number of finished transaction",
   277  		}, []string{"task", "worker", "source_id"})
   278  	m.ReplicationTransactionBatch = f.NewHistogramVec(
   279  		prometheus.HistogramOpts{
   280  			Namespace: "dm",
   281  			Subsystem: "syncer",
   282  			Name:      "replication_transaction_batch",
   283  			Help:      "number of sql's contained in a transaction that executed to downstream",
   284  			Buckets:   prometheus.LinearBuckets(1, 50, 21), // linear from 1 to 1001
   285  		}, []string{"worker", "task", "source_id", "queueNo", "type"})
   286  	m.flushCheckPointsTimeInterval = f.NewHistogramVec(
   287  		prometheus.HistogramOpts{
   288  			Namespace: "dm",
   289  			Subsystem: "syncer",
   290  			Name:      "flush_checkpoints_time_interval",
   291  			Help:      "checkpoint flushed time interval in seconds",
   292  			Buckets:   prometheus.LinearBuckets(1, 50, 21), // linear from 1 to 1001, i think this is enough
   293  		}, []string{"worker", "task", "source_id"})
   294  }
   295  
   296  // CacheForOneTask returns a new Proxies with m.Metrics filled. It is used
   297  // to avoid calling WithLabelValues in hot path.
   298  func (m *Proxies) CacheForOneTask(taskName, workerName, sourceID string) *Proxies {
   299  	ret := *m
   300  	ret.Metrics = &Metrics{}
   301  	ret.Metrics.BinlogReadDurationHistogram = m.binlogReadDurationHistogram.WithLabelValues(taskName, sourceID)
   302  	ret.Metrics.BinlogEventSizeHistogram = m.binlogEventSizeHistogram.WithLabelValues(taskName, workerName, sourceID)
   303  	ret.Metrics.ConflictDetectDurationHistogram = m.conflictDetectDurationHistogram.WithLabelValues(taskName, sourceID)
   304  	ret.Metrics.IdealQPS = m.idealQPS.WithLabelValues(taskName, workerName, sourceID)
   305  	ret.Metrics.BinlogMasterPosGauge = m.binlogPosGauge.WithLabelValues("master", taskName, sourceID)
   306  	ret.Metrics.BinlogSyncerPosGauge = m.binlogPosGauge.WithLabelValues("syncer", taskName, sourceID)
   307  	ret.Metrics.BinlogMasterFileGauge = m.binlogFileGauge.WithLabelValues("master", taskName, sourceID)
   308  	ret.Metrics.BinlogSyncerFileGauge = m.binlogFileGauge.WithLabelValues("syncer", taskName, sourceID)
   309  	ret.Metrics.BinlogEventRowHistogram = m.binlogEventRowHistogram.WithLabelValues(workerName, taskName, sourceID)
   310  	ret.Metrics.TxnHistogram = m.txnHistogram.WithLabelValues(taskName, workerName, sourceID)
   311  	ret.Metrics.QueryHistogram = m.queryHistogram.WithLabelValues(taskName, workerName, sourceID)
   312  	ret.Metrics.ExitWithResumableErrorCounter = m.syncerExitWithErrorCounter.WithLabelValues(taskName, sourceID, "true")
   313  	ret.Metrics.ExitWithNonResumableErrorCounter = m.syncerExitWithErrorCounter.WithLabelValues(taskName, sourceID, "false")
   314  	ret.Metrics.ReplicationLagGauge = m.replicationLagGauge.WithLabelValues(taskName, sourceID, workerName)
   315  	ret.Metrics.ReplicationLagHistogram = m.replicationLagHistogram.WithLabelValues(taskName, sourceID, workerName)
   316  	ret.Metrics.RemainingTimeGauge = m.remainingTimeGauge.WithLabelValues(taskName, sourceID, workerName)
   317  	ret.Metrics.ShardLockResolving = m.shardLockResolving.WithLabelValues(taskName, sourceID)
   318  	ret.Metrics.FinishedTransactionTotal = m.finishedTransactionTotal.WithLabelValues(taskName, workerName, sourceID)
   319  	ret.Metrics.FlushCheckPointsTimeInterval = m.flushCheckPointsTimeInterval.WithLabelValues(workerName, taskName, sourceID)
   320  	return &ret
   321  }
   322  
   323  // RegisterMetrics registers Proxies.
   324  func (m *Proxies) RegisterMetrics(registry *prometheus.Registry) {
   325  	registry.MustRegister(m.binlogReadDurationHistogram)
   326  	registry.MustRegister(m.binlogEventSizeHistogram)
   327  	registry.MustRegister(m.BinlogEventCost)
   328  	registry.MustRegister(m.binlogEventRowHistogram)
   329  	registry.MustRegister(m.conflictDetectDurationHistogram)
   330  	registry.MustRegister(m.AddJobDurationHistogram)
   331  	registry.MustRegister(m.DispatchBinlogDurationHistogram)
   332  	registry.MustRegister(m.SkipBinlogDurationHistogram)
   333  	registry.MustRegister(m.AddedJobsTotal)
   334  	registry.MustRegister(m.FinishedJobsTotal)
   335  	registry.MustRegister(m.QueueSizeGauge)
   336  	registry.MustRegister(m.binlogPosGauge)
   337  	registry.MustRegister(m.binlogFileGauge)
   338  	registry.MustRegister(m.txnHistogram)
   339  	registry.MustRegister(m.StmtHistogram)
   340  	registry.MustRegister(m.queryHistogram)
   341  	registry.MustRegister(m.syncerExitWithErrorCounter)
   342  	registry.MustRegister(m.replicationLagGauge)
   343  	registry.MustRegister(m.replicationLagHistogram)
   344  	registry.MustRegister(m.remainingTimeGauge)
   345  	registry.MustRegister(m.UnsyncedTableGauge)
   346  	registry.MustRegister(m.shardLockResolving)
   347  	registry.MustRegister(m.idealQPS)
   348  	registry.MustRegister(m.finishedTransactionTotal)
   349  	registry.MustRegister(m.ReplicationTransactionBatch)
   350  	registry.MustRegister(m.flushCheckPointsTimeInterval)
   351  }
   352  
   353  // RemoveLabelValuesWithTaskInMetrics cleans all Metrics related to the task.
   354  func (m *Proxies) RemoveLabelValuesWithTaskInMetrics(task string) {
   355  	m.binlogReadDurationHistogram.DeletePartialMatch(prometheus.Labels{"task": task})
   356  	m.binlogEventSizeHistogram.DeletePartialMatch(prometheus.Labels{"task": task})
   357  	m.BinlogEventCost.DeletePartialMatch(prometheus.Labels{"task": task})
   358  	m.binlogEventRowHistogram.DeletePartialMatch(prometheus.Labels{"task": task})
   359  	m.conflictDetectDurationHistogram.DeletePartialMatch(prometheus.Labels{"task": task})
   360  	m.AddJobDurationHistogram.DeletePartialMatch(prometheus.Labels{"task": task})
   361  	m.DispatchBinlogDurationHistogram.DeletePartialMatch(prometheus.Labels{"task": task})
   362  	m.SkipBinlogDurationHistogram.DeletePartialMatch(prometheus.Labels{"task": task})
   363  	m.AddedJobsTotal.DeletePartialMatch(prometheus.Labels{"task": task})
   364  	m.FinishedJobsTotal.DeletePartialMatch(prometheus.Labels{"task": task})
   365  	m.QueueSizeGauge.DeletePartialMatch(prometheus.Labels{"task": task})
   366  	m.binlogPosGauge.DeletePartialMatch(prometheus.Labels{"task": task})
   367  	m.binlogFileGauge.DeletePartialMatch(prometheus.Labels{"task": task})
   368  	m.txnHistogram.DeletePartialMatch(prometheus.Labels{"task": task})
   369  	m.StmtHistogram.DeletePartialMatch(prometheus.Labels{"task": task})
   370  	m.queryHistogram.DeletePartialMatch(prometheus.Labels{"task": task})
   371  	m.syncerExitWithErrorCounter.DeletePartialMatch(prometheus.Labels{"task": task})
   372  	m.replicationLagGauge.DeletePartialMatch(prometheus.Labels{"task": task})
   373  	m.replicationLagHistogram.DeletePartialMatch(prometheus.Labels{"task": task})
   374  	m.remainingTimeGauge.DeletePartialMatch(prometheus.Labels{"task": task})
   375  	m.UnsyncedTableGauge.DeletePartialMatch(prometheus.Labels{"task": task})
   376  	m.shardLockResolving.DeletePartialMatch(prometheus.Labels{"task": task})
   377  	m.idealQPS.DeletePartialMatch(prometheus.Labels{"task": task})
   378  	m.finishedTransactionTotal.DeletePartialMatch(prometheus.Labels{"task": task})
   379  	m.ReplicationTransactionBatch.DeletePartialMatch(prometheus.Labels{"task": task})
   380  	m.flushCheckPointsTimeInterval.DeletePartialMatch(prometheus.Labels{"task": task})
   381  }