github.com/pingcap/tiflow@v0.0.0-20240520035814-5bf52d54e205/dm/master/metrics/metrics.go (about)

     1  // Copyright 2020 PingCAP, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // See the License for the specific language governing permissions and
    12  // limitations under the License.
    13  
    14  package metrics
    15  
    16  import (
    17  	"context"
    18  	"time"
    19  
    20  	cpu "github.com/pingcap/tidb-tools/pkg/utils"
    21  	"github.com/pingcap/tiflow/engine/pkg/promutil"
    22  	"github.com/prometheus/client_golang/prometheus"
    23  )
    24  
    25  // used for ddlPendingCounter, no "Resolved" lock because they will be
    26  // remove quickly and not pending anymore.
    27  const (
    28  	DDLPendingNone     = "None"
    29  	DDLPendingUnSynced = "Un-synced"
    30  	DDLPendingSynced   = "Synced"
    31  )
    32  
    33  // used to show error type when handle DDLs.
    34  const (
    35  	InfoErrSyncLock    = "InfoPut - SyncLockError"
    36  	InfoErrHandleLock  = "InfoPut - HandleLockError"
    37  	OpErrRemoveLock    = "OperationPut - RemoveLockError"
    38  	OpErrLockUnSynced  = "OperationPut - LockUnSyncedError"
    39  	OpErrPutNonOwnerOp = "OperationPut - PutNonOwnerOpError"
    40  )
    41  
    42  // used to represent worker event error type.
    43  const (
    44  	WorkerEventHandle = "handle"
    45  	WorkerEventWatch  = "watch"
    46  )
    47  
    48  var (
    49  	f           = &promutil.PromFactory{}
    50  	workerState = f.NewGaugeVec(
    51  		prometheus.GaugeOpts{
    52  			Namespace: "dm",
    53  			Subsystem: "master",
    54  			Name:      "worker_state",
    55  			Help:      "state of worker, -1 - unrecognized, 0 - offline, 1 - free, 2 - bound",
    56  		}, []string{"worker"})
    57  
    58  	cpuUsageGauge = prometheus.NewGauge(
    59  		prometheus.GaugeOpts{
    60  			Namespace: "dm",
    61  			Subsystem: "master",
    62  			Name:      "cpu_usage",
    63  			Help:      "the cpu usage of master",
    64  		})
    65  
    66  	ddlPendingCounter = f.NewGaugeVec(
    67  		prometheus.GaugeOpts{
    68  			Namespace: "dm",
    69  			Subsystem: "master",
    70  			Name:      "ddl_state_number",
    71  			Help:      "number of pending DDL in different states, Un-synced (waiting all upstream), Synced (all upstream finished, waiting all downstream)",
    72  		}, []string{"task", "type"})
    73  
    74  	ddlErrCounter = f.NewCounterVec(
    75  		prometheus.CounterOpts{
    76  			Namespace: "dm",
    77  			Subsystem: "master",
    78  			Name:      "shard_ddl_error",
    79  			Help:      "number of shard DDL lock/operation error",
    80  		}, []string{"task", "type"})
    81  
    82  	workerEventErrCounter = f.NewCounterVec(
    83  		prometheus.CounterOpts{
    84  			Namespace: "dm",
    85  			Subsystem: "master",
    86  			Name:      "worker_event_error",
    87  			Help:      "number of error related to worker event, during handling or watching",
    88  		}, []string{"type"})
    89  
    90  	startLeaderCounter = prometheus.NewCounter(
    91  		prometheus.CounterOpts{
    92  			Namespace: "dm",
    93  			Subsystem: "master",
    94  			Name:      "start_leader_counter",
    95  			Help:      "number of this dm-master try to start leader components",
    96  		})
    97  )
    98  
    99  func collectMetrics() {
   100  	cpuUsage := cpu.GetCPUPercentage()
   101  	cpuUsageGauge.Set(cpuUsage)
   102  }
   103  
   104  // RunBackgroundJob do periodic job.
   105  func RunBackgroundJob(ctx context.Context) {
   106  	ticker := time.NewTicker(time.Second * 10)
   107  	defer ticker.Stop()
   108  
   109  	for {
   110  		select {
   111  		case <-ticker.C:
   112  			collectMetrics()
   113  
   114  		case <-ctx.Done():
   115  			return
   116  		}
   117  	}
   118  }
   119  
   120  // RegistryMetrics registries metrics for worker.
   121  func RegistryMetrics() {
   122  	registry := prometheus.DefaultRegisterer
   123  
   124  	registry.MustRegister(workerState)
   125  	registry.MustRegister(cpuUsageGauge)
   126  	registry.MustRegister(ddlPendingCounter)
   127  	registry.MustRegister(ddlErrCounter)
   128  	registry.MustRegister(workerEventErrCounter)
   129  	registry.MustRegister(startLeaderCounter)
   130  }
   131  
   132  // ReportWorkerStage is a setter for workerState.
   133  func ReportWorkerStage(name string, state float64) {
   134  	workerState.WithLabelValues(name).Set(state)
   135  }
   136  
   137  // RemoveWorkerState cleans state of deleted worker.
   138  func RemoveWorkerState(name string) {
   139  	workerState.DeletePartialMatch(prometheus.Labels{"worker": name})
   140  }
   141  
   142  // ReportDDLPending inc/dec by 1 to ddlPendingCounter.
   143  func ReportDDLPending(task, oldStatus, newStatus string) {
   144  	if oldStatus != DDLPendingNone {
   145  		ddlPendingCounter.WithLabelValues(task, oldStatus).Dec()
   146  	}
   147  	if newStatus != DDLPendingNone {
   148  		ddlPendingCounter.WithLabelValues(task, newStatus).Inc()
   149  	}
   150  }
   151  
   152  // RemoveDDLPending removes all counter of this task.
   153  func RemoveDDLPending(task string) {
   154  	ddlPendingCounter.DeletePartialMatch(prometheus.Labels{"task": task})
   155  }
   156  
   157  // ReportDDLError is a setter for ddlErrCounter.
   158  func ReportDDLError(task, errType string) {
   159  	ddlErrCounter.WithLabelValues(task, errType).Inc()
   160  }
   161  
   162  // ReportWorkerEventErr is a setter for workerEventErrCounter.
   163  func ReportWorkerEventErr(errType string) {
   164  	workerEventErrCounter.WithLabelValues(errType).Inc()
   165  }
   166  
   167  // ReportStartLeader increases startLeaderCounter by one.
   168  func ReportStartLeader() {
   169  	startLeaderCounter.Inc()
   170  }
   171  
   172  // OnRetireLeader cleans some metrics when retires.
   173  func OnRetireLeader() {
   174  	workerState.Reset()
   175  	ddlErrCounter.Reset()
   176  	ddlPendingCounter.Reset()
   177  	workerEventErrCounter.Reset()
   178  }