github.com/pingcap/tiflow@v0.0.0-20240520035814-5bf52d54e205/dm/master/metrics/metrics.go (about) 1 // Copyright 2020 PingCAP, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // See the License for the specific language governing permissions and 12 // limitations under the License. 13 14 package metrics 15 16 import ( 17 "context" 18 "time" 19 20 cpu "github.com/pingcap/tidb-tools/pkg/utils" 21 "github.com/pingcap/tiflow/engine/pkg/promutil" 22 "github.com/prometheus/client_golang/prometheus" 23 ) 24 25 // used for ddlPendingCounter, no "Resolved" lock because they will be 26 // remove quickly and not pending anymore. 27 const ( 28 DDLPendingNone = "None" 29 DDLPendingUnSynced = "Un-synced" 30 DDLPendingSynced = "Synced" 31 ) 32 33 // used to show error type when handle DDLs. 34 const ( 35 InfoErrSyncLock = "InfoPut - SyncLockError" 36 InfoErrHandleLock = "InfoPut - HandleLockError" 37 OpErrRemoveLock = "OperationPut - RemoveLockError" 38 OpErrLockUnSynced = "OperationPut - LockUnSyncedError" 39 OpErrPutNonOwnerOp = "OperationPut - PutNonOwnerOpError" 40 ) 41 42 // used to represent worker event error type. 43 const ( 44 WorkerEventHandle = "handle" 45 WorkerEventWatch = "watch" 46 ) 47 48 var ( 49 f = &promutil.PromFactory{} 50 workerState = f.NewGaugeVec( 51 prometheus.GaugeOpts{ 52 Namespace: "dm", 53 Subsystem: "master", 54 Name: "worker_state", 55 Help: "state of worker, -1 - unrecognized, 0 - offline, 1 - free, 2 - bound", 56 }, []string{"worker"}) 57 58 cpuUsageGauge = prometheus.NewGauge( 59 prometheus.GaugeOpts{ 60 Namespace: "dm", 61 Subsystem: "master", 62 Name: "cpu_usage", 63 Help: "the cpu usage of master", 64 }) 65 66 ddlPendingCounter = f.NewGaugeVec( 67 prometheus.GaugeOpts{ 68 Namespace: "dm", 69 Subsystem: "master", 70 Name: "ddl_state_number", 71 Help: "number of pending DDL in different states, Un-synced (waiting all upstream), Synced (all upstream finished, waiting all downstream)", 72 }, []string{"task", "type"}) 73 74 ddlErrCounter = f.NewCounterVec( 75 prometheus.CounterOpts{ 76 Namespace: "dm", 77 Subsystem: "master", 78 Name: "shard_ddl_error", 79 Help: "number of shard DDL lock/operation error", 80 }, []string{"task", "type"}) 81 82 workerEventErrCounter = f.NewCounterVec( 83 prometheus.CounterOpts{ 84 Namespace: "dm", 85 Subsystem: "master", 86 Name: "worker_event_error", 87 Help: "number of error related to worker event, during handling or watching", 88 }, []string{"type"}) 89 90 startLeaderCounter = prometheus.NewCounter( 91 prometheus.CounterOpts{ 92 Namespace: "dm", 93 Subsystem: "master", 94 Name: "start_leader_counter", 95 Help: "number of this dm-master try to start leader components", 96 }) 97 ) 98 99 func collectMetrics() { 100 cpuUsage := cpu.GetCPUPercentage() 101 cpuUsageGauge.Set(cpuUsage) 102 } 103 104 // RunBackgroundJob do periodic job. 105 func RunBackgroundJob(ctx context.Context) { 106 ticker := time.NewTicker(time.Second * 10) 107 defer ticker.Stop() 108 109 for { 110 select { 111 case <-ticker.C: 112 collectMetrics() 113 114 case <-ctx.Done(): 115 return 116 } 117 } 118 } 119 120 // RegistryMetrics registries metrics for worker. 121 func RegistryMetrics() { 122 registry := prometheus.DefaultRegisterer 123 124 registry.MustRegister(workerState) 125 registry.MustRegister(cpuUsageGauge) 126 registry.MustRegister(ddlPendingCounter) 127 registry.MustRegister(ddlErrCounter) 128 registry.MustRegister(workerEventErrCounter) 129 registry.MustRegister(startLeaderCounter) 130 } 131 132 // ReportWorkerStage is a setter for workerState. 133 func ReportWorkerStage(name string, state float64) { 134 workerState.WithLabelValues(name).Set(state) 135 } 136 137 // RemoveWorkerState cleans state of deleted worker. 138 func RemoveWorkerState(name string) { 139 workerState.DeletePartialMatch(prometheus.Labels{"worker": name}) 140 } 141 142 // ReportDDLPending inc/dec by 1 to ddlPendingCounter. 143 func ReportDDLPending(task, oldStatus, newStatus string) { 144 if oldStatus != DDLPendingNone { 145 ddlPendingCounter.WithLabelValues(task, oldStatus).Dec() 146 } 147 if newStatus != DDLPendingNone { 148 ddlPendingCounter.WithLabelValues(task, newStatus).Inc() 149 } 150 } 151 152 // RemoveDDLPending removes all counter of this task. 153 func RemoveDDLPending(task string) { 154 ddlPendingCounter.DeletePartialMatch(prometheus.Labels{"task": task}) 155 } 156 157 // ReportDDLError is a setter for ddlErrCounter. 158 func ReportDDLError(task, errType string) { 159 ddlErrCounter.WithLabelValues(task, errType).Inc() 160 } 161 162 // ReportWorkerEventErr is a setter for workerEventErrCounter. 163 func ReportWorkerEventErr(errType string) { 164 workerEventErrCounter.WithLabelValues(errType).Inc() 165 } 166 167 // ReportStartLeader increases startLeaderCounter by one. 168 func ReportStartLeader() { 169 startLeaderCounter.Inc() 170 } 171 172 // OnRetireLeader cleans some metrics when retires. 173 func OnRetireLeader() { 174 workerState.Reset() 175 ddlErrCounter.Reset() 176 ddlPendingCounter.Reset() 177 workerEventErrCounter.Reset() 178 }