github.com/pingcap/tiflow@v0.0.0-20240520035814-5bf52d54e205/dm/relay/metrics.go (about) 1 // Copyright 2019 PingCAP, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // See the License for the specific language governing permissions and 12 // limitations under the License. 13 14 package relay 15 16 import ( 17 "context" 18 "time" 19 20 "github.com/pingcap/failpoint" 21 "github.com/pingcap/tiflow/dm/pkg/log" 22 "github.com/pingcap/tiflow/dm/pkg/terror" 23 "github.com/pingcap/tiflow/dm/pkg/utils" 24 "github.com/pingcap/tiflow/engine/pkg/promutil" 25 "github.com/prometheus/client_golang/prometheus" 26 ) 27 28 var ( 29 f = &promutil.PromFactory{} 30 relayLogPosGauge = f.NewGaugeVec( 31 prometheus.GaugeOpts{ 32 Namespace: "dm", 33 Subsystem: "relay", 34 Name: "binlog_pos", 35 Help: "current binlog pos in current binlog file", 36 }, []string{"node"}) 37 38 relayLogFileGauge = f.NewGaugeVec( 39 prometheus.GaugeOpts{ 40 Namespace: "dm", 41 Subsystem: "relay", 42 Name: "binlog_file", 43 Help: "current binlog file index", 44 }, []string{"node"}) 45 46 // split sub directory info from relayLogPosGauge / relayLogFileGauge 47 // to make compare relayLogFileGauge for master / relay more easier. 48 relaySubDirIndex = f.NewGaugeVec( 49 prometheus.GaugeOpts{ 50 Namespace: "dm", 51 Subsystem: "relay", 52 Name: "sub_dir_index", 53 Help: "current relay sub directory index", 54 }, []string{"node", "uuid"}) 55 56 // should alert if available space < 10G. 57 relayLogSpaceGauge = f.NewGaugeVec( 58 prometheus.GaugeOpts{ 59 Namespace: "dm", 60 Subsystem: "relay", 61 Name: "space", 62 Help: "the space of storage for relay component", 63 }, []string{"type"}) // type can be 'capacity' and 'available'. 64 65 // should alert. 66 relayLogDataCorruptionCounter = prometheus.NewCounter( 67 prometheus.CounterOpts{ 68 Namespace: "dm", 69 Subsystem: "relay", 70 Name: "data_corruption", 71 Help: "counter of relay log data corruption", 72 }) 73 74 relayLogWriteSizeHistogram = prometheus.NewHistogram( 75 prometheus.HistogramOpts{ 76 Namespace: "dm", 77 Subsystem: "relay", 78 Name: "write_size", 79 Help: "write relay log size", 80 Buckets: prometheus.ExponentialBuckets(16, 2, 20), 81 }) 82 83 relayLogWriteDurationHistogram = prometheus.NewHistogram( 84 prometheus.HistogramOpts{ 85 Namespace: "dm", 86 Subsystem: "relay", 87 Name: "write_duration", 88 Help: "bucketed histogram of write time (s) of single relay log event", 89 Buckets: prometheus.ExponentialBuckets(0.000005, 2, 25), 90 }) 91 92 // should alert. 93 relayLogWriteErrorCounter = prometheus.NewCounter( 94 prometheus.CounterOpts{ 95 Namespace: "dm", 96 Subsystem: "relay", 97 Name: "write_error_count", 98 Help: "write relay log error count", 99 }) 100 101 // should alert. 102 binlogReadErrorCounter = prometheus.NewCounter( 103 prometheus.CounterOpts{ 104 Namespace: "dm", 105 Subsystem: "relay", 106 Name: "read_error_count", 107 Help: "read binlog from master error count", 108 }) 109 110 binlogReadDurationHistogram = prometheus.NewHistogram( 111 prometheus.HistogramOpts{ 112 Namespace: "dm", 113 Subsystem: "relay", 114 Name: "read_binlog_duration", 115 Help: "bucketed histogram of read time (s) of single binlog event from the master.", 116 Buckets: prometheus.ExponentialBuckets(0.000005, 2, 25), 117 }) 118 119 binlogTransformDurationHistogram = prometheus.NewHistogram( 120 prometheus.HistogramOpts{ 121 Namespace: "dm", 122 Subsystem: "relay", 123 Name: "read_transform_duration", 124 Help: "bucketed histogram of transform time (s) of single binlog event.", 125 Buckets: prometheus.ExponentialBuckets(0.000005, 2, 25), 126 }) 127 128 // should alert. 129 relayExitWithErrorCounter = prometheus.NewCounterVec( 130 prometheus.CounterOpts{ 131 Namespace: "dm", 132 Subsystem: "relay", 133 Name: "exit_with_error_count", 134 Help: "counter of relay unit exits with error", 135 }, []string{"resumable_err"}) 136 ) 137 138 // RegisterMetrics register metrics. 139 func RegisterMetrics(registry *prometheus.Registry) { 140 registry.MustRegister(relayLogPosGauge) 141 registry.MustRegister(relayLogFileGauge) 142 registry.MustRegister(relaySubDirIndex) 143 registry.MustRegister(relayLogSpaceGauge) 144 registry.MustRegister(relayLogDataCorruptionCounter) 145 registry.MustRegister(relayLogWriteSizeHistogram) 146 registry.MustRegister(relayLogWriteDurationHistogram) 147 registry.MustRegister(relayLogWriteErrorCounter) 148 registry.MustRegister(binlogReadErrorCounter) 149 registry.MustRegister(binlogReadDurationHistogram) 150 registry.MustRegister(binlogTransformDurationHistogram) 151 registry.MustRegister(relayExitWithErrorCounter) 152 } 153 154 func reportRelayLogSpaceInBackground(ctx context.Context, dirpath string) error { 155 if len(dirpath) == 0 { 156 return terror.ErrRelayLogDirpathEmpty.Generate() 157 } 158 159 go func() { 160 var ticker *time.Ticker 161 ticker = time.NewTicker(time.Second * 10) 162 failpoint.Inject("ReportRelayLogSpaceInBackground", func(val failpoint.Value) { 163 t := val.(int) 164 ticker = time.NewTicker(time.Duration(t) * time.Second) 165 }) 166 defer ticker.Stop() 167 for { 168 select { 169 case <-ctx.Done(): 170 return 171 case <-ticker.C: 172 size, err := utils.GetStorageSize(dirpath) 173 if err != nil { 174 log.L().Error("fail to update relay log storage size", log.ShortError(err)) 175 } else { 176 relayLogSpaceGauge.WithLabelValues("capacity").Set(float64(size.Capacity)) 177 relayLogSpaceGauge.WithLabelValues("available").Set(float64(size.Available)) 178 } 179 } 180 } 181 }() 182 183 return nil 184 }