github.com/pingcap/tiflow@v0.0.0-20240520035814-5bf52d54e205/dm/relay/metrics.go (about)

     1  // Copyright 2019 PingCAP, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // See the License for the specific language governing permissions and
    12  // limitations under the License.
    13  
    14  package relay
    15  
    16  import (
    17  	"context"
    18  	"time"
    19  
    20  	"github.com/pingcap/failpoint"
    21  	"github.com/pingcap/tiflow/dm/pkg/log"
    22  	"github.com/pingcap/tiflow/dm/pkg/terror"
    23  	"github.com/pingcap/tiflow/dm/pkg/utils"
    24  	"github.com/pingcap/tiflow/engine/pkg/promutil"
    25  	"github.com/prometheus/client_golang/prometheus"
    26  )
    27  
    28  var (
    29  	f                = &promutil.PromFactory{}
    30  	relayLogPosGauge = f.NewGaugeVec(
    31  		prometheus.GaugeOpts{
    32  			Namespace: "dm",
    33  			Subsystem: "relay",
    34  			Name:      "binlog_pos",
    35  			Help:      "current binlog pos in current binlog file",
    36  		}, []string{"node"})
    37  
    38  	relayLogFileGauge = f.NewGaugeVec(
    39  		prometheus.GaugeOpts{
    40  			Namespace: "dm",
    41  			Subsystem: "relay",
    42  			Name:      "binlog_file",
    43  			Help:      "current binlog file index",
    44  		}, []string{"node"})
    45  
    46  	// split sub directory info from relayLogPosGauge / relayLogFileGauge
    47  	// to make compare relayLogFileGauge for master / relay more easier.
    48  	relaySubDirIndex = f.NewGaugeVec(
    49  		prometheus.GaugeOpts{
    50  			Namespace: "dm",
    51  			Subsystem: "relay",
    52  			Name:      "sub_dir_index",
    53  			Help:      "current relay sub directory index",
    54  		}, []string{"node", "uuid"})
    55  
    56  	// should alert if available space < 10G.
    57  	relayLogSpaceGauge = f.NewGaugeVec(
    58  		prometheus.GaugeOpts{
    59  			Namespace: "dm",
    60  			Subsystem: "relay",
    61  			Name:      "space",
    62  			Help:      "the space of storage for relay component",
    63  		}, []string{"type"}) // type can be 'capacity' and 'available'.
    64  
    65  	// should alert.
    66  	relayLogDataCorruptionCounter = prometheus.NewCounter(
    67  		prometheus.CounterOpts{
    68  			Namespace: "dm",
    69  			Subsystem: "relay",
    70  			Name:      "data_corruption",
    71  			Help:      "counter of relay log data corruption",
    72  		})
    73  
    74  	relayLogWriteSizeHistogram = prometheus.NewHistogram(
    75  		prometheus.HistogramOpts{
    76  			Namespace: "dm",
    77  			Subsystem: "relay",
    78  			Name:      "write_size",
    79  			Help:      "write relay log size",
    80  			Buckets:   prometheus.ExponentialBuckets(16, 2, 20),
    81  		})
    82  
    83  	relayLogWriteDurationHistogram = prometheus.NewHistogram(
    84  		prometheus.HistogramOpts{
    85  			Namespace: "dm",
    86  			Subsystem: "relay",
    87  			Name:      "write_duration",
    88  			Help:      "bucketed histogram of write time (s) of single relay log event",
    89  			Buckets:   prometheus.ExponentialBuckets(0.000005, 2, 25),
    90  		})
    91  
    92  	// should alert.
    93  	relayLogWriteErrorCounter = prometheus.NewCounter(
    94  		prometheus.CounterOpts{
    95  			Namespace: "dm",
    96  			Subsystem: "relay",
    97  			Name:      "write_error_count",
    98  			Help:      "write relay log error count",
    99  		})
   100  
   101  	// should alert.
   102  	binlogReadErrorCounter = prometheus.NewCounter(
   103  		prometheus.CounterOpts{
   104  			Namespace: "dm",
   105  			Subsystem: "relay",
   106  			Name:      "read_error_count",
   107  			Help:      "read binlog from master error count",
   108  		})
   109  
   110  	binlogReadDurationHistogram = prometheus.NewHistogram(
   111  		prometheus.HistogramOpts{
   112  			Namespace: "dm",
   113  			Subsystem: "relay",
   114  			Name:      "read_binlog_duration",
   115  			Help:      "bucketed histogram of read time (s) of single binlog event from the master.",
   116  			Buckets:   prometheus.ExponentialBuckets(0.000005, 2, 25),
   117  		})
   118  
   119  	binlogTransformDurationHistogram = prometheus.NewHistogram(
   120  		prometheus.HistogramOpts{
   121  			Namespace: "dm",
   122  			Subsystem: "relay",
   123  			Name:      "read_transform_duration",
   124  			Help:      "bucketed histogram of transform time (s) of single binlog event.",
   125  			Buckets:   prometheus.ExponentialBuckets(0.000005, 2, 25),
   126  		})
   127  
   128  	// should alert.
   129  	relayExitWithErrorCounter = prometheus.NewCounterVec(
   130  		prometheus.CounterOpts{
   131  			Namespace: "dm",
   132  			Subsystem: "relay",
   133  			Name:      "exit_with_error_count",
   134  			Help:      "counter of relay unit exits with error",
   135  		}, []string{"resumable_err"})
   136  )
   137  
   138  // RegisterMetrics register metrics.
   139  func RegisterMetrics(registry *prometheus.Registry) {
   140  	registry.MustRegister(relayLogPosGauge)
   141  	registry.MustRegister(relayLogFileGauge)
   142  	registry.MustRegister(relaySubDirIndex)
   143  	registry.MustRegister(relayLogSpaceGauge)
   144  	registry.MustRegister(relayLogDataCorruptionCounter)
   145  	registry.MustRegister(relayLogWriteSizeHistogram)
   146  	registry.MustRegister(relayLogWriteDurationHistogram)
   147  	registry.MustRegister(relayLogWriteErrorCounter)
   148  	registry.MustRegister(binlogReadErrorCounter)
   149  	registry.MustRegister(binlogReadDurationHistogram)
   150  	registry.MustRegister(binlogTransformDurationHistogram)
   151  	registry.MustRegister(relayExitWithErrorCounter)
   152  }
   153  
   154  func reportRelayLogSpaceInBackground(ctx context.Context, dirpath string) error {
   155  	if len(dirpath) == 0 {
   156  		return terror.ErrRelayLogDirpathEmpty.Generate()
   157  	}
   158  
   159  	go func() {
   160  		var ticker *time.Ticker
   161  		ticker = time.NewTicker(time.Second * 10)
   162  		failpoint.Inject("ReportRelayLogSpaceInBackground", func(val failpoint.Value) {
   163  			t := val.(int)
   164  			ticker = time.NewTicker(time.Duration(t) * time.Second)
   165  		})
   166  		defer ticker.Stop()
   167  		for {
   168  			select {
   169  			case <-ctx.Done():
   170  				return
   171  			case <-ticker.C:
   172  				size, err := utils.GetStorageSize(dirpath)
   173  				if err != nil {
   174  					log.L().Error("fail to update relay log storage size", log.ShortError(err))
   175  				} else {
   176  					relayLogSpaceGauge.WithLabelValues("capacity").Set(float64(size.Capacity))
   177  					relayLogSpaceGauge.WithLabelValues("available").Set(float64(size.Available))
   178  				}
   179  			}
   180  		}
   181  	}()
   182  
   183  	return nil
   184  }