github.com/pingcap/tiflow@v0.0.0-20240520035814-5bf52d54e205/dm/worker/metrics.go (about)

     1  // Copyright 2019 PingCAP, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // See the License for the specific language governing permissions and
    12  // limitations under the License.
    13  
    14  package worker
    15  
    16  import (
    17  	"context"
    18  	"net"
    19  	"net/http"
    20  	"net/http/pprof"
    21  	"time"
    22  
    23  	cpu "github.com/pingcap/tidb-tools/pkg/utils"
    24  	"github.com/pingcap/tiflow/dm/common"
    25  	"github.com/pingcap/tiflow/dm/dumpling"
    26  	"github.com/pingcap/tiflow/dm/loader"
    27  	"github.com/pingcap/tiflow/dm/pkg/log"
    28  	"github.com/pingcap/tiflow/dm/relay"
    29  	"github.com/pingcap/tiflow/dm/syncer/metrics"
    30  	"github.com/pingcap/tiflow/engine/pkg/promutil"
    31  	"github.com/pingcap/tiflow/pkg/version"
    32  	"github.com/prometheus/client_golang/prometheus"
    33  	"github.com/prometheus/client_golang/prometheus/collectors"
    34  	"github.com/prometheus/client_golang/prometheus/promhttp"
    35  )
    36  
    37  const (
    38  	opErrTypeBeforeOp    = "BeforeAnyOp"
    39  	opErrTypeSourceBound = "SourceBound"
    40  	opErrTypeRelaySource = "RelaySource"
    41  )
    42  
    43  var (
    44  	f         = &promutil.PromFactory{}
    45  	taskState = f.NewGaugeVec(
    46  		prometheus.GaugeOpts{
    47  			Namespace: "dm",
    48  			Subsystem: "worker",
    49  			Name:      "task_state",
    50  			Help:      "state of task, 0 - invalidStage, 1 - New, 2 - Running, 3 - Paused, 4 - Stopped, 5 - Finished",
    51  		}, []string{"task", "source_id", "worker"})
    52  
    53  	// opErrCounter cleans on worker close, which is the same time dm-worker exits, so no explicit clean.
    54  	opErrCounter = f.NewCounterVec(
    55  		prometheus.CounterOpts{
    56  			Namespace: "dm",
    57  			Subsystem: "worker",
    58  			Name:      "operate_error",
    59  			Help:      "number of different operate error",
    60  		}, []string{"worker", "type"})
    61  
    62  	cpuUsageGauge = prometheus.NewGauge(
    63  		prometheus.GaugeOpts{
    64  			Namespace: "dm",
    65  			Subsystem: "worker",
    66  			Name:      "cpu_usage",
    67  			Help:      "the cpu usage of worker",
    68  		})
    69  )
    70  
    71  type statusHandler struct{}
    72  
    73  func (h *statusHandler) ServeHTTP(w http.ResponseWriter, req *http.Request) {
    74  	w.Header().Set("Content-Type", "text/plain")
    75  	text := version.GetRawInfo()
    76  	_, err := w.Write([]byte(text))
    77  	if err != nil && !common.IsErrNetClosing(err) {
    78  		log.L().Error("fail to write status response", log.ShortError(err))
    79  	}
    80  }
    81  
    82  // Note: handle error inside the function with returning it.
    83  func (s *Server) collectMetrics() {
    84  	// CPU usage metric
    85  	cpuUsage := cpu.GetCPUPercentage()
    86  	cpuUsageGauge.Set(cpuUsage)
    87  }
    88  
    89  func (s *Server) runBackgroundJob(ctx context.Context) {
    90  	ticker := time.NewTicker(time.Second * 10)
    91  	defer ticker.Stop()
    92  
    93  	for {
    94  		select {
    95  		case <-ticker.C:
    96  			s.collectMetrics()
    97  
    98  		case <-ctx.Done():
    99  			return
   100  		}
   101  	}
   102  }
   103  
   104  // RegistryMetrics registries metrics for worker.
   105  func RegistryMetrics() {
   106  	registry := prometheus.NewRegistry()
   107  	registry.MustRegister(prometheus.NewProcessCollector(prometheus.ProcessCollectorOpts{}))
   108  	registry.MustRegister(prometheus.NewGoCollector(
   109  		collectors.WithGoCollections(collectors.GoRuntimeMemStatsCollection | collectors.GoRuntimeMetricsCollection)))
   110  
   111  	registry.MustRegister(cpuUsageGauge)
   112  
   113  	registry.MustRegister(taskState)
   114  	registry.MustRegister(opErrCounter)
   115  
   116  	relay.RegisterMetrics(registry)
   117  	dumpling.RegisterMetrics(registry)
   118  	loader.RegisterMetrics(registry)
   119  	metrics.RegisterValidatorMetrics(registry)
   120  	metrics.DefaultMetricsProxies.RegisterMetrics(registry)
   121  	prometheus.DefaultGatherer = registry
   122  }
   123  
   124  // InitStatus initializes the HTTP status server.
   125  func InitStatus(lis net.Listener) {
   126  	mux := http.NewServeMux()
   127  	mux.Handle("/status", &statusHandler{})
   128  	mux.Handle("/metrics", promhttp.Handler())
   129  
   130  	mux.HandleFunc("/debug/pprof/", pprof.Index)
   131  	mux.HandleFunc("/debug/pprof/cmdline", pprof.Cmdline)
   132  	mux.HandleFunc("/debug/pprof/profile", pprof.Profile)
   133  	mux.HandleFunc("/debug/pprof/symbol", pprof.Symbol)
   134  	mux.HandleFunc("/debug/pprof/trace", pprof.Trace)
   135  
   136  	httpS := &http.Server{
   137  		Handler: mux,
   138  	}
   139  	err := httpS.Serve(lis)
   140  	if err != nil && !common.IsErrNetClosing(err) && err != http.ErrServerClosed {
   141  		log.L().Error("status server returned", log.ShortError(err))
   142  	}
   143  }