go.etcd.io/etcd@v3.3.27+incompatible/etcdserver/metrics.go (about)

     1  // Copyright 2015 The etcd Authors
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package etcdserver
    16  
    17  import (
    18  	goruntime "runtime"
    19  	"time"
    20  
    21  	"github.com/coreos/etcd/pkg/runtime"
    22  	"github.com/coreos/etcd/version"
    23  	"github.com/prometheus/client_golang/prometheus"
    24  )
    25  
    26  var (
    27  	hasLeader = prometheus.NewGauge(prometheus.GaugeOpts{
    28  		Namespace: "etcd",
    29  		Subsystem: "server",
    30  		Name:      "has_leader",
    31  		Help:      "Whether or not a leader exists. 1 is existence, 0 is not.",
    32  	})
    33  	isLeader = prometheus.NewGauge(prometheus.GaugeOpts{
    34  		Namespace: "etcd",
    35  		Subsystem: "server",
    36  		Name:      "is_leader",
    37  		Help:      "Whether or not this member is a leader. 1 if is, 0 otherwise.",
    38  	})
    39  	leaderChanges = prometheus.NewCounter(prometheus.CounterOpts{
    40  		Namespace: "etcd",
    41  		Subsystem: "server",
    42  		Name:      "leader_changes_seen_total",
    43  		Help:      "The number of leader changes seen.",
    44  	})
    45  	heartbeatSendFailures = prometheus.NewCounter(prometheus.CounterOpts{
    46  		Namespace: "etcd",
    47  		Subsystem: "server",
    48  		Name:      "heartbeat_send_failures_total",
    49  		Help:      "The total number of leader heartbeat send failures (likely overloaded from slow disk).",
    50  	})
    51  	slowApplies = prometheus.NewCounter(prometheus.CounterOpts{
    52  		Namespace: "etcd",
    53  		Subsystem: "server",
    54  		Name:      "slow_apply_total",
    55  		Help:      "The total number of slow apply requests (likely overloaded from slow disk).",
    56  	})
    57  	applySnapshotInProgress = prometheus.NewGauge(prometheus.GaugeOpts{
    58  		Namespace: "etcd",
    59  		Subsystem: "server",
    60  		Name:      "snapshot_apply_in_progress_total",
    61  		Help:      "1 if the server is applying the incoming snapshot. 0 if none.",
    62  	})
    63  	proposalsCommitted = prometheus.NewGauge(prometheus.GaugeOpts{
    64  		Namespace: "etcd",
    65  		Subsystem: "server",
    66  		Name:      "proposals_committed_total",
    67  		Help:      "The total number of consensus proposals committed.",
    68  	})
    69  	proposalsApplied = prometheus.NewGauge(prometheus.GaugeOpts{
    70  		Namespace: "etcd",
    71  		Subsystem: "server",
    72  		Name:      "proposals_applied_total",
    73  		Help:      "The total number of consensus proposals applied.",
    74  	})
    75  	proposalsPending = prometheus.NewGauge(prometheus.GaugeOpts{
    76  		Namespace: "etcd",
    77  		Subsystem: "server",
    78  		Name:      "proposals_pending",
    79  		Help:      "The current number of pending proposals to commit.",
    80  	})
    81  	proposalsFailed = prometheus.NewCounter(prometheus.CounterOpts{
    82  		Namespace: "etcd",
    83  		Subsystem: "server",
    84  		Name:      "proposals_failed_total",
    85  		Help:      "The total number of failed proposals seen.",
    86  	})
    87  	leaseExpired = prometheus.NewCounter(prometheus.CounterOpts{
    88  		Namespace: "etcd_debugging",
    89  		Subsystem: "server",
    90  		Name:      "lease_expired_total",
    91  		Help:      "The total number of expired leases.",
    92  	})
    93  	slowReadIndex = prometheus.NewCounter(prometheus.CounterOpts{
    94  		Namespace: "etcd",
    95  		Subsystem: "server",
    96  		Name:      "slow_read_indexes_total",
    97  		Help:      "The total number of pending read indexes not in sync with leader's or timed out read index requests.",
    98  	})
    99  	readIndexFailed = prometheus.NewCounter(prometheus.CounterOpts{
   100  		Namespace: "etcd",
   101  		Subsystem: "server",
   102  		Name:      "read_indexes_failed_total",
   103  		Help:      "The total number of failed read indexes seen.",
   104  	})
   105  	quotaBackendBytes = prometheus.NewGauge(prometheus.GaugeOpts{
   106  		Namespace: "etcd",
   107  		Subsystem: "server",
   108  		Name:      "quota_backend_bytes",
   109  		Help:      "Current backend storage quota size in bytes.",
   110  	})
   111  	currentVersion = prometheus.NewGaugeVec(prometheus.GaugeOpts{
   112  		Namespace: "etcd",
   113  		Subsystem: "server",
   114  		Name:      "version",
   115  		Help:      "Which version is running. 1 for 'server_version' label with current version.",
   116  	},
   117  		[]string{"server_version"})
   118  	currentGoVersion = prometheus.NewGaugeVec(prometheus.GaugeOpts{
   119  		Namespace: "etcd",
   120  		Subsystem: "server",
   121  		Name:      "go_version",
   122  		Help:      "Which Go version server is running with. 1 for 'server_go_version' label with current version.",
   123  	},
   124  		[]string{"server_go_version"})
   125  	serverID = prometheus.NewGaugeVec(prometheus.GaugeOpts{
   126  		Namespace: "etcd",
   127  		Subsystem: "server",
   128  		Name:      "id",
   129  		Help:      "Server or member ID in hexadecimal format. 1 for 'server_id' label with current ID.",
   130  	},
   131  		[]string{"server_id"})
   132  
   133  	fdUsed = prometheus.NewGauge(prometheus.GaugeOpts{
   134  		Namespace: "os",
   135  		Subsystem: "fd",
   136  		Name:      "used",
   137  		Help:      "The number of used file descriptors.",
   138  	})
   139  	fdLimit = prometheus.NewGauge(prometheus.GaugeOpts{
   140  		Namespace: "os",
   141  		Subsystem: "fd",
   142  		Name:      "limit",
   143  		Help:      "The file descriptor limit.",
   144  	})
   145  )
   146  
   147  func init() {
   148  	prometheus.MustRegister(hasLeader)
   149  	prometheus.MustRegister(isLeader)
   150  	prometheus.MustRegister(leaderChanges)
   151  	prometheus.MustRegister(heartbeatSendFailures)
   152  	prometheus.MustRegister(slowApplies)
   153  	prometheus.MustRegister(applySnapshotInProgress)
   154  	prometheus.MustRegister(proposalsCommitted)
   155  	prometheus.MustRegister(proposalsApplied)
   156  	prometheus.MustRegister(proposalsPending)
   157  	prometheus.MustRegister(proposalsFailed)
   158  	prometheus.MustRegister(leaseExpired)
   159  	prometheus.MustRegister(slowReadIndex)
   160  	prometheus.MustRegister(readIndexFailed)
   161  	prometheus.MustRegister(quotaBackendBytes)
   162  	prometheus.MustRegister(currentVersion)
   163  	prometheus.MustRegister(currentGoVersion)
   164  	prometheus.MustRegister(serverID)
   165  	prometheus.MustRegister(fdUsed)
   166  	prometheus.MustRegister(fdLimit)
   167  
   168  	currentVersion.With(prometheus.Labels{
   169  		"server_version": version.Version,
   170  	}).Set(1)
   171  	currentGoVersion.With(prometheus.Labels{
   172  		"server_go_version": goruntime.Version(),
   173  	}).Set(1)
   174  }
   175  
   176  func monitorFileDescriptor(done <-chan struct{}) {
   177  	// This ticker will check File Descriptor Requirements ,and count all fds in used.
   178  	// And recorded some logs when in used >= limit/5*4. Just recorded message.
   179  	// If fds was more than 10K,It's low performance due to FDUsage() works.
   180  	// So need to increase it.
   181  	// See https://github.com/etcd-io/etcd/issues/11969 for more detail.
   182  	ticker := time.NewTicker(10 * time.Minute)
   183  	defer ticker.Stop()
   184  	for {
   185  		used, err := runtime.FDUsage()
   186  		if err != nil {
   187  			plog.Errorf("cannot monitor file descriptor usage (%v)", err)
   188  			return
   189  		}
   190  		fdUsed.Set(float64(used))
   191  		limit, err := runtime.FDLimit()
   192  		if err != nil {
   193  			plog.Errorf("cannot monitor file descriptor usage (%v)", err)
   194  			return
   195  		}
   196  		fdLimit.Set(float64(limit))
   197  		if used >= limit/5*4 {
   198  			plog.Warningf("80%% of the file descriptor limit is used [used = %d, limit = %d]", used, limit)
   199  		}
   200  		select {
   201  		case <-ticker.C:
   202  		case <-done:
   203  			return
   204  		}
   205  	}
   206  }