istio.io/istio@v0.0.0-20240520182934-d79c90f27776/pilot/pkg/xds/monitoring.go (about)

     1  // Copyright Istio Authors
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //	http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  package xds
    15  
    16  import (
    17  	"sync"
    18  	"time"
    19  
    20  	"google.golang.org/grpc/codes"
    21  	"google.golang.org/grpc/status"
    22  
    23  	"istio.io/istio/pilot/pkg/model"
    24  	v3 "istio.io/istio/pilot/pkg/xds/v3"
    25  	"istio.io/istio/pkg/monitoring"
    26  )
    27  
    28  var (
    29  	typeTag    = monitoring.CreateLabel("type")
    30  	versionTag = monitoring.CreateLabel("version")
    31  
    32  	monServices = monitoring.NewGauge(
    33  		"pilot_services",
    34  		"Total services known to pilot.",
    35  	)
    36  
    37  	// TODO: Update all the resource stats in separate routine
    38  	// virtual services, destination rules, gateways, etc.
    39  	xdsClients = monitoring.NewGauge(
    40  		"pilot_xds",
    41  		"Number of endpoints connected to this pilot using XDS.",
    42  	)
    43  	xdsClientTrackerMutex = &sync.Mutex{}
    44  	xdsClientTracker      = make(map[string]float64)
    45  
    46  	// Covers xds_builderr and xds_senderr for xds in {lds, rds, cds, eds}.
    47  	pushes = monitoring.NewSum(
    48  		"pilot_xds_pushes",
    49  		"Pilot build and send errors for lds, rds, cds and eds.",
    50  	)
    51  
    52  	cdsSendErrPushes = pushes.With(typeTag.Value("cds_senderr"))
    53  	edsSendErrPushes = pushes.With(typeTag.Value("eds_senderr"))
    54  	ldsSendErrPushes = pushes.With(typeTag.Value("lds_senderr"))
    55  	rdsSendErrPushes = pushes.With(typeTag.Value("rds_senderr"))
    56  
    57  	debounceTime = monitoring.NewDistribution(
    58  		"pilot_debounce_time",
    59  		"Delay in seconds between the first config enters debouncing and the merged push request is pushed into the push queue.",
    60  		[]float64{.01, .1, 1, 3, 5, 10, 20, 30},
    61  	)
    62  
    63  	pushContextInitTime = monitoring.NewDistribution(
    64  		"pilot_pushcontext_init_seconds",
    65  		"Total time in seconds Pilot takes to init pushContext.",
    66  		[]float64{.01, .1, 0.5, 1, 3, 5},
    67  	)
    68  
    69  	pushTime = monitoring.NewDistribution(
    70  		"pilot_xds_push_time",
    71  		"Total time in seconds Pilot takes to push lds, rds, cds and eds.",
    72  		[]float64{.01, .1, 1, 3, 5, 10, 20, 30},
    73  	)
    74  
    75  	proxiesQueueTime = monitoring.NewDistribution(
    76  		"pilot_proxy_queue_time",
    77  		"Time in seconds, a proxy is in the push queue before being dequeued.",
    78  		[]float64{.1, .5, 1, 3, 5, 10, 20, 30},
    79  	)
    80  
    81  	pushTriggers = monitoring.NewSum(
    82  		"pilot_push_triggers",
    83  		"Total number of times a push was triggered, labeled by reason for the push.",
    84  	)
    85  
    86  	proxiesConvergeDelay = monitoring.NewDistribution(
    87  		"pilot_proxy_convergence_time",
    88  		"Delay in seconds between config change and a proxy receiving all required configuration.",
    89  		[]float64{.1, .5, 1, 3, 5, 10, 20, 30},
    90  	)
    91  
    92  	pushContextErrors = monitoring.NewSum(
    93  		"pilot_xds_push_context_errors",
    94  		"Number of errors (timeouts) initiating push context.",
    95  	)
    96  
    97  	inboundUpdates = monitoring.NewSum(
    98  		"pilot_inbound_updates",
    99  		"Total number of updates received by pilot.",
   100  	)
   101  
   102  	pilotSDSCertificateErrors = monitoring.NewSum(
   103  		"pilot_sds_certificate_errors_total",
   104  		"Total number of failures to fetch SDS key and certificate.",
   105  	)
   106  
   107  	inboundConfigUpdates  = inboundUpdates.With(typeTag.Value("config"))
   108  	inboundEDSUpdates     = inboundUpdates.With(typeTag.Value("eds"))
   109  	inboundServiceUpdates = inboundUpdates.With(typeTag.Value("svc"))
   110  	inboundServiceDeletes = inboundUpdates.With(typeTag.Value("svcdelete"))
   111  
   112  	configSizeBytes = monitoring.NewDistribution(
   113  		"pilot_xds_config_size_bytes",
   114  		"Distribution of configuration sizes pushed to clients",
   115  		// Important boundaries: 10K, 1M, 4M, 10M, 40M
   116  		// 4M default limit for gRPC, 10M config will start to strain system,
   117  		// 40M is likely upper-bound on config sizes supported.
   118  		[]float64{1, 10000, 1000000, 4000000, 10000000, 40000000},
   119  		monitoring.WithUnit(monitoring.Bytes),
   120  	)
   121  )
   122  
   123  func recordXDSClients(version string, delta float64) {
   124  	xdsClientTrackerMutex.Lock()
   125  	defer xdsClientTrackerMutex.Unlock()
   126  	xdsClientTracker[version] += delta
   127  	xdsClients.With(versionTag.Value(version)).Record(xdsClientTracker[version])
   128  }
   129  
   130  // triggerMetric is a precomputed monitoring.Metric for each trigger type. This saves on a lot of allocations
   131  var triggerMetric = map[model.TriggerReason]monitoring.Metric{
   132  	model.EndpointUpdate:  pushTriggers.With(typeTag.Value(string(model.EndpointUpdate))),
   133  	model.ConfigUpdate:    pushTriggers.With(typeTag.Value(string(model.ConfigUpdate))),
   134  	model.ServiceUpdate:   pushTriggers.With(typeTag.Value(string(model.ServiceUpdate))),
   135  	model.ProxyUpdate:     pushTriggers.With(typeTag.Value(string(model.ProxyUpdate))),
   136  	model.GlobalUpdate:    pushTriggers.With(typeTag.Value(string(model.GlobalUpdate))),
   137  	model.UnknownTrigger:  pushTriggers.With(typeTag.Value(string(model.UnknownTrigger))),
   138  	model.DebugTrigger:    pushTriggers.With(typeTag.Value(string(model.DebugTrigger))),
   139  	model.SecretTrigger:   pushTriggers.With(typeTag.Value(string(model.SecretTrigger))),
   140  	model.NetworksTrigger: pushTriggers.With(typeTag.Value(string(model.NetworksTrigger))),
   141  	model.ProxyRequest:    pushTriggers.With(typeTag.Value(string(model.ProxyRequest))),
   142  	model.NamespaceUpdate: pushTriggers.With(typeTag.Value(string(model.NamespaceUpdate))),
   143  	model.ClusterUpdate:   pushTriggers.With(typeTag.Value(string(model.ClusterUpdate))),
   144  }
   145  
   146  func recordPushTriggers(reasons model.ReasonStats) {
   147  	for r, cnt := range reasons {
   148  		t, f := triggerMetric[r]
   149  		if f {
   150  			t.RecordInt(int64(cnt))
   151  		} else {
   152  			pushTriggers.With(typeTag.Value(string(r))).Increment()
   153  		}
   154  	}
   155  }
   156  
   157  func isUnexpectedError(err error) bool {
   158  	s, ok := status.FromError(err)
   159  	// Unavailable or canceled code will be sent when a connection is closing down. This is very normal,
   160  	// due to the XDS connection being dropped every 30 minutes, or a pod shutting down.
   161  	isError := s.Code() != codes.Unavailable && s.Code() != codes.Canceled
   162  	return !ok || isError
   163  }
   164  
   165  // recordSendError records a metric indicating that a push failed. It returns true if this was an unexpected
   166  // error
   167  func recordSendError(xdsType string, err error) bool {
   168  	if isUnexpectedError(err) {
   169  		// TODO use a single metric with a type tag
   170  		switch xdsType {
   171  		case v3.ListenerType:
   172  			ldsSendErrPushes.Increment()
   173  		case v3.ClusterType:
   174  			cdsSendErrPushes.Increment()
   175  		case v3.EndpointType:
   176  			edsSendErrPushes.Increment()
   177  		case v3.RouteType:
   178  			rdsSendErrPushes.Increment()
   179  		}
   180  		return true
   181  	}
   182  	return false
   183  }
   184  
   185  func recordPushTime(xdsType string, duration time.Duration) {
   186  	pushTime.With(typeTag.Value(v3.GetMetricType(xdsType))).Record(duration.Seconds())
   187  	pushes.With(typeTag.Value(v3.GetMetricType(xdsType))).Increment()
   188  }