istio.io/istio@v0.0.0-20240520182934-d79c90f27776/pilot/pkg/xds/monitoring.go (about) 1 // Copyright Istio Authors 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 package xds 15 16 import ( 17 "sync" 18 "time" 19 20 "google.golang.org/grpc/codes" 21 "google.golang.org/grpc/status" 22 23 "istio.io/istio/pilot/pkg/model" 24 v3 "istio.io/istio/pilot/pkg/xds/v3" 25 "istio.io/istio/pkg/monitoring" 26 ) 27 28 var ( 29 typeTag = monitoring.CreateLabel("type") 30 versionTag = monitoring.CreateLabel("version") 31 32 monServices = monitoring.NewGauge( 33 "pilot_services", 34 "Total services known to pilot.", 35 ) 36 37 // TODO: Update all the resource stats in separate routine 38 // virtual services, destination rules, gateways, etc. 39 xdsClients = monitoring.NewGauge( 40 "pilot_xds", 41 "Number of endpoints connected to this pilot using XDS.", 42 ) 43 xdsClientTrackerMutex = &sync.Mutex{} 44 xdsClientTracker = make(map[string]float64) 45 46 // Covers xds_builderr and xds_senderr for xds in {lds, rds, cds, eds}. 47 pushes = monitoring.NewSum( 48 "pilot_xds_pushes", 49 "Pilot build and send errors for lds, rds, cds and eds.", 50 ) 51 52 cdsSendErrPushes = pushes.With(typeTag.Value("cds_senderr")) 53 edsSendErrPushes = pushes.With(typeTag.Value("eds_senderr")) 54 ldsSendErrPushes = pushes.With(typeTag.Value("lds_senderr")) 55 rdsSendErrPushes = pushes.With(typeTag.Value("rds_senderr")) 56 57 debounceTime = monitoring.NewDistribution( 58 "pilot_debounce_time", 59 "Delay in seconds between the first config enters debouncing and the merged push request is pushed into the push queue.", 60 []float64{.01, .1, 1, 3, 5, 10, 20, 30}, 61 ) 62 63 pushContextInitTime = monitoring.NewDistribution( 64 "pilot_pushcontext_init_seconds", 65 "Total time in seconds Pilot takes to init pushContext.", 66 []float64{.01, .1, 0.5, 1, 3, 5}, 67 ) 68 69 pushTime = monitoring.NewDistribution( 70 "pilot_xds_push_time", 71 "Total time in seconds Pilot takes to push lds, rds, cds and eds.", 72 []float64{.01, .1, 1, 3, 5, 10, 20, 30}, 73 ) 74 75 proxiesQueueTime = monitoring.NewDistribution( 76 "pilot_proxy_queue_time", 77 "Time in seconds, a proxy is in the push queue before being dequeued.", 78 []float64{.1, .5, 1, 3, 5, 10, 20, 30}, 79 ) 80 81 pushTriggers = monitoring.NewSum( 82 "pilot_push_triggers", 83 "Total number of times a push was triggered, labeled by reason for the push.", 84 ) 85 86 proxiesConvergeDelay = monitoring.NewDistribution( 87 "pilot_proxy_convergence_time", 88 "Delay in seconds between config change and a proxy receiving all required configuration.", 89 []float64{.1, .5, 1, 3, 5, 10, 20, 30}, 90 ) 91 92 pushContextErrors = monitoring.NewSum( 93 "pilot_xds_push_context_errors", 94 "Number of errors (timeouts) initiating push context.", 95 ) 96 97 inboundUpdates = monitoring.NewSum( 98 "pilot_inbound_updates", 99 "Total number of updates received by pilot.", 100 ) 101 102 pilotSDSCertificateErrors = monitoring.NewSum( 103 "pilot_sds_certificate_errors_total", 104 "Total number of failures to fetch SDS key and certificate.", 105 ) 106 107 inboundConfigUpdates = inboundUpdates.With(typeTag.Value("config")) 108 inboundEDSUpdates = inboundUpdates.With(typeTag.Value("eds")) 109 inboundServiceUpdates = inboundUpdates.With(typeTag.Value("svc")) 110 inboundServiceDeletes = inboundUpdates.With(typeTag.Value("svcdelete")) 111 112 configSizeBytes = monitoring.NewDistribution( 113 "pilot_xds_config_size_bytes", 114 "Distribution of configuration sizes pushed to clients", 115 // Important boundaries: 10K, 1M, 4M, 10M, 40M 116 // 4M default limit for gRPC, 10M config will start to strain system, 117 // 40M is likely upper-bound on config sizes supported. 118 []float64{1, 10000, 1000000, 4000000, 10000000, 40000000}, 119 monitoring.WithUnit(monitoring.Bytes), 120 ) 121 ) 122 123 func recordXDSClients(version string, delta float64) { 124 xdsClientTrackerMutex.Lock() 125 defer xdsClientTrackerMutex.Unlock() 126 xdsClientTracker[version] += delta 127 xdsClients.With(versionTag.Value(version)).Record(xdsClientTracker[version]) 128 } 129 130 // triggerMetric is a precomputed monitoring.Metric for each trigger type. This saves on a lot of allocations 131 var triggerMetric = map[model.TriggerReason]monitoring.Metric{ 132 model.EndpointUpdate: pushTriggers.With(typeTag.Value(string(model.EndpointUpdate))), 133 model.ConfigUpdate: pushTriggers.With(typeTag.Value(string(model.ConfigUpdate))), 134 model.ServiceUpdate: pushTriggers.With(typeTag.Value(string(model.ServiceUpdate))), 135 model.ProxyUpdate: pushTriggers.With(typeTag.Value(string(model.ProxyUpdate))), 136 model.GlobalUpdate: pushTriggers.With(typeTag.Value(string(model.GlobalUpdate))), 137 model.UnknownTrigger: pushTriggers.With(typeTag.Value(string(model.UnknownTrigger))), 138 model.DebugTrigger: pushTriggers.With(typeTag.Value(string(model.DebugTrigger))), 139 model.SecretTrigger: pushTriggers.With(typeTag.Value(string(model.SecretTrigger))), 140 model.NetworksTrigger: pushTriggers.With(typeTag.Value(string(model.NetworksTrigger))), 141 model.ProxyRequest: pushTriggers.With(typeTag.Value(string(model.ProxyRequest))), 142 model.NamespaceUpdate: pushTriggers.With(typeTag.Value(string(model.NamespaceUpdate))), 143 model.ClusterUpdate: pushTriggers.With(typeTag.Value(string(model.ClusterUpdate))), 144 } 145 146 func recordPushTriggers(reasons model.ReasonStats) { 147 for r, cnt := range reasons { 148 t, f := triggerMetric[r] 149 if f { 150 t.RecordInt(int64(cnt)) 151 } else { 152 pushTriggers.With(typeTag.Value(string(r))).Increment() 153 } 154 } 155 } 156 157 func isUnexpectedError(err error) bool { 158 s, ok := status.FromError(err) 159 // Unavailable or canceled code will be sent when a connection is closing down. This is very normal, 160 // due to the XDS connection being dropped every 30 minutes, or a pod shutting down. 161 isError := s.Code() != codes.Unavailable && s.Code() != codes.Canceled 162 return !ok || isError 163 } 164 165 // recordSendError records a metric indicating that a push failed. It returns true if this was an unexpected 166 // error 167 func recordSendError(xdsType string, err error) bool { 168 if isUnexpectedError(err) { 169 // TODO use a single metric with a type tag 170 switch xdsType { 171 case v3.ListenerType: 172 ldsSendErrPushes.Increment() 173 case v3.ClusterType: 174 cdsSendErrPushes.Increment() 175 case v3.EndpointType: 176 edsSendErrPushes.Increment() 177 case v3.RouteType: 178 rdsSendErrPushes.Increment() 179 } 180 return true 181 } 182 return false 183 } 184 185 func recordPushTime(xdsType string, duration time.Duration) { 186 pushTime.With(typeTag.Value(v3.GetMetricType(xdsType))).Record(duration.Seconds()) 187 pushes.With(typeTag.Value(v3.GetMetricType(xdsType))).Increment() 188 }