istio.io/istio@v0.0.0-20240520182934-d79c90f27776/pilot/pkg/xds/discovery.go (about) 1 // Copyright Istio Authors 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package xds 16 17 import ( 18 "context" 19 "fmt" 20 "sort" 21 "strconv" 22 "sync" 23 "time" 24 25 discovery "github.com/envoyproxy/go-control-plane/envoy/service/discovery/v3" 26 "github.com/google/uuid" 27 "go.uber.org/atomic" 28 "golang.org/x/time/rate" 29 "google.golang.org/grpc" 30 31 "istio.io/istio/pilot/pkg/autoregistration" 32 "istio.io/istio/pilot/pkg/features" 33 "istio.io/istio/pilot/pkg/model" 34 "istio.io/istio/pilot/pkg/networking/core/envoyfilter" 35 "istio.io/istio/pkg/cluster" 36 "istio.io/istio/pkg/config/schema/kind" 37 "istio.io/istio/pkg/maps" 38 "istio.io/istio/pkg/security" 39 ) 40 41 var periodicRefreshMetrics = 10 * time.Second 42 43 type DebounceOptions struct { 44 // DebounceAfter is the delay added to events to wait 45 // after a registry/config event for debouncing. 46 // This will delay the push by at least this interval, plus 47 // the time getting subsequent events. If no change is 48 // detected the push will happen, otherwise we'll keep 49 // delaying until things settle. 50 DebounceAfter time.Duration 51 52 // debounceMax is the maximum time to wait for events 53 // while debouncing. Defaults to 10 seconds. If events keep 54 // showing up with no break for this time, we'll trigger a push. 55 debounceMax time.Duration 56 57 // enableEDSDebounce indicates whether EDS pushes should be debounced. 58 enableEDSDebounce bool 59 } 60 61 // DiscoveryServer is Pilot's gRPC implementation for Envoy's xds APIs 62 type DiscoveryServer struct { 63 // Env is the model environment. 64 Env *model.Environment 65 66 // Generators allow customizing the generated config, based on the client metadata. 67 // Key is the generator type - will match the Generator metadata to set the per-connection 68 // default generator, or the combination of Generator metadata and TypeUrl to select a 69 // different generator for a type. 70 // Normal istio clients use the default generator - will not be impacted by this. 71 Generators map[string]model.XdsResourceGenerator 72 73 // ProxyNeedsPush is a function that determines whether a push can be completely skipped. Individual generators 74 // may also choose to not send any updates. 75 ProxyNeedsPush func(proxy *model.Proxy, req *model.PushRequest) bool 76 77 // concurrentPushLimit is a semaphore that limits the amount of concurrent XDS pushes. 78 concurrentPushLimit chan struct{} 79 // RequestRateLimit limits the number of new XDS requests allowed. This helps prevent thundering hurd of incoming requests. 80 RequestRateLimit *rate.Limiter 81 82 // InboundUpdates describes the number of configuration updates the discovery server has received 83 InboundUpdates *atomic.Int64 84 // CommittedUpdates describes the number of configuration updates the discovery server has 85 // received, process, and stored in the push context. If this number is less than InboundUpdates, 86 // there are updates we have not yet processed. 87 // Note: This does not mean that all proxies have received these configurations; it is strictly 88 // the push context, which means that the next push to a proxy will receive this configuration. 89 CommittedUpdates *atomic.Int64 90 91 // pushChannel is the buffer used for debouncing. 92 // after debouncing the pushRequest will be sent to pushQueue 93 pushChannel chan *model.PushRequest 94 95 // pushQueue is the buffer that used after debounce and before the real xds push. 96 pushQueue *PushQueue 97 98 // debugHandlers is the list of all the supported debug handlers. 99 debugHandlers map[string]string 100 101 // adsClients reflect active gRPC channels, for both ADS and EDS. 102 adsClients map[string]*Connection 103 adsClientsMutex sync.RWMutex 104 105 StatusReporter DistributionStatusCache 106 107 // Authenticators for XDS requests. Should be same/subset of the CA authenticators. 108 Authenticators []security.Authenticator 109 110 WorkloadEntryController *autoregistration.Controller 111 112 // serverReady indicates caches have been synced up and server is ready to process requests. 113 serverReady atomic.Bool 114 115 DebounceOptions DebounceOptions 116 117 // Cache for XDS resources 118 Cache model.XdsCache 119 120 // JwtKeyResolver holds a reference to the JWT key resolver instance. 121 JwtKeyResolver *model.JwksResolver 122 123 // ListRemoteClusters collects debug information about other clusters this istiod reads from. 124 ListRemoteClusters func() []cluster.DebugInfo 125 126 // ClusterAliases are alias names for cluster. When a proxy connects with a cluster ID 127 // and if it has a different alias we should use that a cluster ID for proxy. 128 ClusterAliases map[cluster.ID]cluster.ID 129 130 // pushVersion stores the numeric push version. This should be accessed via NextVersion() 131 pushVersion atomic.Uint64 132 133 // DiscoveryStartTime is the time since the binary started 134 DiscoveryStartTime time.Time 135 } 136 137 // NewDiscoveryServer creates DiscoveryServer that sources data from Pilot's internal mesh data structures 138 func NewDiscoveryServer(env *model.Environment, clusterAliases map[string]string) *DiscoveryServer { 139 out := &DiscoveryServer{ 140 Env: env, 141 Generators: map[string]model.XdsResourceGenerator{}, 142 ProxyNeedsPush: DefaultProxyNeedsPush, 143 concurrentPushLimit: make(chan struct{}, features.PushThrottle), 144 RequestRateLimit: rate.NewLimiter(rate.Limit(features.RequestLimit), 1), 145 InboundUpdates: atomic.NewInt64(0), 146 CommittedUpdates: atomic.NewInt64(0), 147 pushChannel: make(chan *model.PushRequest, 10), 148 pushQueue: NewPushQueue(), 149 debugHandlers: map[string]string{}, 150 adsClients: map[string]*Connection{}, 151 DebounceOptions: DebounceOptions{ 152 DebounceAfter: features.DebounceAfter, 153 debounceMax: features.DebounceMax, 154 enableEDSDebounce: features.EnableEDSDebounce, 155 }, 156 Cache: env.Cache, 157 DiscoveryStartTime: processStartTime, 158 } 159 160 out.ClusterAliases = make(map[cluster.ID]cluster.ID) 161 for alias := range clusterAliases { 162 out.ClusterAliases[cluster.ID(alias)] = cluster.ID(clusterAliases[alias]) 163 } 164 165 out.initJwksResolver() 166 167 return out 168 } 169 170 // initJwkResolver initializes the JWT key resolver to be used. 171 func (s *DiscoveryServer) initJwksResolver() { 172 if s.JwtKeyResolver != nil { 173 s.closeJwksResolver() 174 } 175 s.JwtKeyResolver = model.NewJwksResolver( 176 model.JwtPubKeyEvictionDuration, model.JwtPubKeyRefreshInterval, 177 model.JwtPubKeyRefreshIntervalOnFailure, model.JwtPubKeyRetryInterval) 178 179 // Flush cached discovery responses when detecting jwt public key change. 180 s.JwtKeyResolver.PushFunc = func() { 181 s.ConfigUpdate(&model.PushRequest{Full: true, Reason: model.NewReasonStats(model.UnknownTrigger)}) 182 } 183 } 184 185 // closeJwksResolver shuts down the JWT key resolver used. 186 func (s *DiscoveryServer) closeJwksResolver() { 187 if s.JwtKeyResolver != nil { 188 s.JwtKeyResolver.Close() 189 } 190 } 191 192 // Register adds the ADS handler to the grpc server 193 func (s *DiscoveryServer) Register(rpcs *grpc.Server) { 194 // Register v3 server 195 discovery.RegisterAggregatedDiscoveryServiceServer(rpcs, s) 196 } 197 198 var processStartTime = time.Now() 199 200 // CachesSynced is called when caches have been synced so that server can accept connections. 201 func (s *DiscoveryServer) CachesSynced() { 202 log.Infof("All caches have been synced up in %v, marking server ready", time.Since(s.DiscoveryStartTime)) 203 s.serverReady.Store(true) 204 } 205 206 func (s *DiscoveryServer) IsServerReady() bool { 207 return s.serverReady.Load() 208 } 209 210 func (s *DiscoveryServer) Start(stopCh <-chan struct{}) { 211 go s.WorkloadEntryController.Run(stopCh) 212 go s.handleUpdates(stopCh) 213 go s.periodicRefreshMetrics(stopCh) 214 go s.sendPushes(stopCh) 215 go s.Cache.Run(stopCh) 216 } 217 218 // Push metrics are updated periodically (10s default) 219 func (s *DiscoveryServer) periodicRefreshMetrics(stopCh <-chan struct{}) { 220 ticker := time.NewTicker(periodicRefreshMetrics) 221 defer ticker.Stop() 222 for { 223 select { 224 case <-ticker.C: 225 push := s.globalPushContext() 226 model.LastPushMutex.Lock() 227 if model.LastPushStatus != push { 228 model.LastPushStatus = push 229 push.UpdateMetrics() 230 out, _ := model.LastPushStatus.StatusJSON() 231 if string(out) != "{}" { 232 log.Infof("Push Status: %s", string(out)) 233 } 234 } 235 model.LastPushMutex.Unlock() 236 case <-stopCh: 237 return 238 } 239 } 240 } 241 242 // dropCacheForRequest clears the cache in response to a push request 243 func (s *DiscoveryServer) dropCacheForRequest(req *model.PushRequest) { 244 // If we don't know what updated, cannot safely cache. Clear the whole cache 245 if len(req.ConfigsUpdated) == 0 { 246 s.Cache.ClearAll() 247 } else { 248 // Otherwise, just clear the updated configs 249 s.Cache.Clear(req.ConfigsUpdated) 250 } 251 } 252 253 // Push is called to push changes on config updates using ADS. 254 func (s *DiscoveryServer) Push(req *model.PushRequest) { 255 if !req.Full { 256 req.Push = s.globalPushContext() 257 s.dropCacheForRequest(req) 258 s.AdsPushAll(req) 259 return 260 } 261 // Reset the status during the push. 262 oldPushContext := s.globalPushContext() 263 if oldPushContext != nil { 264 oldPushContext.OnConfigChange() 265 // Push the previous push Envoy metrics. 266 envoyfilter.RecordMetrics() 267 } 268 // PushContext is reset after a config change. Previous status is 269 // saved. 270 t0 := time.Now() 271 versionLocal := s.NextVersion() 272 push, err := s.initPushContext(req, oldPushContext, versionLocal) 273 if err != nil { 274 return 275 } 276 initContextTime := time.Since(t0) 277 log.Debugf("InitContext %v for push took %s", versionLocal, initContextTime) 278 pushContextInitTime.Record(initContextTime.Seconds()) 279 280 req.Push = push 281 s.AdsPushAll(req) 282 } 283 284 func nonce(noncePrefix string) string { 285 return noncePrefix + uuid.New().String() 286 } 287 288 // Returns the global push context. This should be used with caution; generally the proxy-specific 289 // PushContext should be used to get the current state in the context of a single proxy. This should 290 // only be used for "global" lookups, such as initiating a new push to all proxies. 291 func (s *DiscoveryServer) globalPushContext() *model.PushContext { 292 return s.Env.PushContext() 293 } 294 295 // ConfigUpdate implements ConfigUpdater interface, used to request pushes. 296 func (s *DiscoveryServer) ConfigUpdate(req *model.PushRequest) { 297 if features.EnableUnsafeAssertions { 298 if model.HasConfigsOfKind(req.ConfigsUpdated, kind.Service) { 299 panic("assertion failed kind.Service can not be set in ConfigKey") 300 } 301 } 302 if model.HasConfigsOfKind(req.ConfigsUpdated, kind.Address) { 303 // This is a bit like clearing EDS cache on EndpointShard update. Because Address 304 // types are fetched dynamically, they are not part of the same protections, so we need to clear 305 // the cache. 306 s.Cache.ClearAll() 307 } 308 inboundConfigUpdates.Increment() 309 s.InboundUpdates.Inc() 310 s.pushChannel <- req 311 } 312 313 // Debouncing and push request happens in a separate thread, it uses locks 314 // and we want to avoid complications, ConfigUpdate may already hold other locks. 315 // handleUpdates processes events from pushChannel 316 // It ensures that at minimum minQuiet time has elapsed since the last event before processing it. 317 // It also ensures that at most maxDelay is elapsed between receiving an event and processing it. 318 func (s *DiscoveryServer) handleUpdates(stopCh <-chan struct{}) { 319 debounce(s.pushChannel, stopCh, s.DebounceOptions, s.Push, s.CommittedUpdates) 320 } 321 322 // The debounce helper function is implemented to enable mocking 323 func debounce(ch chan *model.PushRequest, stopCh <-chan struct{}, opts DebounceOptions, pushFn func(req *model.PushRequest), updateSent *atomic.Int64) { 324 var timeChan <-chan time.Time 325 var startDebounce time.Time 326 var lastConfigUpdateTime time.Time 327 328 pushCounter := 0 329 debouncedEvents := 0 330 331 // Keeps track of the push requests. If updates are debounce they will be merged. 332 var req *model.PushRequest 333 334 free := true 335 freeCh := make(chan struct{}, 1) 336 337 push := func(req *model.PushRequest, debouncedEvents int, startDebounce time.Time) { 338 pushFn(req) 339 updateSent.Add(int64(debouncedEvents)) 340 debounceTime.Record(time.Since(startDebounce).Seconds()) 341 freeCh <- struct{}{} 342 } 343 344 pushWorker := func() { 345 eventDelay := time.Since(startDebounce) 346 quietTime := time.Since(lastConfigUpdateTime) 347 // it has been too long or quiet enough 348 if eventDelay >= opts.debounceMax || quietTime >= opts.DebounceAfter { 349 if req != nil { 350 pushCounter++ 351 if req.ConfigsUpdated == nil { 352 log.Infof("Push debounce stable[%d] %d for reason %s: %v since last change, %v since last push, full=%v", 353 pushCounter, debouncedEvents, reasonsUpdated(req), 354 quietTime, eventDelay, req.Full) 355 } else { 356 log.Infof("Push debounce stable[%d] %d for config %s: %v since last change, %v since last push, full=%v", 357 pushCounter, debouncedEvents, configsUpdated(req), 358 quietTime, eventDelay, req.Full) 359 } 360 free = false 361 go push(req, debouncedEvents, startDebounce) 362 req = nil 363 debouncedEvents = 0 364 } 365 } else { 366 timeChan = time.After(opts.DebounceAfter - quietTime) 367 } 368 } 369 370 for { 371 select { 372 case <-freeCh: 373 free = true 374 pushWorker() 375 case r := <-ch: 376 // If reason is not set, record it as an unknown reason 377 if len(r.Reason) == 0 { 378 r.Reason = model.NewReasonStats(model.UnknownTrigger) 379 } 380 if !opts.enableEDSDebounce && !r.Full { 381 // trigger push now, just for EDS 382 go func(req *model.PushRequest) { 383 pushFn(req) 384 updateSent.Inc() 385 }(r) 386 continue 387 } 388 389 lastConfigUpdateTime = time.Now() 390 if debouncedEvents == 0 { 391 timeChan = time.After(opts.DebounceAfter) 392 startDebounce = lastConfigUpdateTime 393 } 394 debouncedEvents++ 395 396 req = req.Merge(r) 397 case <-timeChan: 398 if free { 399 pushWorker() 400 } 401 case <-stopCh: 402 return 403 } 404 } 405 } 406 407 func configsUpdated(req *model.PushRequest) string { 408 configs := "" 409 for key := range req.ConfigsUpdated { 410 configs += key.String() 411 break 412 } 413 if len(req.ConfigsUpdated) > 1 { 414 more := fmt.Sprintf(" and %d more configs", len(req.ConfigsUpdated)-1) 415 configs += more 416 } 417 return configs 418 } 419 420 func reasonsUpdated(req *model.PushRequest) string { 421 var ( 422 reason0, reason1 model.TriggerReason 423 reason0Cnt, reason1Cnt, idx int 424 ) 425 for r, cnt := range req.Reason { 426 if idx == 0 { 427 reason0, reason0Cnt = r, cnt 428 } else if idx == 1 { 429 reason1, reason1Cnt = r, cnt 430 } else { 431 break 432 } 433 idx++ 434 } 435 436 switch len(req.Reason) { 437 case 0: 438 return "unknown" 439 case 1: 440 return fmt.Sprintf("%s:%d", reason0, reason0Cnt) 441 case 2: 442 return fmt.Sprintf("%s:%d and %s:%d", reason0, reason0Cnt, reason1, reason1Cnt) 443 default: 444 return fmt.Sprintf("%s:%d and %d(%d) more reasons", reason0, reason0Cnt, len(req.Reason)-1, 445 req.Reason.Count()-reason0Cnt) 446 } 447 } 448 449 func doSendPushes(stopCh <-chan struct{}, semaphore chan struct{}, queue *PushQueue) { 450 for { 451 select { 452 case <-stopCh: 453 return 454 default: 455 // We can send to it until it is full, then it will block until a pushes finishes and reads from it. 456 // This limits the number of pushes that can happen concurrently 457 semaphore <- struct{}{} 458 459 // Get the next proxy to push. This will block if there are no updates required. 460 client, push, shuttingdown := queue.Dequeue() 461 if shuttingdown { 462 return 463 } 464 recordPushTriggers(push.Reason) 465 // Signals that a push is done by reading from the semaphore, allowing another send on it. 466 doneFunc := func() { 467 queue.MarkDone(client) 468 <-semaphore 469 } 470 471 proxiesQueueTime.Record(time.Since(push.Start).Seconds()) 472 var closed <-chan struct{} 473 if client.deltaStream != nil { 474 closed = client.deltaStream.Context().Done() 475 } else { 476 closed = client.StreamDone() 477 } 478 go func() { 479 pushEv := &Event{ 480 pushRequest: push, 481 done: doneFunc, 482 } 483 484 select { 485 case client.PushCh() <- pushEv: 486 return 487 case <-closed: // grpc stream was closed 488 doneFunc() 489 log.Infof("Client closed connection %v", client.ID()) 490 } 491 }() 492 } 493 } 494 } 495 496 // initPushContext creates a global push context and stores it on the environment. Note: while this 497 // method is technically thread safe (there are no data races), it should not be called in parallel; 498 // if it is, then we may start two push context creations (say A, and B), but then write them in 499 // reverse order, leaving us with a final version of A, which may be incomplete. 500 func (s *DiscoveryServer) initPushContext(req *model.PushRequest, oldPushContext *model.PushContext, version string) (*model.PushContext, error) { 501 push := model.NewPushContext() 502 push.PushVersion = version 503 push.JwtKeyResolver = s.JwtKeyResolver 504 if err := push.InitContext(s.Env, oldPushContext, req); err != nil { 505 log.Errorf("XDS: failed to init push context: %v", err) 506 // We can't push if we can't read the data - stick with previous version. 507 pushContextErrors.Increment() 508 return nil, err 509 } 510 511 s.dropCacheForRequest(req) 512 s.Env.SetPushContext(push) 513 514 return push, nil 515 } 516 517 func (s *DiscoveryServer) sendPushes(stopCh <-chan struct{}) { 518 doSendPushes(stopCh, s.concurrentPushLimit, s.pushQueue) 519 } 520 521 // Shutdown shuts down DiscoveryServer components. 522 func (s *DiscoveryServer) Shutdown() { 523 s.closeJwksResolver() 524 s.pushQueue.ShutDown() 525 } 526 527 // Clients returns all currently connected clients. This method can be safely called concurrently, 528 // but care should be taken with the underlying objects (ie model.Proxy) to ensure proper locking. 529 // This method returns only fully initialized connections; for all connections, use AllClients 530 func (s *DiscoveryServer) Clients() []*Connection { 531 s.adsClientsMutex.RLock() 532 defer s.adsClientsMutex.RUnlock() 533 clients := make([]*Connection, 0, len(s.adsClients)) 534 for _, con := range s.adsClients { 535 select { 536 case <-con.InitializedCh(): 537 default: 538 // Initialization not complete, skip 539 continue 540 } 541 clients = append(clients, con) 542 } 543 return clients 544 } 545 546 // SortedClients returns all currently connected clients in an ordered manner. 547 // Sorting order priority is as follows: ClusterID, Namespace, ID. 548 func (s *DiscoveryServer) SortedClients() []*Connection { 549 clients := s.Clients() 550 sort.Slice(clients, func(i, j int) bool { 551 if clients[i].proxy.GetClusterID().String() < clients[j].proxy.GetClusterID().String() { 552 return true 553 } 554 if clients[i].proxy.GetNamespace() < clients[j].proxy.GetNamespace() { 555 return true 556 } 557 return clients[i].proxy.GetID() < clients[j].proxy.GetID() 558 }) 559 return clients 560 } 561 562 // AllClients returns all connected clients, per Clients, but additionally includes uninitialized connections 563 // Warning: callers must take care not to rely on the con.proxy field being set 564 func (s *DiscoveryServer) AllClients() []*Connection { 565 s.adsClientsMutex.RLock() 566 defer s.adsClientsMutex.RUnlock() 567 return maps.Values(s.adsClients) 568 } 569 570 func (s *DiscoveryServer) WaitForRequestLimit(ctx context.Context) error { 571 if s.RequestRateLimit.Limit() == 0 { 572 // Allow opt out when rate limiting is set to 0qps 573 return nil 574 } 575 // Give a bit of time for queue to clear out, but if not fail fast. Client will connect to another 576 // instance in best case, or retry with backoff. 577 wait, cancel := context.WithTimeout(ctx, time.Second) 578 defer cancel() 579 return s.RequestRateLimit.Wait(wait) 580 } 581 582 func (s *DiscoveryServer) NextVersion() string { 583 return time.Now().Format(time.RFC3339) + "/" + strconv.FormatUint(s.pushVersion.Inc(), 10) 584 }