istio.io/istio@v0.0.0-20240520182934-d79c90f27776/pkg/xds/server.go (about) 1 // Copyright Istio Authors 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package xds 16 17 import ( 18 "strings" 19 "time" 20 21 core "github.com/envoyproxy/go-control-plane/envoy/config/core/v3" 22 discovery "github.com/envoyproxy/go-control-plane/envoy/service/discovery/v3" 23 "google.golang.org/grpc/codes" 24 "google.golang.org/grpc/status" 25 "google.golang.org/protobuf/types/known/anypb" 26 27 "istio.io/istio/pilot/pkg/features" 28 istiogrpc "istio.io/istio/pilot/pkg/grpc" 29 "istio.io/istio/pkg/model" 30 "istio.io/istio/pkg/util/sets" 31 ) 32 33 // ResourceDelta records the difference in requested resources by an XDS client 34 type ResourceDelta struct { 35 // Subscribed indicates the client requested these additional resources 36 Subscribed sets.String 37 // Unsubscribed indicates the client no longer requires these resources 38 Unsubscribed sets.String 39 } 40 41 var emptyResourceDelta = ResourceDelta{} 42 43 func (rd ResourceDelta) IsEmpty() bool { 44 return len(rd.Subscribed) == 0 && len(rd.Unsubscribed) == 0 45 } 46 47 type Resources = []*discovery.Resource 48 49 func ResourcesToAny(r Resources) []*anypb.Any { 50 a := make([]*anypb.Any, 0, len(r)) 51 for _, rr := range r { 52 a = append(a, rr.Resource) 53 } 54 return a 55 } 56 57 // WatchedResource tracks an active DiscoveryRequest subscription. 58 type WatchedResource struct { 59 // TypeUrl is copied from the DiscoveryRequest.TypeUrl that initiated watching this resource. 60 // nolint 61 TypeUrl string 62 63 // ResourceNames tracks the list of resources that are actively watched. 64 // For LDS and CDS, all resources of the TypeUrl type are watched if it is empty. 65 // For endpoints the resource names will have list of clusters and for clusters it is empty. 66 // For Delta Xds, all resources of the TypeUrl that a client has subscribed to. 67 ResourceNames []string 68 69 // Wildcard indicates the subscription is a wildcard subscription. This only applies to types that 70 // allow both wildcard and non-wildcard subscriptions. 71 Wildcard bool 72 73 // NonceSent is the nonce sent in the last sent response. If it is equal with NonceAcked, the 74 // last message has been processed. If empty: we never sent a message of this type. 75 NonceSent string 76 77 // NonceAcked is the last acked message. 78 NonceAcked string 79 80 // AlwaysRespond, if true, will ensure that even when a request would otherwise be treated as an 81 // ACK, it will be responded to. This typically happens when a proxy reconnects to another instance of 82 // Istiod. In that case, Envoy expects us to respond to EDS/RDS/SDS requests to finish warming of 83 // clusters/listeners. 84 // Typically, this should be set to 'false' after response; keeping it true would likely result in an endless loop. 85 AlwaysRespond bool 86 87 // LastResources tracks the contents of the last push. 88 // This field is extremely expensive to maintain and is typically disabled 89 LastResources Resources 90 } 91 92 type Watcher interface { 93 DeleteWatchedResource(url string) 94 GetWatchedResource(url string) *WatchedResource 95 NewWatchedResource(url string, names []string) 96 UpdateWatchedResource(string, func(*WatchedResource) *WatchedResource) 97 // GetID identifies an xDS client. This is different from a connection ID. 98 GetID() string 99 } 100 101 // IsWildcardTypeURL checks whether a given type is a wildcard type 102 // https://www.envoyproxy.io/docs/envoy/latest/api-docs/xds_protocol#how-the-client-specifies-what-resources-to-return 103 // If the list of resource names becomes empty, that means that the client is no 104 // longer interested in any resources of the specified type. For Listener and 105 // Cluster resource types, there is also a “wildcard” mode, which is triggered 106 // when the initial request on the stream for that resource type contains no 107 // resource names. 108 func IsWildcardTypeURL(typeURL string) bool { 109 switch typeURL { 110 case model.SecretType, model.EndpointType, model.RouteType, model.ExtensionConfigurationType: 111 // By XDS spec, these are not wildcard 112 return false 113 case model.ClusterType, model.ListenerType: 114 // By XDS spec, these are wildcard 115 return true 116 default: 117 // All of our internal types use wildcard semantics 118 return true 119 } 120 } 121 122 // DiscoveryStream is a server interface for XDS. 123 type DiscoveryStream = discovery.AggregatedDiscoveryService_StreamAggregatedResourcesServer 124 125 // Connection holds information about an xDS client connection. There may be more than one connection to the same client. 126 type Connection struct { 127 // peerAddr is the address of the client, from network layer. 128 peerAddr string 129 130 // Time of connection, for debugging 131 connectedAt time.Time 132 133 // conID is the connection conID, used as a key in the connection table. 134 // Currently based on the node name and a counter. 135 conID string 136 137 // Sending on this channel results in a push. 138 pushChannel chan any 139 140 // Both ADS and SDS streams implement this interface 141 stream DiscoveryStream 142 143 // initialized channel will be closed when proxy is initialized. Pushes, or anything accessing 144 // the proxy, should not be started until this channel is closed. 145 initialized chan struct{} 146 147 // stop can be used to end the connection manually via debug endpoints. Only to be used for testing. 148 stop chan struct{} 149 150 // reqChan is used to receive discovery requests for this connection. 151 reqChan chan *discovery.DiscoveryRequest 152 153 // errorChan is used to process error during discovery request processing. 154 errorChan chan error 155 } 156 157 func NewConnection(peerAddr string, stream DiscoveryStream) Connection { 158 return Connection{ 159 pushChannel: make(chan any), 160 initialized: make(chan struct{}), 161 stop: make(chan struct{}), 162 reqChan: make(chan *discovery.DiscoveryRequest, 1), 163 errorChan: make(chan error, 1), 164 peerAddr: peerAddr, 165 connectedAt: time.Now(), 166 stream: stream, 167 } 168 } 169 170 func (conn *Connection) InitializedCh() chan struct{} { 171 return conn.initialized 172 } 173 174 func (conn *Connection) PushCh() chan any { 175 return conn.pushChannel 176 } 177 178 func (conn *Connection) StopCh() chan struct{} { 179 return conn.stop 180 } 181 182 func (conn *Connection) ErrorCh() chan error { 183 return conn.errorChan 184 } 185 186 func (conn *Connection) StreamDone() <-chan struct{} { 187 return conn.stream.Context().Done() 188 } 189 190 func (conn *Connection) ID() string { 191 return conn.conID 192 } 193 194 func (conn *Connection) Peer() string { 195 return conn.peerAddr 196 } 197 198 func (conn *Connection) SetID(id string) { 199 conn.conID = id 200 } 201 202 func (conn *Connection) ConnectedAt() time.Time { 203 return conn.connectedAt 204 } 205 206 func (conn *Connection) Stop() { 207 close(conn.stop) 208 } 209 210 func (conn *Connection) MarkInitialized() { 211 close(conn.initialized) 212 } 213 214 // ConnectionContext is used by the RPC event loop to respond to requests and pushes. 215 type ConnectionContext interface { 216 XdsConnection() *Connection 217 Watcher() Watcher 218 // Initialize checks the first request. 219 Initialize(node *core.Node) error 220 // Close discards the connection. 221 Close() 222 // Process responds to a discovery request. 223 Process(req *discovery.DiscoveryRequest) error 224 // Push responds to a push event queue 225 Push(ev any) error 226 } 227 228 func Stream(ctx ConnectionContext) error { 229 con := ctx.XdsConnection() 230 // Do not call: defer close(con.pushChannel). The push channel will be garbage collected 231 // when the connection is no longer used. Closing the channel can cause subtle race conditions 232 // with push. According to the spec: "It's only necessary to close a channel when it is important 233 // to tell the receiving goroutines that all data have been sent." 234 235 // Block until either a request is received or a push is triggered. 236 // We need 2 go routines because 'read' blocks in Recv(). 237 go Receive(ctx) 238 239 // Wait for the proxy to be fully initialized before we start serving traffic. Because 240 // initialization doesn't have dependencies that will block, there is no need to add any timeout 241 // here. Prior to this explicit wait, we were implicitly waiting by receive() not sending to 242 // reqChannel and the connection not being enqueued for pushes to pushChannel until the 243 // initialization is complete. 244 <-con.initialized 245 246 for { 247 // Go select{} statements are not ordered; the same channel can be chosen many times. 248 // For requests, these are higher priority (client may be blocked on startup until these are done) 249 // and often very cheap to handle (simple ACK), so we check it first. 250 select { 251 case req, ok := <-con.reqChan: 252 if ok { 253 if err := ctx.Process(req); err != nil { 254 return err 255 } 256 } else { 257 // Remote side closed connection or error processing the request. 258 return <-con.errorChan 259 } 260 case <-con.stop: 261 return nil 262 default: 263 } 264 // If there wasn't already a request, poll for requests and pushes. Note: if we have a huge 265 // amount of incoming requests, we may still send some pushes, as we do not `continue` above; 266 // however, requests will be handled ~2x as much as pushes. This ensures a wave of requests 267 // cannot completely starve pushes. However, this scenario is unlikely. 268 select { 269 case req, ok := <-con.reqChan: 270 if ok { 271 if err := ctx.Process(req); err != nil { 272 return err 273 } 274 } else { 275 // Remote side closed connection or error processing the request. 276 return <-con.errorChan 277 } 278 case pushEv := <-con.pushChannel: 279 err := ctx.Push(pushEv) 280 if err != nil { 281 return err 282 } 283 case <-con.stop: 284 return nil 285 } 286 } 287 } 288 289 func Receive(ctx ConnectionContext) { 290 con := ctx.XdsConnection() 291 defer func() { 292 close(con.errorChan) 293 close(con.reqChan) 294 // Close the initialized channel, if its not already closed, to prevent blocking the stream. 295 select { 296 case <-con.initialized: 297 default: 298 close(con.initialized) 299 } 300 }() 301 302 firstRequest := true 303 for { 304 req, err := con.stream.Recv() 305 if err != nil { 306 if istiogrpc.IsExpectedGRPCError(err) { 307 log.Infof("ADS: %q %s terminated", con.peerAddr, con.conID) 308 return 309 } 310 con.errorChan <- err 311 log.Errorf("ADS: %q %s terminated with error: %v", con.peerAddr, con.conID, err) 312 TotalXDSInternalErrors.Increment() 313 return 314 } 315 // This should be only set for the first request. The node id may not be set - for example malicious clients. 316 if firstRequest { 317 // probe happens before envoy sends first xDS request 318 if req.TypeUrl == model.HealthInfoType { 319 log.Warnf("ADS: %q %s send health check probe before normal xDS request", con.peerAddr, con.conID) 320 continue 321 } 322 firstRequest = false 323 if req.Node == nil || req.Node.Id == "" { 324 con.errorChan <- status.New(codes.InvalidArgument, "missing node information").Err() 325 return 326 } 327 if err := ctx.Initialize(req.Node); err != nil { 328 con.errorChan <- err 329 return 330 } 331 defer ctx.Close() 332 log.Infof("ADS: new connection for node:%s", con.conID) 333 } 334 335 select { 336 case con.reqChan <- req: 337 case <-con.stream.Context().Done(): 338 log.Infof("ADS: %q %s terminated with stream closed", con.peerAddr, con.conID) 339 return 340 } 341 } 342 } 343 344 // ShouldRespond determines whether this request needs to be responded back. It applies the ack/nack rules as per xds protocol 345 // using WatchedResource for previous state and discovery request for the current state. 346 func ShouldRespond(w Watcher, id string, request *discovery.DiscoveryRequest) (bool, ResourceDelta) { 347 stype := model.GetShortType(request.TypeUrl) 348 349 // If there is an error in request that means previous response is erroneous. 350 // We do not have to respond in that case. In this case request's version info 351 // will be different from the version sent. But it is fragile to rely on that. 352 if request.ErrorDetail != nil { 353 errCode := codes.Code(request.ErrorDetail.Code) 354 log.Warnf("ADS:%s: ACK ERROR %s %s:%s", stype, id, errCode.String(), request.ErrorDetail.GetMessage()) 355 IncrementXDSRejects(request.TypeUrl, w.GetID(), errCode.String()) 356 return false, emptyResourceDelta 357 } 358 359 if shouldUnsubscribe(request) { 360 log.Debugf("ADS:%s: UNSUBSCRIBE %s %s %s", stype, id, request.VersionInfo, request.ResponseNonce) 361 w.DeleteWatchedResource(request.TypeUrl) 362 return false, emptyResourceDelta 363 } 364 365 previousInfo := w.GetWatchedResource(request.TypeUrl) 366 // This can happen in two cases: 367 // 1. When Envoy starts for the first time, it sends an initial Discovery request to Istiod. 368 // 2. When Envoy reconnects to a new Istiod that does not have information about this typeUrl 369 // i.e. non empty response nonce. 370 // We should always respond with the current resource names. 371 if request.ResponseNonce == "" || previousInfo == nil { 372 log.Debugf("ADS:%s: INIT/RECONNECT %s %s %s", stype, id, request.VersionInfo, request.ResponseNonce) 373 w.NewWatchedResource(request.TypeUrl, request.ResourceNames) 374 return true, emptyResourceDelta 375 } 376 377 // If there is mismatch in the nonce, that is a case of expired/stale nonce. 378 // A nonce becomes stale following a newer nonce being sent to Envoy. 379 // previousInfo.NonceSent can be empty if we previously had shouldRespond=true but didn't send any resources. 380 if request.ResponseNonce != previousInfo.NonceSent { 381 if features.EnableUnsafeAssertions && previousInfo.NonceSent == "" { 382 // Assert we do not end up in an invalid state 383 log.Fatalf("ADS:%s: REQ %s Expired nonce received %s, but we never sent any nonce", stype, 384 id, request.ResponseNonce) 385 } 386 log.Debugf("ADS:%s: REQ %s Expired nonce received %s, sent %s", stype, 387 id, request.ResponseNonce, previousInfo.NonceSent) 388 ExpiredNonce.With(typeTag.Value(model.GetMetricType(request.TypeUrl))).Increment() 389 return false, emptyResourceDelta 390 } 391 392 // If it comes here, that means nonce match. 393 var previousResources []string 394 var alwaysRespond bool 395 w.UpdateWatchedResource(request.TypeUrl, func(wr *WatchedResource) *WatchedResource { 396 previousResources = wr.ResourceNames 397 wr.NonceAcked = request.ResponseNonce 398 wr.ResourceNames = request.ResourceNames 399 alwaysRespond = wr.AlwaysRespond 400 wr.AlwaysRespond = false 401 return wr 402 }) 403 404 // Envoy can send two DiscoveryRequests with same version and nonce. 405 // when it detects a new resource. We should respond if they change. 406 prev := sets.New(previousResources...) 407 cur := sets.New(request.ResourceNames...) 408 removed := prev.Difference(cur) 409 added := cur.Difference(prev) 410 411 // We should always respond "alwaysRespond" marked requests to let Envoy finish warming 412 // even though Nonce match and it looks like an ACK. 413 if alwaysRespond { 414 log.Infof("ADS:%s: FORCE RESPONSE %s for warming.", stype, id) 415 return true, emptyResourceDelta 416 } 417 418 if len(removed) == 0 && len(added) == 0 { 419 log.Debugf("ADS:%s: ACK %s %s %s", stype, id, request.VersionInfo, request.ResponseNonce) 420 return false, emptyResourceDelta 421 } 422 log.Debugf("ADS:%s: RESOURCE CHANGE added %v removed %v %s %s %s", stype, 423 added, removed, id, request.VersionInfo, request.ResponseNonce) 424 425 // For non wildcard resource, if no new resources are subscribed, it means we do not need to push. 426 if !IsWildcardTypeURL(request.TypeUrl) && len(added) == 0 { 427 return false, emptyResourceDelta 428 } 429 430 return true, ResourceDelta{ 431 Subscribed: added, 432 // we do not need to set unsubscribed for StoW 433 } 434 } 435 436 // shouldUnsubscribe checks if we should unsubscribe. This is done when Envoy is 437 // no longer watching. For example, we remove all RDS references, we will 438 // unsubscribe from RDS. NOTE: This may happen as part of the initial request. If 439 // there are no routes needed, Envoy will send an empty request, which this 440 // properly handles by not adding it to the watched resource list. 441 func shouldUnsubscribe(request *discovery.DiscoveryRequest) bool { 442 return len(request.ResourceNames) == 0 && !IsWildcardTypeURL(request.TypeUrl) 443 } 444 445 func Send(ctx ConnectionContext, res *discovery.DiscoveryResponse) error { 446 conn := ctx.XdsConnection() 447 sendResponse := func() error { 448 start := time.Now() 449 defer func() { RecordSendTime(time.Since(start)) }() 450 return conn.stream.Send(res) 451 } 452 err := sendResponse() 453 if err == nil { 454 if res.Nonce != "" && !strings.HasPrefix(res.TypeUrl, model.DebugType) { 455 ctx.Watcher().UpdateWatchedResource(res.TypeUrl, func(wr *WatchedResource) *WatchedResource { 456 if wr == nil { 457 wr = &WatchedResource{TypeUrl: res.TypeUrl} 458 } 459 wr.NonceSent = res.Nonce 460 return wr 461 }) 462 } 463 } else if status.Convert(err).Code() == codes.DeadlineExceeded { 464 log.Infof("Timeout writing %s: %v", conn.conID, model.GetShortType(res.TypeUrl)) 465 ResponseWriteTimeouts.Increment() 466 } 467 return err 468 }