github.com/cilium/cilium@v1.16.2/pkg/envoy/xds/server.go (about) 1 // SPDX-License-Identifier: Apache-2.0 2 // Copyright Authors of Cilium 3 4 package xds 5 6 import ( 7 "context" 8 "errors" 9 "fmt" 10 "io" 11 "reflect" 12 "strconv" 13 "strings" 14 "sync/atomic" 15 16 envoy_service_discovery "github.com/cilium/proxy/go/envoy/service/discovery/v3" 17 "github.com/sirupsen/logrus" 18 "google.golang.org/grpc/codes" 19 "google.golang.org/protobuf/types/known/anypb" 20 21 "github.com/cilium/cilium/pkg/endpointstate" 22 "github.com/cilium/cilium/pkg/logging/logfields" 23 "github.com/cilium/cilium/pkg/promise" 24 ) 25 26 const ( 27 // AnyTypeURL is the default type URL to use for ADS resource sets. 28 AnyTypeURL = "" 29 ) 30 31 var ( 32 // ErrNoADSTypeURL is the error returned when receiving a request without 33 // a type URL from an ADS stream. 34 ErrNoADSTypeURL = errors.New("type URL is required for ADS") 35 36 // ErrUnknownTypeURL is the error returned when receiving a request with 37 // an unknown type URL. 38 ErrUnknownTypeURL = errors.New("unknown type URL") 39 40 // ErrInvalidVersionInfo is the error returned when receiving a request 41 // with a version info that is not a positive integer. 42 ErrInvalidVersionInfo = errors.New("invalid version info") 43 44 // ErrInvalidNonce is the error returned when receiving a request 45 // with a response nonce that is not a positive integer. 46 ErrInvalidResponseNonce = errors.New("invalid response nonce info") 47 48 // ErrInvalidNodeFormat is the error returned when receiving a request 49 // with a node that is not a formatted correctly. 50 ErrInvalidNodeFormat = errors.New("invalid node format") 51 52 // ErrResourceWatch is the error returned whenever an internal error 53 // occurs while waiting for new versions of resources. 54 ErrResourceWatch = errors.New("resource watch failed") 55 56 // grpcCanceled is the string prefix of any gRPC error related 57 // to the stream being canceled. Ignore the description, as it 58 // is derived from the client and may vary, while the code is 59 // set by the gRPC library we link with. 60 // 61 // Ref. vendor/google.golang.org/grpc/status/status.go: 62 // return fmt.Sprintf("rpc error: code = %s desc = %s", codes.Code(p.GetCode()), p.GetMessage()) 63 grpcCanceled = fmt.Sprintf("rpc error: code = %s", codes.Canceled.String()) 64 ) 65 66 // Server implements the handling of xDS streams. 67 type Server struct { 68 // restorerPromise is initialized only if xDS server should wait sending any xDS resources 69 // until all endpoints have been restored. 70 restorerPromise promise.Promise[endpointstate.Restorer] 71 72 // watchers maps each supported type URL to its corresponding resource 73 // watcher. 74 watchers map[string]*ResourceWatcher 75 76 // ackObservers maps each supported type URL to its corresponding observer 77 // of ACKs received from Envoy nodes. 78 ackObservers map[string]ResourceVersionAckObserver 79 80 // lastStreamID is the identifier of the last processed stream. 81 // It is incremented atomically when starting the handling of a new stream. 82 lastStreamID atomic.Uint64 83 } 84 85 // ResourceTypeConfiguration is the configuration of the XDS server for a 86 // resource type. 87 type ResourceTypeConfiguration struct { 88 // Source contains the resources of this type. 89 Source ObservableResourceSource 90 91 // AckObserver is called back whenever a node acknowledges having applied a 92 // version of the resources of this type. 93 AckObserver ResourceVersionAckObserver 94 } 95 96 // NewServer creates an xDS gRPC stream handler using the given resource 97 // sources. 98 // types maps each supported resource type URL to its corresponding resource 99 // source and ACK observer. 100 func NewServer(resourceTypes map[string]*ResourceTypeConfiguration, restorerPromise promise.Promise[endpointstate.Restorer]) *Server { 101 watchers := make(map[string]*ResourceWatcher, len(resourceTypes)) 102 ackObservers := make(map[string]ResourceVersionAckObserver, len(resourceTypes)) 103 for typeURL, resType := range resourceTypes { 104 w := NewResourceWatcher(typeURL, resType.Source) 105 resType.Source.AddResourceVersionObserver(w) 106 watchers[typeURL] = w 107 108 if resType.AckObserver != nil { 109 if restorerPromise != nil { 110 resType.AckObserver.MarkRestorePending() 111 } 112 ackObservers[typeURL] = resType.AckObserver 113 } 114 } 115 116 // TODO: Unregister the watchers when stopping the server. 117 118 return &Server{restorerPromise: restorerPromise, watchers: watchers, ackObservers: ackObservers} 119 } 120 121 func getXDSRequestFields(req *envoy_service_discovery.DiscoveryRequest) logrus.Fields { 122 return logrus.Fields{ 123 logfields.XDSAckedVersion: req.GetVersionInfo(), 124 logfields.XDSTypeURL: req.GetTypeUrl(), 125 logfields.XDSNonce: req.GetResponseNonce(), 126 } 127 } 128 129 // HandleRequestStream receives and processes the requests from an xDS stream. 130 func (s *Server) HandleRequestStream(ctx context.Context, stream Stream, defaultTypeURL string) error { 131 // increment stream count 132 streamID := s.lastStreamID.Add(1) 133 134 reqStreamLog := log.WithField(logfields.XDSStreamID, streamID) 135 136 reqCh := make(chan *envoy_service_discovery.DiscoveryRequest) 137 138 stopRecv := make(chan struct{}) 139 defer close(stopRecv) 140 141 nodeId := "" 142 143 go func(streamLog *logrus.Entry) { 144 defer close(reqCh) 145 for { 146 req, err := stream.Recv() 147 if err != nil { 148 if errors.Is(err, io.EOF) { 149 streamLog.Debug("xDS stream closed") 150 } else if strings.HasPrefix(err.Error(), grpcCanceled) { 151 streamLog.WithError(err).Debug("xDS stream canceled") 152 } else { 153 streamLog.WithError(err).Error("error while receiving request from xDS stream") 154 } 155 return 156 } 157 if req == nil { 158 streamLog.Error("received nil request from xDS stream; stopping xDS stream handling") 159 return 160 } 161 if req.GetTypeUrl() == "" { 162 req.TypeUrl = defaultTypeURL 163 } 164 if nodeId == "" { 165 nodeId = req.GetNode().GetId() 166 streamLog = streamLog.WithField(logfields.XDSClientNode, nodeId) 167 } 168 streamLog.WithFields(getXDSRequestFields(req)).Debug("received request from xDS stream") 169 170 select { 171 case <-stopRecv: 172 streamLog.Debug("stopping xDS stream handling") 173 return 174 case reqCh <- req: 175 } 176 } 177 }(reqStreamLog) 178 179 return s.processRequestStream(ctx, reqStreamLog, stream, reqCh, defaultTypeURL) 180 } 181 182 // perTypeStreamState is the state maintained per resource type for each 183 // xDS stream. 184 type perTypeStreamState struct { 185 // typeURL identifies the resource type. 186 typeURL string 187 188 // pendingWatchCancel is a pending watch on this resource type. 189 // If nil, no watch is pending. 190 pendingWatchCancel context.CancelFunc 191 192 // version is the last version sent. This is needed so that we'll know 193 // if a new request is an ACK (VersionInfo matches current version), or a NACK 194 // (VersionInfo matches an earlier version). 195 version uint64 196 197 // resourceNames is the list of names of resources sent in the last 198 // response to a request for this resource type. 199 resourceNames []string 200 } 201 202 // processRequestStream processes the requests in an xDS stream from a channel. 203 func (s *Server) processRequestStream(ctx context.Context, streamLog *logrus.Entry, stream Stream, 204 reqCh <-chan *envoy_service_discovery.DiscoveryRequest, defaultTypeURL string, 205 ) error { 206 // The request state for every type URL. 207 typeStates := make([]perTypeStreamState, len(s.watchers)) 208 defer func() { 209 for _, state := range typeStates { 210 if state.pendingWatchCancel != nil { 211 state.pendingWatchCancel() 212 } 213 } 214 }() 215 216 // A map of a resource type's URL to the corresponding index in typeStates 217 // for the resource type. 218 typeIndexes := make(map[string]int, len(typeStates)) 219 220 // The set of channels to select from. Since the set of channels is 221 // dynamic, we use reflection for selection. 222 // The indexes in selectCases from 0 to len(typeStates)-1 match the indexes 223 // in typeStates. 224 selectCases := make([]reflect.SelectCase, len(typeStates)+2) 225 226 // The last select case index is always the request channel. 227 reqChIndex := len(selectCases) - 1 228 selectCases[reqChIndex] = reflect.SelectCase{ 229 Dir: reflect.SelectRecv, 230 Chan: reflect.ValueOf(reqCh), 231 } 232 233 // The next-to-last select case is the context's Done channel. 234 doneChIndex := reqChIndex - 1 235 selectCases[doneChIndex] = reflect.SelectCase{ 236 Dir: reflect.SelectRecv, 237 Chan: reflect.ValueOf(ctx.Done()), 238 } 239 240 // Initially there are no pending watches, so just select a dead channel 241 // that will never be selected. 242 quietCh := make(chan *VersionedResources) 243 defer close(quietCh) 244 quietChValue := reflect.ValueOf(quietCh) 245 246 i := 0 247 for typeURL := range s.watchers { 248 typeStates[i] = perTypeStreamState{ 249 typeURL: typeURL, 250 } 251 252 selectCases[i] = reflect.SelectCase{ 253 Dir: reflect.SelectRecv, 254 Chan: quietChValue, 255 } 256 257 typeIndexes[typeURL] = i 258 259 i++ 260 } 261 262 streamLog.Info("starting xDS stream processing") 263 264 nodeIP := "" 265 266 if s.restorerPromise != nil { 267 restorer, err := s.restorerPromise.Await(ctx) 268 if err != nil { 269 return err 270 } 271 272 if restorer != nil { 273 streamLog.Debug("Waiting for endpoint restoration before serving resources...") 274 restorer.WaitForEndpointRestore(ctx) 275 for typeURL, ackObserver := range s.ackObservers { 276 streamLog.WithField(logfields.XDSTypeURL, typeURL). 277 Debug("Endpoints restored, starting serving.") 278 ackObserver.MarkRestoreCompleted() 279 } 280 } 281 } 282 283 for { 284 // Process either a new request from the xDS stream or a response 285 // from the resource watcher. 286 chosen, recv, recvOK := reflect.Select(selectCases) 287 288 switch chosen { 289 case doneChIndex: // Context got canceled, most likely by the client terminating. 290 streamLog.WithError(ctx.Err()).Debug("xDS stream context canceled") 291 return nil 292 293 case reqChIndex: // Request received from the stream. 294 if !recvOK { 295 streamLog.Info("xDS stream closed") 296 return nil 297 } 298 299 req := recv.Interface().(*envoy_service_discovery.DiscoveryRequest) 300 301 // only require Node to exist in the first request 302 if nodeIP == "" { 303 id := req.GetNode().GetId() 304 streamLog = streamLog.WithField(logfields.XDSClientNode, id) 305 var err error 306 nodeIP, err = EnvoyNodeIdToIP(id) 307 if err != nil { 308 streamLog.WithError(err).Error("invalid Node in xDS request") 309 return ErrInvalidNodeFormat 310 } 311 } 312 313 requestLog := streamLog.WithFields(getXDSRequestFields(req)) 314 315 // Ensure that the version info is a string that was sent by this 316 // server or the empty string (the first request in a stream should 317 // always have an empty version info). 318 var versionInfo uint64 319 if req.GetVersionInfo() != "" { 320 var err error 321 versionInfo, err = strconv.ParseUint(req.VersionInfo, 10, 64) 322 if err != nil { 323 requestLog.Errorf("invalid version info in xDS request, not a uint64") 324 return ErrInvalidVersionInfo 325 } 326 } 327 var nonce uint64 328 if req.GetResponseNonce() != "" { 329 var err error 330 nonce, err = strconv.ParseUint(req.ResponseNonce, 10, 64) 331 if err != nil { 332 requestLog.Error("invalid response nonce info in xDS request, not a uint64") 333 return ErrInvalidResponseNonce 334 } 335 } 336 var detail string 337 status := req.GetErrorDetail() 338 if status != nil { 339 detail = status.Message 340 } 341 342 typeURL := req.GetTypeUrl() 343 if defaultTypeURL == AnyTypeURL && typeURL == "" { 344 requestLog.Error("no type URL given in ADS request") 345 return ErrNoADSTypeURL 346 } 347 348 index, exists := typeIndexes[typeURL] 349 if !exists { 350 requestLog.Error("unknown type URL in xDS request") 351 return ErrUnknownTypeURL 352 } 353 354 state := &typeStates[index] 355 watcher := s.watchers[typeURL] 356 357 if nonce == 0 && versionInfo > 0 { 358 requestLog.Debugf("xDS was restarted, setting nonce to %d", versionInfo) 359 nonce = versionInfo 360 } 361 362 // Response nonce is always the same as the response version. 363 // Request version indicates the last acked version. If the 364 // response nonce in the request is different (smaller) than 365 // the version, all versions upto that version are acked, but 366 // the versions from that to and including the nonce are nacked. 367 if versionInfo <= nonce { 368 ackObserver := s.ackObservers[typeURL] 369 if ackObserver != nil { 370 requestLog.Debug("notifying observers of ACKs") 371 ackObserver.HandleResourceVersionAck(versionInfo, nonce, nodeIP, state.resourceNames, typeURL, detail) 372 } else { 373 requestLog.Debug("ACK received but no observers are waiting for ACKs") 374 } 375 if versionInfo < nonce { 376 // versions after VersionInfo, upto and including ResponseNonce are NACKed 377 requestLog.WithField(logfields.XDSDetail, detail).Warningf("NACK received for versions after %s and up to %s; waiting for a version update before sending again", req.VersionInfo, req.ResponseNonce) 378 // Watcher will behave as if the sent version was acked. 379 // Otherwise we will just be sending the same failing 380 // version over and over filling logs. 381 versionInfo = state.version 382 } 383 384 if state.pendingWatchCancel != nil { 385 // A pending watch exists for this type URL. Cancel it to 386 // start a new watch. 387 requestLog.Debug("canceling pending watch") 388 state.pendingWatchCancel() 389 } 390 391 respCh := make(chan *VersionedResources, 1) 392 selectCases[index].Chan = reflect.ValueOf(respCh) 393 394 ctx, cancel := context.WithCancel(ctx) 395 state.pendingWatchCancel = cancel 396 397 requestLog.Debugf("starting watch on %d resources", len(req.GetResourceNames())) 398 go watcher.WatchResources(ctx, typeURL, versionInfo, nodeIP, req.GetResourceNames(), respCh) 399 } else { 400 requestLog.Debug("received invalid nonce in xDS request; ignoring request") 401 } 402 default: // Pending watch response. 403 state := &typeStates[chosen] 404 state.pendingWatchCancel() 405 state.pendingWatchCancel = nil 406 407 if !recvOK { 408 streamLog.WithField(logfields.XDSTypeURL, state.typeURL). 409 Error("xDS resource watch failed; terminating") 410 return ErrResourceWatch 411 } 412 413 // Disabling reading from the channel after reading any from it, 414 // since the watcher will close it anyway. 415 selectCases[chosen].Chan = quietChValue 416 417 resp := recv.Interface().(*VersionedResources) 418 419 responseLog := streamLog.WithFields(logrus.Fields{ 420 logfields.XDSCachedVersion: resp.Version, 421 logfields.XDSCanary: resp.Canary, 422 logfields.XDSTypeURL: state.typeURL, 423 logfields.XDSNonce: resp.Version, 424 }) 425 426 resources := make([]*anypb.Any, len(resp.Resources)) 427 428 // Marshall the resources into protobuf's Any type. 429 for i, res := range resp.Resources { 430 any, err := anypb.New(res) 431 if err != nil { 432 responseLog.WithError(err).Errorf("error marshalling xDS response (%d resources)", len(resp.Resources)) 433 return err 434 } 435 resources[i] = any 436 } 437 438 responseLog.Debugf("sending xDS response with %d resources", len(resp.Resources)) 439 440 versionStr := strconv.FormatUint(resp.Version, 10) 441 out := &envoy_service_discovery.DiscoveryResponse{ 442 VersionInfo: versionStr, 443 Resources: resources, 444 Canary: resp.Canary, 445 TypeUrl: state.typeURL, 446 Nonce: versionStr, 447 } 448 err := stream.Send(out) 449 if err != nil { 450 return err 451 } 452 453 state.version = resp.Version 454 state.resourceNames = resp.ResourceNames 455 } 456 } 457 }