github.com/cilium/cilium@v1.16.2/pkg/envoy/xds/server.go (about)

     1  // SPDX-License-Identifier: Apache-2.0
     2  // Copyright Authors of Cilium
     3  
     4  package xds
     5  
     6  import (
     7  	"context"
     8  	"errors"
     9  	"fmt"
    10  	"io"
    11  	"reflect"
    12  	"strconv"
    13  	"strings"
    14  	"sync/atomic"
    15  
    16  	envoy_service_discovery "github.com/cilium/proxy/go/envoy/service/discovery/v3"
    17  	"github.com/sirupsen/logrus"
    18  	"google.golang.org/grpc/codes"
    19  	"google.golang.org/protobuf/types/known/anypb"
    20  
    21  	"github.com/cilium/cilium/pkg/endpointstate"
    22  	"github.com/cilium/cilium/pkg/logging/logfields"
    23  	"github.com/cilium/cilium/pkg/promise"
    24  )
    25  
    26  const (
    27  	// AnyTypeURL is the default type URL to use for ADS resource sets.
    28  	AnyTypeURL = ""
    29  )
    30  
    31  var (
    32  	// ErrNoADSTypeURL is the error returned when receiving a request without
    33  	// a type URL from an ADS stream.
    34  	ErrNoADSTypeURL = errors.New("type URL is required for ADS")
    35  
    36  	// ErrUnknownTypeURL is the error returned when receiving a request with
    37  	// an unknown type URL.
    38  	ErrUnknownTypeURL = errors.New("unknown type URL")
    39  
    40  	// ErrInvalidVersionInfo is the error returned when receiving a request
    41  	// with a version info that is not a positive integer.
    42  	ErrInvalidVersionInfo = errors.New("invalid version info")
    43  
    44  	// ErrInvalidNonce is the error returned when receiving a request
    45  	// with a response nonce that is not a positive integer.
    46  	ErrInvalidResponseNonce = errors.New("invalid response nonce info")
    47  
    48  	// ErrInvalidNodeFormat is the error returned when receiving a request
    49  	// with a node that is not a formatted correctly.
    50  	ErrInvalidNodeFormat = errors.New("invalid node format")
    51  
    52  	// ErrResourceWatch is the error returned whenever an internal error
    53  	// occurs while waiting for new versions of resources.
    54  	ErrResourceWatch = errors.New("resource watch failed")
    55  
    56  	// grpcCanceled is the string prefix of any gRPC error related
    57  	// to the stream being canceled. Ignore the description, as it
    58  	// is derived from the client and may vary, while the code is
    59  	// set by the gRPC library we link with.
    60  	//
    61  	// Ref. vendor/google.golang.org/grpc/status/status.go:
    62  	// return fmt.Sprintf("rpc error: code = %s desc = %s", codes.Code(p.GetCode()), p.GetMessage())
    63  	grpcCanceled = fmt.Sprintf("rpc error: code = %s", codes.Canceled.String())
    64  )
    65  
    66  // Server implements the handling of xDS streams.
    67  type Server struct {
    68  	// restorerPromise is initialized only if xDS server should wait sending any xDS resources
    69  	// until all endpoints have been restored.
    70  	restorerPromise promise.Promise[endpointstate.Restorer]
    71  
    72  	// watchers maps each supported type URL to its corresponding resource
    73  	// watcher.
    74  	watchers map[string]*ResourceWatcher
    75  
    76  	// ackObservers maps each supported type URL to its corresponding observer
    77  	// of ACKs received from Envoy nodes.
    78  	ackObservers map[string]ResourceVersionAckObserver
    79  
    80  	// lastStreamID is the identifier of the last processed stream.
    81  	// It is incremented atomically when starting the handling of a new stream.
    82  	lastStreamID atomic.Uint64
    83  }
    84  
    85  // ResourceTypeConfiguration is the configuration of the XDS server for a
    86  // resource type.
    87  type ResourceTypeConfiguration struct {
    88  	// Source contains the resources of this type.
    89  	Source ObservableResourceSource
    90  
    91  	// AckObserver is called back whenever a node acknowledges having applied a
    92  	// version of the resources of this type.
    93  	AckObserver ResourceVersionAckObserver
    94  }
    95  
    96  // NewServer creates an xDS gRPC stream handler using the given resource
    97  // sources.
    98  // types maps each supported resource type URL to its corresponding resource
    99  // source and ACK observer.
   100  func NewServer(resourceTypes map[string]*ResourceTypeConfiguration, restorerPromise promise.Promise[endpointstate.Restorer]) *Server {
   101  	watchers := make(map[string]*ResourceWatcher, len(resourceTypes))
   102  	ackObservers := make(map[string]ResourceVersionAckObserver, len(resourceTypes))
   103  	for typeURL, resType := range resourceTypes {
   104  		w := NewResourceWatcher(typeURL, resType.Source)
   105  		resType.Source.AddResourceVersionObserver(w)
   106  		watchers[typeURL] = w
   107  
   108  		if resType.AckObserver != nil {
   109  			if restorerPromise != nil {
   110  				resType.AckObserver.MarkRestorePending()
   111  			}
   112  			ackObservers[typeURL] = resType.AckObserver
   113  		}
   114  	}
   115  
   116  	// TODO: Unregister the watchers when stopping the server.
   117  
   118  	return &Server{restorerPromise: restorerPromise, watchers: watchers, ackObservers: ackObservers}
   119  }
   120  
   121  func getXDSRequestFields(req *envoy_service_discovery.DiscoveryRequest) logrus.Fields {
   122  	return logrus.Fields{
   123  		logfields.XDSAckedVersion: req.GetVersionInfo(),
   124  		logfields.XDSTypeURL:      req.GetTypeUrl(),
   125  		logfields.XDSNonce:        req.GetResponseNonce(),
   126  	}
   127  }
   128  
   129  // HandleRequestStream receives and processes the requests from an xDS stream.
   130  func (s *Server) HandleRequestStream(ctx context.Context, stream Stream, defaultTypeURL string) error {
   131  	// increment stream count
   132  	streamID := s.lastStreamID.Add(1)
   133  
   134  	reqStreamLog := log.WithField(logfields.XDSStreamID, streamID)
   135  
   136  	reqCh := make(chan *envoy_service_discovery.DiscoveryRequest)
   137  
   138  	stopRecv := make(chan struct{})
   139  	defer close(stopRecv)
   140  
   141  	nodeId := ""
   142  
   143  	go func(streamLog *logrus.Entry) {
   144  		defer close(reqCh)
   145  		for {
   146  			req, err := stream.Recv()
   147  			if err != nil {
   148  				if errors.Is(err, io.EOF) {
   149  					streamLog.Debug("xDS stream closed")
   150  				} else if strings.HasPrefix(err.Error(), grpcCanceled) {
   151  					streamLog.WithError(err).Debug("xDS stream canceled")
   152  				} else {
   153  					streamLog.WithError(err).Error("error while receiving request from xDS stream")
   154  				}
   155  				return
   156  			}
   157  			if req == nil {
   158  				streamLog.Error("received nil request from xDS stream; stopping xDS stream handling")
   159  				return
   160  			}
   161  			if req.GetTypeUrl() == "" {
   162  				req.TypeUrl = defaultTypeURL
   163  			}
   164  			if nodeId == "" {
   165  				nodeId = req.GetNode().GetId()
   166  				streamLog = streamLog.WithField(logfields.XDSClientNode, nodeId)
   167  			}
   168  			streamLog.WithFields(getXDSRequestFields(req)).Debug("received request from xDS stream")
   169  
   170  			select {
   171  			case <-stopRecv:
   172  				streamLog.Debug("stopping xDS stream handling")
   173  				return
   174  			case reqCh <- req:
   175  			}
   176  		}
   177  	}(reqStreamLog)
   178  
   179  	return s.processRequestStream(ctx, reqStreamLog, stream, reqCh, defaultTypeURL)
   180  }
   181  
   182  // perTypeStreamState is the state maintained per resource type for each
   183  // xDS stream.
   184  type perTypeStreamState struct {
   185  	// typeURL identifies the resource type.
   186  	typeURL string
   187  
   188  	// pendingWatchCancel is a pending watch on this resource type.
   189  	// If nil, no watch is pending.
   190  	pendingWatchCancel context.CancelFunc
   191  
   192  	// version is the last version sent. This is needed so that we'll know
   193  	// if a new request is an ACK (VersionInfo matches current version), or a NACK
   194  	// (VersionInfo matches an earlier version).
   195  	version uint64
   196  
   197  	// resourceNames is the list of names of resources sent in the last
   198  	// response to a request for this resource type.
   199  	resourceNames []string
   200  }
   201  
   202  // processRequestStream processes the requests in an xDS stream from a channel.
   203  func (s *Server) processRequestStream(ctx context.Context, streamLog *logrus.Entry, stream Stream,
   204  	reqCh <-chan *envoy_service_discovery.DiscoveryRequest, defaultTypeURL string,
   205  ) error {
   206  	// The request state for every type URL.
   207  	typeStates := make([]perTypeStreamState, len(s.watchers))
   208  	defer func() {
   209  		for _, state := range typeStates {
   210  			if state.pendingWatchCancel != nil {
   211  				state.pendingWatchCancel()
   212  			}
   213  		}
   214  	}()
   215  
   216  	// A map of a resource type's URL to the corresponding index in typeStates
   217  	// for the resource type.
   218  	typeIndexes := make(map[string]int, len(typeStates))
   219  
   220  	// The set of channels to select from. Since the set of channels is
   221  	// dynamic, we use reflection for selection.
   222  	// The indexes in selectCases from 0 to len(typeStates)-1 match the indexes
   223  	// in typeStates.
   224  	selectCases := make([]reflect.SelectCase, len(typeStates)+2)
   225  
   226  	// The last select case index is always the request channel.
   227  	reqChIndex := len(selectCases) - 1
   228  	selectCases[reqChIndex] = reflect.SelectCase{
   229  		Dir:  reflect.SelectRecv,
   230  		Chan: reflect.ValueOf(reqCh),
   231  	}
   232  
   233  	// The next-to-last select case is the context's Done channel.
   234  	doneChIndex := reqChIndex - 1
   235  	selectCases[doneChIndex] = reflect.SelectCase{
   236  		Dir:  reflect.SelectRecv,
   237  		Chan: reflect.ValueOf(ctx.Done()),
   238  	}
   239  
   240  	// Initially there are no pending watches, so just select a dead channel
   241  	// that will never be selected.
   242  	quietCh := make(chan *VersionedResources)
   243  	defer close(quietCh)
   244  	quietChValue := reflect.ValueOf(quietCh)
   245  
   246  	i := 0
   247  	for typeURL := range s.watchers {
   248  		typeStates[i] = perTypeStreamState{
   249  			typeURL: typeURL,
   250  		}
   251  
   252  		selectCases[i] = reflect.SelectCase{
   253  			Dir:  reflect.SelectRecv,
   254  			Chan: quietChValue,
   255  		}
   256  
   257  		typeIndexes[typeURL] = i
   258  
   259  		i++
   260  	}
   261  
   262  	streamLog.Info("starting xDS stream processing")
   263  
   264  	nodeIP := ""
   265  
   266  	if s.restorerPromise != nil {
   267  		restorer, err := s.restorerPromise.Await(ctx)
   268  		if err != nil {
   269  			return err
   270  		}
   271  
   272  		if restorer != nil {
   273  			streamLog.Debug("Waiting for endpoint restoration before serving resources...")
   274  			restorer.WaitForEndpointRestore(ctx)
   275  			for typeURL, ackObserver := range s.ackObservers {
   276  				streamLog.WithField(logfields.XDSTypeURL, typeURL).
   277  					Debug("Endpoints restored, starting serving.")
   278  				ackObserver.MarkRestoreCompleted()
   279  			}
   280  		}
   281  	}
   282  
   283  	for {
   284  		// Process either a new request from the xDS stream or a response
   285  		// from the resource watcher.
   286  		chosen, recv, recvOK := reflect.Select(selectCases)
   287  
   288  		switch chosen {
   289  		case doneChIndex: // Context got canceled, most likely by the client terminating.
   290  			streamLog.WithError(ctx.Err()).Debug("xDS stream context canceled")
   291  			return nil
   292  
   293  		case reqChIndex: // Request received from the stream.
   294  			if !recvOK {
   295  				streamLog.Info("xDS stream closed")
   296  				return nil
   297  			}
   298  
   299  			req := recv.Interface().(*envoy_service_discovery.DiscoveryRequest)
   300  
   301  			// only require Node to exist in the first request
   302  			if nodeIP == "" {
   303  				id := req.GetNode().GetId()
   304  				streamLog = streamLog.WithField(logfields.XDSClientNode, id)
   305  				var err error
   306  				nodeIP, err = EnvoyNodeIdToIP(id)
   307  				if err != nil {
   308  					streamLog.WithError(err).Error("invalid Node in xDS request")
   309  					return ErrInvalidNodeFormat
   310  				}
   311  			}
   312  
   313  			requestLog := streamLog.WithFields(getXDSRequestFields(req))
   314  
   315  			// Ensure that the version info is a string that was sent by this
   316  			// server or the empty string (the first request in a stream should
   317  			// always have an empty version info).
   318  			var versionInfo uint64
   319  			if req.GetVersionInfo() != "" {
   320  				var err error
   321  				versionInfo, err = strconv.ParseUint(req.VersionInfo, 10, 64)
   322  				if err != nil {
   323  					requestLog.Errorf("invalid version info in xDS request, not a uint64")
   324  					return ErrInvalidVersionInfo
   325  				}
   326  			}
   327  			var nonce uint64
   328  			if req.GetResponseNonce() != "" {
   329  				var err error
   330  				nonce, err = strconv.ParseUint(req.ResponseNonce, 10, 64)
   331  				if err != nil {
   332  					requestLog.Error("invalid response nonce info in xDS request, not a uint64")
   333  					return ErrInvalidResponseNonce
   334  				}
   335  			}
   336  			var detail string
   337  			status := req.GetErrorDetail()
   338  			if status != nil {
   339  				detail = status.Message
   340  			}
   341  
   342  			typeURL := req.GetTypeUrl()
   343  			if defaultTypeURL == AnyTypeURL && typeURL == "" {
   344  				requestLog.Error("no type URL given in ADS request")
   345  				return ErrNoADSTypeURL
   346  			}
   347  
   348  			index, exists := typeIndexes[typeURL]
   349  			if !exists {
   350  				requestLog.Error("unknown type URL in xDS request")
   351  				return ErrUnknownTypeURL
   352  			}
   353  
   354  			state := &typeStates[index]
   355  			watcher := s.watchers[typeURL]
   356  
   357  			if nonce == 0 && versionInfo > 0 {
   358  				requestLog.Debugf("xDS was restarted, setting nonce to %d", versionInfo)
   359  				nonce = versionInfo
   360  			}
   361  
   362  			// Response nonce is always the same as the response version.
   363  			// Request version indicates the last acked version. If the
   364  			// response nonce in the request is different (smaller) than
   365  			// the version, all versions upto that version are acked, but
   366  			// the versions from that to and including the nonce are nacked.
   367  			if versionInfo <= nonce {
   368  				ackObserver := s.ackObservers[typeURL]
   369  				if ackObserver != nil {
   370  					requestLog.Debug("notifying observers of ACKs")
   371  					ackObserver.HandleResourceVersionAck(versionInfo, nonce, nodeIP, state.resourceNames, typeURL, detail)
   372  				} else {
   373  					requestLog.Debug("ACK received but no observers are waiting for ACKs")
   374  				}
   375  				if versionInfo < nonce {
   376  					// versions after VersionInfo, upto and including ResponseNonce are NACKed
   377  					requestLog.WithField(logfields.XDSDetail, detail).Warningf("NACK received for versions after %s and up to %s; waiting for a version update before sending again", req.VersionInfo, req.ResponseNonce)
   378  					// Watcher will behave as if the sent version was acked.
   379  					// Otherwise we will just be sending the same failing
   380  					// version over and over filling logs.
   381  					versionInfo = state.version
   382  				}
   383  
   384  				if state.pendingWatchCancel != nil {
   385  					// A pending watch exists for this type URL. Cancel it to
   386  					// start a new watch.
   387  					requestLog.Debug("canceling pending watch")
   388  					state.pendingWatchCancel()
   389  				}
   390  
   391  				respCh := make(chan *VersionedResources, 1)
   392  				selectCases[index].Chan = reflect.ValueOf(respCh)
   393  
   394  				ctx, cancel := context.WithCancel(ctx)
   395  				state.pendingWatchCancel = cancel
   396  
   397  				requestLog.Debugf("starting watch on %d resources", len(req.GetResourceNames()))
   398  				go watcher.WatchResources(ctx, typeURL, versionInfo, nodeIP, req.GetResourceNames(), respCh)
   399  			} else {
   400  				requestLog.Debug("received invalid nonce in xDS request; ignoring request")
   401  			}
   402  		default: // Pending watch response.
   403  			state := &typeStates[chosen]
   404  			state.pendingWatchCancel()
   405  			state.pendingWatchCancel = nil
   406  
   407  			if !recvOK {
   408  				streamLog.WithField(logfields.XDSTypeURL, state.typeURL).
   409  					Error("xDS resource watch failed; terminating")
   410  				return ErrResourceWatch
   411  			}
   412  
   413  			// Disabling reading from the channel after reading any from it,
   414  			// since the watcher will close it anyway.
   415  			selectCases[chosen].Chan = quietChValue
   416  
   417  			resp := recv.Interface().(*VersionedResources)
   418  
   419  			responseLog := streamLog.WithFields(logrus.Fields{
   420  				logfields.XDSCachedVersion: resp.Version,
   421  				logfields.XDSCanary:        resp.Canary,
   422  				logfields.XDSTypeURL:       state.typeURL,
   423  				logfields.XDSNonce:         resp.Version,
   424  			})
   425  
   426  			resources := make([]*anypb.Any, len(resp.Resources))
   427  
   428  			// Marshall the resources into protobuf's Any type.
   429  			for i, res := range resp.Resources {
   430  				any, err := anypb.New(res)
   431  				if err != nil {
   432  					responseLog.WithError(err).Errorf("error marshalling xDS response (%d resources)", len(resp.Resources))
   433  					return err
   434  				}
   435  				resources[i] = any
   436  			}
   437  
   438  			responseLog.Debugf("sending xDS response with %d resources", len(resp.Resources))
   439  
   440  			versionStr := strconv.FormatUint(resp.Version, 10)
   441  			out := &envoy_service_discovery.DiscoveryResponse{
   442  				VersionInfo: versionStr,
   443  				Resources:   resources,
   444  				Canary:      resp.Canary,
   445  				TypeUrl:     state.typeURL,
   446  				Nonce:       versionStr,
   447  			}
   448  			err := stream.Send(out)
   449  			if err != nil {
   450  				return err
   451  			}
   452  
   453  			state.version = resp.Version
   454  			state.resourceNames = resp.ResourceNames
   455  		}
   456  	}
   457  }