istio.io/istio@v0.0.0-20240520182934-d79c90f27776/pkg/istio-agent/xds_proxy_delta.go (about)

     1  // Copyright Istio Authors
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package istioagent
    16  
    17  import (
    18  	"context"
    19  	"fmt"
    20  	"strings"
    21  	"time"
    22  
    23  	discovery "github.com/envoyproxy/go-control-plane/envoy/service/discovery/v3"
    24  	"go.uber.org/atomic"
    25  	google_rpc "google.golang.org/genproto/googleapis/rpc/status"
    26  	"google.golang.org/grpc"
    27  	"google.golang.org/grpc/codes"
    28  	"google.golang.org/grpc/metadata"
    29  	anypb "google.golang.org/protobuf/types/known/anypb"
    30  
    31  	"istio.io/istio/pilot/pkg/features"
    32  	"istio.io/istio/pkg/channels"
    33  	"istio.io/istio/pkg/istio-agent/metrics"
    34  	"istio.io/istio/pkg/log"
    35  	"istio.io/istio/pkg/model"
    36  	"istio.io/istio/pkg/slices"
    37  	"istio.io/istio/pkg/wasm"
    38  )
    39  
    40  // sendDeltaRequest is a small wrapper around sending to con.requestsChan. This ensures that we do not
    41  // block forever on
    42  func (con *ProxyConnection) sendDeltaRequest(req *discovery.DeltaDiscoveryRequest) {
    43  	con.deltaRequestsChan.Put(req)
    44  }
    45  
    46  // DeltaAggregatedResources is an implementation of Delta XDS API used for proxying between Istiod and Envoy.
    47  // Every time envoy makes a fresh connection to the agent, we reestablish a new connection to the upstream xds
    48  // This ensures that a new connection between istiod and agent doesn't end up consuming pending messages from envoy
    49  // as the new connection may not go to the same istiod. Vice versa case also applies.
    50  func (p *XdsProxy) DeltaAggregatedResources(downstream DeltaDiscoveryStream) error {
    51  	proxyLog.Debugf("accepted delta xds connection from envoy, forwarding to upstream")
    52  
    53  	con := &ProxyConnection{
    54  		conID:             connectionNumber.Inc(),
    55  		upstreamError:     make(chan error), // can be produced by recv and send
    56  		downstreamError:   make(chan error), // can be produced by recv and send
    57  		deltaRequestsChan: channels.NewUnbounded[*discovery.DeltaDiscoveryRequest](),
    58  		// Allow a buffer of 1. This ensures we queue up at most 2 (one in process, 1 pending) responses before forwarding.
    59  		deltaResponsesChan: make(chan *discovery.DeltaDiscoveryResponse, 1),
    60  		stopChan:           make(chan struct{}),
    61  		downstreamDeltas:   downstream,
    62  	}
    63  	p.registerStream(con)
    64  	defer p.unregisterStream(con)
    65  
    66  	ctx, cancel := context.WithTimeout(context.Background(), time.Second*5)
    67  	defer cancel()
    68  
    69  	upstreamConn, err := p.buildUpstreamConn(ctx)
    70  	if err != nil {
    71  		proxyLog.Errorf("failed to connect to upstream %s: %v", p.istiodAddress, err)
    72  		metrics.IstiodConnectionFailures.Increment()
    73  		return err
    74  	}
    75  	defer upstreamConn.Close()
    76  
    77  	xds := discovery.NewAggregatedDiscoveryServiceClient(upstreamConn)
    78  	ctx = metadata.AppendToOutgoingContext(context.Background(), "ClusterID", p.clusterID)
    79  	for k, v := range p.xdsHeaders {
    80  		ctx = metadata.AppendToOutgoingContext(ctx, k, v)
    81  	}
    82  	// We must propagate upstream termination to Envoy. This ensures that we resume the full XDS sequence on new connection
    83  	return p.handleDeltaUpstream(ctx, con, xds)
    84  }
    85  
    86  func (p *XdsProxy) handleDeltaUpstream(ctx context.Context, con *ProxyConnection, xds discovery.AggregatedDiscoveryServiceClient) error {
    87  	log := proxyLog.WithLabels("id", con.conID)
    88  	deltaUpstream, err := xds.DeltaAggregatedResources(ctx,
    89  		grpc.MaxCallRecvMsgSize(defaultClientMaxReceiveMessageSize))
    90  	if err != nil {
    91  		// Envoy logs errors again, so no need to log beyond debug level
    92  		log.Debugf("failed to create delta upstream grpc client: %v", err)
    93  		// Increase metric when xds connection error, for example: forgot to restart ingressgateway or sidecar after changing root CA.
    94  		metrics.IstiodConnectionErrors.Increment()
    95  		return err
    96  	}
    97  	log.Infof("connected to delta upstream XDS server: %s", p.istiodAddress)
    98  	defer log.Debugf("disconnected from delta XDS server: %s", p.istiodAddress)
    99  
   100  	con.upstreamDeltas = deltaUpstream
   101  
   102  	// handle responses from istiod
   103  	go func() {
   104  		for {
   105  			resp, err := con.upstreamDeltas.Recv()
   106  			if err != nil {
   107  				upstreamErr(con, err)
   108  				return
   109  			}
   110  			select {
   111  			case con.deltaResponsesChan <- resp:
   112  			case <-con.stopChan:
   113  			}
   114  		}
   115  	}()
   116  
   117  	go p.handleUpstreamDeltaRequest(con)
   118  	go p.handleUpstreamDeltaResponse(con)
   119  
   120  	for {
   121  		select {
   122  		case err := <-con.upstreamError:
   123  			return err
   124  		case err := <-con.downstreamError:
   125  			// On downstream error, we will return. This propagates the error to downstream envoy which will trigger reconnect
   126  			return err
   127  		case <-con.stopChan:
   128  			log.Debugf("upstream stopped")
   129  			return nil
   130  		}
   131  	}
   132  }
   133  
   134  func (p *XdsProxy) handleUpstreamDeltaRequest(con *ProxyConnection) {
   135  	log := proxyLog.WithLabels("id", con.conID)
   136  	initialRequestsSent := atomic.NewBool(false)
   137  	go func() {
   138  		for {
   139  			// recv delta xds requests from envoy
   140  			req, err := con.downstreamDeltas.Recv()
   141  			if err != nil {
   142  				downstreamErr(con, err)
   143  				return
   144  			}
   145  
   146  			// forward to istiod
   147  			con.sendDeltaRequest(req)
   148  			if !initialRequestsSent.Load() && req.TypeUrl == model.ListenerType {
   149  				// fire off an initial NDS request
   150  				if _, f := p.handlers[model.NameTableType]; f {
   151  					con.sendDeltaRequest(&discovery.DeltaDiscoveryRequest{
   152  						TypeUrl: model.NameTableType,
   153  					})
   154  				}
   155  				// fire off an initial PCDS request
   156  				if _, f := p.handlers[model.ProxyConfigType]; f {
   157  					con.sendDeltaRequest(&discovery.DeltaDiscoveryRequest{
   158  						TypeUrl: model.ProxyConfigType,
   159  					})
   160  				}
   161  				// set flag before sending the initial request to prevent race.
   162  				initialRequestsSent.Store(true)
   163  				// Fire of a configured initial request, if there is one
   164  				p.connectedMutex.RLock()
   165  				initialRequest := p.initialDeltaHealthRequest
   166  				if initialRequest != nil {
   167  					con.sendDeltaRequest(initialRequest)
   168  				}
   169  				p.connectedMutex.RUnlock()
   170  			}
   171  		}
   172  	}()
   173  
   174  	defer func() {
   175  		_ = con.upstreamDeltas.CloseSend()
   176  	}()
   177  	for {
   178  		select {
   179  		case req := <-con.deltaRequestsChan.Get():
   180  			con.deltaRequestsChan.Load()
   181  			if req.TypeUrl == model.HealthInfoType && !initialRequestsSent.Load() {
   182  				// only send healthcheck probe after LDS request has been sent
   183  				continue
   184  			}
   185  			log.WithLabels(
   186  				"type", model.GetShortType(req.TypeUrl),
   187  				"sub", len(req.ResourceNamesSubscribe),
   188  				"unsub", len(req.ResourceNamesUnsubscribe),
   189  				"nonce", req.ResponseNonce,
   190  				"initial", len(req.InitialResourceVersions),
   191  			).Debugf("delta request")
   192  			metrics.XdsProxyRequests.Increment()
   193  			if req.TypeUrl == model.ExtensionConfigurationType {
   194  				p.ecdsLastNonce.Store(req.ResponseNonce)
   195  			}
   196  
   197  			if err := con.upstreamDeltas.Send(req); err != nil {
   198  				err = fmt.Errorf("send error for type url %s: %v", req.TypeUrl, err)
   199  				upstreamErr(con, err)
   200  				return
   201  			}
   202  		case <-con.stopChan:
   203  			return
   204  		}
   205  	}
   206  }
   207  
   208  func (p *XdsProxy) handleUpstreamDeltaResponse(con *ProxyConnection) {
   209  	forwardEnvoyCh := make(chan *discovery.DeltaDiscoveryResponse, 1)
   210  	for {
   211  		select {
   212  		case resp := <-con.deltaResponsesChan:
   213  			// TODO: separate upstream response handling from requests sending, which are both time costly
   214  			proxyLog.WithLabels(
   215  				"id", con.conID,
   216  				"type", model.GetShortType(resp.TypeUrl),
   217  				"nonce", resp.Nonce,
   218  				"resources", len(resp.Resources),
   219  				"removes", len(resp.RemovedResources),
   220  			).Debugf("upstream response")
   221  			metrics.XdsProxyResponses.Increment()
   222  			if h, f := p.handlers[resp.TypeUrl]; f {
   223  				if len(resp.Resources) == 0 {
   224  					// Empty response, nothing to do
   225  					// This assumes internal types are always singleton
   226  					break
   227  				}
   228  				err := h(resp.Resources[0].Resource)
   229  				var errorResp *google_rpc.Status
   230  				if err != nil {
   231  					errorResp = &google_rpc.Status{
   232  						Code:    int32(codes.Internal),
   233  						Message: err.Error(),
   234  					}
   235  				}
   236  				// Send ACK/NACK
   237  				con.sendDeltaRequest(&discovery.DeltaDiscoveryRequest{
   238  					TypeUrl:       resp.TypeUrl,
   239  					ResponseNonce: resp.Nonce,
   240  					ErrorDetail:   errorResp,
   241  				})
   242  				continue
   243  			}
   244  			switch resp.TypeUrl {
   245  			case model.ExtensionConfigurationType:
   246  				if features.WasmRemoteLoadConversion {
   247  					// If Wasm remote load conversion feature is enabled, rewrite and send.
   248  					go p.deltaRewriteAndForward(con, resp, func(resp *discovery.DeltaDiscoveryResponse) {
   249  						// Forward the response using the thread of `handleUpstreamResponse`
   250  						// to prevent concurrent access to forwardToEnvoy
   251  						select {
   252  						case forwardEnvoyCh <- resp:
   253  						case <-con.stopChan:
   254  						}
   255  					})
   256  				} else {
   257  					// Otherwise, forward ECDS resource update directly to Envoy.
   258  					forwardDeltaToEnvoy(con, resp)
   259  				}
   260  			default:
   261  				if strings.HasPrefix(resp.TypeUrl, model.DebugType) {
   262  					p.forwardDeltaToTap(resp)
   263  				} else {
   264  					forwardDeltaToEnvoy(con, resp)
   265  				}
   266  			}
   267  		case resp := <-forwardEnvoyCh:
   268  			forwardDeltaToEnvoy(con, resp)
   269  		case <-con.stopChan:
   270  			return
   271  		}
   272  	}
   273  }
   274  
   275  func (p *XdsProxy) deltaRewriteAndForward(con *ProxyConnection, resp *discovery.DeltaDiscoveryResponse, forward func(resp *discovery.DeltaDiscoveryResponse)) {
   276  	resources := make([]*anypb.Any, 0, len(resp.Resources))
   277  	for i := range resp.Resources {
   278  		resources = append(resources, resp.Resources[i].Resource)
   279  	}
   280  
   281  	if err := wasm.MaybeConvertWasmExtensionConfig(resources, p.wasmCache); err != nil {
   282  		proxyLog.Debugf("sending NACK for ECDS resources %+v, err: %+v", resp.Resources, err)
   283  		con.sendDeltaRequest(&discovery.DeltaDiscoveryRequest{
   284  			TypeUrl:       resp.TypeUrl,
   285  			ResponseNonce: resp.Nonce,
   286  			ErrorDetail: &google_rpc.Status{
   287  				Code:    int32(codes.Internal),
   288  				Message: err.Error(),
   289  			},
   290  		})
   291  		return
   292  	}
   293  
   294  	for i := range resources {
   295  		resp.Resources[i].Resource = resources[i]
   296  	}
   297  
   298  	proxyLog.WithLabels("resources", slices.Map(resp.Resources, (*discovery.Resource).GetName), "removes", resp.RemovedResources).Debugf("forward ECDS")
   299  	forward(resp)
   300  }
   301  
   302  func forwardDeltaToEnvoy(con *ProxyConnection, resp *discovery.DeltaDiscoveryResponse) {
   303  	if !model.IsEnvoyType(resp.TypeUrl) && resp.TypeUrl != model.WorkloadType {
   304  		proxyLog.Errorf("Skipping forwarding type url %s to Envoy as is not a valid Envoy type", resp.TypeUrl)
   305  		return
   306  	}
   307  	if con.isClosed() {
   308  		proxyLog.WithLabels("id", con.conID).Errorf("downstream dropped delta xds push to Envoy, connection already closed")
   309  		return
   310  	}
   311  	if err := sendDownstreamDelta(con.downstreamDeltas, resp); err != nil {
   312  		err = fmt.Errorf("send error for type url %s: %v", resp.TypeUrl, err)
   313  		downstreamErr(con, err)
   314  		return
   315  	}
   316  }
   317  
   318  func sendDownstreamDelta(deltaDownstream DeltaDiscoveryStream, res *discovery.DeltaDiscoveryResponse) error {
   319  	tStart := time.Now()
   320  	defer func() {
   321  		// This is a hint to help debug slow responses.
   322  		if time.Since(tStart) > 10*time.Second {
   323  			proxyLog.Warnf("sendDownstreamDelta took %v", time.Since(tStart))
   324  		}
   325  	}()
   326  	return deltaDownstream.Send(res)
   327  }
   328  
   329  func (p *XdsProxy) sendDeltaHealthRequest(req *discovery.DeltaDiscoveryRequest) {
   330  	p.connectedMutex.Lock()
   331  	// Immediately send if we are currently connected.
   332  	if p.connected != nil && p.connected.deltaRequestsChan != nil {
   333  		p.connected.deltaRequestsChan.Put(req)
   334  	}
   335  	// Otherwise place it as our initial request for new connections
   336  	p.initialDeltaHealthRequest = req
   337  	p.connectedMutex.Unlock()
   338  }
   339  
   340  func (p *XdsProxy) forwardDeltaToTap(resp *discovery.DeltaDiscoveryResponse) {
   341  	select {
   342  	// Convert back to a SotW response
   343  	case p.tapResponseChannel <- &discovery.DiscoveryResponse{
   344  		VersionInfo:  resp.SystemVersionInfo,
   345  		Resources:    slices.Map(resp.Resources, (*discovery.Resource).GetResource),
   346  		Canary:       false,
   347  		TypeUrl:      resp.TypeUrl,
   348  		Nonce:        resp.Nonce,
   349  		ControlPlane: resp.ControlPlane,
   350  	}:
   351  	default:
   352  		log.Infof("tap response %q arrived too late; discarding", resp.TypeUrl)
   353  	}
   354  }