google.golang.org/grpc@v1.74.2/xds/internal/clients/lrsclient/lrs_stream.go

google.golang.org/grpc@v1.74.2/xds/internal/clients/lrsclient/lrs_stream.go (about)

     1  /*
     2   *
     3   * Copyright 2025 gRPC authors.
     4   *
     5   * Licensed under the Apache License, Version 2.0 (the "License");
     6   * you may not use this file except in compliance with the License.
     7   * You may obtain a copy of the License at
     8   *
     9   *     http://www.apache.org/licenses/LICENSE-2.0
    10   *
    11   * Unless required by applicable law or agreed to in writing, software
    12   * distributed under the License is distributed on an "AS IS" BASIS,
    13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14   * See the License for the specific language governing permissions and
    15   * limitations under the License.
    16   */
    17  
    18  package lrsclient
    19  
    20  import (
    21  	"context"
    22  	"fmt"
    23  	"io"
    24  	"time"
    25  
    26  	"google.golang.org/grpc/grpclog"
    27  	"google.golang.org/grpc/internal/backoff"
    28  	igrpclog "google.golang.org/grpc/internal/grpclog"
    29  	"google.golang.org/grpc/internal/pretty"
    30  	"google.golang.org/grpc/xds/internal/clients"
    31  	"google.golang.org/protobuf/proto"
    32  	"google.golang.org/protobuf/types/known/durationpb"
    33  
    34  	v3corepb "github.com/envoyproxy/go-control-plane/envoy/config/core/v3"
    35  	v3endpointpb "github.com/envoyproxy/go-control-plane/envoy/config/endpoint/v3"
    36  	v3lrspb "github.com/envoyproxy/go-control-plane/envoy/service/load_stats/v3"
    37  )
    38  
    39  // Any per-RPC level logs which print complete request or response messages
    40  // should be gated at this verbosity level. Other per-RPC level logs which print
    41  // terse output should be at `INFO` and verbosity 2.
    42  const perRPCVerbosityLevel = 9
    43  
    44  // streamImpl provides all the functionality associated with an LRS (Load
    45  // Reporting Service) stream on the client-side. It manages the lifecycle of
    46  // the LRS stream, including starting, stopping, and retrying the stream. It
    47  // also provides a LoadStore that can be used to report load, with a Stop
    48  // function that should be called when the load reporting is no longer
    49  // needed.
    50  type streamImpl struct {
    51  	// The following fields are initialized when a stream instance is created
    52  	// and are read-only afterwards, and hence can be accessed without a mutex.
    53  	transport clients.Transport       // Transport to use for LRS stream.
    54  	backoff   func(int) time.Duration // Backoff for retries, after stream failures.
    55  	nodeProto *v3corepb.Node          // Identifies the gRPC application.
    56  	doneCh    chan struct{}           // To notify exit of LRS goroutine.
    57  	logger    *igrpclog.PrefixLogger
    58  
    59  	cancelStream context.CancelFunc // Cancel the stream. If nil, the stream is not active.
    60  	loadStore    *LoadStore         // LoadStore returned to user for pushing loads.
    61  
    62  	finalSendRequest chan struct{} // To request for the final attempt to send loads.
    63  	finalSendDone    chan error    // To signal completion of the final attempt of sending loads.
    64  }
    65  
    66  // streamOpts holds the options for creating an lrsStream.
    67  type streamOpts struct {
    68  	transport clients.Transport       // xDS transport to create the stream on.
    69  	backoff   func(int) time.Duration // Backoff for retries, after stream failures.
    70  	nodeProto *v3corepb.Node          // Node proto to identify the gRPC application.
    71  	logPrefix string                  // Prefix to be used for log messages.
    72  }
    73  
    74  // newStreamImpl creates a new StreamImpl with the provided options.
    75  //
    76  // The actual streaming RPC call is initiated when the first call to ReportLoad
    77  // is made, and is terminated when the last call to ReportLoad is canceled.
    78  func newStreamImpl(opts streamOpts) *streamImpl {
    79  	ctx, cancel := context.WithCancel(context.Background())
    80  
    81  	lrs := &streamImpl{
    82  		transport:        opts.transport,
    83  		backoff:          opts.backoff,
    84  		nodeProto:        opts.nodeProto,
    85  		cancelStream:     cancel,
    86  		doneCh:           make(chan struct{}),
    87  		finalSendRequest: make(chan struct{}, 1),
    88  		finalSendDone:    make(chan error, 1),
    89  	}
    90  
    91  	l := grpclog.Component("xds")
    92  	lrs.logger = igrpclog.NewPrefixLogger(l, opts.logPrefix+fmt.Sprintf("[lrs-stream %p] ", lrs))
    93  	lrs.loadStore = newLoadStore()
    94  	go lrs.runner(ctx)
    95  	return lrs
    96  }
    97  
    98  // runner is responsible for managing the lifetime of an LRS streaming call. It
    99  // creates the stream, sends the initial LoadStatsRequest, receives the first
   100  // LoadStatsResponse, and then starts a goroutine to periodically send
   101  // LoadStatsRequests. The runner will restart the stream if it encounters any
   102  // errors.
   103  func (lrs *streamImpl) runner(ctx context.Context) {
   104  	defer close(lrs.doneCh)
   105  
   106  	// This feature indicates that the client supports the
   107  	// LoadStatsResponse.send_all_clusters field in the LRS response.
   108  	node := proto.Clone(lrs.nodeProto).(*v3corepb.Node)
   109  	node.ClientFeatures = append(node.ClientFeatures, "envoy.lrs.supports_send_all_clusters")
   110  
   111  	runLoadReportStream := func() error {
   112  		// streamCtx is created and canceled in case we terminate the stream
   113  		// early for any reason, to avoid gRPC-Go leaking the RPC's monitoring
   114  		// goroutine.
   115  		streamCtx, cancel := context.WithCancel(ctx)
   116  		defer cancel()
   117  
   118  		stream, err := lrs.transport.NewStream(streamCtx, "/envoy.service.load_stats.v3.LoadReportingService/StreamLoadStats")
   119  		if err != nil {
   120  			lrs.logger.Warningf("Failed to create new LRS streaming RPC: %v", err)
   121  			return nil
   122  		}
   123  		if lrs.logger.V(2) {
   124  			lrs.logger.Infof("LRS stream created")
   125  		}
   126  
   127  		if err := lrs.sendFirstLoadStatsRequest(stream, node); err != nil {
   128  			lrs.logger.Warningf("Sending first LRS request failed: %v", err)
   129  			return nil
   130  		}
   131  
   132  		clusters, interval, err := lrs.recvFirstLoadStatsResponse(stream)
   133  		if err != nil {
   134  			lrs.logger.Warningf("Reading from LRS streaming RPC failed: %v", err)
   135  			return nil
   136  		}
   137  
   138  		// We reset backoff state when we successfully receive at least one
   139  		// message from the server.
   140  		lrs.sendLoads(streamCtx, stream, clusters, interval)
   141  		return backoff.ErrResetBackoff
   142  	}
   143  	backoff.RunF(ctx, runLoadReportStream, lrs.backoff)
   144  }
   145  
   146  // sendLoads is responsible for periodically sending load reports to the LRS
   147  // server at the specified interval for the specified clusters, until the passed
   148  // in context is canceled.
   149  func (lrs *streamImpl) sendLoads(ctx context.Context, stream clients.Stream, clusterNames []string, interval time.Duration) {
   150  	tick := time.NewTicker(interval)
   151  	defer tick.Stop()
   152  	for {
   153  		select {
   154  		case <-tick.C:
   155  		case <-ctx.Done():
   156  			return
   157  		case <-lrs.finalSendRequest:
   158  			var finalSendErr error
   159  			if lrs.logger.V(2) {
   160  				lrs.logger.Infof("Final send request received. Attempting final LRS report.")
   161  			}
   162  			if err := lrs.sendLoadStatsRequest(stream, lrs.loadStore.stats(clusterNames)); err != nil {
   163  				lrs.logger.Warningf("Failed to send final load report. Writing to LRS stream failed: %v", err)
   164  				finalSendErr = err
   165  			}
   166  			if lrs.logger.V(2) {
   167  				lrs.logger.Infof("Successfully sent final load report.")
   168  			}
   169  			lrs.finalSendDone <- finalSendErr
   170  			return
   171  		}
   172  
   173  		if err := lrs.sendLoadStatsRequest(stream, lrs.loadStore.stats(clusterNames)); err != nil {
   174  			lrs.logger.Warningf("Failed to send periodic load report. Writing to LRS stream failed: %v", err)
   175  			return
   176  		}
   177  	}
   178  }
   179  
   180  func (lrs *streamImpl) sendFirstLoadStatsRequest(stream clients.Stream, node *v3corepb.Node) error {
   181  	req := &v3lrspb.LoadStatsRequest{Node: node}
   182  	if lrs.logger.V(perRPCVerbosityLevel) {
   183  		lrs.logger.Infof("Sending initial LoadStatsRequest: %s", pretty.ToJSON(req))
   184  	}
   185  	msg, err := proto.Marshal(req)
   186  	if err != nil {
   187  		lrs.logger.Warningf("Failed to marshal LoadStatsRequest: %v", err)
   188  		return err
   189  	}
   190  	err = stream.Send(msg)
   191  	if err == io.EOF {
   192  		return getStreamError(stream)
   193  	}
   194  	return err
   195  }
   196  
   197  // recvFirstLoadStatsResponse receives the first LoadStatsResponse from the LRS
   198  // server.  Returns the following:
   199  //   - a list of cluster names requested by the server or an empty slice if the
   200  //     server requested for load from all clusters
   201  //   - the load reporting interval, and
   202  //   - any error encountered
   203  func (lrs *streamImpl) recvFirstLoadStatsResponse(stream clients.Stream) ([]string, time.Duration, error) {
   204  	r, err := stream.Recv()
   205  	if err != nil {
   206  		return nil, 0, fmt.Errorf("lrs: failed to receive first LoadStatsResponse: %v", err)
   207  	}
   208  	var resp v3lrspb.LoadStatsResponse
   209  	if err := proto.Unmarshal(r, &resp); err != nil {
   210  		if lrs.logger.V(2) {
   211  			lrs.logger.Infof("Failed to unmarshal response to LoadStatsResponse: %v", err)
   212  		}
   213  		return nil, time.Duration(0), fmt.Errorf("lrs: unexpected message type %T", r)
   214  	}
   215  	if lrs.logger.V(perRPCVerbosityLevel) {
   216  		lrs.logger.Infof("Received first LoadStatsResponse: %s", pretty.ToJSON(&resp))
   217  	}
   218  
   219  	internal := resp.GetLoadReportingInterval()
   220  	if internal.CheckValid() != nil {
   221  		return nil, 0, fmt.Errorf("lrs: invalid load_reporting_interval: %v", err)
   222  	}
   223  	loadReportingInterval := internal.AsDuration()
   224  
   225  	clusters := resp.Clusters
   226  	if resp.SendAllClusters {
   227  		// Return an empty slice to send stats for all clusters.
   228  		clusters = []string{}
   229  	}
   230  
   231  	return clusters, loadReportingInterval, nil
   232  }
   233  
   234  func (lrs *streamImpl) sendLoadStatsRequest(stream clients.Stream, loads []*loadData) error {
   235  	clusterStats := make([]*v3endpointpb.ClusterStats, 0, len(loads))
   236  	for _, sd := range loads {
   237  		droppedReqs := make([]*v3endpointpb.ClusterStats_DroppedRequests, 0, len(sd.drops))
   238  		for category, count := range sd.drops {
   239  			droppedReqs = append(droppedReqs, &v3endpointpb.ClusterStats_DroppedRequests{
   240  				Category:     category,
   241  				DroppedCount: count,
   242  			})
   243  		}
   244  		localityStats := make([]*v3endpointpb.UpstreamLocalityStats, 0, len(sd.localityStats))
   245  		for lid, localityData := range sd.localityStats {
   246  			loadMetricStats := make([]*v3endpointpb.EndpointLoadMetricStats, 0, len(localityData.loadStats))
   247  			for name, loadData := range localityData.loadStats {
   248  				loadMetricStats = append(loadMetricStats, &v3endpointpb.EndpointLoadMetricStats{
   249  					MetricName:                    name,
   250  					NumRequestsFinishedWithMetric: loadData.count,
   251  					TotalMetricValue:              loadData.sum,
   252  				})
   253  			}
   254  			localityStats = append(localityStats, &v3endpointpb.UpstreamLocalityStats{
   255  				Locality: &v3corepb.Locality{
   256  					Region:  lid.Region,
   257  					Zone:    lid.Zone,
   258  					SubZone: lid.SubZone,
   259  				},
   260  				TotalSuccessfulRequests: localityData.requestStats.succeeded,
   261  				TotalRequestsInProgress: localityData.requestStats.inProgress,
   262  				TotalErrorRequests:      localityData.requestStats.errored,
   263  				TotalIssuedRequests:     localityData.requestStats.issued,
   264  				LoadMetricStats:         loadMetricStats,
   265  				UpstreamEndpointStats:   nil, // TODO: populate for per endpoint loads.
   266  			})
   267  		}
   268  
   269  		clusterStats = append(clusterStats, &v3endpointpb.ClusterStats{
   270  			ClusterName:           sd.cluster,
   271  			ClusterServiceName:    sd.service,
   272  			UpstreamLocalityStats: localityStats,
   273  			TotalDroppedRequests:  sd.totalDrops,
   274  			DroppedRequests:       droppedReqs,
   275  			LoadReportInterval:    durationpb.New(sd.reportInterval),
   276  		})
   277  	}
   278  
   279  	req := &v3lrspb.LoadStatsRequest{ClusterStats: clusterStats}
   280  	if lrs.logger.V(perRPCVerbosityLevel) {
   281  		lrs.logger.Infof("Sending LRS loads: %s", pretty.ToJSON(req))
   282  	}
   283  	msg, err := proto.Marshal(req)
   284  	if err != nil {
   285  		if lrs.logger.V(2) {
   286  			lrs.logger.Infof("Failed to marshal LoadStatsRequest: %v", err)
   287  		}
   288  		return err
   289  	}
   290  	err = stream.Send(msg)
   291  	if err == io.EOF {
   292  		return getStreamError(stream)
   293  	}
   294  	return err
   295  }
   296  
   297  func getStreamError(stream clients.Stream) error {
   298  	for {
   299  		if _, err := stream.Recv(); err != nil {
   300  			return err
   301  		}
   302  	}
   303  }