google.golang.org/grpc@v1.72.2/xds/internal/xdsclient/transport/lrs/lrs_stream.go (about)

     1  /*
     2   *
     3   * Copyright 2024 gRPC authors.
     4   *
     5   * Licensed under the Apache License, Version 2.0 (the "License");
     6   * you may not use this file except in compliance with the License.
     7   * You may obtain a copy of the License at
     8   *
     9   *     http://www.apache.org/licenses/LICENSE-2.0
    10   *
    11   * Unless required by applicable law or agreed to in writing, software
    12   * distributed under the License is distributed on an "AS IS" BASIS,
    13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14   * See the License for the specific language governing permissions and
    15   * limitations under the License.
    16   */
    17  
    18  // Package lrs provides the implementation of an LRS (Load Reporting Service)
    19  // stream for the xDS client.
    20  package lrs
    21  
    22  import (
    23  	"context"
    24  	"fmt"
    25  	"io"
    26  	"sync"
    27  	"time"
    28  
    29  	"google.golang.org/grpc/grpclog"
    30  	"google.golang.org/grpc/internal/backoff"
    31  	igrpclog "google.golang.org/grpc/internal/grpclog"
    32  	"google.golang.org/grpc/internal/pretty"
    33  	"google.golang.org/grpc/xds/internal"
    34  	"google.golang.org/grpc/xds/internal/xdsclient/load"
    35  	"google.golang.org/grpc/xds/internal/xdsclient/transport"
    36  	"google.golang.org/protobuf/proto"
    37  	"google.golang.org/protobuf/types/known/durationpb"
    38  
    39  	v3corepb "github.com/envoyproxy/go-control-plane/envoy/config/core/v3"
    40  	v3endpointpb "github.com/envoyproxy/go-control-plane/envoy/config/endpoint/v3"
    41  	v3lrspb "github.com/envoyproxy/go-control-plane/envoy/service/load_stats/v3"
    42  )
    43  
    44  // Any per-RPC level logs which print complete request or response messages
    45  // should be gated at this verbosity level. Other per-RPC level logs which print
    46  // terse output should be at `INFO` and verbosity 2.
    47  const perRPCVerbosityLevel = 9
    48  
    49  // StreamImpl provides all the functionality associated with an LRS (Load Reporting
    50  // Service) stream on the client-side. It manages the lifecycle of the LRS stream,
    51  // including starting, stopping, and retrying the stream. It also provides a
    52  // load.Store that can be used to report load, and a cleanup function that should
    53  // be called when the load reporting is no longer needed.
    54  type StreamImpl struct {
    55  	// The following fields are initialized when a Stream instance is created
    56  	// and are read-only afterwards, and hence can be accessed without a mutex.
    57  	transport transport.Transport     // Transport to use for LRS stream.
    58  	backoff   func(int) time.Duration // Backoff for retries, after stream failures.
    59  	nodeProto *v3corepb.Node          // Identifies the gRPC application.
    60  	doneCh    chan struct{}           // To notify exit of LRS goroutine.
    61  	logger    *igrpclog.PrefixLogger
    62  
    63  	// Guards access to the below fields.
    64  	mu           sync.Mutex
    65  	cancelStream context.CancelFunc // Cancel the stream. If nil, the stream is not active.
    66  	refCount     int                // Number of interested parties.
    67  	lrsStore     *load.Store        // Store returned to user for pushing loads.
    68  }
    69  
    70  // StreamOpts holds the options for creating an lrsStream.
    71  type StreamOpts struct {
    72  	Transport transport.Transport     // xDS transport to create the stream on.
    73  	Backoff   func(int) time.Duration // Backoff for retries, after stream failures.
    74  	NodeProto *v3corepb.Node          // Node proto to identify the gRPC application.
    75  	LogPrefix string                  // Prefix to be used for log messages.
    76  }
    77  
    78  // NewStreamImpl creates a new StreamImpl with the provided options.
    79  //
    80  // The actual streaming RPC call is initiated when the first call to ReportLoad
    81  // is made, and is terminated when the last call to ReportLoad is canceled.
    82  func NewStreamImpl(opts StreamOpts) *StreamImpl {
    83  	lrs := &StreamImpl{
    84  		transport: opts.Transport,
    85  		backoff:   opts.Backoff,
    86  		nodeProto: opts.NodeProto,
    87  		lrsStore:  load.NewStore(),
    88  	}
    89  
    90  	l := grpclog.Component("xds")
    91  	lrs.logger = igrpclog.NewPrefixLogger(l, opts.LogPrefix+fmt.Sprintf("[lrs-stream %p] ", lrs))
    92  	return lrs
    93  }
    94  
    95  // ReportLoad returns a load.Store that can be used to report load, and a
    96  // cleanup function that should be called when the load reporting is no longer
    97  // needed.
    98  //
    99  // The first call to ReportLoad sets the reference count to one, and starts the
   100  // LRS streaming call. Subsequent calls increment the reference count and return
   101  // the same load.Store.
   102  //
   103  // The cleanup function decrements the reference count and stops the LRS stream
   104  // when the last reference is removed.
   105  func (lrs *StreamImpl) ReportLoad() (*load.Store, func()) {
   106  	lrs.mu.Lock()
   107  	defer lrs.mu.Unlock()
   108  
   109  	cleanup := sync.OnceFunc(func() {
   110  		lrs.mu.Lock()
   111  		defer lrs.mu.Unlock()
   112  
   113  		if lrs.refCount == 0 {
   114  			lrs.logger.Errorf("Attempting to stop already stopped StreamImpl")
   115  			return
   116  		}
   117  		lrs.refCount--
   118  		if lrs.refCount != 0 {
   119  			return
   120  		}
   121  
   122  		if lrs.cancelStream == nil {
   123  			// It is possible that Stop() is called before the cleanup function
   124  			// is called, thereby setting cancelStream to nil. Hence we need a
   125  			// nil check here bofore invoking the cancel function.
   126  			return
   127  		}
   128  		lrs.cancelStream()
   129  		lrs.cancelStream = nil
   130  		lrs.logger.Infof("Stopping StreamImpl")
   131  	})
   132  
   133  	if lrs.refCount != 0 {
   134  		lrs.refCount++
   135  		return lrs.lrsStore, cleanup
   136  	}
   137  
   138  	lrs.refCount++
   139  	ctx, cancel := context.WithCancel(context.Background())
   140  	lrs.cancelStream = cancel
   141  	lrs.doneCh = make(chan struct{})
   142  	go lrs.runner(ctx)
   143  	return lrs.lrsStore, cleanup
   144  }
   145  
   146  // runner is responsible for managing the lifetime of an LRS streaming call. It
   147  // creates the stream, sends the initial LoadStatsRequest, receives the first
   148  // LoadStatsResponse, and then starts a goroutine to periodically send
   149  // LoadStatsRequests. The runner will restart the stream if it encounters any
   150  // errors.
   151  func (lrs *StreamImpl) runner(ctx context.Context) {
   152  	defer close(lrs.doneCh)
   153  
   154  	// This feature indicates that the client supports the
   155  	// LoadStatsResponse.send_all_clusters field in the LRS response.
   156  	node := proto.Clone(lrs.nodeProto).(*v3corepb.Node)
   157  	node.ClientFeatures = append(node.ClientFeatures, "envoy.lrs.supports_send_all_clusters")
   158  
   159  	runLoadReportStream := func() error {
   160  		// streamCtx is created and canceled in case we terminate the stream
   161  		// early for any reason, to avoid gRPC-Go leaking the RPC's monitoring
   162  		// goroutine.
   163  		streamCtx, cancel := context.WithCancel(ctx)
   164  		defer cancel()
   165  
   166  		stream, err := lrs.transport.CreateStreamingCall(streamCtx, "/envoy.service.load_stats.v3.LoadReportingService/StreamLoadStats")
   167  		if err != nil {
   168  			lrs.logger.Warningf("Failed to create new LRS streaming RPC: %v", err)
   169  			return nil
   170  		}
   171  		if lrs.logger.V(2) {
   172  			lrs.logger.Infof("LRS stream created")
   173  		}
   174  
   175  		if err := lrs.sendFirstLoadStatsRequest(stream, node); err != nil {
   176  			lrs.logger.Warningf("Sending first LRS request failed: %v", err)
   177  			return nil
   178  		}
   179  
   180  		clusters, interval, err := lrs.recvFirstLoadStatsResponse(stream)
   181  		if err != nil {
   182  			lrs.logger.Warningf("Reading from LRS streaming RPC failed: %v", err)
   183  			return nil
   184  		}
   185  
   186  		// We reset backoff state when we successfully receive at least one
   187  		// message from the server.
   188  		lrs.sendLoads(streamCtx, stream, clusters, interval)
   189  		return backoff.ErrResetBackoff
   190  	}
   191  	backoff.RunF(ctx, runLoadReportStream, lrs.backoff)
   192  }
   193  
   194  // sendLoads is responsible for periodically sending load reports to the LRS
   195  // server at the specified interval for the specified clusters, until the passed
   196  // in context is canceled.
   197  func (lrs *StreamImpl) sendLoads(ctx context.Context, stream transport.StreamingCall, clusterNames []string, interval time.Duration) {
   198  	tick := time.NewTicker(interval)
   199  	defer tick.Stop()
   200  	for {
   201  		select {
   202  		case <-tick.C:
   203  		case <-ctx.Done():
   204  			return
   205  		}
   206  		if err := lrs.sendLoadStatsRequest(stream, lrs.lrsStore.Stats(clusterNames)); err != nil {
   207  			lrs.logger.Warningf("Writing to LRS stream failed: %v", err)
   208  			return
   209  		}
   210  	}
   211  }
   212  
   213  func (lrs *StreamImpl) sendFirstLoadStatsRequest(stream transport.StreamingCall, node *v3corepb.Node) error {
   214  	req := &v3lrspb.LoadStatsRequest{Node: node}
   215  	if lrs.logger.V(perRPCVerbosityLevel) {
   216  		lrs.logger.Infof("Sending initial LoadStatsRequest: %s", pretty.ToJSON(req))
   217  	}
   218  	err := stream.Send(req)
   219  	if err == io.EOF {
   220  		return getStreamError(stream)
   221  	}
   222  	return err
   223  }
   224  
   225  // recvFirstLoadStatsResponse receives the first LoadStatsResponse from the LRS
   226  // server.  Returns the following:
   227  //   - a list of cluster names requested by the server or an empty slice if the
   228  //     server requested for load from all clusters
   229  //   - the load reporting interval, and
   230  //   - any error encountered
   231  func (lrs *StreamImpl) recvFirstLoadStatsResponse(stream transport.StreamingCall) ([]string, time.Duration, error) {
   232  	r, err := stream.Recv()
   233  	if err != nil {
   234  		return nil, 0, fmt.Errorf("lrs: failed to receive first LoadStatsResponse: %v", err)
   235  	}
   236  	resp, ok := r.(*v3lrspb.LoadStatsResponse)
   237  	if !ok {
   238  		return nil, time.Duration(0), fmt.Errorf("lrs: unexpected message type %T", r)
   239  	}
   240  	if lrs.logger.V(perRPCVerbosityLevel) {
   241  		lrs.logger.Infof("Received first LoadStatsResponse: %s", pretty.ToJSON(resp))
   242  	}
   243  
   244  	internal := resp.GetLoadReportingInterval()
   245  	if internal.CheckValid() != nil {
   246  		return nil, 0, fmt.Errorf("lrs: invalid load_reporting_interval: %v", err)
   247  	}
   248  	loadReportingInterval := internal.AsDuration()
   249  
   250  	clusters := resp.Clusters
   251  	if resp.SendAllClusters {
   252  		// Return an empty slice to send stats for all clusters.
   253  		clusters = []string{}
   254  	}
   255  
   256  	return clusters, loadReportingInterval, nil
   257  }
   258  
   259  func (lrs *StreamImpl) sendLoadStatsRequest(stream transport.StreamingCall, loads []*load.Data) error {
   260  	clusterStats := make([]*v3endpointpb.ClusterStats, 0, len(loads))
   261  	for _, sd := range loads {
   262  		droppedReqs := make([]*v3endpointpb.ClusterStats_DroppedRequests, 0, len(sd.Drops))
   263  		for category, count := range sd.Drops {
   264  			droppedReqs = append(droppedReqs, &v3endpointpb.ClusterStats_DroppedRequests{
   265  				Category:     category,
   266  				DroppedCount: count,
   267  			})
   268  		}
   269  		localityStats := make([]*v3endpointpb.UpstreamLocalityStats, 0, len(sd.LocalityStats))
   270  		for l, localityData := range sd.LocalityStats {
   271  			lid, err := internal.LocalityIDFromString(l)
   272  			if err != nil {
   273  				return err
   274  			}
   275  			loadMetricStats := make([]*v3endpointpb.EndpointLoadMetricStats, 0, len(localityData.LoadStats))
   276  			for name, loadData := range localityData.LoadStats {
   277  				loadMetricStats = append(loadMetricStats, &v3endpointpb.EndpointLoadMetricStats{
   278  					MetricName:                    name,
   279  					NumRequestsFinishedWithMetric: loadData.Count,
   280  					TotalMetricValue:              loadData.Sum,
   281  				})
   282  			}
   283  			localityStats = append(localityStats, &v3endpointpb.UpstreamLocalityStats{
   284  				Locality: &v3corepb.Locality{
   285  					Region:  lid.Region,
   286  					Zone:    lid.Zone,
   287  					SubZone: lid.SubZone,
   288  				},
   289  				TotalSuccessfulRequests: localityData.RequestStats.Succeeded,
   290  				TotalRequestsInProgress: localityData.RequestStats.InProgress,
   291  				TotalErrorRequests:      localityData.RequestStats.Errored,
   292  				TotalIssuedRequests:     localityData.RequestStats.Issued,
   293  				LoadMetricStats:         loadMetricStats,
   294  				UpstreamEndpointStats:   nil, // TODO: populate for per endpoint loads.
   295  			})
   296  		}
   297  
   298  		clusterStats = append(clusterStats, &v3endpointpb.ClusterStats{
   299  			ClusterName:           sd.Cluster,
   300  			ClusterServiceName:    sd.Service,
   301  			UpstreamLocalityStats: localityStats,
   302  			TotalDroppedRequests:  sd.TotalDrops,
   303  			DroppedRequests:       droppedReqs,
   304  			LoadReportInterval:    durationpb.New(sd.ReportInterval),
   305  		})
   306  	}
   307  
   308  	req := &v3lrspb.LoadStatsRequest{ClusterStats: clusterStats}
   309  	if lrs.logger.V(perRPCVerbosityLevel) {
   310  		lrs.logger.Infof("Sending LRS loads: %s", pretty.ToJSON(req))
   311  	}
   312  	err := stream.Send(req)
   313  	if err == io.EOF {
   314  		return getStreamError(stream)
   315  	}
   316  	return err
   317  }
   318  
   319  func getStreamError(stream transport.StreamingCall) error {
   320  	for {
   321  		if _, err := stream.Recv(); err != nil {
   322  			return err
   323  		}
   324  	}
   325  }
   326  
   327  // Stop blocks until the stream is closed and all spawned goroutines exit.
   328  func (lrs *StreamImpl) Stop() {
   329  	lrs.mu.Lock()
   330  	defer lrs.mu.Unlock()
   331  
   332  	if lrs.cancelStream == nil {
   333  		return
   334  	}
   335  	lrs.cancelStream()
   336  	lrs.cancelStream = nil
   337  	lrs.logger.Infof("Stopping LRS stream")
   338  	<-lrs.doneCh
   339  }