google.golang.org/grpc@v1.62.1/xds/internal/xdsclient/transport/loadreport.go

google.golang.org/grpc@v1.62.1/xds/internal/xdsclient/transport/loadreport.go (about)

     1  /*
     2   *
     3   * Copyright 2022 gRPC authors.
     4   *
     5   * Licensed under the Apache License, Version 2.0 (the "License");
     6   * you may not use this file except in compliance with the License.
     7   * You may obtain a copy of the License at
     8   *
     9   *     http://www.apache.org/licenses/LICENSE-2.0
    10   *
    11   * Unless required by applicable law or agreed to in writing, software
    12   * distributed under the License is distributed on an "AS IS" BASIS,
    13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    14   * See the License for the specific language governing permissions and
    15   * limitations under the License.
    16   */
    17  
    18  package transport
    19  
    20  import (
    21  	"context"
    22  	"errors"
    23  	"fmt"
    24  	"io"
    25  	"time"
    26  
    27  	"google.golang.org/grpc/internal/backoff"
    28  	"google.golang.org/grpc/internal/grpcsync"
    29  	"google.golang.org/grpc/internal/pretty"
    30  	"google.golang.org/grpc/xds/internal"
    31  	"google.golang.org/grpc/xds/internal/xdsclient/load"
    32  	"google.golang.org/protobuf/proto"
    33  	"google.golang.org/protobuf/types/known/durationpb"
    34  
    35  	v3corepb "github.com/envoyproxy/go-control-plane/envoy/config/core/v3"
    36  	v3endpointpb "github.com/envoyproxy/go-control-plane/envoy/config/endpoint/v3"
    37  	v3lrsgrpc "github.com/envoyproxy/go-control-plane/envoy/service/load_stats/v3"
    38  	v3lrspb "github.com/envoyproxy/go-control-plane/envoy/service/load_stats/v3"
    39  )
    40  
    41  type lrsStream = v3lrsgrpc.LoadReportingService_StreamLoadStatsClient
    42  
    43  // ReportLoad starts reporting loads to the management server the transport is
    44  // configured to use.
    45  //
    46  // It returns a Store for the user to report loads and a function to cancel the
    47  // load reporting.
    48  func (t *Transport) ReportLoad() (*load.Store, func()) {
    49  	t.lrsStartStream()
    50  	return t.lrsStore, grpcsync.OnceFunc(func() { t.lrsStopStream() })
    51  }
    52  
    53  // lrsStartStream starts an LRS stream to the server, if none exists.
    54  func (t *Transport) lrsStartStream() {
    55  	t.lrsMu.Lock()
    56  	defer t.lrsMu.Unlock()
    57  
    58  	t.lrsRefCount++
    59  	if t.lrsRefCount != 1 {
    60  		// Return early if the stream has already been started.
    61  		return
    62  	}
    63  
    64  	ctx, cancel := context.WithCancel(context.Background())
    65  	t.lrsCancelStream = cancel
    66  
    67  	// Create a new done channel everytime a new stream is created. This ensures
    68  	// that we don't close the same channel multiple times (from lrsRunner()
    69  	// goroutine) when multiple streams are created and closed.
    70  	t.lrsRunnerDoneCh = make(chan struct{})
    71  	go t.lrsRunner(ctx)
    72  }
    73  
    74  // lrsStopStream closes the LRS stream, if this is the last user of the stream.
    75  func (t *Transport) lrsStopStream() {
    76  	t.lrsMu.Lock()
    77  	defer t.lrsMu.Unlock()
    78  
    79  	t.lrsRefCount--
    80  	if t.lrsRefCount != 0 {
    81  		// Return early if the stream has other references.
    82  		return
    83  	}
    84  
    85  	t.lrsCancelStream()
    86  	t.logger.Infof("Stopping LRS stream")
    87  
    88  	// Wait for the runner goroutine to exit. The done channel will be
    89  	// recreated when a new stream is created.
    90  	<-t.lrsRunnerDoneCh
    91  }
    92  
    93  // lrsRunner starts an LRS stream to report load data to the management server.
    94  // It reports load at constant intervals (as configured by the management
    95  // server) until the context is cancelled.
    96  func (t *Transport) lrsRunner(ctx context.Context) {
    97  	defer close(t.lrsRunnerDoneCh)
    98  
    99  	// This feature indicates that the client supports the
   100  	// LoadStatsResponse.send_all_clusters field in the LRS response.
   101  	node := proto.Clone(t.nodeProto).(*v3corepb.Node)
   102  	node.ClientFeatures = append(node.ClientFeatures, "envoy.lrs.supports_send_all_clusters")
   103  
   104  	runLoadReportStream := func() error {
   105  		// streamCtx is created and canceled in case we terminate the stream
   106  		// early for any reason, to avoid gRPC-Go leaking the RPC's monitoring
   107  		// goroutine.
   108  		streamCtx, cancel := context.WithCancel(ctx)
   109  		defer cancel()
   110  		stream, err := v3lrsgrpc.NewLoadReportingServiceClient(t.cc).StreamLoadStats(streamCtx)
   111  		if err != nil {
   112  			t.logger.Warningf("Creating LRS stream to server %q failed: %v", t.serverURI, err)
   113  			return nil
   114  		}
   115  		t.logger.Infof("Created LRS stream to server %q", t.serverURI)
   116  
   117  		if err := t.sendFirstLoadStatsRequest(stream, node); err != nil {
   118  			t.logger.Warningf("Sending first LRS request failed: %v", err)
   119  			return nil
   120  		}
   121  
   122  		clusters, interval, err := t.recvFirstLoadStatsResponse(stream)
   123  		if err != nil {
   124  			t.logger.Warningf("Reading from LRS stream failed: %v", err)
   125  			return nil
   126  		}
   127  
   128  		// We reset backoff state when we successfully receive at least one
   129  		// message from the server.
   130  		t.sendLoads(streamCtx, stream, clusters, interval)
   131  		return backoff.ErrResetBackoff
   132  	}
   133  	backoff.RunF(ctx, runLoadReportStream, t.backoff)
   134  }
   135  
   136  func (t *Transport) sendLoads(ctx context.Context, stream lrsStream, clusterNames []string, interval time.Duration) {
   137  	tick := time.NewTicker(interval)
   138  	defer tick.Stop()
   139  	for {
   140  		select {
   141  		case <-tick.C:
   142  		case <-ctx.Done():
   143  			return
   144  		}
   145  		if err := t.sendLoadStatsRequest(stream, t.lrsStore.Stats(clusterNames)); err != nil {
   146  			t.logger.Warningf("Writing to LRS stream failed: %v", err)
   147  			return
   148  		}
   149  	}
   150  }
   151  
   152  func (t *Transport) sendFirstLoadStatsRequest(stream lrsStream, node *v3corepb.Node) error {
   153  	req := &v3lrspb.LoadStatsRequest{Node: node}
   154  	if t.logger.V(perRPCVerbosityLevel) {
   155  		t.logger.Infof("Sending initial LoadStatsRequest: %s", pretty.ToJSON(req))
   156  	}
   157  	err := stream.Send(req)
   158  	if err == io.EOF {
   159  		return getStreamError(stream)
   160  	}
   161  	return err
   162  }
   163  
   164  func (t *Transport) recvFirstLoadStatsResponse(stream lrsStream) ([]string, time.Duration, error) {
   165  	resp, err := stream.Recv()
   166  	if err != nil {
   167  		return nil, 0, fmt.Errorf("failed to receive first LoadStatsResponse: %v", err)
   168  	}
   169  	if t.logger.V(perRPCVerbosityLevel) {
   170  		t.logger.Infof("Received first LoadStatsResponse: %s", pretty.ToJSON(resp))
   171  	}
   172  
   173  	rInterval := resp.GetLoadReportingInterval()
   174  	if rInterval.CheckValid() != nil {
   175  		return nil, 0, fmt.Errorf("invalid load_reporting_interval: %v", err)
   176  	}
   177  	interval := rInterval.AsDuration()
   178  
   179  	if resp.ReportEndpointGranularity {
   180  		// TODO(easwars): Support per endpoint loads.
   181  		return nil, 0, errors.New("lrs: endpoint loads requested, but not supported by current implementation")
   182  	}
   183  
   184  	clusters := resp.Clusters
   185  	if resp.SendAllClusters {
   186  		// Return nil to send stats for all clusters.
   187  		clusters = nil
   188  	}
   189  
   190  	return clusters, interval, nil
   191  }
   192  
   193  func (t *Transport) sendLoadStatsRequest(stream lrsStream, loads []*load.Data) error {
   194  	clusterStats := make([]*v3endpointpb.ClusterStats, 0, len(loads))
   195  	for _, sd := range loads {
   196  		droppedReqs := make([]*v3endpointpb.ClusterStats_DroppedRequests, 0, len(sd.Drops))
   197  		for category, count := range sd.Drops {
   198  			droppedReqs = append(droppedReqs, &v3endpointpb.ClusterStats_DroppedRequests{
   199  				Category:     category,
   200  				DroppedCount: count,
   201  			})
   202  		}
   203  		localityStats := make([]*v3endpointpb.UpstreamLocalityStats, 0, len(sd.LocalityStats))
   204  		for l, localityData := range sd.LocalityStats {
   205  			lid, err := internal.LocalityIDFromString(l)
   206  			if err != nil {
   207  				return err
   208  			}
   209  			loadMetricStats := make([]*v3endpointpb.EndpointLoadMetricStats, 0, len(localityData.LoadStats))
   210  			for name, loadData := range localityData.LoadStats {
   211  				loadMetricStats = append(loadMetricStats, &v3endpointpb.EndpointLoadMetricStats{
   212  					MetricName:                    name,
   213  					NumRequestsFinishedWithMetric: loadData.Count,
   214  					TotalMetricValue:              loadData.Sum,
   215  				})
   216  			}
   217  			localityStats = append(localityStats, &v3endpointpb.UpstreamLocalityStats{
   218  				Locality: &v3corepb.Locality{
   219  					Region:  lid.Region,
   220  					Zone:    lid.Zone,
   221  					SubZone: lid.SubZone,
   222  				},
   223  				TotalSuccessfulRequests: localityData.RequestStats.Succeeded,
   224  				TotalRequestsInProgress: localityData.RequestStats.InProgress,
   225  				TotalErrorRequests:      localityData.RequestStats.Errored,
   226  				LoadMetricStats:         loadMetricStats,
   227  				UpstreamEndpointStats:   nil, // TODO: populate for per endpoint loads.
   228  			})
   229  		}
   230  
   231  		clusterStats = append(clusterStats, &v3endpointpb.ClusterStats{
   232  			ClusterName:           sd.Cluster,
   233  			ClusterServiceName:    sd.Service,
   234  			UpstreamLocalityStats: localityStats,
   235  			TotalDroppedRequests:  sd.TotalDrops,
   236  			DroppedRequests:       droppedReqs,
   237  			LoadReportInterval:    durationpb.New(sd.ReportInterval),
   238  		})
   239  	}
   240  
   241  	req := &v3lrspb.LoadStatsRequest{ClusterStats: clusterStats}
   242  	if t.logger.V(perRPCVerbosityLevel) {
   243  		t.logger.Infof("Sending LRS loads: %s", pretty.ToJSON(req))
   244  	}
   245  	err := stream.Send(req)
   246  	if err == io.EOF {
   247  		return getStreamError(stream)
   248  	}
   249  	return err
   250  }
   251  
   252  func getStreamError(stream lrsStream) error {
   253  	for {
   254  		if _, err := stream.Recv(); err != nil {
   255  			return err
   256  		}
   257  	}
   258  }