google.golang.org/grpc@v1.62.1/xds/internal/xdsclient/transport/loadreport.go (about) 1 /* 2 * 3 * Copyright 2022 gRPC authors. 4 * 5 * Licensed under the Apache License, Version 2.0 (the "License"); 6 * you may not use this file except in compliance with the License. 7 * You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 */ 17 18 package transport 19 20 import ( 21 "context" 22 "errors" 23 "fmt" 24 "io" 25 "time" 26 27 "google.golang.org/grpc/internal/backoff" 28 "google.golang.org/grpc/internal/grpcsync" 29 "google.golang.org/grpc/internal/pretty" 30 "google.golang.org/grpc/xds/internal" 31 "google.golang.org/grpc/xds/internal/xdsclient/load" 32 "google.golang.org/protobuf/proto" 33 "google.golang.org/protobuf/types/known/durationpb" 34 35 v3corepb "github.com/envoyproxy/go-control-plane/envoy/config/core/v3" 36 v3endpointpb "github.com/envoyproxy/go-control-plane/envoy/config/endpoint/v3" 37 v3lrsgrpc "github.com/envoyproxy/go-control-plane/envoy/service/load_stats/v3" 38 v3lrspb "github.com/envoyproxy/go-control-plane/envoy/service/load_stats/v3" 39 ) 40 41 type lrsStream = v3lrsgrpc.LoadReportingService_StreamLoadStatsClient 42 43 // ReportLoad starts reporting loads to the management server the transport is 44 // configured to use. 45 // 46 // It returns a Store for the user to report loads and a function to cancel the 47 // load reporting. 48 func (t *Transport) ReportLoad() (*load.Store, func()) { 49 t.lrsStartStream() 50 return t.lrsStore, grpcsync.OnceFunc(func() { t.lrsStopStream() }) 51 } 52 53 // lrsStartStream starts an LRS stream to the server, if none exists. 54 func (t *Transport) lrsStartStream() { 55 t.lrsMu.Lock() 56 defer t.lrsMu.Unlock() 57 58 t.lrsRefCount++ 59 if t.lrsRefCount != 1 { 60 // Return early if the stream has already been started. 61 return 62 } 63 64 ctx, cancel := context.WithCancel(context.Background()) 65 t.lrsCancelStream = cancel 66 67 // Create a new done channel everytime a new stream is created. This ensures 68 // that we don't close the same channel multiple times (from lrsRunner() 69 // goroutine) when multiple streams are created and closed. 70 t.lrsRunnerDoneCh = make(chan struct{}) 71 go t.lrsRunner(ctx) 72 } 73 74 // lrsStopStream closes the LRS stream, if this is the last user of the stream. 75 func (t *Transport) lrsStopStream() { 76 t.lrsMu.Lock() 77 defer t.lrsMu.Unlock() 78 79 t.lrsRefCount-- 80 if t.lrsRefCount != 0 { 81 // Return early if the stream has other references. 82 return 83 } 84 85 t.lrsCancelStream() 86 t.logger.Infof("Stopping LRS stream") 87 88 // Wait for the runner goroutine to exit. The done channel will be 89 // recreated when a new stream is created. 90 <-t.lrsRunnerDoneCh 91 } 92 93 // lrsRunner starts an LRS stream to report load data to the management server. 94 // It reports load at constant intervals (as configured by the management 95 // server) until the context is cancelled. 96 func (t *Transport) lrsRunner(ctx context.Context) { 97 defer close(t.lrsRunnerDoneCh) 98 99 // This feature indicates that the client supports the 100 // LoadStatsResponse.send_all_clusters field in the LRS response. 101 node := proto.Clone(t.nodeProto).(*v3corepb.Node) 102 node.ClientFeatures = append(node.ClientFeatures, "envoy.lrs.supports_send_all_clusters") 103 104 runLoadReportStream := func() error { 105 // streamCtx is created and canceled in case we terminate the stream 106 // early for any reason, to avoid gRPC-Go leaking the RPC's monitoring 107 // goroutine. 108 streamCtx, cancel := context.WithCancel(ctx) 109 defer cancel() 110 stream, err := v3lrsgrpc.NewLoadReportingServiceClient(t.cc).StreamLoadStats(streamCtx) 111 if err != nil { 112 t.logger.Warningf("Creating LRS stream to server %q failed: %v", t.serverURI, err) 113 return nil 114 } 115 t.logger.Infof("Created LRS stream to server %q", t.serverURI) 116 117 if err := t.sendFirstLoadStatsRequest(stream, node); err != nil { 118 t.logger.Warningf("Sending first LRS request failed: %v", err) 119 return nil 120 } 121 122 clusters, interval, err := t.recvFirstLoadStatsResponse(stream) 123 if err != nil { 124 t.logger.Warningf("Reading from LRS stream failed: %v", err) 125 return nil 126 } 127 128 // We reset backoff state when we successfully receive at least one 129 // message from the server. 130 t.sendLoads(streamCtx, stream, clusters, interval) 131 return backoff.ErrResetBackoff 132 } 133 backoff.RunF(ctx, runLoadReportStream, t.backoff) 134 } 135 136 func (t *Transport) sendLoads(ctx context.Context, stream lrsStream, clusterNames []string, interval time.Duration) { 137 tick := time.NewTicker(interval) 138 defer tick.Stop() 139 for { 140 select { 141 case <-tick.C: 142 case <-ctx.Done(): 143 return 144 } 145 if err := t.sendLoadStatsRequest(stream, t.lrsStore.Stats(clusterNames)); err != nil { 146 t.logger.Warningf("Writing to LRS stream failed: %v", err) 147 return 148 } 149 } 150 } 151 152 func (t *Transport) sendFirstLoadStatsRequest(stream lrsStream, node *v3corepb.Node) error { 153 req := &v3lrspb.LoadStatsRequest{Node: node} 154 if t.logger.V(perRPCVerbosityLevel) { 155 t.logger.Infof("Sending initial LoadStatsRequest: %s", pretty.ToJSON(req)) 156 } 157 err := stream.Send(req) 158 if err == io.EOF { 159 return getStreamError(stream) 160 } 161 return err 162 } 163 164 func (t *Transport) recvFirstLoadStatsResponse(stream lrsStream) ([]string, time.Duration, error) { 165 resp, err := stream.Recv() 166 if err != nil { 167 return nil, 0, fmt.Errorf("failed to receive first LoadStatsResponse: %v", err) 168 } 169 if t.logger.V(perRPCVerbosityLevel) { 170 t.logger.Infof("Received first LoadStatsResponse: %s", pretty.ToJSON(resp)) 171 } 172 173 rInterval := resp.GetLoadReportingInterval() 174 if rInterval.CheckValid() != nil { 175 return nil, 0, fmt.Errorf("invalid load_reporting_interval: %v", err) 176 } 177 interval := rInterval.AsDuration() 178 179 if resp.ReportEndpointGranularity { 180 // TODO(easwars): Support per endpoint loads. 181 return nil, 0, errors.New("lrs: endpoint loads requested, but not supported by current implementation") 182 } 183 184 clusters := resp.Clusters 185 if resp.SendAllClusters { 186 // Return nil to send stats for all clusters. 187 clusters = nil 188 } 189 190 return clusters, interval, nil 191 } 192 193 func (t *Transport) sendLoadStatsRequest(stream lrsStream, loads []*load.Data) error { 194 clusterStats := make([]*v3endpointpb.ClusterStats, 0, len(loads)) 195 for _, sd := range loads { 196 droppedReqs := make([]*v3endpointpb.ClusterStats_DroppedRequests, 0, len(sd.Drops)) 197 for category, count := range sd.Drops { 198 droppedReqs = append(droppedReqs, &v3endpointpb.ClusterStats_DroppedRequests{ 199 Category: category, 200 DroppedCount: count, 201 }) 202 } 203 localityStats := make([]*v3endpointpb.UpstreamLocalityStats, 0, len(sd.LocalityStats)) 204 for l, localityData := range sd.LocalityStats { 205 lid, err := internal.LocalityIDFromString(l) 206 if err != nil { 207 return err 208 } 209 loadMetricStats := make([]*v3endpointpb.EndpointLoadMetricStats, 0, len(localityData.LoadStats)) 210 for name, loadData := range localityData.LoadStats { 211 loadMetricStats = append(loadMetricStats, &v3endpointpb.EndpointLoadMetricStats{ 212 MetricName: name, 213 NumRequestsFinishedWithMetric: loadData.Count, 214 TotalMetricValue: loadData.Sum, 215 }) 216 } 217 localityStats = append(localityStats, &v3endpointpb.UpstreamLocalityStats{ 218 Locality: &v3corepb.Locality{ 219 Region: lid.Region, 220 Zone: lid.Zone, 221 SubZone: lid.SubZone, 222 }, 223 TotalSuccessfulRequests: localityData.RequestStats.Succeeded, 224 TotalRequestsInProgress: localityData.RequestStats.InProgress, 225 TotalErrorRequests: localityData.RequestStats.Errored, 226 LoadMetricStats: loadMetricStats, 227 UpstreamEndpointStats: nil, // TODO: populate for per endpoint loads. 228 }) 229 } 230 231 clusterStats = append(clusterStats, &v3endpointpb.ClusterStats{ 232 ClusterName: sd.Cluster, 233 ClusterServiceName: sd.Service, 234 UpstreamLocalityStats: localityStats, 235 TotalDroppedRequests: sd.TotalDrops, 236 DroppedRequests: droppedReqs, 237 LoadReportInterval: durationpb.New(sd.ReportInterval), 238 }) 239 } 240 241 req := &v3lrspb.LoadStatsRequest{ClusterStats: clusterStats} 242 if t.logger.V(perRPCVerbosityLevel) { 243 t.logger.Infof("Sending LRS loads: %s", pretty.ToJSON(req)) 244 } 245 err := stream.Send(req) 246 if err == io.EOF { 247 return getStreamError(stream) 248 } 249 return err 250 } 251 252 func getStreamError(stream lrsStream) error { 253 for { 254 if _, err := stream.Recv(); err != nil { 255 return err 256 } 257 } 258 }