google.golang.org/grpc@v1.72.2/xds/internal/xdsclient/transport/lrs/lrs_stream.go (about) 1 /* 2 * 3 * Copyright 2024 gRPC authors. 4 * 5 * Licensed under the Apache License, Version 2.0 (the "License"); 6 * you may not use this file except in compliance with the License. 7 * You may obtain a copy of the License at 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 */ 17 18 // Package lrs provides the implementation of an LRS (Load Reporting Service) 19 // stream for the xDS client. 20 package lrs 21 22 import ( 23 "context" 24 "fmt" 25 "io" 26 "sync" 27 "time" 28 29 "google.golang.org/grpc/grpclog" 30 "google.golang.org/grpc/internal/backoff" 31 igrpclog "google.golang.org/grpc/internal/grpclog" 32 "google.golang.org/grpc/internal/pretty" 33 "google.golang.org/grpc/xds/internal" 34 "google.golang.org/grpc/xds/internal/xdsclient/load" 35 "google.golang.org/grpc/xds/internal/xdsclient/transport" 36 "google.golang.org/protobuf/proto" 37 "google.golang.org/protobuf/types/known/durationpb" 38 39 v3corepb "github.com/envoyproxy/go-control-plane/envoy/config/core/v3" 40 v3endpointpb "github.com/envoyproxy/go-control-plane/envoy/config/endpoint/v3" 41 v3lrspb "github.com/envoyproxy/go-control-plane/envoy/service/load_stats/v3" 42 ) 43 44 // Any per-RPC level logs which print complete request or response messages 45 // should be gated at this verbosity level. Other per-RPC level logs which print 46 // terse output should be at `INFO` and verbosity 2. 47 const perRPCVerbosityLevel = 9 48 49 // StreamImpl provides all the functionality associated with an LRS (Load Reporting 50 // Service) stream on the client-side. It manages the lifecycle of the LRS stream, 51 // including starting, stopping, and retrying the stream. It also provides a 52 // load.Store that can be used to report load, and a cleanup function that should 53 // be called when the load reporting is no longer needed. 54 type StreamImpl struct { 55 // The following fields are initialized when a Stream instance is created 56 // and are read-only afterwards, and hence can be accessed without a mutex. 57 transport transport.Transport // Transport to use for LRS stream. 58 backoff func(int) time.Duration // Backoff for retries, after stream failures. 59 nodeProto *v3corepb.Node // Identifies the gRPC application. 60 doneCh chan struct{} // To notify exit of LRS goroutine. 61 logger *igrpclog.PrefixLogger 62 63 // Guards access to the below fields. 64 mu sync.Mutex 65 cancelStream context.CancelFunc // Cancel the stream. If nil, the stream is not active. 66 refCount int // Number of interested parties. 67 lrsStore *load.Store // Store returned to user for pushing loads. 68 } 69 70 // StreamOpts holds the options for creating an lrsStream. 71 type StreamOpts struct { 72 Transport transport.Transport // xDS transport to create the stream on. 73 Backoff func(int) time.Duration // Backoff for retries, after stream failures. 74 NodeProto *v3corepb.Node // Node proto to identify the gRPC application. 75 LogPrefix string // Prefix to be used for log messages. 76 } 77 78 // NewStreamImpl creates a new StreamImpl with the provided options. 79 // 80 // The actual streaming RPC call is initiated when the first call to ReportLoad 81 // is made, and is terminated when the last call to ReportLoad is canceled. 82 func NewStreamImpl(opts StreamOpts) *StreamImpl { 83 lrs := &StreamImpl{ 84 transport: opts.Transport, 85 backoff: opts.Backoff, 86 nodeProto: opts.NodeProto, 87 lrsStore: load.NewStore(), 88 } 89 90 l := grpclog.Component("xds") 91 lrs.logger = igrpclog.NewPrefixLogger(l, opts.LogPrefix+fmt.Sprintf("[lrs-stream %p] ", lrs)) 92 return lrs 93 } 94 95 // ReportLoad returns a load.Store that can be used to report load, and a 96 // cleanup function that should be called when the load reporting is no longer 97 // needed. 98 // 99 // The first call to ReportLoad sets the reference count to one, and starts the 100 // LRS streaming call. Subsequent calls increment the reference count and return 101 // the same load.Store. 102 // 103 // The cleanup function decrements the reference count and stops the LRS stream 104 // when the last reference is removed. 105 func (lrs *StreamImpl) ReportLoad() (*load.Store, func()) { 106 lrs.mu.Lock() 107 defer lrs.mu.Unlock() 108 109 cleanup := sync.OnceFunc(func() { 110 lrs.mu.Lock() 111 defer lrs.mu.Unlock() 112 113 if lrs.refCount == 0 { 114 lrs.logger.Errorf("Attempting to stop already stopped StreamImpl") 115 return 116 } 117 lrs.refCount-- 118 if lrs.refCount != 0 { 119 return 120 } 121 122 if lrs.cancelStream == nil { 123 // It is possible that Stop() is called before the cleanup function 124 // is called, thereby setting cancelStream to nil. Hence we need a 125 // nil check here bofore invoking the cancel function. 126 return 127 } 128 lrs.cancelStream() 129 lrs.cancelStream = nil 130 lrs.logger.Infof("Stopping StreamImpl") 131 }) 132 133 if lrs.refCount != 0 { 134 lrs.refCount++ 135 return lrs.lrsStore, cleanup 136 } 137 138 lrs.refCount++ 139 ctx, cancel := context.WithCancel(context.Background()) 140 lrs.cancelStream = cancel 141 lrs.doneCh = make(chan struct{}) 142 go lrs.runner(ctx) 143 return lrs.lrsStore, cleanup 144 } 145 146 // runner is responsible for managing the lifetime of an LRS streaming call. It 147 // creates the stream, sends the initial LoadStatsRequest, receives the first 148 // LoadStatsResponse, and then starts a goroutine to periodically send 149 // LoadStatsRequests. The runner will restart the stream if it encounters any 150 // errors. 151 func (lrs *StreamImpl) runner(ctx context.Context) { 152 defer close(lrs.doneCh) 153 154 // This feature indicates that the client supports the 155 // LoadStatsResponse.send_all_clusters field in the LRS response. 156 node := proto.Clone(lrs.nodeProto).(*v3corepb.Node) 157 node.ClientFeatures = append(node.ClientFeatures, "envoy.lrs.supports_send_all_clusters") 158 159 runLoadReportStream := func() error { 160 // streamCtx is created and canceled in case we terminate the stream 161 // early for any reason, to avoid gRPC-Go leaking the RPC's monitoring 162 // goroutine. 163 streamCtx, cancel := context.WithCancel(ctx) 164 defer cancel() 165 166 stream, err := lrs.transport.CreateStreamingCall(streamCtx, "/envoy.service.load_stats.v3.LoadReportingService/StreamLoadStats") 167 if err != nil { 168 lrs.logger.Warningf("Failed to create new LRS streaming RPC: %v", err) 169 return nil 170 } 171 if lrs.logger.V(2) { 172 lrs.logger.Infof("LRS stream created") 173 } 174 175 if err := lrs.sendFirstLoadStatsRequest(stream, node); err != nil { 176 lrs.logger.Warningf("Sending first LRS request failed: %v", err) 177 return nil 178 } 179 180 clusters, interval, err := lrs.recvFirstLoadStatsResponse(stream) 181 if err != nil { 182 lrs.logger.Warningf("Reading from LRS streaming RPC failed: %v", err) 183 return nil 184 } 185 186 // We reset backoff state when we successfully receive at least one 187 // message from the server. 188 lrs.sendLoads(streamCtx, stream, clusters, interval) 189 return backoff.ErrResetBackoff 190 } 191 backoff.RunF(ctx, runLoadReportStream, lrs.backoff) 192 } 193 194 // sendLoads is responsible for periodically sending load reports to the LRS 195 // server at the specified interval for the specified clusters, until the passed 196 // in context is canceled. 197 func (lrs *StreamImpl) sendLoads(ctx context.Context, stream transport.StreamingCall, clusterNames []string, interval time.Duration) { 198 tick := time.NewTicker(interval) 199 defer tick.Stop() 200 for { 201 select { 202 case <-tick.C: 203 case <-ctx.Done(): 204 return 205 } 206 if err := lrs.sendLoadStatsRequest(stream, lrs.lrsStore.Stats(clusterNames)); err != nil { 207 lrs.logger.Warningf("Writing to LRS stream failed: %v", err) 208 return 209 } 210 } 211 } 212 213 func (lrs *StreamImpl) sendFirstLoadStatsRequest(stream transport.StreamingCall, node *v3corepb.Node) error { 214 req := &v3lrspb.LoadStatsRequest{Node: node} 215 if lrs.logger.V(perRPCVerbosityLevel) { 216 lrs.logger.Infof("Sending initial LoadStatsRequest: %s", pretty.ToJSON(req)) 217 } 218 err := stream.Send(req) 219 if err == io.EOF { 220 return getStreamError(stream) 221 } 222 return err 223 } 224 225 // recvFirstLoadStatsResponse receives the first LoadStatsResponse from the LRS 226 // server. Returns the following: 227 // - a list of cluster names requested by the server or an empty slice if the 228 // server requested for load from all clusters 229 // - the load reporting interval, and 230 // - any error encountered 231 func (lrs *StreamImpl) recvFirstLoadStatsResponse(stream transport.StreamingCall) ([]string, time.Duration, error) { 232 r, err := stream.Recv() 233 if err != nil { 234 return nil, 0, fmt.Errorf("lrs: failed to receive first LoadStatsResponse: %v", err) 235 } 236 resp, ok := r.(*v3lrspb.LoadStatsResponse) 237 if !ok { 238 return nil, time.Duration(0), fmt.Errorf("lrs: unexpected message type %T", r) 239 } 240 if lrs.logger.V(perRPCVerbosityLevel) { 241 lrs.logger.Infof("Received first LoadStatsResponse: %s", pretty.ToJSON(resp)) 242 } 243 244 internal := resp.GetLoadReportingInterval() 245 if internal.CheckValid() != nil { 246 return nil, 0, fmt.Errorf("lrs: invalid load_reporting_interval: %v", err) 247 } 248 loadReportingInterval := internal.AsDuration() 249 250 clusters := resp.Clusters 251 if resp.SendAllClusters { 252 // Return an empty slice to send stats for all clusters. 253 clusters = []string{} 254 } 255 256 return clusters, loadReportingInterval, nil 257 } 258 259 func (lrs *StreamImpl) sendLoadStatsRequest(stream transport.StreamingCall, loads []*load.Data) error { 260 clusterStats := make([]*v3endpointpb.ClusterStats, 0, len(loads)) 261 for _, sd := range loads { 262 droppedReqs := make([]*v3endpointpb.ClusterStats_DroppedRequests, 0, len(sd.Drops)) 263 for category, count := range sd.Drops { 264 droppedReqs = append(droppedReqs, &v3endpointpb.ClusterStats_DroppedRequests{ 265 Category: category, 266 DroppedCount: count, 267 }) 268 } 269 localityStats := make([]*v3endpointpb.UpstreamLocalityStats, 0, len(sd.LocalityStats)) 270 for l, localityData := range sd.LocalityStats { 271 lid, err := internal.LocalityIDFromString(l) 272 if err != nil { 273 return err 274 } 275 loadMetricStats := make([]*v3endpointpb.EndpointLoadMetricStats, 0, len(localityData.LoadStats)) 276 for name, loadData := range localityData.LoadStats { 277 loadMetricStats = append(loadMetricStats, &v3endpointpb.EndpointLoadMetricStats{ 278 MetricName: name, 279 NumRequestsFinishedWithMetric: loadData.Count, 280 TotalMetricValue: loadData.Sum, 281 }) 282 } 283 localityStats = append(localityStats, &v3endpointpb.UpstreamLocalityStats{ 284 Locality: &v3corepb.Locality{ 285 Region: lid.Region, 286 Zone: lid.Zone, 287 SubZone: lid.SubZone, 288 }, 289 TotalSuccessfulRequests: localityData.RequestStats.Succeeded, 290 TotalRequestsInProgress: localityData.RequestStats.InProgress, 291 TotalErrorRequests: localityData.RequestStats.Errored, 292 TotalIssuedRequests: localityData.RequestStats.Issued, 293 LoadMetricStats: loadMetricStats, 294 UpstreamEndpointStats: nil, // TODO: populate for per endpoint loads. 295 }) 296 } 297 298 clusterStats = append(clusterStats, &v3endpointpb.ClusterStats{ 299 ClusterName: sd.Cluster, 300 ClusterServiceName: sd.Service, 301 UpstreamLocalityStats: localityStats, 302 TotalDroppedRequests: sd.TotalDrops, 303 DroppedRequests: droppedReqs, 304 LoadReportInterval: durationpb.New(sd.ReportInterval), 305 }) 306 } 307 308 req := &v3lrspb.LoadStatsRequest{ClusterStats: clusterStats} 309 if lrs.logger.V(perRPCVerbosityLevel) { 310 lrs.logger.Infof("Sending LRS loads: %s", pretty.ToJSON(req)) 311 } 312 err := stream.Send(req) 313 if err == io.EOF { 314 return getStreamError(stream) 315 } 316 return err 317 } 318 319 func getStreamError(stream transport.StreamingCall) error { 320 for { 321 if _, err := stream.Recv(); err != nil { 322 return err 323 } 324 } 325 } 326 327 // Stop blocks until the stream is closed and all spawned goroutines exit. 328 func (lrs *StreamImpl) Stop() { 329 lrs.mu.Lock() 330 defer lrs.mu.Unlock() 331 332 if lrs.cancelStream == nil { 333 return 334 } 335 lrs.cancelStream() 336 lrs.cancelStream = nil 337 lrs.logger.Infof("Stopping LRS stream") 338 <-lrs.doneCh 339 }