google.golang.org/grpc@v1.62.1/orca/producer_test.go (about)

     1  /*
     2   * Copyright 2022 gRPC authors.
     3   *
     4   * Licensed under the Apache License, Version 2.0 (the "License");
     5   * you may not use this file except in compliance with the License.
     6   * You may obtain a copy of the License at
     7   *
     8   *     http://www.apache.org/licenses/LICENSE-2.0
     9   *
    10   * Unless required by applicable law or agreed to in writing, software
    11   * distributed under the License is distributed on an "AS IS" BASIS,
    12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13   * See the License for the specific language governing permissions and
    14   * limitations under the License.
    15   */
    16  
    17  package orca_test
    18  
    19  import (
    20  	"context"
    21  	"fmt"
    22  	"sync"
    23  	"testing"
    24  	"time"
    25  
    26  	"google.golang.org/grpc"
    27  	"google.golang.org/grpc/balancer"
    28  	"google.golang.org/grpc/balancer/roundrobin"
    29  	"google.golang.org/grpc/codes"
    30  	"google.golang.org/grpc/credentials/insecure"
    31  	"google.golang.org/grpc/internal/grpctest"
    32  	"google.golang.org/grpc/internal/testutils"
    33  	"google.golang.org/grpc/orca"
    34  	"google.golang.org/grpc/orca/internal"
    35  	"google.golang.org/grpc/resolver"
    36  	"google.golang.org/grpc/resolver/manual"
    37  	"google.golang.org/grpc/status"
    38  	"google.golang.org/protobuf/proto"
    39  
    40  	v3orcapb "github.com/cncf/xds/go/xds/data/orca/v3"
    41  	v3orcaservicegrpc "github.com/cncf/xds/go/xds/service/orca/v3"
    42  	v3orcaservicepb "github.com/cncf/xds/go/xds/service/orca/v3"
    43  )
    44  
    45  // customLBB wraps a round robin LB policy but provides a ClientConn wrapper to
    46  // add an ORCA OOB report producer for all created SubConns.
    47  type customLBB struct{}
    48  
    49  func (customLBB) Build(cc balancer.ClientConn, opts balancer.BuildOptions) balancer.Balancer {
    50  	return balancer.Get(roundrobin.Name).Build(&ccWrapper{ClientConn: cc}, opts)
    51  }
    52  
    53  func (customLBB) Name() string { return "customLB" }
    54  
    55  func init() {
    56  	balancer.Register(customLBB{})
    57  }
    58  
    59  type ccWrapper struct {
    60  	balancer.ClientConn
    61  }
    62  
    63  func (w *ccWrapper) NewSubConn(addrs []resolver.Address, opts balancer.NewSubConnOptions) (balancer.SubConn, error) {
    64  	if len(addrs) != 1 {
    65  		panic(fmt.Sprintf("got addrs=%v; want len(addrs) == 1", addrs))
    66  	}
    67  	sc, err := w.ClientConn.NewSubConn(addrs, opts)
    68  	if err != nil {
    69  		return sc, err
    70  	}
    71  	l := getListenerInfo(addrs[0])
    72  	l.listener.cleanup = orca.RegisterOOBListener(sc, l.listener, l.opts)
    73  	l.sc = sc
    74  	return sc, nil
    75  }
    76  
    77  // listenerInfo is stored in an address's attributes to allow ORCA
    78  // listeners to be registered on subconns created for that address.
    79  type listenerInfo struct {
    80  	listener *testOOBListener
    81  	opts     orca.OOBListenerOptions
    82  	sc       balancer.SubConn // Set by the LB policy
    83  }
    84  
    85  type listenerInfoKey struct{}
    86  
    87  func setListenerInfo(addr resolver.Address, l *listenerInfo) resolver.Address {
    88  	addr.Attributes = addr.Attributes.WithValue(listenerInfoKey{}, l)
    89  	return addr
    90  }
    91  
    92  func getListenerInfo(addr resolver.Address) *listenerInfo {
    93  	return addr.Attributes.Value(listenerInfoKey{}).(*listenerInfo)
    94  }
    95  
    96  // testOOBListener is a simple listener that pushes load reports to a channel.
    97  type testOOBListener struct {
    98  	cleanup      func()
    99  	loadReportCh chan *v3orcapb.OrcaLoadReport
   100  }
   101  
   102  func newTestOOBListener() *testOOBListener {
   103  	return &testOOBListener{cleanup: func() {}, loadReportCh: make(chan *v3orcapb.OrcaLoadReport)}
   104  }
   105  
   106  func (t *testOOBListener) Stop() { t.cleanup() }
   107  
   108  func (t *testOOBListener) OnLoadReport(r *v3orcapb.OrcaLoadReport) {
   109  	t.loadReportCh <- r
   110  }
   111  
   112  // TestProducer is a basic, end-to-end style test of an LB policy with an
   113  // OOBListener communicating with a server with an ORCA service.
   114  func (s) TestProducer(t *testing.T) {
   115  	ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout)
   116  	defer cancel()
   117  
   118  	// Use a fixed backoff for stream recreation.
   119  	oldBackoff := internal.DefaultBackoffFunc
   120  	internal.DefaultBackoffFunc = func(int) time.Duration { return 10 * time.Millisecond }
   121  	defer func() { internal.DefaultBackoffFunc = oldBackoff }()
   122  
   123  	// Initialize listener for our ORCA server.
   124  	lis, err := testutils.LocalTCPListener()
   125  	if err != nil {
   126  		t.Fatal(err)
   127  	}
   128  
   129  	// Register the OpenRCAService with a very short metrics reporting interval.
   130  	const shortReportingInterval = 50 * time.Millisecond
   131  	smr := orca.NewServerMetricsRecorder()
   132  	opts := orca.ServiceOptions{MinReportingInterval: shortReportingInterval, ServerMetricsProvider: smr}
   133  	internal.AllowAnyMinReportingInterval.(func(*orca.ServiceOptions))(&opts)
   134  	s := grpc.NewServer()
   135  	if err := orca.Register(s, opts); err != nil {
   136  		t.Fatalf("orca.Register failed: %v", err)
   137  	}
   138  	go s.Serve(lis)
   139  	defer s.Stop()
   140  
   141  	// Create our client with an OOB listener in the LB policy it selects.
   142  	r := manual.NewBuilderWithScheme("whatever")
   143  	oobLis := newTestOOBListener()
   144  
   145  	lisOpts := orca.OOBListenerOptions{ReportInterval: 50 * time.Millisecond}
   146  	li := &listenerInfo{listener: oobLis, opts: lisOpts}
   147  	addr := setListenerInfo(resolver.Address{Addr: lis.Addr().String()}, li)
   148  	r.InitialState(resolver.State{Addresses: []resolver.Address{addr}})
   149  	cc, err := grpc.Dial("whatever:///whatever", grpc.WithDefaultServiceConfig(`{"loadBalancingConfig": [{"customLB":{}}]}`), grpc.WithResolvers(r), grpc.WithTransportCredentials(insecure.NewCredentials()))
   150  	if err != nil {
   151  		t.Fatalf("grpc.Dial failed: %v", err)
   152  	}
   153  	defer cc.Close()
   154  
   155  	// Ensure the OOB listener is stopped before the client is closed to avoid
   156  	// a potential irrelevant error in the logs.
   157  	defer oobLis.Stop()
   158  
   159  	// Set a few metrics and wait for them on the client side.
   160  	smr.SetCPUUtilization(10)
   161  	smr.SetMemoryUtilization(0.1)
   162  	smr.SetNamedUtilization("bob", 0.555)
   163  	loadReportWant := &v3orcapb.OrcaLoadReport{
   164  		CpuUtilization: 10,
   165  		MemUtilization: 0.1,
   166  		Utilization:    map[string]float64{"bob": 0.555},
   167  	}
   168  
   169  testReport:
   170  	for {
   171  		select {
   172  		case r := <-oobLis.loadReportCh:
   173  			t.Log("Load report received: ", r)
   174  			if proto.Equal(r, loadReportWant) {
   175  				// Success!
   176  				break testReport
   177  			}
   178  		case <-ctx.Done():
   179  			t.Fatalf("timed out waiting for load report: %v", loadReportWant)
   180  		}
   181  	}
   182  
   183  	// Change and add metrics and wait for them on the client side.
   184  	smr.SetCPUUtilization(0.5)
   185  	smr.SetMemoryUtilization(0.2)
   186  	smr.SetNamedUtilization("mary", 0.321)
   187  	loadReportWant = &v3orcapb.OrcaLoadReport{
   188  		CpuUtilization: 0.5,
   189  		MemUtilization: 0.2,
   190  		Utilization:    map[string]float64{"bob": 0.555, "mary": 0.321},
   191  	}
   192  
   193  	for {
   194  		select {
   195  		case r := <-oobLis.loadReportCh:
   196  			t.Log("Load report received: ", r)
   197  			if proto.Equal(r, loadReportWant) {
   198  				// Success!
   199  				return
   200  			}
   201  		case <-ctx.Done():
   202  			t.Fatalf("timed out waiting for load report: %v", loadReportWant)
   203  		}
   204  	}
   205  }
   206  
   207  // fakeORCAService is a simple implementation of an ORCA service that pushes
   208  // requests it receives from clients to a channel and sends responses from a
   209  // channel back.  This allows tests to verify the client is sending requests
   210  // and processing responses properly.
   211  type fakeORCAService struct {
   212  	v3orcaservicegrpc.UnimplementedOpenRcaServiceServer
   213  
   214  	reqCh  chan *v3orcaservicepb.OrcaLoadReportRequest
   215  	respCh chan any // either *v3orcapb.OrcaLoadReport or error
   216  }
   217  
   218  func newFakeORCAService() *fakeORCAService {
   219  	return &fakeORCAService{
   220  		reqCh:  make(chan *v3orcaservicepb.OrcaLoadReportRequest),
   221  		respCh: make(chan any),
   222  	}
   223  }
   224  
   225  func (f *fakeORCAService) close() {
   226  	close(f.respCh)
   227  }
   228  
   229  func (f *fakeORCAService) StreamCoreMetrics(req *v3orcaservicepb.OrcaLoadReportRequest, stream v3orcaservicegrpc.OpenRcaService_StreamCoreMetricsServer) error {
   230  	f.reqCh <- req
   231  	for {
   232  		var resp any
   233  		select {
   234  		case resp = <-f.respCh:
   235  		case <-stream.Context().Done():
   236  			return stream.Context().Err()
   237  		}
   238  
   239  		if err, ok := resp.(error); ok {
   240  			return err
   241  		}
   242  		if err := stream.Send(resp.(*v3orcapb.OrcaLoadReport)); err != nil {
   243  			// In the event that a stream error occurs, a new stream will have
   244  			// been created that was waiting for this response message.  Push
   245  			// it back onto the channel and return.
   246  			//
   247  			// This happens because we range over respCh.  If we changed to
   248  			// instead select on respCh + stream.Context(), the same situation
   249  			// could still occur due to a race between noticing the two events,
   250  			// so such a workaround would still be needed to prevent flakiness.
   251  			f.respCh <- resp
   252  			return err
   253  		}
   254  	}
   255  }
   256  
   257  // TestProducerBackoff verifies that the ORCA producer applies the proper
   258  // backoff after stream failures.
   259  func (s) TestProducerBackoff(t *testing.T) {
   260  	grpctest.TLogger.ExpectErrorN("injected error", 4)
   261  
   262  	ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout)
   263  	defer cancel()
   264  
   265  	// Provide a convenient way to expect backoff calls and return a minimal
   266  	// value.
   267  	const backoffShouldNotBeCalled = 9999 // Use to assert backoff function is not called.
   268  	const backoffAllowAny = -1            // Use to ignore any backoff calls.
   269  	expectedBackoff := backoffAllowAny
   270  	oldBackoff := internal.DefaultBackoffFunc
   271  	internal.DefaultBackoffFunc = func(got int) time.Duration {
   272  		if expectedBackoff == backoffShouldNotBeCalled {
   273  			t.Errorf("Unexpected backoff call; parameter = %v", got)
   274  		} else if expectedBackoff != backoffAllowAny {
   275  			if got != expectedBackoff {
   276  				t.Errorf("Unexpected backoff received; got %v want %v", got, expectedBackoff)
   277  			}
   278  		}
   279  		return time.Millisecond
   280  	}
   281  	defer func() { internal.DefaultBackoffFunc = oldBackoff }()
   282  
   283  	// Initialize listener for our ORCA server.
   284  	lis, err := testutils.LocalTCPListener()
   285  	if err != nil {
   286  		t.Fatal(err)
   287  	}
   288  
   289  	// Register our fake ORCA service.
   290  	s := grpc.NewServer()
   291  	fake := newFakeORCAService()
   292  	defer fake.close()
   293  	v3orcaservicegrpc.RegisterOpenRcaServiceServer(s, fake)
   294  	go s.Serve(lis)
   295  	defer s.Stop()
   296  
   297  	// Define the report interval and a function to wait for it to be sent to
   298  	// the server.
   299  	const reportInterval = 123 * time.Second
   300  	awaitRequest := func(interval time.Duration) {
   301  		select {
   302  		case req := <-fake.reqCh:
   303  			if got := req.GetReportInterval().AsDuration(); got != interval {
   304  				t.Errorf("Unexpected report interval; got %v want %v", got, interval)
   305  			}
   306  		case <-ctx.Done():
   307  			t.Fatalf("Did not receive client request")
   308  		}
   309  	}
   310  
   311  	// Create our client with an OOB listener in the LB policy it selects.
   312  	r := manual.NewBuilderWithScheme("whatever")
   313  	oobLis := newTestOOBListener()
   314  
   315  	lisOpts := orca.OOBListenerOptions{ReportInterval: reportInterval}
   316  	li := &listenerInfo{listener: oobLis, opts: lisOpts}
   317  	r.InitialState(resolver.State{Addresses: []resolver.Address{setListenerInfo(resolver.Address{Addr: lis.Addr().String()}, li)}})
   318  	cc, err := grpc.Dial("whatever:///whatever", grpc.WithDefaultServiceConfig(`{"loadBalancingConfig": [{"customLB":{}}]}`), grpc.WithResolvers(r), grpc.WithTransportCredentials(insecure.NewCredentials()))
   319  	if err != nil {
   320  		t.Fatalf("grpc.Dial failed: %v", err)
   321  	}
   322  	defer cc.Close()
   323  
   324  	// Ensure the OOB listener is stopped before the client is closed to avoid
   325  	// a potential irrelevant error in the logs.
   326  	defer oobLis.Stop()
   327  
   328  	// Define a load report to send and expect the client to see.
   329  	loadReportWant := &v3orcapb.OrcaLoadReport{
   330  		CpuUtilization: 10,
   331  		MemUtilization: 0.1,
   332  		Utilization:    map[string]float64{"bob": 0.555},
   333  	}
   334  
   335  	// Unblock the fake.
   336  	awaitRequest(reportInterval)
   337  	fake.respCh <- loadReportWant
   338  	select {
   339  	case r := <-oobLis.loadReportCh:
   340  		t.Log("Load report received: ", r)
   341  		if proto.Equal(r, loadReportWant) {
   342  			// Success!
   343  			break
   344  		}
   345  	case <-ctx.Done():
   346  		t.Fatalf("timed out waiting for load report: %v", loadReportWant)
   347  	}
   348  
   349  	// The next request should be immediate, since there was a message
   350  	// received.
   351  	expectedBackoff = backoffShouldNotBeCalled
   352  	fake.respCh <- status.Errorf(codes.Internal, "injected error")
   353  	awaitRequest(reportInterval)
   354  
   355  	// The next requests will need to backoff.
   356  	expectedBackoff = 0
   357  	fake.respCh <- status.Errorf(codes.Internal, "injected error")
   358  	awaitRequest(reportInterval)
   359  	expectedBackoff = 1
   360  	fake.respCh <- status.Errorf(codes.Internal, "injected error")
   361  	awaitRequest(reportInterval)
   362  	expectedBackoff = 2
   363  	fake.respCh <- status.Errorf(codes.Internal, "injected error")
   364  	awaitRequest(reportInterval)
   365  	// The next request should be immediate, since there was a message
   366  	// received.
   367  	expectedBackoff = backoffShouldNotBeCalled
   368  
   369  	// Send another valid response and wait for it on the client.
   370  	fake.respCh <- loadReportWant
   371  	select {
   372  	case r := <-oobLis.loadReportCh:
   373  		t.Log("Load report received: ", r)
   374  		if proto.Equal(r, loadReportWant) {
   375  			// Success!
   376  			break
   377  		}
   378  	case <-ctx.Done():
   379  		t.Fatalf("timed out waiting for load report: %v", loadReportWant)
   380  	}
   381  }
   382  
   383  // TestProducerMultipleListeners tests that multiple listeners works as
   384  // expected in a producer: requesting the proper interval and delivering the
   385  // update to all listeners.
   386  func (s) TestProducerMultipleListeners(t *testing.T) {
   387  	ctx, cancel := context.WithTimeout(context.Background(), defaultTestTimeout)
   388  	defer cancel()
   389  
   390  	// Provide a convenient way to expect backoff calls and return a minimal
   391  	// value.
   392  	oldBackoff := internal.DefaultBackoffFunc
   393  	internal.DefaultBackoffFunc = func(got int) time.Duration {
   394  		return time.Millisecond
   395  	}
   396  	defer func() { internal.DefaultBackoffFunc = oldBackoff }()
   397  
   398  	// Initialize listener for our ORCA server.
   399  	lis, err := testutils.LocalTCPListener()
   400  	if err != nil {
   401  		t.Fatal(err)
   402  	}
   403  
   404  	// Register our fake ORCA service.
   405  	s := grpc.NewServer()
   406  	fake := newFakeORCAService()
   407  	defer fake.close()
   408  	v3orcaservicegrpc.RegisterOpenRcaServiceServer(s, fake)
   409  	go s.Serve(lis)
   410  	defer s.Stop()
   411  
   412  	// Define the report interval and a function to wait for it to be sent to
   413  	// the server.
   414  	const reportInterval1 = 123 * time.Second
   415  	const reportInterval2 = 234 * time.Second
   416  	const reportInterval3 = 56 * time.Second
   417  	awaitRequest := func(interval time.Duration) {
   418  		select {
   419  		case req := <-fake.reqCh:
   420  			if got := req.GetReportInterval().AsDuration(); got != interval {
   421  				t.Errorf("Unexpected report interval; got %v want %v", got, interval)
   422  			}
   423  		case <-ctx.Done():
   424  			t.Fatalf("Did not receive client request")
   425  		}
   426  	}
   427  
   428  	// Create our client with an OOB listener in the LB policy it selects.
   429  	r := manual.NewBuilderWithScheme("whatever")
   430  	oobLis1 := newTestOOBListener()
   431  	lisOpts1 := orca.OOBListenerOptions{ReportInterval: reportInterval1}
   432  	li := &listenerInfo{listener: oobLis1, opts: lisOpts1}
   433  	r.InitialState(resolver.State{Addresses: []resolver.Address{setListenerInfo(resolver.Address{Addr: lis.Addr().String()}, li)}})
   434  	cc, err := grpc.Dial("whatever:///whatever", grpc.WithDefaultServiceConfig(`{"loadBalancingConfig": [{"customLB":{}}]}`), grpc.WithResolvers(r), grpc.WithTransportCredentials(insecure.NewCredentials()))
   435  	if err != nil {
   436  		t.Fatalf("grpc.Dial failed: %v", err)
   437  	}
   438  	defer cc.Close()
   439  
   440  	// Ensure the OOB listener is stopped before the client is closed to avoid
   441  	// a potential irrelevant error in the logs.
   442  	defer oobLis1.Stop()
   443  
   444  	oobLis2 := newTestOOBListener()
   445  	lisOpts2 := orca.OOBListenerOptions{ReportInterval: reportInterval2}
   446  
   447  	oobLis3 := newTestOOBListener()
   448  	lisOpts3 := orca.OOBListenerOptions{ReportInterval: reportInterval3}
   449  
   450  	// Define a load report to send and expect the client to see.
   451  	loadReportWant := &v3orcapb.OrcaLoadReport{
   452  		CpuUtilization: 10,
   453  		MemUtilization: 0.1,
   454  		Utilization:    map[string]float64{"bob": 0.555},
   455  	}
   456  
   457  	// Receive reports and update counts for the three listeners.
   458  	var reportsMu sync.Mutex
   459  	var reportsReceived1, reportsReceived2, reportsReceived3 int
   460  	go func() {
   461  		for {
   462  			select {
   463  			case r := <-oobLis1.loadReportCh:
   464  				t.Log("Load report 1 received: ", r)
   465  				if !proto.Equal(r, loadReportWant) {
   466  					t.Errorf("Unexpected report received: %+v", r)
   467  				}
   468  				reportsMu.Lock()
   469  				reportsReceived1++
   470  				reportsMu.Unlock()
   471  			case r := <-oobLis2.loadReportCh:
   472  				t.Log("Load report 2 received: ", r)
   473  				if !proto.Equal(r, loadReportWant) {
   474  					t.Errorf("Unexpected report received: %+v", r)
   475  				}
   476  				reportsMu.Lock()
   477  				reportsReceived2++
   478  				reportsMu.Unlock()
   479  			case r := <-oobLis3.loadReportCh:
   480  				t.Log("Load report 3 received: ", r)
   481  				if !proto.Equal(r, loadReportWant) {
   482  					t.Errorf("Unexpected report received: %+v", r)
   483  				}
   484  				reportsMu.Lock()
   485  				reportsReceived3++
   486  				reportsMu.Unlock()
   487  			case <-ctx.Done():
   488  				// Test has ended; exit
   489  				return
   490  			}
   491  		}
   492  	}()
   493  
   494  	// checkReports is a helper function to check the report counts for the three listeners.
   495  	checkReports := func(r1, r2, r3 int) {
   496  		t.Helper()
   497  		for ctx.Err() == nil {
   498  			reportsMu.Lock()
   499  			if r1 == reportsReceived1 && r2 == reportsReceived2 && r3 == reportsReceived3 {
   500  				// Success!
   501  				reportsMu.Unlock()
   502  				return
   503  			}
   504  			if reportsReceived1 > r1 || reportsReceived2 > r2 || reportsReceived3 > r3 {
   505  				reportsMu.Unlock()
   506  				t.Fatalf("received excess reports. got %v %v %v; want %v %v %v", reportsReceived1, reportsReceived2, reportsReceived3, r1, r2, r3)
   507  				return
   508  			}
   509  			reportsMu.Unlock()
   510  			time.Sleep(10 * time.Millisecond)
   511  		}
   512  		t.Fatalf("timed out waiting for reports received. got %v %v %v; want %v %v %v", reportsReceived1, reportsReceived2, reportsReceived3, r1, r2, r3)
   513  	}
   514  
   515  	// Only 1 listener; expect reportInterval1 to be used and expect the report
   516  	// to be sent to the listener.
   517  	awaitRequest(reportInterval1)
   518  	fake.respCh <- loadReportWant
   519  	checkReports(1, 0, 0)
   520  
   521  	// Register listener 2 with a less frequent interval; no need to recreate
   522  	// stream.  Report should go to both listeners.
   523  	oobLis2.cleanup = orca.RegisterOOBListener(li.sc, oobLis2, lisOpts2)
   524  	fake.respCh <- loadReportWant
   525  	checkReports(2, 1, 0)
   526  
   527  	// Register listener 3 with a more frequent interval; stream is recreated
   528  	// with this interval.  The next report will go to all three listeners.
   529  	oobLis3.cleanup = orca.RegisterOOBListener(li.sc, oobLis3, lisOpts3)
   530  	awaitRequest(reportInterval3)
   531  	fake.respCh <- loadReportWant
   532  	checkReports(3, 2, 1)
   533  
   534  	// Another report without a change in listeners should go to all three listeners.
   535  	fake.respCh <- loadReportWant
   536  	checkReports(4, 3, 2)
   537  
   538  	// Stop listener 2.  This does not affect the interval as listener 3 is
   539  	// still the shortest.  The next update goes to listeners 1 and 3.
   540  	oobLis2.Stop()
   541  	fake.respCh <- loadReportWant
   542  	checkReports(5, 3, 3)
   543  
   544  	// Stop listener 3.  This makes the interval longer.  Reports should only
   545  	// go to listener 1 now.
   546  	oobLis3.Stop()
   547  	awaitRequest(reportInterval1)
   548  	fake.respCh <- loadReportWant
   549  	checkReports(6, 3, 3)
   550  	// Another report without a change in listeners should go to the first listener.
   551  	fake.respCh <- loadReportWant
   552  	checkReports(7, 3, 3)
   553  }