vitess.io/vitess@v0.16.2/go/vt/vttablet/grpctmclient/cached_client_flaky_test.go (about)

     1  /*
     2  Copyright 2021 The Vitess Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8  	http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package grpctmclient
    18  
    19  import (
    20  	"context"
    21  	"fmt"
    22  	"io"
    23  	"math/rand"
    24  	"net"
    25  	"runtime"
    26  	"sync"
    27  	"testing"
    28  	"time"
    29  
    30  	"github.com/stretchr/testify/assert"
    31  	"github.com/stretchr/testify/require"
    32  	"golang.org/x/net/nettest"
    33  	"google.golang.org/grpc"
    34  
    35  	"vitess.io/vitess/go/sync2"
    36  	"vitess.io/vitess/go/vt/vttablet/grpctmserver"
    37  	"vitess.io/vitess/go/vt/vttablet/tabletmanager"
    38  	"vitess.io/vitess/go/vt/vttablet/tmrpctest"
    39  
    40  	topodatapb "vitess.io/vitess/go/vt/proto/topodata"
    41  )
    42  
    43  func grpcTestServer(t testing.TB, tm tabletmanager.RPCTM) (*net.TCPAddr, func()) {
    44  	t.Helper()
    45  
    46  	lis, err := nettest.NewLocalListener("tcp")
    47  	if err != nil {
    48  		t.Fatalf("Cannot listen: %v", err)
    49  	}
    50  
    51  	s := grpc.NewServer()
    52  	grpctmserver.RegisterForTest(s, tm)
    53  	go s.Serve(lis)
    54  
    55  	var shutdownOnce sync.Once
    56  
    57  	return lis.Addr().(*net.TCPAddr), func() {
    58  		shutdownOnce.Do(func() {
    59  			s.Stop()
    60  			lis.Close()
    61  		})
    62  	}
    63  }
    64  
    65  func BenchmarkCachedConnClientSteadyState(b *testing.B) {
    66  	tmserv := tmrpctest.NewFakeRPCTM(b)
    67  	tablets := make([]*topodatapb.Tablet, 1000)
    68  	for i := 0; i < len(tablets); i++ {
    69  		addr, shutdown := grpcTestServer(b, tmserv)
    70  		defer shutdown()
    71  
    72  		tablets[i] = &topodatapb.Tablet{
    73  			Alias: &topodatapb.TabletAlias{
    74  				Cell: "test",
    75  				Uid:  uint32(addr.Port),
    76  			},
    77  			Hostname: addr.IP.String(),
    78  			PortMap: map[string]int32{
    79  				"grpc": int32(addr.Port),
    80  			},
    81  		}
    82  	}
    83  
    84  	client := NewCachedConnClient(100)
    85  	defer client.Close()
    86  
    87  	// fill the pool
    88  	for i := 0; i < 100; i++ {
    89  		err := client.Ping(context.Background(), tablets[i])
    90  		require.NoError(b, err)
    91  	}
    92  
    93  	procs := runtime.GOMAXPROCS(0) / 4
    94  	if procs == 0 {
    95  		procs = 2
    96  	}
    97  
    98  	pingsPerProc := len(tablets) / procs
    99  	if pingsPerProc == 0 {
   100  		pingsPerProc = 2
   101  	}
   102  
   103  	b.ResetTimer()
   104  
   105  	// Begin the benchmark
   106  	for i := 0; i < b.N; i++ {
   107  		ctx, cancel := context.WithCancel(context.Background())
   108  
   109  		var wg sync.WaitGroup
   110  		for j := 0; j < procs; j++ {
   111  			wg.Add(1)
   112  			go func() {
   113  				defer wg.Done()
   114  
   115  				for k := 0; k < pingsPerProc; k++ {
   116  					func() {
   117  						ctx, cancel := context.WithTimeout(ctx, time.Second*5)
   118  						defer cancel()
   119  
   120  						x := rand.Intn(len(tablets))
   121  						err := client.Ping(ctx, tablets[x])
   122  						assert.NoError(b, err)
   123  					}()
   124  				}
   125  			}()
   126  		}
   127  
   128  		wg.Wait()
   129  		cancel()
   130  	}
   131  }
   132  
   133  func BenchmarkCachedConnClientSteadyStateRedials(b *testing.B) {
   134  	tmserv := tmrpctest.NewFakeRPCTM(b)
   135  	tablets := make([]*topodatapb.Tablet, 1000)
   136  	for i := 0; i < len(tablets); i++ {
   137  		addr, shutdown := grpcTestServer(b, tmserv)
   138  		defer shutdown()
   139  
   140  		tablets[i] = &topodatapb.Tablet{
   141  			Alias: &topodatapb.TabletAlias{
   142  				Cell: "test",
   143  				Uid:  uint32(addr.Port),
   144  			},
   145  			Hostname: addr.IP.String(),
   146  			PortMap: map[string]int32{
   147  				"grpc": int32(addr.Port),
   148  			},
   149  		}
   150  	}
   151  
   152  	client := NewCachedConnClient(1000)
   153  	defer client.Close()
   154  
   155  	// fill the pool
   156  	for i := 0; i < 1000; i++ {
   157  		err := client.Ping(context.Background(), tablets[i])
   158  		require.NoError(b, err)
   159  	}
   160  
   161  	procs := runtime.GOMAXPROCS(0) / 4
   162  	if procs == 0 {
   163  		procs = 2
   164  	}
   165  
   166  	pingsPerProc := len(tablets) / procs
   167  	if pingsPerProc == 0 {
   168  		pingsPerProc = 2
   169  	}
   170  
   171  	b.ResetTimer()
   172  
   173  	// Begin the benchmark
   174  	for i := 0; i < b.N; i++ {
   175  		ctx, cancel := context.WithCancel(context.Background())
   176  
   177  		var wg sync.WaitGroup
   178  		for j := 0; j < procs; j++ {
   179  			wg.Add(1)
   180  			go func() {
   181  				defer wg.Done()
   182  
   183  				for k := 0; k < pingsPerProc; k++ {
   184  					func() {
   185  						ctx, cancel := context.WithTimeout(ctx, time.Second*5)
   186  						defer cancel()
   187  
   188  						x := rand.Intn(len(tablets))
   189  						err := client.Ping(ctx, tablets[x])
   190  						assert.NoError(b, err)
   191  					}()
   192  				}
   193  			}()
   194  		}
   195  
   196  		wg.Wait()
   197  		cancel()
   198  	}
   199  }
   200  
   201  func BenchmarkCachedConnClientSteadyStateEvictions(b *testing.B) {
   202  	tmserv := tmrpctest.NewFakeRPCTM(b)
   203  	tablets := make([]*topodatapb.Tablet, 1000)
   204  	for i := 0; i < len(tablets); i++ {
   205  		addr, shutdown := grpcTestServer(b, tmserv)
   206  		defer shutdown()
   207  
   208  		tablets[i] = &topodatapb.Tablet{
   209  			Alias: &topodatapb.TabletAlias{
   210  				Cell: "test",
   211  				Uid:  uint32(addr.Port),
   212  			},
   213  			Hostname: addr.IP.String(),
   214  			PortMap: map[string]int32{
   215  				"grpc": int32(addr.Port),
   216  			},
   217  		}
   218  	}
   219  
   220  	client := NewCachedConnClient(100)
   221  	defer client.Close()
   222  
   223  	// fill the pool
   224  	for i := 0; i < 100; i++ {
   225  		err := client.Ping(context.Background(), tablets[i])
   226  		require.NoError(b, err)
   227  	}
   228  
   229  	assert.Equal(b, len(client.dialer.(*cachedConnDialer).conns), 100)
   230  
   231  	procs := runtime.GOMAXPROCS(0) / 4
   232  	if procs == 0 {
   233  		procs = 2
   234  	}
   235  
   236  	start := 100
   237  	b.ResetTimer()
   238  
   239  	// Begin the benchmark
   240  	for i := 0; i < b.N; i++ {
   241  		ctx, cancel := context.WithCancel(context.Background())
   242  		ch := make(chan int, 100) // 100 dials per iteration
   243  
   244  		var wg sync.WaitGroup
   245  		for j := 0; j < procs; j++ {
   246  			wg.Add(1)
   247  			go func() {
   248  				defer wg.Done()
   249  
   250  				for idx := range ch {
   251  					func() {
   252  						ctx, cancel := context.WithTimeout(ctx, time.Second*5)
   253  						defer cancel()
   254  
   255  						err := client.Ping(ctx, tablets[idx])
   256  						assert.NoError(b, err)
   257  					}()
   258  				}
   259  			}()
   260  		}
   261  
   262  		for j := 0; j < cap(ch); j++ {
   263  			start = (start + j) % 1000 // go in increasing order, wrapping around
   264  			ch <- start
   265  		}
   266  
   267  		close(ch)
   268  		wg.Wait()
   269  		cancel()
   270  	}
   271  }
   272  
   273  func TestCachedConnClient(t *testing.T) {
   274  	t.Parallel()
   275  
   276  	testCtx, testCancel := context.WithCancel(context.Background())
   277  	wg := sync.WaitGroup{}
   278  	procs := 0
   279  
   280  	wg.Add(1)
   281  	go func() {
   282  		defer wg.Done()
   283  		procs = runtime.NumGoroutine()
   284  
   285  		for {
   286  			select {
   287  			case <-testCtx.Done():
   288  				return
   289  			case <-time.After(time.Millisecond * 100):
   290  				newProcs := runtime.NumGoroutine()
   291  				if newProcs > procs {
   292  					procs = newProcs
   293  				}
   294  			}
   295  		}
   296  	}()
   297  
   298  	numTablets := 100
   299  	numGoroutines := 8
   300  
   301  	tmserv := tmrpctest.NewFakeRPCTM(t)
   302  	tablets := make([]*topodatapb.Tablet, numTablets)
   303  	for i := 0; i < len(tablets); i++ {
   304  		addr, shutdown := grpcTestServer(t, tmserv)
   305  		defer shutdown()
   306  
   307  		tablets[i] = &topodatapb.Tablet{
   308  			Alias: &topodatapb.TabletAlias{
   309  				Cell: "test",
   310  				Uid:  uint32(addr.Port),
   311  			},
   312  			Hostname: addr.IP.String(),
   313  			PortMap: map[string]int32{
   314  				"grpc": int32(addr.Port),
   315  			},
   316  		}
   317  	}
   318  
   319  	poolSize := int(float64(numTablets) * 0.5)
   320  	client := NewCachedConnClient(poolSize)
   321  	defer client.Close()
   322  
   323  	dialAttempts := sync2.NewAtomicInt64(0)
   324  	dialErrors := sync2.NewAtomicInt64(0)
   325  
   326  	longestDials := make(chan time.Duration, numGoroutines)
   327  
   328  	for i := 0; i < numGoroutines; i++ {
   329  		wg.Add(1)
   330  		go func() {
   331  			defer wg.Done()
   332  
   333  			attempts := 0
   334  			jitter := time.Second * 0
   335  			longestDial := time.Duration(0)
   336  
   337  			for {
   338  				select {
   339  				case <-testCtx.Done():
   340  					dialAttempts.Add(int64(attempts))
   341  					longestDials <- longestDial
   342  					return
   343  				case <-time.After(jitter):
   344  					jitter = time.Millisecond * (time.Duration(rand.Intn(11) + 50))
   345  					attempts++
   346  
   347  					tablet := tablets[rand.Intn(len(tablets))]
   348  					start := time.Now()
   349  					_, closer, err := client.dialer.dial(context.Background(), tablet)
   350  					if err != nil {
   351  						dialErrors.Add(1)
   352  						continue
   353  					}
   354  
   355  					dialDuration := time.Since(start)
   356  					if dialDuration > longestDial {
   357  						longestDial = dialDuration
   358  					}
   359  
   360  					closer.Close()
   361  				}
   362  			}
   363  		}()
   364  	}
   365  
   366  	time.Sleep(time.Minute)
   367  	testCancel()
   368  	wg.Wait()
   369  	close(longestDials)
   370  
   371  	longestDial := time.Duration(0)
   372  	for dialDuration := range longestDials {
   373  		if dialDuration > longestDial {
   374  			longestDial = dialDuration
   375  		}
   376  	}
   377  
   378  	attempts, errors := dialAttempts.Get(), dialErrors.Get()
   379  	assert.Less(t, float64(errors)/float64(attempts), 0.001, fmt.Sprintf("fewer than 0.1%% of dial attempts should fail (attempts = %d, errors = %d, max running procs = %d)", attempts, errors, procs))
   380  	assert.Less(t, errors, int64(1), "at least one dial attempt failed (attempts = %d, errors = %d)", attempts, errors)
   381  	assert.Less(t, longestDial.Milliseconds(), int64(50))
   382  }
   383  
   384  func TestCachedConnClient_evictions(t *testing.T) {
   385  	tmserv := tmrpctest.NewFakeRPCTM(t)
   386  	tablets := make([]*topodatapb.Tablet, 5)
   387  	for i := 0; i < len(tablets); i++ {
   388  		addr, shutdown := grpcTestServer(t, tmserv)
   389  		defer shutdown()
   390  
   391  		tablets[i] = &topodatapb.Tablet{
   392  			Alias: &topodatapb.TabletAlias{
   393  				Cell: "test",
   394  				Uid:  uint32(addr.Port),
   395  			},
   396  			Hostname: addr.IP.String(),
   397  			PortMap: map[string]int32{
   398  				"grpc": int32(addr.Port),
   399  			},
   400  		}
   401  	}
   402  
   403  	testCtx, cancel := context.WithCancel(context.Background())
   404  	defer cancel()
   405  
   406  	connHoldContext, connHoldCancel := context.WithCancel(testCtx)
   407  
   408  	client := NewCachedConnClient(len(tablets) - 1)
   409  	for i := 0; i < len(tablets)-1; i++ {
   410  		_, closer, err := client.dialer.dial(context.Background(), tablets[i])
   411  		t.Logf("holding connection open to %d", tablets[i].Alias.Uid)
   412  		require.NoError(t, err)
   413  
   414  		ctx := testCtx
   415  		if i == 0 {
   416  			ctx = connHoldContext
   417  		}
   418  		go func(ctx context.Context, closer io.Closer) {
   419  			// Hold on to one connection until the test is done.
   420  			// In the case of tablets[0], hold on to the connection until we
   421  			// signal to close it.
   422  			<-ctx.Done()
   423  			closer.Close()
   424  		}(ctx, closer)
   425  	}
   426  
   427  	dialCtx, dialCancel := context.WithTimeout(testCtx, time.Millisecond*50)
   428  	defer dialCancel()
   429  
   430  	err := client.Ping(dialCtx, tablets[0]) // this should take the rlock_fast path
   431  	assert.NoError(t, err, "could not redial on inuse cached connection")
   432  
   433  	err = client.Ping(dialCtx, tablets[4]) // this will enter the poll loop until context timeout
   434  	assert.Error(t, err, "should have timed out waiting for an eviction, while all conns were held")
   435  
   436  	// free up a connection
   437  	connHoldCancel()
   438  
   439  	dialCtx, dialCancel = context.WithTimeout(testCtx, time.Millisecond*100)
   440  	defer dialCancel()
   441  
   442  	err = client.Ping(dialCtx, tablets[4]) // this will enter the poll loop and evict a connection
   443  	assert.NoError(t, err, "should have evicted a conn and succeeded to dial")
   444  }