vitess.io/vitess@v0.16.2/go/vt/vttablet/grpctmclient/cached_client_flaky_test.go (about) 1 /* 2 Copyright 2021 The Vitess Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package grpctmclient 18 19 import ( 20 "context" 21 "fmt" 22 "io" 23 "math/rand" 24 "net" 25 "runtime" 26 "sync" 27 "testing" 28 "time" 29 30 "github.com/stretchr/testify/assert" 31 "github.com/stretchr/testify/require" 32 "golang.org/x/net/nettest" 33 "google.golang.org/grpc" 34 35 "vitess.io/vitess/go/sync2" 36 "vitess.io/vitess/go/vt/vttablet/grpctmserver" 37 "vitess.io/vitess/go/vt/vttablet/tabletmanager" 38 "vitess.io/vitess/go/vt/vttablet/tmrpctest" 39 40 topodatapb "vitess.io/vitess/go/vt/proto/topodata" 41 ) 42 43 func grpcTestServer(t testing.TB, tm tabletmanager.RPCTM) (*net.TCPAddr, func()) { 44 t.Helper() 45 46 lis, err := nettest.NewLocalListener("tcp") 47 if err != nil { 48 t.Fatalf("Cannot listen: %v", err) 49 } 50 51 s := grpc.NewServer() 52 grpctmserver.RegisterForTest(s, tm) 53 go s.Serve(lis) 54 55 var shutdownOnce sync.Once 56 57 return lis.Addr().(*net.TCPAddr), func() { 58 shutdownOnce.Do(func() { 59 s.Stop() 60 lis.Close() 61 }) 62 } 63 } 64 65 func BenchmarkCachedConnClientSteadyState(b *testing.B) { 66 tmserv := tmrpctest.NewFakeRPCTM(b) 67 tablets := make([]*topodatapb.Tablet, 1000) 68 for i := 0; i < len(tablets); i++ { 69 addr, shutdown := grpcTestServer(b, tmserv) 70 defer shutdown() 71 72 tablets[i] = &topodatapb.Tablet{ 73 Alias: &topodatapb.TabletAlias{ 74 Cell: "test", 75 Uid: uint32(addr.Port), 76 }, 77 Hostname: addr.IP.String(), 78 PortMap: map[string]int32{ 79 "grpc": int32(addr.Port), 80 }, 81 } 82 } 83 84 client := NewCachedConnClient(100) 85 defer client.Close() 86 87 // fill the pool 88 for i := 0; i < 100; i++ { 89 err := client.Ping(context.Background(), tablets[i]) 90 require.NoError(b, err) 91 } 92 93 procs := runtime.GOMAXPROCS(0) / 4 94 if procs == 0 { 95 procs = 2 96 } 97 98 pingsPerProc := len(tablets) / procs 99 if pingsPerProc == 0 { 100 pingsPerProc = 2 101 } 102 103 b.ResetTimer() 104 105 // Begin the benchmark 106 for i := 0; i < b.N; i++ { 107 ctx, cancel := context.WithCancel(context.Background()) 108 109 var wg sync.WaitGroup 110 for j := 0; j < procs; j++ { 111 wg.Add(1) 112 go func() { 113 defer wg.Done() 114 115 for k := 0; k < pingsPerProc; k++ { 116 func() { 117 ctx, cancel := context.WithTimeout(ctx, time.Second*5) 118 defer cancel() 119 120 x := rand.Intn(len(tablets)) 121 err := client.Ping(ctx, tablets[x]) 122 assert.NoError(b, err) 123 }() 124 } 125 }() 126 } 127 128 wg.Wait() 129 cancel() 130 } 131 } 132 133 func BenchmarkCachedConnClientSteadyStateRedials(b *testing.B) { 134 tmserv := tmrpctest.NewFakeRPCTM(b) 135 tablets := make([]*topodatapb.Tablet, 1000) 136 for i := 0; i < len(tablets); i++ { 137 addr, shutdown := grpcTestServer(b, tmserv) 138 defer shutdown() 139 140 tablets[i] = &topodatapb.Tablet{ 141 Alias: &topodatapb.TabletAlias{ 142 Cell: "test", 143 Uid: uint32(addr.Port), 144 }, 145 Hostname: addr.IP.String(), 146 PortMap: map[string]int32{ 147 "grpc": int32(addr.Port), 148 }, 149 } 150 } 151 152 client := NewCachedConnClient(1000) 153 defer client.Close() 154 155 // fill the pool 156 for i := 0; i < 1000; i++ { 157 err := client.Ping(context.Background(), tablets[i]) 158 require.NoError(b, err) 159 } 160 161 procs := runtime.GOMAXPROCS(0) / 4 162 if procs == 0 { 163 procs = 2 164 } 165 166 pingsPerProc := len(tablets) / procs 167 if pingsPerProc == 0 { 168 pingsPerProc = 2 169 } 170 171 b.ResetTimer() 172 173 // Begin the benchmark 174 for i := 0; i < b.N; i++ { 175 ctx, cancel := context.WithCancel(context.Background()) 176 177 var wg sync.WaitGroup 178 for j := 0; j < procs; j++ { 179 wg.Add(1) 180 go func() { 181 defer wg.Done() 182 183 for k := 0; k < pingsPerProc; k++ { 184 func() { 185 ctx, cancel := context.WithTimeout(ctx, time.Second*5) 186 defer cancel() 187 188 x := rand.Intn(len(tablets)) 189 err := client.Ping(ctx, tablets[x]) 190 assert.NoError(b, err) 191 }() 192 } 193 }() 194 } 195 196 wg.Wait() 197 cancel() 198 } 199 } 200 201 func BenchmarkCachedConnClientSteadyStateEvictions(b *testing.B) { 202 tmserv := tmrpctest.NewFakeRPCTM(b) 203 tablets := make([]*topodatapb.Tablet, 1000) 204 for i := 0; i < len(tablets); i++ { 205 addr, shutdown := grpcTestServer(b, tmserv) 206 defer shutdown() 207 208 tablets[i] = &topodatapb.Tablet{ 209 Alias: &topodatapb.TabletAlias{ 210 Cell: "test", 211 Uid: uint32(addr.Port), 212 }, 213 Hostname: addr.IP.String(), 214 PortMap: map[string]int32{ 215 "grpc": int32(addr.Port), 216 }, 217 } 218 } 219 220 client := NewCachedConnClient(100) 221 defer client.Close() 222 223 // fill the pool 224 for i := 0; i < 100; i++ { 225 err := client.Ping(context.Background(), tablets[i]) 226 require.NoError(b, err) 227 } 228 229 assert.Equal(b, len(client.dialer.(*cachedConnDialer).conns), 100) 230 231 procs := runtime.GOMAXPROCS(0) / 4 232 if procs == 0 { 233 procs = 2 234 } 235 236 start := 100 237 b.ResetTimer() 238 239 // Begin the benchmark 240 for i := 0; i < b.N; i++ { 241 ctx, cancel := context.WithCancel(context.Background()) 242 ch := make(chan int, 100) // 100 dials per iteration 243 244 var wg sync.WaitGroup 245 for j := 0; j < procs; j++ { 246 wg.Add(1) 247 go func() { 248 defer wg.Done() 249 250 for idx := range ch { 251 func() { 252 ctx, cancel := context.WithTimeout(ctx, time.Second*5) 253 defer cancel() 254 255 err := client.Ping(ctx, tablets[idx]) 256 assert.NoError(b, err) 257 }() 258 } 259 }() 260 } 261 262 for j := 0; j < cap(ch); j++ { 263 start = (start + j) % 1000 // go in increasing order, wrapping around 264 ch <- start 265 } 266 267 close(ch) 268 wg.Wait() 269 cancel() 270 } 271 } 272 273 func TestCachedConnClient(t *testing.T) { 274 t.Parallel() 275 276 testCtx, testCancel := context.WithCancel(context.Background()) 277 wg := sync.WaitGroup{} 278 procs := 0 279 280 wg.Add(1) 281 go func() { 282 defer wg.Done() 283 procs = runtime.NumGoroutine() 284 285 for { 286 select { 287 case <-testCtx.Done(): 288 return 289 case <-time.After(time.Millisecond * 100): 290 newProcs := runtime.NumGoroutine() 291 if newProcs > procs { 292 procs = newProcs 293 } 294 } 295 } 296 }() 297 298 numTablets := 100 299 numGoroutines := 8 300 301 tmserv := tmrpctest.NewFakeRPCTM(t) 302 tablets := make([]*topodatapb.Tablet, numTablets) 303 for i := 0; i < len(tablets); i++ { 304 addr, shutdown := grpcTestServer(t, tmserv) 305 defer shutdown() 306 307 tablets[i] = &topodatapb.Tablet{ 308 Alias: &topodatapb.TabletAlias{ 309 Cell: "test", 310 Uid: uint32(addr.Port), 311 }, 312 Hostname: addr.IP.String(), 313 PortMap: map[string]int32{ 314 "grpc": int32(addr.Port), 315 }, 316 } 317 } 318 319 poolSize := int(float64(numTablets) * 0.5) 320 client := NewCachedConnClient(poolSize) 321 defer client.Close() 322 323 dialAttempts := sync2.NewAtomicInt64(0) 324 dialErrors := sync2.NewAtomicInt64(0) 325 326 longestDials := make(chan time.Duration, numGoroutines) 327 328 for i := 0; i < numGoroutines; i++ { 329 wg.Add(1) 330 go func() { 331 defer wg.Done() 332 333 attempts := 0 334 jitter := time.Second * 0 335 longestDial := time.Duration(0) 336 337 for { 338 select { 339 case <-testCtx.Done(): 340 dialAttempts.Add(int64(attempts)) 341 longestDials <- longestDial 342 return 343 case <-time.After(jitter): 344 jitter = time.Millisecond * (time.Duration(rand.Intn(11) + 50)) 345 attempts++ 346 347 tablet := tablets[rand.Intn(len(tablets))] 348 start := time.Now() 349 _, closer, err := client.dialer.dial(context.Background(), tablet) 350 if err != nil { 351 dialErrors.Add(1) 352 continue 353 } 354 355 dialDuration := time.Since(start) 356 if dialDuration > longestDial { 357 longestDial = dialDuration 358 } 359 360 closer.Close() 361 } 362 } 363 }() 364 } 365 366 time.Sleep(time.Minute) 367 testCancel() 368 wg.Wait() 369 close(longestDials) 370 371 longestDial := time.Duration(0) 372 for dialDuration := range longestDials { 373 if dialDuration > longestDial { 374 longestDial = dialDuration 375 } 376 } 377 378 attempts, errors := dialAttempts.Get(), dialErrors.Get() 379 assert.Less(t, float64(errors)/float64(attempts), 0.001, fmt.Sprintf("fewer than 0.1%% of dial attempts should fail (attempts = %d, errors = %d, max running procs = %d)", attempts, errors, procs)) 380 assert.Less(t, errors, int64(1), "at least one dial attempt failed (attempts = %d, errors = %d)", attempts, errors) 381 assert.Less(t, longestDial.Milliseconds(), int64(50)) 382 } 383 384 func TestCachedConnClient_evictions(t *testing.T) { 385 tmserv := tmrpctest.NewFakeRPCTM(t) 386 tablets := make([]*topodatapb.Tablet, 5) 387 for i := 0; i < len(tablets); i++ { 388 addr, shutdown := grpcTestServer(t, tmserv) 389 defer shutdown() 390 391 tablets[i] = &topodatapb.Tablet{ 392 Alias: &topodatapb.TabletAlias{ 393 Cell: "test", 394 Uid: uint32(addr.Port), 395 }, 396 Hostname: addr.IP.String(), 397 PortMap: map[string]int32{ 398 "grpc": int32(addr.Port), 399 }, 400 } 401 } 402 403 testCtx, cancel := context.WithCancel(context.Background()) 404 defer cancel() 405 406 connHoldContext, connHoldCancel := context.WithCancel(testCtx) 407 408 client := NewCachedConnClient(len(tablets) - 1) 409 for i := 0; i < len(tablets)-1; i++ { 410 _, closer, err := client.dialer.dial(context.Background(), tablets[i]) 411 t.Logf("holding connection open to %d", tablets[i].Alias.Uid) 412 require.NoError(t, err) 413 414 ctx := testCtx 415 if i == 0 { 416 ctx = connHoldContext 417 } 418 go func(ctx context.Context, closer io.Closer) { 419 // Hold on to one connection until the test is done. 420 // In the case of tablets[0], hold on to the connection until we 421 // signal to close it. 422 <-ctx.Done() 423 closer.Close() 424 }(ctx, closer) 425 } 426 427 dialCtx, dialCancel := context.WithTimeout(testCtx, time.Millisecond*50) 428 defer dialCancel() 429 430 err := client.Ping(dialCtx, tablets[0]) // this should take the rlock_fast path 431 assert.NoError(t, err, "could not redial on inuse cached connection") 432 433 err = client.Ping(dialCtx, tablets[4]) // this will enter the poll loop until context timeout 434 assert.Error(t, err, "should have timed out waiting for an eviction, while all conns were held") 435 436 // free up a connection 437 connHoldCancel() 438 439 dialCtx, dialCancel = context.WithTimeout(testCtx, time.Millisecond*100) 440 defer dialCancel() 441 442 err = client.Ping(dialCtx, tablets[4]) // this will enter the poll loop and evict a connection 443 assert.NoError(t, err, "should have evicted a conn and succeeded to dial") 444 }