vitess.io/vitess@v0.16.2/go/vt/discovery/healthcheck_test.go (about) 1 /* 2 Copyright 2019 The Vitess Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package discovery 18 19 import ( 20 "bytes" 21 "context" 22 "fmt" 23 "html/template" 24 "io" 25 "strings" 26 "sync" 27 "testing" 28 "time" 29 30 "github.com/stretchr/testify/assert" 31 "github.com/stretchr/testify/require" 32 33 "vitess.io/vitess/go/test/utils" 34 "vitess.io/vitess/go/vt/grpcclient" 35 "vitess.io/vitess/go/vt/status" 36 "vitess.io/vitess/go/vt/topo" 37 "vitess.io/vitess/go/vt/topo/memorytopo" 38 "vitess.io/vitess/go/vt/topo/topoproto" 39 "vitess.io/vitess/go/vt/vttablet/queryservice" 40 "vitess.io/vitess/go/vt/vttablet/queryservice/fakes" 41 "vitess.io/vitess/go/vt/vttablet/tabletconn" 42 "vitess.io/vitess/go/vt/vttablet/tabletconntest" 43 44 querypb "vitess.io/vitess/go/vt/proto/query" 45 topodatapb "vitess.io/vitess/go/vt/proto/topodata" 46 ) 47 48 var ( 49 connMap map[string]*fakeConn 50 connMapMu sync.Mutex 51 ) 52 53 func testChecksum(t *testing.T, want, got int64) { 54 t.Helper() 55 if want != got { 56 t.Errorf("want checksum %v, got %v", want, got) 57 } 58 } 59 60 func init() { 61 tabletconn.RegisterDialer("fake_gateway", tabletDialer) 62 tabletconntest.SetProtocol("go.vt.discovery.healthcheck_test", "fake_gateway") 63 connMap = make(map[string]*fakeConn) 64 refreshInterval = time.Minute 65 } 66 67 func TestHealthCheck(t *testing.T) { 68 // reset error counters 69 hcErrorCounters.ResetAll() 70 ts := memorytopo.NewServer("cell") 71 hc := createTestHc(ts) 72 // close healthcheck 73 defer hc.Close() 74 tablet := createTestTablet(0, "cell", "a") 75 tablet.Type = topodatapb.TabletType_REPLICA 76 input := make(chan *querypb.StreamHealthResponse) 77 conn := createFakeConn(tablet, input) 78 79 // create a channel and subscribe to healthcheck 80 resultChan := hc.Subscribe() 81 testChecksum(t, 0, hc.stateChecksum()) 82 hc.AddTablet(tablet) 83 testChecksum(t, 1027934207, hc.stateChecksum()) 84 85 // Immediately after AddTablet() there will be the first notification. 86 want := &TabletHealth{ 87 Tablet: tablet, 88 Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA}, 89 Serving: false, 90 Stats: nil, 91 PrimaryTermStartTime: 0, 92 } 93 result := <-resultChan 94 mustMatch(t, want, result, "Wrong TabletHealth data") 95 96 shr := &querypb.StreamHealthResponse{ 97 TabletAlias: tablet.Alias, 98 Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA}, 99 Serving: true, 100 101 TabletExternallyReparentedTimestamp: 0, 102 RealtimeStats: &querypb.RealtimeStats{ReplicationLagSeconds: 1, CpuUsage: 0.5}, 103 } 104 input <- shr 105 result = <-resultChan 106 want = &TabletHealth{ 107 Tablet: tablet, 108 Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA}, 109 Serving: true, 110 Stats: &querypb.RealtimeStats{ReplicationLagSeconds: 1, CpuUsage: 0.5}, 111 PrimaryTermStartTime: 0, 112 } 113 // create a context with timeout and select on it and channel 114 mustMatch(t, want, result, "Wrong TabletHealth data") 115 116 tcsl := hc.CacheStatus() 117 tcslWant := TabletsCacheStatusList{{ 118 Cell: "cell", 119 Target: want.Target, 120 TabletsStats: TabletStatsList{{ 121 Tablet: tablet, 122 Target: want.Target, 123 Serving: true, 124 Stats: &querypb.RealtimeStats{ReplicationLagSeconds: 1, CpuUsage: 0.5}, 125 PrimaryTermStartTime: 0, 126 }}, 127 }} 128 // we can't use assert.Equal here because of the special way we want to compare equality 129 assert.True(t, tcslWant.deepEqual(tcsl), "Incorrect cache status:\n Expected: %+v\n Actual: %+v", tcslWant[0], tcsl[0]) 130 testChecksum(t, 3487343103, hc.stateChecksum()) 131 132 // TabletType changed, should get both old and new event 133 shr = &querypb.StreamHealthResponse{ 134 TabletAlias: tablet.Alias, 135 Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_PRIMARY}, 136 Serving: true, 137 TabletExternallyReparentedTimestamp: 10, 138 RealtimeStats: &querypb.RealtimeStats{ReplicationLagSeconds: 1, CpuUsage: 0.2}, 139 } 140 want = &TabletHealth{ 141 Tablet: tablet, 142 Target: &querypb.Target{ 143 Keyspace: "k", 144 Shard: "s", 145 TabletType: topodatapb.TabletType_PRIMARY, 146 }, 147 Serving: true, 148 Conn: conn, 149 Stats: &querypb.RealtimeStats{ReplicationLagSeconds: 1, CpuUsage: 0.2}, 150 PrimaryTermStartTime: 10, 151 } 152 input <- shr 153 result = <-resultChan 154 155 mustMatch(t, want, result, "Wrong TabletHealth data") 156 testChecksum(t, 1560849771, hc.stateChecksum()) 157 158 err := checkErrorCounter("k", "s", topodatapb.TabletType_PRIMARY, 0) 159 require.NoError(t, err, "error checking error counter") 160 161 // Serving & RealtimeStats changed 162 shr = &querypb.StreamHealthResponse{ 163 TabletAlias: tablet.Alias, 164 Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA}, 165 Serving: false, 166 TabletExternallyReparentedTimestamp: 0, 167 RealtimeStats: &querypb.RealtimeStats{ReplicationLagSeconds: 1, CpuUsage: 0.3}, 168 } 169 want = &TabletHealth{ 170 Tablet: tablet, 171 Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA}, 172 Serving: false, 173 Stats: &querypb.RealtimeStats{ReplicationLagSeconds: 1, CpuUsage: 0.3}, 174 PrimaryTermStartTime: 0, 175 } 176 input <- shr 177 result = <-resultChan 178 mustMatch(t, want, result, "Wrong TabletHealth data") 179 testChecksum(t, 1027934207, hc.stateChecksum()) 180 181 // HealthError 182 shr = &querypb.StreamHealthResponse{ 183 TabletAlias: tablet.Alias, 184 Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA}, 185 Serving: true, 186 TabletExternallyReparentedTimestamp: 0, 187 RealtimeStats: &querypb.RealtimeStats{HealthError: "some error", ReplicationLagSeconds: 1, CpuUsage: 0.3}, 188 } 189 want = &TabletHealth{ 190 Tablet: tablet, 191 Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA}, 192 Serving: false, 193 Stats: &querypb.RealtimeStats{HealthError: "some error", ReplicationLagSeconds: 1, CpuUsage: 0.3}, 194 PrimaryTermStartTime: 0, 195 LastError: fmt.Errorf("vttablet error: some error"), 196 } 197 input <- shr 198 result = <-resultChan 199 // TODO: figure out how to compare objects that contain errors using utils.MustMatch 200 assert.True(t, want.DeepEqual(result), "Wrong TabletHealth data\n Expected: %v\n Actual: %v", want, result) 201 testChecksum(t, 1027934207, hc.stateChecksum()) // unchanged 202 203 // remove tablet 204 hc.deleteTablet(tablet) 205 testChecksum(t, 0, hc.stateChecksum()) 206 } 207 208 func TestHealthCheckStreamError(t *testing.T) { 209 ts := memorytopo.NewServer("cell") 210 hc := createTestHc(ts) 211 defer hc.Close() 212 213 tablet := createTestTablet(0, "cell", "a") 214 input := make(chan *querypb.StreamHealthResponse) 215 resultChan := hc.Subscribe() 216 fc := createFakeConn(tablet, input) 217 fc.errCh = make(chan error) 218 hc.AddTablet(tablet) 219 220 // Immediately after AddTablet() there will be the first notification. 221 want := &TabletHealth{ 222 Tablet: tablet, 223 Target: &querypb.Target{Keyspace: "k", Shard: "s"}, 224 Serving: false, 225 PrimaryTermStartTime: 0, 226 } 227 result := <-resultChan 228 mustMatch(t, want, result, "Wrong TabletHealth data") 229 230 // one tablet after receiving a StreamHealthResponse 231 shr := &querypb.StreamHealthResponse{ 232 TabletAlias: tablet.Alias, 233 Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA}, 234 Serving: true, 235 TabletExternallyReparentedTimestamp: 0, 236 RealtimeStats: &querypb.RealtimeStats{ReplicationLagSeconds: 1, CpuUsage: 0.2}, 237 } 238 want = &TabletHealth{ 239 Tablet: tablet, 240 Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA}, 241 Serving: true, 242 Stats: &querypb.RealtimeStats{ReplicationLagSeconds: 1, CpuUsage: 0.2}, 243 PrimaryTermStartTime: 0, 244 } 245 input <- shr 246 result = <-resultChan 247 mustMatch(t, want, result, "Wrong TabletHealth data") 248 249 // Stream error 250 fc.errCh <- fmt.Errorf("some stream error") 251 want = &TabletHealth{ 252 Tablet: tablet, 253 Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA}, 254 Serving: false, 255 Stats: &querypb.RealtimeStats{ReplicationLagSeconds: 1, CpuUsage: 0.2}, 256 PrimaryTermStartTime: 0, 257 LastError: fmt.Errorf("some stream error"), 258 } 259 result = <-resultChan 260 // TODO: figure out how to compare objects that contain errors using utils.MustMatch 261 assert.True(t, want.DeepEqual(result), "Wrong TabletHealth data\n Expected: %v\n Actual: %v", want, result) 262 // tablet should be removed from healthy list 263 a := hc.GetHealthyTabletStats(&querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA}) 264 assert.Empty(t, a, "wrong result, expected empty list") 265 } 266 267 // TestHealthCheckErrorOnPrimary is the same as TestHealthCheckStreamError except for tablet type 268 func TestHealthCheckErrorOnPrimary(t *testing.T) { 269 ts := memorytopo.NewServer("cell") 270 hc := createTestHc(ts) 271 defer hc.Close() 272 273 tablet := createTestTablet(0, "cell", "a") 274 input := make(chan *querypb.StreamHealthResponse) 275 resultChan := hc.Subscribe() 276 fc := createFakeConn(tablet, input) 277 fc.errCh = make(chan error) 278 hc.AddTablet(tablet) 279 280 // Immediately after AddTablet() there will be the first notification. 281 want := &TabletHealth{ 282 Tablet: tablet, 283 Target: &querypb.Target{Keyspace: "k", Shard: "s"}, 284 Serving: false, 285 PrimaryTermStartTime: 0, 286 } 287 result := <-resultChan 288 mustMatch(t, want, result, "Wrong TabletHealth data") 289 290 // one tablet after receiving a StreamHealthResponse 291 shr := &querypb.StreamHealthResponse{ 292 TabletAlias: tablet.Alias, 293 Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_PRIMARY}, 294 Serving: true, 295 TabletExternallyReparentedTimestamp: 10, 296 RealtimeStats: &querypb.RealtimeStats{ReplicationLagSeconds: 1, CpuUsage: 0.2}, 297 } 298 want = &TabletHealth{ 299 Tablet: tablet, 300 Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_PRIMARY}, 301 Serving: true, 302 Stats: &querypb.RealtimeStats{ReplicationLagSeconds: 1, CpuUsage: 0.2}, 303 PrimaryTermStartTime: 10, 304 } 305 input <- shr 306 result = <-resultChan 307 mustMatch(t, want, result, "Wrong TabletHealth data") 308 309 // Stream error 310 fc.errCh <- fmt.Errorf("some stream error") 311 want = &TabletHealth{ 312 Tablet: tablet, 313 Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_PRIMARY}, 314 Serving: false, 315 Stats: &querypb.RealtimeStats{ReplicationLagSeconds: 1, CpuUsage: 0.2}, 316 PrimaryTermStartTime: 10, 317 LastError: fmt.Errorf("some stream error"), 318 } 319 result = <-resultChan 320 // TODO: figure out how to compare objects that contain errors using utils.MustMatch 321 assert.True(t, want.DeepEqual(result), "Wrong TabletHealth data\n Expected: %v\n Actual: %v", want, result) 322 // tablet should be removed from healthy list 323 a := hc.GetHealthyTabletStats(&querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_PRIMARY}) 324 assert.Empty(t, a, "wrong result, expected empty list") 325 } 326 327 func TestHealthCheckErrorOnPrimaryAfterExternalReparent(t *testing.T) { 328 ts := memorytopo.NewServer("cell") 329 hc := createTestHc(ts) 330 defer hc.Close() 331 332 resultChan := hc.Subscribe() 333 334 tablet1 := createTestTablet(0, "cell", "a") 335 input1 := make(chan *querypb.StreamHealthResponse) 336 fc1 := createFakeConn(tablet1, input1) 337 fc1.errCh = make(chan error) 338 hc.AddTablet(tablet1) 339 <-resultChan 340 341 tablet2 := createTestTablet(1, "cell", "b") 342 tablet2.Type = topodatapb.TabletType_REPLICA 343 input2 := make(chan *querypb.StreamHealthResponse) 344 createFakeConn(tablet2, input2) 345 hc.AddTablet(tablet2) 346 <-resultChan 347 348 shr2 := &querypb.StreamHealthResponse{ 349 TabletAlias: tablet2.Alias, 350 Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA}, 351 Serving: true, 352 TabletExternallyReparentedTimestamp: 0, 353 RealtimeStats: &querypb.RealtimeStats{ReplicationLagSeconds: 10, CpuUsage: 0.2}, 354 } 355 input2 <- shr2 356 <-resultChan 357 shr1 := &querypb.StreamHealthResponse{ 358 TabletAlias: tablet1.Alias, 359 Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_PRIMARY}, 360 Serving: true, 361 TabletExternallyReparentedTimestamp: 10, 362 RealtimeStats: &querypb.RealtimeStats{ReplicationLagSeconds: 0, CpuUsage: 0.2}, 363 } 364 input1 <- shr1 365 <-resultChan 366 // tablet 1 is the primary now 367 health := []*TabletHealth{{ 368 Tablet: tablet1, 369 Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_PRIMARY}, 370 Serving: true, 371 Stats: &querypb.RealtimeStats{ReplicationLagSeconds: 0, CpuUsage: 0.2}, 372 PrimaryTermStartTime: 10, 373 }} 374 a := hc.GetHealthyTabletStats(&querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_PRIMARY}) 375 mustMatch(t, health, a, "unexpected result") 376 377 shr2 = &querypb.StreamHealthResponse{ 378 TabletAlias: tablet2.Alias, 379 Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_PRIMARY}, 380 Serving: true, 381 TabletExternallyReparentedTimestamp: 20, 382 RealtimeStats: &querypb.RealtimeStats{ReplicationLagSeconds: 0, CpuUsage: 0.2}, 383 } 384 input2 <- shr2 385 <-resultChan 386 // reparent: tablet 2 is the primary now 387 health = []*TabletHealth{{ 388 Tablet: tablet2, 389 Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_PRIMARY}, 390 Serving: true, 391 Stats: &querypb.RealtimeStats{ReplicationLagSeconds: 0, CpuUsage: 0.2}, 392 PrimaryTermStartTime: 20, 393 }} 394 a = hc.GetHealthyTabletStats(&querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_PRIMARY}) 395 mustMatch(t, health, a, "unexpected result") 396 397 // Stream error from tablet 1 398 fc1.errCh <- fmt.Errorf("some stream error") 399 <-resultChan 400 // tablet 2 should still be the primary 401 a = hc.GetHealthyTabletStats(&querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_PRIMARY}) 402 mustMatch(t, health, a, "unexpected result") 403 } 404 405 func TestHealthCheckVerifiesTabletAlias(t *testing.T) { 406 ts := memorytopo.NewServer("cell") 407 hc := createTestHc(ts) 408 defer hc.Close() 409 410 tablet := createTestTablet(0, "cell", "a") 411 input := make(chan *querypb.StreamHealthResponse, 1) 412 fc := createFakeConn(tablet, input) 413 resultChan := hc.Subscribe() 414 415 hc.AddTablet(tablet) 416 417 // Immediately after AddTablet() there will be the first notification. 418 want := &TabletHealth{ 419 Tablet: tablet, 420 Target: &querypb.Target{Keyspace: "k", Shard: "s"}, 421 Serving: false, 422 PrimaryTermStartTime: 0, 423 } 424 result := <-resultChan 425 mustMatch(t, want, result, "Wrong TabletHealth data") 426 427 input <- &querypb.StreamHealthResponse{ 428 Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_PRIMARY}, 429 TabletAlias: &topodatapb.TabletAlias{Uid: 20, Cell: "cellb"}, 430 Serving: true, 431 TabletExternallyReparentedTimestamp: 10, 432 RealtimeStats: &querypb.RealtimeStats{ReplicationLagSeconds: 1, CpuUsage: 0.2}, 433 } 434 435 ticker := time.NewTicker(1 * time.Second) 436 select { 437 case err := <-fc.cbErrCh: 438 assert.Contains(t, err.Error(), "health stats mismatch", "wrong error") 439 case <-resultChan: 440 require.Fail(t, "StreamHealth should have returned a health stats mismatch error") 441 case <-ticker.C: 442 require.Fail(t, "Timed out waiting for StreamHealth to return a health stats mismatch error") 443 } 444 } 445 446 // TestHealthCheckCloseWaitsForGoRoutines tests that Close() waits for all Go 447 // routines to finish and the listener won't be called anymore. 448 func TestHealthCheckCloseWaitsForGoRoutines(t *testing.T) { 449 ts := memorytopo.NewServer("cell") 450 hc := createTestHc(ts) 451 tablet := createTestTablet(0, "cell", "a") 452 input := make(chan *querypb.StreamHealthResponse, 1) 453 createFakeConn(tablet, input) 454 resultChan := hc.Subscribe() 455 456 hc.AddTablet(tablet) 457 458 // Immediately after AddTablet() there will be the first notification. 459 want := &TabletHealth{ 460 Tablet: tablet, 461 Target: &querypb.Target{Keyspace: "k", Shard: "s"}, 462 Serving: false, 463 PrimaryTermStartTime: 0, 464 } 465 result := <-resultChan 466 mustMatch(t, want, result, "Wrong TabletHealth data") 467 468 // one tablet after receiving a StreamHealthResponse 469 shr := &querypb.StreamHealthResponse{ 470 TabletAlias: tablet.Alias, 471 Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA}, 472 Serving: true, 473 TabletExternallyReparentedTimestamp: 0, 474 RealtimeStats: &querypb.RealtimeStats{ReplicationLagSeconds: 1, CpuUsage: 0.2}, 475 } 476 want = &TabletHealth{ 477 Tablet: tablet, 478 Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA}, 479 Serving: true, 480 Stats: &querypb.RealtimeStats{ReplicationLagSeconds: 1, CpuUsage: 0.2}, 481 482 PrimaryTermStartTime: 0, 483 } 484 input <- shr 485 result = <-resultChan 486 mustMatch(t, want, result, "Wrong TabletHealth data") 487 488 // Change input to distinguish between stats sent before and after Close(). 489 shr.TabletExternallyReparentedTimestamp = 11 490 // Close the healthcheck. Tablet connections are closed asynchronously and 491 // Close() will block until all Go routines (one per connection) are done. 492 assert.Nil(t, hc.Close(), "Close returned error") 493 // Try to send more updates. They should be ignored and nothing should change 494 input <- shr 495 496 select { 497 case result = <-resultChan: 498 assert.Nil(t, result, "healthCheck still running after Close(): received result: %v", result) 499 case <-time.After(1 * time.Millisecond): 500 // No response after timeout. Success. 501 } 502 503 hc.mu.Lock() 504 defer hc.mu.Unlock() 505 assert.Nil(t, hc.healthByAlias, "health data should be nil") 506 } 507 508 func TestHealthCheckTimeout(t *testing.T) { 509 // reset counters 510 hcErrorCounters.ResetAll() 511 ts := memorytopo.NewServer("cell") 512 hc := createTestHc(ts) 513 hc.healthCheckTimeout = 500 * time.Millisecond 514 defer hc.Close() 515 tablet := createTestTablet(0, "cell", "a") 516 input := make(chan *querypb.StreamHealthResponse) 517 fc := createFakeConn(tablet, input) 518 resultChan := hc.Subscribe() 519 hc.AddTablet(tablet) 520 // Immediately after AddTablet() there will be the first notification. 521 want := &TabletHealth{ 522 Tablet: tablet, 523 Target: &querypb.Target{Keyspace: "k", Shard: "s"}, 524 Serving: false, 525 PrimaryTermStartTime: 0, 526 } 527 result := <-resultChan 528 mustMatch(t, want, result, "Wrong TabletHealth data") 529 530 // one tablet after receiving a StreamHealthResponse 531 shr := &querypb.StreamHealthResponse{ 532 TabletAlias: tablet.Alias, 533 Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA}, 534 Serving: true, 535 TabletExternallyReparentedTimestamp: 0, 536 RealtimeStats: &querypb.RealtimeStats{ReplicationLagSeconds: 1, CpuUsage: 0.2}, 537 } 538 want = &TabletHealth{ 539 Tablet: tablet, 540 Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA}, 541 Serving: true, 542 Stats: &querypb.RealtimeStats{ReplicationLagSeconds: 1, CpuUsage: 0.2}, 543 PrimaryTermStartTime: 0, 544 } 545 input <- shr 546 result = <-resultChan 547 mustMatch(t, want, result, "Wrong TabletHealth data") 548 assert.Nil(t, checkErrorCounter("k", "s", topodatapb.TabletType_REPLICA, 0)) 549 550 // wait for timeout period 551 time.Sleep(hc.healthCheckTimeout + 100*time.Millisecond) 552 t.Logf(`Sleep(1.1 * timeout)`) 553 result = <-resultChan 554 assert.False(t, result.Serving, "tabletHealthCheck: %+v; want not serving", result) 555 assert.Nil(t, checkErrorCounter("k", "s", topodatapb.TabletType_REPLICA, 1)) 556 assert.True(t, fc.isCanceled(), "StreamHealth should be canceled after timeout, but is not") 557 558 // tablet should be removed from healthy list 559 a := hc.GetHealthyTabletStats(&querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA}) 560 assert.Empty(t, a, "wrong result, expected empty list") 561 562 // repeat the wait. It will timeout one more time trying to get the connection. 563 fc.resetCanceledFlag() 564 time.Sleep(hc.healthCheckTimeout) 565 566 result = <-resultChan 567 assert.False(t, result.Serving, "tabletHealthCheck: %+v; want not serving", result) 568 assert.Nil(t, checkErrorCounter("k", "s", topodatapb.TabletType_REPLICA, 2)) 569 assert.True(t, fc.isCanceled(), "StreamHealth should be canceled again after timeout, but is not") 570 571 // send a healthcheck response, it should be serving again 572 fc.resetCanceledFlag() 573 input <- shr 574 575 // wait for the exponential backoff to wear off and health monitoring to resume. 576 result = <-resultChan 577 mustMatch(t, want, result, "Wrong TabletHealth data") 578 } 579 580 func TestWaitForAllServingTablets(t *testing.T) { 581 ts := memorytopo.NewServer("cell") 582 hc := createTestHc(ts) 583 defer hc.Close() 584 tablet := createTestTablet(0, "cell", "a") 585 tablet.Type = topodatapb.TabletType_REPLICA 586 targets := []*querypb.Target{ 587 { 588 Keyspace: tablet.Keyspace, 589 Shard: tablet.Shard, 590 TabletType: tablet.Type, 591 }, 592 } 593 input := make(chan *querypb.StreamHealthResponse) 594 createFakeConn(tablet, input) 595 596 // create a channel and subscribe to healthcheck 597 resultChan := hc.Subscribe() 598 hc.AddTablet(tablet) 599 // there will be a first result, get and discard it 600 <-resultChan 601 // empty 602 ctx, cancel := context.WithTimeout(context.Background(), 1*time.Second) 603 defer cancel() 604 605 err := hc.WaitForAllServingTablets(ctx, targets) 606 assert.NotNil(t, err, "error should not be nil") 607 608 shr := &querypb.StreamHealthResponse{ 609 TabletAlias: tablet.Alias, 610 Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA}, 611 Serving: true, 612 TabletExternallyReparentedTimestamp: 0, 613 RealtimeStats: &querypb.RealtimeStats{ReplicationLagSeconds: 1, CpuUsage: 0.2}, 614 } 615 616 input <- shr 617 <-resultChan 618 // // check it's there 619 620 targets = []*querypb.Target{ 621 622 { 623 Keyspace: tablet.Keyspace, 624 Shard: tablet.Shard, 625 TabletType: tablet.Type, 626 }, 627 } 628 629 err = hc.WaitForAllServingTablets(ctx, targets) 630 assert.Nil(t, err, "error should be nil. Targets are found") 631 632 targets = []*querypb.Target{ 633 634 { 635 Keyspace: tablet.Keyspace, 636 Shard: tablet.Shard, 637 TabletType: tablet.Type, 638 }, 639 { 640 Keyspace: "newkeyspace", 641 Shard: tablet.Shard, 642 TabletType: tablet.Type, 643 }, 644 } 645 646 err = hc.WaitForAllServingTablets(ctx, targets) 647 assert.NotNil(t, err, "error should not be nil (there are no tablets on this keyspace") 648 649 targets = []*querypb.Target{ 650 651 { 652 Keyspace: tablet.Keyspace, 653 Shard: tablet.Shard, 654 TabletType: tablet.Type, 655 }, 656 { 657 Keyspace: "newkeyspace", 658 Shard: tablet.Shard, 659 TabletType: tablet.Type, 660 }, 661 } 662 663 KeyspacesToWatch = []string{tablet.Keyspace} 664 665 err = hc.WaitForAllServingTablets(ctx, targets) 666 assert.Nil(t, err, "error should be nil. Keyspace with no tablets is filtered") 667 668 KeyspacesToWatch = []string{} 669 } 670 671 // TestRemoveTablet tests the behavior when a tablet goes away. 672 func TestRemoveTablet(t *testing.T) { 673 ts := memorytopo.NewServer("cell") 674 hc := createTestHc(ts) 675 defer hc.Close() 676 tablet := createTestTablet(0, "cell", "a") 677 tablet.Type = topodatapb.TabletType_REPLICA 678 input := make(chan *querypb.StreamHealthResponse) 679 createFakeConn(tablet, input) 680 681 // create a channel and subscribe to healthcheck 682 resultChan := hc.Subscribe() 683 hc.AddTablet(tablet) 684 // there will be a first result, get and discard it 685 <-resultChan 686 687 shrReplica := &querypb.StreamHealthResponse{ 688 TabletAlias: tablet.Alias, 689 Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA}, 690 Serving: true, 691 TabletExternallyReparentedTimestamp: 0, 692 RealtimeStats: &querypb.RealtimeStats{ReplicationLagSeconds: 1, CpuUsage: 0.2}, 693 } 694 want := []*TabletHealth{{ 695 Tablet: tablet, 696 Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA}, 697 Serving: true, 698 Stats: &querypb.RealtimeStats{ReplicationLagSeconds: 1, CpuUsage: 0.2}, 699 PrimaryTermStartTime: 0, 700 }} 701 input <- shrReplica 702 <-resultChan 703 // check it's there 704 a := hc.GetHealthyTabletStats(&querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA}) 705 mustMatch(t, want, a, "unexpected result") 706 707 // delete the tablet 708 hc.RemoveTablet(tablet) 709 a = hc.GetHealthyTabletStats(&querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA}) 710 assert.Empty(t, a, "wrong result, expected empty list") 711 712 // Now confirm that when a tablet's type changes between when it's added to the 713 // cache and when it's removed, that the tablet is entirely removed from the 714 // cache since in the secondary maps it's keyed in part by tablet type. 715 // Note: we are using GetTabletStats here to check the healthData map (rather 716 // than the healthy map that we checked above) because that is the data 717 // structure that is used when printing the contents of the healthcheck cache 718 // in the /debug/status endpoint and in the SHOW VITESS_TABLETS; SQL command 719 // output. 720 721 // Add the tablet back. 722 hc.AddTablet(tablet) 723 // Receive and discard the initial result as we have not yet sent the first 724 // StreamHealthResponse with the dynamic serving and stats information. 725 <-resultChan 726 // Send the first StreamHealthResponse with the dynamic serving and stats 727 // information. 728 input <- shrReplica 729 <-resultChan 730 // Confirm it's there in the cache. 731 a = hc.GetTabletStats(&querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA}) 732 mustMatch(t, want, a, "unexpected result") 733 734 // Change the tablet type to RDONLY. 735 tablet.Type = topodatapb.TabletType_RDONLY 736 shrRdonly := &querypb.StreamHealthResponse{ 737 TabletAlias: tablet.Alias, 738 Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_RDONLY}, 739 Serving: true, 740 TabletExternallyReparentedTimestamp: 0, 741 RealtimeStats: &querypb.RealtimeStats{ReplicationLagSeconds: 2, CpuUsage: 0.4}, 742 } 743 744 // Now Replace it, which does a Remove and Add. The tablet should be removed 745 // from the cache and all its maps even though the tablet type had changed 746 // in-between the initial Add and Remove. 747 hc.ReplaceTablet(tablet, tablet) 748 // Receive and discard the initial result as we have not yet sent the first 749 // StreamHealthResponse with the dynamic serving and stats information. 750 <-resultChan 751 // Confirm that the old entry is gone. 752 a = hc.GetTabletStats(&querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA}) 753 assert.Empty(t, a, "wrong result, expected empty list") 754 // Send the first StreamHealthResponse with the dynamic serving and stats 755 // information. 756 input <- shrRdonly 757 <-resultChan 758 // Confirm that the new entry is there in the cache. 759 want = []*TabletHealth{{ 760 Tablet: tablet, 761 Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_RDONLY}, 762 Serving: true, 763 Stats: &querypb.RealtimeStats{ReplicationLagSeconds: 2, CpuUsage: 0.4}, 764 PrimaryTermStartTime: 0, 765 }} 766 a = hc.GetTabletStats(&querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_RDONLY}) 767 mustMatch(t, want, a, "unexpected result") 768 769 // Delete the tablet, confirm again that it's gone in both tablet type 770 // forms. 771 hc.RemoveTablet(tablet) 772 a = hc.GetTabletStats(&querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA}) 773 assert.Empty(t, a, "wrong result, expected empty list") 774 a = hc.GetTabletStats(&querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_RDONLY}) 775 assert.Empty(t, a, "wrong result, expected empty list") 776 } 777 778 // TestGetHealthyTablets tests the functionality of GetHealthyTabletStats. 779 func TestGetHealthyTablets(t *testing.T) { 780 ts := memorytopo.NewServer("cell") 781 hc := createTestHc(ts) 782 defer hc.Close() 783 tablet := createTestTablet(0, "cell", "a") 784 tablet.Type = topodatapb.TabletType_REPLICA 785 input := make(chan *querypb.StreamHealthResponse) 786 createFakeConn(tablet, input) 787 788 // create a channel and subscribe to healthcheck 789 resultChan := hc.Subscribe() 790 hc.AddTablet(tablet) 791 // there will be a first result, get and discard it 792 <-resultChan 793 // empty 794 a := hc.GetHealthyTabletStats(&querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_PRIMARY}) 795 assert.Empty(t, a, "wrong result, expected empty list") 796 797 shr := &querypb.StreamHealthResponse{ 798 TabletAlias: tablet.Alias, 799 Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA}, 800 Serving: true, 801 TabletExternallyReparentedTimestamp: 0, 802 RealtimeStats: &querypb.RealtimeStats{ReplicationLagSeconds: 1, CpuUsage: 0.2}, 803 } 804 want := []*TabletHealth{{ 805 Tablet: tablet, 806 Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA}, 807 Serving: true, 808 Stats: &querypb.RealtimeStats{ReplicationLagSeconds: 1, CpuUsage: 0.2}, 809 PrimaryTermStartTime: 0, 810 }} 811 input <- shr 812 <-resultChan 813 // check it's there 814 a = hc.GetHealthyTabletStats(&querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA}) 815 mustMatch(t, want, a, "unexpected result") 816 817 // update health with a change that won't change health array 818 shr = &querypb.StreamHealthResponse{ 819 TabletAlias: tablet.Alias, 820 Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA}, 821 Serving: true, 822 TabletExternallyReparentedTimestamp: 0, 823 RealtimeStats: &querypb.RealtimeStats{ReplicationLagSeconds: 2, CpuUsage: 0.2}, 824 } 825 input <- shr 826 // wait for result before checking 827 <-resultChan 828 // check it's there 829 a = hc.GetHealthyTabletStats(&querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA}) 830 mustMatch(t, want, a, "unexpected result") 831 832 // update stats with a change that will change health array 833 shr = &querypb.StreamHealthResponse{ 834 TabletAlias: tablet.Alias, 835 Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA}, 836 Serving: true, 837 TabletExternallyReparentedTimestamp: 0, 838 RealtimeStats: &querypb.RealtimeStats{ReplicationLagSeconds: 35, CpuUsage: 0.2}, 839 } 840 want = []*TabletHealth{{ 841 Tablet: tablet, 842 Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA}, 843 Serving: true, 844 Stats: &querypb.RealtimeStats{ReplicationLagSeconds: 35, CpuUsage: 0.2}, 845 PrimaryTermStartTime: 0, 846 }} 847 input <- shr 848 // wait for result before checking 849 <-resultChan 850 // check it's there 851 a = hc.GetHealthyTabletStats(&querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA}) 852 mustMatch(t, want, a, "unexpected result") 853 854 // add a second tablet 855 tablet2 := createTestTablet(11, "cell", "host2") 856 tablet2.Type = topodatapb.TabletType_REPLICA 857 input2 := make(chan *querypb.StreamHealthResponse) 858 createFakeConn(tablet2, input2) 859 hc.AddTablet(tablet2) 860 // there will be a first result, get and discard it 861 <-resultChan 862 863 shr2 := &querypb.StreamHealthResponse{ 864 TabletAlias: tablet2.Alias, 865 Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA}, 866 Serving: true, 867 TabletExternallyReparentedTimestamp: 0, 868 RealtimeStats: &querypb.RealtimeStats{ReplicationLagSeconds: 10, CpuUsage: 0.2}, 869 } 870 want2 := []*TabletHealth{{ 871 Tablet: tablet, 872 Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA}, 873 Serving: true, 874 Stats: &querypb.RealtimeStats{ReplicationLagSeconds: 35, CpuUsage: 0.2}, 875 PrimaryTermStartTime: 0, 876 }, { 877 Tablet: tablet2, 878 Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA}, 879 Serving: true, 880 Stats: &querypb.RealtimeStats{ReplicationLagSeconds: 10, CpuUsage: 0.2}, 881 PrimaryTermStartTime: 0, 882 }} 883 input2 <- shr2 884 // wait for result 885 <-resultChan 886 a = hc.GetHealthyTabletStats(&querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA}) 887 assert.Equal(t, 2, len(a), "Wrong number of results") 888 if a[0].Tablet.Alias.Uid == 11 { 889 a[0], a[1] = a[1], a[0] 890 } 891 mustMatch(t, want2, a, "unexpected result") 892 893 shr2 = &querypb.StreamHealthResponse{ 894 TabletAlias: tablet2.Alias, 895 Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA}, 896 Serving: false, 897 TabletExternallyReparentedTimestamp: 0, 898 RealtimeStats: &querypb.RealtimeStats{ReplicationLagSeconds: 10, CpuUsage: 0.2}, 899 } 900 input2 <- shr2 901 // wait for result 902 <-resultChan 903 a = hc.GetHealthyTabletStats(&querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA}) 904 assert.Equal(t, 1, len(a), "Wrong number of results") 905 906 // second tablet turns into a primary 907 shr2 = &querypb.StreamHealthResponse{ 908 TabletAlias: tablet2.Alias, 909 Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_PRIMARY}, 910 Serving: true, 911 912 TabletExternallyReparentedTimestamp: 10, 913 914 RealtimeStats: &querypb.RealtimeStats{ReplicationLagSeconds: 0, CpuUsage: 0.2}, 915 } 916 input2 <- shr2 917 // wait for result 918 <-resultChan 919 // check we only have 1 healthy replica left 920 a = hc.GetHealthyTabletStats(&querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA}) 921 mustMatch(t, want, a, "unexpected result") 922 923 want2 = []*TabletHealth{{ 924 Tablet: tablet2, 925 Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_PRIMARY}, 926 Serving: true, 927 Stats: &querypb.RealtimeStats{ReplicationLagSeconds: 0, CpuUsage: 0.2}, 928 PrimaryTermStartTime: 10, 929 }} 930 // check we have a primary now 931 a = hc.GetHealthyTabletStats(&querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_PRIMARY}) 932 mustMatch(t, want2, a, "unexpected result") 933 934 // reparent: old replica goes into primary 935 shr = &querypb.StreamHealthResponse{ 936 TabletAlias: tablet.Alias, 937 Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_PRIMARY}, 938 Serving: true, 939 TabletExternallyReparentedTimestamp: 20, 940 RealtimeStats: &querypb.RealtimeStats{ReplicationLagSeconds: 0, CpuUsage: 0.2}, 941 } 942 input <- shr 943 <-resultChan 944 want = []*TabletHealth{{ 945 Tablet: tablet, 946 Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_PRIMARY}, 947 Serving: true, 948 Stats: &querypb.RealtimeStats{ReplicationLagSeconds: 0, CpuUsage: 0.2}, 949 PrimaryTermStartTime: 20, 950 }} 951 952 // check we lost all replicas, and primary is new one 953 a = hc.GetHealthyTabletStats(&querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA}) 954 assert.Empty(t, a, "Wrong number of results") 955 a = hc.GetHealthyTabletStats(&querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_PRIMARY}) 956 mustMatch(t, want, a, "unexpected result") 957 958 // old primary sending an old ping should be ignored 959 input2 <- shr2 960 <-resultChan 961 a = hc.GetHealthyTabletStats(&querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_PRIMARY}) 962 mustMatch(t, want, a, "unexpected result") 963 } 964 965 func TestPrimaryInOtherCell(t *testing.T) { 966 ts := memorytopo.NewServer("cell1", "cell2") 967 hc := NewHealthCheck(context.Background(), 1*time.Millisecond, time.Hour, ts, "cell1", "cell1, cell2") 968 defer hc.Close() 969 970 // add a tablet as primary in different cell 971 tablet := createTestTablet(1, "cell2", "host1") 972 tablet.Type = topodatapb.TabletType_PRIMARY 973 input := make(chan *querypb.StreamHealthResponse) 974 fc := createFakeConn(tablet, input) 975 // create a channel and subscribe to healthcheck 976 resultChan := hc.Subscribe() 977 hc.AddTablet(tablet) 978 // should get a result, but this will hang if multi-cell logic is broken 979 // so wait and timeout 980 ticker := time.NewTicker(1 * time.Second) 981 select { 982 case err := <-fc.cbErrCh: 983 require.Fail(t, "Unexpected error: %v", err) 984 case <-resultChan: 985 case <-ticker.C: 986 require.Fail(t, "Timed out waiting for HealthCheck update") 987 } 988 989 shr := &querypb.StreamHealthResponse{ 990 TabletAlias: tablet.Alias, 991 Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_PRIMARY}, 992 Serving: true, 993 TabletExternallyReparentedTimestamp: 20, 994 RealtimeStats: &querypb.RealtimeStats{ReplicationLagSeconds: 0, CpuUsage: 0.2}, 995 } 996 want := &TabletHealth{ 997 Tablet: tablet, 998 Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_PRIMARY}, 999 Serving: true, 1000 Stats: &querypb.RealtimeStats{ReplicationLagSeconds: 0, CpuUsage: 0.2}, 1001 PrimaryTermStartTime: 20, 1002 } 1003 1004 input <- shr 1005 ticker = time.NewTicker(1 * time.Second) 1006 select { 1007 case err := <-fc.cbErrCh: 1008 require.Fail(t, "Unexpected error: %v", err) 1009 case got := <-resultChan: 1010 // check that we DO receive health check update for PRIMARY in other cell 1011 mustMatch(t, want, got, "Wrong TabletHealth data") 1012 case <-ticker.C: 1013 require.Fail(t, "Timed out waiting for HealthCheck update") 1014 } 1015 1016 // check that PRIMARY tablet from other cell IS in healthy tablet list 1017 a := hc.GetHealthyTabletStats(&querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_PRIMARY}) 1018 require.Len(t, a, 1, "") 1019 mustMatch(t, want, a[0], "Expecting healthy primary") 1020 } 1021 1022 func TestReplicaInOtherCell(t *testing.T) { 1023 ts := memorytopo.NewServer("cell1", "cell2") 1024 hc := NewHealthCheck(context.Background(), 1*time.Millisecond, time.Hour, ts, "cell1", "cell1, cell2") 1025 defer hc.Close() 1026 1027 // add a tablet as replica 1028 local := createTestTablet(1, "cell1", "host1") 1029 local.Type = topodatapb.TabletType_REPLICA 1030 input := make(chan *querypb.StreamHealthResponse) 1031 fc := createFakeConn(local, input) 1032 // create a channel and subscribe to healthcheck 1033 resultChan := hc.Subscribe() 1034 hc.AddTablet(local) 1035 1036 ticker := time.NewTicker(1 * time.Second) 1037 select { 1038 case err := <-fc.cbErrCh: 1039 require.Fail(t, "Unexpected error: %v", err) 1040 case <-resultChan: 1041 case <-ticker.C: 1042 require.Fail(t, "Timed out waiting for HealthCheck update") 1043 } 1044 1045 shr := &querypb.StreamHealthResponse{ 1046 TabletAlias: local.Alias, 1047 Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA}, 1048 Serving: true, 1049 TabletExternallyReparentedTimestamp: 0, 1050 RealtimeStats: &querypb.RealtimeStats{ReplicationLagSeconds: 10, CpuUsage: 0.2}, 1051 } 1052 want := &TabletHealth{ 1053 Tablet: local, 1054 Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA}, 1055 Serving: true, 1056 Stats: &querypb.RealtimeStats{ReplicationLagSeconds: 10, CpuUsage: 0.2}, 1057 PrimaryTermStartTime: 0, 1058 } 1059 1060 input <- shr 1061 ticker = time.NewTicker(1 * time.Second) 1062 select { 1063 case err := <-fc.cbErrCh: 1064 require.Fail(t, "Unexpected error: %v", err) 1065 case got := <-resultChan: 1066 // check that we DO receive health check update for REPLICA in other cell 1067 mustMatch(t, want, got, "Wrong TabletHealth data") 1068 case <-ticker.C: 1069 require.Fail(t, "Timed out waiting for HealthCheck update") 1070 } 1071 1072 // add a tablet as replica in different cell 1073 remote := createTestTablet(2, "cell2", "host2") 1074 remote.Type = topodatapb.TabletType_REPLICA 1075 input2 := make(chan *querypb.StreamHealthResponse) 1076 fc2 := createFakeConn(remote, input2) 1077 // create a channel and subscribe to healthcheck 1078 resultChan2 := hc.Subscribe() 1079 hc.AddTablet(remote) 1080 // should get a result, but this will hang if multi-cell logic is broken 1081 // so wait and timeout 1082 ticker = time.NewTicker(1 * time.Second) 1083 select { 1084 case err := <-fc2.cbErrCh: 1085 require.Fail(t, "Unexpected error: %v", err) 1086 case <-resultChan2: 1087 case <-ticker.C: 1088 require.Fail(t, "Timed out waiting for HealthCheck update") 1089 } 1090 1091 shr2 := &querypb.StreamHealthResponse{ 1092 TabletAlias: remote.Alias, 1093 Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA}, 1094 Serving: true, 1095 TabletExternallyReparentedTimestamp: 0, 1096 RealtimeStats: &querypb.RealtimeStats{ReplicationLagSeconds: 10, CpuUsage: 0.2}, 1097 } 1098 want2 := &TabletHealth{ 1099 Tablet: remote, 1100 Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA}, 1101 Serving: true, 1102 Stats: &querypb.RealtimeStats{ReplicationLagSeconds: 10, CpuUsage: 0.2}, 1103 PrimaryTermStartTime: 0, 1104 } 1105 1106 input2 <- shr2 1107 ticker = time.NewTicker(1 * time.Second) 1108 select { 1109 case err := <-fc.cbErrCh: 1110 require.Fail(t, "Unexpected error: %v", err) 1111 case got := <-resultChan2: 1112 // check that we DO receive health check update for REPLICA in other cell 1113 mustMatch(t, want2, got, "Wrong TabletHealth data") 1114 case <-ticker.C: 1115 require.Fail(t, "Timed out waiting for HealthCheck update") 1116 } 1117 1118 // check that only REPLICA tablet from cell1 is in healthy tablet list 1119 a := hc.GetHealthyTabletStats(&querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA}) 1120 require.Len(t, a, 1, "") 1121 mustMatch(t, want, a[0], "Expecting healthy local replica") 1122 } 1123 1124 func TestCellAliases(t *testing.T) { 1125 ts := memorytopo.NewServer("cell1", "cell2") 1126 hc := NewHealthCheck(context.Background(), 1*time.Millisecond, time.Hour, ts, "cell1", "cell1, cell2") 1127 defer hc.Close() 1128 1129 cellsAlias := &topodatapb.CellsAlias{ 1130 Cells: []string{"cell1", "cell2"}, 1131 } 1132 assert.Nil(t, ts.CreateCellsAlias(context.Background(), "region1", cellsAlias), "failed to create cell alias") 1133 defer deleteCellsAlias(t, ts, "region1") 1134 1135 // add a tablet as replica in diff cell, same region 1136 tablet := createTestTablet(1, "cell2", "host2") 1137 tablet.Type = topodatapb.TabletType_REPLICA 1138 input := make(chan *querypb.StreamHealthResponse) 1139 fc := createFakeConn(tablet, input) 1140 // create a channel and subscribe to healthcheck 1141 resultChan := hc.Subscribe() 1142 hc.AddTablet(tablet) 1143 // should get a result, but this will hang if cell alias logic is broken 1144 // so wait and timeout 1145 ticker := time.NewTicker(1 * time.Second) 1146 select { 1147 case err := <-fc.cbErrCh: 1148 require.Fail(t, "Unexpected error: %v", err) 1149 case <-resultChan: 1150 case <-ticker.C: 1151 require.Fail(t, "Timed out waiting for HealthCheck update") 1152 } 1153 1154 shr := &querypb.StreamHealthResponse{ 1155 TabletAlias: tablet.Alias, 1156 Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA}, 1157 Serving: true, 1158 TabletExternallyReparentedTimestamp: 0, 1159 RealtimeStats: &querypb.RealtimeStats{ReplicationLagSeconds: 10, CpuUsage: 0.2}, 1160 } 1161 want := []*TabletHealth{{ 1162 Tablet: tablet, 1163 Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA}, 1164 Serving: true, 1165 Stats: &querypb.RealtimeStats{ReplicationLagSeconds: 10, CpuUsage: 0.2}, 1166 PrimaryTermStartTime: 0, 1167 }} 1168 1169 input <- shr 1170 ticker = time.NewTicker(1 * time.Second) 1171 select { 1172 case err := <-fc.cbErrCh: 1173 require.Fail(t, "Unexpected error: %v", err) 1174 case <-resultChan: 1175 case <-ticker.C: 1176 require.Fail(t, "Timed out waiting for HealthCheck update") 1177 } 1178 1179 // check it's there 1180 a := hc.GetHealthyTabletStats(&querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA}) 1181 mustMatch(t, want, a, "Wrong TabletHealth data") 1182 } 1183 1184 func TestHealthCheckChecksGrpcPort(t *testing.T) { 1185 ts := memorytopo.NewServer("cell") 1186 hc := createTestHc(ts) 1187 defer hc.Close() 1188 1189 tablet := createTestTablet(0, "cell", "a") 1190 tablet.PortMap["grpc"] = 0 1191 resultChan := hc.Subscribe() 1192 1193 // AddTablet should not add the tablet because port is 0 1194 hc.AddTablet(tablet) 1195 1196 select { 1197 case result := <-resultChan: 1198 assert.Nil(t, result, "healthCheck received result: %v", result) 1199 case <-time.After(2 * time.Millisecond): 1200 // No response after timeout. Success. 1201 } 1202 } 1203 1204 func TestTemplate(t *testing.T) { 1205 TabletURLTemplateString = "http://{{.GetTabletHostPort}}" 1206 ParseTabletURLTemplateFromFlag() 1207 1208 tablet := topo.NewTablet(0, "cell", "a") 1209 ts := []*TabletHealth{ 1210 { 1211 Tablet: tablet, 1212 Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA}, 1213 Serving: false, 1214 Stats: &querypb.RealtimeStats{ReplicationLagSeconds: 1, CpuUsage: 0.3}, 1215 PrimaryTermStartTime: 0, 1216 }, 1217 } 1218 tcs := &TabletsCacheStatus{ 1219 Cell: "cell", 1220 Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA}, 1221 TabletsStats: ts, 1222 } 1223 templ := template.New("").Funcs(status.StatusFuncs) 1224 templ, err := templ.Parse(HealthCheckTemplate) 1225 require.Nil(t, err, "error parsing template: %v", err) 1226 wr := &bytes.Buffer{} 1227 err = templ.Execute(wr, []*TabletsCacheStatus{tcs}) 1228 require.Nil(t, err, "error executing template: %v", err) 1229 } 1230 1231 func TestDebugURLFormatting(t *testing.T) { 1232 TabletURLTemplateString = "https://{{.GetHostNameLevel 0}}.bastion.{{.Tablet.Alias.Cell}}.corp" 1233 ParseTabletURLTemplateFromFlag() 1234 1235 tablet := topo.NewTablet(0, "cell", "host.dc.domain") 1236 ts := []*TabletHealth{ 1237 { 1238 Tablet: tablet, 1239 Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA}, 1240 Serving: false, 1241 Stats: &querypb.RealtimeStats{ReplicationLagSeconds: 1, CpuUsage: 0.3}, 1242 PrimaryTermStartTime: 0, 1243 }, 1244 } 1245 tcs := &TabletsCacheStatus{ 1246 Cell: "cell", 1247 Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA}, 1248 TabletsStats: ts, 1249 } 1250 templ := template.New("").Funcs(status.StatusFuncs) 1251 templ, err := templ.Parse(HealthCheckTemplate) 1252 require.Nil(t, err, "error parsing template") 1253 wr := &bytes.Buffer{} 1254 err = templ.Execute(wr, []*TabletsCacheStatus{tcs}) 1255 require.Nil(t, err, "error executing template") 1256 expectedURL := `"https://host.bastion.cell.corp"` 1257 require.Contains(t, wr.String(), expectedURL, "output missing formatted URL") 1258 } 1259 1260 func tabletDialer(tablet *topodatapb.Tablet, _ grpcclient.FailFast) (queryservice.QueryService, error) { 1261 connMapMu.Lock() 1262 defer connMapMu.Unlock() 1263 1264 key := TabletToMapKey(tablet) 1265 if qs, ok := connMap[key]; ok { 1266 return qs, nil 1267 } 1268 return nil, fmt.Errorf("tablet %v not found", key) 1269 } 1270 1271 func createTestHc(ts *topo.Server) *HealthCheckImpl { 1272 return NewHealthCheck(context.Background(), 1*time.Millisecond, time.Hour, ts, "cell", "") 1273 } 1274 1275 type fakeConn struct { 1276 queryservice.QueryService 1277 tablet *topodatapb.Tablet 1278 // If fixedResult is set, the channels are not used. 1279 fixedResult *querypb.StreamHealthResponse 1280 // hcChan should be an unbuffered channel which holds the tablet's next health response. 1281 hcChan chan *querypb.StreamHealthResponse 1282 // errCh is either an unbuffered channel which holds the stream error to return, or nil. 1283 errCh chan error 1284 // cbErrCh is a channel which receives errors returned from the supplied callback. 1285 cbErrCh chan error 1286 1287 mu sync.Mutex 1288 canceled bool 1289 } 1290 1291 func createFakeConn(tablet *topodatapb.Tablet, c chan *querypb.StreamHealthResponse) *fakeConn { 1292 connMapMu.Lock() 1293 defer connMapMu.Unlock() 1294 key := TabletToMapKey(tablet) 1295 conn := &fakeConn{ 1296 QueryService: fakes.ErrorQueryService, 1297 tablet: tablet, 1298 hcChan: c, 1299 cbErrCh: make(chan error, 1), 1300 } 1301 connMap[key] = conn 1302 return conn 1303 } 1304 1305 // StreamHealth implements queryservice.QueryService. 1306 func (fc *fakeConn) StreamHealth(ctx context.Context, callback func(shr *querypb.StreamHealthResponse) error) error { 1307 if fc.fixedResult != nil { 1308 return callback(fc.fixedResult) 1309 } 1310 for { 1311 select { 1312 case shr := <-fc.hcChan: 1313 if err := callback(shr); err != nil { 1314 if err == io.EOF { 1315 return nil 1316 } 1317 select { 1318 case fc.cbErrCh <- err: 1319 case <-ctx.Done(): 1320 } 1321 return err 1322 } 1323 case err := <-fc.errCh: 1324 return err 1325 case <-ctx.Done(): 1326 fc.mu.Lock() 1327 fc.canceled = true 1328 fc.mu.Unlock() 1329 return nil 1330 } 1331 } 1332 } 1333 1334 func (fc *fakeConn) isCanceled() bool { 1335 fc.mu.Lock() 1336 defer fc.mu.Unlock() 1337 return fc.canceled 1338 } 1339 1340 func (fc *fakeConn) resetCanceledFlag() { 1341 fc.mu.Lock() 1342 defer fc.mu.Unlock() 1343 fc.canceled = false 1344 } 1345 1346 func checkErrorCounter(keyspace, shard string, tabletType topodatapb.TabletType, want int64) error { 1347 statsKey := []string{keyspace, shard, topoproto.TabletTypeLString(tabletType)} 1348 name := strings.Join(statsKey, ".") 1349 got, ok := hcErrorCounters.Counts()[name] 1350 if !ok { 1351 return fmt.Errorf("hcErrorCounters not correctly initialized") 1352 } 1353 if got != want { 1354 return fmt.Errorf("wrong value for hcErrorCounters got = %v, want = %v", got, want) 1355 } 1356 return nil 1357 } 1358 1359 func createFixedHealthConn(tablet *topodatapb.Tablet, fixedResult *querypb.StreamHealthResponse) *fakeConn { 1360 key := TabletToMapKey(tablet) 1361 conn := &fakeConn{ 1362 QueryService: fakes.ErrorQueryService, 1363 tablet: tablet, 1364 fixedResult: fixedResult, 1365 } 1366 connMapMu.Lock() 1367 defer connMapMu.Unlock() 1368 connMap[key] = conn 1369 return conn 1370 } 1371 1372 func createTestTablet(uid uint32, cell, host string) *topodatapb.Tablet { 1373 tablet := topo.NewTablet(uid, cell, host) 1374 tablet.PortMap["vt"] = 1 1375 tablet.PortMap["grpc"] = 2 1376 tablet.Keyspace = "k" 1377 tablet.Shard = "s" 1378 return tablet 1379 } 1380 1381 var mustMatch = utils.MustMatchFn(".Conn" /* ignored fields*/) 1382 1383 func deleteCellsAlias(t *testing.T, ts *topo.Server, alias string) { 1384 if err := ts.DeleteCellsAlias(context.Background(), alias); err != nil { 1385 t.Logf("DeleteCellsAlias(%s) failed: %v", alias, err) 1386 } 1387 }