vitess.io/vitess@v0.16.2/go/vt/discovery/healthcheck_test.go (about)

     1  /*
     2  Copyright 2019 The Vitess Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package discovery
    18  
    19  import (
    20  	"bytes"
    21  	"context"
    22  	"fmt"
    23  	"html/template"
    24  	"io"
    25  	"strings"
    26  	"sync"
    27  	"testing"
    28  	"time"
    29  
    30  	"github.com/stretchr/testify/assert"
    31  	"github.com/stretchr/testify/require"
    32  
    33  	"vitess.io/vitess/go/test/utils"
    34  	"vitess.io/vitess/go/vt/grpcclient"
    35  	"vitess.io/vitess/go/vt/status"
    36  	"vitess.io/vitess/go/vt/topo"
    37  	"vitess.io/vitess/go/vt/topo/memorytopo"
    38  	"vitess.io/vitess/go/vt/topo/topoproto"
    39  	"vitess.io/vitess/go/vt/vttablet/queryservice"
    40  	"vitess.io/vitess/go/vt/vttablet/queryservice/fakes"
    41  	"vitess.io/vitess/go/vt/vttablet/tabletconn"
    42  	"vitess.io/vitess/go/vt/vttablet/tabletconntest"
    43  
    44  	querypb "vitess.io/vitess/go/vt/proto/query"
    45  	topodatapb "vitess.io/vitess/go/vt/proto/topodata"
    46  )
    47  
    48  var (
    49  	connMap   map[string]*fakeConn
    50  	connMapMu sync.Mutex
    51  )
    52  
    53  func testChecksum(t *testing.T, want, got int64) {
    54  	t.Helper()
    55  	if want != got {
    56  		t.Errorf("want checksum %v, got %v", want, got)
    57  	}
    58  }
    59  
    60  func init() {
    61  	tabletconn.RegisterDialer("fake_gateway", tabletDialer)
    62  	tabletconntest.SetProtocol("go.vt.discovery.healthcheck_test", "fake_gateway")
    63  	connMap = make(map[string]*fakeConn)
    64  	refreshInterval = time.Minute
    65  }
    66  
    67  func TestHealthCheck(t *testing.T) {
    68  	// reset error counters
    69  	hcErrorCounters.ResetAll()
    70  	ts := memorytopo.NewServer("cell")
    71  	hc := createTestHc(ts)
    72  	// close healthcheck
    73  	defer hc.Close()
    74  	tablet := createTestTablet(0, "cell", "a")
    75  	tablet.Type = topodatapb.TabletType_REPLICA
    76  	input := make(chan *querypb.StreamHealthResponse)
    77  	conn := createFakeConn(tablet, input)
    78  
    79  	// create a channel and subscribe to healthcheck
    80  	resultChan := hc.Subscribe()
    81  	testChecksum(t, 0, hc.stateChecksum())
    82  	hc.AddTablet(tablet)
    83  	testChecksum(t, 1027934207, hc.stateChecksum())
    84  
    85  	// Immediately after AddTablet() there will be the first notification.
    86  	want := &TabletHealth{
    87  		Tablet:               tablet,
    88  		Target:               &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA},
    89  		Serving:              false,
    90  		Stats:                nil,
    91  		PrimaryTermStartTime: 0,
    92  	}
    93  	result := <-resultChan
    94  	mustMatch(t, want, result, "Wrong TabletHealth data")
    95  
    96  	shr := &querypb.StreamHealthResponse{
    97  		TabletAlias: tablet.Alias,
    98  		Target:      &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA},
    99  		Serving:     true,
   100  
   101  		TabletExternallyReparentedTimestamp: 0,
   102  		RealtimeStats:                       &querypb.RealtimeStats{ReplicationLagSeconds: 1, CpuUsage: 0.5},
   103  	}
   104  	input <- shr
   105  	result = <-resultChan
   106  	want = &TabletHealth{
   107  		Tablet:               tablet,
   108  		Target:               &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA},
   109  		Serving:              true,
   110  		Stats:                &querypb.RealtimeStats{ReplicationLagSeconds: 1, CpuUsage: 0.5},
   111  		PrimaryTermStartTime: 0,
   112  	}
   113  	// create a context with timeout and select on it and channel
   114  	mustMatch(t, want, result, "Wrong TabletHealth data")
   115  
   116  	tcsl := hc.CacheStatus()
   117  	tcslWant := TabletsCacheStatusList{{
   118  		Cell:   "cell",
   119  		Target: want.Target,
   120  		TabletsStats: TabletStatsList{{
   121  			Tablet:               tablet,
   122  			Target:               want.Target,
   123  			Serving:              true,
   124  			Stats:                &querypb.RealtimeStats{ReplicationLagSeconds: 1, CpuUsage: 0.5},
   125  			PrimaryTermStartTime: 0,
   126  		}},
   127  	}}
   128  	// we can't use assert.Equal here because of the special way we want to compare equality
   129  	assert.True(t, tcslWant.deepEqual(tcsl), "Incorrect cache status:\n Expected: %+v\n Actual:   %+v", tcslWant[0], tcsl[0])
   130  	testChecksum(t, 3487343103, hc.stateChecksum())
   131  
   132  	// TabletType changed, should get both old and new event
   133  	shr = &querypb.StreamHealthResponse{
   134  		TabletAlias:                         tablet.Alias,
   135  		Target:                              &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_PRIMARY},
   136  		Serving:                             true,
   137  		TabletExternallyReparentedTimestamp: 10,
   138  		RealtimeStats:                       &querypb.RealtimeStats{ReplicationLagSeconds: 1, CpuUsage: 0.2},
   139  	}
   140  	want = &TabletHealth{
   141  		Tablet: tablet,
   142  		Target: &querypb.Target{
   143  			Keyspace:   "k",
   144  			Shard:      "s",
   145  			TabletType: topodatapb.TabletType_PRIMARY,
   146  		},
   147  		Serving:              true,
   148  		Conn:                 conn,
   149  		Stats:                &querypb.RealtimeStats{ReplicationLagSeconds: 1, CpuUsage: 0.2},
   150  		PrimaryTermStartTime: 10,
   151  	}
   152  	input <- shr
   153  	result = <-resultChan
   154  
   155  	mustMatch(t, want, result, "Wrong TabletHealth data")
   156  	testChecksum(t, 1560849771, hc.stateChecksum())
   157  
   158  	err := checkErrorCounter("k", "s", topodatapb.TabletType_PRIMARY, 0)
   159  	require.NoError(t, err, "error checking error counter")
   160  
   161  	// Serving & RealtimeStats changed
   162  	shr = &querypb.StreamHealthResponse{
   163  		TabletAlias:                         tablet.Alias,
   164  		Target:                              &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA},
   165  		Serving:                             false,
   166  		TabletExternallyReparentedTimestamp: 0,
   167  		RealtimeStats:                       &querypb.RealtimeStats{ReplicationLagSeconds: 1, CpuUsage: 0.3},
   168  	}
   169  	want = &TabletHealth{
   170  		Tablet:               tablet,
   171  		Target:               &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA},
   172  		Serving:              false,
   173  		Stats:                &querypb.RealtimeStats{ReplicationLagSeconds: 1, CpuUsage: 0.3},
   174  		PrimaryTermStartTime: 0,
   175  	}
   176  	input <- shr
   177  	result = <-resultChan
   178  	mustMatch(t, want, result, "Wrong TabletHealth data")
   179  	testChecksum(t, 1027934207, hc.stateChecksum())
   180  
   181  	// HealthError
   182  	shr = &querypb.StreamHealthResponse{
   183  		TabletAlias:                         tablet.Alias,
   184  		Target:                              &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA},
   185  		Serving:                             true,
   186  		TabletExternallyReparentedTimestamp: 0,
   187  		RealtimeStats:                       &querypb.RealtimeStats{HealthError: "some error", ReplicationLagSeconds: 1, CpuUsage: 0.3},
   188  	}
   189  	want = &TabletHealth{
   190  		Tablet:               tablet,
   191  		Target:               &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA},
   192  		Serving:              false,
   193  		Stats:                &querypb.RealtimeStats{HealthError: "some error", ReplicationLagSeconds: 1, CpuUsage: 0.3},
   194  		PrimaryTermStartTime: 0,
   195  		LastError:            fmt.Errorf("vttablet error: some error"),
   196  	}
   197  	input <- shr
   198  	result = <-resultChan
   199  	// TODO: figure out how to compare objects that contain errors using utils.MustMatch
   200  	assert.True(t, want.DeepEqual(result), "Wrong TabletHealth data\n Expected: %v\n Actual:   %v", want, result)
   201  	testChecksum(t, 1027934207, hc.stateChecksum()) // unchanged
   202  
   203  	// remove tablet
   204  	hc.deleteTablet(tablet)
   205  	testChecksum(t, 0, hc.stateChecksum())
   206  }
   207  
   208  func TestHealthCheckStreamError(t *testing.T) {
   209  	ts := memorytopo.NewServer("cell")
   210  	hc := createTestHc(ts)
   211  	defer hc.Close()
   212  
   213  	tablet := createTestTablet(0, "cell", "a")
   214  	input := make(chan *querypb.StreamHealthResponse)
   215  	resultChan := hc.Subscribe()
   216  	fc := createFakeConn(tablet, input)
   217  	fc.errCh = make(chan error)
   218  	hc.AddTablet(tablet)
   219  
   220  	// Immediately after AddTablet() there will be the first notification.
   221  	want := &TabletHealth{
   222  		Tablet:               tablet,
   223  		Target:               &querypb.Target{Keyspace: "k", Shard: "s"},
   224  		Serving:              false,
   225  		PrimaryTermStartTime: 0,
   226  	}
   227  	result := <-resultChan
   228  	mustMatch(t, want, result, "Wrong TabletHealth data")
   229  
   230  	// one tablet after receiving a StreamHealthResponse
   231  	shr := &querypb.StreamHealthResponse{
   232  		TabletAlias:                         tablet.Alias,
   233  		Target:                              &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA},
   234  		Serving:                             true,
   235  		TabletExternallyReparentedTimestamp: 0,
   236  		RealtimeStats:                       &querypb.RealtimeStats{ReplicationLagSeconds: 1, CpuUsage: 0.2},
   237  	}
   238  	want = &TabletHealth{
   239  		Tablet:               tablet,
   240  		Target:               &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA},
   241  		Serving:              true,
   242  		Stats:                &querypb.RealtimeStats{ReplicationLagSeconds: 1, CpuUsage: 0.2},
   243  		PrimaryTermStartTime: 0,
   244  	}
   245  	input <- shr
   246  	result = <-resultChan
   247  	mustMatch(t, want, result, "Wrong TabletHealth data")
   248  
   249  	// Stream error
   250  	fc.errCh <- fmt.Errorf("some stream error")
   251  	want = &TabletHealth{
   252  		Tablet:               tablet,
   253  		Target:               &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA},
   254  		Serving:              false,
   255  		Stats:                &querypb.RealtimeStats{ReplicationLagSeconds: 1, CpuUsage: 0.2},
   256  		PrimaryTermStartTime: 0,
   257  		LastError:            fmt.Errorf("some stream error"),
   258  	}
   259  	result = <-resultChan
   260  	// TODO: figure out how to compare objects that contain errors using utils.MustMatch
   261  	assert.True(t, want.DeepEqual(result), "Wrong TabletHealth data\n Expected: %v\n Actual:   %v", want, result)
   262  	// tablet should be removed from healthy list
   263  	a := hc.GetHealthyTabletStats(&querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA})
   264  	assert.Empty(t, a, "wrong result, expected empty list")
   265  }
   266  
   267  // TestHealthCheckErrorOnPrimary is the same as TestHealthCheckStreamError except for tablet type
   268  func TestHealthCheckErrorOnPrimary(t *testing.T) {
   269  	ts := memorytopo.NewServer("cell")
   270  	hc := createTestHc(ts)
   271  	defer hc.Close()
   272  
   273  	tablet := createTestTablet(0, "cell", "a")
   274  	input := make(chan *querypb.StreamHealthResponse)
   275  	resultChan := hc.Subscribe()
   276  	fc := createFakeConn(tablet, input)
   277  	fc.errCh = make(chan error)
   278  	hc.AddTablet(tablet)
   279  
   280  	// Immediately after AddTablet() there will be the first notification.
   281  	want := &TabletHealth{
   282  		Tablet:               tablet,
   283  		Target:               &querypb.Target{Keyspace: "k", Shard: "s"},
   284  		Serving:              false,
   285  		PrimaryTermStartTime: 0,
   286  	}
   287  	result := <-resultChan
   288  	mustMatch(t, want, result, "Wrong TabletHealth data")
   289  
   290  	// one tablet after receiving a StreamHealthResponse
   291  	shr := &querypb.StreamHealthResponse{
   292  		TabletAlias:                         tablet.Alias,
   293  		Target:                              &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_PRIMARY},
   294  		Serving:                             true,
   295  		TabletExternallyReparentedTimestamp: 10,
   296  		RealtimeStats:                       &querypb.RealtimeStats{ReplicationLagSeconds: 1, CpuUsage: 0.2},
   297  	}
   298  	want = &TabletHealth{
   299  		Tablet:               tablet,
   300  		Target:               &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_PRIMARY},
   301  		Serving:              true,
   302  		Stats:                &querypb.RealtimeStats{ReplicationLagSeconds: 1, CpuUsage: 0.2},
   303  		PrimaryTermStartTime: 10,
   304  	}
   305  	input <- shr
   306  	result = <-resultChan
   307  	mustMatch(t, want, result, "Wrong TabletHealth data")
   308  
   309  	// Stream error
   310  	fc.errCh <- fmt.Errorf("some stream error")
   311  	want = &TabletHealth{
   312  		Tablet:               tablet,
   313  		Target:               &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_PRIMARY},
   314  		Serving:              false,
   315  		Stats:                &querypb.RealtimeStats{ReplicationLagSeconds: 1, CpuUsage: 0.2},
   316  		PrimaryTermStartTime: 10,
   317  		LastError:            fmt.Errorf("some stream error"),
   318  	}
   319  	result = <-resultChan
   320  	// TODO: figure out how to compare objects that contain errors using utils.MustMatch
   321  	assert.True(t, want.DeepEqual(result), "Wrong TabletHealth data\n Expected: %v\n Actual:   %v", want, result)
   322  	// tablet should be removed from healthy list
   323  	a := hc.GetHealthyTabletStats(&querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_PRIMARY})
   324  	assert.Empty(t, a, "wrong result, expected empty list")
   325  }
   326  
   327  func TestHealthCheckErrorOnPrimaryAfterExternalReparent(t *testing.T) {
   328  	ts := memorytopo.NewServer("cell")
   329  	hc := createTestHc(ts)
   330  	defer hc.Close()
   331  
   332  	resultChan := hc.Subscribe()
   333  
   334  	tablet1 := createTestTablet(0, "cell", "a")
   335  	input1 := make(chan *querypb.StreamHealthResponse)
   336  	fc1 := createFakeConn(tablet1, input1)
   337  	fc1.errCh = make(chan error)
   338  	hc.AddTablet(tablet1)
   339  	<-resultChan
   340  
   341  	tablet2 := createTestTablet(1, "cell", "b")
   342  	tablet2.Type = topodatapb.TabletType_REPLICA
   343  	input2 := make(chan *querypb.StreamHealthResponse)
   344  	createFakeConn(tablet2, input2)
   345  	hc.AddTablet(tablet2)
   346  	<-resultChan
   347  
   348  	shr2 := &querypb.StreamHealthResponse{
   349  		TabletAlias:                         tablet2.Alias,
   350  		Target:                              &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA},
   351  		Serving:                             true,
   352  		TabletExternallyReparentedTimestamp: 0,
   353  		RealtimeStats:                       &querypb.RealtimeStats{ReplicationLagSeconds: 10, CpuUsage: 0.2},
   354  	}
   355  	input2 <- shr2
   356  	<-resultChan
   357  	shr1 := &querypb.StreamHealthResponse{
   358  		TabletAlias:                         tablet1.Alias,
   359  		Target:                              &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_PRIMARY},
   360  		Serving:                             true,
   361  		TabletExternallyReparentedTimestamp: 10,
   362  		RealtimeStats:                       &querypb.RealtimeStats{ReplicationLagSeconds: 0, CpuUsage: 0.2},
   363  	}
   364  	input1 <- shr1
   365  	<-resultChan
   366  	// tablet 1 is the primary now
   367  	health := []*TabletHealth{{
   368  		Tablet:               tablet1,
   369  		Target:               &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_PRIMARY},
   370  		Serving:              true,
   371  		Stats:                &querypb.RealtimeStats{ReplicationLagSeconds: 0, CpuUsage: 0.2},
   372  		PrimaryTermStartTime: 10,
   373  	}}
   374  	a := hc.GetHealthyTabletStats(&querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_PRIMARY})
   375  	mustMatch(t, health, a, "unexpected result")
   376  
   377  	shr2 = &querypb.StreamHealthResponse{
   378  		TabletAlias:                         tablet2.Alias,
   379  		Target:                              &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_PRIMARY},
   380  		Serving:                             true,
   381  		TabletExternallyReparentedTimestamp: 20,
   382  		RealtimeStats:                       &querypb.RealtimeStats{ReplicationLagSeconds: 0, CpuUsage: 0.2},
   383  	}
   384  	input2 <- shr2
   385  	<-resultChan
   386  	// reparent: tablet 2 is the primary now
   387  	health = []*TabletHealth{{
   388  		Tablet:               tablet2,
   389  		Target:               &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_PRIMARY},
   390  		Serving:              true,
   391  		Stats:                &querypb.RealtimeStats{ReplicationLagSeconds: 0, CpuUsage: 0.2},
   392  		PrimaryTermStartTime: 20,
   393  	}}
   394  	a = hc.GetHealthyTabletStats(&querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_PRIMARY})
   395  	mustMatch(t, health, a, "unexpected result")
   396  
   397  	// Stream error from tablet 1
   398  	fc1.errCh <- fmt.Errorf("some stream error")
   399  	<-resultChan
   400  	// tablet 2 should still be the primary
   401  	a = hc.GetHealthyTabletStats(&querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_PRIMARY})
   402  	mustMatch(t, health, a, "unexpected result")
   403  }
   404  
   405  func TestHealthCheckVerifiesTabletAlias(t *testing.T) {
   406  	ts := memorytopo.NewServer("cell")
   407  	hc := createTestHc(ts)
   408  	defer hc.Close()
   409  
   410  	tablet := createTestTablet(0, "cell", "a")
   411  	input := make(chan *querypb.StreamHealthResponse, 1)
   412  	fc := createFakeConn(tablet, input)
   413  	resultChan := hc.Subscribe()
   414  
   415  	hc.AddTablet(tablet)
   416  
   417  	// Immediately after AddTablet() there will be the first notification.
   418  	want := &TabletHealth{
   419  		Tablet:               tablet,
   420  		Target:               &querypb.Target{Keyspace: "k", Shard: "s"},
   421  		Serving:              false,
   422  		PrimaryTermStartTime: 0,
   423  	}
   424  	result := <-resultChan
   425  	mustMatch(t, want, result, "Wrong TabletHealth data")
   426  
   427  	input <- &querypb.StreamHealthResponse{
   428  		Target:                              &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_PRIMARY},
   429  		TabletAlias:                         &topodatapb.TabletAlias{Uid: 20, Cell: "cellb"},
   430  		Serving:                             true,
   431  		TabletExternallyReparentedTimestamp: 10,
   432  		RealtimeStats:                       &querypb.RealtimeStats{ReplicationLagSeconds: 1, CpuUsage: 0.2},
   433  	}
   434  
   435  	ticker := time.NewTicker(1 * time.Second)
   436  	select {
   437  	case err := <-fc.cbErrCh:
   438  		assert.Contains(t, err.Error(), "health stats mismatch", "wrong error")
   439  	case <-resultChan:
   440  		require.Fail(t, "StreamHealth should have returned a health stats mismatch error")
   441  	case <-ticker.C:
   442  		require.Fail(t, "Timed out waiting for StreamHealth to return a health stats mismatch error")
   443  	}
   444  }
   445  
   446  // TestHealthCheckCloseWaitsForGoRoutines tests that Close() waits for all Go
   447  // routines to finish and the listener won't be called anymore.
   448  func TestHealthCheckCloseWaitsForGoRoutines(t *testing.T) {
   449  	ts := memorytopo.NewServer("cell")
   450  	hc := createTestHc(ts)
   451  	tablet := createTestTablet(0, "cell", "a")
   452  	input := make(chan *querypb.StreamHealthResponse, 1)
   453  	createFakeConn(tablet, input)
   454  	resultChan := hc.Subscribe()
   455  
   456  	hc.AddTablet(tablet)
   457  
   458  	// Immediately after AddTablet() there will be the first notification.
   459  	want := &TabletHealth{
   460  		Tablet:               tablet,
   461  		Target:               &querypb.Target{Keyspace: "k", Shard: "s"},
   462  		Serving:              false,
   463  		PrimaryTermStartTime: 0,
   464  	}
   465  	result := <-resultChan
   466  	mustMatch(t, want, result, "Wrong TabletHealth data")
   467  
   468  	// one tablet after receiving a StreamHealthResponse
   469  	shr := &querypb.StreamHealthResponse{
   470  		TabletAlias:                         tablet.Alias,
   471  		Target:                              &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA},
   472  		Serving:                             true,
   473  		TabletExternallyReparentedTimestamp: 0,
   474  		RealtimeStats:                       &querypb.RealtimeStats{ReplicationLagSeconds: 1, CpuUsage: 0.2},
   475  	}
   476  	want = &TabletHealth{
   477  		Tablet:  tablet,
   478  		Target:  &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA},
   479  		Serving: true,
   480  		Stats:   &querypb.RealtimeStats{ReplicationLagSeconds: 1, CpuUsage: 0.2},
   481  
   482  		PrimaryTermStartTime: 0,
   483  	}
   484  	input <- shr
   485  	result = <-resultChan
   486  	mustMatch(t, want, result, "Wrong TabletHealth data")
   487  
   488  	// Change input to distinguish between stats sent before and after Close().
   489  	shr.TabletExternallyReparentedTimestamp = 11
   490  	// Close the healthcheck. Tablet connections are closed asynchronously and
   491  	// Close() will block until all Go routines (one per connection) are done.
   492  	assert.Nil(t, hc.Close(), "Close returned error")
   493  	// Try to send more updates. They should be ignored and nothing should change
   494  	input <- shr
   495  
   496  	select {
   497  	case result = <-resultChan:
   498  		assert.Nil(t, result, "healthCheck still running after Close(): received result: %v", result)
   499  	case <-time.After(1 * time.Millisecond):
   500  		// No response after timeout. Success.
   501  	}
   502  
   503  	hc.mu.Lock()
   504  	defer hc.mu.Unlock()
   505  	assert.Nil(t, hc.healthByAlias, "health data should be nil")
   506  }
   507  
   508  func TestHealthCheckTimeout(t *testing.T) {
   509  	// reset counters
   510  	hcErrorCounters.ResetAll()
   511  	ts := memorytopo.NewServer("cell")
   512  	hc := createTestHc(ts)
   513  	hc.healthCheckTimeout = 500 * time.Millisecond
   514  	defer hc.Close()
   515  	tablet := createTestTablet(0, "cell", "a")
   516  	input := make(chan *querypb.StreamHealthResponse)
   517  	fc := createFakeConn(tablet, input)
   518  	resultChan := hc.Subscribe()
   519  	hc.AddTablet(tablet)
   520  	// Immediately after AddTablet() there will be the first notification.
   521  	want := &TabletHealth{
   522  		Tablet:               tablet,
   523  		Target:               &querypb.Target{Keyspace: "k", Shard: "s"},
   524  		Serving:              false,
   525  		PrimaryTermStartTime: 0,
   526  	}
   527  	result := <-resultChan
   528  	mustMatch(t, want, result, "Wrong TabletHealth data")
   529  
   530  	// one tablet after receiving a StreamHealthResponse
   531  	shr := &querypb.StreamHealthResponse{
   532  		TabletAlias:                         tablet.Alias,
   533  		Target:                              &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA},
   534  		Serving:                             true,
   535  		TabletExternallyReparentedTimestamp: 0,
   536  		RealtimeStats:                       &querypb.RealtimeStats{ReplicationLagSeconds: 1, CpuUsage: 0.2},
   537  	}
   538  	want = &TabletHealth{
   539  		Tablet:               tablet,
   540  		Target:               &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA},
   541  		Serving:              true,
   542  		Stats:                &querypb.RealtimeStats{ReplicationLagSeconds: 1, CpuUsage: 0.2},
   543  		PrimaryTermStartTime: 0,
   544  	}
   545  	input <- shr
   546  	result = <-resultChan
   547  	mustMatch(t, want, result, "Wrong TabletHealth data")
   548  	assert.Nil(t, checkErrorCounter("k", "s", topodatapb.TabletType_REPLICA, 0))
   549  
   550  	// wait for timeout period
   551  	time.Sleep(hc.healthCheckTimeout + 100*time.Millisecond)
   552  	t.Logf(`Sleep(1.1 * timeout)`)
   553  	result = <-resultChan
   554  	assert.False(t, result.Serving, "tabletHealthCheck: %+v; want not serving", result)
   555  	assert.Nil(t, checkErrorCounter("k", "s", topodatapb.TabletType_REPLICA, 1))
   556  	assert.True(t, fc.isCanceled(), "StreamHealth should be canceled after timeout, but is not")
   557  
   558  	// tablet should be removed from healthy list
   559  	a := hc.GetHealthyTabletStats(&querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA})
   560  	assert.Empty(t, a, "wrong result, expected empty list")
   561  
   562  	// repeat the wait. It will timeout one more time trying to get the connection.
   563  	fc.resetCanceledFlag()
   564  	time.Sleep(hc.healthCheckTimeout)
   565  
   566  	result = <-resultChan
   567  	assert.False(t, result.Serving, "tabletHealthCheck: %+v; want not serving", result)
   568  	assert.Nil(t, checkErrorCounter("k", "s", topodatapb.TabletType_REPLICA, 2))
   569  	assert.True(t, fc.isCanceled(), "StreamHealth should be canceled again after timeout, but is not")
   570  
   571  	// send a healthcheck response, it should be serving again
   572  	fc.resetCanceledFlag()
   573  	input <- shr
   574  
   575  	// wait for the exponential backoff to wear off and health monitoring to resume.
   576  	result = <-resultChan
   577  	mustMatch(t, want, result, "Wrong TabletHealth data")
   578  }
   579  
   580  func TestWaitForAllServingTablets(t *testing.T) {
   581  	ts := memorytopo.NewServer("cell")
   582  	hc := createTestHc(ts)
   583  	defer hc.Close()
   584  	tablet := createTestTablet(0, "cell", "a")
   585  	tablet.Type = topodatapb.TabletType_REPLICA
   586  	targets := []*querypb.Target{
   587  		{
   588  			Keyspace:   tablet.Keyspace,
   589  			Shard:      tablet.Shard,
   590  			TabletType: tablet.Type,
   591  		},
   592  	}
   593  	input := make(chan *querypb.StreamHealthResponse)
   594  	createFakeConn(tablet, input)
   595  
   596  	// create a channel and subscribe to healthcheck
   597  	resultChan := hc.Subscribe()
   598  	hc.AddTablet(tablet)
   599  	// there will be a first result, get and discard it
   600  	<-resultChan
   601  	// empty
   602  	ctx, cancel := context.WithTimeout(context.Background(), 1*time.Second)
   603  	defer cancel()
   604  
   605  	err := hc.WaitForAllServingTablets(ctx, targets)
   606  	assert.NotNil(t, err, "error should not be nil")
   607  
   608  	shr := &querypb.StreamHealthResponse{
   609  		TabletAlias:                         tablet.Alias,
   610  		Target:                              &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA},
   611  		Serving:                             true,
   612  		TabletExternallyReparentedTimestamp: 0,
   613  		RealtimeStats:                       &querypb.RealtimeStats{ReplicationLagSeconds: 1, CpuUsage: 0.2},
   614  	}
   615  
   616  	input <- shr
   617  	<-resultChan
   618  	// // check it's there
   619  
   620  	targets = []*querypb.Target{
   621  
   622  		{
   623  			Keyspace:   tablet.Keyspace,
   624  			Shard:      tablet.Shard,
   625  			TabletType: tablet.Type,
   626  		},
   627  	}
   628  
   629  	err = hc.WaitForAllServingTablets(ctx, targets)
   630  	assert.Nil(t, err, "error should be nil. Targets are found")
   631  
   632  	targets = []*querypb.Target{
   633  
   634  		{
   635  			Keyspace:   tablet.Keyspace,
   636  			Shard:      tablet.Shard,
   637  			TabletType: tablet.Type,
   638  		},
   639  		{
   640  			Keyspace:   "newkeyspace",
   641  			Shard:      tablet.Shard,
   642  			TabletType: tablet.Type,
   643  		},
   644  	}
   645  
   646  	err = hc.WaitForAllServingTablets(ctx, targets)
   647  	assert.NotNil(t, err, "error should not be nil (there are no tablets on this keyspace")
   648  
   649  	targets = []*querypb.Target{
   650  
   651  		{
   652  			Keyspace:   tablet.Keyspace,
   653  			Shard:      tablet.Shard,
   654  			TabletType: tablet.Type,
   655  		},
   656  		{
   657  			Keyspace:   "newkeyspace",
   658  			Shard:      tablet.Shard,
   659  			TabletType: tablet.Type,
   660  		},
   661  	}
   662  
   663  	KeyspacesToWatch = []string{tablet.Keyspace}
   664  
   665  	err = hc.WaitForAllServingTablets(ctx, targets)
   666  	assert.Nil(t, err, "error should be nil. Keyspace with no tablets is filtered")
   667  
   668  	KeyspacesToWatch = []string{}
   669  }
   670  
   671  // TestRemoveTablet tests the behavior when a tablet goes away.
   672  func TestRemoveTablet(t *testing.T) {
   673  	ts := memorytopo.NewServer("cell")
   674  	hc := createTestHc(ts)
   675  	defer hc.Close()
   676  	tablet := createTestTablet(0, "cell", "a")
   677  	tablet.Type = topodatapb.TabletType_REPLICA
   678  	input := make(chan *querypb.StreamHealthResponse)
   679  	createFakeConn(tablet, input)
   680  
   681  	// create a channel and subscribe to healthcheck
   682  	resultChan := hc.Subscribe()
   683  	hc.AddTablet(tablet)
   684  	// there will be a first result, get and discard it
   685  	<-resultChan
   686  
   687  	shrReplica := &querypb.StreamHealthResponse{
   688  		TabletAlias:                         tablet.Alias,
   689  		Target:                              &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA},
   690  		Serving:                             true,
   691  		TabletExternallyReparentedTimestamp: 0,
   692  		RealtimeStats:                       &querypb.RealtimeStats{ReplicationLagSeconds: 1, CpuUsage: 0.2},
   693  	}
   694  	want := []*TabletHealth{{
   695  		Tablet:               tablet,
   696  		Target:               &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA},
   697  		Serving:              true,
   698  		Stats:                &querypb.RealtimeStats{ReplicationLagSeconds: 1, CpuUsage: 0.2},
   699  		PrimaryTermStartTime: 0,
   700  	}}
   701  	input <- shrReplica
   702  	<-resultChan
   703  	// check it's there
   704  	a := hc.GetHealthyTabletStats(&querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA})
   705  	mustMatch(t, want, a, "unexpected result")
   706  
   707  	// delete the tablet
   708  	hc.RemoveTablet(tablet)
   709  	a = hc.GetHealthyTabletStats(&querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA})
   710  	assert.Empty(t, a, "wrong result, expected empty list")
   711  
   712  	// Now confirm that when a tablet's type changes between when it's added to the
   713  	// cache and when it's removed, that the tablet is entirely removed from the
   714  	// cache since in the secondary maps it's keyed in part by tablet type.
   715  	// Note: we are using GetTabletStats here to check the healthData map (rather
   716  	// than the healthy map that we checked above) because that is the data
   717  	// structure that is used when printing the contents of the healthcheck cache
   718  	// in the /debug/status endpoint and in the SHOW VITESS_TABLETS; SQL command
   719  	// output.
   720  
   721  	// Add the tablet back.
   722  	hc.AddTablet(tablet)
   723  	// Receive and discard the initial result as we have not yet sent the first
   724  	// StreamHealthResponse with the dynamic serving and stats information.
   725  	<-resultChan
   726  	// Send the first StreamHealthResponse with the dynamic serving and stats
   727  	// information.
   728  	input <- shrReplica
   729  	<-resultChan
   730  	// Confirm it's there in the cache.
   731  	a = hc.GetTabletStats(&querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA})
   732  	mustMatch(t, want, a, "unexpected result")
   733  
   734  	// Change the tablet type to RDONLY.
   735  	tablet.Type = topodatapb.TabletType_RDONLY
   736  	shrRdonly := &querypb.StreamHealthResponse{
   737  		TabletAlias:                         tablet.Alias,
   738  		Target:                              &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_RDONLY},
   739  		Serving:                             true,
   740  		TabletExternallyReparentedTimestamp: 0,
   741  		RealtimeStats:                       &querypb.RealtimeStats{ReplicationLagSeconds: 2, CpuUsage: 0.4},
   742  	}
   743  
   744  	// Now Replace it, which does a Remove and Add. The tablet should be removed
   745  	// from the cache and all its maps even though the tablet type had changed
   746  	// in-between the initial Add and Remove.
   747  	hc.ReplaceTablet(tablet, tablet)
   748  	// Receive and discard the initial result as we have not yet sent the first
   749  	// StreamHealthResponse with the dynamic serving and stats information.
   750  	<-resultChan
   751  	// Confirm that the old entry is gone.
   752  	a = hc.GetTabletStats(&querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA})
   753  	assert.Empty(t, a, "wrong result, expected empty list")
   754  	// Send the first StreamHealthResponse with the dynamic serving and stats
   755  	// information.
   756  	input <- shrRdonly
   757  	<-resultChan
   758  	// Confirm that the new entry is there in the cache.
   759  	want = []*TabletHealth{{
   760  		Tablet:               tablet,
   761  		Target:               &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_RDONLY},
   762  		Serving:              true,
   763  		Stats:                &querypb.RealtimeStats{ReplicationLagSeconds: 2, CpuUsage: 0.4},
   764  		PrimaryTermStartTime: 0,
   765  	}}
   766  	a = hc.GetTabletStats(&querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_RDONLY})
   767  	mustMatch(t, want, a, "unexpected result")
   768  
   769  	// Delete the tablet, confirm again that it's gone in both tablet type
   770  	// forms.
   771  	hc.RemoveTablet(tablet)
   772  	a = hc.GetTabletStats(&querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA})
   773  	assert.Empty(t, a, "wrong result, expected empty list")
   774  	a = hc.GetTabletStats(&querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_RDONLY})
   775  	assert.Empty(t, a, "wrong result, expected empty list")
   776  }
   777  
   778  // TestGetHealthyTablets tests the functionality of GetHealthyTabletStats.
   779  func TestGetHealthyTablets(t *testing.T) {
   780  	ts := memorytopo.NewServer("cell")
   781  	hc := createTestHc(ts)
   782  	defer hc.Close()
   783  	tablet := createTestTablet(0, "cell", "a")
   784  	tablet.Type = topodatapb.TabletType_REPLICA
   785  	input := make(chan *querypb.StreamHealthResponse)
   786  	createFakeConn(tablet, input)
   787  
   788  	// create a channel and subscribe to healthcheck
   789  	resultChan := hc.Subscribe()
   790  	hc.AddTablet(tablet)
   791  	// there will be a first result, get and discard it
   792  	<-resultChan
   793  	// empty
   794  	a := hc.GetHealthyTabletStats(&querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_PRIMARY})
   795  	assert.Empty(t, a, "wrong result, expected empty list")
   796  
   797  	shr := &querypb.StreamHealthResponse{
   798  		TabletAlias:                         tablet.Alias,
   799  		Target:                              &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA},
   800  		Serving:                             true,
   801  		TabletExternallyReparentedTimestamp: 0,
   802  		RealtimeStats:                       &querypb.RealtimeStats{ReplicationLagSeconds: 1, CpuUsage: 0.2},
   803  	}
   804  	want := []*TabletHealth{{
   805  		Tablet:               tablet,
   806  		Target:               &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA},
   807  		Serving:              true,
   808  		Stats:                &querypb.RealtimeStats{ReplicationLagSeconds: 1, CpuUsage: 0.2},
   809  		PrimaryTermStartTime: 0,
   810  	}}
   811  	input <- shr
   812  	<-resultChan
   813  	// check it's there
   814  	a = hc.GetHealthyTabletStats(&querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA})
   815  	mustMatch(t, want, a, "unexpected result")
   816  
   817  	// update health with a change that won't change health array
   818  	shr = &querypb.StreamHealthResponse{
   819  		TabletAlias:                         tablet.Alias,
   820  		Target:                              &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA},
   821  		Serving:                             true,
   822  		TabletExternallyReparentedTimestamp: 0,
   823  		RealtimeStats:                       &querypb.RealtimeStats{ReplicationLagSeconds: 2, CpuUsage: 0.2},
   824  	}
   825  	input <- shr
   826  	// wait for result before checking
   827  	<-resultChan
   828  	// check it's there
   829  	a = hc.GetHealthyTabletStats(&querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA})
   830  	mustMatch(t, want, a, "unexpected result")
   831  
   832  	// update stats with a change that will change health array
   833  	shr = &querypb.StreamHealthResponse{
   834  		TabletAlias:                         tablet.Alias,
   835  		Target:                              &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA},
   836  		Serving:                             true,
   837  		TabletExternallyReparentedTimestamp: 0,
   838  		RealtimeStats:                       &querypb.RealtimeStats{ReplicationLagSeconds: 35, CpuUsage: 0.2},
   839  	}
   840  	want = []*TabletHealth{{
   841  		Tablet:               tablet,
   842  		Target:               &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA},
   843  		Serving:              true,
   844  		Stats:                &querypb.RealtimeStats{ReplicationLagSeconds: 35, CpuUsage: 0.2},
   845  		PrimaryTermStartTime: 0,
   846  	}}
   847  	input <- shr
   848  	// wait for result before checking
   849  	<-resultChan
   850  	// check it's there
   851  	a = hc.GetHealthyTabletStats(&querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA})
   852  	mustMatch(t, want, a, "unexpected result")
   853  
   854  	// add a second tablet
   855  	tablet2 := createTestTablet(11, "cell", "host2")
   856  	tablet2.Type = topodatapb.TabletType_REPLICA
   857  	input2 := make(chan *querypb.StreamHealthResponse)
   858  	createFakeConn(tablet2, input2)
   859  	hc.AddTablet(tablet2)
   860  	// there will be a first result, get and discard it
   861  	<-resultChan
   862  
   863  	shr2 := &querypb.StreamHealthResponse{
   864  		TabletAlias:                         tablet2.Alias,
   865  		Target:                              &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA},
   866  		Serving:                             true,
   867  		TabletExternallyReparentedTimestamp: 0,
   868  		RealtimeStats:                       &querypb.RealtimeStats{ReplicationLagSeconds: 10, CpuUsage: 0.2},
   869  	}
   870  	want2 := []*TabletHealth{{
   871  		Tablet:               tablet,
   872  		Target:               &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA},
   873  		Serving:              true,
   874  		Stats:                &querypb.RealtimeStats{ReplicationLagSeconds: 35, CpuUsage: 0.2},
   875  		PrimaryTermStartTime: 0,
   876  	}, {
   877  		Tablet:               tablet2,
   878  		Target:               &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA},
   879  		Serving:              true,
   880  		Stats:                &querypb.RealtimeStats{ReplicationLagSeconds: 10, CpuUsage: 0.2},
   881  		PrimaryTermStartTime: 0,
   882  	}}
   883  	input2 <- shr2
   884  	// wait for result
   885  	<-resultChan
   886  	a = hc.GetHealthyTabletStats(&querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA})
   887  	assert.Equal(t, 2, len(a), "Wrong number of results")
   888  	if a[0].Tablet.Alias.Uid == 11 {
   889  		a[0], a[1] = a[1], a[0]
   890  	}
   891  	mustMatch(t, want2, a, "unexpected result")
   892  
   893  	shr2 = &querypb.StreamHealthResponse{
   894  		TabletAlias:                         tablet2.Alias,
   895  		Target:                              &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA},
   896  		Serving:                             false,
   897  		TabletExternallyReparentedTimestamp: 0,
   898  		RealtimeStats:                       &querypb.RealtimeStats{ReplicationLagSeconds: 10, CpuUsage: 0.2},
   899  	}
   900  	input2 <- shr2
   901  	// wait for result
   902  	<-resultChan
   903  	a = hc.GetHealthyTabletStats(&querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA})
   904  	assert.Equal(t, 1, len(a), "Wrong number of results")
   905  
   906  	// second tablet turns into a primary
   907  	shr2 = &querypb.StreamHealthResponse{
   908  		TabletAlias: tablet2.Alias,
   909  		Target:      &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_PRIMARY},
   910  		Serving:     true,
   911  
   912  		TabletExternallyReparentedTimestamp: 10,
   913  
   914  		RealtimeStats: &querypb.RealtimeStats{ReplicationLagSeconds: 0, CpuUsage: 0.2},
   915  	}
   916  	input2 <- shr2
   917  	// wait for result
   918  	<-resultChan
   919  	// check we only have 1 healthy replica left
   920  	a = hc.GetHealthyTabletStats(&querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA})
   921  	mustMatch(t, want, a, "unexpected result")
   922  
   923  	want2 = []*TabletHealth{{
   924  		Tablet:               tablet2,
   925  		Target:               &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_PRIMARY},
   926  		Serving:              true,
   927  		Stats:                &querypb.RealtimeStats{ReplicationLagSeconds: 0, CpuUsage: 0.2},
   928  		PrimaryTermStartTime: 10,
   929  	}}
   930  	// check we have a primary now
   931  	a = hc.GetHealthyTabletStats(&querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_PRIMARY})
   932  	mustMatch(t, want2, a, "unexpected result")
   933  
   934  	// reparent: old replica goes into primary
   935  	shr = &querypb.StreamHealthResponse{
   936  		TabletAlias:                         tablet.Alias,
   937  		Target:                              &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_PRIMARY},
   938  		Serving:                             true,
   939  		TabletExternallyReparentedTimestamp: 20,
   940  		RealtimeStats:                       &querypb.RealtimeStats{ReplicationLagSeconds: 0, CpuUsage: 0.2},
   941  	}
   942  	input <- shr
   943  	<-resultChan
   944  	want = []*TabletHealth{{
   945  		Tablet:               tablet,
   946  		Target:               &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_PRIMARY},
   947  		Serving:              true,
   948  		Stats:                &querypb.RealtimeStats{ReplicationLagSeconds: 0, CpuUsage: 0.2},
   949  		PrimaryTermStartTime: 20,
   950  	}}
   951  
   952  	// check we lost all replicas, and primary is new one
   953  	a = hc.GetHealthyTabletStats(&querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA})
   954  	assert.Empty(t, a, "Wrong number of results")
   955  	a = hc.GetHealthyTabletStats(&querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_PRIMARY})
   956  	mustMatch(t, want, a, "unexpected result")
   957  
   958  	// old primary sending an old ping should be ignored
   959  	input2 <- shr2
   960  	<-resultChan
   961  	a = hc.GetHealthyTabletStats(&querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_PRIMARY})
   962  	mustMatch(t, want, a, "unexpected result")
   963  }
   964  
   965  func TestPrimaryInOtherCell(t *testing.T) {
   966  	ts := memorytopo.NewServer("cell1", "cell2")
   967  	hc := NewHealthCheck(context.Background(), 1*time.Millisecond, time.Hour, ts, "cell1", "cell1, cell2")
   968  	defer hc.Close()
   969  
   970  	// add a tablet as primary in different cell
   971  	tablet := createTestTablet(1, "cell2", "host1")
   972  	tablet.Type = topodatapb.TabletType_PRIMARY
   973  	input := make(chan *querypb.StreamHealthResponse)
   974  	fc := createFakeConn(tablet, input)
   975  	// create a channel and subscribe to healthcheck
   976  	resultChan := hc.Subscribe()
   977  	hc.AddTablet(tablet)
   978  	// should get a result, but this will hang if multi-cell logic is broken
   979  	// so wait and timeout
   980  	ticker := time.NewTicker(1 * time.Second)
   981  	select {
   982  	case err := <-fc.cbErrCh:
   983  		require.Fail(t, "Unexpected error: %v", err)
   984  	case <-resultChan:
   985  	case <-ticker.C:
   986  		require.Fail(t, "Timed out waiting for HealthCheck update")
   987  	}
   988  
   989  	shr := &querypb.StreamHealthResponse{
   990  		TabletAlias:                         tablet.Alias,
   991  		Target:                              &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_PRIMARY},
   992  		Serving:                             true,
   993  		TabletExternallyReparentedTimestamp: 20,
   994  		RealtimeStats:                       &querypb.RealtimeStats{ReplicationLagSeconds: 0, CpuUsage: 0.2},
   995  	}
   996  	want := &TabletHealth{
   997  		Tablet:               tablet,
   998  		Target:               &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_PRIMARY},
   999  		Serving:              true,
  1000  		Stats:                &querypb.RealtimeStats{ReplicationLagSeconds: 0, CpuUsage: 0.2},
  1001  		PrimaryTermStartTime: 20,
  1002  	}
  1003  
  1004  	input <- shr
  1005  	ticker = time.NewTicker(1 * time.Second)
  1006  	select {
  1007  	case err := <-fc.cbErrCh:
  1008  		require.Fail(t, "Unexpected error: %v", err)
  1009  	case got := <-resultChan:
  1010  		// check that we DO receive health check update for PRIMARY in other cell
  1011  		mustMatch(t, want, got, "Wrong TabletHealth data")
  1012  	case <-ticker.C:
  1013  		require.Fail(t, "Timed out waiting for HealthCheck update")
  1014  	}
  1015  
  1016  	// check that PRIMARY tablet from other cell IS in healthy tablet list
  1017  	a := hc.GetHealthyTabletStats(&querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_PRIMARY})
  1018  	require.Len(t, a, 1, "")
  1019  	mustMatch(t, want, a[0], "Expecting healthy primary")
  1020  }
  1021  
  1022  func TestReplicaInOtherCell(t *testing.T) {
  1023  	ts := memorytopo.NewServer("cell1", "cell2")
  1024  	hc := NewHealthCheck(context.Background(), 1*time.Millisecond, time.Hour, ts, "cell1", "cell1, cell2")
  1025  	defer hc.Close()
  1026  
  1027  	// add a tablet as replica
  1028  	local := createTestTablet(1, "cell1", "host1")
  1029  	local.Type = topodatapb.TabletType_REPLICA
  1030  	input := make(chan *querypb.StreamHealthResponse)
  1031  	fc := createFakeConn(local, input)
  1032  	// create a channel and subscribe to healthcheck
  1033  	resultChan := hc.Subscribe()
  1034  	hc.AddTablet(local)
  1035  
  1036  	ticker := time.NewTicker(1 * time.Second)
  1037  	select {
  1038  	case err := <-fc.cbErrCh:
  1039  		require.Fail(t, "Unexpected error: %v", err)
  1040  	case <-resultChan:
  1041  	case <-ticker.C:
  1042  		require.Fail(t, "Timed out waiting for HealthCheck update")
  1043  	}
  1044  
  1045  	shr := &querypb.StreamHealthResponse{
  1046  		TabletAlias:                         local.Alias,
  1047  		Target:                              &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA},
  1048  		Serving:                             true,
  1049  		TabletExternallyReparentedTimestamp: 0,
  1050  		RealtimeStats:                       &querypb.RealtimeStats{ReplicationLagSeconds: 10, CpuUsage: 0.2},
  1051  	}
  1052  	want := &TabletHealth{
  1053  		Tablet:               local,
  1054  		Target:               &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA},
  1055  		Serving:              true,
  1056  		Stats:                &querypb.RealtimeStats{ReplicationLagSeconds: 10, CpuUsage: 0.2},
  1057  		PrimaryTermStartTime: 0,
  1058  	}
  1059  
  1060  	input <- shr
  1061  	ticker = time.NewTicker(1 * time.Second)
  1062  	select {
  1063  	case err := <-fc.cbErrCh:
  1064  		require.Fail(t, "Unexpected error: %v", err)
  1065  	case got := <-resultChan:
  1066  		// check that we DO receive health check update for REPLICA in other cell
  1067  		mustMatch(t, want, got, "Wrong TabletHealth data")
  1068  	case <-ticker.C:
  1069  		require.Fail(t, "Timed out waiting for HealthCheck update")
  1070  	}
  1071  
  1072  	// add a tablet as replica in different cell
  1073  	remote := createTestTablet(2, "cell2", "host2")
  1074  	remote.Type = topodatapb.TabletType_REPLICA
  1075  	input2 := make(chan *querypb.StreamHealthResponse)
  1076  	fc2 := createFakeConn(remote, input2)
  1077  	// create a channel and subscribe to healthcheck
  1078  	resultChan2 := hc.Subscribe()
  1079  	hc.AddTablet(remote)
  1080  	// should get a result, but this will hang if multi-cell logic is broken
  1081  	// so wait and timeout
  1082  	ticker = time.NewTicker(1 * time.Second)
  1083  	select {
  1084  	case err := <-fc2.cbErrCh:
  1085  		require.Fail(t, "Unexpected error: %v", err)
  1086  	case <-resultChan2:
  1087  	case <-ticker.C:
  1088  		require.Fail(t, "Timed out waiting for HealthCheck update")
  1089  	}
  1090  
  1091  	shr2 := &querypb.StreamHealthResponse{
  1092  		TabletAlias:                         remote.Alias,
  1093  		Target:                              &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA},
  1094  		Serving:                             true,
  1095  		TabletExternallyReparentedTimestamp: 0,
  1096  		RealtimeStats:                       &querypb.RealtimeStats{ReplicationLagSeconds: 10, CpuUsage: 0.2},
  1097  	}
  1098  	want2 := &TabletHealth{
  1099  		Tablet:               remote,
  1100  		Target:               &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA},
  1101  		Serving:              true,
  1102  		Stats:                &querypb.RealtimeStats{ReplicationLagSeconds: 10, CpuUsage: 0.2},
  1103  		PrimaryTermStartTime: 0,
  1104  	}
  1105  
  1106  	input2 <- shr2
  1107  	ticker = time.NewTicker(1 * time.Second)
  1108  	select {
  1109  	case err := <-fc.cbErrCh:
  1110  		require.Fail(t, "Unexpected error: %v", err)
  1111  	case got := <-resultChan2:
  1112  		// check that we DO receive health check update for REPLICA in other cell
  1113  		mustMatch(t, want2, got, "Wrong TabletHealth data")
  1114  	case <-ticker.C:
  1115  		require.Fail(t, "Timed out waiting for HealthCheck update")
  1116  	}
  1117  
  1118  	// check that only REPLICA tablet from cell1 is in healthy tablet list
  1119  	a := hc.GetHealthyTabletStats(&querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA})
  1120  	require.Len(t, a, 1, "")
  1121  	mustMatch(t, want, a[0], "Expecting healthy local replica")
  1122  }
  1123  
  1124  func TestCellAliases(t *testing.T) {
  1125  	ts := memorytopo.NewServer("cell1", "cell2")
  1126  	hc := NewHealthCheck(context.Background(), 1*time.Millisecond, time.Hour, ts, "cell1", "cell1, cell2")
  1127  	defer hc.Close()
  1128  
  1129  	cellsAlias := &topodatapb.CellsAlias{
  1130  		Cells: []string{"cell1", "cell2"},
  1131  	}
  1132  	assert.Nil(t, ts.CreateCellsAlias(context.Background(), "region1", cellsAlias), "failed to create cell alias")
  1133  	defer deleteCellsAlias(t, ts, "region1")
  1134  
  1135  	// add a tablet as replica in diff cell, same region
  1136  	tablet := createTestTablet(1, "cell2", "host2")
  1137  	tablet.Type = topodatapb.TabletType_REPLICA
  1138  	input := make(chan *querypb.StreamHealthResponse)
  1139  	fc := createFakeConn(tablet, input)
  1140  	// create a channel and subscribe to healthcheck
  1141  	resultChan := hc.Subscribe()
  1142  	hc.AddTablet(tablet)
  1143  	// should get a result, but this will hang if cell alias logic is broken
  1144  	// so wait and timeout
  1145  	ticker := time.NewTicker(1 * time.Second)
  1146  	select {
  1147  	case err := <-fc.cbErrCh:
  1148  		require.Fail(t, "Unexpected error: %v", err)
  1149  	case <-resultChan:
  1150  	case <-ticker.C:
  1151  		require.Fail(t, "Timed out waiting for HealthCheck update")
  1152  	}
  1153  
  1154  	shr := &querypb.StreamHealthResponse{
  1155  		TabletAlias:                         tablet.Alias,
  1156  		Target:                              &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA},
  1157  		Serving:                             true,
  1158  		TabletExternallyReparentedTimestamp: 0,
  1159  		RealtimeStats:                       &querypb.RealtimeStats{ReplicationLagSeconds: 10, CpuUsage: 0.2},
  1160  	}
  1161  	want := []*TabletHealth{{
  1162  		Tablet:               tablet,
  1163  		Target:               &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA},
  1164  		Serving:              true,
  1165  		Stats:                &querypb.RealtimeStats{ReplicationLagSeconds: 10, CpuUsage: 0.2},
  1166  		PrimaryTermStartTime: 0,
  1167  	}}
  1168  
  1169  	input <- shr
  1170  	ticker = time.NewTicker(1 * time.Second)
  1171  	select {
  1172  	case err := <-fc.cbErrCh:
  1173  		require.Fail(t, "Unexpected error: %v", err)
  1174  	case <-resultChan:
  1175  	case <-ticker.C:
  1176  		require.Fail(t, "Timed out waiting for HealthCheck update")
  1177  	}
  1178  
  1179  	// check it's there
  1180  	a := hc.GetHealthyTabletStats(&querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA})
  1181  	mustMatch(t, want, a, "Wrong TabletHealth data")
  1182  }
  1183  
  1184  func TestHealthCheckChecksGrpcPort(t *testing.T) {
  1185  	ts := memorytopo.NewServer("cell")
  1186  	hc := createTestHc(ts)
  1187  	defer hc.Close()
  1188  
  1189  	tablet := createTestTablet(0, "cell", "a")
  1190  	tablet.PortMap["grpc"] = 0
  1191  	resultChan := hc.Subscribe()
  1192  
  1193  	// AddTablet should not add the tablet because port is 0
  1194  	hc.AddTablet(tablet)
  1195  
  1196  	select {
  1197  	case result := <-resultChan:
  1198  		assert.Nil(t, result, "healthCheck received result: %v", result)
  1199  	case <-time.After(2 * time.Millisecond):
  1200  		// No response after timeout. Success.
  1201  	}
  1202  }
  1203  
  1204  func TestTemplate(t *testing.T) {
  1205  	TabletURLTemplateString = "http://{{.GetTabletHostPort}}"
  1206  	ParseTabletURLTemplateFromFlag()
  1207  
  1208  	tablet := topo.NewTablet(0, "cell", "a")
  1209  	ts := []*TabletHealth{
  1210  		{
  1211  			Tablet:               tablet,
  1212  			Target:               &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA},
  1213  			Serving:              false,
  1214  			Stats:                &querypb.RealtimeStats{ReplicationLagSeconds: 1, CpuUsage: 0.3},
  1215  			PrimaryTermStartTime: 0,
  1216  		},
  1217  	}
  1218  	tcs := &TabletsCacheStatus{
  1219  		Cell:         "cell",
  1220  		Target:       &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA},
  1221  		TabletsStats: ts,
  1222  	}
  1223  	templ := template.New("").Funcs(status.StatusFuncs)
  1224  	templ, err := templ.Parse(HealthCheckTemplate)
  1225  	require.Nil(t, err, "error parsing template: %v", err)
  1226  	wr := &bytes.Buffer{}
  1227  	err = templ.Execute(wr, []*TabletsCacheStatus{tcs})
  1228  	require.Nil(t, err, "error executing template: %v", err)
  1229  }
  1230  
  1231  func TestDebugURLFormatting(t *testing.T) {
  1232  	TabletURLTemplateString = "https://{{.GetHostNameLevel 0}}.bastion.{{.Tablet.Alias.Cell}}.corp"
  1233  	ParseTabletURLTemplateFromFlag()
  1234  
  1235  	tablet := topo.NewTablet(0, "cell", "host.dc.domain")
  1236  	ts := []*TabletHealth{
  1237  		{
  1238  			Tablet:               tablet,
  1239  			Target:               &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA},
  1240  			Serving:              false,
  1241  			Stats:                &querypb.RealtimeStats{ReplicationLagSeconds: 1, CpuUsage: 0.3},
  1242  			PrimaryTermStartTime: 0,
  1243  		},
  1244  	}
  1245  	tcs := &TabletsCacheStatus{
  1246  		Cell:         "cell",
  1247  		Target:       &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA},
  1248  		TabletsStats: ts,
  1249  	}
  1250  	templ := template.New("").Funcs(status.StatusFuncs)
  1251  	templ, err := templ.Parse(HealthCheckTemplate)
  1252  	require.Nil(t, err, "error parsing template")
  1253  	wr := &bytes.Buffer{}
  1254  	err = templ.Execute(wr, []*TabletsCacheStatus{tcs})
  1255  	require.Nil(t, err, "error executing template")
  1256  	expectedURL := `"https://host.bastion.cell.corp"`
  1257  	require.Contains(t, wr.String(), expectedURL, "output missing formatted URL")
  1258  }
  1259  
  1260  func tabletDialer(tablet *topodatapb.Tablet, _ grpcclient.FailFast) (queryservice.QueryService, error) {
  1261  	connMapMu.Lock()
  1262  	defer connMapMu.Unlock()
  1263  
  1264  	key := TabletToMapKey(tablet)
  1265  	if qs, ok := connMap[key]; ok {
  1266  		return qs, nil
  1267  	}
  1268  	return nil, fmt.Errorf("tablet %v not found", key)
  1269  }
  1270  
  1271  func createTestHc(ts *topo.Server) *HealthCheckImpl {
  1272  	return NewHealthCheck(context.Background(), 1*time.Millisecond, time.Hour, ts, "cell", "")
  1273  }
  1274  
  1275  type fakeConn struct {
  1276  	queryservice.QueryService
  1277  	tablet *topodatapb.Tablet
  1278  	// If fixedResult is set, the channels are not used.
  1279  	fixedResult *querypb.StreamHealthResponse
  1280  	// hcChan should be an unbuffered channel which holds the tablet's next health response.
  1281  	hcChan chan *querypb.StreamHealthResponse
  1282  	// errCh is either an unbuffered channel which holds the stream error to return, or nil.
  1283  	errCh chan error
  1284  	// cbErrCh is a channel which receives errors returned from the supplied callback.
  1285  	cbErrCh chan error
  1286  
  1287  	mu       sync.Mutex
  1288  	canceled bool
  1289  }
  1290  
  1291  func createFakeConn(tablet *topodatapb.Tablet, c chan *querypb.StreamHealthResponse) *fakeConn {
  1292  	connMapMu.Lock()
  1293  	defer connMapMu.Unlock()
  1294  	key := TabletToMapKey(tablet)
  1295  	conn := &fakeConn{
  1296  		QueryService: fakes.ErrorQueryService,
  1297  		tablet:       tablet,
  1298  		hcChan:       c,
  1299  		cbErrCh:      make(chan error, 1),
  1300  	}
  1301  	connMap[key] = conn
  1302  	return conn
  1303  }
  1304  
  1305  // StreamHealth implements queryservice.QueryService.
  1306  func (fc *fakeConn) StreamHealth(ctx context.Context, callback func(shr *querypb.StreamHealthResponse) error) error {
  1307  	if fc.fixedResult != nil {
  1308  		return callback(fc.fixedResult)
  1309  	}
  1310  	for {
  1311  		select {
  1312  		case shr := <-fc.hcChan:
  1313  			if err := callback(shr); err != nil {
  1314  				if err == io.EOF {
  1315  					return nil
  1316  				}
  1317  				select {
  1318  				case fc.cbErrCh <- err:
  1319  				case <-ctx.Done():
  1320  				}
  1321  				return err
  1322  			}
  1323  		case err := <-fc.errCh:
  1324  			return err
  1325  		case <-ctx.Done():
  1326  			fc.mu.Lock()
  1327  			fc.canceled = true
  1328  			fc.mu.Unlock()
  1329  			return nil
  1330  		}
  1331  	}
  1332  }
  1333  
  1334  func (fc *fakeConn) isCanceled() bool {
  1335  	fc.mu.Lock()
  1336  	defer fc.mu.Unlock()
  1337  	return fc.canceled
  1338  }
  1339  
  1340  func (fc *fakeConn) resetCanceledFlag() {
  1341  	fc.mu.Lock()
  1342  	defer fc.mu.Unlock()
  1343  	fc.canceled = false
  1344  }
  1345  
  1346  func checkErrorCounter(keyspace, shard string, tabletType topodatapb.TabletType, want int64) error {
  1347  	statsKey := []string{keyspace, shard, topoproto.TabletTypeLString(tabletType)}
  1348  	name := strings.Join(statsKey, ".")
  1349  	got, ok := hcErrorCounters.Counts()[name]
  1350  	if !ok {
  1351  		return fmt.Errorf("hcErrorCounters not correctly initialized")
  1352  	}
  1353  	if got != want {
  1354  		return fmt.Errorf("wrong value for hcErrorCounters got = %v, want = %v", got, want)
  1355  	}
  1356  	return nil
  1357  }
  1358  
  1359  func createFixedHealthConn(tablet *topodatapb.Tablet, fixedResult *querypb.StreamHealthResponse) *fakeConn {
  1360  	key := TabletToMapKey(tablet)
  1361  	conn := &fakeConn{
  1362  		QueryService: fakes.ErrorQueryService,
  1363  		tablet:       tablet,
  1364  		fixedResult:  fixedResult,
  1365  	}
  1366  	connMapMu.Lock()
  1367  	defer connMapMu.Unlock()
  1368  	connMap[key] = conn
  1369  	return conn
  1370  }
  1371  
  1372  func createTestTablet(uid uint32, cell, host string) *topodatapb.Tablet {
  1373  	tablet := topo.NewTablet(uid, cell, host)
  1374  	tablet.PortMap["vt"] = 1
  1375  	tablet.PortMap["grpc"] = 2
  1376  	tablet.Keyspace = "k"
  1377  	tablet.Shard = "s"
  1378  	return tablet
  1379  }
  1380  
  1381  var mustMatch = utils.MustMatchFn(".Conn" /* ignored fields*/)
  1382  
  1383  func deleteCellsAlias(t *testing.T, ts *topo.Server, alias string) {
  1384  	if err := ts.DeleteCellsAlias(context.Background(), alias); err != nil {
  1385  		t.Logf("DeleteCellsAlias(%s) failed: %v", alias, err)
  1386  	}
  1387  }