github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/gossip_test.go (about)

     1  // Copyright 2015 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package kvserver_test
    12  
    13  import (
    14  	"context"
    15  	"fmt"
    16  	"reflect"
    17  	"testing"
    18  	"time"
    19  
    20  	"github.com/cockroachdb/cockroach/pkg/base"
    21  	"github.com/cockroachdb/cockroach/pkg/gossip"
    22  	"github.com/cockroachdb/cockroach/pkg/keys"
    23  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    24  	"github.com/cockroachdb/cockroach/pkg/sql/sqlbase"
    25  	"github.com/cockroachdb/cockroach/pkg/testutils/testcluster"
    26  	"github.com/cockroachdb/cockroach/pkg/util"
    27  	"github.com/cockroachdb/cockroach/pkg/util/leaktest"
    28  	"github.com/cockroachdb/cockroach/pkg/util/log"
    29  	"github.com/cockroachdb/cockroach/pkg/util/retry"
    30  	"github.com/stretchr/testify/require"
    31  )
    32  
    33  func TestGossipFirstRange(t *testing.T) {
    34  	defer leaktest.AfterTest(t)()
    35  
    36  	tc := testcluster.StartTestCluster(t, 3,
    37  		base.TestClusterArgs{
    38  			ReplicationMode: base.ReplicationManual,
    39  		})
    40  	defer tc.Stopper().Stop(context.Background())
    41  
    42  	errors := make(chan error, 1)
    43  	descs := make(chan *roachpb.RangeDescriptor)
    44  	unregister := tc.Servers[0].Gossip().RegisterCallback(gossip.KeyFirstRangeDescriptor,
    45  		func(_ string, content roachpb.Value) {
    46  			var desc roachpb.RangeDescriptor
    47  			if err := content.GetProto(&desc); err != nil {
    48  				select {
    49  				case errors <- err:
    50  				default:
    51  				}
    52  			} else {
    53  				select {
    54  				case descs <- &desc:
    55  				case <-time.After(45 * time.Second):
    56  					t.Logf("had to drop descriptor %+v", desc)
    57  				}
    58  			}
    59  		},
    60  		// Redundant callbacks are required by this test.
    61  		gossip.Redundant,
    62  	)
    63  	// Unregister the callback before attempting to stop the stopper to prevent
    64  	// deadlock. This is still flaky in theory since a callback can fire between
    65  	// the last read from the channels and this unregister, but testing has
    66  	// shown this solution to be sufficiently robust for now.
    67  	defer unregister()
    68  
    69  	// Wait for the specified descriptor to be gossiped for the first range. We
    70  	// loop because the timing of replica addition and lease transfer can cause
    71  	// extra gossiping of the first range.
    72  	waitForGossip := func(desc roachpb.RangeDescriptor) {
    73  		for {
    74  			select {
    75  			case err := <-errors:
    76  				t.Fatal(err)
    77  			case gossiped := <-descs:
    78  				if reflect.DeepEqual(&desc, gossiped) {
    79  					return
    80  				}
    81  				log.Infof(context.Background(), "expected\n%+v\nbut found\n%+v", desc, gossiped)
    82  			}
    83  		}
    84  	}
    85  
    86  	// Expect an initial callback of the first range descriptor.
    87  	select {
    88  	case err := <-errors:
    89  		t.Fatal(err)
    90  	case <-descs:
    91  	}
    92  
    93  	// Add two replicas. The first range descriptor should be gossiped after each
    94  	// addition.
    95  	var desc roachpb.RangeDescriptor
    96  	firstRangeKey := keys.MinKey
    97  	for i := 1; i <= 2; i++ {
    98  		var err error
    99  		if desc, err = tc.AddReplicas(firstRangeKey, tc.Target(i)); err != nil {
   100  			t.Fatal(err)
   101  		}
   102  		waitForGossip(desc)
   103  	}
   104  
   105  	// Transfer the lease to a new node. This should cause the first range to be
   106  	// gossiped again.
   107  	if err := tc.TransferRangeLease(desc, tc.Target(1)); err != nil {
   108  		t.Fatal(err)
   109  	}
   110  	waitForGossip(desc)
   111  
   112  	// Remove a non-lease holder replica.
   113  	desc, err := tc.RemoveReplicas(firstRangeKey, tc.Target(0))
   114  	if err != nil {
   115  		t.Fatal(err)
   116  	}
   117  	waitForGossip(desc)
   118  
   119  	// TODO(peter): Re-enable or remove when we've resolved the discussion
   120  	// about removing the lease-holder replica. See #7872.
   121  
   122  	// // Remove the lease holder replica.
   123  	// leaseHolder, err := tc.FindRangeLeaseHolder(desc, nil)
   124  	// desc, err = tc.RemoveReplicas(firstRangeKey, leaseHolder)
   125  	// if err != nil {
   126  	// 	t.Fatal(err)
   127  	// }
   128  	// select {
   129  	// case err := <-errors:
   130  	// 	t.Fatal(err)
   131  	// case gossiped := <-descs:
   132  	// 	if !reflect.DeepEqual(desc, gossiped) {
   133  	// 		t.Fatalf("expected\n%+v\nbut found\n%+v", desc, gossiped)
   134  	// 	}
   135  	// }
   136  }
   137  
   138  // TestGossipHandlesReplacedNode tests that we can shut down a node and
   139  // replace it with a new node at the same address (simulating a node getting
   140  // restarted after losing its data) without the cluster breaking.
   141  func TestGossipHandlesReplacedNode(t *testing.T) {
   142  	defer leaktest.AfterTest(t)()
   143  	if testing.Short() {
   144  		// As of Nov 2018 it takes 3.6s.
   145  		t.Skip("short")
   146  	}
   147  	ctx := context.Background()
   148  
   149  	// Shorten the raft tick interval and election timeout to make range leases
   150  	// much shorter than normal. This keeps us from having to wait so long for
   151  	// the replaced node's leases to time out, but has still shown itself to be
   152  	// long enough to avoid flakes.
   153  	serverArgs := base.TestServerArgs{
   154  		Addr:     util.IsolatedTestAddr.String(),
   155  		Insecure: true, // because our certs are only valid for 127.0.0.1
   156  		RetryOptions: retry.Options{
   157  			InitialBackoff: 10 * time.Millisecond,
   158  			MaxBackoff:     50 * time.Millisecond,
   159  		},
   160  	}
   161  	serverArgs.RaftTickInterval = 50 * time.Millisecond
   162  	serverArgs.RaftElectionTimeoutTicks = 10
   163  
   164  	tc := testcluster.StartTestCluster(t, 3,
   165  		base.TestClusterArgs{
   166  			ServerArgs: serverArgs,
   167  		})
   168  	defer tc.Stopper().Stop(context.Background())
   169  
   170  	// Take down the first node and replace it with a new one.
   171  	oldNodeIdx := 0
   172  	newServerArgs := serverArgs
   173  	newServerArgs.Addr = tc.Servers[oldNodeIdx].ServingRPCAddr()
   174  	newServerArgs.SQLAddr = tc.Servers[oldNodeIdx].ServingSQLAddr()
   175  	newServerArgs.PartOfCluster = true
   176  	newServerArgs.JoinAddr = tc.Servers[1].ServingRPCAddr()
   177  	log.Infof(ctx, "stopping server %d", oldNodeIdx)
   178  	tc.StopServer(oldNodeIdx)
   179  	tc.AddServer(t, newServerArgs)
   180  
   181  	tc.WaitForStores(t, tc.Server(1).GossipI().(*gossip.Gossip))
   182  
   183  	// Ensure that all servers still running are responsive. If the two remaining
   184  	// original nodes don't refresh their connection to the address of the first
   185  	// node, they can get stuck here.
   186  	for i, server := range tc.Servers {
   187  		if i == oldNodeIdx {
   188  			continue
   189  		}
   190  		kvClient := server.DB()
   191  		if err := kvClient.Put(ctx, fmt.Sprintf("%d", i), i); err != nil {
   192  			t.Errorf("failed Put to node %d: %+v", i, err)
   193  		}
   194  	}
   195  }
   196  
   197  // TestGossipAfterAbortOfSystemConfigTransactionAfterFailureDueToIntents tests
   198  // that failures to gossip the system config due to intents are rectified when
   199  // later intents are aborted.
   200  func TestGossipAfterAbortOfSystemConfigTransactionAfterFailureDueToIntents(t *testing.T) {
   201  	defer leaktest.AfterTest(t)()
   202  
   203  	ctx := context.Background()
   204  
   205  	tc := testcluster.StartTestCluster(t, 1, base.TestClusterArgs{})
   206  	defer tc.Stopper().Stop(ctx)
   207  	require.NoError(t, tc.WaitForFullReplication())
   208  
   209  	db := tc.Server(0).DB()
   210  
   211  	txA := db.NewTxn(ctx, "a")
   212  	txB := db.NewTxn(ctx, "b")
   213  
   214  	require.NoError(t, txA.SetSystemConfigTrigger())
   215  	require.NoError(t, txA.Put(
   216  		ctx, keys.SystemSQLCodec.DescMetadataKey(1000), sqlbase.WrapDescriptor(&sqlbase.DatabaseDescriptor{})))
   217  
   218  	require.NoError(t, txB.SetSystemConfigTrigger())
   219  	require.NoError(t, txB.Put(
   220  		ctx, keys.SystemSQLCodec.DescMetadataKey(2000), sqlbase.WrapDescriptor(&sqlbase.DatabaseDescriptor{})))
   221  
   222  	const someTime = 10 * time.Millisecond
   223  	clearNotifictions := func(ch <-chan struct{}) {
   224  		for {
   225  			select {
   226  			case <-ch:
   227  			case <-time.After(someTime):
   228  				return
   229  			}
   230  		}
   231  	}
   232  	systemConfChangeCh := tc.Server(0).GossipI().(*gossip.Gossip).RegisterSystemConfigChannel()
   233  	clearNotifictions(systemConfChangeCh)
   234  	require.NoError(t, txB.Commit(ctx))
   235  	select {
   236  	case <-systemConfChangeCh:
   237  		// This case is rare but happens sometimes. We gossip the node liveness
   238  		// in a bunch of cases so we just let the test finish here. The important
   239  		// thing is that sometimes we get to the next phase.
   240  		t.Log("got unexpected update. This can happen for a variety of " +
   241  			"reasons like lease transfers. The test is exiting without testing anything")
   242  		return
   243  	case <-time.After(someTime):
   244  		// Did not expect an update so this is the happy case
   245  	}
   246  	// Roll back the transaction which had laid down the intent which blocked the
   247  	// earlier gossip update, make sure we get a gossip notification now.
   248  	const aLongTime = 20 * someTime
   249  	require.NoError(t, txA.Rollback(ctx))
   250  	select {
   251  	case <-systemConfChangeCh:
   252  		// Got an update.
   253  	case <-time.After(aLongTime):
   254  		t.Fatal("expected update")
   255  	}
   256  }