go.etcd.io/etcd@v3.3.27+incompatible/clientv3/integration/network_partition_test.go (about)

     1  // Copyright 2017 The etcd Authors
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // +build !cluster_proxy
    16  
    17  package integration
    18  
    19  import (
    20  	"context"
    21  	"errors"
    22  	"testing"
    23  	"time"
    24  
    25  	"github.com/coreos/etcd/clientv3"
    26  	"github.com/coreos/etcd/etcdserver/api/v3rpc/rpctypes"
    27  	pb "github.com/coreos/etcd/etcdserver/etcdserverpb"
    28  	"github.com/coreos/etcd/integration"
    29  	"github.com/coreos/etcd/pkg/testutil"
    30  	"google.golang.org/grpc"
    31  )
    32  
    33  var errExpected = errors.New("expected error")
    34  
    35  // TestBalancerUnderNetworkPartitionPut tests when one member becomes isolated,
    36  // first Put request fails, and following retry succeeds with client balancer
    37  // switching to others.
    38  func TestBalancerUnderNetworkPartitionPut(t *testing.T) {
    39  	testBalancerUnderNetworkPartition(t, func(cli *clientv3.Client, ctx context.Context) error {
    40  		_, err := cli.Put(ctx, "a", "b")
    41  		if isClientTimeout(err) || isServerCtxTimeout(err) || err == rpctypes.ErrTimeout {
    42  			return errExpected
    43  		}
    44  		return err
    45  	}, time.Second)
    46  }
    47  
    48  func TestBalancerUnderNetworkPartitionDelete(t *testing.T) {
    49  	testBalancerUnderNetworkPartition(t, func(cli *clientv3.Client, ctx context.Context) error {
    50  		_, err := cli.Delete(ctx, "a")
    51  		if isClientTimeout(err) || isServerCtxTimeout(err) || err == rpctypes.ErrTimeout {
    52  			return errExpected
    53  		}
    54  		return err
    55  	}, time.Second)
    56  }
    57  
    58  func TestBalancerUnderNetworkPartitionTxn(t *testing.T) {
    59  	testBalancerUnderNetworkPartition(t, func(cli *clientv3.Client, ctx context.Context) error {
    60  		_, err := cli.Txn(ctx).
    61  			If(clientv3.Compare(clientv3.Version("foo"), "=", 0)).
    62  			Then(clientv3.OpPut("foo", "bar")).
    63  			Else(clientv3.OpPut("foo", "baz")).Commit()
    64  		if isClientTimeout(err) || isServerCtxTimeout(err) || err == rpctypes.ErrTimeout {
    65  			return errExpected
    66  		}
    67  		return err
    68  	}, time.Second)
    69  }
    70  
    71  // TestBalancerUnderNetworkPartitionLinearizableGetWithLongTimeout tests
    72  // when one member becomes isolated, first quorum Get request succeeds
    73  // by switching endpoints within the timeout (long enough to cover endpoint switch).
    74  func TestBalancerUnderNetworkPartitionLinearizableGetWithLongTimeout(t *testing.T) {
    75  	testBalancerUnderNetworkPartition(t, func(cli *clientv3.Client, ctx context.Context) error {
    76  		_, err := cli.Get(ctx, "a")
    77  		if err == rpctypes.ErrTimeout {
    78  			return errExpected
    79  		}
    80  		return err
    81  	}, 7*time.Second)
    82  }
    83  
    84  // TestBalancerUnderNetworkPartitionLinearizableGetWithShortTimeout tests
    85  // when one member becomes isolated, first quorum Get request fails,
    86  // and following retry succeeds with client balancer switching to others.
    87  func TestBalancerUnderNetworkPartitionLinearizableGetWithShortTimeout(t *testing.T) {
    88  	testBalancerUnderNetworkPartition(t, func(cli *clientv3.Client, ctx context.Context) error {
    89  		_, err := cli.Get(ctx, "a")
    90  		if isClientTimeout(err) || isServerCtxTimeout(err) {
    91  			return errExpected
    92  		}
    93  		return err
    94  	}, time.Second)
    95  }
    96  
    97  func TestBalancerUnderNetworkPartitionSerializableGet(t *testing.T) {
    98  	testBalancerUnderNetworkPartition(t, func(cli *clientv3.Client, ctx context.Context) error {
    99  		_, err := cli.Get(ctx, "a", clientv3.WithSerializable())
   100  		return err
   101  	}, time.Second)
   102  }
   103  
   104  func testBalancerUnderNetworkPartition(t *testing.T, op func(*clientv3.Client, context.Context) error, timeout time.Duration) {
   105  	defer testutil.AfterTest(t)
   106  
   107  	clus := integration.NewClusterV3(t, &integration.ClusterConfig{
   108  		Size:               3,
   109  		SkipCreatingClient: true,
   110  	})
   111  	defer clus.Terminate(t)
   112  
   113  	eps := []string{clus.Members[0].GRPCAddr(), clus.Members[1].GRPCAddr(), clus.Members[2].GRPCAddr()}
   114  
   115  	// expect pin eps[0]
   116  	ccfg := clientv3.Config{
   117  		Endpoints:   []string{eps[0]},
   118  		DialTimeout: 3 * time.Second,
   119  		DialOptions: []grpc.DialOption{grpc.WithBlock()},
   120  	}
   121  	cli, err := clientv3.New(ccfg)
   122  	if err != nil {
   123  		t.Fatal(err)
   124  	}
   125  	defer cli.Close()
   126  
   127  	// wait for eps[0] to be pinned
   128  	mustWaitPinReady(t, cli)
   129  
   130  	// add other endpoints for later endpoint switch
   131  	cli.SetEndpoints(eps...)
   132  	time.Sleep(time.Second * 2)
   133  	clus.Members[0].InjectPartition(t, clus.Members[1:]...)
   134  
   135  	for i := 0; i < 5; i++ {
   136  		ctx, cancel := context.WithTimeout(context.Background(), timeout)
   137  		err = op(cli, ctx)
   138  		cancel()
   139  		if err == nil {
   140  			break
   141  		}
   142  		if err != errExpected {
   143  			t.Errorf("#%d: expected '%v', got '%v'", i, errExpected, err)
   144  		}
   145  		// give enough time for endpoint switch
   146  		// TODO: remove random sleep by syncing directly with balancer
   147  		if i == 0 {
   148  			time.Sleep(5 * time.Second)
   149  		}
   150  	}
   151  	if err != nil {
   152  		t.Errorf("balancer did not switch in time (%v)", err)
   153  	}
   154  }
   155  
   156  // TestBalancerUnderNetworkPartitionLinearizableGetLeaderElection ensures balancer
   157  // switches endpoint when leader fails and linearizable get requests returns
   158  // "etcdserver: request timed out".
   159  func TestBalancerUnderNetworkPartitionLinearizableGetLeaderElection(t *testing.T) {
   160  	defer testutil.AfterTest(t)
   161  
   162  	clus := integration.NewClusterV3(t, &integration.ClusterConfig{
   163  		Size:               3,
   164  		SkipCreatingClient: true,
   165  	})
   166  	defer clus.Terminate(t)
   167  	eps := []string{clus.Members[0].GRPCAddr(), clus.Members[1].GRPCAddr(), clus.Members[2].GRPCAddr()}
   168  
   169  	lead := clus.WaitLeader(t)
   170  
   171  	timeout := 3 * clus.Members[(lead+1)%2].ServerConfig.ReqTimeout()
   172  
   173  	cli, err := clientv3.New(clientv3.Config{
   174  		Endpoints:   []string{eps[(lead+1)%2]},
   175  		DialTimeout: 2 * time.Second,
   176  		DialOptions: []grpc.DialOption{grpc.WithBlock()},
   177  	})
   178  	if err != nil {
   179  		t.Fatal(err)
   180  	}
   181  	defer cli.Close()
   182  
   183  	// add all eps to list, so that when the original pined one fails
   184  	// the client can switch to other available eps
   185  	cli.SetEndpoints(eps[lead], eps[(lead+1)%2])
   186  
   187  	// isolate leader
   188  	clus.Members[lead].InjectPartition(t, clus.Members[(lead+1)%3], clus.Members[(lead+2)%3])
   189  
   190  	// expects balancer to round robin to leader within two attempts
   191  	for i := 0; i < 2; i++ {
   192  		ctx, cancel := context.WithTimeout(context.TODO(), timeout)
   193  		_, err = cli.Get(ctx, "a")
   194  		cancel()
   195  		if err == nil {
   196  			break
   197  		}
   198  	}
   199  	if err != nil {
   200  		t.Fatal(err)
   201  	}
   202  }
   203  
   204  func TestBalancerUnderNetworkPartitionWatchLeader(t *testing.T) {
   205  	testBalancerUnderNetworkPartitionWatch(t, true)
   206  }
   207  
   208  func TestBalancerUnderNetworkPartitionWatchFollower(t *testing.T) {
   209  	testBalancerUnderNetworkPartitionWatch(t, false)
   210  }
   211  
   212  // testBalancerUnderNetworkPartitionWatch ensures watch stream
   213  // to a partitioned node be closed when context requires leader.
   214  func testBalancerUnderNetworkPartitionWatch(t *testing.T, isolateLeader bool) {
   215  	defer testutil.AfterTest(t)
   216  
   217  	clus := integration.NewClusterV3(t, &integration.ClusterConfig{
   218  		Size:               3,
   219  		SkipCreatingClient: true,
   220  	})
   221  	defer clus.Terminate(t)
   222  
   223  	eps := []string{clus.Members[0].GRPCAddr(), clus.Members[1].GRPCAddr(), clus.Members[2].GRPCAddr()}
   224  
   225  	target := clus.WaitLeader(t)
   226  	if !isolateLeader {
   227  		target = (target + 1) % 3
   228  	}
   229  
   230  	// pin eps[target]
   231  	watchCli, err := clientv3.New(clientv3.Config{
   232  		Endpoints:   []string{eps[target]},
   233  		DialOptions: []grpc.DialOption{grpc.WithBlock()},
   234  	})
   235  	if err != nil {
   236  		t.Fatal(err)
   237  	}
   238  	defer watchCli.Close()
   239  
   240  	// wait for eps[target] to be pinned
   241  	mustWaitPinReady(t, watchCli)
   242  
   243  	// add all eps to list, so that when the original pined one fails
   244  	// the client can switch to other available eps
   245  	watchCli.SetEndpoints(eps...)
   246  
   247  	wch := watchCli.Watch(clientv3.WithRequireLeader(context.Background()), "foo", clientv3.WithCreatedNotify())
   248  	select {
   249  	case <-wch:
   250  	case <-time.After(integration.RequestWaitTimeout):
   251  		t.Fatal("took too long to create watch")
   252  	}
   253  
   254  	// isolate eps[target]
   255  	clus.Members[target].InjectPartition(t,
   256  		clus.Members[(target+1)%3],
   257  		clus.Members[(target+2)%3],
   258  	)
   259  
   260  	select {
   261  	case ev := <-wch:
   262  		if len(ev.Events) != 0 {
   263  			t.Fatal("expected no event")
   264  		}
   265  		if err = ev.Err(); err != rpctypes.ErrNoLeader {
   266  			t.Fatalf("expected %v, got %v", rpctypes.ErrNoLeader, err)
   267  		}
   268  	case <-time.After(integration.RequestWaitTimeout): // enough time to detect leader lost
   269  		t.Fatal("took too long to detect leader lost")
   270  	}
   271  }
   272  
   273  func TestDropReadUnderNetworkPartition(t *testing.T) {
   274  	defer testutil.AfterTest(t)
   275  
   276  	clus := integration.NewClusterV3(t, &integration.ClusterConfig{
   277  		Size:               3,
   278  		SkipCreatingClient: true,
   279  	})
   280  	defer clus.Terminate(t)
   281  	leaderIndex := clus.WaitLeader(t)
   282  	// get a follower endpoint
   283  	eps := []string{clus.Members[(leaderIndex+1)%3].GRPCAddr()}
   284  	ccfg := clientv3.Config{
   285  		Endpoints:   eps,
   286  		DialTimeout: 10 * time.Second,
   287  		DialOptions: []grpc.DialOption{grpc.WithBlock()},
   288  	}
   289  	cli, err := clientv3.New(ccfg)
   290  	if err != nil {
   291  		t.Fatal(err)
   292  	}
   293  	defer cli.Close()
   294  
   295  	// wait for eps[0] to be pinned
   296  	mustWaitPinReady(t, cli)
   297  
   298  	// add other endpoints for later endpoint switch
   299  	cli.SetEndpoints(eps...)
   300  	time.Sleep(time.Second * 2)
   301  	conn, err := cli.Dial(clus.Members[(leaderIndex+1)%3].GRPCAddr())
   302  	if err != nil {
   303  		t.Fatal(err)
   304  	}
   305  	defer conn.Close()
   306  
   307  	clus.Members[leaderIndex].InjectPartition(t, clus.Members[(leaderIndex+1)%3], clus.Members[(leaderIndex+2)%3])
   308  	kvc := clientv3.NewKVFromKVClient(pb.NewKVClient(conn), nil)
   309  	ctx, cancel := context.WithTimeout(context.TODO(), 10*time.Second)
   310  	_, err = kvc.Get(ctx, "a")
   311  	cancel()
   312  	if err.Error() != rpctypes.ErrLeaderChanged.Error() {
   313  		t.Fatalf("expected %v, got %v", rpctypes.ErrLeaderChanged, err)
   314  	}
   315  
   316  	for i := 0; i < 5; i++ {
   317  		ctx, cancel = context.WithTimeout(context.TODO(), 10*time.Second)
   318  		_, err = kvc.Get(ctx, "a")
   319  		cancel()
   320  		if err != nil {
   321  			if err == rpctypes.ErrTimeout {
   322  				<-time.After(time.Second)
   323  				i++
   324  				continue
   325  			}
   326  			t.Fatalf("expected nil or timeout, got %v", err)
   327  		}
   328  		// No error returned and no retry required
   329  		break
   330  	}
   331  }