github.com/lfch/etcd-io/tests/v3@v3.0.0-20221004140520-eac99acd3e9d/integration/clientv3/connectivity/network_partition_test.go (about)

     1  // Copyright 2017 The etcd Authors
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  //go:build !cluster_proxy
    16  // +build !cluster_proxy
    17  
    18  package connectivity_test
    19  
    20  import (
    21  	"context"
    22  	"errors"
    23  	"testing"
    24  	"time"
    25  
    26  	pb "github.com/lfch/etcd-io/api/v3/etcdserverpb"
    27  	"github.com/lfch/etcd-io/api/v3/v3rpc/rpctypes"
    28  	"github.com/lfch/etcd-io/client/v3"
    29  	integration2 "github.com/lfch/etcd-io/tests/v3/framework/integration"
    30  	"github.com/lfch/etcd-io/tests/v3/integration/clientv3"
    31  	"google.golang.org/grpc"
    32  )
    33  
    34  var errExpected = errors.New("expected error")
    35  
    36  func isErrorExpected(err error) bool {
    37  	return clientv3test.IsClientTimeout(err) || clientv3test.IsServerCtxTimeout(err) ||
    38  		err == rpctypes.ErrTimeout || err == rpctypes.ErrTimeoutDueToLeaderFail
    39  }
    40  
    41  // TestBalancerUnderNetworkPartitionPut tests when one member becomes isolated,
    42  // first Put request fails, and following retry succeeds with client balancer
    43  // switching to others.
    44  func TestBalancerUnderNetworkPartitionPut(t *testing.T) {
    45  	testBalancerUnderNetworkPartition(t, func(cli *clientv3.Client, ctx context.Context) error {
    46  		_, err := cli.Put(ctx, "a", "b")
    47  		if isErrorExpected(err) {
    48  			return errExpected
    49  		}
    50  		return err
    51  	}, time.Second)
    52  }
    53  
    54  func TestBalancerUnderNetworkPartitionDelete(t *testing.T) {
    55  	testBalancerUnderNetworkPartition(t, func(cli *clientv3.Client, ctx context.Context) error {
    56  		_, err := cli.Delete(ctx, "a")
    57  		if isErrorExpected(err) {
    58  			return errExpected
    59  		}
    60  		return err
    61  	}, time.Second)
    62  }
    63  
    64  func TestBalancerUnderNetworkPartitionTxn(t *testing.T) {
    65  	testBalancerUnderNetworkPartition(t, func(cli *clientv3.Client, ctx context.Context) error {
    66  		_, err := cli.Txn(ctx).
    67  			If(clientv3.Compare(clientv3.Version("foo"), "=", 0)).
    68  			Then(clientv3.OpPut("foo", "bar")).
    69  			Else(clientv3.OpPut("foo", "baz")).Commit()
    70  		if isErrorExpected(err) {
    71  			return errExpected
    72  		}
    73  		return err
    74  	}, time.Second)
    75  }
    76  
    77  // TestBalancerUnderNetworkPartitionLinearizableGetWithLongTimeout tests
    78  // when one member becomes isolated, first quorum Get request succeeds
    79  // by switching endpoints within the timeout (long enough to cover endpoint switch).
    80  func TestBalancerUnderNetworkPartitionLinearizableGetWithLongTimeout(t *testing.T) {
    81  	testBalancerUnderNetworkPartition(t, func(cli *clientv3.Client, ctx context.Context) error {
    82  		_, err := cli.Get(ctx, "a")
    83  		if isErrorExpected(err) {
    84  			return errExpected
    85  		}
    86  		return err
    87  	}, 7*time.Second)
    88  }
    89  
    90  // TestBalancerUnderNetworkPartitionLinearizableGetWithShortTimeout tests
    91  // when one member becomes isolated, first quorum Get request fails,
    92  // and following retry succeeds with client balancer switching to others.
    93  func TestBalancerUnderNetworkPartitionLinearizableGetWithShortTimeout(t *testing.T) {
    94  	testBalancerUnderNetworkPartition(t, func(cli *clientv3.Client, ctx context.Context) error {
    95  		_, err := cli.Get(ctx, "a")
    96  		if clientv3test.IsClientTimeout(err) || clientv3test.IsServerCtxTimeout(err) {
    97  			return errExpected
    98  		}
    99  		return err
   100  	}, time.Second)
   101  }
   102  
   103  func TestBalancerUnderNetworkPartitionSerializableGet(t *testing.T) {
   104  	testBalancerUnderNetworkPartition(t, func(cli *clientv3.Client, ctx context.Context) error {
   105  		_, err := cli.Get(ctx, "a", clientv3.WithSerializable())
   106  		return err
   107  	}, time.Second)
   108  }
   109  
   110  func testBalancerUnderNetworkPartition(t *testing.T, op func(*clientv3.Client, context.Context) error, timeout time.Duration) {
   111  	integration2.BeforeTest(t)
   112  
   113  	clus := integration2.NewCluster(t, &integration2.ClusterConfig{
   114  		Size: 3,
   115  	})
   116  	defer clus.Terminate(t)
   117  
   118  	eps := []string{clus.Members[0].GRPCURL(), clus.Members[1].GRPCURL(), clus.Members[2].GRPCURL()}
   119  
   120  	// expect pin eps[0]
   121  	ccfg := clientv3.Config{
   122  		Endpoints:   []string{eps[0]},
   123  		DialTimeout: 3 * time.Second,
   124  		DialOptions: []grpc.DialOption{grpc.WithBlock()},
   125  	}
   126  	cli, err := integration2.NewClient(t, ccfg)
   127  	if err != nil {
   128  		t.Fatal(err)
   129  	}
   130  	defer cli.Close()
   131  	// wait for eps[0] to be pinned
   132  	clientv3test.MustWaitPinReady(t, cli)
   133  
   134  	// add other endpoints for later endpoint switch
   135  	cli.SetEndpoints(eps...)
   136  	time.Sleep(time.Second * 2)
   137  	clus.Members[0].InjectPartition(t, clus.Members[1:]...)
   138  
   139  	for i := 0; i < 5; i++ {
   140  		ctx, cancel := context.WithTimeout(context.Background(), timeout)
   141  		err = op(cli, ctx)
   142  		t.Logf("Op returned error: %v", err)
   143  		t.Log("Cancelling...")
   144  		cancel()
   145  		if err == nil {
   146  			break
   147  		}
   148  		if err != errExpected {
   149  			t.Errorf("#%d: expected '%v', got '%v'", i, errExpected, err)
   150  		}
   151  		// give enough time for endpoint switch
   152  		// TODO: remove random sleep by syncing directly with balancer
   153  		if i == 0 {
   154  			time.Sleep(5 * time.Second)
   155  		}
   156  	}
   157  	if err != nil {
   158  		t.Errorf("balancer did not switch in time (%v)", err)
   159  	}
   160  }
   161  
   162  // TestBalancerUnderNetworkPartitionLinearizableGetLeaderElection ensures balancer
   163  // switches endpoint when leader fails and linearizable get requests returns
   164  // "etcdserver: request timed out".
   165  func TestBalancerUnderNetworkPartitionLinearizableGetLeaderElection(t *testing.T) {
   166  	integration2.BeforeTest(t)
   167  
   168  	clus := integration2.NewCluster(t, &integration2.ClusterConfig{
   169  		Size: 3,
   170  	})
   171  	defer clus.Terminate(t)
   172  	eps := []string{clus.Members[0].GRPCURL(), clus.Members[1].GRPCURL(), clus.Members[2].GRPCURL()}
   173  
   174  	lead := clus.WaitLeader(t)
   175  
   176  	timeout := 3 * clus.Members[(lead+1)%2].ServerConfig.ReqTimeout()
   177  
   178  	cli, err := integration2.NewClient(t, clientv3.Config{
   179  		Endpoints:   []string{eps[(lead+1)%2]},
   180  		DialTimeout: 2 * time.Second,
   181  		DialOptions: []grpc.DialOption{grpc.WithBlock()},
   182  	})
   183  	if err != nil {
   184  		t.Fatal(err)
   185  	}
   186  	defer cli.Close()
   187  
   188  	// add all eps to list, so that when the original pined one fails
   189  	// the client can switch to other available eps
   190  	cli.SetEndpoints(eps[lead], eps[(lead+1)%2])
   191  
   192  	// isolate leader
   193  	clus.Members[lead].InjectPartition(t, clus.Members[(lead+1)%3], clus.Members[(lead+2)%3])
   194  
   195  	// expects balancer to round robin to leader within two attempts
   196  	for i := 0; i < 2; i++ {
   197  		ctx, cancel := context.WithTimeout(context.TODO(), timeout)
   198  		_, err = cli.Get(ctx, "a")
   199  		cancel()
   200  		if err == nil {
   201  			break
   202  		}
   203  	}
   204  	if err != nil {
   205  		t.Fatal(err)
   206  	}
   207  }
   208  
   209  func TestBalancerUnderNetworkPartitionWatchLeader(t *testing.T) {
   210  	testBalancerUnderNetworkPartitionWatch(t, true)
   211  }
   212  
   213  func TestBalancerUnderNetworkPartitionWatchFollower(t *testing.T) {
   214  	testBalancerUnderNetworkPartitionWatch(t, false)
   215  }
   216  
   217  // testBalancerUnderNetworkPartitionWatch ensures watch stream
   218  // to a partitioned node be closed when context requires leader.
   219  func testBalancerUnderNetworkPartitionWatch(t *testing.T, isolateLeader bool) {
   220  	integration2.BeforeTest(t)
   221  
   222  	clus := integration2.NewCluster(t, &integration2.ClusterConfig{
   223  		Size: 3,
   224  	})
   225  	defer clus.Terminate(t)
   226  
   227  	eps := []string{clus.Members[0].GRPCURL(), clus.Members[1].GRPCURL(), clus.Members[2].GRPCURL()}
   228  
   229  	target := clus.WaitLeader(t)
   230  	if !isolateLeader {
   231  		target = (target + 1) % 3
   232  	}
   233  
   234  	// pin eps[target]
   235  	watchCli, err := integration2.NewClient(t, clientv3.Config{Endpoints: []string{eps[target]}})
   236  	if err != nil {
   237  		t.Fatal(err)
   238  	}
   239  	t.Logf("watchCli created to: %v", target)
   240  	defer watchCli.Close()
   241  
   242  	// wait for eps[target] to be connected
   243  	clientv3test.MustWaitPinReady(t, watchCli)
   244  	t.Logf("successful connection with server: %v", target)
   245  
   246  	// We stick to the original endpoint, so when the one fails we don't switch
   247  	// under the cover to other available eps, but expose the failure to the
   248  	// caller (test assertion).
   249  
   250  	wch := watchCli.Watch(clientv3.WithRequireLeader(context.Background()), "foo", clientv3.WithCreatedNotify())
   251  	select {
   252  	case <-wch:
   253  	case <-time.After(integration2.RequestWaitTimeout):
   254  		t.Fatal("took too long to create watch")
   255  	}
   256  
   257  	t.Logf("watch established")
   258  
   259  	// isolate eps[target]
   260  	clus.Members[target].InjectPartition(t,
   261  		clus.Members[(target+1)%3],
   262  		clus.Members[(target+2)%3],
   263  	)
   264  
   265  	select {
   266  	case ev := <-wch:
   267  		if len(ev.Events) != 0 {
   268  			t.Fatal("expected no event")
   269  		}
   270  		if err = ev.Err(); err != rpctypes.ErrNoLeader {
   271  			t.Fatalf("expected %v, got %v", rpctypes.ErrNoLeader, err)
   272  		}
   273  	case <-time.After(integration2.RequestWaitTimeout): // enough time to detect leader lost
   274  		t.Fatal("took too long to detect leader lost")
   275  	}
   276  }
   277  
   278  func TestDropReadUnderNetworkPartition(t *testing.T) {
   279  	integration2.BeforeTest(t)
   280  
   281  	clus := integration2.NewCluster(t, &integration2.ClusterConfig{
   282  		Size: 3,
   283  	})
   284  	defer clus.Terminate(t)
   285  	leaderIndex := clus.WaitLeader(t)
   286  	// get a follower endpoint
   287  	eps := []string{clus.Members[(leaderIndex+1)%3].GRPCURL()}
   288  	ccfg := clientv3.Config{
   289  		Endpoints:   eps,
   290  		DialTimeout: 10 * time.Second,
   291  		DialOptions: []grpc.DialOption{grpc.WithBlock()},
   292  	}
   293  	cli, err := integration2.NewClient(t, ccfg)
   294  	if err != nil {
   295  		t.Fatal(err)
   296  	}
   297  	defer cli.Close()
   298  
   299  	// wait for eps[0] to be pinned
   300  	clientv3test.MustWaitPinReady(t, cli)
   301  
   302  	// add other endpoints for later endpoint switch
   303  	cli.SetEndpoints(eps...)
   304  	time.Sleep(time.Second * 2)
   305  	conn, err := cli.Dial(clus.Members[(leaderIndex+1)%3].GRPCURL())
   306  	if err != nil {
   307  		t.Fatal(err)
   308  	}
   309  	defer conn.Close()
   310  
   311  	clus.Members[leaderIndex].InjectPartition(t, clus.Members[(leaderIndex+1)%3], clus.Members[(leaderIndex+2)%3])
   312  	kvc := clientv3.NewKVFromKVClient(pb.NewKVClient(conn), nil)
   313  	ctx, cancel := context.WithTimeout(context.TODO(), 10*time.Second)
   314  	_, err = kvc.Get(ctx, "a")
   315  	cancel()
   316  	if err != rpctypes.ErrLeaderChanged {
   317  		t.Fatalf("expected %v, got %v", rpctypes.ErrLeaderChanged, err)
   318  	}
   319  
   320  	for i := 0; i < 5; i++ {
   321  		ctx, cancel = context.WithTimeout(context.TODO(), 10*time.Second)
   322  		_, err = kvc.Get(ctx, "a")
   323  		cancel()
   324  		if err != nil {
   325  			if err == rpctypes.ErrTimeout {
   326  				<-time.After(time.Second)
   327  				i++
   328  				continue
   329  			}
   330  			t.Fatalf("expected nil or timeout, got %v", err)
   331  		}
   332  		// No error returned and no retry required
   333  		break
   334  	}
   335  }