go.etcd.io/etcd@v3.3.27+incompatible/clientv3/integration/server_shutdown_test.go (about)

     1  // Copyright 2017 The etcd Authors
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package integration
    16  
    17  import (
    18  	"bytes"
    19  	"context"
    20  	"strings"
    21  	"testing"
    22  	"time"
    23  
    24  	"github.com/coreos/etcd/clientv3"
    25  	"github.com/coreos/etcd/etcdserver/api/v3rpc/rpctypes"
    26  	"github.com/coreos/etcd/integration"
    27  	"github.com/coreos/etcd/pkg/testutil"
    28  
    29  	"google.golang.org/grpc"
    30  	"google.golang.org/grpc/codes"
    31  	"google.golang.org/grpc/status"
    32  )
    33  
    34  // TestBalancerUnderServerShutdownWatch expects that watch client
    35  // switch its endpoints when the member of the pinned endpoint fails.
    36  func TestBalancerUnderServerShutdownWatch(t *testing.T) {
    37  	defer testutil.AfterTest(t)
    38  
    39  	clus := integration.NewClusterV3(t, &integration.ClusterConfig{
    40  		Size:               3,
    41  		SkipCreatingClient: true,
    42  	})
    43  	defer clus.Terminate(t)
    44  
    45  	eps := []string{clus.Members[0].GRPCAddr(), clus.Members[1].GRPCAddr(), clus.Members[2].GRPCAddr()}
    46  
    47  	lead := clus.WaitLeader(t)
    48  
    49  	// pin eps[lead]
    50  	watchCli, err := clientv3.New(clientv3.Config{Endpoints: []string{eps[lead]}})
    51  	if err != nil {
    52  		t.Fatal(err)
    53  	}
    54  	defer watchCli.Close()
    55  
    56  	// wait for eps[lead] to be pinned
    57  	mustWaitPinReady(t, watchCli)
    58  
    59  	// add all eps to list, so that when the original pined one fails
    60  	// the client can switch to other available eps
    61  	watchCli.SetEndpoints(eps...)
    62  
    63  	key, val := "foo", "bar"
    64  	wch := watchCli.Watch(context.Background(), key, clientv3.WithCreatedNotify())
    65  	select {
    66  	case <-wch:
    67  	case <-time.After(integration.RequestWaitTimeout):
    68  		t.Fatal("took too long to create watch")
    69  	}
    70  
    71  	donec := make(chan struct{})
    72  	go func() {
    73  		defer close(donec)
    74  
    75  		// switch to others when eps[lead] is shut down
    76  		select {
    77  		case ev := <-wch:
    78  			if werr := ev.Err(); werr != nil {
    79  				t.Error(werr)
    80  			}
    81  			if len(ev.Events) != 1 {
    82  				t.Errorf("expected one event, got %+v", ev)
    83  			}
    84  			if !bytes.Equal(ev.Events[0].Kv.Value, []byte(val)) {
    85  				t.Errorf("expected %q, got %+v", val, ev.Events[0].Kv)
    86  			}
    87  		case <-time.After(7 * time.Second):
    88  			t.Error("took too long to receive events")
    89  		}
    90  	}()
    91  
    92  	// shut down eps[lead]
    93  	clus.Members[lead].Terminate(t)
    94  
    95  	// writes to eps[lead+1]
    96  	putCli, err := clientv3.New(clientv3.Config{
    97  		Endpoints:   []string{eps[(lead+1)%3]},
    98  		DialOptions: []grpc.DialOption{grpc.WithBlock()},
    99  	})
   100  	if err != nil {
   101  		t.Fatal(err)
   102  	}
   103  	defer putCli.Close()
   104  	for {
   105  		ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
   106  		_, err = putCli.Put(ctx, key, val)
   107  		cancel()
   108  		if err == nil {
   109  			break
   110  		}
   111  		if isClientTimeout(err) || isServerCtxTimeout(err) || err == rpctypes.ErrTimeout || err == rpctypes.ErrTimeoutDueToLeaderFail {
   112  			continue
   113  		}
   114  		t.Fatal(err)
   115  	}
   116  
   117  	select {
   118  	case <-donec:
   119  	case <-time.After(5 * time.Second): // enough time for balancer switch
   120  		t.Fatal("took too long to receive events")
   121  	}
   122  }
   123  
   124  func TestBalancerUnderServerShutdownPut(t *testing.T) {
   125  	testBalancerUnderServerShutdownMutable(t, func(cli *clientv3.Client, ctx context.Context) error {
   126  		_, err := cli.Put(ctx, "foo", "bar")
   127  		return err
   128  	})
   129  }
   130  
   131  func TestBalancerUnderServerShutdownDelete(t *testing.T) {
   132  	testBalancerUnderServerShutdownMutable(t, func(cli *clientv3.Client, ctx context.Context) error {
   133  		_, err := cli.Delete(ctx, "foo")
   134  		return err
   135  	})
   136  }
   137  
   138  func TestBalancerUnderServerShutdownTxn(t *testing.T) {
   139  	testBalancerUnderServerShutdownMutable(t, func(cli *clientv3.Client, ctx context.Context) error {
   140  		_, err := cli.Txn(ctx).
   141  			If(clientv3.Compare(clientv3.Version("foo"), "=", 0)).
   142  			Then(clientv3.OpPut("foo", "bar")).
   143  			Else(clientv3.OpPut("foo", "baz")).Commit()
   144  		return err
   145  	})
   146  }
   147  
   148  // testBalancerUnderServerShutdownMutable expects that when the member of
   149  // the pinned endpoint is shut down, the balancer switches its endpoints
   150  // and all subsequent put/delete/txn requests succeed with new endpoints.
   151  func testBalancerUnderServerShutdownMutable(t *testing.T, op func(*clientv3.Client, context.Context) error) {
   152  	defer testutil.AfterTest(t)
   153  
   154  	clus := integration.NewClusterV3(t, &integration.ClusterConfig{
   155  		Size:               3,
   156  		SkipCreatingClient: true,
   157  	})
   158  	defer clus.Terminate(t)
   159  
   160  	eps := []string{clus.Members[0].GRPCAddr(), clus.Members[1].GRPCAddr(), clus.Members[2].GRPCAddr()}
   161  
   162  	// pin eps[0]
   163  	cli, err := clientv3.New(clientv3.Config{
   164  		Endpoints:   []string{eps[0]},
   165  		DialOptions: []grpc.DialOption{grpc.WithBlock()},
   166  	})
   167  	if err != nil {
   168  		t.Fatal(err)
   169  	}
   170  	defer cli.Close()
   171  
   172  	// wait for eps[0] to be pinned
   173  	mustWaitPinReady(t, cli)
   174  
   175  	// add all eps to list, so that when the original pined one fails
   176  	// the client can switch to other available eps
   177  	cli.SetEndpoints(eps...)
   178  
   179  	// shut down eps[0]
   180  	clus.Members[0].Terminate(t)
   181  
   182  	// switched to others when eps[0] was explicitly shut down
   183  	// and following request should succeed
   184  	// TODO: remove this (expose client connection state?)
   185  	time.Sleep(time.Second)
   186  
   187  	cctx, ccancel := context.WithTimeout(context.Background(), time.Second)
   188  	err = op(cli, cctx)
   189  	ccancel()
   190  	if err != nil {
   191  		t.Fatal(err)
   192  	}
   193  }
   194  
   195  func TestBalancerUnderServerShutdownGetLinearizable(t *testing.T) {
   196  	testBalancerUnderServerShutdownImmutable(t, func(cli *clientv3.Client, ctx context.Context) error {
   197  		_, err := cli.Get(ctx, "foo")
   198  		return err
   199  	}, 7*time.Second) // give enough time for leader election, balancer switch
   200  }
   201  
   202  func TestBalancerUnderServerShutdownGetSerializable(t *testing.T) {
   203  	testBalancerUnderServerShutdownImmutable(t, func(cli *clientv3.Client, ctx context.Context) error {
   204  		_, err := cli.Get(ctx, "foo", clientv3.WithSerializable())
   205  		return err
   206  	}, 2*time.Second)
   207  }
   208  
   209  // testBalancerUnderServerShutdownImmutable expects that when the member of
   210  // the pinned endpoint is shut down, the balancer switches its endpoints
   211  // and all subsequent range requests succeed with new endpoints.
   212  func testBalancerUnderServerShutdownImmutable(t *testing.T, op func(*clientv3.Client, context.Context) error, timeout time.Duration) {
   213  	defer testutil.AfterTest(t)
   214  
   215  	clus := integration.NewClusterV3(t, &integration.ClusterConfig{
   216  		Size:               3,
   217  		SkipCreatingClient: true,
   218  	})
   219  	defer clus.Terminate(t)
   220  
   221  	eps := []string{clus.Members[0].GRPCAddr(), clus.Members[1].GRPCAddr(), clus.Members[2].GRPCAddr()}
   222  
   223  	// pin eps[0]
   224  	cli, err := clientv3.New(clientv3.Config{Endpoints: []string{eps[0]}})
   225  	if err != nil {
   226  		t.Errorf("failed to create client: %v", err)
   227  	}
   228  	defer cli.Close()
   229  
   230  	// wait for eps[0] to be pinned
   231  	mustWaitPinReady(t, cli)
   232  
   233  	// add all eps to list, so that when the original pined one fails
   234  	// the client can switch to other available eps
   235  	cli.SetEndpoints(eps...)
   236  
   237  	// shut down eps[0]
   238  	clus.Members[0].Terminate(t)
   239  
   240  	// switched to others when eps[0] was explicitly shut down
   241  	// and following request should succeed
   242  	cctx, ccancel := context.WithTimeout(context.Background(), timeout)
   243  	err = op(cli, cctx)
   244  	ccancel()
   245  	if err != nil {
   246  		t.Errorf("failed to finish range request in time %v (timeout %v)", err, timeout)
   247  	}
   248  }
   249  
   250  func TestBalancerUnderServerStopInflightLinearizableGetOnRestart(t *testing.T) {
   251  	tt := []pinTestOpt{
   252  		{pinLeader: true, stopPinFirst: true},
   253  		{pinLeader: true, stopPinFirst: false},
   254  		{pinLeader: false, stopPinFirst: true},
   255  		{pinLeader: false, stopPinFirst: false},
   256  	}
   257  	for i := range tt {
   258  		testBalancerUnderServerStopInflightRangeOnRestart(t, true, tt[i])
   259  	}
   260  }
   261  
   262  func TestBalancerUnderServerStopInflightSerializableGetOnRestart(t *testing.T) {
   263  	tt := []pinTestOpt{
   264  		{pinLeader: true, stopPinFirst: true},
   265  		{pinLeader: true, stopPinFirst: false},
   266  		{pinLeader: false, stopPinFirst: true},
   267  		{pinLeader: false, stopPinFirst: false},
   268  	}
   269  	for i := range tt {
   270  		testBalancerUnderServerStopInflightRangeOnRestart(t, false, tt[i])
   271  	}
   272  }
   273  
   274  type pinTestOpt struct {
   275  	pinLeader    bool
   276  	stopPinFirst bool
   277  }
   278  
   279  // testBalancerUnderServerStopInflightRangeOnRestart expects
   280  // inflight range request reconnects on server restart.
   281  func testBalancerUnderServerStopInflightRangeOnRestart(t *testing.T, linearizable bool, opt pinTestOpt) {
   282  	defer testutil.AfterTest(t)
   283  
   284  	cfg := &integration.ClusterConfig{
   285  		Size:               2,
   286  		SkipCreatingClient: true,
   287  	}
   288  	if linearizable {
   289  		cfg.Size = 3
   290  	}
   291  
   292  	clus := integration.NewClusterV3(t, cfg)
   293  	defer clus.Terminate(t)
   294  	eps := []string{clus.Members[0].GRPCAddr(), clus.Members[1].GRPCAddr()}
   295  	if linearizable {
   296  		eps = append(eps, clus.Members[2].GRPCAddr())
   297  	}
   298  
   299  	lead := clus.WaitLeader(t)
   300  
   301  	target := lead
   302  	if !opt.pinLeader {
   303  		target = (target + 1) % 2
   304  	}
   305  
   306  	// pin eps[target]
   307  	cli, err := clientv3.New(clientv3.Config{Endpoints: []string{eps[target]}})
   308  	if err != nil {
   309  		t.Errorf("failed to create client: %v", err)
   310  	}
   311  	defer cli.Close()
   312  
   313  	// wait for eps[target] to be pinned
   314  	mustWaitPinReady(t, cli)
   315  
   316  	// add all eps to list, so that when the original pined one fails
   317  	// the client can switch to other available eps
   318  	cli.SetEndpoints(eps...)
   319  
   320  	if opt.stopPinFirst {
   321  		clus.Members[target].Stop(t)
   322  		// give some time for balancer switch before stopping the other
   323  		time.Sleep(time.Second)
   324  		clus.Members[(target+1)%2].Stop(t)
   325  	} else {
   326  		clus.Members[(target+1)%2].Stop(t)
   327  		// balancer cannot pin other member since it's already stopped
   328  		clus.Members[target].Stop(t)
   329  	}
   330  
   331  	// 3-second is the minimum interval between endpoint being marked
   332  	// as unhealthy and being removed from unhealthy, so possibly
   333  	// takes >5-second to unpin and repin an endpoint
   334  	// TODO: decrease timeout when balancer switch rewrite
   335  	clientTimeout := 7 * time.Second
   336  
   337  	var gops []clientv3.OpOption
   338  	if !linearizable {
   339  		gops = append(gops, clientv3.WithSerializable())
   340  	}
   341  
   342  	donec, readyc := make(chan struct{}), make(chan struct{}, 1)
   343  	go func() {
   344  		defer close(donec)
   345  		ctx, cancel := context.WithTimeout(context.TODO(), clientTimeout)
   346  		readyc <- struct{}{}
   347  
   348  		// TODO: The new grpc load balancer will not pin to an endpoint
   349  		// as intended by this test. But it will round robin member within
   350  		// two attempts.
   351  		// Remove retry loop once the new grpc load balancer provides retry.
   352  		for i := 0; i < 2; i++ {
   353  			_, err = cli.Get(ctx, "abc", gops...)
   354  			if err == nil {
   355  				break
   356  			}
   357  		}
   358  		cancel()
   359  		if err != nil {
   360  			t.Errorf("unexpected error: %v", err)
   361  		}
   362  	}()
   363  
   364  	<-readyc
   365  	clus.Members[target].Restart(t)
   366  
   367  	select {
   368  	case <-time.After(clientTimeout + integration.RequestWaitTimeout):
   369  		t.Fatalf("timed out waiting for Get [linearizable: %v, opt: %+v]", linearizable, opt)
   370  	case <-donec:
   371  	}
   372  }
   373  
   374  // e.g. due to clock drifts in server-side,
   375  // client context times out first in server-side
   376  // while original client-side context is not timed out yet
   377  func isServerCtxTimeout(err error) bool {
   378  	if err == nil {
   379  		return false
   380  	}
   381  	ev, ok := status.FromError(err)
   382  	if !ok {
   383  		return false
   384  	}
   385  	code := ev.Code()
   386  	return code == codes.DeadlineExceeded && strings.Contains(err.Error(), "context deadline exceeded")
   387  }
   388  
   389  // In grpc v1.11.3+ dial timeouts can error out with transport.ErrConnClosing. Previously dial timeouts
   390  // would always error out with context.DeadlineExceeded.
   391  func isClientTimeout(err error) bool {
   392  	if err == nil {
   393  		return false
   394  	}
   395  	if err == context.DeadlineExceeded {
   396  		return true
   397  	}
   398  	ev, ok := status.FromError(err)
   399  	if !ok {
   400  		return false
   401  	}
   402  	code := ev.Code()
   403  	return code == codes.DeadlineExceeded
   404  }
   405  
   406  func isCanceled(err error) bool {
   407  	if err == nil {
   408  		return false
   409  	}
   410  	if err == context.Canceled {
   411  		return true
   412  	}
   413  	ev, ok := status.FromError(err)
   414  	if !ok {
   415  		return false
   416  	}
   417  	code := ev.Code()
   418  	return code == codes.Canceled
   419  }
   420  
   421  func isUnavailable(err error) bool {
   422  	if err == nil {
   423  		return false
   424  	}
   425  	if err == context.Canceled {
   426  		return true
   427  	}
   428  	ev, ok := status.FromError(err)
   429  	if !ok {
   430  		return false
   431  	}
   432  	code := ev.Code()
   433  	return code == codes.Unavailable
   434  }