go.etcd.io/etcd@v3.3.27+incompatible/clientv3/integration/network_partition_test.go (about) 1 // Copyright 2017 The etcd Authors 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // +build !cluster_proxy 16 17 package integration 18 19 import ( 20 "context" 21 "errors" 22 "testing" 23 "time" 24 25 "github.com/coreos/etcd/clientv3" 26 "github.com/coreos/etcd/etcdserver/api/v3rpc/rpctypes" 27 pb "github.com/coreos/etcd/etcdserver/etcdserverpb" 28 "github.com/coreos/etcd/integration" 29 "github.com/coreos/etcd/pkg/testutil" 30 "google.golang.org/grpc" 31 ) 32 33 var errExpected = errors.New("expected error") 34 35 // TestBalancerUnderNetworkPartitionPut tests when one member becomes isolated, 36 // first Put request fails, and following retry succeeds with client balancer 37 // switching to others. 38 func TestBalancerUnderNetworkPartitionPut(t *testing.T) { 39 testBalancerUnderNetworkPartition(t, func(cli *clientv3.Client, ctx context.Context) error { 40 _, err := cli.Put(ctx, "a", "b") 41 if isClientTimeout(err) || isServerCtxTimeout(err) || err == rpctypes.ErrTimeout { 42 return errExpected 43 } 44 return err 45 }, time.Second) 46 } 47 48 func TestBalancerUnderNetworkPartitionDelete(t *testing.T) { 49 testBalancerUnderNetworkPartition(t, func(cli *clientv3.Client, ctx context.Context) error { 50 _, err := cli.Delete(ctx, "a") 51 if isClientTimeout(err) || isServerCtxTimeout(err) || err == rpctypes.ErrTimeout { 52 return errExpected 53 } 54 return err 55 }, time.Second) 56 } 57 58 func TestBalancerUnderNetworkPartitionTxn(t *testing.T) { 59 testBalancerUnderNetworkPartition(t, func(cli *clientv3.Client, ctx context.Context) error { 60 _, err := cli.Txn(ctx). 61 If(clientv3.Compare(clientv3.Version("foo"), "=", 0)). 62 Then(clientv3.OpPut("foo", "bar")). 63 Else(clientv3.OpPut("foo", "baz")).Commit() 64 if isClientTimeout(err) || isServerCtxTimeout(err) || err == rpctypes.ErrTimeout { 65 return errExpected 66 } 67 return err 68 }, time.Second) 69 } 70 71 // TestBalancerUnderNetworkPartitionLinearizableGetWithLongTimeout tests 72 // when one member becomes isolated, first quorum Get request succeeds 73 // by switching endpoints within the timeout (long enough to cover endpoint switch). 74 func TestBalancerUnderNetworkPartitionLinearizableGetWithLongTimeout(t *testing.T) { 75 testBalancerUnderNetworkPartition(t, func(cli *clientv3.Client, ctx context.Context) error { 76 _, err := cli.Get(ctx, "a") 77 if err == rpctypes.ErrTimeout { 78 return errExpected 79 } 80 return err 81 }, 7*time.Second) 82 } 83 84 // TestBalancerUnderNetworkPartitionLinearizableGetWithShortTimeout tests 85 // when one member becomes isolated, first quorum Get request fails, 86 // and following retry succeeds with client balancer switching to others. 87 func TestBalancerUnderNetworkPartitionLinearizableGetWithShortTimeout(t *testing.T) { 88 testBalancerUnderNetworkPartition(t, func(cli *clientv3.Client, ctx context.Context) error { 89 _, err := cli.Get(ctx, "a") 90 if isClientTimeout(err) || isServerCtxTimeout(err) { 91 return errExpected 92 } 93 return err 94 }, time.Second) 95 } 96 97 func TestBalancerUnderNetworkPartitionSerializableGet(t *testing.T) { 98 testBalancerUnderNetworkPartition(t, func(cli *clientv3.Client, ctx context.Context) error { 99 _, err := cli.Get(ctx, "a", clientv3.WithSerializable()) 100 return err 101 }, time.Second) 102 } 103 104 func testBalancerUnderNetworkPartition(t *testing.T, op func(*clientv3.Client, context.Context) error, timeout time.Duration) { 105 defer testutil.AfterTest(t) 106 107 clus := integration.NewClusterV3(t, &integration.ClusterConfig{ 108 Size: 3, 109 SkipCreatingClient: true, 110 }) 111 defer clus.Terminate(t) 112 113 eps := []string{clus.Members[0].GRPCAddr(), clus.Members[1].GRPCAddr(), clus.Members[2].GRPCAddr()} 114 115 // expect pin eps[0] 116 ccfg := clientv3.Config{ 117 Endpoints: []string{eps[0]}, 118 DialTimeout: 3 * time.Second, 119 DialOptions: []grpc.DialOption{grpc.WithBlock()}, 120 } 121 cli, err := clientv3.New(ccfg) 122 if err != nil { 123 t.Fatal(err) 124 } 125 defer cli.Close() 126 127 // wait for eps[0] to be pinned 128 mustWaitPinReady(t, cli) 129 130 // add other endpoints for later endpoint switch 131 cli.SetEndpoints(eps...) 132 time.Sleep(time.Second * 2) 133 clus.Members[0].InjectPartition(t, clus.Members[1:]...) 134 135 for i := 0; i < 5; i++ { 136 ctx, cancel := context.WithTimeout(context.Background(), timeout) 137 err = op(cli, ctx) 138 cancel() 139 if err == nil { 140 break 141 } 142 if err != errExpected { 143 t.Errorf("#%d: expected '%v', got '%v'", i, errExpected, err) 144 } 145 // give enough time for endpoint switch 146 // TODO: remove random sleep by syncing directly with balancer 147 if i == 0 { 148 time.Sleep(5 * time.Second) 149 } 150 } 151 if err != nil { 152 t.Errorf("balancer did not switch in time (%v)", err) 153 } 154 } 155 156 // TestBalancerUnderNetworkPartitionLinearizableGetLeaderElection ensures balancer 157 // switches endpoint when leader fails and linearizable get requests returns 158 // "etcdserver: request timed out". 159 func TestBalancerUnderNetworkPartitionLinearizableGetLeaderElection(t *testing.T) { 160 defer testutil.AfterTest(t) 161 162 clus := integration.NewClusterV3(t, &integration.ClusterConfig{ 163 Size: 3, 164 SkipCreatingClient: true, 165 }) 166 defer clus.Terminate(t) 167 eps := []string{clus.Members[0].GRPCAddr(), clus.Members[1].GRPCAddr(), clus.Members[2].GRPCAddr()} 168 169 lead := clus.WaitLeader(t) 170 171 timeout := 3 * clus.Members[(lead+1)%2].ServerConfig.ReqTimeout() 172 173 cli, err := clientv3.New(clientv3.Config{ 174 Endpoints: []string{eps[(lead+1)%2]}, 175 DialTimeout: 2 * time.Second, 176 DialOptions: []grpc.DialOption{grpc.WithBlock()}, 177 }) 178 if err != nil { 179 t.Fatal(err) 180 } 181 defer cli.Close() 182 183 // add all eps to list, so that when the original pined one fails 184 // the client can switch to other available eps 185 cli.SetEndpoints(eps[lead], eps[(lead+1)%2]) 186 187 // isolate leader 188 clus.Members[lead].InjectPartition(t, clus.Members[(lead+1)%3], clus.Members[(lead+2)%3]) 189 190 // expects balancer to round robin to leader within two attempts 191 for i := 0; i < 2; i++ { 192 ctx, cancel := context.WithTimeout(context.TODO(), timeout) 193 _, err = cli.Get(ctx, "a") 194 cancel() 195 if err == nil { 196 break 197 } 198 } 199 if err != nil { 200 t.Fatal(err) 201 } 202 } 203 204 func TestBalancerUnderNetworkPartitionWatchLeader(t *testing.T) { 205 testBalancerUnderNetworkPartitionWatch(t, true) 206 } 207 208 func TestBalancerUnderNetworkPartitionWatchFollower(t *testing.T) { 209 testBalancerUnderNetworkPartitionWatch(t, false) 210 } 211 212 // testBalancerUnderNetworkPartitionWatch ensures watch stream 213 // to a partitioned node be closed when context requires leader. 214 func testBalancerUnderNetworkPartitionWatch(t *testing.T, isolateLeader bool) { 215 defer testutil.AfterTest(t) 216 217 clus := integration.NewClusterV3(t, &integration.ClusterConfig{ 218 Size: 3, 219 SkipCreatingClient: true, 220 }) 221 defer clus.Terminate(t) 222 223 eps := []string{clus.Members[0].GRPCAddr(), clus.Members[1].GRPCAddr(), clus.Members[2].GRPCAddr()} 224 225 target := clus.WaitLeader(t) 226 if !isolateLeader { 227 target = (target + 1) % 3 228 } 229 230 // pin eps[target] 231 watchCli, err := clientv3.New(clientv3.Config{ 232 Endpoints: []string{eps[target]}, 233 DialOptions: []grpc.DialOption{grpc.WithBlock()}, 234 }) 235 if err != nil { 236 t.Fatal(err) 237 } 238 defer watchCli.Close() 239 240 // wait for eps[target] to be pinned 241 mustWaitPinReady(t, watchCli) 242 243 // add all eps to list, so that when the original pined one fails 244 // the client can switch to other available eps 245 watchCli.SetEndpoints(eps...) 246 247 wch := watchCli.Watch(clientv3.WithRequireLeader(context.Background()), "foo", clientv3.WithCreatedNotify()) 248 select { 249 case <-wch: 250 case <-time.After(integration.RequestWaitTimeout): 251 t.Fatal("took too long to create watch") 252 } 253 254 // isolate eps[target] 255 clus.Members[target].InjectPartition(t, 256 clus.Members[(target+1)%3], 257 clus.Members[(target+2)%3], 258 ) 259 260 select { 261 case ev := <-wch: 262 if len(ev.Events) != 0 { 263 t.Fatal("expected no event") 264 } 265 if err = ev.Err(); err != rpctypes.ErrNoLeader { 266 t.Fatalf("expected %v, got %v", rpctypes.ErrNoLeader, err) 267 } 268 case <-time.After(integration.RequestWaitTimeout): // enough time to detect leader lost 269 t.Fatal("took too long to detect leader lost") 270 } 271 } 272 273 func TestDropReadUnderNetworkPartition(t *testing.T) { 274 defer testutil.AfterTest(t) 275 276 clus := integration.NewClusterV3(t, &integration.ClusterConfig{ 277 Size: 3, 278 SkipCreatingClient: true, 279 }) 280 defer clus.Terminate(t) 281 leaderIndex := clus.WaitLeader(t) 282 // get a follower endpoint 283 eps := []string{clus.Members[(leaderIndex+1)%3].GRPCAddr()} 284 ccfg := clientv3.Config{ 285 Endpoints: eps, 286 DialTimeout: 10 * time.Second, 287 DialOptions: []grpc.DialOption{grpc.WithBlock()}, 288 } 289 cli, err := clientv3.New(ccfg) 290 if err != nil { 291 t.Fatal(err) 292 } 293 defer cli.Close() 294 295 // wait for eps[0] to be pinned 296 mustWaitPinReady(t, cli) 297 298 // add other endpoints for later endpoint switch 299 cli.SetEndpoints(eps...) 300 time.Sleep(time.Second * 2) 301 conn, err := cli.Dial(clus.Members[(leaderIndex+1)%3].GRPCAddr()) 302 if err != nil { 303 t.Fatal(err) 304 } 305 defer conn.Close() 306 307 clus.Members[leaderIndex].InjectPartition(t, clus.Members[(leaderIndex+1)%3], clus.Members[(leaderIndex+2)%3]) 308 kvc := clientv3.NewKVFromKVClient(pb.NewKVClient(conn), nil) 309 ctx, cancel := context.WithTimeout(context.TODO(), 10*time.Second) 310 _, err = kvc.Get(ctx, "a") 311 cancel() 312 if err.Error() != rpctypes.ErrLeaderChanged.Error() { 313 t.Fatalf("expected %v, got %v", rpctypes.ErrLeaderChanged, err) 314 } 315 316 for i := 0; i < 5; i++ { 317 ctx, cancel = context.WithTimeout(context.TODO(), 10*time.Second) 318 _, err = kvc.Get(ctx, "a") 319 cancel() 320 if err != nil { 321 if err == rpctypes.ErrTimeout { 322 <-time.After(time.Second) 323 i++ 324 continue 325 } 326 t.Fatalf("expected nil or timeout, got %v", err) 327 } 328 // No error returned and no retry required 329 break 330 } 331 }