github.com/lfch/etcd-io/tests/v3@v3.0.0-20221004140520-eac99acd3e9d/integration/clientv3/connectivity/network_partition_test.go (about) 1 // Copyright 2017 The etcd Authors 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 //go:build !cluster_proxy 16 // +build !cluster_proxy 17 18 package connectivity_test 19 20 import ( 21 "context" 22 "errors" 23 "testing" 24 "time" 25 26 pb "github.com/lfch/etcd-io/api/v3/etcdserverpb" 27 "github.com/lfch/etcd-io/api/v3/v3rpc/rpctypes" 28 "github.com/lfch/etcd-io/client/v3" 29 integration2 "github.com/lfch/etcd-io/tests/v3/framework/integration" 30 "github.com/lfch/etcd-io/tests/v3/integration/clientv3" 31 "google.golang.org/grpc" 32 ) 33 34 var errExpected = errors.New("expected error") 35 36 func isErrorExpected(err error) bool { 37 return clientv3test.IsClientTimeout(err) || clientv3test.IsServerCtxTimeout(err) || 38 err == rpctypes.ErrTimeout || err == rpctypes.ErrTimeoutDueToLeaderFail 39 } 40 41 // TestBalancerUnderNetworkPartitionPut tests when one member becomes isolated, 42 // first Put request fails, and following retry succeeds with client balancer 43 // switching to others. 44 func TestBalancerUnderNetworkPartitionPut(t *testing.T) { 45 testBalancerUnderNetworkPartition(t, func(cli *clientv3.Client, ctx context.Context) error { 46 _, err := cli.Put(ctx, "a", "b") 47 if isErrorExpected(err) { 48 return errExpected 49 } 50 return err 51 }, time.Second) 52 } 53 54 func TestBalancerUnderNetworkPartitionDelete(t *testing.T) { 55 testBalancerUnderNetworkPartition(t, func(cli *clientv3.Client, ctx context.Context) error { 56 _, err := cli.Delete(ctx, "a") 57 if isErrorExpected(err) { 58 return errExpected 59 } 60 return err 61 }, time.Second) 62 } 63 64 func TestBalancerUnderNetworkPartitionTxn(t *testing.T) { 65 testBalancerUnderNetworkPartition(t, func(cli *clientv3.Client, ctx context.Context) error { 66 _, err := cli.Txn(ctx). 67 If(clientv3.Compare(clientv3.Version("foo"), "=", 0)). 68 Then(clientv3.OpPut("foo", "bar")). 69 Else(clientv3.OpPut("foo", "baz")).Commit() 70 if isErrorExpected(err) { 71 return errExpected 72 } 73 return err 74 }, time.Second) 75 } 76 77 // TestBalancerUnderNetworkPartitionLinearizableGetWithLongTimeout tests 78 // when one member becomes isolated, first quorum Get request succeeds 79 // by switching endpoints within the timeout (long enough to cover endpoint switch). 80 func TestBalancerUnderNetworkPartitionLinearizableGetWithLongTimeout(t *testing.T) { 81 testBalancerUnderNetworkPartition(t, func(cli *clientv3.Client, ctx context.Context) error { 82 _, err := cli.Get(ctx, "a") 83 if isErrorExpected(err) { 84 return errExpected 85 } 86 return err 87 }, 7*time.Second) 88 } 89 90 // TestBalancerUnderNetworkPartitionLinearizableGetWithShortTimeout tests 91 // when one member becomes isolated, first quorum Get request fails, 92 // and following retry succeeds with client balancer switching to others. 93 func TestBalancerUnderNetworkPartitionLinearizableGetWithShortTimeout(t *testing.T) { 94 testBalancerUnderNetworkPartition(t, func(cli *clientv3.Client, ctx context.Context) error { 95 _, err := cli.Get(ctx, "a") 96 if clientv3test.IsClientTimeout(err) || clientv3test.IsServerCtxTimeout(err) { 97 return errExpected 98 } 99 return err 100 }, time.Second) 101 } 102 103 func TestBalancerUnderNetworkPartitionSerializableGet(t *testing.T) { 104 testBalancerUnderNetworkPartition(t, func(cli *clientv3.Client, ctx context.Context) error { 105 _, err := cli.Get(ctx, "a", clientv3.WithSerializable()) 106 return err 107 }, time.Second) 108 } 109 110 func testBalancerUnderNetworkPartition(t *testing.T, op func(*clientv3.Client, context.Context) error, timeout time.Duration) { 111 integration2.BeforeTest(t) 112 113 clus := integration2.NewCluster(t, &integration2.ClusterConfig{ 114 Size: 3, 115 }) 116 defer clus.Terminate(t) 117 118 eps := []string{clus.Members[0].GRPCURL(), clus.Members[1].GRPCURL(), clus.Members[2].GRPCURL()} 119 120 // expect pin eps[0] 121 ccfg := clientv3.Config{ 122 Endpoints: []string{eps[0]}, 123 DialTimeout: 3 * time.Second, 124 DialOptions: []grpc.DialOption{grpc.WithBlock()}, 125 } 126 cli, err := integration2.NewClient(t, ccfg) 127 if err != nil { 128 t.Fatal(err) 129 } 130 defer cli.Close() 131 // wait for eps[0] to be pinned 132 clientv3test.MustWaitPinReady(t, cli) 133 134 // add other endpoints for later endpoint switch 135 cli.SetEndpoints(eps...) 136 time.Sleep(time.Second * 2) 137 clus.Members[0].InjectPartition(t, clus.Members[1:]...) 138 139 for i := 0; i < 5; i++ { 140 ctx, cancel := context.WithTimeout(context.Background(), timeout) 141 err = op(cli, ctx) 142 t.Logf("Op returned error: %v", err) 143 t.Log("Cancelling...") 144 cancel() 145 if err == nil { 146 break 147 } 148 if err != errExpected { 149 t.Errorf("#%d: expected '%v', got '%v'", i, errExpected, err) 150 } 151 // give enough time for endpoint switch 152 // TODO: remove random sleep by syncing directly with balancer 153 if i == 0 { 154 time.Sleep(5 * time.Second) 155 } 156 } 157 if err != nil { 158 t.Errorf("balancer did not switch in time (%v)", err) 159 } 160 } 161 162 // TestBalancerUnderNetworkPartitionLinearizableGetLeaderElection ensures balancer 163 // switches endpoint when leader fails and linearizable get requests returns 164 // "etcdserver: request timed out". 165 func TestBalancerUnderNetworkPartitionLinearizableGetLeaderElection(t *testing.T) { 166 integration2.BeforeTest(t) 167 168 clus := integration2.NewCluster(t, &integration2.ClusterConfig{ 169 Size: 3, 170 }) 171 defer clus.Terminate(t) 172 eps := []string{clus.Members[0].GRPCURL(), clus.Members[1].GRPCURL(), clus.Members[2].GRPCURL()} 173 174 lead := clus.WaitLeader(t) 175 176 timeout := 3 * clus.Members[(lead+1)%2].ServerConfig.ReqTimeout() 177 178 cli, err := integration2.NewClient(t, clientv3.Config{ 179 Endpoints: []string{eps[(lead+1)%2]}, 180 DialTimeout: 2 * time.Second, 181 DialOptions: []grpc.DialOption{grpc.WithBlock()}, 182 }) 183 if err != nil { 184 t.Fatal(err) 185 } 186 defer cli.Close() 187 188 // add all eps to list, so that when the original pined one fails 189 // the client can switch to other available eps 190 cli.SetEndpoints(eps[lead], eps[(lead+1)%2]) 191 192 // isolate leader 193 clus.Members[lead].InjectPartition(t, clus.Members[(lead+1)%3], clus.Members[(lead+2)%3]) 194 195 // expects balancer to round robin to leader within two attempts 196 for i := 0; i < 2; i++ { 197 ctx, cancel := context.WithTimeout(context.TODO(), timeout) 198 _, err = cli.Get(ctx, "a") 199 cancel() 200 if err == nil { 201 break 202 } 203 } 204 if err != nil { 205 t.Fatal(err) 206 } 207 } 208 209 func TestBalancerUnderNetworkPartitionWatchLeader(t *testing.T) { 210 testBalancerUnderNetworkPartitionWatch(t, true) 211 } 212 213 func TestBalancerUnderNetworkPartitionWatchFollower(t *testing.T) { 214 testBalancerUnderNetworkPartitionWatch(t, false) 215 } 216 217 // testBalancerUnderNetworkPartitionWatch ensures watch stream 218 // to a partitioned node be closed when context requires leader. 219 func testBalancerUnderNetworkPartitionWatch(t *testing.T, isolateLeader bool) { 220 integration2.BeforeTest(t) 221 222 clus := integration2.NewCluster(t, &integration2.ClusterConfig{ 223 Size: 3, 224 }) 225 defer clus.Terminate(t) 226 227 eps := []string{clus.Members[0].GRPCURL(), clus.Members[1].GRPCURL(), clus.Members[2].GRPCURL()} 228 229 target := clus.WaitLeader(t) 230 if !isolateLeader { 231 target = (target + 1) % 3 232 } 233 234 // pin eps[target] 235 watchCli, err := integration2.NewClient(t, clientv3.Config{Endpoints: []string{eps[target]}}) 236 if err != nil { 237 t.Fatal(err) 238 } 239 t.Logf("watchCli created to: %v", target) 240 defer watchCli.Close() 241 242 // wait for eps[target] to be connected 243 clientv3test.MustWaitPinReady(t, watchCli) 244 t.Logf("successful connection with server: %v", target) 245 246 // We stick to the original endpoint, so when the one fails we don't switch 247 // under the cover to other available eps, but expose the failure to the 248 // caller (test assertion). 249 250 wch := watchCli.Watch(clientv3.WithRequireLeader(context.Background()), "foo", clientv3.WithCreatedNotify()) 251 select { 252 case <-wch: 253 case <-time.After(integration2.RequestWaitTimeout): 254 t.Fatal("took too long to create watch") 255 } 256 257 t.Logf("watch established") 258 259 // isolate eps[target] 260 clus.Members[target].InjectPartition(t, 261 clus.Members[(target+1)%3], 262 clus.Members[(target+2)%3], 263 ) 264 265 select { 266 case ev := <-wch: 267 if len(ev.Events) != 0 { 268 t.Fatal("expected no event") 269 } 270 if err = ev.Err(); err != rpctypes.ErrNoLeader { 271 t.Fatalf("expected %v, got %v", rpctypes.ErrNoLeader, err) 272 } 273 case <-time.After(integration2.RequestWaitTimeout): // enough time to detect leader lost 274 t.Fatal("took too long to detect leader lost") 275 } 276 } 277 278 func TestDropReadUnderNetworkPartition(t *testing.T) { 279 integration2.BeforeTest(t) 280 281 clus := integration2.NewCluster(t, &integration2.ClusterConfig{ 282 Size: 3, 283 }) 284 defer clus.Terminate(t) 285 leaderIndex := clus.WaitLeader(t) 286 // get a follower endpoint 287 eps := []string{clus.Members[(leaderIndex+1)%3].GRPCURL()} 288 ccfg := clientv3.Config{ 289 Endpoints: eps, 290 DialTimeout: 10 * time.Second, 291 DialOptions: []grpc.DialOption{grpc.WithBlock()}, 292 } 293 cli, err := integration2.NewClient(t, ccfg) 294 if err != nil { 295 t.Fatal(err) 296 } 297 defer cli.Close() 298 299 // wait for eps[0] to be pinned 300 clientv3test.MustWaitPinReady(t, cli) 301 302 // add other endpoints for later endpoint switch 303 cli.SetEndpoints(eps...) 304 time.Sleep(time.Second * 2) 305 conn, err := cli.Dial(clus.Members[(leaderIndex+1)%3].GRPCURL()) 306 if err != nil { 307 t.Fatal(err) 308 } 309 defer conn.Close() 310 311 clus.Members[leaderIndex].InjectPartition(t, clus.Members[(leaderIndex+1)%3], clus.Members[(leaderIndex+2)%3]) 312 kvc := clientv3.NewKVFromKVClient(pb.NewKVClient(conn), nil) 313 ctx, cancel := context.WithTimeout(context.TODO(), 10*time.Second) 314 _, err = kvc.Get(ctx, "a") 315 cancel() 316 if err != rpctypes.ErrLeaderChanged { 317 t.Fatalf("expected %v, got %v", rpctypes.ErrLeaderChanged, err) 318 } 319 320 for i := 0; i < 5; i++ { 321 ctx, cancel = context.WithTimeout(context.TODO(), 10*time.Second) 322 _, err = kvc.Get(ctx, "a") 323 cancel() 324 if err != nil { 325 if err == rpctypes.ErrTimeout { 326 <-time.After(time.Second) 327 i++ 328 continue 329 } 330 t.Fatalf("expected nil or timeout, got %v", err) 331 } 332 // No error returned and no retry required 333 break 334 } 335 }